{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.258602711157456, "eval_steps": 100, "global_step": 25000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013034410844629822, "grad_norm": 752.3058471679688, "learning_rate": 3.6000000000000005e-08, "loss": 36.8414, "step": 10 }, { "epoch": 0.0026068821689259644, "grad_norm": 842.267822265625, "learning_rate": 7.6e-08, "loss": 38.6302, "step": 20 }, { "epoch": 0.003910323253388947, "grad_norm": 743.934326171875, "learning_rate": 1.16e-07, "loss": 36.3021, "step": 30 }, { "epoch": 0.005213764337851929, "grad_norm": 934.6981201171875, "learning_rate": 1.56e-07, "loss": 36.9985, "step": 40 }, { "epoch": 0.006517205422314911, "grad_norm": 649.5113525390625, "learning_rate": 1.96e-07, "loss": 33.0931, "step": 50 }, { "epoch": 0.007820646506777894, "grad_norm": 552.53662109375, "learning_rate": 2.3600000000000002e-07, "loss": 32.0638, "step": 60 }, { "epoch": 0.009124087591240875, "grad_norm": 582.6400146484375, "learning_rate": 2.7600000000000004e-07, "loss": 30.6222, "step": 70 }, { "epoch": 0.010427528675703858, "grad_norm": 445.86651611328125, "learning_rate": 3.160000000000001e-07, "loss": 28.4039, "step": 80 }, { "epoch": 0.01173096976016684, "grad_norm": 409.77642822265625, "learning_rate": 3.56e-07, "loss": 27.1438, "step": 90 }, { "epoch": 0.013034410844629822, "grad_norm": 361.9573059082031, "learning_rate": 3.9600000000000005e-07, "loss": 24.495, "step": 100 }, { "epoch": 0.013034410844629822, "eval/acc": 4.651162624359131, "step": 100 }, { "epoch": 0.013034410844629822, "eval_loss": 18.547975540161133, "eval_runtime": 0.9127, "eval_samples_per_second": 47.114, "eval_steps_per_second": 1.096, "step": 100 }, { "epoch": 0.014337851929092805, "grad_norm": 313.210693359375, "learning_rate": 4.3600000000000004e-07, "loss": 21.7606, "step": 110 }, { "epoch": 0.01564129301355579, "grad_norm": 329.11328125, "learning_rate": 4.760000000000001e-07, "loss": 19.5126, "step": 120 }, { "epoch": 0.01694473409801877, "grad_norm": 208.9153289794922, "learning_rate": 5.16e-07, "loss": 18.0711, "step": 130 }, { "epoch": 0.01824817518248175, "grad_norm": 220.79066467285156, "learning_rate": 5.560000000000001e-07, "loss": 16.9045, "step": 140 }, { "epoch": 0.019551616266944735, "grad_norm": 167.1522216796875, "learning_rate": 5.960000000000001e-07, "loss": 14.9046, "step": 150 }, { "epoch": 0.020855057351407715, "grad_norm": 154.0718994140625, "learning_rate": 6.360000000000001e-07, "loss": 14.7601, "step": 160 }, { "epoch": 0.0221584984358707, "grad_norm": 146.39012145996094, "learning_rate": 6.76e-07, "loss": 12.5387, "step": 170 }, { "epoch": 0.02346193952033368, "grad_norm": 153.8921356201172, "learning_rate": 7.16e-07, "loss": 11.6882, "step": 180 }, { "epoch": 0.024765380604796664, "grad_norm": 187.7710723876953, "learning_rate": 7.56e-07, "loss": 11.9919, "step": 190 }, { "epoch": 0.026068821689259645, "grad_norm": 163.95228576660156, "learning_rate": 7.960000000000001e-07, "loss": 10.8187, "step": 200 }, { "epoch": 0.026068821689259645, "eval/acc": 11.627906799316406, "step": 200 }, { "epoch": 0.026068821689259645, "eval_loss": 8.683622360229492, "eval_runtime": 0.5536, "eval_samples_per_second": 77.674, "eval_steps_per_second": 1.806, "step": 200 }, { "epoch": 0.02737226277372263, "grad_norm": 121.16007232666016, "learning_rate": 8.36e-07, "loss": 9.9573, "step": 210 }, { "epoch": 0.02867570385818561, "grad_norm": 123.3974609375, "learning_rate": 8.760000000000001e-07, "loss": 9.3645, "step": 220 }, { "epoch": 0.029979144942648594, "grad_norm": 149.9007110595703, "learning_rate": 9.160000000000001e-07, "loss": 9.1913, "step": 230 }, { "epoch": 0.03128258602711158, "grad_norm": 142.5546875, "learning_rate": 9.56e-07, "loss": 8.1642, "step": 240 }, { "epoch": 0.03258602711157456, "grad_norm": 111.52351379394531, "learning_rate": 9.96e-07, "loss": 8.1291, "step": 250 }, { "epoch": 0.03388946819603754, "grad_norm": 112.73123931884766, "learning_rate": 1.0360000000000001e-06, "loss": 7.6783, "step": 260 }, { "epoch": 0.03519290928050052, "grad_norm": 94.62492370605469, "learning_rate": 1.0760000000000002e-06, "loss": 7.333, "step": 270 }, { "epoch": 0.0364963503649635, "grad_norm": 105.54913330078125, "learning_rate": 1.1160000000000002e-06, "loss": 6.6041, "step": 280 }, { "epoch": 0.03779979144942649, "grad_norm": 93.97553253173828, "learning_rate": 1.156e-06, "loss": 6.769, "step": 290 }, { "epoch": 0.03910323253388947, "grad_norm": 78.89429473876953, "learning_rate": 1.196e-06, "loss": 6.3188, "step": 300 }, { "epoch": 0.03910323253388947, "eval/acc": 18.604650497436523, "step": 300 }, { "epoch": 0.03910323253388947, "eval_loss": 6.040415287017822, "eval_runtime": 0.5521, "eval_samples_per_second": 77.881, "eval_steps_per_second": 1.811, "step": 300 }, { "epoch": 0.04040667361835245, "grad_norm": 88.73673248291016, "learning_rate": 1.2360000000000001e-06, "loss": 5.643, "step": 310 }, { "epoch": 0.04171011470281543, "grad_norm": 83.74315643310547, "learning_rate": 1.276e-06, "loss": 5.0575, "step": 320 }, { "epoch": 0.04301355578727842, "grad_norm": 83.0094223022461, "learning_rate": 1.316e-06, "loss": 4.8596, "step": 330 }, { "epoch": 0.0443169968717414, "grad_norm": 66.99898529052734, "learning_rate": 1.356e-06, "loss": 4.444, "step": 340 }, { "epoch": 0.04562043795620438, "grad_norm": 61.338409423828125, "learning_rate": 1.396e-06, "loss": 4.1019, "step": 350 }, { "epoch": 0.04692387904066736, "grad_norm": 49.183837890625, "learning_rate": 1.436e-06, "loss": 3.7076, "step": 360 }, { "epoch": 0.04822732012513034, "grad_norm": 43.407833099365234, "learning_rate": 1.4760000000000001e-06, "loss": 3.4065, "step": 370 }, { "epoch": 0.04953076120959333, "grad_norm": 36.92807388305664, "learning_rate": 1.5160000000000002e-06, "loss": 3.2919, "step": 380 }, { "epoch": 0.05083420229405631, "grad_norm": 31.856853485107422, "learning_rate": 1.556e-06, "loss": 2.8133, "step": 390 }, { "epoch": 0.05213764337851929, "grad_norm": 25.495525360107422, "learning_rate": 1.596e-06, "loss": 2.7088, "step": 400 }, { "epoch": 0.05213764337851929, "eval/acc": 16.279069900512695, "step": 400 }, { "epoch": 0.05213764337851929, "eval_loss": 3.6577463150024414, "eval_runtime": 0.5513, "eval_samples_per_second": 77.993, "eval_steps_per_second": 1.814, "step": 400 }, { "epoch": 0.05344108446298227, "grad_norm": 24.631906509399414, "learning_rate": 1.636e-06, "loss": 2.4304, "step": 410 }, { "epoch": 0.05474452554744526, "grad_norm": 21.909217834472656, "learning_rate": 1.6760000000000001e-06, "loss": 2.534, "step": 420 }, { "epoch": 0.05604796663190824, "grad_norm": 22.260988235473633, "learning_rate": 1.7160000000000002e-06, "loss": 2.2622, "step": 430 }, { "epoch": 0.05735140771637122, "grad_norm": 20.961124420166016, "learning_rate": 1.7560000000000002e-06, "loss": 2.203, "step": 440 }, { "epoch": 0.0586548488008342, "grad_norm": 17.357723236083984, "learning_rate": 1.7960000000000003e-06, "loss": 2.0872, "step": 450 }, { "epoch": 0.05995828988529719, "grad_norm": 23.459308624267578, "learning_rate": 1.8360000000000003e-06, "loss": 2.1486, "step": 460 }, { "epoch": 0.06126173096976017, "grad_norm": 16.572664260864258, "learning_rate": 1.8760000000000001e-06, "loss": 1.8763, "step": 470 }, { "epoch": 0.06256517205422316, "grad_norm": 16.683259963989258, "learning_rate": 1.916e-06, "loss": 1.9834, "step": 480 }, { "epoch": 0.06386861313868614, "grad_norm": 17.584997177124023, "learning_rate": 1.956e-06, "loss": 1.9754, "step": 490 }, { "epoch": 0.06517205422314912, "grad_norm": 16.946754455566406, "learning_rate": 1.996e-06, "loss": 1.84, "step": 500 }, { "epoch": 0.06517205422314912, "eval/acc": 30.23255729675293, "step": 500 }, { "epoch": 0.06517205422314912, "eval_loss": 3.3365631103515625, "eval_runtime": 0.5506, "eval_samples_per_second": 78.1, "eval_steps_per_second": 1.816, "step": 500 }, { "epoch": 0.0664754953076121, "grad_norm": 16.998144149780273, "learning_rate": 2.036e-06, "loss": 1.951, "step": 510 }, { "epoch": 0.06777893639207508, "grad_norm": 15.914703369140625, "learning_rate": 2.076e-06, "loss": 1.6538, "step": 520 }, { "epoch": 0.06908237747653806, "grad_norm": 16.67144775390625, "learning_rate": 2.116e-06, "loss": 1.8369, "step": 530 }, { "epoch": 0.07038581856100104, "grad_norm": 17.469003677368164, "learning_rate": 2.156e-06, "loss": 1.7525, "step": 540 }, { "epoch": 0.07168925964546402, "grad_norm": 17.825891494750977, "learning_rate": 2.1960000000000002e-06, "loss": 1.741, "step": 550 }, { "epoch": 0.072992700729927, "grad_norm": 16.591110229492188, "learning_rate": 2.2360000000000003e-06, "loss": 1.7004, "step": 560 }, { "epoch": 0.07429614181439, "grad_norm": 17.972606658935547, "learning_rate": 2.2760000000000003e-06, "loss": 1.5859, "step": 570 }, { "epoch": 0.07559958289885298, "grad_norm": 16.083576202392578, "learning_rate": 2.3160000000000004e-06, "loss": 1.6437, "step": 580 }, { "epoch": 0.07690302398331596, "grad_norm": 18.013198852539062, "learning_rate": 2.3560000000000004e-06, "loss": 1.6496, "step": 590 }, { "epoch": 0.07820646506777894, "grad_norm": 17.562707901000977, "learning_rate": 2.3960000000000004e-06, "loss": 1.5905, "step": 600 }, { "epoch": 0.07820646506777894, "eval/acc": 25.581396102905273, "step": 600 }, { "epoch": 0.07820646506777894, "eval_loss": 3.272217035293579, "eval_runtime": 0.5506, "eval_samples_per_second": 78.096, "eval_steps_per_second": 1.816, "step": 600 }, { "epoch": 0.07950990615224192, "grad_norm": 16.752840042114258, "learning_rate": 2.4360000000000005e-06, "loss": 1.6781, "step": 610 }, { "epoch": 0.0808133472367049, "grad_norm": 15.609387397766113, "learning_rate": 2.476e-06, "loss": 1.6322, "step": 620 }, { "epoch": 0.08211678832116788, "grad_norm": 18.39044952392578, "learning_rate": 2.516e-06, "loss": 1.6085, "step": 630 }, { "epoch": 0.08342022940563086, "grad_norm": 15.455676078796387, "learning_rate": 2.556e-06, "loss": 1.5879, "step": 640 }, { "epoch": 0.08472367049009384, "grad_norm": 17.240724563598633, "learning_rate": 2.5960000000000002e-06, "loss": 1.6769, "step": 650 }, { "epoch": 0.08602711157455684, "grad_norm": 15.329434394836426, "learning_rate": 2.6360000000000003e-06, "loss": 1.623, "step": 660 }, { "epoch": 0.08733055265901982, "grad_norm": 15.390430450439453, "learning_rate": 2.6760000000000003e-06, "loss": 1.5269, "step": 670 }, { "epoch": 0.0886339937434828, "grad_norm": 13.903982162475586, "learning_rate": 2.7160000000000003e-06, "loss": 1.5213, "step": 680 }, { "epoch": 0.08993743482794578, "grad_norm": 15.723600387573242, "learning_rate": 2.7560000000000004e-06, "loss": 1.544, "step": 690 }, { "epoch": 0.09124087591240876, "grad_norm": 16.601280212402344, "learning_rate": 2.7960000000000004e-06, "loss": 1.5074, "step": 700 }, { "epoch": 0.09124087591240876, "eval/acc": 23.255813598632812, "step": 700 }, { "epoch": 0.09124087591240876, "eval_loss": 3.2280378341674805, "eval_runtime": 0.5537, "eval_samples_per_second": 77.661, "eval_steps_per_second": 1.806, "step": 700 }, { "epoch": 0.09254431699687174, "grad_norm": 15.303380966186523, "learning_rate": 2.8360000000000005e-06, "loss": 1.6147, "step": 710 }, { "epoch": 0.09384775808133472, "grad_norm": 15.977986335754395, "learning_rate": 2.8760000000000005e-06, "loss": 1.4851, "step": 720 }, { "epoch": 0.0951511991657977, "grad_norm": 15.908977508544922, "learning_rate": 2.9160000000000005e-06, "loss": 1.517, "step": 730 }, { "epoch": 0.09645464025026068, "grad_norm": 14.383811950683594, "learning_rate": 2.956e-06, "loss": 1.5444, "step": 740 }, { "epoch": 0.09775808133472368, "grad_norm": 12.663350105285645, "learning_rate": 2.996e-06, "loss": 1.5018, "step": 750 }, { "epoch": 0.09906152241918666, "grad_norm": 15.087569236755371, "learning_rate": 3.0360000000000002e-06, "loss": 1.4602, "step": 760 }, { "epoch": 0.10036496350364964, "grad_norm": 13.563980102539062, "learning_rate": 3.0760000000000003e-06, "loss": 1.3855, "step": 770 }, { "epoch": 0.10166840458811262, "grad_norm": 13.872782707214355, "learning_rate": 3.1160000000000003e-06, "loss": 1.508, "step": 780 }, { "epoch": 0.1029718456725756, "grad_norm": 12.23460865020752, "learning_rate": 3.1560000000000004e-06, "loss": 1.3865, "step": 790 }, { "epoch": 0.10427528675703858, "grad_norm": 15.744820594787598, "learning_rate": 3.1960000000000004e-06, "loss": 1.5776, "step": 800 }, { "epoch": 0.10427528675703858, "eval/acc": 23.255813598632812, "step": 800 }, { "epoch": 0.10427528675703858, "eval_loss": 3.1086668968200684, "eval_runtime": 0.5521, "eval_samples_per_second": 77.884, "eval_steps_per_second": 1.811, "step": 800 }, { "epoch": 0.10557872784150156, "grad_norm": 12.964938163757324, "learning_rate": 3.2360000000000004e-06, "loss": 1.4783, "step": 810 }, { "epoch": 0.10688216892596454, "grad_norm": 16.409147262573242, "learning_rate": 3.2760000000000005e-06, "loss": 1.3763, "step": 820 }, { "epoch": 0.10818561001042754, "grad_norm": 13.46617317199707, "learning_rate": 3.3160000000000005e-06, "loss": 1.4161, "step": 830 }, { "epoch": 0.10948905109489052, "grad_norm": 14.7039213180542, "learning_rate": 3.3560000000000006e-06, "loss": 1.5434, "step": 840 }, { "epoch": 0.1107924921793535, "grad_norm": 14.37901782989502, "learning_rate": 3.3960000000000006e-06, "loss": 1.4212, "step": 850 }, { "epoch": 0.11209593326381648, "grad_norm": 13.210816383361816, "learning_rate": 3.4360000000000006e-06, "loss": 1.4053, "step": 860 }, { "epoch": 0.11339937434827946, "grad_norm": 13.743114471435547, "learning_rate": 3.4760000000000007e-06, "loss": 1.4231, "step": 870 }, { "epoch": 0.11470281543274244, "grad_norm": 12.634490013122559, "learning_rate": 3.5160000000000007e-06, "loss": 1.3584, "step": 880 }, { "epoch": 0.11600625651720542, "grad_norm": 15.65221881866455, "learning_rate": 3.5560000000000008e-06, "loss": 1.399, "step": 890 }, { "epoch": 0.1173096976016684, "grad_norm": 14.89765453338623, "learning_rate": 3.596e-06, "loss": 1.3935, "step": 900 }, { "epoch": 0.1173096976016684, "eval/acc": 23.255813598632812, "step": 900 }, { "epoch": 0.1173096976016684, "eval_loss": 3.096344232559204, "eval_runtime": 0.5513, "eval_samples_per_second": 77.992, "eval_steps_per_second": 1.814, "step": 900 }, { "epoch": 0.11861313868613138, "grad_norm": 14.929734230041504, "learning_rate": 3.636e-06, "loss": 1.4005, "step": 910 }, { "epoch": 0.11991657977059438, "grad_norm": 12.793665885925293, "learning_rate": 3.676e-06, "loss": 1.4152, "step": 920 }, { "epoch": 0.12122002085505736, "grad_norm": 13.772797584533691, "learning_rate": 3.716e-06, "loss": 1.3823, "step": 930 }, { "epoch": 0.12252346193952034, "grad_norm": 11.430520057678223, "learning_rate": 3.756e-06, "loss": 1.3623, "step": 940 }, { "epoch": 0.12382690302398332, "grad_norm": 13.903288841247559, "learning_rate": 3.796e-06, "loss": 1.3491, "step": 950 }, { "epoch": 0.1251303441084463, "grad_norm": 14.225196838378906, "learning_rate": 3.836e-06, "loss": 1.3605, "step": 960 }, { "epoch": 0.1264337851929093, "grad_norm": 13.653999328613281, "learning_rate": 3.876000000000001e-06, "loss": 1.4258, "step": 970 }, { "epoch": 0.12773722627737227, "grad_norm": 12.619461059570312, "learning_rate": 3.916e-06, "loss": 1.2765, "step": 980 }, { "epoch": 0.12904066736183525, "grad_norm": 12.887979507446289, "learning_rate": 3.956000000000001e-06, "loss": 1.3446, "step": 990 }, { "epoch": 0.13034410844629823, "grad_norm": 13.362163543701172, "learning_rate": 3.996e-06, "loss": 1.4322, "step": 1000 }, { "epoch": 0.13034410844629823, "eval/acc": 23.255813598632812, "step": 1000 }, { "epoch": 0.13034410844629823, "eval_loss": 3.0436527729034424, "eval_runtime": 0.5774, "eval_samples_per_second": 74.467, "eval_steps_per_second": 1.732, "step": 1000 }, { "epoch": 0.13164754953076122, "grad_norm": 13.34825611114502, "learning_rate": 4.036000000000001e-06, "loss": 1.3434, "step": 1010 }, { "epoch": 0.1329509906152242, "grad_norm": 12.807318687438965, "learning_rate": 4.0760000000000004e-06, "loss": 1.3971, "step": 1020 }, { "epoch": 0.13425443169968718, "grad_norm": 10.88805103302002, "learning_rate": 4.116000000000001e-06, "loss": 1.3324, "step": 1030 }, { "epoch": 0.13555787278415016, "grad_norm": 12.1721830368042, "learning_rate": 4.1560000000000005e-06, "loss": 1.3454, "step": 1040 }, { "epoch": 0.13686131386861314, "grad_norm": 16.927200317382812, "learning_rate": 4.196e-06, "loss": 1.3912, "step": 1050 }, { "epoch": 0.13816475495307612, "grad_norm": 11.07986068725586, "learning_rate": 4.236e-06, "loss": 1.2858, "step": 1060 }, { "epoch": 0.1394681960375391, "grad_norm": 13.776060104370117, "learning_rate": 4.276e-06, "loss": 1.4019, "step": 1070 }, { "epoch": 0.14077163712200208, "grad_norm": 13.49791145324707, "learning_rate": 4.316e-06, "loss": 1.4221, "step": 1080 }, { "epoch": 0.14207507820646506, "grad_norm": 11.622773170471191, "learning_rate": 4.356e-06, "loss": 1.3731, "step": 1090 }, { "epoch": 0.14337851929092804, "grad_norm": 13.743051528930664, "learning_rate": 4.396e-06, "loss": 1.3511, "step": 1100 }, { "epoch": 0.14337851929092804, "eval/acc": 25.581396102905273, "step": 1100 }, { "epoch": 0.14337851929092804, "eval_loss": 3.016920804977417, "eval_runtime": 0.5524, "eval_samples_per_second": 77.847, "eval_steps_per_second": 1.81, "step": 1100 }, { "epoch": 0.14468196037539102, "grad_norm": 14.456645011901855, "learning_rate": 4.436e-06, "loss": 1.3994, "step": 1110 }, { "epoch": 0.145985401459854, "grad_norm": 12.78945255279541, "learning_rate": 4.476e-06, "loss": 1.3486, "step": 1120 }, { "epoch": 0.14728884254431698, "grad_norm": 12.899959564208984, "learning_rate": 4.5160000000000005e-06, "loss": 1.3342, "step": 1130 }, { "epoch": 0.14859228362878, "grad_norm": 12.025766372680664, "learning_rate": 4.556e-06, "loss": 1.3066, "step": 1140 }, { "epoch": 0.14989572471324297, "grad_norm": 11.712949752807617, "learning_rate": 4.5960000000000006e-06, "loss": 1.4095, "step": 1150 }, { "epoch": 0.15119916579770595, "grad_norm": 14.212655067443848, "learning_rate": 4.636e-06, "loss": 1.2781, "step": 1160 }, { "epoch": 0.15250260688216893, "grad_norm": 13.639365196228027, "learning_rate": 4.676000000000001e-06, "loss": 1.3783, "step": 1170 }, { "epoch": 0.15380604796663191, "grad_norm": 11.413806915283203, "learning_rate": 4.716e-06, "loss": 1.2901, "step": 1180 }, { "epoch": 0.1551094890510949, "grad_norm": 11.520100593566895, "learning_rate": 4.756000000000001e-06, "loss": 1.3142, "step": 1190 }, { "epoch": 0.15641293013555788, "grad_norm": 13.1220064163208, "learning_rate": 4.796e-06, "loss": 1.3254, "step": 1200 }, { "epoch": 0.15641293013555788, "eval/acc": 27.9069766998291, "step": 1200 }, { "epoch": 0.15641293013555788, "eval_loss": 2.8769519329071045, "eval_runtime": 0.5522, "eval_samples_per_second": 77.864, "eval_steps_per_second": 1.811, "step": 1200 }, { "epoch": 0.15771637122002086, "grad_norm": 11.807994842529297, "learning_rate": 4.836e-06, "loss": 1.339, "step": 1210 }, { "epoch": 0.15901981230448384, "grad_norm": 11.208297729492188, "learning_rate": 4.876e-06, "loss": 1.1896, "step": 1220 }, { "epoch": 0.16032325338894682, "grad_norm": 13.063114166259766, "learning_rate": 4.916e-06, "loss": 1.2701, "step": 1230 }, { "epoch": 0.1616266944734098, "grad_norm": 11.611763000488281, "learning_rate": 4.9560000000000005e-06, "loss": 1.3212, "step": 1240 }, { "epoch": 0.16293013555787278, "grad_norm": 10.884580612182617, "learning_rate": 4.996e-06, "loss": 1.227, "step": 1250 }, { "epoch": 0.16423357664233576, "grad_norm": 11.97398567199707, "learning_rate": 5.0360000000000006e-06, "loss": 1.3075, "step": 1260 }, { "epoch": 0.16553701772679874, "grad_norm": 13.973258972167969, "learning_rate": 5.076000000000001e-06, "loss": 1.2388, "step": 1270 }, { "epoch": 0.16684045881126172, "grad_norm": 13.00340461730957, "learning_rate": 5.116000000000001e-06, "loss": 1.3462, "step": 1280 }, { "epoch": 0.1681438998957247, "grad_norm": 11.750258445739746, "learning_rate": 5.156e-06, "loss": 1.2093, "step": 1290 }, { "epoch": 0.16944734098018768, "grad_norm": 12.117288589477539, "learning_rate": 5.196e-06, "loss": 1.223, "step": 1300 }, { "epoch": 0.16944734098018768, "eval/acc": 32.55813980102539, "step": 1300 }, { "epoch": 0.16944734098018768, "eval_loss": 2.936992645263672, "eval_runtime": 0.5526, "eval_samples_per_second": 77.814, "eval_steps_per_second": 1.81, "step": 1300 }, { "epoch": 0.1707507820646507, "grad_norm": 12.747390747070312, "learning_rate": 5.236e-06, "loss": 1.2956, "step": 1310 }, { "epoch": 0.17205422314911367, "grad_norm": 10.593498229980469, "learning_rate": 5.276e-06, "loss": 1.1996, "step": 1320 }, { "epoch": 0.17335766423357665, "grad_norm": 11.945181846618652, "learning_rate": 5.3160000000000004e-06, "loss": 1.33, "step": 1330 }, { "epoch": 0.17466110531803963, "grad_norm": 12.65109634399414, "learning_rate": 5.356e-06, "loss": 1.2295, "step": 1340 }, { "epoch": 0.17596454640250261, "grad_norm": 11.467466354370117, "learning_rate": 5.3960000000000005e-06, "loss": 1.2227, "step": 1350 }, { "epoch": 0.1772679874869656, "grad_norm": 12.898762702941895, "learning_rate": 5.436e-06, "loss": 1.2573, "step": 1360 }, { "epoch": 0.17857142857142858, "grad_norm": 11.188071250915527, "learning_rate": 5.476000000000001e-06, "loss": 1.3103, "step": 1370 }, { "epoch": 0.17987486965589156, "grad_norm": 12.179079055786133, "learning_rate": 5.516e-06, "loss": 1.259, "step": 1380 }, { "epoch": 0.18117831074035454, "grad_norm": 12.672003746032715, "learning_rate": 5.556000000000001e-06, "loss": 1.1756, "step": 1390 }, { "epoch": 0.18248175182481752, "grad_norm": 11.671830177307129, "learning_rate": 5.596e-06, "loss": 1.2905, "step": 1400 }, { "epoch": 0.18248175182481752, "eval/acc": 27.9069766998291, "step": 1400 }, { "epoch": 0.18248175182481752, "eval_loss": 2.8736231327056885, "eval_runtime": 0.5533, "eval_samples_per_second": 77.719, "eval_steps_per_second": 1.807, "step": 1400 }, { "epoch": 0.1837851929092805, "grad_norm": 12.279439926147461, "learning_rate": 5.636000000000001e-06, "loss": 1.2422, "step": 1410 }, { "epoch": 0.18508863399374348, "grad_norm": 12.59632396697998, "learning_rate": 5.676e-06, "loss": 1.1998, "step": 1420 }, { "epoch": 0.18639207507820646, "grad_norm": 10.290858268737793, "learning_rate": 5.716000000000001e-06, "loss": 1.3073, "step": 1430 }, { "epoch": 0.18769551616266944, "grad_norm": 11.450456619262695, "learning_rate": 5.7560000000000005e-06, "loss": 1.2917, "step": 1440 }, { "epoch": 0.18899895724713242, "grad_norm": 10.898682594299316, "learning_rate": 5.796000000000001e-06, "loss": 1.1988, "step": 1450 }, { "epoch": 0.1903023983315954, "grad_norm": 11.755196571350098, "learning_rate": 5.8360000000000005e-06, "loss": 1.4151, "step": 1460 }, { "epoch": 0.19160583941605838, "grad_norm": 10.369739532470703, "learning_rate": 5.876000000000001e-06, "loss": 1.2748, "step": 1470 }, { "epoch": 0.19290928050052136, "grad_norm": 10.368874549865723, "learning_rate": 5.916000000000001e-06, "loss": 1.2456, "step": 1480 }, { "epoch": 0.19421272158498437, "grad_norm": 10.07337474822998, "learning_rate": 5.956000000000001e-06, "loss": 1.1918, "step": 1490 }, { "epoch": 0.19551616266944735, "grad_norm": 12.127270698547363, "learning_rate": 5.996000000000001e-06, "loss": 1.1726, "step": 1500 }, { "epoch": 0.19551616266944735, "eval/acc": 32.55813980102539, "step": 1500 }, { "epoch": 0.19551616266944735, "eval_loss": 2.9003522396087646, "eval_runtime": 0.5544, "eval_samples_per_second": 77.556, "eval_steps_per_second": 1.804, "step": 1500 }, { "epoch": 0.19681960375391033, "grad_norm": 12.454967498779297, "learning_rate": 6.036000000000001e-06, "loss": 1.3509, "step": 1510 }, { "epoch": 0.1981230448383733, "grad_norm": 11.402469635009766, "learning_rate": 6.076000000000001e-06, "loss": 1.2365, "step": 1520 }, { "epoch": 0.1994264859228363, "grad_norm": 12.890278816223145, "learning_rate": 6.116000000000001e-06, "loss": 1.2295, "step": 1530 }, { "epoch": 0.20072992700729927, "grad_norm": 12.542150497436523, "learning_rate": 6.156000000000001e-06, "loss": 1.2789, "step": 1540 }, { "epoch": 0.20203336809176226, "grad_norm": 10.868870735168457, "learning_rate": 6.196000000000001e-06, "loss": 1.232, "step": 1550 }, { "epoch": 0.20333680917622524, "grad_norm": 12.972379684448242, "learning_rate": 6.236000000000001e-06, "loss": 1.2111, "step": 1560 }, { "epoch": 0.20464025026068822, "grad_norm": 14.00292682647705, "learning_rate": 6.2760000000000006e-06, "loss": 1.1551, "step": 1570 }, { "epoch": 0.2059436913451512, "grad_norm": 11.713733673095703, "learning_rate": 6.316000000000001e-06, "loss": 1.2256, "step": 1580 }, { "epoch": 0.20724713242961418, "grad_norm": 11.81581974029541, "learning_rate": 6.356000000000001e-06, "loss": 1.2375, "step": 1590 }, { "epoch": 0.20855057351407716, "grad_norm": 9.595722198486328, "learning_rate": 6.396e-06, "loss": 1.2307, "step": 1600 }, { "epoch": 0.20855057351407716, "eval/acc": 34.88372039794922, "step": 1600 }, { "epoch": 0.20855057351407716, "eval_loss": 2.89196515083313, "eval_runtime": 0.5526, "eval_samples_per_second": 77.814, "eval_steps_per_second": 1.81, "step": 1600 }, { "epoch": 0.20985401459854014, "grad_norm": 14.193363189697266, "learning_rate": 6.436e-06, "loss": 1.2503, "step": 1610 }, { "epoch": 0.21115745568300312, "grad_norm": 10.671473503112793, "learning_rate": 6.476e-06, "loss": 1.1697, "step": 1620 }, { "epoch": 0.2124608967674661, "grad_norm": 12.921130180358887, "learning_rate": 6.516e-06, "loss": 1.1124, "step": 1630 }, { "epoch": 0.21376433785192908, "grad_norm": 12.321484565734863, "learning_rate": 6.556e-06, "loss": 1.2272, "step": 1640 }, { "epoch": 0.21506777893639206, "grad_norm": 13.49770450592041, "learning_rate": 6.596e-06, "loss": 1.2014, "step": 1650 }, { "epoch": 0.21637122002085507, "grad_norm": 10.752897262573242, "learning_rate": 6.6360000000000005e-06, "loss": 1.174, "step": 1660 }, { "epoch": 0.21767466110531805, "grad_norm": 12.024086952209473, "learning_rate": 6.676e-06, "loss": 1.1625, "step": 1670 }, { "epoch": 0.21897810218978103, "grad_norm": 12.498847961425781, "learning_rate": 6.716000000000001e-06, "loss": 1.1683, "step": 1680 }, { "epoch": 0.220281543274244, "grad_norm": 11.955095291137695, "learning_rate": 6.756e-06, "loss": 1.2441, "step": 1690 }, { "epoch": 0.221584984358707, "grad_norm": 10.969300270080566, "learning_rate": 6.796000000000001e-06, "loss": 1.2547, "step": 1700 }, { "epoch": 0.221584984358707, "eval/acc": 32.55813980102539, "step": 1700 }, { "epoch": 0.221584984358707, "eval_loss": 2.735595464706421, "eval_runtime": 0.5549, "eval_samples_per_second": 77.488, "eval_steps_per_second": 1.802, "step": 1700 }, { "epoch": 0.22288842544316997, "grad_norm": 10.412980079650879, "learning_rate": 6.836e-06, "loss": 1.2473, "step": 1710 }, { "epoch": 0.22419186652763295, "grad_norm": 12.962031364440918, "learning_rate": 6.876000000000001e-06, "loss": 1.1759, "step": 1720 }, { "epoch": 0.22549530761209594, "grad_norm": 10.370616912841797, "learning_rate": 6.916e-06, "loss": 1.1444, "step": 1730 }, { "epoch": 0.22679874869655892, "grad_norm": 12.069488525390625, "learning_rate": 6.956000000000001e-06, "loss": 1.2177, "step": 1740 }, { "epoch": 0.2281021897810219, "grad_norm": 11.831305503845215, "learning_rate": 6.9960000000000004e-06, "loss": 1.2315, "step": 1750 }, { "epoch": 0.22940563086548488, "grad_norm": 10.261811256408691, "learning_rate": 7.036000000000001e-06, "loss": 1.1478, "step": 1760 }, { "epoch": 0.23070907194994786, "grad_norm": 10.814574241638184, "learning_rate": 7.0760000000000005e-06, "loss": 1.221, "step": 1770 }, { "epoch": 0.23201251303441084, "grad_norm": 11.184773445129395, "learning_rate": 7.116000000000001e-06, "loss": 1.2984, "step": 1780 }, { "epoch": 0.23331595411887382, "grad_norm": 11.853842735290527, "learning_rate": 7.156000000000001e-06, "loss": 1.2325, "step": 1790 }, { "epoch": 0.2346193952033368, "grad_norm": 10.178322792053223, "learning_rate": 7.196000000000001e-06, "loss": 1.1664, "step": 1800 }, { "epoch": 0.2346193952033368, "eval/acc": 34.88372039794922, "step": 1800 }, { "epoch": 0.2346193952033368, "eval_loss": 2.893901824951172, "eval_runtime": 0.5541, "eval_samples_per_second": 77.605, "eval_steps_per_second": 1.805, "step": 1800 }, { "epoch": 0.23592283628779978, "grad_norm": 11.30508804321289, "learning_rate": 7.236000000000001e-06, "loss": 1.2602, "step": 1810 }, { "epoch": 0.23722627737226276, "grad_norm": 11.195526123046875, "learning_rate": 7.276000000000001e-06, "loss": 1.1529, "step": 1820 }, { "epoch": 0.23852971845672574, "grad_norm": 11.082310676574707, "learning_rate": 7.316000000000001e-06, "loss": 1.2024, "step": 1830 }, { "epoch": 0.23983315954118875, "grad_norm": 10.974154472351074, "learning_rate": 7.356000000000001e-06, "loss": 1.1329, "step": 1840 }, { "epoch": 0.24113660062565173, "grad_norm": 10.155501365661621, "learning_rate": 7.396000000000001e-06, "loss": 1.1555, "step": 1850 }, { "epoch": 0.2424400417101147, "grad_norm": 10.690115928649902, "learning_rate": 7.436000000000001e-06, "loss": 1.1916, "step": 1860 }, { "epoch": 0.2437434827945777, "grad_norm": 11.77647876739502, "learning_rate": 7.476000000000001e-06, "loss": 1.0674, "step": 1870 }, { "epoch": 0.24504692387904067, "grad_norm": 13.536336898803711, "learning_rate": 7.516000000000001e-06, "loss": 1.2325, "step": 1880 }, { "epoch": 0.24635036496350365, "grad_norm": 10.988912582397461, "learning_rate": 7.556000000000001e-06, "loss": 1.1597, "step": 1890 }, { "epoch": 0.24765380604796663, "grad_norm": 11.346904754638672, "learning_rate": 7.5960000000000015e-06, "loss": 1.1883, "step": 1900 }, { "epoch": 0.24765380604796663, "eval/acc": 34.88372039794922, "step": 1900 }, { "epoch": 0.24765380604796663, "eval_loss": 2.8580784797668457, "eval_runtime": 0.5538, "eval_samples_per_second": 77.645, "eval_steps_per_second": 1.806, "step": 1900 }, { "epoch": 0.24895724713242962, "grad_norm": 10.486469268798828, "learning_rate": 7.636e-06, "loss": 1.1989, "step": 1910 }, { "epoch": 0.2502606882168926, "grad_norm": 11.191844940185547, "learning_rate": 7.676e-06, "loss": 1.1934, "step": 1920 }, { "epoch": 0.2515641293013556, "grad_norm": 12.818986892700195, "learning_rate": 7.716e-06, "loss": 1.1856, "step": 1930 }, { "epoch": 0.2528675703858186, "grad_norm": 9.980338096618652, "learning_rate": 7.756e-06, "loss": 1.1685, "step": 1940 }, { "epoch": 0.25417101147028154, "grad_norm": 10.59505558013916, "learning_rate": 7.796e-06, "loss": 1.0932, "step": 1950 }, { "epoch": 0.25547445255474455, "grad_norm": 10.21989631652832, "learning_rate": 7.836000000000001e-06, "loss": 1.2254, "step": 1960 }, { "epoch": 0.2567778936392075, "grad_norm": 9.082103729248047, "learning_rate": 7.876e-06, "loss": 1.1439, "step": 1970 }, { "epoch": 0.2580813347236705, "grad_norm": 10.54208755493164, "learning_rate": 7.916e-06, "loss": 1.2031, "step": 1980 }, { "epoch": 0.25938477580813346, "grad_norm": 11.807458877563477, "learning_rate": 7.956e-06, "loss": 1.1575, "step": 1990 }, { "epoch": 0.26068821689259647, "grad_norm": 11.20957088470459, "learning_rate": 7.996000000000001e-06, "loss": 1.152, "step": 2000 }, { "epoch": 0.26068821689259647, "eval/acc": 34.88372039794922, "step": 2000 }, { "epoch": 0.26068821689259647, "eval_loss": 2.9252498149871826, "eval_runtime": 0.5546, "eval_samples_per_second": 77.535, "eval_steps_per_second": 1.803, "step": 2000 }, { "epoch": 0.2619916579770594, "grad_norm": 9.236865997314453, "learning_rate": 8.036e-06, "loss": 1.1518, "step": 2010 }, { "epoch": 0.26329509906152243, "grad_norm": 10.173084259033203, "learning_rate": 8.076e-06, "loss": 1.1738, "step": 2020 }, { "epoch": 0.2645985401459854, "grad_norm": 11.158531188964844, "learning_rate": 8.116e-06, "loss": 1.1942, "step": 2030 }, { "epoch": 0.2659019812304484, "grad_norm": 10.654205322265625, "learning_rate": 8.156000000000001e-06, "loss": 1.1965, "step": 2040 }, { "epoch": 0.26720542231491134, "grad_norm": 10.954093933105469, "learning_rate": 8.196e-06, "loss": 1.0949, "step": 2050 }, { "epoch": 0.26850886339937435, "grad_norm": 10.480634689331055, "learning_rate": 8.236e-06, "loss": 1.2128, "step": 2060 }, { "epoch": 0.2698123044838373, "grad_norm": 9.64358901977539, "learning_rate": 8.276e-06, "loss": 1.1713, "step": 2070 }, { "epoch": 0.2711157455683003, "grad_norm": 9.68060302734375, "learning_rate": 8.316000000000001e-06, "loss": 1.1275, "step": 2080 }, { "epoch": 0.27241918665276327, "grad_norm": 10.211024284362793, "learning_rate": 8.356000000000001e-06, "loss": 1.2368, "step": 2090 }, { "epoch": 0.2737226277372263, "grad_norm": 10.19279670715332, "learning_rate": 8.396e-06, "loss": 1.1649, "step": 2100 }, { "epoch": 0.2737226277372263, "eval/acc": 37.20930099487305, "step": 2100 }, { "epoch": 0.2737226277372263, "eval_loss": 2.894489288330078, "eval_runtime": 0.5551, "eval_samples_per_second": 77.469, "eval_steps_per_second": 1.802, "step": 2100 }, { "epoch": 0.2750260688216893, "grad_norm": 11.496298789978027, "learning_rate": 8.436e-06, "loss": 1.1936, "step": 2110 }, { "epoch": 0.27632950990615224, "grad_norm": 10.342120170593262, "learning_rate": 8.476000000000002e-06, "loss": 1.2169, "step": 2120 }, { "epoch": 0.27763295099061525, "grad_norm": 10.583955764770508, "learning_rate": 8.516000000000001e-06, "loss": 1.1169, "step": 2130 }, { "epoch": 0.2789363920750782, "grad_norm": 10.484763145446777, "learning_rate": 8.556e-06, "loss": 1.1492, "step": 2140 }, { "epoch": 0.2802398331595412, "grad_norm": 10.46810245513916, "learning_rate": 8.596e-06, "loss": 1.2559, "step": 2150 }, { "epoch": 0.28154327424400416, "grad_norm": 10.129209518432617, "learning_rate": 8.636000000000002e-06, "loss": 1.0982, "step": 2160 }, { "epoch": 0.28284671532846717, "grad_norm": 9.844231605529785, "learning_rate": 8.676000000000001e-06, "loss": 1.1378, "step": 2170 }, { "epoch": 0.2841501564129301, "grad_norm": 11.35154914855957, "learning_rate": 8.716000000000001e-06, "loss": 1.2192, "step": 2180 }, { "epoch": 0.28545359749739313, "grad_norm": 11.104358673095703, "learning_rate": 8.756e-06, "loss": 1.1804, "step": 2190 }, { "epoch": 0.2867570385818561, "grad_norm": 8.416515350341797, "learning_rate": 8.796000000000002e-06, "loss": 1.135, "step": 2200 }, { "epoch": 0.2867570385818561, "eval/acc": 32.55813980102539, "step": 2200 }, { "epoch": 0.2867570385818561, "eval_loss": 2.850806713104248, "eval_runtime": 0.5533, "eval_samples_per_second": 77.714, "eval_steps_per_second": 1.807, "step": 2200 }, { "epoch": 0.2880604796663191, "grad_norm": 11.20003890991211, "learning_rate": 8.836000000000001e-06, "loss": 1.1998, "step": 2210 }, { "epoch": 0.28936392075078204, "grad_norm": 12.205933570861816, "learning_rate": 8.876e-06, "loss": 1.1331, "step": 2220 }, { "epoch": 0.29066736183524505, "grad_norm": 9.875853538513184, "learning_rate": 8.916e-06, "loss": 1.0744, "step": 2230 }, { "epoch": 0.291970802919708, "grad_norm": 11.795681953430176, "learning_rate": 8.956e-06, "loss": 1.1378, "step": 2240 }, { "epoch": 0.293274244004171, "grad_norm": 9.370049476623535, "learning_rate": 8.996e-06, "loss": 1.0586, "step": 2250 }, { "epoch": 0.29457768508863397, "grad_norm": 10.6432466506958, "learning_rate": 9.036e-06, "loss": 1.1936, "step": 2260 }, { "epoch": 0.295881126173097, "grad_norm": 10.588776588439941, "learning_rate": 9.076000000000001e-06, "loss": 1.0813, "step": 2270 }, { "epoch": 0.29718456725756, "grad_norm": 10.122645378112793, "learning_rate": 9.116e-06, "loss": 1.2067, "step": 2280 }, { "epoch": 0.29848800834202294, "grad_norm": 9.388029098510742, "learning_rate": 9.156e-06, "loss": 1.1546, "step": 2290 }, { "epoch": 0.29979144942648595, "grad_norm": 9.928315162658691, "learning_rate": 9.196e-06, "loss": 1.1134, "step": 2300 }, { "epoch": 0.29979144942648595, "eval/acc": 41.86046600341797, "step": 2300 }, { "epoch": 0.29979144942648595, "eval_loss": 2.8716952800750732, "eval_runtime": 0.554, "eval_samples_per_second": 77.619, "eval_steps_per_second": 1.805, "step": 2300 }, { "epoch": 0.3010948905109489, "grad_norm": 10.03122615814209, "learning_rate": 9.236000000000001e-06, "loss": 1.3212, "step": 2310 }, { "epoch": 0.3023983315954119, "grad_norm": 10.512228012084961, "learning_rate": 9.276e-06, "loss": 1.1369, "step": 2320 }, { "epoch": 0.30370177267987486, "grad_norm": 10.605701446533203, "learning_rate": 9.316e-06, "loss": 1.2389, "step": 2330 }, { "epoch": 0.30500521376433787, "grad_norm": 11.414910316467285, "learning_rate": 9.356e-06, "loss": 1.1003, "step": 2340 }, { "epoch": 0.3063086548488008, "grad_norm": 9.643972396850586, "learning_rate": 9.396000000000001e-06, "loss": 1.1028, "step": 2350 }, { "epoch": 0.30761209593326383, "grad_norm": 11.462910652160645, "learning_rate": 9.436e-06, "loss": 1.1437, "step": 2360 }, { "epoch": 0.3089155370177268, "grad_norm": 10.556984901428223, "learning_rate": 9.476e-06, "loss": 1.1802, "step": 2370 }, { "epoch": 0.3102189781021898, "grad_norm": 11.555737495422363, "learning_rate": 9.516e-06, "loss": 1.1376, "step": 2380 }, { "epoch": 0.31152241918665274, "grad_norm": 9.358216285705566, "learning_rate": 9.556000000000001e-06, "loss": 1.0511, "step": 2390 }, { "epoch": 0.31282586027111575, "grad_norm": 9.375101089477539, "learning_rate": 9.596000000000001e-06, "loss": 1.0519, "step": 2400 }, { "epoch": 0.31282586027111575, "eval/acc": 32.55813980102539, "step": 2400 }, { "epoch": 0.31282586027111575, "eval_loss": 2.992863655090332, "eval_runtime": 0.5533, "eval_samples_per_second": 77.716, "eval_steps_per_second": 1.807, "step": 2400 }, { "epoch": 0.3141293013555787, "grad_norm": 10.487874984741211, "learning_rate": 9.636e-06, "loss": 1.0795, "step": 2410 }, { "epoch": 0.3154327424400417, "grad_norm": 14.056046485900879, "learning_rate": 9.676e-06, "loss": 1.1586, "step": 2420 }, { "epoch": 0.31673618352450467, "grad_norm": 10.049337387084961, "learning_rate": 9.716000000000002e-06, "loss": 1.1348, "step": 2430 }, { "epoch": 0.3180396246089677, "grad_norm": 11.084318161010742, "learning_rate": 9.756000000000001e-06, "loss": 1.056, "step": 2440 }, { "epoch": 0.3193430656934307, "grad_norm": 10.96147346496582, "learning_rate": 9.796e-06, "loss": 1.1561, "step": 2450 }, { "epoch": 0.32064650677789364, "grad_norm": 9.765122413635254, "learning_rate": 9.836e-06, "loss": 1.1844, "step": 2460 }, { "epoch": 0.32194994786235664, "grad_norm": 12.490370750427246, "learning_rate": 9.876000000000002e-06, "loss": 1.1342, "step": 2470 }, { "epoch": 0.3232533889468196, "grad_norm": 9.971538543701172, "learning_rate": 9.916000000000001e-06, "loss": 1.0907, "step": 2480 }, { "epoch": 0.3245568300312826, "grad_norm": 11.306795120239258, "learning_rate": 9.956000000000001e-06, "loss": 1.1521, "step": 2490 }, { "epoch": 0.32586027111574556, "grad_norm": 10.270991325378418, "learning_rate": 9.996e-06, "loss": 1.1473, "step": 2500 }, { "epoch": 0.32586027111574556, "eval/acc": 37.20930099487305, "step": 2500 }, { "epoch": 0.32586027111574556, "eval_loss": 2.875680446624756, "eval_runtime": 0.554, "eval_samples_per_second": 77.622, "eval_steps_per_second": 1.805, "step": 2500 }, { "epoch": 0.32716371220020857, "grad_norm": 10.23509407043457, "learning_rate": 9.996e-06, "loss": 1.216, "step": 2510 }, { "epoch": 0.3284671532846715, "grad_norm": 12.120686531066895, "learning_rate": 9.991555555555557e-06, "loss": 1.0669, "step": 2520 }, { "epoch": 0.32977059436913453, "grad_norm": 11.20948314666748, "learning_rate": 9.987111111111112e-06, "loss": 1.0889, "step": 2530 }, { "epoch": 0.3310740354535975, "grad_norm": 11.085042953491211, "learning_rate": 9.982666666666667e-06, "loss": 1.212, "step": 2540 }, { "epoch": 0.3323774765380605, "grad_norm": 11.783760070800781, "learning_rate": 9.978222222222223e-06, "loss": 1.2059, "step": 2550 }, { "epoch": 0.33368091762252344, "grad_norm": 11.339371681213379, "learning_rate": 9.973777777777778e-06, "loss": 1.1027, "step": 2560 }, { "epoch": 0.33498435870698645, "grad_norm": 10.946623802185059, "learning_rate": 9.969333333333335e-06, "loss": 1.1663, "step": 2570 }, { "epoch": 0.3362877997914494, "grad_norm": 9.797304153442383, "learning_rate": 9.96488888888889e-06, "loss": 1.147, "step": 2580 }, { "epoch": 0.3375912408759124, "grad_norm": 10.305734634399414, "learning_rate": 9.960444444444444e-06, "loss": 1.2113, "step": 2590 }, { "epoch": 0.33889468196037537, "grad_norm": 9.742680549621582, "learning_rate": 9.956000000000001e-06, "loss": 1.1096, "step": 2600 }, { "epoch": 0.33889468196037537, "eval/acc": 32.55813980102539, "step": 2600 }, { "epoch": 0.33889468196037537, "eval_loss": 2.838628053665161, "eval_runtime": 0.5541, "eval_samples_per_second": 77.604, "eval_steps_per_second": 1.805, "step": 2600 }, { "epoch": 0.3401981230448384, "grad_norm": 11.681222915649414, "learning_rate": 9.951555555555556e-06, "loss": 1.1573, "step": 2610 }, { "epoch": 0.3415015641293014, "grad_norm": 10.580199241638184, "learning_rate": 9.947111111111112e-06, "loss": 1.1942, "step": 2620 }, { "epoch": 0.34280500521376434, "grad_norm": 9.525206565856934, "learning_rate": 9.942666666666667e-06, "loss": 1.139, "step": 2630 }, { "epoch": 0.34410844629822734, "grad_norm": 11.521892547607422, "learning_rate": 9.938222222222224e-06, "loss": 1.2106, "step": 2640 }, { "epoch": 0.3454118873826903, "grad_norm": 10.282144546508789, "learning_rate": 9.933777777777779e-06, "loss": 1.068, "step": 2650 }, { "epoch": 0.3467153284671533, "grad_norm": 10.942089080810547, "learning_rate": 9.929333333333333e-06, "loss": 1.0709, "step": 2660 }, { "epoch": 0.34801876955161626, "grad_norm": 12.269514083862305, "learning_rate": 9.92488888888889e-06, "loss": 1.073, "step": 2670 }, { "epoch": 0.34932221063607927, "grad_norm": 10.467517852783203, "learning_rate": 9.920444444444445e-06, "loss": 1.1178, "step": 2680 }, { "epoch": 0.3506256517205422, "grad_norm": 11.05263900756836, "learning_rate": 9.916000000000001e-06, "loss": 1.1126, "step": 2690 }, { "epoch": 0.35192909280500523, "grad_norm": 10.848026275634766, "learning_rate": 9.911555555555556e-06, "loss": 1.1425, "step": 2700 }, { "epoch": 0.35192909280500523, "eval/acc": 32.55813980102539, "step": 2700 }, { "epoch": 0.35192909280500523, "eval_loss": 2.8443257808685303, "eval_runtime": 0.5552, "eval_samples_per_second": 77.455, "eval_steps_per_second": 1.801, "step": 2700 }, { "epoch": 0.3532325338894682, "grad_norm": 9.627706527709961, "learning_rate": 9.907111111111111e-06, "loss": 1.0892, "step": 2710 }, { "epoch": 0.3545359749739312, "grad_norm": 9.48183822631836, "learning_rate": 9.902666666666668e-06, "loss": 1.1611, "step": 2720 }, { "epoch": 0.35583941605839414, "grad_norm": 10.31680965423584, "learning_rate": 9.898222222222224e-06, "loss": 1.1474, "step": 2730 }, { "epoch": 0.35714285714285715, "grad_norm": 9.613831520080566, "learning_rate": 9.893777777777779e-06, "loss": 1.0592, "step": 2740 }, { "epoch": 0.3584462982273201, "grad_norm": 14.002620697021484, "learning_rate": 9.889333333333334e-06, "loss": 1.1399, "step": 2750 }, { "epoch": 0.3597497393117831, "grad_norm": 9.574627876281738, "learning_rate": 9.884888888888889e-06, "loss": 1.0702, "step": 2760 }, { "epoch": 0.36105318039624607, "grad_norm": 11.370795249938965, "learning_rate": 9.880444444444445e-06, "loss": 1.1089, "step": 2770 }, { "epoch": 0.3623566214807091, "grad_norm": 11.44530963897705, "learning_rate": 9.876000000000002e-06, "loss": 1.0896, "step": 2780 }, { "epoch": 0.3636600625651721, "grad_norm": 10.26310920715332, "learning_rate": 9.871555555555557e-06, "loss": 1.2288, "step": 2790 }, { "epoch": 0.36496350364963503, "grad_norm": 10.72587776184082, "learning_rate": 9.867111111111111e-06, "loss": 1.1154, "step": 2800 }, { "epoch": 0.36496350364963503, "eval/acc": 39.53488540649414, "step": 2800 }, { "epoch": 0.36496350364963503, "eval_loss": 2.9019012451171875, "eval_runtime": 0.553, "eval_samples_per_second": 77.752, "eval_steps_per_second": 1.808, "step": 2800 }, { "epoch": 0.36626694473409804, "grad_norm": 10.30538272857666, "learning_rate": 9.862666666666668e-06, "loss": 1.086, "step": 2810 }, { "epoch": 0.367570385818561, "grad_norm": 9.331382751464844, "learning_rate": 9.858222222222223e-06, "loss": 1.2179, "step": 2820 }, { "epoch": 0.368873826903024, "grad_norm": 9.834467887878418, "learning_rate": 9.85377777777778e-06, "loss": 1.1286, "step": 2830 }, { "epoch": 0.37017726798748696, "grad_norm": 11.874444961547852, "learning_rate": 9.849333333333334e-06, "loss": 1.1325, "step": 2840 }, { "epoch": 0.37148070907194997, "grad_norm": 10.40954875946045, "learning_rate": 9.844888888888889e-06, "loss": 1.1669, "step": 2850 }, { "epoch": 0.3727841501564129, "grad_norm": 10.013657569885254, "learning_rate": 9.840444444444446e-06, "loss": 1.0895, "step": 2860 }, { "epoch": 0.3740875912408759, "grad_norm": 10.641711235046387, "learning_rate": 9.836e-06, "loss": 1.1342, "step": 2870 }, { "epoch": 0.3753910323253389, "grad_norm": 9.41917896270752, "learning_rate": 9.831555555555557e-06, "loss": 1.0698, "step": 2880 }, { "epoch": 0.3766944734098019, "grad_norm": 10.998407363891602, "learning_rate": 9.827111111111112e-06, "loss": 1.0777, "step": 2890 }, { "epoch": 0.37799791449426484, "grad_norm": 10.565347671508789, "learning_rate": 9.822666666666667e-06, "loss": 1.1446, "step": 2900 }, { "epoch": 0.37799791449426484, "eval/acc": 34.88372039794922, "step": 2900 }, { "epoch": 0.37799791449426484, "eval_loss": 2.8377606868743896, "eval_runtime": 0.5537, "eval_samples_per_second": 77.653, "eval_steps_per_second": 1.806, "step": 2900 }, { "epoch": 0.37930135557872785, "grad_norm": 10.49682903289795, "learning_rate": 9.818222222222223e-06, "loss": 1.0774, "step": 2910 }, { "epoch": 0.3806047966631908, "grad_norm": 10.447504997253418, "learning_rate": 9.813777777777778e-06, "loss": 1.0116, "step": 2920 }, { "epoch": 0.3819082377476538, "grad_norm": 9.127096176147461, "learning_rate": 9.809333333333335e-06, "loss": 1.0786, "step": 2930 }, { "epoch": 0.38321167883211676, "grad_norm": 11.165003776550293, "learning_rate": 9.80488888888889e-06, "loss": 1.0451, "step": 2940 }, { "epoch": 0.3845151199165798, "grad_norm": 11.500470161437988, "learning_rate": 9.800444444444446e-06, "loss": 1.1676, "step": 2950 }, { "epoch": 0.3858185610010427, "grad_norm": 9.945548057556152, "learning_rate": 9.796e-06, "loss": 1.0829, "step": 2960 }, { "epoch": 0.38712200208550573, "grad_norm": 9.096894264221191, "learning_rate": 9.791555555555556e-06, "loss": 1.1571, "step": 2970 }, { "epoch": 0.38842544316996874, "grad_norm": 9.676164627075195, "learning_rate": 9.787111111111112e-06, "loss": 1.1088, "step": 2980 }, { "epoch": 0.3897288842544317, "grad_norm": 9.788176536560059, "learning_rate": 9.782666666666667e-06, "loss": 1.1283, "step": 2990 }, { "epoch": 0.3910323253388947, "grad_norm": 9.841941833496094, "learning_rate": 9.778222222222224e-06, "loss": 1.1106, "step": 3000 }, { "epoch": 0.3910323253388947, "eval/acc": 34.88372039794922, "step": 3000 }, { "epoch": 0.3910323253388947, "eval_loss": 2.733646869659424, "eval_runtime": 0.5496, "eval_samples_per_second": 78.234, "eval_steps_per_second": 1.819, "step": 3000 }, { "epoch": 0.39233576642335766, "grad_norm": 11.72174072265625, "learning_rate": 9.773777777777778e-06, "loss": 1.0701, "step": 3010 }, { "epoch": 0.39363920750782067, "grad_norm": 9.394064903259277, "learning_rate": 9.769333333333333e-06, "loss": 1.0966, "step": 3020 }, { "epoch": 0.3949426485922836, "grad_norm": 10.733139991760254, "learning_rate": 9.76488888888889e-06, "loss": 1.1911, "step": 3030 }, { "epoch": 0.3962460896767466, "grad_norm": 10.33555793762207, "learning_rate": 9.760444444444446e-06, "loss": 1.0843, "step": 3040 }, { "epoch": 0.3975495307612096, "grad_norm": 10.826848983764648, "learning_rate": 9.756000000000001e-06, "loss": 1.2552, "step": 3050 }, { "epoch": 0.3988529718456726, "grad_norm": 14.324176788330078, "learning_rate": 9.751555555555556e-06, "loss": 1.0654, "step": 3060 }, { "epoch": 0.40015641293013554, "grad_norm": 9.932692527770996, "learning_rate": 9.74711111111111e-06, "loss": 1.1099, "step": 3070 }, { "epoch": 0.40145985401459855, "grad_norm": 8.769567489624023, "learning_rate": 9.742666666666667e-06, "loss": 1.126, "step": 3080 }, { "epoch": 0.4027632950990615, "grad_norm": 9.914202690124512, "learning_rate": 9.738222222222224e-06, "loss": 1.0349, "step": 3090 }, { "epoch": 0.4040667361835245, "grad_norm": 8.979110717773438, "learning_rate": 9.733777777777779e-06, "loss": 1.1147, "step": 3100 }, { "epoch": 0.4040667361835245, "eval/acc": 34.88372039794922, "step": 3100 }, { "epoch": 0.4040667361835245, "eval_loss": 2.9223451614379883, "eval_runtime": 0.5703, "eval_samples_per_second": 75.404, "eval_steps_per_second": 1.754, "step": 3100 }, { "epoch": 0.40537017726798746, "grad_norm": 10.178040504455566, "learning_rate": 9.729333333333334e-06, "loss": 1.0661, "step": 3110 }, { "epoch": 0.40667361835245047, "grad_norm": 9.03530216217041, "learning_rate": 9.724888888888888e-06, "loss": 1.1282, "step": 3120 }, { "epoch": 0.4079770594369134, "grad_norm": 9.545401573181152, "learning_rate": 9.720444444444445e-06, "loss": 1.0933, "step": 3130 }, { "epoch": 0.40928050052137643, "grad_norm": 9.34640121459961, "learning_rate": 9.716000000000002e-06, "loss": 1.1273, "step": 3140 }, { "epoch": 0.41058394160583944, "grad_norm": 9.456986427307129, "learning_rate": 9.711555555555556e-06, "loss": 1.1527, "step": 3150 }, { "epoch": 0.4118873826903024, "grad_norm": 10.370234489440918, "learning_rate": 9.707111111111111e-06, "loss": 1.0639, "step": 3160 }, { "epoch": 0.4131908237747654, "grad_norm": 9.064216613769531, "learning_rate": 9.702666666666668e-06, "loss": 1.0942, "step": 3170 }, { "epoch": 0.41449426485922836, "grad_norm": 10.234908103942871, "learning_rate": 9.698222222222223e-06, "loss": 1.167, "step": 3180 }, { "epoch": 0.41579770594369136, "grad_norm": 10.56281566619873, "learning_rate": 9.693777777777779e-06, "loss": 1.0659, "step": 3190 }, { "epoch": 0.4171011470281543, "grad_norm": 10.342962265014648, "learning_rate": 9.689333333333334e-06, "loss": 1.0829, "step": 3200 }, { "epoch": 0.4171011470281543, "eval/acc": 37.20930099487305, "step": 3200 }, { "epoch": 0.4171011470281543, "eval_loss": 2.8262035846710205, "eval_runtime": 0.5536, "eval_samples_per_second": 77.675, "eval_steps_per_second": 1.806, "step": 3200 }, { "epoch": 0.4184045881126173, "grad_norm": 10.684874534606934, "learning_rate": 9.684888888888889e-06, "loss": 1.1349, "step": 3210 }, { "epoch": 0.4197080291970803, "grad_norm": 10.516358375549316, "learning_rate": 9.680444444444445e-06, "loss": 1.1277, "step": 3220 }, { "epoch": 0.4210114702815433, "grad_norm": 10.139744758605957, "learning_rate": 9.676e-06, "loss": 1.0163, "step": 3230 }, { "epoch": 0.42231491136600624, "grad_norm": 8.341273307800293, "learning_rate": 9.671555555555557e-06, "loss": 1.0264, "step": 3240 }, { "epoch": 0.42361835245046925, "grad_norm": 10.223051071166992, "learning_rate": 9.667111111111112e-06, "loss": 1.135, "step": 3250 }, { "epoch": 0.4249217935349322, "grad_norm": 10.643424987792969, "learning_rate": 9.662666666666668e-06, "loss": 1.1079, "step": 3260 }, { "epoch": 0.4262252346193952, "grad_norm": 10.129752159118652, "learning_rate": 9.658222222222223e-06, "loss": 1.1081, "step": 3270 }, { "epoch": 0.42752867570385816, "grad_norm": 9.503373146057129, "learning_rate": 9.653777777777778e-06, "loss": 1.1133, "step": 3280 }, { "epoch": 0.42883211678832117, "grad_norm": 10.085280418395996, "learning_rate": 9.649333333333334e-06, "loss": 1.0587, "step": 3290 }, { "epoch": 0.4301355578727841, "grad_norm": 8.572416305541992, "learning_rate": 9.64488888888889e-06, "loss": 1.0355, "step": 3300 }, { "epoch": 0.4301355578727841, "eval/acc": 39.53488540649414, "step": 3300 }, { "epoch": 0.4301355578727841, "eval_loss": 2.875706911087036, "eval_runtime": 0.8759, "eval_samples_per_second": 49.094, "eval_steps_per_second": 1.142, "step": 3300 }, { "epoch": 0.43143899895724713, "grad_norm": 11.9055814743042, "learning_rate": 9.640444444444446e-06, "loss": 1.1596, "step": 3310 }, { "epoch": 0.43274244004171014, "grad_norm": 7.891529083251953, "learning_rate": 9.636e-06, "loss": 1.0478, "step": 3320 }, { "epoch": 0.4340458811261731, "grad_norm": 10.109358787536621, "learning_rate": 9.631555555555555e-06, "loss": 1.037, "step": 3330 }, { "epoch": 0.4353493222106361, "grad_norm": 9.673956871032715, "learning_rate": 9.627111111111112e-06, "loss": 1.0827, "step": 3340 }, { "epoch": 0.43665276329509906, "grad_norm": 9.339848518371582, "learning_rate": 9.622666666666668e-06, "loss": 1.0662, "step": 3350 }, { "epoch": 0.43795620437956206, "grad_norm": 9.403885841369629, "learning_rate": 9.618222222222223e-06, "loss": 1.051, "step": 3360 }, { "epoch": 0.439259645464025, "grad_norm": 10.163128852844238, "learning_rate": 9.613777777777778e-06, "loss": 1.082, "step": 3370 }, { "epoch": 0.440563086548488, "grad_norm": 8.699789047241211, "learning_rate": 9.609333333333333e-06, "loss": 1.1382, "step": 3380 }, { "epoch": 0.441866527632951, "grad_norm": 10.108110427856445, "learning_rate": 9.60488888888889e-06, "loss": 1.1227, "step": 3390 }, { "epoch": 0.443169968717414, "grad_norm": 11.483874320983887, "learning_rate": 9.600444444444446e-06, "loss": 1.0851, "step": 3400 }, { "epoch": 0.443169968717414, "eval/acc": 37.20930099487305, "step": 3400 }, { "epoch": 0.443169968717414, "eval_loss": 2.804619312286377, "eval_runtime": 0.5503, "eval_samples_per_second": 78.14, "eval_steps_per_second": 1.817, "step": 3400 }, { "epoch": 0.44447340980187694, "grad_norm": 9.97952938079834, "learning_rate": 9.596000000000001e-06, "loss": 1.02, "step": 3410 }, { "epoch": 0.44577685088633995, "grad_norm": 9.445475578308105, "learning_rate": 9.591555555555556e-06, "loss": 1.1166, "step": 3420 }, { "epoch": 0.4470802919708029, "grad_norm": 10.408344268798828, "learning_rate": 9.58711111111111e-06, "loss": 1.1305, "step": 3430 }, { "epoch": 0.4483837330552659, "grad_norm": 10.129816055297852, "learning_rate": 9.582666666666667e-06, "loss": 1.0882, "step": 3440 }, { "epoch": 0.44968717413972886, "grad_norm": 11.24634838104248, "learning_rate": 9.578222222222224e-06, "loss": 1.1458, "step": 3450 }, { "epoch": 0.45099061522419187, "grad_norm": 9.015290260314941, "learning_rate": 9.573777777777779e-06, "loss": 1.1692, "step": 3460 }, { "epoch": 0.4522940563086548, "grad_norm": 11.587824821472168, "learning_rate": 9.569333333333333e-06, "loss": 1.1005, "step": 3470 }, { "epoch": 0.45359749739311783, "grad_norm": 11.352563858032227, "learning_rate": 9.56488888888889e-06, "loss": 1.0646, "step": 3480 }, { "epoch": 0.45490093847758084, "grad_norm": 9.49247932434082, "learning_rate": 9.560444444444445e-06, "loss": 1.0682, "step": 3490 }, { "epoch": 0.4562043795620438, "grad_norm": 11.200020790100098, "learning_rate": 9.556000000000001e-06, "loss": 1.0354, "step": 3500 }, { "epoch": 0.4562043795620438, "eval/acc": 32.55813980102539, "step": 3500 }, { "epoch": 0.4562043795620438, "eval_loss": 2.850745916366577, "eval_runtime": 0.5511, "eval_samples_per_second": 78.029, "eval_steps_per_second": 1.815, "step": 3500 }, { "epoch": 0.4575078206465068, "grad_norm": 9.414541244506836, "learning_rate": 9.551555555555556e-06, "loss": 1.0458, "step": 3510 }, { "epoch": 0.45881126173096975, "grad_norm": 11.558536529541016, "learning_rate": 9.547111111111111e-06, "loss": 1.1236, "step": 3520 }, { "epoch": 0.46011470281543276, "grad_norm": 9.692702293395996, "learning_rate": 9.542666666666668e-06, "loss": 1.1055, "step": 3530 }, { "epoch": 0.4614181438998957, "grad_norm": 11.946589469909668, "learning_rate": 9.538222222222222e-06, "loss": 1.0746, "step": 3540 }, { "epoch": 0.4627215849843587, "grad_norm": 9.754605293273926, "learning_rate": 9.533777777777779e-06, "loss": 1.1951, "step": 3550 }, { "epoch": 0.4640250260688217, "grad_norm": 9.67271614074707, "learning_rate": 9.529333333333334e-06, "loss": 1.0727, "step": 3560 }, { "epoch": 0.4653284671532847, "grad_norm": 9.403274536132812, "learning_rate": 9.52488888888889e-06, "loss": 1.0781, "step": 3570 }, { "epoch": 0.46663190823774764, "grad_norm": 9.350010871887207, "learning_rate": 9.520444444444445e-06, "loss": 1.141, "step": 3580 }, { "epoch": 0.46793534932221065, "grad_norm": 11.810049057006836, "learning_rate": 9.516e-06, "loss": 1.0642, "step": 3590 }, { "epoch": 0.4692387904066736, "grad_norm": 9.524765014648438, "learning_rate": 9.511555555555557e-06, "loss": 1.0196, "step": 3600 }, { "epoch": 0.4692387904066736, "eval/acc": 34.88372039794922, "step": 3600 }, { "epoch": 0.4692387904066736, "eval_loss": 2.8087544441223145, "eval_runtime": 0.5519, "eval_samples_per_second": 77.911, "eval_steps_per_second": 1.812, "step": 3600 }, { "epoch": 0.4705422314911366, "grad_norm": 11.370705604553223, "learning_rate": 9.507111111111111e-06, "loss": 1.0253, "step": 3610 }, { "epoch": 0.47184567257559956, "grad_norm": 8.690403938293457, "learning_rate": 9.502666666666668e-06, "loss": 1.071, "step": 3620 }, { "epoch": 0.47314911366006257, "grad_norm": 9.085663795471191, "learning_rate": 9.498222222222223e-06, "loss": 1.1507, "step": 3630 }, { "epoch": 0.4744525547445255, "grad_norm": 9.252151489257812, "learning_rate": 9.493777777777778e-06, "loss": 1.1038, "step": 3640 }, { "epoch": 0.47575599582898853, "grad_norm": 11.12983226776123, "learning_rate": 9.489333333333334e-06, "loss": 1.0457, "step": 3650 }, { "epoch": 0.4770594369134515, "grad_norm": 9.117828369140625, "learning_rate": 9.48488888888889e-06, "loss": 1.0704, "step": 3660 }, { "epoch": 0.4783628779979145, "grad_norm": 10.71731185913086, "learning_rate": 9.480444444444446e-06, "loss": 1.0461, "step": 3670 }, { "epoch": 0.4796663190823775, "grad_norm": 8.495375633239746, "learning_rate": 9.476e-06, "loss": 1.1173, "step": 3680 }, { "epoch": 0.48096976016684045, "grad_norm": 10.223701477050781, "learning_rate": 9.471555555555555e-06, "loss": 1.1301, "step": 3690 }, { "epoch": 0.48227320125130346, "grad_norm": 10.180765151977539, "learning_rate": 9.467111111111112e-06, "loss": 1.1262, "step": 3700 }, { "epoch": 0.48227320125130346, "eval/acc": 37.20930099487305, "step": 3700 }, { "epoch": 0.48227320125130346, "eval_loss": 2.7609646320343018, "eval_runtime": 0.5504, "eval_samples_per_second": 78.122, "eval_steps_per_second": 1.817, "step": 3700 }, { "epoch": 0.4835766423357664, "grad_norm": 10.506937026977539, "learning_rate": 9.462666666666668e-06, "loss": 1.0366, "step": 3710 }, { "epoch": 0.4848800834202294, "grad_norm": 11.131478309631348, "learning_rate": 9.458222222222223e-06, "loss": 0.9718, "step": 3720 }, { "epoch": 0.4861835245046924, "grad_norm": 11.127948760986328, "learning_rate": 9.453777777777778e-06, "loss": 1.1568, "step": 3730 }, { "epoch": 0.4874869655891554, "grad_norm": 10.03661060333252, "learning_rate": 9.449333333333333e-06, "loss": 1.0066, "step": 3740 }, { "epoch": 0.48879040667361834, "grad_norm": 11.38666820526123, "learning_rate": 9.44488888888889e-06, "loss": 1.0457, "step": 3750 }, { "epoch": 0.49009384775808135, "grad_norm": 9.510127067565918, "learning_rate": 9.440444444444446e-06, "loss": 1.1167, "step": 3760 }, { "epoch": 0.4913972888425443, "grad_norm": 10.810651779174805, "learning_rate": 9.436e-06, "loss": 1.1126, "step": 3770 }, { "epoch": 0.4927007299270073, "grad_norm": 9.202433586120605, "learning_rate": 9.431555555555556e-06, "loss": 1.0681, "step": 3780 }, { "epoch": 0.49400417101147026, "grad_norm": 8.647710800170898, "learning_rate": 9.427111111111112e-06, "loss": 1.0295, "step": 3790 }, { "epoch": 0.49530761209593327, "grad_norm": 11.453765869140625, "learning_rate": 9.422666666666667e-06, "loss": 1.015, "step": 3800 }, { "epoch": 0.49530761209593327, "eval/acc": 37.20930099487305, "step": 3800 }, { "epoch": 0.49530761209593327, "eval_loss": 2.7812387943267822, "eval_runtime": 0.5494, "eval_samples_per_second": 78.268, "eval_steps_per_second": 1.82, "step": 3800 }, { "epoch": 0.4966110531803962, "grad_norm": 10.551323890686035, "learning_rate": 9.418222222222224e-06, "loss": 1.1188, "step": 3810 }, { "epoch": 0.49791449426485923, "grad_norm": 10.307533264160156, "learning_rate": 9.413777777777778e-06, "loss": 1.0767, "step": 3820 }, { "epoch": 0.4992179353493222, "grad_norm": 12.098529815673828, "learning_rate": 9.409333333333333e-06, "loss": 1.0597, "step": 3830 }, { "epoch": 0.5005213764337852, "grad_norm": 10.920623779296875, "learning_rate": 9.40488888888889e-06, "loss": 0.9847, "step": 3840 }, { "epoch": 0.5018248175182481, "grad_norm": 10.035759925842285, "learning_rate": 9.400444444444445e-06, "loss": 1.07, "step": 3850 }, { "epoch": 0.5031282586027112, "grad_norm": 10.293031692504883, "learning_rate": 9.396000000000001e-06, "loss": 1.0453, "step": 3860 }, { "epoch": 0.5044316996871742, "grad_norm": 9.7219877243042, "learning_rate": 9.391555555555556e-06, "loss": 1.0783, "step": 3870 }, { "epoch": 0.5057351407716372, "grad_norm": 9.780116081237793, "learning_rate": 9.387111111111113e-06, "loss": 1.021, "step": 3880 }, { "epoch": 0.5070385818561001, "grad_norm": 10.145584106445312, "learning_rate": 9.382666666666667e-06, "loss": 1.0744, "step": 3890 }, { "epoch": 0.5083420229405631, "grad_norm": 9.737056732177734, "learning_rate": 9.378222222222222e-06, "loss": 1.1837, "step": 3900 }, { "epoch": 0.5083420229405631, "eval/acc": 34.88372039794922, "step": 3900 }, { "epoch": 0.5083420229405631, "eval_loss": 2.6774258613586426, "eval_runtime": 0.5548, "eval_samples_per_second": 77.509, "eval_steps_per_second": 1.803, "step": 3900 }, { "epoch": 0.5096454640250261, "grad_norm": 9.52910041809082, "learning_rate": 9.373777777777779e-06, "loss": 1.0, "step": 3910 }, { "epoch": 0.5109489051094891, "grad_norm": 11.480224609375, "learning_rate": 9.369333333333334e-06, "loss": 1.029, "step": 3920 }, { "epoch": 0.512252346193952, "grad_norm": 8.294060707092285, "learning_rate": 9.36488888888889e-06, "loss": 1.0584, "step": 3930 }, { "epoch": 0.513555787278415, "grad_norm": 8.96554946899414, "learning_rate": 9.360444444444445e-06, "loss": 1.0415, "step": 3940 }, { "epoch": 0.514859228362878, "grad_norm": 10.146249771118164, "learning_rate": 9.356e-06, "loss": 0.9763, "step": 3950 }, { "epoch": 0.516162669447341, "grad_norm": 9.620243072509766, "learning_rate": 9.351555555555556e-06, "loss": 1.0677, "step": 3960 }, { "epoch": 0.5174661105318039, "grad_norm": 8.995674133300781, "learning_rate": 9.347111111111113e-06, "loss": 1.0893, "step": 3970 }, { "epoch": 0.5187695516162669, "grad_norm": 10.30301284790039, "learning_rate": 9.342666666666668e-06, "loss": 1.0958, "step": 3980 }, { "epoch": 0.5200729927007299, "grad_norm": 9.020184516906738, "learning_rate": 9.338222222222223e-06, "loss": 1.0115, "step": 3990 }, { "epoch": 0.5213764337851929, "grad_norm": 11.706809997558594, "learning_rate": 9.333777777777777e-06, "loss": 1.1306, "step": 4000 }, { "epoch": 0.5213764337851929, "eval/acc": 34.88372039794922, "step": 4000 }, { "epoch": 0.5213764337851929, "eval_loss": 2.719060182571411, "eval_runtime": 0.5502, "eval_samples_per_second": 78.155, "eval_steps_per_second": 1.818, "step": 4000 }, { "epoch": 0.5226798748696558, "grad_norm": 10.49409294128418, "learning_rate": 9.329333333333334e-06, "loss": 1.0554, "step": 4010 }, { "epoch": 0.5239833159541188, "grad_norm": 7.883603572845459, "learning_rate": 9.32488888888889e-06, "loss": 0.9968, "step": 4020 }, { "epoch": 0.5252867570385819, "grad_norm": 11.045550346374512, "learning_rate": 9.320444444444445e-06, "loss": 1.1689, "step": 4030 }, { "epoch": 0.5265901981230449, "grad_norm": 9.245767593383789, "learning_rate": 9.316e-06, "loss": 1.0647, "step": 4040 }, { "epoch": 0.5278936392075079, "grad_norm": 8.662199974060059, "learning_rate": 9.311555555555555e-06, "loss": 0.9952, "step": 4050 }, { "epoch": 0.5291970802919708, "grad_norm": 8.584678649902344, "learning_rate": 9.307111111111112e-06, "loss": 1.0025, "step": 4060 }, { "epoch": 0.5305005213764338, "grad_norm": 8.951703071594238, "learning_rate": 9.302666666666668e-06, "loss": 1.0182, "step": 4070 }, { "epoch": 0.5318039624608968, "grad_norm": 11.469212532043457, "learning_rate": 9.298222222222223e-06, "loss": 1.086, "step": 4080 }, { "epoch": 0.5331074035453598, "grad_norm": 10.124979972839355, "learning_rate": 9.293777777777778e-06, "loss": 1.0614, "step": 4090 }, { "epoch": 0.5344108446298227, "grad_norm": 9.715713500976562, "learning_rate": 9.289333333333334e-06, "loss": 1.029, "step": 4100 }, { "epoch": 0.5344108446298227, "eval/acc": 39.53488540649414, "step": 4100 }, { "epoch": 0.5344108446298227, "eval_loss": 2.6891214847564697, "eval_runtime": 0.5531, "eval_samples_per_second": 77.75, "eval_steps_per_second": 1.808, "step": 4100 }, { "epoch": 0.5357142857142857, "grad_norm": 10.887805938720703, "learning_rate": 9.28488888888889e-06, "loss": 1.1104, "step": 4110 }, { "epoch": 0.5370177267987487, "grad_norm": 8.276104927062988, "learning_rate": 9.280444444444446e-06, "loss": 1.0563, "step": 4120 }, { "epoch": 0.5383211678832117, "grad_norm": 9.104747772216797, "learning_rate": 9.276e-06, "loss": 1.0732, "step": 4130 }, { "epoch": 0.5396246089676746, "grad_norm": 10.727592468261719, "learning_rate": 9.271555555555555e-06, "loss": 1.0253, "step": 4140 }, { "epoch": 0.5409280500521376, "grad_norm": 10.487238883972168, "learning_rate": 9.267111111111112e-06, "loss": 1.0849, "step": 4150 }, { "epoch": 0.5422314911366006, "grad_norm": 9.830368995666504, "learning_rate": 9.262666666666667e-06, "loss": 1.0699, "step": 4160 }, { "epoch": 0.5435349322210636, "grad_norm": 9.725363731384277, "learning_rate": 9.258222222222223e-06, "loss": 1.0149, "step": 4170 }, { "epoch": 0.5448383733055265, "grad_norm": 10.23435115814209, "learning_rate": 9.253777777777778e-06, "loss": 0.9648, "step": 4180 }, { "epoch": 0.5461418143899895, "grad_norm": 8.573326110839844, "learning_rate": 9.249333333333335e-06, "loss": 1.0607, "step": 4190 }, { "epoch": 0.5474452554744526, "grad_norm": 9.514001846313477, "learning_rate": 9.24488888888889e-06, "loss": 1.0196, "step": 4200 }, { "epoch": 0.5474452554744526, "eval/acc": 32.55813980102539, "step": 4200 }, { "epoch": 0.5474452554744526, "eval_loss": 2.5407004356384277, "eval_runtime": 0.5515, "eval_samples_per_second": 77.969, "eval_steps_per_second": 1.813, "step": 4200 }, { "epoch": 0.5487486965589156, "grad_norm": 9.46273136138916, "learning_rate": 9.240444444444444e-06, "loss": 1.0557, "step": 4210 }, { "epoch": 0.5500521376433786, "grad_norm": 12.82573127746582, "learning_rate": 9.236000000000001e-06, "loss": 1.051, "step": 4220 }, { "epoch": 0.5513555787278415, "grad_norm": 10.965460777282715, "learning_rate": 9.231555555555556e-06, "loss": 0.9239, "step": 4230 }, { "epoch": 0.5526590198123045, "grad_norm": 9.015987396240234, "learning_rate": 9.227111111111112e-06, "loss": 1.0477, "step": 4240 }, { "epoch": 0.5539624608967675, "grad_norm": 8.61673355102539, "learning_rate": 9.222666666666667e-06, "loss": 1.0693, "step": 4250 }, { "epoch": 0.5552659019812305, "grad_norm": 9.152997016906738, "learning_rate": 9.218222222222222e-06, "loss": 1.0104, "step": 4260 }, { "epoch": 0.5565693430656934, "grad_norm": 8.82421588897705, "learning_rate": 9.213777777777779e-06, "loss": 1.0304, "step": 4270 }, { "epoch": 0.5578727841501564, "grad_norm": 9.665721893310547, "learning_rate": 9.209333333333335e-06, "loss": 1.0608, "step": 4280 }, { "epoch": 0.5591762252346194, "grad_norm": 10.174515724182129, "learning_rate": 9.20488888888889e-06, "loss": 1.099, "step": 4290 }, { "epoch": 0.5604796663190824, "grad_norm": 9.723739624023438, "learning_rate": 9.200444444444445e-06, "loss": 1.0327, "step": 4300 }, { "epoch": 0.5604796663190824, "eval/acc": 34.88372039794922, "step": 4300 }, { "epoch": 0.5604796663190824, "eval_loss": 2.560245990753174, "eval_runtime": 0.5506, "eval_samples_per_second": 78.095, "eval_steps_per_second": 1.816, "step": 4300 }, { "epoch": 0.5617831074035453, "grad_norm": 9.028182983398438, "learning_rate": 9.196e-06, "loss": 1.0447, "step": 4310 }, { "epoch": 0.5630865484880083, "grad_norm": 9.231035232543945, "learning_rate": 9.191555555555556e-06, "loss": 1.0014, "step": 4320 }, { "epoch": 0.5643899895724713, "grad_norm": 9.409144401550293, "learning_rate": 9.187111111111113e-06, "loss": 1.0805, "step": 4330 }, { "epoch": 0.5656934306569343, "grad_norm": 9.330337524414062, "learning_rate": 9.182666666666668e-06, "loss": 0.9831, "step": 4340 }, { "epoch": 0.5669968717413972, "grad_norm": 9.44364070892334, "learning_rate": 9.178222222222222e-06, "loss": 1.1507, "step": 4350 }, { "epoch": 0.5683003128258602, "grad_norm": 8.195267677307129, "learning_rate": 9.173777777777777e-06, "loss": 1.0538, "step": 4360 }, { "epoch": 0.5696037539103233, "grad_norm": 10.082292556762695, "learning_rate": 9.169333333333334e-06, "loss": 1.1772, "step": 4370 }, { "epoch": 0.5709071949947863, "grad_norm": 7.957224369049072, "learning_rate": 9.16488888888889e-06, "loss": 0.9671, "step": 4380 }, { "epoch": 0.5722106360792493, "grad_norm": 9.066376686096191, "learning_rate": 9.160444444444445e-06, "loss": 0.9564, "step": 4390 }, { "epoch": 0.5735140771637122, "grad_norm": 9.167228698730469, "learning_rate": 9.156e-06, "loss": 1.0625, "step": 4400 }, { "epoch": 0.5735140771637122, "eval/acc": 37.20930099487305, "step": 4400 }, { "epoch": 0.5735140771637122, "eval_loss": 2.62846040725708, "eval_runtime": 0.5519, "eval_samples_per_second": 77.91, "eval_steps_per_second": 1.812, "step": 4400 }, { "epoch": 0.5748175182481752, "grad_norm": 11.493626594543457, "learning_rate": 9.151555555555557e-06, "loss": 1.0258, "step": 4410 }, { "epoch": 0.5761209593326382, "grad_norm": 11.340927124023438, "learning_rate": 9.147111111111111e-06, "loss": 1.0167, "step": 4420 }, { "epoch": 0.5774244004171012, "grad_norm": 9.083796501159668, "learning_rate": 9.142666666666668e-06, "loss": 1.0704, "step": 4430 }, { "epoch": 0.5787278415015641, "grad_norm": 10.585103988647461, "learning_rate": 9.138222222222223e-06, "loss": 1.1001, "step": 4440 }, { "epoch": 0.5800312825860271, "grad_norm": 10.192399024963379, "learning_rate": 9.133777777777778e-06, "loss": 1.0589, "step": 4450 }, { "epoch": 0.5813347236704901, "grad_norm": 9.637321472167969, "learning_rate": 9.129333333333334e-06, "loss": 1.0217, "step": 4460 }, { "epoch": 0.5826381647549531, "grad_norm": 11.652050018310547, "learning_rate": 9.124888888888889e-06, "loss": 1.1136, "step": 4470 }, { "epoch": 0.583941605839416, "grad_norm": 9.2413969039917, "learning_rate": 9.120444444444446e-06, "loss": 1.0009, "step": 4480 }, { "epoch": 0.585245046923879, "grad_norm": 9.579240798950195, "learning_rate": 9.116e-06, "loss": 1.0948, "step": 4490 }, { "epoch": 0.586548488008342, "grad_norm": 10.748444557189941, "learning_rate": 9.111555555555557e-06, "loss": 1.0969, "step": 4500 }, { "epoch": 0.586548488008342, "eval/acc": 34.88372039794922, "step": 4500 }, { "epoch": 0.586548488008342, "eval_loss": 2.528625726699829, "eval_runtime": 0.5514, "eval_samples_per_second": 77.977, "eval_steps_per_second": 1.813, "step": 4500 }, { "epoch": 0.587851929092805, "grad_norm": 9.715644836425781, "learning_rate": 9.107111111111112e-06, "loss": 1.1046, "step": 4510 }, { "epoch": 0.5891553701772679, "grad_norm": 9.33938980102539, "learning_rate": 9.102666666666667e-06, "loss": 0.9897, "step": 4520 }, { "epoch": 0.5904588112617309, "grad_norm": 8.88958740234375, "learning_rate": 9.098222222222223e-06, "loss": 1.1344, "step": 4530 }, { "epoch": 0.591762252346194, "grad_norm": 10.64390754699707, "learning_rate": 9.093777777777778e-06, "loss": 1.0106, "step": 4540 }, { "epoch": 0.593065693430657, "grad_norm": 9.564251899719238, "learning_rate": 9.089333333333335e-06, "loss": 1.0985, "step": 4550 }, { "epoch": 0.59436913451512, "grad_norm": 9.475229263305664, "learning_rate": 9.08488888888889e-06, "loss": 1.0643, "step": 4560 }, { "epoch": 0.5956725755995829, "grad_norm": 8.694733619689941, "learning_rate": 9.080444444444444e-06, "loss": 1.0899, "step": 4570 }, { "epoch": 0.5969760166840459, "grad_norm": 9.67250919342041, "learning_rate": 9.076000000000001e-06, "loss": 1.0518, "step": 4580 }, { "epoch": 0.5982794577685089, "grad_norm": 9.918119430541992, "learning_rate": 9.071555555555557e-06, "loss": 1.139, "step": 4590 }, { "epoch": 0.5995828988529719, "grad_norm": 11.0655517578125, "learning_rate": 9.067111111111112e-06, "loss": 1.0195, "step": 4600 }, { "epoch": 0.5995828988529719, "eval/acc": 37.20930099487305, "step": 4600 }, { "epoch": 0.5995828988529719, "eval_loss": 2.581063747406006, "eval_runtime": 0.5974, "eval_samples_per_second": 71.985, "eval_steps_per_second": 1.674, "step": 4600 }, { "epoch": 0.6008863399374348, "grad_norm": 9.673873901367188, "learning_rate": 9.062666666666667e-06, "loss": 1.0139, "step": 4610 }, { "epoch": 0.6021897810218978, "grad_norm": 9.95392894744873, "learning_rate": 9.058222222222222e-06, "loss": 1.0544, "step": 4620 }, { "epoch": 0.6034932221063608, "grad_norm": 11.47777271270752, "learning_rate": 9.053777777777778e-06, "loss": 1.0851, "step": 4630 }, { "epoch": 0.6047966631908238, "grad_norm": 8.379030227661133, "learning_rate": 9.049333333333335e-06, "loss": 1.0244, "step": 4640 }, { "epoch": 0.6061001042752867, "grad_norm": 8.413164138793945, "learning_rate": 9.04488888888889e-06, "loss": 1.0619, "step": 4650 }, { "epoch": 0.6074035453597497, "grad_norm": 10.171146392822266, "learning_rate": 9.040444444444445e-06, "loss": 1.0794, "step": 4660 }, { "epoch": 0.6087069864442127, "grad_norm": 10.772948265075684, "learning_rate": 9.036e-06, "loss": 1.0795, "step": 4670 }, { "epoch": 0.6100104275286757, "grad_norm": 9.68770980834961, "learning_rate": 9.031555555555556e-06, "loss": 1.0261, "step": 4680 }, { "epoch": 0.6113138686131386, "grad_norm": 9.47791862487793, "learning_rate": 9.027111111111113e-06, "loss": 1.1494, "step": 4690 }, { "epoch": 0.6126173096976016, "grad_norm": 9.655404090881348, "learning_rate": 9.022666666666667e-06, "loss": 1.1113, "step": 4700 }, { "epoch": 0.6126173096976016, "eval/acc": 41.86046600341797, "step": 4700 }, { "epoch": 0.6126173096976016, "eval_loss": 2.4572794437408447, "eval_runtime": 0.551, "eval_samples_per_second": 78.043, "eval_steps_per_second": 1.815, "step": 4700 }, { "epoch": 0.6139207507820647, "grad_norm": 11.384035110473633, "learning_rate": 9.018222222222222e-06, "loss": 0.9002, "step": 4710 }, { "epoch": 0.6152241918665277, "grad_norm": 9.862360000610352, "learning_rate": 9.013777777777779e-06, "loss": 0.9838, "step": 4720 }, { "epoch": 0.6165276329509907, "grad_norm": 8.860601425170898, "learning_rate": 9.009333333333334e-06, "loss": 1.0471, "step": 4730 }, { "epoch": 0.6178310740354536, "grad_norm": 9.085923194885254, "learning_rate": 9.00488888888889e-06, "loss": 1.1413, "step": 4740 }, { "epoch": 0.6191345151199166, "grad_norm": 7.881019115447998, "learning_rate": 9.000444444444445e-06, "loss": 1.0241, "step": 4750 }, { "epoch": 0.6204379562043796, "grad_norm": 9.55480670928955, "learning_rate": 8.996e-06, "loss": 1.0634, "step": 4760 }, { "epoch": 0.6217413972888426, "grad_norm": 8.191434860229492, "learning_rate": 8.991555555555556e-06, "loss": 1.0264, "step": 4770 }, { "epoch": 0.6230448383733055, "grad_norm": 11.498793601989746, "learning_rate": 8.987111111111111e-06, "loss": 1.0466, "step": 4780 }, { "epoch": 0.6243482794577685, "grad_norm": 8.848291397094727, "learning_rate": 8.982666666666668e-06, "loss": 0.9813, "step": 4790 }, { "epoch": 0.6256517205422315, "grad_norm": 8.858402252197266, "learning_rate": 8.978222222222223e-06, "loss": 1.0143, "step": 4800 }, { "epoch": 0.6256517205422315, "eval/acc": 34.88372039794922, "step": 4800 }, { "epoch": 0.6256517205422315, "eval_loss": 2.6291277408599854, "eval_runtime": 0.557, "eval_samples_per_second": 77.206, "eval_steps_per_second": 1.795, "step": 4800 }, { "epoch": 0.6269551616266945, "grad_norm": 9.703082084655762, "learning_rate": 8.97377777777778e-06, "loss": 1.0354, "step": 4810 }, { "epoch": 0.6282586027111574, "grad_norm": 8.061450004577637, "learning_rate": 8.969333333333334e-06, "loss": 1.0989, "step": 4820 }, { "epoch": 0.6295620437956204, "grad_norm": 8.38237476348877, "learning_rate": 8.964888888888889e-06, "loss": 1.0303, "step": 4830 }, { "epoch": 0.6308654848800834, "grad_norm": 9.098999977111816, "learning_rate": 8.960444444444445e-06, "loss": 1.1374, "step": 4840 }, { "epoch": 0.6321689259645464, "grad_norm": 8.959243774414062, "learning_rate": 8.956e-06, "loss": 1.0242, "step": 4850 }, { "epoch": 0.6334723670490093, "grad_norm": 10.157614707946777, "learning_rate": 8.951555555555557e-06, "loss": 1.134, "step": 4860 }, { "epoch": 0.6347758081334723, "grad_norm": 10.983575820922852, "learning_rate": 8.947111111111112e-06, "loss": 0.9518, "step": 4870 }, { "epoch": 0.6360792492179353, "grad_norm": 11.162731170654297, "learning_rate": 8.942666666666667e-06, "loss": 1.0702, "step": 4880 }, { "epoch": 0.6373826903023984, "grad_norm": 9.005561828613281, "learning_rate": 8.938222222222223e-06, "loss": 1.0228, "step": 4890 }, { "epoch": 0.6386861313868614, "grad_norm": 9.825065612792969, "learning_rate": 8.93377777777778e-06, "loss": 1.0373, "step": 4900 }, { "epoch": 0.6386861313868614, "eval/acc": 32.55813980102539, "step": 4900 }, { "epoch": 0.6386861313868614, "eval_loss": 2.755546808242798, "eval_runtime": 0.5529, "eval_samples_per_second": 77.765, "eval_steps_per_second": 1.808, "step": 4900 }, { "epoch": 0.6399895724713243, "grad_norm": 8.52741813659668, "learning_rate": 8.929333333333334e-06, "loss": 1.066, "step": 4910 }, { "epoch": 0.6412930135557873, "grad_norm": 9.974360466003418, "learning_rate": 8.92488888888889e-06, "loss": 1.0397, "step": 4920 }, { "epoch": 0.6425964546402503, "grad_norm": 8.10251235961914, "learning_rate": 8.920444444444444e-06, "loss": 1.0832, "step": 4930 }, { "epoch": 0.6438998957247133, "grad_norm": 10.143448829650879, "learning_rate": 8.916e-06, "loss": 1.0112, "step": 4940 }, { "epoch": 0.6452033368091762, "grad_norm": 10.25130844116211, "learning_rate": 8.911555555555557e-06, "loss": 1.0808, "step": 4950 }, { "epoch": 0.6465067778936392, "grad_norm": 11.107799530029297, "learning_rate": 8.907111111111112e-06, "loss": 1.0547, "step": 4960 }, { "epoch": 0.6478102189781022, "grad_norm": 10.128641128540039, "learning_rate": 8.902666666666667e-06, "loss": 1.0721, "step": 4970 }, { "epoch": 0.6491136600625652, "grad_norm": 10.3110933303833, "learning_rate": 8.898222222222222e-06, "loss": 0.9976, "step": 4980 }, { "epoch": 0.6504171011470281, "grad_norm": 8.941389083862305, "learning_rate": 8.893777777777778e-06, "loss": 1.0196, "step": 4990 }, { "epoch": 0.6517205422314911, "grad_norm": 10.89724063873291, "learning_rate": 8.889333333333335e-06, "loss": 1.0551, "step": 5000 }, { "epoch": 0.6517205422314911, "eval/acc": 34.88372039794922, "step": 5000 }, { "epoch": 0.6517205422314911, "eval_loss": 2.655290126800537, "eval_runtime": 0.5523, "eval_samples_per_second": 77.86, "eval_steps_per_second": 1.811, "step": 5000 }, { "epoch": 0.6530239833159541, "grad_norm": 8.989163398742676, "learning_rate": 8.88488888888889e-06, "loss": 0.9939, "step": 5010 }, { "epoch": 0.6543274244004171, "grad_norm": 10.607318878173828, "learning_rate": 8.880444444444445e-06, "loss": 0.9934, "step": 5020 }, { "epoch": 0.65563086548488, "grad_norm": 9.469202995300293, "learning_rate": 8.876e-06, "loss": 0.9683, "step": 5030 }, { "epoch": 0.656934306569343, "grad_norm": 8.721243858337402, "learning_rate": 8.871555555555556e-06, "loss": 1.0208, "step": 5040 }, { "epoch": 0.658237747653806, "grad_norm": 10.423583984375, "learning_rate": 8.867111111111112e-06, "loss": 1.0284, "step": 5050 }, { "epoch": 0.6595411887382691, "grad_norm": 9.084837913513184, "learning_rate": 8.862666666666667e-06, "loss": 1.0282, "step": 5060 }, { "epoch": 0.6608446298227321, "grad_norm": 9.282791137695312, "learning_rate": 8.858222222222222e-06, "loss": 1.0565, "step": 5070 }, { "epoch": 0.662148070907195, "grad_norm": 9.868699073791504, "learning_rate": 8.853777777777779e-06, "loss": 1.0749, "step": 5080 }, { "epoch": 0.663451511991658, "grad_norm": 9.887734413146973, "learning_rate": 8.849333333333334e-06, "loss": 1.1138, "step": 5090 }, { "epoch": 0.664754953076121, "grad_norm": 9.476777076721191, "learning_rate": 8.84488888888889e-06, "loss": 0.987, "step": 5100 }, { "epoch": 0.664754953076121, "eval/acc": 34.88372039794922, "step": 5100 }, { "epoch": 0.664754953076121, "eval_loss": 2.7898762226104736, "eval_runtime": 0.555, "eval_samples_per_second": 77.483, "eval_steps_per_second": 1.802, "step": 5100 }, { "epoch": 0.666058394160584, "grad_norm": 8.887063026428223, "learning_rate": 8.840444444444445e-06, "loss": 1.0133, "step": 5110 }, { "epoch": 0.6673618352450469, "grad_norm": 8.670061111450195, "learning_rate": 8.836000000000001e-06, "loss": 1.0152, "step": 5120 }, { "epoch": 0.6686652763295099, "grad_norm": 10.9150390625, "learning_rate": 8.831555555555556e-06, "loss": 1.0866, "step": 5130 }, { "epoch": 0.6699687174139729, "grad_norm": 9.245491027832031, "learning_rate": 8.827111111111111e-06, "loss": 1.0282, "step": 5140 }, { "epoch": 0.6712721584984359, "grad_norm": 8.521747589111328, "learning_rate": 8.822666666666668e-06, "loss": 0.9966, "step": 5150 }, { "epoch": 0.6725755995828988, "grad_norm": 10.43109130859375, "learning_rate": 8.818222222222223e-06, "loss": 1.0034, "step": 5160 }, { "epoch": 0.6738790406673618, "grad_norm": 8.546734809875488, "learning_rate": 8.813777777777779e-06, "loss": 1.034, "step": 5170 }, { "epoch": 0.6751824817518248, "grad_norm": 9.694477081298828, "learning_rate": 8.809333333333334e-06, "loss": 1.0633, "step": 5180 }, { "epoch": 0.6764859228362878, "grad_norm": 9.622178077697754, "learning_rate": 8.804888888888889e-06, "loss": 1.0111, "step": 5190 }, { "epoch": 0.6777893639207507, "grad_norm": 8.308663368225098, "learning_rate": 8.800444444444445e-06, "loss": 1.0342, "step": 5200 }, { "epoch": 0.6777893639207507, "eval/acc": 39.53488540649414, "step": 5200 }, { "epoch": 0.6777893639207507, "eval_loss": 2.570695161819458, "eval_runtime": 0.9101, "eval_samples_per_second": 47.246, "eval_steps_per_second": 1.099, "step": 5200 }, { "epoch": 0.6790928050052137, "grad_norm": 8.973245620727539, "learning_rate": 8.796000000000002e-06, "loss": 1.0253, "step": 5210 }, { "epoch": 0.6803962460896767, "grad_norm": 10.03006649017334, "learning_rate": 8.791555555555557e-06, "loss": 1.0575, "step": 5220 }, { "epoch": 0.6816996871741398, "grad_norm": 9.414715766906738, "learning_rate": 8.787111111111112e-06, "loss": 1.0044, "step": 5230 }, { "epoch": 0.6830031282586028, "grad_norm": 10.497982025146484, "learning_rate": 8.782666666666666e-06, "loss": 1.0556, "step": 5240 }, { "epoch": 0.6843065693430657, "grad_norm": 9.626456260681152, "learning_rate": 8.778222222222223e-06, "loss": 1.0294, "step": 5250 }, { "epoch": 0.6856100104275287, "grad_norm": 8.467864036560059, "learning_rate": 8.77377777777778e-06, "loss": 1.0754, "step": 5260 }, { "epoch": 0.6869134515119917, "grad_norm": 10.255539894104004, "learning_rate": 8.769333333333334e-06, "loss": 1.013, "step": 5270 }, { "epoch": 0.6882168925964547, "grad_norm": 9.694276809692383, "learning_rate": 8.764888888888889e-06, "loss": 1.0573, "step": 5280 }, { "epoch": 0.6895203336809176, "grad_norm": 9.310009002685547, "learning_rate": 8.760444444444444e-06, "loss": 0.9638, "step": 5290 }, { "epoch": 0.6908237747653806, "grad_norm": 8.105212211608887, "learning_rate": 8.756e-06, "loss": 0.9974, "step": 5300 }, { "epoch": 0.6908237747653806, "eval/acc": 34.88372039794922, "step": 5300 }, { "epoch": 0.6908237747653806, "eval_loss": 2.696380376815796, "eval_runtime": 0.5519, "eval_samples_per_second": 77.91, "eval_steps_per_second": 1.812, "step": 5300 }, { "epoch": 0.6921272158498436, "grad_norm": 9.850837707519531, "learning_rate": 8.751555555555557e-06, "loss": 1.0513, "step": 5310 }, { "epoch": 0.6934306569343066, "grad_norm": 9.398372650146484, "learning_rate": 8.747111111111112e-06, "loss": 0.9589, "step": 5320 }, { "epoch": 0.6947340980187695, "grad_norm": 9.533944129943848, "learning_rate": 8.742666666666667e-06, "loss": 1.0183, "step": 5330 }, { "epoch": 0.6960375391032325, "grad_norm": 9.51920223236084, "learning_rate": 8.738222222222222e-06, "loss": 0.9897, "step": 5340 }, { "epoch": 0.6973409801876955, "grad_norm": 8.380166053771973, "learning_rate": 8.733777777777778e-06, "loss": 0.9377, "step": 5350 }, { "epoch": 0.6986444212721585, "grad_norm": 9.34756088256836, "learning_rate": 8.729333333333335e-06, "loss": 0.9827, "step": 5360 }, { "epoch": 0.6999478623566214, "grad_norm": 8.969053268432617, "learning_rate": 8.72488888888889e-06, "loss": 1.0291, "step": 5370 }, { "epoch": 0.7012513034410844, "grad_norm": 11.20063304901123, "learning_rate": 8.720444444444444e-06, "loss": 1.0111, "step": 5380 }, { "epoch": 0.7025547445255474, "grad_norm": 9.322565078735352, "learning_rate": 8.716000000000001e-06, "loss": 0.9815, "step": 5390 }, { "epoch": 0.7038581856100105, "grad_norm": 9.362802505493164, "learning_rate": 8.711555555555556e-06, "loss": 1.0608, "step": 5400 }, { "epoch": 0.7038581856100105, "eval/acc": 32.55813980102539, "step": 5400 }, { "epoch": 0.7038581856100105, "eval_loss": 2.7143032550811768, "eval_runtime": 0.5523, "eval_samples_per_second": 77.854, "eval_steps_per_second": 1.811, "step": 5400 }, { "epoch": 0.7051616266944735, "grad_norm": 7.582075119018555, "learning_rate": 8.707111111111112e-06, "loss": 0.983, "step": 5410 }, { "epoch": 0.7064650677789364, "grad_norm": 9.668004989624023, "learning_rate": 8.702666666666667e-06, "loss": 0.9948, "step": 5420 }, { "epoch": 0.7077685088633994, "grad_norm": 9.572304725646973, "learning_rate": 8.698222222222224e-06, "loss": 0.989, "step": 5430 }, { "epoch": 0.7090719499478624, "grad_norm": 9.887052536010742, "learning_rate": 8.693777777777779e-06, "loss": 1.0045, "step": 5440 }, { "epoch": 0.7103753910323254, "grad_norm": 10.95411205291748, "learning_rate": 8.689333333333333e-06, "loss": 1.1597, "step": 5450 }, { "epoch": 0.7116788321167883, "grad_norm": 8.732405662536621, "learning_rate": 8.68488888888889e-06, "loss": 0.9946, "step": 5460 }, { "epoch": 0.7129822732012513, "grad_norm": 10.515278816223145, "learning_rate": 8.680444444444445e-06, "loss": 1.0129, "step": 5470 }, { "epoch": 0.7142857142857143, "grad_norm": 11.1256685256958, "learning_rate": 8.676000000000001e-06, "loss": 1.0248, "step": 5480 }, { "epoch": 0.7155891553701773, "grad_norm": 8.381686210632324, "learning_rate": 8.671555555555556e-06, "loss": 1.1366, "step": 5490 }, { "epoch": 0.7168925964546402, "grad_norm": 9.804370880126953, "learning_rate": 8.667111111111111e-06, "loss": 1.0714, "step": 5500 }, { "epoch": 0.7168925964546402, "eval/acc": 32.55813980102539, "step": 5500 }, { "epoch": 0.7168925964546402, "eval_loss": 2.6976399421691895, "eval_runtime": 0.5537, "eval_samples_per_second": 77.656, "eval_steps_per_second": 1.806, "step": 5500 }, { "epoch": 0.7181960375391032, "grad_norm": 8.139718055725098, "learning_rate": 8.662666666666668e-06, "loss": 1.0368, "step": 5510 }, { "epoch": 0.7194994786235662, "grad_norm": 10.484297752380371, "learning_rate": 8.658222222222224e-06, "loss": 1.0055, "step": 5520 }, { "epoch": 0.7208029197080292, "grad_norm": 8.326539993286133, "learning_rate": 8.653777777777779e-06, "loss": 1.0275, "step": 5530 }, { "epoch": 0.7221063607924921, "grad_norm": 9.043944358825684, "learning_rate": 8.649333333333334e-06, "loss": 1.0032, "step": 5540 }, { "epoch": 0.7234098018769551, "grad_norm": 8.754300117492676, "learning_rate": 8.644888888888889e-06, "loss": 0.9642, "step": 5550 }, { "epoch": 0.7247132429614181, "grad_norm": 8.33322525024414, "learning_rate": 8.640444444444445e-06, "loss": 1.014, "step": 5560 }, { "epoch": 0.7260166840458812, "grad_norm": 8.533761024475098, "learning_rate": 8.636000000000002e-06, "loss": 1.0439, "step": 5570 }, { "epoch": 0.7273201251303442, "grad_norm": 10.058277130126953, "learning_rate": 8.631555555555557e-06, "loss": 0.9821, "step": 5580 }, { "epoch": 0.7286235662148071, "grad_norm": 7.9940009117126465, "learning_rate": 8.627111111111111e-06, "loss": 1.0351, "step": 5590 }, { "epoch": 0.7299270072992701, "grad_norm": 9.121907234191895, "learning_rate": 8.622666666666666e-06, "loss": 0.9859, "step": 5600 }, { "epoch": 0.7299270072992701, "eval/acc": 39.53488540649414, "step": 5600 }, { "epoch": 0.7299270072992701, "eval_loss": 2.6650607585906982, "eval_runtime": 0.552, "eval_samples_per_second": 77.897, "eval_steps_per_second": 1.812, "step": 5600 }, { "epoch": 0.7312304483837331, "grad_norm": 9.132323265075684, "learning_rate": 8.618222222222223e-06, "loss": 1.0289, "step": 5610 }, { "epoch": 0.7325338894681961, "grad_norm": 8.751036643981934, "learning_rate": 8.61377777777778e-06, "loss": 1.0203, "step": 5620 }, { "epoch": 0.733837330552659, "grad_norm": 8.791621208190918, "learning_rate": 8.609333333333334e-06, "loss": 0.9776, "step": 5630 }, { "epoch": 0.735140771637122, "grad_norm": 7.251979827880859, "learning_rate": 8.604888888888889e-06, "loss": 0.9381, "step": 5640 }, { "epoch": 0.736444212721585, "grad_norm": 9.67597484588623, "learning_rate": 8.600444444444444e-06, "loss": 1.1627, "step": 5650 }, { "epoch": 0.737747653806048, "grad_norm": 9.147563934326172, "learning_rate": 8.596e-06, "loss": 1.0479, "step": 5660 }, { "epoch": 0.7390510948905109, "grad_norm": 11.876523971557617, "learning_rate": 8.591555555555557e-06, "loss": 1.0011, "step": 5670 }, { "epoch": 0.7403545359749739, "grad_norm": 9.589251518249512, "learning_rate": 8.587111111111112e-06, "loss": 1.0448, "step": 5680 }, { "epoch": 0.7416579770594369, "grad_norm": 10.050942420959473, "learning_rate": 8.582666666666667e-06, "loss": 1.0001, "step": 5690 }, { "epoch": 0.7429614181438999, "grad_norm": 7.628367900848389, "learning_rate": 8.578222222222223e-06, "loss": 1.0111, "step": 5700 }, { "epoch": 0.7429614181438999, "eval/acc": 37.20930099487305, "step": 5700 }, { "epoch": 0.7429614181438999, "eval_loss": 2.59824800491333, "eval_runtime": 0.5524, "eval_samples_per_second": 77.842, "eval_steps_per_second": 1.81, "step": 5700 }, { "epoch": 0.7442648592283628, "grad_norm": 9.08203125, "learning_rate": 8.573777777777778e-06, "loss": 1.0043, "step": 5710 }, { "epoch": 0.7455683003128258, "grad_norm": 10.87893009185791, "learning_rate": 8.569333333333335e-06, "loss": 0.981, "step": 5720 }, { "epoch": 0.7468717413972888, "grad_norm": 9.921890258789062, "learning_rate": 8.56488888888889e-06, "loss": 1.0664, "step": 5730 }, { "epoch": 0.7481751824817519, "grad_norm": 8.49359130859375, "learning_rate": 8.560444444444446e-06, "loss": 1.1034, "step": 5740 }, { "epoch": 0.7494786235662148, "grad_norm": 8.523398399353027, "learning_rate": 8.556e-06, "loss": 0.9143, "step": 5750 }, { "epoch": 0.7507820646506778, "grad_norm": 9.348311424255371, "learning_rate": 8.551555555555556e-06, "loss": 1.011, "step": 5760 }, { "epoch": 0.7520855057351408, "grad_norm": 10.186081886291504, "learning_rate": 8.547111111111112e-06, "loss": 1.11, "step": 5770 }, { "epoch": 0.7533889468196038, "grad_norm": 8.896495819091797, "learning_rate": 8.542666666666667e-06, "loss": 1.0081, "step": 5780 }, { "epoch": 0.7546923879040668, "grad_norm": 8.774282455444336, "learning_rate": 8.538222222222224e-06, "loss": 1.0392, "step": 5790 }, { "epoch": 0.7559958289885297, "grad_norm": 10.161205291748047, "learning_rate": 8.533777777777778e-06, "loss": 1.0442, "step": 5800 }, { "epoch": 0.7559958289885297, "eval/acc": 37.20930099487305, "step": 5800 }, { "epoch": 0.7559958289885297, "eval_loss": 2.6203322410583496, "eval_runtime": 0.5612, "eval_samples_per_second": 76.627, "eval_steps_per_second": 1.782, "step": 5800 }, { "epoch": 0.7572992700729927, "grad_norm": 11.288007736206055, "learning_rate": 8.529333333333333e-06, "loss": 1.0112, "step": 5810 }, { "epoch": 0.7586027111574557, "grad_norm": 8.31664752960205, "learning_rate": 8.52488888888889e-06, "loss": 0.9818, "step": 5820 }, { "epoch": 0.7599061522419187, "grad_norm": 9.366721153259277, "learning_rate": 8.520444444444446e-06, "loss": 1.0025, "step": 5830 }, { "epoch": 0.7612095933263816, "grad_norm": 9.547416687011719, "learning_rate": 8.516000000000001e-06, "loss": 1.0241, "step": 5840 }, { "epoch": 0.7625130344108446, "grad_norm": 9.147334098815918, "learning_rate": 8.511555555555556e-06, "loss": 1.0055, "step": 5850 }, { "epoch": 0.7638164754953076, "grad_norm": 9.335505485534668, "learning_rate": 8.50711111111111e-06, "loss": 0.9671, "step": 5860 }, { "epoch": 0.7651199165797706, "grad_norm": 9.130939483642578, "learning_rate": 8.502666666666667e-06, "loss": 0.9679, "step": 5870 }, { "epoch": 0.7664233576642335, "grad_norm": 9.89399528503418, "learning_rate": 8.498222222222224e-06, "loss": 1.0736, "step": 5880 }, { "epoch": 0.7677267987486965, "grad_norm": 9.254303932189941, "learning_rate": 8.493777777777779e-06, "loss": 1.0552, "step": 5890 }, { "epoch": 0.7690302398331595, "grad_norm": 8.202300071716309, "learning_rate": 8.489333333333334e-06, "loss": 0.9509, "step": 5900 }, { "epoch": 0.7690302398331595, "eval/acc": 32.55813980102539, "step": 5900 }, { "epoch": 0.7690302398331595, "eval_loss": 2.7594034671783447, "eval_runtime": 0.5559, "eval_samples_per_second": 77.347, "eval_steps_per_second": 1.799, "step": 5900 }, { "epoch": 0.7703336809176226, "grad_norm": 7.404378890991211, "learning_rate": 8.484888888888888e-06, "loss": 1.0923, "step": 5910 }, { "epoch": 0.7716371220020855, "grad_norm": 9.642816543579102, "learning_rate": 8.480444444444445e-06, "loss": 0.967, "step": 5920 }, { "epoch": 0.7729405630865485, "grad_norm": 10.284126281738281, "learning_rate": 8.476000000000002e-06, "loss": 0.9209, "step": 5930 }, { "epoch": 0.7742440041710115, "grad_norm": 9.019957542419434, "learning_rate": 8.471555555555556e-06, "loss": 1.0556, "step": 5940 }, { "epoch": 0.7755474452554745, "grad_norm": 10.67587661743164, "learning_rate": 8.467111111111111e-06, "loss": 1.0267, "step": 5950 }, { "epoch": 0.7768508863399375, "grad_norm": 9.622302055358887, "learning_rate": 8.462666666666666e-06, "loss": 1.0583, "step": 5960 }, { "epoch": 0.7781543274244004, "grad_norm": 8.635871887207031, "learning_rate": 8.458222222222223e-06, "loss": 0.9977, "step": 5970 }, { "epoch": 0.7794577685088634, "grad_norm": 9.332335472106934, "learning_rate": 8.453777777777779e-06, "loss": 0.9918, "step": 5980 }, { "epoch": 0.7807612095933264, "grad_norm": 8.821697235107422, "learning_rate": 8.449333333333334e-06, "loss": 1.0279, "step": 5990 }, { "epoch": 0.7820646506777894, "grad_norm": 7.9288763999938965, "learning_rate": 8.444888888888889e-06, "loss": 0.9941, "step": 6000 }, { "epoch": 0.7820646506777894, "eval/acc": 41.86046600341797, "step": 6000 }, { "epoch": 0.7820646506777894, "eval_loss": 2.6315321922302246, "eval_runtime": 0.5518, "eval_samples_per_second": 77.927, "eval_steps_per_second": 1.812, "step": 6000 }, { "epoch": 0.7833680917622523, "grad_norm": 8.8925199508667, "learning_rate": 8.440444444444445e-06, "loss": 0.9833, "step": 6010 }, { "epoch": 0.7846715328467153, "grad_norm": 10.178521156311035, "learning_rate": 8.436e-06, "loss": 1.0427, "step": 6020 }, { "epoch": 0.7859749739311783, "grad_norm": 8.328222274780273, "learning_rate": 8.431555555555557e-06, "loss": 0.9223, "step": 6030 }, { "epoch": 0.7872784150156413, "grad_norm": 7.066805839538574, "learning_rate": 8.427111111111112e-06, "loss": 1.0382, "step": 6040 }, { "epoch": 0.7885818561001042, "grad_norm": 10.042593955993652, "learning_rate": 8.422666666666668e-06, "loss": 1.0121, "step": 6050 }, { "epoch": 0.7898852971845672, "grad_norm": 10.048341751098633, "learning_rate": 8.418222222222223e-06, "loss": 1.0191, "step": 6060 }, { "epoch": 0.7911887382690302, "grad_norm": 9.812093734741211, "learning_rate": 8.413777777777778e-06, "loss": 1.0178, "step": 6070 }, { "epoch": 0.7924921793534933, "grad_norm": 8.843996047973633, "learning_rate": 8.409333333333334e-06, "loss": 1.0332, "step": 6080 }, { "epoch": 0.7937956204379562, "grad_norm": 9.375243186950684, "learning_rate": 8.40488888888889e-06, "loss": 0.97, "step": 6090 }, { "epoch": 0.7950990615224192, "grad_norm": 9.695100784301758, "learning_rate": 8.400444444444446e-06, "loss": 0.9974, "step": 6100 }, { "epoch": 0.7950990615224192, "eval/acc": 34.88372039794922, "step": 6100 }, { "epoch": 0.7950990615224192, "eval_loss": 2.6288933753967285, "eval_runtime": 0.5581, "eval_samples_per_second": 77.053, "eval_steps_per_second": 1.792, "step": 6100 }, { "epoch": 0.7964025026068822, "grad_norm": 10.059000015258789, "learning_rate": 8.396e-06, "loss": 0.9742, "step": 6110 }, { "epoch": 0.7977059436913452, "grad_norm": 9.15655517578125, "learning_rate": 8.391555555555555e-06, "loss": 1.0031, "step": 6120 }, { "epoch": 0.7990093847758082, "grad_norm": 10.105188369750977, "learning_rate": 8.387111111111112e-06, "loss": 0.9663, "step": 6130 }, { "epoch": 0.8003128258602711, "grad_norm": 8.860940933227539, "learning_rate": 8.382666666666669e-06, "loss": 0.9867, "step": 6140 }, { "epoch": 0.8016162669447341, "grad_norm": 9.332911491394043, "learning_rate": 8.378222222222223e-06, "loss": 0.9407, "step": 6150 }, { "epoch": 0.8029197080291971, "grad_norm": 9.756667137145996, "learning_rate": 8.373777777777778e-06, "loss": 1.0216, "step": 6160 }, { "epoch": 0.8042231491136601, "grad_norm": 9.770795822143555, "learning_rate": 8.369333333333333e-06, "loss": 1.0218, "step": 6170 }, { "epoch": 0.805526590198123, "grad_norm": 9.790980339050293, "learning_rate": 8.36488888888889e-06, "loss": 1.0977, "step": 6180 }, { "epoch": 0.806830031282586, "grad_norm": 9.125996589660645, "learning_rate": 8.360444444444446e-06, "loss": 1.0185, "step": 6190 }, { "epoch": 0.808133472367049, "grad_norm": 8.784990310668945, "learning_rate": 8.356000000000001e-06, "loss": 0.9859, "step": 6200 }, { "epoch": 0.808133472367049, "eval/acc": 37.20930099487305, "step": 6200 }, { "epoch": 0.808133472367049, "eval_loss": 2.6269452571868896, "eval_runtime": 0.5551, "eval_samples_per_second": 77.468, "eval_steps_per_second": 1.802, "step": 6200 }, { "epoch": 0.809436913451512, "grad_norm": 10.93716049194336, "learning_rate": 8.351555555555556e-06, "loss": 1.0452, "step": 6210 }, { "epoch": 0.8107403545359749, "grad_norm": 8.550542831420898, "learning_rate": 8.34711111111111e-06, "loss": 1.0022, "step": 6220 }, { "epoch": 0.8120437956204379, "grad_norm": 9.354156494140625, "learning_rate": 8.342666666666667e-06, "loss": 0.9901, "step": 6230 }, { "epoch": 0.8133472367049009, "grad_norm": 11.288535118103027, "learning_rate": 8.338222222222224e-06, "loss": 1.0463, "step": 6240 }, { "epoch": 0.814650677789364, "grad_norm": 8.247468948364258, "learning_rate": 8.333777777777779e-06, "loss": 1.0336, "step": 6250 }, { "epoch": 0.8159541188738269, "grad_norm": 9.26513385772705, "learning_rate": 8.329333333333333e-06, "loss": 1.0107, "step": 6260 }, { "epoch": 0.8172575599582899, "grad_norm": 8.089369773864746, "learning_rate": 8.324888888888888e-06, "loss": 0.9727, "step": 6270 }, { "epoch": 0.8185610010427529, "grad_norm": 8.736642837524414, "learning_rate": 8.320444444444445e-06, "loss": 1.0667, "step": 6280 }, { "epoch": 0.8198644421272159, "grad_norm": 8.858651161193848, "learning_rate": 8.316000000000001e-06, "loss": 1.081, "step": 6290 }, { "epoch": 0.8211678832116789, "grad_norm": 8.538246154785156, "learning_rate": 8.311555555555556e-06, "loss": 0.9886, "step": 6300 }, { "epoch": 0.8211678832116789, "eval/acc": 39.53488540649414, "step": 6300 }, { "epoch": 0.8211678832116789, "eval_loss": 2.624267101287842, "eval_runtime": 0.5516, "eval_samples_per_second": 77.959, "eval_steps_per_second": 1.813, "step": 6300 }, { "epoch": 0.8224713242961418, "grad_norm": 9.205613136291504, "learning_rate": 8.307111111111111e-06, "loss": 0.9988, "step": 6310 }, { "epoch": 0.8237747653806048, "grad_norm": 8.61228084564209, "learning_rate": 8.302666666666668e-06, "loss": 1.0361, "step": 6320 }, { "epoch": 0.8250782064650678, "grad_norm": 9.030414581298828, "learning_rate": 8.298222222222222e-06, "loss": 1.0017, "step": 6330 }, { "epoch": 0.8263816475495308, "grad_norm": 7.929698944091797, "learning_rate": 8.293777777777779e-06, "loss": 1.0242, "step": 6340 }, { "epoch": 0.8276850886339937, "grad_norm": 10.961642265319824, "learning_rate": 8.289333333333334e-06, "loss": 1.033, "step": 6350 }, { "epoch": 0.8289885297184567, "grad_norm": 9.38997745513916, "learning_rate": 8.28488888888889e-06, "loss": 1.0846, "step": 6360 }, { "epoch": 0.8302919708029197, "grad_norm": 8.70460033416748, "learning_rate": 8.280444444444445e-06, "loss": 0.9333, "step": 6370 }, { "epoch": 0.8315954118873827, "grad_norm": 9.825383186340332, "learning_rate": 8.276e-06, "loss": 1.0019, "step": 6380 }, { "epoch": 0.8328988529718456, "grad_norm": 8.25622272491455, "learning_rate": 8.271555555555557e-06, "loss": 0.977, "step": 6390 }, { "epoch": 0.8342022940563086, "grad_norm": 10.422846794128418, "learning_rate": 8.267111111111111e-06, "loss": 1.027, "step": 6400 }, { "epoch": 0.8342022940563086, "eval/acc": 37.20930099487305, "step": 6400 }, { "epoch": 0.8342022940563086, "eval_loss": 2.673701763153076, "eval_runtime": 0.5557, "eval_samples_per_second": 77.383, "eval_steps_per_second": 1.8, "step": 6400 }, { "epoch": 0.8355057351407716, "grad_norm": 9.53593921661377, "learning_rate": 8.262666666666668e-06, "loss": 1.0361, "step": 6410 }, { "epoch": 0.8368091762252347, "grad_norm": 9.09162425994873, "learning_rate": 8.258222222222223e-06, "loss": 1.0367, "step": 6420 }, { "epoch": 0.8381126173096975, "grad_norm": 9.458831787109375, "learning_rate": 8.253777777777778e-06, "loss": 1.0165, "step": 6430 }, { "epoch": 0.8394160583941606, "grad_norm": 9.845352172851562, "learning_rate": 8.249333333333334e-06, "loss": 0.9662, "step": 6440 }, { "epoch": 0.8407194994786236, "grad_norm": 7.90129280090332, "learning_rate": 8.24488888888889e-06, "loss": 0.9909, "step": 6450 }, { "epoch": 0.8420229405630866, "grad_norm": 8.902530670166016, "learning_rate": 8.240444444444446e-06, "loss": 1.0187, "step": 6460 }, { "epoch": 0.8433263816475496, "grad_norm": 8.841060638427734, "learning_rate": 8.236e-06, "loss": 0.9962, "step": 6470 }, { "epoch": 0.8446298227320125, "grad_norm": 8.994577407836914, "learning_rate": 8.231555555555555e-06, "loss": 1.0147, "step": 6480 }, { "epoch": 0.8459332638164755, "grad_norm": 8.313756942749023, "learning_rate": 8.227111111111112e-06, "loss": 0.8592, "step": 6490 }, { "epoch": 0.8472367049009385, "grad_norm": 9.097774505615234, "learning_rate": 8.222666666666668e-06, "loss": 1.071, "step": 6500 }, { "epoch": 0.8472367049009385, "eval/acc": 37.20930099487305, "step": 6500 }, { "epoch": 0.8472367049009385, "eval_loss": 2.67804217338562, "eval_runtime": 0.5526, "eval_samples_per_second": 77.809, "eval_steps_per_second": 1.81, "step": 6500 }, { "epoch": 0.8485401459854015, "grad_norm": 8.86130428314209, "learning_rate": 8.218222222222223e-06, "loss": 1.0751, "step": 6510 }, { "epoch": 0.8498435870698644, "grad_norm": 10.352911949157715, "learning_rate": 8.213777777777778e-06, "loss": 1.0069, "step": 6520 }, { "epoch": 0.8511470281543274, "grad_norm": 9.668673515319824, "learning_rate": 8.209333333333333e-06, "loss": 0.9834, "step": 6530 }, { "epoch": 0.8524504692387904, "grad_norm": 10.304662704467773, "learning_rate": 8.20488888888889e-06, "loss": 1.0477, "step": 6540 }, { "epoch": 0.8537539103232534, "grad_norm": 8.507194519042969, "learning_rate": 8.200444444444446e-06, "loss": 1.0732, "step": 6550 }, { "epoch": 0.8550573514077163, "grad_norm": 8.823519706726074, "learning_rate": 8.196e-06, "loss": 1.1194, "step": 6560 }, { "epoch": 0.8563607924921793, "grad_norm": 11.177069664001465, "learning_rate": 8.191555555555556e-06, "loss": 0.9341, "step": 6570 }, { "epoch": 0.8576642335766423, "grad_norm": 8.849434852600098, "learning_rate": 8.18711111111111e-06, "loss": 1.0493, "step": 6580 }, { "epoch": 0.8589676746611054, "grad_norm": 8.759775161743164, "learning_rate": 8.182666666666667e-06, "loss": 0.9479, "step": 6590 }, { "epoch": 0.8602711157455682, "grad_norm": 9.2578706741333, "learning_rate": 8.178222222222224e-06, "loss": 1.0334, "step": 6600 }, { "epoch": 0.8602711157455682, "eval/acc": 34.88372039794922, "step": 6600 }, { "epoch": 0.8602711157455682, "eval_loss": 2.6864330768585205, "eval_runtime": 0.5529, "eval_samples_per_second": 77.777, "eval_steps_per_second": 1.809, "step": 6600 }, { "epoch": 0.8615745568300313, "grad_norm": 8.463264465332031, "learning_rate": 8.173777777777778e-06, "loss": 0.9916, "step": 6610 }, { "epoch": 0.8628779979144943, "grad_norm": 9.883336067199707, "learning_rate": 8.169333333333333e-06, "loss": 0.9955, "step": 6620 }, { "epoch": 0.8641814389989573, "grad_norm": 8.82459545135498, "learning_rate": 8.16488888888889e-06, "loss": 1.0315, "step": 6630 }, { "epoch": 0.8654848800834203, "grad_norm": 10.618040084838867, "learning_rate": 8.160444444444445e-06, "loss": 0.9399, "step": 6640 }, { "epoch": 0.8667883211678832, "grad_norm": 11.280345916748047, "learning_rate": 8.156000000000001e-06, "loss": 1.1035, "step": 6650 }, { "epoch": 0.8680917622523462, "grad_norm": 8.534235000610352, "learning_rate": 8.151555555555556e-06, "loss": 0.9105, "step": 6660 }, { "epoch": 0.8693952033368092, "grad_norm": 9.337313652038574, "learning_rate": 8.147111111111113e-06, "loss": 1.0439, "step": 6670 }, { "epoch": 0.8706986444212722, "grad_norm": 8.120159149169922, "learning_rate": 8.142666666666667e-06, "loss": 0.8715, "step": 6680 }, { "epoch": 0.8720020855057351, "grad_norm": 9.389538764953613, "learning_rate": 8.138222222222222e-06, "loss": 1.0232, "step": 6690 }, { "epoch": 0.8733055265901981, "grad_norm": 9.953109741210938, "learning_rate": 8.133777777777779e-06, "loss": 1.0433, "step": 6700 }, { "epoch": 0.8733055265901981, "eval/acc": 32.55813980102539, "step": 6700 }, { "epoch": 0.8733055265901981, "eval_loss": 2.6176934242248535, "eval_runtime": 0.5514, "eval_samples_per_second": 77.99, "eval_steps_per_second": 1.814, "step": 6700 }, { "epoch": 0.8746089676746611, "grad_norm": 8.203495979309082, "learning_rate": 8.129333333333334e-06, "loss": 1.0338, "step": 6710 }, { "epoch": 0.8759124087591241, "grad_norm": 9.573360443115234, "learning_rate": 8.12488888888889e-06, "loss": 0.9788, "step": 6720 }, { "epoch": 0.877215849843587, "grad_norm": 9.354838371276855, "learning_rate": 8.120444444444445e-06, "loss": 0.9955, "step": 6730 }, { "epoch": 0.87851929092805, "grad_norm": 9.718021392822266, "learning_rate": 8.116e-06, "loss": 1.1349, "step": 6740 }, { "epoch": 0.879822732012513, "grad_norm": 8.48845386505127, "learning_rate": 8.111555555555556e-06, "loss": 1.0507, "step": 6750 }, { "epoch": 0.881126173096976, "grad_norm": 9.579551696777344, "learning_rate": 8.107111111111113e-06, "loss": 0.9856, "step": 6760 }, { "epoch": 0.882429614181439, "grad_norm": 9.837549209594727, "learning_rate": 8.102666666666668e-06, "loss": 1.0175, "step": 6770 }, { "epoch": 0.883733055265902, "grad_norm": 8.538361549377441, "learning_rate": 8.098222222222223e-06, "loss": 1.0427, "step": 6780 }, { "epoch": 0.885036496350365, "grad_norm": 10.268227577209473, "learning_rate": 8.093777777777777e-06, "loss": 0.9472, "step": 6790 }, { "epoch": 0.886339937434828, "grad_norm": 10.689719200134277, "learning_rate": 8.089333333333334e-06, "loss": 1.0209, "step": 6800 }, { "epoch": 0.886339937434828, "eval/acc": 34.88372039794922, "step": 6800 }, { "epoch": 0.886339937434828, "eval_loss": 2.6165432929992676, "eval_runtime": 0.5568, "eval_samples_per_second": 77.234, "eval_steps_per_second": 1.796, "step": 6800 }, { "epoch": 0.887643378519291, "grad_norm": 9.130402565002441, "learning_rate": 8.08488888888889e-06, "loss": 1.01, "step": 6810 }, { "epoch": 0.8889468196037539, "grad_norm": 8.528085708618164, "learning_rate": 8.080444444444445e-06, "loss": 0.9943, "step": 6820 }, { "epoch": 0.8902502606882169, "grad_norm": 10.689129829406738, "learning_rate": 8.076e-06, "loss": 1.0258, "step": 6830 }, { "epoch": 0.8915537017726799, "grad_norm": 9.432913780212402, "learning_rate": 8.071555555555555e-06, "loss": 1.0207, "step": 6840 }, { "epoch": 0.8928571428571429, "grad_norm": 8.51905632019043, "learning_rate": 8.067111111111112e-06, "loss": 1.0492, "step": 6850 }, { "epoch": 0.8941605839416058, "grad_norm": 9.575328826904297, "learning_rate": 8.062666666666668e-06, "loss": 0.919, "step": 6860 }, { "epoch": 0.8954640250260688, "grad_norm": 9.271153450012207, "learning_rate": 8.058222222222223e-06, "loss": 0.9525, "step": 6870 }, { "epoch": 0.8967674661105318, "grad_norm": 9.966239929199219, "learning_rate": 8.053777777777778e-06, "loss": 0.9551, "step": 6880 }, { "epoch": 0.8980709071949948, "grad_norm": 8.789650917053223, "learning_rate": 8.049333333333333e-06, "loss": 0.9712, "step": 6890 }, { "epoch": 0.8993743482794577, "grad_norm": 10.292884826660156, "learning_rate": 8.04488888888889e-06, "loss": 0.9979, "step": 6900 }, { "epoch": 0.8993743482794577, "eval/acc": 32.55813980102539, "step": 6900 }, { "epoch": 0.8993743482794577, "eval_loss": 2.6249427795410156, "eval_runtime": 0.5525, "eval_samples_per_second": 77.821, "eval_steps_per_second": 1.81, "step": 6900 }, { "epoch": 0.9006777893639207, "grad_norm": 9.126242637634277, "learning_rate": 8.040444444444446e-06, "loss": 1.0284, "step": 6910 }, { "epoch": 0.9019812304483837, "grad_norm": 8.590139389038086, "learning_rate": 8.036e-06, "loss": 0.9972, "step": 6920 }, { "epoch": 0.9032846715328468, "grad_norm": 8.998208999633789, "learning_rate": 8.031555555555555e-06, "loss": 0.9327, "step": 6930 }, { "epoch": 0.9045881126173096, "grad_norm": 9.858304977416992, "learning_rate": 8.027111111111112e-06, "loss": 0.9598, "step": 6940 }, { "epoch": 0.9058915537017727, "grad_norm": 9.873553276062012, "learning_rate": 8.022666666666667e-06, "loss": 1.013, "step": 6950 }, { "epoch": 0.9071949947862357, "grad_norm": 7.501348972320557, "learning_rate": 8.018222222222223e-06, "loss": 1.0439, "step": 6960 }, { "epoch": 0.9084984358706987, "grad_norm": 9.33034610748291, "learning_rate": 8.013777777777778e-06, "loss": 1.0219, "step": 6970 }, { "epoch": 0.9098018769551617, "grad_norm": 8.672886848449707, "learning_rate": 8.009333333333335e-06, "loss": 0.9912, "step": 6980 }, { "epoch": 0.9111053180396246, "grad_norm": 9.21350383758545, "learning_rate": 8.00488888888889e-06, "loss": 0.9569, "step": 6990 }, { "epoch": 0.9124087591240876, "grad_norm": 8.252593994140625, "learning_rate": 8.000444444444444e-06, "loss": 0.9535, "step": 7000 }, { "epoch": 0.9124087591240876, "eval/acc": 30.23255729675293, "step": 7000 }, { "epoch": 0.9124087591240876, "eval_loss": 2.6694142818450928, "eval_runtime": 0.5555, "eval_samples_per_second": 77.407, "eval_steps_per_second": 1.8, "step": 7000 }, { "epoch": 0.9137122002085506, "grad_norm": 9.754995346069336, "learning_rate": 7.996000000000001e-06, "loss": 0.9894, "step": 7010 }, { "epoch": 0.9150156412930136, "grad_norm": 9.45035171508789, "learning_rate": 7.991555555555556e-06, "loss": 0.968, "step": 7020 }, { "epoch": 0.9163190823774765, "grad_norm": 10.195062637329102, "learning_rate": 7.987111111111112e-06, "loss": 1.0784, "step": 7030 }, { "epoch": 0.9176225234619395, "grad_norm": 9.188143730163574, "learning_rate": 7.982666666666667e-06, "loss": 0.9223, "step": 7040 }, { "epoch": 0.9189259645464025, "grad_norm": 10.677811622619629, "learning_rate": 7.978222222222222e-06, "loss": 0.9489, "step": 7050 }, { "epoch": 0.9202294056308655, "grad_norm": 8.982565879821777, "learning_rate": 7.973777777777779e-06, "loss": 0.9646, "step": 7060 }, { "epoch": 0.9215328467153284, "grad_norm": 9.741477012634277, "learning_rate": 7.969333333333335e-06, "loss": 0.9839, "step": 7070 }, { "epoch": 0.9228362877997914, "grad_norm": 9.745488166809082, "learning_rate": 7.96488888888889e-06, "loss": 1.0026, "step": 7080 }, { "epoch": 0.9241397288842544, "grad_norm": 9.319021224975586, "learning_rate": 7.960444444444445e-06, "loss": 0.9635, "step": 7090 }, { "epoch": 0.9254431699687174, "grad_norm": 9.34664249420166, "learning_rate": 7.956e-06, "loss": 0.9563, "step": 7100 }, { "epoch": 0.9254431699687174, "eval/acc": 34.88372039794922, "step": 7100 }, { "epoch": 0.9254431699687174, "eval_loss": 2.6430439949035645, "eval_runtime": 0.5521, "eval_samples_per_second": 77.886, "eval_steps_per_second": 1.811, "step": 7100 }, { "epoch": 0.9267466110531803, "grad_norm": 10.529402732849121, "learning_rate": 7.951555555555556e-06, "loss": 0.9977, "step": 7110 }, { "epoch": 0.9280500521376434, "grad_norm": 9.836724281311035, "learning_rate": 7.947111111111113e-06, "loss": 0.9703, "step": 7120 }, { "epoch": 0.9293534932221064, "grad_norm": 10.40678596496582, "learning_rate": 7.942666666666668e-06, "loss": 0.9726, "step": 7130 }, { "epoch": 0.9306569343065694, "grad_norm": 8.559309959411621, "learning_rate": 7.938222222222222e-06, "loss": 0.9991, "step": 7140 }, { "epoch": 0.9319603753910324, "grad_norm": 8.88638973236084, "learning_rate": 7.933777777777777e-06, "loss": 1.0273, "step": 7150 }, { "epoch": 0.9332638164754953, "grad_norm": 8.549494743347168, "learning_rate": 7.929333333333334e-06, "loss": 0.9444, "step": 7160 }, { "epoch": 0.9345672575599583, "grad_norm": 8.719738960266113, "learning_rate": 7.92488888888889e-06, "loss": 0.9725, "step": 7170 }, { "epoch": 0.9358706986444213, "grad_norm": 8.908008575439453, "learning_rate": 7.920444444444445e-06, "loss": 0.9216, "step": 7180 }, { "epoch": 0.9371741397288843, "grad_norm": 8.950148582458496, "learning_rate": 7.916e-06, "loss": 1.0316, "step": 7190 }, { "epoch": 0.9384775808133472, "grad_norm": 8.901386260986328, "learning_rate": 7.911555555555555e-06, "loss": 1.0231, "step": 7200 }, { "epoch": 0.9384775808133472, "eval/acc": 34.88372039794922, "step": 7200 }, { "epoch": 0.9384775808133472, "eval_loss": 2.628709316253662, "eval_runtime": 0.5528, "eval_samples_per_second": 77.786, "eval_steps_per_second": 1.809, "step": 7200 }, { "epoch": 0.9397810218978102, "grad_norm": 8.983177185058594, "learning_rate": 7.907111111111111e-06, "loss": 1.0309, "step": 7210 }, { "epoch": 0.9410844629822732, "grad_norm": 10.097892761230469, "learning_rate": 7.902666666666668e-06, "loss": 1.0109, "step": 7220 }, { "epoch": 0.9423879040667362, "grad_norm": 8.999421119689941, "learning_rate": 7.898222222222223e-06, "loss": 0.9416, "step": 7230 }, { "epoch": 0.9436913451511991, "grad_norm": 9.044880867004395, "learning_rate": 7.893777777777778e-06, "loss": 0.9678, "step": 7240 }, { "epoch": 0.9449947862356621, "grad_norm": 9.498455047607422, "learning_rate": 7.889333333333334e-06, "loss": 1.0159, "step": 7250 }, { "epoch": 0.9462982273201251, "grad_norm": 8.960881233215332, "learning_rate": 7.884888888888889e-06, "loss": 1.0648, "step": 7260 }, { "epoch": 0.9476016684045881, "grad_norm": 8.187444686889648, "learning_rate": 7.880444444444446e-06, "loss": 1.0935, "step": 7270 }, { "epoch": 0.948905109489051, "grad_norm": 8.623115539550781, "learning_rate": 7.876e-06, "loss": 1.0459, "step": 7280 }, { "epoch": 0.950208550573514, "grad_norm": 8.242715835571289, "learning_rate": 7.871555555555557e-06, "loss": 0.9429, "step": 7290 }, { "epoch": 0.9515119916579771, "grad_norm": 8.49636459350586, "learning_rate": 7.867111111111112e-06, "loss": 0.8846, "step": 7300 }, { "epoch": 0.9515119916579771, "eval/acc": 37.20930099487305, "step": 7300 }, { "epoch": 0.9515119916579771, "eval_loss": 2.692638397216797, "eval_runtime": 0.5526, "eval_samples_per_second": 77.807, "eval_steps_per_second": 1.809, "step": 7300 }, { "epoch": 0.9528154327424401, "grad_norm": 8.402484893798828, "learning_rate": 7.862666666666667e-06, "loss": 0.9219, "step": 7310 }, { "epoch": 0.954118873826903, "grad_norm": 9.127314567565918, "learning_rate": 7.858222222222223e-06, "loss": 0.9732, "step": 7320 }, { "epoch": 0.955422314911366, "grad_norm": 8.822267532348633, "learning_rate": 7.853777777777778e-06, "loss": 1.0115, "step": 7330 }, { "epoch": 0.956725755995829, "grad_norm": 9.45130729675293, "learning_rate": 7.849333333333335e-06, "loss": 0.9701, "step": 7340 }, { "epoch": 0.958029197080292, "grad_norm": 9.352774620056152, "learning_rate": 7.84488888888889e-06, "loss": 1.0008, "step": 7350 }, { "epoch": 0.959332638164755, "grad_norm": 8.283885955810547, "learning_rate": 7.840444444444444e-06, "loss": 1.0424, "step": 7360 }, { "epoch": 0.9606360792492179, "grad_norm": 8.821931838989258, "learning_rate": 7.836000000000001e-06, "loss": 0.9506, "step": 7370 }, { "epoch": 0.9619395203336809, "grad_norm": 10.744322776794434, "learning_rate": 7.831555555555557e-06, "loss": 1.0553, "step": 7380 }, { "epoch": 0.9632429614181439, "grad_norm": 8.165914535522461, "learning_rate": 7.827111111111112e-06, "loss": 0.9444, "step": 7390 }, { "epoch": 0.9645464025026069, "grad_norm": 9.222362518310547, "learning_rate": 7.822666666666667e-06, "loss": 1.0401, "step": 7400 }, { "epoch": 0.9645464025026069, "eval/acc": 30.23255729675293, "step": 7400 }, { "epoch": 0.9645464025026069, "eval_loss": 2.5550081729888916, "eval_runtime": 0.5536, "eval_samples_per_second": 77.675, "eval_steps_per_second": 1.806, "step": 7400 }, { "epoch": 0.9658498435870698, "grad_norm": 7.525315284729004, "learning_rate": 7.818222222222222e-06, "loss": 1.0211, "step": 7410 }, { "epoch": 0.9671532846715328, "grad_norm": 10.365772247314453, "learning_rate": 7.813777777777778e-06, "loss": 1.0113, "step": 7420 }, { "epoch": 0.9684567257559958, "grad_norm": 9.825669288635254, "learning_rate": 7.809333333333335e-06, "loss": 0.9951, "step": 7430 }, { "epoch": 0.9697601668404588, "grad_norm": 9.201279640197754, "learning_rate": 7.80488888888889e-06, "loss": 0.9878, "step": 7440 }, { "epoch": 0.9710636079249217, "grad_norm": 9.135232925415039, "learning_rate": 7.800444444444445e-06, "loss": 0.9692, "step": 7450 }, { "epoch": 0.9723670490093848, "grad_norm": 10.133102416992188, "learning_rate": 7.796e-06, "loss": 0.9488, "step": 7460 }, { "epoch": 0.9736704900938478, "grad_norm": 8.100567817687988, "learning_rate": 7.791555555555556e-06, "loss": 0.9558, "step": 7470 }, { "epoch": 0.9749739311783108, "grad_norm": 8.011831283569336, "learning_rate": 7.787111111111113e-06, "loss": 0.9695, "step": 7480 }, { "epoch": 0.9762773722627737, "grad_norm": 8.650604248046875, "learning_rate": 7.782666666666667e-06, "loss": 1.0838, "step": 7490 }, { "epoch": 0.9775808133472367, "grad_norm": 9.186234474182129, "learning_rate": 7.778222222222222e-06, "loss": 0.9779, "step": 7500 }, { "epoch": 0.9775808133472367, "eval/acc": 34.88372039794922, "step": 7500 }, { "epoch": 0.9775808133472367, "eval_loss": 2.6376609802246094, "eval_runtime": 0.5509, "eval_samples_per_second": 78.048, "eval_steps_per_second": 1.815, "step": 7500 }, { "epoch": 0.9788842544316997, "grad_norm": 9.186497688293457, "learning_rate": 7.773777777777777e-06, "loss": 0.9524, "step": 7510 }, { "epoch": 0.9801876955161627, "grad_norm": 8.840336799621582, "learning_rate": 7.769333333333334e-06, "loss": 0.9805, "step": 7520 }, { "epoch": 0.9814911366006257, "grad_norm": 9.51953411102295, "learning_rate": 7.76488888888889e-06, "loss": 0.9898, "step": 7530 }, { "epoch": 0.9827945776850886, "grad_norm": 9.390219688415527, "learning_rate": 7.760444444444445e-06, "loss": 1.0244, "step": 7540 }, { "epoch": 0.9840980187695516, "grad_norm": 8.21574592590332, "learning_rate": 7.756e-06, "loss": 0.9465, "step": 7550 }, { "epoch": 0.9854014598540146, "grad_norm": 9.998575210571289, "learning_rate": 7.751555555555556e-06, "loss": 1.0289, "step": 7560 }, { "epoch": 0.9867049009384776, "grad_norm": 9.237029075622559, "learning_rate": 7.747111111111111e-06, "loss": 0.9701, "step": 7570 }, { "epoch": 0.9880083420229405, "grad_norm": 11.330434799194336, "learning_rate": 7.742666666666668e-06, "loss": 1.0181, "step": 7580 }, { "epoch": 0.9893117831074035, "grad_norm": 7.958991050720215, "learning_rate": 7.738222222222223e-06, "loss": 1.0152, "step": 7590 }, { "epoch": 0.9906152241918665, "grad_norm": 8.143290519714355, "learning_rate": 7.73377777777778e-06, "loss": 1.0054, "step": 7600 }, { "epoch": 0.9906152241918665, "eval/acc": 34.88372039794922, "step": 7600 }, { "epoch": 0.9906152241918665, "eval_loss": 2.6791605949401855, "eval_runtime": 0.553, "eval_samples_per_second": 77.752, "eval_steps_per_second": 1.808, "step": 7600 }, { "epoch": 0.9919186652763295, "grad_norm": 9.640328407287598, "learning_rate": 7.729333333333334e-06, "loss": 1.0106, "step": 7610 }, { "epoch": 0.9932221063607924, "grad_norm": 7.355710029602051, "learning_rate": 7.724888888888889e-06, "loss": 0.9055, "step": 7620 }, { "epoch": 0.9945255474452555, "grad_norm": 9.306617736816406, "learning_rate": 7.720444444444445e-06, "loss": 1.0173, "step": 7630 }, { "epoch": 0.9958289885297185, "grad_norm": 9.533848762512207, "learning_rate": 7.716e-06, "loss": 0.9594, "step": 7640 }, { "epoch": 0.9971324296141815, "grad_norm": 8.485058784484863, "learning_rate": 7.711555555555557e-06, "loss": 0.9638, "step": 7650 }, { "epoch": 0.9984358706986444, "grad_norm": 9.320188522338867, "learning_rate": 7.707111111111112e-06, "loss": 1.0431, "step": 7660 }, { "epoch": 0.9997393117831074, "grad_norm": 9.840314865112305, "learning_rate": 7.702666666666667e-06, "loss": 0.9663, "step": 7670 }, { "epoch": 1.0010427528675705, "grad_norm": 10.212252616882324, "learning_rate": 7.698222222222223e-06, "loss": 0.8872, "step": 7680 }, { "epoch": 1.0023461939520333, "grad_norm": 10.817450523376465, "learning_rate": 7.69377777777778e-06, "loss": 0.8731, "step": 7690 }, { "epoch": 1.0036496350364963, "grad_norm": 9.630830764770508, "learning_rate": 7.689333333333334e-06, "loss": 0.9175, "step": 7700 }, { "epoch": 1.0036496350364963, "eval/acc": 41.86046600341797, "step": 7700 }, { "epoch": 1.0036496350364963, "eval_loss": 2.9220545291900635, "eval_runtime": 0.5716, "eval_samples_per_second": 75.233, "eval_steps_per_second": 1.75, "step": 7700 }, { "epoch": 1.0049530761209593, "grad_norm": 8.57374095916748, "learning_rate": 7.68488888888889e-06, "loss": 0.9758, "step": 7710 }, { "epoch": 1.0062565172054223, "grad_norm": 9.157445907592773, "learning_rate": 7.680444444444444e-06, "loss": 0.8355, "step": 7720 }, { "epoch": 1.0075599582898853, "grad_norm": 9.53898811340332, "learning_rate": 7.676e-06, "loss": 0.8846, "step": 7730 }, { "epoch": 1.0088633993743483, "grad_norm": 11.01609992980957, "learning_rate": 7.671555555555557e-06, "loss": 0.9107, "step": 7740 }, { "epoch": 1.0101668404588113, "grad_norm": 8.74721622467041, "learning_rate": 7.667111111111112e-06, "loss": 0.8947, "step": 7750 }, { "epoch": 1.0114702815432743, "grad_norm": 10.709732055664062, "learning_rate": 7.662666666666667e-06, "loss": 0.96, "step": 7760 }, { "epoch": 1.0127737226277371, "grad_norm": 9.86235523223877, "learning_rate": 7.658222222222222e-06, "loss": 0.9517, "step": 7770 }, { "epoch": 1.0140771637122001, "grad_norm": 9.738570213317871, "learning_rate": 7.653777777777778e-06, "loss": 0.9245, "step": 7780 }, { "epoch": 1.0153806047966631, "grad_norm": 8.564919471740723, "learning_rate": 7.649333333333335e-06, "loss": 0.8769, "step": 7790 }, { "epoch": 1.0166840458811262, "grad_norm": 10.096901893615723, "learning_rate": 7.64488888888889e-06, "loss": 0.9472, "step": 7800 }, { "epoch": 1.0166840458811262, "eval/acc": 39.53488540649414, "step": 7800 }, { "epoch": 1.0166840458811262, "eval_loss": 2.8397738933563232, "eval_runtime": 0.5698, "eval_samples_per_second": 75.459, "eval_steps_per_second": 1.755, "step": 7800 }, { "epoch": 1.0179874869655892, "grad_norm": 7.725857257843018, "learning_rate": 7.640444444444445e-06, "loss": 0.9419, "step": 7810 }, { "epoch": 1.0192909280500522, "grad_norm": 8.992986679077148, "learning_rate": 7.636e-06, "loss": 0.8854, "step": 7820 }, { "epoch": 1.0205943691345152, "grad_norm": 9.632916450500488, "learning_rate": 7.631555555555556e-06, "loss": 0.9982, "step": 7830 }, { "epoch": 1.0218978102189782, "grad_norm": 9.003362655639648, "learning_rate": 7.627111111111112e-06, "loss": 0.8817, "step": 7840 }, { "epoch": 1.023201251303441, "grad_norm": 8.941239356994629, "learning_rate": 7.622666666666667e-06, "loss": 0.9168, "step": 7850 }, { "epoch": 1.024504692387904, "grad_norm": 9.012649536132812, "learning_rate": 7.618222222222222e-06, "loss": 0.9765, "step": 7860 }, { "epoch": 1.025808133472367, "grad_norm": 8.48647689819336, "learning_rate": 7.613777777777779e-06, "loss": 0.9321, "step": 7870 }, { "epoch": 1.02711157455683, "grad_norm": 9.642101287841797, "learning_rate": 7.609333333333334e-06, "loss": 0.9204, "step": 7880 }, { "epoch": 1.028415015641293, "grad_norm": 7.952564239501953, "learning_rate": 7.604888888888889e-06, "loss": 0.9259, "step": 7890 }, { "epoch": 1.029718456725756, "grad_norm": 10.145586013793945, "learning_rate": 7.600444444444445e-06, "loss": 0.963, "step": 7900 }, { "epoch": 1.029718456725756, "eval/acc": 41.86046600341797, "step": 7900 }, { "epoch": 1.029718456725756, "eval_loss": 2.715751886367798, "eval_runtime": 0.5523, "eval_samples_per_second": 77.86, "eval_steps_per_second": 1.811, "step": 7900 }, { "epoch": 1.031021897810219, "grad_norm": 9.017576217651367, "learning_rate": 7.5960000000000015e-06, "loss": 0.8985, "step": 7910 }, { "epoch": 1.032325338894682, "grad_norm": 8.437040328979492, "learning_rate": 7.591555555555556e-06, "loss": 0.9481, "step": 7920 }, { "epoch": 1.033628779979145, "grad_norm": 8.106425285339355, "learning_rate": 7.587111111111112e-06, "loss": 0.8974, "step": 7930 }, { "epoch": 1.0349322210636078, "grad_norm": 7.865415573120117, "learning_rate": 7.582666666666667e-06, "loss": 0.8884, "step": 7940 }, { "epoch": 1.0362356621480708, "grad_norm": 11.911471366882324, "learning_rate": 7.5782222222222225e-06, "loss": 0.8795, "step": 7950 }, { "epoch": 1.0375391032325338, "grad_norm": 8.606977462768555, "learning_rate": 7.573777777777779e-06, "loss": 0.8699, "step": 7960 }, { "epoch": 1.0388425443169969, "grad_norm": 10.2034330368042, "learning_rate": 7.569333333333334e-06, "loss": 0.8918, "step": 7970 }, { "epoch": 1.0401459854014599, "grad_norm": 11.00263786315918, "learning_rate": 7.56488888888889e-06, "loss": 0.889, "step": 7980 }, { "epoch": 1.0414494264859229, "grad_norm": 8.472570419311523, "learning_rate": 7.5604444444444445e-06, "loss": 0.9292, "step": 7990 }, { "epoch": 1.0427528675703859, "grad_norm": 12.965922355651855, "learning_rate": 7.556000000000001e-06, "loss": 0.9053, "step": 8000 }, { "epoch": 1.0427528675703859, "eval/acc": 39.53488540649414, "step": 8000 }, { "epoch": 1.0427528675703859, "eval_loss": 2.7604176998138428, "eval_runtime": 0.5527, "eval_samples_per_second": 77.805, "eval_steps_per_second": 1.809, "step": 8000 }, { "epoch": 1.0440563086548489, "grad_norm": 9.576687812805176, "learning_rate": 7.551555555555557e-06, "loss": 0.952, "step": 8010 }, { "epoch": 1.045359749739312, "grad_norm": 8.876384735107422, "learning_rate": 7.5471111111111115e-06, "loss": 0.8395, "step": 8020 }, { "epoch": 1.0466631908237747, "grad_norm": 8.862028121948242, "learning_rate": 7.542666666666667e-06, "loss": 0.9816, "step": 8030 }, { "epoch": 1.0479666319082377, "grad_norm": 7.471101760864258, "learning_rate": 7.538222222222222e-06, "loss": 0.8824, "step": 8040 }, { "epoch": 1.0492700729927007, "grad_norm": 10.809747695922852, "learning_rate": 7.533777777777779e-06, "loss": 0.9463, "step": 8050 }, { "epoch": 1.0505735140771637, "grad_norm": 8.711124420166016, "learning_rate": 7.529333333333334e-06, "loss": 0.9188, "step": 8060 }, { "epoch": 1.0518769551616267, "grad_norm": 9.918045997619629, "learning_rate": 7.524888888888889e-06, "loss": 0.9275, "step": 8070 }, { "epoch": 1.0531803962460897, "grad_norm": 7.851119518280029, "learning_rate": 7.520444444444445e-06, "loss": 1.0079, "step": 8080 }, { "epoch": 1.0544838373305527, "grad_norm": 9.710519790649414, "learning_rate": 7.516000000000001e-06, "loss": 0.9625, "step": 8090 }, { "epoch": 1.0557872784150157, "grad_norm": 9.297921180725098, "learning_rate": 7.511555555555556e-06, "loss": 0.9715, "step": 8100 }, { "epoch": 1.0557872784150157, "eval/acc": 39.53488540649414, "step": 8100 }, { "epoch": 1.0557872784150157, "eval_loss": 2.6021246910095215, "eval_runtime": 0.5527, "eval_samples_per_second": 77.804, "eval_steps_per_second": 1.809, "step": 8100 }, { "epoch": 1.0570907194994785, "grad_norm": 7.866173267364502, "learning_rate": 7.507111111111112e-06, "loss": 0.9433, "step": 8110 }, { "epoch": 1.0583941605839415, "grad_norm": 9.581183433532715, "learning_rate": 7.502666666666667e-06, "loss": 1.0003, "step": 8120 }, { "epoch": 1.0596976016684045, "grad_norm": 9.06647777557373, "learning_rate": 7.4982222222222225e-06, "loss": 0.8665, "step": 8130 }, { "epoch": 1.0610010427528676, "grad_norm": 10.333742141723633, "learning_rate": 7.493777777777779e-06, "loss": 0.9426, "step": 8140 }, { "epoch": 1.0623044838373306, "grad_norm": 7.566351413726807, "learning_rate": 7.489333333333334e-06, "loss": 0.9917, "step": 8150 }, { "epoch": 1.0636079249217936, "grad_norm": 8.821928977966309, "learning_rate": 7.4848888888888895e-06, "loss": 0.8495, "step": 8160 }, { "epoch": 1.0649113660062566, "grad_norm": 9.295007705688477, "learning_rate": 7.480444444444444e-06, "loss": 0.8889, "step": 8170 }, { "epoch": 1.0662148070907196, "grad_norm": 8.136956214904785, "learning_rate": 7.476000000000001e-06, "loss": 1.0155, "step": 8180 }, { "epoch": 1.0675182481751824, "grad_norm": 8.968605995178223, "learning_rate": 7.471555555555557e-06, "loss": 0.8767, "step": 8190 }, { "epoch": 1.0688216892596454, "grad_norm": 9.742268562316895, "learning_rate": 7.4671111111111115e-06, "loss": 0.9174, "step": 8200 }, { "epoch": 1.0688216892596454, "eval/acc": 39.53488540649414, "step": 8200 }, { "epoch": 1.0688216892596454, "eval_loss": 2.713195562362671, "eval_runtime": 0.5585, "eval_samples_per_second": 76.986, "eval_steps_per_second": 1.79, "step": 8200 }, { "epoch": 1.0701251303441084, "grad_norm": 10.187495231628418, "learning_rate": 7.462666666666667e-06, "loss": 0.9243, "step": 8210 }, { "epoch": 1.0714285714285714, "grad_norm": 8.872064590454102, "learning_rate": 7.458222222222224e-06, "loss": 0.9854, "step": 8220 }, { "epoch": 1.0727320125130344, "grad_norm": 8.593219757080078, "learning_rate": 7.4537777777777785e-06, "loss": 0.8947, "step": 8230 }, { "epoch": 1.0740354535974974, "grad_norm": 8.671141624450684, "learning_rate": 7.449333333333334e-06, "loss": 0.8982, "step": 8240 }, { "epoch": 1.0753388946819604, "grad_norm": 8.44241714477539, "learning_rate": 7.444888888888889e-06, "loss": 0.8571, "step": 8250 }, { "epoch": 1.0766423357664234, "grad_norm": 8.147622108459473, "learning_rate": 7.440444444444445e-06, "loss": 1.0003, "step": 8260 }, { "epoch": 1.0779457768508864, "grad_norm": 7.976443767547607, "learning_rate": 7.436000000000001e-06, "loss": 0.9462, "step": 8270 }, { "epoch": 1.0792492179353492, "grad_norm": 11.551979064941406, "learning_rate": 7.431555555555556e-06, "loss": 0.822, "step": 8280 }, { "epoch": 1.0805526590198122, "grad_norm": 9.746484756469727, "learning_rate": 7.427111111111112e-06, "loss": 0.9055, "step": 8290 }, { "epoch": 1.0818561001042752, "grad_norm": 8.56600570678711, "learning_rate": 7.422666666666667e-06, "loss": 0.9878, "step": 8300 }, { "epoch": 1.0818561001042752, "eval/acc": 41.86046600341797, "step": 8300 }, { "epoch": 1.0818561001042752, "eval_loss": 2.6222879886627197, "eval_runtime": 0.5525, "eval_samples_per_second": 77.832, "eval_steps_per_second": 1.81, "step": 8300 }, { "epoch": 1.0831595411887383, "grad_norm": 9.583529472351074, "learning_rate": 7.418222222222223e-06, "loss": 0.88, "step": 8310 }, { "epoch": 1.0844629822732013, "grad_norm": 8.403987884521484, "learning_rate": 7.413777777777779e-06, "loss": 0.9225, "step": 8320 }, { "epoch": 1.0857664233576643, "grad_norm": 8.449149131774902, "learning_rate": 7.409333333333334e-06, "loss": 0.8888, "step": 8330 }, { "epoch": 1.0870698644421273, "grad_norm": 9.25545883178711, "learning_rate": 7.4048888888888895e-06, "loss": 0.9197, "step": 8340 }, { "epoch": 1.0883733055265903, "grad_norm": 9.97526741027832, "learning_rate": 7.400444444444444e-06, "loss": 0.9448, "step": 8350 }, { "epoch": 1.0896767466110533, "grad_norm": 8.82689380645752, "learning_rate": 7.396000000000001e-06, "loss": 0.9921, "step": 8360 }, { "epoch": 1.090980187695516, "grad_norm": 9.498625755310059, "learning_rate": 7.3915555555555565e-06, "loss": 0.9156, "step": 8370 }, { "epoch": 1.092283628779979, "grad_norm": 9.056727409362793, "learning_rate": 7.387111111111111e-06, "loss": 0.9225, "step": 8380 }, { "epoch": 1.093587069864442, "grad_norm": 10.002091407775879, "learning_rate": 7.382666666666667e-06, "loss": 0.8715, "step": 8390 }, { "epoch": 1.094890510948905, "grad_norm": 10.686137199401855, "learning_rate": 7.378222222222224e-06, "loss": 0.9742, "step": 8400 }, { "epoch": 1.094890510948905, "eval/acc": 41.86046600341797, "step": 8400 }, { "epoch": 1.094890510948905, "eval_loss": 2.6428284645080566, "eval_runtime": 0.5508, "eval_samples_per_second": 78.075, "eval_steps_per_second": 1.816, "step": 8400 }, { "epoch": 1.0961939520333681, "grad_norm": 9.688252449035645, "learning_rate": 7.3737777777777785e-06, "loss": 0.8819, "step": 8410 }, { "epoch": 1.0974973931178311, "grad_norm": 8.46037769317627, "learning_rate": 7.369333333333334e-06, "loss": 0.8957, "step": 8420 }, { "epoch": 1.0988008342022941, "grad_norm": 12.241666793823242, "learning_rate": 7.364888888888889e-06, "loss": 0.9476, "step": 8430 }, { "epoch": 1.1001042752867571, "grad_norm": 9.35562801361084, "learning_rate": 7.360444444444445e-06, "loss": 0.8638, "step": 8440 }, { "epoch": 1.10140771637122, "grad_norm": 9.53718090057373, "learning_rate": 7.356000000000001e-06, "loss": 0.8484, "step": 8450 }, { "epoch": 1.102711157455683, "grad_norm": 8.596136093139648, "learning_rate": 7.351555555555556e-06, "loss": 0.8997, "step": 8460 }, { "epoch": 1.104014598540146, "grad_norm": 8.995074272155762, "learning_rate": 7.347111111111112e-06, "loss": 0.9136, "step": 8470 }, { "epoch": 1.105318039624609, "grad_norm": 9.429573059082031, "learning_rate": 7.342666666666667e-06, "loss": 0.8752, "step": 8480 }, { "epoch": 1.106621480709072, "grad_norm": 8.272865295410156, "learning_rate": 7.338222222222223e-06, "loss": 0.9411, "step": 8490 }, { "epoch": 1.107924921793535, "grad_norm": 9.662363052368164, "learning_rate": 7.333777777777779e-06, "loss": 0.9048, "step": 8500 }, { "epoch": 1.107924921793535, "eval/acc": 41.86046600341797, "step": 8500 }, { "epoch": 1.107924921793535, "eval_loss": 2.7484524250030518, "eval_runtime": 0.5518, "eval_samples_per_second": 77.922, "eval_steps_per_second": 1.812, "step": 8500 }, { "epoch": 1.109228362877998, "grad_norm": 10.379301071166992, "learning_rate": 7.329333333333334e-06, "loss": 0.9296, "step": 8510 }, { "epoch": 1.110531803962461, "grad_norm": 8.337886810302734, "learning_rate": 7.324888888888889e-06, "loss": 0.9018, "step": 8520 }, { "epoch": 1.1118352450469238, "grad_norm": 9.126009941101074, "learning_rate": 7.320444444444446e-06, "loss": 0.9447, "step": 8530 }, { "epoch": 1.1131386861313868, "grad_norm": 9.507220268249512, "learning_rate": 7.316000000000001e-06, "loss": 0.922, "step": 8540 }, { "epoch": 1.1144421272158498, "grad_norm": 9.640961647033691, "learning_rate": 7.3115555555555565e-06, "loss": 0.9118, "step": 8550 }, { "epoch": 1.1157455683003128, "grad_norm": 9.205474853515625, "learning_rate": 7.307111111111111e-06, "loss": 0.888, "step": 8560 }, { "epoch": 1.1170490093847758, "grad_norm": 8.839990615844727, "learning_rate": 7.302666666666667e-06, "loss": 0.9424, "step": 8570 }, { "epoch": 1.1183524504692388, "grad_norm": 9.424860000610352, "learning_rate": 7.2982222222222235e-06, "loss": 0.9087, "step": 8580 }, { "epoch": 1.1196558915537018, "grad_norm": 10.8524169921875, "learning_rate": 7.293777777777778e-06, "loss": 0.8574, "step": 8590 }, { "epoch": 1.1209593326381648, "grad_norm": 9.900219917297363, "learning_rate": 7.289333333333334e-06, "loss": 0.9162, "step": 8600 }, { "epoch": 1.1209593326381648, "eval/acc": 39.53488540649414, "step": 8600 }, { "epoch": 1.1209593326381648, "eval_loss": 2.7909610271453857, "eval_runtime": 0.5532, "eval_samples_per_second": 77.735, "eval_steps_per_second": 1.808, "step": 8600 }, { "epoch": 1.1222627737226278, "grad_norm": 8.885669708251953, "learning_rate": 7.284888888888889e-06, "loss": 0.8483, "step": 8610 }, { "epoch": 1.1235662148070906, "grad_norm": 10.051748275756836, "learning_rate": 7.2804444444444455e-06, "loss": 0.9287, "step": 8620 }, { "epoch": 1.1248696558915536, "grad_norm": 10.11687183380127, "learning_rate": 7.276000000000001e-06, "loss": 0.9642, "step": 8630 }, { "epoch": 1.1261730969760166, "grad_norm": 8.279516220092773, "learning_rate": 7.271555555555556e-06, "loss": 0.8276, "step": 8640 }, { "epoch": 1.1274765380604796, "grad_norm": 9.638724327087402, "learning_rate": 7.267111111111112e-06, "loss": 0.9309, "step": 8650 }, { "epoch": 1.1287799791449427, "grad_norm": 9.2564115524292, "learning_rate": 7.2626666666666665e-06, "loss": 0.8981, "step": 8660 }, { "epoch": 1.1300834202294057, "grad_norm": 11.3716402053833, "learning_rate": 7.258222222222223e-06, "loss": 0.9597, "step": 8670 }, { "epoch": 1.1313868613138687, "grad_norm": 8.568989753723145, "learning_rate": 7.253777777777779e-06, "loss": 0.8171, "step": 8680 }, { "epoch": 1.1326903023983317, "grad_norm": 9.600732803344727, "learning_rate": 7.249333333333334e-06, "loss": 0.9596, "step": 8690 }, { "epoch": 1.1339937434827947, "grad_norm": 8.98791790008545, "learning_rate": 7.244888888888889e-06, "loss": 0.8949, "step": 8700 }, { "epoch": 1.1339937434827947, "eval/acc": 37.20930099487305, "step": 8700 }, { "epoch": 1.1339937434827947, "eval_loss": 2.729647397994995, "eval_runtime": 0.5521, "eval_samples_per_second": 77.885, "eval_steps_per_second": 1.811, "step": 8700 }, { "epoch": 1.1352971845672575, "grad_norm": 7.747639179229736, "learning_rate": 7.240444444444446e-06, "loss": 0.9241, "step": 8710 }, { "epoch": 1.1366006256517205, "grad_norm": 8.257080078125, "learning_rate": 7.236000000000001e-06, "loss": 0.9498, "step": 8720 }, { "epoch": 1.1379040667361835, "grad_norm": 8.281147003173828, "learning_rate": 7.231555555555556e-06, "loss": 0.9207, "step": 8730 }, { "epoch": 1.1392075078206465, "grad_norm": 10.297804832458496, "learning_rate": 7.227111111111111e-06, "loss": 0.9211, "step": 8740 }, { "epoch": 1.1405109489051095, "grad_norm": 8.747051239013672, "learning_rate": 7.222666666666667e-06, "loss": 0.8808, "step": 8750 }, { "epoch": 1.1418143899895725, "grad_norm": 8.297418594360352, "learning_rate": 7.2182222222222235e-06, "loss": 0.9545, "step": 8760 }, { "epoch": 1.1431178310740355, "grad_norm": 8.677216529846191, "learning_rate": 7.213777777777778e-06, "loss": 0.8664, "step": 8770 }, { "epoch": 1.1444212721584983, "grad_norm": 9.342988014221191, "learning_rate": 7.209333333333334e-06, "loss": 0.8342, "step": 8780 }, { "epoch": 1.1457247132429613, "grad_norm": 9.079336166381836, "learning_rate": 7.204888888888889e-06, "loss": 0.9371, "step": 8790 }, { "epoch": 1.1470281543274243, "grad_norm": 9.262984275817871, "learning_rate": 7.200444444444445e-06, "loss": 0.8873, "step": 8800 }, { "epoch": 1.1470281543274243, "eval/acc": 41.86046600341797, "step": 8800 }, { "epoch": 1.1470281543274243, "eval_loss": 2.822016477584839, "eval_runtime": 0.5525, "eval_samples_per_second": 77.829, "eval_steps_per_second": 1.81, "step": 8800 }, { "epoch": 1.1483315954118873, "grad_norm": 10.144986152648926, "learning_rate": 7.196000000000001e-06, "loss": 0.938, "step": 8810 }, { "epoch": 1.1496350364963503, "grad_norm": 8.611014366149902, "learning_rate": 7.191555555555556e-06, "loss": 0.9392, "step": 8820 }, { "epoch": 1.1509384775808134, "grad_norm": 6.575359344482422, "learning_rate": 7.187111111111112e-06, "loss": 0.8865, "step": 8830 }, { "epoch": 1.1522419186652764, "grad_norm": 8.885351181030273, "learning_rate": 7.182666666666668e-06, "loss": 0.908, "step": 8840 }, { "epoch": 1.1535453597497394, "grad_norm": 10.43062973022461, "learning_rate": 7.178222222222223e-06, "loss": 0.9037, "step": 8850 }, { "epoch": 1.1548488008342024, "grad_norm": 9.297091484069824, "learning_rate": 7.173777777777779e-06, "loss": 0.9363, "step": 8860 }, { "epoch": 1.1561522419186652, "grad_norm": 9.116903305053711, "learning_rate": 7.1693333333333335e-06, "loss": 0.9396, "step": 8870 }, { "epoch": 1.1574556830031282, "grad_norm": 9.323028564453125, "learning_rate": 7.164888888888889e-06, "loss": 0.8927, "step": 8880 }, { "epoch": 1.1587591240875912, "grad_norm": 9.512086868286133, "learning_rate": 7.160444444444446e-06, "loss": 0.9617, "step": 8890 }, { "epoch": 1.1600625651720542, "grad_norm": 8.222494125366211, "learning_rate": 7.156000000000001e-06, "loss": 0.8694, "step": 8900 }, { "epoch": 1.1600625651720542, "eval/acc": 41.86046600341797, "step": 8900 }, { "epoch": 1.1600625651720542, "eval_loss": 2.7194950580596924, "eval_runtime": 0.5505, "eval_samples_per_second": 78.114, "eval_steps_per_second": 1.817, "step": 8900 }, { "epoch": 1.1613660062565172, "grad_norm": 10.078699111938477, "learning_rate": 7.151555555555556e-06, "loss": 0.9702, "step": 8910 }, { "epoch": 1.1626694473409802, "grad_norm": 7.739863395690918, "learning_rate": 7.147111111111111e-06, "loss": 0.9196, "step": 8920 }, { "epoch": 1.1639728884254432, "grad_norm": 9.16687297821045, "learning_rate": 7.142666666666668e-06, "loss": 0.8814, "step": 8930 }, { "epoch": 1.1652763295099062, "grad_norm": 9.084263801574707, "learning_rate": 7.138222222222223e-06, "loss": 0.8838, "step": 8940 }, { "epoch": 1.1665797705943692, "grad_norm": 9.483837127685547, "learning_rate": 7.133777777777778e-06, "loss": 0.8867, "step": 8950 }, { "epoch": 1.167883211678832, "grad_norm": 8.17795181274414, "learning_rate": 7.129333333333334e-06, "loss": 0.8548, "step": 8960 }, { "epoch": 1.169186652763295, "grad_norm": 10.508554458618164, "learning_rate": 7.124888888888889e-06, "loss": 0.894, "step": 8970 }, { "epoch": 1.170490093847758, "grad_norm": 9.039753913879395, "learning_rate": 7.120444444444445e-06, "loss": 0.9424, "step": 8980 }, { "epoch": 1.171793534932221, "grad_norm": 10.616847038269043, "learning_rate": 7.116000000000001e-06, "loss": 0.9333, "step": 8990 }, { "epoch": 1.173096976016684, "grad_norm": 11.284839630126953, "learning_rate": 7.111555555555556e-06, "loss": 0.879, "step": 9000 }, { "epoch": 1.173096976016684, "eval/acc": 39.53488540649414, "step": 9000 }, { "epoch": 1.173096976016684, "eval_loss": 2.869804620742798, "eval_runtime": 0.552, "eval_samples_per_second": 77.901, "eval_steps_per_second": 1.812, "step": 9000 }, { "epoch": 1.174400417101147, "grad_norm": 8.72288703918457, "learning_rate": 7.1071111111111115e-06, "loss": 0.8851, "step": 9010 }, { "epoch": 1.17570385818561, "grad_norm": 10.025834083557129, "learning_rate": 7.102666666666668e-06, "loss": 0.9131, "step": 9020 }, { "epoch": 1.177007299270073, "grad_norm": 9.2742338180542, "learning_rate": 7.098222222222223e-06, "loss": 0.9457, "step": 9030 }, { "epoch": 1.178310740354536, "grad_norm": 8.592848777770996, "learning_rate": 7.093777777777779e-06, "loss": 0.8704, "step": 9040 }, { "epoch": 1.1796141814389989, "grad_norm": 9.556818962097168, "learning_rate": 7.0893333333333334e-06, "loss": 0.9106, "step": 9050 }, { "epoch": 1.1809176225234619, "grad_norm": 9.446195602416992, "learning_rate": 7.084888888888889e-06, "loss": 0.8518, "step": 9060 }, { "epoch": 1.182221063607925, "grad_norm": 8.797647476196289, "learning_rate": 7.080444444444446e-06, "loss": 0.927, "step": 9070 }, { "epoch": 1.183524504692388, "grad_norm": 7.664761066436768, "learning_rate": 7.0760000000000005e-06, "loss": 0.8785, "step": 9080 }, { "epoch": 1.184827945776851, "grad_norm": 8.273712158203125, "learning_rate": 7.071555555555556e-06, "loss": 0.9844, "step": 9090 }, { "epoch": 1.186131386861314, "grad_norm": 8.067178726196289, "learning_rate": 7.067111111111111e-06, "loss": 0.9642, "step": 9100 }, { "epoch": 1.186131386861314, "eval/acc": 41.86046600341797, "step": 9100 }, { "epoch": 1.186131386861314, "eval_loss": 2.6579670906066895, "eval_runtime": 0.5546, "eval_samples_per_second": 77.529, "eval_steps_per_second": 1.803, "step": 9100 }, { "epoch": 1.187434827945777, "grad_norm": 9.999368667602539, "learning_rate": 7.062666666666668e-06, "loss": 0.9165, "step": 9110 }, { "epoch": 1.1887382690302397, "grad_norm": 11.097794532775879, "learning_rate": 7.058222222222223e-06, "loss": 0.8975, "step": 9120 }, { "epoch": 1.1900417101147027, "grad_norm": 9.338957786560059, "learning_rate": 7.053777777777778e-06, "loss": 0.9487, "step": 9130 }, { "epoch": 1.1913451511991657, "grad_norm": 8.180441856384277, "learning_rate": 7.049333333333334e-06, "loss": 0.8133, "step": 9140 }, { "epoch": 1.1926485922836287, "grad_norm": 9.26723575592041, "learning_rate": 7.04488888888889e-06, "loss": 0.8796, "step": 9150 }, { "epoch": 1.1939520333680917, "grad_norm": 10.798274993896484, "learning_rate": 7.040444444444445e-06, "loss": 0.9258, "step": 9160 }, { "epoch": 1.1952554744525548, "grad_norm": 9.296907424926758, "learning_rate": 7.036000000000001e-06, "loss": 0.9351, "step": 9170 }, { "epoch": 1.1965589155370178, "grad_norm": 9.426993370056152, "learning_rate": 7.031555555555556e-06, "loss": 0.9557, "step": 9180 }, { "epoch": 1.1978623566214808, "grad_norm": 8.745172500610352, "learning_rate": 7.0271111111111114e-06, "loss": 0.9959, "step": 9190 }, { "epoch": 1.1991657977059438, "grad_norm": 8.444724082946777, "learning_rate": 7.022666666666668e-06, "loss": 0.8816, "step": 9200 }, { "epoch": 1.1991657977059438, "eval/acc": 39.53488540649414, "step": 9200 }, { "epoch": 1.1991657977059438, "eval_loss": 2.6600775718688965, "eval_runtime": 0.5567, "eval_samples_per_second": 77.247, "eval_steps_per_second": 1.796, "step": 9200 }, { "epoch": 1.2004692387904066, "grad_norm": 9.101587295532227, "learning_rate": 7.018222222222223e-06, "loss": 0.9375, "step": 9210 }, { "epoch": 1.2017726798748696, "grad_norm": 9.597663879394531, "learning_rate": 7.0137777777777785e-06, "loss": 0.9135, "step": 9220 }, { "epoch": 1.2030761209593326, "grad_norm": 9.677689552307129, "learning_rate": 7.009333333333333e-06, "loss": 0.9415, "step": 9230 }, { "epoch": 1.2043795620437956, "grad_norm": 8.487961769104004, "learning_rate": 7.00488888888889e-06, "loss": 0.9985, "step": 9240 }, { "epoch": 1.2056830031282586, "grad_norm": 8.874425888061523, "learning_rate": 7.000444444444446e-06, "loss": 0.8465, "step": 9250 }, { "epoch": 1.2069864442127216, "grad_norm": 10.664003372192383, "learning_rate": 6.9960000000000004e-06, "loss": 1.0205, "step": 9260 }, { "epoch": 1.2082898852971846, "grad_norm": 9.072754859924316, "learning_rate": 6.991555555555556e-06, "loss": 0.9317, "step": 9270 }, { "epoch": 1.2095933263816476, "grad_norm": 10.553153038024902, "learning_rate": 6.987111111111111e-06, "loss": 0.9212, "step": 9280 }, { "epoch": 1.2108967674661106, "grad_norm": 7.488556385040283, "learning_rate": 6.9826666666666675e-06, "loss": 0.8059, "step": 9290 }, { "epoch": 1.2122002085505734, "grad_norm": 10.70551586151123, "learning_rate": 6.978222222222223e-06, "loss": 0.949, "step": 9300 }, { "epoch": 1.2122002085505734, "eval/acc": 41.86046600341797, "step": 9300 }, { "epoch": 1.2122002085505734, "eval_loss": 2.724426031112671, "eval_runtime": 0.5754, "eval_samples_per_second": 74.733, "eval_steps_per_second": 1.738, "step": 9300 }, { "epoch": 1.2135036496350364, "grad_norm": 8.544173240661621, "learning_rate": 6.973777777777778e-06, "loss": 0.8677, "step": 9310 }, { "epoch": 1.2148070907194994, "grad_norm": 9.747323036193848, "learning_rate": 6.969333333333334e-06, "loss": 0.9325, "step": 9320 }, { "epoch": 1.2161105318039624, "grad_norm": 10.305643081665039, "learning_rate": 6.96488888888889e-06, "loss": 0.8978, "step": 9330 }, { "epoch": 1.2174139728884255, "grad_norm": 9.521451950073242, "learning_rate": 6.960444444444445e-06, "loss": 0.9385, "step": 9340 }, { "epoch": 1.2187174139728885, "grad_norm": 9.554557800292969, "learning_rate": 6.956000000000001e-06, "loss": 0.806, "step": 9350 }, { "epoch": 1.2200208550573515, "grad_norm": 9.167433738708496, "learning_rate": 6.951555555555556e-06, "loss": 0.8892, "step": 9360 }, { "epoch": 1.2213242961418145, "grad_norm": 9.763519287109375, "learning_rate": 6.947111111111111e-06, "loss": 0.956, "step": 9370 }, { "epoch": 1.2226277372262775, "grad_norm": 10.049515724182129, "learning_rate": 6.942666666666668e-06, "loss": 1.0004, "step": 9380 }, { "epoch": 1.2239311783107403, "grad_norm": 9.015792846679688, "learning_rate": 6.938222222222223e-06, "loss": 0.8952, "step": 9390 }, { "epoch": 1.2252346193952033, "grad_norm": 8.805821418762207, "learning_rate": 6.9337777777777784e-06, "loss": 0.9088, "step": 9400 }, { "epoch": 1.2252346193952033, "eval/acc": 41.86046600341797, "step": 9400 }, { "epoch": 1.2252346193952033, "eval_loss": 2.7320239543914795, "eval_runtime": 0.5515, "eval_samples_per_second": 77.971, "eval_steps_per_second": 1.813, "step": 9400 }, { "epoch": 1.2265380604796663, "grad_norm": 9.7424955368042, "learning_rate": 6.929333333333333e-06, "loss": 0.8769, "step": 9410 }, { "epoch": 1.2278415015641293, "grad_norm": 9.860488891601562, "learning_rate": 6.92488888888889e-06, "loss": 0.9535, "step": 9420 }, { "epoch": 1.2291449426485923, "grad_norm": 9.606118202209473, "learning_rate": 6.9204444444444455e-06, "loss": 0.8852, "step": 9430 }, { "epoch": 1.2304483837330553, "grad_norm": 8.621418952941895, "learning_rate": 6.916e-06, "loss": 0.8938, "step": 9440 }, { "epoch": 1.2317518248175183, "grad_norm": 9.280407905578613, "learning_rate": 6.911555555555556e-06, "loss": 0.8569, "step": 9450 }, { "epoch": 1.2330552659019811, "grad_norm": 9.812846183776855, "learning_rate": 6.907111111111113e-06, "loss": 0.9469, "step": 9460 }, { "epoch": 1.2343587069864441, "grad_norm": 10.344027519226074, "learning_rate": 6.9026666666666674e-06, "loss": 0.9226, "step": 9470 }, { "epoch": 1.2356621480709071, "grad_norm": 9.68948745727539, "learning_rate": 6.898222222222223e-06, "loss": 0.9407, "step": 9480 }, { "epoch": 1.2369655891553701, "grad_norm": 8.801238059997559, "learning_rate": 6.893777777777778e-06, "loss": 0.918, "step": 9490 }, { "epoch": 1.2382690302398331, "grad_norm": 10.129354476928711, "learning_rate": 6.889333333333334e-06, "loss": 0.8354, "step": 9500 }, { "epoch": 1.2382690302398331, "eval/acc": 41.86046600341797, "step": 9500 }, { "epoch": 1.2382690302398331, "eval_loss": 2.6192688941955566, "eval_runtime": 0.5967, "eval_samples_per_second": 72.062, "eval_steps_per_second": 1.676, "step": 9500 }, { "epoch": 1.2395724713242962, "grad_norm": 8.593018531799316, "learning_rate": 6.88488888888889e-06, "loss": 0.8486, "step": 9510 }, { "epoch": 1.2408759124087592, "grad_norm": 10.093417167663574, "learning_rate": 6.880444444444445e-06, "loss": 0.8747, "step": 9520 }, { "epoch": 1.2421793534932222, "grad_norm": 10.845832824707031, "learning_rate": 6.876000000000001e-06, "loss": 0.9131, "step": 9530 }, { "epoch": 1.2434827945776852, "grad_norm": 8.653305053710938, "learning_rate": 6.871555555555556e-06, "loss": 0.8911, "step": 9540 }, { "epoch": 1.244786235662148, "grad_norm": 9.522208213806152, "learning_rate": 6.867111111111112e-06, "loss": 0.9512, "step": 9550 }, { "epoch": 1.246089676746611, "grad_norm": 10.715890884399414, "learning_rate": 6.862666666666668e-06, "loss": 0.7942, "step": 9560 }, { "epoch": 1.247393117831074, "grad_norm": 9.762306213378906, "learning_rate": 6.858222222222223e-06, "loss": 0.8735, "step": 9570 }, { "epoch": 1.248696558915537, "grad_norm": 8.764912605285645, "learning_rate": 6.853777777777778e-06, "loss": 0.9222, "step": 9580 }, { "epoch": 1.25, "grad_norm": 8.265697479248047, "learning_rate": 6.849333333333333e-06, "loss": 0.9455, "step": 9590 }, { "epoch": 1.251303441084463, "grad_norm": 8.233742713928223, "learning_rate": 6.84488888888889e-06, "loss": 0.8745, "step": 9600 }, { "epoch": 1.251303441084463, "eval/acc": 41.86046600341797, "step": 9600 }, { "epoch": 1.251303441084463, "eval_loss": 2.7820498943328857, "eval_runtime": 0.5501, "eval_samples_per_second": 78.165, "eval_steps_per_second": 1.818, "step": 9600 }, { "epoch": 1.252606882168926, "grad_norm": 10.830070495605469, "learning_rate": 6.8404444444444454e-06, "loss": 0.9719, "step": 9610 }, { "epoch": 1.253910323253389, "grad_norm": 11.779263496398926, "learning_rate": 6.836e-06, "loss": 0.9731, "step": 9620 }, { "epoch": 1.255213764337852, "grad_norm": 7.983153820037842, "learning_rate": 6.831555555555556e-06, "loss": 0.8534, "step": 9630 }, { "epoch": 1.2565172054223148, "grad_norm": 11.44545841217041, "learning_rate": 6.8271111111111125e-06, "loss": 0.9109, "step": 9640 }, { "epoch": 1.2578206465067778, "grad_norm": 8.1010103225708, "learning_rate": 6.822666666666667e-06, "loss": 0.919, "step": 9650 }, { "epoch": 1.2591240875912408, "grad_norm": 8.409565925598145, "learning_rate": 6.818222222222223e-06, "loss": 0.9451, "step": 9660 }, { "epoch": 1.2604275286757038, "grad_norm": 9.691168785095215, "learning_rate": 6.813777777777778e-06, "loss": 0.892, "step": 9670 }, { "epoch": 1.2617309697601669, "grad_norm": 10.412053108215332, "learning_rate": 6.809333333333334e-06, "loss": 0.8998, "step": 9680 }, { "epoch": 1.2630344108446299, "grad_norm": 10.049180030822754, "learning_rate": 6.80488888888889e-06, "loss": 0.8622, "step": 9690 }, { "epoch": 1.2643378519290929, "grad_norm": 9.016233444213867, "learning_rate": 6.800444444444445e-06, "loss": 0.9278, "step": 9700 }, { "epoch": 1.2643378519290929, "eval/acc": 41.86046600341797, "step": 9700 }, { "epoch": 1.2643378519290929, "eval_loss": 2.7881505489349365, "eval_runtime": 0.5523, "eval_samples_per_second": 77.86, "eval_steps_per_second": 1.811, "step": 9700 }, { "epoch": 1.2656412930135557, "grad_norm": 9.524649620056152, "learning_rate": 6.796000000000001e-06, "loss": 0.8966, "step": 9710 }, { "epoch": 1.2669447340980189, "grad_norm": 10.176855087280273, "learning_rate": 6.7915555555555555e-06, "loss": 0.9484, "step": 9720 }, { "epoch": 1.2682481751824817, "grad_norm": 8.774044036865234, "learning_rate": 6.787111111111112e-06, "loss": 0.8667, "step": 9730 }, { "epoch": 1.2695516162669447, "grad_norm": 8.964950561523438, "learning_rate": 6.782666666666668e-06, "loss": 0.8335, "step": 9740 }, { "epoch": 1.2708550573514077, "grad_norm": 9.385854721069336, "learning_rate": 6.778222222222223e-06, "loss": 0.8379, "step": 9750 }, { "epoch": 1.2721584984358707, "grad_norm": 10.424607276916504, "learning_rate": 6.773777777777778e-06, "loss": 0.9662, "step": 9760 }, { "epoch": 1.2734619395203337, "grad_norm": 7.88417387008667, "learning_rate": 6.769333333333335e-06, "loss": 0.9382, "step": 9770 }, { "epoch": 1.2747653806047967, "grad_norm": 10.213610649108887, "learning_rate": 6.76488888888889e-06, "loss": 0.8691, "step": 9780 }, { "epoch": 1.2760688216892597, "grad_norm": 9.095560073852539, "learning_rate": 6.760444444444445e-06, "loss": 0.9166, "step": 9790 }, { "epoch": 1.2773722627737225, "grad_norm": 8.787574768066406, "learning_rate": 6.756e-06, "loss": 0.8944, "step": 9800 }, { "epoch": 1.2773722627737225, "eval/acc": 41.86046600341797, "step": 9800 }, { "epoch": 1.2773722627737225, "eval_loss": 2.719513416290283, "eval_runtime": 0.5524, "eval_samples_per_second": 77.849, "eval_steps_per_second": 1.81, "step": 9800 }, { "epoch": 1.2786757038581857, "grad_norm": 8.333978652954102, "learning_rate": 6.751555555555556e-06, "loss": 0.8888, "step": 9810 }, { "epoch": 1.2799791449426485, "grad_norm": 10.098021507263184, "learning_rate": 6.7471111111111124e-06, "loss": 0.9634, "step": 9820 }, { "epoch": 1.2812825860271115, "grad_norm": 9.07473373413086, "learning_rate": 6.742666666666667e-06, "loss": 0.9637, "step": 9830 }, { "epoch": 1.2825860271115745, "grad_norm": 7.207655429840088, "learning_rate": 6.738222222222223e-06, "loss": 0.9221, "step": 9840 }, { "epoch": 1.2838894681960376, "grad_norm": 8.756492614746094, "learning_rate": 6.733777777777778e-06, "loss": 0.8939, "step": 9850 }, { "epoch": 1.2851929092805006, "grad_norm": 8.18359375, "learning_rate": 6.729333333333334e-06, "loss": 0.9374, "step": 9860 }, { "epoch": 1.2864963503649636, "grad_norm": 8.941536903381348, "learning_rate": 6.72488888888889e-06, "loss": 0.9355, "step": 9870 }, { "epoch": 1.2877997914494266, "grad_norm": 10.77589225769043, "learning_rate": 6.720444444444445e-06, "loss": 0.9452, "step": 9880 }, { "epoch": 1.2891032325338894, "grad_norm": 9.430168151855469, "learning_rate": 6.716000000000001e-06, "loss": 0.9284, "step": 9890 }, { "epoch": 1.2904066736183524, "grad_norm": 8.396258354187012, "learning_rate": 6.7115555555555554e-06, "loss": 0.9645, "step": 9900 }, { "epoch": 1.2904066736183524, "eval/acc": 41.86046600341797, "step": 9900 }, { "epoch": 1.2904066736183524, "eval_loss": 2.688626527786255, "eval_runtime": 0.5512, "eval_samples_per_second": 78.016, "eval_steps_per_second": 1.814, "step": 9900 }, { "epoch": 1.2917101147028154, "grad_norm": 9.193182945251465, "learning_rate": 6.707111111111112e-06, "loss": 0.9774, "step": 9910 }, { "epoch": 1.2930135557872784, "grad_norm": 8.28522777557373, "learning_rate": 6.702666666666668e-06, "loss": 0.9131, "step": 9920 }, { "epoch": 1.2943169968717414, "grad_norm": 8.093572616577148, "learning_rate": 6.6982222222222225e-06, "loss": 0.8372, "step": 9930 }, { "epoch": 1.2956204379562044, "grad_norm": 9.774358749389648, "learning_rate": 6.693777777777778e-06, "loss": 0.9098, "step": 9940 }, { "epoch": 1.2969238790406674, "grad_norm": 9.629504203796387, "learning_rate": 6.689333333333335e-06, "loss": 0.8035, "step": 9950 }, { "epoch": 1.2982273201251304, "grad_norm": 8.22866439819336, "learning_rate": 6.68488888888889e-06, "loss": 0.8679, "step": 9960 }, { "epoch": 1.2995307612095934, "grad_norm": 9.282711029052734, "learning_rate": 6.680444444444445e-06, "loss": 0.8877, "step": 9970 }, { "epoch": 1.3008342022940562, "grad_norm": 9.920709609985352, "learning_rate": 6.676e-06, "loss": 0.8942, "step": 9980 }, { "epoch": 1.3021376433785192, "grad_norm": 8.841803550720215, "learning_rate": 6.671555555555556e-06, "loss": 0.8912, "step": 9990 }, { "epoch": 1.3034410844629822, "grad_norm": 7.006541728973389, "learning_rate": 6.667111111111112e-06, "loss": 0.8737, "step": 10000 }, { "epoch": 1.3034410844629822, "eval/acc": 41.86046600341797, "step": 10000 }, { "epoch": 1.3034410844629822, "eval_loss": 2.7103235721588135, "eval_runtime": 0.5532, "eval_samples_per_second": 77.73, "eval_steps_per_second": 1.808, "step": 10000 }, { "epoch": 1.3047445255474452, "grad_norm": 9.201408386230469, "learning_rate": 6.662666666666667e-06, "loss": 0.8694, "step": 10010 }, { "epoch": 1.3060479666319083, "grad_norm": 8.531641960144043, "learning_rate": 6.658222222222223e-06, "loss": 0.8899, "step": 10020 }, { "epoch": 1.3073514077163713, "grad_norm": 7.873185634613037, "learning_rate": 6.653777777777778e-06, "loss": 0.9146, "step": 10030 }, { "epoch": 1.3086548488008343, "grad_norm": 8.658695220947266, "learning_rate": 6.649333333333334e-06, "loss": 0.8503, "step": 10040 }, { "epoch": 1.309958289885297, "grad_norm": 6.5535969734191895, "learning_rate": 6.64488888888889e-06, "loss": 0.8938, "step": 10050 }, { "epoch": 1.3112617309697603, "grad_norm": 8.543478012084961, "learning_rate": 6.640444444444445e-06, "loss": 0.9514, "step": 10060 }, { "epoch": 1.312565172054223, "grad_norm": 9.992765426635742, "learning_rate": 6.6360000000000005e-06, "loss": 0.8396, "step": 10070 }, { "epoch": 1.313868613138686, "grad_norm": 8.397747993469238, "learning_rate": 6.631555555555557e-06, "loss": 0.9288, "step": 10080 }, { "epoch": 1.315172054223149, "grad_norm": 8.088634490966797, "learning_rate": 6.627111111111112e-06, "loss": 0.9153, "step": 10090 }, { "epoch": 1.316475495307612, "grad_norm": 9.067995071411133, "learning_rate": 6.622666666666668e-06, "loss": 0.8775, "step": 10100 }, { "epoch": 1.316475495307612, "eval/acc": 41.86046600341797, "step": 10100 }, { "epoch": 1.316475495307612, "eval_loss": 2.6987268924713135, "eval_runtime": 0.5522, "eval_samples_per_second": 77.866, "eval_steps_per_second": 1.811, "step": 10100 }, { "epoch": 1.317778936392075, "grad_norm": 9.6102876663208, "learning_rate": 6.618222222222222e-06, "loss": 0.8614, "step": 10110 }, { "epoch": 1.3190823774765381, "grad_norm": 9.03238582611084, "learning_rate": 6.613777777777778e-06, "loss": 0.8847, "step": 10120 }, { "epoch": 1.3203858185610011, "grad_norm": 9.015857696533203, "learning_rate": 6.609333333333335e-06, "loss": 0.9204, "step": 10130 }, { "epoch": 1.321689259645464, "grad_norm": 10.332176208496094, "learning_rate": 6.6048888888888895e-06, "loss": 0.9905, "step": 10140 }, { "epoch": 1.3229927007299271, "grad_norm": 9.234716415405273, "learning_rate": 6.600444444444445e-06, "loss": 0.8545, "step": 10150 }, { "epoch": 1.32429614181439, "grad_norm": 7.404505729675293, "learning_rate": 6.596e-06, "loss": 0.8498, "step": 10160 }, { "epoch": 1.325599582898853, "grad_norm": 9.298205375671387, "learning_rate": 6.5915555555555566e-06, "loss": 0.8812, "step": 10170 }, { "epoch": 1.326903023983316, "grad_norm": 9.275193214416504, "learning_rate": 6.587111111111112e-06, "loss": 1.0253, "step": 10180 }, { "epoch": 1.328206465067779, "grad_norm": 8.494884490966797, "learning_rate": 6.582666666666667e-06, "loss": 0.8705, "step": 10190 }, { "epoch": 1.329509906152242, "grad_norm": 9.347415924072266, "learning_rate": 6.578222222222223e-06, "loss": 0.8584, "step": 10200 }, { "epoch": 1.329509906152242, "eval/acc": 41.86046600341797, "step": 10200 }, { "epoch": 1.329509906152242, "eval_loss": 2.7637178897857666, "eval_runtime": 0.7156, "eval_samples_per_second": 60.087, "eval_steps_per_second": 1.397, "step": 10200 }, { "epoch": 1.330813347236705, "grad_norm": 11.133914947509766, "learning_rate": 6.573777777777778e-06, "loss": 0.9911, "step": 10210 }, { "epoch": 1.332116788321168, "grad_norm": 8.803244590759277, "learning_rate": 6.569333333333334e-06, "loss": 0.9088, "step": 10220 }, { "epoch": 1.3334202294056308, "grad_norm": 8.14785385131836, "learning_rate": 6.56488888888889e-06, "loss": 0.8896, "step": 10230 }, { "epoch": 1.3347236704900938, "grad_norm": 8.227129936218262, "learning_rate": 6.560444444444445e-06, "loss": 0.8486, "step": 10240 }, { "epoch": 1.3360271115745568, "grad_norm": 8.84304141998291, "learning_rate": 6.556e-06, "loss": 0.8333, "step": 10250 }, { "epoch": 1.3373305526590198, "grad_norm": 8.489936828613281, "learning_rate": 6.551555555555557e-06, "loss": 0.9373, "step": 10260 }, { "epoch": 1.3386339937434828, "grad_norm": 9.00913143157959, "learning_rate": 6.547111111111112e-06, "loss": 0.838, "step": 10270 }, { "epoch": 1.3399374348279458, "grad_norm": 9.798420906066895, "learning_rate": 6.5426666666666675e-06, "loss": 0.9247, "step": 10280 }, { "epoch": 1.3412408759124088, "grad_norm": 8.319931030273438, "learning_rate": 6.538222222222222e-06, "loss": 0.9001, "step": 10290 }, { "epoch": 1.3425443169968718, "grad_norm": 9.302659034729004, "learning_rate": 6.533777777777778e-06, "loss": 0.8778, "step": 10300 }, { "epoch": 1.3425443169968718, "eval/acc": 41.86046600341797, "step": 10300 }, { "epoch": 1.3425443169968718, "eval_loss": 2.8571131229400635, "eval_runtime": 0.5527, "eval_samples_per_second": 77.795, "eval_steps_per_second": 1.809, "step": 10300 }, { "epoch": 1.3438477580813348, "grad_norm": 7.9171462059021, "learning_rate": 6.5293333333333346e-06, "loss": 0.8653, "step": 10310 }, { "epoch": 1.3451511991657976, "grad_norm": 9.129400253295898, "learning_rate": 6.524888888888889e-06, "loss": 1.0094, "step": 10320 }, { "epoch": 1.3464546402502606, "grad_norm": 9.533576011657715, "learning_rate": 6.520444444444445e-06, "loss": 0.9017, "step": 10330 }, { "epoch": 1.3477580813347236, "grad_norm": 8.139508247375488, "learning_rate": 6.516e-06, "loss": 0.9967, "step": 10340 }, { "epoch": 1.3490615224191866, "grad_norm": 9.283590316772461, "learning_rate": 6.5115555555555565e-06, "loss": 0.9135, "step": 10350 }, { "epoch": 1.3503649635036497, "grad_norm": 12.347064971923828, "learning_rate": 6.507111111111112e-06, "loss": 0.8906, "step": 10360 }, { "epoch": 1.3516684045881127, "grad_norm": 8.940449714660645, "learning_rate": 6.502666666666667e-06, "loss": 0.8458, "step": 10370 }, { "epoch": 1.3529718456725757, "grad_norm": 10.323555946350098, "learning_rate": 6.498222222222223e-06, "loss": 0.9233, "step": 10380 }, { "epoch": 1.3542752867570385, "grad_norm": 8.252403259277344, "learning_rate": 6.493777777777779e-06, "loss": 0.8813, "step": 10390 }, { "epoch": 1.3555787278415017, "grad_norm": 8.905969619750977, "learning_rate": 6.489333333333334e-06, "loss": 0.9202, "step": 10400 }, { "epoch": 1.3555787278415017, "eval/acc": 39.53488540649414, "step": 10400 }, { "epoch": 1.3555787278415017, "eval_loss": 2.795374631881714, "eval_runtime": 0.5527, "eval_samples_per_second": 77.803, "eval_steps_per_second": 1.809, "step": 10400 }, { "epoch": 1.3568821689259645, "grad_norm": 9.175992012023926, "learning_rate": 6.48488888888889e-06, "loss": 0.8884, "step": 10410 }, { "epoch": 1.3581856100104275, "grad_norm": 7.678065299987793, "learning_rate": 6.480444444444445e-06, "loss": 0.8813, "step": 10420 }, { "epoch": 1.3594890510948905, "grad_norm": 8.357683181762695, "learning_rate": 6.476e-06, "loss": 0.9146, "step": 10430 }, { "epoch": 1.3607924921793535, "grad_norm": 8.358731269836426, "learning_rate": 6.471555555555557e-06, "loss": 0.8483, "step": 10440 }, { "epoch": 1.3620959332638165, "grad_norm": 8.581130027770996, "learning_rate": 6.467111111111112e-06, "loss": 0.8168, "step": 10450 }, { "epoch": 1.3633993743482795, "grad_norm": 8.504261016845703, "learning_rate": 6.462666666666667e-06, "loss": 0.8887, "step": 10460 }, { "epoch": 1.3647028154327425, "grad_norm": 11.222940444946289, "learning_rate": 6.458222222222222e-06, "loss": 0.9576, "step": 10470 }, { "epoch": 1.3660062565172053, "grad_norm": 10.263794898986816, "learning_rate": 6.453777777777779e-06, "loss": 0.8617, "step": 10480 }, { "epoch": 1.3673096976016685, "grad_norm": 8.209610939025879, "learning_rate": 6.4493333333333345e-06, "loss": 0.9442, "step": 10490 }, { "epoch": 1.3686131386861313, "grad_norm": 9.299262046813965, "learning_rate": 6.444888888888889e-06, "loss": 0.8732, "step": 10500 }, { "epoch": 1.3686131386861313, "eval/acc": 39.53488540649414, "step": 10500 }, { "epoch": 1.3686131386861313, "eval_loss": 2.783844470977783, "eval_runtime": 0.5526, "eval_samples_per_second": 77.814, "eval_steps_per_second": 1.81, "step": 10500 }, { "epoch": 1.3699165797705943, "grad_norm": 8.058958053588867, "learning_rate": 6.440444444444445e-06, "loss": 0.9469, "step": 10510 }, { "epoch": 1.3712200208550573, "grad_norm": 8.55029582977295, "learning_rate": 6.436e-06, "loss": 0.9351, "step": 10520 }, { "epoch": 1.3725234619395204, "grad_norm": 10.340177536010742, "learning_rate": 6.431555555555556e-06, "loss": 0.9086, "step": 10530 }, { "epoch": 1.3738269030239834, "grad_norm": 9.552031517028809, "learning_rate": 6.427111111111112e-06, "loss": 0.912, "step": 10540 }, { "epoch": 1.3751303441084464, "grad_norm": 10.294907569885254, "learning_rate": 6.422666666666667e-06, "loss": 0.9435, "step": 10550 }, { "epoch": 1.3764337851929094, "grad_norm": 9.045071601867676, "learning_rate": 6.418222222222223e-06, "loss": 0.8511, "step": 10560 }, { "epoch": 1.3777372262773722, "grad_norm": 9.00757122039795, "learning_rate": 6.413777777777779e-06, "loss": 0.8888, "step": 10570 }, { "epoch": 1.3790406673618352, "grad_norm": 10.765233039855957, "learning_rate": 6.409333333333334e-06, "loss": 0.9298, "step": 10580 }, { "epoch": 1.3803441084462982, "grad_norm": 9.451735496520996, "learning_rate": 6.40488888888889e-06, "loss": 0.976, "step": 10590 }, { "epoch": 1.3816475495307612, "grad_norm": 8.657115936279297, "learning_rate": 6.4004444444444446e-06, "loss": 0.8349, "step": 10600 }, { "epoch": 1.3816475495307612, "eval/acc": 39.53488540649414, "step": 10600 }, { "epoch": 1.3816475495307612, "eval_loss": 2.866401433944702, "eval_runtime": 0.6246, "eval_samples_per_second": 68.841, "eval_steps_per_second": 1.601, "step": 10600 }, { "epoch": 1.3829509906152242, "grad_norm": 9.011321067810059, "learning_rate": 6.396e-06, "loss": 0.8357, "step": 10610 }, { "epoch": 1.3842544316996872, "grad_norm": 8.620442390441895, "learning_rate": 6.391555555555557e-06, "loss": 0.981, "step": 10620 }, { "epoch": 1.3855578727841502, "grad_norm": 8.544589042663574, "learning_rate": 6.387111111111112e-06, "loss": 0.8763, "step": 10630 }, { "epoch": 1.3868613138686132, "grad_norm": 9.019304275512695, "learning_rate": 6.382666666666667e-06, "loss": 0.921, "step": 10640 }, { "epoch": 1.3881647549530762, "grad_norm": 8.633360862731934, "learning_rate": 6.378222222222222e-06, "loss": 0.9333, "step": 10650 }, { "epoch": 1.389468196037539, "grad_norm": 8.078035354614258, "learning_rate": 6.373777777777779e-06, "loss": 0.8879, "step": 10660 }, { "epoch": 1.390771637122002, "grad_norm": 8.819733619689941, "learning_rate": 6.369333333333334e-06, "loss": 0.8902, "step": 10670 }, { "epoch": 1.392075078206465, "grad_norm": 10.02749252319336, "learning_rate": 6.364888888888889e-06, "loss": 0.9373, "step": 10680 }, { "epoch": 1.393378519290928, "grad_norm": 9.460182189941406, "learning_rate": 6.360444444444445e-06, "loss": 0.9681, "step": 10690 }, { "epoch": 1.394681960375391, "grad_norm": 8.766586303710938, "learning_rate": 6.356000000000001e-06, "loss": 0.8493, "step": 10700 }, { "epoch": 1.394681960375391, "eval/acc": 41.86046600341797, "step": 10700 }, { "epoch": 1.394681960375391, "eval_loss": 2.8115038871765137, "eval_runtime": 0.5558, "eval_samples_per_second": 77.361, "eval_steps_per_second": 1.799, "step": 10700 }, { "epoch": 1.395985401459854, "grad_norm": 11.57418155670166, "learning_rate": 6.351555555555556e-06, "loss": 0.9588, "step": 10710 }, { "epoch": 1.397288842544317, "grad_norm": 8.195137023925781, "learning_rate": 6.347111111111112e-06, "loss": 0.9191, "step": 10720 }, { "epoch": 1.3985922836287799, "grad_norm": 9.76877498626709, "learning_rate": 6.342666666666667e-06, "loss": 0.9118, "step": 10730 }, { "epoch": 1.399895724713243, "grad_norm": 8.484424591064453, "learning_rate": 6.3382222222222226e-06, "loss": 0.8327, "step": 10740 }, { "epoch": 1.4011991657977059, "grad_norm": 10.10142993927002, "learning_rate": 6.333777777777779e-06, "loss": 0.9729, "step": 10750 }, { "epoch": 1.4025026068821689, "grad_norm": 9.406108856201172, "learning_rate": 6.329333333333334e-06, "loss": 0.8831, "step": 10760 }, { "epoch": 1.4038060479666319, "grad_norm": 8.581574440002441, "learning_rate": 6.32488888888889e-06, "loss": 0.8678, "step": 10770 }, { "epoch": 1.405109489051095, "grad_norm": 9.275774955749512, "learning_rate": 6.3204444444444445e-06, "loss": 0.876, "step": 10780 }, { "epoch": 1.406412930135558, "grad_norm": 9.65285587310791, "learning_rate": 6.316000000000001e-06, "loss": 0.9449, "step": 10790 }, { "epoch": 1.407716371220021, "grad_norm": 8.811188697814941, "learning_rate": 6.311555555555557e-06, "loss": 0.8921, "step": 10800 }, { "epoch": 1.407716371220021, "eval/acc": 39.53488540649414, "step": 10800 }, { "epoch": 1.407716371220021, "eval_loss": 2.8162498474121094, "eval_runtime": 0.5542, "eval_samples_per_second": 77.585, "eval_steps_per_second": 1.804, "step": 10800 }, { "epoch": 1.409019812304484, "grad_norm": 8.699746131896973, "learning_rate": 6.3071111111111116e-06, "loss": 0.8355, "step": 10810 }, { "epoch": 1.4103232533889467, "grad_norm": 9.994730949401855, "learning_rate": 6.302666666666667e-06, "loss": 0.8612, "step": 10820 }, { "epoch": 1.41162669447341, "grad_norm": 9.046276092529297, "learning_rate": 6.298222222222222e-06, "loss": 0.9787, "step": 10830 }, { "epoch": 1.4129301355578727, "grad_norm": 7.892019748687744, "learning_rate": 6.293777777777779e-06, "loss": 0.8488, "step": 10840 }, { "epoch": 1.4142335766423357, "grad_norm": 9.220200538635254, "learning_rate": 6.289333333333334e-06, "loss": 0.8494, "step": 10850 }, { "epoch": 1.4155370177267987, "grad_norm": 10.144383430480957, "learning_rate": 6.284888888888889e-06, "loss": 0.9315, "step": 10860 }, { "epoch": 1.4168404588112617, "grad_norm": 9.87109375, "learning_rate": 6.280444444444445e-06, "loss": 0.8582, "step": 10870 }, { "epoch": 1.4181438998957248, "grad_norm": 9.461145401000977, "learning_rate": 6.2760000000000006e-06, "loss": 0.8907, "step": 10880 }, { "epoch": 1.4194473409801878, "grad_norm": 9.081710815429688, "learning_rate": 6.271555555555556e-06, "loss": 0.837, "step": 10890 }, { "epoch": 1.4207507820646508, "grad_norm": 13.004462242126465, "learning_rate": 6.267111111111112e-06, "loss": 0.9401, "step": 10900 }, { "epoch": 1.4207507820646508, "eval/acc": 41.86046600341797, "step": 10900 }, { "epoch": 1.4207507820646508, "eval_loss": 2.7814087867736816, "eval_runtime": 0.554, "eval_samples_per_second": 77.613, "eval_steps_per_second": 1.805, "step": 10900 }, { "epoch": 1.4220542231491136, "grad_norm": 8.624650001525879, "learning_rate": 6.262666666666667e-06, "loss": 0.8775, "step": 10910 }, { "epoch": 1.4233576642335766, "grad_norm": 8.703479766845703, "learning_rate": 6.2582222222222225e-06, "loss": 0.8844, "step": 10920 }, { "epoch": 1.4246611053180396, "grad_norm": 8.683279991149902, "learning_rate": 6.253777777777779e-06, "loss": 0.8558, "step": 10930 }, { "epoch": 1.4259645464025026, "grad_norm": 10.942120552062988, "learning_rate": 6.249333333333334e-06, "loss": 0.9372, "step": 10940 }, { "epoch": 1.4272679874869656, "grad_norm": 8.699433326721191, "learning_rate": 6.2448888888888896e-06, "loss": 0.8894, "step": 10950 }, { "epoch": 1.4285714285714286, "grad_norm": 9.938445091247559, "learning_rate": 6.240444444444444e-06, "loss": 0.8583, "step": 10960 }, { "epoch": 1.4298748696558916, "grad_norm": 8.051493644714355, "learning_rate": 6.236000000000001e-06, "loss": 0.9549, "step": 10970 }, { "epoch": 1.4311783107403544, "grad_norm": 8.52479362487793, "learning_rate": 6.231555555555557e-06, "loss": 0.8776, "step": 10980 }, { "epoch": 1.4324817518248176, "grad_norm": 8.406006813049316, "learning_rate": 6.2271111111111115e-06, "loss": 0.8878, "step": 10990 }, { "epoch": 1.4337851929092804, "grad_norm": 9.622314453125, "learning_rate": 6.222666666666667e-06, "loss": 0.9299, "step": 11000 }, { "epoch": 1.4337851929092804, "eval/acc": 44.1860466003418, "step": 11000 }, { "epoch": 1.4337851929092804, "eval_loss": 2.6992862224578857, "eval_runtime": 0.5512, "eval_samples_per_second": 78.008, "eval_steps_per_second": 1.814, "step": 11000 }, { "epoch": 1.4350886339937434, "grad_norm": 8.622066497802734, "learning_rate": 6.218222222222223e-06, "loss": 0.8806, "step": 11010 }, { "epoch": 1.4363920750782064, "grad_norm": 7.857709884643555, "learning_rate": 6.2137777777777786e-06, "loss": 0.8938, "step": 11020 }, { "epoch": 1.4376955161626694, "grad_norm": 9.03056526184082, "learning_rate": 6.209333333333334e-06, "loss": 0.9384, "step": 11030 }, { "epoch": 1.4389989572471324, "grad_norm": 9.456419944763184, "learning_rate": 6.204888888888889e-06, "loss": 0.9438, "step": 11040 }, { "epoch": 1.4403023983315955, "grad_norm": 8.256093978881836, "learning_rate": 6.200444444444445e-06, "loss": 0.8903, "step": 11050 }, { "epoch": 1.4416058394160585, "grad_norm": 10.614521980285645, "learning_rate": 6.196000000000001e-06, "loss": 0.9231, "step": 11060 }, { "epoch": 1.4429092805005213, "grad_norm": 11.964981079101562, "learning_rate": 6.191555555555556e-06, "loss": 0.9104, "step": 11070 }, { "epoch": 1.4442127215849845, "grad_norm": 9.800127983093262, "learning_rate": 6.187111111111112e-06, "loss": 1.0013, "step": 11080 }, { "epoch": 1.4455161626694473, "grad_norm": 9.091155052185059, "learning_rate": 6.182666666666667e-06, "loss": 0.875, "step": 11090 }, { "epoch": 1.4468196037539103, "grad_norm": 7.972758769989014, "learning_rate": 6.178222222222223e-06, "loss": 1.0118, "step": 11100 }, { "epoch": 1.4468196037539103, "eval/acc": 41.86046600341797, "step": 11100 }, { "epoch": 1.4468196037539103, "eval_loss": 2.5994858741760254, "eval_runtime": 0.5519, "eval_samples_per_second": 77.916, "eval_steps_per_second": 1.812, "step": 11100 }, { "epoch": 1.4481230448383733, "grad_norm": 9.793561935424805, "learning_rate": 6.173777777777779e-06, "loss": 0.8798, "step": 11110 }, { "epoch": 1.4494264859228363, "grad_norm": 8.747686386108398, "learning_rate": 6.169333333333334e-06, "loss": 0.9374, "step": 11120 }, { "epoch": 1.4507299270072993, "grad_norm": 7.503750801086426, "learning_rate": 6.1648888888888895e-06, "loss": 0.9173, "step": 11130 }, { "epoch": 1.4520333680917623, "grad_norm": 8.862283706665039, "learning_rate": 6.160444444444444e-06, "loss": 0.9371, "step": 11140 }, { "epoch": 1.4533368091762253, "grad_norm": 9.035390853881836, "learning_rate": 6.156000000000001e-06, "loss": 0.8905, "step": 11150 }, { "epoch": 1.454640250260688, "grad_norm": 7.301136493682861, "learning_rate": 6.1515555555555566e-06, "loss": 0.9289, "step": 11160 }, { "epoch": 1.4559436913451513, "grad_norm": 8.715147018432617, "learning_rate": 6.147111111111111e-06, "loss": 0.9882, "step": 11170 }, { "epoch": 1.4572471324296141, "grad_norm": 8.610602378845215, "learning_rate": 6.142666666666667e-06, "loss": 0.844, "step": 11180 }, { "epoch": 1.4585505735140771, "grad_norm": 8.293404579162598, "learning_rate": 6.138222222222223e-06, "loss": 0.7902, "step": 11190 }, { "epoch": 1.4598540145985401, "grad_norm": 10.559432029724121, "learning_rate": 6.1337777777777785e-06, "loss": 0.9425, "step": 11200 }, { "epoch": 1.4598540145985401, "eval/acc": 41.86046600341797, "step": 11200 }, { "epoch": 1.4598540145985401, "eval_loss": 2.852363348007202, "eval_runtime": 0.5528, "eval_samples_per_second": 77.781, "eval_steps_per_second": 1.809, "step": 11200 }, { "epoch": 1.4611574556830031, "grad_norm": 10.356833457946777, "learning_rate": 6.129333333333334e-06, "loss": 1.0593, "step": 11210 }, { "epoch": 1.4624608967674662, "grad_norm": 7.944422245025635, "learning_rate": 6.124888888888889e-06, "loss": 0.9051, "step": 11220 }, { "epoch": 1.4637643378519292, "grad_norm": 10.254863739013672, "learning_rate": 6.120444444444445e-06, "loss": 0.9235, "step": 11230 }, { "epoch": 1.4650677789363922, "grad_norm": 9.219043731689453, "learning_rate": 6.116000000000001e-06, "loss": 0.8744, "step": 11240 }, { "epoch": 1.466371220020855, "grad_norm": 8.306119918823242, "learning_rate": 6.111555555555556e-06, "loss": 0.8564, "step": 11250 }, { "epoch": 1.467674661105318, "grad_norm": 9.30771255493164, "learning_rate": 6.107111111111112e-06, "loss": 0.9322, "step": 11260 }, { "epoch": 1.468978102189781, "grad_norm": 8.586750030517578, "learning_rate": 6.102666666666667e-06, "loss": 0.8966, "step": 11270 }, { "epoch": 1.470281543274244, "grad_norm": 11.305912971496582, "learning_rate": 6.098222222222223e-06, "loss": 0.9073, "step": 11280 }, { "epoch": 1.471584984358707, "grad_norm": 9.06704044342041, "learning_rate": 6.093777777777779e-06, "loss": 0.9724, "step": 11290 }, { "epoch": 1.47288842544317, "grad_norm": 7.793592929840088, "learning_rate": 6.089333333333334e-06, "loss": 0.9722, "step": 11300 }, { "epoch": 1.47288842544317, "eval/acc": 44.1860466003418, "step": 11300 }, { "epoch": 1.47288842544317, "eval_loss": 2.65043306350708, "eval_runtime": 0.5525, "eval_samples_per_second": 77.826, "eval_steps_per_second": 1.81, "step": 11300 }, { "epoch": 1.474191866527633, "grad_norm": 9.574031829833984, "learning_rate": 6.084888888888889e-06, "loss": 0.9068, "step": 11310 }, { "epoch": 1.4754953076120958, "grad_norm": 8.477428436279297, "learning_rate": 6.080444444444445e-06, "loss": 0.8757, "step": 11320 }, { "epoch": 1.476798748696559, "grad_norm": 8.575848579406738, "learning_rate": 6.076000000000001e-06, "loss": 0.9199, "step": 11330 }, { "epoch": 1.4781021897810218, "grad_norm": 8.902753829956055, "learning_rate": 6.0715555555555565e-06, "loss": 0.8908, "step": 11340 }, { "epoch": 1.4794056308654848, "grad_norm": 8.576892852783203, "learning_rate": 6.067111111111111e-06, "loss": 0.8653, "step": 11350 }, { "epoch": 1.4807090719499478, "grad_norm": 8.837362289428711, "learning_rate": 6.062666666666667e-06, "loss": 0.843, "step": 11360 }, { "epoch": 1.4820125130344108, "grad_norm": 9.362685203552246, "learning_rate": 6.058222222222223e-06, "loss": 0.9083, "step": 11370 }, { "epoch": 1.4833159541188738, "grad_norm": 9.474995613098145, "learning_rate": 6.053777777777778e-06, "loss": 0.8347, "step": 11380 }, { "epoch": 1.4846193952033369, "grad_norm": 10.123291015625, "learning_rate": 6.049333333333334e-06, "loss": 0.8961, "step": 11390 }, { "epoch": 1.4859228362877999, "grad_norm": 9.163581848144531, "learning_rate": 6.044888888888889e-06, "loss": 0.9281, "step": 11400 }, { "epoch": 1.4859228362877999, "eval/acc": 41.86046600341797, "step": 11400 }, { "epoch": 1.4859228362877999, "eval_loss": 2.7590243816375732, "eval_runtime": 0.5523, "eval_samples_per_second": 77.856, "eval_steps_per_second": 1.811, "step": 11400 }, { "epoch": 1.4872262773722627, "grad_norm": 8.61991024017334, "learning_rate": 6.0404444444444455e-06, "loss": 0.8984, "step": 11410 }, { "epoch": 1.4885297184567259, "grad_norm": 9.411813735961914, "learning_rate": 6.036000000000001e-06, "loss": 0.958, "step": 11420 }, { "epoch": 1.4898331595411887, "grad_norm": 8.644630432128906, "learning_rate": 6.031555555555556e-06, "loss": 0.925, "step": 11430 }, { "epoch": 1.4911366006256517, "grad_norm": 10.198295593261719, "learning_rate": 6.027111111111112e-06, "loss": 0.8969, "step": 11440 }, { "epoch": 1.4924400417101147, "grad_norm": 11.641205787658691, "learning_rate": 6.0226666666666665e-06, "loss": 0.977, "step": 11450 }, { "epoch": 1.4937434827945777, "grad_norm": 8.234724044799805, "learning_rate": 6.018222222222223e-06, "loss": 0.9378, "step": 11460 }, { "epoch": 1.4950469238790407, "grad_norm": 7.507535934448242, "learning_rate": 6.013777777777779e-06, "loss": 0.9237, "step": 11470 }, { "epoch": 1.4963503649635037, "grad_norm": 8.594752311706543, "learning_rate": 6.009333333333334e-06, "loss": 1.0255, "step": 11480 }, { "epoch": 1.4976538060479667, "grad_norm": 8.283523559570312, "learning_rate": 6.004888888888889e-06, "loss": 0.8837, "step": 11490 }, { "epoch": 1.4989572471324295, "grad_norm": 8.156198501586914, "learning_rate": 6.000444444444445e-06, "loss": 0.8456, "step": 11500 }, { "epoch": 1.4989572471324295, "eval/acc": 41.86046600341797, "step": 11500 }, { "epoch": 1.4989572471324295, "eval_loss": 2.6901562213897705, "eval_runtime": 0.5503, "eval_samples_per_second": 78.14, "eval_steps_per_second": 1.817, "step": 11500 }, { "epoch": 1.5002606882168927, "grad_norm": 9.280471801757812, "learning_rate": 5.996000000000001e-06, "loss": 0.8989, "step": 11510 }, { "epoch": 1.5015641293013555, "grad_norm": 11.96032428741455, "learning_rate": 5.991555555555556e-06, "loss": 0.9634, "step": 11520 }, { "epoch": 1.5028675703858185, "grad_norm": 9.809213638305664, "learning_rate": 5.987111111111111e-06, "loss": 0.8584, "step": 11530 }, { "epoch": 1.5041710114702815, "grad_norm": 10.654325485229492, "learning_rate": 5.982666666666667e-06, "loss": 0.8962, "step": 11540 }, { "epoch": 1.5054744525547445, "grad_norm": 7.823657035827637, "learning_rate": 5.978222222222223e-06, "loss": 0.8916, "step": 11550 }, { "epoch": 1.5067778936392076, "grad_norm": 7.672808647155762, "learning_rate": 5.973777777777778e-06, "loss": 0.898, "step": 11560 }, { "epoch": 1.5080813347236703, "grad_norm": 8.952858924865723, "learning_rate": 5.969333333333334e-06, "loss": 0.8695, "step": 11570 }, { "epoch": 1.5093847758081336, "grad_norm": 10.120991706848145, "learning_rate": 5.964888888888889e-06, "loss": 0.8802, "step": 11580 }, { "epoch": 1.5106882168925964, "grad_norm": 8.758228302001953, "learning_rate": 5.960444444444445e-06, "loss": 0.911, "step": 11590 }, { "epoch": 1.5119916579770596, "grad_norm": 9.951668739318848, "learning_rate": 5.956000000000001e-06, "loss": 0.9807, "step": 11600 }, { "epoch": 1.5119916579770596, "eval/acc": 41.86046600341797, "step": 11600 }, { "epoch": 1.5119916579770596, "eval_loss": 2.691246271133423, "eval_runtime": 0.55, "eval_samples_per_second": 78.178, "eval_steps_per_second": 1.818, "step": 11600 }, { "epoch": 1.5132950990615224, "grad_norm": 8.754342079162598, "learning_rate": 5.951555555555556e-06, "loss": 0.9525, "step": 11610 }, { "epoch": 1.5145985401459854, "grad_norm": 7.778733253479004, "learning_rate": 5.947111111111112e-06, "loss": 0.8477, "step": 11620 }, { "epoch": 1.5159019812304484, "grad_norm": 11.202043533325195, "learning_rate": 5.942666666666667e-06, "loss": 0.903, "step": 11630 }, { "epoch": 1.5172054223149114, "grad_norm": 9.004331588745117, "learning_rate": 5.938222222222223e-06, "loss": 0.8947, "step": 11640 }, { "epoch": 1.5185088633993744, "grad_norm": 8.66901683807373, "learning_rate": 5.933777777777779e-06, "loss": 0.9115, "step": 11650 }, { "epoch": 1.5198123044838372, "grad_norm": 10.1716890335083, "learning_rate": 5.9293333333333335e-06, "loss": 0.8698, "step": 11660 }, { "epoch": 1.5211157455683004, "grad_norm": 9.544024467468262, "learning_rate": 5.924888888888889e-06, "loss": 0.9145, "step": 11670 }, { "epoch": 1.5224191866527632, "grad_norm": 10.033705711364746, "learning_rate": 5.920444444444445e-06, "loss": 0.7864, "step": 11680 }, { "epoch": 1.5237226277372264, "grad_norm": 7.851691722869873, "learning_rate": 5.916000000000001e-06, "loss": 0.9286, "step": 11690 }, { "epoch": 1.5250260688216892, "grad_norm": 8.896501541137695, "learning_rate": 5.911555555555556e-06, "loss": 0.8971, "step": 11700 }, { "epoch": 1.5250260688216892, "eval/acc": 44.1860466003418, "step": 11700 }, { "epoch": 1.5250260688216892, "eval_loss": 2.7321903705596924, "eval_runtime": 0.5514, "eval_samples_per_second": 77.982, "eval_steps_per_second": 1.814, "step": 11700 }, { "epoch": 1.5263295099061522, "grad_norm": 7.569187164306641, "learning_rate": 5.907111111111111e-06, "loss": 0.8678, "step": 11710 }, { "epoch": 1.5276329509906152, "grad_norm": 8.165056228637695, "learning_rate": 5.902666666666668e-06, "loss": 0.9258, "step": 11720 }, { "epoch": 1.5289363920750783, "grad_norm": 10.068973541259766, "learning_rate": 5.8982222222222225e-06, "loss": 0.8216, "step": 11730 }, { "epoch": 1.5302398331595413, "grad_norm": 8.550789833068848, "learning_rate": 5.893777777777778e-06, "loss": 0.9825, "step": 11740 }, { "epoch": 1.531543274244004, "grad_norm": 9.013258934020996, "learning_rate": 5.889333333333334e-06, "loss": 0.8041, "step": 11750 }, { "epoch": 1.5328467153284673, "grad_norm": 7.696575164794922, "learning_rate": 5.884888888888889e-06, "loss": 0.8873, "step": 11760 }, { "epoch": 1.53415015641293, "grad_norm": 9.604674339294434, "learning_rate": 5.880444444444445e-06, "loss": 0.8717, "step": 11770 }, { "epoch": 1.535453597497393, "grad_norm": 9.942161560058594, "learning_rate": 5.876000000000001e-06, "loss": 0.8491, "step": 11780 }, { "epoch": 1.536757038581856, "grad_norm": 10.737907409667969, "learning_rate": 5.871555555555556e-06, "loss": 0.84, "step": 11790 }, { "epoch": 1.538060479666319, "grad_norm": 8.854541778564453, "learning_rate": 5.8671111111111115e-06, "loss": 0.907, "step": 11800 }, { "epoch": 1.538060479666319, "eval/acc": 41.86046600341797, "step": 11800 }, { "epoch": 1.538060479666319, "eval_loss": 2.783064603805542, "eval_runtime": 0.5504, "eval_samples_per_second": 78.13, "eval_steps_per_second": 1.817, "step": 11800 }, { "epoch": 1.539363920750782, "grad_norm": 9.020838737487793, "learning_rate": 5.862666666666667e-06, "loss": 0.9288, "step": 11810 }, { "epoch": 1.5406673618352449, "grad_norm": 7.965185642242432, "learning_rate": 5.858222222222223e-06, "loss": 0.8789, "step": 11820 }, { "epoch": 1.5419708029197081, "grad_norm": 8.109720230102539, "learning_rate": 5.853777777777779e-06, "loss": 0.9817, "step": 11830 }, { "epoch": 1.543274244004171, "grad_norm": 8.252847671508789, "learning_rate": 5.8493333333333335e-06, "loss": 0.9315, "step": 11840 }, { "epoch": 1.5445776850886341, "grad_norm": 8.494192123413086, "learning_rate": 5.844888888888889e-06, "loss": 0.924, "step": 11850 }, { "epoch": 1.545881126173097, "grad_norm": 9.51470947265625, "learning_rate": 5.840444444444445e-06, "loss": 0.9022, "step": 11860 }, { "epoch": 1.54718456725756, "grad_norm": 8.728997230529785, "learning_rate": 5.8360000000000005e-06, "loss": 0.9336, "step": 11870 }, { "epoch": 1.548488008342023, "grad_norm": 6.90728759765625, "learning_rate": 5.831555555555556e-06, "loss": 0.8825, "step": 11880 }, { "epoch": 1.549791449426486, "grad_norm": 7.594155788421631, "learning_rate": 5.827111111111111e-06, "loss": 0.9326, "step": 11890 }, { "epoch": 1.551094890510949, "grad_norm": 9.622396469116211, "learning_rate": 5.822666666666668e-06, "loss": 0.9338, "step": 11900 }, { "epoch": 1.551094890510949, "eval/acc": 41.86046600341797, "step": 11900 }, { "epoch": 1.551094890510949, "eval_loss": 2.7425906658172607, "eval_runtime": 0.5501, "eval_samples_per_second": 78.165, "eval_steps_per_second": 1.818, "step": 11900 }, { "epoch": 1.5523983315954117, "grad_norm": 8.330667495727539, "learning_rate": 5.818222222222223e-06, "loss": 0.9319, "step": 11910 }, { "epoch": 1.553701772679875, "grad_norm": 7.950173377990723, "learning_rate": 5.813777777777778e-06, "loss": 0.9547, "step": 11920 }, { "epoch": 1.5550052137643378, "grad_norm": 9.067012786865234, "learning_rate": 5.809333333333334e-06, "loss": 0.8985, "step": 11930 }, { "epoch": 1.556308654848801, "grad_norm": 9.018157958984375, "learning_rate": 5.8048888888888895e-06, "loss": 0.8627, "step": 11940 }, { "epoch": 1.5576120959332638, "grad_norm": 9.047653198242188, "learning_rate": 5.800444444444445e-06, "loss": 0.9111, "step": 11950 }, { "epoch": 1.5589155370177268, "grad_norm": 9.007577896118164, "learning_rate": 5.796000000000001e-06, "loss": 0.8687, "step": 11960 }, { "epoch": 1.5602189781021898, "grad_norm": 9.922075271606445, "learning_rate": 5.791555555555556e-06, "loss": 0.8776, "step": 11970 }, { "epoch": 1.5615224191866528, "grad_norm": 7.893103122711182, "learning_rate": 5.7871111111111115e-06, "loss": 0.8076, "step": 11980 }, { "epoch": 1.5628258602711158, "grad_norm": 9.148321151733398, "learning_rate": 5.782666666666667e-06, "loss": 0.9792, "step": 11990 }, { "epoch": 1.5641293013555786, "grad_norm": 7.6082611083984375, "learning_rate": 5.778222222222223e-06, "loss": 0.8537, "step": 12000 }, { "epoch": 1.5641293013555786, "eval/acc": 44.1860466003418, "step": 12000 }, { "epoch": 1.5641293013555786, "eval_loss": 2.7407734394073486, "eval_runtime": 0.55, "eval_samples_per_second": 78.181, "eval_steps_per_second": 1.818, "step": 12000 }, { "epoch": 1.5654327424400418, "grad_norm": 7.85395622253418, "learning_rate": 5.7737777777777785e-06, "loss": 0.9308, "step": 12010 }, { "epoch": 1.5667361835245046, "grad_norm": 9.808009147644043, "learning_rate": 5.769333333333333e-06, "loss": 0.8778, "step": 12020 }, { "epoch": 1.5680396246089676, "grad_norm": 8.558539390563965, "learning_rate": 5.76488888888889e-06, "loss": 0.8854, "step": 12030 }, { "epoch": 1.5693430656934306, "grad_norm": 6.804073810577393, "learning_rate": 5.760444444444445e-06, "loss": 0.8707, "step": 12040 }, { "epoch": 1.5706465067778936, "grad_norm": 10.607601165771484, "learning_rate": 5.7560000000000005e-06, "loss": 1.0026, "step": 12050 }, { "epoch": 1.5719499478623566, "grad_norm": 9.236237525939941, "learning_rate": 5.751555555555556e-06, "loss": 0.8743, "step": 12060 }, { "epoch": 1.5732533889468197, "grad_norm": 9.969038963317871, "learning_rate": 5.747111111111111e-06, "loss": 0.8429, "step": 12070 }, { "epoch": 1.5745568300312827, "grad_norm": 10.216775894165039, "learning_rate": 5.7426666666666675e-06, "loss": 0.9631, "step": 12080 }, { "epoch": 1.5758602711157454, "grad_norm": 9.933892250061035, "learning_rate": 5.738222222222223e-06, "loss": 0.8622, "step": 12090 }, { "epoch": 1.5771637122002087, "grad_norm": 9.885002136230469, "learning_rate": 5.733777777777778e-06, "loss": 0.9007, "step": 12100 }, { "epoch": 1.5771637122002087, "eval/acc": 44.1860466003418, "step": 12100 }, { "epoch": 1.5771637122002087, "eval_loss": 2.760734796524048, "eval_runtime": 0.7302, "eval_samples_per_second": 58.889, "eval_steps_per_second": 1.37, "step": 12100 }, { "epoch": 1.5784671532846715, "grad_norm": 9.095500946044922, "learning_rate": 5.729333333333334e-06, "loss": 0.9787, "step": 12110 }, { "epoch": 1.5797705943691345, "grad_norm": 8.187049865722656, "learning_rate": 5.7248888888888895e-06, "loss": 0.8988, "step": 12120 }, { "epoch": 1.5810740354535975, "grad_norm": 8.010296821594238, "learning_rate": 5.720444444444445e-06, "loss": 0.81, "step": 12130 }, { "epoch": 1.5823774765380605, "grad_norm": 8.447101593017578, "learning_rate": 5.716000000000001e-06, "loss": 0.9033, "step": 12140 }, { "epoch": 1.5836809176225235, "grad_norm": 9.873286247253418, "learning_rate": 5.711555555555556e-06, "loss": 0.9477, "step": 12150 }, { "epoch": 1.5849843587069863, "grad_norm": 11.441001892089844, "learning_rate": 5.707111111111111e-06, "loss": 0.913, "step": 12160 }, { "epoch": 1.5862877997914495, "grad_norm": 8.592070579528809, "learning_rate": 5.702666666666667e-06, "loss": 0.9975, "step": 12170 }, { "epoch": 1.5875912408759123, "grad_norm": 7.584151744842529, "learning_rate": 5.698222222222223e-06, "loss": 0.8821, "step": 12180 }, { "epoch": 1.5888946819603755, "grad_norm": 10.436724662780762, "learning_rate": 5.6937777777777785e-06, "loss": 1.0322, "step": 12190 }, { "epoch": 1.5901981230448383, "grad_norm": 8.973846435546875, "learning_rate": 5.689333333333333e-06, "loss": 0.8742, "step": 12200 }, { "epoch": 1.5901981230448383, "eval/acc": 44.1860466003418, "step": 12200 }, { "epoch": 1.5901981230448383, "eval_loss": 2.7265775203704834, "eval_runtime": 0.5512, "eval_samples_per_second": 78.016, "eval_steps_per_second": 1.814, "step": 12200 }, { "epoch": 1.5915015641293013, "grad_norm": 8.307132720947266, "learning_rate": 5.68488888888889e-06, "loss": 0.8625, "step": 12210 }, { "epoch": 1.5928050052137643, "grad_norm": 9.80394458770752, "learning_rate": 5.680444444444445e-06, "loss": 0.9754, "step": 12220 }, { "epoch": 1.5941084462982273, "grad_norm": 9.470157623291016, "learning_rate": 5.676e-06, "loss": 0.9609, "step": 12230 }, { "epoch": 1.5954118873826904, "grad_norm": 9.292947769165039, "learning_rate": 5.671555555555556e-06, "loss": 0.8267, "step": 12240 }, { "epoch": 1.5967153284671531, "grad_norm": 9.344517707824707, "learning_rate": 5.667111111111112e-06, "loss": 0.794, "step": 12250 }, { "epoch": 1.5980187695516164, "grad_norm": 11.099726676940918, "learning_rate": 5.6626666666666675e-06, "loss": 0.9265, "step": 12260 }, { "epoch": 1.5993222106360792, "grad_norm": 9.472885131835938, "learning_rate": 5.658222222222223e-06, "loss": 0.8703, "step": 12270 }, { "epoch": 1.6006256517205424, "grad_norm": 9.212688446044922, "learning_rate": 5.653777777777778e-06, "loss": 0.93, "step": 12280 }, { "epoch": 1.6019290928050052, "grad_norm": 9.390169143676758, "learning_rate": 5.649333333333334e-06, "loss": 0.8868, "step": 12290 }, { "epoch": 1.6032325338894682, "grad_norm": 10.038371086120605, "learning_rate": 5.644888888888889e-06, "loss": 0.8937, "step": 12300 }, { "epoch": 1.6032325338894682, "eval/acc": 44.1860466003418, "step": 12300 }, { "epoch": 1.6032325338894682, "eval_loss": 2.7616920471191406, "eval_runtime": 0.5501, "eval_samples_per_second": 78.161, "eval_steps_per_second": 1.818, "step": 12300 }, { "epoch": 1.6045359749739312, "grad_norm": 11.397320747375488, "learning_rate": 5.640444444444445e-06, "loss": 0.9074, "step": 12310 }, { "epoch": 1.6058394160583942, "grad_norm": 9.590808868408203, "learning_rate": 5.636000000000001e-06, "loss": 0.9424, "step": 12320 }, { "epoch": 1.6071428571428572, "grad_norm": 10.152965545654297, "learning_rate": 5.631555555555556e-06, "loss": 0.9256, "step": 12330 }, { "epoch": 1.60844629822732, "grad_norm": 8.448522567749023, "learning_rate": 5.627111111111112e-06, "loss": 0.8068, "step": 12340 }, { "epoch": 1.6097497393117832, "grad_norm": 8.193599700927734, "learning_rate": 5.622666666666667e-06, "loss": 0.9053, "step": 12350 }, { "epoch": 1.611053180396246, "grad_norm": 9.1607084274292, "learning_rate": 5.618222222222223e-06, "loss": 0.9115, "step": 12360 }, { "epoch": 1.612356621480709, "grad_norm": 8.736509323120117, "learning_rate": 5.613777777777778e-06, "loss": 0.9171, "step": 12370 }, { "epoch": 1.613660062565172, "grad_norm": 9.676508903503418, "learning_rate": 5.609333333333333e-06, "loss": 0.8624, "step": 12380 }, { "epoch": 1.614963503649635, "grad_norm": 9.189021110534668, "learning_rate": 5.60488888888889e-06, "loss": 0.9063, "step": 12390 }, { "epoch": 1.616266944734098, "grad_norm": 9.787127494812012, "learning_rate": 5.600444444444445e-06, "loss": 0.9251, "step": 12400 }, { "epoch": 1.616266944734098, "eval/acc": 41.86046600341797, "step": 12400 }, { "epoch": 1.616266944734098, "eval_loss": 2.6924171447753906, "eval_runtime": 0.5537, "eval_samples_per_second": 77.66, "eval_steps_per_second": 1.806, "step": 12400 }, { "epoch": 1.617570385818561, "grad_norm": 8.809910774230957, "learning_rate": 5.596e-06, "loss": 0.8073, "step": 12410 }, { "epoch": 1.618873826903024, "grad_norm": 10.405462265014648, "learning_rate": 5.591555555555556e-06, "loss": 0.9235, "step": 12420 }, { "epoch": 1.6201772679874868, "grad_norm": 7.922429084777832, "learning_rate": 5.587111111111112e-06, "loss": 0.9334, "step": 12430 }, { "epoch": 1.62148070907195, "grad_norm": 8.95710277557373, "learning_rate": 5.582666666666667e-06, "loss": 0.8827, "step": 12440 }, { "epoch": 1.6227841501564129, "grad_norm": 9.956131935119629, "learning_rate": 5.578222222222223e-06, "loss": 0.8351, "step": 12450 }, { "epoch": 1.6240875912408759, "grad_norm": 8.452085494995117, "learning_rate": 5.573777777777778e-06, "loss": 0.9133, "step": 12460 }, { "epoch": 1.6253910323253389, "grad_norm": 8.975761413574219, "learning_rate": 5.569333333333334e-06, "loss": 0.9549, "step": 12470 }, { "epoch": 1.6266944734098019, "grad_norm": 8.806785583496094, "learning_rate": 5.564888888888889e-06, "loss": 0.909, "step": 12480 }, { "epoch": 1.627997914494265, "grad_norm": 7.5352396965026855, "learning_rate": 5.560444444444445e-06, "loss": 0.8269, "step": 12490 }, { "epoch": 1.6293013555787277, "grad_norm": 8.941774368286133, "learning_rate": 5.556000000000001e-06, "loss": 0.8117, "step": 12500 }, { "epoch": 1.6293013555787277, "eval/acc": 41.86046600341797, "step": 12500 }, { "epoch": 1.6293013555787277, "eval_loss": 2.6788859367370605, "eval_runtime": 0.5535, "eval_samples_per_second": 77.692, "eval_steps_per_second": 1.807, "step": 12500 }, { "epoch": 1.630604796663191, "grad_norm": 9.224435806274414, "learning_rate": 5.5515555555555555e-06, "loss": 0.9428, "step": 12510 }, { "epoch": 1.6319082377476537, "grad_norm": 8.841092109680176, "learning_rate": 5.547111111111112e-06, "loss": 0.8942, "step": 12520 }, { "epoch": 1.633211678832117, "grad_norm": 9.174897193908691, "learning_rate": 5.542666666666667e-06, "loss": 0.8342, "step": 12530 }, { "epoch": 1.6345151199165797, "grad_norm": 10.593732833862305, "learning_rate": 5.538222222222223e-06, "loss": 0.9226, "step": 12540 }, { "epoch": 1.6358185610010427, "grad_norm": 8.261096954345703, "learning_rate": 5.533777777777778e-06, "loss": 0.8812, "step": 12550 }, { "epoch": 1.6371220020855057, "grad_norm": 10.176057815551758, "learning_rate": 5.529333333333334e-06, "loss": 0.8772, "step": 12560 }, { "epoch": 1.6384254431699687, "grad_norm": 10.218589782714844, "learning_rate": 5.52488888888889e-06, "loss": 0.9139, "step": 12570 }, { "epoch": 1.6397288842544318, "grad_norm": 9.200139045715332, "learning_rate": 5.5204444444444445e-06, "loss": 0.8967, "step": 12580 }, { "epoch": 1.6410323253388945, "grad_norm": 10.613264083862305, "learning_rate": 5.516e-06, "loss": 0.8179, "step": 12590 }, { "epoch": 1.6423357664233578, "grad_norm": 10.631454467773438, "learning_rate": 5.511555555555556e-06, "loss": 0.9559, "step": 12600 }, { "epoch": 1.6423357664233578, "eval/acc": 41.86046600341797, "step": 12600 }, { "epoch": 1.6423357664233578, "eval_loss": 2.768066644668579, "eval_runtime": 0.5506, "eval_samples_per_second": 78.098, "eval_steps_per_second": 1.816, "step": 12600 }, { "epoch": 1.6436392075078206, "grad_norm": 11.390617370605469, "learning_rate": 5.507111111111112e-06, "loss": 0.9535, "step": 12610 }, { "epoch": 1.6449426485922838, "grad_norm": 8.572603225708008, "learning_rate": 5.502666666666667e-06, "loss": 0.8902, "step": 12620 }, { "epoch": 1.6462460896767466, "grad_norm": 8.765055656433105, "learning_rate": 5.498222222222223e-06, "loss": 0.8981, "step": 12630 }, { "epoch": 1.6475495307612096, "grad_norm": 11.53538990020752, "learning_rate": 5.493777777777778e-06, "loss": 0.8783, "step": 12640 }, { "epoch": 1.6488529718456726, "grad_norm": 8.511107444763184, "learning_rate": 5.489333333333334e-06, "loss": 0.8433, "step": 12650 }, { "epoch": 1.6501564129301356, "grad_norm": 11.314702033996582, "learning_rate": 5.484888888888889e-06, "loss": 0.9371, "step": 12660 }, { "epoch": 1.6514598540145986, "grad_norm": 9.576703071594238, "learning_rate": 5.480444444444445e-06, "loss": 0.9076, "step": 12670 }, { "epoch": 1.6527632950990614, "grad_norm": 9.437372207641602, "learning_rate": 5.476000000000001e-06, "loss": 0.8284, "step": 12680 }, { "epoch": 1.6540667361835246, "grad_norm": 8.913594245910645, "learning_rate": 5.4715555555555554e-06, "loss": 0.8538, "step": 12690 }, { "epoch": 1.6553701772679874, "grad_norm": 9.062487602233887, "learning_rate": 5.467111111111112e-06, "loss": 0.965, "step": 12700 }, { "epoch": 1.6553701772679874, "eval/acc": 41.86046600341797, "step": 12700 }, { "epoch": 1.6553701772679874, "eval_loss": 2.616363286972046, "eval_runtime": 0.5798, "eval_samples_per_second": 74.16, "eval_steps_per_second": 1.725, "step": 12700 }, { "epoch": 1.6566736183524504, "grad_norm": 9.46473503112793, "learning_rate": 5.462666666666667e-06, "loss": 0.9239, "step": 12710 }, { "epoch": 1.6579770594369134, "grad_norm": 9.126602172851562, "learning_rate": 5.4582222222222225e-06, "loss": 0.9104, "step": 12720 }, { "epoch": 1.6592805005213764, "grad_norm": 8.588539123535156, "learning_rate": 5.453777777777778e-06, "loss": 0.9338, "step": 12730 }, { "epoch": 1.6605839416058394, "grad_norm": 8.043057441711426, "learning_rate": 5.449333333333334e-06, "loss": 0.8664, "step": 12740 }, { "epoch": 1.6618873826903025, "grad_norm": 8.878229141235352, "learning_rate": 5.44488888888889e-06, "loss": 0.9545, "step": 12750 }, { "epoch": 1.6631908237747655, "grad_norm": 9.775492668151855, "learning_rate": 5.4404444444444444e-06, "loss": 0.9144, "step": 12760 }, { "epoch": 1.6644942648592282, "grad_norm": 8.257186889648438, "learning_rate": 5.436e-06, "loss": 0.876, "step": 12770 }, { "epoch": 1.6657977059436915, "grad_norm": 7.558773517608643, "learning_rate": 5.431555555555556e-06, "loss": 0.8863, "step": 12780 }, { "epoch": 1.6671011470281543, "grad_norm": 10.847901344299316, "learning_rate": 5.4271111111111115e-06, "loss": 0.9244, "step": 12790 }, { "epoch": 1.6684045881126173, "grad_norm": 9.977592468261719, "learning_rate": 5.422666666666667e-06, "loss": 0.9048, "step": 12800 }, { "epoch": 1.6684045881126173, "eval/acc": 46.511627197265625, "step": 12800 }, { "epoch": 1.6684045881126173, "eval_loss": 2.6525614261627197, "eval_runtime": 0.796, "eval_samples_per_second": 54.02, "eval_steps_per_second": 1.256, "step": 12800 }, { "epoch": 1.6697080291970803, "grad_norm": 10.569034576416016, "learning_rate": 5.418222222222223e-06, "loss": 0.8241, "step": 12810 }, { "epoch": 1.6710114702815433, "grad_norm": 8.749735832214355, "learning_rate": 5.413777777777778e-06, "loss": 0.892, "step": 12820 }, { "epoch": 1.6723149113660063, "grad_norm": 9.518916130065918, "learning_rate": 5.409333333333334e-06, "loss": 0.8783, "step": 12830 }, { "epoch": 1.673618352450469, "grad_norm": 9.398987770080566, "learning_rate": 5.404888888888889e-06, "loss": 0.9105, "step": 12840 }, { "epoch": 1.6749217935349323, "grad_norm": 9.8145112991333, "learning_rate": 5.400444444444445e-06, "loss": 0.9007, "step": 12850 }, { "epoch": 1.676225234619395, "grad_norm": 8.494111061096191, "learning_rate": 5.3960000000000005e-06, "loss": 0.8184, "step": 12860 }, { "epoch": 1.6775286757038583, "grad_norm": 8.602559089660645, "learning_rate": 5.391555555555556e-06, "loss": 0.9125, "step": 12870 }, { "epoch": 1.6788321167883211, "grad_norm": 8.055842399597168, "learning_rate": 5.387111111111112e-06, "loss": 0.9053, "step": 12880 }, { "epoch": 1.6801355578727841, "grad_norm": 8.72721004486084, "learning_rate": 5.382666666666667e-06, "loss": 0.8576, "step": 12890 }, { "epoch": 1.6814389989572471, "grad_norm": 8.111344337463379, "learning_rate": 5.3782222222222224e-06, "loss": 0.8187, "step": 12900 }, { "epoch": 1.6814389989572471, "eval/acc": 44.1860466003418, "step": 12900 }, { "epoch": 1.6814389989572471, "eval_loss": 2.7945597171783447, "eval_runtime": 0.551, "eval_samples_per_second": 78.037, "eval_steps_per_second": 1.815, "step": 12900 }, { "epoch": 1.6827424400417101, "grad_norm": 9.866985321044922, "learning_rate": 5.373777777777778e-06, "loss": 0.8378, "step": 12910 }, { "epoch": 1.6840458811261731, "grad_norm": 8.585725784301758, "learning_rate": 5.369333333333334e-06, "loss": 0.7471, "step": 12920 }, { "epoch": 1.685349322210636, "grad_norm": 9.177979469299316, "learning_rate": 5.3648888888888895e-06, "loss": 0.8114, "step": 12930 }, { "epoch": 1.6866527632950992, "grad_norm": 10.077980995178223, "learning_rate": 5.360444444444445e-06, "loss": 0.9403, "step": 12940 }, { "epoch": 1.687956204379562, "grad_norm": 8.668931007385254, "learning_rate": 5.356e-06, "loss": 0.8741, "step": 12950 }, { "epoch": 1.6892596454640252, "grad_norm": 10.258094787597656, "learning_rate": 5.351555555555557e-06, "loss": 0.8982, "step": 12960 }, { "epoch": 1.690563086548488, "grad_norm": 9.145166397094727, "learning_rate": 5.3471111111111114e-06, "loss": 0.8866, "step": 12970 }, { "epoch": 1.691866527632951, "grad_norm": 7.475383758544922, "learning_rate": 5.342666666666667e-06, "loss": 0.9094, "step": 12980 }, { "epoch": 1.693169968717414, "grad_norm": 8.732629776000977, "learning_rate": 5.338222222222223e-06, "loss": 0.9394, "step": 12990 }, { "epoch": 1.694473409801877, "grad_norm": 9.685428619384766, "learning_rate": 5.333777777777778e-06, "loss": 0.8522, "step": 13000 }, { "epoch": 1.694473409801877, "eval/acc": 44.1860466003418, "step": 13000 }, { "epoch": 1.694473409801877, "eval_loss": 2.6734085083007812, "eval_runtime": 0.5505, "eval_samples_per_second": 78.115, "eval_steps_per_second": 1.817, "step": 13000 }, { "epoch": 1.69577685088634, "grad_norm": 9.119738578796387, "learning_rate": 5.329333333333334e-06, "loss": 0.872, "step": 13010 }, { "epoch": 1.6970802919708028, "grad_norm": 10.126830101013184, "learning_rate": 5.324888888888889e-06, "loss": 0.8678, "step": 13020 }, { "epoch": 1.698383733055266, "grad_norm": 11.138778686523438, "learning_rate": 5.320444444444445e-06, "loss": 0.8797, "step": 13030 }, { "epoch": 1.6996871741397288, "grad_norm": 7.824183940887451, "learning_rate": 5.3160000000000004e-06, "loss": 0.896, "step": 13040 }, { "epoch": 1.7009906152241918, "grad_norm": 8.033583641052246, "learning_rate": 5.311555555555556e-06, "loss": 0.874, "step": 13050 }, { "epoch": 1.7022940563086548, "grad_norm": 10.687947273254395, "learning_rate": 5.307111111111112e-06, "loss": 0.9535, "step": 13060 }, { "epoch": 1.7035974973931178, "grad_norm": 7.008943557739258, "learning_rate": 5.302666666666667e-06, "loss": 0.8165, "step": 13070 }, { "epoch": 1.7049009384775808, "grad_norm": 9.229698181152344, "learning_rate": 5.298222222222222e-06, "loss": 0.889, "step": 13080 }, { "epoch": 1.7062043795620438, "grad_norm": 8.03006649017334, "learning_rate": 5.293777777777778e-06, "loss": 0.9482, "step": 13090 }, { "epoch": 1.7075078206465069, "grad_norm": 9.35704517364502, "learning_rate": 5.289333333333334e-06, "loss": 0.8916, "step": 13100 }, { "epoch": 1.7075078206465069, "eval/acc": 41.86046600341797, "step": 13100 }, { "epoch": 1.7075078206465069, "eval_loss": 2.545015811920166, "eval_runtime": 0.5517, "eval_samples_per_second": 77.943, "eval_steps_per_second": 1.813, "step": 13100 }, { "epoch": 1.7088112617309696, "grad_norm": 9.768939971923828, "learning_rate": 5.2848888888888894e-06, "loss": 0.8869, "step": 13110 }, { "epoch": 1.7101147028154329, "grad_norm": 10.913764953613281, "learning_rate": 5.280444444444445e-06, "loss": 0.8006, "step": 13120 }, { "epoch": 1.7114181438998957, "grad_norm": 9.386956214904785, "learning_rate": 5.276e-06, "loss": 0.8897, "step": 13130 }, { "epoch": 1.7127215849843587, "grad_norm": 8.580976486206055, "learning_rate": 5.2715555555555565e-06, "loss": 0.9284, "step": 13140 }, { "epoch": 1.7140250260688217, "grad_norm": 9.469084739685059, "learning_rate": 5.267111111111111e-06, "loss": 0.98, "step": 13150 }, { "epoch": 1.7153284671532847, "grad_norm": 8.68073558807373, "learning_rate": 5.262666666666667e-06, "loss": 0.8204, "step": 13160 }, { "epoch": 1.7166319082377477, "grad_norm": 9.238105773925781, "learning_rate": 5.258222222222223e-06, "loss": 0.9089, "step": 13170 }, { "epoch": 1.7179353493222105, "grad_norm": 10.01119327545166, "learning_rate": 5.2537777777777784e-06, "loss": 0.9608, "step": 13180 }, { "epoch": 1.7192387904066737, "grad_norm": 8.633408546447754, "learning_rate": 5.249333333333334e-06, "loss": 0.8658, "step": 13190 }, { "epoch": 1.7205422314911365, "grad_norm": 9.289779663085938, "learning_rate": 5.244888888888889e-06, "loss": 0.8606, "step": 13200 }, { "epoch": 1.7205422314911365, "eval/acc": 44.1860466003418, "step": 13200 }, { "epoch": 1.7205422314911365, "eval_loss": 2.7408595085144043, "eval_runtime": 0.5513, "eval_samples_per_second": 77.998, "eval_steps_per_second": 1.814, "step": 13200 }, { "epoch": 1.7218456725755997, "grad_norm": 9.304854393005371, "learning_rate": 5.240444444444445e-06, "loss": 0.8886, "step": 13210 }, { "epoch": 1.7231491136600625, "grad_norm": 10.647976875305176, "learning_rate": 5.236e-06, "loss": 0.8962, "step": 13220 }, { "epoch": 1.7244525547445255, "grad_norm": 8.663270950317383, "learning_rate": 5.231555555555556e-06, "loss": 0.928, "step": 13230 }, { "epoch": 1.7257559958289885, "grad_norm": 8.543795585632324, "learning_rate": 5.227111111111112e-06, "loss": 0.9272, "step": 13240 }, { "epoch": 1.7270594369134515, "grad_norm": 8.28640079498291, "learning_rate": 5.222666666666667e-06, "loss": 1.0759, "step": 13250 }, { "epoch": 1.7283628779979145, "grad_norm": 8.390782356262207, "learning_rate": 5.218222222222222e-06, "loss": 0.8919, "step": 13260 }, { "epoch": 1.7296663190823773, "grad_norm": 8.103139877319336, "learning_rate": 5.213777777777779e-06, "loss": 0.9241, "step": 13270 }, { "epoch": 1.7309697601668406, "grad_norm": 8.039697647094727, "learning_rate": 5.209333333333334e-06, "loss": 0.9198, "step": 13280 }, { "epoch": 1.7322732012513034, "grad_norm": 8.839466094970703, "learning_rate": 5.204888888888889e-06, "loss": 0.8781, "step": 13290 }, { "epoch": 1.7335766423357666, "grad_norm": 9.11958122253418, "learning_rate": 5.200444444444445e-06, "loss": 0.8538, "step": 13300 }, { "epoch": 1.7335766423357666, "eval/acc": 44.1860466003418, "step": 13300 }, { "epoch": 1.7335766423357666, "eval_loss": 2.781658172607422, "eval_runtime": 0.5521, "eval_samples_per_second": 77.887, "eval_steps_per_second": 1.811, "step": 13300 }, { "epoch": 1.7348800834202294, "grad_norm": 7.923957347869873, "learning_rate": 5.196e-06, "loss": 0.847, "step": 13310 }, { "epoch": 1.7361835245046924, "grad_norm": 9.315792083740234, "learning_rate": 5.1915555555555564e-06, "loss": 0.9223, "step": 13320 }, { "epoch": 1.7374869655891554, "grad_norm": 9.324281692504883, "learning_rate": 5.187111111111111e-06, "loss": 0.9163, "step": 13330 }, { "epoch": 1.7387904066736184, "grad_norm": 9.846336364746094, "learning_rate": 5.182666666666667e-06, "loss": 0.8335, "step": 13340 }, { "epoch": 1.7400938477580814, "grad_norm": 8.53715991973877, "learning_rate": 5.178222222222223e-06, "loss": 0.8905, "step": 13350 }, { "epoch": 1.7413972888425442, "grad_norm": 8.64244556427002, "learning_rate": 5.173777777777778e-06, "loss": 0.9093, "step": 13360 }, { "epoch": 1.7427007299270074, "grad_norm": 7.952944278717041, "learning_rate": 5.169333333333334e-06, "loss": 0.8108, "step": 13370 }, { "epoch": 1.7440041710114702, "grad_norm": 8.999094009399414, "learning_rate": 5.164888888888889e-06, "loss": 0.9165, "step": 13380 }, { "epoch": 1.7453076120959332, "grad_norm": 8.41929817199707, "learning_rate": 5.160444444444445e-06, "loss": 0.9178, "step": 13390 }, { "epoch": 1.7466110531803962, "grad_norm": 7.692058086395264, "learning_rate": 5.156e-06, "loss": 0.9495, "step": 13400 }, { "epoch": 1.7466110531803962, "eval/acc": 41.86046600341797, "step": 13400 }, { "epoch": 1.7466110531803962, "eval_loss": 2.7012240886688232, "eval_runtime": 0.5515, "eval_samples_per_second": 77.969, "eval_steps_per_second": 1.813, "step": 13400 }, { "epoch": 1.7479144942648592, "grad_norm": 9.710464477539062, "learning_rate": 5.151555555555556e-06, "loss": 0.9028, "step": 13410 }, { "epoch": 1.7492179353493222, "grad_norm": 9.615612983703613, "learning_rate": 5.147111111111112e-06, "loss": 0.9338, "step": 13420 }, { "epoch": 1.7505213764337852, "grad_norm": 12.186484336853027, "learning_rate": 5.1426666666666665e-06, "loss": 0.8766, "step": 13430 }, { "epoch": 1.7518248175182483, "grad_norm": 9.344442367553711, "learning_rate": 5.138222222222222e-06, "loss": 0.9485, "step": 13440 }, { "epoch": 1.753128258602711, "grad_norm": 8.845906257629395, "learning_rate": 5.133777777777779e-06, "loss": 0.8931, "step": 13450 }, { "epoch": 1.7544316996871743, "grad_norm": 10.252700805664062, "learning_rate": 5.129333333333334e-06, "loss": 0.9327, "step": 13460 }, { "epoch": 1.755735140771637, "grad_norm": 8.981327056884766, "learning_rate": 5.124888888888889e-06, "loss": 0.8826, "step": 13470 }, { "epoch": 1.7570385818561, "grad_norm": 10.439797401428223, "learning_rate": 5.120444444444445e-06, "loss": 0.9466, "step": 13480 }, { "epoch": 1.758342022940563, "grad_norm": 9.587408065795898, "learning_rate": 5.116000000000001e-06, "loss": 0.8633, "step": 13490 }, { "epoch": 1.759645464025026, "grad_norm": 8.184699058532715, "learning_rate": 5.111555555555556e-06, "loss": 0.9367, "step": 13500 }, { "epoch": 1.759645464025026, "eval/acc": 44.1860466003418, "step": 13500 }, { "epoch": 1.759645464025026, "eval_loss": 2.7659718990325928, "eval_runtime": 0.5494, "eval_samples_per_second": 78.268, "eval_steps_per_second": 1.82, "step": 13500 }, { "epoch": 1.760948905109489, "grad_norm": 10.320651054382324, "learning_rate": 5.107111111111111e-06, "loss": 0.8506, "step": 13510 }, { "epoch": 1.7622523461939519, "grad_norm": 9.348326683044434, "learning_rate": 5.102666666666667e-06, "loss": 0.8818, "step": 13520 }, { "epoch": 1.763555787278415, "grad_norm": 9.354836463928223, "learning_rate": 5.0982222222222226e-06, "loss": 0.9021, "step": 13530 }, { "epoch": 1.764859228362878, "grad_norm": 8.566079139709473, "learning_rate": 5.093777777777778e-06, "loss": 0.8619, "step": 13540 }, { "epoch": 1.7661626694473411, "grad_norm": 8.734844207763672, "learning_rate": 5.089333333333334e-06, "loss": 0.8636, "step": 13550 }, { "epoch": 1.767466110531804, "grad_norm": 8.266152381896973, "learning_rate": 5.084888888888889e-06, "loss": 0.8232, "step": 13560 }, { "epoch": 1.768769551616267, "grad_norm": 11.402979850769043, "learning_rate": 5.0804444444444445e-06, "loss": 0.8227, "step": 13570 }, { "epoch": 1.77007299270073, "grad_norm": 8.30081844329834, "learning_rate": 5.076000000000001e-06, "loss": 0.8818, "step": 13580 }, { "epoch": 1.771376433785193, "grad_norm": 9.175376892089844, "learning_rate": 5.071555555555556e-06, "loss": 0.8995, "step": 13590 }, { "epoch": 1.772679874869656, "grad_norm": 9.377425193786621, "learning_rate": 5.0671111111111116e-06, "loss": 0.943, "step": 13600 }, { "epoch": 1.772679874869656, "eval/acc": 39.53488540649414, "step": 13600 }, { "epoch": 1.772679874869656, "eval_loss": 2.644824504852295, "eval_runtime": 0.5518, "eval_samples_per_second": 77.93, "eval_steps_per_second": 1.812, "step": 13600 }, { "epoch": 1.7739833159541187, "grad_norm": 8.776534080505371, "learning_rate": 5.062666666666666e-06, "loss": 0.9302, "step": 13610 }, { "epoch": 1.775286757038582, "grad_norm": 8.07412052154541, "learning_rate": 5.058222222222222e-06, "loss": 0.7761, "step": 13620 }, { "epoch": 1.7765901981230448, "grad_norm": 10.329318046569824, "learning_rate": 5.053777777777779e-06, "loss": 0.8889, "step": 13630 }, { "epoch": 1.777893639207508, "grad_norm": 8.45678424835205, "learning_rate": 5.0493333333333335e-06, "loss": 0.9088, "step": 13640 }, { "epoch": 1.7791970802919708, "grad_norm": 9.35205078125, "learning_rate": 5.044888888888889e-06, "loss": 0.7881, "step": 13650 }, { "epoch": 1.7805005213764338, "grad_norm": 9.435702323913574, "learning_rate": 5.040444444444445e-06, "loss": 0.9457, "step": 13660 }, { "epoch": 1.7818039624608968, "grad_norm": 10.433883666992188, "learning_rate": 5.0360000000000006e-06, "loss": 0.8582, "step": 13670 }, { "epoch": 1.7831074035453598, "grad_norm": 9.960062026977539, "learning_rate": 5.031555555555556e-06, "loss": 0.9387, "step": 13680 }, { "epoch": 1.7844108446298228, "grad_norm": 7.642147541046143, "learning_rate": 5.027111111111111e-06, "loss": 0.7353, "step": 13690 }, { "epoch": 1.7857142857142856, "grad_norm": 9.296479225158691, "learning_rate": 5.022666666666667e-06, "loss": 0.8311, "step": 13700 }, { "epoch": 1.7857142857142856, "eval/acc": 41.86046600341797, "step": 13700 }, { "epoch": 1.7857142857142856, "eval_loss": 2.8517420291900635, "eval_runtime": 0.5589, "eval_samples_per_second": 76.934, "eval_steps_per_second": 1.789, "step": 13700 }, { "epoch": 1.7870177267987488, "grad_norm": 8.802518844604492, "learning_rate": 5.0182222222222225e-06, "loss": 0.8766, "step": 13710 }, { "epoch": 1.7883211678832116, "grad_norm": 8.179342269897461, "learning_rate": 5.013777777777778e-06, "loss": 0.8817, "step": 13720 }, { "epoch": 1.7896246089676746, "grad_norm": 6.9555535316467285, "learning_rate": 5.009333333333334e-06, "loss": 0.8558, "step": 13730 }, { "epoch": 1.7909280500521376, "grad_norm": 11.052443504333496, "learning_rate": 5.004888888888889e-06, "loss": 0.7967, "step": 13740 }, { "epoch": 1.7922314911366006, "grad_norm": 10.142528533935547, "learning_rate": 5.000444444444444e-06, "loss": 0.8248, "step": 13750 }, { "epoch": 1.7935349322210636, "grad_norm": 8.769235610961914, "learning_rate": 4.996e-06, "loss": 0.9434, "step": 13760 }, { "epoch": 1.7948383733055264, "grad_norm": 9.223167419433594, "learning_rate": 4.991555555555556e-06, "loss": 0.925, "step": 13770 }, { "epoch": 1.7961418143899897, "grad_norm": 8.411652565002441, "learning_rate": 4.9871111111111115e-06, "loss": 0.8926, "step": 13780 }, { "epoch": 1.7974452554744524, "grad_norm": 8.350011825561523, "learning_rate": 4.982666666666667e-06, "loss": 0.9647, "step": 13790 }, { "epoch": 1.7987486965589157, "grad_norm": 9.274157524108887, "learning_rate": 4.978222222222223e-06, "loss": 0.9131, "step": 13800 }, { "epoch": 1.7987486965589157, "eval/acc": 44.1860466003418, "step": 13800 }, { "epoch": 1.7987486965589157, "eval_loss": 2.6356310844421387, "eval_runtime": 0.5489, "eval_samples_per_second": 78.338, "eval_steps_per_second": 1.822, "step": 13800 }, { "epoch": 1.8000521376433785, "grad_norm": 9.579371452331543, "learning_rate": 4.973777777777778e-06, "loss": 0.8722, "step": 13810 }, { "epoch": 1.8013555787278415, "grad_norm": 8.458383560180664, "learning_rate": 4.969333333333333e-06, "loss": 0.927, "step": 13820 }, { "epoch": 1.8026590198123045, "grad_norm": 8.24111270904541, "learning_rate": 4.964888888888889e-06, "loss": 0.9477, "step": 13830 }, { "epoch": 1.8039624608967675, "grad_norm": 9.82101058959961, "learning_rate": 4.960444444444445e-06, "loss": 0.9693, "step": 13840 }, { "epoch": 1.8052659019812305, "grad_norm": 8.676629066467285, "learning_rate": 4.9560000000000005e-06, "loss": 0.8951, "step": 13850 }, { "epoch": 1.8065693430656933, "grad_norm": 8.70731258392334, "learning_rate": 4.951555555555556e-06, "loss": 0.9119, "step": 13860 }, { "epoch": 1.8078727841501565, "grad_norm": 6.890768051147461, "learning_rate": 4.947111111111111e-06, "loss": 0.796, "step": 13870 }, { "epoch": 1.8091762252346193, "grad_norm": 8.783676147460938, "learning_rate": 4.9426666666666676e-06, "loss": 0.8929, "step": 13880 }, { "epoch": 1.8104796663190825, "grad_norm": 9.946412086486816, "learning_rate": 4.938222222222222e-06, "loss": 0.8616, "step": 13890 }, { "epoch": 1.8117831074035453, "grad_norm": 7.610568523406982, "learning_rate": 4.933777777777778e-06, "loss": 0.9022, "step": 13900 }, { "epoch": 1.8117831074035453, "eval/acc": 44.1860466003418, "step": 13900 }, { "epoch": 1.8117831074035453, "eval_loss": 2.734508752822876, "eval_runtime": 0.5529, "eval_samples_per_second": 77.771, "eval_steps_per_second": 1.809, "step": 13900 }, { "epoch": 1.8130865484880083, "grad_norm": 10.21069622039795, "learning_rate": 4.929333333333334e-06, "loss": 0.8551, "step": 13910 }, { "epoch": 1.8143899895724713, "grad_norm": 9.551142692565918, "learning_rate": 4.924888888888889e-06, "loss": 0.9592, "step": 13920 }, { "epoch": 1.8156934306569343, "grad_norm": 8.115945816040039, "learning_rate": 4.920444444444445e-06, "loss": 0.9228, "step": 13930 }, { "epoch": 1.8169968717413973, "grad_norm": 9.605473518371582, "learning_rate": 4.916e-06, "loss": 0.7986, "step": 13940 }, { "epoch": 1.8183003128258601, "grad_norm": 8.493094444274902, "learning_rate": 4.911555555555556e-06, "loss": 0.8875, "step": 13950 }, { "epoch": 1.8196037539103234, "grad_norm": 9.430155754089355, "learning_rate": 4.907111111111111e-06, "loss": 0.8725, "step": 13960 }, { "epoch": 1.8209071949947861, "grad_norm": 9.103509902954102, "learning_rate": 4.902666666666667e-06, "loss": 0.8753, "step": 13970 }, { "epoch": 1.8222106360792494, "grad_norm": 8.458293914794922, "learning_rate": 4.898222222222223e-06, "loss": 0.9498, "step": 13980 }, { "epoch": 1.8235140771637122, "grad_norm": 8.19863224029541, "learning_rate": 4.8937777777777785e-06, "loss": 0.7745, "step": 13990 }, { "epoch": 1.8248175182481752, "grad_norm": 10.201926231384277, "learning_rate": 4.889333333333333e-06, "loss": 0.8675, "step": 14000 }, { "epoch": 1.8248175182481752, "eval/acc": 44.1860466003418, "step": 14000 }, { "epoch": 1.8248175182481752, "eval_loss": 2.824615955352783, "eval_runtime": 0.549, "eval_samples_per_second": 78.328, "eval_steps_per_second": 1.822, "step": 14000 }, { "epoch": 1.8261209593326382, "grad_norm": 7.97217321395874, "learning_rate": 4.884888888888889e-06, "loss": 0.9589, "step": 14010 }, { "epoch": 1.8274244004171012, "grad_norm": 8.716256141662598, "learning_rate": 4.880444444444445e-06, "loss": 0.8702, "step": 14020 }, { "epoch": 1.8287278415015642, "grad_norm": 8.274259567260742, "learning_rate": 4.876e-06, "loss": 0.8167, "step": 14030 }, { "epoch": 1.830031282586027, "grad_norm": 8.204845428466797, "learning_rate": 4.871555555555556e-06, "loss": 0.8727, "step": 14040 }, { "epoch": 1.8313347236704902, "grad_norm": 10.708463668823242, "learning_rate": 4.867111111111111e-06, "loss": 0.9321, "step": 14050 }, { "epoch": 1.832638164754953, "grad_norm": 10.184549331665039, "learning_rate": 4.8626666666666675e-06, "loss": 0.9505, "step": 14060 }, { "epoch": 1.833941605839416, "grad_norm": 9.27289867401123, "learning_rate": 4.858222222222222e-06, "loss": 0.9187, "step": 14070 }, { "epoch": 1.835245046923879, "grad_norm": 9.189624786376953, "learning_rate": 4.853777777777778e-06, "loss": 0.9396, "step": 14080 }, { "epoch": 1.836548488008342, "grad_norm": 10.616105079650879, "learning_rate": 4.849333333333334e-06, "loss": 0.8546, "step": 14090 }, { "epoch": 1.837851929092805, "grad_norm": 9.03197193145752, "learning_rate": 4.844888888888889e-06, "loss": 0.8278, "step": 14100 }, { "epoch": 1.837851929092805, "eval/acc": 41.86046600341797, "step": 14100 }, { "epoch": 1.837851929092805, "eval_loss": 2.814258575439453, "eval_runtime": 0.5497, "eval_samples_per_second": 78.227, "eval_steps_per_second": 1.819, "step": 14100 }, { "epoch": 1.8391553701772678, "grad_norm": 7.356207370758057, "learning_rate": 4.840444444444445e-06, "loss": 0.8294, "step": 14110 }, { "epoch": 1.840458811261731, "grad_norm": 7.77383279800415, "learning_rate": 4.836e-06, "loss": 0.7693, "step": 14120 }, { "epoch": 1.8417622523461938, "grad_norm": 9.256695747375488, "learning_rate": 4.831555555555556e-06, "loss": 0.9065, "step": 14130 }, { "epoch": 1.843065693430657, "grad_norm": 9.7053861618042, "learning_rate": 4.827111111111111e-06, "loss": 0.8214, "step": 14140 }, { "epoch": 1.8443691345151199, "grad_norm": 10.338956832885742, "learning_rate": 4.822666666666667e-06, "loss": 0.8054, "step": 14150 }, { "epoch": 1.8456725755995829, "grad_norm": 8.740114212036133, "learning_rate": 4.818222222222223e-06, "loss": 0.917, "step": 14160 }, { "epoch": 1.8469760166840459, "grad_norm": 10.82303237915039, "learning_rate": 4.813777777777778e-06, "loss": 0.9137, "step": 14170 }, { "epoch": 1.8482794577685089, "grad_norm": 8.188334465026855, "learning_rate": 4.809333333333333e-06, "loss": 0.9297, "step": 14180 }, { "epoch": 1.849582898852972, "grad_norm": 7.886497974395752, "learning_rate": 4.80488888888889e-06, "loss": 0.8766, "step": 14190 }, { "epoch": 1.8508863399374347, "grad_norm": 8.58775806427002, "learning_rate": 4.800444444444445e-06, "loss": 0.8149, "step": 14200 }, { "epoch": 1.8508863399374347, "eval/acc": 44.1860466003418, "step": 14200 }, { "epoch": 1.8508863399374347, "eval_loss": 2.8095524311065674, "eval_runtime": 0.5502, "eval_samples_per_second": 78.159, "eval_steps_per_second": 1.818, "step": 14200 }, { "epoch": 1.852189781021898, "grad_norm": 10.114990234375, "learning_rate": 4.796e-06, "loss": 0.9143, "step": 14210 }, { "epoch": 1.8534932221063607, "grad_norm": 9.910545349121094, "learning_rate": 4.791555555555556e-06, "loss": 0.7914, "step": 14220 }, { "epoch": 1.854796663190824, "grad_norm": 10.009315490722656, "learning_rate": 4.787111111111111e-06, "loss": 0.9096, "step": 14230 }, { "epoch": 1.8561001042752867, "grad_norm": 7.72868013381958, "learning_rate": 4.782666666666667e-06, "loss": 0.8271, "step": 14240 }, { "epoch": 1.8574035453597497, "grad_norm": 9.030556678771973, "learning_rate": 4.778222222222222e-06, "loss": 0.8794, "step": 14250 }, { "epoch": 1.8587069864442127, "grad_norm": 7.498101234436035, "learning_rate": 4.773777777777778e-06, "loss": 0.9072, "step": 14260 }, { "epoch": 1.8600104275286757, "grad_norm": 8.781988143920898, "learning_rate": 4.769333333333334e-06, "loss": 0.7605, "step": 14270 }, { "epoch": 1.8613138686131387, "grad_norm": 10.2689790725708, "learning_rate": 4.764888888888889e-06, "loss": 0.9192, "step": 14280 }, { "epoch": 1.8626173096976015, "grad_norm": 8.778042793273926, "learning_rate": 4.760444444444445e-06, "loss": 0.8405, "step": 14290 }, { "epoch": 1.8639207507820648, "grad_norm": 8.738964080810547, "learning_rate": 4.756000000000001e-06, "loss": 0.9066, "step": 14300 }, { "epoch": 1.8639207507820648, "eval/acc": 44.1860466003418, "step": 14300 }, { "epoch": 1.8639207507820648, "eval_loss": 2.7467243671417236, "eval_runtime": 0.5506, "eval_samples_per_second": 78.096, "eval_steps_per_second": 1.816, "step": 14300 }, { "epoch": 1.8652241918665275, "grad_norm": 9.869569778442383, "learning_rate": 4.7515555555555556e-06, "loss": 0.8738, "step": 14310 }, { "epoch": 1.8665276329509908, "grad_norm": 10.059449195861816, "learning_rate": 4.747111111111111e-06, "loss": 0.9088, "step": 14320 }, { "epoch": 1.8678310740354536, "grad_norm": 8.857345581054688, "learning_rate": 4.742666666666667e-06, "loss": 0.9025, "step": 14330 }, { "epoch": 1.8691345151199166, "grad_norm": 7.883423328399658, "learning_rate": 4.738222222222223e-06, "loss": 0.8895, "step": 14340 }, { "epoch": 1.8704379562043796, "grad_norm": 10.041040420532227, "learning_rate": 4.733777777777778e-06, "loss": 0.9754, "step": 14350 }, { "epoch": 1.8717413972888426, "grad_norm": 9.739842414855957, "learning_rate": 4.729333333333333e-06, "loss": 0.8964, "step": 14360 }, { "epoch": 1.8730448383733056, "grad_norm": 8.323782920837402, "learning_rate": 4.72488888888889e-06, "loss": 0.8254, "step": 14370 }, { "epoch": 1.8743482794577684, "grad_norm": 11.314323425292969, "learning_rate": 4.7204444444444446e-06, "loss": 0.8477, "step": 14380 }, { "epoch": 1.8756517205422316, "grad_norm": 8.608202934265137, "learning_rate": 4.716e-06, "loss": 0.8297, "step": 14390 }, { "epoch": 1.8769551616266944, "grad_norm": 9.095972061157227, "learning_rate": 4.711555555555556e-06, "loss": 0.9633, "step": 14400 }, { "epoch": 1.8769551616266944, "eval/acc": 41.86046600341797, "step": 14400 }, { "epoch": 1.8769551616266944, "eval_loss": 2.710338830947876, "eval_runtime": 0.55, "eval_samples_per_second": 78.179, "eval_steps_per_second": 1.818, "step": 14400 }, { "epoch": 1.8782586027111574, "grad_norm": 8.579903602600098, "learning_rate": 4.707111111111112e-06, "loss": 1.0018, "step": 14410 }, { "epoch": 1.8795620437956204, "grad_norm": 8.766191482543945, "learning_rate": 4.702666666666667e-06, "loss": 0.9306, "step": 14420 }, { "epoch": 1.8808654848800834, "grad_norm": 8.627042770385742, "learning_rate": 4.698222222222222e-06, "loss": 0.9087, "step": 14430 }, { "epoch": 1.8821689259645464, "grad_norm": 7.4542927742004395, "learning_rate": 4.693777777777778e-06, "loss": 0.8616, "step": 14440 }, { "epoch": 1.8834723670490092, "grad_norm": 8.04769515991211, "learning_rate": 4.6893333333333336e-06, "loss": 0.8787, "step": 14450 }, { "epoch": 1.8847758081334725, "grad_norm": 10.944637298583984, "learning_rate": 4.684888888888889e-06, "loss": 0.9169, "step": 14460 }, { "epoch": 1.8860792492179352, "grad_norm": 8.250574111938477, "learning_rate": 4.680444444444445e-06, "loss": 0.8843, "step": 14470 }, { "epoch": 1.8873826903023985, "grad_norm": 9.908997535705566, "learning_rate": 4.676000000000001e-06, "loss": 0.8261, "step": 14480 }, { "epoch": 1.8886861313868613, "grad_norm": 9.308547019958496, "learning_rate": 4.6715555555555555e-06, "loss": 0.8978, "step": 14490 }, { "epoch": 1.8899895724713243, "grad_norm": 9.356900215148926, "learning_rate": 4.667111111111112e-06, "loss": 0.8685, "step": 14500 }, { "epoch": 1.8899895724713243, "eval/acc": 44.1860466003418, "step": 14500 }, { "epoch": 1.8899895724713243, "eval_loss": 2.8063623905181885, "eval_runtime": 1.0484, "eval_samples_per_second": 41.016, "eval_steps_per_second": 0.954, "step": 14500 }, { "epoch": 1.8912930135557873, "grad_norm": 7.892333507537842, "learning_rate": 4.662666666666667e-06, "loss": 0.8365, "step": 14510 }, { "epoch": 1.8925964546402503, "grad_norm": 7.322168827056885, "learning_rate": 4.6582222222222226e-06, "loss": 0.9015, "step": 14520 }, { "epoch": 1.8938998957247133, "grad_norm": 8.697136878967285, "learning_rate": 4.653777777777778e-06, "loss": 0.869, "step": 14530 }, { "epoch": 1.895203336809176, "grad_norm": 8.418469429016113, "learning_rate": 4.649333333333333e-06, "loss": 0.8868, "step": 14540 }, { "epoch": 1.8965067778936393, "grad_norm": 8.133889198303223, "learning_rate": 4.64488888888889e-06, "loss": 0.8792, "step": 14550 }, { "epoch": 1.897810218978102, "grad_norm": 8.766043663024902, "learning_rate": 4.6404444444444445e-06, "loss": 0.9195, "step": 14560 }, { "epoch": 1.8991136600625653, "grad_norm": 10.944380760192871, "learning_rate": 4.636e-06, "loss": 0.9656, "step": 14570 }, { "epoch": 1.900417101147028, "grad_norm": 8.704939842224121, "learning_rate": 4.631555555555556e-06, "loss": 0.8598, "step": 14580 }, { "epoch": 1.9017205422314911, "grad_norm": 10.356460571289062, "learning_rate": 4.6271111111111116e-06, "loss": 0.8563, "step": 14590 }, { "epoch": 1.9030239833159541, "grad_norm": 8.393257141113281, "learning_rate": 4.622666666666667e-06, "loss": 0.8816, "step": 14600 }, { "epoch": 1.9030239833159541, "eval/acc": 44.1860466003418, "step": 14600 }, { "epoch": 1.9030239833159541, "eval_loss": 2.7662394046783447, "eval_runtime": 0.5497, "eval_samples_per_second": 78.218, "eval_steps_per_second": 1.819, "step": 14600 }, { "epoch": 1.9043274244004171, "grad_norm": 9.824382781982422, "learning_rate": 4.618222222222223e-06, "loss": 0.9036, "step": 14610 }, { "epoch": 1.9056308654848801, "grad_norm": 9.598849296569824, "learning_rate": 4.613777777777778e-06, "loss": 0.8779, "step": 14620 }, { "epoch": 1.906934306569343, "grad_norm": 8.416238784790039, "learning_rate": 4.6093333333333335e-06, "loss": 0.892, "step": 14630 }, { "epoch": 1.9082377476538062, "grad_norm": 9.565461158752441, "learning_rate": 4.604888888888889e-06, "loss": 0.9037, "step": 14640 }, { "epoch": 1.909541188738269, "grad_norm": 9.42790699005127, "learning_rate": 4.600444444444445e-06, "loss": 0.8332, "step": 14650 }, { "epoch": 1.9108446298227322, "grad_norm": 8.945188522338867, "learning_rate": 4.5960000000000006e-06, "loss": 0.8583, "step": 14660 }, { "epoch": 1.912148070907195, "grad_norm": 10.122364044189453, "learning_rate": 4.591555555555555e-06, "loss": 0.9026, "step": 14670 }, { "epoch": 1.913451511991658, "grad_norm": 9.518960952758789, "learning_rate": 4.587111111111112e-06, "loss": 0.9566, "step": 14680 }, { "epoch": 1.914754953076121, "grad_norm": 9.283480644226074, "learning_rate": 4.582666666666667e-06, "loss": 0.8769, "step": 14690 }, { "epoch": 1.916058394160584, "grad_norm": 7.251773357391357, "learning_rate": 4.5782222222222225e-06, "loss": 0.8971, "step": 14700 }, { "epoch": 1.916058394160584, "eval/acc": 41.86046600341797, "step": 14700 }, { "epoch": 1.916058394160584, "eval_loss": 2.7316019535064697, "eval_runtime": 0.5494, "eval_samples_per_second": 78.261, "eval_steps_per_second": 1.82, "step": 14700 }, { "epoch": 1.917361835245047, "grad_norm": 10.917113304138184, "learning_rate": 4.573777777777778e-06, "loss": 0.8602, "step": 14710 }, { "epoch": 1.9186652763295098, "grad_norm": 9.944513320922852, "learning_rate": 4.569333333333334e-06, "loss": 0.9087, "step": 14720 }, { "epoch": 1.919968717413973, "grad_norm": 8.63537311553955, "learning_rate": 4.5648888888888895e-06, "loss": 0.8641, "step": 14730 }, { "epoch": 1.9212721584984358, "grad_norm": 8.801549911499023, "learning_rate": 4.560444444444444e-06, "loss": 0.8388, "step": 14740 }, { "epoch": 1.9225755995828988, "grad_norm": 7.719444274902344, "learning_rate": 4.556e-06, "loss": 0.8747, "step": 14750 }, { "epoch": 1.9238790406673618, "grad_norm": 9.879678726196289, "learning_rate": 4.551555555555556e-06, "loss": 0.9214, "step": 14760 }, { "epoch": 1.9251824817518248, "grad_norm": 8.831061363220215, "learning_rate": 4.5471111111111115e-06, "loss": 0.8342, "step": 14770 }, { "epoch": 1.9264859228362878, "grad_norm": 7.933155536651611, "learning_rate": 4.542666666666667e-06, "loss": 0.8463, "step": 14780 }, { "epoch": 1.9277893639207506, "grad_norm": 8.727767944335938, "learning_rate": 4.538222222222223e-06, "loss": 0.8486, "step": 14790 }, { "epoch": 1.9290928050052139, "grad_norm": 9.676631927490234, "learning_rate": 4.533777777777778e-06, "loss": 0.8637, "step": 14800 }, { "epoch": 1.9290928050052139, "eval/acc": 41.86046600341797, "step": 14800 }, { "epoch": 1.9290928050052139, "eval_loss": 2.8342931270599365, "eval_runtime": 0.5508, "eval_samples_per_second": 78.072, "eval_steps_per_second": 1.816, "step": 14800 }, { "epoch": 1.9303962460896766, "grad_norm": 10.179593086242676, "learning_rate": 4.529333333333334e-06, "loss": 0.8602, "step": 14810 }, { "epoch": 1.9316996871741399, "grad_norm": 9.099173545837402, "learning_rate": 4.524888888888889e-06, "loss": 0.9287, "step": 14820 }, { "epoch": 1.9330031282586027, "grad_norm": 7.766209125518799, "learning_rate": 4.520444444444445e-06, "loss": 0.8371, "step": 14830 }, { "epoch": 1.9343065693430657, "grad_norm": 10.427769660949707, "learning_rate": 4.5160000000000005e-06, "loss": 0.7662, "step": 14840 }, { "epoch": 1.9356100104275287, "grad_norm": 8.358800888061523, "learning_rate": 4.511555555555555e-06, "loss": 0.8258, "step": 14850 }, { "epoch": 1.9369134515119917, "grad_norm": 9.416501998901367, "learning_rate": 4.507111111111112e-06, "loss": 0.9169, "step": 14860 }, { "epoch": 1.9382168925964547, "grad_norm": 7.18595027923584, "learning_rate": 4.502666666666667e-06, "loss": 0.9116, "step": 14870 }, { "epoch": 1.9395203336809175, "grad_norm": 9.848918914794922, "learning_rate": 4.498222222222222e-06, "loss": 0.9086, "step": 14880 }, { "epoch": 1.9408237747653807, "grad_norm": 8.464194297790527, "learning_rate": 4.493777777777778e-06, "loss": 0.9313, "step": 14890 }, { "epoch": 1.9421272158498435, "grad_norm": 10.33420467376709, "learning_rate": 4.489333333333334e-06, "loss": 0.8663, "step": 14900 }, { "epoch": 1.9421272158498435, "eval/acc": 41.86046600341797, "step": 14900 }, { "epoch": 1.9421272158498435, "eval_loss": 2.7230069637298584, "eval_runtime": 0.5507, "eval_samples_per_second": 78.077, "eval_steps_per_second": 1.816, "step": 14900 }, { "epoch": 1.9434306569343067, "grad_norm": 9.30990982055664, "learning_rate": 4.4848888888888895e-06, "loss": 0.8596, "step": 14910 }, { "epoch": 1.9447340980187695, "grad_norm": 10.317497253417969, "learning_rate": 4.480444444444445e-06, "loss": 0.8723, "step": 14920 }, { "epoch": 1.9460375391032325, "grad_norm": 9.612330436706543, "learning_rate": 4.476e-06, "loss": 0.8811, "step": 14930 }, { "epoch": 1.9473409801876955, "grad_norm": 9.313211441040039, "learning_rate": 4.471555555555556e-06, "loss": 0.9004, "step": 14940 }, { "epoch": 1.9486444212721585, "grad_norm": 10.339431762695312, "learning_rate": 4.467111111111111e-06, "loss": 0.9151, "step": 14950 }, { "epoch": 1.9499478623566215, "grad_norm": 8.292842864990234, "learning_rate": 4.462666666666667e-06, "loss": 0.8738, "step": 14960 }, { "epoch": 1.9512513034410843, "grad_norm": 9.114078521728516, "learning_rate": 4.458222222222223e-06, "loss": 0.9177, "step": 14970 }, { "epoch": 1.9525547445255476, "grad_norm": 8.515254974365234, "learning_rate": 4.453777777777778e-06, "loss": 0.8424, "step": 14980 }, { "epoch": 1.9538581856100103, "grad_norm": 9.158382415771484, "learning_rate": 4.449333333333334e-06, "loss": 0.8819, "step": 14990 }, { "epoch": 1.9551616266944736, "grad_norm": 9.479183197021484, "learning_rate": 4.444888888888889e-06, "loss": 0.8487, "step": 15000 }, { "epoch": 1.9551616266944736, "eval/acc": 44.1860466003418, "step": 15000 }, { "epoch": 1.9551616266944736, "eval_loss": 2.7125165462493896, "eval_runtime": 0.552, "eval_samples_per_second": 77.896, "eval_steps_per_second": 1.812, "step": 15000 }, { "epoch": 1.9564650677789364, "grad_norm": 9.325910568237305, "learning_rate": 4.440444444444445e-06, "loss": 0.8671, "step": 15010 }, { "epoch": 1.9577685088633994, "grad_norm": 9.51240348815918, "learning_rate": 4.436e-06, "loss": 0.8318, "step": 15020 }, { "epoch": 1.9590719499478624, "grad_norm": 9.367854118347168, "learning_rate": 4.431555555555556e-06, "loss": 0.9166, "step": 15030 }, { "epoch": 1.9603753910323254, "grad_norm": 10.816835403442383, "learning_rate": 4.427111111111112e-06, "loss": 0.9097, "step": 15040 }, { "epoch": 1.9616788321167884, "grad_norm": 9.849903106689453, "learning_rate": 4.422666666666667e-06, "loss": 0.8977, "step": 15050 }, { "epoch": 1.9629822732012512, "grad_norm": 8.08956241607666, "learning_rate": 4.418222222222222e-06, "loss": 0.8673, "step": 15060 }, { "epoch": 1.9642857142857144, "grad_norm": 8.915260314941406, "learning_rate": 4.413777777777778e-06, "loss": 0.8121, "step": 15070 }, { "epoch": 1.9655891553701772, "grad_norm": 8.326038360595703, "learning_rate": 4.409333333333334e-06, "loss": 0.8525, "step": 15080 }, { "epoch": 1.9668925964546402, "grad_norm": 11.110740661621094, "learning_rate": 4.404888888888889e-06, "loss": 0.9314, "step": 15090 }, { "epoch": 1.9681960375391032, "grad_norm": 11.023222923278809, "learning_rate": 4.400444444444445e-06, "loss": 0.9959, "step": 15100 }, { "epoch": 1.9681960375391032, "eval/acc": 44.1860466003418, "step": 15100 }, { "epoch": 1.9681960375391032, "eval_loss": 2.74115252494812, "eval_runtime": 0.5506, "eval_samples_per_second": 78.1, "eval_steps_per_second": 1.816, "step": 15100 }, { "epoch": 1.9694994786235662, "grad_norm": 8.999839782714844, "learning_rate": 4.396e-06, "loss": 0.9375, "step": 15110 }, { "epoch": 1.9708029197080292, "grad_norm": 9.46464729309082, "learning_rate": 4.3915555555555565e-06, "loss": 0.9147, "step": 15120 }, { "epoch": 1.972106360792492, "grad_norm": 8.06843090057373, "learning_rate": 4.387111111111111e-06, "loss": 0.8655, "step": 15130 }, { "epoch": 1.9734098018769552, "grad_norm": 7.990191459655762, "learning_rate": 4.382666666666667e-06, "loss": 0.8895, "step": 15140 }, { "epoch": 1.974713242961418, "grad_norm": 8.925671577453613, "learning_rate": 4.378222222222223e-06, "loss": 0.8448, "step": 15150 }, { "epoch": 1.9760166840458813, "grad_norm": 9.917550086975098, "learning_rate": 4.3737777777777775e-06, "loss": 0.9078, "step": 15160 }, { "epoch": 1.977320125130344, "grad_norm": 9.35695743560791, "learning_rate": 4.369333333333334e-06, "loss": 0.9456, "step": 15170 }, { "epoch": 1.978623566214807, "grad_norm": 8.905348777770996, "learning_rate": 4.364888888888889e-06, "loss": 0.9329, "step": 15180 }, { "epoch": 1.97992700729927, "grad_norm": 8.441768646240234, "learning_rate": 4.360444444444445e-06, "loss": 0.8868, "step": 15190 }, { "epoch": 1.981230448383733, "grad_norm": 8.45506477355957, "learning_rate": 4.356e-06, "loss": 0.9057, "step": 15200 }, { "epoch": 1.981230448383733, "eval/acc": 44.1860466003418, "step": 15200 }, { "epoch": 1.981230448383733, "eval_loss": 2.7594377994537354, "eval_runtime": 0.5497, "eval_samples_per_second": 78.229, "eval_steps_per_second": 1.819, "step": 15200 }, { "epoch": 1.982533889468196, "grad_norm": 9.14447021484375, "learning_rate": 4.351555555555556e-06, "loss": 0.9506, "step": 15210 }, { "epoch": 1.9838373305526589, "grad_norm": 10.687137603759766, "learning_rate": 4.347111111111112e-06, "loss": 0.9221, "step": 15220 }, { "epoch": 1.985140771637122, "grad_norm": 8.657807350158691, "learning_rate": 4.342666666666667e-06, "loss": 0.881, "step": 15230 }, { "epoch": 1.986444212721585, "grad_norm": 9.527717590332031, "learning_rate": 4.338222222222222e-06, "loss": 0.9449, "step": 15240 }, { "epoch": 1.9877476538060481, "grad_norm": 9.977442741394043, "learning_rate": 4.333777777777778e-06, "loss": 0.9146, "step": 15250 }, { "epoch": 1.989051094890511, "grad_norm": 9.08598804473877, "learning_rate": 4.329333333333334e-06, "loss": 0.9198, "step": 15260 }, { "epoch": 1.990354535974974, "grad_norm": 9.887649536132812, "learning_rate": 4.324888888888889e-06, "loss": 0.8849, "step": 15270 }, { "epoch": 1.991657977059437, "grad_norm": 9.53846549987793, "learning_rate": 4.320444444444445e-06, "loss": 0.8554, "step": 15280 }, { "epoch": 1.9929614181439, "grad_norm": 10.340767860412598, "learning_rate": 4.316e-06, "loss": 0.9592, "step": 15290 }, { "epoch": 1.994264859228363, "grad_norm": 8.2573823928833, "learning_rate": 4.311555555555556e-06, "loss": 0.8768, "step": 15300 }, { "epoch": 1.994264859228363, "eval/acc": 41.86046600341797, "step": 15300 }, { "epoch": 1.994264859228363, "eval_loss": 2.7782442569732666, "eval_runtime": 0.5495, "eval_samples_per_second": 78.256, "eval_steps_per_second": 1.82, "step": 15300 }, { "epoch": 1.9955683003128257, "grad_norm": 8.565967559814453, "learning_rate": 4.307111111111111e-06, "loss": 0.876, "step": 15310 }, { "epoch": 1.996871741397289, "grad_norm": 8.339155197143555, "learning_rate": 4.302666666666667e-06, "loss": 0.8927, "step": 15320 }, { "epoch": 1.9981751824817517, "grad_norm": 8.305340766906738, "learning_rate": 4.298222222222223e-06, "loss": 0.9051, "step": 15330 }, { "epoch": 1.9994786235662148, "grad_norm": 7.999139785766602, "learning_rate": 4.293777777777778e-06, "loss": 0.9112, "step": 15340 }, { "epoch": 2.0007820646506778, "grad_norm": 7.436159610748291, "learning_rate": 4.289333333333334e-06, "loss": 0.8019, "step": 15350 }, { "epoch": 2.002085505735141, "grad_norm": 9.319280624389648, "learning_rate": 4.284888888888889e-06, "loss": 0.8252, "step": 15360 }, { "epoch": 2.0033889468196038, "grad_norm": 8.193472862243652, "learning_rate": 4.2804444444444445e-06, "loss": 0.8238, "step": 15370 }, { "epoch": 2.0046923879040666, "grad_norm": 9.857198715209961, "learning_rate": 4.276e-06, "loss": 0.8704, "step": 15380 }, { "epoch": 2.00599582898853, "grad_norm": 9.024703979492188, "learning_rate": 4.271555555555556e-06, "loss": 0.7243, "step": 15390 }, { "epoch": 2.0072992700729926, "grad_norm": 8.167558670043945, "learning_rate": 4.267111111111112e-06, "loss": 0.8248, "step": 15400 }, { "epoch": 2.0072992700729926, "eval/acc": 53.488372802734375, "step": 15400 }, { "epoch": 2.0072992700729926, "eval_loss": 1.8480530977249146, "eval_runtime": 0.6715, "eval_samples_per_second": 64.039, "eval_steps_per_second": 1.489, "step": 15400 }, { "epoch": 2.008602711157456, "grad_norm": 9.35039234161377, "learning_rate": 4.262666666666667e-06, "loss": 0.8212, "step": 15410 }, { "epoch": 2.0099061522419186, "grad_norm": 10.026947975158691, "learning_rate": 4.258222222222222e-06, "loss": 0.8591, "step": 15420 }, { "epoch": 2.011209593326382, "grad_norm": 8.662971496582031, "learning_rate": 4.253777777777779e-06, "loss": 0.919, "step": 15430 }, { "epoch": 2.0125130344108446, "grad_norm": 9.054789543151855, "learning_rate": 4.2493333333333335e-06, "loss": 0.786, "step": 15440 }, { "epoch": 2.0138164754953074, "grad_norm": 9.303889274597168, "learning_rate": 4.244888888888889e-06, "loss": 0.8007, "step": 15450 }, { "epoch": 2.0151199165797706, "grad_norm": 9.099271774291992, "learning_rate": 4.240444444444445e-06, "loss": 0.8589, "step": 15460 }, { "epoch": 2.0164233576642334, "grad_norm": 7.5457940101623535, "learning_rate": 4.236e-06, "loss": 0.7862, "step": 15470 }, { "epoch": 2.0177267987486966, "grad_norm": 8.39581298828125, "learning_rate": 4.231555555555556e-06, "loss": 0.7887, "step": 15480 }, { "epoch": 2.0190302398331594, "grad_norm": 8.494316101074219, "learning_rate": 4.227111111111111e-06, "loss": 0.7363, "step": 15490 }, { "epoch": 2.0203336809176227, "grad_norm": 11.257396697998047, "learning_rate": 4.222666666666667e-06, "loss": 0.8697, "step": 15500 }, { "epoch": 2.0203336809176227, "eval/acc": 53.488372802734375, "step": 15500 }, { "epoch": 2.0203336809176227, "eval_loss": 1.808847427368164, "eval_runtime": 0.5923, "eval_samples_per_second": 72.597, "eval_steps_per_second": 1.688, "step": 15500 }, { "epoch": 2.0216371220020855, "grad_norm": 10.112096786499023, "learning_rate": 4.2182222222222225e-06, "loss": 0.7427, "step": 15510 }, { "epoch": 2.0229405630865487, "grad_norm": 8.01047420501709, "learning_rate": 4.213777777777778e-06, "loss": 0.7793, "step": 15520 }, { "epoch": 2.0242440041710115, "grad_norm": 9.328739166259766, "learning_rate": 4.209333333333334e-06, "loss": 0.9251, "step": 15530 }, { "epoch": 2.0255474452554743, "grad_norm": 8.83740520477295, "learning_rate": 4.20488888888889e-06, "loss": 0.7822, "step": 15540 }, { "epoch": 2.0268508863399375, "grad_norm": 8.313032150268555, "learning_rate": 4.2004444444444445e-06, "loss": 0.8361, "step": 15550 }, { "epoch": 2.0281543274244003, "grad_norm": 10.842329978942871, "learning_rate": 4.196e-06, "loss": 0.8584, "step": 15560 }, { "epoch": 2.0294577685088635, "grad_norm": 7.673077583312988, "learning_rate": 4.191555555555556e-06, "loss": 0.8465, "step": 15570 }, { "epoch": 2.0307612095933263, "grad_norm": 9.108306884765625, "learning_rate": 4.1871111111111115e-06, "loss": 0.7594, "step": 15580 }, { "epoch": 2.0320646506777895, "grad_norm": 10.208512306213379, "learning_rate": 4.182666666666667e-06, "loss": 0.8516, "step": 15590 }, { "epoch": 2.0333680917622523, "grad_norm": 8.632047653198242, "learning_rate": 4.178222222222222e-06, "loss": 0.8486, "step": 15600 }, { "epoch": 2.0333680917622523, "eval/acc": 55.8139533996582, "step": 15600 }, { "epoch": 2.0333680917622523, "eval_loss": 1.751330852508545, "eval_runtime": 0.5923, "eval_samples_per_second": 72.595, "eval_steps_per_second": 1.688, "step": 15600 }, { "epoch": 2.0346715328467155, "grad_norm": 8.031896591186523, "learning_rate": 4.173777777777779e-06, "loss": 0.7694, "step": 15610 }, { "epoch": 2.0359749739311783, "grad_norm": 8.930005073547363, "learning_rate": 4.1693333333333335e-06, "loss": 0.8281, "step": 15620 }, { "epoch": 2.037278415015641, "grad_norm": 8.876046180725098, "learning_rate": 4.164888888888889e-06, "loss": 0.8195, "step": 15630 }, { "epoch": 2.0385818561001043, "grad_norm": 9.0652494430542, "learning_rate": 4.160444444444445e-06, "loss": 0.7868, "step": 15640 }, { "epoch": 2.039885297184567, "grad_norm": 6.786269187927246, "learning_rate": 4.1560000000000005e-06, "loss": 0.7362, "step": 15650 }, { "epoch": 2.0411887382690304, "grad_norm": 7.054460048675537, "learning_rate": 4.151555555555556e-06, "loss": 0.8101, "step": 15660 }, { "epoch": 2.042492179353493, "grad_norm": 9.174240112304688, "learning_rate": 4.147111111111111e-06, "loss": 0.8236, "step": 15670 }, { "epoch": 2.0437956204379564, "grad_norm": 7.598669052124023, "learning_rate": 4.142666666666667e-06, "loss": 0.7286, "step": 15680 }, { "epoch": 2.045099061522419, "grad_norm": 11.250917434692383, "learning_rate": 4.1382222222222224e-06, "loss": 0.9499, "step": 15690 }, { "epoch": 2.046402502606882, "grad_norm": 9.569265365600586, "learning_rate": 4.133777777777778e-06, "loss": 0.8767, "step": 15700 }, { "epoch": 2.046402502606882, "eval/acc": 55.8139533996582, "step": 15700 }, { "epoch": 2.046402502606882, "eval_loss": 1.7277277708053589, "eval_runtime": 0.5905, "eval_samples_per_second": 72.817, "eval_steps_per_second": 1.693, "step": 15700 }, { "epoch": 2.047705943691345, "grad_norm": 8.519957542419434, "learning_rate": 4.129333333333334e-06, "loss": 0.7883, "step": 15710 }, { "epoch": 2.049009384775808, "grad_norm": 7.968056678771973, "learning_rate": 4.1248888888888895e-06, "loss": 0.8081, "step": 15720 }, { "epoch": 2.050312825860271, "grad_norm": 8.23605728149414, "learning_rate": 4.120444444444444e-06, "loss": 0.796, "step": 15730 }, { "epoch": 2.051616266944734, "grad_norm": 7.167626857757568, "learning_rate": 4.116000000000001e-06, "loss": 0.8485, "step": 15740 }, { "epoch": 2.052919708029197, "grad_norm": 10.619958877563477, "learning_rate": 4.111555555555556e-06, "loss": 0.864, "step": 15750 }, { "epoch": 2.05422314911366, "grad_norm": 10.666949272155762, "learning_rate": 4.1071111111111114e-06, "loss": 0.8113, "step": 15760 }, { "epoch": 2.0555265901981232, "grad_norm": 9.4641695022583, "learning_rate": 4.102666666666667e-06, "loss": 0.8055, "step": 15770 }, { "epoch": 2.056830031282586, "grad_norm": 8.168855667114258, "learning_rate": 4.098222222222222e-06, "loss": 0.7808, "step": 15780 }, { "epoch": 2.058133472367049, "grad_norm": 9.684637069702148, "learning_rate": 4.0937777777777785e-06, "loss": 0.7952, "step": 15790 }, { "epoch": 2.059436913451512, "grad_norm": 10.373298645019531, "learning_rate": 4.089333333333333e-06, "loss": 0.8357, "step": 15800 }, { "epoch": 2.059436913451512, "eval/acc": 55.8139533996582, "step": 15800 }, { "epoch": 2.059436913451512, "eval_loss": 1.7130889892578125, "eval_runtime": 0.5935, "eval_samples_per_second": 72.457, "eval_steps_per_second": 1.685, "step": 15800 }, { "epoch": 2.060740354535975, "grad_norm": 9.320963859558105, "learning_rate": 4.084888888888889e-06, "loss": 0.835, "step": 15810 }, { "epoch": 2.062043795620438, "grad_norm": 9.566075325012207, "learning_rate": 4.080444444444445e-06, "loss": 0.9067, "step": 15820 }, { "epoch": 2.063347236704901, "grad_norm": 10.841265678405762, "learning_rate": 4.0760000000000004e-06, "loss": 0.812, "step": 15830 }, { "epoch": 2.064650677789364, "grad_norm": 9.906693458557129, "learning_rate": 4.071555555555556e-06, "loss": 0.8303, "step": 15840 }, { "epoch": 2.065954118873827, "grad_norm": 9.217839241027832, "learning_rate": 4.067111111111112e-06, "loss": 0.8075, "step": 15850 }, { "epoch": 2.06725755995829, "grad_norm": 9.24675464630127, "learning_rate": 4.062666666666667e-06, "loss": 0.8487, "step": 15860 }, { "epoch": 2.068561001042753, "grad_norm": 8.972613334655762, "learning_rate": 4.058222222222222e-06, "loss": 0.8685, "step": 15870 }, { "epoch": 2.0698644421272157, "grad_norm": 8.786476135253906, "learning_rate": 4.053777777777778e-06, "loss": 0.8267, "step": 15880 }, { "epoch": 2.071167883211679, "grad_norm": 8.891913414001465, "learning_rate": 4.049333333333334e-06, "loss": 0.8504, "step": 15890 }, { "epoch": 2.0724713242961417, "grad_norm": 8.825101852416992, "learning_rate": 4.0448888888888894e-06, "loss": 0.8349, "step": 15900 }, { "epoch": 2.0724713242961417, "eval/acc": 55.8139533996582, "step": 15900 }, { "epoch": 2.0724713242961417, "eval_loss": 1.6487507820129395, "eval_runtime": 0.592, "eval_samples_per_second": 72.635, "eval_steps_per_second": 1.689, "step": 15900 }, { "epoch": 2.073774765380605, "grad_norm": 8.748294830322266, "learning_rate": 4.040444444444444e-06, "loss": 0.8199, "step": 15910 }, { "epoch": 2.0750782064650677, "grad_norm": 9.041640281677246, "learning_rate": 4.036000000000001e-06, "loss": 0.8017, "step": 15920 }, { "epoch": 2.076381647549531, "grad_norm": 7.172785758972168, "learning_rate": 4.031555555555556e-06, "loss": 0.8374, "step": 15930 }, { "epoch": 2.0776850886339937, "grad_norm": 9.868871688842773, "learning_rate": 4.027111111111111e-06, "loss": 0.8409, "step": 15940 }, { "epoch": 2.078988529718457, "grad_norm": 9.898670196533203, "learning_rate": 4.022666666666667e-06, "loss": 0.8316, "step": 15950 }, { "epoch": 2.0802919708029197, "grad_norm": 8.59785270690918, "learning_rate": 4.018222222222223e-06, "loss": 0.8594, "step": 15960 }, { "epoch": 2.0815954118873825, "grad_norm": 8.578251838684082, "learning_rate": 4.0137777777777784e-06, "loss": 0.7638, "step": 15970 }, { "epoch": 2.0828988529718457, "grad_norm": 8.776795387268066, "learning_rate": 4.009333333333333e-06, "loss": 0.8671, "step": 15980 }, { "epoch": 2.0842022940563085, "grad_norm": 9.476344108581543, "learning_rate": 4.004888888888889e-06, "loss": 0.7472, "step": 15990 }, { "epoch": 2.0855057351407718, "grad_norm": 8.491658210754395, "learning_rate": 4.000444444444445e-06, "loss": 0.7733, "step": 16000 }, { "epoch": 2.0855057351407718, "eval/acc": 58.13953399658203, "step": 16000 }, { "epoch": 2.0855057351407718, "eval_loss": 1.703324794769287, "eval_runtime": 0.5918, "eval_samples_per_second": 72.654, "eval_steps_per_second": 1.69, "step": 16000 }, { "epoch": 2.0868091762252345, "grad_norm": 8.365692138671875, "learning_rate": 3.996e-06, "loss": 0.7216, "step": 16010 }, { "epoch": 2.0881126173096978, "grad_norm": 9.400110244750977, "learning_rate": 3.991555555555556e-06, "loss": 0.8316, "step": 16020 }, { "epoch": 2.0894160583941606, "grad_norm": 9.060704231262207, "learning_rate": 3.987111111111112e-06, "loss": 0.7972, "step": 16030 }, { "epoch": 2.090719499478624, "grad_norm": 9.138260841369629, "learning_rate": 3.982666666666667e-06, "loss": 0.8028, "step": 16040 }, { "epoch": 2.0920229405630866, "grad_norm": 10.73976993560791, "learning_rate": 3.978222222222223e-06, "loss": 0.8902, "step": 16050 }, { "epoch": 2.0933263816475494, "grad_norm": 8.873048782348633, "learning_rate": 3.973777777777778e-06, "loss": 0.7547, "step": 16060 }, { "epoch": 2.0946298227320126, "grad_norm": 7.576135158538818, "learning_rate": 3.969333333333334e-06, "loss": 0.7965, "step": 16070 }, { "epoch": 2.0959332638164754, "grad_norm": 8.750541687011719, "learning_rate": 3.964888888888889e-06, "loss": 0.7097, "step": 16080 }, { "epoch": 2.0972367049009386, "grad_norm": 9.333820343017578, "learning_rate": 3.960444444444444e-06, "loss": 0.8654, "step": 16090 }, { "epoch": 2.0985401459854014, "grad_norm": 9.30049991607666, "learning_rate": 3.956000000000001e-06, "loss": 0.8405, "step": 16100 }, { "epoch": 2.0985401459854014, "eval/acc": 58.13953399658203, "step": 16100 }, { "epoch": 2.0985401459854014, "eval_loss": 1.7304972410202026, "eval_runtime": 0.5938, "eval_samples_per_second": 72.409, "eval_steps_per_second": 1.684, "step": 16100 }, { "epoch": 2.0998435870698646, "grad_norm": 9.376851081848145, "learning_rate": 3.951555555555556e-06, "loss": 0.7574, "step": 16110 }, { "epoch": 2.1011470281543274, "grad_norm": 7.704640865325928, "learning_rate": 3.947111111111111e-06, "loss": 0.7922, "step": 16120 }, { "epoch": 2.10245046923879, "grad_norm": 9.256245613098145, "learning_rate": 3.942666666666667e-06, "loss": 0.797, "step": 16130 }, { "epoch": 2.1037539103232534, "grad_norm": 9.58582592010498, "learning_rate": 3.938222222222223e-06, "loss": 0.8053, "step": 16140 }, { "epoch": 2.105057351407716, "grad_norm": 9.159411430358887, "learning_rate": 3.933777777777778e-06, "loss": 0.8219, "step": 16150 }, { "epoch": 2.1063607924921794, "grad_norm": 7.1823225021362305, "learning_rate": 3.929333333333334e-06, "loss": 0.8525, "step": 16160 }, { "epoch": 2.1076642335766422, "grad_norm": 10.153651237487793, "learning_rate": 3.924888888888889e-06, "loss": 0.8098, "step": 16170 }, { "epoch": 2.1089676746611055, "grad_norm": 8.096991539001465, "learning_rate": 3.920444444444445e-06, "loss": 0.9122, "step": 16180 }, { "epoch": 2.1102711157455682, "grad_norm": 8.992088317871094, "learning_rate": 3.916e-06, "loss": 0.79, "step": 16190 }, { "epoch": 2.1115745568300315, "grad_norm": 8.178614616394043, "learning_rate": 3.911555555555556e-06, "loss": 0.8499, "step": 16200 }, { "epoch": 2.1115745568300315, "eval/acc": 58.13953399658203, "step": 16200 }, { "epoch": 2.1115745568300315, "eval_loss": 1.7082654237747192, "eval_runtime": 0.5931, "eval_samples_per_second": 72.497, "eval_steps_per_second": 1.686, "step": 16200 }, { "epoch": 2.1128779979144943, "grad_norm": 8.454838752746582, "learning_rate": 3.907111111111112e-06, "loss": 0.8184, "step": 16210 }, { "epoch": 2.114181438998957, "grad_norm": 8.823241233825684, "learning_rate": 3.9026666666666665e-06, "loss": 0.8508, "step": 16220 }, { "epoch": 2.1154848800834203, "grad_norm": 8.29101848602295, "learning_rate": 3.898222222222223e-06, "loss": 0.7465, "step": 16230 }, { "epoch": 2.116788321167883, "grad_norm": 8.946051597595215, "learning_rate": 3.893777777777778e-06, "loss": 0.7212, "step": 16240 }, { "epoch": 2.1180917622523463, "grad_norm": 10.07369613647461, "learning_rate": 3.889333333333334e-06, "loss": 0.8236, "step": 16250 }, { "epoch": 2.119395203336809, "grad_norm": 9.569952964782715, "learning_rate": 3.884888888888889e-06, "loss": 0.7614, "step": 16260 }, { "epoch": 2.1206986444212723, "grad_norm": 10.556382179260254, "learning_rate": 3.880444444444445e-06, "loss": 0.8268, "step": 16270 }, { "epoch": 2.122002085505735, "grad_norm": 9.503656387329102, "learning_rate": 3.876000000000001e-06, "loss": 0.8625, "step": 16280 }, { "epoch": 2.1233055265901983, "grad_norm": 7.187640190124512, "learning_rate": 3.8715555555555555e-06, "loss": 0.8264, "step": 16290 }, { "epoch": 2.124608967674661, "grad_norm": 9.812830924987793, "learning_rate": 3.867111111111111e-06, "loss": 0.8047, "step": 16300 }, { "epoch": 2.124608967674661, "eval/acc": 58.13953399658203, "step": 16300 }, { "epoch": 2.124608967674661, "eval_loss": 1.7577248811721802, "eval_runtime": 0.5938, "eval_samples_per_second": 72.415, "eval_steps_per_second": 1.684, "step": 16300 }, { "epoch": 2.125912408759124, "grad_norm": 6.756389141082764, "learning_rate": 3.862666666666667e-06, "loss": 0.8275, "step": 16310 }, { "epoch": 2.127215849843587, "grad_norm": 9.148049354553223, "learning_rate": 3.858222222222223e-06, "loss": 0.7844, "step": 16320 }, { "epoch": 2.12851929092805, "grad_norm": 11.886138916015625, "learning_rate": 3.853777777777778e-06, "loss": 0.8472, "step": 16330 }, { "epoch": 2.129822732012513, "grad_norm": 9.415099143981934, "learning_rate": 3.849333333333334e-06, "loss": 0.8273, "step": 16340 }, { "epoch": 2.131126173096976, "grad_norm": 10.05582332611084, "learning_rate": 3.844888888888889e-06, "loss": 0.7572, "step": 16350 }, { "epoch": 2.132429614181439, "grad_norm": 8.643566131591797, "learning_rate": 3.840444444444445e-06, "loss": 0.887, "step": 16360 }, { "epoch": 2.133733055265902, "grad_norm": 8.855241775512695, "learning_rate": 3.836e-06, "loss": 0.8601, "step": 16370 }, { "epoch": 2.1350364963503647, "grad_norm": 11.129681587219238, "learning_rate": 3.831555555555556e-06, "loss": 0.8409, "step": 16380 }, { "epoch": 2.136339937434828, "grad_norm": 11.98705005645752, "learning_rate": 3.827111111111112e-06, "loss": 0.855, "step": 16390 }, { "epoch": 2.1376433785192908, "grad_norm": 7.61968469619751, "learning_rate": 3.8226666666666664e-06, "loss": 0.6953, "step": 16400 }, { "epoch": 2.1376433785192908, "eval/acc": 58.13953399658203, "step": 16400 }, { "epoch": 2.1376433785192908, "eval_loss": 1.7353113889694214, "eval_runtime": 0.5953, "eval_samples_per_second": 72.237, "eval_steps_per_second": 1.68, "step": 16400 }, { "epoch": 2.138946819603754, "grad_norm": 8.824946403503418, "learning_rate": 3.818222222222223e-06, "loss": 0.7384, "step": 16410 }, { "epoch": 2.1402502606882168, "grad_norm": 9.723782539367676, "learning_rate": 3.813777777777778e-06, "loss": 0.8602, "step": 16420 }, { "epoch": 2.14155370177268, "grad_norm": 9.958561897277832, "learning_rate": 3.809333333333334e-06, "loss": 0.8567, "step": 16430 }, { "epoch": 2.142857142857143, "grad_norm": 9.253443717956543, "learning_rate": 3.804888888888889e-06, "loss": 0.7919, "step": 16440 }, { "epoch": 2.144160583941606, "grad_norm": 8.461666107177734, "learning_rate": 3.800444444444445e-06, "loss": 0.8155, "step": 16450 }, { "epoch": 2.145464025026069, "grad_norm": 9.44472885131836, "learning_rate": 3.796e-06, "loss": 0.9095, "step": 16460 }, { "epoch": 2.1467674661105316, "grad_norm": 8.522442817687988, "learning_rate": 3.7915555555555563e-06, "loss": 0.796, "step": 16470 }, { "epoch": 2.148070907194995, "grad_norm": 8.793846130371094, "learning_rate": 3.7871111111111115e-06, "loss": 0.8318, "step": 16480 }, { "epoch": 2.1493743482794576, "grad_norm": 8.55291748046875, "learning_rate": 3.782666666666667e-06, "loss": 0.8397, "step": 16490 }, { "epoch": 2.150677789363921, "grad_norm": 8.956378936767578, "learning_rate": 3.7782222222222225e-06, "loss": 0.7799, "step": 16500 }, { "epoch": 2.150677789363921, "eval/acc": 55.8139533996582, "step": 16500 }, { "epoch": 2.150677789363921, "eval_loss": 1.731651782989502, "eval_runtime": 0.6132, "eval_samples_per_second": 70.124, "eval_steps_per_second": 1.631, "step": 16500 }, { "epoch": 2.1519812304483836, "grad_norm": 8.988715171813965, "learning_rate": 3.7737777777777778e-06, "loss": 0.826, "step": 16510 }, { "epoch": 2.153284671532847, "grad_norm": 8.181129455566406, "learning_rate": 3.769333333333334e-06, "loss": 0.8415, "step": 16520 }, { "epoch": 2.1545881126173096, "grad_norm": 8.54606819152832, "learning_rate": 3.764888888888889e-06, "loss": 0.8464, "step": 16530 }, { "epoch": 2.155891553701773, "grad_norm": 8.382131576538086, "learning_rate": 3.760444444444445e-06, "loss": 0.7763, "step": 16540 }, { "epoch": 2.1571949947862357, "grad_norm": 8.597378730773926, "learning_rate": 3.756e-06, "loss": 0.853, "step": 16550 }, { "epoch": 2.1584984358706985, "grad_norm": 9.286585807800293, "learning_rate": 3.7515555555555562e-06, "loss": 0.8351, "step": 16560 }, { "epoch": 2.1598018769551617, "grad_norm": 9.251093864440918, "learning_rate": 3.7471111111111115e-06, "loss": 0.8411, "step": 16570 }, { "epoch": 2.1611053180396245, "grad_norm": 8.470297813415527, "learning_rate": 3.742666666666667e-06, "loss": 0.7818, "step": 16580 }, { "epoch": 2.1624087591240877, "grad_norm": 8.684773445129395, "learning_rate": 3.7382222222222225e-06, "loss": 0.777, "step": 16590 }, { "epoch": 2.1637122002085505, "grad_norm": 8.598529815673828, "learning_rate": 3.7337777777777777e-06, "loss": 0.8065, "step": 16600 }, { "epoch": 2.1637122002085505, "eval/acc": 58.13953399658203, "step": 16600 }, { "epoch": 2.1637122002085505, "eval_loss": 1.7437891960144043, "eval_runtime": 0.6496, "eval_samples_per_second": 66.197, "eval_steps_per_second": 1.539, "step": 16600 }, { "epoch": 2.1650156412930137, "grad_norm": 10.93813419342041, "learning_rate": 3.729333333333334e-06, "loss": 0.7977, "step": 16610 }, { "epoch": 2.1663190823774765, "grad_norm": 7.3532490730285645, "learning_rate": 3.724888888888889e-06, "loss": 0.7108, "step": 16620 }, { "epoch": 2.1676225234619393, "grad_norm": 9.04556655883789, "learning_rate": 3.720444444444445e-06, "loss": 0.8417, "step": 16630 }, { "epoch": 2.1689259645464025, "grad_norm": 10.736461639404297, "learning_rate": 3.716e-06, "loss": 0.8135, "step": 16640 }, { "epoch": 2.1702294056308653, "grad_norm": 9.682230949401855, "learning_rate": 3.711555555555556e-06, "loss": 0.7997, "step": 16650 }, { "epoch": 2.1715328467153285, "grad_norm": 10.539389610290527, "learning_rate": 3.7071111111111115e-06, "loss": 0.8649, "step": 16660 }, { "epoch": 2.1728362877997913, "grad_norm": 10.269004821777344, "learning_rate": 3.702666666666667e-06, "loss": 0.8461, "step": 16670 }, { "epoch": 2.1741397288842546, "grad_norm": 10.077771186828613, "learning_rate": 3.6982222222222224e-06, "loss": 0.8135, "step": 16680 }, { "epoch": 2.1754431699687173, "grad_norm": 9.409977912902832, "learning_rate": 3.6937777777777785e-06, "loss": 0.7994, "step": 16690 }, { "epoch": 2.1767466110531806, "grad_norm": 13.457232475280762, "learning_rate": 3.689333333333334e-06, "loss": 0.7914, "step": 16700 }, { "epoch": 2.1767466110531806, "eval/acc": 58.13953399658203, "step": 16700 }, { "epoch": 2.1767466110531806, "eval_loss": 1.7389909029006958, "eval_runtime": 0.5955, "eval_samples_per_second": 72.208, "eval_steps_per_second": 1.679, "step": 16700 }, { "epoch": 2.1780500521376434, "grad_norm": 9.8069429397583, "learning_rate": 3.684888888888889e-06, "loss": 0.8296, "step": 16710 }, { "epoch": 2.1793534932221066, "grad_norm": 9.093622207641602, "learning_rate": 3.6804444444444448e-06, "loss": 0.8165, "step": 16720 }, { "epoch": 2.1806569343065694, "grad_norm": 9.013671875, "learning_rate": 3.676e-06, "loss": 0.7776, "step": 16730 }, { "epoch": 2.181960375391032, "grad_norm": 8.411311149597168, "learning_rate": 3.671555555555556e-06, "loss": 0.8925, "step": 16740 }, { "epoch": 2.1832638164754954, "grad_norm": 7.932659149169922, "learning_rate": 3.6671111111111114e-06, "loss": 0.7949, "step": 16750 }, { "epoch": 2.184567257559958, "grad_norm": 7.752965927124023, "learning_rate": 3.662666666666667e-06, "loss": 0.8059, "step": 16760 }, { "epoch": 2.1858706986444214, "grad_norm": 8.397247314453125, "learning_rate": 3.6582222222222224e-06, "loss": 0.8152, "step": 16770 }, { "epoch": 2.187174139728884, "grad_norm": 9.063042640686035, "learning_rate": 3.6537777777777785e-06, "loss": 0.9003, "step": 16780 }, { "epoch": 2.1884775808133474, "grad_norm": 8.312545776367188, "learning_rate": 3.6493333333333338e-06, "loss": 0.8314, "step": 16790 }, { "epoch": 2.18978102189781, "grad_norm": 8.772299766540527, "learning_rate": 3.644888888888889e-06, "loss": 0.822, "step": 16800 }, { "epoch": 2.18978102189781, "eval/acc": 58.13953399658203, "step": 16800 }, { "epoch": 2.18978102189781, "eval_loss": 1.6912325620651245, "eval_runtime": 0.5962, "eval_samples_per_second": 72.127, "eval_steps_per_second": 1.677, "step": 16800 }, { "epoch": 2.191084462982273, "grad_norm": 8.423686981201172, "learning_rate": 3.6404444444444447e-06, "loss": 0.7787, "step": 16810 }, { "epoch": 2.1923879040667362, "grad_norm": 12.527349472045898, "learning_rate": 3.636e-06, "loss": 0.8614, "step": 16820 }, { "epoch": 2.193691345151199, "grad_norm": 9.030877113342285, "learning_rate": 3.631555555555556e-06, "loss": 0.8539, "step": 16830 }, { "epoch": 2.1949947862356622, "grad_norm": 7.820774555206299, "learning_rate": 3.6271111111111114e-06, "loss": 0.7738, "step": 16840 }, { "epoch": 2.196298227320125, "grad_norm": 10.639665603637695, "learning_rate": 3.622666666666667e-06, "loss": 0.9302, "step": 16850 }, { "epoch": 2.1976016684045883, "grad_norm": 8.641568183898926, "learning_rate": 3.6182222222222223e-06, "loss": 0.7162, "step": 16860 }, { "epoch": 2.198905109489051, "grad_norm": 7.5878825187683105, "learning_rate": 3.6137777777777785e-06, "loss": 0.8653, "step": 16870 }, { "epoch": 2.2002085505735143, "grad_norm": 10.108068466186523, "learning_rate": 3.6093333333333337e-06, "loss": 0.9035, "step": 16880 }, { "epoch": 2.201511991657977, "grad_norm": 10.614107131958008, "learning_rate": 3.6048888888888894e-06, "loss": 0.8561, "step": 16890 }, { "epoch": 2.20281543274244, "grad_norm": 9.28848934173584, "learning_rate": 3.6004444444444447e-06, "loss": 0.8372, "step": 16900 }, { "epoch": 2.20281543274244, "eval/acc": 58.13953399658203, "step": 16900 }, { "epoch": 2.20281543274244, "eval_loss": 1.7112364768981934, "eval_runtime": 0.5944, "eval_samples_per_second": 72.342, "eval_steps_per_second": 1.682, "step": 16900 }, { "epoch": 2.204118873826903, "grad_norm": 9.058589935302734, "learning_rate": 3.596e-06, "loss": 0.8493, "step": 16910 }, { "epoch": 2.205422314911366, "grad_norm": 8.164351463317871, "learning_rate": 3.591555555555556e-06, "loss": 0.8426, "step": 16920 }, { "epoch": 2.206725755995829, "grad_norm": 9.746597290039062, "learning_rate": 3.5871111111111113e-06, "loss": 0.8459, "step": 16930 }, { "epoch": 2.208029197080292, "grad_norm": 8.436588287353516, "learning_rate": 3.582666666666667e-06, "loss": 0.8397, "step": 16940 }, { "epoch": 2.209332638164755, "grad_norm": 8.43806266784668, "learning_rate": 3.5782222222222223e-06, "loss": 0.7691, "step": 16950 }, { "epoch": 2.210636079249218, "grad_norm": 9.413714408874512, "learning_rate": 3.5737777777777784e-06, "loss": 0.7806, "step": 16960 }, { "epoch": 2.211939520333681, "grad_norm": 10.42586612701416, "learning_rate": 3.5693333333333337e-06, "loss": 0.8395, "step": 16970 }, { "epoch": 2.213242961418144, "grad_norm": 8.827805519104004, "learning_rate": 3.5648888888888894e-06, "loss": 0.868, "step": 16980 }, { "epoch": 2.2145464025026067, "grad_norm": 8.495600700378418, "learning_rate": 3.5604444444444447e-06, "loss": 0.838, "step": 16990 }, { "epoch": 2.21584984358707, "grad_norm": 9.636509895324707, "learning_rate": 3.5560000000000008e-06, "loss": 0.8174, "step": 17000 }, { "epoch": 2.21584984358707, "eval/acc": 58.13953399658203, "step": 17000 }, { "epoch": 2.21584984358707, "eval_loss": 1.6880416870117188, "eval_runtime": 2.5356, "eval_samples_per_second": 16.959, "eval_steps_per_second": 0.394, "step": 17000 }, { "epoch": 2.2171532846715327, "grad_norm": 8.30799674987793, "learning_rate": 3.551555555555556e-06, "loss": 0.8394, "step": 17010 }, { "epoch": 2.218456725755996, "grad_norm": 8.65863037109375, "learning_rate": 3.5471111111111113e-06, "loss": 0.8411, "step": 17020 }, { "epoch": 2.2197601668404587, "grad_norm": 9.91065502166748, "learning_rate": 3.542666666666667e-06, "loss": 0.8738, "step": 17030 }, { "epoch": 2.221063607924922, "grad_norm": 8.491907119750977, "learning_rate": 3.5382222222222223e-06, "loss": 0.8289, "step": 17040 }, { "epoch": 2.2223670490093848, "grad_norm": 8.471418380737305, "learning_rate": 3.5337777777777784e-06, "loss": 0.8105, "step": 17050 }, { "epoch": 2.2236704900938475, "grad_norm": 8.999011993408203, "learning_rate": 3.5293333333333336e-06, "loss": 0.8201, "step": 17060 }, { "epoch": 2.2249739311783108, "grad_norm": 8.173701286315918, "learning_rate": 3.5248888888888893e-06, "loss": 0.7309, "step": 17070 }, { "epoch": 2.2262773722627736, "grad_norm": 9.306352615356445, "learning_rate": 3.5204444444444446e-06, "loss": 0.8052, "step": 17080 }, { "epoch": 2.227580813347237, "grad_norm": 11.817889213562012, "learning_rate": 3.5160000000000007e-06, "loss": 0.8774, "step": 17090 }, { "epoch": 2.2288842544316996, "grad_norm": 8.985753059387207, "learning_rate": 3.511555555555556e-06, "loss": 0.849, "step": 17100 }, { "epoch": 2.2288842544316996, "eval/acc": 58.13953399658203, "step": 17100 }, { "epoch": 2.2288842544316996, "eval_loss": 1.6983453035354614, "eval_runtime": 1.0239, "eval_samples_per_second": 41.998, "eval_steps_per_second": 0.977, "step": 17100 }, { "epoch": 2.230187695516163, "grad_norm": 8.127403259277344, "learning_rate": 3.5071111111111113e-06, "loss": 0.7885, "step": 17110 }, { "epoch": 2.2314911366006256, "grad_norm": 8.712991714477539, "learning_rate": 3.502666666666667e-06, "loss": 0.8378, "step": 17120 }, { "epoch": 2.232794577685089, "grad_norm": 9.402266502380371, "learning_rate": 3.4982222222222222e-06, "loss": 0.8458, "step": 17130 }, { "epoch": 2.2340980187695516, "grad_norm": 9.683643341064453, "learning_rate": 3.4937777777777783e-06, "loss": 0.8539, "step": 17140 }, { "epoch": 2.2354014598540144, "grad_norm": 9.23348617553711, "learning_rate": 3.4893333333333336e-06, "loss": 0.8676, "step": 17150 }, { "epoch": 2.2367049009384776, "grad_norm": 9.799532890319824, "learning_rate": 3.4848888888888893e-06, "loss": 0.8149, "step": 17160 }, { "epoch": 2.2380083420229404, "grad_norm": 9.09245491027832, "learning_rate": 3.4804444444444446e-06, "loss": 0.7953, "step": 17170 }, { "epoch": 2.2393117831074036, "grad_norm": 7.876502990722656, "learning_rate": 3.4760000000000007e-06, "loss": 0.7861, "step": 17180 }, { "epoch": 2.2406152241918664, "grad_norm": 11.36570930480957, "learning_rate": 3.471555555555556e-06, "loss": 0.9264, "step": 17190 }, { "epoch": 2.2419186652763297, "grad_norm": 7.699188709259033, "learning_rate": 3.4671111111111116e-06, "loss": 0.8733, "step": 17200 }, { "epoch": 2.2419186652763297, "eval/acc": 58.13953399658203, "step": 17200 }, { "epoch": 2.2419186652763297, "eval_loss": 1.7120801210403442, "eval_runtime": 0.5944, "eval_samples_per_second": 72.343, "eval_steps_per_second": 1.682, "step": 17200 }, { "epoch": 2.2432221063607924, "grad_norm": 9.601641654968262, "learning_rate": 3.462666666666667e-06, "loss": 0.9126, "step": 17210 }, { "epoch": 2.2445255474452557, "grad_norm": 8.544979095458984, "learning_rate": 3.458222222222222e-06, "loss": 0.7927, "step": 17220 }, { "epoch": 2.2458289885297185, "grad_norm": 9.429412841796875, "learning_rate": 3.4537777777777783e-06, "loss": 0.7645, "step": 17230 }, { "epoch": 2.2471324296141812, "grad_norm": 9.055729866027832, "learning_rate": 3.4493333333333336e-06, "loss": 0.8892, "step": 17240 }, { "epoch": 2.2484358706986445, "grad_norm": 8.690081596374512, "learning_rate": 3.4448888888888893e-06, "loss": 0.8491, "step": 17250 }, { "epoch": 2.2497393117831073, "grad_norm": 7.440013885498047, "learning_rate": 3.4404444444444445e-06, "loss": 0.8624, "step": 17260 }, { "epoch": 2.2510427528675705, "grad_norm": 8.565265655517578, "learning_rate": 3.4360000000000006e-06, "loss": 0.8301, "step": 17270 }, { "epoch": 2.2523461939520333, "grad_norm": 8.482741355895996, "learning_rate": 3.431555555555556e-06, "loss": 0.7969, "step": 17280 }, { "epoch": 2.2536496350364965, "grad_norm": 9.109694480895996, "learning_rate": 3.4271111111111116e-06, "loss": 0.797, "step": 17290 }, { "epoch": 2.2549530761209593, "grad_norm": 11.01980209350586, "learning_rate": 3.422666666666667e-06, "loss": 0.8017, "step": 17300 }, { "epoch": 2.2549530761209593, "eval/acc": 58.13953399658203, "step": 17300 }, { "epoch": 2.2549530761209593, "eval_loss": 1.7680262327194214, "eval_runtime": 0.5958, "eval_samples_per_second": 72.167, "eval_steps_per_second": 1.678, "step": 17300 }, { "epoch": 2.256256517205422, "grad_norm": 11.009764671325684, "learning_rate": 3.4182222222222226e-06, "loss": 0.8278, "step": 17310 }, { "epoch": 2.2575599582898853, "grad_norm": 10.312722206115723, "learning_rate": 3.4137777777777783e-06, "loss": 0.7284, "step": 17320 }, { "epoch": 2.258863399374348, "grad_norm": 9.148292541503906, "learning_rate": 3.4093333333333335e-06, "loss": 0.8191, "step": 17330 }, { "epoch": 2.2601668404588113, "grad_norm": 8.659215927124023, "learning_rate": 3.4048888888888892e-06, "loss": 0.844, "step": 17340 }, { "epoch": 2.261470281543274, "grad_norm": 9.481653213500977, "learning_rate": 3.4004444444444445e-06, "loss": 0.7708, "step": 17350 }, { "epoch": 2.2627737226277373, "grad_norm": 7.66526460647583, "learning_rate": 3.3960000000000006e-06, "loss": 0.8532, "step": 17360 }, { "epoch": 2.2640771637122, "grad_norm": 9.634511947631836, "learning_rate": 3.391555555555556e-06, "loss": 0.9005, "step": 17370 }, { "epoch": 2.2653806047966634, "grad_norm": 8.340860366821289, "learning_rate": 3.3871111111111116e-06, "loss": 0.7791, "step": 17380 }, { "epoch": 2.266684045881126, "grad_norm": 9.459833145141602, "learning_rate": 3.382666666666667e-06, "loss": 0.8383, "step": 17390 }, { "epoch": 2.2679874869655894, "grad_norm": 9.933736801147461, "learning_rate": 3.3782222222222225e-06, "loss": 0.7861, "step": 17400 }, { "epoch": 2.2679874869655894, "eval/acc": 58.13953399658203, "step": 17400 }, { "epoch": 2.2679874869655894, "eval_loss": 1.7251276969909668, "eval_runtime": 0.6661, "eval_samples_per_second": 64.552, "eval_steps_per_second": 1.501, "step": 17400 }, { "epoch": 2.269290928050052, "grad_norm": 7.369382381439209, "learning_rate": 3.3737777777777782e-06, "loss": 0.7912, "step": 17410 }, { "epoch": 2.270594369134515, "grad_norm": 9.374004364013672, "learning_rate": 3.3693333333333335e-06, "loss": 0.8077, "step": 17420 }, { "epoch": 2.271897810218978, "grad_norm": 7.642943859100342, "learning_rate": 3.364888888888889e-06, "loss": 0.808, "step": 17430 }, { "epoch": 2.273201251303441, "grad_norm": 8.486736297607422, "learning_rate": 3.3604444444444444e-06, "loss": 0.756, "step": 17440 }, { "epoch": 2.274504692387904, "grad_norm": 10.820671081542969, "learning_rate": 3.3560000000000006e-06, "loss": 0.7724, "step": 17450 }, { "epoch": 2.275808133472367, "grad_norm": 9.143059730529785, "learning_rate": 3.351555555555556e-06, "loss": 0.8139, "step": 17460 }, { "epoch": 2.27711157455683, "grad_norm": 10.45938491821289, "learning_rate": 3.3471111111111115e-06, "loss": 0.7952, "step": 17470 }, { "epoch": 2.278415015641293, "grad_norm": 11.475374221801758, "learning_rate": 3.342666666666667e-06, "loss": 0.813, "step": 17480 }, { "epoch": 2.279718456725756, "grad_norm": 7.964687347412109, "learning_rate": 3.3382222222222225e-06, "loss": 0.8363, "step": 17490 }, { "epoch": 2.281021897810219, "grad_norm": 10.092107772827148, "learning_rate": 3.333777777777778e-06, "loss": 0.8142, "step": 17500 }, { "epoch": 2.281021897810219, "eval/acc": 58.13953399658203, "step": 17500 }, { "epoch": 2.281021897810219, "eval_loss": 1.7565020322799683, "eval_runtime": 0.6497, "eval_samples_per_second": 66.185, "eval_steps_per_second": 1.539, "step": 17500 }, { "epoch": 2.282325338894682, "grad_norm": 8.953532218933105, "learning_rate": 3.329333333333334e-06, "loss": 0.8225, "step": 17510 }, { "epoch": 2.283628779979145, "grad_norm": 8.757415771484375, "learning_rate": 3.324888888888889e-06, "loss": 0.8243, "step": 17520 }, { "epoch": 2.284932221063608, "grad_norm": 8.159775733947754, "learning_rate": 3.3204444444444444e-06, "loss": 0.744, "step": 17530 }, { "epoch": 2.286235662148071, "grad_norm": 8.831269264221191, "learning_rate": 3.3160000000000005e-06, "loss": 0.824, "step": 17540 }, { "epoch": 2.287539103232534, "grad_norm": 10.947331428527832, "learning_rate": 3.311555555555556e-06, "loss": 0.7939, "step": 17550 }, { "epoch": 2.2888425443169966, "grad_norm": 11.166162490844727, "learning_rate": 3.3071111111111115e-06, "loss": 0.7827, "step": 17560 }, { "epoch": 2.29014598540146, "grad_norm": 8.819451332092285, "learning_rate": 3.3026666666666668e-06, "loss": 0.821, "step": 17570 }, { "epoch": 2.2914494264859226, "grad_norm": 8.286727905273438, "learning_rate": 3.298222222222223e-06, "loss": 0.8177, "step": 17580 }, { "epoch": 2.292752867570386, "grad_norm": 8.996437072753906, "learning_rate": 3.293777777777778e-06, "loss": 0.79, "step": 17590 }, { "epoch": 2.2940563086548487, "grad_norm": 10.241941452026367, "learning_rate": 3.289333333333334e-06, "loss": 0.7033, "step": 17600 }, { "epoch": 2.2940563086548487, "eval/acc": 58.13953399658203, "step": 17600 }, { "epoch": 2.2940563086548487, "eval_loss": 1.754825472831726, "eval_runtime": 0.5977, "eval_samples_per_second": 71.941, "eval_steps_per_second": 1.673, "step": 17600 }, { "epoch": 2.295359749739312, "grad_norm": 7.804232597351074, "learning_rate": 3.284888888888889e-06, "loss": 0.8224, "step": 17610 }, { "epoch": 2.2966631908237747, "grad_norm": 11.357056617736816, "learning_rate": 3.280444444444445e-06, "loss": 0.8497, "step": 17620 }, { "epoch": 2.297966631908238, "grad_norm": 11.514175415039062, "learning_rate": 3.2760000000000005e-06, "loss": 0.8414, "step": 17630 }, { "epoch": 2.2992700729927007, "grad_norm": 8.889968872070312, "learning_rate": 3.2715555555555558e-06, "loss": 0.8768, "step": 17640 }, { "epoch": 2.300573514077164, "grad_norm": 7.368014335632324, "learning_rate": 3.2671111111111114e-06, "loss": 0.7285, "step": 17650 }, { "epoch": 2.3018769551616267, "grad_norm": 8.335658073425293, "learning_rate": 3.2626666666666667e-06, "loss": 0.8032, "step": 17660 }, { "epoch": 2.3031803962460895, "grad_norm": 9.069354057312012, "learning_rate": 3.258222222222223e-06, "loss": 0.7602, "step": 17670 }, { "epoch": 2.3044838373305527, "grad_norm": 9.623358726501465, "learning_rate": 3.253777777777778e-06, "loss": 0.8184, "step": 17680 }, { "epoch": 2.3057872784150155, "grad_norm": 8.727527618408203, "learning_rate": 3.249333333333334e-06, "loss": 0.7464, "step": 17690 }, { "epoch": 2.3070907194994787, "grad_norm": 10.096443176269531, "learning_rate": 3.244888888888889e-06, "loss": 0.8697, "step": 17700 }, { "epoch": 2.3070907194994787, "eval/acc": 55.8139533996582, "step": 17700 }, { "epoch": 2.3070907194994787, "eval_loss": 1.7222002744674683, "eval_runtime": 0.6006, "eval_samples_per_second": 71.597, "eval_steps_per_second": 1.665, "step": 17700 }, { "epoch": 2.3083941605839415, "grad_norm": 8.81919002532959, "learning_rate": 3.2404444444444448e-06, "loss": 0.8745, "step": 17710 }, { "epoch": 2.3096976016684048, "grad_norm": 8.428309440612793, "learning_rate": 3.2360000000000004e-06, "loss": 0.8359, "step": 17720 }, { "epoch": 2.3110010427528676, "grad_norm": 8.93838882446289, "learning_rate": 3.2315555555555557e-06, "loss": 0.7349, "step": 17730 }, { "epoch": 2.3123044838373303, "grad_norm": 10.623334884643555, "learning_rate": 3.2271111111111114e-06, "loss": 0.8263, "step": 17740 }, { "epoch": 2.3136079249217936, "grad_norm": 8.26021957397461, "learning_rate": 3.2226666666666667e-06, "loss": 0.7624, "step": 17750 }, { "epoch": 2.3149113660062564, "grad_norm": 8.875035285949707, "learning_rate": 3.2182222222222228e-06, "loss": 0.821, "step": 17760 }, { "epoch": 2.3162148070907196, "grad_norm": 7.648210525512695, "learning_rate": 3.213777777777778e-06, "loss": 0.8093, "step": 17770 }, { "epoch": 2.3175182481751824, "grad_norm": 8.669422149658203, "learning_rate": 3.2093333333333337e-06, "loss": 0.8163, "step": 17780 }, { "epoch": 2.3188216892596456, "grad_norm": 7.551290035247803, "learning_rate": 3.204888888888889e-06, "loss": 0.7905, "step": 17790 }, { "epoch": 2.3201251303441084, "grad_norm": 9.070842742919922, "learning_rate": 3.2004444444444447e-06, "loss": 0.7389, "step": 17800 }, { "epoch": 2.3201251303441084, "eval/acc": 55.8139533996582, "step": 17800 }, { "epoch": 2.3201251303441084, "eval_loss": 1.7223340272903442, "eval_runtime": 0.5983, "eval_samples_per_second": 71.876, "eval_steps_per_second": 1.672, "step": 17800 }, { "epoch": 2.3214285714285716, "grad_norm": 10.925883293151855, "learning_rate": 3.1960000000000004e-06, "loss": 0.8962, "step": 17810 }, { "epoch": 2.3227320125130344, "grad_norm": 9.021198272705078, "learning_rate": 3.191555555555556e-06, "loss": 0.85, "step": 17820 }, { "epoch": 2.3240354535974976, "grad_norm": 9.038586616516113, "learning_rate": 3.1871111111111114e-06, "loss": 0.8027, "step": 17830 }, { "epoch": 2.3253388946819604, "grad_norm": 9.180644989013672, "learning_rate": 3.1826666666666666e-06, "loss": 0.789, "step": 17840 }, { "epoch": 2.326642335766423, "grad_norm": 10.611841201782227, "learning_rate": 3.1782222222222227e-06, "loss": 0.8333, "step": 17850 }, { "epoch": 2.3279457768508864, "grad_norm": 9.16645336151123, "learning_rate": 3.173777777777778e-06, "loss": 0.7988, "step": 17860 }, { "epoch": 2.3292492179353492, "grad_norm": 8.53354263305664, "learning_rate": 3.1693333333333337e-06, "loss": 0.8492, "step": 17870 }, { "epoch": 2.3305526590198125, "grad_norm": 9.344878196716309, "learning_rate": 3.164888888888889e-06, "loss": 0.8115, "step": 17880 }, { "epoch": 2.3318561001042752, "grad_norm": 7.698447227478027, "learning_rate": 3.1604444444444447e-06, "loss": 0.7206, "step": 17890 }, { "epoch": 2.3331595411887385, "grad_norm": 7.664024829864502, "learning_rate": 3.1560000000000004e-06, "loss": 0.798, "step": 17900 }, { "epoch": 2.3331595411887385, "eval/acc": 58.13953399658203, "step": 17900 }, { "epoch": 2.3331595411887385, "eval_loss": 1.7228178977966309, "eval_runtime": 0.5976, "eval_samples_per_second": 71.949, "eval_steps_per_second": 1.673, "step": 17900 }, { "epoch": 2.3344629822732013, "grad_norm": 9.088309288024902, "learning_rate": 3.151555555555556e-06, "loss": 0.7544, "step": 17910 }, { "epoch": 2.335766423357664, "grad_norm": 8.936786651611328, "learning_rate": 3.1471111111111113e-06, "loss": 0.8105, "step": 17920 }, { "epoch": 2.3370698644421273, "grad_norm": 7.950433254241943, "learning_rate": 3.142666666666667e-06, "loss": 0.7856, "step": 17930 }, { "epoch": 2.33837330552659, "grad_norm": 9.024825096130371, "learning_rate": 3.1382222222222227e-06, "loss": 0.8075, "step": 17940 }, { "epoch": 2.3396767466110533, "grad_norm": 11.19747543334961, "learning_rate": 3.133777777777778e-06, "loss": 0.7978, "step": 17950 }, { "epoch": 2.340980187695516, "grad_norm": 8.113253593444824, "learning_rate": 3.1293333333333337e-06, "loss": 0.7742, "step": 17960 }, { "epoch": 2.3422836287799793, "grad_norm": 8.828620910644531, "learning_rate": 3.124888888888889e-06, "loss": 0.7781, "step": 17970 }, { "epoch": 2.343587069864442, "grad_norm": 7.624085903167725, "learning_rate": 3.1204444444444446e-06, "loss": 0.7877, "step": 17980 }, { "epoch": 2.344890510948905, "grad_norm": 9.706528663635254, "learning_rate": 3.1160000000000003e-06, "loss": 0.8024, "step": 17990 }, { "epoch": 2.346193952033368, "grad_norm": 10.718920707702637, "learning_rate": 3.111555555555556e-06, "loss": 0.8454, "step": 18000 }, { "epoch": 2.346193952033368, "eval/acc": 58.13953399658203, "step": 18000 }, { "epoch": 2.346193952033368, "eval_loss": 1.736668348312378, "eval_runtime": 0.5979, "eval_samples_per_second": 71.916, "eval_steps_per_second": 1.672, "step": 18000 }, { "epoch": 2.347497393117831, "grad_norm": 8.593095779418945, "learning_rate": 3.1071111111111113e-06, "loss": 0.7644, "step": 18010 }, { "epoch": 2.348800834202294, "grad_norm": 9.433939933776855, "learning_rate": 3.102666666666667e-06, "loss": 0.834, "step": 18020 }, { "epoch": 2.350104275286757, "grad_norm": 11.388009071350098, "learning_rate": 3.0982222222222227e-06, "loss": 0.8843, "step": 18030 }, { "epoch": 2.35140771637122, "grad_norm": 8.85029411315918, "learning_rate": 3.093777777777778e-06, "loss": 0.8164, "step": 18040 }, { "epoch": 2.352711157455683, "grad_norm": 8.95462703704834, "learning_rate": 3.0893333333333336e-06, "loss": 0.8184, "step": 18050 }, { "epoch": 2.354014598540146, "grad_norm": 8.794722557067871, "learning_rate": 3.084888888888889e-06, "loss": 0.8928, "step": 18060 }, { "epoch": 2.355318039624609, "grad_norm": 8.893062591552734, "learning_rate": 3.0804444444444446e-06, "loss": 0.8413, "step": 18070 }, { "epoch": 2.356621480709072, "grad_norm": 8.394699096679688, "learning_rate": 3.0760000000000003e-06, "loss": 0.7707, "step": 18080 }, { "epoch": 2.357924921793535, "grad_norm": 8.472186088562012, "learning_rate": 3.071555555555556e-06, "loss": 0.7725, "step": 18090 }, { "epoch": 2.3592283628779978, "grad_norm": 8.91372299194336, "learning_rate": 3.0671111111111112e-06, "loss": 0.8177, "step": 18100 }, { "epoch": 2.3592283628779978, "eval/acc": 58.13953399658203, "step": 18100 }, { "epoch": 2.3592283628779978, "eval_loss": 1.7373141050338745, "eval_runtime": 0.597, "eval_samples_per_second": 72.022, "eval_steps_per_second": 1.675, "step": 18100 }, { "epoch": 2.360531803962461, "grad_norm": 8.862125396728516, "learning_rate": 3.062666666666667e-06, "loss": 0.8077, "step": 18110 }, { "epoch": 2.3618352450469238, "grad_norm": 8.020530700683594, "learning_rate": 3.0582222222222226e-06, "loss": 0.7855, "step": 18120 }, { "epoch": 2.363138686131387, "grad_norm": 8.013257026672363, "learning_rate": 3.0537777777777783e-06, "loss": 0.7992, "step": 18130 }, { "epoch": 2.36444212721585, "grad_norm": 9.200993537902832, "learning_rate": 3.0493333333333336e-06, "loss": 0.8319, "step": 18140 }, { "epoch": 2.365745568300313, "grad_norm": 10.322992324829102, "learning_rate": 3.044888888888889e-06, "loss": 0.845, "step": 18150 }, { "epoch": 2.367049009384776, "grad_norm": 7.696877956390381, "learning_rate": 3.0404444444444445e-06, "loss": 0.8244, "step": 18160 }, { "epoch": 2.3683524504692386, "grad_norm": 8.845235824584961, "learning_rate": 3.0360000000000002e-06, "loss": 0.7634, "step": 18170 }, { "epoch": 2.369655891553702, "grad_norm": 9.789495468139648, "learning_rate": 3.031555555555556e-06, "loss": 0.7536, "step": 18180 }, { "epoch": 2.3709593326381646, "grad_norm": 8.694038391113281, "learning_rate": 3.027111111111111e-06, "loss": 0.7695, "step": 18190 }, { "epoch": 2.372262773722628, "grad_norm": 10.665741920471191, "learning_rate": 3.022666666666667e-06, "loss": 0.842, "step": 18200 }, { "epoch": 2.372262773722628, "eval/acc": 55.8139533996582, "step": 18200 }, { "epoch": 2.372262773722628, "eval_loss": 1.7320786714553833, "eval_runtime": 0.596, "eval_samples_per_second": 72.147, "eval_steps_per_second": 1.678, "step": 18200 }, { "epoch": 2.3735662148070906, "grad_norm": 9.941478729248047, "learning_rate": 3.0182222222222226e-06, "loss": 0.8402, "step": 18210 }, { "epoch": 2.374869655891554, "grad_norm": 10.921478271484375, "learning_rate": 3.0137777777777783e-06, "loss": 0.8759, "step": 18220 }, { "epoch": 2.3761730969760166, "grad_norm": 9.340253829956055, "learning_rate": 3.0093333333333335e-06, "loss": 0.7529, "step": 18230 }, { "epoch": 2.3774765380604794, "grad_norm": 8.433263778686523, "learning_rate": 3.0048888888888892e-06, "loss": 0.7887, "step": 18240 }, { "epoch": 2.3787799791449427, "grad_norm": 10.336188316345215, "learning_rate": 3.0004444444444445e-06, "loss": 0.8693, "step": 18250 }, { "epoch": 2.3800834202294054, "grad_norm": 9.22595500946045, "learning_rate": 2.996e-06, "loss": 0.858, "step": 18260 }, { "epoch": 2.3813868613138687, "grad_norm": 9.79007625579834, "learning_rate": 2.991555555555556e-06, "loss": 0.8302, "step": 18270 }, { "epoch": 2.3826903023983315, "grad_norm": 7.493574142456055, "learning_rate": 2.987111111111111e-06, "loss": 0.7439, "step": 18280 }, { "epoch": 2.3839937434827947, "grad_norm": 9.142792701721191, "learning_rate": 2.982666666666667e-06, "loss": 0.8211, "step": 18290 }, { "epoch": 2.3852971845672575, "grad_norm": 8.349295616149902, "learning_rate": 2.9782222222222225e-06, "loss": 0.7921, "step": 18300 }, { "epoch": 2.3852971845672575, "eval/acc": 55.8139533996582, "step": 18300 }, { "epoch": 2.3852971845672575, "eval_loss": 1.6824078559875488, "eval_runtime": 0.5959, "eval_samples_per_second": 72.156, "eval_steps_per_second": 1.678, "step": 18300 }, { "epoch": 2.3866006256517207, "grad_norm": 9.027983665466309, "learning_rate": 2.9737777777777782e-06, "loss": 0.8652, "step": 18310 }, { "epoch": 2.3879040667361835, "grad_norm": 10.181669235229492, "learning_rate": 2.9693333333333335e-06, "loss": 0.915, "step": 18320 }, { "epoch": 2.3892075078206467, "grad_norm": 8.568914413452148, "learning_rate": 2.964888888888889e-06, "loss": 0.7698, "step": 18330 }, { "epoch": 2.3905109489051095, "grad_norm": 7.892252445220947, "learning_rate": 2.9604444444444445e-06, "loss": 0.8431, "step": 18340 }, { "epoch": 2.3918143899895723, "grad_norm": 9.389975547790527, "learning_rate": 2.956e-06, "loss": 0.7999, "step": 18350 }, { "epoch": 2.3931178310740355, "grad_norm": 9.338902473449707, "learning_rate": 2.951555555555556e-06, "loss": 0.9219, "step": 18360 }, { "epoch": 2.3944212721584983, "grad_norm": 8.417449951171875, "learning_rate": 2.947111111111111e-06, "loss": 0.7626, "step": 18370 }, { "epoch": 2.3957247132429615, "grad_norm": 9.792203903198242, "learning_rate": 2.942666666666667e-06, "loss": 0.7947, "step": 18380 }, { "epoch": 2.3970281543274243, "grad_norm": 8.978912353515625, "learning_rate": 2.9382222222222225e-06, "loss": 0.828, "step": 18390 }, { "epoch": 2.3983315954118876, "grad_norm": 10.301528930664062, "learning_rate": 2.933777777777778e-06, "loss": 0.8807, "step": 18400 }, { "epoch": 2.3983315954118876, "eval/acc": 58.13953399658203, "step": 18400 }, { "epoch": 2.3983315954118876, "eval_loss": 1.7043213844299316, "eval_runtime": 0.5956, "eval_samples_per_second": 72.192, "eval_steps_per_second": 1.679, "step": 18400 }, { "epoch": 2.3996350364963503, "grad_norm": 8.793416976928711, "learning_rate": 2.9293333333333335e-06, "loss": 0.7905, "step": 18410 }, { "epoch": 2.400938477580813, "grad_norm": 9.114569664001465, "learning_rate": 2.924888888888889e-06, "loss": 0.8954, "step": 18420 }, { "epoch": 2.4022419186652764, "grad_norm": 8.437703132629395, "learning_rate": 2.9204444444444444e-06, "loss": 0.8572, "step": 18430 }, { "epoch": 2.403545359749739, "grad_norm": 8.898796081542969, "learning_rate": 2.9160000000000005e-06, "loss": 0.8002, "step": 18440 }, { "epoch": 2.4048488008342024, "grad_norm": 6.8336663246154785, "learning_rate": 2.911555555555556e-06, "loss": 0.8055, "step": 18450 }, { "epoch": 2.406152241918665, "grad_norm": 8.852208137512207, "learning_rate": 2.907111111111111e-06, "loss": 0.8021, "step": 18460 }, { "epoch": 2.4074556830031284, "grad_norm": 8.297391891479492, "learning_rate": 2.9026666666666668e-06, "loss": 0.7849, "step": 18470 }, { "epoch": 2.408759124087591, "grad_norm": 9.60500431060791, "learning_rate": 2.8982222222222225e-06, "loss": 0.887, "step": 18480 }, { "epoch": 2.4100625651720544, "grad_norm": 8.069330215454102, "learning_rate": 2.893777777777778e-06, "loss": 0.8041, "step": 18490 }, { "epoch": 2.411366006256517, "grad_norm": 9.09995174407959, "learning_rate": 2.8893333333333334e-06, "loss": 0.7886, "step": 18500 }, { "epoch": 2.411366006256517, "eval/acc": 58.13953399658203, "step": 18500 }, { "epoch": 2.411366006256517, "eval_loss": 1.732820987701416, "eval_runtime": 0.5961, "eval_samples_per_second": 72.137, "eval_steps_per_second": 1.678, "step": 18500 }, { "epoch": 2.4126694473409804, "grad_norm": 8.060381889343262, "learning_rate": 2.884888888888889e-06, "loss": 0.8123, "step": 18510 }, { "epoch": 2.413972888425443, "grad_norm": 8.432539939880371, "learning_rate": 2.880444444444445e-06, "loss": 0.7672, "step": 18520 }, { "epoch": 2.415276329509906, "grad_norm": 9.973830223083496, "learning_rate": 2.8760000000000005e-06, "loss": 0.8087, "step": 18530 }, { "epoch": 2.4165797705943692, "grad_norm": 8.331082344055176, "learning_rate": 2.8715555555555558e-06, "loss": 0.8696, "step": 18540 }, { "epoch": 2.417883211678832, "grad_norm": 9.16474723815918, "learning_rate": 2.8671111111111115e-06, "loss": 0.9246, "step": 18550 }, { "epoch": 2.4191866527632953, "grad_norm": 8.847817420959473, "learning_rate": 2.8626666666666667e-06, "loss": 0.803, "step": 18560 }, { "epoch": 2.420490093847758, "grad_norm": 7.864871501922607, "learning_rate": 2.8582222222222224e-06, "loss": 0.7799, "step": 18570 }, { "epoch": 2.4217935349322213, "grad_norm": 10.756715774536133, "learning_rate": 2.853777777777778e-06, "loss": 0.8396, "step": 18580 }, { "epoch": 2.423096976016684, "grad_norm": 9.915592193603516, "learning_rate": 2.8493333333333334e-06, "loss": 0.8761, "step": 18590 }, { "epoch": 2.424400417101147, "grad_norm": 8.573412895202637, "learning_rate": 2.844888888888889e-06, "loss": 0.8064, "step": 18600 }, { "epoch": 2.424400417101147, "eval/acc": 55.8139533996582, "step": 18600 }, { "epoch": 2.424400417101147, "eval_loss": 1.7053908109664917, "eval_runtime": 0.6479, "eval_samples_per_second": 66.373, "eval_steps_per_second": 1.544, "step": 18600 }, { "epoch": 2.42570385818561, "grad_norm": 9.568745613098145, "learning_rate": 2.8404444444444448e-06, "loss": 0.9169, "step": 18610 }, { "epoch": 2.427007299270073, "grad_norm": 10.205423355102539, "learning_rate": 2.8360000000000005e-06, "loss": 0.875, "step": 18620 }, { "epoch": 2.428310740354536, "grad_norm": 8.54217529296875, "learning_rate": 2.8315555555555557e-06, "loss": 0.7617, "step": 18630 }, { "epoch": 2.429614181438999, "grad_norm": 8.418920516967773, "learning_rate": 2.8271111111111114e-06, "loss": 0.8104, "step": 18640 }, { "epoch": 2.430917622523462, "grad_norm": 7.629629611968994, "learning_rate": 2.8226666666666667e-06, "loss": 0.7892, "step": 18650 }, { "epoch": 2.432221063607925, "grad_norm": 8.20186996459961, "learning_rate": 2.8182222222222224e-06, "loss": 0.7604, "step": 18660 }, { "epoch": 2.4335245046923877, "grad_norm": 10.07830810546875, "learning_rate": 2.813777777777778e-06, "loss": 0.7799, "step": 18670 }, { "epoch": 2.434827945776851, "grad_norm": 9.589900016784668, "learning_rate": 2.8093333333333333e-06, "loss": 0.7774, "step": 18680 }, { "epoch": 2.4361313868613137, "grad_norm": 7.906235694885254, "learning_rate": 2.804888888888889e-06, "loss": 0.7806, "step": 18690 }, { "epoch": 2.437434827945777, "grad_norm": 7.387449741363525, "learning_rate": 2.8004444444444447e-06, "loss": 0.7948, "step": 18700 }, { "epoch": 2.437434827945777, "eval/acc": 58.13953399658203, "step": 18700 }, { "epoch": 2.437434827945777, "eval_loss": 1.7254258394241333, "eval_runtime": 0.7602, "eval_samples_per_second": 56.563, "eval_steps_per_second": 1.315, "step": 18700 }, { "epoch": 2.4387382690302397, "grad_norm": 9.001852989196777, "learning_rate": 2.7960000000000004e-06, "loss": 0.8126, "step": 18710 }, { "epoch": 2.440041710114703, "grad_norm": 9.350873947143555, "learning_rate": 2.7915555555555557e-06, "loss": 0.7731, "step": 18720 }, { "epoch": 2.4413451511991657, "grad_norm": 10.713414192199707, "learning_rate": 2.7871111111111114e-06, "loss": 0.8611, "step": 18730 }, { "epoch": 2.442648592283629, "grad_norm": 8.005009651184082, "learning_rate": 2.7826666666666666e-06, "loss": 0.8614, "step": 18740 }, { "epoch": 2.4439520333680917, "grad_norm": 8.559712409973145, "learning_rate": 2.7782222222222228e-06, "loss": 0.7755, "step": 18750 }, { "epoch": 2.445255474452555, "grad_norm": 8.788186073303223, "learning_rate": 2.773777777777778e-06, "loss": 0.8453, "step": 18760 }, { "epoch": 2.4465589155370178, "grad_norm": 8.74999713897705, "learning_rate": 2.7693333333333333e-06, "loss": 0.7585, "step": 18770 }, { "epoch": 2.4478623566214806, "grad_norm": 9.652961730957031, "learning_rate": 2.764888888888889e-06, "loss": 0.7698, "step": 18780 }, { "epoch": 2.449165797705944, "grad_norm": 10.831005096435547, "learning_rate": 2.7604444444444447e-06, "loss": 0.8352, "step": 18790 }, { "epoch": 2.4504692387904066, "grad_norm": 9.221595764160156, "learning_rate": 2.7560000000000004e-06, "loss": 0.7731, "step": 18800 }, { "epoch": 2.4504692387904066, "eval/acc": 58.13953399658203, "step": 18800 }, { "epoch": 2.4504692387904066, "eval_loss": 1.7193679809570312, "eval_runtime": 0.5964, "eval_samples_per_second": 72.094, "eval_steps_per_second": 1.677, "step": 18800 }, { "epoch": 2.45177267987487, "grad_norm": 10.366093635559082, "learning_rate": 2.7515555555555556e-06, "loss": 0.8429, "step": 18810 }, { "epoch": 2.4530761209593326, "grad_norm": 10.944464683532715, "learning_rate": 2.7471111111111113e-06, "loss": 0.8524, "step": 18820 }, { "epoch": 2.454379562043796, "grad_norm": 8.571611404418945, "learning_rate": 2.7426666666666666e-06, "loss": 0.8135, "step": 18830 }, { "epoch": 2.4556830031282586, "grad_norm": 8.205272674560547, "learning_rate": 2.7382222222222227e-06, "loss": 0.834, "step": 18840 }, { "epoch": 2.4569864442127214, "grad_norm": 9.873887062072754, "learning_rate": 2.733777777777778e-06, "loss": 0.8428, "step": 18850 }, { "epoch": 2.4582898852971846, "grad_norm": 8.814358711242676, "learning_rate": 2.7293333333333333e-06, "loss": 0.8065, "step": 18860 }, { "epoch": 2.4595933263816474, "grad_norm": 9.057844161987305, "learning_rate": 2.724888888888889e-06, "loss": 0.8158, "step": 18870 }, { "epoch": 2.4608967674661106, "grad_norm": 8.199042320251465, "learning_rate": 2.7204444444444446e-06, "loss": 0.8154, "step": 18880 }, { "epoch": 2.4622002085505734, "grad_norm": 9.592483520507812, "learning_rate": 2.7160000000000003e-06, "loss": 0.7788, "step": 18890 }, { "epoch": 2.4635036496350367, "grad_norm": 8.592867851257324, "learning_rate": 2.7115555555555556e-06, "loss": 0.7358, "step": 18900 }, { "epoch": 2.4635036496350367, "eval/acc": 58.13953399658203, "step": 18900 }, { "epoch": 2.4635036496350367, "eval_loss": 1.7105212211608887, "eval_runtime": 0.5936, "eval_samples_per_second": 72.435, "eval_steps_per_second": 1.685, "step": 18900 }, { "epoch": 2.4648070907194994, "grad_norm": 10.121728897094727, "learning_rate": 2.7071111111111113e-06, "loss": 0.7847, "step": 18910 }, { "epoch": 2.4661105318039622, "grad_norm": 9.763382911682129, "learning_rate": 2.7026666666666666e-06, "loss": 0.8032, "step": 18920 }, { "epoch": 2.4674139728884255, "grad_norm": 8.65837574005127, "learning_rate": 2.6982222222222227e-06, "loss": 0.8417, "step": 18930 }, { "epoch": 2.4687174139728882, "grad_norm": 10.843329429626465, "learning_rate": 2.693777777777778e-06, "loss": 0.8336, "step": 18940 }, { "epoch": 2.4700208550573515, "grad_norm": 8.979269027709961, "learning_rate": 2.6893333333333336e-06, "loss": 0.8747, "step": 18950 }, { "epoch": 2.4713242961418143, "grad_norm": 9.999788284301758, "learning_rate": 2.684888888888889e-06, "loss": 0.8114, "step": 18960 }, { "epoch": 2.4726277372262775, "grad_norm": 8.939599990844727, "learning_rate": 2.6804444444444446e-06, "loss": 0.8323, "step": 18970 }, { "epoch": 2.4739311783107403, "grad_norm": 8.508830070495605, "learning_rate": 2.6760000000000003e-06, "loss": 0.7615, "step": 18980 }, { "epoch": 2.4752346193952035, "grad_norm": 8.749983787536621, "learning_rate": 2.6715555555555556e-06, "loss": 0.8019, "step": 18990 }, { "epoch": 2.4765380604796663, "grad_norm": 8.812922477722168, "learning_rate": 2.6671111111111113e-06, "loss": 0.7822, "step": 19000 }, { "epoch": 2.4765380604796663, "eval/acc": 58.13953399658203, "step": 19000 }, { "epoch": 2.4765380604796663, "eval_loss": 1.7229161262512207, "eval_runtime": 0.5944, "eval_samples_per_second": 72.346, "eval_steps_per_second": 1.682, "step": 19000 }, { "epoch": 2.4778415015641295, "grad_norm": 9.612266540527344, "learning_rate": 2.6626666666666665e-06, "loss": 0.8657, "step": 19010 }, { "epoch": 2.4791449426485923, "grad_norm": 9.715696334838867, "learning_rate": 2.6582222222222226e-06, "loss": 0.8531, "step": 19020 }, { "epoch": 2.480448383733055, "grad_norm": 8.434911727905273, "learning_rate": 2.653777777777778e-06, "loss": 0.84, "step": 19030 }, { "epoch": 2.4817518248175183, "grad_norm": 9.896410942077637, "learning_rate": 2.6493333333333336e-06, "loss": 0.857, "step": 19040 }, { "epoch": 2.483055265901981, "grad_norm": 8.452371597290039, "learning_rate": 2.644888888888889e-06, "loss": 0.8199, "step": 19050 }, { "epoch": 2.4843587069864443, "grad_norm": 7.998941898345947, "learning_rate": 2.640444444444445e-06, "loss": 0.8247, "step": 19060 }, { "epoch": 2.485662148070907, "grad_norm": 9.147958755493164, "learning_rate": 2.6360000000000003e-06, "loss": 0.8084, "step": 19070 }, { "epoch": 2.4869655891553704, "grad_norm": 8.45266342163086, "learning_rate": 2.6315555555555555e-06, "loss": 0.8302, "step": 19080 }, { "epoch": 2.488269030239833, "grad_norm": 9.505598068237305, "learning_rate": 2.6271111111111112e-06, "loss": 0.7856, "step": 19090 }, { "epoch": 2.489572471324296, "grad_norm": 9.163291931152344, "learning_rate": 2.6226666666666665e-06, "loss": 0.8649, "step": 19100 }, { "epoch": 2.489572471324296, "eval/acc": 55.8139533996582, "step": 19100 }, { "epoch": 2.489572471324296, "eval_loss": 1.7023024559020996, "eval_runtime": 0.5971, "eval_samples_per_second": 72.015, "eval_steps_per_second": 1.675, "step": 19100 }, { "epoch": 2.490875912408759, "grad_norm": 10.414417266845703, "learning_rate": 2.6182222222222226e-06, "loss": 0.8299, "step": 19110 }, { "epoch": 2.492179353493222, "grad_norm": 7.410620212554932, "learning_rate": 2.613777777777778e-06, "loss": 0.7835, "step": 19120 }, { "epoch": 2.493482794577685, "grad_norm": 8.88325309753418, "learning_rate": 2.6093333333333336e-06, "loss": 0.8437, "step": 19130 }, { "epoch": 2.494786235662148, "grad_norm": 8.833433151245117, "learning_rate": 2.604888888888889e-06, "loss": 0.7945, "step": 19140 }, { "epoch": 2.496089676746611, "grad_norm": 9.540637016296387, "learning_rate": 2.600444444444445e-06, "loss": 0.7809, "step": 19150 }, { "epoch": 2.497393117831074, "grad_norm": 9.051806449890137, "learning_rate": 2.5960000000000002e-06, "loss": 0.882, "step": 19160 }, { "epoch": 2.4986965589155368, "grad_norm": 9.45577621459961, "learning_rate": 2.5915555555555555e-06, "loss": 0.8434, "step": 19170 }, { "epoch": 2.5, "grad_norm": 8.784090995788574, "learning_rate": 2.587111111111111e-06, "loss": 0.8707, "step": 19180 }, { "epoch": 2.5013034410844632, "grad_norm": 9.114548683166504, "learning_rate": 2.5826666666666664e-06, "loss": 0.9411, "step": 19190 }, { "epoch": 2.502606882168926, "grad_norm": 10.387870788574219, "learning_rate": 2.5782222222222226e-06, "loss": 0.8886, "step": 19200 }, { "epoch": 2.502606882168926, "eval/acc": 58.13953399658203, "step": 19200 }, { "epoch": 2.502606882168926, "eval_loss": 1.6806849241256714, "eval_runtime": 0.6005, "eval_samples_per_second": 71.61, "eval_steps_per_second": 1.665, "step": 19200 }, { "epoch": 2.503910323253389, "grad_norm": 9.6982421875, "learning_rate": 2.573777777777778e-06, "loss": 0.8666, "step": 19210 }, { "epoch": 2.505213764337852, "grad_norm": 9.777280807495117, "learning_rate": 2.5693333333333335e-06, "loss": 0.8425, "step": 19220 }, { "epoch": 2.506517205422315, "grad_norm": 8.246393203735352, "learning_rate": 2.564888888888889e-06, "loss": 0.8693, "step": 19230 }, { "epoch": 2.507820646506778, "grad_norm": 9.383831977844238, "learning_rate": 2.560444444444445e-06, "loss": 0.8128, "step": 19240 }, { "epoch": 2.509124087591241, "grad_norm": 8.326151847839355, "learning_rate": 2.556e-06, "loss": 0.8621, "step": 19250 }, { "epoch": 2.510427528675704, "grad_norm": 9.772537231445312, "learning_rate": 2.551555555555556e-06, "loss": 0.8134, "step": 19260 }, { "epoch": 2.511730969760167, "grad_norm": 9.540831565856934, "learning_rate": 2.547111111111111e-06, "loss": 0.777, "step": 19270 }, { "epoch": 2.5130344108446296, "grad_norm": 7.673141956329346, "learning_rate": 2.5426666666666664e-06, "loss": 0.7608, "step": 19280 }, { "epoch": 2.514337851929093, "grad_norm": 10.192721366882324, "learning_rate": 2.5382222222222225e-06, "loss": 0.8058, "step": 19290 }, { "epoch": 2.5156412930135557, "grad_norm": 9.352919578552246, "learning_rate": 2.533777777777778e-06, "loss": 0.8289, "step": 19300 }, { "epoch": 2.5156412930135557, "eval/acc": 55.8139533996582, "step": 19300 }, { "epoch": 2.5156412930135557, "eval_loss": 1.726179599761963, "eval_runtime": 0.5957, "eval_samples_per_second": 72.18, "eval_steps_per_second": 1.679, "step": 19300 }, { "epoch": 2.516944734098019, "grad_norm": 10.388177871704102, "learning_rate": 2.5293333333333335e-06, "loss": 0.8061, "step": 19310 }, { "epoch": 2.5182481751824817, "grad_norm": 9.670066833496094, "learning_rate": 2.5248888888888888e-06, "loss": 0.8421, "step": 19320 }, { "epoch": 2.519551616266945, "grad_norm": 9.624741554260254, "learning_rate": 2.520444444444445e-06, "loss": 0.8737, "step": 19330 }, { "epoch": 2.5208550573514077, "grad_norm": 9.047123908996582, "learning_rate": 2.516e-06, "loss": 0.8925, "step": 19340 }, { "epoch": 2.5221584984358705, "grad_norm": 8.847010612487793, "learning_rate": 2.511555555555556e-06, "loss": 0.7694, "step": 19350 }, { "epoch": 2.5234619395203337, "grad_norm": 9.46484088897705, "learning_rate": 2.507111111111111e-06, "loss": 0.7791, "step": 19360 }, { "epoch": 2.524765380604797, "grad_norm": 8.407572746276855, "learning_rate": 2.5026666666666672e-06, "loss": 0.8067, "step": 19370 }, { "epoch": 2.5260688216892597, "grad_norm": 8.751404762268066, "learning_rate": 2.4982222222222225e-06, "loss": 0.8447, "step": 19380 }, { "epoch": 2.5273722627737225, "grad_norm": 9.621039390563965, "learning_rate": 2.493777777777778e-06, "loss": 0.8412, "step": 19390 }, { "epoch": 2.5286757038581857, "grad_norm": 9.272809982299805, "learning_rate": 2.4893333333333334e-06, "loss": 0.7829, "step": 19400 }, { "epoch": 2.5286757038581857, "eval/acc": 55.8139533996582, "step": 19400 }, { "epoch": 2.5286757038581857, "eval_loss": 1.705978274345398, "eval_runtime": 0.5948, "eval_samples_per_second": 72.293, "eval_steps_per_second": 1.681, "step": 19400 }, { "epoch": 2.5299791449426485, "grad_norm": 7.593291759490967, "learning_rate": 2.484888888888889e-06, "loss": 0.8125, "step": 19410 }, { "epoch": 2.5312825860271113, "grad_norm": 7.302863597869873, "learning_rate": 2.480444444444445e-06, "loss": 0.8157, "step": 19420 }, { "epoch": 2.5325860271115745, "grad_norm": 9.871075630187988, "learning_rate": 2.476e-06, "loss": 0.7753, "step": 19430 }, { "epoch": 2.5338894681960378, "grad_norm": 8.587265968322754, "learning_rate": 2.4715555555555558e-06, "loss": 0.8444, "step": 19440 }, { "epoch": 2.5351929092805006, "grad_norm": 8.136966705322266, "learning_rate": 2.467111111111111e-06, "loss": 0.8036, "step": 19450 }, { "epoch": 2.5364963503649633, "grad_norm": 9.554794311523438, "learning_rate": 2.4626666666666667e-06, "loss": 0.8582, "step": 19460 }, { "epoch": 2.5377997914494266, "grad_norm": 7.948121070861816, "learning_rate": 2.4582222222222224e-06, "loss": 0.8087, "step": 19470 }, { "epoch": 2.5391032325338894, "grad_norm": 7.665543079376221, "learning_rate": 2.453777777777778e-06, "loss": 0.803, "step": 19480 }, { "epoch": 2.5404066736183526, "grad_norm": 9.550437927246094, "learning_rate": 2.4493333333333334e-06, "loss": 0.7749, "step": 19490 }, { "epoch": 2.5417101147028154, "grad_norm": 7.813060760498047, "learning_rate": 2.444888888888889e-06, "loss": 0.8137, "step": 19500 }, { "epoch": 2.5417101147028154, "eval/acc": 55.8139533996582, "step": 19500 }, { "epoch": 2.5417101147028154, "eval_loss": 1.7320709228515625, "eval_runtime": 0.5954, "eval_samples_per_second": 72.215, "eval_steps_per_second": 1.679, "step": 19500 }, { "epoch": 2.5430135557872786, "grad_norm": 9.900518417358398, "learning_rate": 2.4404444444444448e-06, "loss": 0.8209, "step": 19510 }, { "epoch": 2.5443169968717414, "grad_norm": 9.617853164672852, "learning_rate": 2.4360000000000005e-06, "loss": 0.8187, "step": 19520 }, { "epoch": 2.545620437956204, "grad_norm": 8.513010025024414, "learning_rate": 2.4315555555555557e-06, "loss": 0.8135, "step": 19530 }, { "epoch": 2.5469238790406674, "grad_norm": 10.416341781616211, "learning_rate": 2.427111111111111e-06, "loss": 0.8412, "step": 19540 }, { "epoch": 2.54822732012513, "grad_norm": 8.88688850402832, "learning_rate": 2.4226666666666667e-06, "loss": 0.7538, "step": 19550 }, { "epoch": 2.5495307612095934, "grad_norm": 10.363279342651367, "learning_rate": 2.4182222222222224e-06, "loss": 0.7776, "step": 19560 }, { "epoch": 2.550834202294056, "grad_norm": 7.975240707397461, "learning_rate": 2.413777777777778e-06, "loss": 0.7684, "step": 19570 }, { "epoch": 2.5521376433785194, "grad_norm": 9.642936706542969, "learning_rate": 2.4093333333333334e-06, "loss": 0.805, "step": 19580 }, { "epoch": 2.5534410844629822, "grad_norm": 8.115169525146484, "learning_rate": 2.404888888888889e-06, "loss": 0.8086, "step": 19590 }, { "epoch": 2.554744525547445, "grad_norm": 8.276997566223145, "learning_rate": 2.4004444444444447e-06, "loss": 0.8091, "step": 19600 }, { "epoch": 2.554744525547445, "eval/acc": 55.8139533996582, "step": 19600 }, { "epoch": 2.554744525547445, "eval_loss": 1.72951078414917, "eval_runtime": 0.5947, "eval_samples_per_second": 72.308, "eval_steps_per_second": 1.682, "step": 19600 }, { "epoch": 2.5560479666319083, "grad_norm": 8.742209434509277, "learning_rate": 2.3960000000000004e-06, "loss": 0.7671, "step": 19610 }, { "epoch": 2.5573514077163715, "grad_norm": 9.606656074523926, "learning_rate": 2.3915555555555557e-06, "loss": 0.8687, "step": 19620 }, { "epoch": 2.5586548488008343, "grad_norm": 9.96225643157959, "learning_rate": 2.3871111111111114e-06, "loss": 0.8365, "step": 19630 }, { "epoch": 2.559958289885297, "grad_norm": 10.499612808227539, "learning_rate": 2.3826666666666667e-06, "loss": 0.8111, "step": 19640 }, { "epoch": 2.5612617309697603, "grad_norm": 8.857958793640137, "learning_rate": 2.3782222222222224e-06, "loss": 0.7975, "step": 19650 }, { "epoch": 2.562565172054223, "grad_norm": 9.551183700561523, "learning_rate": 2.373777777777778e-06, "loss": 0.758, "step": 19660 }, { "epoch": 2.563868613138686, "grad_norm": 9.512008666992188, "learning_rate": 2.3693333333333333e-06, "loss": 0.8015, "step": 19670 }, { "epoch": 2.565172054223149, "grad_norm": 8.902203559875488, "learning_rate": 2.364888888888889e-06, "loss": 0.8934, "step": 19680 }, { "epoch": 2.5664754953076123, "grad_norm": 9.198417663574219, "learning_rate": 2.3604444444444447e-06, "loss": 0.8539, "step": 19690 }, { "epoch": 2.567778936392075, "grad_norm": 9.011664390563965, "learning_rate": 2.3560000000000004e-06, "loss": 0.7985, "step": 19700 }, { "epoch": 2.567778936392075, "eval/acc": 55.8139533996582, "step": 19700 }, { "epoch": 2.567778936392075, "eval_loss": 1.7140063047409058, "eval_runtime": 0.5934, "eval_samples_per_second": 72.469, "eval_steps_per_second": 1.685, "step": 19700 }, { "epoch": 2.569082377476538, "grad_norm": 8.956233978271484, "learning_rate": 2.3515555555555557e-06, "loss": 0.8205, "step": 19710 }, { "epoch": 2.570385818561001, "grad_norm": 10.16060733795166, "learning_rate": 2.3471111111111114e-06, "loss": 0.8875, "step": 19720 }, { "epoch": 2.571689259645464, "grad_norm": 10.25076675415039, "learning_rate": 2.342666666666667e-06, "loss": 0.8668, "step": 19730 }, { "epoch": 2.572992700729927, "grad_norm": 10.392470359802246, "learning_rate": 2.3382222222222223e-06, "loss": 0.8584, "step": 19740 }, { "epoch": 2.57429614181439, "grad_norm": 8.213852882385254, "learning_rate": 2.333777777777778e-06, "loss": 0.8476, "step": 19750 }, { "epoch": 2.575599582898853, "grad_norm": 9.253135681152344, "learning_rate": 2.3293333333333333e-06, "loss": 0.7432, "step": 19760 }, { "epoch": 2.576903023983316, "grad_norm": 10.394274711608887, "learning_rate": 2.324888888888889e-06, "loss": 0.8076, "step": 19770 }, { "epoch": 2.5782064650677787, "grad_norm": 8.953391075134277, "learning_rate": 2.3204444444444447e-06, "loss": 0.7678, "step": 19780 }, { "epoch": 2.579509906152242, "grad_norm": 10.66081428527832, "learning_rate": 2.3160000000000004e-06, "loss": 0.7726, "step": 19790 }, { "epoch": 2.5808133472367047, "grad_norm": 10.238149642944336, "learning_rate": 2.3115555555555556e-06, "loss": 0.8436, "step": 19800 }, { "epoch": 2.5808133472367047, "eval/acc": 55.8139533996582, "step": 19800 }, { "epoch": 2.5808133472367047, "eval_loss": 1.7150553464889526, "eval_runtime": 0.596, "eval_samples_per_second": 72.146, "eval_steps_per_second": 1.678, "step": 19800 }, { "epoch": 2.582116788321168, "grad_norm": 8.832317352294922, "learning_rate": 2.3071111111111113e-06, "loss": 0.7451, "step": 19810 }, { "epoch": 2.5834202294056308, "grad_norm": 7.704029560089111, "learning_rate": 2.302666666666667e-06, "loss": 0.8185, "step": 19820 }, { "epoch": 2.584723670490094, "grad_norm": 9.155497550964355, "learning_rate": 2.2982222222222227e-06, "loss": 0.8275, "step": 19830 }, { "epoch": 2.586027111574557, "grad_norm": 10.78309154510498, "learning_rate": 2.293777777777778e-06, "loss": 0.8207, "step": 19840 }, { "epoch": 2.5873305526590196, "grad_norm": 8.628974914550781, "learning_rate": 2.2893333333333332e-06, "loss": 0.8192, "step": 19850 }, { "epoch": 2.588633993743483, "grad_norm": 9.785404205322266, "learning_rate": 2.284888888888889e-06, "loss": 0.8558, "step": 19860 }, { "epoch": 2.589937434827946, "grad_norm": 8.311519622802734, "learning_rate": 2.2804444444444446e-06, "loss": 0.846, "step": 19870 }, { "epoch": 2.591240875912409, "grad_norm": 9.980459213256836, "learning_rate": 2.2760000000000003e-06, "loss": 0.8365, "step": 19880 }, { "epoch": 2.5925443169968716, "grad_norm": 9.531750679016113, "learning_rate": 2.2715555555555556e-06, "loss": 0.8665, "step": 19890 }, { "epoch": 2.593847758081335, "grad_norm": 9.05338191986084, "learning_rate": 2.2671111111111113e-06, "loss": 0.8137, "step": 19900 }, { "epoch": 2.593847758081335, "eval/acc": 58.13953399658203, "step": 19900 }, { "epoch": 2.593847758081335, "eval_loss": 1.7151741981506348, "eval_runtime": 0.5947, "eval_samples_per_second": 72.301, "eval_steps_per_second": 1.681, "step": 19900 }, { "epoch": 2.5951511991657976, "grad_norm": 8.866362571716309, "learning_rate": 2.262666666666667e-06, "loss": 0.7586, "step": 19910 }, { "epoch": 2.596454640250261, "grad_norm": 9.143294334411621, "learning_rate": 2.2582222222222227e-06, "loss": 0.8466, "step": 19920 }, { "epoch": 2.5977580813347236, "grad_norm": 8.100214958190918, "learning_rate": 2.253777777777778e-06, "loss": 0.7925, "step": 19930 }, { "epoch": 2.599061522419187, "grad_norm": 10.791122436523438, "learning_rate": 2.2493333333333336e-06, "loss": 0.8622, "step": 19940 }, { "epoch": 2.6003649635036497, "grad_norm": 9.305837631225586, "learning_rate": 2.244888888888889e-06, "loss": 0.8136, "step": 19950 }, { "epoch": 2.6016684045881124, "grad_norm": 8.93336296081543, "learning_rate": 2.2404444444444446e-06, "loss": 0.7706, "step": 19960 }, { "epoch": 2.6029718456725757, "grad_norm": 9.683979034423828, "learning_rate": 2.2360000000000003e-06, "loss": 0.7822, "step": 19970 }, { "epoch": 2.6042752867570385, "grad_norm": 9.008330345153809, "learning_rate": 2.2315555555555555e-06, "loss": 0.8574, "step": 19980 }, { "epoch": 2.6055787278415017, "grad_norm": 8.597892761230469, "learning_rate": 2.2271111111111112e-06, "loss": 0.8596, "step": 19990 }, { "epoch": 2.6068821689259645, "grad_norm": 8.90829849243164, "learning_rate": 2.222666666666667e-06, "loss": 0.7657, "step": 20000 }, { "epoch": 2.6068821689259645, "eval/acc": 55.8139533996582, "step": 20000 }, { "epoch": 2.6068821689259645, "eval_loss": 1.704869031906128, "eval_runtime": 0.6013, "eval_samples_per_second": 71.506, "eval_steps_per_second": 1.663, "step": 20000 }, { "epoch": 2.6081856100104277, "grad_norm": 8.453849792480469, "learning_rate": 2.2182222222222226e-06, "loss": 0.7928, "step": 20010 }, { "epoch": 2.6094890510948905, "grad_norm": 9.283638000488281, "learning_rate": 2.213777777777778e-06, "loss": 0.8167, "step": 20020 }, { "epoch": 2.6107924921793533, "grad_norm": 8.577330589294434, "learning_rate": 2.2093333333333336e-06, "loss": 0.7852, "step": 20030 }, { "epoch": 2.6120959332638165, "grad_norm": 8.238676071166992, "learning_rate": 2.2048888888888893e-06, "loss": 0.8583, "step": 20040 }, { "epoch": 2.6133993743482793, "grad_norm": 8.749384880065918, "learning_rate": 2.2004444444444445e-06, "loss": 0.7423, "step": 20050 }, { "epoch": 2.6147028154327425, "grad_norm": 8.294389724731445, "learning_rate": 2.1960000000000002e-06, "loss": 0.7263, "step": 20060 }, { "epoch": 2.6160062565172053, "grad_norm": 10.182981491088867, "learning_rate": 2.1915555555555555e-06, "loss": 0.7088, "step": 20070 }, { "epoch": 2.6173096976016685, "grad_norm": 9.129683494567871, "learning_rate": 2.187111111111111e-06, "loss": 0.8517, "step": 20080 }, { "epoch": 2.6186131386861313, "grad_norm": 9.808338165283203, "learning_rate": 2.182666666666667e-06, "loss": 0.8825, "step": 20090 }, { "epoch": 2.619916579770594, "grad_norm": 9.212470054626465, "learning_rate": 2.1782222222222226e-06, "loss": 0.7882, "step": 20100 }, { "epoch": 2.619916579770594, "eval/acc": 55.8139533996582, "step": 20100 }, { "epoch": 2.619916579770594, "eval_loss": 1.76344633102417, "eval_runtime": 0.751, "eval_samples_per_second": 57.26, "eval_steps_per_second": 1.332, "step": 20100 }, { "epoch": 2.6212200208550573, "grad_norm": 8.38420581817627, "learning_rate": 2.173777777777778e-06, "loss": 0.7743, "step": 20110 }, { "epoch": 2.6225234619395206, "grad_norm": 7.478631496429443, "learning_rate": 2.1693333333333335e-06, "loss": 0.8153, "step": 20120 }, { "epoch": 2.6238269030239834, "grad_norm": 8.357779502868652, "learning_rate": 2.1648888888888892e-06, "loss": 0.8325, "step": 20130 }, { "epoch": 2.625130344108446, "grad_norm": 9.877179145812988, "learning_rate": 2.160444444444445e-06, "loss": 0.8657, "step": 20140 }, { "epoch": 2.6264337851929094, "grad_norm": 9.724544525146484, "learning_rate": 2.156e-06, "loss": 0.7548, "step": 20150 }, { "epoch": 2.627737226277372, "grad_norm": 10.089259147644043, "learning_rate": 2.1515555555555555e-06, "loss": 0.9041, "step": 20160 }, { "epoch": 2.6290406673618354, "grad_norm": 7.933990955352783, "learning_rate": 2.147111111111111e-06, "loss": 0.777, "step": 20170 }, { "epoch": 2.630344108446298, "grad_norm": 8.897799491882324, "learning_rate": 2.142666666666667e-06, "loss": 0.8425, "step": 20180 }, { "epoch": 2.6316475495307614, "grad_norm": 9.558837890625, "learning_rate": 2.1382222222222225e-06, "loss": 0.8559, "step": 20190 }, { "epoch": 2.632950990615224, "grad_norm": 9.414146423339844, "learning_rate": 2.133777777777778e-06, "loss": 0.8226, "step": 20200 }, { "epoch": 2.632950990615224, "eval/acc": 55.8139533996582, "step": 20200 }, { "epoch": 2.632950990615224, "eval_loss": 1.73488187789917, "eval_runtime": 0.6112, "eval_samples_per_second": 70.351, "eval_steps_per_second": 1.636, "step": 20200 }, { "epoch": 2.634254431699687, "grad_norm": 7.601212024688721, "learning_rate": 2.1293333333333335e-06, "loss": 0.8004, "step": 20210 }, { "epoch": 2.63555787278415, "grad_norm": 9.234183311462402, "learning_rate": 2.124888888888889e-06, "loss": 0.8305, "step": 20220 }, { "epoch": 2.636861313868613, "grad_norm": 9.179082870483398, "learning_rate": 2.120444444444445e-06, "loss": 0.7903, "step": 20230 }, { "epoch": 2.6381647549530762, "grad_norm": 9.157135009765625, "learning_rate": 2.116e-06, "loss": 0.8007, "step": 20240 }, { "epoch": 2.639468196037539, "grad_norm": 10.37834644317627, "learning_rate": 2.111555555555556e-06, "loss": 0.8345, "step": 20250 }, { "epoch": 2.6407716371220022, "grad_norm": 9.638982772827148, "learning_rate": 2.107111111111111e-06, "loss": 0.8608, "step": 20260 }, { "epoch": 2.642075078206465, "grad_norm": 8.557043075561523, "learning_rate": 2.102666666666667e-06, "loss": 0.8646, "step": 20270 }, { "epoch": 2.643378519290928, "grad_norm": 8.109334945678711, "learning_rate": 2.0982222222222225e-06, "loss": 0.784, "step": 20280 }, { "epoch": 2.644681960375391, "grad_norm": 6.916861057281494, "learning_rate": 2.0937777777777778e-06, "loss": 0.8635, "step": 20290 }, { "epoch": 2.6459854014598543, "grad_norm": 10.789344787597656, "learning_rate": 2.0893333333333335e-06, "loss": 0.8841, "step": 20300 }, { "epoch": 2.6459854014598543, "eval/acc": 55.8139533996582, "step": 20300 }, { "epoch": 2.6459854014598543, "eval_loss": 1.7253895998001099, "eval_runtime": 0.5949, "eval_samples_per_second": 72.278, "eval_steps_per_second": 1.681, "step": 20300 }, { "epoch": 2.647288842544317, "grad_norm": 8.237646102905273, "learning_rate": 2.084888888888889e-06, "loss": 0.766, "step": 20310 }, { "epoch": 2.64859228362878, "grad_norm": 8.624271392822266, "learning_rate": 2.080444444444445e-06, "loss": 0.8947, "step": 20320 }, { "epoch": 2.649895724713243, "grad_norm": 10.324417114257812, "learning_rate": 2.076e-06, "loss": 0.8271, "step": 20330 }, { "epoch": 2.651199165797706, "grad_norm": 7.076232433319092, "learning_rate": 2.071555555555556e-06, "loss": 0.8504, "step": 20340 }, { "epoch": 2.6525026068821687, "grad_norm": 8.949559211730957, "learning_rate": 2.0671111111111115e-06, "loss": 0.7841, "step": 20350 }, { "epoch": 2.653806047966632, "grad_norm": 9.507597923278809, "learning_rate": 2.0626666666666668e-06, "loss": 0.8529, "step": 20360 }, { "epoch": 2.655109489051095, "grad_norm": 8.600275039672852, "learning_rate": 2.0582222222222225e-06, "loss": 0.8194, "step": 20370 }, { "epoch": 2.656412930135558, "grad_norm": 8.702995300292969, "learning_rate": 2.0537777777777777e-06, "loss": 0.8231, "step": 20380 }, { "epoch": 2.6577163712200207, "grad_norm": 10.336079597473145, "learning_rate": 2.0493333333333334e-06, "loss": 0.7455, "step": 20390 }, { "epoch": 2.659019812304484, "grad_norm": 9.913479804992676, "learning_rate": 2.044888888888889e-06, "loss": 0.8347, "step": 20400 }, { "epoch": 2.659019812304484, "eval/acc": 55.8139533996582, "step": 20400 }, { "epoch": 2.659019812304484, "eval_loss": 1.7520560026168823, "eval_runtime": 0.6286, "eval_samples_per_second": 68.405, "eval_steps_per_second": 1.591, "step": 20400 }, { "epoch": 2.6603232533889467, "grad_norm": 10.463825225830078, "learning_rate": 2.040444444444445e-06, "loss": 0.8286, "step": 20410 }, { "epoch": 2.66162669447341, "grad_norm": 9.548503875732422, "learning_rate": 2.036e-06, "loss": 0.8157, "step": 20420 }, { "epoch": 2.6629301355578727, "grad_norm": 8.514922142028809, "learning_rate": 2.0315555555555558e-06, "loss": 0.8322, "step": 20430 }, { "epoch": 2.664233576642336, "grad_norm": 10.52540111541748, "learning_rate": 2.0271111111111115e-06, "loss": 0.7563, "step": 20440 }, { "epoch": 2.6655370177267987, "grad_norm": 8.459930419921875, "learning_rate": 2.022666666666667e-06, "loss": 0.7748, "step": 20450 }, { "epoch": 2.6668404588112615, "grad_norm": 9.70013427734375, "learning_rate": 2.0182222222222224e-06, "loss": 0.8655, "step": 20460 }, { "epoch": 2.6681438998957248, "grad_norm": 11.60466194152832, "learning_rate": 2.0137777777777777e-06, "loss": 0.763, "step": 20470 }, { "epoch": 2.6694473409801875, "grad_norm": 8.21410846710205, "learning_rate": 2.0093333333333334e-06, "loss": 0.6768, "step": 20480 }, { "epoch": 2.6707507820646508, "grad_norm": 9.287160873413086, "learning_rate": 2.004888888888889e-06, "loss": 0.8581, "step": 20490 }, { "epoch": 2.6720542231491136, "grad_norm": 9.349991798400879, "learning_rate": 2.0004444444444448e-06, "loss": 0.8021, "step": 20500 }, { "epoch": 2.6720542231491136, "eval/acc": 55.8139533996582, "step": 20500 }, { "epoch": 2.6720542231491136, "eval_loss": 1.7581194639205933, "eval_runtime": 0.5932, "eval_samples_per_second": 72.482, "eval_steps_per_second": 1.686, "step": 20500 }, { "epoch": 2.673357664233577, "grad_norm": 8.882963180541992, "learning_rate": 1.996e-06, "loss": 0.9018, "step": 20510 }, { "epoch": 2.6746611053180396, "grad_norm": 9.676372528076172, "learning_rate": 1.9915555555555557e-06, "loss": 0.8221, "step": 20520 }, { "epoch": 2.6759645464025024, "grad_norm": 9.17044734954834, "learning_rate": 1.9871111111111114e-06, "loss": 0.7813, "step": 20530 }, { "epoch": 2.6772679874869656, "grad_norm": 8.10533618927002, "learning_rate": 1.982666666666667e-06, "loss": 0.8308, "step": 20540 }, { "epoch": 2.678571428571429, "grad_norm": 9.31989860534668, "learning_rate": 1.9782222222222224e-06, "loss": 0.8135, "step": 20550 }, { "epoch": 2.6798748696558916, "grad_norm": 9.844327926635742, "learning_rate": 1.973777777777778e-06, "loss": 0.8391, "step": 20560 }, { "epoch": 2.6811783107403544, "grad_norm": 7.683537483215332, "learning_rate": 1.9693333333333333e-06, "loss": 0.7482, "step": 20570 }, { "epoch": 2.6824817518248176, "grad_norm": 7.707378387451172, "learning_rate": 1.964888888888889e-06, "loss": 0.8401, "step": 20580 }, { "epoch": 2.6837851929092804, "grad_norm": 9.39203929901123, "learning_rate": 1.9604444444444447e-06, "loss": 0.8429, "step": 20590 }, { "epoch": 2.6850886339937436, "grad_norm": 9.42681884765625, "learning_rate": 1.956e-06, "loss": 0.8566, "step": 20600 }, { "epoch": 2.6850886339937436, "eval/acc": 55.8139533996582, "step": 20600 }, { "epoch": 2.6850886339937436, "eval_loss": 1.7261815071105957, "eval_runtime": 0.8276, "eval_samples_per_second": 51.958, "eval_steps_per_second": 1.208, "step": 20600 }, { "epoch": 2.6863920750782064, "grad_norm": 8.788177490234375, "learning_rate": 1.9515555555555557e-06, "loss": 0.8298, "step": 20610 }, { "epoch": 2.6876955161626697, "grad_norm": 8.58096694946289, "learning_rate": 1.9471111111111114e-06, "loss": 0.7794, "step": 20620 }, { "epoch": 2.6889989572471324, "grad_norm": 8.545759201049805, "learning_rate": 1.942666666666667e-06, "loss": 0.8269, "step": 20630 }, { "epoch": 2.6903023983315952, "grad_norm": 9.065820693969727, "learning_rate": 1.9382222222222223e-06, "loss": 0.8254, "step": 20640 }, { "epoch": 2.6916058394160585, "grad_norm": 12.32685375213623, "learning_rate": 1.933777777777778e-06, "loss": 0.8192, "step": 20650 }, { "epoch": 2.6929092805005213, "grad_norm": 10.891736030578613, "learning_rate": 1.9293333333333337e-06, "loss": 0.7958, "step": 20660 }, { "epoch": 2.6942127215849845, "grad_norm": 9.1018648147583, "learning_rate": 1.924888888888889e-06, "loss": 0.7531, "step": 20670 }, { "epoch": 2.6955161626694473, "grad_norm": 11.012262344360352, "learning_rate": 1.9204444444444447e-06, "loss": 0.8609, "step": 20680 }, { "epoch": 2.6968196037539105, "grad_norm": 8.111937522888184, "learning_rate": 1.916e-06, "loss": 0.7365, "step": 20690 }, { "epoch": 2.6981230448383733, "grad_norm": 8.684410095214844, "learning_rate": 1.9115555555555556e-06, "loss": 0.8194, "step": 20700 }, { "epoch": 2.6981230448383733, "eval/acc": 55.8139533996582, "step": 20700 }, { "epoch": 2.6981230448383733, "eval_loss": 1.715824842453003, "eval_runtime": 0.5947, "eval_samples_per_second": 72.31, "eval_steps_per_second": 1.682, "step": 20700 }, { "epoch": 2.699426485922836, "grad_norm": 9.805110931396484, "learning_rate": 1.9071111111111113e-06, "loss": 0.8631, "step": 20710 }, { "epoch": 2.7007299270072993, "grad_norm": 8.534740447998047, "learning_rate": 1.9026666666666668e-06, "loss": 0.8436, "step": 20720 }, { "epoch": 2.702033368091762, "grad_norm": 8.149726867675781, "learning_rate": 1.8982222222222225e-06, "loss": 0.806, "step": 20730 }, { "epoch": 2.7033368091762253, "grad_norm": 9.574934959411621, "learning_rate": 1.893777777777778e-06, "loss": 0.8009, "step": 20740 }, { "epoch": 2.704640250260688, "grad_norm": 9.412125587463379, "learning_rate": 1.8893333333333335e-06, "loss": 0.8207, "step": 20750 }, { "epoch": 2.7059436913451513, "grad_norm": 10.085877418518066, "learning_rate": 1.8848888888888892e-06, "loss": 0.7766, "step": 20760 }, { "epoch": 2.707247132429614, "grad_norm": 8.625889778137207, "learning_rate": 1.8804444444444444e-06, "loss": 0.7763, "step": 20770 }, { "epoch": 2.708550573514077, "grad_norm": 8.862693786621094, "learning_rate": 1.8760000000000001e-06, "loss": 0.7289, "step": 20780 }, { "epoch": 2.70985401459854, "grad_norm": 12.81855297088623, "learning_rate": 1.8715555555555556e-06, "loss": 0.838, "step": 20790 }, { "epoch": 2.7111574556830034, "grad_norm": 9.329113960266113, "learning_rate": 1.8671111111111113e-06, "loss": 0.9157, "step": 20800 }, { "epoch": 2.7111574556830034, "eval/acc": 55.8139533996582, "step": 20800 }, { "epoch": 2.7111574556830034, "eval_loss": 1.7705872058868408, "eval_runtime": 0.5975, "eval_samples_per_second": 71.968, "eval_steps_per_second": 1.674, "step": 20800 }, { "epoch": 2.712460896767466, "grad_norm": 9.216154098510742, "learning_rate": 1.8626666666666668e-06, "loss": 0.8403, "step": 20810 }, { "epoch": 2.713764337851929, "grad_norm": 8.265249252319336, "learning_rate": 1.8582222222222225e-06, "loss": 0.8327, "step": 20820 }, { "epoch": 2.715067778936392, "grad_norm": 9.86301326751709, "learning_rate": 1.853777777777778e-06, "loss": 0.7988, "step": 20830 }, { "epoch": 2.716371220020855, "grad_norm": 9.455988883972168, "learning_rate": 1.8493333333333336e-06, "loss": 0.8337, "step": 20840 }, { "epoch": 2.717674661105318, "grad_norm": 8.680557250976562, "learning_rate": 1.8448888888888891e-06, "loss": 0.7727, "step": 20850 }, { "epoch": 2.718978102189781, "grad_norm": 7.517144203186035, "learning_rate": 1.8404444444444446e-06, "loss": 0.8316, "step": 20860 }, { "epoch": 2.720281543274244, "grad_norm": 9.021925926208496, "learning_rate": 1.8360000000000003e-06, "loss": 0.8849, "step": 20870 }, { "epoch": 2.721584984358707, "grad_norm": 8.564515113830566, "learning_rate": 1.8315555555555556e-06, "loss": 0.7729, "step": 20880 }, { "epoch": 2.72288842544317, "grad_norm": 8.147209167480469, "learning_rate": 1.8271111111111113e-06, "loss": 0.8401, "step": 20890 }, { "epoch": 2.724191866527633, "grad_norm": 9.876725196838379, "learning_rate": 1.8226666666666667e-06, "loss": 0.8213, "step": 20900 }, { "epoch": 2.724191866527633, "eval/acc": 55.8139533996582, "step": 20900 }, { "epoch": 2.724191866527633, "eval_loss": 1.738500714302063, "eval_runtime": 0.5984, "eval_samples_per_second": 71.855, "eval_steps_per_second": 1.671, "step": 20900 }, { "epoch": 2.725495307612096, "grad_norm": 8.925768852233887, "learning_rate": 1.8182222222222224e-06, "loss": 0.7689, "step": 20910 }, { "epoch": 2.726798748696559, "grad_norm": 8.184511184692383, "learning_rate": 1.813777777777778e-06, "loss": 0.8103, "step": 20920 }, { "epoch": 2.728102189781022, "grad_norm": 8.428443908691406, "learning_rate": 1.8093333333333336e-06, "loss": 0.8449, "step": 20930 }, { "epoch": 2.729405630865485, "grad_norm": 8.153343200683594, "learning_rate": 1.804888888888889e-06, "loss": 0.7802, "step": 20940 }, { "epoch": 2.730709071949948, "grad_norm": 7.602133274078369, "learning_rate": 1.8004444444444446e-06, "loss": 0.7915, "step": 20950 }, { "epoch": 2.7320125130344106, "grad_norm": 8.75157642364502, "learning_rate": 1.7960000000000003e-06, "loss": 0.8533, "step": 20960 }, { "epoch": 2.733315954118874, "grad_norm": 8.349024772644043, "learning_rate": 1.7915555555555557e-06, "loss": 0.7904, "step": 20970 }, { "epoch": 2.734619395203337, "grad_norm": 7.673280715942383, "learning_rate": 1.7871111111111112e-06, "loss": 0.81, "step": 20980 }, { "epoch": 2.7359228362878, "grad_norm": 8.986681938171387, "learning_rate": 1.7826666666666667e-06, "loss": 0.7565, "step": 20990 }, { "epoch": 2.7372262773722627, "grad_norm": 10.555803298950195, "learning_rate": 1.7782222222222224e-06, "loss": 0.825, "step": 21000 }, { "epoch": 2.7372262773722627, "eval/acc": 55.8139533996582, "step": 21000 }, { "epoch": 2.7372262773722627, "eval_loss": 1.7480158805847168, "eval_runtime": 0.595, "eval_samples_per_second": 72.274, "eval_steps_per_second": 1.681, "step": 21000 }, { "epoch": 2.738529718456726, "grad_norm": 8.346331596374512, "learning_rate": 1.7737777777777779e-06, "loss": 0.8415, "step": 21010 }, { "epoch": 2.7398331595411887, "grad_norm": 8.959522247314453, "learning_rate": 1.7693333333333336e-06, "loss": 0.8585, "step": 21020 }, { "epoch": 2.7411366006256515, "grad_norm": 9.007274627685547, "learning_rate": 1.764888888888889e-06, "loss": 0.7772, "step": 21030 }, { "epoch": 2.7424400417101147, "grad_norm": 9.85273265838623, "learning_rate": 1.7604444444444445e-06, "loss": 0.803, "step": 21040 }, { "epoch": 2.743743482794578, "grad_norm": 9.497745513916016, "learning_rate": 1.7560000000000002e-06, "loss": 0.8231, "step": 21050 }, { "epoch": 2.7450469238790407, "grad_norm": 7.569871425628662, "learning_rate": 1.7515555555555557e-06, "loss": 0.7644, "step": 21060 }, { "epoch": 2.7463503649635035, "grad_norm": 8.82552719116211, "learning_rate": 1.7471111111111114e-06, "loss": 0.8311, "step": 21070 }, { "epoch": 2.7476538060479667, "grad_norm": 8.405786514282227, "learning_rate": 1.7426666666666667e-06, "loss": 0.8311, "step": 21080 }, { "epoch": 2.7489572471324295, "grad_norm": 9.895186424255371, "learning_rate": 1.7382222222222223e-06, "loss": 0.76, "step": 21090 }, { "epoch": 2.7502606882168927, "grad_norm": 10.085371971130371, "learning_rate": 1.7337777777777778e-06, "loss": 0.8829, "step": 21100 }, { "epoch": 2.7502606882168927, "eval/acc": 58.13953399658203, "step": 21100 }, { "epoch": 2.7502606882168927, "eval_loss": 1.7588940858840942, "eval_runtime": 0.6282, "eval_samples_per_second": 68.448, "eval_steps_per_second": 1.592, "step": 21100 }, { "epoch": 2.7515641293013555, "grad_norm": 8.851200103759766, "learning_rate": 1.7293333333333335e-06, "loss": 0.7755, "step": 21110 }, { "epoch": 2.7528675703858188, "grad_norm": 10.234457969665527, "learning_rate": 1.724888888888889e-06, "loss": 0.7719, "step": 21120 }, { "epoch": 2.7541710114702815, "grad_norm": 8.145639419555664, "learning_rate": 1.7204444444444445e-06, "loss": 0.7959, "step": 21130 }, { "epoch": 2.7554744525547443, "grad_norm": 8.405341148376465, "learning_rate": 1.7160000000000002e-06, "loss": 0.7324, "step": 21140 }, { "epoch": 2.7567778936392076, "grad_norm": 10.199214935302734, "learning_rate": 1.7115555555555557e-06, "loss": 0.7886, "step": 21150 }, { "epoch": 2.7580813347236703, "grad_norm": 8.674017906188965, "learning_rate": 1.7071111111111113e-06, "loss": 0.8636, "step": 21160 }, { "epoch": 2.7593847758081336, "grad_norm": 7.610052108764648, "learning_rate": 1.7026666666666668e-06, "loss": 0.852, "step": 21170 }, { "epoch": 2.7606882168925964, "grad_norm": 7.873721599578857, "learning_rate": 1.6982222222222225e-06, "loss": 0.7453, "step": 21180 }, { "epoch": 2.7619916579770596, "grad_norm": 9.731517791748047, "learning_rate": 1.6937777777777778e-06, "loss": 0.7407, "step": 21190 }, { "epoch": 2.7632950990615224, "grad_norm": 9.55085277557373, "learning_rate": 1.6893333333333335e-06, "loss": 0.7894, "step": 21200 }, { "epoch": 2.7632950990615224, "eval/acc": 55.8139533996582, "step": 21200 }, { "epoch": 2.7632950990615224, "eval_loss": 1.7706856727600098, "eval_runtime": 0.598, "eval_samples_per_second": 71.91, "eval_steps_per_second": 1.672, "step": 21200 }, { "epoch": 2.764598540145985, "grad_norm": 8.54183578491211, "learning_rate": 1.684888888888889e-06, "loss": 0.8015, "step": 21210 }, { "epoch": 2.7659019812304484, "grad_norm": 9.926834106445312, "learning_rate": 1.6804444444444444e-06, "loss": 0.8217, "step": 21220 }, { "epoch": 2.7672054223149116, "grad_norm": 10.468040466308594, "learning_rate": 1.6760000000000001e-06, "loss": 0.8725, "step": 21230 }, { "epoch": 2.7685088633993744, "grad_norm": 7.767794609069824, "learning_rate": 1.6715555555555556e-06, "loss": 0.7824, "step": 21240 }, { "epoch": 2.769812304483837, "grad_norm": 8.285783767700195, "learning_rate": 1.6671111111111113e-06, "loss": 0.8516, "step": 21250 }, { "epoch": 2.7711157455683004, "grad_norm": 8.21517562866211, "learning_rate": 1.6626666666666668e-06, "loss": 0.8567, "step": 21260 }, { "epoch": 2.772419186652763, "grad_norm": 10.385954856872559, "learning_rate": 1.6582222222222225e-06, "loss": 0.8594, "step": 21270 }, { "epoch": 2.7737226277372264, "grad_norm": 10.0065336227417, "learning_rate": 1.653777777777778e-06, "loss": 0.8324, "step": 21280 }, { "epoch": 2.7750260688216892, "grad_norm": 9.019394874572754, "learning_rate": 1.6493333333333334e-06, "loss": 0.8144, "step": 21290 }, { "epoch": 2.7763295099061525, "grad_norm": 9.957494735717773, "learning_rate": 1.644888888888889e-06, "loss": 0.8109, "step": 21300 }, { "epoch": 2.7763295099061525, "eval/acc": 55.8139533996582, "step": 21300 }, { "epoch": 2.7763295099061525, "eval_loss": 1.7376309633255005, "eval_runtime": 0.5956, "eval_samples_per_second": 72.195, "eval_steps_per_second": 1.679, "step": 21300 }, { "epoch": 2.7776329509906152, "grad_norm": 7.597692489624023, "learning_rate": 1.6404444444444446e-06, "loss": 0.8543, "step": 21310 }, { "epoch": 2.778936392075078, "grad_norm": 9.298222541809082, "learning_rate": 1.636e-06, "loss": 0.8035, "step": 21320 }, { "epoch": 2.7802398331595413, "grad_norm": 9.63909912109375, "learning_rate": 1.6315555555555556e-06, "loss": 0.8199, "step": 21330 }, { "epoch": 2.781543274244004, "grad_norm": 9.966618537902832, "learning_rate": 1.6271111111111113e-06, "loss": 0.8671, "step": 21340 }, { "epoch": 2.7828467153284673, "grad_norm": 7.089530944824219, "learning_rate": 1.6226666666666667e-06, "loss": 0.7366, "step": 21350 }, { "epoch": 2.78415015641293, "grad_norm": 8.88662052154541, "learning_rate": 1.6182222222222224e-06, "loss": 0.8052, "step": 21360 }, { "epoch": 2.7854535974973933, "grad_norm": 8.042808532714844, "learning_rate": 1.613777777777778e-06, "loss": 0.801, "step": 21370 }, { "epoch": 2.786757038581856, "grad_norm": 10.173172950744629, "learning_rate": 1.6093333333333336e-06, "loss": 0.7869, "step": 21380 }, { "epoch": 2.788060479666319, "grad_norm": 9.452740669250488, "learning_rate": 1.6048888888888889e-06, "loss": 0.7967, "step": 21390 }, { "epoch": 2.789363920750782, "grad_norm": 10.909093856811523, "learning_rate": 1.6004444444444446e-06, "loss": 0.8885, "step": 21400 }, { "epoch": 2.789363920750782, "eval/acc": 55.8139533996582, "step": 21400 }, { "epoch": 2.789363920750782, "eval_loss": 1.7504191398620605, "eval_runtime": 0.598, "eval_samples_per_second": 71.902, "eval_steps_per_second": 1.672, "step": 21400 }, { "epoch": 2.790667361835245, "grad_norm": 8.049874305725098, "learning_rate": 1.596e-06, "loss": 0.7642, "step": 21410 }, { "epoch": 2.791970802919708, "grad_norm": 10.671398162841797, "learning_rate": 1.5915555555555555e-06, "loss": 0.8066, "step": 21420 }, { "epoch": 2.793274244004171, "grad_norm": 7.51567268371582, "learning_rate": 1.5871111111111112e-06, "loss": 0.8404, "step": 21430 }, { "epoch": 2.794577685088634, "grad_norm": 11.538567543029785, "learning_rate": 1.5826666666666667e-06, "loss": 0.792, "step": 21440 }, { "epoch": 2.795881126173097, "grad_norm": 8.25501537322998, "learning_rate": 1.5782222222222224e-06, "loss": 0.8429, "step": 21450 }, { "epoch": 2.7971845672575597, "grad_norm": 7.69072151184082, "learning_rate": 1.5737777777777779e-06, "loss": 0.7721, "step": 21460 }, { "epoch": 2.798488008342023, "grad_norm": 9.079118728637695, "learning_rate": 1.5693333333333336e-06, "loss": 0.8482, "step": 21470 }, { "epoch": 2.799791449426486, "grad_norm": 9.573830604553223, "learning_rate": 1.564888888888889e-06, "loss": 0.816, "step": 21480 }, { "epoch": 2.801094890510949, "grad_norm": 10.528756141662598, "learning_rate": 1.5604444444444447e-06, "loss": 0.7285, "step": 21490 }, { "epoch": 2.8023983315954117, "grad_norm": 7.944141864776611, "learning_rate": 1.556e-06, "loss": 0.8056, "step": 21500 }, { "epoch": 2.8023983315954117, "eval/acc": 55.8139533996582, "step": 21500 }, { "epoch": 2.8023983315954117, "eval_loss": 1.7248492240905762, "eval_runtime": 0.5957, "eval_samples_per_second": 72.186, "eval_steps_per_second": 1.679, "step": 21500 }, { "epoch": 2.803701772679875, "grad_norm": 7.831629753112793, "learning_rate": 1.5515555555555555e-06, "loss": 0.8012, "step": 21510 }, { "epoch": 2.8050052137643378, "grad_norm": 9.035334587097168, "learning_rate": 1.5471111111111112e-06, "loss": 0.7636, "step": 21520 }, { "epoch": 2.806308654848801, "grad_norm": 9.537494659423828, "learning_rate": 1.5426666666666667e-06, "loss": 0.9204, "step": 21530 }, { "epoch": 2.8076120959332638, "grad_norm": 7.744075298309326, "learning_rate": 1.5382222222222224e-06, "loss": 0.8169, "step": 21540 }, { "epoch": 2.808915537017727, "grad_norm": 8.111047744750977, "learning_rate": 1.5337777777777778e-06, "loss": 0.7965, "step": 21550 }, { "epoch": 2.81021897810219, "grad_norm": 9.138099670410156, "learning_rate": 1.5293333333333335e-06, "loss": 0.7859, "step": 21560 }, { "epoch": 2.8115224191866526, "grad_norm": 8.989847183227539, "learning_rate": 1.524888888888889e-06, "loss": 0.8174, "step": 21570 }, { "epoch": 2.812825860271116, "grad_norm": 7.629883289337158, "learning_rate": 1.5204444444444447e-06, "loss": 0.6784, "step": 21580 }, { "epoch": 2.8141293013555786, "grad_norm": 9.241663932800293, "learning_rate": 1.5160000000000002e-06, "loss": 0.8228, "step": 21590 }, { "epoch": 2.815432742440042, "grad_norm": 8.300806999206543, "learning_rate": 1.5115555555555554e-06, "loss": 0.8382, "step": 21600 }, { "epoch": 2.815432742440042, "eval/acc": 55.8139533996582, "step": 21600 }, { "epoch": 2.815432742440042, "eval_loss": 1.7526339292526245, "eval_runtime": 0.5944, "eval_samples_per_second": 72.341, "eval_steps_per_second": 1.682, "step": 21600 }, { "epoch": 2.8167361835245046, "grad_norm": 7.783026218414307, "learning_rate": 1.5071111111111111e-06, "loss": 0.7666, "step": 21610 }, { "epoch": 2.818039624608968, "grad_norm": 8.837292671203613, "learning_rate": 1.5026666666666666e-06, "loss": 0.8254, "step": 21620 }, { "epoch": 2.8193430656934306, "grad_norm": 8.256052017211914, "learning_rate": 1.4982222222222223e-06, "loss": 0.8286, "step": 21630 }, { "epoch": 2.8206465067778934, "grad_norm": 8.350088119506836, "learning_rate": 1.4937777777777778e-06, "loss": 0.8416, "step": 21640 }, { "epoch": 2.8219499478623566, "grad_norm": 8.230713844299316, "learning_rate": 1.4893333333333335e-06, "loss": 0.7837, "step": 21650 }, { "epoch": 2.82325338894682, "grad_norm": 9.582507133483887, "learning_rate": 1.484888888888889e-06, "loss": 0.8531, "step": 21660 }, { "epoch": 2.8245568300312827, "grad_norm": 8.942204475402832, "learning_rate": 1.4804444444444447e-06, "loss": 0.8512, "step": 21670 }, { "epoch": 2.8258602711157454, "grad_norm": 9.81450080871582, "learning_rate": 1.4760000000000001e-06, "loss": 0.8324, "step": 21680 }, { "epoch": 2.8271637122002087, "grad_norm": 9.60515022277832, "learning_rate": 1.4715555555555558e-06, "loss": 0.8128, "step": 21690 }, { "epoch": 2.8284671532846715, "grad_norm": 9.353902816772461, "learning_rate": 1.467111111111111e-06, "loss": 0.8133, "step": 21700 }, { "epoch": 2.8284671532846715, "eval/acc": 55.8139533996582, "step": 21700 }, { "epoch": 2.8284671532846715, "eval_loss": 1.7252392768859863, "eval_runtime": 0.6197, "eval_samples_per_second": 69.388, "eval_steps_per_second": 1.614, "step": 21700 }, { "epoch": 2.8297705943691343, "grad_norm": 8.195267677307129, "learning_rate": 1.4626666666666666e-06, "loss": 0.8227, "step": 21710 }, { "epoch": 2.8310740354535975, "grad_norm": 9.377577781677246, "learning_rate": 1.4582222222222223e-06, "loss": 0.8392, "step": 21720 }, { "epoch": 2.8323774765380607, "grad_norm": 8.656903266906738, "learning_rate": 1.4537777777777778e-06, "loss": 0.7891, "step": 21730 }, { "epoch": 2.8336809176225235, "grad_norm": 9.403076171875, "learning_rate": 1.4493333333333334e-06, "loss": 0.8846, "step": 21740 }, { "epoch": 2.8349843587069863, "grad_norm": 8.889082908630371, "learning_rate": 1.444888888888889e-06, "loss": 0.8441, "step": 21750 }, { "epoch": 2.8362877997914495, "grad_norm": 8.883251190185547, "learning_rate": 1.4404444444444446e-06, "loss": 0.7443, "step": 21760 }, { "epoch": 2.8375912408759123, "grad_norm": 8.660350799560547, "learning_rate": 1.436e-06, "loss": 0.6943, "step": 21770 }, { "epoch": 2.8388946819603755, "grad_norm": 9.905976295471191, "learning_rate": 1.4315555555555558e-06, "loss": 0.7677, "step": 21780 }, { "epoch": 2.8401981230448383, "grad_norm": 11.383807182312012, "learning_rate": 1.4271111111111113e-06, "loss": 0.8732, "step": 21790 }, { "epoch": 2.8415015641293015, "grad_norm": 9.875412940979004, "learning_rate": 1.422666666666667e-06, "loss": 0.7804, "step": 21800 }, { "epoch": 2.8415015641293015, "eval/acc": 55.8139533996582, "step": 21800 }, { "epoch": 2.8415015641293015, "eval_loss": 1.738519549369812, "eval_runtime": 0.5938, "eval_samples_per_second": 72.416, "eval_steps_per_second": 1.684, "step": 21800 }, { "epoch": 2.8428050052137643, "grad_norm": 9.082887649536133, "learning_rate": 1.4182222222222222e-06, "loss": 0.7898, "step": 21810 }, { "epoch": 2.844108446298227, "grad_norm": 7.228450298309326, "learning_rate": 1.4137777777777777e-06, "loss": 0.8062, "step": 21820 }, { "epoch": 2.8454118873826904, "grad_norm": 11.01636791229248, "learning_rate": 1.4093333333333334e-06, "loss": 0.8463, "step": 21830 }, { "epoch": 2.846715328467153, "grad_norm": 9.76578140258789, "learning_rate": 1.4048888888888889e-06, "loss": 0.8392, "step": 21840 }, { "epoch": 2.8480187695516164, "grad_norm": 10.003615379333496, "learning_rate": 1.4004444444444446e-06, "loss": 0.8299, "step": 21850 }, { "epoch": 2.849322210636079, "grad_norm": 9.93203353881836, "learning_rate": 1.396e-06, "loss": 0.8896, "step": 21860 }, { "epoch": 2.8506256517205424, "grad_norm": 9.752360343933105, "learning_rate": 1.3915555555555558e-06, "loss": 0.8586, "step": 21870 }, { "epoch": 2.851929092805005, "grad_norm": 7.873197555541992, "learning_rate": 1.3871111111111112e-06, "loss": 0.7965, "step": 21880 }, { "epoch": 2.853232533889468, "grad_norm": 8.04538631439209, "learning_rate": 1.382666666666667e-06, "loss": 0.8035, "step": 21890 }, { "epoch": 2.854535974973931, "grad_norm": 8.153717041015625, "learning_rate": 1.3782222222222224e-06, "loss": 0.8084, "step": 21900 }, { "epoch": 2.854535974973931, "eval/acc": 55.8139533996582, "step": 21900 }, { "epoch": 2.854535974973931, "eval_loss": 1.7067084312438965, "eval_runtime": 0.5964, "eval_samples_per_second": 72.095, "eval_steps_per_second": 1.677, "step": 21900 }, { "epoch": 2.8558394160583944, "grad_norm": 9.104667663574219, "learning_rate": 1.3737777777777777e-06, "loss": 0.7999, "step": 21910 }, { "epoch": 2.857142857142857, "grad_norm": 9.181319236755371, "learning_rate": 1.3693333333333334e-06, "loss": 0.826, "step": 21920 }, { "epoch": 2.85844629822732, "grad_norm": 8.1204195022583, "learning_rate": 1.3648888888888888e-06, "loss": 0.7895, "step": 21930 }, { "epoch": 2.8597497393117832, "grad_norm": 8.179566383361816, "learning_rate": 1.3604444444444445e-06, "loss": 0.7776, "step": 21940 }, { "epoch": 2.861053180396246, "grad_norm": 10.060112953186035, "learning_rate": 1.356e-06, "loss": 0.8448, "step": 21950 }, { "epoch": 2.862356621480709, "grad_norm": 9.05113410949707, "learning_rate": 1.3515555555555557e-06, "loss": 0.8476, "step": 21960 }, { "epoch": 2.863660062565172, "grad_norm": 8.80278205871582, "learning_rate": 1.3471111111111112e-06, "loss": 0.8347, "step": 21970 }, { "epoch": 2.8649635036496353, "grad_norm": 8.517053604125977, "learning_rate": 1.3426666666666669e-06, "loss": 0.7759, "step": 21980 }, { "epoch": 2.866266944734098, "grad_norm": 7.882120132446289, "learning_rate": 1.3382222222222224e-06, "loss": 0.8023, "step": 21990 }, { "epoch": 2.867570385818561, "grad_norm": 9.600735664367676, "learning_rate": 1.333777777777778e-06, "loss": 0.869, "step": 22000 }, { "epoch": 2.867570385818561, "eval/acc": 55.8139533996582, "step": 22000 }, { "epoch": 2.867570385818561, "eval_loss": 1.7035927772521973, "eval_runtime": 0.5967, "eval_samples_per_second": 72.066, "eval_steps_per_second": 1.676, "step": 22000 }, { "epoch": 2.868873826903024, "grad_norm": 7.473567485809326, "learning_rate": 1.3293333333333333e-06, "loss": 0.8553, "step": 22010 }, { "epoch": 2.870177267987487, "grad_norm": 8.016550064086914, "learning_rate": 1.3248888888888888e-06, "loss": 0.6897, "step": 22020 }, { "epoch": 2.87148070907195, "grad_norm": 7.75469446182251, "learning_rate": 1.3204444444444445e-06, "loss": 0.7867, "step": 22030 }, { "epoch": 2.872784150156413, "grad_norm": 8.484375953674316, "learning_rate": 1.316e-06, "loss": 0.7927, "step": 22040 }, { "epoch": 2.874087591240876, "grad_norm": 10.987565994262695, "learning_rate": 1.3115555555555557e-06, "loss": 0.8873, "step": 22050 }, { "epoch": 2.875391032325339, "grad_norm": 8.03238296508789, "learning_rate": 1.3071111111111112e-06, "loss": 0.7856, "step": 22060 }, { "epoch": 2.8766944734098017, "grad_norm": 10.538369178771973, "learning_rate": 1.3026666666666668e-06, "loss": 0.8806, "step": 22070 }, { "epoch": 2.877997914494265, "grad_norm": 8.091436386108398, "learning_rate": 1.2982222222222223e-06, "loss": 0.7605, "step": 22080 }, { "epoch": 2.8793013555787277, "grad_norm": 8.7027587890625, "learning_rate": 1.293777777777778e-06, "loss": 0.8475, "step": 22090 }, { "epoch": 2.880604796663191, "grad_norm": 8.675474166870117, "learning_rate": 1.2893333333333335e-06, "loss": 0.8084, "step": 22100 }, { "epoch": 2.880604796663191, "eval/acc": 55.8139533996582, "step": 22100 }, { "epoch": 2.880604796663191, "eval_loss": 1.704601764678955, "eval_runtime": 0.5954, "eval_samples_per_second": 72.225, "eval_steps_per_second": 1.68, "step": 22100 }, { "epoch": 2.8819082377476537, "grad_norm": 8.718690872192383, "learning_rate": 1.2848888888888892e-06, "loss": 0.8579, "step": 22110 }, { "epoch": 2.883211678832117, "grad_norm": 8.46165657043457, "learning_rate": 1.2804444444444445e-06, "loss": 0.8496, "step": 22120 }, { "epoch": 2.8845151199165797, "grad_norm": 9.558334350585938, "learning_rate": 1.276e-06, "loss": 0.8039, "step": 22130 }, { "epoch": 2.8858185610010425, "grad_norm": 9.370363235473633, "learning_rate": 1.2715555555555556e-06, "loss": 0.7453, "step": 22140 }, { "epoch": 2.8871220020855057, "grad_norm": 9.115830421447754, "learning_rate": 1.2671111111111111e-06, "loss": 0.8762, "step": 22150 }, { "epoch": 2.888425443169969, "grad_norm": 10.963330268859863, "learning_rate": 1.2626666666666668e-06, "loss": 0.8063, "step": 22160 }, { "epoch": 2.8897288842544318, "grad_norm": 9.611781120300293, "learning_rate": 1.2582222222222223e-06, "loss": 0.8045, "step": 22170 }, { "epoch": 2.8910323253388945, "grad_norm": 8.729655265808105, "learning_rate": 1.253777777777778e-06, "loss": 0.8586, "step": 22180 }, { "epoch": 2.8923357664233578, "grad_norm": 9.18319034576416, "learning_rate": 1.2493333333333335e-06, "loss": 0.8338, "step": 22190 }, { "epoch": 2.8936392075078206, "grad_norm": 8.156170845031738, "learning_rate": 1.244888888888889e-06, "loss": 0.7815, "step": 22200 }, { "epoch": 2.8936392075078206, "eval/acc": 55.8139533996582, "step": 22200 }, { "epoch": 2.8936392075078206, "eval_loss": 1.7061247825622559, "eval_runtime": 0.5989, "eval_samples_per_second": 71.798, "eval_steps_per_second": 1.67, "step": 22200 }, { "epoch": 2.894942648592284, "grad_norm": 8.240790367126465, "learning_rate": 1.2404444444444446e-06, "loss": 0.7652, "step": 22210 }, { "epoch": 2.8962460896767466, "grad_norm": 9.448629379272461, "learning_rate": 1.2360000000000001e-06, "loss": 0.7466, "step": 22220 }, { "epoch": 2.89754953076121, "grad_norm": 10.177162170410156, "learning_rate": 1.2315555555555558e-06, "loss": 0.8698, "step": 22230 }, { "epoch": 2.8988529718456726, "grad_norm": 8.812707901000977, "learning_rate": 1.2271111111111113e-06, "loss": 0.8441, "step": 22240 }, { "epoch": 2.9001564129301354, "grad_norm": 8.4219388961792, "learning_rate": 1.2226666666666668e-06, "loss": 0.7985, "step": 22250 }, { "epoch": 2.9014598540145986, "grad_norm": 10.176101684570312, "learning_rate": 1.2182222222222222e-06, "loss": 0.7857, "step": 22260 }, { "epoch": 2.9027632950990614, "grad_norm": 8.970868110656738, "learning_rate": 1.213777777777778e-06, "loss": 0.8488, "step": 22270 }, { "epoch": 2.9040667361835246, "grad_norm": 8.309362411499023, "learning_rate": 1.2093333333333334e-06, "loss": 0.7674, "step": 22280 }, { "epoch": 2.9053701772679874, "grad_norm": 9.352801322937012, "learning_rate": 1.2048888888888891e-06, "loss": 0.8757, "step": 22290 }, { "epoch": 2.9066736183524506, "grad_norm": 9.76472282409668, "learning_rate": 1.2004444444444446e-06, "loss": 0.7713, "step": 22300 }, { "epoch": 2.9066736183524506, "eval/acc": 55.8139533996582, "step": 22300 }, { "epoch": 2.9066736183524506, "eval_loss": 1.7272435426712036, "eval_runtime": 0.5966, "eval_samples_per_second": 72.074, "eval_steps_per_second": 1.676, "step": 22300 }, { "epoch": 2.9079770594369134, "grad_norm": 9.32111644744873, "learning_rate": 1.196e-06, "loss": 0.8285, "step": 22310 }, { "epoch": 2.909280500521376, "grad_norm": 8.500946998596191, "learning_rate": 1.1915555555555558e-06, "loss": 0.8625, "step": 22320 }, { "epoch": 2.9105839416058394, "grad_norm": 8.599331855773926, "learning_rate": 1.1871111111111112e-06, "loss": 0.8084, "step": 22330 }, { "epoch": 2.9118873826903027, "grad_norm": 8.66392993927002, "learning_rate": 1.1826666666666667e-06, "loss": 0.8211, "step": 22340 }, { "epoch": 2.9131908237747655, "grad_norm": 10.023335456848145, "learning_rate": 1.1782222222222222e-06, "loss": 0.9338, "step": 22350 }, { "epoch": 2.9144942648592282, "grad_norm": 9.169831275939941, "learning_rate": 1.173777777777778e-06, "loss": 0.745, "step": 22360 }, { "epoch": 2.9157977059436915, "grad_norm": 8.528318405151367, "learning_rate": 1.1693333333333334e-06, "loss": 0.8145, "step": 22370 }, { "epoch": 2.9171011470281543, "grad_norm": 10.03150463104248, "learning_rate": 1.164888888888889e-06, "loss": 0.8264, "step": 22380 }, { "epoch": 2.918404588112617, "grad_norm": 9.400873184204102, "learning_rate": 1.1604444444444445e-06, "loss": 0.8575, "step": 22390 }, { "epoch": 2.9197080291970803, "grad_norm": 8.157634735107422, "learning_rate": 1.156e-06, "loss": 0.8022, "step": 22400 }, { "epoch": 2.9197080291970803, "eval/acc": 55.8139533996582, "step": 22400 }, { "epoch": 2.9197080291970803, "eval_loss": 1.7023892402648926, "eval_runtime": 0.6004, "eval_samples_per_second": 71.624, "eval_steps_per_second": 1.666, "step": 22400 }, { "epoch": 2.9210114702815435, "grad_norm": 8.085871696472168, "learning_rate": 1.1515555555555557e-06, "loss": 0.7697, "step": 22410 }, { "epoch": 2.9223149113660063, "grad_norm": 8.985044479370117, "learning_rate": 1.1471111111111112e-06, "loss": 0.7715, "step": 22420 }, { "epoch": 2.923618352450469, "grad_norm": 8.769021987915039, "learning_rate": 1.1426666666666667e-06, "loss": 0.7541, "step": 22430 }, { "epoch": 2.9249217935349323, "grad_norm": 9.005128860473633, "learning_rate": 1.1382222222222224e-06, "loss": 0.7914, "step": 22440 }, { "epoch": 2.926225234619395, "grad_norm": 8.189840316772461, "learning_rate": 1.1337777777777779e-06, "loss": 0.8205, "step": 22450 }, { "epoch": 2.9275286757038583, "grad_norm": 8.781454086303711, "learning_rate": 1.1293333333333333e-06, "loss": 0.818, "step": 22460 }, { "epoch": 2.928832116788321, "grad_norm": 8.667879104614258, "learning_rate": 1.124888888888889e-06, "loss": 0.7215, "step": 22470 }, { "epoch": 2.9301355578727843, "grad_norm": 9.384228706359863, "learning_rate": 1.1204444444444445e-06, "loss": 0.6935, "step": 22480 }, { "epoch": 2.931438998957247, "grad_norm": 9.668108940124512, "learning_rate": 1.1160000000000002e-06, "loss": 0.8112, "step": 22490 }, { "epoch": 2.93274244004171, "grad_norm": 9.23719310760498, "learning_rate": 1.1115555555555557e-06, "loss": 0.8172, "step": 22500 }, { "epoch": 2.93274244004171, "eval/acc": 55.8139533996582, "step": 22500 }, { "epoch": 2.93274244004171, "eval_loss": 1.7093583345413208, "eval_runtime": 0.5945, "eval_samples_per_second": 72.331, "eval_steps_per_second": 1.682, "step": 22500 }, { "epoch": 2.934045881126173, "grad_norm": 10.637138366699219, "learning_rate": 1.1071111111111112e-06, "loss": 0.869, "step": 22510 }, { "epoch": 2.935349322210636, "grad_norm": 7.807180404663086, "learning_rate": 1.1026666666666666e-06, "loss": 0.7453, "step": 22520 }, { "epoch": 2.936652763295099, "grad_norm": 9.524556159973145, "learning_rate": 1.0982222222222223e-06, "loss": 0.7958, "step": 22530 }, { "epoch": 2.937956204379562, "grad_norm": 8.069131851196289, "learning_rate": 1.0937777777777778e-06, "loss": 0.7538, "step": 22540 }, { "epoch": 2.939259645464025, "grad_norm": 7.696805953979492, "learning_rate": 1.0893333333333333e-06, "loss": 0.8004, "step": 22550 }, { "epoch": 2.940563086548488, "grad_norm": 7.997035026550293, "learning_rate": 1.084888888888889e-06, "loss": 0.8361, "step": 22560 }, { "epoch": 2.9418665276329508, "grad_norm": 9.125737190246582, "learning_rate": 1.0804444444444445e-06, "loss": 0.8736, "step": 22570 }, { "epoch": 2.943169968717414, "grad_norm": 8.789617538452148, "learning_rate": 1.0760000000000002e-06, "loss": 0.757, "step": 22580 }, { "epoch": 2.944473409801877, "grad_norm": 10.013816833496094, "learning_rate": 1.0715555555555556e-06, "loss": 0.9126, "step": 22590 }, { "epoch": 2.94577685088634, "grad_norm": 6.805258750915527, "learning_rate": 1.0671111111111113e-06, "loss": 0.7293, "step": 22600 }, { "epoch": 2.94577685088634, "eval/acc": 58.13953399658203, "step": 22600 }, { "epoch": 2.94577685088634, "eval_loss": 1.6907391548156738, "eval_runtime": 0.5953, "eval_samples_per_second": 72.236, "eval_steps_per_second": 1.68, "step": 22600 }, { "epoch": 2.947080291970803, "grad_norm": 7.691598415374756, "learning_rate": 1.0626666666666668e-06, "loss": 0.7911, "step": 22610 }, { "epoch": 2.948383733055266, "grad_norm": 10.180532455444336, "learning_rate": 1.0582222222222223e-06, "loss": 0.8116, "step": 22620 }, { "epoch": 2.949687174139729, "grad_norm": 9.354681015014648, "learning_rate": 1.0537777777777778e-06, "loss": 0.863, "step": 22630 }, { "epoch": 2.9509906152241916, "grad_norm": 9.615518569946289, "learning_rate": 1.0493333333333335e-06, "loss": 0.8085, "step": 22640 }, { "epoch": 2.952294056308655, "grad_norm": 8.809122085571289, "learning_rate": 1.044888888888889e-06, "loss": 0.7793, "step": 22650 }, { "epoch": 2.953597497393118, "grad_norm": 9.159653663635254, "learning_rate": 1.0404444444444444e-06, "loss": 0.9068, "step": 22660 }, { "epoch": 2.954900938477581, "grad_norm": 7.889333724975586, "learning_rate": 1.0360000000000001e-06, "loss": 0.7797, "step": 22670 }, { "epoch": 2.9562043795620436, "grad_norm": 9.848904609680176, "learning_rate": 1.0315555555555556e-06, "loss": 0.7948, "step": 22680 }, { "epoch": 2.957507820646507, "grad_norm": 8.004546165466309, "learning_rate": 1.0271111111111113e-06, "loss": 0.7908, "step": 22690 }, { "epoch": 2.9588112617309696, "grad_norm": 9.759391784667969, "learning_rate": 1.0226666666666668e-06, "loss": 0.7236, "step": 22700 }, { "epoch": 2.9588112617309696, "eval/acc": 55.8139533996582, "step": 22700 }, { "epoch": 2.9588112617309696, "eval_loss": 1.709557056427002, "eval_runtime": 0.5959, "eval_samples_per_second": 72.157, "eval_steps_per_second": 1.678, "step": 22700 }, { "epoch": 2.960114702815433, "grad_norm": 8.908174514770508, "learning_rate": 1.0182222222222223e-06, "loss": 0.8257, "step": 22710 }, { "epoch": 2.9614181438998957, "grad_norm": 10.542007446289062, "learning_rate": 1.0137777777777777e-06, "loss": 0.7968, "step": 22720 }, { "epoch": 2.962721584984359, "grad_norm": 9.285322189331055, "learning_rate": 1.0093333333333334e-06, "loss": 0.8002, "step": 22730 }, { "epoch": 2.9640250260688217, "grad_norm": 9.922897338867188, "learning_rate": 1.004888888888889e-06, "loss": 0.8223, "step": 22740 }, { "epoch": 2.9653284671532845, "grad_norm": 9.057666778564453, "learning_rate": 1.0004444444444446e-06, "loss": 0.8733, "step": 22750 }, { "epoch": 2.9666319082377477, "grad_norm": 7.953373432159424, "learning_rate": 9.96e-07, "loss": 0.7555, "step": 22760 }, { "epoch": 2.9679353493222105, "grad_norm": 8.704047203063965, "learning_rate": 9.915555555555556e-07, "loss": 0.7663, "step": 22770 }, { "epoch": 2.9692387904066737, "grad_norm": 10.23957347869873, "learning_rate": 9.871111111111113e-07, "loss": 0.8505, "step": 22780 }, { "epoch": 2.9705422314911365, "grad_norm": 9.663519859313965, "learning_rate": 9.826666666666667e-07, "loss": 0.8052, "step": 22790 }, { "epoch": 2.9718456725755997, "grad_norm": 7.896175384521484, "learning_rate": 9.782222222222224e-07, "loss": 0.7069, "step": 22800 }, { "epoch": 2.9718456725755997, "eval/acc": 55.8139533996582, "step": 22800 }, { "epoch": 2.9718456725755997, "eval_loss": 1.7335095405578613, "eval_runtime": 0.5955, "eval_samples_per_second": 72.214, "eval_steps_per_second": 1.679, "step": 22800 }, { "epoch": 2.9731491136600625, "grad_norm": 11.28175163269043, "learning_rate": 9.737777777777777e-07, "loss": 0.8209, "step": 22810 }, { "epoch": 2.9744525547445253, "grad_norm": 9.087202072143555, "learning_rate": 9.693333333333334e-07, "loss": 0.7846, "step": 22820 }, { "epoch": 2.9757559958289885, "grad_norm": 8.91103458404541, "learning_rate": 9.648888888888889e-07, "loss": 0.7994, "step": 22830 }, { "epoch": 2.9770594369134518, "grad_norm": 8.838888168334961, "learning_rate": 9.604444444444446e-07, "loss": 0.8204, "step": 22840 }, { "epoch": 2.9783628779979145, "grad_norm": 9.16761589050293, "learning_rate": 9.56e-07, "loss": 0.7628, "step": 22850 }, { "epoch": 2.9796663190823773, "grad_norm": 8.374608993530273, "learning_rate": 9.515555555555555e-07, "loss": 0.7794, "step": 22860 }, { "epoch": 2.9809697601668406, "grad_norm": 8.412690162658691, "learning_rate": 9.471111111111111e-07, "loss": 0.8667, "step": 22870 }, { "epoch": 2.9822732012513034, "grad_norm": 10.252215385437012, "learning_rate": 9.426666666666667e-07, "loss": 0.7964, "step": 22880 }, { "epoch": 2.9835766423357666, "grad_norm": 7.88516092300415, "learning_rate": 9.382222222222223e-07, "loss": 0.8069, "step": 22890 }, { "epoch": 2.9848800834202294, "grad_norm": 8.369636535644531, "learning_rate": 9.337777777777779e-07, "loss": 0.9002, "step": 22900 }, { "epoch": 2.9848800834202294, "eval/acc": 55.8139533996582, "step": 22900 }, { "epoch": 2.9848800834202294, "eval_loss": 1.7202376127243042, "eval_runtime": 0.5962, "eval_samples_per_second": 72.118, "eval_steps_per_second": 1.677, "step": 22900 }, { "epoch": 2.9861835245046926, "grad_norm": 8.701763153076172, "learning_rate": 9.293333333333334e-07, "loss": 0.7509, "step": 22910 }, { "epoch": 2.9874869655891554, "grad_norm": 8.373270988464355, "learning_rate": 9.248888888888889e-07, "loss": 0.8195, "step": 22920 }, { "epoch": 2.988790406673618, "grad_norm": 9.346085548400879, "learning_rate": 9.204444444444445e-07, "loss": 0.858, "step": 22930 }, { "epoch": 2.9900938477580814, "grad_norm": 9.157856941223145, "learning_rate": 9.160000000000001e-07, "loss": 0.8083, "step": 22940 }, { "epoch": 2.991397288842544, "grad_norm": 8.524199485778809, "learning_rate": 9.115555555555557e-07, "loss": 0.7312, "step": 22950 }, { "epoch": 2.9927007299270074, "grad_norm": 10.313263893127441, "learning_rate": 9.071111111111113e-07, "loss": 0.8086, "step": 22960 }, { "epoch": 2.99400417101147, "grad_norm": 8.694539070129395, "learning_rate": 9.026666666666666e-07, "loss": 0.834, "step": 22970 }, { "epoch": 2.9953076120959334, "grad_norm": 9.729520797729492, "learning_rate": 8.982222222222222e-07, "loss": 0.8254, "step": 22980 }, { "epoch": 2.9966110531803962, "grad_norm": 11.1943941116333, "learning_rate": 8.937777777777778e-07, "loss": 0.9107, "step": 22990 }, { "epoch": 2.997914494264859, "grad_norm": 7.659653663635254, "learning_rate": 8.893333333333334e-07, "loss": 0.7785, "step": 23000 }, { "epoch": 2.997914494264859, "eval/acc": 55.8139533996582, "step": 23000 }, { "epoch": 2.997914494264859, "eval_loss": 1.7206988334655762, "eval_runtime": 0.5954, "eval_samples_per_second": 72.217, "eval_steps_per_second": 1.679, "step": 23000 }, { "epoch": 2.9992179353493222, "grad_norm": 8.414470672607422, "learning_rate": 8.84888888888889e-07, "loss": 0.8194, "step": 23010 }, { "epoch": 3.000521376433785, "grad_norm": 7.712728977203369, "learning_rate": 8.804444444444445e-07, "loss": 0.7945, "step": 23020 }, { "epoch": 3.0018248175182483, "grad_norm": 9.1021146774292, "learning_rate": 8.760000000000001e-07, "loss": 0.8432, "step": 23030 }, { "epoch": 3.003128258602711, "grad_norm": 8.000706672668457, "learning_rate": 8.715555555555556e-07, "loss": 0.7872, "step": 23040 }, { "epoch": 3.0044316996871743, "grad_norm": 9.666967391967773, "learning_rate": 8.671111111111112e-07, "loss": 0.7302, "step": 23050 }, { "epoch": 3.005735140771637, "grad_norm": 9.65298843383789, "learning_rate": 8.626666666666668e-07, "loss": 0.8211, "step": 23060 }, { "epoch": 3.0070385818561003, "grad_norm": 8.305415153503418, "learning_rate": 8.582222222222222e-07, "loss": 0.816, "step": 23070 }, { "epoch": 3.008342022940563, "grad_norm": 7.433007717132568, "learning_rate": 8.537777777777778e-07, "loss": 0.7012, "step": 23080 }, { "epoch": 3.009645464025026, "grad_norm": 9.865405082702637, "learning_rate": 8.493333333333334e-07, "loss": 0.7318, "step": 23090 }, { "epoch": 3.010948905109489, "grad_norm": 10.633191108703613, "learning_rate": 8.44888888888889e-07, "loss": 0.731, "step": 23100 }, { "epoch": 3.010948905109489, "eval/acc": 44.1860466003418, "step": 23100 }, { "epoch": 3.010948905109489, "eval_loss": 2.3543057441711426, "eval_runtime": 0.6487, "eval_samples_per_second": 66.288, "eval_steps_per_second": 1.542, "step": 23100 }, { "epoch": 3.012252346193952, "grad_norm": 8.203362464904785, "learning_rate": 8.404444444444445e-07, "loss": 0.8021, "step": 23110 }, { "epoch": 3.013555787278415, "grad_norm": 11.412751197814941, "learning_rate": 8.36e-07, "loss": 0.8496, "step": 23120 }, { "epoch": 3.014859228362878, "grad_norm": 8.65821647644043, "learning_rate": 8.315555555555556e-07, "loss": 0.7419, "step": 23130 }, { "epoch": 3.016162669447341, "grad_norm": 8.106006622314453, "learning_rate": 8.271111111111112e-07, "loss": 0.7634, "step": 23140 }, { "epoch": 3.017466110531804, "grad_norm": 8.795356750488281, "learning_rate": 8.226666666666668e-07, "loss": 0.8055, "step": 23150 }, { "epoch": 3.018769551616267, "grad_norm": 10.137787818908691, "learning_rate": 8.182222222222224e-07, "loss": 0.7134, "step": 23160 }, { "epoch": 3.02007299270073, "grad_norm": 8.62680435180664, "learning_rate": 8.137777777777777e-07, "loss": 0.7913, "step": 23170 }, { "epoch": 3.0213764337851927, "grad_norm": 9.772363662719727, "learning_rate": 8.093333333333333e-07, "loss": 0.8055, "step": 23180 }, { "epoch": 3.022679874869656, "grad_norm": 9.606851577758789, "learning_rate": 8.048888888888889e-07, "loss": 0.7847, "step": 23190 }, { "epoch": 3.0239833159541187, "grad_norm": 10.20776653289795, "learning_rate": 8.004444444444445e-07, "loss": 0.8393, "step": 23200 }, { "epoch": 3.0239833159541187, "eval/acc": 44.1860466003418, "step": 23200 }, { "epoch": 3.0239833159541187, "eval_loss": 2.342888116836548, "eval_runtime": 0.598, "eval_samples_per_second": 71.908, "eval_steps_per_second": 1.672, "step": 23200 }, { "epoch": 3.025286757038582, "grad_norm": 10.320840835571289, "learning_rate": 7.960000000000001e-07, "loss": 0.7719, "step": 23210 }, { "epoch": 3.0265901981230448, "grad_norm": 10.175871849060059, "learning_rate": 7.915555555555557e-07, "loss": 0.6825, "step": 23220 }, { "epoch": 3.027893639207508, "grad_norm": 7.87008810043335, "learning_rate": 7.871111111111112e-07, "loss": 0.7914, "step": 23230 }, { "epoch": 3.0291970802919708, "grad_norm": 9.775065422058105, "learning_rate": 7.826666666666667e-07, "loss": 0.7449, "step": 23240 }, { "epoch": 3.030500521376434, "grad_norm": 8.62662410736084, "learning_rate": 7.782222222222223e-07, "loss": 0.7656, "step": 23250 }, { "epoch": 3.031803962460897, "grad_norm": 8.621891021728516, "learning_rate": 7.737777777777779e-07, "loss": 0.7827, "step": 23260 }, { "epoch": 3.0331074035453596, "grad_norm": 8.492998123168945, "learning_rate": 7.693333333333335e-07, "loss": 0.7795, "step": 23270 }, { "epoch": 3.034410844629823, "grad_norm": 10.105307579040527, "learning_rate": 7.648888888888889e-07, "loss": 0.7507, "step": 23280 }, { "epoch": 3.0357142857142856, "grad_norm": 8.7698335647583, "learning_rate": 7.604444444444445e-07, "loss": 0.8074, "step": 23290 }, { "epoch": 3.037017726798749, "grad_norm": 9.208733558654785, "learning_rate": 7.56e-07, "loss": 0.7664, "step": 23300 }, { "epoch": 3.037017726798749, "eval/acc": 44.1860466003418, "step": 23300 }, { "epoch": 3.037017726798749, "eval_loss": 2.3217122554779053, "eval_runtime": 0.5954, "eval_samples_per_second": 72.217, "eval_steps_per_second": 1.679, "step": 23300 }, { "epoch": 3.0383211678832116, "grad_norm": 9.838146209716797, "learning_rate": 7.515555555555556e-07, "loss": 0.6982, "step": 23310 }, { "epoch": 3.039624608967675, "grad_norm": 9.454673767089844, "learning_rate": 7.471111111111112e-07, "loss": 0.7123, "step": 23320 }, { "epoch": 3.0409280500521376, "grad_norm": 9.826740264892578, "learning_rate": 7.426666666666667e-07, "loss": 0.8448, "step": 23330 }, { "epoch": 3.0422314911366004, "grad_norm": 10.036713600158691, "learning_rate": 7.382222222222223e-07, "loss": 0.8399, "step": 23340 }, { "epoch": 3.0435349322210636, "grad_norm": 9.749173164367676, "learning_rate": 7.337777777777779e-07, "loss": 0.7782, "step": 23350 }, { "epoch": 3.0448383733055264, "grad_norm": 9.692205429077148, "learning_rate": 7.293333333333335e-07, "loss": 0.7734, "step": 23360 }, { "epoch": 3.0461418143899897, "grad_norm": 8.908297538757324, "learning_rate": 7.24888888888889e-07, "loss": 0.7462, "step": 23370 }, { "epoch": 3.0474452554744524, "grad_norm": 9.458683967590332, "learning_rate": 7.204444444444444e-07, "loss": 0.7983, "step": 23380 }, { "epoch": 3.0487486965589157, "grad_norm": 8.491473197937012, "learning_rate": 7.16e-07, "loss": 0.7492, "step": 23390 }, { "epoch": 3.0500521376433785, "grad_norm": 9.838336944580078, "learning_rate": 7.115555555555556e-07, "loss": 0.7682, "step": 23400 }, { "epoch": 3.0500521376433785, "eval/acc": 44.1860466003418, "step": 23400 }, { "epoch": 3.0500521376433785, "eval_loss": 2.3281493186950684, "eval_runtime": 0.5939, "eval_samples_per_second": 72.4, "eval_steps_per_second": 1.684, "step": 23400 }, { "epoch": 3.0513555787278417, "grad_norm": 9.328446388244629, "learning_rate": 7.071111111111112e-07, "loss": 0.8455, "step": 23410 }, { "epoch": 3.0526590198123045, "grad_norm": 8.515073776245117, "learning_rate": 7.026666666666668e-07, "loss": 0.7225, "step": 23420 }, { "epoch": 3.0539624608967673, "grad_norm": 9.882821083068848, "learning_rate": 6.982222222222222e-07, "loss": 0.794, "step": 23430 }, { "epoch": 3.0552659019812305, "grad_norm": 9.148018836975098, "learning_rate": 6.937777777777778e-07, "loss": 0.7199, "step": 23440 }, { "epoch": 3.0565693430656933, "grad_norm": 8.159253120422363, "learning_rate": 6.893333333333334e-07, "loss": 0.7133, "step": 23450 }, { "epoch": 3.0578727841501565, "grad_norm": 8.699991226196289, "learning_rate": 6.84888888888889e-07, "loss": 0.8155, "step": 23460 }, { "epoch": 3.0591762252346193, "grad_norm": 8.250022888183594, "learning_rate": 6.804444444444446e-07, "loss": 0.6973, "step": 23470 }, { "epoch": 3.0604796663190825, "grad_norm": 9.82026195526123, "learning_rate": 6.76e-07, "loss": 0.8226, "step": 23480 }, { "epoch": 3.0617831074035453, "grad_norm": 8.168004035949707, "learning_rate": 6.715555555555556e-07, "loss": 0.8224, "step": 23490 }, { "epoch": 3.0630865484880085, "grad_norm": 9.79699993133545, "learning_rate": 6.671111111111111e-07, "loss": 0.8358, "step": 23500 }, { "epoch": 3.0630865484880085, "eval/acc": 44.1860466003418, "step": 23500 }, { "epoch": 3.0630865484880085, "eval_loss": 2.3481953144073486, "eval_runtime": 0.5953, "eval_samples_per_second": 72.235, "eval_steps_per_second": 1.68, "step": 23500 }, { "epoch": 3.0643899895724713, "grad_norm": 10.111868858337402, "learning_rate": 6.626666666666667e-07, "loss": 0.8549, "step": 23510 }, { "epoch": 3.065693430656934, "grad_norm": 10.493847846984863, "learning_rate": 6.582222222222223e-07, "loss": 0.8386, "step": 23520 }, { "epoch": 3.0669968717413973, "grad_norm": 10.280377388000488, "learning_rate": 6.537777777777779e-07, "loss": 0.7867, "step": 23530 }, { "epoch": 3.06830031282586, "grad_norm": 9.221932411193848, "learning_rate": 6.493333333333334e-07, "loss": 0.775, "step": 23540 }, { "epoch": 3.0696037539103234, "grad_norm": 9.999637603759766, "learning_rate": 6.44888888888889e-07, "loss": 0.8096, "step": 23550 }, { "epoch": 3.070907194994786, "grad_norm": 8.87269115447998, "learning_rate": 6.404444444444446e-07, "loss": 0.8125, "step": 23560 }, { "epoch": 3.0722106360792494, "grad_norm": 8.223602294921875, "learning_rate": 6.360000000000001e-07, "loss": 0.8074, "step": 23570 }, { "epoch": 3.073514077163712, "grad_norm": 9.926717758178711, "learning_rate": 6.315555555555557e-07, "loss": 0.79, "step": 23580 }, { "epoch": 3.0748175182481754, "grad_norm": 7.261898517608643, "learning_rate": 6.271111111111111e-07, "loss": 0.806, "step": 23590 }, { "epoch": 3.076120959332638, "grad_norm": 8.577115058898926, "learning_rate": 6.226666666666667e-07, "loss": 0.77, "step": 23600 }, { "epoch": 3.076120959332638, "eval/acc": 44.1860466003418, "step": 23600 }, { "epoch": 3.076120959332638, "eval_loss": 2.30120587348938, "eval_runtime": 0.5942, "eval_samples_per_second": 72.362, "eval_steps_per_second": 1.683, "step": 23600 }, { "epoch": 3.077424400417101, "grad_norm": 10.157139778137207, "learning_rate": 6.182222222222223e-07, "loss": 0.8188, "step": 23610 }, { "epoch": 3.078727841501564, "grad_norm": 7.27759313583374, "learning_rate": 6.137777777777779e-07, "loss": 0.684, "step": 23620 }, { "epoch": 3.080031282586027, "grad_norm": 8.57936954498291, "learning_rate": 6.093333333333333e-07, "loss": 0.7651, "step": 23630 }, { "epoch": 3.08133472367049, "grad_norm": 9.291726112365723, "learning_rate": 6.048888888888889e-07, "loss": 0.859, "step": 23640 }, { "epoch": 3.082638164754953, "grad_norm": 10.239093780517578, "learning_rate": 6.004444444444445e-07, "loss": 0.8297, "step": 23650 }, { "epoch": 3.0839416058394162, "grad_norm": 9.032713890075684, "learning_rate": 5.960000000000001e-07, "loss": 0.7933, "step": 23660 }, { "epoch": 3.085245046923879, "grad_norm": 8.530960083007812, "learning_rate": 5.915555555555557e-07, "loss": 0.7733, "step": 23670 }, { "epoch": 3.086548488008342, "grad_norm": 9.930492401123047, "learning_rate": 5.871111111111112e-07, "loss": 0.7824, "step": 23680 }, { "epoch": 3.087851929092805, "grad_norm": 7.708046913146973, "learning_rate": 5.826666666666667e-07, "loss": 0.8466, "step": 23690 }, { "epoch": 3.089155370177268, "grad_norm": 8.710040092468262, "learning_rate": 5.782222222222222e-07, "loss": 0.811, "step": 23700 }, { "epoch": 3.089155370177268, "eval/acc": 44.1860466003418, "step": 23700 }, { "epoch": 3.089155370177268, "eval_loss": 2.304382562637329, "eval_runtime": 0.6284, "eval_samples_per_second": 68.43, "eval_steps_per_second": 1.591, "step": 23700 }, { "epoch": 3.090458811261731, "grad_norm": 8.081384658813477, "learning_rate": 5.737777777777778e-07, "loss": 0.7537, "step": 23710 }, { "epoch": 3.091762252346194, "grad_norm": 8.750344276428223, "learning_rate": 5.693333333333334e-07, "loss": 0.8633, "step": 23720 }, { "epoch": 3.093065693430657, "grad_norm": 7.804800987243652, "learning_rate": 5.648888888888889e-07, "loss": 0.8154, "step": 23730 }, { "epoch": 3.09436913451512, "grad_norm": 9.655993461608887, "learning_rate": 5.604444444444445e-07, "loss": 0.7911, "step": 23740 }, { "epoch": 3.095672575599583, "grad_norm": 9.007747650146484, "learning_rate": 5.560000000000001e-07, "loss": 0.8206, "step": 23750 }, { "epoch": 3.096976016684046, "grad_norm": 11.500776290893555, "learning_rate": 5.515555555555556e-07, "loss": 0.8642, "step": 23760 }, { "epoch": 3.0982794577685087, "grad_norm": 7.945301055908203, "learning_rate": 5.471111111111112e-07, "loss": 0.7082, "step": 23770 }, { "epoch": 3.099582898852972, "grad_norm": 9.078472137451172, "learning_rate": 5.426666666666667e-07, "loss": 0.8087, "step": 23780 }, { "epoch": 3.1008863399374347, "grad_norm": 9.277877807617188, "learning_rate": 5.382222222222223e-07, "loss": 0.7876, "step": 23790 }, { "epoch": 3.102189781021898, "grad_norm": 10.550357818603516, "learning_rate": 5.337777777777779e-07, "loss": 0.8331, "step": 23800 }, { "epoch": 3.102189781021898, "eval/acc": 44.1860466003418, "step": 23800 }, { "epoch": 3.102189781021898, "eval_loss": 2.293590784072876, "eval_runtime": 0.5959, "eval_samples_per_second": 72.165, "eval_steps_per_second": 1.678, "step": 23800 }, { "epoch": 3.1034932221063607, "grad_norm": 17.503860473632812, "learning_rate": 5.293333333333334e-07, "loss": 0.8179, "step": 23810 }, { "epoch": 3.104796663190824, "grad_norm": 10.381514549255371, "learning_rate": 5.24888888888889e-07, "loss": 0.8179, "step": 23820 }, { "epoch": 3.1061001042752867, "grad_norm": 11.208725929260254, "learning_rate": 5.204444444444444e-07, "loss": 0.8272, "step": 23830 }, { "epoch": 3.10740354535975, "grad_norm": 8.3886079788208, "learning_rate": 5.16e-07, "loss": 0.7942, "step": 23840 }, { "epoch": 3.1087069864442127, "grad_norm": 7.879312515258789, "learning_rate": 5.115555555555556e-07, "loss": 0.7284, "step": 23850 }, { "epoch": 3.1100104275286755, "grad_norm": 8.940388679504395, "learning_rate": 5.071111111111112e-07, "loss": 0.7731, "step": 23860 }, { "epoch": 3.1113138686131387, "grad_norm": 7.582369804382324, "learning_rate": 5.026666666666667e-07, "loss": 0.8052, "step": 23870 }, { "epoch": 3.1126173096976015, "grad_norm": 8.524144172668457, "learning_rate": 4.982222222222223e-07, "loss": 0.7615, "step": 23880 }, { "epoch": 3.1139207507820648, "grad_norm": 7.885721206665039, "learning_rate": 4.937777777777778e-07, "loss": 0.8296, "step": 23890 }, { "epoch": 3.1152241918665275, "grad_norm": 9.29078197479248, "learning_rate": 4.893333333333334e-07, "loss": 0.7253, "step": 23900 }, { "epoch": 3.1152241918665275, "eval/acc": 44.1860466003418, "step": 23900 }, { "epoch": 3.1152241918665275, "eval_loss": 2.2885892391204834, "eval_runtime": 0.5978, "eval_samples_per_second": 71.931, "eval_steps_per_second": 1.673, "step": 23900 }, { "epoch": 3.116527632950991, "grad_norm": 7.733985900878906, "learning_rate": 4.848888888888889e-07, "loss": 0.748, "step": 23910 }, { "epoch": 3.1178310740354536, "grad_norm": 9.445945739746094, "learning_rate": 4.804444444444445e-07, "loss": 0.804, "step": 23920 }, { "epoch": 3.1191345151199164, "grad_norm": 10.450433731079102, "learning_rate": 4.760000000000001e-07, "loss": 0.7528, "step": 23930 }, { "epoch": 3.1204379562043796, "grad_norm": 9.020345687866211, "learning_rate": 4.7155555555555556e-07, "loss": 0.8078, "step": 23940 }, { "epoch": 3.1217413972888424, "grad_norm": 8.772445678710938, "learning_rate": 4.6711111111111115e-07, "loss": 0.8569, "step": 23950 }, { "epoch": 3.1230448383733056, "grad_norm": 8.530732154846191, "learning_rate": 4.626666666666667e-07, "loss": 0.7827, "step": 23960 }, { "epoch": 3.1243482794577684, "grad_norm": 10.08745288848877, "learning_rate": 4.5822222222222227e-07, "loss": 0.7515, "step": 23970 }, { "epoch": 3.1256517205422316, "grad_norm": 8.416505813598633, "learning_rate": 4.5377777777777785e-07, "loss": 0.7119, "step": 23980 }, { "epoch": 3.1269551616266944, "grad_norm": 6.936562538146973, "learning_rate": 4.4933333333333333e-07, "loss": 0.7402, "step": 23990 }, { "epoch": 3.1282586027111576, "grad_norm": 8.483198165893555, "learning_rate": 4.448888888888889e-07, "loss": 0.7469, "step": 24000 }, { "epoch": 3.1282586027111576, "eval/acc": 44.1860466003418, "step": 24000 }, { "epoch": 3.1282586027111576, "eval_loss": 2.2984375953674316, "eval_runtime": 0.5926, "eval_samples_per_second": 72.561, "eval_steps_per_second": 1.687, "step": 24000 }, { "epoch": 3.1295620437956204, "grad_norm": 10.089522361755371, "learning_rate": 4.4044444444444445e-07, "loss": 0.7775, "step": 24010 }, { "epoch": 3.1308654848800836, "grad_norm": 9.412835121154785, "learning_rate": 4.3600000000000004e-07, "loss": 0.7378, "step": 24020 }, { "epoch": 3.1321689259645464, "grad_norm": 7.9646172523498535, "learning_rate": 4.315555555555556e-07, "loss": 0.7772, "step": 24030 }, { "epoch": 3.133472367049009, "grad_norm": 8.067090034484863, "learning_rate": 4.271111111111111e-07, "loss": 0.7543, "step": 24040 }, { "epoch": 3.1347758081334725, "grad_norm": 9.903745651245117, "learning_rate": 4.226666666666667e-07, "loss": 0.7853, "step": 24050 }, { "epoch": 3.1360792492179352, "grad_norm": 7.205333232879639, "learning_rate": 4.182222222222222e-07, "loss": 0.8118, "step": 24060 }, { "epoch": 3.1373826903023985, "grad_norm": 9.490117073059082, "learning_rate": 4.137777777777778e-07, "loss": 0.7729, "step": 24070 }, { "epoch": 3.1386861313868613, "grad_norm": 9.806024551391602, "learning_rate": 4.093333333333334e-07, "loss": 0.7637, "step": 24080 }, { "epoch": 3.1399895724713245, "grad_norm": 9.066428184509277, "learning_rate": 4.048888888888889e-07, "loss": 0.7493, "step": 24090 }, { "epoch": 3.1412930135557873, "grad_norm": 8.659998893737793, "learning_rate": 4.0044444444444447e-07, "loss": 0.7813, "step": 24100 }, { "epoch": 3.1412930135557873, "eval/acc": 44.1860466003418, "step": 24100 }, { "epoch": 3.1412930135557873, "eval_loss": 2.3256468772888184, "eval_runtime": 0.5937, "eval_samples_per_second": 72.426, "eval_steps_per_second": 1.684, "step": 24100 }, { "epoch": 3.14259645464025, "grad_norm": 9.005102157592773, "learning_rate": 3.9600000000000005e-07, "loss": 0.7518, "step": 24110 }, { "epoch": 3.1438998957247133, "grad_norm": 9.493056297302246, "learning_rate": 3.915555555555556e-07, "loss": 0.7889, "step": 24120 }, { "epoch": 3.145203336809176, "grad_norm": 8.895201683044434, "learning_rate": 3.8711111111111117e-07, "loss": 0.7456, "step": 24130 }, { "epoch": 3.1465067778936393, "grad_norm": 10.715651512145996, "learning_rate": 3.8266666666666665e-07, "loss": 0.8079, "step": 24140 }, { "epoch": 3.147810218978102, "grad_norm": 9.14599323272705, "learning_rate": 3.7822222222222224e-07, "loss": 0.7592, "step": 24150 }, { "epoch": 3.1491136600625653, "grad_norm": 9.454031944274902, "learning_rate": 3.737777777777778e-07, "loss": 0.7485, "step": 24160 }, { "epoch": 3.150417101147028, "grad_norm": 9.021618843078613, "learning_rate": 3.6933333333333336e-07, "loss": 0.7607, "step": 24170 }, { "epoch": 3.1517205422314913, "grad_norm": 10.145747184753418, "learning_rate": 3.6488888888888894e-07, "loss": 0.872, "step": 24180 }, { "epoch": 3.153023983315954, "grad_norm": 8.64067554473877, "learning_rate": 3.604444444444444e-07, "loss": 0.7888, "step": 24190 }, { "epoch": 3.154327424400417, "grad_norm": 11.549010276794434, "learning_rate": 3.56e-07, "loss": 0.7563, "step": 24200 }, { "epoch": 3.154327424400417, "eval/acc": 44.1860466003418, "step": 24200 }, { "epoch": 3.154327424400417, "eval_loss": 2.3230156898498535, "eval_runtime": 0.5935, "eval_samples_per_second": 72.457, "eval_steps_per_second": 1.685, "step": 24200 }, { "epoch": 3.15563086548488, "grad_norm": 8.076187133789062, "learning_rate": 3.515555555555556e-07, "loss": 0.7741, "step": 24210 }, { "epoch": 3.156934306569343, "grad_norm": 8.054800033569336, "learning_rate": 3.4711111111111113e-07, "loss": 0.7878, "step": 24220 }, { "epoch": 3.158237747653806, "grad_norm": 7.848004341125488, "learning_rate": 3.426666666666667e-07, "loss": 0.7633, "step": 24230 }, { "epoch": 3.159541188738269, "grad_norm": 10.233086585998535, "learning_rate": 3.382222222222222e-07, "loss": 0.7746, "step": 24240 }, { "epoch": 3.160844629822732, "grad_norm": 9.173324584960938, "learning_rate": 3.337777777777778e-07, "loss": 0.7334, "step": 24250 }, { "epoch": 3.162148070907195, "grad_norm": 8.621508598327637, "learning_rate": 3.2933333333333337e-07, "loss": 0.7215, "step": 24260 }, { "epoch": 3.163451511991658, "grad_norm": 8.829760551452637, "learning_rate": 3.248888888888889e-07, "loss": 0.7677, "step": 24270 }, { "epoch": 3.164754953076121, "grad_norm": 9.101958274841309, "learning_rate": 3.204444444444445e-07, "loss": 0.7497, "step": 24280 }, { "epoch": 3.1660583941605838, "grad_norm": 10.770998001098633, "learning_rate": 3.160000000000001e-07, "loss": 0.8396, "step": 24290 }, { "epoch": 3.167361835245047, "grad_norm": 8.987298011779785, "learning_rate": 3.1155555555555556e-07, "loss": 0.7257, "step": 24300 }, { "epoch": 3.167361835245047, "eval/acc": 44.1860466003418, "step": 24300 }, { "epoch": 3.167361835245047, "eval_loss": 2.3391051292419434, "eval_runtime": 0.5929, "eval_samples_per_second": 72.529, "eval_steps_per_second": 1.687, "step": 24300 }, { "epoch": 3.16866527632951, "grad_norm": 9.821568489074707, "learning_rate": 3.0711111111111114e-07, "loss": 0.7412, "step": 24310 }, { "epoch": 3.169968717413973, "grad_norm": 7.845837593078613, "learning_rate": 3.026666666666667e-07, "loss": 0.7474, "step": 24320 }, { "epoch": 3.171272158498436, "grad_norm": 9.569933891296387, "learning_rate": 2.9822222222222226e-07, "loss": 0.8086, "step": 24330 }, { "epoch": 3.172575599582899, "grad_norm": 9.449197769165039, "learning_rate": 2.937777777777778e-07, "loss": 0.7285, "step": 24340 }, { "epoch": 3.173879040667362, "grad_norm": 8.717931747436523, "learning_rate": 2.8933333333333333e-07, "loss": 0.772, "step": 24350 }, { "epoch": 3.1751824817518246, "grad_norm": 8.253952980041504, "learning_rate": 2.848888888888889e-07, "loss": 0.8267, "step": 24360 }, { "epoch": 3.176485922836288, "grad_norm": 8.124940872192383, "learning_rate": 2.8044444444444445e-07, "loss": 0.7456, "step": 24370 }, { "epoch": 3.1777893639207506, "grad_norm": 9.552311897277832, "learning_rate": 2.7600000000000004e-07, "loss": 0.8471, "step": 24380 }, { "epoch": 3.179092805005214, "grad_norm": 10.212308883666992, "learning_rate": 2.7155555555555557e-07, "loss": 0.7492, "step": 24390 }, { "epoch": 3.1803962460896766, "grad_norm": 12.078847885131836, "learning_rate": 2.6711111111111116e-07, "loss": 0.6964, "step": 24400 }, { "epoch": 3.1803962460896766, "eval/acc": 44.1860466003418, "step": 24400 }, { "epoch": 3.1803962460896766, "eval_loss": 2.3331923484802246, "eval_runtime": 0.5975, "eval_samples_per_second": 71.963, "eval_steps_per_second": 1.674, "step": 24400 }, { "epoch": 3.18169968717414, "grad_norm": 9.165428161621094, "learning_rate": 2.626666666666667e-07, "loss": 0.8607, "step": 24410 }, { "epoch": 3.1830031282586027, "grad_norm": 9.410515785217285, "learning_rate": 2.582222222222222e-07, "loss": 0.7727, "step": 24420 }, { "epoch": 3.184306569343066, "grad_norm": 8.71871566772461, "learning_rate": 2.537777777777778e-07, "loss": 0.823, "step": 24430 }, { "epoch": 3.1856100104275287, "grad_norm": 9.580927848815918, "learning_rate": 2.4933333333333334e-07, "loss": 0.8325, "step": 24440 }, { "epoch": 3.1869134515119915, "grad_norm": 8.831100463867188, "learning_rate": 2.4488888888888893e-07, "loss": 0.7268, "step": 24450 }, { "epoch": 3.1882168925964547, "grad_norm": 9.11598014831543, "learning_rate": 2.4044444444444446e-07, "loss": 0.8279, "step": 24460 }, { "epoch": 3.1895203336809175, "grad_norm": 7.807504177093506, "learning_rate": 2.3600000000000002e-07, "loss": 0.7966, "step": 24470 }, { "epoch": 3.1908237747653807, "grad_norm": 11.205448150634766, "learning_rate": 2.3155555555555556e-07, "loss": 0.8279, "step": 24480 }, { "epoch": 3.1921272158498435, "grad_norm": 10.597332000732422, "learning_rate": 2.2711111111111114e-07, "loss": 0.822, "step": 24490 }, { "epoch": 3.1934306569343067, "grad_norm": 9.206705093383789, "learning_rate": 2.226666666666667e-07, "loss": 0.7624, "step": 24500 }, { "epoch": 3.1934306569343067, "eval/acc": 44.1860466003418, "step": 24500 }, { "epoch": 3.1934306569343067, "eval_loss": 2.314422369003296, "eval_runtime": 0.5936, "eval_samples_per_second": 72.443, "eval_steps_per_second": 1.685, "step": 24500 }, { "epoch": 3.1947340980187695, "grad_norm": 10.281166076660156, "learning_rate": 2.1822222222222224e-07, "loss": 0.7693, "step": 24510 }, { "epoch": 3.1960375391032327, "grad_norm": 9.520174980163574, "learning_rate": 2.137777777777778e-07, "loss": 0.7204, "step": 24520 }, { "epoch": 3.1973409801876955, "grad_norm": 9.11064338684082, "learning_rate": 2.0933333333333335e-07, "loss": 0.8043, "step": 24530 }, { "epoch": 3.1986444212721583, "grad_norm": 8.368624687194824, "learning_rate": 2.0488888888888891e-07, "loss": 0.8166, "step": 24540 }, { "epoch": 3.1999478623566215, "grad_norm": 9.419053077697754, "learning_rate": 2.0044444444444447e-07, "loss": 0.7974, "step": 24550 }, { "epoch": 3.2012513034410843, "grad_norm": 11.183753967285156, "learning_rate": 1.96e-07, "loss": 0.8052, "step": 24560 }, { "epoch": 3.2025547445255476, "grad_norm": 9.21888256072998, "learning_rate": 1.9155555555555557e-07, "loss": 0.7275, "step": 24570 }, { "epoch": 3.2038581856100103, "grad_norm": 11.298639297485352, "learning_rate": 1.8711111111111113e-07, "loss": 0.7249, "step": 24580 }, { "epoch": 3.2051616266944736, "grad_norm": 9.682819366455078, "learning_rate": 1.826666666666667e-07, "loss": 0.7822, "step": 24590 }, { "epoch": 3.2064650677789364, "grad_norm": 8.665870666503906, "learning_rate": 1.7822222222222222e-07, "loss": 0.8087, "step": 24600 }, { "epoch": 3.2064650677789364, "eval/acc": 44.1860466003418, "step": 24600 }, { "epoch": 3.2064650677789364, "eval_loss": 2.3100533485412598, "eval_runtime": 0.5933, "eval_samples_per_second": 72.479, "eval_steps_per_second": 1.686, "step": 24600 }, { "epoch": 3.207768508863399, "grad_norm": 8.488890647888184, "learning_rate": 1.7377777777777778e-07, "loss": 0.7405, "step": 24610 }, { "epoch": 3.2090719499478624, "grad_norm": 8.736209869384766, "learning_rate": 1.6933333333333337e-07, "loss": 0.7494, "step": 24620 }, { "epoch": 3.210375391032325, "grad_norm": 12.234800338745117, "learning_rate": 1.648888888888889e-07, "loss": 0.8531, "step": 24630 }, { "epoch": 3.2116788321167884, "grad_norm": 9.17124080657959, "learning_rate": 1.6044444444444446e-07, "loss": 0.7451, "step": 24640 }, { "epoch": 3.212982273201251, "grad_norm": 8.480875015258789, "learning_rate": 1.56e-07, "loss": 0.773, "step": 24650 }, { "epoch": 3.2142857142857144, "grad_norm": 9.989618301391602, "learning_rate": 1.5155555555555558e-07, "loss": 0.8174, "step": 24660 }, { "epoch": 3.215589155370177, "grad_norm": 8.60275936126709, "learning_rate": 1.4711111111111111e-07, "loss": 0.7893, "step": 24670 }, { "epoch": 3.2168925964546404, "grad_norm": 8.643913269042969, "learning_rate": 1.4266666666666667e-07, "loss": 0.7696, "step": 24680 }, { "epoch": 3.218196037539103, "grad_norm": 9.599669456481934, "learning_rate": 1.3822222222222223e-07, "loss": 0.8098, "step": 24690 }, { "epoch": 3.2194994786235664, "grad_norm": 9.3348970413208, "learning_rate": 1.337777777777778e-07, "loss": 0.7412, "step": 24700 }, { "epoch": 3.2194994786235664, "eval/acc": 44.1860466003418, "step": 24700 }, { "epoch": 3.2194994786235664, "eval_loss": 2.309143543243408, "eval_runtime": 0.5936, "eval_samples_per_second": 72.434, "eval_steps_per_second": 1.685, "step": 24700 }, { "epoch": 3.2208029197080292, "grad_norm": 10.135048866271973, "learning_rate": 1.2933333333333335e-07, "loss": 0.7982, "step": 24710 }, { "epoch": 3.222106360792492, "grad_norm": 8.44097900390625, "learning_rate": 1.248888888888889e-07, "loss": 0.8366, "step": 24720 }, { "epoch": 3.2234098018769552, "grad_norm": 11.066662788391113, "learning_rate": 1.2044444444444445e-07, "loss": 0.8359, "step": 24730 }, { "epoch": 3.224713242961418, "grad_norm": 10.742765426635742, "learning_rate": 1.16e-07, "loss": 0.8291, "step": 24740 }, { "epoch": 3.2260166840458813, "grad_norm": 9.625585556030273, "learning_rate": 1.1155555555555557e-07, "loss": 0.7253, "step": 24750 }, { "epoch": 3.227320125130344, "grad_norm": 7.575984954833984, "learning_rate": 1.0711111111111111e-07, "loss": 0.7829, "step": 24760 }, { "epoch": 3.2286235662148073, "grad_norm": 8.65267276763916, "learning_rate": 1.0266666666666667e-07, "loss": 0.7705, "step": 24770 }, { "epoch": 3.22992700729927, "grad_norm": 9.437275886535645, "learning_rate": 9.822222222222222e-08, "loss": 0.8079, "step": 24780 }, { "epoch": 3.231230448383733, "grad_norm": 9.608490943908691, "learning_rate": 9.377777777777779e-08, "loss": 0.7287, "step": 24790 }, { "epoch": 3.232533889468196, "grad_norm": 9.197142601013184, "learning_rate": 8.933333333333334e-08, "loss": 0.7716, "step": 24800 }, { "epoch": 3.232533889468196, "eval/acc": 44.1860466003418, "step": 24800 }, { "epoch": 3.232533889468196, "eval_loss": 2.311877965927124, "eval_runtime": 0.6131, "eval_samples_per_second": 70.136, "eval_steps_per_second": 1.631, "step": 24800 }, { "epoch": 3.233837330552659, "grad_norm": 8.920914649963379, "learning_rate": 8.48888888888889e-08, "loss": 0.7726, "step": 24810 }, { "epoch": 3.235140771637122, "grad_norm": 9.354734420776367, "learning_rate": 8.044444444444445e-08, "loss": 0.7293, "step": 24820 }, { "epoch": 3.236444212721585, "grad_norm": 8.23311996459961, "learning_rate": 7.6e-08, "loss": 0.7429, "step": 24830 }, { "epoch": 3.237747653806048, "grad_norm": 8.496770858764648, "learning_rate": 7.155555555555557e-08, "loss": 0.8852, "step": 24840 }, { "epoch": 3.239051094890511, "grad_norm": 8.722766876220703, "learning_rate": 6.711111111111111e-08, "loss": 0.8203, "step": 24850 }, { "epoch": 3.240354535974974, "grad_norm": 8.189964294433594, "learning_rate": 6.266666666666667e-08, "loss": 0.7764, "step": 24860 }, { "epoch": 3.241657977059437, "grad_norm": 7.387482166290283, "learning_rate": 5.822222222222223e-08, "loss": 0.7104, "step": 24870 }, { "epoch": 3.2429614181438997, "grad_norm": 9.63283634185791, "learning_rate": 5.3777777777777785e-08, "loss": 0.7579, "step": 24880 }, { "epoch": 3.244264859228363, "grad_norm": 8.49582576751709, "learning_rate": 4.933333333333333e-08, "loss": 0.7313, "step": 24890 }, { "epoch": 3.2455683003128257, "grad_norm": 9.183392524719238, "learning_rate": 4.488888888888889e-08, "loss": 0.7957, "step": 24900 }, { "epoch": 3.2455683003128257, "eval/acc": 44.1860466003418, "step": 24900 }, { "epoch": 3.2455683003128257, "eval_loss": 2.31575345993042, "eval_runtime": 0.5941, "eval_samples_per_second": 72.373, "eval_steps_per_second": 1.683, "step": 24900 }, { "epoch": 3.246871741397289, "grad_norm": 7.979811191558838, "learning_rate": 4.0444444444444445e-08, "loss": 0.8163, "step": 24910 }, { "epoch": 3.2481751824817517, "grad_norm": 7.971841335296631, "learning_rate": 3.6000000000000005e-08, "loss": 0.7097, "step": 24920 }, { "epoch": 3.249478623566215, "grad_norm": 9.18423843383789, "learning_rate": 3.155555555555556e-08, "loss": 0.7837, "step": 24930 }, { "epoch": 3.2507820646506778, "grad_norm": 8.598184585571289, "learning_rate": 2.7111111111111115e-08, "loss": 0.7621, "step": 24940 }, { "epoch": 3.252085505735141, "grad_norm": 8.469486236572266, "learning_rate": 2.266666666666667e-08, "loss": 0.7323, "step": 24950 }, { "epoch": 3.2533889468196038, "grad_norm": 9.469120979309082, "learning_rate": 1.8222222222222224e-08, "loss": 0.7403, "step": 24960 }, { "epoch": 3.2546923879040666, "grad_norm": 9.596359252929688, "learning_rate": 1.3777777777777778e-08, "loss": 0.7815, "step": 24970 }, { "epoch": 3.25599582898853, "grad_norm": 10.055347442626953, "learning_rate": 9.333333333333334e-09, "loss": 0.8386, "step": 24980 }, { "epoch": 3.2572992700729926, "grad_norm": 8.804590225219727, "learning_rate": 4.888888888888889e-09, "loss": 0.7281, "step": 24990 }, { "epoch": 3.258602711157456, "grad_norm": 7.5192484855651855, "learning_rate": 4.444444444444445e-10, "loss": 0.7622, "step": 25000 }, { "epoch": 3.258602711157456, "eval/acc": 44.1860466003418, "step": 25000 }, { "epoch": 3.258602711157456, "eval_loss": 2.3172976970672607, "eval_runtime": 0.5926, "eval_samples_per_second": 72.565, "eval_steps_per_second": 1.688, "step": 25000 } ], "logging_steps": 10, "max_steps": 25000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }