{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6517205422314911, "eval_steps": 100, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013034410844629822, "grad_norm": 752.3058471679688, "learning_rate": 3.6000000000000005e-08, "loss": 36.8414, "step": 10 }, { "epoch": 0.0026068821689259644, "grad_norm": 842.267822265625, "learning_rate": 7.6e-08, "loss": 38.6302, "step": 20 }, { "epoch": 0.003910323253388947, "grad_norm": 743.934326171875, "learning_rate": 1.16e-07, "loss": 36.3021, "step": 30 }, { "epoch": 0.005213764337851929, "grad_norm": 934.6981201171875, "learning_rate": 1.56e-07, "loss": 36.9985, "step": 40 }, { "epoch": 0.006517205422314911, "grad_norm": 649.5113525390625, "learning_rate": 1.96e-07, "loss": 33.0931, "step": 50 }, { "epoch": 0.007820646506777894, "grad_norm": 552.53662109375, "learning_rate": 2.3600000000000002e-07, "loss": 32.0638, "step": 60 }, { "epoch": 0.009124087591240875, "grad_norm": 582.6400146484375, "learning_rate": 2.7600000000000004e-07, "loss": 30.6222, "step": 70 }, { "epoch": 0.010427528675703858, "grad_norm": 445.86651611328125, "learning_rate": 3.160000000000001e-07, "loss": 28.4039, "step": 80 }, { "epoch": 0.01173096976016684, "grad_norm": 409.77642822265625, "learning_rate": 3.56e-07, "loss": 27.1438, "step": 90 }, { "epoch": 0.013034410844629822, "grad_norm": 361.9573059082031, "learning_rate": 3.9600000000000005e-07, "loss": 24.495, "step": 100 }, { "epoch": 0.013034410844629822, "eval/acc": 4.651162624359131, "step": 100 }, { "epoch": 0.013034410844629822, "eval_loss": 18.547975540161133, "eval_runtime": 0.9127, "eval_samples_per_second": 47.114, "eval_steps_per_second": 1.096, "step": 100 }, { "epoch": 0.014337851929092805, "grad_norm": 313.210693359375, "learning_rate": 4.3600000000000004e-07, "loss": 21.7606, "step": 110 }, { "epoch": 0.01564129301355579, "grad_norm": 329.11328125, "learning_rate": 4.760000000000001e-07, "loss": 19.5126, "step": 120 }, { "epoch": 0.01694473409801877, "grad_norm": 208.9153289794922, "learning_rate": 5.16e-07, "loss": 18.0711, "step": 130 }, { "epoch": 0.01824817518248175, "grad_norm": 220.79066467285156, "learning_rate": 5.560000000000001e-07, "loss": 16.9045, "step": 140 }, { "epoch": 0.019551616266944735, "grad_norm": 167.1522216796875, "learning_rate": 5.960000000000001e-07, "loss": 14.9046, "step": 150 }, { "epoch": 0.020855057351407715, "grad_norm": 154.0718994140625, "learning_rate": 6.360000000000001e-07, "loss": 14.7601, "step": 160 }, { "epoch": 0.0221584984358707, "grad_norm": 146.39012145996094, "learning_rate": 6.76e-07, "loss": 12.5387, "step": 170 }, { "epoch": 0.02346193952033368, "grad_norm": 153.8921356201172, "learning_rate": 7.16e-07, "loss": 11.6882, "step": 180 }, { "epoch": 0.024765380604796664, "grad_norm": 187.7710723876953, "learning_rate": 7.56e-07, "loss": 11.9919, "step": 190 }, { "epoch": 0.026068821689259645, "grad_norm": 163.95228576660156, "learning_rate": 7.960000000000001e-07, "loss": 10.8187, "step": 200 }, { "epoch": 0.026068821689259645, "eval/acc": 11.627906799316406, "step": 200 }, { "epoch": 0.026068821689259645, "eval_loss": 8.683622360229492, "eval_runtime": 0.5536, "eval_samples_per_second": 77.674, "eval_steps_per_second": 1.806, "step": 200 }, { "epoch": 0.02737226277372263, "grad_norm": 121.16007232666016, "learning_rate": 8.36e-07, "loss": 9.9573, "step": 210 }, { "epoch": 0.02867570385818561, "grad_norm": 123.3974609375, "learning_rate": 8.760000000000001e-07, "loss": 9.3645, "step": 220 }, { "epoch": 0.029979144942648594, "grad_norm": 149.9007110595703, "learning_rate": 9.160000000000001e-07, "loss": 9.1913, "step": 230 }, { "epoch": 0.03128258602711158, "grad_norm": 142.5546875, "learning_rate": 9.56e-07, "loss": 8.1642, "step": 240 }, { "epoch": 0.03258602711157456, "grad_norm": 111.52351379394531, "learning_rate": 9.96e-07, "loss": 8.1291, "step": 250 }, { "epoch": 0.03388946819603754, "grad_norm": 112.73123931884766, "learning_rate": 1.0360000000000001e-06, "loss": 7.6783, "step": 260 }, { "epoch": 0.03519290928050052, "grad_norm": 94.62492370605469, "learning_rate": 1.0760000000000002e-06, "loss": 7.333, "step": 270 }, { "epoch": 0.0364963503649635, "grad_norm": 105.54913330078125, "learning_rate": 1.1160000000000002e-06, "loss": 6.6041, "step": 280 }, { "epoch": 0.03779979144942649, "grad_norm": 93.97553253173828, "learning_rate": 1.156e-06, "loss": 6.769, "step": 290 }, { "epoch": 0.03910323253388947, "grad_norm": 78.89429473876953, "learning_rate": 1.196e-06, "loss": 6.3188, "step": 300 }, { "epoch": 0.03910323253388947, "eval/acc": 18.604650497436523, "step": 300 }, { "epoch": 0.03910323253388947, "eval_loss": 6.040415287017822, "eval_runtime": 0.5521, "eval_samples_per_second": 77.881, "eval_steps_per_second": 1.811, "step": 300 }, { "epoch": 0.04040667361835245, "grad_norm": 88.73673248291016, "learning_rate": 1.2360000000000001e-06, "loss": 5.643, "step": 310 }, { "epoch": 0.04171011470281543, "grad_norm": 83.74315643310547, "learning_rate": 1.276e-06, "loss": 5.0575, "step": 320 }, { "epoch": 0.04301355578727842, "grad_norm": 83.0094223022461, "learning_rate": 1.316e-06, "loss": 4.8596, "step": 330 }, { "epoch": 0.0443169968717414, "grad_norm": 66.99898529052734, "learning_rate": 1.356e-06, "loss": 4.444, "step": 340 }, { "epoch": 0.04562043795620438, "grad_norm": 61.338409423828125, "learning_rate": 1.396e-06, "loss": 4.1019, "step": 350 }, { "epoch": 0.04692387904066736, "grad_norm": 49.183837890625, "learning_rate": 1.436e-06, "loss": 3.7076, "step": 360 }, { "epoch": 0.04822732012513034, "grad_norm": 43.407833099365234, "learning_rate": 1.4760000000000001e-06, "loss": 3.4065, "step": 370 }, { "epoch": 0.04953076120959333, "grad_norm": 36.92807388305664, "learning_rate": 1.5160000000000002e-06, "loss": 3.2919, "step": 380 }, { "epoch": 0.05083420229405631, "grad_norm": 31.856853485107422, "learning_rate": 1.556e-06, "loss": 2.8133, "step": 390 }, { "epoch": 0.05213764337851929, "grad_norm": 25.495525360107422, "learning_rate": 1.596e-06, "loss": 2.7088, "step": 400 }, { "epoch": 0.05213764337851929, "eval/acc": 16.279069900512695, "step": 400 }, { "epoch": 0.05213764337851929, "eval_loss": 3.6577463150024414, "eval_runtime": 0.5513, "eval_samples_per_second": 77.993, "eval_steps_per_second": 1.814, "step": 400 }, { "epoch": 0.05344108446298227, "grad_norm": 24.631906509399414, "learning_rate": 1.636e-06, "loss": 2.4304, "step": 410 }, { "epoch": 0.05474452554744526, "grad_norm": 21.909217834472656, "learning_rate": 1.6760000000000001e-06, "loss": 2.534, "step": 420 }, { "epoch": 0.05604796663190824, "grad_norm": 22.260988235473633, "learning_rate": 1.7160000000000002e-06, "loss": 2.2622, "step": 430 }, { "epoch": 0.05735140771637122, "grad_norm": 20.961124420166016, "learning_rate": 1.7560000000000002e-06, "loss": 2.203, "step": 440 }, { "epoch": 0.0586548488008342, "grad_norm": 17.357723236083984, "learning_rate": 1.7960000000000003e-06, "loss": 2.0872, "step": 450 }, { "epoch": 0.05995828988529719, "grad_norm": 23.459308624267578, "learning_rate": 1.8360000000000003e-06, "loss": 2.1486, "step": 460 }, { "epoch": 0.06126173096976017, "grad_norm": 16.572664260864258, "learning_rate": 1.8760000000000001e-06, "loss": 1.8763, "step": 470 }, { "epoch": 0.06256517205422316, "grad_norm": 16.683259963989258, "learning_rate": 1.916e-06, "loss": 1.9834, "step": 480 }, { "epoch": 0.06386861313868614, "grad_norm": 17.584997177124023, "learning_rate": 1.956e-06, "loss": 1.9754, "step": 490 }, { "epoch": 0.06517205422314912, "grad_norm": 16.946754455566406, "learning_rate": 1.996e-06, "loss": 1.84, "step": 500 }, { "epoch": 0.06517205422314912, "eval/acc": 30.23255729675293, "step": 500 }, { "epoch": 0.06517205422314912, "eval_loss": 3.3365631103515625, "eval_runtime": 0.5506, "eval_samples_per_second": 78.1, "eval_steps_per_second": 1.816, "step": 500 }, { "epoch": 0.0664754953076121, "grad_norm": 16.998144149780273, "learning_rate": 2.036e-06, "loss": 1.951, "step": 510 }, { "epoch": 0.06777893639207508, "grad_norm": 15.914703369140625, "learning_rate": 2.076e-06, "loss": 1.6538, "step": 520 }, { "epoch": 0.06908237747653806, "grad_norm": 16.67144775390625, "learning_rate": 2.116e-06, "loss": 1.8369, "step": 530 }, { "epoch": 0.07038581856100104, "grad_norm": 17.469003677368164, "learning_rate": 2.156e-06, "loss": 1.7525, "step": 540 }, { "epoch": 0.07168925964546402, "grad_norm": 17.825891494750977, "learning_rate": 2.1960000000000002e-06, "loss": 1.741, "step": 550 }, { "epoch": 0.072992700729927, "grad_norm": 16.591110229492188, "learning_rate": 2.2360000000000003e-06, "loss": 1.7004, "step": 560 }, { "epoch": 0.07429614181439, "grad_norm": 17.972606658935547, "learning_rate": 2.2760000000000003e-06, "loss": 1.5859, "step": 570 }, { "epoch": 0.07559958289885298, "grad_norm": 16.083576202392578, "learning_rate": 2.3160000000000004e-06, "loss": 1.6437, "step": 580 }, { "epoch": 0.07690302398331596, "grad_norm": 18.013198852539062, "learning_rate": 2.3560000000000004e-06, "loss": 1.6496, "step": 590 }, { "epoch": 0.07820646506777894, "grad_norm": 17.562707901000977, "learning_rate": 2.3960000000000004e-06, "loss": 1.5905, "step": 600 }, { "epoch": 0.07820646506777894, "eval/acc": 25.581396102905273, "step": 600 }, { "epoch": 0.07820646506777894, "eval_loss": 3.272217035293579, "eval_runtime": 0.5506, "eval_samples_per_second": 78.096, "eval_steps_per_second": 1.816, "step": 600 }, { "epoch": 0.07950990615224192, "grad_norm": 16.752840042114258, "learning_rate": 2.4360000000000005e-06, "loss": 1.6781, "step": 610 }, { "epoch": 0.0808133472367049, "grad_norm": 15.609387397766113, "learning_rate": 2.476e-06, "loss": 1.6322, "step": 620 }, { "epoch": 0.08211678832116788, "grad_norm": 18.39044952392578, "learning_rate": 2.516e-06, "loss": 1.6085, "step": 630 }, { "epoch": 0.08342022940563086, "grad_norm": 15.455676078796387, "learning_rate": 2.556e-06, "loss": 1.5879, "step": 640 }, { "epoch": 0.08472367049009384, "grad_norm": 17.240724563598633, "learning_rate": 2.5960000000000002e-06, "loss": 1.6769, "step": 650 }, { "epoch": 0.08602711157455684, "grad_norm": 15.329434394836426, "learning_rate": 2.6360000000000003e-06, "loss": 1.623, "step": 660 }, { "epoch": 0.08733055265901982, "grad_norm": 15.390430450439453, "learning_rate": 2.6760000000000003e-06, "loss": 1.5269, "step": 670 }, { "epoch": 0.0886339937434828, "grad_norm": 13.903982162475586, "learning_rate": 2.7160000000000003e-06, "loss": 1.5213, "step": 680 }, { "epoch": 0.08993743482794578, "grad_norm": 15.723600387573242, "learning_rate": 2.7560000000000004e-06, "loss": 1.544, "step": 690 }, { "epoch": 0.09124087591240876, "grad_norm": 16.601280212402344, "learning_rate": 2.7960000000000004e-06, "loss": 1.5074, "step": 700 }, { "epoch": 0.09124087591240876, "eval/acc": 23.255813598632812, "step": 700 }, { "epoch": 0.09124087591240876, "eval_loss": 3.2280378341674805, "eval_runtime": 0.5537, "eval_samples_per_second": 77.661, "eval_steps_per_second": 1.806, "step": 700 }, { "epoch": 0.09254431699687174, "grad_norm": 15.303380966186523, "learning_rate": 2.8360000000000005e-06, "loss": 1.6147, "step": 710 }, { "epoch": 0.09384775808133472, "grad_norm": 15.977986335754395, "learning_rate": 2.8760000000000005e-06, "loss": 1.4851, "step": 720 }, { "epoch": 0.0951511991657977, "grad_norm": 15.908977508544922, "learning_rate": 2.9160000000000005e-06, "loss": 1.517, "step": 730 }, { "epoch": 0.09645464025026068, "grad_norm": 14.383811950683594, "learning_rate": 2.956e-06, "loss": 1.5444, "step": 740 }, { "epoch": 0.09775808133472368, "grad_norm": 12.663350105285645, "learning_rate": 2.996e-06, "loss": 1.5018, "step": 750 }, { "epoch": 0.09906152241918666, "grad_norm": 15.087569236755371, "learning_rate": 3.0360000000000002e-06, "loss": 1.4602, "step": 760 }, { "epoch": 0.10036496350364964, "grad_norm": 13.563980102539062, "learning_rate": 3.0760000000000003e-06, "loss": 1.3855, "step": 770 }, { "epoch": 0.10166840458811262, "grad_norm": 13.872782707214355, "learning_rate": 3.1160000000000003e-06, "loss": 1.508, "step": 780 }, { "epoch": 0.1029718456725756, "grad_norm": 12.23460865020752, "learning_rate": 3.1560000000000004e-06, "loss": 1.3865, "step": 790 }, { "epoch": 0.10427528675703858, "grad_norm": 15.744820594787598, "learning_rate": 3.1960000000000004e-06, "loss": 1.5776, "step": 800 }, { "epoch": 0.10427528675703858, "eval/acc": 23.255813598632812, "step": 800 }, { "epoch": 0.10427528675703858, "eval_loss": 3.1086668968200684, "eval_runtime": 0.5521, "eval_samples_per_second": 77.884, "eval_steps_per_second": 1.811, "step": 800 }, { "epoch": 0.10557872784150156, "grad_norm": 12.964938163757324, "learning_rate": 3.2360000000000004e-06, "loss": 1.4783, "step": 810 }, { "epoch": 0.10688216892596454, "grad_norm": 16.409147262573242, "learning_rate": 3.2760000000000005e-06, "loss": 1.3763, "step": 820 }, { "epoch": 0.10818561001042754, "grad_norm": 13.46617317199707, "learning_rate": 3.3160000000000005e-06, "loss": 1.4161, "step": 830 }, { "epoch": 0.10948905109489052, "grad_norm": 14.7039213180542, "learning_rate": 3.3560000000000006e-06, "loss": 1.5434, "step": 840 }, { "epoch": 0.1107924921793535, "grad_norm": 14.37901782989502, "learning_rate": 3.3960000000000006e-06, "loss": 1.4212, "step": 850 }, { "epoch": 0.11209593326381648, "grad_norm": 13.210816383361816, "learning_rate": 3.4360000000000006e-06, "loss": 1.4053, "step": 860 }, { "epoch": 0.11339937434827946, "grad_norm": 13.743114471435547, "learning_rate": 3.4760000000000007e-06, "loss": 1.4231, "step": 870 }, { "epoch": 0.11470281543274244, "grad_norm": 12.634490013122559, "learning_rate": 3.5160000000000007e-06, "loss": 1.3584, "step": 880 }, { "epoch": 0.11600625651720542, "grad_norm": 15.65221881866455, "learning_rate": 3.5560000000000008e-06, "loss": 1.399, "step": 890 }, { "epoch": 0.1173096976016684, "grad_norm": 14.89765453338623, "learning_rate": 3.596e-06, "loss": 1.3935, "step": 900 }, { "epoch": 0.1173096976016684, "eval/acc": 23.255813598632812, "step": 900 }, { "epoch": 0.1173096976016684, "eval_loss": 3.096344232559204, "eval_runtime": 0.5513, "eval_samples_per_second": 77.992, "eval_steps_per_second": 1.814, "step": 900 }, { "epoch": 0.11861313868613138, "grad_norm": 14.929734230041504, "learning_rate": 3.636e-06, "loss": 1.4005, "step": 910 }, { "epoch": 0.11991657977059438, "grad_norm": 12.793665885925293, "learning_rate": 3.676e-06, "loss": 1.4152, "step": 920 }, { "epoch": 0.12122002085505736, "grad_norm": 13.772797584533691, "learning_rate": 3.716e-06, "loss": 1.3823, "step": 930 }, { "epoch": 0.12252346193952034, "grad_norm": 11.430520057678223, "learning_rate": 3.756e-06, "loss": 1.3623, "step": 940 }, { "epoch": 0.12382690302398332, "grad_norm": 13.903288841247559, "learning_rate": 3.796e-06, "loss": 1.3491, "step": 950 }, { "epoch": 0.1251303441084463, "grad_norm": 14.225196838378906, "learning_rate": 3.836e-06, "loss": 1.3605, "step": 960 }, { "epoch": 0.1264337851929093, "grad_norm": 13.653999328613281, "learning_rate": 3.876000000000001e-06, "loss": 1.4258, "step": 970 }, { "epoch": 0.12773722627737227, "grad_norm": 12.619461059570312, "learning_rate": 3.916e-06, "loss": 1.2765, "step": 980 }, { "epoch": 0.12904066736183525, "grad_norm": 12.887979507446289, "learning_rate": 3.956000000000001e-06, "loss": 1.3446, "step": 990 }, { "epoch": 0.13034410844629823, "grad_norm": 13.362163543701172, "learning_rate": 3.996e-06, "loss": 1.4322, "step": 1000 }, { "epoch": 0.13034410844629823, "eval/acc": 23.255813598632812, "step": 1000 }, { "epoch": 0.13034410844629823, "eval_loss": 3.0436527729034424, "eval_runtime": 0.5774, "eval_samples_per_second": 74.467, "eval_steps_per_second": 1.732, "step": 1000 }, { "epoch": 0.13164754953076122, "grad_norm": 13.34825611114502, "learning_rate": 4.036000000000001e-06, "loss": 1.3434, "step": 1010 }, { "epoch": 0.1329509906152242, "grad_norm": 12.807318687438965, "learning_rate": 4.0760000000000004e-06, "loss": 1.3971, "step": 1020 }, { "epoch": 0.13425443169968718, "grad_norm": 10.88805103302002, "learning_rate": 4.116000000000001e-06, "loss": 1.3324, "step": 1030 }, { "epoch": 0.13555787278415016, "grad_norm": 12.1721830368042, "learning_rate": 4.1560000000000005e-06, "loss": 1.3454, "step": 1040 }, { "epoch": 0.13686131386861314, "grad_norm": 16.927200317382812, "learning_rate": 4.196e-06, "loss": 1.3912, "step": 1050 }, { "epoch": 0.13816475495307612, "grad_norm": 11.07986068725586, "learning_rate": 4.236e-06, "loss": 1.2858, "step": 1060 }, { "epoch": 0.1394681960375391, "grad_norm": 13.776060104370117, "learning_rate": 4.276e-06, "loss": 1.4019, "step": 1070 }, { "epoch": 0.14077163712200208, "grad_norm": 13.49791145324707, "learning_rate": 4.316e-06, "loss": 1.4221, "step": 1080 }, { "epoch": 0.14207507820646506, "grad_norm": 11.622773170471191, "learning_rate": 4.356e-06, "loss": 1.3731, "step": 1090 }, { "epoch": 0.14337851929092804, "grad_norm": 13.743051528930664, "learning_rate": 4.396e-06, "loss": 1.3511, "step": 1100 }, { "epoch": 0.14337851929092804, "eval/acc": 25.581396102905273, "step": 1100 }, { "epoch": 0.14337851929092804, "eval_loss": 3.016920804977417, "eval_runtime": 0.5524, "eval_samples_per_second": 77.847, "eval_steps_per_second": 1.81, "step": 1100 }, { "epoch": 0.14468196037539102, "grad_norm": 14.456645011901855, "learning_rate": 4.436e-06, "loss": 1.3994, "step": 1110 }, { "epoch": 0.145985401459854, "grad_norm": 12.78945255279541, "learning_rate": 4.476e-06, "loss": 1.3486, "step": 1120 }, { "epoch": 0.14728884254431698, "grad_norm": 12.899959564208984, "learning_rate": 4.5160000000000005e-06, "loss": 1.3342, "step": 1130 }, { "epoch": 0.14859228362878, "grad_norm": 12.025766372680664, "learning_rate": 4.556e-06, "loss": 1.3066, "step": 1140 }, { "epoch": 0.14989572471324297, "grad_norm": 11.712949752807617, "learning_rate": 4.5960000000000006e-06, "loss": 1.4095, "step": 1150 }, { "epoch": 0.15119916579770595, "grad_norm": 14.212655067443848, "learning_rate": 4.636e-06, "loss": 1.2781, "step": 1160 }, { "epoch": 0.15250260688216893, "grad_norm": 13.639365196228027, "learning_rate": 4.676000000000001e-06, "loss": 1.3783, "step": 1170 }, { "epoch": 0.15380604796663191, "grad_norm": 11.413806915283203, "learning_rate": 4.716e-06, "loss": 1.2901, "step": 1180 }, { "epoch": 0.1551094890510949, "grad_norm": 11.520100593566895, "learning_rate": 4.756000000000001e-06, "loss": 1.3142, "step": 1190 }, { "epoch": 0.15641293013555788, "grad_norm": 13.1220064163208, "learning_rate": 4.796e-06, "loss": 1.3254, "step": 1200 }, { "epoch": 0.15641293013555788, "eval/acc": 27.9069766998291, "step": 1200 }, { "epoch": 0.15641293013555788, "eval_loss": 2.8769519329071045, "eval_runtime": 0.5522, "eval_samples_per_second": 77.864, "eval_steps_per_second": 1.811, "step": 1200 }, { "epoch": 0.15771637122002086, "grad_norm": 11.807994842529297, "learning_rate": 4.836e-06, "loss": 1.339, "step": 1210 }, { "epoch": 0.15901981230448384, "grad_norm": 11.208297729492188, "learning_rate": 4.876e-06, "loss": 1.1896, "step": 1220 }, { "epoch": 0.16032325338894682, "grad_norm": 13.063114166259766, "learning_rate": 4.916e-06, "loss": 1.2701, "step": 1230 }, { "epoch": 0.1616266944734098, "grad_norm": 11.611763000488281, "learning_rate": 4.9560000000000005e-06, "loss": 1.3212, "step": 1240 }, { "epoch": 0.16293013555787278, "grad_norm": 10.884580612182617, "learning_rate": 4.996e-06, "loss": 1.227, "step": 1250 }, { "epoch": 0.16423357664233576, "grad_norm": 11.97398567199707, "learning_rate": 5.0360000000000006e-06, "loss": 1.3075, "step": 1260 }, { "epoch": 0.16553701772679874, "grad_norm": 13.973258972167969, "learning_rate": 5.076000000000001e-06, "loss": 1.2388, "step": 1270 }, { "epoch": 0.16684045881126172, "grad_norm": 13.00340461730957, "learning_rate": 5.116000000000001e-06, "loss": 1.3462, "step": 1280 }, { "epoch": 0.1681438998957247, "grad_norm": 11.750258445739746, "learning_rate": 5.156e-06, "loss": 1.2093, "step": 1290 }, { "epoch": 0.16944734098018768, "grad_norm": 12.117288589477539, "learning_rate": 5.196e-06, "loss": 1.223, "step": 1300 }, { "epoch": 0.16944734098018768, "eval/acc": 32.55813980102539, "step": 1300 }, { "epoch": 0.16944734098018768, "eval_loss": 2.936992645263672, "eval_runtime": 0.5526, "eval_samples_per_second": 77.814, "eval_steps_per_second": 1.81, "step": 1300 }, { "epoch": 0.1707507820646507, "grad_norm": 12.747390747070312, "learning_rate": 5.236e-06, "loss": 1.2956, "step": 1310 }, { "epoch": 0.17205422314911367, "grad_norm": 10.593498229980469, "learning_rate": 5.276e-06, "loss": 1.1996, "step": 1320 }, { "epoch": 0.17335766423357665, "grad_norm": 11.945181846618652, "learning_rate": 5.3160000000000004e-06, "loss": 1.33, "step": 1330 }, { "epoch": 0.17466110531803963, "grad_norm": 12.65109634399414, "learning_rate": 5.356e-06, "loss": 1.2295, "step": 1340 }, { "epoch": 0.17596454640250261, "grad_norm": 11.467466354370117, "learning_rate": 5.3960000000000005e-06, "loss": 1.2227, "step": 1350 }, { "epoch": 0.1772679874869656, "grad_norm": 12.898762702941895, "learning_rate": 5.436e-06, "loss": 1.2573, "step": 1360 }, { "epoch": 0.17857142857142858, "grad_norm": 11.188071250915527, "learning_rate": 5.476000000000001e-06, "loss": 1.3103, "step": 1370 }, { "epoch": 0.17987486965589156, "grad_norm": 12.179079055786133, "learning_rate": 5.516e-06, "loss": 1.259, "step": 1380 }, { "epoch": 0.18117831074035454, "grad_norm": 12.672003746032715, "learning_rate": 5.556000000000001e-06, "loss": 1.1756, "step": 1390 }, { "epoch": 0.18248175182481752, "grad_norm": 11.671830177307129, "learning_rate": 5.596e-06, "loss": 1.2905, "step": 1400 }, { "epoch": 0.18248175182481752, "eval/acc": 27.9069766998291, "step": 1400 }, { "epoch": 0.18248175182481752, "eval_loss": 2.8736231327056885, "eval_runtime": 0.5533, "eval_samples_per_second": 77.719, "eval_steps_per_second": 1.807, "step": 1400 }, { "epoch": 0.1837851929092805, "grad_norm": 12.279439926147461, "learning_rate": 5.636000000000001e-06, "loss": 1.2422, "step": 1410 }, { "epoch": 0.18508863399374348, "grad_norm": 12.59632396697998, "learning_rate": 5.676e-06, "loss": 1.1998, "step": 1420 }, { "epoch": 0.18639207507820646, "grad_norm": 10.290858268737793, "learning_rate": 5.716000000000001e-06, "loss": 1.3073, "step": 1430 }, { "epoch": 0.18769551616266944, "grad_norm": 11.450456619262695, "learning_rate": 5.7560000000000005e-06, "loss": 1.2917, "step": 1440 }, { "epoch": 0.18899895724713242, "grad_norm": 10.898682594299316, "learning_rate": 5.796000000000001e-06, "loss": 1.1988, "step": 1450 }, { "epoch": 0.1903023983315954, "grad_norm": 11.755196571350098, "learning_rate": 5.8360000000000005e-06, "loss": 1.4151, "step": 1460 }, { "epoch": 0.19160583941605838, "grad_norm": 10.369739532470703, "learning_rate": 5.876000000000001e-06, "loss": 1.2748, "step": 1470 }, { "epoch": 0.19290928050052136, "grad_norm": 10.368874549865723, "learning_rate": 5.916000000000001e-06, "loss": 1.2456, "step": 1480 }, { "epoch": 0.19421272158498437, "grad_norm": 10.07337474822998, "learning_rate": 5.956000000000001e-06, "loss": 1.1918, "step": 1490 }, { "epoch": 0.19551616266944735, "grad_norm": 12.127270698547363, "learning_rate": 5.996000000000001e-06, "loss": 1.1726, "step": 1500 }, { "epoch": 0.19551616266944735, "eval/acc": 32.55813980102539, "step": 1500 }, { "epoch": 0.19551616266944735, "eval_loss": 2.9003522396087646, "eval_runtime": 0.5544, "eval_samples_per_second": 77.556, "eval_steps_per_second": 1.804, "step": 1500 }, { "epoch": 0.19681960375391033, "grad_norm": 12.454967498779297, "learning_rate": 6.036000000000001e-06, "loss": 1.3509, "step": 1510 }, { "epoch": 0.1981230448383733, "grad_norm": 11.402469635009766, "learning_rate": 6.076000000000001e-06, "loss": 1.2365, "step": 1520 }, { "epoch": 0.1994264859228363, "grad_norm": 12.890278816223145, "learning_rate": 6.116000000000001e-06, "loss": 1.2295, "step": 1530 }, { "epoch": 0.20072992700729927, "grad_norm": 12.542150497436523, "learning_rate": 6.156000000000001e-06, "loss": 1.2789, "step": 1540 }, { "epoch": 0.20203336809176226, "grad_norm": 10.868870735168457, "learning_rate": 6.196000000000001e-06, "loss": 1.232, "step": 1550 }, { "epoch": 0.20333680917622524, "grad_norm": 12.972379684448242, "learning_rate": 6.236000000000001e-06, "loss": 1.2111, "step": 1560 }, { "epoch": 0.20464025026068822, "grad_norm": 14.00292682647705, "learning_rate": 6.2760000000000006e-06, "loss": 1.1551, "step": 1570 }, { "epoch": 0.2059436913451512, "grad_norm": 11.713733673095703, "learning_rate": 6.316000000000001e-06, "loss": 1.2256, "step": 1580 }, { "epoch": 0.20724713242961418, "grad_norm": 11.81581974029541, "learning_rate": 6.356000000000001e-06, "loss": 1.2375, "step": 1590 }, { "epoch": 0.20855057351407716, "grad_norm": 9.595722198486328, "learning_rate": 6.396e-06, "loss": 1.2307, "step": 1600 }, { "epoch": 0.20855057351407716, "eval/acc": 34.88372039794922, "step": 1600 }, { "epoch": 0.20855057351407716, "eval_loss": 2.89196515083313, "eval_runtime": 0.5526, "eval_samples_per_second": 77.814, "eval_steps_per_second": 1.81, "step": 1600 }, { "epoch": 0.20985401459854014, "grad_norm": 14.193363189697266, "learning_rate": 6.436e-06, "loss": 1.2503, "step": 1610 }, { "epoch": 0.21115745568300312, "grad_norm": 10.671473503112793, "learning_rate": 6.476e-06, "loss": 1.1697, "step": 1620 }, { "epoch": 0.2124608967674661, "grad_norm": 12.921130180358887, "learning_rate": 6.516e-06, "loss": 1.1124, "step": 1630 }, { "epoch": 0.21376433785192908, "grad_norm": 12.321484565734863, "learning_rate": 6.556e-06, "loss": 1.2272, "step": 1640 }, { "epoch": 0.21506777893639206, "grad_norm": 13.49770450592041, "learning_rate": 6.596e-06, "loss": 1.2014, "step": 1650 }, { "epoch": 0.21637122002085507, "grad_norm": 10.752897262573242, "learning_rate": 6.6360000000000005e-06, "loss": 1.174, "step": 1660 }, { "epoch": 0.21767466110531805, "grad_norm": 12.024086952209473, "learning_rate": 6.676e-06, "loss": 1.1625, "step": 1670 }, { "epoch": 0.21897810218978103, "grad_norm": 12.498847961425781, "learning_rate": 6.716000000000001e-06, "loss": 1.1683, "step": 1680 }, { "epoch": 0.220281543274244, "grad_norm": 11.955095291137695, "learning_rate": 6.756e-06, "loss": 1.2441, "step": 1690 }, { "epoch": 0.221584984358707, "grad_norm": 10.969300270080566, "learning_rate": 6.796000000000001e-06, "loss": 1.2547, "step": 1700 }, { "epoch": 0.221584984358707, "eval/acc": 32.55813980102539, "step": 1700 }, { "epoch": 0.221584984358707, "eval_loss": 2.735595464706421, "eval_runtime": 0.5549, "eval_samples_per_second": 77.488, "eval_steps_per_second": 1.802, "step": 1700 }, { "epoch": 0.22288842544316997, "grad_norm": 10.412980079650879, "learning_rate": 6.836e-06, "loss": 1.2473, "step": 1710 }, { "epoch": 0.22419186652763295, "grad_norm": 12.962031364440918, "learning_rate": 6.876000000000001e-06, "loss": 1.1759, "step": 1720 }, { "epoch": 0.22549530761209594, "grad_norm": 10.370616912841797, "learning_rate": 6.916e-06, "loss": 1.1444, "step": 1730 }, { "epoch": 0.22679874869655892, "grad_norm": 12.069488525390625, "learning_rate": 6.956000000000001e-06, "loss": 1.2177, "step": 1740 }, { "epoch": 0.2281021897810219, "grad_norm": 11.831305503845215, "learning_rate": 6.9960000000000004e-06, "loss": 1.2315, "step": 1750 }, { "epoch": 0.22940563086548488, "grad_norm": 10.261811256408691, "learning_rate": 7.036000000000001e-06, "loss": 1.1478, "step": 1760 }, { "epoch": 0.23070907194994786, "grad_norm": 10.814574241638184, "learning_rate": 7.0760000000000005e-06, "loss": 1.221, "step": 1770 }, { "epoch": 0.23201251303441084, "grad_norm": 11.184773445129395, "learning_rate": 7.116000000000001e-06, "loss": 1.2984, "step": 1780 }, { "epoch": 0.23331595411887382, "grad_norm": 11.853842735290527, "learning_rate": 7.156000000000001e-06, "loss": 1.2325, "step": 1790 }, { "epoch": 0.2346193952033368, "grad_norm": 10.178322792053223, "learning_rate": 7.196000000000001e-06, "loss": 1.1664, "step": 1800 }, { "epoch": 0.2346193952033368, "eval/acc": 34.88372039794922, "step": 1800 }, { "epoch": 0.2346193952033368, "eval_loss": 2.893901824951172, "eval_runtime": 0.5541, "eval_samples_per_second": 77.605, "eval_steps_per_second": 1.805, "step": 1800 }, { "epoch": 0.23592283628779978, "grad_norm": 11.30508804321289, "learning_rate": 7.236000000000001e-06, "loss": 1.2602, "step": 1810 }, { "epoch": 0.23722627737226276, "grad_norm": 11.195526123046875, "learning_rate": 7.276000000000001e-06, "loss": 1.1529, "step": 1820 }, { "epoch": 0.23852971845672574, "grad_norm": 11.082310676574707, "learning_rate": 7.316000000000001e-06, "loss": 1.2024, "step": 1830 }, { "epoch": 0.23983315954118875, "grad_norm": 10.974154472351074, "learning_rate": 7.356000000000001e-06, "loss": 1.1329, "step": 1840 }, { "epoch": 0.24113660062565173, "grad_norm": 10.155501365661621, "learning_rate": 7.396000000000001e-06, "loss": 1.1555, "step": 1850 }, { "epoch": 0.2424400417101147, "grad_norm": 10.690115928649902, "learning_rate": 7.436000000000001e-06, "loss": 1.1916, "step": 1860 }, { "epoch": 0.2437434827945777, "grad_norm": 11.77647876739502, "learning_rate": 7.476000000000001e-06, "loss": 1.0674, "step": 1870 }, { "epoch": 0.24504692387904067, "grad_norm": 13.536336898803711, "learning_rate": 7.516000000000001e-06, "loss": 1.2325, "step": 1880 }, { "epoch": 0.24635036496350365, "grad_norm": 10.988912582397461, "learning_rate": 7.556000000000001e-06, "loss": 1.1597, "step": 1890 }, { "epoch": 0.24765380604796663, "grad_norm": 11.346904754638672, "learning_rate": 7.5960000000000015e-06, "loss": 1.1883, "step": 1900 }, { "epoch": 0.24765380604796663, "eval/acc": 34.88372039794922, "step": 1900 }, { "epoch": 0.24765380604796663, "eval_loss": 2.8580784797668457, "eval_runtime": 0.5538, "eval_samples_per_second": 77.645, "eval_steps_per_second": 1.806, "step": 1900 }, { "epoch": 0.24895724713242962, "grad_norm": 10.486469268798828, "learning_rate": 7.636e-06, "loss": 1.1989, "step": 1910 }, { "epoch": 0.2502606882168926, "grad_norm": 11.191844940185547, "learning_rate": 7.676e-06, "loss": 1.1934, "step": 1920 }, { "epoch": 0.2515641293013556, "grad_norm": 12.818986892700195, "learning_rate": 7.716e-06, "loss": 1.1856, "step": 1930 }, { "epoch": 0.2528675703858186, "grad_norm": 9.980338096618652, "learning_rate": 7.756e-06, "loss": 1.1685, "step": 1940 }, { "epoch": 0.25417101147028154, "grad_norm": 10.59505558013916, "learning_rate": 7.796e-06, "loss": 1.0932, "step": 1950 }, { "epoch": 0.25547445255474455, "grad_norm": 10.21989631652832, "learning_rate": 7.836000000000001e-06, "loss": 1.2254, "step": 1960 }, { "epoch": 0.2567778936392075, "grad_norm": 9.082103729248047, "learning_rate": 7.876e-06, "loss": 1.1439, "step": 1970 }, { "epoch": 0.2580813347236705, "grad_norm": 10.54208755493164, "learning_rate": 7.916e-06, "loss": 1.2031, "step": 1980 }, { "epoch": 0.25938477580813346, "grad_norm": 11.807458877563477, "learning_rate": 7.956e-06, "loss": 1.1575, "step": 1990 }, { "epoch": 0.26068821689259647, "grad_norm": 11.20957088470459, "learning_rate": 7.996000000000001e-06, "loss": 1.152, "step": 2000 }, { "epoch": 0.26068821689259647, "eval/acc": 34.88372039794922, "step": 2000 }, { "epoch": 0.26068821689259647, "eval_loss": 2.9252498149871826, "eval_runtime": 0.5546, "eval_samples_per_second": 77.535, "eval_steps_per_second": 1.803, "step": 2000 }, { "epoch": 0.2619916579770594, "grad_norm": 9.236865997314453, "learning_rate": 8.036e-06, "loss": 1.1518, "step": 2010 }, { "epoch": 0.26329509906152243, "grad_norm": 10.173084259033203, "learning_rate": 8.076e-06, "loss": 1.1738, "step": 2020 }, { "epoch": 0.2645985401459854, "grad_norm": 11.158531188964844, "learning_rate": 8.116e-06, "loss": 1.1942, "step": 2030 }, { "epoch": 0.2659019812304484, "grad_norm": 10.654205322265625, "learning_rate": 8.156000000000001e-06, "loss": 1.1965, "step": 2040 }, { "epoch": 0.26720542231491134, "grad_norm": 10.954093933105469, "learning_rate": 8.196e-06, "loss": 1.0949, "step": 2050 }, { "epoch": 0.26850886339937435, "grad_norm": 10.480634689331055, "learning_rate": 8.236e-06, "loss": 1.2128, "step": 2060 }, { "epoch": 0.2698123044838373, "grad_norm": 9.64358901977539, "learning_rate": 8.276e-06, "loss": 1.1713, "step": 2070 }, { "epoch": 0.2711157455683003, "grad_norm": 9.68060302734375, "learning_rate": 8.316000000000001e-06, "loss": 1.1275, "step": 2080 }, { "epoch": 0.27241918665276327, "grad_norm": 10.211024284362793, "learning_rate": 8.356000000000001e-06, "loss": 1.2368, "step": 2090 }, { "epoch": 0.2737226277372263, "grad_norm": 10.19279670715332, "learning_rate": 8.396e-06, "loss": 1.1649, "step": 2100 }, { "epoch": 0.2737226277372263, "eval/acc": 37.20930099487305, "step": 2100 }, { "epoch": 0.2737226277372263, "eval_loss": 2.894489288330078, "eval_runtime": 0.5551, "eval_samples_per_second": 77.469, "eval_steps_per_second": 1.802, "step": 2100 }, { "epoch": 0.2750260688216893, "grad_norm": 11.496298789978027, "learning_rate": 8.436e-06, "loss": 1.1936, "step": 2110 }, { "epoch": 0.27632950990615224, "grad_norm": 10.342120170593262, "learning_rate": 8.476000000000002e-06, "loss": 1.2169, "step": 2120 }, { "epoch": 0.27763295099061525, "grad_norm": 10.583955764770508, "learning_rate": 8.516000000000001e-06, "loss": 1.1169, "step": 2130 }, { "epoch": 0.2789363920750782, "grad_norm": 10.484763145446777, "learning_rate": 8.556e-06, "loss": 1.1492, "step": 2140 }, { "epoch": 0.2802398331595412, "grad_norm": 10.46810245513916, "learning_rate": 8.596e-06, "loss": 1.2559, "step": 2150 }, { "epoch": 0.28154327424400416, "grad_norm": 10.129209518432617, "learning_rate": 8.636000000000002e-06, "loss": 1.0982, "step": 2160 }, { "epoch": 0.28284671532846717, "grad_norm": 9.844231605529785, "learning_rate": 8.676000000000001e-06, "loss": 1.1378, "step": 2170 }, { "epoch": 0.2841501564129301, "grad_norm": 11.35154914855957, "learning_rate": 8.716000000000001e-06, "loss": 1.2192, "step": 2180 }, { "epoch": 0.28545359749739313, "grad_norm": 11.104358673095703, "learning_rate": 8.756e-06, "loss": 1.1804, "step": 2190 }, { "epoch": 0.2867570385818561, "grad_norm": 8.416515350341797, "learning_rate": 8.796000000000002e-06, "loss": 1.135, "step": 2200 }, { "epoch": 0.2867570385818561, "eval/acc": 32.55813980102539, "step": 2200 }, { "epoch": 0.2867570385818561, "eval_loss": 2.850806713104248, "eval_runtime": 0.5533, "eval_samples_per_second": 77.714, "eval_steps_per_second": 1.807, "step": 2200 }, { "epoch": 0.2880604796663191, "grad_norm": 11.20003890991211, "learning_rate": 8.836000000000001e-06, "loss": 1.1998, "step": 2210 }, { "epoch": 0.28936392075078204, "grad_norm": 12.205933570861816, "learning_rate": 8.876e-06, "loss": 1.1331, "step": 2220 }, { "epoch": 0.29066736183524505, "grad_norm": 9.875853538513184, "learning_rate": 8.916e-06, "loss": 1.0744, "step": 2230 }, { "epoch": 0.291970802919708, "grad_norm": 11.795681953430176, "learning_rate": 8.956e-06, "loss": 1.1378, "step": 2240 }, { "epoch": 0.293274244004171, "grad_norm": 9.370049476623535, "learning_rate": 8.996e-06, "loss": 1.0586, "step": 2250 }, { "epoch": 0.29457768508863397, "grad_norm": 10.6432466506958, "learning_rate": 9.036e-06, "loss": 1.1936, "step": 2260 }, { "epoch": 0.295881126173097, "grad_norm": 10.588776588439941, "learning_rate": 9.076000000000001e-06, "loss": 1.0813, "step": 2270 }, { "epoch": 0.29718456725756, "grad_norm": 10.122645378112793, "learning_rate": 9.116e-06, "loss": 1.2067, "step": 2280 }, { "epoch": 0.29848800834202294, "grad_norm": 9.388029098510742, "learning_rate": 9.156e-06, "loss": 1.1546, "step": 2290 }, { "epoch": 0.29979144942648595, "grad_norm": 9.928315162658691, "learning_rate": 9.196e-06, "loss": 1.1134, "step": 2300 }, { "epoch": 0.29979144942648595, "eval/acc": 41.86046600341797, "step": 2300 }, { "epoch": 0.29979144942648595, "eval_loss": 2.8716952800750732, "eval_runtime": 0.554, "eval_samples_per_second": 77.619, "eval_steps_per_second": 1.805, "step": 2300 }, { "epoch": 0.3010948905109489, "grad_norm": 10.03122615814209, "learning_rate": 9.236000000000001e-06, "loss": 1.3212, "step": 2310 }, { "epoch": 0.3023983315954119, "grad_norm": 10.512228012084961, "learning_rate": 9.276e-06, "loss": 1.1369, "step": 2320 }, { "epoch": 0.30370177267987486, "grad_norm": 10.605701446533203, "learning_rate": 9.316e-06, "loss": 1.2389, "step": 2330 }, { "epoch": 0.30500521376433787, "grad_norm": 11.414910316467285, "learning_rate": 9.356e-06, "loss": 1.1003, "step": 2340 }, { "epoch": 0.3063086548488008, "grad_norm": 9.643972396850586, "learning_rate": 9.396000000000001e-06, "loss": 1.1028, "step": 2350 }, { "epoch": 0.30761209593326383, "grad_norm": 11.462910652160645, "learning_rate": 9.436e-06, "loss": 1.1437, "step": 2360 }, { "epoch": 0.3089155370177268, "grad_norm": 10.556984901428223, "learning_rate": 9.476e-06, "loss": 1.1802, "step": 2370 }, { "epoch": 0.3102189781021898, "grad_norm": 11.555737495422363, "learning_rate": 9.516e-06, "loss": 1.1376, "step": 2380 }, { "epoch": 0.31152241918665274, "grad_norm": 9.358216285705566, "learning_rate": 9.556000000000001e-06, "loss": 1.0511, "step": 2390 }, { "epoch": 0.31282586027111575, "grad_norm": 9.375101089477539, "learning_rate": 9.596000000000001e-06, "loss": 1.0519, "step": 2400 }, { "epoch": 0.31282586027111575, "eval/acc": 32.55813980102539, "step": 2400 }, { "epoch": 0.31282586027111575, "eval_loss": 2.992863655090332, "eval_runtime": 0.5533, "eval_samples_per_second": 77.716, "eval_steps_per_second": 1.807, "step": 2400 }, { "epoch": 0.3141293013555787, "grad_norm": 10.487874984741211, "learning_rate": 9.636e-06, "loss": 1.0795, "step": 2410 }, { "epoch": 0.3154327424400417, "grad_norm": 14.056046485900879, "learning_rate": 9.676e-06, "loss": 1.1586, "step": 2420 }, { "epoch": 0.31673618352450467, "grad_norm": 10.049337387084961, "learning_rate": 9.716000000000002e-06, "loss": 1.1348, "step": 2430 }, { "epoch": 0.3180396246089677, "grad_norm": 11.084318161010742, "learning_rate": 9.756000000000001e-06, "loss": 1.056, "step": 2440 }, { "epoch": 0.3193430656934307, "grad_norm": 10.96147346496582, "learning_rate": 9.796e-06, "loss": 1.1561, "step": 2450 }, { "epoch": 0.32064650677789364, "grad_norm": 9.765122413635254, "learning_rate": 9.836e-06, "loss": 1.1844, "step": 2460 }, { "epoch": 0.32194994786235664, "grad_norm": 12.490370750427246, "learning_rate": 9.876000000000002e-06, "loss": 1.1342, "step": 2470 }, { "epoch": 0.3232533889468196, "grad_norm": 9.971538543701172, "learning_rate": 9.916000000000001e-06, "loss": 1.0907, "step": 2480 }, { "epoch": 0.3245568300312826, "grad_norm": 11.306795120239258, "learning_rate": 9.956000000000001e-06, "loss": 1.1521, "step": 2490 }, { "epoch": 0.32586027111574556, "grad_norm": 10.270991325378418, "learning_rate": 9.996e-06, "loss": 1.1473, "step": 2500 }, { "epoch": 0.32586027111574556, "eval/acc": 37.20930099487305, "step": 2500 }, { "epoch": 0.32586027111574556, "eval_loss": 2.875680446624756, "eval_runtime": 0.554, "eval_samples_per_second": 77.622, "eval_steps_per_second": 1.805, "step": 2500 }, { "epoch": 0.32716371220020857, "grad_norm": 10.23509407043457, "learning_rate": 9.996e-06, "loss": 1.216, "step": 2510 }, { "epoch": 0.3284671532846715, "grad_norm": 12.120686531066895, "learning_rate": 9.991555555555557e-06, "loss": 1.0669, "step": 2520 }, { "epoch": 0.32977059436913453, "grad_norm": 11.20948314666748, "learning_rate": 9.987111111111112e-06, "loss": 1.0889, "step": 2530 }, { "epoch": 0.3310740354535975, "grad_norm": 11.085042953491211, "learning_rate": 9.982666666666667e-06, "loss": 1.212, "step": 2540 }, { "epoch": 0.3323774765380605, "grad_norm": 11.783760070800781, "learning_rate": 9.978222222222223e-06, "loss": 1.2059, "step": 2550 }, { "epoch": 0.33368091762252344, "grad_norm": 11.339371681213379, "learning_rate": 9.973777777777778e-06, "loss": 1.1027, "step": 2560 }, { "epoch": 0.33498435870698645, "grad_norm": 10.946623802185059, "learning_rate": 9.969333333333335e-06, "loss": 1.1663, "step": 2570 }, { "epoch": 0.3362877997914494, "grad_norm": 9.797304153442383, "learning_rate": 9.96488888888889e-06, "loss": 1.147, "step": 2580 }, { "epoch": 0.3375912408759124, "grad_norm": 10.305734634399414, "learning_rate": 9.960444444444444e-06, "loss": 1.2113, "step": 2590 }, { "epoch": 0.33889468196037537, "grad_norm": 9.742680549621582, "learning_rate": 9.956000000000001e-06, "loss": 1.1096, "step": 2600 }, { "epoch": 0.33889468196037537, "eval/acc": 32.55813980102539, "step": 2600 }, { "epoch": 0.33889468196037537, "eval_loss": 2.838628053665161, "eval_runtime": 0.5541, "eval_samples_per_second": 77.604, "eval_steps_per_second": 1.805, "step": 2600 }, { "epoch": 0.3401981230448384, "grad_norm": 11.681222915649414, "learning_rate": 9.951555555555556e-06, "loss": 1.1573, "step": 2610 }, { "epoch": 0.3415015641293014, "grad_norm": 10.580199241638184, "learning_rate": 9.947111111111112e-06, "loss": 1.1942, "step": 2620 }, { "epoch": 0.34280500521376434, "grad_norm": 9.525206565856934, "learning_rate": 9.942666666666667e-06, "loss": 1.139, "step": 2630 }, { "epoch": 0.34410844629822734, "grad_norm": 11.521892547607422, "learning_rate": 9.938222222222224e-06, "loss": 1.2106, "step": 2640 }, { "epoch": 0.3454118873826903, "grad_norm": 10.282144546508789, "learning_rate": 9.933777777777779e-06, "loss": 1.068, "step": 2650 }, { "epoch": 0.3467153284671533, "grad_norm": 10.942089080810547, "learning_rate": 9.929333333333333e-06, "loss": 1.0709, "step": 2660 }, { "epoch": 0.34801876955161626, "grad_norm": 12.269514083862305, "learning_rate": 9.92488888888889e-06, "loss": 1.073, "step": 2670 }, { "epoch": 0.34932221063607927, "grad_norm": 10.467517852783203, "learning_rate": 9.920444444444445e-06, "loss": 1.1178, "step": 2680 }, { "epoch": 0.3506256517205422, "grad_norm": 11.05263900756836, "learning_rate": 9.916000000000001e-06, "loss": 1.1126, "step": 2690 }, { "epoch": 0.35192909280500523, "grad_norm": 10.848026275634766, "learning_rate": 9.911555555555556e-06, "loss": 1.1425, "step": 2700 }, { "epoch": 0.35192909280500523, "eval/acc": 32.55813980102539, "step": 2700 }, { "epoch": 0.35192909280500523, "eval_loss": 2.8443257808685303, "eval_runtime": 0.5552, "eval_samples_per_second": 77.455, "eval_steps_per_second": 1.801, "step": 2700 }, { "epoch": 0.3532325338894682, "grad_norm": 9.627706527709961, "learning_rate": 9.907111111111111e-06, "loss": 1.0892, "step": 2710 }, { "epoch": 0.3545359749739312, "grad_norm": 9.48183822631836, "learning_rate": 9.902666666666668e-06, "loss": 1.1611, "step": 2720 }, { "epoch": 0.35583941605839414, "grad_norm": 10.31680965423584, "learning_rate": 9.898222222222224e-06, "loss": 1.1474, "step": 2730 }, { "epoch": 0.35714285714285715, "grad_norm": 9.613831520080566, "learning_rate": 9.893777777777779e-06, "loss": 1.0592, "step": 2740 }, { "epoch": 0.3584462982273201, "grad_norm": 14.002620697021484, "learning_rate": 9.889333333333334e-06, "loss": 1.1399, "step": 2750 }, { "epoch": 0.3597497393117831, "grad_norm": 9.574627876281738, "learning_rate": 9.884888888888889e-06, "loss": 1.0702, "step": 2760 }, { "epoch": 0.36105318039624607, "grad_norm": 11.370795249938965, "learning_rate": 9.880444444444445e-06, "loss": 1.1089, "step": 2770 }, { "epoch": 0.3623566214807091, "grad_norm": 11.44530963897705, "learning_rate": 9.876000000000002e-06, "loss": 1.0896, "step": 2780 }, { "epoch": 0.3636600625651721, "grad_norm": 10.26310920715332, "learning_rate": 9.871555555555557e-06, "loss": 1.2288, "step": 2790 }, { "epoch": 0.36496350364963503, "grad_norm": 10.72587776184082, "learning_rate": 9.867111111111111e-06, "loss": 1.1154, "step": 2800 }, { "epoch": 0.36496350364963503, "eval/acc": 39.53488540649414, "step": 2800 }, { "epoch": 0.36496350364963503, "eval_loss": 2.9019012451171875, "eval_runtime": 0.553, "eval_samples_per_second": 77.752, "eval_steps_per_second": 1.808, "step": 2800 }, { "epoch": 0.36626694473409804, "grad_norm": 10.30538272857666, "learning_rate": 9.862666666666668e-06, "loss": 1.086, "step": 2810 }, { "epoch": 0.367570385818561, "grad_norm": 9.331382751464844, "learning_rate": 9.858222222222223e-06, "loss": 1.2179, "step": 2820 }, { "epoch": 0.368873826903024, "grad_norm": 9.834467887878418, "learning_rate": 9.85377777777778e-06, "loss": 1.1286, "step": 2830 }, { "epoch": 0.37017726798748696, "grad_norm": 11.874444961547852, "learning_rate": 9.849333333333334e-06, "loss": 1.1325, "step": 2840 }, { "epoch": 0.37148070907194997, "grad_norm": 10.40954875946045, "learning_rate": 9.844888888888889e-06, "loss": 1.1669, "step": 2850 }, { "epoch": 0.3727841501564129, "grad_norm": 10.013657569885254, "learning_rate": 9.840444444444446e-06, "loss": 1.0895, "step": 2860 }, { "epoch": 0.3740875912408759, "grad_norm": 10.641711235046387, "learning_rate": 9.836e-06, "loss": 1.1342, "step": 2870 }, { "epoch": 0.3753910323253389, "grad_norm": 9.41917896270752, "learning_rate": 9.831555555555557e-06, "loss": 1.0698, "step": 2880 }, { "epoch": 0.3766944734098019, "grad_norm": 10.998407363891602, "learning_rate": 9.827111111111112e-06, "loss": 1.0777, "step": 2890 }, { "epoch": 0.37799791449426484, "grad_norm": 10.565347671508789, "learning_rate": 9.822666666666667e-06, "loss": 1.1446, "step": 2900 }, { "epoch": 0.37799791449426484, "eval/acc": 34.88372039794922, "step": 2900 }, { "epoch": 0.37799791449426484, "eval_loss": 2.8377606868743896, "eval_runtime": 0.5537, "eval_samples_per_second": 77.653, "eval_steps_per_second": 1.806, "step": 2900 }, { "epoch": 0.37930135557872785, "grad_norm": 10.49682903289795, "learning_rate": 9.818222222222223e-06, "loss": 1.0774, "step": 2910 }, { "epoch": 0.3806047966631908, "grad_norm": 10.447504997253418, "learning_rate": 9.813777777777778e-06, "loss": 1.0116, "step": 2920 }, { "epoch": 0.3819082377476538, "grad_norm": 9.127096176147461, "learning_rate": 9.809333333333335e-06, "loss": 1.0786, "step": 2930 }, { "epoch": 0.38321167883211676, "grad_norm": 11.165003776550293, "learning_rate": 9.80488888888889e-06, "loss": 1.0451, "step": 2940 }, { "epoch": 0.3845151199165798, "grad_norm": 11.500470161437988, "learning_rate": 9.800444444444446e-06, "loss": 1.1676, "step": 2950 }, { "epoch": 0.3858185610010427, "grad_norm": 9.945548057556152, "learning_rate": 9.796e-06, "loss": 1.0829, "step": 2960 }, { "epoch": 0.38712200208550573, "grad_norm": 9.096894264221191, "learning_rate": 9.791555555555556e-06, "loss": 1.1571, "step": 2970 }, { "epoch": 0.38842544316996874, "grad_norm": 9.676164627075195, "learning_rate": 9.787111111111112e-06, "loss": 1.1088, "step": 2980 }, { "epoch": 0.3897288842544317, "grad_norm": 9.788176536560059, "learning_rate": 9.782666666666667e-06, "loss": 1.1283, "step": 2990 }, { "epoch": 0.3910323253388947, "grad_norm": 9.841941833496094, "learning_rate": 9.778222222222224e-06, "loss": 1.1106, "step": 3000 }, { "epoch": 0.3910323253388947, "eval/acc": 34.88372039794922, "step": 3000 }, { "epoch": 0.3910323253388947, "eval_loss": 2.733646869659424, "eval_runtime": 0.5496, "eval_samples_per_second": 78.234, "eval_steps_per_second": 1.819, "step": 3000 }, { "epoch": 0.39233576642335766, "grad_norm": 11.72174072265625, "learning_rate": 9.773777777777778e-06, "loss": 1.0701, "step": 3010 }, { "epoch": 0.39363920750782067, "grad_norm": 9.394064903259277, "learning_rate": 9.769333333333333e-06, "loss": 1.0966, "step": 3020 }, { "epoch": 0.3949426485922836, "grad_norm": 10.733139991760254, "learning_rate": 9.76488888888889e-06, "loss": 1.1911, "step": 3030 }, { "epoch": 0.3962460896767466, "grad_norm": 10.33555793762207, "learning_rate": 9.760444444444446e-06, "loss": 1.0843, "step": 3040 }, { "epoch": 0.3975495307612096, "grad_norm": 10.826848983764648, "learning_rate": 9.756000000000001e-06, "loss": 1.2552, "step": 3050 }, { "epoch": 0.3988529718456726, "grad_norm": 14.324176788330078, "learning_rate": 9.751555555555556e-06, "loss": 1.0654, "step": 3060 }, { "epoch": 0.40015641293013554, "grad_norm": 9.932692527770996, "learning_rate": 9.74711111111111e-06, "loss": 1.1099, "step": 3070 }, { "epoch": 0.40145985401459855, "grad_norm": 8.769567489624023, "learning_rate": 9.742666666666667e-06, "loss": 1.126, "step": 3080 }, { "epoch": 0.4027632950990615, "grad_norm": 9.914202690124512, "learning_rate": 9.738222222222224e-06, "loss": 1.0349, "step": 3090 }, { "epoch": 0.4040667361835245, "grad_norm": 8.979110717773438, "learning_rate": 9.733777777777779e-06, "loss": 1.1147, "step": 3100 }, { "epoch": 0.4040667361835245, "eval/acc": 34.88372039794922, "step": 3100 }, { "epoch": 0.4040667361835245, "eval_loss": 2.9223451614379883, "eval_runtime": 0.5703, "eval_samples_per_second": 75.404, "eval_steps_per_second": 1.754, "step": 3100 }, { "epoch": 0.40537017726798746, "grad_norm": 10.178040504455566, "learning_rate": 9.729333333333334e-06, "loss": 1.0661, "step": 3110 }, { "epoch": 0.40667361835245047, "grad_norm": 9.03530216217041, "learning_rate": 9.724888888888888e-06, "loss": 1.1282, "step": 3120 }, { "epoch": 0.4079770594369134, "grad_norm": 9.545401573181152, "learning_rate": 9.720444444444445e-06, "loss": 1.0933, "step": 3130 }, { "epoch": 0.40928050052137643, "grad_norm": 9.34640121459961, "learning_rate": 9.716000000000002e-06, "loss": 1.1273, "step": 3140 }, { "epoch": 0.41058394160583944, "grad_norm": 9.456986427307129, "learning_rate": 9.711555555555556e-06, "loss": 1.1527, "step": 3150 }, { "epoch": 0.4118873826903024, "grad_norm": 10.370234489440918, "learning_rate": 9.707111111111111e-06, "loss": 1.0639, "step": 3160 }, { "epoch": 0.4131908237747654, "grad_norm": 9.064216613769531, "learning_rate": 9.702666666666668e-06, "loss": 1.0942, "step": 3170 }, { "epoch": 0.41449426485922836, "grad_norm": 10.234908103942871, "learning_rate": 9.698222222222223e-06, "loss": 1.167, "step": 3180 }, { "epoch": 0.41579770594369136, "grad_norm": 10.56281566619873, "learning_rate": 9.693777777777779e-06, "loss": 1.0659, "step": 3190 }, { "epoch": 0.4171011470281543, "grad_norm": 10.342962265014648, "learning_rate": 9.689333333333334e-06, "loss": 1.0829, "step": 3200 }, { "epoch": 0.4171011470281543, "eval/acc": 37.20930099487305, "step": 3200 }, { "epoch": 0.4171011470281543, "eval_loss": 2.8262035846710205, "eval_runtime": 0.5536, "eval_samples_per_second": 77.675, "eval_steps_per_second": 1.806, "step": 3200 }, { "epoch": 0.4184045881126173, "grad_norm": 10.684874534606934, "learning_rate": 9.684888888888889e-06, "loss": 1.1349, "step": 3210 }, { "epoch": 0.4197080291970803, "grad_norm": 10.516358375549316, "learning_rate": 9.680444444444445e-06, "loss": 1.1277, "step": 3220 }, { "epoch": 0.4210114702815433, "grad_norm": 10.139744758605957, "learning_rate": 9.676e-06, "loss": 1.0163, "step": 3230 }, { "epoch": 0.42231491136600624, "grad_norm": 8.341273307800293, "learning_rate": 9.671555555555557e-06, "loss": 1.0264, "step": 3240 }, { "epoch": 0.42361835245046925, "grad_norm": 10.223051071166992, "learning_rate": 9.667111111111112e-06, "loss": 1.135, "step": 3250 }, { "epoch": 0.4249217935349322, "grad_norm": 10.643424987792969, "learning_rate": 9.662666666666668e-06, "loss": 1.1079, "step": 3260 }, { "epoch": 0.4262252346193952, "grad_norm": 10.129752159118652, "learning_rate": 9.658222222222223e-06, "loss": 1.1081, "step": 3270 }, { "epoch": 0.42752867570385816, "grad_norm": 9.503373146057129, "learning_rate": 9.653777777777778e-06, "loss": 1.1133, "step": 3280 }, { "epoch": 0.42883211678832117, "grad_norm": 10.085280418395996, "learning_rate": 9.649333333333334e-06, "loss": 1.0587, "step": 3290 }, { "epoch": 0.4301355578727841, "grad_norm": 8.572416305541992, "learning_rate": 9.64488888888889e-06, "loss": 1.0355, "step": 3300 }, { "epoch": 0.4301355578727841, "eval/acc": 39.53488540649414, "step": 3300 }, { "epoch": 0.4301355578727841, "eval_loss": 2.875706911087036, "eval_runtime": 0.8759, "eval_samples_per_second": 49.094, "eval_steps_per_second": 1.142, "step": 3300 }, { "epoch": 0.43143899895724713, "grad_norm": 11.9055814743042, "learning_rate": 9.640444444444446e-06, "loss": 1.1596, "step": 3310 }, { "epoch": 0.43274244004171014, "grad_norm": 7.891529083251953, "learning_rate": 9.636e-06, "loss": 1.0478, "step": 3320 }, { "epoch": 0.4340458811261731, "grad_norm": 10.109358787536621, "learning_rate": 9.631555555555555e-06, "loss": 1.037, "step": 3330 }, { "epoch": 0.4353493222106361, "grad_norm": 9.673956871032715, "learning_rate": 9.627111111111112e-06, "loss": 1.0827, "step": 3340 }, { "epoch": 0.43665276329509906, "grad_norm": 9.339848518371582, "learning_rate": 9.622666666666668e-06, "loss": 1.0662, "step": 3350 }, { "epoch": 0.43795620437956206, "grad_norm": 9.403885841369629, "learning_rate": 9.618222222222223e-06, "loss": 1.051, "step": 3360 }, { "epoch": 0.439259645464025, "grad_norm": 10.163128852844238, "learning_rate": 9.613777777777778e-06, "loss": 1.082, "step": 3370 }, { "epoch": 0.440563086548488, "grad_norm": 8.699789047241211, "learning_rate": 9.609333333333333e-06, "loss": 1.1382, "step": 3380 }, { "epoch": 0.441866527632951, "grad_norm": 10.108110427856445, "learning_rate": 9.60488888888889e-06, "loss": 1.1227, "step": 3390 }, { "epoch": 0.443169968717414, "grad_norm": 11.483874320983887, "learning_rate": 9.600444444444446e-06, "loss": 1.0851, "step": 3400 }, { "epoch": 0.443169968717414, "eval/acc": 37.20930099487305, "step": 3400 }, { "epoch": 0.443169968717414, "eval_loss": 2.804619312286377, "eval_runtime": 0.5503, "eval_samples_per_second": 78.14, "eval_steps_per_second": 1.817, "step": 3400 }, { "epoch": 0.44447340980187694, "grad_norm": 9.97952938079834, "learning_rate": 9.596000000000001e-06, "loss": 1.02, "step": 3410 }, { "epoch": 0.44577685088633995, "grad_norm": 9.445475578308105, "learning_rate": 9.591555555555556e-06, "loss": 1.1166, "step": 3420 }, { "epoch": 0.4470802919708029, "grad_norm": 10.408344268798828, "learning_rate": 9.58711111111111e-06, "loss": 1.1305, "step": 3430 }, { "epoch": 0.4483837330552659, "grad_norm": 10.129816055297852, "learning_rate": 9.582666666666667e-06, "loss": 1.0882, "step": 3440 }, { "epoch": 0.44968717413972886, "grad_norm": 11.24634838104248, "learning_rate": 9.578222222222224e-06, "loss": 1.1458, "step": 3450 }, { "epoch": 0.45099061522419187, "grad_norm": 9.015290260314941, "learning_rate": 9.573777777777779e-06, "loss": 1.1692, "step": 3460 }, { "epoch": 0.4522940563086548, "grad_norm": 11.587824821472168, "learning_rate": 9.569333333333333e-06, "loss": 1.1005, "step": 3470 }, { "epoch": 0.45359749739311783, "grad_norm": 11.352563858032227, "learning_rate": 9.56488888888889e-06, "loss": 1.0646, "step": 3480 }, { "epoch": 0.45490093847758084, "grad_norm": 9.49247932434082, "learning_rate": 9.560444444444445e-06, "loss": 1.0682, "step": 3490 }, { "epoch": 0.4562043795620438, "grad_norm": 11.200020790100098, "learning_rate": 9.556000000000001e-06, "loss": 1.0354, "step": 3500 }, { "epoch": 0.4562043795620438, "eval/acc": 32.55813980102539, "step": 3500 }, { "epoch": 0.4562043795620438, "eval_loss": 2.850745916366577, "eval_runtime": 0.5511, "eval_samples_per_second": 78.029, "eval_steps_per_second": 1.815, "step": 3500 }, { "epoch": 0.4575078206465068, "grad_norm": 9.414541244506836, "learning_rate": 9.551555555555556e-06, "loss": 1.0458, "step": 3510 }, { "epoch": 0.45881126173096975, "grad_norm": 11.558536529541016, "learning_rate": 9.547111111111111e-06, "loss": 1.1236, "step": 3520 }, { "epoch": 0.46011470281543276, "grad_norm": 9.692702293395996, "learning_rate": 9.542666666666668e-06, "loss": 1.1055, "step": 3530 }, { "epoch": 0.4614181438998957, "grad_norm": 11.946589469909668, "learning_rate": 9.538222222222222e-06, "loss": 1.0746, "step": 3540 }, { "epoch": 0.4627215849843587, "grad_norm": 9.754605293273926, "learning_rate": 9.533777777777779e-06, "loss": 1.1951, "step": 3550 }, { "epoch": 0.4640250260688217, "grad_norm": 9.67271614074707, "learning_rate": 9.529333333333334e-06, "loss": 1.0727, "step": 3560 }, { "epoch": 0.4653284671532847, "grad_norm": 9.403274536132812, "learning_rate": 9.52488888888889e-06, "loss": 1.0781, "step": 3570 }, { "epoch": 0.46663190823774764, "grad_norm": 9.350010871887207, "learning_rate": 9.520444444444445e-06, "loss": 1.141, "step": 3580 }, { "epoch": 0.46793534932221065, "grad_norm": 11.810049057006836, "learning_rate": 9.516e-06, "loss": 1.0642, "step": 3590 }, { "epoch": 0.4692387904066736, "grad_norm": 9.524765014648438, "learning_rate": 9.511555555555557e-06, "loss": 1.0196, "step": 3600 }, { "epoch": 0.4692387904066736, "eval/acc": 34.88372039794922, "step": 3600 }, { "epoch": 0.4692387904066736, "eval_loss": 2.8087544441223145, "eval_runtime": 0.5519, "eval_samples_per_second": 77.911, "eval_steps_per_second": 1.812, "step": 3600 }, { "epoch": 0.4705422314911366, "grad_norm": 11.370705604553223, "learning_rate": 9.507111111111111e-06, "loss": 1.0253, "step": 3610 }, { "epoch": 0.47184567257559956, "grad_norm": 8.690403938293457, "learning_rate": 9.502666666666668e-06, "loss": 1.071, "step": 3620 }, { "epoch": 0.47314911366006257, "grad_norm": 9.085663795471191, "learning_rate": 9.498222222222223e-06, "loss": 1.1507, "step": 3630 }, { "epoch": 0.4744525547445255, "grad_norm": 9.252151489257812, "learning_rate": 9.493777777777778e-06, "loss": 1.1038, "step": 3640 }, { "epoch": 0.47575599582898853, "grad_norm": 11.12983226776123, "learning_rate": 9.489333333333334e-06, "loss": 1.0457, "step": 3650 }, { "epoch": 0.4770594369134515, "grad_norm": 9.117828369140625, "learning_rate": 9.48488888888889e-06, "loss": 1.0704, "step": 3660 }, { "epoch": 0.4783628779979145, "grad_norm": 10.71731185913086, "learning_rate": 9.480444444444446e-06, "loss": 1.0461, "step": 3670 }, { "epoch": 0.4796663190823775, "grad_norm": 8.495375633239746, "learning_rate": 9.476e-06, "loss": 1.1173, "step": 3680 }, { "epoch": 0.48096976016684045, "grad_norm": 10.223701477050781, "learning_rate": 9.471555555555555e-06, "loss": 1.1301, "step": 3690 }, { "epoch": 0.48227320125130346, "grad_norm": 10.180765151977539, "learning_rate": 9.467111111111112e-06, "loss": 1.1262, "step": 3700 }, { "epoch": 0.48227320125130346, "eval/acc": 37.20930099487305, "step": 3700 }, { "epoch": 0.48227320125130346, "eval_loss": 2.7609646320343018, "eval_runtime": 0.5504, "eval_samples_per_second": 78.122, "eval_steps_per_second": 1.817, "step": 3700 }, { "epoch": 0.4835766423357664, "grad_norm": 10.506937026977539, "learning_rate": 9.462666666666668e-06, "loss": 1.0366, "step": 3710 }, { "epoch": 0.4848800834202294, "grad_norm": 11.131478309631348, "learning_rate": 9.458222222222223e-06, "loss": 0.9718, "step": 3720 }, { "epoch": 0.4861835245046924, "grad_norm": 11.127948760986328, "learning_rate": 9.453777777777778e-06, "loss": 1.1568, "step": 3730 }, { "epoch": 0.4874869655891554, "grad_norm": 10.03661060333252, "learning_rate": 9.449333333333333e-06, "loss": 1.0066, "step": 3740 }, { "epoch": 0.48879040667361834, "grad_norm": 11.38666820526123, "learning_rate": 9.44488888888889e-06, "loss": 1.0457, "step": 3750 }, { "epoch": 0.49009384775808135, "grad_norm": 9.510127067565918, "learning_rate": 9.440444444444446e-06, "loss": 1.1167, "step": 3760 }, { "epoch": 0.4913972888425443, "grad_norm": 10.810651779174805, "learning_rate": 9.436e-06, "loss": 1.1126, "step": 3770 }, { "epoch": 0.4927007299270073, "grad_norm": 9.202433586120605, "learning_rate": 9.431555555555556e-06, "loss": 1.0681, "step": 3780 }, { "epoch": 0.49400417101147026, "grad_norm": 8.647710800170898, "learning_rate": 9.427111111111112e-06, "loss": 1.0295, "step": 3790 }, { "epoch": 0.49530761209593327, "grad_norm": 11.453765869140625, "learning_rate": 9.422666666666667e-06, "loss": 1.015, "step": 3800 }, { "epoch": 0.49530761209593327, "eval/acc": 37.20930099487305, "step": 3800 }, { "epoch": 0.49530761209593327, "eval_loss": 2.7812387943267822, "eval_runtime": 0.5494, "eval_samples_per_second": 78.268, "eval_steps_per_second": 1.82, "step": 3800 }, { "epoch": 0.4966110531803962, "grad_norm": 10.551323890686035, "learning_rate": 9.418222222222224e-06, "loss": 1.1188, "step": 3810 }, { "epoch": 0.49791449426485923, "grad_norm": 10.307533264160156, "learning_rate": 9.413777777777778e-06, "loss": 1.0767, "step": 3820 }, { "epoch": 0.4992179353493222, "grad_norm": 12.098529815673828, "learning_rate": 9.409333333333333e-06, "loss": 1.0597, "step": 3830 }, { "epoch": 0.5005213764337852, "grad_norm": 10.920623779296875, "learning_rate": 9.40488888888889e-06, "loss": 0.9847, "step": 3840 }, { "epoch": 0.5018248175182481, "grad_norm": 10.035759925842285, "learning_rate": 9.400444444444445e-06, "loss": 1.07, "step": 3850 }, { "epoch": 0.5031282586027112, "grad_norm": 10.293031692504883, "learning_rate": 9.396000000000001e-06, "loss": 1.0453, "step": 3860 }, { "epoch": 0.5044316996871742, "grad_norm": 9.7219877243042, "learning_rate": 9.391555555555556e-06, "loss": 1.0783, "step": 3870 }, { "epoch": 0.5057351407716372, "grad_norm": 9.780116081237793, "learning_rate": 9.387111111111113e-06, "loss": 1.021, "step": 3880 }, { "epoch": 0.5070385818561001, "grad_norm": 10.145584106445312, "learning_rate": 9.382666666666667e-06, "loss": 1.0744, "step": 3890 }, { "epoch": 0.5083420229405631, "grad_norm": 9.737056732177734, "learning_rate": 9.378222222222222e-06, "loss": 1.1837, "step": 3900 }, { "epoch": 0.5083420229405631, "eval/acc": 34.88372039794922, "step": 3900 }, { "epoch": 0.5083420229405631, "eval_loss": 2.6774258613586426, "eval_runtime": 0.5548, "eval_samples_per_second": 77.509, "eval_steps_per_second": 1.803, "step": 3900 }, { "epoch": 0.5096454640250261, "grad_norm": 9.52910041809082, "learning_rate": 9.373777777777779e-06, "loss": 1.0, "step": 3910 }, { "epoch": 0.5109489051094891, "grad_norm": 11.480224609375, "learning_rate": 9.369333333333334e-06, "loss": 1.029, "step": 3920 }, { "epoch": 0.512252346193952, "grad_norm": 8.294060707092285, "learning_rate": 9.36488888888889e-06, "loss": 1.0584, "step": 3930 }, { "epoch": 0.513555787278415, "grad_norm": 8.96554946899414, "learning_rate": 9.360444444444445e-06, "loss": 1.0415, "step": 3940 }, { "epoch": 0.514859228362878, "grad_norm": 10.146249771118164, "learning_rate": 9.356e-06, "loss": 0.9763, "step": 3950 }, { "epoch": 0.516162669447341, "grad_norm": 9.620243072509766, "learning_rate": 9.351555555555556e-06, "loss": 1.0677, "step": 3960 }, { "epoch": 0.5174661105318039, "grad_norm": 8.995674133300781, "learning_rate": 9.347111111111113e-06, "loss": 1.0893, "step": 3970 }, { "epoch": 0.5187695516162669, "grad_norm": 10.30301284790039, "learning_rate": 9.342666666666668e-06, "loss": 1.0958, "step": 3980 }, { "epoch": 0.5200729927007299, "grad_norm": 9.020184516906738, "learning_rate": 9.338222222222223e-06, "loss": 1.0115, "step": 3990 }, { "epoch": 0.5213764337851929, "grad_norm": 11.706809997558594, "learning_rate": 9.333777777777777e-06, "loss": 1.1306, "step": 4000 }, { "epoch": 0.5213764337851929, "eval/acc": 34.88372039794922, "step": 4000 }, { "epoch": 0.5213764337851929, "eval_loss": 2.719060182571411, "eval_runtime": 0.5502, "eval_samples_per_second": 78.155, "eval_steps_per_second": 1.818, "step": 4000 }, { "epoch": 0.5226798748696558, "grad_norm": 10.49409294128418, "learning_rate": 9.329333333333334e-06, "loss": 1.0554, "step": 4010 }, { "epoch": 0.5239833159541188, "grad_norm": 7.883603572845459, "learning_rate": 9.32488888888889e-06, "loss": 0.9968, "step": 4020 }, { "epoch": 0.5252867570385819, "grad_norm": 11.045550346374512, "learning_rate": 9.320444444444445e-06, "loss": 1.1689, "step": 4030 }, { "epoch": 0.5265901981230449, "grad_norm": 9.245767593383789, "learning_rate": 9.316e-06, "loss": 1.0647, "step": 4040 }, { "epoch": 0.5278936392075079, "grad_norm": 8.662199974060059, "learning_rate": 9.311555555555555e-06, "loss": 0.9952, "step": 4050 }, { "epoch": 0.5291970802919708, "grad_norm": 8.584678649902344, "learning_rate": 9.307111111111112e-06, "loss": 1.0025, "step": 4060 }, { "epoch": 0.5305005213764338, "grad_norm": 8.951703071594238, "learning_rate": 9.302666666666668e-06, "loss": 1.0182, "step": 4070 }, { "epoch": 0.5318039624608968, "grad_norm": 11.469212532043457, "learning_rate": 9.298222222222223e-06, "loss": 1.086, "step": 4080 }, { "epoch": 0.5331074035453598, "grad_norm": 10.124979972839355, "learning_rate": 9.293777777777778e-06, "loss": 1.0614, "step": 4090 }, { "epoch": 0.5344108446298227, "grad_norm": 9.715713500976562, "learning_rate": 9.289333333333334e-06, "loss": 1.029, "step": 4100 }, { "epoch": 0.5344108446298227, "eval/acc": 39.53488540649414, "step": 4100 }, { "epoch": 0.5344108446298227, "eval_loss": 2.6891214847564697, "eval_runtime": 0.5531, "eval_samples_per_second": 77.75, "eval_steps_per_second": 1.808, "step": 4100 }, { "epoch": 0.5357142857142857, "grad_norm": 10.887805938720703, "learning_rate": 9.28488888888889e-06, "loss": 1.1104, "step": 4110 }, { "epoch": 0.5370177267987487, "grad_norm": 8.276104927062988, "learning_rate": 9.280444444444446e-06, "loss": 1.0563, "step": 4120 }, { "epoch": 0.5383211678832117, "grad_norm": 9.104747772216797, "learning_rate": 9.276e-06, "loss": 1.0732, "step": 4130 }, { "epoch": 0.5396246089676746, "grad_norm": 10.727592468261719, "learning_rate": 9.271555555555555e-06, "loss": 1.0253, "step": 4140 }, { "epoch": 0.5409280500521376, "grad_norm": 10.487238883972168, "learning_rate": 9.267111111111112e-06, "loss": 1.0849, "step": 4150 }, { "epoch": 0.5422314911366006, "grad_norm": 9.830368995666504, "learning_rate": 9.262666666666667e-06, "loss": 1.0699, "step": 4160 }, { "epoch": 0.5435349322210636, "grad_norm": 9.725363731384277, "learning_rate": 9.258222222222223e-06, "loss": 1.0149, "step": 4170 }, { "epoch": 0.5448383733055265, "grad_norm": 10.23435115814209, "learning_rate": 9.253777777777778e-06, "loss": 0.9648, "step": 4180 }, { "epoch": 0.5461418143899895, "grad_norm": 8.573326110839844, "learning_rate": 9.249333333333335e-06, "loss": 1.0607, "step": 4190 }, { "epoch": 0.5474452554744526, "grad_norm": 9.514001846313477, "learning_rate": 9.24488888888889e-06, "loss": 1.0196, "step": 4200 }, { "epoch": 0.5474452554744526, "eval/acc": 32.55813980102539, "step": 4200 }, { "epoch": 0.5474452554744526, "eval_loss": 2.5407004356384277, "eval_runtime": 0.5515, "eval_samples_per_second": 77.969, "eval_steps_per_second": 1.813, "step": 4200 }, { "epoch": 0.5487486965589156, "grad_norm": 9.46273136138916, "learning_rate": 9.240444444444444e-06, "loss": 1.0557, "step": 4210 }, { "epoch": 0.5500521376433786, "grad_norm": 12.82573127746582, "learning_rate": 9.236000000000001e-06, "loss": 1.051, "step": 4220 }, { "epoch": 0.5513555787278415, "grad_norm": 10.965460777282715, "learning_rate": 9.231555555555556e-06, "loss": 0.9239, "step": 4230 }, { "epoch": 0.5526590198123045, "grad_norm": 9.015987396240234, "learning_rate": 9.227111111111112e-06, "loss": 1.0477, "step": 4240 }, { "epoch": 0.5539624608967675, "grad_norm": 8.61673355102539, "learning_rate": 9.222666666666667e-06, "loss": 1.0693, "step": 4250 }, { "epoch": 0.5552659019812305, "grad_norm": 9.152997016906738, "learning_rate": 9.218222222222222e-06, "loss": 1.0104, "step": 4260 }, { "epoch": 0.5565693430656934, "grad_norm": 8.82421588897705, "learning_rate": 9.213777777777779e-06, "loss": 1.0304, "step": 4270 }, { "epoch": 0.5578727841501564, "grad_norm": 9.665721893310547, "learning_rate": 9.209333333333335e-06, "loss": 1.0608, "step": 4280 }, { "epoch": 0.5591762252346194, "grad_norm": 10.174515724182129, "learning_rate": 9.20488888888889e-06, "loss": 1.099, "step": 4290 }, { "epoch": 0.5604796663190824, "grad_norm": 9.723739624023438, "learning_rate": 9.200444444444445e-06, "loss": 1.0327, "step": 4300 }, { "epoch": 0.5604796663190824, "eval/acc": 34.88372039794922, "step": 4300 }, { "epoch": 0.5604796663190824, "eval_loss": 2.560245990753174, "eval_runtime": 0.5506, "eval_samples_per_second": 78.095, "eval_steps_per_second": 1.816, "step": 4300 }, { "epoch": 0.5617831074035453, "grad_norm": 9.028182983398438, "learning_rate": 9.196e-06, "loss": 1.0447, "step": 4310 }, { "epoch": 0.5630865484880083, "grad_norm": 9.231035232543945, "learning_rate": 9.191555555555556e-06, "loss": 1.0014, "step": 4320 }, { "epoch": 0.5643899895724713, "grad_norm": 9.409144401550293, "learning_rate": 9.187111111111113e-06, "loss": 1.0805, "step": 4330 }, { "epoch": 0.5656934306569343, "grad_norm": 9.330337524414062, "learning_rate": 9.182666666666668e-06, "loss": 0.9831, "step": 4340 }, { "epoch": 0.5669968717413972, "grad_norm": 9.44364070892334, "learning_rate": 9.178222222222222e-06, "loss": 1.1507, "step": 4350 }, { "epoch": 0.5683003128258602, "grad_norm": 8.195267677307129, "learning_rate": 9.173777777777777e-06, "loss": 1.0538, "step": 4360 }, { "epoch": 0.5696037539103233, "grad_norm": 10.082292556762695, "learning_rate": 9.169333333333334e-06, "loss": 1.1772, "step": 4370 }, { "epoch": 0.5709071949947863, "grad_norm": 7.957224369049072, "learning_rate": 9.16488888888889e-06, "loss": 0.9671, "step": 4380 }, { "epoch": 0.5722106360792493, "grad_norm": 9.066376686096191, "learning_rate": 9.160444444444445e-06, "loss": 0.9564, "step": 4390 }, { "epoch": 0.5735140771637122, "grad_norm": 9.167228698730469, "learning_rate": 9.156e-06, "loss": 1.0625, "step": 4400 }, { "epoch": 0.5735140771637122, "eval/acc": 37.20930099487305, "step": 4400 }, { "epoch": 0.5735140771637122, "eval_loss": 2.62846040725708, "eval_runtime": 0.5519, "eval_samples_per_second": 77.91, "eval_steps_per_second": 1.812, "step": 4400 }, { "epoch": 0.5748175182481752, "grad_norm": 11.493626594543457, "learning_rate": 9.151555555555557e-06, "loss": 1.0258, "step": 4410 }, { "epoch": 0.5761209593326382, "grad_norm": 11.340927124023438, "learning_rate": 9.147111111111111e-06, "loss": 1.0167, "step": 4420 }, { "epoch": 0.5774244004171012, "grad_norm": 9.083796501159668, "learning_rate": 9.142666666666668e-06, "loss": 1.0704, "step": 4430 }, { "epoch": 0.5787278415015641, "grad_norm": 10.585103988647461, "learning_rate": 9.138222222222223e-06, "loss": 1.1001, "step": 4440 }, { "epoch": 0.5800312825860271, "grad_norm": 10.192399024963379, "learning_rate": 9.133777777777778e-06, "loss": 1.0589, "step": 4450 }, { "epoch": 0.5813347236704901, "grad_norm": 9.637321472167969, "learning_rate": 9.129333333333334e-06, "loss": 1.0217, "step": 4460 }, { "epoch": 0.5826381647549531, "grad_norm": 11.652050018310547, "learning_rate": 9.124888888888889e-06, "loss": 1.1136, "step": 4470 }, { "epoch": 0.583941605839416, "grad_norm": 9.2413969039917, "learning_rate": 9.120444444444446e-06, "loss": 1.0009, "step": 4480 }, { "epoch": 0.585245046923879, "grad_norm": 9.579240798950195, "learning_rate": 9.116e-06, "loss": 1.0948, "step": 4490 }, { "epoch": 0.586548488008342, "grad_norm": 10.748444557189941, "learning_rate": 9.111555555555557e-06, "loss": 1.0969, "step": 4500 }, { "epoch": 0.586548488008342, "eval/acc": 34.88372039794922, "step": 4500 }, { "epoch": 0.586548488008342, "eval_loss": 2.528625726699829, "eval_runtime": 0.5514, "eval_samples_per_second": 77.977, "eval_steps_per_second": 1.813, "step": 4500 }, { "epoch": 0.587851929092805, "grad_norm": 9.715644836425781, "learning_rate": 9.107111111111112e-06, "loss": 1.1046, "step": 4510 }, { "epoch": 0.5891553701772679, "grad_norm": 9.33938980102539, "learning_rate": 9.102666666666667e-06, "loss": 0.9897, "step": 4520 }, { "epoch": 0.5904588112617309, "grad_norm": 8.88958740234375, "learning_rate": 9.098222222222223e-06, "loss": 1.1344, "step": 4530 }, { "epoch": 0.591762252346194, "grad_norm": 10.64390754699707, "learning_rate": 9.093777777777778e-06, "loss": 1.0106, "step": 4540 }, { "epoch": 0.593065693430657, "grad_norm": 9.564251899719238, "learning_rate": 9.089333333333335e-06, "loss": 1.0985, "step": 4550 }, { "epoch": 0.59436913451512, "grad_norm": 9.475229263305664, "learning_rate": 9.08488888888889e-06, "loss": 1.0643, "step": 4560 }, { "epoch": 0.5956725755995829, "grad_norm": 8.694733619689941, "learning_rate": 9.080444444444444e-06, "loss": 1.0899, "step": 4570 }, { "epoch": 0.5969760166840459, "grad_norm": 9.67250919342041, "learning_rate": 9.076000000000001e-06, "loss": 1.0518, "step": 4580 }, { "epoch": 0.5982794577685089, "grad_norm": 9.918119430541992, "learning_rate": 9.071555555555557e-06, "loss": 1.139, "step": 4590 }, { "epoch": 0.5995828988529719, "grad_norm": 11.0655517578125, "learning_rate": 9.067111111111112e-06, "loss": 1.0195, "step": 4600 }, { "epoch": 0.5995828988529719, "eval/acc": 37.20930099487305, "step": 4600 }, { "epoch": 0.5995828988529719, "eval_loss": 2.581063747406006, "eval_runtime": 0.5974, "eval_samples_per_second": 71.985, "eval_steps_per_second": 1.674, "step": 4600 }, { "epoch": 0.6008863399374348, "grad_norm": 9.673873901367188, "learning_rate": 9.062666666666667e-06, "loss": 1.0139, "step": 4610 }, { "epoch": 0.6021897810218978, "grad_norm": 9.95392894744873, "learning_rate": 9.058222222222222e-06, "loss": 1.0544, "step": 4620 }, { "epoch": 0.6034932221063608, "grad_norm": 11.47777271270752, "learning_rate": 9.053777777777778e-06, "loss": 1.0851, "step": 4630 }, { "epoch": 0.6047966631908238, "grad_norm": 8.379030227661133, "learning_rate": 9.049333333333335e-06, "loss": 1.0244, "step": 4640 }, { "epoch": 0.6061001042752867, "grad_norm": 8.413164138793945, "learning_rate": 9.04488888888889e-06, "loss": 1.0619, "step": 4650 }, { "epoch": 0.6074035453597497, "grad_norm": 10.171146392822266, "learning_rate": 9.040444444444445e-06, "loss": 1.0794, "step": 4660 }, { "epoch": 0.6087069864442127, "grad_norm": 10.772948265075684, "learning_rate": 9.036e-06, "loss": 1.0795, "step": 4670 }, { "epoch": 0.6100104275286757, "grad_norm": 9.68770980834961, "learning_rate": 9.031555555555556e-06, "loss": 1.0261, "step": 4680 }, { "epoch": 0.6113138686131386, "grad_norm": 9.47791862487793, "learning_rate": 9.027111111111113e-06, "loss": 1.1494, "step": 4690 }, { "epoch": 0.6126173096976016, "grad_norm": 9.655404090881348, "learning_rate": 9.022666666666667e-06, "loss": 1.1113, "step": 4700 }, { "epoch": 0.6126173096976016, "eval/acc": 41.86046600341797, "step": 4700 }, { "epoch": 0.6126173096976016, "eval_loss": 2.4572794437408447, "eval_runtime": 0.551, "eval_samples_per_second": 78.043, "eval_steps_per_second": 1.815, "step": 4700 }, { "epoch": 0.6139207507820647, "grad_norm": 11.384035110473633, "learning_rate": 9.018222222222222e-06, "loss": 0.9002, "step": 4710 }, { "epoch": 0.6152241918665277, "grad_norm": 9.862360000610352, "learning_rate": 9.013777777777779e-06, "loss": 0.9838, "step": 4720 }, { "epoch": 0.6165276329509907, "grad_norm": 8.860601425170898, "learning_rate": 9.009333333333334e-06, "loss": 1.0471, "step": 4730 }, { "epoch": 0.6178310740354536, "grad_norm": 9.085923194885254, "learning_rate": 9.00488888888889e-06, "loss": 1.1413, "step": 4740 }, { "epoch": 0.6191345151199166, "grad_norm": 7.881019115447998, "learning_rate": 9.000444444444445e-06, "loss": 1.0241, "step": 4750 }, { "epoch": 0.6204379562043796, "grad_norm": 9.55480670928955, "learning_rate": 8.996e-06, "loss": 1.0634, "step": 4760 }, { "epoch": 0.6217413972888426, "grad_norm": 8.191434860229492, "learning_rate": 8.991555555555556e-06, "loss": 1.0264, "step": 4770 }, { "epoch": 0.6230448383733055, "grad_norm": 11.498793601989746, "learning_rate": 8.987111111111111e-06, "loss": 1.0466, "step": 4780 }, { "epoch": 0.6243482794577685, "grad_norm": 8.848291397094727, "learning_rate": 8.982666666666668e-06, "loss": 0.9813, "step": 4790 }, { "epoch": 0.6256517205422315, "grad_norm": 8.858402252197266, "learning_rate": 8.978222222222223e-06, "loss": 1.0143, "step": 4800 }, { "epoch": 0.6256517205422315, "eval/acc": 34.88372039794922, "step": 4800 }, { "epoch": 0.6256517205422315, "eval_loss": 2.6291277408599854, "eval_runtime": 0.557, "eval_samples_per_second": 77.206, "eval_steps_per_second": 1.795, "step": 4800 }, { "epoch": 0.6269551616266945, "grad_norm": 9.703082084655762, "learning_rate": 8.97377777777778e-06, "loss": 1.0354, "step": 4810 }, { "epoch": 0.6282586027111574, "grad_norm": 8.061450004577637, "learning_rate": 8.969333333333334e-06, "loss": 1.0989, "step": 4820 }, { "epoch": 0.6295620437956204, "grad_norm": 8.38237476348877, "learning_rate": 8.964888888888889e-06, "loss": 1.0303, "step": 4830 }, { "epoch": 0.6308654848800834, "grad_norm": 9.098999977111816, "learning_rate": 8.960444444444445e-06, "loss": 1.1374, "step": 4840 }, { "epoch": 0.6321689259645464, "grad_norm": 8.959243774414062, "learning_rate": 8.956e-06, "loss": 1.0242, "step": 4850 }, { "epoch": 0.6334723670490093, "grad_norm": 10.157614707946777, "learning_rate": 8.951555555555557e-06, "loss": 1.134, "step": 4860 }, { "epoch": 0.6347758081334723, "grad_norm": 10.983575820922852, "learning_rate": 8.947111111111112e-06, "loss": 0.9518, "step": 4870 }, { "epoch": 0.6360792492179353, "grad_norm": 11.162731170654297, "learning_rate": 8.942666666666667e-06, "loss": 1.0702, "step": 4880 }, { "epoch": 0.6373826903023984, "grad_norm": 9.005561828613281, "learning_rate": 8.938222222222223e-06, "loss": 1.0228, "step": 4890 }, { "epoch": 0.6386861313868614, "grad_norm": 9.825065612792969, "learning_rate": 8.93377777777778e-06, "loss": 1.0373, "step": 4900 }, { "epoch": 0.6386861313868614, "eval/acc": 32.55813980102539, "step": 4900 }, { "epoch": 0.6386861313868614, "eval_loss": 2.755546808242798, "eval_runtime": 0.5529, "eval_samples_per_second": 77.765, "eval_steps_per_second": 1.808, "step": 4900 }, { "epoch": 0.6399895724713243, "grad_norm": 8.52741813659668, "learning_rate": 8.929333333333334e-06, "loss": 1.066, "step": 4910 }, { "epoch": 0.6412930135557873, "grad_norm": 9.974360466003418, "learning_rate": 8.92488888888889e-06, "loss": 1.0397, "step": 4920 }, { "epoch": 0.6425964546402503, "grad_norm": 8.10251235961914, "learning_rate": 8.920444444444444e-06, "loss": 1.0832, "step": 4930 }, { "epoch": 0.6438998957247133, "grad_norm": 10.143448829650879, "learning_rate": 8.916e-06, "loss": 1.0112, "step": 4940 }, { "epoch": 0.6452033368091762, "grad_norm": 10.25130844116211, "learning_rate": 8.911555555555557e-06, "loss": 1.0808, "step": 4950 }, { "epoch": 0.6465067778936392, "grad_norm": 11.107799530029297, "learning_rate": 8.907111111111112e-06, "loss": 1.0547, "step": 4960 }, { "epoch": 0.6478102189781022, "grad_norm": 10.128641128540039, "learning_rate": 8.902666666666667e-06, "loss": 1.0721, "step": 4970 }, { "epoch": 0.6491136600625652, "grad_norm": 10.3110933303833, "learning_rate": 8.898222222222222e-06, "loss": 0.9976, "step": 4980 }, { "epoch": 0.6504171011470281, "grad_norm": 8.941389083862305, "learning_rate": 8.893777777777778e-06, "loss": 1.0196, "step": 4990 }, { "epoch": 0.6517205422314911, "grad_norm": 10.89724063873291, "learning_rate": 8.889333333333335e-06, "loss": 1.0551, "step": 5000 }, { "epoch": 0.6517205422314911, "eval/acc": 34.88372039794922, "step": 5000 }, { "epoch": 0.6517205422314911, "eval_loss": 2.655290126800537, "eval_runtime": 0.5523, "eval_samples_per_second": 77.86, "eval_steps_per_second": 1.811, "step": 5000 } ], "logging_steps": 10, "max_steps": 25000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }