diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,203341 @@ +{ + "best_metric": 0.4563054144382477, + "best_model_checkpoint": "./w2v-bert-2.0-yoruba_naijavoices_250h/checkpoint-24000", + "epoch": 37.22721437740693, + "eval_steps": 1000, + "global_step": 29000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012836970474967907, + "grad_norm": 24.242416381835938, + "learning_rate": 3.851091142490372e-09, + "loss": 9.1008, + "step": 1 + }, + { + "epoch": 0.0025673940949935813, + "grad_norm": 21.366621017456055, + "learning_rate": 7.702182284980745e-09, + "loss": 8.0803, + "step": 2 + }, + { + "epoch": 0.0038510911424903724, + "grad_norm": 22.873092651367188, + "learning_rate": 1.1553273427471117e-08, + "loss": 7.863, + "step": 3 + }, + { + "epoch": 0.005134788189987163, + "grad_norm": 20.10211944580078, + "learning_rate": 1.540436456996149e-08, + "loss": 7.6799, + "step": 4 + }, + { + "epoch": 0.006418485237483954, + "grad_norm": 19.432950973510742, + "learning_rate": 1.925545571245186e-08, + "loss": 7.5406, + "step": 5 + }, + { + "epoch": 0.007702182284980745, + "grad_norm": 19.68552589416504, + "learning_rate": 2.3106546854942234e-08, + "loss": 7.5649, + "step": 6 + }, + { + "epoch": 0.008985879332477536, + "grad_norm": 19.547300338745117, + "learning_rate": 2.6957637997432606e-08, + "loss": 7.4886, + "step": 7 + }, + { + "epoch": 0.010269576379974325, + "grad_norm": 19.464845657348633, + "learning_rate": 3.080872913992298e-08, + "loss": 7.4435, + "step": 8 + }, + { + "epoch": 0.011553273427471117, + "grad_norm": 19.644046783447266, + "learning_rate": 3.4659820282413354e-08, + "loss": 7.4736, + "step": 9 + }, + { + "epoch": 0.012836970474967908, + "grad_norm": 19.380165100097656, + "learning_rate": 3.851091142490372e-08, + "loss": 7.4247, + "step": 10 + }, + { + "epoch": 0.014120667522464698, + "grad_norm": 19.499492645263672, + "learning_rate": 4.236200256739409e-08, + "loss": 7.3995, + "step": 11 + }, + { + "epoch": 0.01540436456996149, + "grad_norm": 19.271892547607422, + "learning_rate": 4.621309370988447e-08, + "loss": 7.3303, + "step": 12 + }, + { + "epoch": 0.01668806161745828, + "grad_norm": 19.00176239013672, + "learning_rate": 5.006418485237484e-08, + "loss": 7.2711, + "step": 13 + }, + { + "epoch": 0.01797175866495507, + "grad_norm": 19.132980346679688, + "learning_rate": 5.391527599486521e-08, + "loss": 7.2443, + "step": 14 + }, + { + "epoch": 0.019255455712451863, + "grad_norm": 19.25206184387207, + "learning_rate": 5.776636713735559e-08, + "loss": 7.2637, + "step": 15 + }, + { + "epoch": 0.02053915275994865, + "grad_norm": 18.955392837524414, + "learning_rate": 6.161745827984596e-08, + "loss": 7.1688, + "step": 16 + }, + { + "epoch": 0.021822849807445442, + "grad_norm": 18.521286010742188, + "learning_rate": 6.546854942233633e-08, + "loss": 7.074, + "step": 17 + }, + { + "epoch": 0.023106546854942234, + "grad_norm": 18.71027946472168, + "learning_rate": 6.931964056482671e-08, + "loss": 7.1596, + "step": 18 + }, + { + "epoch": 0.024390243902439025, + "grad_norm": 18.62795639038086, + "learning_rate": 7.317073170731708e-08, + "loss": 7.1041, + "step": 19 + }, + { + "epoch": 0.025673940949935817, + "grad_norm": 18.663362503051758, + "learning_rate": 7.702182284980745e-08, + "loss": 7.0799, + "step": 20 + }, + { + "epoch": 0.026957637997432605, + "grad_norm": 18.353092193603516, + "learning_rate": 8.087291399229783e-08, + "loss": 7.0006, + "step": 21 + }, + { + "epoch": 0.028241335044929396, + "grad_norm": 18.156808853149414, + "learning_rate": 8.472400513478818e-08, + "loss": 6.9076, + "step": 22 + }, + { + "epoch": 0.029525032092426188, + "grad_norm": 18.829126358032227, + "learning_rate": 8.857509627727857e-08, + "loss": 7.0696, + "step": 23 + }, + { + "epoch": 0.03080872913992298, + "grad_norm": 18.97662925720215, + "learning_rate": 9.242618741976894e-08, + "loss": 7.0946, + "step": 24 + }, + { + "epoch": 0.03209242618741977, + "grad_norm": 18.850385665893555, + "learning_rate": 9.627727856225932e-08, + "loss": 7.0042, + "step": 25 + }, + { + "epoch": 0.03337612323491656, + "grad_norm": 19.03487205505371, + "learning_rate": 1.0012836970474969e-07, + "loss": 7.0037, + "step": 26 + }, + { + "epoch": 0.03465982028241335, + "grad_norm": 19.012073516845703, + "learning_rate": 1.0397946084724006e-07, + "loss": 7.0155, + "step": 27 + }, + { + "epoch": 0.03594351732991014, + "grad_norm": 19.70283317565918, + "learning_rate": 1.0783055198973042e-07, + "loss": 7.1265, + "step": 28 + }, + { + "epoch": 0.037227214377406934, + "grad_norm": 19.28377342224121, + "learning_rate": 1.1168164313222079e-07, + "loss": 7.0238, + "step": 29 + }, + { + "epoch": 0.038510911424903725, + "grad_norm": 18.43764305114746, + "learning_rate": 1.1553273427471118e-07, + "loss": 6.8514, + "step": 30 + }, + { + "epoch": 0.03979460847240052, + "grad_norm": 18.511457443237305, + "learning_rate": 1.1938382541720153e-07, + "loss": 6.7892, + "step": 31 + }, + { + "epoch": 0.0410783055198973, + "grad_norm": 18.69145965576172, + "learning_rate": 1.2323491655969191e-07, + "loss": 6.8359, + "step": 32 + }, + { + "epoch": 0.04236200256739409, + "grad_norm": 19.1663818359375, + "learning_rate": 1.2708600770218227e-07, + "loss": 6.9343, + "step": 33 + }, + { + "epoch": 0.043645699614890884, + "grad_norm": 18.933828353881836, + "learning_rate": 1.3093709884467265e-07, + "loss": 6.7797, + "step": 34 + }, + { + "epoch": 0.044929396662387676, + "grad_norm": 18.77761459350586, + "learning_rate": 1.3478818998716303e-07, + "loss": 6.8204, + "step": 35 + }, + { + "epoch": 0.04621309370988447, + "grad_norm": 18.645185470581055, + "learning_rate": 1.3863928112965342e-07, + "loss": 6.7408, + "step": 36 + }, + { + "epoch": 0.04749679075738126, + "grad_norm": 18.545621871948242, + "learning_rate": 1.4249037227214377e-07, + "loss": 6.6868, + "step": 37 + }, + { + "epoch": 0.04878048780487805, + "grad_norm": 19.080093383789062, + "learning_rate": 1.4634146341463415e-07, + "loss": 6.7747, + "step": 38 + }, + { + "epoch": 0.05006418485237484, + "grad_norm": 18.58884620666504, + "learning_rate": 1.5019255455712454e-07, + "loss": 6.6526, + "step": 39 + }, + { + "epoch": 0.051347881899871634, + "grad_norm": 18.594018936157227, + "learning_rate": 1.540436456996149e-07, + "loss": 6.6477, + "step": 40 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 18.952205657958984, + "learning_rate": 1.5789473684210527e-07, + "loss": 6.6794, + "step": 41 + }, + { + "epoch": 0.05391527599486521, + "grad_norm": 19.520898818969727, + "learning_rate": 1.6174582798459566e-07, + "loss": 6.7687, + "step": 42 + }, + { + "epoch": 0.055198973042362, + "grad_norm": 19.670276641845703, + "learning_rate": 1.65596919127086e-07, + "loss": 6.7709, + "step": 43 + }, + { + "epoch": 0.05648267008985879, + "grad_norm": 19.158912658691406, + "learning_rate": 1.6944801026957637e-07, + "loss": 6.6384, + "step": 44 + }, + { + "epoch": 0.057766367137355584, + "grad_norm": 19.482553482055664, + "learning_rate": 1.7329910141206675e-07, + "loss": 6.7088, + "step": 45 + }, + { + "epoch": 0.059050064184852376, + "grad_norm": 19.068878173828125, + "learning_rate": 1.7715019255455713e-07, + "loss": 6.5213, + "step": 46 + }, + { + "epoch": 0.06033376123234917, + "grad_norm": 18.148954391479492, + "learning_rate": 1.810012836970475e-07, + "loss": 6.3373, + "step": 47 + }, + { + "epoch": 0.06161745827984596, + "grad_norm": 18.164642333984375, + "learning_rate": 1.8485237483953787e-07, + "loss": 6.3373, + "step": 48 + }, + { + "epoch": 0.06290115532734275, + "grad_norm": 18.472932815551758, + "learning_rate": 1.8870346598202825e-07, + "loss": 6.3138, + "step": 49 + }, + { + "epoch": 0.06418485237483953, + "grad_norm": 17.000547409057617, + "learning_rate": 1.9255455712451863e-07, + "loss": 5.9205, + "step": 50 + }, + { + "epoch": 0.06546854942233633, + "grad_norm": 26.757137298583984, + "learning_rate": 1.96405648267009e-07, + "loss": 8.6358, + "step": 51 + }, + { + "epoch": 0.06675224646983312, + "grad_norm": 24.46841812133789, + "learning_rate": 2.0025673940949937e-07, + "loss": 7.9946, + "step": 52 + }, + { + "epoch": 0.06803594351732992, + "grad_norm": 23.499061584472656, + "learning_rate": 2.0410783055198975e-07, + "loss": 7.6998, + "step": 53 + }, + { + "epoch": 0.0693196405648267, + "grad_norm": 22.740497589111328, + "learning_rate": 2.079589216944801e-07, + "loss": 7.4934, + "step": 54 + }, + { + "epoch": 0.07060333761232349, + "grad_norm": 21.4111270904541, + "learning_rate": 2.1181001283697047e-07, + "loss": 7.1555, + "step": 55 + }, + { + "epoch": 0.07188703465982028, + "grad_norm": 21.672393798828125, + "learning_rate": 2.1566110397946085e-07, + "loss": 7.2315, + "step": 56 + }, + { + "epoch": 0.07317073170731707, + "grad_norm": 22.032957077026367, + "learning_rate": 2.1951219512195123e-07, + "loss": 7.2371, + "step": 57 + }, + { + "epoch": 0.07445442875481387, + "grad_norm": 22.619094848632812, + "learning_rate": 2.2336328626444159e-07, + "loss": 7.2931, + "step": 58 + }, + { + "epoch": 0.07573812580231065, + "grad_norm": 21.71796417236328, + "learning_rate": 2.2721437740693197e-07, + "loss": 7.1159, + "step": 59 + }, + { + "epoch": 0.07702182284980745, + "grad_norm": 22.30376434326172, + "learning_rate": 2.3106546854942235e-07, + "loss": 7.1128, + "step": 60 + }, + { + "epoch": 0.07830551989730423, + "grad_norm": 22.50465965270996, + "learning_rate": 2.3491655969191273e-07, + "loss": 7.1347, + "step": 61 + }, + { + "epoch": 0.07958921694480103, + "grad_norm": 22.50840187072754, + "learning_rate": 2.3876765083440306e-07, + "loss": 7.0818, + "step": 62 + }, + { + "epoch": 0.08087291399229782, + "grad_norm": 21.543643951416016, + "learning_rate": 2.4261874197689344e-07, + "loss": 6.8609, + "step": 63 + }, + { + "epoch": 0.0821566110397946, + "grad_norm": 21.832979202270508, + "learning_rate": 2.4646983311938383e-07, + "loss": 6.8845, + "step": 64 + }, + { + "epoch": 0.0834403080872914, + "grad_norm": 22.5042781829834, + "learning_rate": 2.503209242618742e-07, + "loss": 6.9685, + "step": 65 + }, + { + "epoch": 0.08472400513478819, + "grad_norm": 21.90555763244629, + "learning_rate": 2.5417201540436454e-07, + "loss": 6.8761, + "step": 66 + }, + { + "epoch": 0.08600770218228498, + "grad_norm": 21.853771209716797, + "learning_rate": 2.58023106546855e-07, + "loss": 6.7529, + "step": 67 + }, + { + "epoch": 0.08729139922978177, + "grad_norm": 22.25796890258789, + "learning_rate": 2.618741976893453e-07, + "loss": 6.7835, + "step": 68 + }, + { + "epoch": 0.08857509627727857, + "grad_norm": 22.163890838623047, + "learning_rate": 2.6572528883183574e-07, + "loss": 6.7643, + "step": 69 + }, + { + "epoch": 0.08985879332477535, + "grad_norm": 21.976062774658203, + "learning_rate": 2.6957637997432607e-07, + "loss": 6.6603, + "step": 70 + }, + { + "epoch": 0.09114249037227215, + "grad_norm": 21.436439514160156, + "learning_rate": 2.7342747111681645e-07, + "loss": 6.5395, + "step": 71 + }, + { + "epoch": 0.09242618741976893, + "grad_norm": 22.0313663482666, + "learning_rate": 2.7727856225930683e-07, + "loss": 6.6215, + "step": 72 + }, + { + "epoch": 0.09370988446726572, + "grad_norm": 21.904924392700195, + "learning_rate": 2.8112965340179716e-07, + "loss": 6.5459, + "step": 73 + }, + { + "epoch": 0.09499358151476252, + "grad_norm": 22.789369583129883, + "learning_rate": 2.8498074454428754e-07, + "loss": 6.6724, + "step": 74 + }, + { + "epoch": 0.0962772785622593, + "grad_norm": 21.565170288085938, + "learning_rate": 2.888318356867779e-07, + "loss": 6.4099, + "step": 75 + }, + { + "epoch": 0.0975609756097561, + "grad_norm": 22.226415634155273, + "learning_rate": 2.926829268292683e-07, + "loss": 6.4872, + "step": 76 + }, + { + "epoch": 0.09884467265725289, + "grad_norm": 22.05906867980957, + "learning_rate": 2.9653401797175864e-07, + "loss": 6.412, + "step": 77 + }, + { + "epoch": 0.10012836970474968, + "grad_norm": 22.02336311340332, + "learning_rate": 3.0038510911424907e-07, + "loss": 6.3664, + "step": 78 + }, + { + "epoch": 0.10141206675224647, + "grad_norm": 22.63691520690918, + "learning_rate": 3.042362002567394e-07, + "loss": 6.3829, + "step": 79 + }, + { + "epoch": 0.10269576379974327, + "grad_norm": 23.02330207824707, + "learning_rate": 3.080872913992298e-07, + "loss": 6.4389, + "step": 80 + }, + { + "epoch": 0.10397946084724005, + "grad_norm": 22.768295288085938, + "learning_rate": 3.1193838254172017e-07, + "loss": 6.3512, + "step": 81 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 23.320693969726562, + "learning_rate": 3.1578947368421055e-07, + "loss": 6.3854, + "step": 82 + }, + { + "epoch": 0.10654685494223363, + "grad_norm": 24.200546264648438, + "learning_rate": 3.196405648267009e-07, + "loss": 6.4797, + "step": 83 + }, + { + "epoch": 0.10783055198973042, + "grad_norm": 22.854984283447266, + "learning_rate": 3.234916559691913e-07, + "loss": 6.2464, + "step": 84 + }, + { + "epoch": 0.10911424903722722, + "grad_norm": 22.600597381591797, + "learning_rate": 3.2734274711168164e-07, + "loss": 6.1901, + "step": 85 + }, + { + "epoch": 0.110397946084724, + "grad_norm": 23.933935165405273, + "learning_rate": 3.31193838254172e-07, + "loss": 6.2964, + "step": 86 + }, + { + "epoch": 0.1116816431322208, + "grad_norm": 24.567317962646484, + "learning_rate": 3.350449293966624e-07, + "loss": 6.3571, + "step": 87 + }, + { + "epoch": 0.11296534017971759, + "grad_norm": 24.403837203979492, + "learning_rate": 3.3889602053915274e-07, + "loss": 6.2616, + "step": 88 + }, + { + "epoch": 0.11424903722721438, + "grad_norm": 23.295124053955078, + "learning_rate": 3.4274711168164317e-07, + "loss": 6.0965, + "step": 89 + }, + { + "epoch": 0.11553273427471117, + "grad_norm": 23.897069931030273, + "learning_rate": 3.465982028241335e-07, + "loss": 6.1361, + "step": 90 + }, + { + "epoch": 0.11681643132220795, + "grad_norm": 24.950023651123047, + "learning_rate": 3.504492939666239e-07, + "loss": 6.1944, + "step": 91 + }, + { + "epoch": 0.11810012836970475, + "grad_norm": 23.701839447021484, + "learning_rate": 3.5430038510911426e-07, + "loss": 6.037, + "step": 92 + }, + { + "epoch": 0.11938382541720154, + "grad_norm": 24.530427932739258, + "learning_rate": 3.5815147625160465e-07, + "loss": 6.0525, + "step": 93 + }, + { + "epoch": 0.12066752246469833, + "grad_norm": 23.55308723449707, + "learning_rate": 3.62002567394095e-07, + "loss": 5.9046, + "step": 94 + }, + { + "epoch": 0.12195121951219512, + "grad_norm": 23.804170608520508, + "learning_rate": 3.658536585365854e-07, + "loss": 5.912, + "step": 95 + }, + { + "epoch": 0.12323491655969192, + "grad_norm": 24.240203857421875, + "learning_rate": 3.6970474967907574e-07, + "loss": 5.8934, + "step": 96 + }, + { + "epoch": 0.1245186136071887, + "grad_norm": 23.929492950439453, + "learning_rate": 3.7355584082156607e-07, + "loss": 5.8405, + "step": 97 + }, + { + "epoch": 0.1258023106546855, + "grad_norm": 23.338584899902344, + "learning_rate": 3.774069319640565e-07, + "loss": 5.7144, + "step": 98 + }, + { + "epoch": 0.12708600770218229, + "grad_norm": NaN, + "learning_rate": 3.774069319640565e-07, + "loss": 5.5659, + "step": 99 + }, + { + "epoch": 0.12836970474967907, + "grad_norm": 20.151845932006836, + "learning_rate": 3.8125802310654683e-07, + "loss": 5.3376, + "step": 100 + }, + { + "epoch": 0.12965340179717585, + "grad_norm": 42.2439079284668, + "learning_rate": 3.8510911424903727e-07, + "loss": 7.7044, + "step": 101 + }, + { + "epoch": 0.13093709884467267, + "grad_norm": 36.17237854003906, + "learning_rate": 3.889602053915276e-07, + "loss": 6.982, + "step": 102 + }, + { + "epoch": 0.13222079589216945, + "grad_norm": 34.319515228271484, + "learning_rate": 3.92811296534018e-07, + "loss": 6.7211, + "step": 103 + }, + { + "epoch": 0.13350449293966624, + "grad_norm": 33.82362365722656, + "learning_rate": 3.9666238767650836e-07, + "loss": 6.6173, + "step": 104 + }, + { + "epoch": 0.13478818998716302, + "grad_norm": 31.72848892211914, + "learning_rate": 4.0051347881899875e-07, + "loss": 6.3516, + "step": 105 + }, + { + "epoch": 0.13607188703465983, + "grad_norm": 32.95777130126953, + "learning_rate": 4.043645699614891e-07, + "loss": 6.3796, + "step": 106 + }, + { + "epoch": 0.13735558408215662, + "grad_norm": 33.49725341796875, + "learning_rate": 4.082156611039795e-07, + "loss": 6.4071, + "step": 107 + }, + { + "epoch": 0.1386392811296534, + "grad_norm": 31.753347396850586, + "learning_rate": 4.1206675224646984e-07, + "loss": 6.184, + "step": 108 + }, + { + "epoch": 0.1399229781771502, + "grad_norm": 31.021516799926758, + "learning_rate": 4.159178433889602e-07, + "loss": 6.0475, + "step": 109 + }, + { + "epoch": 0.14120667522464697, + "grad_norm": 33.10489273071289, + "learning_rate": 4.197689345314506e-07, + "loss": 6.1509, + "step": 110 + }, + { + "epoch": 0.14249037227214378, + "grad_norm": 32.31212615966797, + "learning_rate": 4.2362002567394093e-07, + "loss": 6.0293, + "step": 111 + }, + { + "epoch": 0.14377406931964057, + "grad_norm": 32.418331146240234, + "learning_rate": 4.274711168164313e-07, + "loss": 5.9888, + "step": 112 + }, + { + "epoch": 0.14505776636713735, + "grad_norm": 31.62362289428711, + "learning_rate": 4.313222079589217e-07, + "loss": 5.8801, + "step": 113 + }, + { + "epoch": 0.14634146341463414, + "grad_norm": 29.90801239013672, + "learning_rate": 4.351732991014121e-07, + "loss": 5.6868, + "step": 114 + }, + { + "epoch": 0.14762516046213095, + "grad_norm": 30.84420394897461, + "learning_rate": 4.3902439024390246e-07, + "loss": 5.7115, + "step": 115 + }, + { + "epoch": 0.14890885750962773, + "grad_norm": 31.115346908569336, + "learning_rate": 4.4287548138639284e-07, + "loss": 5.7102, + "step": 116 + }, + { + "epoch": 0.15019255455712452, + "grad_norm": 30.684629440307617, + "learning_rate": 4.4672657252888317e-07, + "loss": 5.6302, + "step": 117 + }, + { + "epoch": 0.1514762516046213, + "grad_norm": 31.33417320251465, + "learning_rate": 4.505776636713736e-07, + "loss": 5.6457, + "step": 118 + }, + { + "epoch": 0.1527599486521181, + "grad_norm": 30.707128524780273, + "learning_rate": 4.5442875481386394e-07, + "loss": 5.5676, + "step": 119 + }, + { + "epoch": 0.1540436456996149, + "grad_norm": 30.215208053588867, + "learning_rate": 4.582798459563543e-07, + "loss": 5.4736, + "step": 120 + }, + { + "epoch": 0.15532734274711169, + "grad_norm": 32.085628509521484, + "learning_rate": 4.621309370988447e-07, + "loss": 5.5599, + "step": 121 + }, + { + "epoch": 0.15661103979460847, + "grad_norm": 30.307817459106445, + "learning_rate": 4.6598202824133503e-07, + "loss": 5.4184, + "step": 122 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 30.24565887451172, + "learning_rate": 4.6983311938382547e-07, + "loss": 5.3698, + "step": 123 + }, + { + "epoch": 0.15917843388960207, + "grad_norm": 30.069122314453125, + "learning_rate": 4.736842105263158e-07, + "loss": 5.3172, + "step": 124 + }, + { + "epoch": 0.16046213093709885, + "grad_norm": 29.60153579711914, + "learning_rate": 4.775353016688061e-07, + "loss": 5.2648, + "step": 125 + }, + { + "epoch": 0.16174582798459564, + "grad_norm": 29.234619140625, + "learning_rate": 4.813863928112966e-07, + "loss": 5.1946, + "step": 126 + }, + { + "epoch": 0.16302952503209242, + "grad_norm": 29.561168670654297, + "learning_rate": 4.852374839537869e-07, + "loss": 5.1906, + "step": 127 + }, + { + "epoch": 0.1643132220795892, + "grad_norm": 28.44255828857422, + "learning_rate": 4.890885750962773e-07, + "loss": 5.0867, + "step": 128 + }, + { + "epoch": 0.16559691912708602, + "grad_norm": 28.542892456054688, + "learning_rate": 4.929396662387677e-07, + "loss": 5.0616, + "step": 129 + }, + { + "epoch": 0.1668806161745828, + "grad_norm": 28.772489547729492, + "learning_rate": 4.967907573812581e-07, + "loss": 5.0415, + "step": 130 + }, + { + "epoch": 0.1681643132220796, + "grad_norm": 28.33892059326172, + "learning_rate": 5.006418485237484e-07, + "loss": 5.003, + "step": 131 + }, + { + "epoch": 0.16944801026957637, + "grad_norm": 28.298683166503906, + "learning_rate": 5.044929396662387e-07, + "loss": 4.9736, + "step": 132 + }, + { + "epoch": 0.17073170731707318, + "grad_norm": 26.346548080444336, + "learning_rate": 5.083440308087291e-07, + "loss": 4.8456, + "step": 133 + }, + { + "epoch": 0.17201540436456997, + "grad_norm": 27.50910758972168, + "learning_rate": 5.121951219512195e-07, + "loss": 4.8633, + "step": 134 + }, + { + "epoch": 0.17329910141206675, + "grad_norm": 25.500120162963867, + "learning_rate": 5.1604621309371e-07, + "loss": 4.7519, + "step": 135 + }, + { + "epoch": 0.17458279845956354, + "grad_norm": 25.66370391845703, + "learning_rate": 5.198973042362003e-07, + "loss": 4.7337, + "step": 136 + }, + { + "epoch": 0.17586649550706032, + "grad_norm": 23.77637481689453, + "learning_rate": 5.237483953786906e-07, + "loss": 4.6237, + "step": 137 + }, + { + "epoch": 0.17715019255455713, + "grad_norm": 23.799057006835938, + "learning_rate": 5.275994865211809e-07, + "loss": 4.6362, + "step": 138 + }, + { + "epoch": 0.17843388960205392, + "grad_norm": 24.04403305053711, + "learning_rate": 5.314505776636715e-07, + "loss": 4.6148, + "step": 139 + }, + { + "epoch": 0.1797175866495507, + "grad_norm": 24.804710388183594, + "learning_rate": 5.353016688061618e-07, + "loss": 4.5999, + "step": 140 + }, + { + "epoch": 0.1810012836970475, + "grad_norm": 23.099361419677734, + "learning_rate": 5.391527599486521e-07, + "loss": 4.5428, + "step": 141 + }, + { + "epoch": 0.1822849807445443, + "grad_norm": 20.342622756958008, + "learning_rate": 5.430038510911425e-07, + "loss": 4.4497, + "step": 142 + }, + { + "epoch": 0.18356867779204109, + "grad_norm": 20.395980834960938, + "learning_rate": 5.468549422336329e-07, + "loss": 4.4268, + "step": 143 + }, + { + "epoch": 0.18485237483953787, + "grad_norm": 18.99181365966797, + "learning_rate": 5.507060333761232e-07, + "loss": 4.3803, + "step": 144 + }, + { + "epoch": 0.18613607188703465, + "grad_norm": 17.128931045532227, + "learning_rate": 5.545571245186137e-07, + "loss": 4.3168, + "step": 145 + }, + { + "epoch": 0.18741976893453144, + "grad_norm": 16.001495361328125, + "learning_rate": 5.58408215661104e-07, + "loss": 4.2653, + "step": 146 + }, + { + "epoch": 0.18870346598202825, + "grad_norm": 15.398704528808594, + "learning_rate": 5.622593068035943e-07, + "loss": 4.2551, + "step": 147 + }, + { + "epoch": 0.18998716302952504, + "grad_norm": 14.399839401245117, + "learning_rate": 5.661103979460848e-07, + "loss": 4.239, + "step": 148 + }, + { + "epoch": 0.19127086007702182, + "grad_norm": 11.779078483581543, + "learning_rate": 5.699614890885751e-07, + "loss": 4.1613, + "step": 149 + }, + { + "epoch": 0.1925545571245186, + "grad_norm": 10.946303367614746, + "learning_rate": 5.738125802310654e-07, + "loss": 4.1447, + "step": 150 + }, + { + "epoch": 0.19383825417201542, + "grad_norm": 27.648740768432617, + "learning_rate": 5.776636713735559e-07, + "loss": 4.4788, + "step": 151 + }, + { + "epoch": 0.1951219512195122, + "grad_norm": 22.141569137573242, + "learning_rate": 5.815147625160463e-07, + "loss": 4.3075, + "step": 152 + }, + { + "epoch": 0.196405648267009, + "grad_norm": 16.920921325683594, + "learning_rate": 5.853658536585366e-07, + "loss": 4.2103, + "step": 153 + }, + { + "epoch": 0.19768934531450577, + "grad_norm": 13.896105766296387, + "learning_rate": 5.892169448010269e-07, + "loss": 4.1479, + "step": 154 + }, + { + "epoch": 0.19897304236200256, + "grad_norm": 13.339458465576172, + "learning_rate": 5.930680359435173e-07, + "loss": 4.1475, + "step": 155 + }, + { + "epoch": 0.20025673940949937, + "grad_norm": 10.088788032531738, + "learning_rate": 5.969191270860078e-07, + "loss": 4.0908, + "step": 156 + }, + { + "epoch": 0.20154043645699615, + "grad_norm": 8.751063346862793, + "learning_rate": 6.007702182284981e-07, + "loss": 4.0528, + "step": 157 + }, + { + "epoch": 0.20282413350449294, + "grad_norm": 8.005382537841797, + "learning_rate": 6.046213093709885e-07, + "loss": 4.0569, + "step": 158 + }, + { + "epoch": 0.20410783055198972, + "grad_norm": 7.591683387756348, + "learning_rate": 6.084724005134788e-07, + "loss": 4.0394, + "step": 159 + }, + { + "epoch": 0.20539152759948653, + "grad_norm": 7.789794921875, + "learning_rate": 6.123234916559691e-07, + "loss": 4.0156, + "step": 160 + }, + { + "epoch": 0.20667522464698332, + "grad_norm": 8.084864616394043, + "learning_rate": 6.161745827984596e-07, + "loss": 3.996, + "step": 161 + }, + { + "epoch": 0.2079589216944801, + "grad_norm": 8.138855934143066, + "learning_rate": 6.2002567394095e-07, + "loss": 4.0022, + "step": 162 + }, + { + "epoch": 0.2092426187419769, + "grad_norm": 8.559894561767578, + "learning_rate": 6.238767650834403e-07, + "loss": 3.9854, + "step": 163 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 8.370992660522461, + "learning_rate": 6.277278562259307e-07, + "loss": 3.9865, + "step": 164 + }, + { + "epoch": 0.21181001283697048, + "grad_norm": 8.984065055847168, + "learning_rate": 6.315789473684211e-07, + "loss": 3.9426, + "step": 165 + }, + { + "epoch": 0.21309370988446727, + "grad_norm": 8.319439888000488, + "learning_rate": 6.354300385109114e-07, + "loss": 3.933, + "step": 166 + }, + { + "epoch": 0.21437740693196405, + "grad_norm": 7.762672424316406, + "learning_rate": 6.392811296534018e-07, + "loss": 3.9126, + "step": 167 + }, + { + "epoch": 0.21566110397946084, + "grad_norm": 7.059142589569092, + "learning_rate": 6.431322207958922e-07, + "loss": 3.9023, + "step": 168 + }, + { + "epoch": 0.21694480102695765, + "grad_norm": 6.611874103546143, + "learning_rate": 6.469833119383826e-07, + "loss": 3.8951, + "step": 169 + }, + { + "epoch": 0.21822849807445444, + "grad_norm": 6.356005668640137, + "learning_rate": 6.50834403080873e-07, + "loss": 3.8722, + "step": 170 + }, + { + "epoch": 0.21951219512195122, + "grad_norm": 6.268534183502197, + "learning_rate": 6.546854942233633e-07, + "loss": 3.8689, + "step": 171 + }, + { + "epoch": 0.220795892169448, + "grad_norm": 6.19456148147583, + "learning_rate": 6.585365853658536e-07, + "loss": 3.8433, + "step": 172 + }, + { + "epoch": 0.2220795892169448, + "grad_norm": 5.93058443069458, + "learning_rate": 6.62387676508344e-07, + "loss": 3.8319, + "step": 173 + }, + { + "epoch": 0.2233632862644416, + "grad_norm": 5.988935470581055, + "learning_rate": 6.662387676508345e-07, + "loss": 3.8322, + "step": 174 + }, + { + "epoch": 0.2246469833119384, + "grad_norm": 6.126948833465576, + "learning_rate": 6.700898587933248e-07, + "loss": 3.8223, + "step": 175 + }, + { + "epoch": 0.22593068035943517, + "grad_norm": 5.137011528015137, + "learning_rate": 6.739409499358151e-07, + "loss": 3.7999, + "step": 176 + }, + { + "epoch": 0.22721437740693196, + "grad_norm": 4.9252166748046875, + "learning_rate": 6.777920410783055e-07, + "loss": 3.7852, + "step": 177 + }, + { + "epoch": 0.22849807445442877, + "grad_norm": 4.860875606536865, + "learning_rate": 6.816431322207959e-07, + "loss": 3.7723, + "step": 178 + }, + { + "epoch": 0.22978177150192555, + "grad_norm": 4.579726219177246, + "learning_rate": 6.854942233632863e-07, + "loss": 3.772, + "step": 179 + }, + { + "epoch": 0.23106546854942234, + "grad_norm": 5.291154861450195, + "learning_rate": 6.893453145057767e-07, + "loss": 3.752, + "step": 180 + }, + { + "epoch": 0.23234916559691912, + "grad_norm": 4.533776760101318, + "learning_rate": 6.93196405648267e-07, + "loss": 3.7348, + "step": 181 + }, + { + "epoch": 0.2336328626444159, + "grad_norm": 4.748323440551758, + "learning_rate": 6.970474967907574e-07, + "loss": 3.7372, + "step": 182 + }, + { + "epoch": 0.23491655969191272, + "grad_norm": 4.5923380851745605, + "learning_rate": 7.008985879332478e-07, + "loss": 3.727, + "step": 183 + }, + { + "epoch": 0.2362002567394095, + "grad_norm": 4.573209762573242, + "learning_rate": 7.047496790757382e-07, + "loss": 3.7111, + "step": 184 + }, + { + "epoch": 0.2374839537869063, + "grad_norm": 4.096446514129639, + "learning_rate": 7.086007702182285e-07, + "loss": 3.6976, + "step": 185 + }, + { + "epoch": 0.23876765083440307, + "grad_norm": 3.9917752742767334, + "learning_rate": 7.124518613607189e-07, + "loss": 3.6943, + "step": 186 + }, + { + "epoch": 0.24005134788189988, + "grad_norm": 4.115462303161621, + "learning_rate": 7.163029525032093e-07, + "loss": 3.675, + "step": 187 + }, + { + "epoch": 0.24133504492939667, + "grad_norm": 4.121078014373779, + "learning_rate": 7.201540436456996e-07, + "loss": 3.6585, + "step": 188 + }, + { + "epoch": 0.24261874197689345, + "grad_norm": 4.0298991203308105, + "learning_rate": 7.2400513478819e-07, + "loss": 3.6545, + "step": 189 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 4.567723751068115, + "learning_rate": 7.278562259306804e-07, + "loss": 3.6615, + "step": 190 + }, + { + "epoch": 0.24518613607188702, + "grad_norm": 3.5754239559173584, + "learning_rate": 7.317073170731708e-07, + "loss": 3.6209, + "step": 191 + }, + { + "epoch": 0.24646983311938384, + "grad_norm": 3.6776413917541504, + "learning_rate": 7.355584082156612e-07, + "loss": 3.6184, + "step": 192 + }, + { + "epoch": 0.24775353016688062, + "grad_norm": 3.6289355754852295, + "learning_rate": 7.394094993581515e-07, + "loss": 3.6027, + "step": 193 + }, + { + "epoch": 0.2490372272143774, + "grad_norm": 3.3749172687530518, + "learning_rate": 7.432605905006418e-07, + "loss": 3.609, + "step": 194 + }, + { + "epoch": 0.2503209242618742, + "grad_norm": 3.633855104446411, + "learning_rate": 7.471116816431321e-07, + "loss": 3.5807, + "step": 195 + }, + { + "epoch": 0.251604621309371, + "grad_norm": 3.375627040863037, + "learning_rate": 7.509627727856227e-07, + "loss": 3.5965, + "step": 196 + }, + { + "epoch": 0.25288831835686776, + "grad_norm": 3.5195882320404053, + "learning_rate": 7.54813863928113e-07, + "loss": 3.5877, + "step": 197 + }, + { + "epoch": 0.25417201540436457, + "grad_norm": 3.8019325733184814, + "learning_rate": 7.586649550706033e-07, + "loss": 3.5565, + "step": 198 + }, + { + "epoch": 0.2554557124518614, + "grad_norm": 3.853597640991211, + "learning_rate": 7.625160462130937e-07, + "loss": 3.5414, + "step": 199 + }, + { + "epoch": 0.25673940949935814, + "grad_norm": 5.801620960235596, + "learning_rate": 7.663671373555841e-07, + "loss": 3.5471, + "step": 200 + }, + { + "epoch": 0.25802310654685495, + "grad_norm": 23.119701385498047, + "learning_rate": 7.702182284980745e-07, + "loss": 3.7121, + "step": 201 + }, + { + "epoch": 0.2593068035943517, + "grad_norm": 15.82883071899414, + "learning_rate": 7.740693196405649e-07, + "loss": 3.6294, + "step": 202 + }, + { + "epoch": 0.2605905006418485, + "grad_norm": 12.630388259887695, + "learning_rate": 7.779204107830552e-07, + "loss": 3.5797, + "step": 203 + }, + { + "epoch": 0.26187419768934533, + "grad_norm": 12.007905006408691, + "learning_rate": 7.817715019255456e-07, + "loss": 3.5711, + "step": 204 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 10.191715240478516, + "learning_rate": 7.85622593068036e-07, + "loss": 3.5777, + "step": 205 + }, + { + "epoch": 0.2644415917843389, + "grad_norm": 6.7641754150390625, + "learning_rate": 7.894736842105263e-07, + "loss": 3.5446, + "step": 206 + }, + { + "epoch": 0.26572528883183566, + "grad_norm": 4.6238884925842285, + "learning_rate": 7.933247753530167e-07, + "loss": 3.5115, + "step": 207 + }, + { + "epoch": 0.26700898587933247, + "grad_norm": 3.9241538047790527, + "learning_rate": 7.971758664955071e-07, + "loss": 3.5216, + "step": 208 + }, + { + "epoch": 0.2682926829268293, + "grad_norm": 2.842366933822632, + "learning_rate": 8.010269576379975e-07, + "loss": 3.5094, + "step": 209 + }, + { + "epoch": 0.26957637997432604, + "grad_norm": 4.873263835906982, + "learning_rate": 8.048780487804878e-07, + "loss": 3.5105, + "step": 210 + }, + { + "epoch": 0.27086007702182285, + "grad_norm": 6.30509614944458, + "learning_rate": 8.087291399229781e-07, + "loss": 3.4995, + "step": 211 + }, + { + "epoch": 0.27214377406931967, + "grad_norm": 7.079239368438721, + "learning_rate": 8.125802310654685e-07, + "loss": 3.5123, + "step": 212 + }, + { + "epoch": 0.2734274711168164, + "grad_norm": 6.813809871673584, + "learning_rate": 8.16431322207959e-07, + "loss": 3.503, + "step": 213 + }, + { + "epoch": 0.27471116816431324, + "grad_norm": 8.319632530212402, + "learning_rate": 8.202824133504493e-07, + "loss": 3.5104, + "step": 214 + }, + { + "epoch": 0.27599486521181, + "grad_norm": 6.521505355834961, + "learning_rate": 8.241335044929397e-07, + "loss": 3.4858, + "step": 215 + }, + { + "epoch": 0.2772785622593068, + "grad_norm": 5.21783447265625, + "learning_rate": 8.2798459563543e-07, + "loss": 3.4819, + "step": 216 + }, + { + "epoch": 0.2785622593068036, + "grad_norm": 2.893491506576538, + "learning_rate": 8.318356867779204e-07, + "loss": 3.4858, + "step": 217 + }, + { + "epoch": 0.2798459563543004, + "grad_norm": 2.1085124015808105, + "learning_rate": 8.356867779204109e-07, + "loss": 3.4701, + "step": 218 + }, + { + "epoch": 0.2811296534017972, + "grad_norm": 2.3387696743011475, + "learning_rate": 8.395378690629012e-07, + "loss": 3.4642, + "step": 219 + }, + { + "epoch": 0.28241335044929394, + "grad_norm": 4.081897735595703, + "learning_rate": 8.433889602053915e-07, + "loss": 3.4635, + "step": 220 + }, + { + "epoch": 0.28369704749679076, + "grad_norm": 4.365902900695801, + "learning_rate": 8.472400513478819e-07, + "loss": 3.4551, + "step": 221 + }, + { + "epoch": 0.28498074454428757, + "grad_norm": 4.514154434204102, + "learning_rate": 8.510911424903723e-07, + "loss": 3.4676, + "step": 222 + }, + { + "epoch": 0.2862644415917843, + "grad_norm": 6.363986015319824, + "learning_rate": 8.549422336328626e-07, + "loss": 3.4579, + "step": 223 + }, + { + "epoch": 0.28754813863928114, + "grad_norm": 3.433619976043701, + "learning_rate": 8.587933247753531e-07, + "loss": 3.4558, + "step": 224 + }, + { + "epoch": 0.2888318356867779, + "grad_norm": 1.982452392578125, + "learning_rate": 8.626444159178434e-07, + "loss": 3.4351, + "step": 225 + }, + { + "epoch": 0.2901155327342747, + "grad_norm": 2.435988187789917, + "learning_rate": 8.664955070603338e-07, + "loss": 3.4469, + "step": 226 + }, + { + "epoch": 0.2913992297817715, + "grad_norm": 2.622964382171631, + "learning_rate": 8.703465982028242e-07, + "loss": 3.4348, + "step": 227 + }, + { + "epoch": 0.2926829268292683, + "grad_norm": 2.22902774810791, + "learning_rate": 8.741976893453145e-07, + "loss": 3.4472, + "step": 228 + }, + { + "epoch": 0.2939666238767651, + "grad_norm": 3.2561194896698, + "learning_rate": 8.780487804878049e-07, + "loss": 3.4253, + "step": 229 + }, + { + "epoch": 0.2952503209242619, + "grad_norm": 1.8927252292633057, + "learning_rate": 8.818998716302953e-07, + "loss": 3.4384, + "step": 230 + }, + { + "epoch": 0.29653401797175866, + "grad_norm": 1.6343789100646973, + "learning_rate": 8.857509627727857e-07, + "loss": 3.4342, + "step": 231 + }, + { + "epoch": 0.29781771501925547, + "grad_norm": 1.8646332025527954, + "learning_rate": 8.89602053915276e-07, + "loss": 3.4523, + "step": 232 + }, + { + "epoch": 0.2991014120667522, + "grad_norm": 2.4403603076934814, + "learning_rate": 8.934531450577663e-07, + "loss": 3.4247, + "step": 233 + }, + { + "epoch": 0.30038510911424904, + "grad_norm": 3.48620343208313, + "learning_rate": 8.973042362002567e-07, + "loss": 3.4276, + "step": 234 + }, + { + "epoch": 0.30166880616174585, + "grad_norm": 1.9550559520721436, + "learning_rate": 9.011553273427472e-07, + "loss": 3.4139, + "step": 235 + }, + { + "epoch": 0.3029525032092426, + "grad_norm": 1.536184310913086, + "learning_rate": 9.050064184852375e-07, + "loss": 3.416, + "step": 236 + }, + { + "epoch": 0.3042362002567394, + "grad_norm": 1.7124470472335815, + "learning_rate": 9.088575096277279e-07, + "loss": 3.4058, + "step": 237 + }, + { + "epoch": 0.3055198973042362, + "grad_norm": 1.4909348487854004, + "learning_rate": 9.127086007702182e-07, + "loss": 3.3883, + "step": 238 + }, + { + "epoch": 0.306803594351733, + "grad_norm": 5.2078142166137695, + "learning_rate": 9.165596919127086e-07, + "loss": 3.4008, + "step": 239 + }, + { + "epoch": 0.3080872913992298, + "grad_norm": 1.8005460500717163, + "learning_rate": 9.20410783055199e-07, + "loss": 3.3781, + "step": 240 + }, + { + "epoch": 0.30937098844672656, + "grad_norm": 1.9392553567886353, + "learning_rate": 9.242618741976894e-07, + "loss": 3.3809, + "step": 241 + }, + { + "epoch": 0.31065468549422337, + "grad_norm": 1.5816645622253418, + "learning_rate": 9.281129653401797e-07, + "loss": 3.3838, + "step": 242 + }, + { + "epoch": 0.3119383825417201, + "grad_norm": 1.3630691766738892, + "learning_rate": 9.319640564826701e-07, + "loss": 3.3844, + "step": 243 + }, + { + "epoch": 0.31322207958921694, + "grad_norm": 5.87333869934082, + "learning_rate": 9.358151476251605e-07, + "loss": 3.3939, + "step": 244 + }, + { + "epoch": 0.31450577663671375, + "grad_norm": 2.663943290710449, + "learning_rate": 9.396662387676509e-07, + "loss": 3.3785, + "step": 245 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 4.305357456207275, + "learning_rate": 9.435173299101413e-07, + "loss": 3.3777, + "step": 246 + }, + { + "epoch": 0.3170731707317073, + "grad_norm": 1.4998444318771362, + "learning_rate": 9.473684210526316e-07, + "loss": 3.3821, + "step": 247 + }, + { + "epoch": 0.31835686777920413, + "grad_norm": 3.4124948978424072, + "learning_rate": 9.51219512195122e-07, + "loss": 3.3806, + "step": 248 + }, + { + "epoch": 0.3196405648267009, + "grad_norm": 3.2989046573638916, + "learning_rate": 9.550706033376123e-07, + "loss": 3.3935, + "step": 249 + }, + { + "epoch": 0.3209242618741977, + "grad_norm": 5.360820293426514, + "learning_rate": 9.589216944801027e-07, + "loss": 3.3519, + "step": 250 + }, + { + "epoch": 0.32220795892169446, + "grad_norm": 27.14214324951172, + "learning_rate": 9.627727856225931e-07, + "loss": 3.5269, + "step": 251 + }, + { + "epoch": 0.32349165596919127, + "grad_norm": 20.670705795288086, + "learning_rate": 9.666238767650833e-07, + "loss": 3.4741, + "step": 252 + }, + { + "epoch": 0.3247753530166881, + "grad_norm": 17.14813232421875, + "learning_rate": 9.704749679075738e-07, + "loss": 3.4451, + "step": 253 + }, + { + "epoch": 0.32605905006418484, + "grad_norm": 13.764606475830078, + "learning_rate": 9.743260590500642e-07, + "loss": 3.435, + "step": 254 + }, + { + "epoch": 0.32734274711168165, + "grad_norm": 8.495406150817871, + "learning_rate": 9.781771501925546e-07, + "loss": 3.3957, + "step": 255 + }, + { + "epoch": 0.3286264441591784, + "grad_norm": 5.8587117195129395, + "learning_rate": 9.82028241335045e-07, + "loss": 3.3845, + "step": 256 + }, + { + "epoch": 0.3299101412066752, + "grad_norm": 2.3683226108551025, + "learning_rate": 9.858793324775353e-07, + "loss": 3.3785, + "step": 257 + }, + { + "epoch": 0.33119383825417203, + "grad_norm": 5.089169025421143, + "learning_rate": 9.897304236200257e-07, + "loss": 3.375, + "step": 258 + }, + { + "epoch": 0.3324775353016688, + "grad_norm": 6.798583984375, + "learning_rate": 9.935815147625162e-07, + "loss": 3.3807, + "step": 259 + }, + { + "epoch": 0.3337612323491656, + "grad_norm": 9.168935775756836, + "learning_rate": 9.974326059050064e-07, + "loss": 3.3857, + "step": 260 + }, + { + "epoch": 0.33504492939666236, + "grad_norm": 7.749726295471191, + "learning_rate": 1.0012836970474968e-06, + "loss": 3.367, + "step": 261 + }, + { + "epoch": 0.3363286264441592, + "grad_norm": 5.2261457443237305, + "learning_rate": 1.005134788189987e-06, + "loss": 3.3826, + "step": 262 + }, + { + "epoch": 0.337612323491656, + "grad_norm": 4.4711809158325195, + "learning_rate": 1.0089858793324775e-06, + "loss": 3.3641, + "step": 263 + }, + { + "epoch": 0.33889602053915274, + "grad_norm": 1.8908816576004028, + "learning_rate": 1.012836970474968e-06, + "loss": 3.3592, + "step": 264 + }, + { + "epoch": 0.34017971758664955, + "grad_norm": 3.5218827724456787, + "learning_rate": 1.0166880616174582e-06, + "loss": 3.3641, + "step": 265 + }, + { + "epoch": 0.34146341463414637, + "grad_norm": 6.087357521057129, + "learning_rate": 1.0205391527599488e-06, + "loss": 3.3814, + "step": 266 + }, + { + "epoch": 0.3427471116816431, + "grad_norm": 5.132585048675537, + "learning_rate": 1.024390243902439e-06, + "loss": 3.3712, + "step": 267 + }, + { + "epoch": 0.34403080872913994, + "grad_norm": 5.377570152282715, + "learning_rate": 1.0282413350449295e-06, + "loss": 3.3674, + "step": 268 + }, + { + "epoch": 0.3453145057766367, + "grad_norm": 3.3792009353637695, + "learning_rate": 1.03209242618742e-06, + "loss": 3.353, + "step": 269 + }, + { + "epoch": 0.3465982028241335, + "grad_norm": 1.2656879425048828, + "learning_rate": 1.0359435173299101e-06, + "loss": 3.3583, + "step": 270 + }, + { + "epoch": 0.3478818998716303, + "grad_norm": 2.4479024410247803, + "learning_rate": 1.0397946084724006e-06, + "loss": 3.343, + "step": 271 + }, + { + "epoch": 0.3491655969191271, + "grad_norm": 3.967794418334961, + "learning_rate": 1.043645699614891e-06, + "loss": 3.357, + "step": 272 + }, + { + "epoch": 0.3504492939666239, + "grad_norm": 4.105884075164795, + "learning_rate": 1.0474967907573812e-06, + "loss": 3.3627, + "step": 273 + }, + { + "epoch": 0.35173299101412064, + "grad_norm": 2.7573158740997314, + "learning_rate": 1.0513478818998716e-06, + "loss": 3.3618, + "step": 274 + }, + { + "epoch": 0.35301668806161746, + "grad_norm": 1.6856857538223267, + "learning_rate": 1.0551989730423619e-06, + "loss": 3.3609, + "step": 275 + }, + { + "epoch": 0.35430038510911427, + "grad_norm": 3.3063294887542725, + "learning_rate": 1.0590500641848523e-06, + "loss": 3.3578, + "step": 276 + }, + { + "epoch": 0.355584082156611, + "grad_norm": 2.8285024166107178, + "learning_rate": 1.062901155327343e-06, + "loss": 3.3504, + "step": 277 + }, + { + "epoch": 0.35686777920410784, + "grad_norm": 2.774961471557617, + "learning_rate": 1.0667522464698332e-06, + "loss": 3.3311, + "step": 278 + }, + { + "epoch": 0.3581514762516046, + "grad_norm": 1.4063013792037964, + "learning_rate": 1.0706033376123236e-06, + "loss": 3.3605, + "step": 279 + }, + { + "epoch": 0.3594351732991014, + "grad_norm": 2.8284571170806885, + "learning_rate": 1.0744544287548138e-06, + "loss": 3.3378, + "step": 280 + }, + { + "epoch": 0.3607188703465982, + "grad_norm": 2.621030330657959, + "learning_rate": 1.0783055198973043e-06, + "loss": 3.3338, + "step": 281 + }, + { + "epoch": 0.362002567394095, + "grad_norm": 3.2113161087036133, + "learning_rate": 1.0821566110397947e-06, + "loss": 3.3426, + "step": 282 + }, + { + "epoch": 0.3632862644415918, + "grad_norm": 1.6037802696228027, + "learning_rate": 1.086007702182285e-06, + "loss": 3.3446, + "step": 283 + }, + { + "epoch": 0.3645699614890886, + "grad_norm": 1.7215930223464966, + "learning_rate": 1.0898587933247754e-06, + "loss": 3.349, + "step": 284 + }, + { + "epoch": 0.36585365853658536, + "grad_norm": 3.87294864654541, + "learning_rate": 1.0937098844672658e-06, + "loss": 3.3429, + "step": 285 + }, + { + "epoch": 0.36713735558408217, + "grad_norm": 4.677052974700928, + "learning_rate": 1.097560975609756e-06, + "loss": 3.3403, + "step": 286 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 3.7845232486724854, + "learning_rate": 1.1014120667522465e-06, + "loss": 3.3546, + "step": 287 + }, + { + "epoch": 0.36970474967907574, + "grad_norm": 1.221165418624878, + "learning_rate": 1.1052631578947367e-06, + "loss": 3.3262, + "step": 288 + }, + { + "epoch": 0.37098844672657255, + "grad_norm": 1.2050323486328125, + "learning_rate": 1.1091142490372273e-06, + "loss": 3.3307, + "step": 289 + }, + { + "epoch": 0.3722721437740693, + "grad_norm": 3.5935862064361572, + "learning_rate": 1.1129653401797178e-06, + "loss": 3.3292, + "step": 290 + }, + { + "epoch": 0.3735558408215661, + "grad_norm": 2.9169514179229736, + "learning_rate": 1.116816431322208e-06, + "loss": 3.3373, + "step": 291 + }, + { + "epoch": 0.3748395378690629, + "grad_norm": 2.363762617111206, + "learning_rate": 1.1206675224646984e-06, + "loss": 3.345, + "step": 292 + }, + { + "epoch": 0.3761232349165597, + "grad_norm": 1.9544564485549927, + "learning_rate": 1.1245186136071886e-06, + "loss": 3.3245, + "step": 293 + }, + { + "epoch": 0.3774069319640565, + "grad_norm": 2.5326650142669678, + "learning_rate": 1.128369704749679e-06, + "loss": 3.3415, + "step": 294 + }, + { + "epoch": 0.37869062901155326, + "grad_norm": 1.2009296417236328, + "learning_rate": 1.1322207958921695e-06, + "loss": 3.3145, + "step": 295 + }, + { + "epoch": 0.37997432605905007, + "grad_norm": 1.5298550128936768, + "learning_rate": 1.1360718870346597e-06, + "loss": 3.3171, + "step": 296 + }, + { + "epoch": 0.38125802310654683, + "grad_norm": 1.5165319442749023, + "learning_rate": 1.1399229781771502e-06, + "loss": 3.2885, + "step": 297 + }, + { + "epoch": 0.38254172015404364, + "grad_norm": 2.449878454208374, + "learning_rate": 1.1437740693196406e-06, + "loss": 3.2749, + "step": 298 + }, + { + "epoch": 0.38382541720154045, + "grad_norm": 1.8864243030548096, + "learning_rate": 1.1476251604621308e-06, + "loss": 3.3151, + "step": 299 + }, + { + "epoch": 0.3851091142490372, + "grad_norm": 5.297044277191162, + "learning_rate": 1.1514762516046215e-06, + "loss": 3.2896, + "step": 300 + }, + { + "epoch": 0.386392811296534, + "grad_norm": 31.94978904724121, + "learning_rate": 1.1553273427471117e-06, + "loss": 3.5397, + "step": 301 + }, + { + "epoch": 0.38767650834403083, + "grad_norm": 25.897207260131836, + "learning_rate": 1.1591784338896021e-06, + "loss": 3.4965, + "step": 302 + }, + { + "epoch": 0.3889602053915276, + "grad_norm": 21.828601837158203, + "learning_rate": 1.1630295250320926e-06, + "loss": 3.4567, + "step": 303 + }, + { + "epoch": 0.3902439024390244, + "grad_norm": 19.756389617919922, + "learning_rate": 1.1668806161745828e-06, + "loss": 3.4323, + "step": 304 + }, + { + "epoch": 0.39152759948652116, + "grad_norm": 13.417524337768555, + "learning_rate": 1.1707317073170732e-06, + "loss": 3.3822, + "step": 305 + }, + { + "epoch": 0.392811296534018, + "grad_norm": 9.815092086791992, + "learning_rate": 1.1745827984595635e-06, + "loss": 3.3651, + "step": 306 + }, + { + "epoch": 0.3940949935815148, + "grad_norm": 4.418903350830078, + "learning_rate": 1.1784338896020539e-06, + "loss": 3.3398, + "step": 307 + }, + { + "epoch": 0.39537869062901154, + "grad_norm": 5.8169708251953125, + "learning_rate": 1.1822849807445443e-06, + "loss": 3.3452, + "step": 308 + }, + { + "epoch": 0.39666238767650835, + "grad_norm": 8.733827590942383, + "learning_rate": 1.1861360718870345e-06, + "loss": 3.3523, + "step": 309 + }, + { + "epoch": 0.3979460847240051, + "grad_norm": 11.200616836547852, + "learning_rate": 1.189987163029525e-06, + "loss": 3.3688, + "step": 310 + }, + { + "epoch": 0.3992297817715019, + "grad_norm": 11.395453453063965, + "learning_rate": 1.1938382541720156e-06, + "loss": 3.3729, + "step": 311 + }, + { + "epoch": 0.40051347881899874, + "grad_norm": 10.407209396362305, + "learning_rate": 1.1976893453145059e-06, + "loss": 3.3516, + "step": 312 + }, + { + "epoch": 0.4017971758664955, + "grad_norm": 7.685133934020996, + "learning_rate": 1.2015404364569963e-06, + "loss": 3.3503, + "step": 313 + }, + { + "epoch": 0.4030808729139923, + "grad_norm": 4.913595676422119, + "learning_rate": 1.2053915275994865e-06, + "loss": 3.3158, + "step": 314 + }, + { + "epoch": 0.40436456996148906, + "grad_norm": 1.6458847522735596, + "learning_rate": 1.209242618741977e-06, + "loss": 3.3148, + "step": 315 + }, + { + "epoch": 0.4056482670089859, + "grad_norm": 4.645716667175293, + "learning_rate": 1.2130937098844674e-06, + "loss": 3.3222, + "step": 316 + }, + { + "epoch": 0.4069319640564827, + "grad_norm": 6.796189308166504, + "learning_rate": 1.2169448010269576e-06, + "loss": 3.3239, + "step": 317 + }, + { + "epoch": 0.40821566110397944, + "grad_norm": 5.8429412841796875, + "learning_rate": 1.220795892169448e-06, + "loss": 3.3204, + "step": 318 + }, + { + "epoch": 0.40949935815147626, + "grad_norm": 4.108697414398193, + "learning_rate": 1.2246469833119383e-06, + "loss": 3.3165, + "step": 319 + }, + { + "epoch": 0.41078305519897307, + "grad_norm": 3.6560332775115967, + "learning_rate": 1.2284980744544287e-06, + "loss": 3.3162, + "step": 320 + }, + { + "epoch": 0.4120667522464698, + "grad_norm": 1.721331000328064, + "learning_rate": 1.2323491655969191e-06, + "loss": 3.3081, + "step": 321 + }, + { + "epoch": 0.41335044929396664, + "grad_norm": 3.748699426651001, + "learning_rate": 1.2362002567394096e-06, + "loss": 3.3205, + "step": 322 + }, + { + "epoch": 0.4146341463414634, + "grad_norm": 3.9064371585845947, + "learning_rate": 1.2400513478819e-06, + "loss": 3.3053, + "step": 323 + }, + { + "epoch": 0.4159178433889602, + "grad_norm": 2.762563467025757, + "learning_rate": 1.2439024390243904e-06, + "loss": 3.3008, + "step": 324 + }, + { + "epoch": 0.417201540436457, + "grad_norm": 1.687069058418274, + "learning_rate": 1.2477535301668807e-06, + "loss": 3.319, + "step": 325 + }, + { + "epoch": 0.4184852374839538, + "grad_norm": 1.5062334537506104, + "learning_rate": 1.251604621309371e-06, + "loss": 3.3093, + "step": 326 + }, + { + "epoch": 0.4197689345314506, + "grad_norm": 2.5449252128601074, + "learning_rate": 1.2554557124518613e-06, + "loss": 3.2899, + "step": 327 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 1.1288577318191528, + "learning_rate": 1.2593068035943518e-06, + "loss": 3.2995, + "step": 328 + }, + { + "epoch": 0.42233632862644416, + "grad_norm": 1.5257689952850342, + "learning_rate": 1.2631578947368422e-06, + "loss": 3.3064, + "step": 329 + }, + { + "epoch": 0.42362002567394097, + "grad_norm": 1.4540174007415771, + "learning_rate": 1.2670089858793324e-06, + "loss": 3.3096, + "step": 330 + }, + { + "epoch": 0.4249037227214377, + "grad_norm": 1.0269229412078857, + "learning_rate": 1.2708600770218229e-06, + "loss": 3.3078, + "step": 331 + }, + { + "epoch": 0.42618741976893454, + "grad_norm": 1.9975014925003052, + "learning_rate": 1.274711168164313e-06, + "loss": 3.3046, + "step": 332 + }, + { + "epoch": 0.4274711168164313, + "grad_norm": 7.5496931076049805, + "learning_rate": 1.2785622593068035e-06, + "loss": 3.2945, + "step": 333 + }, + { + "epoch": 0.4287548138639281, + "grad_norm": 1.0366392135620117, + "learning_rate": 1.2824133504492942e-06, + "loss": 3.2915, + "step": 334 + }, + { + "epoch": 0.4300385109114249, + "grad_norm": 0.9323128461837769, + "learning_rate": 1.2862644415917844e-06, + "loss": 3.2875, + "step": 335 + }, + { + "epoch": 0.4313222079589217, + "grad_norm": 1.7378100156784058, + "learning_rate": 1.2901155327342748e-06, + "loss": 3.2961, + "step": 336 + }, + { + "epoch": 0.4326059050064185, + "grad_norm": 1.624954342842102, + "learning_rate": 1.2939666238767653e-06, + "loss": 3.2916, + "step": 337 + }, + { + "epoch": 0.4338896020539153, + "grad_norm": 1.1825155019760132, + "learning_rate": 1.2978177150192555e-06, + "loss": 3.2762, + "step": 338 + }, + { + "epoch": 0.43517329910141206, + "grad_norm": 1.7932780981063843, + "learning_rate": 1.301668806161746e-06, + "loss": 3.2863, + "step": 339 + }, + { + "epoch": 0.43645699614890887, + "grad_norm": 1.658711552619934, + "learning_rate": 1.3055198973042361e-06, + "loss": 3.2771, + "step": 340 + }, + { + "epoch": 0.43774069319640563, + "grad_norm": 1.4445250034332275, + "learning_rate": 1.3093709884467266e-06, + "loss": 3.2795, + "step": 341 + }, + { + "epoch": 0.43902439024390244, + "grad_norm": 2.142378568649292, + "learning_rate": 1.313222079589217e-06, + "loss": 3.2715, + "step": 342 + }, + { + "epoch": 0.44030808729139925, + "grad_norm": 1.4514838457107544, + "learning_rate": 1.3170731707317072e-06, + "loss": 3.2691, + "step": 343 + }, + { + "epoch": 0.441591784338896, + "grad_norm": 2.0860116481781006, + "learning_rate": 1.3209242618741977e-06, + "loss": 3.2585, + "step": 344 + }, + { + "epoch": 0.4428754813863928, + "grad_norm": 1.7583154439926147, + "learning_rate": 1.324775353016688e-06, + "loss": 3.2603, + "step": 345 + }, + { + "epoch": 0.4441591784338896, + "grad_norm": 1.4004662036895752, + "learning_rate": 1.3286264441591785e-06, + "loss": 3.2785, + "step": 346 + }, + { + "epoch": 0.4454428754813864, + "grad_norm": 2.8631296157836914, + "learning_rate": 1.332477535301669e-06, + "loss": 3.2395, + "step": 347 + }, + { + "epoch": 0.4467265725288832, + "grad_norm": 3.129737615585327, + "learning_rate": 1.3363286264441592e-06, + "loss": 3.2542, + "step": 348 + }, + { + "epoch": 0.44801026957637996, + "grad_norm": 2.558603286743164, + "learning_rate": 1.3401797175866496e-06, + "loss": 3.2288, + "step": 349 + }, + { + "epoch": 0.4492939666238768, + "grad_norm": 2.5695462226867676, + "learning_rate": 1.34403080872914e-06, + "loss": 3.2799, + "step": 350 + }, + { + "epoch": 0.45057766367137353, + "grad_norm": 27.192651748657227, + "learning_rate": 1.3478818998716303e-06, + "loss": 3.4529, + "step": 351 + }, + { + "epoch": 0.45186136071887034, + "grad_norm": 21.921140670776367, + "learning_rate": 1.3517329910141207e-06, + "loss": 3.4064, + "step": 352 + }, + { + "epoch": 0.45314505776636715, + "grad_norm": 19.45769500732422, + "learning_rate": 1.355584082156611e-06, + "loss": 3.3885, + "step": 353 + }, + { + "epoch": 0.4544287548138639, + "grad_norm": 14.555753707885742, + "learning_rate": 1.3594351732991014e-06, + "loss": 3.3358, + "step": 354 + }, + { + "epoch": 0.4557124518613607, + "grad_norm": 9.946282386779785, + "learning_rate": 1.3632862644415918e-06, + "loss": 3.3004, + "step": 355 + }, + { + "epoch": 0.45699614890885754, + "grad_norm": 3.712834596633911, + "learning_rate": 1.3671373555840822e-06, + "loss": 3.272, + "step": 356 + }, + { + "epoch": 0.4582798459563543, + "grad_norm": 4.932548999786377, + "learning_rate": 1.3709884467265727e-06, + "loss": 3.2785, + "step": 357 + }, + { + "epoch": 0.4595635430038511, + "grad_norm": 8.90805721282959, + "learning_rate": 1.374839537869063e-06, + "loss": 3.2967, + "step": 358 + }, + { + "epoch": 0.46084724005134786, + "grad_norm": 9.592296600341797, + "learning_rate": 1.3786906290115533e-06, + "loss": 3.2886, + "step": 359 + }, + { + "epoch": 0.4621309370988447, + "grad_norm": 7.961353778839111, + "learning_rate": 1.3825417201540438e-06, + "loss": 3.2803, + "step": 360 + }, + { + "epoch": 0.4634146341463415, + "grad_norm": 5.245780944824219, + "learning_rate": 1.386392811296534e-06, + "loss": 3.2675, + "step": 361 + }, + { + "epoch": 0.46469833119383824, + "grad_norm": 2.0360219478607178, + "learning_rate": 1.3902439024390244e-06, + "loss": 3.2424, + "step": 362 + }, + { + "epoch": 0.46598202824133506, + "grad_norm": 6.0150604248046875, + "learning_rate": 1.3940949935815149e-06, + "loss": 3.2691, + "step": 363 + }, + { + "epoch": 0.4672657252888318, + "grad_norm": 8.334208488464355, + "learning_rate": 1.397946084724005e-06, + "loss": 3.2688, + "step": 364 + }, + { + "epoch": 0.4685494223363286, + "grad_norm": 9.457432746887207, + "learning_rate": 1.4017971758664955e-06, + "loss": 3.2705, + "step": 365 + }, + { + "epoch": 0.46983311938382544, + "grad_norm": 7.642498016357422, + "learning_rate": 1.4056482670089858e-06, + "loss": 3.2552, + "step": 366 + }, + { + "epoch": 0.4711168164313222, + "grad_norm": 5.331130504608154, + "learning_rate": 1.4094993581514764e-06, + "loss": 3.2352, + "step": 367 + }, + { + "epoch": 0.472400513478819, + "grad_norm": 2.130319833755493, + "learning_rate": 1.4133504492939668e-06, + "loss": 3.2355, + "step": 368 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 3.7069358825683594, + "learning_rate": 1.417201540436457e-06, + "loss": 3.2258, + "step": 369 + }, + { + "epoch": 0.4749679075738126, + "grad_norm": 6.786232948303223, + "learning_rate": 1.4210526315789475e-06, + "loss": 3.2329, + "step": 370 + }, + { + "epoch": 0.4762516046213094, + "grad_norm": 7.719097137451172, + "learning_rate": 1.4249037227214377e-06, + "loss": 3.2359, + "step": 371 + }, + { + "epoch": 0.47753530166880614, + "grad_norm": 6.512034893035889, + "learning_rate": 1.4287548138639282e-06, + "loss": 3.2491, + "step": 372 + }, + { + "epoch": 0.47881899871630296, + "grad_norm": 3.542940139770508, + "learning_rate": 1.4326059050064186e-06, + "loss": 3.2239, + "step": 373 + }, + { + "epoch": 0.48010269576379977, + "grad_norm": 2.113111734390259, + "learning_rate": 1.4364569961489088e-06, + "loss": 3.219, + "step": 374 + }, + { + "epoch": 0.4813863928112965, + "grad_norm": 4.287517070770264, + "learning_rate": 1.4403080872913992e-06, + "loss": 3.2272, + "step": 375 + }, + { + "epoch": 0.48267008985879334, + "grad_norm": 5.944103717803955, + "learning_rate": 1.4441591784338895e-06, + "loss": 3.2299, + "step": 376 + }, + { + "epoch": 0.4839537869062901, + "grad_norm": 5.198914051055908, + "learning_rate": 1.44801026957638e-06, + "loss": 3.2137, + "step": 377 + }, + { + "epoch": 0.4852374839537869, + "grad_norm": 2.7224936485290527, + "learning_rate": 1.4518613607188703e-06, + "loss": 3.203, + "step": 378 + }, + { + "epoch": 0.4865211810012837, + "grad_norm": 1.4994703531265259, + "learning_rate": 1.4557124518613608e-06, + "loss": 3.1905, + "step": 379 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 4.7617902755737305, + "learning_rate": 1.4595635430038512e-06, + "loss": 3.222, + "step": 380 + }, + { + "epoch": 0.4890885750962773, + "grad_norm": 4.6207404136657715, + "learning_rate": 1.4634146341463416e-06, + "loss": 3.214, + "step": 381 + }, + { + "epoch": 0.49037227214377405, + "grad_norm": 4.329181671142578, + "learning_rate": 1.4672657252888319e-06, + "loss": 3.198, + "step": 382 + }, + { + "epoch": 0.49165596919127086, + "grad_norm": 2.2723121643066406, + "learning_rate": 1.4711168164313223e-06, + "loss": 3.1914, + "step": 383 + }, + { + "epoch": 0.49293966623876767, + "grad_norm": 2.1637392044067383, + "learning_rate": 1.4749679075738125e-06, + "loss": 3.1958, + "step": 384 + }, + { + "epoch": 0.4942233632862644, + "grad_norm": 4.0450592041015625, + "learning_rate": 1.478818998716303e-06, + "loss": 3.1953, + "step": 385 + }, + { + "epoch": 0.49550706033376124, + "grad_norm": 2.9784770011901855, + "learning_rate": 1.4826700898587934e-06, + "loss": 3.1724, + "step": 386 + }, + { + "epoch": 0.496790757381258, + "grad_norm": 2.8167121410369873, + "learning_rate": 1.4865211810012836e-06, + "loss": 3.1624, + "step": 387 + }, + { + "epoch": 0.4980744544287548, + "grad_norm": 1.5745196342468262, + "learning_rate": 1.490372272143774e-06, + "loss": 3.1681, + "step": 388 + }, + { + "epoch": 0.4993581514762516, + "grad_norm": 1.801265001296997, + "learning_rate": 1.4942233632862643e-06, + "loss": 3.1614, + "step": 389 + }, + { + "epoch": 0.5006418485237484, + "grad_norm": 1.6354237794876099, + "learning_rate": 1.498074454428755e-06, + "loss": 3.1467, + "step": 390 + }, + { + "epoch": 0.5019255455712451, + "grad_norm": 1.6085950136184692, + "learning_rate": 1.5019255455712454e-06, + "loss": 3.1561, + "step": 391 + }, + { + "epoch": 0.503209242618742, + "grad_norm": 1.8036441802978516, + "learning_rate": 1.5057766367137356e-06, + "loss": 3.1446, + "step": 392 + }, + { + "epoch": 0.5044929396662388, + "grad_norm": 2.5143301486968994, + "learning_rate": 1.509627727856226e-06, + "loss": 3.1508, + "step": 393 + }, + { + "epoch": 0.5057766367137355, + "grad_norm": 2.5598814487457275, + "learning_rate": 1.5134788189987165e-06, + "loss": 3.1443, + "step": 394 + }, + { + "epoch": 0.5070603337612324, + "grad_norm": 2.5701630115509033, + "learning_rate": 1.5173299101412067e-06, + "loss": 3.1475, + "step": 395 + }, + { + "epoch": 0.5083440308087291, + "grad_norm": 3.681556224822998, + "learning_rate": 1.5211810012836971e-06, + "loss": 3.1327, + "step": 396 + }, + { + "epoch": 0.5096277278562259, + "grad_norm": 2.4432597160339355, + "learning_rate": 1.5250320924261873e-06, + "loss": 3.1373, + "step": 397 + }, + { + "epoch": 0.5109114249037228, + "grad_norm": 5.495240688323975, + "learning_rate": 1.5288831835686778e-06, + "loss": 3.1289, + "step": 398 + }, + { + "epoch": 0.5121951219512195, + "grad_norm": 6.974538326263428, + "learning_rate": 1.5327342747111682e-06, + "loss": 3.1191, + "step": 399 + }, + { + "epoch": 0.5134788189987163, + "grad_norm": 4.2711639404296875, + "learning_rate": 1.5365853658536584e-06, + "loss": 3.1326, + "step": 400 + }, + { + "epoch": 0.5147625160462131, + "grad_norm": 23.033803939819336, + "learning_rate": 1.540436456996149e-06, + "loss": 3.2853, + "step": 401 + }, + { + "epoch": 0.5160462130937099, + "grad_norm": 16.992469787597656, + "learning_rate": 1.5442875481386393e-06, + "loss": 3.2405, + "step": 402 + }, + { + "epoch": 0.5173299101412067, + "grad_norm": 13.806048393249512, + "learning_rate": 1.5481386392811297e-06, + "loss": 3.197, + "step": 403 + }, + { + "epoch": 0.5186136071887034, + "grad_norm": 9.503249168395996, + "learning_rate": 1.5519897304236202e-06, + "loss": 3.1479, + "step": 404 + }, + { + "epoch": 0.5198973042362003, + "grad_norm": 4.36508846282959, + "learning_rate": 1.5558408215661104e-06, + "loss": 3.132, + "step": 405 + }, + { + "epoch": 0.521181001283697, + "grad_norm": 4.394503116607666, + "learning_rate": 1.5596919127086008e-06, + "loss": 3.1198, + "step": 406 + }, + { + "epoch": 0.5224646983311938, + "grad_norm": 7.779834270477295, + "learning_rate": 1.5635430038510913e-06, + "loss": 3.1314, + "step": 407 + }, + { + "epoch": 0.5237483953786907, + "grad_norm": 8.350678443908691, + "learning_rate": 1.5673940949935815e-06, + "loss": 3.107, + "step": 408 + }, + { + "epoch": 0.5250320924261874, + "grad_norm": 6.6670355796813965, + "learning_rate": 1.571245186136072e-06, + "loss": 3.1216, + "step": 409 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 3.0672831535339355, + "learning_rate": 1.5750962772785621e-06, + "loss": 3.0834, + "step": 410 + }, + { + "epoch": 0.527599486521181, + "grad_norm": 2.5408921241760254, + "learning_rate": 1.5789473684210526e-06, + "loss": 3.0988, + "step": 411 + }, + { + "epoch": 0.5288831835686778, + "grad_norm": 5.678051948547363, + "learning_rate": 1.5827984595635432e-06, + "loss": 3.0854, + "step": 412 + }, + { + "epoch": 0.5301668806161746, + "grad_norm": 5.5434441566467285, + "learning_rate": 1.5866495507060335e-06, + "loss": 3.0806, + "step": 413 + }, + { + "epoch": 0.5314505776636713, + "grad_norm": 4.869655609130859, + "learning_rate": 1.5905006418485239e-06, + "loss": 3.0794, + "step": 414 + }, + { + "epoch": 0.5327342747111682, + "grad_norm": 2.206669569015503, + "learning_rate": 1.5943517329910141e-06, + "loss": 3.0444, + "step": 415 + }, + { + "epoch": 0.5340179717586649, + "grad_norm": 4.958453178405762, + "learning_rate": 1.5982028241335045e-06, + "loss": 3.0675, + "step": 416 + }, + { + "epoch": 0.5353016688061617, + "grad_norm": 5.575069904327393, + "learning_rate": 1.602053915275995e-06, + "loss": 3.0627, + "step": 417 + }, + { + "epoch": 0.5365853658536586, + "grad_norm": 6.711045265197754, + "learning_rate": 1.6059050064184852e-06, + "loss": 3.0461, + "step": 418 + }, + { + "epoch": 0.5378690629011553, + "grad_norm": 6.169948577880859, + "learning_rate": 1.6097560975609756e-06, + "loss": 3.0384, + "step": 419 + }, + { + "epoch": 0.5391527599486521, + "grad_norm": 2.8179290294647217, + "learning_rate": 1.613607188703466e-06, + "loss": 3.036, + "step": 420 + }, + { + "epoch": 0.540436456996149, + "grad_norm": 2.550825834274292, + "learning_rate": 1.6174582798459563e-06, + "loss": 3.0149, + "step": 421 + }, + { + "epoch": 0.5417201540436457, + "grad_norm": 6.49483585357666, + "learning_rate": 1.6213093709884467e-06, + "loss": 3.0123, + "step": 422 + }, + { + "epoch": 0.5430038510911425, + "grad_norm": 4.586013317108154, + "learning_rate": 1.625160462130937e-06, + "loss": 3.0196, + "step": 423 + }, + { + "epoch": 0.5442875481386393, + "grad_norm": 3.7473394870758057, + "learning_rate": 1.6290115532734276e-06, + "loss": 2.9974, + "step": 424 + }, + { + "epoch": 0.5455712451861361, + "grad_norm": 1.6458550691604614, + "learning_rate": 1.632862644415918e-06, + "loss": 2.9996, + "step": 425 + }, + { + "epoch": 0.5468549422336328, + "grad_norm": 3.0494046211242676, + "learning_rate": 1.6367137355584083e-06, + "loss": 3.0, + "step": 426 + }, + { + "epoch": 0.5481386392811296, + "grad_norm": 3.776355743408203, + "learning_rate": 1.6405648267008987e-06, + "loss": 2.9768, + "step": 427 + }, + { + "epoch": 0.5494223363286265, + "grad_norm": 36.14421463012695, + "learning_rate": 1.644415917843389e-06, + "loss": 2.9672, + "step": 428 + }, + { + "epoch": 0.5507060333761232, + "grad_norm": 2.1283068656921387, + "learning_rate": 1.6482670089858794e-06, + "loss": 2.9843, + "step": 429 + }, + { + "epoch": 0.55198973042362, + "grad_norm": 3.2613437175750732, + "learning_rate": 1.6521181001283698e-06, + "loss": 2.9826, + "step": 430 + }, + { + "epoch": 0.5532734274711169, + "grad_norm": 3.8119924068450928, + "learning_rate": 1.65596919127086e-06, + "loss": 2.9653, + "step": 431 + }, + { + "epoch": 0.5545571245186136, + "grad_norm": 3.555905818939209, + "learning_rate": 1.6598202824133505e-06, + "loss": 2.9311, + "step": 432 + }, + { + "epoch": 0.5558408215661104, + "grad_norm": 2.9793708324432373, + "learning_rate": 1.6636713735558409e-06, + "loss": 2.95, + "step": 433 + }, + { + "epoch": 0.5571245186136072, + "grad_norm": 2.6017532348632812, + "learning_rate": 1.6675224646983311e-06, + "loss": 2.9528, + "step": 434 + }, + { + "epoch": 0.558408215661104, + "grad_norm": 3.010883331298828, + "learning_rate": 1.6713735558408218e-06, + "loss": 2.9207, + "step": 435 + }, + { + "epoch": 0.5596919127086007, + "grad_norm": 4.232792854309082, + "learning_rate": 1.675224646983312e-06, + "loss": 2.904, + "step": 436 + }, + { + "epoch": 0.5609756097560976, + "grad_norm": 3.223520040512085, + "learning_rate": 1.6790757381258024e-06, + "loss": 2.9149, + "step": 437 + }, + { + "epoch": 0.5622593068035944, + "grad_norm": 2.4745566844940186, + "learning_rate": 1.6829268292682928e-06, + "loss": 2.9018, + "step": 438 + }, + { + "epoch": 0.5635430038510911, + "grad_norm": 2.2844088077545166, + "learning_rate": 1.686777920410783e-06, + "loss": 2.8972, + "step": 439 + }, + { + "epoch": 0.5648267008985879, + "grad_norm": 2.1709048748016357, + "learning_rate": 1.6906290115532735e-06, + "loss": 2.8892, + "step": 440 + }, + { + "epoch": 0.5661103979460848, + "grad_norm": 6.405858039855957, + "learning_rate": 1.6944801026957637e-06, + "loss": 2.8695, + "step": 441 + }, + { + "epoch": 0.5673940949935815, + "grad_norm": 7.1617937088012695, + "learning_rate": 1.6983311938382542e-06, + "loss": 2.8637, + "step": 442 + }, + { + "epoch": 0.5686777920410783, + "grad_norm": 3.4643394947052, + "learning_rate": 1.7021822849807446e-06, + "loss": 2.8383, + "step": 443 + }, + { + "epoch": 0.5699614890885751, + "grad_norm": 2.7208151817321777, + "learning_rate": 1.7060333761232348e-06, + "loss": 2.8319, + "step": 444 + }, + { + "epoch": 0.5712451861360719, + "grad_norm": 2.770800828933716, + "learning_rate": 1.7098844672657253e-06, + "loss": 2.8466, + "step": 445 + }, + { + "epoch": 0.5725288831835686, + "grad_norm": 2.959151268005371, + "learning_rate": 1.713735558408216e-06, + "loss": 2.831, + "step": 446 + }, + { + "epoch": 0.5738125802310655, + "grad_norm": 4.3938775062561035, + "learning_rate": 1.7175866495507061e-06, + "loss": 2.8188, + "step": 447 + }, + { + "epoch": 0.5750962772785623, + "grad_norm": 4.71693754196167, + "learning_rate": 1.7214377406931966e-06, + "loss": 2.8513, + "step": 448 + }, + { + "epoch": 0.576379974326059, + "grad_norm": 2.847822427749634, + "learning_rate": 1.7252888318356868e-06, + "loss": 2.838, + "step": 449 + }, + { + "epoch": 0.5776636713735558, + "grad_norm": 6.239818096160889, + "learning_rate": 1.7291399229781772e-06, + "loss": 2.9177, + "step": 450 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 15.888236045837402, + "learning_rate": 1.7329910141206677e-06, + "loss": 2.8791, + "step": 451 + }, + { + "epoch": 0.5802310654685494, + "grad_norm": 11.259949684143066, + "learning_rate": 1.7368421052631579e-06, + "loss": 2.8215, + "step": 452 + }, + { + "epoch": 0.5815147625160462, + "grad_norm": 8.95538330078125, + "learning_rate": 1.7406931964056483e-06, + "loss": 2.8228, + "step": 453 + }, + { + "epoch": 0.582798459563543, + "grad_norm": 5.1466546058654785, + "learning_rate": 1.7445442875481385e-06, + "loss": 2.7845, + "step": 454 + }, + { + "epoch": 0.5840821566110398, + "grad_norm": 3.8544881343841553, + "learning_rate": 1.748395378690629e-06, + "loss": 2.7459, + "step": 455 + }, + { + "epoch": 0.5853658536585366, + "grad_norm": 10.917842864990234, + "learning_rate": 1.7522464698331194e-06, + "loss": 2.7312, + "step": 456 + }, + { + "epoch": 0.5866495507060334, + "grad_norm": 6.054429054260254, + "learning_rate": 1.7560975609756098e-06, + "loss": 2.7254, + "step": 457 + }, + { + "epoch": 0.5879332477535302, + "grad_norm": 5.144399166107178, + "learning_rate": 1.7599486521181003e-06, + "loss": 2.6899, + "step": 458 + }, + { + "epoch": 0.5892169448010269, + "grad_norm": 3.937089204788208, + "learning_rate": 1.7637997432605905e-06, + "loss": 2.6824, + "step": 459 + }, + { + "epoch": 0.5905006418485238, + "grad_norm": 4.6186723709106445, + "learning_rate": 1.767650834403081e-06, + "loss": 2.6566, + "step": 460 + }, + { + "epoch": 0.5917843388960206, + "grad_norm": 5.16159200668335, + "learning_rate": 1.7715019255455714e-06, + "loss": 2.6842, + "step": 461 + }, + { + "epoch": 0.5930680359435173, + "grad_norm": 3.294140100479126, + "learning_rate": 1.7753530166880616e-06, + "loss": 2.6382, + "step": 462 + }, + { + "epoch": 0.5943517329910141, + "grad_norm": 21.5394287109375, + "learning_rate": 1.779204107830552e-06, + "loss": 2.6243, + "step": 463 + }, + { + "epoch": 0.5956354300385109, + "grad_norm": 5.000025749206543, + "learning_rate": 1.7830551989730425e-06, + "loss": 2.6593, + "step": 464 + }, + { + "epoch": 0.5969191270860077, + "grad_norm": 4.962882995605469, + "learning_rate": 1.7869062901155327e-06, + "loss": 2.6177, + "step": 465 + }, + { + "epoch": 0.5982028241335045, + "grad_norm": 4.599617958068848, + "learning_rate": 1.7907573812580231e-06, + "loss": 2.6093, + "step": 466 + }, + { + "epoch": 0.5994865211810013, + "grad_norm": 3.3913557529449463, + "learning_rate": 1.7946084724005134e-06, + "loss": 2.5722, + "step": 467 + }, + { + "epoch": 0.6007702182284981, + "grad_norm": 3.5761096477508545, + "learning_rate": 1.7984595635430038e-06, + "loss": 2.5887, + "step": 468 + }, + { + "epoch": 0.6020539152759948, + "grad_norm": 3.406315326690674, + "learning_rate": 1.8023106546854944e-06, + "loss": 2.5682, + "step": 469 + }, + { + "epoch": 0.6033376123234917, + "grad_norm": 3.643446683883667, + "learning_rate": 1.8061617458279847e-06, + "loss": 2.5429, + "step": 470 + }, + { + "epoch": 0.6046213093709885, + "grad_norm": 4.5528106689453125, + "learning_rate": 1.810012836970475e-06, + "loss": 2.5282, + "step": 471 + }, + { + "epoch": 0.6059050064184852, + "grad_norm": 5.149334907531738, + "learning_rate": 1.8138639281129653e-06, + "loss": 2.5438, + "step": 472 + }, + { + "epoch": 0.6071887034659821, + "grad_norm": 13.459592819213867, + "learning_rate": 1.8177150192554558e-06, + "loss": 2.4954, + "step": 473 + }, + { + "epoch": 0.6084724005134788, + "grad_norm": 3.159773826599121, + "learning_rate": 1.8215661103979462e-06, + "loss": 2.517, + "step": 474 + }, + { + "epoch": 0.6097560975609756, + "grad_norm": 2.859253168106079, + "learning_rate": 1.8254172015404364e-06, + "loss": 2.4969, + "step": 475 + }, + { + "epoch": 0.6110397946084724, + "grad_norm": 5.062876224517822, + "learning_rate": 1.8292682926829268e-06, + "loss": 2.4553, + "step": 476 + }, + { + "epoch": 0.6123234916559692, + "grad_norm": 3.475503444671631, + "learning_rate": 1.8331193838254173e-06, + "loss": 2.4418, + "step": 477 + }, + { + "epoch": 0.613607188703466, + "grad_norm": 2.8628828525543213, + "learning_rate": 1.8369704749679075e-06, + "loss": 2.4433, + "step": 478 + }, + { + "epoch": 0.6148908857509627, + "grad_norm": 5.238796234130859, + "learning_rate": 1.840821566110398e-06, + "loss": 2.4253, + "step": 479 + }, + { + "epoch": 0.6161745827984596, + "grad_norm": 6.911955833435059, + "learning_rate": 1.8446726572528884e-06, + "loss": 2.3884, + "step": 480 + }, + { + "epoch": 0.6174582798459564, + "grad_norm": 5.33975076675415, + "learning_rate": 1.8485237483953788e-06, + "loss": 2.4222, + "step": 481 + }, + { + "epoch": 0.6187419768934531, + "grad_norm": 3.391489028930664, + "learning_rate": 1.8523748395378692e-06, + "loss": 2.3804, + "step": 482 + }, + { + "epoch": 0.62002567394095, + "grad_norm": 4.331815719604492, + "learning_rate": 1.8562259306803595e-06, + "loss": 2.3711, + "step": 483 + }, + { + "epoch": 0.6213093709884467, + "grad_norm": 3.421475410461426, + "learning_rate": 1.86007702182285e-06, + "loss": 2.3574, + "step": 484 + }, + { + "epoch": 0.6225930680359435, + "grad_norm": 3.045912742614746, + "learning_rate": 1.8639281129653401e-06, + "loss": 2.3106, + "step": 485 + }, + { + "epoch": 0.6238767650834403, + "grad_norm": 3.6695072650909424, + "learning_rate": 1.8677792041078306e-06, + "loss": 2.3295, + "step": 486 + }, + { + "epoch": 0.6251604621309371, + "grad_norm": 3.298065423965454, + "learning_rate": 1.871630295250321e-06, + "loss": 2.3088, + "step": 487 + }, + { + "epoch": 0.6264441591784339, + "grad_norm": 4.256178855895996, + "learning_rate": 1.8754813863928114e-06, + "loss": 2.3164, + "step": 488 + }, + { + "epoch": 0.6277278562259306, + "grad_norm": 3.3660290241241455, + "learning_rate": 1.8793324775353019e-06, + "loss": 2.2996, + "step": 489 + }, + { + "epoch": 0.6290115532734275, + "grad_norm": 21.471315383911133, + "learning_rate": 1.8831835686777923e-06, + "loss": 2.2633, + "step": 490 + }, + { + "epoch": 0.6302952503209243, + "grad_norm": 5.996706485748291, + "learning_rate": 1.8870346598202825e-06, + "loss": 2.2768, + "step": 491 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 3.8171963691711426, + "learning_rate": 1.890885750962773e-06, + "loss": 2.3135, + "step": 492 + }, + { + "epoch": 0.6328626444159179, + "grad_norm": 3.695248603820801, + "learning_rate": 1.8947368421052632e-06, + "loss": 2.2641, + "step": 493 + }, + { + "epoch": 0.6341463414634146, + "grad_norm": 5.199057579040527, + "learning_rate": 1.8985879332477536e-06, + "loss": 2.2666, + "step": 494 + }, + { + "epoch": 0.6354300385109114, + "grad_norm": 5.159434795379639, + "learning_rate": 1.902439024390244e-06, + "loss": 2.2722, + "step": 495 + }, + { + "epoch": 0.6367137355584083, + "grad_norm": 4.2077202796936035, + "learning_rate": 1.9062901155327343e-06, + "loss": 2.2781, + "step": 496 + }, + { + "epoch": 0.637997432605905, + "grad_norm": 3.2308552265167236, + "learning_rate": 1.9101412066752245e-06, + "loss": 2.255, + "step": 497 + }, + { + "epoch": 0.6392811296534018, + "grad_norm": 3.7804834842681885, + "learning_rate": 1.913992297817715e-06, + "loss": 2.2904, + "step": 498 + }, + { + "epoch": 0.6405648267008985, + "grad_norm": 3.1728270053863525, + "learning_rate": 1.9178433889602054e-06, + "loss": 2.3166, + "step": 499 + }, + { + "epoch": 0.6418485237483954, + "grad_norm": 16.286394119262695, + "learning_rate": 1.9216944801026956e-06, + "loss": 2.3934, + "step": 500 + }, + { + "epoch": 0.6431322207958922, + "grad_norm": 11.94299030303955, + "learning_rate": 1.9255455712451862e-06, + "loss": 2.142, + "step": 501 + }, + { + "epoch": 0.6444159178433889, + "grad_norm": 7.800815582275391, + "learning_rate": 1.9293966623876765e-06, + "loss": 2.1061, + "step": 502 + }, + { + "epoch": 0.6456996148908858, + "grad_norm": 5.40025520324707, + "learning_rate": 1.9332477535301667e-06, + "loss": 2.0816, + "step": 503 + }, + { + "epoch": 0.6469833119383825, + "grad_norm": 3.4121334552764893, + "learning_rate": 1.9370988446726573e-06, + "loss": 2.0458, + "step": 504 + }, + { + "epoch": 0.6482670089858793, + "grad_norm": 4.032678604125977, + "learning_rate": 1.9409499358151476e-06, + "loss": 2.0656, + "step": 505 + }, + { + "epoch": 0.6495507060333762, + "grad_norm": 3.569476842880249, + "learning_rate": 1.9448010269576378e-06, + "loss": 2.046, + "step": 506 + }, + { + "epoch": 0.6508344030808729, + "grad_norm": 2.297093391418457, + "learning_rate": 1.9486521181001284e-06, + "loss": 2.0238, + "step": 507 + }, + { + "epoch": 0.6521181001283697, + "grad_norm": 3.593141794204712, + "learning_rate": 1.952503209242619e-06, + "loss": 2.017, + "step": 508 + }, + { + "epoch": 0.6534017971758665, + "grad_norm": 3.8404839038848877, + "learning_rate": 1.9563543003851093e-06, + "loss": 2.0162, + "step": 509 + }, + { + "epoch": 0.6546854942233633, + "grad_norm": 5.941136360168457, + "learning_rate": 1.9602053915275995e-06, + "loss": 1.9931, + "step": 510 + }, + { + "epoch": 0.6559691912708601, + "grad_norm": 2.2746422290802, + "learning_rate": 1.96405648267009e-06, + "loss": 1.9356, + "step": 511 + }, + { + "epoch": 0.6572528883183568, + "grad_norm": 2.672189235687256, + "learning_rate": 1.9679075738125804e-06, + "loss": 1.9623, + "step": 512 + }, + { + "epoch": 0.6585365853658537, + "grad_norm": 4.078820705413818, + "learning_rate": 1.9717586649550706e-06, + "loss": 1.993, + "step": 513 + }, + { + "epoch": 0.6598202824133504, + "grad_norm": 2.7616958618164062, + "learning_rate": 1.9756097560975613e-06, + "loss": 1.9395, + "step": 514 + }, + { + "epoch": 0.6611039794608472, + "grad_norm": 3.846428871154785, + "learning_rate": 1.9794608472400515e-06, + "loss": 1.926, + "step": 515 + }, + { + "epoch": 0.6623876765083441, + "grad_norm": 4.205070972442627, + "learning_rate": 1.9833119383825417e-06, + "loss": 1.8923, + "step": 516 + }, + { + "epoch": 0.6636713735558408, + "grad_norm": 3.930536985397339, + "learning_rate": 1.9871630295250324e-06, + "loss": 1.8705, + "step": 517 + }, + { + "epoch": 0.6649550706033376, + "grad_norm": 4.312464714050293, + "learning_rate": 1.9910141206675226e-06, + "loss": 1.8852, + "step": 518 + }, + { + "epoch": 0.6662387676508345, + "grad_norm": 2.5857338905334473, + "learning_rate": 1.994865211810013e-06, + "loss": 1.8858, + "step": 519 + }, + { + "epoch": 0.6675224646983312, + "grad_norm": 7.249959945678711, + "learning_rate": 1.998716302952503e-06, + "loss": 1.8864, + "step": 520 + }, + { + "epoch": 0.668806161745828, + "grad_norm": 3.217883348464966, + "learning_rate": 2.0025673940949937e-06, + "loss": 1.8279, + "step": 521 + }, + { + "epoch": 0.6700898587933247, + "grad_norm": 3.5921382904052734, + "learning_rate": 2.006418485237484e-06, + "loss": 1.8659, + "step": 522 + }, + { + "epoch": 0.6713735558408216, + "grad_norm": 2.312103271484375, + "learning_rate": 2.010269576379974e-06, + "loss": 1.825, + "step": 523 + }, + { + "epoch": 0.6726572528883183, + "grad_norm": 3.586066961288452, + "learning_rate": 2.0141206675224648e-06, + "loss": 1.8399, + "step": 524 + }, + { + "epoch": 0.6739409499358151, + "grad_norm": 2.3749477863311768, + "learning_rate": 2.017971758664955e-06, + "loss": 1.8189, + "step": 525 + }, + { + "epoch": 0.675224646983312, + "grad_norm": 3.5971667766571045, + "learning_rate": 2.0218228498074452e-06, + "loss": 1.7961, + "step": 526 + }, + { + "epoch": 0.6765083440308087, + "grad_norm": 2.8192286491394043, + "learning_rate": 2.025673940949936e-06, + "loss": 1.8243, + "step": 527 + }, + { + "epoch": 0.6777920410783055, + "grad_norm": 3.166078567504883, + "learning_rate": 2.029525032092426e-06, + "loss": 1.7713, + "step": 528 + }, + { + "epoch": 0.6790757381258024, + "grad_norm": 5.492098331451416, + "learning_rate": 2.0333761232349163e-06, + "loss": 1.7655, + "step": 529 + }, + { + "epoch": 0.6803594351732991, + "grad_norm": 2.9173991680145264, + "learning_rate": 2.037227214377407e-06, + "loss": 1.7925, + "step": 530 + }, + { + "epoch": 0.6816431322207959, + "grad_norm": 4.113759994506836, + "learning_rate": 2.0410783055198976e-06, + "loss": 1.7788, + "step": 531 + }, + { + "epoch": 0.6829268292682927, + "grad_norm": 3.966140031814575, + "learning_rate": 2.044929396662388e-06, + "loss": 1.7188, + "step": 532 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 2.333364725112915, + "learning_rate": 2.048780487804878e-06, + "loss": 1.7959, + "step": 533 + }, + { + "epoch": 0.6854942233632862, + "grad_norm": 2.7772445678710938, + "learning_rate": 2.0526315789473687e-06, + "loss": 1.7271, + "step": 534 + }, + { + "epoch": 0.686777920410783, + "grad_norm": 5.49709415435791, + "learning_rate": 2.056482670089859e-06, + "loss": 1.7418, + "step": 535 + }, + { + "epoch": 0.6880616174582799, + "grad_norm": 4.052853107452393, + "learning_rate": 2.060333761232349e-06, + "loss": 1.7536, + "step": 536 + }, + { + "epoch": 0.6893453145057766, + "grad_norm": 5.225743293762207, + "learning_rate": 2.06418485237484e-06, + "loss": 1.7056, + "step": 537 + }, + { + "epoch": 0.6906290115532734, + "grad_norm": 4.913660526275635, + "learning_rate": 2.06803594351733e-06, + "loss": 1.7724, + "step": 538 + }, + { + "epoch": 0.6919127086007703, + "grad_norm": 4.794094562530518, + "learning_rate": 2.0718870346598202e-06, + "loss": 1.7796, + "step": 539 + }, + { + "epoch": 0.693196405648267, + "grad_norm": 3.701730251312256, + "learning_rate": 2.075738125802311e-06, + "loss": 1.6599, + "step": 540 + }, + { + "epoch": 0.6944801026957638, + "grad_norm": 5.673757076263428, + "learning_rate": 2.079589216944801e-06, + "loss": 1.7103, + "step": 541 + }, + { + "epoch": 0.6957637997432606, + "grad_norm": 5.089924335479736, + "learning_rate": 2.0834403080872913e-06, + "loss": 1.7444, + "step": 542 + }, + { + "epoch": 0.6970474967907574, + "grad_norm": 3.2783498764038086, + "learning_rate": 2.087291399229782e-06, + "loss": 1.7175, + "step": 543 + }, + { + "epoch": 0.6983311938382541, + "grad_norm": 3.2167928218841553, + "learning_rate": 2.091142490372272e-06, + "loss": 1.7573, + "step": 544 + }, + { + "epoch": 0.699614890885751, + "grad_norm": 5.307199478149414, + "learning_rate": 2.0949935815147624e-06, + "loss": 1.8001, + "step": 545 + }, + { + "epoch": 0.7008985879332478, + "grad_norm": 6.651915550231934, + "learning_rate": 2.0988446726572526e-06, + "loss": 1.7343, + "step": 546 + }, + { + "epoch": 0.7021822849807445, + "grad_norm": 4.084765434265137, + "learning_rate": 2.1026957637997433e-06, + "loss": 1.7678, + "step": 547 + }, + { + "epoch": 0.7034659820282413, + "grad_norm": 4.867658615112305, + "learning_rate": 2.1065468549422335e-06, + "loss": 1.7713, + "step": 548 + }, + { + "epoch": 0.7047496790757382, + "grad_norm": 4.187880516052246, + "learning_rate": 2.1103979460847237e-06, + "loss": 1.8963, + "step": 549 + }, + { + "epoch": 0.7060333761232349, + "grad_norm": 4.418581962585449, + "learning_rate": 2.1142490372272144e-06, + "loss": 1.9699, + "step": 550 + }, + { + "epoch": 0.7073170731707317, + "grad_norm": 8.570418357849121, + "learning_rate": 2.1181001283697046e-06, + "loss": 1.6575, + "step": 551 + }, + { + "epoch": 0.7086007702182285, + "grad_norm": 7.159685134887695, + "learning_rate": 2.121951219512195e-06, + "loss": 1.5573, + "step": 552 + }, + { + "epoch": 0.7098844672657253, + "grad_norm": 5.4836554527282715, + "learning_rate": 2.125802310654686e-06, + "loss": 1.5812, + "step": 553 + }, + { + "epoch": 0.711168164313222, + "grad_norm": 3.27650785446167, + "learning_rate": 2.129653401797176e-06, + "loss": 1.5598, + "step": 554 + }, + { + "epoch": 0.7124518613607189, + "grad_norm": 3.342855930328369, + "learning_rate": 2.1335044929396664e-06, + "loss": 1.5321, + "step": 555 + }, + { + "epoch": 0.7137355584082157, + "grad_norm": 3.3505494594573975, + "learning_rate": 2.137355584082157e-06, + "loss": 1.537, + "step": 556 + }, + { + "epoch": 0.7150192554557124, + "grad_norm": 3.3080084323883057, + "learning_rate": 2.1412066752246472e-06, + "loss": 1.537, + "step": 557 + }, + { + "epoch": 0.7163029525032092, + "grad_norm": 2.1756575107574463, + "learning_rate": 2.1450577663671374e-06, + "loss": 1.516, + "step": 558 + }, + { + "epoch": 0.7175866495507061, + "grad_norm": 4.2100348472595215, + "learning_rate": 2.1489088575096277e-06, + "loss": 1.4779, + "step": 559 + }, + { + "epoch": 0.7188703465982028, + "grad_norm": 5.060901641845703, + "learning_rate": 2.1527599486521183e-06, + "loss": 1.5263, + "step": 560 + }, + { + "epoch": 0.7201540436456996, + "grad_norm": 3.6082191467285156, + "learning_rate": 2.1566110397946085e-06, + "loss": 1.4928, + "step": 561 + }, + { + "epoch": 0.7214377406931964, + "grad_norm": 2.0594840049743652, + "learning_rate": 2.1604621309370988e-06, + "loss": 1.5032, + "step": 562 + }, + { + "epoch": 0.7227214377406932, + "grad_norm": 3.6437103748321533, + "learning_rate": 2.1643132220795894e-06, + "loss": 1.4936, + "step": 563 + }, + { + "epoch": 0.72400513478819, + "grad_norm": 3.771500825881958, + "learning_rate": 2.1681643132220796e-06, + "loss": 1.4852, + "step": 564 + }, + { + "epoch": 0.7252888318356868, + "grad_norm": 5.350396633148193, + "learning_rate": 2.17201540436457e-06, + "loss": 1.5235, + "step": 565 + }, + { + "epoch": 0.7265725288831836, + "grad_norm": 4.163018703460693, + "learning_rate": 2.1758664955070605e-06, + "loss": 1.4814, + "step": 566 + }, + { + "epoch": 0.7278562259306803, + "grad_norm": 2.3174567222595215, + "learning_rate": 2.1797175866495507e-06, + "loss": 1.489, + "step": 567 + }, + { + "epoch": 0.7291399229781772, + "grad_norm": 2.5452702045440674, + "learning_rate": 2.183568677792041e-06, + "loss": 1.4543, + "step": 568 + }, + { + "epoch": 0.730423620025674, + "grad_norm": 10.002145767211914, + "learning_rate": 2.1874197689345316e-06, + "loss": 1.4729, + "step": 569 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 5.464149475097656, + "learning_rate": 2.191270860077022e-06, + "loss": 1.4756, + "step": 570 + }, + { + "epoch": 0.7329910141206675, + "grad_norm": 3.1264593601226807, + "learning_rate": 2.195121951219512e-06, + "loss": 1.4807, + "step": 571 + }, + { + "epoch": 0.7342747111681643, + "grad_norm": 5.05717134475708, + "learning_rate": 2.1989730423620023e-06, + "loss": 1.473, + "step": 572 + }, + { + "epoch": 0.7355584082156611, + "grad_norm": 2.6096537113189697, + "learning_rate": 2.202824133504493e-06, + "loss": 1.4362, + "step": 573 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 3.0062127113342285, + "learning_rate": 2.206675224646983e-06, + "loss": 1.4395, + "step": 574 + }, + { + "epoch": 0.7381258023106547, + "grad_norm": 2.1466500759124756, + "learning_rate": 2.2105263157894734e-06, + "loss": 1.4506, + "step": 575 + }, + { + "epoch": 0.7394094993581515, + "grad_norm": 2.3291752338409424, + "learning_rate": 2.2143774069319644e-06, + "loss": 1.5157, + "step": 576 + }, + { + "epoch": 0.7406931964056482, + "grad_norm": 5.733763694763184, + "learning_rate": 2.2182284980744547e-06, + "loss": 1.4587, + "step": 577 + }, + { + "epoch": 0.7419768934531451, + "grad_norm": 2.418729543685913, + "learning_rate": 2.222079589216945e-06, + "loss": 1.4235, + "step": 578 + }, + { + "epoch": 0.7432605905006419, + "grad_norm": 2.6055257320404053, + "learning_rate": 2.2259306803594355e-06, + "loss": 1.4523, + "step": 579 + }, + { + "epoch": 0.7445442875481386, + "grad_norm": 4.000535488128662, + "learning_rate": 2.2297817715019257e-06, + "loss": 1.4701, + "step": 580 + }, + { + "epoch": 0.7458279845956355, + "grad_norm": 6.057888507843018, + "learning_rate": 2.233632862644416e-06, + "loss": 1.4762, + "step": 581 + }, + { + "epoch": 0.7471116816431322, + "grad_norm": 1.8840421438217163, + "learning_rate": 2.2374839537869066e-06, + "loss": 1.4544, + "step": 582 + }, + { + "epoch": 0.748395378690629, + "grad_norm": 2.364220142364502, + "learning_rate": 2.241335044929397e-06, + "loss": 1.4831, + "step": 583 + }, + { + "epoch": 0.7496790757381258, + "grad_norm": 3.2425448894500732, + "learning_rate": 2.245186136071887e-06, + "loss": 1.4803, + "step": 584 + }, + { + "epoch": 0.7509627727856226, + "grad_norm": 4.065702438354492, + "learning_rate": 2.2490372272143773e-06, + "loss": 1.489, + "step": 585 + }, + { + "epoch": 0.7522464698331194, + "grad_norm": 2.9406991004943848, + "learning_rate": 2.252888318356868e-06, + "loss": 1.4831, + "step": 586 + }, + { + "epoch": 0.7535301668806161, + "grad_norm": 2.476684808731079, + "learning_rate": 2.256739409499358e-06, + "loss": 1.4944, + "step": 587 + }, + { + "epoch": 0.754813863928113, + "grad_norm": 3.712686538696289, + "learning_rate": 2.2605905006418484e-06, + "loss": 1.4766, + "step": 588 + }, + { + "epoch": 0.7560975609756098, + "grad_norm": 4.379526138305664, + "learning_rate": 2.264441591784339e-06, + "loss": 1.4625, + "step": 589 + }, + { + "epoch": 0.7573812580231065, + "grad_norm": 3.0794143676757812, + "learning_rate": 2.2682926829268293e-06, + "loss": 1.4955, + "step": 590 + }, + { + "epoch": 0.7586649550706034, + "grad_norm": 5.108908176422119, + "learning_rate": 2.2721437740693195e-06, + "loss": 1.4673, + "step": 591 + }, + { + "epoch": 0.7599486521181001, + "grad_norm": 20.051380157470703, + "learning_rate": 2.27599486521181e-06, + "loss": 1.4821, + "step": 592 + }, + { + "epoch": 0.7612323491655969, + "grad_norm": 6.290349006652832, + "learning_rate": 2.2798459563543003e-06, + "loss": 1.4827, + "step": 593 + }, + { + "epoch": 0.7625160462130937, + "grad_norm": 2.988668441772461, + "learning_rate": 2.2836970474967906e-06, + "loss": 1.496, + "step": 594 + }, + { + "epoch": 0.7637997432605905, + "grad_norm": 5.114992141723633, + "learning_rate": 2.2875481386392812e-06, + "loss": 1.4913, + "step": 595 + }, + { + "epoch": 0.7650834403080873, + "grad_norm": 3.015044927597046, + "learning_rate": 2.2913992297817714e-06, + "loss": 1.4888, + "step": 596 + }, + { + "epoch": 0.766367137355584, + "grad_norm": 6.18331241607666, + "learning_rate": 2.2952503209242617e-06, + "loss": 1.5574, + "step": 597 + }, + { + "epoch": 0.7676508344030809, + "grad_norm": 4.923129081726074, + "learning_rate": 2.2991014120667523e-06, + "loss": 1.5526, + "step": 598 + }, + { + "epoch": 0.7689345314505777, + "grad_norm": 4.91621732711792, + "learning_rate": 2.302952503209243e-06, + "loss": 1.6422, + "step": 599 + }, + { + "epoch": 0.7702182284980744, + "grad_norm": 5.8028082847595215, + "learning_rate": 2.306803594351733e-06, + "loss": 1.8994, + "step": 600 + }, + { + "epoch": 0.7715019255455713, + "grad_norm": 5.631858825683594, + "learning_rate": 2.3106546854942234e-06, + "loss": 1.3085, + "step": 601 + }, + { + "epoch": 0.772785622593068, + "grad_norm": 4.195847511291504, + "learning_rate": 2.314505776636714e-06, + "loss": 1.3307, + "step": 602 + }, + { + "epoch": 0.7740693196405648, + "grad_norm": 2.9175002574920654, + "learning_rate": 2.3183568677792043e-06, + "loss": 1.2848, + "step": 603 + }, + { + "epoch": 0.7753530166880617, + "grad_norm": 1.9752349853515625, + "learning_rate": 2.3222079589216945e-06, + "loss": 1.3114, + "step": 604 + }, + { + "epoch": 0.7766367137355584, + "grad_norm": 3.6058242321014404, + "learning_rate": 2.326059050064185e-06, + "loss": 1.3102, + "step": 605 + }, + { + "epoch": 0.7779204107830552, + "grad_norm": 4.959922790527344, + "learning_rate": 2.3299101412066754e-06, + "loss": 1.3138, + "step": 606 + }, + { + "epoch": 0.7792041078305519, + "grad_norm": 3.0089902877807617, + "learning_rate": 2.3337612323491656e-06, + "loss": 1.2957, + "step": 607 + }, + { + "epoch": 0.7804878048780488, + "grad_norm": 2.3154454231262207, + "learning_rate": 2.3376123234916562e-06, + "loss": 1.2863, + "step": 608 + }, + { + "epoch": 0.7817715019255456, + "grad_norm": 2.754892110824585, + "learning_rate": 2.3414634146341465e-06, + "loss": 1.2951, + "step": 609 + }, + { + "epoch": 0.7830551989730423, + "grad_norm": 2.783585786819458, + "learning_rate": 2.3453145057766367e-06, + "loss": 1.3131, + "step": 610 + }, + { + "epoch": 0.7843388960205392, + "grad_norm": 2.0606112480163574, + "learning_rate": 2.349165596919127e-06, + "loss": 1.2715, + "step": 611 + }, + { + "epoch": 0.785622593068036, + "grad_norm": 3.135239362716675, + "learning_rate": 2.3530166880616176e-06, + "loss": 1.3105, + "step": 612 + }, + { + "epoch": 0.7869062901155327, + "grad_norm": 2.3232169151306152, + "learning_rate": 2.3568677792041078e-06, + "loss": 1.279, + "step": 613 + }, + { + "epoch": 0.7881899871630296, + "grad_norm": 3.4578206539154053, + "learning_rate": 2.360718870346598e-06, + "loss": 1.2987, + "step": 614 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 2.7253754138946533, + "learning_rate": 2.3645699614890887e-06, + "loss": 1.2926, + "step": 615 + }, + { + "epoch": 0.7907573812580231, + "grad_norm": 2.029757022857666, + "learning_rate": 2.368421052631579e-06, + "loss": 1.2769, + "step": 616 + }, + { + "epoch": 0.79204107830552, + "grad_norm": 3.506108045578003, + "learning_rate": 2.372272143774069e-06, + "loss": 1.3, + "step": 617 + }, + { + "epoch": 0.7933247753530167, + "grad_norm": 4.525811672210693, + "learning_rate": 2.3761232349165597e-06, + "loss": 1.2918, + "step": 618 + }, + { + "epoch": 0.7946084724005135, + "grad_norm": 2.5861072540283203, + "learning_rate": 2.37997432605905e-06, + "loss": 1.2954, + "step": 619 + }, + { + "epoch": 0.7958921694480102, + "grad_norm": 3.4657814502716064, + "learning_rate": 2.38382541720154e-06, + "loss": 1.3074, + "step": 620 + }, + { + "epoch": 0.7971758664955071, + "grad_norm": 3.8747518062591553, + "learning_rate": 2.3876765083440313e-06, + "loss": 1.2822, + "step": 621 + }, + { + "epoch": 0.7984595635430038, + "grad_norm": 3.10097599029541, + "learning_rate": 2.3915275994865215e-06, + "loss": 1.2972, + "step": 622 + }, + { + "epoch": 0.7997432605905006, + "grad_norm": 2.387160539627075, + "learning_rate": 2.3953786906290117e-06, + "loss": 1.3136, + "step": 623 + }, + { + "epoch": 0.8010269576379975, + "grad_norm": 4.623098850250244, + "learning_rate": 2.399229781771502e-06, + "loss": 1.2909, + "step": 624 + }, + { + "epoch": 0.8023106546854942, + "grad_norm": 2.40871000289917, + "learning_rate": 2.4030808729139926e-06, + "loss": 1.3008, + "step": 625 + }, + { + "epoch": 0.803594351732991, + "grad_norm": 9.528878211975098, + "learning_rate": 2.406931964056483e-06, + "loss": 1.3564, + "step": 626 + }, + { + "epoch": 0.8048780487804879, + "grad_norm": 3.270463705062866, + "learning_rate": 2.410783055198973e-06, + "loss": 1.3483, + "step": 627 + }, + { + "epoch": 0.8061617458279846, + "grad_norm": 2.827291965484619, + "learning_rate": 2.4146341463414637e-06, + "loss": 1.2888, + "step": 628 + }, + { + "epoch": 0.8074454428754814, + "grad_norm": 2.2113242149353027, + "learning_rate": 2.418485237483954e-06, + "loss": 1.3113, + "step": 629 + }, + { + "epoch": 0.8087291399229781, + "grad_norm": 6.040677070617676, + "learning_rate": 2.422336328626444e-06, + "loss": 1.313, + "step": 630 + }, + { + "epoch": 0.810012836970475, + "grad_norm": 3.1061787605285645, + "learning_rate": 2.4261874197689348e-06, + "loss": 1.3235, + "step": 631 + }, + { + "epoch": 0.8112965340179717, + "grad_norm": 2.512805461883545, + "learning_rate": 2.430038510911425e-06, + "loss": 1.2876, + "step": 632 + }, + { + "epoch": 0.8125802310654685, + "grad_norm": 2.4858150482177734, + "learning_rate": 2.4338896020539152e-06, + "loss": 1.2482, + "step": 633 + }, + { + "epoch": 0.8138639281129654, + "grad_norm": 5.052182197570801, + "learning_rate": 2.437740693196406e-06, + "loss": 1.394, + "step": 634 + }, + { + "epoch": 0.8151476251604621, + "grad_norm": 3.152029037475586, + "learning_rate": 2.441591784338896e-06, + "loss": 1.3356, + "step": 635 + }, + { + "epoch": 0.8164313222079589, + "grad_norm": 2.3317408561706543, + "learning_rate": 2.4454428754813863e-06, + "loss": 1.3112, + "step": 636 + }, + { + "epoch": 0.8177150192554558, + "grad_norm": 3.6725807189941406, + "learning_rate": 2.4492939666238765e-06, + "loss": 1.3563, + "step": 637 + }, + { + "epoch": 0.8189987163029525, + "grad_norm": 4.382804870605469, + "learning_rate": 2.453145057766367e-06, + "loss": 1.3549, + "step": 638 + }, + { + "epoch": 0.8202824133504493, + "grad_norm": 4.263907432556152, + "learning_rate": 2.4569961489088574e-06, + "loss": 1.3502, + "step": 639 + }, + { + "epoch": 0.8215661103979461, + "grad_norm": 4.445984363555908, + "learning_rate": 2.4608472400513476e-06, + "loss": 1.3659, + "step": 640 + }, + { + "epoch": 0.8228498074454429, + "grad_norm": 3.138104200363159, + "learning_rate": 2.4646983311938383e-06, + "loss": 1.3683, + "step": 641 + }, + { + "epoch": 0.8241335044929397, + "grad_norm": 3.1025660037994385, + "learning_rate": 2.4685494223363285e-06, + "loss": 1.3664, + "step": 642 + }, + { + "epoch": 0.8254172015404364, + "grad_norm": 3.978451728820801, + "learning_rate": 2.472400513478819e-06, + "loss": 1.3964, + "step": 643 + }, + { + "epoch": 0.8267008985879333, + "grad_norm": 3.4007551670074463, + "learning_rate": 2.4762516046213098e-06, + "loss": 1.3749, + "step": 644 + }, + { + "epoch": 0.82798459563543, + "grad_norm": 4.44425106048584, + "learning_rate": 2.4801026957638e-06, + "loss": 1.4047, + "step": 645 + }, + { + "epoch": 0.8292682926829268, + "grad_norm": 8.605108261108398, + "learning_rate": 2.4839537869062902e-06, + "loss": 1.3577, + "step": 646 + }, + { + "epoch": 0.8305519897304237, + "grad_norm": 3.1253855228424072, + "learning_rate": 2.487804878048781e-06, + "loss": 1.4316, + "step": 647 + }, + { + "epoch": 0.8318356867779204, + "grad_norm": 3.6354339122772217, + "learning_rate": 2.491655969191271e-06, + "loss": 1.5113, + "step": 648 + }, + { + "epoch": 0.8331193838254172, + "grad_norm": 6.2281575202941895, + "learning_rate": 2.4955070603337613e-06, + "loss": 1.6287, + "step": 649 + }, + { + "epoch": 0.834403080872914, + "grad_norm": 39.96592330932617, + "learning_rate": 2.4993581514762516e-06, + "loss": 1.7126, + "step": 650 + }, + { + "epoch": 0.8356867779204108, + "grad_norm": 4.45025110244751, + "learning_rate": 2.503209242618742e-06, + "loss": 1.2188, + "step": 651 + }, + { + "epoch": 0.8369704749679076, + "grad_norm": 3.0908868312835693, + "learning_rate": 2.5070603337612324e-06, + "loss": 1.1751, + "step": 652 + }, + { + "epoch": 0.8382541720154044, + "grad_norm": 1.9767757654190063, + "learning_rate": 2.5109114249037226e-06, + "loss": 1.2073, + "step": 653 + }, + { + "epoch": 0.8395378690629012, + "grad_norm": 2.1471564769744873, + "learning_rate": 2.5147625160462133e-06, + "loss": 1.1841, + "step": 654 + }, + { + "epoch": 0.8408215661103979, + "grad_norm": 2.592188835144043, + "learning_rate": 2.5186136071887035e-06, + "loss": 1.1905, + "step": 655 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 5.435050010681152, + "learning_rate": 2.5224646983311937e-06, + "loss": 1.1627, + "step": 656 + }, + { + "epoch": 0.8433889602053916, + "grad_norm": 2.148787021636963, + "learning_rate": 2.5263157894736844e-06, + "loss": 1.1986, + "step": 657 + }, + { + "epoch": 0.8446726572528883, + "grad_norm": 2.3573200702667236, + "learning_rate": 2.5301668806161746e-06, + "loss": 1.232, + "step": 658 + }, + { + "epoch": 0.8459563543003851, + "grad_norm": 2.7651846408843994, + "learning_rate": 2.534017971758665e-06, + "loss": 1.1834, + "step": 659 + }, + { + "epoch": 0.8472400513478819, + "grad_norm": 2.5072567462921143, + "learning_rate": 2.5378690629011555e-06, + "loss": 1.2523, + "step": 660 + }, + { + "epoch": 0.8485237483953787, + "grad_norm": 2.86497163772583, + "learning_rate": 2.5417201540436457e-06, + "loss": 1.2371, + "step": 661 + }, + { + "epoch": 0.8498074454428755, + "grad_norm": 2.229668617248535, + "learning_rate": 2.545571245186136e-06, + "loss": 1.2374, + "step": 662 + }, + { + "epoch": 0.8510911424903723, + "grad_norm": 1.8919779062271118, + "learning_rate": 2.549422336328626e-06, + "loss": 1.1948, + "step": 663 + }, + { + "epoch": 0.8523748395378691, + "grad_norm": 2.695183515548706, + "learning_rate": 2.553273427471117e-06, + "loss": 1.1998, + "step": 664 + }, + { + "epoch": 0.8536585365853658, + "grad_norm": 2.775907039642334, + "learning_rate": 2.557124518613607e-06, + "loss": 1.2517, + "step": 665 + }, + { + "epoch": 0.8549422336328626, + "grad_norm": 2.02734112739563, + "learning_rate": 2.5609756097560977e-06, + "loss": 1.1826, + "step": 666 + }, + { + "epoch": 0.8562259306803595, + "grad_norm": 2.040088415145874, + "learning_rate": 2.5648267008985883e-06, + "loss": 1.174, + "step": 667 + }, + { + "epoch": 0.8575096277278562, + "grad_norm": 2.055387496948242, + "learning_rate": 2.5686777920410785e-06, + "loss": 1.2602, + "step": 668 + }, + { + "epoch": 0.858793324775353, + "grad_norm": 2.205340623855591, + "learning_rate": 2.5725288831835688e-06, + "loss": 1.2483, + "step": 669 + }, + { + "epoch": 0.8600770218228498, + "grad_norm": 2.386551856994629, + "learning_rate": 2.5763799743260594e-06, + "loss": 1.2554, + "step": 670 + }, + { + "epoch": 0.8613607188703466, + "grad_norm": 2.3689513206481934, + "learning_rate": 2.5802310654685496e-06, + "loss": 1.2235, + "step": 671 + }, + { + "epoch": 0.8626444159178434, + "grad_norm": 2.044184446334839, + "learning_rate": 2.58408215661104e-06, + "loss": 1.2382, + "step": 672 + }, + { + "epoch": 0.8639281129653402, + "grad_norm": 2.666015148162842, + "learning_rate": 2.5879332477535305e-06, + "loss": 1.2294, + "step": 673 + }, + { + "epoch": 0.865211810012837, + "grad_norm": 3.4302542209625244, + "learning_rate": 2.5917843388960207e-06, + "loss": 1.1741, + "step": 674 + }, + { + "epoch": 0.8664955070603337, + "grad_norm": 1.8409051895141602, + "learning_rate": 2.595635430038511e-06, + "loss": 1.1619, + "step": 675 + }, + { + "epoch": 0.8677792041078306, + "grad_norm": 4.391714572906494, + "learning_rate": 2.599486521181001e-06, + "loss": 1.2406, + "step": 676 + }, + { + "epoch": 0.8690629011553274, + "grad_norm": 1.9761794805526733, + "learning_rate": 2.603337612323492e-06, + "loss": 1.2189, + "step": 677 + }, + { + "epoch": 0.8703465982028241, + "grad_norm": 2.591115713119507, + "learning_rate": 2.607188703465982e-06, + "loss": 1.2587, + "step": 678 + }, + { + "epoch": 0.8716302952503209, + "grad_norm": 3.524367332458496, + "learning_rate": 2.6110397946084723e-06, + "loss": 1.2001, + "step": 679 + }, + { + "epoch": 0.8729139922978177, + "grad_norm": 4.937044143676758, + "learning_rate": 2.614890885750963e-06, + "loss": 1.2209, + "step": 680 + }, + { + "epoch": 0.8741976893453145, + "grad_norm": 1.9175267219543457, + "learning_rate": 2.618741976893453e-06, + "loss": 1.2443, + "step": 681 + }, + { + "epoch": 0.8754813863928113, + "grad_norm": 2.2159852981567383, + "learning_rate": 2.6225930680359434e-06, + "loss": 1.1438, + "step": 682 + }, + { + "epoch": 0.8767650834403081, + "grad_norm": 5.112175464630127, + "learning_rate": 2.626444159178434e-06, + "loss": 1.2548, + "step": 683 + }, + { + "epoch": 0.8780487804878049, + "grad_norm": 4.436075210571289, + "learning_rate": 2.6302952503209242e-06, + "loss": 1.2288, + "step": 684 + }, + { + "epoch": 0.8793324775353016, + "grad_norm": 4.58530855178833, + "learning_rate": 2.6341463414634145e-06, + "loss": 1.2654, + "step": 685 + }, + { + "epoch": 0.8806161745827985, + "grad_norm": 3.6816768646240234, + "learning_rate": 2.637997432605905e-06, + "loss": 1.2501, + "step": 686 + }, + { + "epoch": 0.8818998716302953, + "grad_norm": 2.5975704193115234, + "learning_rate": 2.6418485237483953e-06, + "loss": 1.2141, + "step": 687 + }, + { + "epoch": 0.883183568677792, + "grad_norm": 2.9636318683624268, + "learning_rate": 2.645699614890886e-06, + "loss": 1.267, + "step": 688 + }, + { + "epoch": 0.8844672657252889, + "grad_norm": 3.296534776687622, + "learning_rate": 2.649550706033376e-06, + "loss": 1.2332, + "step": 689 + }, + { + "epoch": 0.8857509627727856, + "grad_norm": 4.089988708496094, + "learning_rate": 2.653401797175867e-06, + "loss": 1.2587, + "step": 690 + }, + { + "epoch": 0.8870346598202824, + "grad_norm": 3.2577221393585205, + "learning_rate": 2.657252888318357e-06, + "loss": 1.3133, + "step": 691 + }, + { + "epoch": 0.8883183568677792, + "grad_norm": 3.444378614425659, + "learning_rate": 2.6611039794608473e-06, + "loss": 1.2814, + "step": 692 + }, + { + "epoch": 0.889602053915276, + "grad_norm": 2.2925021648406982, + "learning_rate": 2.664955070603338e-06, + "loss": 1.2988, + "step": 693 + }, + { + "epoch": 0.8908857509627728, + "grad_norm": 4.229244709014893, + "learning_rate": 2.668806161745828e-06, + "loss": 1.3282, + "step": 694 + }, + { + "epoch": 0.8921694480102695, + "grad_norm": 3.8736414909362793, + "learning_rate": 2.6726572528883184e-06, + "loss": 1.3179, + "step": 695 + }, + { + "epoch": 0.8934531450577664, + "grad_norm": 5.3678059577941895, + "learning_rate": 2.676508344030809e-06, + "loss": 1.2987, + "step": 696 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 2.962611436843872, + "learning_rate": 2.6803594351732993e-06, + "loss": 1.3377, + "step": 697 + }, + { + "epoch": 0.8960205391527599, + "grad_norm": 3.021817922592163, + "learning_rate": 2.6842105263157895e-06, + "loss": 1.4271, + "step": 698 + }, + { + "epoch": 0.8973042362002568, + "grad_norm": 39.27174377441406, + "learning_rate": 2.68806161745828e-06, + "loss": 1.4776, + "step": 699 + }, + { + "epoch": 0.8985879332477535, + "grad_norm": 5.803427696228027, + "learning_rate": 2.6919127086007703e-06, + "loss": 1.6155, + "step": 700 + }, + { + "epoch": 0.8998716302952503, + "grad_norm": 3.878854274749756, + "learning_rate": 2.6957637997432606e-06, + "loss": 1.2056, + "step": 701 + }, + { + "epoch": 0.9011553273427471, + "grad_norm": 2.5748848915100098, + "learning_rate": 2.699614890885751e-06, + "loss": 1.1039, + "step": 702 + }, + { + "epoch": 0.9024390243902439, + "grad_norm": 1.8553025722503662, + "learning_rate": 2.7034659820282414e-06, + "loss": 1.1148, + "step": 703 + }, + { + "epoch": 0.9037227214377407, + "grad_norm": 2.528930425643921, + "learning_rate": 2.7073170731707317e-06, + "loss": 1.1541, + "step": 704 + }, + { + "epoch": 0.9050064184852374, + "grad_norm": 3.022494077682495, + "learning_rate": 2.711168164313222e-06, + "loss": 1.1517, + "step": 705 + }, + { + "epoch": 0.9062901155327343, + "grad_norm": 2.7120797634124756, + "learning_rate": 2.7150192554557125e-06, + "loss": 1.0908, + "step": 706 + }, + { + "epoch": 0.9075738125802311, + "grad_norm": 1.9322340488433838, + "learning_rate": 2.7188703465982028e-06, + "loss": 1.1083, + "step": 707 + }, + { + "epoch": 0.9088575096277278, + "grad_norm": 3.3037919998168945, + "learning_rate": 2.722721437740693e-06, + "loss": 1.1261, + "step": 708 + }, + { + "epoch": 0.9101412066752247, + "grad_norm": 1.8992400169372559, + "learning_rate": 2.7265725288831836e-06, + "loss": 1.095, + "step": 709 + }, + { + "epoch": 0.9114249037227214, + "grad_norm": 2.123845100402832, + "learning_rate": 2.730423620025674e-06, + "loss": 1.1329, + "step": 710 + }, + { + "epoch": 0.9127086007702182, + "grad_norm": 2.334078073501587, + "learning_rate": 2.7342747111681645e-06, + "loss": 1.1566, + "step": 711 + }, + { + "epoch": 0.9139922978177151, + "grad_norm": 2.499143123626709, + "learning_rate": 2.7381258023106547e-06, + "loss": 1.1352, + "step": 712 + }, + { + "epoch": 0.9152759948652118, + "grad_norm": 1.8319984674453735, + "learning_rate": 2.7419768934531454e-06, + "loss": 1.1172, + "step": 713 + }, + { + "epoch": 0.9165596919127086, + "grad_norm": 2.322758197784424, + "learning_rate": 2.7458279845956356e-06, + "loss": 1.1403, + "step": 714 + }, + { + "epoch": 0.9178433889602053, + "grad_norm": 3.1399283409118652, + "learning_rate": 2.749679075738126e-06, + "loss": 1.0694, + "step": 715 + }, + { + "epoch": 0.9191270860077022, + "grad_norm": 4.744444370269775, + "learning_rate": 2.7535301668806165e-06, + "loss": 1.1247, + "step": 716 + }, + { + "epoch": 0.920410783055199, + "grad_norm": 2.4238271713256836, + "learning_rate": 2.7573812580231067e-06, + "loss": 1.1361, + "step": 717 + }, + { + "epoch": 0.9216944801026957, + "grad_norm": 2.204378366470337, + "learning_rate": 2.761232349165597e-06, + "loss": 1.1232, + "step": 718 + }, + { + "epoch": 0.9229781771501926, + "grad_norm": 2.8266103267669678, + "learning_rate": 2.7650834403080876e-06, + "loss": 1.1686, + "step": 719 + }, + { + "epoch": 0.9242618741976893, + "grad_norm": 2.879094123840332, + "learning_rate": 2.7689345314505778e-06, + "loss": 1.1269, + "step": 720 + }, + { + "epoch": 0.9255455712451861, + "grad_norm": 10.152116775512695, + "learning_rate": 2.772785622593068e-06, + "loss": 1.1009, + "step": 721 + }, + { + "epoch": 0.926829268292683, + "grad_norm": 1.9774150848388672, + "learning_rate": 2.7766367137355586e-06, + "loss": 1.0895, + "step": 722 + }, + { + "epoch": 0.9281129653401797, + "grad_norm": 2.1980035305023193, + "learning_rate": 2.780487804878049e-06, + "loss": 1.0937, + "step": 723 + }, + { + "epoch": 0.9293966623876765, + "grad_norm": 2.511557102203369, + "learning_rate": 2.784338896020539e-06, + "loss": 1.12, + "step": 724 + }, + { + "epoch": 0.9306803594351734, + "grad_norm": 2.131187915802002, + "learning_rate": 2.7881899871630297e-06, + "loss": 1.1231, + "step": 725 + }, + { + "epoch": 0.9319640564826701, + "grad_norm": 1.919689416885376, + "learning_rate": 2.79204107830552e-06, + "loss": 1.1233, + "step": 726 + }, + { + "epoch": 0.9332477535301669, + "grad_norm": 3.5613670349121094, + "learning_rate": 2.79589216944801e-06, + "loss": 1.1041, + "step": 727 + }, + { + "epoch": 0.9345314505776636, + "grad_norm": 2.112295389175415, + "learning_rate": 2.7997432605905004e-06, + "loss": 1.1265, + "step": 728 + }, + { + "epoch": 0.9358151476251605, + "grad_norm": 2.9397647380828857, + "learning_rate": 2.803594351732991e-06, + "loss": 1.1094, + "step": 729 + }, + { + "epoch": 0.9370988446726572, + "grad_norm": 2.175203561782837, + "learning_rate": 2.8074454428754813e-06, + "loss": 1.1371, + "step": 730 + }, + { + "epoch": 0.938382541720154, + "grad_norm": 2.10927677154541, + "learning_rate": 2.8112965340179715e-06, + "loss": 1.1994, + "step": 731 + }, + { + "epoch": 0.9396662387676509, + "grad_norm": 2.734478235244751, + "learning_rate": 2.815147625160462e-06, + "loss": 1.0914, + "step": 732 + }, + { + "epoch": 0.9409499358151476, + "grad_norm": 3.1416430473327637, + "learning_rate": 2.818998716302953e-06, + "loss": 1.135, + "step": 733 + }, + { + "epoch": 0.9422336328626444, + "grad_norm": 7.097620964050293, + "learning_rate": 2.822849807445443e-06, + "loss": 1.1342, + "step": 734 + }, + { + "epoch": 0.9435173299101413, + "grad_norm": 2.629666805267334, + "learning_rate": 2.8267008985879337e-06, + "loss": 1.0979, + "step": 735 + }, + { + "epoch": 0.944801026957638, + "grad_norm": 2.9584078788757324, + "learning_rate": 2.830551989730424e-06, + "loss": 1.1983, + "step": 736 + }, + { + "epoch": 0.9460847240051348, + "grad_norm": 11.442934036254883, + "learning_rate": 2.834403080872914e-06, + "loss": 1.1906, + "step": 737 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 2.4185807704925537, + "learning_rate": 2.8382541720154043e-06, + "loss": 1.19, + "step": 738 + }, + { + "epoch": 0.9486521181001284, + "grad_norm": 5.49472713470459, + "learning_rate": 2.842105263157895e-06, + "loss": 1.1725, + "step": 739 + }, + { + "epoch": 0.9499358151476252, + "grad_norm": 2.836677312850952, + "learning_rate": 2.845956354300385e-06, + "loss": 1.2212, + "step": 740 + }, + { + "epoch": 0.9512195121951219, + "grad_norm": 3.7951269149780273, + "learning_rate": 2.8498074454428754e-06, + "loss": 1.2059, + "step": 741 + }, + { + "epoch": 0.9525032092426188, + "grad_norm": 2.89170241355896, + "learning_rate": 2.853658536585366e-06, + "loss": 1.1757, + "step": 742 + }, + { + "epoch": 0.9537869062901155, + "grad_norm": 6.169154167175293, + "learning_rate": 2.8575096277278563e-06, + "loss": 1.2162, + "step": 743 + }, + { + "epoch": 0.9550706033376123, + "grad_norm": 3.6270034313201904, + "learning_rate": 2.8613607188703465e-06, + "loss": 1.2494, + "step": 744 + }, + { + "epoch": 0.9563543003851092, + "grad_norm": 3.4046809673309326, + "learning_rate": 2.865211810012837e-06, + "loss": 1.2439, + "step": 745 + }, + { + "epoch": 0.9576379974326059, + "grad_norm": 5.134783744812012, + "learning_rate": 2.8690629011553274e-06, + "loss": 1.2545, + "step": 746 + }, + { + "epoch": 0.9589216944801027, + "grad_norm": 4.775987148284912, + "learning_rate": 2.8729139922978176e-06, + "loss": 1.2757, + "step": 747 + }, + { + "epoch": 0.9602053915275995, + "grad_norm": 3.1813106536865234, + "learning_rate": 2.8767650834403083e-06, + "loss": 1.2738, + "step": 748 + }, + { + "epoch": 0.9614890885750963, + "grad_norm": 4.960154056549072, + "learning_rate": 2.8806161745827985e-06, + "loss": 1.3122, + "step": 749 + }, + { + "epoch": 0.962772785622593, + "grad_norm": 10.278210639953613, + "learning_rate": 2.8844672657252887e-06, + "loss": 1.6104, + "step": 750 + }, + { + "epoch": 0.9640564826700898, + "grad_norm": 4.428796768188477, + "learning_rate": 2.888318356867779e-06, + "loss": 1.1069, + "step": 751 + }, + { + "epoch": 0.9653401797175867, + "grad_norm": 3.311002254486084, + "learning_rate": 2.8921694480102696e-06, + "loss": 1.0833, + "step": 752 + }, + { + "epoch": 0.9666238767650834, + "grad_norm": 1.4702825546264648, + "learning_rate": 2.89602053915276e-06, + "loss": 1.0082, + "step": 753 + }, + { + "epoch": 0.9679075738125802, + "grad_norm": 2.1273207664489746, + "learning_rate": 2.89987163029525e-06, + "loss": 1.0611, + "step": 754 + }, + { + "epoch": 0.9691912708600771, + "grad_norm": 1.6354931592941284, + "learning_rate": 2.9037227214377407e-06, + "loss": 1.0834, + "step": 755 + }, + { + "epoch": 0.9704749679075738, + "grad_norm": 2.445065975189209, + "learning_rate": 2.9075738125802313e-06, + "loss": 1.0201, + "step": 756 + }, + { + "epoch": 0.9717586649550706, + "grad_norm": 1.6229379177093506, + "learning_rate": 2.9114249037227215e-06, + "loss": 1.1139, + "step": 757 + }, + { + "epoch": 0.9730423620025674, + "grad_norm": 2.1692512035369873, + "learning_rate": 2.915275994865212e-06, + "loss": 1.0728, + "step": 758 + }, + { + "epoch": 0.9743260590500642, + "grad_norm": 4.721465587615967, + "learning_rate": 2.9191270860077024e-06, + "loss": 1.0825, + "step": 759 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 2.579840660095215, + "learning_rate": 2.9229781771501926e-06, + "loss": 1.0488, + "step": 760 + }, + { + "epoch": 0.9768934531450578, + "grad_norm": 2.1408536434173584, + "learning_rate": 2.9268292682926833e-06, + "loss": 1.05, + "step": 761 + }, + { + "epoch": 0.9781771501925546, + "grad_norm": 1.6272233724594116, + "learning_rate": 2.9306803594351735e-06, + "loss": 1.0646, + "step": 762 + }, + { + "epoch": 0.9794608472400513, + "grad_norm": 2.1834299564361572, + "learning_rate": 2.9345314505776637e-06, + "loss": 1.0716, + "step": 763 + }, + { + "epoch": 0.9807445442875481, + "grad_norm": 1.7571799755096436, + "learning_rate": 2.938382541720154e-06, + "loss": 1.0445, + "step": 764 + }, + { + "epoch": 0.982028241335045, + "grad_norm": 2.9753291606903076, + "learning_rate": 2.9422336328626446e-06, + "loss": 1.0962, + "step": 765 + }, + { + "epoch": 0.9833119383825417, + "grad_norm": 2.863264322280884, + "learning_rate": 2.946084724005135e-06, + "loss": 1.1083, + "step": 766 + }, + { + "epoch": 0.9845956354300385, + "grad_norm": 2.2779879570007324, + "learning_rate": 2.949935815147625e-06, + "loss": 1.0919, + "step": 767 + }, + { + "epoch": 0.9858793324775353, + "grad_norm": 5.992298126220703, + "learning_rate": 2.9537869062901157e-06, + "loss": 1.1284, + "step": 768 + }, + { + "epoch": 0.9871630295250321, + "grad_norm": 2.6041667461395264, + "learning_rate": 2.957637997432606e-06, + "loss": 1.0685, + "step": 769 + }, + { + "epoch": 0.9884467265725289, + "grad_norm": 2.71285343170166, + "learning_rate": 2.961489088575096e-06, + "loss": 1.1177, + "step": 770 + }, + { + "epoch": 0.9897304236200257, + "grad_norm": 1.8670170307159424, + "learning_rate": 2.965340179717587e-06, + "loss": 1.1479, + "step": 771 + }, + { + "epoch": 0.9910141206675225, + "grad_norm": 2.4949331283569336, + "learning_rate": 2.969191270860077e-06, + "loss": 1.1408, + "step": 772 + }, + { + "epoch": 0.9922978177150192, + "grad_norm": 2.907778739929199, + "learning_rate": 2.9730423620025672e-06, + "loss": 1.0984, + "step": 773 + }, + { + "epoch": 0.993581514762516, + "grad_norm": 2.7032947540283203, + "learning_rate": 2.976893453145058e-06, + "loss": 1.1349, + "step": 774 + }, + { + "epoch": 0.9948652118100129, + "grad_norm": 2.3530263900756836, + "learning_rate": 2.980744544287548e-06, + "loss": 1.185, + "step": 775 + }, + { + "epoch": 0.9961489088575096, + "grad_norm": 3.410780191421509, + "learning_rate": 2.9845956354300383e-06, + "loss": 1.1654, + "step": 776 + }, + { + "epoch": 0.9974326059050064, + "grad_norm": 11.987981796264648, + "learning_rate": 2.9884467265725286e-06, + "loss": 1.249, + "step": 777 + }, + { + "epoch": 0.9987163029525032, + "grad_norm": 5.194775104522705, + "learning_rate": 2.9922978177150196e-06, + "loss": 1.2917, + "step": 778 + }, + { + "epoch": 1.0, + "grad_norm": 4.765862941741943, + "learning_rate": 2.99614890885751e-06, + "loss": 1.619, + "step": 779 + }, + { + "epoch": 1.0012836970474968, + "grad_norm": 4.237629413604736, + "learning_rate": 3e-06, + "loss": 1.1059, + "step": 780 + }, + { + "epoch": 1.0025673940949935, + "grad_norm": 2.0991628170013428, + "learning_rate": 3.0038510911424907e-06, + "loss": 1.0186, + "step": 781 + }, + { + "epoch": 1.0038510911424903, + "grad_norm": 1.6874533891677856, + "learning_rate": 3.007702182284981e-06, + "loss": 0.997, + "step": 782 + }, + { + "epoch": 1.0051347881899872, + "grad_norm": 1.6014307737350464, + "learning_rate": 3.011553273427471e-06, + "loss": 1.0264, + "step": 783 + }, + { + "epoch": 1.006418485237484, + "grad_norm": 2.4277760982513428, + "learning_rate": 3.015404364569962e-06, + "loss": 1.0238, + "step": 784 + }, + { + "epoch": 1.0077021822849808, + "grad_norm": 2.440762519836426, + "learning_rate": 3.019255455712452e-06, + "loss": 1.0239, + "step": 785 + }, + { + "epoch": 1.0089858793324775, + "grad_norm": 2.454963445663452, + "learning_rate": 3.0231065468549423e-06, + "loss": 0.9996, + "step": 786 + }, + { + "epoch": 1.0102695763799743, + "grad_norm": 2.496720790863037, + "learning_rate": 3.026957637997433e-06, + "loss": 0.9519, + "step": 787 + }, + { + "epoch": 1.011553273427471, + "grad_norm": 1.6505825519561768, + "learning_rate": 3.030808729139923e-06, + "loss": 0.9667, + "step": 788 + }, + { + "epoch": 1.012836970474968, + "grad_norm": 1.6889688968658447, + "learning_rate": 3.0346598202824134e-06, + "loss": 1.0254, + "step": 789 + }, + { + "epoch": 1.0141206675224648, + "grad_norm": 2.112671375274658, + "learning_rate": 3.0385109114249036e-06, + "loss": 1.0803, + "step": 790 + }, + { + "epoch": 1.0154043645699615, + "grad_norm": 2.2596631050109863, + "learning_rate": 3.0423620025673942e-06, + "loss": 1.0608, + "step": 791 + }, + { + "epoch": 1.0166880616174583, + "grad_norm": 1.7965775728225708, + "learning_rate": 3.0462130937098845e-06, + "loss": 1.0057, + "step": 792 + }, + { + "epoch": 1.017971758664955, + "grad_norm": 1.8527593612670898, + "learning_rate": 3.0500641848523747e-06, + "loss": 1.0299, + "step": 793 + }, + { + "epoch": 1.0192554557124518, + "grad_norm": 3.4219818115234375, + "learning_rate": 3.0539152759948653e-06, + "loss": 1.0744, + "step": 794 + }, + { + "epoch": 1.0205391527599486, + "grad_norm": 1.769473671913147, + "learning_rate": 3.0577663671373555e-06, + "loss": 1.016, + "step": 795 + }, + { + "epoch": 1.0218228498074455, + "grad_norm": 2.510594129562378, + "learning_rate": 3.0616174582798458e-06, + "loss": 1.0, + "step": 796 + }, + { + "epoch": 1.0231065468549423, + "grad_norm": 4.940460681915283, + "learning_rate": 3.0654685494223364e-06, + "loss": 1.0283, + "step": 797 + }, + { + "epoch": 1.024390243902439, + "grad_norm": 2.0861597061157227, + "learning_rate": 3.0693196405648266e-06, + "loss": 1.041, + "step": 798 + }, + { + "epoch": 1.0256739409499358, + "grad_norm": 2.1998379230499268, + "learning_rate": 3.073170731707317e-06, + "loss": 0.9919, + "step": 799 + }, + { + "epoch": 1.0269576379974326, + "grad_norm": 2.7433533668518066, + "learning_rate": 3.0770218228498075e-06, + "loss": 0.9891, + "step": 800 + }, + { + "epoch": 1.0282413350449293, + "grad_norm": 2.1111090183258057, + "learning_rate": 3.080872913992298e-06, + "loss": 1.0639, + "step": 801 + }, + { + "epoch": 1.0295250320924263, + "grad_norm": 4.122688293457031, + "learning_rate": 3.0847240051347884e-06, + "loss": 1.0682, + "step": 802 + }, + { + "epoch": 1.030808729139923, + "grad_norm": 2.231724262237549, + "learning_rate": 3.0885750962772786e-06, + "loss": 1.053, + "step": 803 + }, + { + "epoch": 1.0320924261874198, + "grad_norm": 2.432349681854248, + "learning_rate": 3.0924261874197692e-06, + "loss": 1.0643, + "step": 804 + }, + { + "epoch": 1.0333761232349166, + "grad_norm": 1.8849841356277466, + "learning_rate": 3.0962772785622595e-06, + "loss": 1.042, + "step": 805 + }, + { + "epoch": 1.0346598202824133, + "grad_norm": 2.1302168369293213, + "learning_rate": 3.1001283697047497e-06, + "loss": 1.0477, + "step": 806 + }, + { + "epoch": 1.03594351732991, + "grad_norm": 3.381016254425049, + "learning_rate": 3.1039794608472403e-06, + "loss": 1.0813, + "step": 807 + }, + { + "epoch": 1.0372272143774068, + "grad_norm": 14.411662101745605, + "learning_rate": 3.1078305519897306e-06, + "loss": 1.0849, + "step": 808 + }, + { + "epoch": 1.0385109114249038, + "grad_norm": 2.894345283508301, + "learning_rate": 3.111681643132221e-06, + "loss": 1.097, + "step": 809 + }, + { + "epoch": 1.0397946084724006, + "grad_norm": 2.540680408477783, + "learning_rate": 3.1155327342747114e-06, + "loss": 1.1281, + "step": 810 + }, + { + "epoch": 1.0410783055198973, + "grad_norm": 2.9149510860443115, + "learning_rate": 3.1193838254172017e-06, + "loss": 1.026, + "step": 811 + }, + { + "epoch": 1.042362002567394, + "grad_norm": 1.8980350494384766, + "learning_rate": 3.123234916559692e-06, + "loss": 1.0134, + "step": 812 + }, + { + "epoch": 1.0436456996148908, + "grad_norm": 3.9234087467193604, + "learning_rate": 3.1270860077021825e-06, + "loss": 1.1381, + "step": 813 + }, + { + "epoch": 1.0449293966623876, + "grad_norm": 3.1239895820617676, + "learning_rate": 3.1309370988446728e-06, + "loss": 1.0768, + "step": 814 + }, + { + "epoch": 1.0462130937098846, + "grad_norm": 2.388974189758301, + "learning_rate": 3.134788189987163e-06, + "loss": 1.1166, + "step": 815 + }, + { + "epoch": 1.0474967907573813, + "grad_norm": 2.219332218170166, + "learning_rate": 3.138639281129653e-06, + "loss": 1.1203, + "step": 816 + }, + { + "epoch": 1.048780487804878, + "grad_norm": 2.5492208003997803, + "learning_rate": 3.142490372272144e-06, + "loss": 1.094, + "step": 817 + }, + { + "epoch": 1.0500641848523748, + "grad_norm": 2.104959726333618, + "learning_rate": 3.146341463414634e-06, + "loss": 1.1107, + "step": 818 + }, + { + "epoch": 1.0513478818998716, + "grad_norm": 2.9152565002441406, + "learning_rate": 3.1501925545571243e-06, + "loss": 1.0573, + "step": 819 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 3.079782485961914, + "learning_rate": 3.154043645699615e-06, + "loss": 1.1473, + "step": 820 + }, + { + "epoch": 1.0539152759948651, + "grad_norm": 3.2895939350128174, + "learning_rate": 3.157894736842105e-06, + "loss": 1.1733, + "step": 821 + }, + { + "epoch": 1.055198973042362, + "grad_norm": 8.116179466247559, + "learning_rate": 3.1617458279845954e-06, + "loss": 1.1415, + "step": 822 + }, + { + "epoch": 1.0564826700898589, + "grad_norm": 4.385889053344727, + "learning_rate": 3.1655969191270865e-06, + "loss": 1.1163, + "step": 823 + }, + { + "epoch": 1.0577663671373556, + "grad_norm": 3.568401336669922, + "learning_rate": 3.1694480102695767e-06, + "loss": 1.129, + "step": 824 + }, + { + "epoch": 1.0590500641848524, + "grad_norm": 3.434136152267456, + "learning_rate": 3.173299101412067e-06, + "loss": 1.1859, + "step": 825 + }, + { + "epoch": 1.0603337612323491, + "grad_norm": 4.9043869972229, + "learning_rate": 3.1771501925545576e-06, + "loss": 1.2473, + "step": 826 + }, + { + "epoch": 1.0616174582798459, + "grad_norm": 7.793929100036621, + "learning_rate": 3.1810012836970478e-06, + "loss": 1.2497, + "step": 827 + }, + { + "epoch": 1.0629011553273426, + "grad_norm": 5.873870849609375, + "learning_rate": 3.184852374839538e-06, + "loss": 1.3709, + "step": 828 + }, + { + "epoch": 1.0641848523748396, + "grad_norm": 10.312554359436035, + "learning_rate": 3.1887034659820282e-06, + "loss": 1.5671, + "step": 829 + }, + { + "epoch": 1.0654685494223364, + "grad_norm": 4.355302810668945, + "learning_rate": 3.192554557124519e-06, + "loss": 1.0365, + "step": 830 + }, + { + "epoch": 1.0667522464698331, + "grad_norm": 2.324286699295044, + "learning_rate": 3.196405648267009e-06, + "loss": 0.9737, + "step": 831 + }, + { + "epoch": 1.0680359435173299, + "grad_norm": 1.4305967092514038, + "learning_rate": 3.2002567394094993e-06, + "loss": 0.9348, + "step": 832 + }, + { + "epoch": 1.0693196405648266, + "grad_norm": 2.3872175216674805, + "learning_rate": 3.20410783055199e-06, + "loss": 1.06, + "step": 833 + }, + { + "epoch": 1.0706033376123234, + "grad_norm": 2.524951219558716, + "learning_rate": 3.20795892169448e-06, + "loss": 0.9624, + "step": 834 + }, + { + "epoch": 1.0718870346598204, + "grad_norm": 1.8948994874954224, + "learning_rate": 3.2118100128369704e-06, + "loss": 1.0073, + "step": 835 + }, + { + "epoch": 1.0731707317073171, + "grad_norm": 1.9453527927398682, + "learning_rate": 3.215661103979461e-06, + "loss": 0.965, + "step": 836 + }, + { + "epoch": 1.074454428754814, + "grad_norm": 2.335667371749878, + "learning_rate": 3.2195121951219513e-06, + "loss": 1.012, + "step": 837 + }, + { + "epoch": 1.0757381258023107, + "grad_norm": 3.0195329189300537, + "learning_rate": 3.2233632862644415e-06, + "loss": 0.9864, + "step": 838 + }, + { + "epoch": 1.0770218228498074, + "grad_norm": 1.8351755142211914, + "learning_rate": 3.227214377406932e-06, + "loss": 0.9886, + "step": 839 + }, + { + "epoch": 1.0783055198973042, + "grad_norm": 2.102322816848755, + "learning_rate": 3.2310654685494224e-06, + "loss": 1.0558, + "step": 840 + }, + { + "epoch": 1.0795892169448011, + "grad_norm": 2.4685165882110596, + "learning_rate": 3.2349165596919126e-06, + "loss": 1.017, + "step": 841 + }, + { + "epoch": 1.080872913992298, + "grad_norm": 2.3355042934417725, + "learning_rate": 3.238767650834403e-06, + "loss": 0.9801, + "step": 842 + }, + { + "epoch": 1.0821566110397947, + "grad_norm": 1.7647514343261719, + "learning_rate": 3.2426187419768935e-06, + "loss": 1.0116, + "step": 843 + }, + { + "epoch": 1.0834403080872914, + "grad_norm": 1.7245396375656128, + "learning_rate": 3.2464698331193837e-06, + "loss": 1.0218, + "step": 844 + }, + { + "epoch": 1.0847240051347882, + "grad_norm": 3.952817440032959, + "learning_rate": 3.250320924261874e-06, + "loss": 0.9583, + "step": 845 + }, + { + "epoch": 1.086007702182285, + "grad_norm": 2.510554552078247, + "learning_rate": 3.254172015404365e-06, + "loss": 0.9967, + "step": 846 + }, + { + "epoch": 1.0872913992297817, + "grad_norm": 2.006739854812622, + "learning_rate": 3.258023106546855e-06, + "loss": 1.0441, + "step": 847 + }, + { + "epoch": 1.0885750962772787, + "grad_norm": 1.6986362934112549, + "learning_rate": 3.2618741976893454e-06, + "loss": 0.9866, + "step": 848 + }, + { + "epoch": 1.0898587933247754, + "grad_norm": 1.7898545265197754, + "learning_rate": 3.265725288831836e-06, + "loss": 0.9927, + "step": 849 + }, + { + "epoch": 1.0911424903722722, + "grad_norm": 1.6863536834716797, + "learning_rate": 3.2695763799743263e-06, + "loss": 0.9601, + "step": 850 + }, + { + "epoch": 1.092426187419769, + "grad_norm": 3.7058067321777344, + "learning_rate": 3.2734274711168165e-06, + "loss": 0.997, + "step": 851 + }, + { + "epoch": 1.0937098844672657, + "grad_norm": 1.9217292070388794, + "learning_rate": 3.277278562259307e-06, + "loss": 0.9537, + "step": 852 + }, + { + "epoch": 1.0949935815147624, + "grad_norm": 2.0208520889282227, + "learning_rate": 3.2811296534017974e-06, + "loss": 1.0175, + "step": 853 + }, + { + "epoch": 1.0962772785622592, + "grad_norm": 1.775927186012268, + "learning_rate": 3.2849807445442876e-06, + "loss": 1.0402, + "step": 854 + }, + { + "epoch": 1.0975609756097562, + "grad_norm": 1.9880980253219604, + "learning_rate": 3.288831835686778e-06, + "loss": 1.0401, + "step": 855 + }, + { + "epoch": 1.098844672657253, + "grad_norm": 1.9695115089416504, + "learning_rate": 3.2926829268292685e-06, + "loss": 0.9902, + "step": 856 + }, + { + "epoch": 1.1001283697047497, + "grad_norm": 2.5979747772216797, + "learning_rate": 3.2965340179717587e-06, + "loss": 1.0569, + "step": 857 + }, + { + "epoch": 1.1014120667522465, + "grad_norm": 2.327995538711548, + "learning_rate": 3.300385109114249e-06, + "loss": 1.049, + "step": 858 + }, + { + "epoch": 1.1026957637997432, + "grad_norm": 1.6576415300369263, + "learning_rate": 3.3042362002567396e-06, + "loss": 1.0287, + "step": 859 + }, + { + "epoch": 1.10397946084724, + "grad_norm": 3.0334465503692627, + "learning_rate": 3.30808729139923e-06, + "loss": 1.0113, + "step": 860 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 2.6588449478149414, + "learning_rate": 3.31193838254172e-06, + "loss": 1.0393, + "step": 861 + }, + { + "epoch": 1.1065468549422337, + "grad_norm": 5.114123344421387, + "learning_rate": 3.3157894736842107e-06, + "loss": 1.0633, + "step": 862 + }, + { + "epoch": 1.1078305519897305, + "grad_norm": 1.9863405227661133, + "learning_rate": 3.319640564826701e-06, + "loss": 1.0837, + "step": 863 + }, + { + "epoch": 1.1091142490372272, + "grad_norm": 3.408025026321411, + "learning_rate": 3.323491655969191e-06, + "loss": 1.0815, + "step": 864 + }, + { + "epoch": 1.110397946084724, + "grad_norm": 1.7207955121994019, + "learning_rate": 3.3273427471116818e-06, + "loss": 1.0368, + "step": 865 + }, + { + "epoch": 1.1116816431322207, + "grad_norm": 2.557915210723877, + "learning_rate": 3.331193838254172e-06, + "loss": 1.0726, + "step": 866 + }, + { + "epoch": 1.1129653401797175, + "grad_norm": 2.8229033946990967, + "learning_rate": 3.3350449293966622e-06, + "loss": 1.1003, + "step": 867 + }, + { + "epoch": 1.1142490372272145, + "grad_norm": 3.2910044193267822, + "learning_rate": 3.338896020539153e-06, + "loss": 1.125, + "step": 868 + }, + { + "epoch": 1.1155327342747112, + "grad_norm": 3.968010425567627, + "learning_rate": 3.3427471116816435e-06, + "loss": 1.1029, + "step": 869 + }, + { + "epoch": 1.116816431322208, + "grad_norm": 10.94180965423584, + "learning_rate": 3.3465982028241337e-06, + "loss": 1.0384, + "step": 870 + }, + { + "epoch": 1.1181001283697047, + "grad_norm": 2.2803256511688232, + "learning_rate": 3.350449293966624e-06, + "loss": 1.0495, + "step": 871 + }, + { + "epoch": 1.1193838254172015, + "grad_norm": 2.0503134727478027, + "learning_rate": 3.3543003851091146e-06, + "loss": 1.1299, + "step": 872 + }, + { + "epoch": 1.1206675224646983, + "grad_norm": 6.476144790649414, + "learning_rate": 3.358151476251605e-06, + "loss": 1.0466, + "step": 873 + }, + { + "epoch": 1.1219512195121952, + "grad_norm": 1.8384480476379395, + "learning_rate": 3.362002567394095e-06, + "loss": 1.1648, + "step": 874 + }, + { + "epoch": 1.123234916559692, + "grad_norm": 2.2125959396362305, + "learning_rate": 3.3658536585365857e-06, + "loss": 1.1089, + "step": 875 + }, + { + "epoch": 1.1245186136071887, + "grad_norm": 5.504364013671875, + "learning_rate": 3.369704749679076e-06, + "loss": 1.1617, + "step": 876 + }, + { + "epoch": 1.1258023106546855, + "grad_norm": 5.079841136932373, + "learning_rate": 3.373555840821566e-06, + "loss": 1.2052, + "step": 877 + }, + { + "epoch": 1.1270860077021823, + "grad_norm": 4.998704433441162, + "learning_rate": 3.377406931964057e-06, + "loss": 1.2869, + "step": 878 + }, + { + "epoch": 1.128369704749679, + "grad_norm": 5.775645732879639, + "learning_rate": 3.381258023106547e-06, + "loss": 1.3529, + "step": 879 + }, + { + "epoch": 1.1296534017971758, + "grad_norm": 3.3500945568084717, + "learning_rate": 3.3851091142490372e-06, + "loss": 0.9809, + "step": 880 + }, + { + "epoch": 1.1309370988446728, + "grad_norm": 1.6781904697418213, + "learning_rate": 3.3889602053915275e-06, + "loss": 0.9266, + "step": 881 + }, + { + "epoch": 1.1322207958921695, + "grad_norm": 1.6215959787368774, + "learning_rate": 3.392811296534018e-06, + "loss": 0.9671, + "step": 882 + }, + { + "epoch": 1.1335044929396663, + "grad_norm": 2.2380666732788086, + "learning_rate": 3.3966623876765083e-06, + "loss": 0.9631, + "step": 883 + }, + { + "epoch": 1.134788189987163, + "grad_norm": 2.0975661277770996, + "learning_rate": 3.4005134788189986e-06, + "loss": 0.9465, + "step": 884 + }, + { + "epoch": 1.1360718870346598, + "grad_norm": 5.398200035095215, + "learning_rate": 3.404364569961489e-06, + "loss": 0.912, + "step": 885 + }, + { + "epoch": 1.1373555840821565, + "grad_norm": 2.4923088550567627, + "learning_rate": 3.4082156611039794e-06, + "loss": 0.948, + "step": 886 + }, + { + "epoch": 1.1386392811296533, + "grad_norm": 2.1492464542388916, + "learning_rate": 3.4120667522464697e-06, + "loss": 0.9215, + "step": 887 + }, + { + "epoch": 1.1399229781771503, + "grad_norm": 3.6015772819519043, + "learning_rate": 3.4159178433889603e-06, + "loss": 0.9067, + "step": 888 + }, + { + "epoch": 1.141206675224647, + "grad_norm": 1.6662254333496094, + "learning_rate": 3.4197689345314505e-06, + "loss": 0.9367, + "step": 889 + }, + { + "epoch": 1.1424903722721438, + "grad_norm": 1.89462411403656, + "learning_rate": 3.4236200256739407e-06, + "loss": 0.9392, + "step": 890 + }, + { + "epoch": 1.1437740693196405, + "grad_norm": 2.1868209838867188, + "learning_rate": 3.427471116816432e-06, + "loss": 1.0131, + "step": 891 + }, + { + "epoch": 1.1450577663671373, + "grad_norm": 2.067338466644287, + "learning_rate": 3.431322207958922e-06, + "loss": 0.9665, + "step": 892 + }, + { + "epoch": 1.146341463414634, + "grad_norm": 1.67741060256958, + "learning_rate": 3.4351732991014123e-06, + "loss": 0.9581, + "step": 893 + }, + { + "epoch": 1.147625160462131, + "grad_norm": 1.7599308490753174, + "learning_rate": 3.4390243902439025e-06, + "loss": 0.9273, + "step": 894 + }, + { + "epoch": 1.1489088575096278, + "grad_norm": 1.6021267175674438, + "learning_rate": 3.442875481386393e-06, + "loss": 0.9763, + "step": 895 + }, + { + "epoch": 1.1501925545571245, + "grad_norm": 1.9802393913269043, + "learning_rate": 3.4467265725288834e-06, + "loss": 1.0011, + "step": 896 + }, + { + "epoch": 1.1514762516046213, + "grad_norm": 2.3911147117614746, + "learning_rate": 3.4505776636713736e-06, + "loss": 0.9627, + "step": 897 + }, + { + "epoch": 1.152759948652118, + "grad_norm": 2.9522151947021484, + "learning_rate": 3.4544287548138642e-06, + "loss": 0.9323, + "step": 898 + }, + { + "epoch": 1.1540436456996148, + "grad_norm": 2.0172269344329834, + "learning_rate": 3.4582798459563544e-06, + "loss": 1.0001, + "step": 899 + }, + { + "epoch": 1.1553273427471118, + "grad_norm": 2.095559597015381, + "learning_rate": 3.4621309370988447e-06, + "loss": 0.9898, + "step": 900 + }, + { + "epoch": 1.1566110397946086, + "grad_norm": 1.9143482446670532, + "learning_rate": 3.4659820282413353e-06, + "loss": 0.9322, + "step": 901 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 2.645573139190674, + "learning_rate": 3.4698331193838255e-06, + "loss": 0.9425, + "step": 902 + }, + { + "epoch": 1.159178433889602, + "grad_norm": 2.1685900688171387, + "learning_rate": 3.4736842105263158e-06, + "loss": 0.9933, + "step": 903 + }, + { + "epoch": 1.1604621309370988, + "grad_norm": 1.7008227109909058, + "learning_rate": 3.4775353016688064e-06, + "loss": 1.0493, + "step": 904 + }, + { + "epoch": 1.1617458279845956, + "grad_norm": 1.4760220050811768, + "learning_rate": 3.4813863928112966e-06, + "loss": 0.9471, + "step": 905 + }, + { + "epoch": 1.1630295250320923, + "grad_norm": 1.9490162134170532, + "learning_rate": 3.485237483953787e-06, + "loss": 0.9362, + "step": 906 + }, + { + "epoch": 1.1643132220795893, + "grad_norm": 1.5922714471817017, + "learning_rate": 3.489088575096277e-06, + "loss": 0.9569, + "step": 907 + }, + { + "epoch": 1.165596919127086, + "grad_norm": 1.8620375394821167, + "learning_rate": 3.4929396662387677e-06, + "loss": 0.9394, + "step": 908 + }, + { + "epoch": 1.1668806161745828, + "grad_norm": 2.2914927005767822, + "learning_rate": 3.496790757381258e-06, + "loss": 0.9766, + "step": 909 + }, + { + "epoch": 1.1681643132220796, + "grad_norm": 3.862346649169922, + "learning_rate": 3.500641848523748e-06, + "loss": 1.0382, + "step": 910 + }, + { + "epoch": 1.1694480102695763, + "grad_norm": 2.4458365440368652, + "learning_rate": 3.504492939666239e-06, + "loss": 1.01, + "step": 911 + }, + { + "epoch": 1.170731707317073, + "grad_norm": 1.9129828214645386, + "learning_rate": 3.508344030808729e-06, + "loss": 1.0079, + "step": 912 + }, + { + "epoch": 1.1720154043645699, + "grad_norm": 2.1233022212982178, + "learning_rate": 3.5121951219512197e-06, + "loss": 0.9668, + "step": 913 + }, + { + "epoch": 1.1732991014120668, + "grad_norm": 2.24208402633667, + "learning_rate": 3.5160462130937103e-06, + "loss": 0.9362, + "step": 914 + }, + { + "epoch": 1.1745827984595636, + "grad_norm": 2.037706136703491, + "learning_rate": 3.5198973042362006e-06, + "loss": 1.0034, + "step": 915 + }, + { + "epoch": 1.1758664955070603, + "grad_norm": 3.474231719970703, + "learning_rate": 3.5237483953786908e-06, + "loss": 1.0482, + "step": 916 + }, + { + "epoch": 1.177150192554557, + "grad_norm": 2.3047573566436768, + "learning_rate": 3.527599486521181e-06, + "loss": 1.0708, + "step": 917 + }, + { + "epoch": 1.1784338896020539, + "grad_norm": 1.6205596923828125, + "learning_rate": 3.5314505776636717e-06, + "loss": 1.0008, + "step": 918 + }, + { + "epoch": 1.1797175866495506, + "grad_norm": 2.6128625869750977, + "learning_rate": 3.535301668806162e-06, + "loss": 1.072, + "step": 919 + }, + { + "epoch": 1.1810012836970474, + "grad_norm": 16.218250274658203, + "learning_rate": 3.539152759948652e-06, + "loss": 1.0569, + "step": 920 + }, + { + "epoch": 1.1822849807445444, + "grad_norm": 3.1179840564727783, + "learning_rate": 3.5430038510911428e-06, + "loss": 1.138, + "step": 921 + }, + { + "epoch": 1.1835686777920411, + "grad_norm": 5.061842441558838, + "learning_rate": 3.546854942233633e-06, + "loss": 1.0565, + "step": 922 + }, + { + "epoch": 1.1848523748395379, + "grad_norm": 2.575146198272705, + "learning_rate": 3.550706033376123e-06, + "loss": 1.0951, + "step": 923 + }, + { + "epoch": 1.1861360718870346, + "grad_norm": 3.3526737689971924, + "learning_rate": 3.554557124518614e-06, + "loss": 1.1534, + "step": 924 + }, + { + "epoch": 1.1874197689345314, + "grad_norm": 3.974761962890625, + "learning_rate": 3.558408215661104e-06, + "loss": 1.1368, + "step": 925 + }, + { + "epoch": 1.1887034659820284, + "grad_norm": 7.883673191070557, + "learning_rate": 3.5622593068035943e-06, + "loss": 1.119, + "step": 926 + }, + { + "epoch": 1.1899871630295251, + "grad_norm": 4.418021202087402, + "learning_rate": 3.566110397946085e-06, + "loss": 1.0923, + "step": 927 + }, + { + "epoch": 1.1912708600770219, + "grad_norm": 4.774886131286621, + "learning_rate": 3.569961489088575e-06, + "loss": 1.2298, + "step": 928 + }, + { + "epoch": 1.1925545571245186, + "grad_norm": 4.911004543304443, + "learning_rate": 3.5738125802310654e-06, + "loss": 1.4733, + "step": 929 + }, + { + "epoch": 1.1938382541720154, + "grad_norm": 3.469620943069458, + "learning_rate": 3.577663671373556e-06, + "loss": 1.0067, + "step": 930 + }, + { + "epoch": 1.1951219512195121, + "grad_norm": 2.382612705230713, + "learning_rate": 3.5815147625160463e-06, + "loss": 0.9456, + "step": 931 + }, + { + "epoch": 1.196405648267009, + "grad_norm": 1.6534301042556763, + "learning_rate": 3.5853658536585365e-06, + "loss": 0.8788, + "step": 932 + }, + { + "epoch": 1.1976893453145059, + "grad_norm": 1.453870415687561, + "learning_rate": 3.5892169448010267e-06, + "loss": 0.9003, + "step": 933 + }, + { + "epoch": 1.1989730423620026, + "grad_norm": 1.6371129751205444, + "learning_rate": 3.5930680359435174e-06, + "loss": 0.9093, + "step": 934 + }, + { + "epoch": 1.2002567394094994, + "grad_norm": 1.648165225982666, + "learning_rate": 3.5969191270860076e-06, + "loss": 0.8978, + "step": 935 + }, + { + "epoch": 1.2015404364569962, + "grad_norm": 1.674502968788147, + "learning_rate": 3.6007702182284982e-06, + "loss": 0.948, + "step": 936 + }, + { + "epoch": 1.202824133504493, + "grad_norm": 1.5485057830810547, + "learning_rate": 3.604621309370989e-06, + "loss": 0.9021, + "step": 937 + }, + { + "epoch": 1.2041078305519897, + "grad_norm": 1.5947580337524414, + "learning_rate": 3.608472400513479e-06, + "loss": 0.9214, + "step": 938 + }, + { + "epoch": 1.2053915275994864, + "grad_norm": 1.841200590133667, + "learning_rate": 3.6123234916559693e-06, + "loss": 0.9267, + "step": 939 + }, + { + "epoch": 1.2066752246469834, + "grad_norm": 1.7012795209884644, + "learning_rate": 3.61617458279846e-06, + "loss": 0.8769, + "step": 940 + }, + { + "epoch": 1.2079589216944802, + "grad_norm": 1.3526928424835205, + "learning_rate": 3.62002567394095e-06, + "loss": 0.8838, + "step": 941 + }, + { + "epoch": 1.209242618741977, + "grad_norm": 1.9045579433441162, + "learning_rate": 3.6238767650834404e-06, + "loss": 0.9171, + "step": 942 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 1.2428076267242432, + "learning_rate": 3.6277278562259306e-06, + "loss": 0.9146, + "step": 943 + }, + { + "epoch": 1.2118100128369704, + "grad_norm": 1.5226168632507324, + "learning_rate": 3.6315789473684213e-06, + "loss": 0.8989, + "step": 944 + }, + { + "epoch": 1.2130937098844672, + "grad_norm": 1.7717477083206177, + "learning_rate": 3.6354300385109115e-06, + "loss": 0.9805, + "step": 945 + }, + { + "epoch": 1.214377406931964, + "grad_norm": 1.6874810457229614, + "learning_rate": 3.6392811296534017e-06, + "loss": 0.8984, + "step": 946 + }, + { + "epoch": 1.215661103979461, + "grad_norm": 1.3287967443466187, + "learning_rate": 3.6431322207958924e-06, + "loss": 0.9363, + "step": 947 + }, + { + "epoch": 1.2169448010269577, + "grad_norm": 1.63985013961792, + "learning_rate": 3.6469833119383826e-06, + "loss": 0.9264, + "step": 948 + }, + { + "epoch": 1.2182284980744544, + "grad_norm": 2.1210286617279053, + "learning_rate": 3.650834403080873e-06, + "loss": 0.9915, + "step": 949 + }, + { + "epoch": 1.2195121951219512, + "grad_norm": 2.173978567123413, + "learning_rate": 3.6546854942233635e-06, + "loss": 0.9681, + "step": 950 + }, + { + "epoch": 1.220795892169448, + "grad_norm": 2.4399049282073975, + "learning_rate": 3.6585365853658537e-06, + "loss": 0.891, + "step": 951 + }, + { + "epoch": 1.2220795892169447, + "grad_norm": 1.7818540334701538, + "learning_rate": 3.662387676508344e-06, + "loss": 0.9377, + "step": 952 + }, + { + "epoch": 1.2233632862644417, + "grad_norm": 1.470245122909546, + "learning_rate": 3.6662387676508346e-06, + "loss": 0.9647, + "step": 953 + }, + { + "epoch": 1.2246469833119384, + "grad_norm": 1.5370548963546753, + "learning_rate": 3.6700898587933248e-06, + "loss": 0.8851, + "step": 954 + }, + { + "epoch": 1.2259306803594352, + "grad_norm": 7.957584381103516, + "learning_rate": 3.673940949935815e-06, + "loss": 0.9476, + "step": 955 + }, + { + "epoch": 1.227214377406932, + "grad_norm": 3.113050937652588, + "learning_rate": 3.6777920410783052e-06, + "loss": 0.9817, + "step": 956 + }, + { + "epoch": 1.2284980744544287, + "grad_norm": 1.84368097782135, + "learning_rate": 3.681643132220796e-06, + "loss": 0.9507, + "step": 957 + }, + { + "epoch": 1.2297817715019255, + "grad_norm": 2.0930912494659424, + "learning_rate": 3.6854942233632865e-06, + "loss": 0.9919, + "step": 958 + }, + { + "epoch": 1.2310654685494224, + "grad_norm": 1.896472692489624, + "learning_rate": 3.6893453145057767e-06, + "loss": 0.9555, + "step": 959 + }, + { + "epoch": 1.2323491655969192, + "grad_norm": 1.974371314048767, + "learning_rate": 3.6931964056482674e-06, + "loss": 0.9657, + "step": 960 + }, + { + "epoch": 1.233632862644416, + "grad_norm": 2.4248595237731934, + "learning_rate": 3.6970474967907576e-06, + "loss": 0.9898, + "step": 961 + }, + { + "epoch": 1.2349165596919127, + "grad_norm": 3.0870821475982666, + "learning_rate": 3.700898587933248e-06, + "loss": 0.9835, + "step": 962 + }, + { + "epoch": 1.2362002567394095, + "grad_norm": 3.844482183456421, + "learning_rate": 3.7047496790757385e-06, + "loss": 0.9887, + "step": 963 + }, + { + "epoch": 1.2374839537869062, + "grad_norm": 3.670504331588745, + "learning_rate": 3.7086007702182287e-06, + "loss": 0.9994, + "step": 964 + }, + { + "epoch": 1.238767650834403, + "grad_norm": 2.65596866607666, + "learning_rate": 3.712451861360719e-06, + "loss": 0.9773, + "step": 965 + }, + { + "epoch": 1.2400513478819, + "grad_norm": 5.298295021057129, + "learning_rate": 3.7163029525032096e-06, + "loss": 0.9873, + "step": 966 + }, + { + "epoch": 1.2413350449293967, + "grad_norm": 3.5446252822875977, + "learning_rate": 3.7201540436457e-06, + "loss": 0.9686, + "step": 967 + }, + { + "epoch": 1.2426187419768935, + "grad_norm": 2.489218235015869, + "learning_rate": 3.72400513478819e-06, + "loss": 1.0118, + "step": 968 + }, + { + "epoch": 1.2439024390243902, + "grad_norm": 1.8840742111206055, + "learning_rate": 3.7278562259306803e-06, + "loss": 1.0049, + "step": 969 + }, + { + "epoch": 1.245186136071887, + "grad_norm": 2.4363183975219727, + "learning_rate": 3.731707317073171e-06, + "loss": 1.0111, + "step": 970 + }, + { + "epoch": 1.2464698331193838, + "grad_norm": 2.3776328563690186, + "learning_rate": 3.735558408215661e-06, + "loss": 1.0289, + "step": 971 + }, + { + "epoch": 1.2477535301668805, + "grad_norm": 3.7378478050231934, + "learning_rate": 3.7394094993581513e-06, + "loss": 1.0452, + "step": 972 + }, + { + "epoch": 1.2490372272143775, + "grad_norm": 3.314236640930176, + "learning_rate": 3.743260590500642e-06, + "loss": 1.0596, + "step": 973 + }, + { + "epoch": 1.2503209242618742, + "grad_norm": 4.042888164520264, + "learning_rate": 3.7471116816431322e-06, + "loss": 1.0835, + "step": 974 + }, + { + "epoch": 1.251604621309371, + "grad_norm": 2.262540340423584, + "learning_rate": 3.750962772785623e-06, + "loss": 1.1075, + "step": 975 + }, + { + "epoch": 1.2528883183568678, + "grad_norm": 2.9646053314208984, + "learning_rate": 3.754813863928113e-06, + "loss": 1.1563, + "step": 976 + }, + { + "epoch": 1.2541720154043645, + "grad_norm": 5.842599868774414, + "learning_rate": 3.7586649550706037e-06, + "loss": 1.1206, + "step": 977 + }, + { + "epoch": 1.2554557124518615, + "grad_norm": 4.317442893981934, + "learning_rate": 3.7625160462130935e-06, + "loss": 1.286, + "step": 978 + }, + { + "epoch": 1.256739409499358, + "grad_norm": 5.816264629364014, + "learning_rate": 3.7663671373555846e-06, + "loss": 1.4089, + "step": 979 + }, + { + "epoch": 1.258023106546855, + "grad_norm": 2.6546268463134766, + "learning_rate": 3.7702182284980744e-06, + "loss": 0.9753, + "step": 980 + }, + { + "epoch": 1.2593068035943518, + "grad_norm": 1.4843813180923462, + "learning_rate": 3.774069319640565e-06, + "loss": 0.9011, + "step": 981 + }, + { + "epoch": 1.2605905006418485, + "grad_norm": 1.1561957597732544, + "learning_rate": 3.777920410783055e-06, + "loss": 0.8779, + "step": 982 + }, + { + "epoch": 1.2618741976893453, + "grad_norm": 1.5607683658599854, + "learning_rate": 3.781771501925546e-06, + "loss": 0.8694, + "step": 983 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.7141096591949463, + "learning_rate": 3.7856225930680357e-06, + "loss": 0.8971, + "step": 984 + }, + { + "epoch": 1.264441591784339, + "grad_norm": 1.4192290306091309, + "learning_rate": 3.7894736842105264e-06, + "loss": 0.8924, + "step": 985 + }, + { + "epoch": 1.2657252888318355, + "grad_norm": 2.1720218658447266, + "learning_rate": 3.7933247753530166e-06, + "loss": 0.8834, + "step": 986 + }, + { + "epoch": 1.2670089858793325, + "grad_norm": 2.2424187660217285, + "learning_rate": 3.7971758664955072e-06, + "loss": 0.883, + "step": 987 + }, + { + "epoch": 1.2682926829268293, + "grad_norm": 1.3853152990341187, + "learning_rate": 3.801026957637997e-06, + "loss": 0.9298, + "step": 988 + }, + { + "epoch": 1.269576379974326, + "grad_norm": 1.785312294960022, + "learning_rate": 3.804878048780488e-06, + "loss": 0.8919, + "step": 989 + }, + { + "epoch": 1.2708600770218228, + "grad_norm": 2.265261173248291, + "learning_rate": 3.808729139922978e-06, + "loss": 0.8978, + "step": 990 + }, + { + "epoch": 1.2721437740693196, + "grad_norm": 1.6713371276855469, + "learning_rate": 3.8125802310654686e-06, + "loss": 0.9038, + "step": 991 + }, + { + "epoch": 1.2734274711168165, + "grad_norm": 1.4502674341201782, + "learning_rate": 3.81643132220796e-06, + "loss": 0.9189, + "step": 992 + }, + { + "epoch": 1.2747111681643133, + "grad_norm": 1.8913146257400513, + "learning_rate": 3.820282413350449e-06, + "loss": 0.9472, + "step": 993 + }, + { + "epoch": 1.27599486521181, + "grad_norm": 2.120197057723999, + "learning_rate": 3.82413350449294e-06, + "loss": 0.8562, + "step": 994 + }, + { + "epoch": 1.2772785622593068, + "grad_norm": 1.8151116371154785, + "learning_rate": 3.82798459563543e-06, + "loss": 0.9113, + "step": 995 + }, + { + "epoch": 1.2785622593068036, + "grad_norm": 2.0674889087677, + "learning_rate": 3.8318356867779205e-06, + "loss": 0.8487, + "step": 996 + }, + { + "epoch": 1.2798459563543003, + "grad_norm": 1.5543787479400635, + "learning_rate": 3.835686777920411e-06, + "loss": 0.8426, + "step": 997 + }, + { + "epoch": 1.281129653401797, + "grad_norm": 1.7336844205856323, + "learning_rate": 3.839537869062902e-06, + "loss": 0.9298, + "step": 998 + }, + { + "epoch": 1.282413350449294, + "grad_norm": 1.5902352333068848, + "learning_rate": 3.843388960205391e-06, + "loss": 0.912, + "step": 999 + }, + { + "epoch": 1.2836970474967908, + "grad_norm": 1.646222710609436, + "learning_rate": 3.847240051347882e-06, + "loss": 0.9344, + "step": 1000 + }, + { + "epoch": 1.2836970474967908, + "eval_cer": 0.34822302721754456, + "eval_loss": 0.8890576958656311, + "eval_runtime": 13.7101, + "eval_samples_per_second": 71.699, + "eval_steps_per_second": 0.511, + "eval_wer": 0.6860811740426508, + "step": 1000 + }, + { + "epoch": 1.2849807445442876, + "grad_norm": 2.2626960277557373, + "learning_rate": 3.8510911424903725e-06, + "loss": 1.0264, + "step": 1001 + }, + { + "epoch": 1.2862644415917843, + "grad_norm": 17.14606285095215, + "learning_rate": 3.854942233632863e-06, + "loss": 0.9438, + "step": 1002 + }, + { + "epoch": 1.287548138639281, + "grad_norm": 2.699713945388794, + "learning_rate": 3.858793324775353e-06, + "loss": 0.9107, + "step": 1003 + }, + { + "epoch": 1.2888318356867778, + "grad_norm": 6.599685192108154, + "learning_rate": 3.862644415917844e-06, + "loss": 0.9165, + "step": 1004 + }, + { + "epoch": 1.2901155327342746, + "grad_norm": 2.591066837310791, + "learning_rate": 3.866495507060333e-06, + "loss": 0.9773, + "step": 1005 + }, + { + "epoch": 1.2913992297817716, + "grad_norm": 1.8000675439834595, + "learning_rate": 3.8703465982028244e-06, + "loss": 0.9067, + "step": 1006 + }, + { + "epoch": 1.2926829268292683, + "grad_norm": 1.660179615020752, + "learning_rate": 3.874197689345315e-06, + "loss": 0.8944, + "step": 1007 + }, + { + "epoch": 1.293966623876765, + "grad_norm": 1.9559460878372192, + "learning_rate": 3.878048780487805e-06, + "loss": 0.9345, + "step": 1008 + }, + { + "epoch": 1.2952503209242618, + "grad_norm": 1.9632720947265625, + "learning_rate": 3.881899871630295e-06, + "loss": 0.9048, + "step": 1009 + }, + { + "epoch": 1.2965340179717586, + "grad_norm": 3.5458920001983643, + "learning_rate": 3.885750962772786e-06, + "loss": 0.9318, + "step": 1010 + }, + { + "epoch": 1.2978177150192556, + "grad_norm": 2.4003000259399414, + "learning_rate": 3.8896020539152756e-06, + "loss": 0.9593, + "step": 1011 + }, + { + "epoch": 1.2991014120667521, + "grad_norm": 2.714249610900879, + "learning_rate": 3.893453145057767e-06, + "loss": 0.9787, + "step": 1012 + }, + { + "epoch": 1.300385109114249, + "grad_norm": 1.7699693441390991, + "learning_rate": 3.897304236200257e-06, + "loss": 0.9652, + "step": 1013 + }, + { + "epoch": 1.3016688061617459, + "grad_norm": 3.1569266319274902, + "learning_rate": 3.901155327342747e-06, + "loss": 0.9531, + "step": 1014 + }, + { + "epoch": 1.3029525032092426, + "grad_norm": 2.4255905151367188, + "learning_rate": 3.905006418485238e-06, + "loss": 0.953, + "step": 1015 + }, + { + "epoch": 1.3042362002567394, + "grad_norm": 3.0518829822540283, + "learning_rate": 3.9088575096277275e-06, + "loss": 0.9908, + "step": 1016 + }, + { + "epoch": 1.3055198973042361, + "grad_norm": 3.5554232597351074, + "learning_rate": 3.912708600770219e-06, + "loss": 1.065, + "step": 1017 + }, + { + "epoch": 1.306803594351733, + "grad_norm": 5.235734939575195, + "learning_rate": 3.916559691912709e-06, + "loss": 0.9912, + "step": 1018 + }, + { + "epoch": 1.3080872913992299, + "grad_norm": 2.0717947483062744, + "learning_rate": 3.920410783055199e-06, + "loss": 0.9535, + "step": 1019 + }, + { + "epoch": 1.3093709884467266, + "grad_norm": 2.6200661659240723, + "learning_rate": 3.924261874197689e-06, + "loss": 0.9958, + "step": 1020 + }, + { + "epoch": 1.3106546854942234, + "grad_norm": 2.3386669158935547, + "learning_rate": 3.92811296534018e-06, + "loss": 0.9996, + "step": 1021 + }, + { + "epoch": 1.3119383825417201, + "grad_norm": 5.348982810974121, + "learning_rate": 3.93196405648267e-06, + "loss": 0.9285, + "step": 1022 + }, + { + "epoch": 1.3132220795892169, + "grad_norm": 2.9138526916503906, + "learning_rate": 3.935815147625161e-06, + "loss": 1.039, + "step": 1023 + }, + { + "epoch": 1.3145057766367136, + "grad_norm": 2.8626205921173096, + "learning_rate": 3.939666238767651e-06, + "loss": 1.0015, + "step": 1024 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 3.1565914154052734, + "learning_rate": 3.943517329910141e-06, + "loss": 1.043, + "step": 1025 + }, + { + "epoch": 1.3170731707317074, + "grad_norm": 2.526036024093628, + "learning_rate": 3.9473684210526315e-06, + "loss": 1.1275, + "step": 1026 + }, + { + "epoch": 1.3183568677792041, + "grad_norm": 3.7077279090881348, + "learning_rate": 3.9512195121951225e-06, + "loss": 1.1129, + "step": 1027 + }, + { + "epoch": 1.319640564826701, + "grad_norm": 3.8750245571136475, + "learning_rate": 3.955070603337612e-06, + "loss": 1.2073, + "step": 1028 + }, + { + "epoch": 1.3209242618741976, + "grad_norm": 3.790834665298462, + "learning_rate": 3.958921694480103e-06, + "loss": 1.2928, + "step": 1029 + }, + { + "epoch": 1.3222079589216944, + "grad_norm": 3.1766088008880615, + "learning_rate": 3.962772785622593e-06, + "loss": 0.956, + "step": 1030 + }, + { + "epoch": 1.3234916559691912, + "grad_norm": 1.7448266744613647, + "learning_rate": 3.966623876765083e-06, + "loss": 0.8765, + "step": 1031 + }, + { + "epoch": 1.3247753530166881, + "grad_norm": 1.4577847719192505, + "learning_rate": 3.970474967907574e-06, + "loss": 0.8566, + "step": 1032 + }, + { + "epoch": 1.326059050064185, + "grad_norm": 2.9100565910339355, + "learning_rate": 3.974326059050065e-06, + "loss": 0.9164, + "step": 1033 + }, + { + "epoch": 1.3273427471116817, + "grad_norm": 1.978783130645752, + "learning_rate": 3.978177150192554e-06, + "loss": 0.8752, + "step": 1034 + }, + { + "epoch": 1.3286264441591784, + "grad_norm": 1.6881825923919678, + "learning_rate": 3.982028241335045e-06, + "loss": 0.897, + "step": 1035 + }, + { + "epoch": 1.3299101412066752, + "grad_norm": 4.072053909301758, + "learning_rate": 3.985879332477535e-06, + "loss": 0.863, + "step": 1036 + }, + { + "epoch": 1.3311938382541721, + "grad_norm": 2.0853073596954346, + "learning_rate": 3.989730423620026e-06, + "loss": 0.8452, + "step": 1037 + }, + { + "epoch": 1.3324775353016687, + "grad_norm": 1.8696050643920898, + "learning_rate": 3.993581514762517e-06, + "loss": 0.9144, + "step": 1038 + }, + { + "epoch": 1.3337612323491657, + "grad_norm": 1.4932132959365845, + "learning_rate": 3.997432605905006e-06, + "loss": 0.8922, + "step": 1039 + }, + { + "epoch": 1.3350449293966624, + "grad_norm": 2.3064846992492676, + "learning_rate": 4.001283697047497e-06, + "loss": 0.8596, + "step": 1040 + }, + { + "epoch": 1.3363286264441592, + "grad_norm": 2.08608341217041, + "learning_rate": 4.005134788189987e-06, + "loss": 0.8368, + "step": 1041 + }, + { + "epoch": 1.337612323491656, + "grad_norm": 2.0808510780334473, + "learning_rate": 4.0089858793324776e-06, + "loss": 0.8713, + "step": 1042 + }, + { + "epoch": 1.3388960205391527, + "grad_norm": 2.609572649002075, + "learning_rate": 4.012836970474968e-06, + "loss": 0.8918, + "step": 1043 + }, + { + "epoch": 1.3401797175866497, + "grad_norm": 8.042173385620117, + "learning_rate": 4.016688061617459e-06, + "loss": 0.8937, + "step": 1044 + }, + { + "epoch": 1.3414634146341464, + "grad_norm": 2.2746338844299316, + "learning_rate": 4.020539152759948e-06, + "loss": 0.865, + "step": 1045 + }, + { + "epoch": 1.3427471116816432, + "grad_norm": 2.3915915489196777, + "learning_rate": 4.024390243902439e-06, + "loss": 0.9067, + "step": 1046 + }, + { + "epoch": 1.34403080872914, + "grad_norm": 3.3308656215667725, + "learning_rate": 4.0282413350449295e-06, + "loss": 0.9318, + "step": 1047 + }, + { + "epoch": 1.3453145057766367, + "grad_norm": 1.8745924234390259, + "learning_rate": 4.03209242618742e-06, + "loss": 0.915, + "step": 1048 + }, + { + "epoch": 1.3465982028241335, + "grad_norm": 2.431544542312622, + "learning_rate": 4.03594351732991e-06, + "loss": 0.9216, + "step": 1049 + }, + { + "epoch": 1.3478818998716302, + "grad_norm": 1.7090404033660889, + "learning_rate": 4.039794608472401e-06, + "loss": 0.8891, + "step": 1050 + }, + { + "epoch": 1.3491655969191272, + "grad_norm": 2.6835219860076904, + "learning_rate": 4.0436456996148904e-06, + "loss": 0.8601, + "step": 1051 + }, + { + "epoch": 1.350449293966624, + "grad_norm": 1.878157615661621, + "learning_rate": 4.0474967907573815e-06, + "loss": 0.8513, + "step": 1052 + }, + { + "epoch": 1.3517329910141207, + "grad_norm": 2.249499559402466, + "learning_rate": 4.051347881899872e-06, + "loss": 0.8469, + "step": 1053 + }, + { + "epoch": 1.3530166880616175, + "grad_norm": 2.173388719558716, + "learning_rate": 4.055198973042362e-06, + "loss": 0.8407, + "step": 1054 + }, + { + "epoch": 1.3543003851091142, + "grad_norm": 1.8022676706314087, + "learning_rate": 4.059050064184852e-06, + "loss": 0.8779, + "step": 1055 + }, + { + "epoch": 1.355584082156611, + "grad_norm": 1.7957738637924194, + "learning_rate": 4.062901155327343e-06, + "loss": 0.8917, + "step": 1056 + }, + { + "epoch": 1.3568677792041077, + "grad_norm": 1.9600023031234741, + "learning_rate": 4.066752246469833e-06, + "loss": 0.9109, + "step": 1057 + }, + { + "epoch": 1.3581514762516047, + "grad_norm": 1.6045281887054443, + "learning_rate": 4.070603337612324e-06, + "loss": 0.8645, + "step": 1058 + }, + { + "epoch": 1.3594351732991015, + "grad_norm": 2.019740581512451, + "learning_rate": 4.074454428754814e-06, + "loss": 0.9043, + "step": 1059 + }, + { + "epoch": 1.3607188703465982, + "grad_norm": 1.8842427730560303, + "learning_rate": 4.078305519897304e-06, + "loss": 0.9297, + "step": 1060 + }, + { + "epoch": 1.362002567394095, + "grad_norm": 2.0369672775268555, + "learning_rate": 4.082156611039795e-06, + "loss": 0.9587, + "step": 1061 + }, + { + "epoch": 1.3632862644415917, + "grad_norm": 2.0122289657592773, + "learning_rate": 4.0860077021822854e-06, + "loss": 0.925, + "step": 1062 + }, + { + "epoch": 1.3645699614890887, + "grad_norm": 1.9722161293029785, + "learning_rate": 4.089858793324776e-06, + "loss": 0.9835, + "step": 1063 + }, + { + "epoch": 1.3658536585365852, + "grad_norm": 2.0234317779541016, + "learning_rate": 4.093709884467266e-06, + "loss": 0.9043, + "step": 1064 + }, + { + "epoch": 1.3671373555840822, + "grad_norm": 2.76041316986084, + "learning_rate": 4.097560975609756e-06, + "loss": 0.9558, + "step": 1065 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.925584316253662, + "learning_rate": 4.101412066752246e-06, + "loss": 0.9254, + "step": 1066 + }, + { + "epoch": 1.3697047496790757, + "grad_norm": 1.8600817918777466, + "learning_rate": 4.105263157894737e-06, + "loss": 0.9147, + "step": 1067 + }, + { + "epoch": 1.3709884467265725, + "grad_norm": 2.832670211791992, + "learning_rate": 4.109114249037227e-06, + "loss": 0.9437, + "step": 1068 + }, + { + "epoch": 1.3722721437740693, + "grad_norm": 2.093510389328003, + "learning_rate": 4.112965340179718e-06, + "loss": 1.019, + "step": 1069 + }, + { + "epoch": 1.3735558408215662, + "grad_norm": 3.0238966941833496, + "learning_rate": 4.116816431322208e-06, + "loss": 0.966, + "step": 1070 + }, + { + "epoch": 1.3748395378690628, + "grad_norm": 2.780505657196045, + "learning_rate": 4.120667522464698e-06, + "loss": 0.9768, + "step": 1071 + }, + { + "epoch": 1.3761232349165597, + "grad_norm": 1.9343726634979248, + "learning_rate": 4.1245186136071885e-06, + "loss": 1.0337, + "step": 1072 + }, + { + "epoch": 1.3774069319640565, + "grad_norm": 5.661340713500977, + "learning_rate": 4.12836970474968e-06, + "loss": 1.0307, + "step": 1073 + }, + { + "epoch": 1.3786906290115533, + "grad_norm": 4.2190070152282715, + "learning_rate": 4.132220795892169e-06, + "loss": 1.0217, + "step": 1074 + }, + { + "epoch": 1.37997432605905, + "grad_norm": 3.4702093601226807, + "learning_rate": 4.13607188703466e-06, + "loss": 1.0466, + "step": 1075 + }, + { + "epoch": 1.3812580231065468, + "grad_norm": 4.1294331550598145, + "learning_rate": 4.13992297817715e-06, + "loss": 1.0178, + "step": 1076 + }, + { + "epoch": 1.3825417201540438, + "grad_norm": 3.6319777965545654, + "learning_rate": 4.1437740693196405e-06, + "loss": 1.121, + "step": 1077 + }, + { + "epoch": 1.3838254172015405, + "grad_norm": 3.4962055683135986, + "learning_rate": 4.147625160462131e-06, + "loss": 1.1619, + "step": 1078 + }, + { + "epoch": 1.3851091142490373, + "grad_norm": 5.333896160125732, + "learning_rate": 4.151476251604622e-06, + "loss": 1.2993, + "step": 1079 + }, + { + "epoch": 1.386392811296534, + "grad_norm": 3.0166397094726562, + "learning_rate": 4.155327342747111e-06, + "loss": 0.8961, + "step": 1080 + }, + { + "epoch": 1.3876765083440308, + "grad_norm": 2.055769205093384, + "learning_rate": 4.159178433889602e-06, + "loss": 0.839, + "step": 1081 + }, + { + "epoch": 1.3889602053915275, + "grad_norm": 1.3765147924423218, + "learning_rate": 4.163029525032093e-06, + "loss": 0.8528, + "step": 1082 + }, + { + "epoch": 1.3902439024390243, + "grad_norm": 2.2609667778015137, + "learning_rate": 4.166880616174583e-06, + "loss": 0.8758, + "step": 1083 + }, + { + "epoch": 1.3915275994865213, + "grad_norm": 1.745911955833435, + "learning_rate": 4.170731707317074e-06, + "loss": 0.8151, + "step": 1084 + }, + { + "epoch": 1.392811296534018, + "grad_norm": 1.470701813697815, + "learning_rate": 4.174582798459564e-06, + "loss": 0.8238, + "step": 1085 + }, + { + "epoch": 1.3940949935815148, + "grad_norm": 4.677919864654541, + "learning_rate": 4.178433889602054e-06, + "loss": 0.8781, + "step": 1086 + }, + { + "epoch": 1.3953786906290115, + "grad_norm": 1.9172484874725342, + "learning_rate": 4.182284980744544e-06, + "loss": 0.873, + "step": 1087 + }, + { + "epoch": 1.3966623876765083, + "grad_norm": 1.6542209386825562, + "learning_rate": 4.1861360718870355e-06, + "loss": 0.8785, + "step": 1088 + }, + { + "epoch": 1.397946084724005, + "grad_norm": 2.784259557723999, + "learning_rate": 4.189987163029525e-06, + "loss": 0.8705, + "step": 1089 + }, + { + "epoch": 1.3992297817715018, + "grad_norm": 2.6584033966064453, + "learning_rate": 4.193838254172016e-06, + "loss": 0.8422, + "step": 1090 + }, + { + "epoch": 1.4005134788189988, + "grad_norm": 1.7797914743423462, + "learning_rate": 4.197689345314505e-06, + "loss": 0.8272, + "step": 1091 + }, + { + "epoch": 1.4017971758664955, + "grad_norm": 1.838279128074646, + "learning_rate": 4.201540436456996e-06, + "loss": 0.8702, + "step": 1092 + }, + { + "epoch": 1.4030808729139923, + "grad_norm": 1.5037357807159424, + "learning_rate": 4.205391527599487e-06, + "loss": 0.8309, + "step": 1093 + }, + { + "epoch": 1.404364569961489, + "grad_norm": 1.752588152885437, + "learning_rate": 4.209242618741977e-06, + "loss": 0.7988, + "step": 1094 + }, + { + "epoch": 1.4056482670089858, + "grad_norm": 1.3989521265029907, + "learning_rate": 4.213093709884467e-06, + "loss": 0.8773, + "step": 1095 + }, + { + "epoch": 1.4069319640564828, + "grad_norm": 1.7519266605377197, + "learning_rate": 4.216944801026958e-06, + "loss": 0.9064, + "step": 1096 + }, + { + "epoch": 1.4082156611039793, + "grad_norm": 2.0358083248138428, + "learning_rate": 4.2207958921694475e-06, + "loss": 0.8573, + "step": 1097 + }, + { + "epoch": 1.4094993581514763, + "grad_norm": 1.6560968160629272, + "learning_rate": 4.2246469833119386e-06, + "loss": 0.8443, + "step": 1098 + }, + { + "epoch": 1.410783055198973, + "grad_norm": 3.4469940662384033, + "learning_rate": 4.228498074454429e-06, + "loss": 0.8957, + "step": 1099 + }, + { + "epoch": 1.4120667522464698, + "grad_norm": 1.798743486404419, + "learning_rate": 4.232349165596919e-06, + "loss": 0.8512, + "step": 1100 + }, + { + "epoch": 1.4133504492939666, + "grad_norm": 2.195523977279663, + "learning_rate": 4.236200256739409e-06, + "loss": 0.8925, + "step": 1101 + }, + { + "epoch": 1.4146341463414633, + "grad_norm": 2.098417282104492, + "learning_rate": 4.2400513478819e-06, + "loss": 0.8848, + "step": 1102 + }, + { + "epoch": 1.4159178433889603, + "grad_norm": 1.8591166734695435, + "learning_rate": 4.24390243902439e-06, + "loss": 0.8856, + "step": 1103 + }, + { + "epoch": 1.417201540436457, + "grad_norm": 1.6930562257766724, + "learning_rate": 4.247753530166881e-06, + "loss": 0.8576, + "step": 1104 + }, + { + "epoch": 1.4184852374839538, + "grad_norm": 1.6047241687774658, + "learning_rate": 4.251604621309372e-06, + "loss": 0.9015, + "step": 1105 + }, + { + "epoch": 1.4197689345314506, + "grad_norm": 2.5010480880737305, + "learning_rate": 4.255455712451861e-06, + "loss": 0.8292, + "step": 1106 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 2.398621082305908, + "learning_rate": 4.259306803594352e-06, + "loss": 0.8109, + "step": 1107 + }, + { + "epoch": 1.422336328626444, + "grad_norm": 1.7723544836044312, + "learning_rate": 4.2631578947368425e-06, + "loss": 0.8708, + "step": 1108 + }, + { + "epoch": 1.4236200256739409, + "grad_norm": 1.9489495754241943, + "learning_rate": 4.267008985879333e-06, + "loss": 0.9005, + "step": 1109 + }, + { + "epoch": 1.4249037227214378, + "grad_norm": 1.9350708723068237, + "learning_rate": 4.270860077021823e-06, + "loss": 0.9432, + "step": 1110 + }, + { + "epoch": 1.4261874197689346, + "grad_norm": 1.9795074462890625, + "learning_rate": 4.274711168164314e-06, + "loss": 0.8408, + "step": 1111 + }, + { + "epoch": 1.4274711168164314, + "grad_norm": 2.249767541885376, + "learning_rate": 4.278562259306803e-06, + "loss": 0.9187, + "step": 1112 + }, + { + "epoch": 1.428754813863928, + "grad_norm": 2.2618868350982666, + "learning_rate": 4.2824133504492944e-06, + "loss": 0.9152, + "step": 1113 + }, + { + "epoch": 1.4300385109114249, + "grad_norm": 3.5552420616149902, + "learning_rate": 4.286264441591785e-06, + "loss": 0.937, + "step": 1114 + }, + { + "epoch": 1.4313222079589216, + "grad_norm": 1.9924381971359253, + "learning_rate": 4.290115532734275e-06, + "loss": 0.9681, + "step": 1115 + }, + { + "epoch": 1.4326059050064184, + "grad_norm": 1.6732163429260254, + "learning_rate": 4.293966623876765e-06, + "loss": 0.9425, + "step": 1116 + }, + { + "epoch": 1.4338896020539154, + "grad_norm": 1.7781575918197632, + "learning_rate": 4.297817715019255e-06, + "loss": 0.9688, + "step": 1117 + }, + { + "epoch": 1.4351732991014121, + "grad_norm": 1.877545714378357, + "learning_rate": 4.3016688061617456e-06, + "loss": 0.9631, + "step": 1118 + }, + { + "epoch": 1.4364569961489089, + "grad_norm": 2.457772731781006, + "learning_rate": 4.305519897304237e-06, + "loss": 0.8779, + "step": 1119 + }, + { + "epoch": 1.4377406931964056, + "grad_norm": 3.306103467941284, + "learning_rate": 4.309370988446726e-06, + "loss": 0.9681, + "step": 1120 + }, + { + "epoch": 1.4390243902439024, + "grad_norm": 2.351863145828247, + "learning_rate": 4.313222079589217e-06, + "loss": 0.9545, + "step": 1121 + }, + { + "epoch": 1.4403080872913994, + "grad_norm": 2.1245923042297363, + "learning_rate": 4.317073170731707e-06, + "loss": 0.9236, + "step": 1122 + }, + { + "epoch": 1.441591784338896, + "grad_norm": 2.0607879161834717, + "learning_rate": 4.3209242618741975e-06, + "loss": 1.0049, + "step": 1123 + }, + { + "epoch": 1.4428754813863929, + "grad_norm": 2.6257970333099365, + "learning_rate": 4.324775353016688e-06, + "loss": 1.0005, + "step": 1124 + }, + { + "epoch": 1.4441591784338896, + "grad_norm": 2.7483763694763184, + "learning_rate": 4.328626444159179e-06, + "loss": 1.0338, + "step": 1125 + }, + { + "epoch": 1.4454428754813864, + "grad_norm": 2.4914960861206055, + "learning_rate": 4.332477535301668e-06, + "loss": 1.0256, + "step": 1126 + }, + { + "epoch": 1.4467265725288831, + "grad_norm": 3.32716703414917, + "learning_rate": 4.336328626444159e-06, + "loss": 1.1411, + "step": 1127 + }, + { + "epoch": 1.44801026957638, + "grad_norm": 3.5923783779144287, + "learning_rate": 4.34017971758665e-06, + "loss": 1.078, + "step": 1128 + }, + { + "epoch": 1.4492939666238769, + "grad_norm": 4.5023193359375, + "learning_rate": 4.34403080872914e-06, + "loss": 1.4054, + "step": 1129 + }, + { + "epoch": 1.4505776636713734, + "grad_norm": 2.9784412384033203, + "learning_rate": 4.347881899871631e-06, + "loss": 0.8764, + "step": 1130 + }, + { + "epoch": 1.4518613607188704, + "grad_norm": 1.874756932258606, + "learning_rate": 4.351732991014121e-06, + "loss": 0.8499, + "step": 1131 + }, + { + "epoch": 1.4531450577663672, + "grad_norm": 1.4087258577346802, + "learning_rate": 4.355584082156611e-06, + "loss": 0.8234, + "step": 1132 + }, + { + "epoch": 1.454428754813864, + "grad_norm": 1.3553149700164795, + "learning_rate": 4.3594351732991015e-06, + "loss": 0.8429, + "step": 1133 + }, + { + "epoch": 1.4557124518613607, + "grad_norm": 1.3703463077545166, + "learning_rate": 4.3632862644415925e-06, + "loss": 0.8635, + "step": 1134 + }, + { + "epoch": 1.4569961489088574, + "grad_norm": 2.4241058826446533, + "learning_rate": 4.367137355584082e-06, + "loss": 0.7998, + "step": 1135 + }, + { + "epoch": 1.4582798459563544, + "grad_norm": 2.503262996673584, + "learning_rate": 4.370988446726573e-06, + "loss": 0.8343, + "step": 1136 + }, + { + "epoch": 1.4595635430038512, + "grad_norm": 1.7744805812835693, + "learning_rate": 4.374839537869063e-06, + "loss": 0.8887, + "step": 1137 + }, + { + "epoch": 1.460847240051348, + "grad_norm": 1.4863355159759521, + "learning_rate": 4.378690629011553e-06, + "loss": 0.7723, + "step": 1138 + }, + { + "epoch": 1.4621309370988447, + "grad_norm": 1.5493770837783813, + "learning_rate": 4.382541720154044e-06, + "loss": 0.8443, + "step": 1139 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 3.4100677967071533, + "learning_rate": 4.386392811296535e-06, + "loss": 0.8266, + "step": 1140 + }, + { + "epoch": 1.4646983311938382, + "grad_norm": 1.9554704427719116, + "learning_rate": 4.390243902439024e-06, + "loss": 0.8753, + "step": 1141 + }, + { + "epoch": 1.465982028241335, + "grad_norm": 5.135269641876221, + "learning_rate": 4.394094993581515e-06, + "loss": 0.8226, + "step": 1142 + }, + { + "epoch": 1.467265725288832, + "grad_norm": 1.961816430091858, + "learning_rate": 4.3979460847240045e-06, + "loss": 0.8974, + "step": 1143 + }, + { + "epoch": 1.4685494223363287, + "grad_norm": 1.7965461015701294, + "learning_rate": 4.401797175866496e-06, + "loss": 0.9051, + "step": 1144 + }, + { + "epoch": 1.4698331193838254, + "grad_norm": 1.3681248426437378, + "learning_rate": 4.405648267008986e-06, + "loss": 0.8403, + "step": 1145 + }, + { + "epoch": 1.4711168164313222, + "grad_norm": 1.7375599145889282, + "learning_rate": 4.409499358151476e-06, + "loss": 0.8313, + "step": 1146 + }, + { + "epoch": 1.472400513478819, + "grad_norm": 2.1059093475341797, + "learning_rate": 4.413350449293966e-06, + "loss": 0.8437, + "step": 1147 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.9037929773330688, + "learning_rate": 4.417201540436457e-06, + "loss": 0.8256, + "step": 1148 + }, + { + "epoch": 1.4749679075738125, + "grad_norm": 1.4835832118988037, + "learning_rate": 4.421052631578947e-06, + "loss": 0.8506, + "step": 1149 + }, + { + "epoch": 1.4762516046213094, + "grad_norm": 1.7527414560317993, + "learning_rate": 4.424903722721438e-06, + "loss": 0.8401, + "step": 1150 + }, + { + "epoch": 1.4775353016688062, + "grad_norm": 1.874907374382019, + "learning_rate": 4.428754813863929e-06, + "loss": 0.8653, + "step": 1151 + }, + { + "epoch": 1.478818998716303, + "grad_norm": 1.8505196571350098, + "learning_rate": 4.432605905006418e-06, + "loss": 0.8629, + "step": 1152 + }, + { + "epoch": 1.4801026957637997, + "grad_norm": 1.7728815078735352, + "learning_rate": 4.436456996148909e-06, + "loss": 0.8401, + "step": 1153 + }, + { + "epoch": 1.4813863928112965, + "grad_norm": 2.0461418628692627, + "learning_rate": 4.4403080872913995e-06, + "loss": 0.8742, + "step": 1154 + }, + { + "epoch": 1.4826700898587934, + "grad_norm": 1.7129602432250977, + "learning_rate": 4.44415917843389e-06, + "loss": 0.8892, + "step": 1155 + }, + { + "epoch": 1.48395378690629, + "grad_norm": 2.136181354522705, + "learning_rate": 4.44801026957638e-06, + "loss": 0.8646, + "step": 1156 + }, + { + "epoch": 1.485237483953787, + "grad_norm": 2.8285648822784424, + "learning_rate": 4.451861360718871e-06, + "loss": 0.8705, + "step": 1157 + }, + { + "epoch": 1.4865211810012837, + "grad_norm": 1.8856934309005737, + "learning_rate": 4.4557124518613604e-06, + "loss": 0.9374, + "step": 1158 + }, + { + "epoch": 1.4878048780487805, + "grad_norm": 1.7079908847808838, + "learning_rate": 4.4595635430038515e-06, + "loss": 0.9002, + "step": 1159 + }, + { + "epoch": 1.4890885750962772, + "grad_norm": 2.724578380584717, + "learning_rate": 4.463414634146342e-06, + "loss": 0.9415, + "step": 1160 + }, + { + "epoch": 1.490372272143774, + "grad_norm": 1.6274868249893188, + "learning_rate": 4.467265725288832e-06, + "loss": 0.8614, + "step": 1161 + }, + { + "epoch": 1.491655969191271, + "grad_norm": 2.325700283050537, + "learning_rate": 4.471116816431322e-06, + "loss": 0.8494, + "step": 1162 + }, + { + "epoch": 1.4929396662387677, + "grad_norm": 1.6611632108688354, + "learning_rate": 4.474967907573813e-06, + "loss": 1.0131, + "step": 1163 + }, + { + "epoch": 1.4942233632862645, + "grad_norm": 5.581430912017822, + "learning_rate": 4.478818998716303e-06, + "loss": 0.8809, + "step": 1164 + }, + { + "epoch": 1.4955070603337612, + "grad_norm": 2.07230806350708, + "learning_rate": 4.482670089858794e-06, + "loss": 0.9456, + "step": 1165 + }, + { + "epoch": 1.496790757381258, + "grad_norm": 3.276052474975586, + "learning_rate": 4.486521181001284e-06, + "loss": 0.9495, + "step": 1166 + }, + { + "epoch": 1.4980744544287548, + "grad_norm": 2.6287448406219482, + "learning_rate": 4.490372272143774e-06, + "loss": 0.9303, + "step": 1167 + }, + { + "epoch": 1.4993581514762515, + "grad_norm": 2.014803171157837, + "learning_rate": 4.494223363286264e-06, + "loss": 0.9069, + "step": 1168 + }, + { + "epoch": 1.5006418485237485, + "grad_norm": 2.4965524673461914, + "learning_rate": 4.498074454428755e-06, + "loss": 0.9228, + "step": 1169 + }, + { + "epoch": 1.501925545571245, + "grad_norm": 2.1323587894439697, + "learning_rate": 4.501925545571245e-06, + "loss": 0.9553, + "step": 1170 + }, + { + "epoch": 1.503209242618742, + "grad_norm": 4.508212566375732, + "learning_rate": 4.505776636713736e-06, + "loss": 0.983, + "step": 1171 + }, + { + "epoch": 1.5044929396662388, + "grad_norm": 2.191310167312622, + "learning_rate": 4.509627727856226e-06, + "loss": 1.0281, + "step": 1172 + }, + { + "epoch": 1.5057766367137355, + "grad_norm": 2.9856228828430176, + "learning_rate": 4.513478818998716e-06, + "loss": 0.9663, + "step": 1173 + }, + { + "epoch": 1.5070603337612325, + "grad_norm": 2.7094099521636963, + "learning_rate": 4.517329910141207e-06, + "loss": 0.9947, + "step": 1174 + }, + { + "epoch": 1.508344030808729, + "grad_norm": 2.6477952003479004, + "learning_rate": 4.521181001283697e-06, + "loss": 1.0231, + "step": 1175 + }, + { + "epoch": 1.509627727856226, + "grad_norm": 4.041821002960205, + "learning_rate": 4.525032092426188e-06, + "loss": 1.0625, + "step": 1176 + }, + { + "epoch": 1.5109114249037228, + "grad_norm": 2.93692684173584, + "learning_rate": 4.528883183568678e-06, + "loss": 1.0889, + "step": 1177 + }, + { + "epoch": 1.5121951219512195, + "grad_norm": 5.7777509689331055, + "learning_rate": 4.532734274711168e-06, + "loss": 1.2442, + "step": 1178 + }, + { + "epoch": 1.5134788189987163, + "grad_norm": 5.9870781898498535, + "learning_rate": 4.5365853658536585e-06, + "loss": 1.4003, + "step": 1179 + }, + { + "epoch": 1.514762516046213, + "grad_norm": 2.531604051589966, + "learning_rate": 4.54043645699615e-06, + "loss": 0.8616, + "step": 1180 + }, + { + "epoch": 1.51604621309371, + "grad_norm": 1.76259446144104, + "learning_rate": 4.544287548138639e-06, + "loss": 0.853, + "step": 1181 + }, + { + "epoch": 1.5173299101412066, + "grad_norm": 1.5425573587417603, + "learning_rate": 4.54813863928113e-06, + "loss": 0.861, + "step": 1182 + }, + { + "epoch": 1.5186136071887035, + "grad_norm": 1.719773769378662, + "learning_rate": 4.55198973042362e-06, + "loss": 0.8365, + "step": 1183 + }, + { + "epoch": 1.5198973042362003, + "grad_norm": 1.5106749534606934, + "learning_rate": 4.5558408215661105e-06, + "loss": 0.8682, + "step": 1184 + }, + { + "epoch": 1.521181001283697, + "grad_norm": 1.9121061563491821, + "learning_rate": 4.559691912708601e-06, + "loss": 0.8095, + "step": 1185 + }, + { + "epoch": 1.5224646983311938, + "grad_norm": 1.6359730958938599, + "learning_rate": 4.563543003851092e-06, + "loss": 0.8203, + "step": 1186 + }, + { + "epoch": 1.5237483953786906, + "grad_norm": 1.7229688167572021, + "learning_rate": 4.567394094993581e-06, + "loss": 0.835, + "step": 1187 + }, + { + "epoch": 1.5250320924261875, + "grad_norm": 1.5083376169204712, + "learning_rate": 4.571245186136072e-06, + "loss": 0.8034, + "step": 1188 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 1.3764948844909668, + "learning_rate": 4.5750962772785624e-06, + "loss": 0.8244, + "step": 1189 + }, + { + "epoch": 1.527599486521181, + "grad_norm": 1.761120080947876, + "learning_rate": 4.578947368421053e-06, + "loss": 0.8049, + "step": 1190 + }, + { + "epoch": 1.5288831835686778, + "grad_norm": 1.9975556135177612, + "learning_rate": 4.582798459563543e-06, + "loss": 0.8797, + "step": 1191 + }, + { + "epoch": 1.5301668806161746, + "grad_norm": 1.5109292268753052, + "learning_rate": 4.586649550706034e-06, + "loss": 0.814, + "step": 1192 + }, + { + "epoch": 1.5314505776636713, + "grad_norm": 2.025286912918091, + "learning_rate": 4.590500641848523e-06, + "loss": 0.8097, + "step": 1193 + }, + { + "epoch": 1.532734274711168, + "grad_norm": 1.5455119609832764, + "learning_rate": 4.594351732991014e-06, + "loss": 0.8335, + "step": 1194 + }, + { + "epoch": 1.534017971758665, + "grad_norm": 3.459369421005249, + "learning_rate": 4.598202824133505e-06, + "loss": 0.8555, + "step": 1195 + }, + { + "epoch": 1.5353016688061616, + "grad_norm": 1.709786057472229, + "learning_rate": 4.602053915275995e-06, + "loss": 0.792, + "step": 1196 + }, + { + "epoch": 1.5365853658536586, + "grad_norm": 1.8654648065567017, + "learning_rate": 4.605905006418486e-06, + "loss": 0.7669, + "step": 1197 + }, + { + "epoch": 1.5378690629011553, + "grad_norm": 1.7607251405715942, + "learning_rate": 4.609756097560975e-06, + "loss": 0.8368, + "step": 1198 + }, + { + "epoch": 1.539152759948652, + "grad_norm": 2.2149899005889893, + "learning_rate": 4.613607188703466e-06, + "loss": 0.8613, + "step": 1199 + }, + { + "epoch": 1.540436456996149, + "grad_norm": 1.4869202375411987, + "learning_rate": 4.617458279845957e-06, + "loss": 0.8181, + "step": 1200 + }, + { + "epoch": 1.5417201540436456, + "grad_norm": 1.5974550247192383, + "learning_rate": 4.621309370988447e-06, + "loss": 0.8331, + "step": 1201 + }, + { + "epoch": 1.5430038510911426, + "grad_norm": 1.7709903717041016, + "learning_rate": 4.625160462130937e-06, + "loss": 0.8039, + "step": 1202 + }, + { + "epoch": 1.5442875481386393, + "grad_norm": 1.6900080442428589, + "learning_rate": 4.629011553273428e-06, + "loss": 0.8118, + "step": 1203 + }, + { + "epoch": 1.545571245186136, + "grad_norm": 2.0574450492858887, + "learning_rate": 4.6328626444159175e-06, + "loss": 0.8088, + "step": 1204 + }, + { + "epoch": 1.5468549422336328, + "grad_norm": 2.3783392906188965, + "learning_rate": 4.6367137355584086e-06, + "loss": 0.8809, + "step": 1205 + }, + { + "epoch": 1.5481386392811296, + "grad_norm": 3.130692958831787, + "learning_rate": 4.640564826700899e-06, + "loss": 0.8562, + "step": 1206 + }, + { + "epoch": 1.5494223363286266, + "grad_norm": 1.7931820154190063, + "learning_rate": 4.644415917843389e-06, + "loss": 0.8178, + "step": 1207 + }, + { + "epoch": 1.5507060333761231, + "grad_norm": 3.1430106163024902, + "learning_rate": 4.648267008985879e-06, + "loss": 0.8641, + "step": 1208 + }, + { + "epoch": 1.55198973042362, + "grad_norm": 2.000171184539795, + "learning_rate": 4.65211810012837e-06, + "loss": 0.8189, + "step": 1209 + }, + { + "epoch": 1.5532734274711169, + "grad_norm": 2.8287642002105713, + "learning_rate": 4.65596919127086e-06, + "loss": 0.8376, + "step": 1210 + }, + { + "epoch": 1.5545571245186136, + "grad_norm": 3.315699815750122, + "learning_rate": 4.659820282413351e-06, + "loss": 0.8305, + "step": 1211 + }, + { + "epoch": 1.5558408215661104, + "grad_norm": 1.4703121185302734, + "learning_rate": 4.663671373555841e-06, + "loss": 0.8713, + "step": 1212 + }, + { + "epoch": 1.5571245186136071, + "grad_norm": 2.1241953372955322, + "learning_rate": 4.667522464698331e-06, + "loss": 0.8978, + "step": 1213 + }, + { + "epoch": 1.558408215661104, + "grad_norm": 9.521953582763672, + "learning_rate": 4.671373555840821e-06, + "loss": 0.8491, + "step": 1214 + }, + { + "epoch": 1.5596919127086006, + "grad_norm": 4.049274444580078, + "learning_rate": 4.6752246469833125e-06, + "loss": 0.9205, + "step": 1215 + }, + { + "epoch": 1.5609756097560976, + "grad_norm": 2.698082447052002, + "learning_rate": 4.679075738125802e-06, + "loss": 0.8821, + "step": 1216 + }, + { + "epoch": 1.5622593068035944, + "grad_norm": 1.9899661540985107, + "learning_rate": 4.682926829268293e-06, + "loss": 0.9211, + "step": 1217 + }, + { + "epoch": 1.5635430038510911, + "grad_norm": 3.800088882446289, + "learning_rate": 4.686777920410783e-06, + "loss": 0.9205, + "step": 1218 + }, + { + "epoch": 1.5648267008985879, + "grad_norm": 4.687465667724609, + "learning_rate": 4.690629011553273e-06, + "loss": 0.9291, + "step": 1219 + }, + { + "epoch": 1.5661103979460846, + "grad_norm": 2.512397050857544, + "learning_rate": 4.6944801026957644e-06, + "loss": 0.9284, + "step": 1220 + }, + { + "epoch": 1.5673940949935816, + "grad_norm": 3.3319482803344727, + "learning_rate": 4.698331193838254e-06, + "loss": 1.0122, + "step": 1221 + }, + { + "epoch": 1.5686777920410782, + "grad_norm": 2.772609233856201, + "learning_rate": 4.702182284980745e-06, + "loss": 0.919, + "step": 1222 + }, + { + "epoch": 1.5699614890885751, + "grad_norm": 3.17008638381958, + "learning_rate": 4.706033376123235e-06, + "loss": 0.8912, + "step": 1223 + }, + { + "epoch": 1.571245186136072, + "grad_norm": 4.457746982574463, + "learning_rate": 4.709884467265725e-06, + "loss": 1.0122, + "step": 1224 + }, + { + "epoch": 1.5725288831835686, + "grad_norm": 3.027817487716675, + "learning_rate": 4.7137355584082156e-06, + "loss": 1.0037, + "step": 1225 + }, + { + "epoch": 1.5738125802310656, + "grad_norm": 3.006171464920044, + "learning_rate": 4.717586649550707e-06, + "loss": 0.9251, + "step": 1226 + }, + { + "epoch": 1.5750962772785622, + "grad_norm": 2.466762065887451, + "learning_rate": 4.721437740693196e-06, + "loss": 1.0066, + "step": 1227 + }, + { + "epoch": 1.5763799743260591, + "grad_norm": 6.422748565673828, + "learning_rate": 4.725288831835687e-06, + "loss": 1.1445, + "step": 1228 + }, + { + "epoch": 1.5776636713735557, + "grad_norm": 4.179295539855957, + "learning_rate": 4.729139922978177e-06, + "loss": 1.2903, + "step": 1229 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 2.419856071472168, + "learning_rate": 4.7329910141206675e-06, + "loss": 0.8065, + "step": 1230 + }, + { + "epoch": 1.5802310654685494, + "grad_norm": 1.4205104112625122, + "learning_rate": 4.736842105263158e-06, + "loss": 0.7817, + "step": 1231 + }, + { + "epoch": 1.5815147625160462, + "grad_norm": 1.7063740491867065, + "learning_rate": 4.740693196405649e-06, + "loss": 0.7995, + "step": 1232 + }, + { + "epoch": 1.5827984595635431, + "grad_norm": 1.5249727964401245, + "learning_rate": 4.744544287548138e-06, + "loss": 0.8021, + "step": 1233 + }, + { + "epoch": 1.5840821566110397, + "grad_norm": 1.4178117513656616, + "learning_rate": 4.748395378690629e-06, + "loss": 0.8412, + "step": 1234 + }, + { + "epoch": 1.5853658536585367, + "grad_norm": 1.3699384927749634, + "learning_rate": 4.7522464698331195e-06, + "loss": 0.826, + "step": 1235 + }, + { + "epoch": 1.5866495507060334, + "grad_norm": 3.414459466934204, + "learning_rate": 4.75609756097561e-06, + "loss": 0.8291, + "step": 1236 + }, + { + "epoch": 1.5879332477535302, + "grad_norm": 2.167909622192383, + "learning_rate": 4.7599486521181e-06, + "loss": 0.786, + "step": 1237 + }, + { + "epoch": 1.589216944801027, + "grad_norm": 1.590160608291626, + "learning_rate": 4.763799743260591e-06, + "loss": 0.8417, + "step": 1238 + }, + { + "epoch": 1.5905006418485237, + "grad_norm": 1.9643218517303467, + "learning_rate": 4.76765083440308e-06, + "loss": 0.795, + "step": 1239 + }, + { + "epoch": 1.5917843388960207, + "grad_norm": 1.7530301809310913, + "learning_rate": 4.7715019255455715e-06, + "loss": 0.8028, + "step": 1240 + }, + { + "epoch": 1.5930680359435172, + "grad_norm": 1.4616026878356934, + "learning_rate": 4.7753530166880625e-06, + "loss": 0.8248, + "step": 1241 + }, + { + "epoch": 1.5943517329910142, + "grad_norm": 2.1300599575042725, + "learning_rate": 4.779204107830552e-06, + "loss": 0.8325, + "step": 1242 + }, + { + "epoch": 1.595635430038511, + "grad_norm": 1.9434114694595337, + "learning_rate": 4.783055198973043e-06, + "loss": 0.8074, + "step": 1243 + }, + { + "epoch": 1.5969191270860077, + "grad_norm": 2.1004881858825684, + "learning_rate": 4.786906290115532e-06, + "loss": 0.8313, + "step": 1244 + }, + { + "epoch": 1.5982028241335045, + "grad_norm": 1.8329432010650635, + "learning_rate": 4.790757381258023e-06, + "loss": 0.821, + "step": 1245 + }, + { + "epoch": 1.5994865211810012, + "grad_norm": 1.8780674934387207, + "learning_rate": 4.794608472400514e-06, + "loss": 0.7811, + "step": 1246 + }, + { + "epoch": 1.6007702182284982, + "grad_norm": 1.611238718032837, + "learning_rate": 4.798459563543004e-06, + "loss": 0.7725, + "step": 1247 + }, + { + "epoch": 1.6020539152759947, + "grad_norm": 1.4771357774734497, + "learning_rate": 4.802310654685494e-06, + "loss": 0.8324, + "step": 1248 + }, + { + "epoch": 1.6033376123234917, + "grad_norm": 1.4288347959518433, + "learning_rate": 4.806161745827985e-06, + "loss": 0.8474, + "step": 1249 + }, + { + "epoch": 1.6046213093709885, + "grad_norm": 2.3017404079437256, + "learning_rate": 4.8100128369704745e-06, + "loss": 0.8212, + "step": 1250 + }, + { + "epoch": 1.6059050064184852, + "grad_norm": 1.7176234722137451, + "learning_rate": 4.813863928112966e-06, + "loss": 0.8046, + "step": 1251 + }, + { + "epoch": 1.6071887034659822, + "grad_norm": 2.3746635913848877, + "learning_rate": 4.817715019255456e-06, + "loss": 0.8181, + "step": 1252 + }, + { + "epoch": 1.6084724005134787, + "grad_norm": 1.3555657863616943, + "learning_rate": 4.821566110397946e-06, + "loss": 0.7772, + "step": 1253 + }, + { + "epoch": 1.6097560975609757, + "grad_norm": 1.619770884513855, + "learning_rate": 4.825417201540436e-06, + "loss": 0.8399, + "step": 1254 + }, + { + "epoch": 1.6110397946084722, + "grad_norm": 1.5324815511703491, + "learning_rate": 4.829268292682927e-06, + "loss": 0.7858, + "step": 1255 + }, + { + "epoch": 1.6123234916559692, + "grad_norm": 1.863723635673523, + "learning_rate": 4.833119383825417e-06, + "loss": 0.8765, + "step": 1256 + }, + { + "epoch": 1.613607188703466, + "grad_norm": 1.9319173097610474, + "learning_rate": 4.836970474967908e-06, + "loss": 0.8506, + "step": 1257 + }, + { + "epoch": 1.6148908857509627, + "grad_norm": 7.748546600341797, + "learning_rate": 4.840821566110398e-06, + "loss": 0.8059, + "step": 1258 + }, + { + "epoch": 1.6161745827984597, + "grad_norm": 2.37045955657959, + "learning_rate": 4.844672657252888e-06, + "loss": 0.8412, + "step": 1259 + }, + { + "epoch": 1.6174582798459562, + "grad_norm": 2.052738904953003, + "learning_rate": 4.8485237483953785e-06, + "loss": 0.8375, + "step": 1260 + }, + { + "epoch": 1.6187419768934532, + "grad_norm": 2.098395347595215, + "learning_rate": 4.8523748395378695e-06, + "loss": 0.806, + "step": 1261 + }, + { + "epoch": 1.62002567394095, + "grad_norm": 2.8876397609710693, + "learning_rate": 4.85622593068036e-06, + "loss": 0.8451, + "step": 1262 + }, + { + "epoch": 1.6213093709884467, + "grad_norm": 2.062074661254883, + "learning_rate": 4.86007702182285e-06, + "loss": 0.8894, + "step": 1263 + }, + { + "epoch": 1.6225930680359435, + "grad_norm": 6.103091716766357, + "learning_rate": 4.863928112965341e-06, + "loss": 0.8508, + "step": 1264 + }, + { + "epoch": 1.6238767650834403, + "grad_norm": 1.87628972530365, + "learning_rate": 4.8677792041078304e-06, + "loss": 0.896, + "step": 1265 + }, + { + "epoch": 1.6251604621309372, + "grad_norm": 3.7247085571289062, + "learning_rate": 4.8716302952503215e-06, + "loss": 0.898, + "step": 1266 + }, + { + "epoch": 1.6264441591784338, + "grad_norm": 2.315091371536255, + "learning_rate": 4.875481386392812e-06, + "loss": 0.8986, + "step": 1267 + }, + { + "epoch": 1.6277278562259307, + "grad_norm": 2.9335837364196777, + "learning_rate": 4.879332477535302e-06, + "loss": 0.9717, + "step": 1268 + }, + { + "epoch": 1.6290115532734275, + "grad_norm": 2.6642210483551025, + "learning_rate": 4.883183568677792e-06, + "loss": 0.9209, + "step": 1269 + }, + { + "epoch": 1.6302952503209243, + "grad_norm": 1.8723340034484863, + "learning_rate": 4.887034659820282e-06, + "loss": 0.8358, + "step": 1270 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 3.110351800918579, + "learning_rate": 4.890885750962773e-06, + "loss": 0.9385, + "step": 1271 + }, + { + "epoch": 1.6328626444159178, + "grad_norm": 3.7334847450256348, + "learning_rate": 4.894736842105264e-06, + "loss": 0.9369, + "step": 1272 + }, + { + "epoch": 1.6341463414634148, + "grad_norm": 3.2843239307403564, + "learning_rate": 4.898587933247753e-06, + "loss": 0.9139, + "step": 1273 + }, + { + "epoch": 1.6354300385109113, + "grad_norm": 2.4176018238067627, + "learning_rate": 4.902439024390244e-06, + "loss": 0.9147, + "step": 1274 + }, + { + "epoch": 1.6367137355584083, + "grad_norm": 2.0011696815490723, + "learning_rate": 4.906290115532734e-06, + "loss": 0.9361, + "step": 1275 + }, + { + "epoch": 1.637997432605905, + "grad_norm": 2.4837310314178467, + "learning_rate": 4.910141206675225e-06, + "loss": 1.0253, + "step": 1276 + }, + { + "epoch": 1.6392811296534018, + "grad_norm": 3.3052310943603516, + "learning_rate": 4.913992297817715e-06, + "loss": 1.0047, + "step": 1277 + }, + { + "epoch": 1.6405648267008985, + "grad_norm": 2.875164270401001, + "learning_rate": 4.917843388960206e-06, + "loss": 1.0986, + "step": 1278 + }, + { + "epoch": 1.6418485237483953, + "grad_norm": 4.226201057434082, + "learning_rate": 4.921694480102695e-06, + "loss": 1.3467, + "step": 1279 + }, + { + "epoch": 1.6431322207958923, + "grad_norm": 2.2717061042785645, + "learning_rate": 4.925545571245186e-06, + "loss": 0.8065, + "step": 1280 + }, + { + "epoch": 1.6444159178433888, + "grad_norm": 1.5243593454360962, + "learning_rate": 4.9293966623876765e-06, + "loss": 0.7893, + "step": 1281 + }, + { + "epoch": 1.6456996148908858, + "grad_norm": 1.7355163097381592, + "learning_rate": 4.933247753530167e-06, + "loss": 0.7993, + "step": 1282 + }, + { + "epoch": 1.6469833119383825, + "grad_norm": 1.5010086297988892, + "learning_rate": 4.937098844672657e-06, + "loss": 0.8073, + "step": 1283 + }, + { + "epoch": 1.6482670089858793, + "grad_norm": 2.0433831214904785, + "learning_rate": 4.940949935815148e-06, + "loss": 0.776, + "step": 1284 + }, + { + "epoch": 1.6495507060333763, + "grad_norm": 1.3164143562316895, + "learning_rate": 4.944801026957638e-06, + "loss": 0.7454, + "step": 1285 + }, + { + "epoch": 1.6508344030808728, + "grad_norm": 1.6024892330169678, + "learning_rate": 4.9486521181001285e-06, + "loss": 0.8057, + "step": 1286 + }, + { + "epoch": 1.6521181001283698, + "grad_norm": 1.7750194072723389, + "learning_rate": 4.9525032092426196e-06, + "loss": 0.7894, + "step": 1287 + }, + { + "epoch": 1.6534017971758665, + "grad_norm": 2.61865496635437, + "learning_rate": 4.956354300385109e-06, + "loss": 0.7545, + "step": 1288 + }, + { + "epoch": 1.6546854942233633, + "grad_norm": 1.6156693696975708, + "learning_rate": 4.9602053915276e-06, + "loss": 0.7825, + "step": 1289 + }, + { + "epoch": 1.65596919127086, + "grad_norm": 1.6188161373138428, + "learning_rate": 4.96405648267009e-06, + "loss": 0.7764, + "step": 1290 + }, + { + "epoch": 1.6572528883183568, + "grad_norm": 1.8639492988586426, + "learning_rate": 4.9679075738125805e-06, + "loss": 0.831, + "step": 1291 + }, + { + "epoch": 1.6585365853658538, + "grad_norm": 1.731979250907898, + "learning_rate": 4.971758664955071e-06, + "loss": 0.8924, + "step": 1292 + }, + { + "epoch": 1.6598202824133503, + "grad_norm": 2.8837296962738037, + "learning_rate": 4.975609756097562e-06, + "loss": 0.8462, + "step": 1293 + }, + { + "epoch": 1.6611039794608473, + "grad_norm": 1.6158130168914795, + "learning_rate": 4.979460847240051e-06, + "loss": 0.7827, + "step": 1294 + }, + { + "epoch": 1.662387676508344, + "grad_norm": 2.3465070724487305, + "learning_rate": 4.983311938382542e-06, + "loss": 0.8218, + "step": 1295 + }, + { + "epoch": 1.6636713735558408, + "grad_norm": 1.6884779930114746, + "learning_rate": 4.987163029525032e-06, + "loss": 0.8373, + "step": 1296 + }, + { + "epoch": 1.6649550706033376, + "grad_norm": 1.5519567728042603, + "learning_rate": 4.991014120667523e-06, + "loss": 0.7594, + "step": 1297 + }, + { + "epoch": 1.6662387676508343, + "grad_norm": 1.489770531654358, + "learning_rate": 4.994865211810013e-06, + "loss": 0.7897, + "step": 1298 + }, + { + "epoch": 1.6675224646983313, + "grad_norm": 1.759024977684021, + "learning_rate": 4.998716302952503e-06, + "loss": 0.818, + "step": 1299 + }, + { + "epoch": 1.6688061617458279, + "grad_norm": 2.3115427494049072, + "learning_rate": 5.002567394094993e-06, + "loss": 0.8369, + "step": 1300 + }, + { + "epoch": 1.6700898587933248, + "grad_norm": 1.5673967599868774, + "learning_rate": 5.006418485237484e-06, + "loss": 0.8537, + "step": 1301 + }, + { + "epoch": 1.6713735558408216, + "grad_norm": 1.6094388961791992, + "learning_rate": 5.010269576379974e-06, + "loss": 0.8349, + "step": 1302 + }, + { + "epoch": 1.6726572528883183, + "grad_norm": 2.1261141300201416, + "learning_rate": 5.014120667522465e-06, + "loss": 0.758, + "step": 1303 + }, + { + "epoch": 1.673940949935815, + "grad_norm": 2.203700542449951, + "learning_rate": 5.017971758664955e-06, + "loss": 0.851, + "step": 1304 + }, + { + "epoch": 1.6752246469833119, + "grad_norm": 2.153761386871338, + "learning_rate": 5.021822849807445e-06, + "loss": 0.8703, + "step": 1305 + }, + { + "epoch": 1.6765083440308088, + "grad_norm": 1.6710529327392578, + "learning_rate": 5.0256739409499355e-06, + "loss": 0.7669, + "step": 1306 + }, + { + "epoch": 1.6777920410783054, + "grad_norm": 1.9920768737792969, + "learning_rate": 5.029525032092427e-06, + "loss": 0.8066, + "step": 1307 + }, + { + "epoch": 1.6790757381258024, + "grad_norm": 1.909638524055481, + "learning_rate": 5.033376123234917e-06, + "loss": 0.81, + "step": 1308 + }, + { + "epoch": 1.680359435173299, + "grad_norm": 1.7178623676300049, + "learning_rate": 5.037227214377407e-06, + "loss": 0.8471, + "step": 1309 + }, + { + "epoch": 1.6816431322207959, + "grad_norm": 3.4491488933563232, + "learning_rate": 5.041078305519898e-06, + "loss": 0.7917, + "step": 1310 + }, + { + "epoch": 1.6829268292682928, + "grad_norm": 2.0493929386138916, + "learning_rate": 5.0449293966623875e-06, + "loss": 0.8141, + "step": 1311 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 2.2411441802978516, + "learning_rate": 5.0487804878048785e-06, + "loss": 0.8275, + "step": 1312 + }, + { + "epoch": 1.6854942233632864, + "grad_norm": 1.8534127473831177, + "learning_rate": 5.052631578947369e-06, + "loss": 0.8948, + "step": 1313 + }, + { + "epoch": 1.686777920410783, + "grad_norm": 1.6553359031677246, + "learning_rate": 5.056482670089859e-06, + "loss": 0.8554, + "step": 1314 + }, + { + "epoch": 1.6880616174582799, + "grad_norm": 2.580989360809326, + "learning_rate": 5.060333761232349e-06, + "loss": 0.8088, + "step": 1315 + }, + { + "epoch": 1.6893453145057766, + "grad_norm": 2.2903711795806885, + "learning_rate": 5.06418485237484e-06, + "loss": 0.8357, + "step": 1316 + }, + { + "epoch": 1.6906290115532734, + "grad_norm": 3.0633113384246826, + "learning_rate": 5.06803594351733e-06, + "loss": 0.8306, + "step": 1317 + }, + { + "epoch": 1.6919127086007704, + "grad_norm": 2.1019012928009033, + "learning_rate": 5.071887034659821e-06, + "loss": 0.9138, + "step": 1318 + }, + { + "epoch": 1.693196405648267, + "grad_norm": 2.112954616546631, + "learning_rate": 5.075738125802311e-06, + "loss": 0.8512, + "step": 1319 + }, + { + "epoch": 1.6944801026957639, + "grad_norm": 2.100004196166992, + "learning_rate": 5.079589216944801e-06, + "loss": 0.9274, + "step": 1320 + }, + { + "epoch": 1.6957637997432606, + "grad_norm": 2.6734371185302734, + "learning_rate": 5.083440308087291e-06, + "loss": 1.0384, + "step": 1321 + }, + { + "epoch": 1.6970474967907574, + "grad_norm": 3.521921157836914, + "learning_rate": 5.087291399229782e-06, + "loss": 0.9442, + "step": 1322 + }, + { + "epoch": 1.6983311938382541, + "grad_norm": 2.504472255706787, + "learning_rate": 5.091142490372272e-06, + "loss": 0.9624, + "step": 1323 + }, + { + "epoch": 1.699614890885751, + "grad_norm": 2.659069299697876, + "learning_rate": 5.094993581514763e-06, + "loss": 0.9398, + "step": 1324 + }, + { + "epoch": 1.7008985879332479, + "grad_norm": 6.208556652069092, + "learning_rate": 5.098844672657252e-06, + "loss": 0.9512, + "step": 1325 + }, + { + "epoch": 1.7021822849807444, + "grad_norm": 9.597379684448242, + "learning_rate": 5.102695763799743e-06, + "loss": 1.0228, + "step": 1326 + }, + { + "epoch": 1.7034659820282414, + "grad_norm": 4.915440082550049, + "learning_rate": 5.106546854942234e-06, + "loss": 0.994, + "step": 1327 + }, + { + "epoch": 1.7047496790757382, + "grad_norm": 7.385984897613525, + "learning_rate": 5.110397946084724e-06, + "loss": 1.1125, + "step": 1328 + }, + { + "epoch": 1.706033376123235, + "grad_norm": 7.261928558349609, + "learning_rate": 5.114249037227214e-06, + "loss": 1.2161, + "step": 1329 + }, + { + "epoch": 1.7073170731707317, + "grad_norm": 1.9865444898605347, + "learning_rate": 5.118100128369705e-06, + "loss": 0.8051, + "step": 1330 + }, + { + "epoch": 1.7086007702182284, + "grad_norm": 1.245772361755371, + "learning_rate": 5.121951219512195e-06, + "loss": 0.7846, + "step": 1331 + }, + { + "epoch": 1.7098844672657254, + "grad_norm": 1.5057305097579956, + "learning_rate": 5.1258023106546856e-06, + "loss": 0.7897, + "step": 1332 + }, + { + "epoch": 1.711168164313222, + "grad_norm": 1.43209707736969, + "learning_rate": 5.129653401797177e-06, + "loss": 0.7622, + "step": 1333 + }, + { + "epoch": 1.712451861360719, + "grad_norm": 1.3988522291183472, + "learning_rate": 5.133504492939666e-06, + "loss": 0.7685, + "step": 1334 + }, + { + "epoch": 1.7137355584082157, + "grad_norm": 1.9765589237213135, + "learning_rate": 5.137355584082157e-06, + "loss": 0.7855, + "step": 1335 + }, + { + "epoch": 1.7150192554557124, + "grad_norm": 1.5465184450149536, + "learning_rate": 5.141206675224647e-06, + "loss": 0.78, + "step": 1336 + }, + { + "epoch": 1.7163029525032092, + "grad_norm": 4.030391216278076, + "learning_rate": 5.1450577663671375e-06, + "loss": 0.8258, + "step": 1337 + }, + { + "epoch": 1.717586649550706, + "grad_norm": 1.559979796409607, + "learning_rate": 5.148908857509628e-06, + "loss": 0.7624, + "step": 1338 + }, + { + "epoch": 1.718870346598203, + "grad_norm": 1.5050727128982544, + "learning_rate": 5.152759948652119e-06, + "loss": 0.797, + "step": 1339 + }, + { + "epoch": 1.7201540436456995, + "grad_norm": 1.831518530845642, + "learning_rate": 5.156611039794608e-06, + "loss": 0.7682, + "step": 1340 + }, + { + "epoch": 1.7214377406931964, + "grad_norm": 1.689405083656311, + "learning_rate": 5.160462130937099e-06, + "loss": 0.7544, + "step": 1341 + }, + { + "epoch": 1.7227214377406932, + "grad_norm": 1.2821015119552612, + "learning_rate": 5.1643132220795895e-06, + "loss": 0.7821, + "step": 1342 + }, + { + "epoch": 1.72400513478819, + "grad_norm": 2.881837844848633, + "learning_rate": 5.16816431322208e-06, + "loss": 0.8101, + "step": 1343 + }, + { + "epoch": 1.725288831835687, + "grad_norm": 1.6624459028244019, + "learning_rate": 5.17201540436457e-06, + "loss": 0.7309, + "step": 1344 + }, + { + "epoch": 1.7265725288831835, + "grad_norm": 1.5423120260238647, + "learning_rate": 5.175866495507061e-06, + "loss": 0.7658, + "step": 1345 + }, + { + "epoch": 1.7278562259306804, + "grad_norm": 1.5667173862457275, + "learning_rate": 5.17971758664955e-06, + "loss": 0.8178, + "step": 1346 + }, + { + "epoch": 1.7291399229781772, + "grad_norm": 1.609457015991211, + "learning_rate": 5.1835686777920414e-06, + "loss": 0.7941, + "step": 1347 + }, + { + "epoch": 1.730423620025674, + "grad_norm": 1.6331952810287476, + "learning_rate": 5.187419768934531e-06, + "loss": 0.7839, + "step": 1348 + }, + { + "epoch": 1.7317073170731707, + "grad_norm": 1.5256834030151367, + "learning_rate": 5.191270860077022e-06, + "loss": 0.7823, + "step": 1349 + }, + { + "epoch": 1.7329910141206675, + "grad_norm": 1.6659702062606812, + "learning_rate": 5.195121951219512e-06, + "loss": 0.7709, + "step": 1350 + }, + { + "epoch": 1.7342747111681645, + "grad_norm": 1.8938156366348267, + "learning_rate": 5.198973042362002e-06, + "loss": 0.8013, + "step": 1351 + }, + { + "epoch": 1.735558408215661, + "grad_norm": 2.4259591102600098, + "learning_rate": 5.202824133504493e-06, + "loss": 0.7902, + "step": 1352 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 2.029754638671875, + "learning_rate": 5.206675224646984e-06, + "loss": 0.8268, + "step": 1353 + }, + { + "epoch": 1.7381258023106547, + "grad_norm": 1.7311152219772339, + "learning_rate": 5.210526315789474e-06, + "loss": 0.8201, + "step": 1354 + }, + { + "epoch": 1.7394094993581515, + "grad_norm": 3.684406280517578, + "learning_rate": 5.214377406931964e-06, + "loss": 0.8349, + "step": 1355 + }, + { + "epoch": 1.7406931964056482, + "grad_norm": 1.9023122787475586, + "learning_rate": 5.218228498074455e-06, + "loss": 0.812, + "step": 1356 + }, + { + "epoch": 1.741976893453145, + "grad_norm": 1.8334126472473145, + "learning_rate": 5.2220795892169445e-06, + "loss": 0.7618, + "step": 1357 + }, + { + "epoch": 1.743260590500642, + "grad_norm": 1.6258822679519653, + "learning_rate": 5.225930680359436e-06, + "loss": 0.7964, + "step": 1358 + }, + { + "epoch": 1.7445442875481385, + "grad_norm": 2.09722638130188, + "learning_rate": 5.229781771501926e-06, + "loss": 0.7929, + "step": 1359 + }, + { + "epoch": 1.7458279845956355, + "grad_norm": 1.690600872039795, + "learning_rate": 5.233632862644416e-06, + "loss": 0.8065, + "step": 1360 + }, + { + "epoch": 1.7471116816431322, + "grad_norm": 2.0227601528167725, + "learning_rate": 5.237483953786906e-06, + "loss": 0.8148, + "step": 1361 + }, + { + "epoch": 1.748395378690629, + "grad_norm": 1.7663267850875854, + "learning_rate": 5.241335044929397e-06, + "loss": 0.8129, + "step": 1362 + }, + { + "epoch": 1.7496790757381258, + "grad_norm": 2.2325069904327393, + "learning_rate": 5.245186136071887e-06, + "loss": 0.9014, + "step": 1363 + }, + { + "epoch": 1.7509627727856225, + "grad_norm": 1.7365281581878662, + "learning_rate": 5.249037227214378e-06, + "loss": 0.7906, + "step": 1364 + }, + { + "epoch": 1.7522464698331195, + "grad_norm": 2.1432666778564453, + "learning_rate": 5.252888318356868e-06, + "loss": 0.8647, + "step": 1365 + }, + { + "epoch": 1.753530166880616, + "grad_norm": 5.010144233703613, + "learning_rate": 5.256739409499358e-06, + "loss": 0.8925, + "step": 1366 + }, + { + "epoch": 1.754813863928113, + "grad_norm": 8.943716049194336, + "learning_rate": 5.2605905006418485e-06, + "loss": 0.8471, + "step": 1367 + }, + { + "epoch": 1.7560975609756098, + "grad_norm": 3.1290252208709717, + "learning_rate": 5.2644415917843395e-06, + "loss": 0.9119, + "step": 1368 + }, + { + "epoch": 1.7573812580231065, + "grad_norm": 3.165363311767578, + "learning_rate": 5.268292682926829e-06, + "loss": 0.9086, + "step": 1369 + }, + { + "epoch": 1.7586649550706035, + "grad_norm": 2.9145076274871826, + "learning_rate": 5.27214377406932e-06, + "loss": 0.8286, + "step": 1370 + }, + { + "epoch": 1.7599486521181, + "grad_norm": 2.661939859390259, + "learning_rate": 5.27599486521181e-06, + "loss": 0.9039, + "step": 1371 + }, + { + "epoch": 1.761232349165597, + "grad_norm": 2.7030370235443115, + "learning_rate": 5.2798459563543e-06, + "loss": 0.823, + "step": 1372 + }, + { + "epoch": 1.7625160462130935, + "grad_norm": 4.422377109527588, + "learning_rate": 5.283697047496791e-06, + "loss": 0.9371, + "step": 1373 + }, + { + "epoch": 1.7637997432605905, + "grad_norm": 2.266887664794922, + "learning_rate": 5.287548138639281e-06, + "loss": 1.0089, + "step": 1374 + }, + { + "epoch": 1.7650834403080873, + "grad_norm": 2.6590404510498047, + "learning_rate": 5.291399229781772e-06, + "loss": 0.9471, + "step": 1375 + }, + { + "epoch": 1.766367137355584, + "grad_norm": 4.557197093963623, + "learning_rate": 5.295250320924262e-06, + "loss": 0.9763, + "step": 1376 + }, + { + "epoch": 1.767650834403081, + "grad_norm": 3.53073787689209, + "learning_rate": 5.299101412066752e-06, + "loss": 0.9894, + "step": 1377 + }, + { + "epoch": 1.7689345314505776, + "grad_norm": 11.883139610290527, + "learning_rate": 5.302952503209243e-06, + "loss": 1.1033, + "step": 1378 + }, + { + "epoch": 1.7702182284980745, + "grad_norm": 7.190917491912842, + "learning_rate": 5.306803594351734e-06, + "loss": 1.2653, + "step": 1379 + }, + { + "epoch": 1.7715019255455713, + "grad_norm": 1.9887049198150635, + "learning_rate": 5.310654685494223e-06, + "loss": 0.8055, + "step": 1380 + }, + { + "epoch": 1.772785622593068, + "grad_norm": 2.5005829334259033, + "learning_rate": 5.314505776636714e-06, + "loss": 0.777, + "step": 1381 + }, + { + "epoch": 1.7740693196405648, + "grad_norm": 1.6502679586410522, + "learning_rate": 5.318356867779204e-06, + "loss": 0.7762, + "step": 1382 + }, + { + "epoch": 1.7753530166880616, + "grad_norm": 1.917419672012329, + "learning_rate": 5.3222079589216946e-06, + "loss": 0.7512, + "step": 1383 + }, + { + "epoch": 1.7766367137355585, + "grad_norm": 2.185181140899658, + "learning_rate": 5.326059050064185e-06, + "loss": 0.7853, + "step": 1384 + }, + { + "epoch": 1.777920410783055, + "grad_norm": 3.9230546951293945, + "learning_rate": 5.329910141206676e-06, + "loss": 0.8158, + "step": 1385 + }, + { + "epoch": 1.779204107830552, + "grad_norm": 2.3328912258148193, + "learning_rate": 5.333761232349165e-06, + "loss": 0.782, + "step": 1386 + }, + { + "epoch": 1.7804878048780488, + "grad_norm": 1.7279127836227417, + "learning_rate": 5.337612323491656e-06, + "loss": 0.7644, + "step": 1387 + }, + { + "epoch": 1.7817715019255456, + "grad_norm": 2.14528489112854, + "learning_rate": 5.3414634146341465e-06, + "loss": 0.7543, + "step": 1388 + }, + { + "epoch": 1.7830551989730423, + "grad_norm": 2.343752145767212, + "learning_rate": 5.345314505776637e-06, + "loss": 0.8337, + "step": 1389 + }, + { + "epoch": 1.784338896020539, + "grad_norm": 1.8980597257614136, + "learning_rate": 5.349165596919127e-06, + "loss": 0.7797, + "step": 1390 + }, + { + "epoch": 1.785622593068036, + "grad_norm": 1.531702995300293, + "learning_rate": 5.353016688061618e-06, + "loss": 0.8136, + "step": 1391 + }, + { + "epoch": 1.7869062901155326, + "grad_norm": 1.7557865381240845, + "learning_rate": 5.3568677792041074e-06, + "loss": 0.7553, + "step": 1392 + }, + { + "epoch": 1.7881899871630296, + "grad_norm": 1.5961699485778809, + "learning_rate": 5.3607188703465985e-06, + "loss": 0.7427, + "step": 1393 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 1.6741262674331665, + "learning_rate": 5.364569961489089e-06, + "loss": 0.7815, + "step": 1394 + }, + { + "epoch": 1.790757381258023, + "grad_norm": 2.1062843799591064, + "learning_rate": 5.368421052631579e-06, + "loss": 0.8153, + "step": 1395 + }, + { + "epoch": 1.79204107830552, + "grad_norm": 1.5005778074264526, + "learning_rate": 5.372272143774069e-06, + "loss": 0.7793, + "step": 1396 + }, + { + "epoch": 1.7933247753530166, + "grad_norm": 4.9874267578125, + "learning_rate": 5.37612323491656e-06, + "loss": 0.7754, + "step": 1397 + }, + { + "epoch": 1.7946084724005136, + "grad_norm": 5.748123645782471, + "learning_rate": 5.3799743260590505e-06, + "loss": 0.8052, + "step": 1398 + }, + { + "epoch": 1.7958921694480101, + "grad_norm": 1.7794653177261353, + "learning_rate": 5.383825417201541e-06, + "loss": 0.7997, + "step": 1399 + }, + { + "epoch": 1.797175866495507, + "grad_norm": 1.7688692808151245, + "learning_rate": 5.387676508344031e-06, + "loss": 0.7862, + "step": 1400 + }, + { + "epoch": 1.7984595635430038, + "grad_norm": 1.5728416442871094, + "learning_rate": 5.391527599486521e-06, + "loss": 0.8026, + "step": 1401 + }, + { + "epoch": 1.7997432605905006, + "grad_norm": 13.011189460754395, + "learning_rate": 5.395378690629012e-06, + "loss": 0.8038, + "step": 1402 + }, + { + "epoch": 1.8010269576379976, + "grad_norm": 2.069424867630005, + "learning_rate": 5.399229781771502e-06, + "loss": 0.8367, + "step": 1403 + }, + { + "epoch": 1.8023106546854941, + "grad_norm": 2.1398582458496094, + "learning_rate": 5.403080872913993e-06, + "loss": 0.7941, + "step": 1404 + }, + { + "epoch": 1.803594351732991, + "grad_norm": 2.424912452697754, + "learning_rate": 5.406931964056483e-06, + "loss": 0.8449, + "step": 1405 + }, + { + "epoch": 1.8048780487804879, + "grad_norm": 1.7023310661315918, + "learning_rate": 5.410783055198973e-06, + "loss": 0.8315, + "step": 1406 + }, + { + "epoch": 1.8061617458279846, + "grad_norm": 2.4309139251708984, + "learning_rate": 5.414634146341463e-06, + "loss": 0.785, + "step": 1407 + }, + { + "epoch": 1.8074454428754814, + "grad_norm": 2.1415693759918213, + "learning_rate": 5.418485237483954e-06, + "loss": 0.7946, + "step": 1408 + }, + { + "epoch": 1.8087291399229781, + "grad_norm": 1.658610224723816, + "learning_rate": 5.422336328626444e-06, + "loss": 0.7825, + "step": 1409 + }, + { + "epoch": 1.810012836970475, + "grad_norm": 1.768970012664795, + "learning_rate": 5.426187419768935e-06, + "loss": 0.8553, + "step": 1410 + }, + { + "epoch": 1.8112965340179716, + "grad_norm": 2.55000638961792, + "learning_rate": 5.430038510911425e-06, + "loss": 0.9041, + "step": 1411 + }, + { + "epoch": 1.8125802310654686, + "grad_norm": 2.3782191276550293, + "learning_rate": 5.433889602053915e-06, + "loss": 0.8596, + "step": 1412 + }, + { + "epoch": 1.8138639281129654, + "grad_norm": 1.9737181663513184, + "learning_rate": 5.4377406931964055e-06, + "loss": 0.821, + "step": 1413 + }, + { + "epoch": 1.8151476251604621, + "grad_norm": 2.2545249462127686, + "learning_rate": 5.441591784338897e-06, + "loss": 0.8472, + "step": 1414 + }, + { + "epoch": 1.8164313222079589, + "grad_norm": 2.795321226119995, + "learning_rate": 5.445442875481386e-06, + "loss": 0.8301, + "step": 1415 + }, + { + "epoch": 1.8177150192554556, + "grad_norm": 3.1141021251678467, + "learning_rate": 5.449293966623877e-06, + "loss": 0.7967, + "step": 1416 + }, + { + "epoch": 1.8189987163029526, + "grad_norm": 2.5269522666931152, + "learning_rate": 5.453145057766367e-06, + "loss": 0.8651, + "step": 1417 + }, + { + "epoch": 1.8202824133504492, + "grad_norm": 1.9397028684616089, + "learning_rate": 5.4569961489088575e-06, + "loss": 0.8358, + "step": 1418 + }, + { + "epoch": 1.8215661103979461, + "grad_norm": 3.2529122829437256, + "learning_rate": 5.460847240051348e-06, + "loss": 0.8698, + "step": 1419 + }, + { + "epoch": 1.822849807445443, + "grad_norm": 2.2545995712280273, + "learning_rate": 5.464698331193839e-06, + "loss": 0.8694, + "step": 1420 + }, + { + "epoch": 1.8241335044929397, + "grad_norm": 2.4143943786621094, + "learning_rate": 5.468549422336329e-06, + "loss": 0.8501, + "step": 1421 + }, + { + "epoch": 1.8254172015404364, + "grad_norm": 2.1708545684814453, + "learning_rate": 5.472400513478819e-06, + "loss": 0.9054, + "step": 1422 + }, + { + "epoch": 1.8267008985879332, + "grad_norm": 2.0110480785369873, + "learning_rate": 5.4762516046213094e-06, + "loss": 0.9006, + "step": 1423 + }, + { + "epoch": 1.8279845956354301, + "grad_norm": 2.236804962158203, + "learning_rate": 5.4801026957638e-06, + "loss": 0.9035, + "step": 1424 + }, + { + "epoch": 1.8292682926829267, + "grad_norm": 1.8953403234481812, + "learning_rate": 5.483953786906291e-06, + "loss": 0.8568, + "step": 1425 + }, + { + "epoch": 1.8305519897304237, + "grad_norm": 2.655092477798462, + "learning_rate": 5.48780487804878e-06, + "loss": 0.9689, + "step": 1426 + }, + { + "epoch": 1.8318356867779204, + "grad_norm": 2.8191285133361816, + "learning_rate": 5.491655969191271e-06, + "loss": 1.0355, + "step": 1427 + }, + { + "epoch": 1.8331193838254172, + "grad_norm": 3.9550411701202393, + "learning_rate": 5.495507060333761e-06, + "loss": 1.0394, + "step": 1428 + }, + { + "epoch": 1.8344030808729141, + "grad_norm": 5.280532360076904, + "learning_rate": 5.499358151476252e-06, + "loss": 1.2152, + "step": 1429 + }, + { + "epoch": 1.8356867779204107, + "grad_norm": 2.5155625343322754, + "learning_rate": 5.503209242618742e-06, + "loss": 0.8135, + "step": 1430 + }, + { + "epoch": 1.8369704749679077, + "grad_norm": 1.472406268119812, + "learning_rate": 5.507060333761233e-06, + "loss": 0.7433, + "step": 1431 + }, + { + "epoch": 1.8382541720154044, + "grad_norm": 1.8811485767364502, + "learning_rate": 5.510911424903722e-06, + "loss": 0.7782, + "step": 1432 + }, + { + "epoch": 1.8395378690629012, + "grad_norm": 1.7714413404464722, + "learning_rate": 5.514762516046213e-06, + "loss": 0.7615, + "step": 1433 + }, + { + "epoch": 1.840821566110398, + "grad_norm": 1.5416642427444458, + "learning_rate": 5.518613607188704e-06, + "loss": 0.7806, + "step": 1434 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 2.0709874629974365, + "learning_rate": 5.522464698331194e-06, + "loss": 0.7656, + "step": 1435 + }, + { + "epoch": 1.8433889602053917, + "grad_norm": 1.7069028615951538, + "learning_rate": 5.526315789473684e-06, + "loss": 0.7773, + "step": 1436 + }, + { + "epoch": 1.8446726572528882, + "grad_norm": 1.4919087886810303, + "learning_rate": 5.530166880616175e-06, + "loss": 0.7748, + "step": 1437 + }, + { + "epoch": 1.8459563543003852, + "grad_norm": 1.6732155084609985, + "learning_rate": 5.5340179717586645e-06, + "loss": 0.8158, + "step": 1438 + }, + { + "epoch": 1.847240051347882, + "grad_norm": 1.5237563848495483, + "learning_rate": 5.5378690629011556e-06, + "loss": 0.7595, + "step": 1439 + }, + { + "epoch": 1.8485237483953787, + "grad_norm": 2.4435250759124756, + "learning_rate": 5.541720154043646e-06, + "loss": 0.7944, + "step": 1440 + }, + { + "epoch": 1.8498074454428755, + "grad_norm": 1.571574091911316, + "learning_rate": 5.545571245186136e-06, + "loss": 0.8154, + "step": 1441 + }, + { + "epoch": 1.8510911424903722, + "grad_norm": 1.4353934526443481, + "learning_rate": 5.549422336328627e-06, + "loss": 0.7462, + "step": 1442 + }, + { + "epoch": 1.8523748395378692, + "grad_norm": 1.5112600326538086, + "learning_rate": 5.553273427471117e-06, + "loss": 0.7407, + "step": 1443 + }, + { + "epoch": 1.8536585365853657, + "grad_norm": 1.9727880954742432, + "learning_rate": 5.5571245186136075e-06, + "loss": 0.7357, + "step": 1444 + }, + { + "epoch": 1.8549422336328627, + "grad_norm": 1.972170114517212, + "learning_rate": 5.560975609756098e-06, + "loss": 0.7578, + "step": 1445 + }, + { + "epoch": 1.8562259306803595, + "grad_norm": 2.20935320854187, + "learning_rate": 5.564826700898589e-06, + "loss": 0.7795, + "step": 1446 + }, + { + "epoch": 1.8575096277278562, + "grad_norm": 1.7336899042129517, + "learning_rate": 5.568677792041078e-06, + "loss": 0.7799, + "step": 1447 + }, + { + "epoch": 1.858793324775353, + "grad_norm": 1.3770073652267456, + "learning_rate": 5.572528883183569e-06, + "loss": 0.782, + "step": 1448 + }, + { + "epoch": 1.8600770218228497, + "grad_norm": 1.6547600030899048, + "learning_rate": 5.5763799743260595e-06, + "loss": 0.7844, + "step": 1449 + }, + { + "epoch": 1.8613607188703467, + "grad_norm": 2.1183316707611084, + "learning_rate": 5.58023106546855e-06, + "loss": 0.8055, + "step": 1450 + }, + { + "epoch": 1.8626444159178432, + "grad_norm": 1.7332578897476196, + "learning_rate": 5.58408215661104e-06, + "loss": 0.8351, + "step": 1451 + }, + { + "epoch": 1.8639281129653402, + "grad_norm": 1.8125332593917847, + "learning_rate": 5.58793324775353e-06, + "loss": 0.8142, + "step": 1452 + }, + { + "epoch": 1.865211810012837, + "grad_norm": 2.6305906772613525, + "learning_rate": 5.59178433889602e-06, + "loss": 0.7653, + "step": 1453 + }, + { + "epoch": 1.8664955070603337, + "grad_norm": 1.7893224954605103, + "learning_rate": 5.5956354300385114e-06, + "loss": 0.7748, + "step": 1454 + }, + { + "epoch": 1.8677792041078307, + "grad_norm": 2.7238190174102783, + "learning_rate": 5.599486521181001e-06, + "loss": 0.7914, + "step": 1455 + }, + { + "epoch": 1.8690629011553272, + "grad_norm": 2.4554357528686523, + "learning_rate": 5.603337612323492e-06, + "loss": 0.7942, + "step": 1456 + }, + { + "epoch": 1.8703465982028242, + "grad_norm": 2.1668176651000977, + "learning_rate": 5.607188703465982e-06, + "loss": 0.8342, + "step": 1457 + }, + { + "epoch": 1.8716302952503208, + "grad_norm": 1.8895982503890991, + "learning_rate": 5.611039794608472e-06, + "loss": 0.7897, + "step": 1458 + }, + { + "epoch": 1.8729139922978177, + "grad_norm": 1.6011098623275757, + "learning_rate": 5.6148908857509626e-06, + "loss": 0.8199, + "step": 1459 + }, + { + "epoch": 1.8741976893453145, + "grad_norm": 1.6772363185882568, + "learning_rate": 5.618741976893454e-06, + "loss": 0.7991, + "step": 1460 + }, + { + "epoch": 1.8754813863928113, + "grad_norm": 2.024881601333618, + "learning_rate": 5.622593068035943e-06, + "loss": 0.8381, + "step": 1461 + }, + { + "epoch": 1.8767650834403082, + "grad_norm": 2.9943082332611084, + "learning_rate": 5.626444159178434e-06, + "loss": 0.7902, + "step": 1462 + }, + { + "epoch": 1.8780487804878048, + "grad_norm": 2.3227345943450928, + "learning_rate": 5.630295250320924e-06, + "loss": 0.8682, + "step": 1463 + }, + { + "epoch": 1.8793324775353017, + "grad_norm": 3.3954648971557617, + "learning_rate": 5.6341463414634145e-06, + "loss": 0.8473, + "step": 1464 + }, + { + "epoch": 1.8806161745827985, + "grad_norm": 2.909148931503296, + "learning_rate": 5.637997432605906e-06, + "loss": 0.846, + "step": 1465 + }, + { + "epoch": 1.8818998716302953, + "grad_norm": 2.748157024383545, + "learning_rate": 5.641848523748396e-06, + "loss": 0.8512, + "step": 1466 + }, + { + "epoch": 1.883183568677792, + "grad_norm": 2.208620309829712, + "learning_rate": 5.645699614890886e-06, + "loss": 0.7904, + "step": 1467 + }, + { + "epoch": 1.8844672657252888, + "grad_norm": 2.778942584991455, + "learning_rate": 5.649550706033376e-06, + "loss": 0.7705, + "step": 1468 + }, + { + "epoch": 1.8857509627727858, + "grad_norm": 2.48745059967041, + "learning_rate": 5.653401797175867e-06, + "loss": 0.8331, + "step": 1469 + }, + { + "epoch": 1.8870346598202823, + "grad_norm": 2.5527689456939697, + "learning_rate": 5.657252888318357e-06, + "loss": 0.8448, + "step": 1470 + }, + { + "epoch": 1.8883183568677793, + "grad_norm": 6.363381385803223, + "learning_rate": 5.661103979460848e-06, + "loss": 0.8868, + "step": 1471 + }, + { + "epoch": 1.889602053915276, + "grad_norm": 3.199928045272827, + "learning_rate": 5.664955070603338e-06, + "loss": 0.854, + "step": 1472 + }, + { + "epoch": 1.8908857509627728, + "grad_norm": 2.086308479309082, + "learning_rate": 5.668806161745828e-06, + "loss": 0.8431, + "step": 1473 + }, + { + "epoch": 1.8921694480102695, + "grad_norm": 3.708441972732544, + "learning_rate": 5.6726572528883185e-06, + "loss": 0.866, + "step": 1474 + }, + { + "epoch": 1.8934531450577663, + "grad_norm": 3.289707899093628, + "learning_rate": 5.676508344030809e-06, + "loss": 0.9039, + "step": 1475 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 2.9239871501922607, + "learning_rate": 5.680359435173299e-06, + "loss": 0.919, + "step": 1476 + }, + { + "epoch": 1.8960205391527598, + "grad_norm": 3.7417044639587402, + "learning_rate": 5.68421052631579e-06, + "loss": 0.9533, + "step": 1477 + }, + { + "epoch": 1.8973042362002568, + "grad_norm": 3.1392247676849365, + "learning_rate": 5.688061617458279e-06, + "loss": 0.9506, + "step": 1478 + }, + { + "epoch": 1.8985879332477535, + "grad_norm": 3.1313633918762207, + "learning_rate": 5.69191270860077e-06, + "loss": 1.1295, + "step": 1479 + }, + { + "epoch": 1.8998716302952503, + "grad_norm": 1.9305497407913208, + "learning_rate": 5.695763799743261e-06, + "loss": 0.7959, + "step": 1480 + }, + { + "epoch": 1.901155327342747, + "grad_norm": 1.3088008165359497, + "learning_rate": 5.699614890885751e-06, + "loss": 0.7484, + "step": 1481 + }, + { + "epoch": 1.9024390243902438, + "grad_norm": 1.676769495010376, + "learning_rate": 5.703465982028241e-06, + "loss": 0.7755, + "step": 1482 + }, + { + "epoch": 1.9037227214377408, + "grad_norm": 1.4382950067520142, + "learning_rate": 5.707317073170732e-06, + "loss": 0.7501, + "step": 1483 + }, + { + "epoch": 1.9050064184852373, + "grad_norm": 1.3181179761886597, + "learning_rate": 5.7111681643132215e-06, + "loss": 0.7691, + "step": 1484 + }, + { + "epoch": 1.9062901155327343, + "grad_norm": 1.5302432775497437, + "learning_rate": 5.715019255455713e-06, + "loss": 0.7622, + "step": 1485 + }, + { + "epoch": 1.907573812580231, + "grad_norm": 1.3199634552001953, + "learning_rate": 5.718870346598203e-06, + "loss": 0.7609, + "step": 1486 + }, + { + "epoch": 1.9088575096277278, + "grad_norm": 1.370903730392456, + "learning_rate": 5.722721437740693e-06, + "loss": 0.8148, + "step": 1487 + }, + { + "epoch": 1.9101412066752248, + "grad_norm": 1.6248453855514526, + "learning_rate": 5.726572528883184e-06, + "loss": 0.772, + "step": 1488 + }, + { + "epoch": 1.9114249037227213, + "grad_norm": 1.3796658515930176, + "learning_rate": 5.730423620025674e-06, + "loss": 0.7158, + "step": 1489 + }, + { + "epoch": 1.9127086007702183, + "grad_norm": 1.6035256385803223, + "learning_rate": 5.7342747111681646e-06, + "loss": 0.776, + "step": 1490 + }, + { + "epoch": 1.913992297817715, + "grad_norm": 1.3770534992218018, + "learning_rate": 5.738125802310655e-06, + "loss": 0.7254, + "step": 1491 + }, + { + "epoch": 1.9152759948652118, + "grad_norm": 1.642757534980774, + "learning_rate": 5.741976893453146e-06, + "loss": 0.7739, + "step": 1492 + }, + { + "epoch": 1.9165596919127086, + "grad_norm": 1.4043824672698975, + "learning_rate": 5.745827984595635e-06, + "loss": 0.773, + "step": 1493 + }, + { + "epoch": 1.9178433889602053, + "grad_norm": 1.7742558717727661, + "learning_rate": 5.749679075738126e-06, + "loss": 0.7618, + "step": 1494 + }, + { + "epoch": 1.9191270860077023, + "grad_norm": 1.5724658966064453, + "learning_rate": 5.7535301668806165e-06, + "loss": 0.7904, + "step": 1495 + }, + { + "epoch": 1.9204107830551989, + "grad_norm": 1.7831647396087646, + "learning_rate": 5.757381258023107e-06, + "loss": 0.7545, + "step": 1496 + }, + { + "epoch": 1.9216944801026958, + "grad_norm": 1.5997084379196167, + "learning_rate": 5.761232349165597e-06, + "loss": 0.734, + "step": 1497 + }, + { + "epoch": 1.9229781771501926, + "grad_norm": 2.223217487335205, + "learning_rate": 5.765083440308088e-06, + "loss": 0.7705, + "step": 1498 + }, + { + "epoch": 1.9242618741976893, + "grad_norm": 2.237731456756592, + "learning_rate": 5.7689345314505774e-06, + "loss": 0.8038, + "step": 1499 + }, + { + "epoch": 1.925545571245186, + "grad_norm": 1.5142958164215088, + "learning_rate": 5.7727856225930685e-06, + "loss": 0.7473, + "step": 1500 + }, + { + "epoch": 1.9268292682926829, + "grad_norm": 1.9462628364562988, + "learning_rate": 5.776636713735558e-06, + "loss": 0.7448, + "step": 1501 + }, + { + "epoch": 1.9281129653401798, + "grad_norm": 2.3250038623809814, + "learning_rate": 5.780487804878049e-06, + "loss": 0.7867, + "step": 1502 + }, + { + "epoch": 1.9293966623876764, + "grad_norm": 1.5466455221176147, + "learning_rate": 5.784338896020539e-06, + "loss": 0.7344, + "step": 1503 + }, + { + "epoch": 1.9306803594351734, + "grad_norm": 1.8115874528884888, + "learning_rate": 5.788189987163029e-06, + "loss": 0.751, + "step": 1504 + }, + { + "epoch": 1.9319640564826701, + "grad_norm": 1.7308756113052368, + "learning_rate": 5.79204107830552e-06, + "loss": 0.7452, + "step": 1505 + }, + { + "epoch": 1.9332477535301669, + "grad_norm": 1.7284057140350342, + "learning_rate": 5.795892169448011e-06, + "loss": 0.78, + "step": 1506 + }, + { + "epoch": 1.9345314505776636, + "grad_norm": 2.309591054916382, + "learning_rate": 5.7997432605905e-06, + "loss": 0.7793, + "step": 1507 + }, + { + "epoch": 1.9358151476251604, + "grad_norm": 2.2311527729034424, + "learning_rate": 5.803594351732991e-06, + "loss": 0.8253, + "step": 1508 + }, + { + "epoch": 1.9370988446726574, + "grad_norm": 2.10443377494812, + "learning_rate": 5.807445442875481e-06, + "loss": 0.8258, + "step": 1509 + }, + { + "epoch": 1.938382541720154, + "grad_norm": 1.9053466320037842, + "learning_rate": 5.811296534017972e-06, + "loss": 0.8039, + "step": 1510 + }, + { + "epoch": 1.9396662387676509, + "grad_norm": 2.188899040222168, + "learning_rate": 5.815147625160463e-06, + "loss": 0.79, + "step": 1511 + }, + { + "epoch": 1.9409499358151476, + "grad_norm": 1.8809665441513062, + "learning_rate": 5.818998716302953e-06, + "loss": 0.8012, + "step": 1512 + }, + { + "epoch": 1.9422336328626444, + "grad_norm": 1.9047067165374756, + "learning_rate": 5.822849807445443e-06, + "loss": 0.8209, + "step": 1513 + }, + { + "epoch": 1.9435173299101414, + "grad_norm": 2.9734208583831787, + "learning_rate": 5.826700898587933e-06, + "loss": 0.8573, + "step": 1514 + }, + { + "epoch": 1.944801026957638, + "grad_norm": 2.403470277786255, + "learning_rate": 5.830551989730424e-06, + "loss": 0.7851, + "step": 1515 + }, + { + "epoch": 1.9460847240051349, + "grad_norm": 3.578660726547241, + "learning_rate": 5.834403080872914e-06, + "loss": 0.879, + "step": 1516 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 2.1649973392486572, + "learning_rate": 5.838254172015405e-06, + "loss": 0.8359, + "step": 1517 + }, + { + "epoch": 1.9486521181001284, + "grad_norm": 2.671109676361084, + "learning_rate": 5.842105263157895e-06, + "loss": 0.9083, + "step": 1518 + }, + { + "epoch": 1.9499358151476252, + "grad_norm": 2.1065287590026855, + "learning_rate": 5.845956354300385e-06, + "loss": 0.7914, + "step": 1519 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 6.381000518798828, + "learning_rate": 5.8498074454428755e-06, + "loss": 0.8338, + "step": 1520 + }, + { + "epoch": 1.9525032092426189, + "grad_norm": 4.0105061531066895, + "learning_rate": 5.853658536585367e-06, + "loss": 0.8151, + "step": 1521 + }, + { + "epoch": 1.9537869062901154, + "grad_norm": 2.6209473609924316, + "learning_rate": 5.857509627727856e-06, + "loss": 0.8174, + "step": 1522 + }, + { + "epoch": 1.9550706033376124, + "grad_norm": 3.4394774436950684, + "learning_rate": 5.861360718870347e-06, + "loss": 0.9273, + "step": 1523 + }, + { + "epoch": 1.9563543003851092, + "grad_norm": 2.3990962505340576, + "learning_rate": 5.865211810012837e-06, + "loss": 0.8462, + "step": 1524 + }, + { + "epoch": 1.957637997432606, + "grad_norm": 3.15592098236084, + "learning_rate": 5.8690629011553275e-06, + "loss": 0.9171, + "step": 1525 + }, + { + "epoch": 1.9589216944801027, + "grad_norm": 4.355759620666504, + "learning_rate": 5.872913992297818e-06, + "loss": 0.9454, + "step": 1526 + }, + { + "epoch": 1.9602053915275994, + "grad_norm": 3.930795669555664, + "learning_rate": 5.876765083440308e-06, + "loss": 0.9872, + "step": 1527 + }, + { + "epoch": 1.9614890885750964, + "grad_norm": 4.212033748626709, + "learning_rate": 5.880616174582798e-06, + "loss": 0.9136, + "step": 1528 + }, + { + "epoch": 1.962772785622593, + "grad_norm": NaN, + "learning_rate": 5.880616174582798e-06, + "loss": 1.1568, + "step": 1529 + }, + { + "epoch": 1.96405648267009, + "grad_norm": 1.8599194288253784, + "learning_rate": 5.884467265725289e-06, + "loss": 0.7616, + "step": 1530 + }, + { + "epoch": 1.9653401797175867, + "grad_norm": 1.6734473705291748, + "learning_rate": 5.888318356867779e-06, + "loss": 0.7866, + "step": 1531 + }, + { + "epoch": 1.9666238767650834, + "grad_norm": 1.7172845602035522, + "learning_rate": 5.89216944801027e-06, + "loss": 0.7132, + "step": 1532 + }, + { + "epoch": 1.9679075738125802, + "grad_norm": 1.8678314685821533, + "learning_rate": 5.896020539152761e-06, + "loss": 0.7199, + "step": 1533 + }, + { + "epoch": 1.969191270860077, + "grad_norm": 1.3816288709640503, + "learning_rate": 5.89987163029525e-06, + "loss": 0.7433, + "step": 1534 + }, + { + "epoch": 1.970474967907574, + "grad_norm": 1.5060051679611206, + "learning_rate": 5.903722721437741e-06, + "loss": 0.7521, + "step": 1535 + }, + { + "epoch": 1.9717586649550705, + "grad_norm": 1.6861637830734253, + "learning_rate": 5.907573812580231e-06, + "loss": 0.7358, + "step": 1536 + }, + { + "epoch": 1.9730423620025674, + "grad_norm": 1.6095949411392212, + "learning_rate": 5.911424903722722e-06, + "loss": 0.7702, + "step": 1537 + }, + { + "epoch": 1.9743260590500642, + "grad_norm": 2.3325228691101074, + "learning_rate": 5.915275994865212e-06, + "loss": 0.7533, + "step": 1538 + }, + { + "epoch": 1.975609756097561, + "grad_norm": 2.0828750133514404, + "learning_rate": 5.919127086007703e-06, + "loss": 0.7411, + "step": 1539 + }, + { + "epoch": 1.976893453145058, + "grad_norm": 1.7412211894989014, + "learning_rate": 5.922978177150192e-06, + "loss": 0.7893, + "step": 1540 + }, + { + "epoch": 1.9781771501925545, + "grad_norm": 1.7784903049468994, + "learning_rate": 5.926829268292683e-06, + "loss": 0.7222, + "step": 1541 + }, + { + "epoch": 1.9794608472400514, + "grad_norm": 2.088312864303589, + "learning_rate": 5.930680359435174e-06, + "loss": 0.7158, + "step": 1542 + }, + { + "epoch": 1.980744544287548, + "grad_norm": 2.1615777015686035, + "learning_rate": 5.934531450577664e-06, + "loss": 0.7465, + "step": 1543 + }, + { + "epoch": 1.982028241335045, + "grad_norm": 2.0436060428619385, + "learning_rate": 5.938382541720154e-06, + "loss": 0.7957, + "step": 1544 + }, + { + "epoch": 1.9833119383825417, + "grad_norm": 1.7182424068450928, + "learning_rate": 5.942233632862645e-06, + "loss": 0.7504, + "step": 1545 + }, + { + "epoch": 1.9845956354300385, + "grad_norm": 2.267867088317871, + "learning_rate": 5.9460847240051345e-06, + "loss": 0.8268, + "step": 1546 + }, + { + "epoch": 1.9858793324775355, + "grad_norm": 2.7819106578826904, + "learning_rate": 5.9499358151476256e-06, + "loss": 0.8144, + "step": 1547 + }, + { + "epoch": 1.987163029525032, + "grad_norm": 2.3286678791046143, + "learning_rate": 5.953786906290116e-06, + "loss": 0.8023, + "step": 1548 + }, + { + "epoch": 1.988446726572529, + "grad_norm": 2.3523731231689453, + "learning_rate": 5.957637997432606e-06, + "loss": 0.8847, + "step": 1549 + }, + { + "epoch": 1.9897304236200257, + "grad_norm": 3.509908676147461, + "learning_rate": 5.961489088575096e-06, + "loss": 0.7913, + "step": 1550 + }, + { + "epoch": 1.9910141206675225, + "grad_norm": 2.544581413269043, + "learning_rate": 5.965340179717587e-06, + "loss": 0.8144, + "step": 1551 + }, + { + "epoch": 1.9922978177150192, + "grad_norm": 2.7753536701202393, + "learning_rate": 5.969191270860077e-06, + "loss": 0.8765, + "step": 1552 + }, + { + "epoch": 1.993581514762516, + "grad_norm": 2.9341447353363037, + "learning_rate": 5.973042362002568e-06, + "loss": 0.8261, + "step": 1553 + }, + { + "epoch": 1.994865211810013, + "grad_norm": 2.3511977195739746, + "learning_rate": 5.976893453145057e-06, + "loss": 0.8331, + "step": 1554 + }, + { + "epoch": 1.9961489088575095, + "grad_norm": 7.132384300231934, + "learning_rate": 5.980744544287548e-06, + "loss": 0.9153, + "step": 1555 + }, + { + "epoch": 1.9974326059050065, + "grad_norm": 3.2519428730010986, + "learning_rate": 5.984595635430039e-06, + "loss": 0.8802, + "step": 1556 + }, + { + "epoch": 1.9987163029525032, + "grad_norm": 4.2981953620910645, + "learning_rate": 5.988446726572529e-06, + "loss": 0.9419, + "step": 1557 + }, + { + "epoch": 2.0, + "grad_norm": 13.71098804473877, + "learning_rate": 5.99229781771502e-06, + "loss": 1.2074, + "step": 1558 + }, + { + "epoch": 2.001283697047497, + "grad_norm": 2.5463271141052246, + "learning_rate": 5.99614890885751e-06, + "loss": 0.7586, + "step": 1559 + }, + { + "epoch": 2.0025673940949935, + "grad_norm": 1.3912371397018433, + "learning_rate": 6e-06, + "loss": 0.7253, + "step": 1560 + }, + { + "epoch": 2.0038510911424905, + "grad_norm": 1.7173439264297485, + "learning_rate": 6.00385109114249e-06, + "loss": 0.7365, + "step": 1561 + }, + { + "epoch": 2.005134788189987, + "grad_norm": 1.5195232629776, + "learning_rate": 6.0077021822849814e-06, + "loss": 0.6978, + "step": 1562 + }, + { + "epoch": 2.006418485237484, + "grad_norm": 1.364099144935608, + "learning_rate": 6.011553273427471e-06, + "loss": 0.7756, + "step": 1563 + }, + { + "epoch": 2.0077021822849805, + "grad_norm": 1.3880313634872437, + "learning_rate": 6.015404364569962e-06, + "loss": 0.7334, + "step": 1564 + }, + { + "epoch": 2.0089858793324775, + "grad_norm": 2.0267395973205566, + "learning_rate": 6.019255455712452e-06, + "loss": 0.7315, + "step": 1565 + }, + { + "epoch": 2.0102695763799745, + "grad_norm": 1.713462233543396, + "learning_rate": 6.023106546854942e-06, + "loss": 0.7795, + "step": 1566 + }, + { + "epoch": 2.011553273427471, + "grad_norm": 1.9033687114715576, + "learning_rate": 6.0269576379974326e-06, + "loss": 0.6925, + "step": 1567 + }, + { + "epoch": 2.012836970474968, + "grad_norm": 2.154813528060913, + "learning_rate": 6.030808729139924e-06, + "loss": 0.7147, + "step": 1568 + }, + { + "epoch": 2.0141206675224645, + "grad_norm": 1.956391453742981, + "learning_rate": 6.034659820282413e-06, + "loss": 0.744, + "step": 1569 + }, + { + "epoch": 2.0154043645699615, + "grad_norm": 2.850402355194092, + "learning_rate": 6.038510911424904e-06, + "loss": 0.7629, + "step": 1570 + }, + { + "epoch": 2.016688061617458, + "grad_norm": 2.223367691040039, + "learning_rate": 6.042362002567394e-06, + "loss": 0.7795, + "step": 1571 + }, + { + "epoch": 2.017971758664955, + "grad_norm": 1.5354862213134766, + "learning_rate": 6.0462130937098845e-06, + "loss": 0.7662, + "step": 1572 + }, + { + "epoch": 2.019255455712452, + "grad_norm": 1.4932434558868408, + "learning_rate": 6.050064184852375e-06, + "loss": 0.7817, + "step": 1573 + }, + { + "epoch": 2.0205391527599486, + "grad_norm": 2.0526251792907715, + "learning_rate": 6.053915275994866e-06, + "loss": 0.7473, + "step": 1574 + }, + { + "epoch": 2.0218228498074455, + "grad_norm": 1.7709757089614868, + "learning_rate": 6.057766367137355e-06, + "loss": 0.7423, + "step": 1575 + }, + { + "epoch": 2.023106546854942, + "grad_norm": 2.008697271347046, + "learning_rate": 6.061617458279846e-06, + "loss": 0.7255, + "step": 1576 + }, + { + "epoch": 2.024390243902439, + "grad_norm": 15.6879301071167, + "learning_rate": 6.0654685494223365e-06, + "loss": 0.7571, + "step": 1577 + }, + { + "epoch": 2.025673940949936, + "grad_norm": 2.0465943813323975, + "learning_rate": 6.069319640564827e-06, + "loss": 0.766, + "step": 1578 + }, + { + "epoch": 2.0269576379974326, + "grad_norm": 1.5307010412216187, + "learning_rate": 6.073170731707318e-06, + "loss": 0.7726, + "step": 1579 + }, + { + "epoch": 2.0282413350449295, + "grad_norm": 1.6262271404266357, + "learning_rate": 6.077021822849807e-06, + "loss": 0.7099, + "step": 1580 + }, + { + "epoch": 2.029525032092426, + "grad_norm": 1.6266850233078003, + "learning_rate": 6.080872913992298e-06, + "loss": 0.7263, + "step": 1581 + }, + { + "epoch": 2.030808729139923, + "grad_norm": 1.8722282648086548, + "learning_rate": 6.0847240051347885e-06, + "loss": 0.7909, + "step": 1582 + }, + { + "epoch": 2.0320924261874196, + "grad_norm": 1.827039361000061, + "learning_rate": 6.088575096277279e-06, + "loss": 0.7839, + "step": 1583 + }, + { + "epoch": 2.0333761232349166, + "grad_norm": 2.603746175765991, + "learning_rate": 6.092426187419769e-06, + "loss": 0.7109, + "step": 1584 + }, + { + "epoch": 2.0346598202824135, + "grad_norm": 1.9702829122543335, + "learning_rate": 6.09627727856226e-06, + "loss": 0.7458, + "step": 1585 + }, + { + "epoch": 2.03594351732991, + "grad_norm": 1.8424110412597656, + "learning_rate": 6.100128369704749e-06, + "loss": 0.7445, + "step": 1586 + }, + { + "epoch": 2.037227214377407, + "grad_norm": 2.3478522300720215, + "learning_rate": 6.10397946084724e-06, + "loss": 0.7523, + "step": 1587 + }, + { + "epoch": 2.0385109114249036, + "grad_norm": 2.360131025314331, + "learning_rate": 6.107830551989731e-06, + "loss": 0.8059, + "step": 1588 + }, + { + "epoch": 2.0397946084724006, + "grad_norm": 2.4755403995513916, + "learning_rate": 6.111681643132221e-06, + "loss": 0.7543, + "step": 1589 + }, + { + "epoch": 2.041078305519897, + "grad_norm": 3.6436901092529297, + "learning_rate": 6.115532734274711e-06, + "loss": 0.8046, + "step": 1590 + }, + { + "epoch": 2.042362002567394, + "grad_norm": 2.190985679626465, + "learning_rate": 6.119383825417202e-06, + "loss": 0.7913, + "step": 1591 + }, + { + "epoch": 2.043645699614891, + "grad_norm": 4.561742305755615, + "learning_rate": 6.1232349165596915e-06, + "loss": 0.7864, + "step": 1592 + }, + { + "epoch": 2.0449293966623876, + "grad_norm": 2.381412982940674, + "learning_rate": 6.127086007702183e-06, + "loss": 0.8068, + "step": 1593 + }, + { + "epoch": 2.0462130937098846, + "grad_norm": 2.7931458950042725, + "learning_rate": 6.130937098844673e-06, + "loss": 0.8889, + "step": 1594 + }, + { + "epoch": 2.047496790757381, + "grad_norm": 1.9566290378570557, + "learning_rate": 6.134788189987163e-06, + "loss": 0.7664, + "step": 1595 + }, + { + "epoch": 2.048780487804878, + "grad_norm": 2.3645718097686768, + "learning_rate": 6.138639281129653e-06, + "loss": 0.869, + "step": 1596 + }, + { + "epoch": 2.0500641848523746, + "grad_norm": 2.7810637950897217, + "learning_rate": 6.142490372272144e-06, + "loss": 0.8087, + "step": 1597 + }, + { + "epoch": 2.0513478818998716, + "grad_norm": 3.890446901321411, + "learning_rate": 6.146341463414634e-06, + "loss": 0.8374, + "step": 1598 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 4.18187952041626, + "learning_rate": 6.150192554557125e-06, + "loss": 0.8237, + "step": 1599 + }, + { + "epoch": 2.053915275994865, + "grad_norm": 3.183194160461426, + "learning_rate": 6.154043645699615e-06, + "loss": 0.9011, + "step": 1600 + }, + { + "epoch": 2.055198973042362, + "grad_norm": 4.183389186859131, + "learning_rate": 6.157894736842105e-06, + "loss": 0.7719, + "step": 1601 + }, + { + "epoch": 2.0564826700898586, + "grad_norm": 2.6897239685058594, + "learning_rate": 6.161745827984596e-06, + "loss": 0.8039, + "step": 1602 + }, + { + "epoch": 2.0577663671373556, + "grad_norm": 2.7941324710845947, + "learning_rate": 6.1655969191270865e-06, + "loss": 0.9124, + "step": 1603 + }, + { + "epoch": 2.0590500641848526, + "grad_norm": 2.9560792446136475, + "learning_rate": 6.169448010269577e-06, + "loss": 0.9633, + "step": 1604 + }, + { + "epoch": 2.060333761232349, + "grad_norm": 3.56866455078125, + "learning_rate": 6.173299101412067e-06, + "loss": 0.9238, + "step": 1605 + }, + { + "epoch": 2.061617458279846, + "grad_norm": 5.009387016296387, + "learning_rate": 6.177150192554557e-06, + "loss": 0.9251, + "step": 1606 + }, + { + "epoch": 2.0629011553273426, + "grad_norm": 3.168916940689087, + "learning_rate": 6.1810012836970474e-06, + "loss": 1.0126, + "step": 1607 + }, + { + "epoch": 2.0641848523748396, + "grad_norm": 4.643386363983154, + "learning_rate": 6.1848523748395385e-06, + "loss": 1.1652, + "step": 1608 + }, + { + "epoch": 2.065468549422336, + "grad_norm": 2.3194832801818848, + "learning_rate": 6.188703465982028e-06, + "loss": 0.7539, + "step": 1609 + }, + { + "epoch": 2.066752246469833, + "grad_norm": 1.4303942918777466, + "learning_rate": 6.192554557124519e-06, + "loss": 0.7083, + "step": 1610 + }, + { + "epoch": 2.06803594351733, + "grad_norm": 1.577144980430603, + "learning_rate": 6.196405648267009e-06, + "loss": 0.7633, + "step": 1611 + }, + { + "epoch": 2.0693196405648266, + "grad_norm": 1.8768211603164673, + "learning_rate": 6.200256739409499e-06, + "loss": 0.7316, + "step": 1612 + }, + { + "epoch": 2.0706033376123236, + "grad_norm": 1.7942155599594116, + "learning_rate": 6.20410783055199e-06, + "loss": 0.7244, + "step": 1613 + }, + { + "epoch": 2.07188703465982, + "grad_norm": 2.2462313175201416, + "learning_rate": 6.207958921694481e-06, + "loss": 0.7096, + "step": 1614 + }, + { + "epoch": 2.073170731707317, + "grad_norm": 1.7616679668426514, + "learning_rate": 6.21181001283697e-06, + "loss": 0.7563, + "step": 1615 + }, + { + "epoch": 2.0744544287548137, + "grad_norm": 1.843617558479309, + "learning_rate": 6.215661103979461e-06, + "loss": 0.6927, + "step": 1616 + }, + { + "epoch": 2.0757381258023107, + "grad_norm": 2.0180912017822266, + "learning_rate": 6.219512195121951e-06, + "loss": 0.7316, + "step": 1617 + }, + { + "epoch": 2.0770218228498076, + "grad_norm": 1.9316391944885254, + "learning_rate": 6.223363286264442e-06, + "loss": 0.7396, + "step": 1618 + }, + { + "epoch": 2.078305519897304, + "grad_norm": 2.3898143768310547, + "learning_rate": 6.227214377406932e-06, + "loss": 0.7686, + "step": 1619 + }, + { + "epoch": 2.079589216944801, + "grad_norm": 1.972963809967041, + "learning_rate": 6.231065468549423e-06, + "loss": 0.7105, + "step": 1620 + }, + { + "epoch": 2.0808729139922977, + "grad_norm": 1.6520049571990967, + "learning_rate": 6.234916559691912e-06, + "loss": 0.7578, + "step": 1621 + }, + { + "epoch": 2.0821566110397947, + "grad_norm": 2.5335652828216553, + "learning_rate": 6.238767650834403e-06, + "loss": 0.7495, + "step": 1622 + }, + { + "epoch": 2.083440308087291, + "grad_norm": 1.3965239524841309, + "learning_rate": 6.2426187419768935e-06, + "loss": 0.7004, + "step": 1623 + }, + { + "epoch": 2.084724005134788, + "grad_norm": 2.453742027282715, + "learning_rate": 6.246469833119384e-06, + "loss": 0.7689, + "step": 1624 + }, + { + "epoch": 2.086007702182285, + "grad_norm": 3.4303698539733887, + "learning_rate": 6.250320924261875e-06, + "loss": 0.7063, + "step": 1625 + }, + { + "epoch": 2.0872913992297817, + "grad_norm": 1.9820915460586548, + "learning_rate": 6.254172015404365e-06, + "loss": 0.7147, + "step": 1626 + }, + { + "epoch": 2.0885750962772787, + "grad_norm": 2.9993793964385986, + "learning_rate": 6.258023106546855e-06, + "loss": 0.7296, + "step": 1627 + }, + { + "epoch": 2.089858793324775, + "grad_norm": 2.8399484157562256, + "learning_rate": 6.2618741976893455e-06, + "loss": 0.7421, + "step": 1628 + }, + { + "epoch": 2.091142490372272, + "grad_norm": 3.524822473526001, + "learning_rate": 6.265725288831836e-06, + "loss": 0.7281, + "step": 1629 + }, + { + "epoch": 2.092426187419769, + "grad_norm": 2.0225021839141846, + "learning_rate": 6.269576379974326e-06, + "loss": 0.7636, + "step": 1630 + }, + { + "epoch": 2.0937098844672657, + "grad_norm": 2.5709612369537354, + "learning_rate": 6.273427471116817e-06, + "loss": 0.7104, + "step": 1631 + }, + { + "epoch": 2.0949935815147627, + "grad_norm": 3.1324565410614014, + "learning_rate": 6.277278562259306e-06, + "loss": 0.7609, + "step": 1632 + }, + { + "epoch": 2.096277278562259, + "grad_norm": 2.223921537399292, + "learning_rate": 6.2811296534017975e-06, + "loss": 0.7801, + "step": 1633 + }, + { + "epoch": 2.097560975609756, + "grad_norm": 2.281264305114746, + "learning_rate": 6.284980744544288e-06, + "loss": 0.7764, + "step": 1634 + }, + { + "epoch": 2.0988446726572527, + "grad_norm": 2.2272696495056152, + "learning_rate": 6.288831835686778e-06, + "loss": 0.765, + "step": 1635 + }, + { + "epoch": 2.1001283697047497, + "grad_norm": 2.3255605697631836, + "learning_rate": 6.292682926829268e-06, + "loss": 0.7134, + "step": 1636 + }, + { + "epoch": 2.1014120667522467, + "grad_norm": 2.7561490535736084, + "learning_rate": 6.296534017971759e-06, + "loss": 0.807, + "step": 1637 + }, + { + "epoch": 2.102695763799743, + "grad_norm": 4.295324802398682, + "learning_rate": 6.300385109114249e-06, + "loss": 0.7687, + "step": 1638 + }, + { + "epoch": 2.10397946084724, + "grad_norm": 2.5178308486938477, + "learning_rate": 6.30423620025674e-06, + "loss": 0.8436, + "step": 1639 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 3.359222888946533, + "learning_rate": 6.30808729139923e-06, + "loss": 0.7723, + "step": 1640 + }, + { + "epoch": 2.1065468549422337, + "grad_norm": 2.502647876739502, + "learning_rate": 6.31193838254172e-06, + "loss": 0.7844, + "step": 1641 + }, + { + "epoch": 2.1078305519897302, + "grad_norm": 1.7365052700042725, + "learning_rate": 6.31578947368421e-06, + "loss": 0.7255, + "step": 1642 + }, + { + "epoch": 2.109114249037227, + "grad_norm": 6.241297245025635, + "learning_rate": 6.319640564826701e-06, + "loss": 0.7851, + "step": 1643 + }, + { + "epoch": 2.110397946084724, + "grad_norm": 5.314526081085205, + "learning_rate": 6.323491655969191e-06, + "loss": 0.7866, + "step": 1644 + }, + { + "epoch": 2.1116816431322207, + "grad_norm": 2.9286420345306396, + "learning_rate": 6.327342747111682e-06, + "loss": 0.8178, + "step": 1645 + }, + { + "epoch": 2.1129653401797177, + "grad_norm": 4.282406330108643, + "learning_rate": 6.331193838254173e-06, + "loss": 0.8039, + "step": 1646 + }, + { + "epoch": 2.1142490372272142, + "grad_norm": 3.0099124908447266, + "learning_rate": 6.335044929396662e-06, + "loss": 0.8061, + "step": 1647 + }, + { + "epoch": 2.1155327342747112, + "grad_norm": 2.430280923843384, + "learning_rate": 6.338896020539153e-06, + "loss": 0.8423, + "step": 1648 + }, + { + "epoch": 2.1168164313222078, + "grad_norm": 2.727905035018921, + "learning_rate": 6.342747111681644e-06, + "loss": 0.8885, + "step": 1649 + }, + { + "epoch": 2.1181001283697047, + "grad_norm": 3.171844720840454, + "learning_rate": 6.346598202824134e-06, + "loss": 0.8298, + "step": 1650 + }, + { + "epoch": 2.1193838254172017, + "grad_norm": 3.8987674713134766, + "learning_rate": 6.350449293966624e-06, + "loss": 0.8701, + "step": 1651 + }, + { + "epoch": 2.1206675224646983, + "grad_norm": 2.567955255508423, + "learning_rate": 6.354300385109115e-06, + "loss": 0.8776, + "step": 1652 + }, + { + "epoch": 2.1219512195121952, + "grad_norm": 3.231529951095581, + "learning_rate": 6.3581514762516045e-06, + "loss": 0.8375, + "step": 1653 + }, + { + "epoch": 2.1232349165596918, + "grad_norm": 4.136504173278809, + "learning_rate": 6.3620025673940956e-06, + "loss": 0.8026, + "step": 1654 + }, + { + "epoch": 2.1245186136071887, + "grad_norm": 3.864003896713257, + "learning_rate": 6.365853658536586e-06, + "loss": 0.8462, + "step": 1655 + }, + { + "epoch": 2.1258023106546853, + "grad_norm": 4.128402233123779, + "learning_rate": 6.369704749679076e-06, + "loss": 0.9241, + "step": 1656 + }, + { + "epoch": 2.1270860077021823, + "grad_norm": 3.691511869430542, + "learning_rate": 6.373555840821566e-06, + "loss": 0.9846, + "step": 1657 + }, + { + "epoch": 2.1283697047496792, + "grad_norm": 4.987913131713867, + "learning_rate": 6.3774069319640564e-06, + "loss": 1.2577, + "step": 1658 + }, + { + "epoch": 2.1296534017971758, + "grad_norm": 2.228168249130249, + "learning_rate": 6.381258023106547e-06, + "loss": 0.7231, + "step": 1659 + }, + { + "epoch": 2.1309370988446728, + "grad_norm": 1.8283109664916992, + "learning_rate": 6.385109114249038e-06, + "loss": 0.7562, + "step": 1660 + }, + { + "epoch": 2.1322207958921693, + "grad_norm": 1.7744511365890503, + "learning_rate": 6.388960205391527e-06, + "loss": 0.6822, + "step": 1661 + }, + { + "epoch": 2.1335044929396663, + "grad_norm": 1.7182488441467285, + "learning_rate": 6.392811296534018e-06, + "loss": 0.7718, + "step": 1662 + }, + { + "epoch": 2.1347881899871632, + "grad_norm": 1.4648926258087158, + "learning_rate": 6.396662387676508e-06, + "loss": 0.7622, + "step": 1663 + }, + { + "epoch": 2.1360718870346598, + "grad_norm": 2.109588384628296, + "learning_rate": 6.400513478818999e-06, + "loss": 0.7055, + "step": 1664 + }, + { + "epoch": 2.1373555840821568, + "grad_norm": 2.4787302017211914, + "learning_rate": 6.404364569961489e-06, + "loss": 0.7543, + "step": 1665 + }, + { + "epoch": 2.1386392811296533, + "grad_norm": 1.6417633295059204, + "learning_rate": 6.40821566110398e-06, + "loss": 0.7486, + "step": 1666 + }, + { + "epoch": 2.1399229781771503, + "grad_norm": 1.5459940433502197, + "learning_rate": 6.412066752246469e-06, + "loss": 0.6952, + "step": 1667 + }, + { + "epoch": 2.141206675224647, + "grad_norm": 1.9439849853515625, + "learning_rate": 6.41591784338896e-06, + "loss": 0.7472, + "step": 1668 + }, + { + "epoch": 2.142490372272144, + "grad_norm": 1.914218783378601, + "learning_rate": 6.4197689345314514e-06, + "loss": 0.7275, + "step": 1669 + }, + { + "epoch": 2.1437740693196408, + "grad_norm": 1.687313199043274, + "learning_rate": 6.423620025673941e-06, + "loss": 0.7753, + "step": 1670 + }, + { + "epoch": 2.1450577663671373, + "grad_norm": 2.721252202987671, + "learning_rate": 6.427471116816432e-06, + "loss": 0.7015, + "step": 1671 + }, + { + "epoch": 2.1463414634146343, + "grad_norm": 1.6778963804244995, + "learning_rate": 6.431322207958922e-06, + "loss": 0.7374, + "step": 1672 + }, + { + "epoch": 2.147625160462131, + "grad_norm": 1.5212852954864502, + "learning_rate": 6.435173299101412e-06, + "loss": 0.7438, + "step": 1673 + }, + { + "epoch": 2.148908857509628, + "grad_norm": 1.8767420053482056, + "learning_rate": 6.4390243902439026e-06, + "loss": 0.6981, + "step": 1674 + }, + { + "epoch": 2.1501925545571243, + "grad_norm": 2.0418570041656494, + "learning_rate": 6.442875481386394e-06, + "loss": 0.7784, + "step": 1675 + }, + { + "epoch": 2.1514762516046213, + "grad_norm": 2.715923547744751, + "learning_rate": 6.446726572528883e-06, + "loss": 0.7572, + "step": 1676 + }, + { + "epoch": 2.1527599486521183, + "grad_norm": 1.6924865245819092, + "learning_rate": 6.450577663671374e-06, + "loss": 0.696, + "step": 1677 + }, + { + "epoch": 2.154043645699615, + "grad_norm": 2.340894937515259, + "learning_rate": 6.454428754813864e-06, + "loss": 0.7699, + "step": 1678 + }, + { + "epoch": 2.155327342747112, + "grad_norm": 2.3373913764953613, + "learning_rate": 6.4582798459563545e-06, + "loss": 0.7067, + "step": 1679 + }, + { + "epoch": 2.1566110397946083, + "grad_norm": 2.2018649578094482, + "learning_rate": 6.462130937098845e-06, + "loss": 0.8045, + "step": 1680 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 3.7178008556365967, + "learning_rate": 6.465982028241335e-06, + "loss": 0.7722, + "step": 1681 + }, + { + "epoch": 2.1591784338896023, + "grad_norm": 2.0105066299438477, + "learning_rate": 6.469833119383825e-06, + "loss": 0.7002, + "step": 1682 + }, + { + "epoch": 2.160462130937099, + "grad_norm": 2.2779154777526855, + "learning_rate": 6.473684210526316e-06, + "loss": 0.7781, + "step": 1683 + }, + { + "epoch": 2.161745827984596, + "grad_norm": 2.718207836151123, + "learning_rate": 6.477535301668806e-06, + "loss": 0.7996, + "step": 1684 + }, + { + "epoch": 2.1630295250320923, + "grad_norm": 2.3596205711364746, + "learning_rate": 6.481386392811297e-06, + "loss": 0.7765, + "step": 1685 + }, + { + "epoch": 2.1643132220795893, + "grad_norm": 2.3189146518707275, + "learning_rate": 6.485237483953787e-06, + "loss": 0.7572, + "step": 1686 + }, + { + "epoch": 2.165596919127086, + "grad_norm": 2.1305816173553467, + "learning_rate": 6.489088575096277e-06, + "loss": 0.7214, + "step": 1687 + }, + { + "epoch": 2.166880616174583, + "grad_norm": 2.232067584991455, + "learning_rate": 6.492939666238767e-06, + "loss": 0.7792, + "step": 1688 + }, + { + "epoch": 2.1681643132220794, + "grad_norm": 3.370026111602783, + "learning_rate": 6.4967907573812585e-06, + "loss": 0.798, + "step": 1689 + }, + { + "epoch": 2.1694480102695763, + "grad_norm": 2.151223659515381, + "learning_rate": 6.500641848523748e-06, + "loss": 0.7808, + "step": 1690 + }, + { + "epoch": 2.1707317073170733, + "grad_norm": 1.702335238456726, + "learning_rate": 6.504492939666239e-06, + "loss": 0.7806, + "step": 1691 + }, + { + "epoch": 2.17201540436457, + "grad_norm": 2.2111949920654297, + "learning_rate": 6.50834403080873e-06, + "loss": 0.755, + "step": 1692 + }, + { + "epoch": 2.173299101412067, + "grad_norm": 4.23499059677124, + "learning_rate": 6.512195121951219e-06, + "loss": 0.8025, + "step": 1693 + }, + { + "epoch": 2.1745827984595634, + "grad_norm": 2.1080093383789062, + "learning_rate": 6.51604621309371e-06, + "loss": 0.7975, + "step": 1694 + }, + { + "epoch": 2.1758664955070603, + "grad_norm": 2.261770248413086, + "learning_rate": 6.519897304236201e-06, + "loss": 0.837, + "step": 1695 + }, + { + "epoch": 2.1771501925545573, + "grad_norm": 2.4744255542755127, + "learning_rate": 6.523748395378691e-06, + "loss": 0.8257, + "step": 1696 + }, + { + "epoch": 2.178433889602054, + "grad_norm": 2.1199259757995605, + "learning_rate": 6.527599486521181e-06, + "loss": 0.7873, + "step": 1697 + }, + { + "epoch": 2.179717586649551, + "grad_norm": 2.30055570602417, + "learning_rate": 6.531450577663672e-06, + "loss": 0.8046, + "step": 1698 + }, + { + "epoch": 2.1810012836970474, + "grad_norm": 3.3697383403778076, + "learning_rate": 6.5353016688061615e-06, + "loss": 0.8501, + "step": 1699 + }, + { + "epoch": 2.1822849807445444, + "grad_norm": 2.786519765853882, + "learning_rate": 6.539152759948653e-06, + "loss": 0.8963, + "step": 1700 + }, + { + "epoch": 2.183568677792041, + "grad_norm": 2.1549735069274902, + "learning_rate": 6.543003851091143e-06, + "loss": 0.8357, + "step": 1701 + }, + { + "epoch": 2.184852374839538, + "grad_norm": 2.4145445823669434, + "learning_rate": 6.546854942233633e-06, + "loss": 0.8444, + "step": 1702 + }, + { + "epoch": 2.186136071887035, + "grad_norm": 3.101074695587158, + "learning_rate": 6.550706033376123e-06, + "loss": 0.8498, + "step": 1703 + }, + { + "epoch": 2.1874197689345314, + "grad_norm": 2.6718242168426514, + "learning_rate": 6.554557124518614e-06, + "loss": 0.8926, + "step": 1704 + }, + { + "epoch": 2.1887034659820284, + "grad_norm": 3.012305498123169, + "learning_rate": 6.558408215661104e-06, + "loss": 0.9276, + "step": 1705 + }, + { + "epoch": 2.189987163029525, + "grad_norm": 3.968097686767578, + "learning_rate": 6.562259306803595e-06, + "loss": 0.9874, + "step": 1706 + }, + { + "epoch": 2.191270860077022, + "grad_norm": 3.22845458984375, + "learning_rate": 6.566110397946084e-06, + "loss": 0.939, + "step": 1707 + }, + { + "epoch": 2.1925545571245184, + "grad_norm": 5.027524948120117, + "learning_rate": 6.569961489088575e-06, + "loss": 1.139, + "step": 1708 + }, + { + "epoch": 2.1938382541720154, + "grad_norm": 1.9296035766601562, + "learning_rate": 6.5738125802310655e-06, + "loss": 0.7472, + "step": 1709 + }, + { + "epoch": 2.1951219512195124, + "grad_norm": 1.4763509035110474, + "learning_rate": 6.577663671373556e-06, + "loss": 0.704, + "step": 1710 + }, + { + "epoch": 2.196405648267009, + "grad_norm": 1.4714782238006592, + "learning_rate": 6.581514762516046e-06, + "loss": 0.697, + "step": 1711 + }, + { + "epoch": 2.197689345314506, + "grad_norm": 2.080443859100342, + "learning_rate": 6.585365853658537e-06, + "loss": 0.6773, + "step": 1712 + }, + { + "epoch": 2.1989730423620024, + "grad_norm": 2.4243502616882324, + "learning_rate": 6.589216944801026e-06, + "loss": 0.7178, + "step": 1713 + }, + { + "epoch": 2.2002567394094994, + "grad_norm": 1.532309889793396, + "learning_rate": 6.5930680359435174e-06, + "loss": 0.7077, + "step": 1714 + }, + { + "epoch": 2.2015404364569964, + "grad_norm": 2.251777172088623, + "learning_rate": 6.5969191270860085e-06, + "loss": 0.715, + "step": 1715 + }, + { + "epoch": 2.202824133504493, + "grad_norm": 1.4279649257659912, + "learning_rate": 6.600770218228498e-06, + "loss": 0.7612, + "step": 1716 + }, + { + "epoch": 2.20410783055199, + "grad_norm": 1.7135405540466309, + "learning_rate": 6.604621309370989e-06, + "loss": 0.7125, + "step": 1717 + }, + { + "epoch": 2.2053915275994864, + "grad_norm": 1.720750093460083, + "learning_rate": 6.608472400513479e-06, + "loss": 0.7464, + "step": 1718 + }, + { + "epoch": 2.2066752246469834, + "grad_norm": 1.5271373987197876, + "learning_rate": 6.612323491655969e-06, + "loss": 0.7295, + "step": 1719 + }, + { + "epoch": 2.20795892169448, + "grad_norm": 1.7914084196090698, + "learning_rate": 6.61617458279846e-06, + "loss": 0.6887, + "step": 1720 + }, + { + "epoch": 2.209242618741977, + "grad_norm": 1.2946442365646362, + "learning_rate": 6.620025673940951e-06, + "loss": 0.7077, + "step": 1721 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 2.0234453678131104, + "learning_rate": 6.62387676508344e-06, + "loss": 0.7607, + "step": 1722 + }, + { + "epoch": 2.2118100128369704, + "grad_norm": 1.9707190990447998, + "learning_rate": 6.627727856225931e-06, + "loss": 0.7998, + "step": 1723 + }, + { + "epoch": 2.2130937098844674, + "grad_norm": 1.547577142715454, + "learning_rate": 6.631578947368421e-06, + "loss": 0.7635, + "step": 1724 + }, + { + "epoch": 2.214377406931964, + "grad_norm": 2.195845365524292, + "learning_rate": 6.635430038510912e-06, + "loss": 0.7417, + "step": 1725 + }, + { + "epoch": 2.215661103979461, + "grad_norm": 1.6878598928451538, + "learning_rate": 6.639281129653402e-06, + "loss": 0.7446, + "step": 1726 + }, + { + "epoch": 2.2169448010269575, + "grad_norm": 1.8954248428344727, + "learning_rate": 6.643132220795893e-06, + "loss": 0.7313, + "step": 1727 + }, + { + "epoch": 2.2182284980744544, + "grad_norm": 1.5534183979034424, + "learning_rate": 6.646983311938382e-06, + "loss": 0.7439, + "step": 1728 + }, + { + "epoch": 2.2195121951219514, + "grad_norm": 1.8070266246795654, + "learning_rate": 6.650834403080873e-06, + "loss": 0.7467, + "step": 1729 + }, + { + "epoch": 2.220795892169448, + "grad_norm": 1.9085168838500977, + "learning_rate": 6.6546854942233635e-06, + "loss": 0.7694, + "step": 1730 + }, + { + "epoch": 2.222079589216945, + "grad_norm": 1.7086241245269775, + "learning_rate": 6.658536585365854e-06, + "loss": 0.7395, + "step": 1731 + }, + { + "epoch": 2.2233632862644415, + "grad_norm": 2.5762650966644287, + "learning_rate": 6.662387676508344e-06, + "loss": 0.7614, + "step": 1732 + }, + { + "epoch": 2.2246469833119384, + "grad_norm": 1.709143877029419, + "learning_rate": 6.666238767650834e-06, + "loss": 0.7644, + "step": 1733 + }, + { + "epoch": 2.225930680359435, + "grad_norm": 2.25691556930542, + "learning_rate": 6.6700898587933244e-06, + "loss": 0.7426, + "step": 1734 + }, + { + "epoch": 2.227214377406932, + "grad_norm": 2.2994158267974854, + "learning_rate": 6.6739409499358155e-06, + "loss": 0.7506, + "step": 1735 + }, + { + "epoch": 2.228498074454429, + "grad_norm": 2.181989908218384, + "learning_rate": 6.677792041078306e-06, + "loss": 0.7677, + "step": 1736 + }, + { + "epoch": 2.2297817715019255, + "grad_norm": 2.195775032043457, + "learning_rate": 6.681643132220796e-06, + "loss": 0.7552, + "step": 1737 + }, + { + "epoch": 2.2310654685494224, + "grad_norm": 2.6668970584869385, + "learning_rate": 6.685494223363287e-06, + "loss": 0.8272, + "step": 1738 + }, + { + "epoch": 2.232349165596919, + "grad_norm": 2.000627040863037, + "learning_rate": 6.689345314505776e-06, + "loss": 0.6973, + "step": 1739 + }, + { + "epoch": 2.233632862644416, + "grad_norm": 1.930127501487732, + "learning_rate": 6.6931964056482675e-06, + "loss": 0.7497, + "step": 1740 + }, + { + "epoch": 2.2349165596919125, + "grad_norm": 1.920986533164978, + "learning_rate": 6.697047496790758e-06, + "loss": 0.7786, + "step": 1741 + }, + { + "epoch": 2.2362002567394095, + "grad_norm": 2.4109294414520264, + "learning_rate": 6.700898587933248e-06, + "loss": 0.7873, + "step": 1742 + }, + { + "epoch": 2.2374839537869065, + "grad_norm": 2.295384407043457, + "learning_rate": 6.704749679075738e-06, + "loss": 0.7143, + "step": 1743 + }, + { + "epoch": 2.238767650834403, + "grad_norm": 2.1234288215637207, + "learning_rate": 6.708600770218229e-06, + "loss": 0.7704, + "step": 1744 + }, + { + "epoch": 2.2400513478819, + "grad_norm": 2.244422197341919, + "learning_rate": 6.712451861360719e-06, + "loss": 0.8297, + "step": 1745 + }, + { + "epoch": 2.2413350449293965, + "grad_norm": 2.898283004760742, + "learning_rate": 6.71630295250321e-06, + "loss": 0.8284, + "step": 1746 + }, + { + "epoch": 2.2426187419768935, + "grad_norm": 2.8173458576202393, + "learning_rate": 6.7201540436457e-06, + "loss": 0.7982, + "step": 1747 + }, + { + "epoch": 2.2439024390243905, + "grad_norm": 2.4676144123077393, + "learning_rate": 6.72400513478819e-06, + "loss": 0.7651, + "step": 1748 + }, + { + "epoch": 2.245186136071887, + "grad_norm": 1.8276346921920776, + "learning_rate": 6.72785622593068e-06, + "loss": 0.8552, + "step": 1749 + }, + { + "epoch": 2.246469833119384, + "grad_norm": 2.940765142440796, + "learning_rate": 6.731707317073171e-06, + "loss": 0.7938, + "step": 1750 + }, + { + "epoch": 2.2477535301668805, + "grad_norm": 2.0617709159851074, + "learning_rate": 6.735558408215661e-06, + "loss": 0.7712, + "step": 1751 + }, + { + "epoch": 2.2490372272143775, + "grad_norm": 3.629711627960205, + "learning_rate": 6.739409499358152e-06, + "loss": 0.9254, + "step": 1752 + }, + { + "epoch": 2.250320924261874, + "grad_norm": 3.5587692260742188, + "learning_rate": 6.743260590500642e-06, + "loss": 0.8729, + "step": 1753 + }, + { + "epoch": 2.251604621309371, + "grad_norm": 2.472480535507202, + "learning_rate": 6.747111681643132e-06, + "loss": 0.8516, + "step": 1754 + }, + { + "epoch": 2.2528883183568675, + "grad_norm": 2.72774600982666, + "learning_rate": 6.7509627727856225e-06, + "loss": 0.9009, + "step": 1755 + }, + { + "epoch": 2.2541720154043645, + "grad_norm": 2.313599109649658, + "learning_rate": 6.754813863928114e-06, + "loss": 0.9379, + "step": 1756 + }, + { + "epoch": 2.2554557124518615, + "grad_norm": 3.393209457397461, + "learning_rate": 6.758664955070603e-06, + "loss": 1.012, + "step": 1757 + }, + { + "epoch": 2.256739409499358, + "grad_norm": 10.1670560836792, + "learning_rate": 6.762516046213094e-06, + "loss": 1.1161, + "step": 1758 + }, + { + "epoch": 2.258023106546855, + "grad_norm": 2.573876142501831, + "learning_rate": 6.766367137355584e-06, + "loss": 0.7437, + "step": 1759 + }, + { + "epoch": 2.2593068035943515, + "grad_norm": 1.4934701919555664, + "learning_rate": 6.7702182284980745e-06, + "loss": 0.7333, + "step": 1760 + }, + { + "epoch": 2.2605905006418485, + "grad_norm": 1.8912155628204346, + "learning_rate": 6.7740693196405655e-06, + "loss": 0.713, + "step": 1761 + }, + { + "epoch": 2.2618741976893455, + "grad_norm": 1.533610463142395, + "learning_rate": 6.777920410783055e-06, + "loss": 0.7195, + "step": 1762 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 1.2622944116592407, + "learning_rate": 6.781771501925546e-06, + "loss": 0.7138, + "step": 1763 + }, + { + "epoch": 2.264441591784339, + "grad_norm": 1.4029089212417603, + "learning_rate": 6.785622593068036e-06, + "loss": 0.7236, + "step": 1764 + }, + { + "epoch": 2.2657252888318355, + "grad_norm": 1.4670441150665283, + "learning_rate": 6.7894736842105264e-06, + "loss": 0.7458, + "step": 1765 + }, + { + "epoch": 2.2670089858793325, + "grad_norm": 1.4024269580841064, + "learning_rate": 6.793324775353017e-06, + "loss": 0.6822, + "step": 1766 + }, + { + "epoch": 2.2682926829268295, + "grad_norm": 2.0888235569000244, + "learning_rate": 6.797175866495508e-06, + "loss": 0.711, + "step": 1767 + }, + { + "epoch": 2.269576379974326, + "grad_norm": 1.2418979406356812, + "learning_rate": 6.801026957637997e-06, + "loss": 0.6961, + "step": 1768 + }, + { + "epoch": 2.270860077021823, + "grad_norm": 1.7657526731491089, + "learning_rate": 6.804878048780488e-06, + "loss": 0.7567, + "step": 1769 + }, + { + "epoch": 2.2721437740693196, + "grad_norm": 1.5038788318634033, + "learning_rate": 6.808729139922978e-06, + "loss": 0.7067, + "step": 1770 + }, + { + "epoch": 2.2734274711168165, + "grad_norm": 1.522554874420166, + "learning_rate": 6.812580231065469e-06, + "loss": 0.7037, + "step": 1771 + }, + { + "epoch": 2.274711168164313, + "grad_norm": 2.6487700939178467, + "learning_rate": 6.816431322207959e-06, + "loss": 0.7319, + "step": 1772 + }, + { + "epoch": 2.27599486521181, + "grad_norm": 1.3566423654556274, + "learning_rate": 6.82028241335045e-06, + "loss": 0.7115, + "step": 1773 + }, + { + "epoch": 2.2772785622593066, + "grad_norm": 1.6638473272323608, + "learning_rate": 6.824133504492939e-06, + "loss": 0.7399, + "step": 1774 + }, + { + "epoch": 2.2785622593068036, + "grad_norm": 2.213059425354004, + "learning_rate": 6.82798459563543e-06, + "loss": 0.7442, + "step": 1775 + }, + { + "epoch": 2.2798459563543005, + "grad_norm": 1.9682117700576782, + "learning_rate": 6.831835686777921e-06, + "loss": 0.7241, + "step": 1776 + }, + { + "epoch": 2.281129653401797, + "grad_norm": 1.7402169704437256, + "learning_rate": 6.835686777920411e-06, + "loss": 0.6959, + "step": 1777 + }, + { + "epoch": 2.282413350449294, + "grad_norm": 1.8042877912521362, + "learning_rate": 6.839537869062901e-06, + "loss": 0.7374, + "step": 1778 + }, + { + "epoch": 2.2836970474967906, + "grad_norm": 1.5333688259124756, + "learning_rate": 6.843388960205392e-06, + "loss": 0.7324, + "step": 1779 + }, + { + "epoch": 2.2849807445442876, + "grad_norm": 1.6414953470230103, + "learning_rate": 6.8472400513478815e-06, + "loss": 0.7557, + "step": 1780 + }, + { + "epoch": 2.2862644415917845, + "grad_norm": 2.310960531234741, + "learning_rate": 6.8510911424903726e-06, + "loss": 0.7075, + "step": 1781 + }, + { + "epoch": 2.287548138639281, + "grad_norm": 1.8483844995498657, + "learning_rate": 6.854942233632864e-06, + "loss": 0.7268, + "step": 1782 + }, + { + "epoch": 2.288831835686778, + "grad_norm": 3.26617693901062, + "learning_rate": 6.858793324775353e-06, + "loss": 0.7131, + "step": 1783 + }, + { + "epoch": 2.2901155327342746, + "grad_norm": 2.663968563079834, + "learning_rate": 6.862644415917844e-06, + "loss": 0.7664, + "step": 1784 + }, + { + "epoch": 2.2913992297817716, + "grad_norm": 1.69324791431427, + "learning_rate": 6.8664955070603335e-06, + "loss": 0.703, + "step": 1785 + }, + { + "epoch": 2.292682926829268, + "grad_norm": 1.7891249656677246, + "learning_rate": 6.8703465982028245e-06, + "loss": 0.7579, + "step": 1786 + }, + { + "epoch": 2.293966623876765, + "grad_norm": 3.1515653133392334, + "learning_rate": 6.874197689345315e-06, + "loss": 0.8074, + "step": 1787 + }, + { + "epoch": 2.295250320924262, + "grad_norm": 2.0920276641845703, + "learning_rate": 6.878048780487805e-06, + "loss": 0.7666, + "step": 1788 + }, + { + "epoch": 2.2965340179717586, + "grad_norm": 2.6227288246154785, + "learning_rate": 6.881899871630295e-06, + "loss": 0.8633, + "step": 1789 + }, + { + "epoch": 2.2978177150192556, + "grad_norm": 3.053483009338379, + "learning_rate": 6.885750962772786e-06, + "loss": 0.8238, + "step": 1790 + }, + { + "epoch": 2.299101412066752, + "grad_norm": 2.2175662517547607, + "learning_rate": 6.889602053915276e-06, + "loss": 0.8116, + "step": 1791 + }, + { + "epoch": 2.300385109114249, + "grad_norm": 3.4466917514801025, + "learning_rate": 6.893453145057767e-06, + "loss": 0.8076, + "step": 1792 + }, + { + "epoch": 2.3016688061617456, + "grad_norm": 2.1490542888641357, + "learning_rate": 6.897304236200257e-06, + "loss": 0.783, + "step": 1793 + }, + { + "epoch": 2.3029525032092426, + "grad_norm": 1.7471797466278076, + "learning_rate": 6.901155327342747e-06, + "loss": 0.7962, + "step": 1794 + }, + { + "epoch": 2.3042362002567396, + "grad_norm": 2.795239210128784, + "learning_rate": 6.905006418485237e-06, + "loss": 0.7948, + "step": 1795 + }, + { + "epoch": 2.305519897304236, + "grad_norm": 2.7132856845855713, + "learning_rate": 6.9088575096277285e-06, + "loss": 0.8229, + "step": 1796 + }, + { + "epoch": 2.306803594351733, + "grad_norm": 3.877049684524536, + "learning_rate": 6.912708600770218e-06, + "loss": 0.7975, + "step": 1797 + }, + { + "epoch": 2.3080872913992296, + "grad_norm": 2.4308671951293945, + "learning_rate": 6.916559691912709e-06, + "loss": 0.7972, + "step": 1798 + }, + { + "epoch": 2.3093709884467266, + "grad_norm": 2.5257458686828613, + "learning_rate": 6.920410783055199e-06, + "loss": 0.8102, + "step": 1799 + }, + { + "epoch": 2.3106546854942236, + "grad_norm": 2.5649187564849854, + "learning_rate": 6.924261874197689e-06, + "loss": 0.8283, + "step": 1800 + }, + { + "epoch": 2.31193838254172, + "grad_norm": 2.765362501144409, + "learning_rate": 6.9281129653401796e-06, + "loss": 0.8108, + "step": 1801 + }, + { + "epoch": 2.313222079589217, + "grad_norm": 3.7559266090393066, + "learning_rate": 6.931964056482671e-06, + "loss": 0.7945, + "step": 1802 + }, + { + "epoch": 2.3145057766367136, + "grad_norm": 3.669818162918091, + "learning_rate": 6.93581514762516e-06, + "loss": 0.8932, + "step": 1803 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 5.079169750213623, + "learning_rate": 6.939666238767651e-06, + "loss": 0.919, + "step": 1804 + }, + { + "epoch": 2.317073170731707, + "grad_norm": 2.720966339111328, + "learning_rate": 6.943517329910142e-06, + "loss": 0.9155, + "step": 1805 + }, + { + "epoch": 2.318356867779204, + "grad_norm": 3.286515712738037, + "learning_rate": 6.9473684210526315e-06, + "loss": 0.8543, + "step": 1806 + }, + { + "epoch": 2.3196405648267007, + "grad_norm": 4.1572651863098145, + "learning_rate": 6.951219512195123e-06, + "loss": 0.9635, + "step": 1807 + }, + { + "epoch": 2.3209242618741976, + "grad_norm": 7.731895923614502, + "learning_rate": 6.955070603337613e-06, + "loss": 1.1165, + "step": 1808 + }, + { + "epoch": 2.3222079589216946, + "grad_norm": 2.444322347640991, + "learning_rate": 6.958921694480103e-06, + "loss": 0.7443, + "step": 1809 + }, + { + "epoch": 2.323491655969191, + "grad_norm": 1.4779698848724365, + "learning_rate": 6.962772785622593e-06, + "loss": 0.6689, + "step": 1810 + }, + { + "epoch": 2.324775353016688, + "grad_norm": 1.755470633506775, + "learning_rate": 6.9666238767650835e-06, + "loss": 0.6947, + "step": 1811 + }, + { + "epoch": 2.3260590500641847, + "grad_norm": 1.7388988733291626, + "learning_rate": 6.970474967907574e-06, + "loss": 0.7181, + "step": 1812 + }, + { + "epoch": 2.3273427471116817, + "grad_norm": 1.847212553024292, + "learning_rate": 6.974326059050065e-06, + "loss": 0.6771, + "step": 1813 + }, + { + "epoch": 2.3286264441591786, + "grad_norm": 2.283888339996338, + "learning_rate": 6.978177150192554e-06, + "loss": 0.7562, + "step": 1814 + }, + { + "epoch": 2.329910141206675, + "grad_norm": 2.016596555709839, + "learning_rate": 6.982028241335045e-06, + "loss": 0.74, + "step": 1815 + }, + { + "epoch": 2.331193838254172, + "grad_norm": 2.7272679805755615, + "learning_rate": 6.9858793324775355e-06, + "loss": 0.7666, + "step": 1816 + }, + { + "epoch": 2.3324775353016687, + "grad_norm": 2.4127988815307617, + "learning_rate": 6.989730423620026e-06, + "loss": 0.7113, + "step": 1817 + }, + { + "epoch": 2.3337612323491657, + "grad_norm": 1.938099980354309, + "learning_rate": 6.993581514762516e-06, + "loss": 0.7311, + "step": 1818 + }, + { + "epoch": 2.335044929396662, + "grad_norm": 2.425567388534546, + "learning_rate": 6.997432605905007e-06, + "loss": 0.71, + "step": 1819 + }, + { + "epoch": 2.336328626444159, + "grad_norm": 1.8010281324386597, + "learning_rate": 7.001283697047496e-06, + "loss": 0.7233, + "step": 1820 + }, + { + "epoch": 2.337612323491656, + "grad_norm": 2.3880224227905273, + "learning_rate": 7.0051347881899874e-06, + "loss": 0.7814, + "step": 1821 + }, + { + "epoch": 2.3388960205391527, + "grad_norm": 2.0849931240081787, + "learning_rate": 7.008985879332478e-06, + "loss": 0.6926, + "step": 1822 + }, + { + "epoch": 2.3401797175866497, + "grad_norm": 2.1298954486846924, + "learning_rate": 7.012836970474968e-06, + "loss": 0.7113, + "step": 1823 + }, + { + "epoch": 2.341463414634146, + "grad_norm": 2.2092416286468506, + "learning_rate": 7.016688061617458e-06, + "loss": 0.6831, + "step": 1824 + }, + { + "epoch": 2.342747111681643, + "grad_norm": 2.387113571166992, + "learning_rate": 7.020539152759949e-06, + "loss": 0.7208, + "step": 1825 + }, + { + "epoch": 2.3440308087291397, + "grad_norm": 2.063612222671509, + "learning_rate": 7.024390243902439e-06, + "loss": 0.6839, + "step": 1826 + }, + { + "epoch": 2.3453145057766367, + "grad_norm": 1.9188421964645386, + "learning_rate": 7.02824133504493e-06, + "loss": 0.7331, + "step": 1827 + }, + { + "epoch": 2.3465982028241337, + "grad_norm": 2.923539161682129, + "learning_rate": 7.032092426187421e-06, + "loss": 0.7001, + "step": 1828 + }, + { + "epoch": 2.34788189987163, + "grad_norm": 2.287684679031372, + "learning_rate": 7.03594351732991e-06, + "loss": 0.6564, + "step": 1829 + }, + { + "epoch": 2.349165596919127, + "grad_norm": 2.023214817047119, + "learning_rate": 7.039794608472401e-06, + "loss": 0.7326, + "step": 1830 + }, + { + "epoch": 2.3504492939666237, + "grad_norm": 2.513319730758667, + "learning_rate": 7.043645699614891e-06, + "loss": 0.7215, + "step": 1831 + }, + { + "epoch": 2.3517329910141207, + "grad_norm": 2.3860838413238525, + "learning_rate": 7.0474967907573816e-06, + "loss": 0.7639, + "step": 1832 + }, + { + "epoch": 2.3530166880616177, + "grad_norm": 2.4009170532226562, + "learning_rate": 7.051347881899872e-06, + "loss": 0.7284, + "step": 1833 + }, + { + "epoch": 2.354300385109114, + "grad_norm": 2.118798017501831, + "learning_rate": 7.055198973042362e-06, + "loss": 0.714, + "step": 1834 + }, + { + "epoch": 2.355584082156611, + "grad_norm": 1.90394926071167, + "learning_rate": 7.059050064184852e-06, + "loss": 0.7517, + "step": 1835 + }, + { + "epoch": 2.3568677792041077, + "grad_norm": 2.3333301544189453, + "learning_rate": 7.062901155327343e-06, + "loss": 0.7032, + "step": 1836 + }, + { + "epoch": 2.3581514762516047, + "grad_norm": 1.825311303138733, + "learning_rate": 7.066752246469833e-06, + "loss": 0.7409, + "step": 1837 + }, + { + "epoch": 2.3594351732991012, + "grad_norm": 2.249018907546997, + "learning_rate": 7.070603337612324e-06, + "loss": 0.732, + "step": 1838 + }, + { + "epoch": 2.360718870346598, + "grad_norm": 3.6433024406433105, + "learning_rate": 7.074454428754814e-06, + "loss": 0.7072, + "step": 1839 + }, + { + "epoch": 2.3620025673940948, + "grad_norm": 2.0886130332946777, + "learning_rate": 7.078305519897304e-06, + "loss": 0.7415, + "step": 1840 + }, + { + "epoch": 2.3632862644415917, + "grad_norm": 3.009723424911499, + "learning_rate": 7.0821566110397944e-06, + "loss": 0.7376, + "step": 1841 + }, + { + "epoch": 2.3645699614890887, + "grad_norm": 2.1183972358703613, + "learning_rate": 7.0860077021822855e-06, + "loss": 0.8195, + "step": 1842 + }, + { + "epoch": 2.3658536585365852, + "grad_norm": 2.012146472930908, + "learning_rate": 7.089858793324775e-06, + "loss": 0.8048, + "step": 1843 + }, + { + "epoch": 2.3671373555840822, + "grad_norm": 2.7662904262542725, + "learning_rate": 7.093709884467266e-06, + "loss": 0.7964, + "step": 1844 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 3.382594108581543, + "learning_rate": 7.097560975609756e-06, + "loss": 0.8311, + "step": 1845 + }, + { + "epoch": 2.3697047496790757, + "grad_norm": 2.2102553844451904, + "learning_rate": 7.101412066752246e-06, + "loss": 0.8063, + "step": 1846 + }, + { + "epoch": 2.3709884467265727, + "grad_norm": 2.9142749309539795, + "learning_rate": 7.105263157894737e-06, + "loss": 0.7642, + "step": 1847 + }, + { + "epoch": 2.3722721437740693, + "grad_norm": 2.292635440826416, + "learning_rate": 7.109114249037228e-06, + "loss": 0.7994, + "step": 1848 + }, + { + "epoch": 2.3735558408215662, + "grad_norm": 2.701422929763794, + "learning_rate": 7.112965340179718e-06, + "loss": 0.8122, + "step": 1849 + }, + { + "epoch": 2.3748395378690628, + "grad_norm": 12.489653587341309, + "learning_rate": 7.116816431322208e-06, + "loss": 0.7397, + "step": 1850 + }, + { + "epoch": 2.3761232349165597, + "grad_norm": 2.2861177921295166, + "learning_rate": 7.120667522464699e-06, + "loss": 0.8286, + "step": 1851 + }, + { + "epoch": 2.3774069319640567, + "grad_norm": 2.5226283073425293, + "learning_rate": 7.124518613607189e-06, + "loss": 0.8724, + "step": 1852 + }, + { + "epoch": 2.3786906290115533, + "grad_norm": 3.760920286178589, + "learning_rate": 7.12836970474968e-06, + "loss": 0.811, + "step": 1853 + }, + { + "epoch": 2.3799743260590502, + "grad_norm": 2.261559009552002, + "learning_rate": 7.13222079589217e-06, + "loss": 0.8025, + "step": 1854 + }, + { + "epoch": 2.3812580231065468, + "grad_norm": 1.9274379014968872, + "learning_rate": 7.13607188703466e-06, + "loss": 0.8136, + "step": 1855 + }, + { + "epoch": 2.3825417201540438, + "grad_norm": 4.83708381652832, + "learning_rate": 7.13992297817715e-06, + "loss": 0.8595, + "step": 1856 + }, + { + "epoch": 2.3838254172015403, + "grad_norm": 3.9294850826263428, + "learning_rate": 7.143774069319641e-06, + "loss": 0.9834, + "step": 1857 + }, + { + "epoch": 2.3851091142490373, + "grad_norm": 6.133410930633545, + "learning_rate": 7.147625160462131e-06, + "loss": 1.0288, + "step": 1858 + }, + { + "epoch": 2.386392811296534, + "grad_norm": 2.0339181423187256, + "learning_rate": 7.151476251604622e-06, + "loss": 0.705, + "step": 1859 + }, + { + "epoch": 2.387676508344031, + "grad_norm": 1.7154656648635864, + "learning_rate": 7.155327342747112e-06, + "loss": 0.7056, + "step": 1860 + }, + { + "epoch": 2.3889602053915278, + "grad_norm": 1.9781588315963745, + "learning_rate": 7.159178433889602e-06, + "loss": 0.7202, + "step": 1861 + }, + { + "epoch": 2.3902439024390243, + "grad_norm": 2.2620060443878174, + "learning_rate": 7.1630295250320925e-06, + "loss": 0.6896, + "step": 1862 + }, + { + "epoch": 2.3915275994865213, + "grad_norm": 1.6026866436004639, + "learning_rate": 7.166880616174583e-06, + "loss": 0.7497, + "step": 1863 + }, + { + "epoch": 2.392811296534018, + "grad_norm": 1.3941253423690796, + "learning_rate": 7.170731707317073e-06, + "loss": 0.6947, + "step": 1864 + }, + { + "epoch": 2.394094993581515, + "grad_norm": 1.3430030345916748, + "learning_rate": 7.174582798459564e-06, + "loss": 0.6868, + "step": 1865 + }, + { + "epoch": 2.3953786906290118, + "grad_norm": 2.1172292232513428, + "learning_rate": 7.178433889602053e-06, + "loss": 0.678, + "step": 1866 + }, + { + "epoch": 2.3966623876765083, + "grad_norm": 1.8160053491592407, + "learning_rate": 7.1822849807445445e-06, + "loss": 0.7582, + "step": 1867 + }, + { + "epoch": 2.3979460847240053, + "grad_norm": 1.6807740926742554, + "learning_rate": 7.186136071887035e-06, + "loss": 0.6657, + "step": 1868 + }, + { + "epoch": 2.399229781771502, + "grad_norm": 1.4259889125823975, + "learning_rate": 7.189987163029525e-06, + "loss": 0.698, + "step": 1869 + }, + { + "epoch": 2.400513478818999, + "grad_norm": 2.2750799655914307, + "learning_rate": 7.193838254172015e-06, + "loss": 0.737, + "step": 1870 + }, + { + "epoch": 2.4017971758664953, + "grad_norm": 1.6814138889312744, + "learning_rate": 7.197689345314506e-06, + "loss": 0.7093, + "step": 1871 + }, + { + "epoch": 2.4030808729139923, + "grad_norm": 1.9263029098510742, + "learning_rate": 7.2015404364569964e-06, + "loss": 0.732, + "step": 1872 + }, + { + "epoch": 2.404364569961489, + "grad_norm": 1.4952023029327393, + "learning_rate": 7.205391527599487e-06, + "loss": 0.6941, + "step": 1873 + }, + { + "epoch": 2.405648267008986, + "grad_norm": 2.0284438133239746, + "learning_rate": 7.209242618741978e-06, + "loss": 0.7277, + "step": 1874 + }, + { + "epoch": 2.406931964056483, + "grad_norm": 1.9570928812026978, + "learning_rate": 7.213093709884467e-06, + "loss": 0.7106, + "step": 1875 + }, + { + "epoch": 2.4082156611039793, + "grad_norm": 1.6523432731628418, + "learning_rate": 7.216944801026958e-06, + "loss": 0.7059, + "step": 1876 + }, + { + "epoch": 2.4094993581514763, + "grad_norm": 1.9205541610717773, + "learning_rate": 7.220795892169448e-06, + "loss": 0.6997, + "step": 1877 + }, + { + "epoch": 2.410783055198973, + "grad_norm": 4.426148414611816, + "learning_rate": 7.224646983311939e-06, + "loss": 0.687, + "step": 1878 + }, + { + "epoch": 2.41206675224647, + "grad_norm": 2.9146974086761475, + "learning_rate": 7.228498074454429e-06, + "loss": 0.7347, + "step": 1879 + }, + { + "epoch": 2.413350449293967, + "grad_norm": 2.567379951477051, + "learning_rate": 7.23234916559692e-06, + "loss": 0.7607, + "step": 1880 + }, + { + "epoch": 2.4146341463414633, + "grad_norm": 1.7294012308120728, + "learning_rate": 7.236200256739409e-06, + "loss": 0.6994, + "step": 1881 + }, + { + "epoch": 2.4159178433889603, + "grad_norm": 4.949089527130127, + "learning_rate": 7.2400513478819e-06, + "loss": 0.7611, + "step": 1882 + }, + { + "epoch": 2.417201540436457, + "grad_norm": 2.989018201828003, + "learning_rate": 7.243902439024391e-06, + "loss": 0.733, + "step": 1883 + }, + { + "epoch": 2.418485237483954, + "grad_norm": 4.773780822753906, + "learning_rate": 7.247753530166881e-06, + "loss": 0.7721, + "step": 1884 + }, + { + "epoch": 2.419768934531451, + "grad_norm": 1.9948393106460571, + "learning_rate": 7.251604621309371e-06, + "loss": 0.6965, + "step": 1885 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 2.5019404888153076, + "learning_rate": 7.255455712451861e-06, + "loss": 0.7256, + "step": 1886 + }, + { + "epoch": 2.4223363286264443, + "grad_norm": 1.9773975610733032, + "learning_rate": 7.2593068035943515e-06, + "loss": 0.7634, + "step": 1887 + }, + { + "epoch": 2.423620025673941, + "grad_norm": 3.217463254928589, + "learning_rate": 7.2631578947368426e-06, + "loss": 0.7365, + "step": 1888 + }, + { + "epoch": 2.424903722721438, + "grad_norm": 2.108593463897705, + "learning_rate": 7.267008985879332e-06, + "loss": 0.7534, + "step": 1889 + }, + { + "epoch": 2.4261874197689344, + "grad_norm": 2.615530014038086, + "learning_rate": 7.270860077021823e-06, + "loss": 0.7252, + "step": 1890 + }, + { + "epoch": 2.4274711168164314, + "grad_norm": 2.096041679382324, + "learning_rate": 7.274711168164313e-06, + "loss": 0.7911, + "step": 1891 + }, + { + "epoch": 2.428754813863928, + "grad_norm": 1.9700123071670532, + "learning_rate": 7.2785622593068035e-06, + "loss": 0.8391, + "step": 1892 + }, + { + "epoch": 2.430038510911425, + "grad_norm": 2.6953890323638916, + "learning_rate": 7.282413350449294e-06, + "loss": 0.8184, + "step": 1893 + }, + { + "epoch": 2.431322207958922, + "grad_norm": 2.7659621238708496, + "learning_rate": 7.286264441591785e-06, + "loss": 0.7772, + "step": 1894 + }, + { + "epoch": 2.4326059050064184, + "grad_norm": 2.3411028385162354, + "learning_rate": 7.290115532734275e-06, + "loss": 0.7862, + "step": 1895 + }, + { + "epoch": 2.4338896020539154, + "grad_norm": 2.033334255218506, + "learning_rate": 7.293966623876765e-06, + "loss": 0.7781, + "step": 1896 + }, + { + "epoch": 2.435173299101412, + "grad_norm": 2.0999388694763184, + "learning_rate": 7.297817715019256e-06, + "loss": 0.7736, + "step": 1897 + }, + { + "epoch": 2.436456996148909, + "grad_norm": 1.9475162029266357, + "learning_rate": 7.301668806161746e-06, + "loss": 0.7463, + "step": 1898 + }, + { + "epoch": 2.437740693196406, + "grad_norm": 2.3356435298919678, + "learning_rate": 7.305519897304237e-06, + "loss": 0.7564, + "step": 1899 + }, + { + "epoch": 2.4390243902439024, + "grad_norm": 2.1365532875061035, + "learning_rate": 7.309370988446727e-06, + "loss": 0.8201, + "step": 1900 + }, + { + "epoch": 2.4403080872913994, + "grad_norm": 3.2961173057556152, + "learning_rate": 7.313222079589217e-06, + "loss": 0.8046, + "step": 1901 + }, + { + "epoch": 2.441591784338896, + "grad_norm": 4.210921764373779, + "learning_rate": 7.317073170731707e-06, + "loss": 0.862, + "step": 1902 + }, + { + "epoch": 2.442875481386393, + "grad_norm": 3.01894211769104, + "learning_rate": 7.3209242618741984e-06, + "loss": 0.8368, + "step": 1903 + }, + { + "epoch": 2.4441591784338894, + "grad_norm": 2.3942835330963135, + "learning_rate": 7.324775353016688e-06, + "loss": 0.7836, + "step": 1904 + }, + { + "epoch": 2.4454428754813864, + "grad_norm": 2.1849570274353027, + "learning_rate": 7.328626444159179e-06, + "loss": 0.8703, + "step": 1905 + }, + { + "epoch": 2.4467265725288834, + "grad_norm": 2.7399399280548096, + "learning_rate": 7.332477535301669e-06, + "loss": 0.9, + "step": 1906 + }, + { + "epoch": 2.44801026957638, + "grad_norm": 3.760615348815918, + "learning_rate": 7.336328626444159e-06, + "loss": 1.058, + "step": 1907 + }, + { + "epoch": 2.449293966623877, + "grad_norm": 3.982959508895874, + "learning_rate": 7.3401797175866496e-06, + "loss": 1.1137, + "step": 1908 + }, + { + "epoch": 2.4505776636713734, + "grad_norm": 1.7186707258224487, + "learning_rate": 7.344030808729141e-06, + "loss": 0.7522, + "step": 1909 + }, + { + "epoch": 2.4518613607188704, + "grad_norm": 2.305250883102417, + "learning_rate": 7.34788189987163e-06, + "loss": 0.722, + "step": 1910 + }, + { + "epoch": 2.453145057766367, + "grad_norm": 1.8333852291107178, + "learning_rate": 7.351732991014121e-06, + "loss": 0.7419, + "step": 1911 + }, + { + "epoch": 2.454428754813864, + "grad_norm": 2.0835366249084473, + "learning_rate": 7.3555840821566105e-06, + "loss": 0.7137, + "step": 1912 + }, + { + "epoch": 2.455712451861361, + "grad_norm": 2.081230640411377, + "learning_rate": 7.3594351732991015e-06, + "loss": 0.6954, + "step": 1913 + }, + { + "epoch": 2.4569961489088574, + "grad_norm": 2.7168636322021484, + "learning_rate": 7.363286264441592e-06, + "loss": 0.7054, + "step": 1914 + }, + { + "epoch": 2.4582798459563544, + "grad_norm": 1.6006404161453247, + "learning_rate": 7.367137355584082e-06, + "loss": 0.7336, + "step": 1915 + }, + { + "epoch": 2.459563543003851, + "grad_norm": 1.6543618440628052, + "learning_rate": 7.370988446726573e-06, + "loss": 0.7284, + "step": 1916 + }, + { + "epoch": 2.460847240051348, + "grad_norm": 1.72433602809906, + "learning_rate": 7.374839537869063e-06, + "loss": 0.7028, + "step": 1917 + }, + { + "epoch": 2.462130937098845, + "grad_norm": 1.7411807775497437, + "learning_rate": 7.3786906290115535e-06, + "loss": 0.746, + "step": 1918 + }, + { + "epoch": 2.4634146341463414, + "grad_norm": 3.6324210166931152, + "learning_rate": 7.382541720154044e-06, + "loss": 0.7114, + "step": 1919 + }, + { + "epoch": 2.4646983311938384, + "grad_norm": 1.8743780851364136, + "learning_rate": 7.386392811296535e-06, + "loss": 0.6652, + "step": 1920 + }, + { + "epoch": 2.465982028241335, + "grad_norm": 1.8245840072631836, + "learning_rate": 7.390243902439024e-06, + "loss": 0.6812, + "step": 1921 + }, + { + "epoch": 2.467265725288832, + "grad_norm": 1.6993836164474487, + "learning_rate": 7.394094993581515e-06, + "loss": 0.6414, + "step": 1922 + }, + { + "epoch": 2.4685494223363285, + "grad_norm": 2.1495940685272217, + "learning_rate": 7.3979460847240055e-06, + "loss": 0.7069, + "step": 1923 + }, + { + "epoch": 2.4698331193838254, + "grad_norm": 2.1574909687042236, + "learning_rate": 7.401797175866496e-06, + "loss": 0.7121, + "step": 1924 + }, + { + "epoch": 2.471116816431322, + "grad_norm": 1.9475167989730835, + "learning_rate": 7.405648267008986e-06, + "loss": 0.7152, + "step": 1925 + }, + { + "epoch": 2.472400513478819, + "grad_norm": 3.5151658058166504, + "learning_rate": 7.409499358151477e-06, + "loss": 0.698, + "step": 1926 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 2.224065065383911, + "learning_rate": 7.413350449293966e-06, + "loss": 0.7879, + "step": 1927 + }, + { + "epoch": 2.4749679075738125, + "grad_norm": 2.181615114212036, + "learning_rate": 7.417201540436457e-06, + "loss": 0.7212, + "step": 1928 + }, + { + "epoch": 2.4762516046213094, + "grad_norm": 1.886335849761963, + "learning_rate": 7.421052631578948e-06, + "loss": 0.7129, + "step": 1929 + }, + { + "epoch": 2.477535301668806, + "grad_norm": 4.37954568862915, + "learning_rate": 7.424903722721438e-06, + "loss": 0.7528, + "step": 1930 + }, + { + "epoch": 2.478818998716303, + "grad_norm": 2.1341216564178467, + "learning_rate": 7.428754813863928e-06, + "loss": 0.7401, + "step": 1931 + }, + { + "epoch": 2.4801026957638, + "grad_norm": 2.1121106147766113, + "learning_rate": 7.432605905006419e-06, + "loss": 0.7111, + "step": 1932 + }, + { + "epoch": 2.4813863928112965, + "grad_norm": 2.5326457023620605, + "learning_rate": 7.4364569961489085e-06, + "loss": 0.7624, + "step": 1933 + }, + { + "epoch": 2.4826700898587934, + "grad_norm": 1.992046594619751, + "learning_rate": 7.4403080872914e-06, + "loss": 0.7074, + "step": 1934 + }, + { + "epoch": 2.48395378690629, + "grad_norm": 2.220059394836426, + "learning_rate": 7.44415917843389e-06, + "loss": 0.693, + "step": 1935 + }, + { + "epoch": 2.485237483953787, + "grad_norm": 1.8103829622268677, + "learning_rate": 7.44801026957638e-06, + "loss": 0.7425, + "step": 1936 + }, + { + "epoch": 2.486521181001284, + "grad_norm": 3.4204020500183105, + "learning_rate": 7.45186136071887e-06, + "loss": 0.7636, + "step": 1937 + }, + { + "epoch": 2.4878048780487805, + "grad_norm": 4.032724380493164, + "learning_rate": 7.4557124518613605e-06, + "loss": 0.8546, + "step": 1938 + }, + { + "epoch": 2.4890885750962775, + "grad_norm": 3.213239908218384, + "learning_rate": 7.4595635430038516e-06, + "loss": 0.7267, + "step": 1939 + }, + { + "epoch": 2.490372272143774, + "grad_norm": 2.501004695892334, + "learning_rate": 7.463414634146342e-06, + "loss": 0.7454, + "step": 1940 + }, + { + "epoch": 2.491655969191271, + "grad_norm": 2.262531280517578, + "learning_rate": 7.467265725288832e-06, + "loss": 0.7831, + "step": 1941 + }, + { + "epoch": 2.4929396662387675, + "grad_norm": 3.0158321857452393, + "learning_rate": 7.471116816431322e-06, + "loss": 0.784, + "step": 1942 + }, + { + "epoch": 2.4942233632862645, + "grad_norm": 2.657529830932617, + "learning_rate": 7.474967907573813e-06, + "loss": 0.7499, + "step": 1943 + }, + { + "epoch": 2.495507060333761, + "grad_norm": 3.530212163925171, + "learning_rate": 7.478818998716303e-06, + "loss": 0.7823, + "step": 1944 + }, + { + "epoch": 2.496790757381258, + "grad_norm": 1.9463351964950562, + "learning_rate": 7.482670089858794e-06, + "loss": 0.7976, + "step": 1945 + }, + { + "epoch": 2.498074454428755, + "grad_norm": 2.9653244018554688, + "learning_rate": 7.486521181001284e-06, + "loss": 0.7711, + "step": 1946 + }, + { + "epoch": 2.4993581514762515, + "grad_norm": 2.359682559967041, + "learning_rate": 7.490372272143774e-06, + "loss": 0.765, + "step": 1947 + }, + { + "epoch": 2.5006418485237485, + "grad_norm": 2.204012393951416, + "learning_rate": 7.4942233632862644e-06, + "loss": 0.7831, + "step": 1948 + }, + { + "epoch": 2.501925545571245, + "grad_norm": 2.5900774002075195, + "learning_rate": 7.4980744544287555e-06, + "loss": 0.801, + "step": 1949 + }, + { + "epoch": 2.503209242618742, + "grad_norm": 11.998871803283691, + "learning_rate": 7.501925545571246e-06, + "loss": 0.7296, + "step": 1950 + }, + { + "epoch": 2.504492939666239, + "grad_norm": 3.9064342975616455, + "learning_rate": 7.505776636713736e-06, + "loss": 0.7926, + "step": 1951 + }, + { + "epoch": 2.5057766367137355, + "grad_norm": 4.258646011352539, + "learning_rate": 7.509627727856226e-06, + "loss": 0.8854, + "step": 1952 + }, + { + "epoch": 2.5070603337612325, + "grad_norm": 2.6175737380981445, + "learning_rate": 7.5134788189987155e-06, + "loss": 0.8479, + "step": 1953 + }, + { + "epoch": 2.508344030808729, + "grad_norm": 7.4966864585876465, + "learning_rate": 7.5173299101412075e-06, + "loss": 0.8078, + "step": 1954 + }, + { + "epoch": 2.509627727856226, + "grad_norm": 3.2520487308502197, + "learning_rate": 7.521181001283698e-06, + "loss": 0.8546, + "step": 1955 + }, + { + "epoch": 2.510911424903723, + "grad_norm": 4.323993682861328, + "learning_rate": 7.525032092426187e-06, + "loss": 0.8512, + "step": 1956 + }, + { + "epoch": 2.5121951219512195, + "grad_norm": 5.760152339935303, + "learning_rate": 7.528883183568677e-06, + "loss": 0.9068, + "step": 1957 + }, + { + "epoch": 2.513478818998716, + "grad_norm": 5.366430282592773, + "learning_rate": 7.532734274711169e-06, + "loss": 1.0974, + "step": 1958 + }, + { + "epoch": 2.514762516046213, + "grad_norm": 1.98116135597229, + "learning_rate": 7.536585365853659e-06, + "loss": 0.7263, + "step": 1959 + }, + { + "epoch": 2.51604621309371, + "grad_norm": 1.7964240312576294, + "learning_rate": 7.540436456996149e-06, + "loss": 0.7376, + "step": 1960 + }, + { + "epoch": 2.5173299101412066, + "grad_norm": 1.5737152099609375, + "learning_rate": 7.544287548138641e-06, + "loss": 0.718, + "step": 1961 + }, + { + "epoch": 2.5186136071887035, + "grad_norm": 1.3770389556884766, + "learning_rate": 7.54813863928113e-06, + "loss": 0.704, + "step": 1962 + }, + { + "epoch": 2.5198973042362, + "grad_norm": 1.9693607091903687, + "learning_rate": 7.55198973042362e-06, + "loss": 0.6895, + "step": 1963 + }, + { + "epoch": 2.521181001283697, + "grad_norm": 2.749272584915161, + "learning_rate": 7.55584082156611e-06, + "loss": 0.6809, + "step": 1964 + }, + { + "epoch": 2.522464698331194, + "grad_norm": 1.6003674268722534, + "learning_rate": 7.559691912708602e-06, + "loss": 0.7174, + "step": 1965 + }, + { + "epoch": 2.5237483953786906, + "grad_norm": 1.4683071374893188, + "learning_rate": 7.563543003851092e-06, + "loss": 0.6818, + "step": 1966 + }, + { + "epoch": 2.5250320924261875, + "grad_norm": 1.9415161609649658, + "learning_rate": 7.567394094993581e-06, + "loss": 0.6717, + "step": 1967 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 3.1242849826812744, + "learning_rate": 7.5712451861360714e-06, + "loss": 0.6869, + "step": 1968 + }, + { + "epoch": 2.527599486521181, + "grad_norm": 2.0313668251037598, + "learning_rate": 7.575096277278563e-06, + "loss": 0.7155, + "step": 1969 + }, + { + "epoch": 2.528883183568678, + "grad_norm": 2.0598368644714355, + "learning_rate": 7.578947368421053e-06, + "loss": 0.7421, + "step": 1970 + }, + { + "epoch": 2.5301668806161746, + "grad_norm": 2.216919422149658, + "learning_rate": 7.582798459563543e-06, + "loss": 0.7225, + "step": 1971 + }, + { + "epoch": 2.531450577663671, + "grad_norm": 2.2301573753356934, + "learning_rate": 7.586649550706033e-06, + "loss": 0.72, + "step": 1972 + }, + { + "epoch": 2.532734274711168, + "grad_norm": 1.485937476158142, + "learning_rate": 7.590500641848524e-06, + "loss": 0.7201, + "step": 1973 + }, + { + "epoch": 2.534017971758665, + "grad_norm": 3.890249013900757, + "learning_rate": 7.5943517329910145e-06, + "loss": 0.7123, + "step": 1974 + }, + { + "epoch": 2.5353016688061616, + "grad_norm": 1.9547008275985718, + "learning_rate": 7.598202824133505e-06, + "loss": 0.7298, + "step": 1975 + }, + { + "epoch": 2.5365853658536586, + "grad_norm": 1.7426552772521973, + "learning_rate": 7.602053915275994e-06, + "loss": 0.7487, + "step": 1976 + }, + { + "epoch": 2.537869062901155, + "grad_norm": 1.5906040668487549, + "learning_rate": 7.605905006418486e-06, + "loss": 0.6897, + "step": 1977 + }, + { + "epoch": 2.539152759948652, + "grad_norm": 1.8577150106430054, + "learning_rate": 7.609756097560976e-06, + "loss": 0.7316, + "step": 1978 + }, + { + "epoch": 2.540436456996149, + "grad_norm": 2.4344217777252197, + "learning_rate": 7.613607188703466e-06, + "loss": 0.7007, + "step": 1979 + }, + { + "epoch": 2.5417201540436456, + "grad_norm": 3.6037774085998535, + "learning_rate": 7.617458279845956e-06, + "loss": 0.7122, + "step": 1980 + }, + { + "epoch": 2.5430038510911426, + "grad_norm": 2.680894613265991, + "learning_rate": 7.621309370988448e-06, + "loss": 0.7293, + "step": 1981 + }, + { + "epoch": 2.544287548138639, + "grad_norm": 1.8831545114517212, + "learning_rate": 7.625160462130937e-06, + "loss": 0.7351, + "step": 1982 + }, + { + "epoch": 2.545571245186136, + "grad_norm": 1.9315872192382812, + "learning_rate": 7.629011553273427e-06, + "loss": 0.7206, + "step": 1983 + }, + { + "epoch": 2.546854942233633, + "grad_norm": 3.3853394985198975, + "learning_rate": 7.63286264441592e-06, + "loss": 0.6913, + "step": 1984 + }, + { + "epoch": 2.5481386392811296, + "grad_norm": 4.272487640380859, + "learning_rate": 7.636713735558409e-06, + "loss": 0.7781, + "step": 1985 + }, + { + "epoch": 2.5494223363286266, + "grad_norm": 1.974852204322815, + "learning_rate": 7.640564826700898e-06, + "loss": 0.7096, + "step": 1986 + }, + { + "epoch": 2.550706033376123, + "grad_norm": 1.9306293725967407, + "learning_rate": 7.644415917843389e-06, + "loss": 0.7336, + "step": 1987 + }, + { + "epoch": 2.55198973042362, + "grad_norm": 2.2347288131713867, + "learning_rate": 7.64826700898588e-06, + "loss": 0.7057, + "step": 1988 + }, + { + "epoch": 2.553273427471117, + "grad_norm": 2.7814571857452393, + "learning_rate": 7.65211810012837e-06, + "loss": 0.7225, + "step": 1989 + }, + { + "epoch": 2.5545571245186136, + "grad_norm": 2.2914586067199707, + "learning_rate": 7.65596919127086e-06, + "loss": 0.719, + "step": 1990 + }, + { + "epoch": 2.55584082156611, + "grad_norm": 2.0528714656829834, + "learning_rate": 7.65982028241335e-06, + "loss": 0.7153, + "step": 1991 + }, + { + "epoch": 2.557124518613607, + "grad_norm": 2.0788707733154297, + "learning_rate": 7.663671373555841e-06, + "loss": 0.7632, + "step": 1992 + }, + { + "epoch": 2.558408215661104, + "grad_norm": 2.302419900894165, + "learning_rate": 7.667522464698332e-06, + "loss": 0.7336, + "step": 1993 + }, + { + "epoch": 2.5596919127086006, + "grad_norm": 4.28538703918457, + "learning_rate": 7.671373555840821e-06, + "loss": 0.7774, + "step": 1994 + }, + { + "epoch": 2.5609756097560976, + "grad_norm": 2.624204158782959, + "learning_rate": 7.675224646983311e-06, + "loss": 0.7325, + "step": 1995 + }, + { + "epoch": 2.562259306803594, + "grad_norm": 3.534843921661377, + "learning_rate": 7.679075738125804e-06, + "loss": 0.7806, + "step": 1996 + }, + { + "epoch": 2.563543003851091, + "grad_norm": 2.8356964588165283, + "learning_rate": 7.682926829268293e-06, + "loss": 0.833, + "step": 1997 + }, + { + "epoch": 2.564826700898588, + "grad_norm": 2.2230989933013916, + "learning_rate": 7.686777920410782e-06, + "loss": 0.8205, + "step": 1998 + }, + { + "epoch": 2.5661103979460846, + "grad_norm": 3.969996452331543, + "learning_rate": 7.690629011553273e-06, + "loss": 0.8111, + "step": 1999 + }, + { + "epoch": 2.5673940949935816, + "grad_norm": 2.715332269668579, + "learning_rate": 7.694480102695765e-06, + "loss": 0.7957, + "step": 2000 + }, + { + "epoch": 2.5673940949935816, + "eval_cer": 0.3201488153514784, + "eval_loss": 0.6957088708877563, + "eval_runtime": 14.6326, + "eval_samples_per_second": 67.179, + "eval_steps_per_second": 0.478, + "eval_wer": 0.6096078880990599, + "step": 2000 + }, + { + "epoch": 2.568677792041078, + "grad_norm": 5.308093547821045, + "learning_rate": 7.698331193838254e-06, + "loss": 0.8429, + "step": 2001 + }, + { + "epoch": 2.569961489088575, + "grad_norm": 2.0056324005126953, + "learning_rate": 7.702182284980745e-06, + "loss": 0.7633, + "step": 2002 + }, + { + "epoch": 2.571245186136072, + "grad_norm": 3.516998052597046, + "learning_rate": 7.706033376123234e-06, + "loss": 0.7705, + "step": 2003 + }, + { + "epoch": 2.5725288831835686, + "grad_norm": 3.503478527069092, + "learning_rate": 7.709884467265725e-06, + "loss": 0.8182, + "step": 2004 + }, + { + "epoch": 2.5738125802310656, + "grad_norm": 2.830655813217163, + "learning_rate": 7.713735558408216e-06, + "loss": 0.8722, + "step": 2005 + }, + { + "epoch": 2.575096277278562, + "grad_norm": 4.934455394744873, + "learning_rate": 7.717586649550706e-06, + "loss": 0.9202, + "step": 2006 + }, + { + "epoch": 2.576379974326059, + "grad_norm": 4.88229513168335, + "learning_rate": 7.721437740693197e-06, + "loss": 0.9115, + "step": 2007 + }, + { + "epoch": 2.5776636713735557, + "grad_norm": 8.74393367767334, + "learning_rate": 7.725288831835688e-06, + "loss": 1.0944, + "step": 2008 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 2.3037712574005127, + "learning_rate": 7.729139922978177e-06, + "loss": 0.7245, + "step": 2009 + }, + { + "epoch": 2.580231065468549, + "grad_norm": 3.0056824684143066, + "learning_rate": 7.732991014120667e-06, + "loss": 0.688, + "step": 2010 + }, + { + "epoch": 2.581514762516046, + "grad_norm": 2.066807985305786, + "learning_rate": 7.73684210526316e-06, + "loss": 0.6739, + "step": 2011 + }, + { + "epoch": 2.582798459563543, + "grad_norm": 1.8481614589691162, + "learning_rate": 7.740693196405649e-06, + "loss": 0.7207, + "step": 2012 + }, + { + "epoch": 2.5840821566110397, + "grad_norm": 2.012495994567871, + "learning_rate": 7.744544287548138e-06, + "loss": 0.7147, + "step": 2013 + }, + { + "epoch": 2.5853658536585367, + "grad_norm": 1.9220106601715088, + "learning_rate": 7.74839537869063e-06, + "loss": 0.7071, + "step": 2014 + }, + { + "epoch": 2.586649550706033, + "grad_norm": 2.3769333362579346, + "learning_rate": 7.75224646983312e-06, + "loss": 0.7116, + "step": 2015 + }, + { + "epoch": 2.58793324775353, + "grad_norm": 3.2910382747650146, + "learning_rate": 7.75609756097561e-06, + "loss": 0.6952, + "step": 2016 + }, + { + "epoch": 2.589216944801027, + "grad_norm": 2.0557937622070312, + "learning_rate": 7.759948652118101e-06, + "loss": 0.6616, + "step": 2017 + }, + { + "epoch": 2.5905006418485237, + "grad_norm": 2.403759479522705, + "learning_rate": 7.76379974326059e-06, + "loss": 0.6973, + "step": 2018 + }, + { + "epoch": 2.5917843388960207, + "grad_norm": 2.348902940750122, + "learning_rate": 7.767650834403081e-06, + "loss": 0.6894, + "step": 2019 + }, + { + "epoch": 2.593068035943517, + "grad_norm": 2.2182934284210205, + "learning_rate": 7.771501925545572e-06, + "loss": 0.7289, + "step": 2020 + }, + { + "epoch": 2.594351732991014, + "grad_norm": 1.998344898223877, + "learning_rate": 7.775353016688062e-06, + "loss": 0.7261, + "step": 2021 + }, + { + "epoch": 2.595635430038511, + "grad_norm": 2.153170347213745, + "learning_rate": 7.779204107830551e-06, + "loss": 0.7221, + "step": 2022 + }, + { + "epoch": 2.5969191270860077, + "grad_norm": 1.8082236051559448, + "learning_rate": 7.783055198973044e-06, + "loss": 0.7027, + "step": 2023 + }, + { + "epoch": 2.5982028241335042, + "grad_norm": 1.948409080505371, + "learning_rate": 7.786906290115533e-06, + "loss": 0.6832, + "step": 2024 + }, + { + "epoch": 2.599486521181001, + "grad_norm": 1.8560880422592163, + "learning_rate": 7.790757381258023e-06, + "loss": 0.6877, + "step": 2025 + }, + { + "epoch": 2.600770218228498, + "grad_norm": 1.6946041584014893, + "learning_rate": 7.794608472400514e-06, + "loss": 0.6789, + "step": 2026 + }, + { + "epoch": 2.6020539152759947, + "grad_norm": 1.878175973892212, + "learning_rate": 7.798459563543005e-06, + "loss": 0.7338, + "step": 2027 + }, + { + "epoch": 2.6033376123234917, + "grad_norm": 6.351782321929932, + "learning_rate": 7.802310654685494e-06, + "loss": 0.7497, + "step": 2028 + }, + { + "epoch": 2.6046213093709882, + "grad_norm": 2.267516851425171, + "learning_rate": 7.806161745827984e-06, + "loss": 0.7401, + "step": 2029 + }, + { + "epoch": 2.605905006418485, + "grad_norm": 2.4305388927459717, + "learning_rate": 7.810012836970476e-06, + "loss": 0.7402, + "step": 2030 + }, + { + "epoch": 2.607188703465982, + "grad_norm": 2.7717108726501465, + "learning_rate": 7.813863928112966e-06, + "loss": 0.666, + "step": 2031 + }, + { + "epoch": 2.6084724005134787, + "grad_norm": 5.0639519691467285, + "learning_rate": 7.817715019255455e-06, + "loss": 0.6935, + "step": 2032 + }, + { + "epoch": 2.6097560975609757, + "grad_norm": 2.6423511505126953, + "learning_rate": 7.821566110397946e-06, + "loss": 0.7094, + "step": 2033 + }, + { + "epoch": 2.6110397946084722, + "grad_norm": 2.5543696880340576, + "learning_rate": 7.825417201540437e-06, + "loss": 0.7486, + "step": 2034 + }, + { + "epoch": 2.612323491655969, + "grad_norm": 2.4507572650909424, + "learning_rate": 7.829268292682927e-06, + "loss": 0.6916, + "step": 2035 + }, + { + "epoch": 2.613607188703466, + "grad_norm": 2.713169813156128, + "learning_rate": 7.833119383825418e-06, + "loss": 0.7424, + "step": 2036 + }, + { + "epoch": 2.6148908857509627, + "grad_norm": 2.476253032684326, + "learning_rate": 7.836970474967907e-06, + "loss": 0.7567, + "step": 2037 + }, + { + "epoch": 2.6161745827984597, + "grad_norm": 1.8716075420379639, + "learning_rate": 7.840821566110398e-06, + "loss": 0.7248, + "step": 2038 + }, + { + "epoch": 2.6174582798459562, + "grad_norm": 2.1893043518066406, + "learning_rate": 7.84467265725289e-06, + "loss": 0.7039, + "step": 2039 + }, + { + "epoch": 2.6187419768934532, + "grad_norm": 2.7815959453582764, + "learning_rate": 7.848523748395379e-06, + "loss": 0.7442, + "step": 2040 + }, + { + "epoch": 2.62002567394095, + "grad_norm": 3.218972682952881, + "learning_rate": 7.852374839537868e-06, + "loss": 0.727, + "step": 2041 + }, + { + "epoch": 2.6213093709884467, + "grad_norm": 2.4592370986938477, + "learning_rate": 7.85622593068036e-06, + "loss": 0.6956, + "step": 2042 + }, + { + "epoch": 2.6225930680359433, + "grad_norm": 1.918702483177185, + "learning_rate": 7.86007702182285e-06, + "loss": 0.7666, + "step": 2043 + }, + { + "epoch": 2.6238767650834403, + "grad_norm": 2.8523895740509033, + "learning_rate": 7.86392811296534e-06, + "loss": 0.759, + "step": 2044 + }, + { + "epoch": 2.6251604621309372, + "grad_norm": 3.804471015930176, + "learning_rate": 7.86777920410783e-06, + "loss": 0.737, + "step": 2045 + }, + { + "epoch": 2.6264441591784338, + "grad_norm": 2.881148099899292, + "learning_rate": 7.871630295250322e-06, + "loss": 0.7631, + "step": 2046 + }, + { + "epoch": 2.6277278562259307, + "grad_norm": 4.3504767417907715, + "learning_rate": 7.875481386392811e-06, + "loss": 0.7941, + "step": 2047 + }, + { + "epoch": 2.6290115532734273, + "grad_norm": 3.2181901931762695, + "learning_rate": 7.879332477535302e-06, + "loss": 0.7953, + "step": 2048 + }, + { + "epoch": 2.6302952503209243, + "grad_norm": 2.5805485248565674, + "learning_rate": 7.883183568677791e-06, + "loss": 0.7952, + "step": 2049 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 3.130258560180664, + "learning_rate": 7.887034659820282e-06, + "loss": 0.8451, + "step": 2050 + }, + { + "epoch": 2.6328626444159178, + "grad_norm": 3.271657705307007, + "learning_rate": 7.890885750962774e-06, + "loss": 0.7614, + "step": 2051 + }, + { + "epoch": 2.6341463414634148, + "grad_norm": 3.2390191555023193, + "learning_rate": 7.894736842105263e-06, + "loss": 0.7996, + "step": 2052 + }, + { + "epoch": 2.6354300385109113, + "grad_norm": 4.438181400299072, + "learning_rate": 7.898587933247754e-06, + "loss": 0.7758, + "step": 2053 + }, + { + "epoch": 2.6367137355584083, + "grad_norm": 2.5000100135803223, + "learning_rate": 7.902439024390245e-06, + "loss": 0.8283, + "step": 2054 + }, + { + "epoch": 2.6379974326059052, + "grad_norm": 4.1029839515686035, + "learning_rate": 7.906290115532734e-06, + "loss": 0.8846, + "step": 2055 + }, + { + "epoch": 2.639281129653402, + "grad_norm": 3.065624952316284, + "learning_rate": 7.910141206675224e-06, + "loss": 0.9019, + "step": 2056 + }, + { + "epoch": 2.6405648267008983, + "grad_norm": 4.0107316970825195, + "learning_rate": 7.913992297817717e-06, + "loss": 0.9637, + "step": 2057 + }, + { + "epoch": 2.6418485237483953, + "grad_norm": 10.800861358642578, + "learning_rate": 7.917843388960206e-06, + "loss": 1.1928, + "step": 2058 + }, + { + "epoch": 2.6431322207958923, + "grad_norm": 2.7875585556030273, + "learning_rate": 7.921694480102695e-06, + "loss": 0.7275, + "step": 2059 + }, + { + "epoch": 2.644415917843389, + "grad_norm": 1.876816749572754, + "learning_rate": 7.925545571245186e-06, + "loss": 0.6818, + "step": 2060 + }, + { + "epoch": 2.645699614890886, + "grad_norm": 2.103193521499634, + "learning_rate": 7.929396662387677e-06, + "loss": 0.6653, + "step": 2061 + }, + { + "epoch": 2.6469833119383823, + "grad_norm": 1.5445969104766846, + "learning_rate": 7.933247753530167e-06, + "loss": 0.7101, + "step": 2062 + }, + { + "epoch": 2.6482670089858793, + "grad_norm": 1.8879718780517578, + "learning_rate": 7.937098844672658e-06, + "loss": 0.6993, + "step": 2063 + }, + { + "epoch": 2.6495507060333763, + "grad_norm": 1.9753280878067017, + "learning_rate": 7.940949935815147e-06, + "loss": 0.6477, + "step": 2064 + }, + { + "epoch": 2.650834403080873, + "grad_norm": 1.6821177005767822, + "learning_rate": 7.944801026957638e-06, + "loss": 0.6598, + "step": 2065 + }, + { + "epoch": 2.65211810012837, + "grad_norm": 1.8339284658432007, + "learning_rate": 7.94865211810013e-06, + "loss": 0.695, + "step": 2066 + }, + { + "epoch": 2.6534017971758663, + "grad_norm": 2.047773838043213, + "learning_rate": 7.952503209242619e-06, + "loss": 0.6818, + "step": 2067 + }, + { + "epoch": 2.6546854942233633, + "grad_norm": 3.418534994125366, + "learning_rate": 7.956354300385108e-06, + "loss": 0.7119, + "step": 2068 + }, + { + "epoch": 2.6559691912708603, + "grad_norm": 2.612258195877075, + "learning_rate": 7.960205391527601e-06, + "loss": 0.6566, + "step": 2069 + }, + { + "epoch": 2.657252888318357, + "grad_norm": 2.262711763381958, + "learning_rate": 7.96405648267009e-06, + "loss": 0.6573, + "step": 2070 + }, + { + "epoch": 2.658536585365854, + "grad_norm": 1.944033145904541, + "learning_rate": 7.96790757381258e-06, + "loss": 0.6966, + "step": 2071 + }, + { + "epoch": 2.6598202824133503, + "grad_norm": 1.7932415008544922, + "learning_rate": 7.97175866495507e-06, + "loss": 0.6755, + "step": 2072 + }, + { + "epoch": 2.6611039794608473, + "grad_norm": 3.426957368850708, + "learning_rate": 7.975609756097562e-06, + "loss": 0.6619, + "step": 2073 + }, + { + "epoch": 2.6623876765083443, + "grad_norm": 3.230947494506836, + "learning_rate": 7.979460847240051e-06, + "loss": 0.7013, + "step": 2074 + }, + { + "epoch": 2.663671373555841, + "grad_norm": 2.340233325958252, + "learning_rate": 7.983311938382542e-06, + "loss": 0.6741, + "step": 2075 + }, + { + "epoch": 2.6649550706033374, + "grad_norm": 2.2414891719818115, + "learning_rate": 7.987163029525033e-06, + "loss": 0.6883, + "step": 2076 + }, + { + "epoch": 2.6662387676508343, + "grad_norm": 2.2187957763671875, + "learning_rate": 7.991014120667523e-06, + "loss": 0.6852, + "step": 2077 + }, + { + "epoch": 2.6675224646983313, + "grad_norm": 3.0733206272125244, + "learning_rate": 7.994865211810012e-06, + "loss": 0.728, + "step": 2078 + }, + { + "epoch": 2.668806161745828, + "grad_norm": 2.078970193862915, + "learning_rate": 7.998716302952503e-06, + "loss": 0.7163, + "step": 2079 + }, + { + "epoch": 2.670089858793325, + "grad_norm": 1.8431140184402466, + "learning_rate": 8.002567394094994e-06, + "loss": 0.6848, + "step": 2080 + }, + { + "epoch": 2.6713735558408214, + "grad_norm": 2.4920871257781982, + "learning_rate": 8.006418485237484e-06, + "loss": 0.7413, + "step": 2081 + }, + { + "epoch": 2.6726572528883183, + "grad_norm": 2.878321409225464, + "learning_rate": 8.010269576379975e-06, + "loss": 0.7012, + "step": 2082 + }, + { + "epoch": 2.6739409499358153, + "grad_norm": 4.225757122039795, + "learning_rate": 8.014120667522464e-06, + "loss": 0.7466, + "step": 2083 + }, + { + "epoch": 2.675224646983312, + "grad_norm": 3.068026542663574, + "learning_rate": 8.017971758664955e-06, + "loss": 0.7244, + "step": 2084 + }, + { + "epoch": 2.676508344030809, + "grad_norm": 2.0471417903900146, + "learning_rate": 8.021822849807446e-06, + "loss": 0.6737, + "step": 2085 + }, + { + "epoch": 2.6777920410783054, + "grad_norm": 2.5882177352905273, + "learning_rate": 8.025673940949936e-06, + "loss": 0.7101, + "step": 2086 + }, + { + "epoch": 2.6790757381258024, + "grad_norm": 2.920259475708008, + "learning_rate": 8.029525032092425e-06, + "loss": 0.7128, + "step": 2087 + }, + { + "epoch": 2.6803594351732993, + "grad_norm": 2.545740842819214, + "learning_rate": 8.033376123234918e-06, + "loss": 0.7189, + "step": 2088 + }, + { + "epoch": 2.681643132220796, + "grad_norm": 2.480940103530884, + "learning_rate": 8.037227214377407e-06, + "loss": 0.7102, + "step": 2089 + }, + { + "epoch": 2.682926829268293, + "grad_norm": 3.395078182220459, + "learning_rate": 8.041078305519896e-06, + "loss": 0.7694, + "step": 2090 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 2.669445037841797, + "learning_rate": 8.044929396662388e-06, + "loss": 0.7538, + "step": 2091 + }, + { + "epoch": 2.6854942233632864, + "grad_norm": 2.7372710704803467, + "learning_rate": 8.048780487804879e-06, + "loss": 0.7643, + "step": 2092 + }, + { + "epoch": 2.686777920410783, + "grad_norm": 3.57773494720459, + "learning_rate": 8.052631578947368e-06, + "loss": 0.6953, + "step": 2093 + }, + { + "epoch": 2.68806161745828, + "grad_norm": 2.1781322956085205, + "learning_rate": 8.056482670089859e-06, + "loss": 0.7205, + "step": 2094 + }, + { + "epoch": 2.6893453145057764, + "grad_norm": 2.792222261428833, + "learning_rate": 8.060333761232348e-06, + "loss": 0.7024, + "step": 2095 + }, + { + "epoch": 2.6906290115532734, + "grad_norm": 4.950434684753418, + "learning_rate": 8.06418485237484e-06, + "loss": 0.7592, + "step": 2096 + }, + { + "epoch": 2.6919127086007704, + "grad_norm": 3.0283758640289307, + "learning_rate": 8.06803594351733e-06, + "loss": 0.7797, + "step": 2097 + }, + { + "epoch": 2.693196405648267, + "grad_norm": 3.073754072189331, + "learning_rate": 8.07188703465982e-06, + "loss": 0.7654, + "step": 2098 + }, + { + "epoch": 2.694480102695764, + "grad_norm": 3.3720569610595703, + "learning_rate": 8.075738125802311e-06, + "loss": 0.7247, + "step": 2099 + }, + { + "epoch": 2.6957637997432604, + "grad_norm": 3.835718870162964, + "learning_rate": 8.079589216944802e-06, + "loss": 0.7835, + "step": 2100 + }, + { + "epoch": 2.6970474967907574, + "grad_norm": 4.170992374420166, + "learning_rate": 8.083440308087291e-06, + "loss": 0.7569, + "step": 2101 + }, + { + "epoch": 2.6983311938382544, + "grad_norm": 5.629909992218018, + "learning_rate": 8.087291399229781e-06, + "loss": 0.8512, + "step": 2102 + }, + { + "epoch": 2.699614890885751, + "grad_norm": 2.7098913192749023, + "learning_rate": 8.091142490372274e-06, + "loss": 0.8249, + "step": 2103 + }, + { + "epoch": 2.700898587933248, + "grad_norm": 3.132858991622925, + "learning_rate": 8.094993581514763e-06, + "loss": 0.8636, + "step": 2104 + }, + { + "epoch": 2.7021822849807444, + "grad_norm": 4.499757766723633, + "learning_rate": 8.098844672657252e-06, + "loss": 0.8262, + "step": 2105 + }, + { + "epoch": 2.7034659820282414, + "grad_norm": 4.087027072906494, + "learning_rate": 8.102695763799743e-06, + "loss": 0.7867, + "step": 2106 + }, + { + "epoch": 2.7047496790757384, + "grad_norm": 3.406259775161743, + "learning_rate": 8.106546854942235e-06, + "loss": 0.8559, + "step": 2107 + }, + { + "epoch": 2.706033376123235, + "grad_norm": 6.020706653594971, + "learning_rate": 8.110397946084724e-06, + "loss": 1.0404, + "step": 2108 + }, + { + "epoch": 2.7073170731707314, + "grad_norm": 2.020845413208008, + "learning_rate": 8.114249037227215e-06, + "loss": 0.7137, + "step": 2109 + }, + { + "epoch": 2.7086007702182284, + "grad_norm": 2.591505527496338, + "learning_rate": 8.118100128369704e-06, + "loss": 0.6787, + "step": 2110 + }, + { + "epoch": 2.7098844672657254, + "grad_norm": 1.6995986700057983, + "learning_rate": 8.121951219512195e-06, + "loss": 0.7184, + "step": 2111 + }, + { + "epoch": 2.711168164313222, + "grad_norm": 2.237093687057495, + "learning_rate": 8.125802310654686e-06, + "loss": 0.7213, + "step": 2112 + }, + { + "epoch": 2.712451861360719, + "grad_norm": 2.1191422939300537, + "learning_rate": 8.129653401797176e-06, + "loss": 0.682, + "step": 2113 + }, + { + "epoch": 2.7137355584082155, + "grad_norm": 2.375401496887207, + "learning_rate": 8.133504492939665e-06, + "loss": 0.6398, + "step": 2114 + }, + { + "epoch": 2.7150192554557124, + "grad_norm": 1.9156849384307861, + "learning_rate": 8.137355584082158e-06, + "loss": 0.7072, + "step": 2115 + }, + { + "epoch": 2.7163029525032094, + "grad_norm": 1.5622645616531372, + "learning_rate": 8.141206675224647e-06, + "loss": 0.6982, + "step": 2116 + }, + { + "epoch": 2.717586649550706, + "grad_norm": 1.8527215719223022, + "learning_rate": 8.145057766367137e-06, + "loss": 0.7107, + "step": 2117 + }, + { + "epoch": 2.718870346598203, + "grad_norm": 2.8598368167877197, + "learning_rate": 8.148908857509628e-06, + "loss": 0.7097, + "step": 2118 + }, + { + "epoch": 2.7201540436456995, + "grad_norm": 1.8807742595672607, + "learning_rate": 8.152759948652119e-06, + "loss": 0.7064, + "step": 2119 + }, + { + "epoch": 2.7214377406931964, + "grad_norm": 1.801220178604126, + "learning_rate": 8.156611039794608e-06, + "loss": 0.7099, + "step": 2120 + }, + { + "epoch": 2.7227214377406934, + "grad_norm": 1.928882360458374, + "learning_rate": 8.1604621309371e-06, + "loss": 0.6965, + "step": 2121 + }, + { + "epoch": 2.72400513478819, + "grad_norm": 2.2771427631378174, + "learning_rate": 8.16431322207959e-06, + "loss": 0.6884, + "step": 2122 + }, + { + "epoch": 2.725288831835687, + "grad_norm": 4.316100120544434, + "learning_rate": 8.16816431322208e-06, + "loss": 0.705, + "step": 2123 + }, + { + "epoch": 2.7265725288831835, + "grad_norm": 1.7974181175231934, + "learning_rate": 8.172015404364571e-06, + "loss": 0.6871, + "step": 2124 + }, + { + "epoch": 2.7278562259306804, + "grad_norm": 2.602572441101074, + "learning_rate": 8.17586649550706e-06, + "loss": 0.7078, + "step": 2125 + }, + { + "epoch": 2.7291399229781774, + "grad_norm": 1.7945259809494019, + "learning_rate": 8.179717586649551e-06, + "loss": 0.7297, + "step": 2126 + }, + { + "epoch": 2.730423620025674, + "grad_norm": 1.7496601343154907, + "learning_rate": 8.183568677792042e-06, + "loss": 0.6699, + "step": 2127 + }, + { + "epoch": 2.7317073170731705, + "grad_norm": 2.028233528137207, + "learning_rate": 8.187419768934532e-06, + "loss": 0.7303, + "step": 2128 + }, + { + "epoch": 2.7329910141206675, + "grad_norm": 2.2645654678344727, + "learning_rate": 8.191270860077021e-06, + "loss": 0.7488, + "step": 2129 + }, + { + "epoch": 2.7342747111681645, + "grad_norm": 2.1540451049804688, + "learning_rate": 8.195121951219512e-06, + "loss": 0.6739, + "step": 2130 + }, + { + "epoch": 2.735558408215661, + "grad_norm": 2.1010475158691406, + "learning_rate": 8.198973042362003e-06, + "loss": 0.7098, + "step": 2131 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 2.443317174911499, + "learning_rate": 8.202824133504493e-06, + "loss": 0.6531, + "step": 2132 + }, + { + "epoch": 2.7381258023106545, + "grad_norm": 2.301025867462158, + "learning_rate": 8.206675224646982e-06, + "loss": 0.669, + "step": 2133 + }, + { + "epoch": 2.7394094993581515, + "grad_norm": 3.028306484222412, + "learning_rate": 8.210526315789475e-06, + "loss": 0.7173, + "step": 2134 + }, + { + "epoch": 2.7406931964056485, + "grad_norm": 1.7959245443344116, + "learning_rate": 8.214377406931964e-06, + "loss": 0.7022, + "step": 2135 + }, + { + "epoch": 2.741976893453145, + "grad_norm": 2.611470937728882, + "learning_rate": 8.218228498074454e-06, + "loss": 0.6931, + "step": 2136 + }, + { + "epoch": 2.743260590500642, + "grad_norm": 5.3553547859191895, + "learning_rate": 8.222079589216945e-06, + "loss": 0.7564, + "step": 2137 + }, + { + "epoch": 2.7445442875481385, + "grad_norm": 2.0330188274383545, + "learning_rate": 8.225930680359436e-06, + "loss": 0.7263, + "step": 2138 + }, + { + "epoch": 2.7458279845956355, + "grad_norm": 1.9592126607894897, + "learning_rate": 8.229781771501925e-06, + "loss": 0.7274, + "step": 2139 + }, + { + "epoch": 2.7471116816431325, + "grad_norm": 2.9866883754730225, + "learning_rate": 8.233632862644416e-06, + "loss": 0.7265, + "step": 2140 + }, + { + "epoch": 2.748395378690629, + "grad_norm": 2.0948076248168945, + "learning_rate": 8.237483953786907e-06, + "loss": 0.7225, + "step": 2141 + }, + { + "epoch": 2.7496790757381255, + "grad_norm": 2.9775516986846924, + "learning_rate": 8.241335044929397e-06, + "loss": 0.7741, + "step": 2142 + }, + { + "epoch": 2.7509627727856225, + "grad_norm": 2.5146214962005615, + "learning_rate": 8.245186136071888e-06, + "loss": 0.8222, + "step": 2143 + }, + { + "epoch": 2.7522464698331195, + "grad_norm": 5.031277656555176, + "learning_rate": 8.249037227214377e-06, + "loss": 0.7584, + "step": 2144 + }, + { + "epoch": 2.753530166880616, + "grad_norm": 2.027562379837036, + "learning_rate": 8.252888318356868e-06, + "loss": 0.7433, + "step": 2145 + }, + { + "epoch": 2.754813863928113, + "grad_norm": 2.422111749649048, + "learning_rate": 8.25673940949936e-06, + "loss": 0.7358, + "step": 2146 + }, + { + "epoch": 2.7560975609756095, + "grad_norm": 3.5474655628204346, + "learning_rate": 8.260590500641849e-06, + "loss": 0.701, + "step": 2147 + }, + { + "epoch": 2.7573812580231065, + "grad_norm": 2.7640202045440674, + "learning_rate": 8.264441591784338e-06, + "loss": 0.7724, + "step": 2148 + }, + { + "epoch": 2.7586649550706035, + "grad_norm": 4.56760311126709, + "learning_rate": 8.26829268292683e-06, + "loss": 0.7994, + "step": 2149 + }, + { + "epoch": 2.7599486521181, + "grad_norm": 3.5492961406707764, + "learning_rate": 8.27214377406932e-06, + "loss": 0.7877, + "step": 2150 + }, + { + "epoch": 2.761232349165597, + "grad_norm": 3.1943113803863525, + "learning_rate": 8.27599486521181e-06, + "loss": 0.8052, + "step": 2151 + }, + { + "epoch": 2.7625160462130935, + "grad_norm": 3.166848659515381, + "learning_rate": 8.2798459563543e-06, + "loss": 0.7131, + "step": 2152 + }, + { + "epoch": 2.7637997432605905, + "grad_norm": 6.033725738525391, + "learning_rate": 8.283697047496792e-06, + "loss": 0.8333, + "step": 2153 + }, + { + "epoch": 2.7650834403080875, + "grad_norm": 2.3792359828948975, + "learning_rate": 8.287548138639281e-06, + "loss": 0.9036, + "step": 2154 + }, + { + "epoch": 2.766367137355584, + "grad_norm": 4.297341346740723, + "learning_rate": 8.291399229781772e-06, + "loss": 0.8442, + "step": 2155 + }, + { + "epoch": 2.767650834403081, + "grad_norm": 3.041365623474121, + "learning_rate": 8.295250320924261e-06, + "loss": 0.8462, + "step": 2156 + }, + { + "epoch": 2.7689345314505776, + "grad_norm": 6.618650913238525, + "learning_rate": 8.299101412066752e-06, + "loss": 0.8673, + "step": 2157 + }, + { + "epoch": 2.7702182284980745, + "grad_norm": 7.356569290161133, + "learning_rate": 8.302952503209244e-06, + "loss": 1.1417, + "step": 2158 + }, + { + "epoch": 2.7715019255455715, + "grad_norm": 2.283582925796509, + "learning_rate": 8.306803594351733e-06, + "loss": 0.7227, + "step": 2159 + }, + { + "epoch": 2.772785622593068, + "grad_norm": 1.581229567527771, + "learning_rate": 8.310654685494222e-06, + "loss": 0.6902, + "step": 2160 + }, + { + "epoch": 2.7740693196405646, + "grad_norm": 1.8156702518463135, + "learning_rate": 8.314505776636715e-06, + "loss": 0.6861, + "step": 2161 + }, + { + "epoch": 2.7753530166880616, + "grad_norm": 1.7232365608215332, + "learning_rate": 8.318356867779204e-06, + "loss": 0.6618, + "step": 2162 + }, + { + "epoch": 2.7766367137355585, + "grad_norm": 2.098971366882324, + "learning_rate": 8.322207958921694e-06, + "loss": 0.7073, + "step": 2163 + }, + { + "epoch": 2.777920410783055, + "grad_norm": 1.4133437871932983, + "learning_rate": 8.326059050064187e-06, + "loss": 0.6768, + "step": 2164 + }, + { + "epoch": 2.779204107830552, + "grad_norm": 2.5567712783813477, + "learning_rate": 8.329910141206676e-06, + "loss": 0.6525, + "step": 2165 + }, + { + "epoch": 2.7804878048780486, + "grad_norm": 3.144292116165161, + "learning_rate": 8.333761232349165e-06, + "loss": 0.6691, + "step": 2166 + }, + { + "epoch": 2.7817715019255456, + "grad_norm": 2.5657975673675537, + "learning_rate": 8.337612323491656e-06, + "loss": 0.6803, + "step": 2167 + }, + { + "epoch": 2.7830551989730425, + "grad_norm": 3.4326186180114746, + "learning_rate": 8.341463414634147e-06, + "loss": 0.7055, + "step": 2168 + }, + { + "epoch": 2.784338896020539, + "grad_norm": 1.9632521867752075, + "learning_rate": 8.345314505776637e-06, + "loss": 0.6685, + "step": 2169 + }, + { + "epoch": 2.785622593068036, + "grad_norm": 1.8903900384902954, + "learning_rate": 8.349165596919128e-06, + "loss": 0.715, + "step": 2170 + }, + { + "epoch": 2.7869062901155326, + "grad_norm": 1.5877869129180908, + "learning_rate": 8.353016688061617e-06, + "loss": 0.6612, + "step": 2171 + }, + { + "epoch": 2.7881899871630296, + "grad_norm": 1.9044854640960693, + "learning_rate": 8.356867779204108e-06, + "loss": 0.6726, + "step": 2172 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 1.8508479595184326, + "learning_rate": 8.3607188703466e-06, + "loss": 0.6527, + "step": 2173 + }, + { + "epoch": 2.790757381258023, + "grad_norm": 2.6960597038269043, + "learning_rate": 8.364569961489089e-06, + "loss": 0.7462, + "step": 2174 + }, + { + "epoch": 2.79204107830552, + "grad_norm": 2.6527867317199707, + "learning_rate": 8.368421052631578e-06, + "loss": 0.679, + "step": 2175 + }, + { + "epoch": 2.7933247753530166, + "grad_norm": 3.211280345916748, + "learning_rate": 8.372272143774071e-06, + "loss": 0.7003, + "step": 2176 + }, + { + "epoch": 2.7946084724005136, + "grad_norm": 5.035701274871826, + "learning_rate": 8.37612323491656e-06, + "loss": 0.7305, + "step": 2177 + }, + { + "epoch": 2.79589216944801, + "grad_norm": 3.033606767654419, + "learning_rate": 8.37997432605905e-06, + "loss": 0.7109, + "step": 2178 + }, + { + "epoch": 2.797175866495507, + "grad_norm": 2.640246629714966, + "learning_rate": 8.38382541720154e-06, + "loss": 0.6588, + "step": 2179 + }, + { + "epoch": 2.7984595635430036, + "grad_norm": 4.058398246765137, + "learning_rate": 8.387676508344032e-06, + "loss": 0.7409, + "step": 2180 + }, + { + "epoch": 2.7997432605905006, + "grad_norm": 3.7943270206451416, + "learning_rate": 8.391527599486521e-06, + "loss": 0.7219, + "step": 2181 + }, + { + "epoch": 2.8010269576379976, + "grad_norm": 2.6023612022399902, + "learning_rate": 8.39537869062901e-06, + "loss": 0.6852, + "step": 2182 + }, + { + "epoch": 2.802310654685494, + "grad_norm": 2.8472042083740234, + "learning_rate": 8.399229781771502e-06, + "loss": 0.6621, + "step": 2183 + }, + { + "epoch": 2.803594351732991, + "grad_norm": 2.3160905838012695, + "learning_rate": 8.403080872913993e-06, + "loss": 0.7498, + "step": 2184 + }, + { + "epoch": 2.8048780487804876, + "grad_norm": 3.8490610122680664, + "learning_rate": 8.406931964056482e-06, + "loss": 0.7541, + "step": 2185 + }, + { + "epoch": 2.8061617458279846, + "grad_norm": 1.8834904432296753, + "learning_rate": 8.410783055198973e-06, + "loss": 0.7171, + "step": 2186 + }, + { + "epoch": 2.8074454428754816, + "grad_norm": 2.271907329559326, + "learning_rate": 8.414634146341464e-06, + "loss": 0.7296, + "step": 2187 + }, + { + "epoch": 2.808729139922978, + "grad_norm": 2.820845365524292, + "learning_rate": 8.418485237483954e-06, + "loss": 0.6828, + "step": 2188 + }, + { + "epoch": 2.810012836970475, + "grad_norm": 2.9421732425689697, + "learning_rate": 8.422336328626445e-06, + "loss": 0.7359, + "step": 2189 + }, + { + "epoch": 2.8112965340179716, + "grad_norm": 2.453494071960449, + "learning_rate": 8.426187419768934e-06, + "loss": 0.7642, + "step": 2190 + }, + { + "epoch": 2.8125802310654686, + "grad_norm": 3.3176536560058594, + "learning_rate": 8.430038510911425e-06, + "loss": 0.7135, + "step": 2191 + }, + { + "epoch": 2.8138639281129656, + "grad_norm": 2.3338394165039062, + "learning_rate": 8.433889602053916e-06, + "loss": 0.7547, + "step": 2192 + }, + { + "epoch": 2.815147625160462, + "grad_norm": 3.281736373901367, + "learning_rate": 8.437740693196406e-06, + "loss": 0.7281, + "step": 2193 + }, + { + "epoch": 2.8164313222079587, + "grad_norm": 3.5960707664489746, + "learning_rate": 8.441591784338895e-06, + "loss": 0.7678, + "step": 2194 + }, + { + "epoch": 2.8177150192554556, + "grad_norm": 3.465277910232544, + "learning_rate": 8.445442875481388e-06, + "loss": 0.7402, + "step": 2195 + }, + { + "epoch": 2.8189987163029526, + "grad_norm": 3.262300729751587, + "learning_rate": 8.449293966623877e-06, + "loss": 0.7308, + "step": 2196 + }, + { + "epoch": 2.820282413350449, + "grad_norm": 4.210052013397217, + "learning_rate": 8.453145057766366e-06, + "loss": 0.7846, + "step": 2197 + }, + { + "epoch": 2.821566110397946, + "grad_norm": 2.6095035076141357, + "learning_rate": 8.456996148908858e-06, + "loss": 0.7753, + "step": 2198 + }, + { + "epoch": 2.8228498074454427, + "grad_norm": 2.0655441284179688, + "learning_rate": 8.460847240051349e-06, + "loss": 0.6934, + "step": 2199 + }, + { + "epoch": 2.8241335044929397, + "grad_norm": 4.204373359680176, + "learning_rate": 8.464698331193838e-06, + "loss": 0.8122, + "step": 2200 + }, + { + "epoch": 2.8254172015404366, + "grad_norm": 4.611855983734131, + "learning_rate": 8.468549422336329e-06, + "loss": 0.8057, + "step": 2201 + }, + { + "epoch": 2.826700898587933, + "grad_norm": 2.540212869644165, + "learning_rate": 8.472400513478818e-06, + "loss": 0.7576, + "step": 2202 + }, + { + "epoch": 2.82798459563543, + "grad_norm": 4.582738399505615, + "learning_rate": 8.47625160462131e-06, + "loss": 0.8731, + "step": 2203 + }, + { + "epoch": 2.8292682926829267, + "grad_norm": 3.5047457218170166, + "learning_rate": 8.4801026957638e-06, + "loss": 0.9448, + "step": 2204 + }, + { + "epoch": 2.8305519897304237, + "grad_norm": 6.130603790283203, + "learning_rate": 8.48395378690629e-06, + "loss": 0.8251, + "step": 2205 + }, + { + "epoch": 2.8318356867779206, + "grad_norm": 3.2252602577209473, + "learning_rate": 8.48780487804878e-06, + "loss": 0.9148, + "step": 2206 + }, + { + "epoch": 2.833119383825417, + "grad_norm": 6.846226215362549, + "learning_rate": 8.491655969191272e-06, + "loss": 0.8982, + "step": 2207 + }, + { + "epoch": 2.834403080872914, + "grad_norm": 6.030252456665039, + "learning_rate": 8.495507060333761e-06, + "loss": 1.0882, + "step": 2208 + }, + { + "epoch": 2.8356867779204107, + "grad_norm": 2.3805532455444336, + "learning_rate": 8.499358151476251e-06, + "loss": 0.7074, + "step": 2209 + }, + { + "epoch": 2.8369704749679077, + "grad_norm": 3.005258798599243, + "learning_rate": 8.503209242618744e-06, + "loss": 0.6792, + "step": 2210 + }, + { + "epoch": 2.8382541720154046, + "grad_norm": 2.059990406036377, + "learning_rate": 8.507060333761233e-06, + "loss": 0.6861, + "step": 2211 + }, + { + "epoch": 2.839537869062901, + "grad_norm": 2.019052028656006, + "learning_rate": 8.510911424903722e-06, + "loss": 0.7281, + "step": 2212 + }, + { + "epoch": 2.8408215661103977, + "grad_norm": 2.5243542194366455, + "learning_rate": 8.514762516046213e-06, + "loss": 0.6796, + "step": 2213 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 2.4102604389190674, + "learning_rate": 8.518613607188705e-06, + "loss": 0.6836, + "step": 2214 + }, + { + "epoch": 2.8433889602053917, + "grad_norm": 3.283890962600708, + "learning_rate": 8.522464698331194e-06, + "loss": 0.6288, + "step": 2215 + }, + { + "epoch": 2.844672657252888, + "grad_norm": 2.1024818420410156, + "learning_rate": 8.526315789473685e-06, + "loss": 0.6788, + "step": 2216 + }, + { + "epoch": 2.845956354300385, + "grad_norm": 1.8132563829421997, + "learning_rate": 8.530166880616174e-06, + "loss": 0.7117, + "step": 2217 + }, + { + "epoch": 2.8472400513478817, + "grad_norm": 1.4907162189483643, + "learning_rate": 8.534017971758665e-06, + "loss": 0.7341, + "step": 2218 + }, + { + "epoch": 2.8485237483953787, + "grad_norm": 1.4846165180206299, + "learning_rate": 8.537869062901156e-06, + "loss": 0.6623, + "step": 2219 + }, + { + "epoch": 2.8498074454428757, + "grad_norm": 3.2766358852386475, + "learning_rate": 8.541720154043646e-06, + "loss": 0.7058, + "step": 2220 + }, + { + "epoch": 2.851091142490372, + "grad_norm": 3.366291046142578, + "learning_rate": 8.545571245186135e-06, + "loss": 0.6689, + "step": 2221 + }, + { + "epoch": 2.852374839537869, + "grad_norm": 2.318279981613159, + "learning_rate": 8.549422336328628e-06, + "loss": 0.762, + "step": 2222 + }, + { + "epoch": 2.8536585365853657, + "grad_norm": 1.9969494342803955, + "learning_rate": 8.553273427471117e-06, + "loss": 0.7294, + "step": 2223 + }, + { + "epoch": 2.8549422336328627, + "grad_norm": 3.5938947200775146, + "learning_rate": 8.557124518613607e-06, + "loss": 0.7067, + "step": 2224 + }, + { + "epoch": 2.8562259306803597, + "grad_norm": 2.2427561283111572, + "learning_rate": 8.560975609756098e-06, + "loss": 0.7071, + "step": 2225 + }, + { + "epoch": 2.857509627727856, + "grad_norm": 5.874627113342285, + "learning_rate": 8.564826700898589e-06, + "loss": 0.6868, + "step": 2226 + }, + { + "epoch": 2.8587933247753528, + "grad_norm": 2.386624813079834, + "learning_rate": 8.568677792041078e-06, + "loss": 0.7091, + "step": 2227 + }, + { + "epoch": 2.8600770218228497, + "grad_norm": 2.5839030742645264, + "learning_rate": 8.57252888318357e-06, + "loss": 0.7222, + "step": 2228 + }, + { + "epoch": 2.8613607188703467, + "grad_norm": 3.0969560146331787, + "learning_rate": 8.576379974326059e-06, + "loss": 0.6825, + "step": 2229 + }, + { + "epoch": 2.8626444159178432, + "grad_norm": 2.420563220977783, + "learning_rate": 8.58023106546855e-06, + "loss": 0.7153, + "step": 2230 + }, + { + "epoch": 2.8639281129653402, + "grad_norm": 2.349046230316162, + "learning_rate": 8.584082156611041e-06, + "loss": 0.6602, + "step": 2231 + }, + { + "epoch": 2.8652118100128368, + "grad_norm": 2.1024107933044434, + "learning_rate": 8.58793324775353e-06, + "loss": 0.7001, + "step": 2232 + }, + { + "epoch": 2.8664955070603337, + "grad_norm": 2.8969578742980957, + "learning_rate": 8.591784338896021e-06, + "loss": 0.7895, + "step": 2233 + }, + { + "epoch": 2.8677792041078307, + "grad_norm": 2.4546751976013184, + "learning_rate": 8.59563543003851e-06, + "loss": 0.714, + "step": 2234 + }, + { + "epoch": 2.8690629011553272, + "grad_norm": 2.6919116973876953, + "learning_rate": 8.599486521181002e-06, + "loss": 0.7379, + "step": 2235 + }, + { + "epoch": 2.8703465982028242, + "grad_norm": 2.6627039909362793, + "learning_rate": 8.603337612323491e-06, + "loss": 0.7048, + "step": 2236 + }, + { + "epoch": 2.8716302952503208, + "grad_norm": 4.0759968757629395, + "learning_rate": 8.607188703465982e-06, + "loss": 0.6752, + "step": 2237 + }, + { + "epoch": 2.8729139922978177, + "grad_norm": 2.6061036586761475, + "learning_rate": 8.611039794608473e-06, + "loss": 0.7541, + "step": 2238 + }, + { + "epoch": 2.8741976893453147, + "grad_norm": 2.291761875152588, + "learning_rate": 8.614890885750963e-06, + "loss": 0.6891, + "step": 2239 + }, + { + "epoch": 2.8754813863928113, + "grad_norm": 1.9719867706298828, + "learning_rate": 8.618741976893452e-06, + "loss": 0.677, + "step": 2240 + }, + { + "epoch": 2.8767650834403082, + "grad_norm": 3.034999132156372, + "learning_rate": 8.622593068035945e-06, + "loss": 0.8157, + "step": 2241 + }, + { + "epoch": 2.8780487804878048, + "grad_norm": 2.966590642929077, + "learning_rate": 8.626444159178434e-06, + "loss": 0.7573, + "step": 2242 + }, + { + "epoch": 2.8793324775353017, + "grad_norm": 2.4253644943237305, + "learning_rate": 8.630295250320924e-06, + "loss": 0.7725, + "step": 2243 + }, + { + "epoch": 2.8806161745827987, + "grad_norm": 2.554767370223999, + "learning_rate": 8.634146341463415e-06, + "loss": 0.7668, + "step": 2244 + }, + { + "epoch": 2.8818998716302953, + "grad_norm": 3.5036745071411133, + "learning_rate": 8.637997432605906e-06, + "loss": 0.7491, + "step": 2245 + }, + { + "epoch": 2.883183568677792, + "grad_norm": 1.9704941511154175, + "learning_rate": 8.641848523748395e-06, + "loss": 0.7607, + "step": 2246 + }, + { + "epoch": 2.8844672657252888, + "grad_norm": 2.9324557781219482, + "learning_rate": 8.645699614890886e-06, + "loss": 0.7189, + "step": 2247 + }, + { + "epoch": 2.8857509627727858, + "grad_norm": 3.2250144481658936, + "learning_rate": 8.649550706033375e-06, + "loss": 0.7781, + "step": 2248 + }, + { + "epoch": 2.8870346598202823, + "grad_norm": 2.0151126384735107, + "learning_rate": 8.653401797175867e-06, + "loss": 0.7422, + "step": 2249 + }, + { + "epoch": 2.8883183568677793, + "grad_norm": 2.4734575748443604, + "learning_rate": 8.657252888318358e-06, + "loss": 0.786, + "step": 2250 + }, + { + "epoch": 2.889602053915276, + "grad_norm": 4.564517498016357, + "learning_rate": 8.661103979460847e-06, + "loss": 0.7927, + "step": 2251 + }, + { + "epoch": 2.890885750962773, + "grad_norm": 2.6971397399902344, + "learning_rate": 8.664955070603336e-06, + "loss": 0.7597, + "step": 2252 + }, + { + "epoch": 2.8921694480102698, + "grad_norm": 4.449679851531982, + "learning_rate": 8.66880616174583e-06, + "loss": 0.719, + "step": 2253 + }, + { + "epoch": 2.8934531450577663, + "grad_norm": 1.9961192607879639, + "learning_rate": 8.672657252888319e-06, + "loss": 0.8096, + "step": 2254 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 4.675631523132324, + "learning_rate": 8.676508344030808e-06, + "loss": 0.7742, + "step": 2255 + }, + { + "epoch": 2.89602053915276, + "grad_norm": 4.419381141662598, + "learning_rate": 8.6803594351733e-06, + "loss": 0.8803, + "step": 2256 + }, + { + "epoch": 2.897304236200257, + "grad_norm": 3.8990871906280518, + "learning_rate": 8.68421052631579e-06, + "loss": 0.8692, + "step": 2257 + }, + { + "epoch": 2.8985879332477538, + "grad_norm": 3.071547269821167, + "learning_rate": 8.68806161745828e-06, + "loss": 1.058, + "step": 2258 + }, + { + "epoch": 2.8998716302952503, + "grad_norm": 2.6805953979492188, + "learning_rate": 8.69191270860077e-06, + "loss": 0.7522, + "step": 2259 + }, + { + "epoch": 2.901155327342747, + "grad_norm": 1.9730786085128784, + "learning_rate": 8.695763799743262e-06, + "loss": 0.6598, + "step": 2260 + }, + { + "epoch": 2.902439024390244, + "grad_norm": 3.0343422889709473, + "learning_rate": 8.699614890885751e-06, + "loss": 0.6648, + "step": 2261 + }, + { + "epoch": 2.903722721437741, + "grad_norm": 2.1699540615081787, + "learning_rate": 8.703465982028242e-06, + "loss": 0.6904, + "step": 2262 + }, + { + "epoch": 2.9050064184852373, + "grad_norm": 1.5241106748580933, + "learning_rate": 8.707317073170731e-06, + "loss": 0.6674, + "step": 2263 + }, + { + "epoch": 2.9062901155327343, + "grad_norm": 3.713940143585205, + "learning_rate": 8.711168164313222e-06, + "loss": 0.6453, + "step": 2264 + }, + { + "epoch": 2.907573812580231, + "grad_norm": 2.7034690380096436, + "learning_rate": 8.715019255455714e-06, + "loss": 0.7032, + "step": 2265 + }, + { + "epoch": 2.908857509627728, + "grad_norm": 2.5460093021392822, + "learning_rate": 8.718870346598203e-06, + "loss": 0.6703, + "step": 2266 + }, + { + "epoch": 2.910141206675225, + "grad_norm": 2.0520424842834473, + "learning_rate": 8.722721437740692e-06, + "loss": 0.6509, + "step": 2267 + }, + { + "epoch": 2.9114249037227213, + "grad_norm": 3.7805559635162354, + "learning_rate": 8.726572528883185e-06, + "loss": 0.6825, + "step": 2268 + }, + { + "epoch": 2.9127086007702183, + "grad_norm": 2.7273008823394775, + "learning_rate": 8.730423620025674e-06, + "loss": 0.7057, + "step": 2269 + }, + { + "epoch": 2.913992297817715, + "grad_norm": 2.825770854949951, + "learning_rate": 8.734274711168164e-06, + "loss": 0.6947, + "step": 2270 + }, + { + "epoch": 2.915275994865212, + "grad_norm": 2.0990333557128906, + "learning_rate": 8.738125802310655e-06, + "loss": 0.6821, + "step": 2271 + }, + { + "epoch": 2.916559691912709, + "grad_norm": 2.015371084213257, + "learning_rate": 8.741976893453146e-06, + "loss": 0.7127, + "step": 2272 + }, + { + "epoch": 2.9178433889602053, + "grad_norm": 2.3248279094696045, + "learning_rate": 8.745827984595635e-06, + "loss": 0.704, + "step": 2273 + }, + { + "epoch": 2.9191270860077023, + "grad_norm": 2.7459826469421387, + "learning_rate": 8.749679075738126e-06, + "loss": 0.6505, + "step": 2274 + }, + { + "epoch": 2.920410783055199, + "grad_norm": 2.015878200531006, + "learning_rate": 8.753530166880616e-06, + "loss": 0.6707, + "step": 2275 + }, + { + "epoch": 2.921694480102696, + "grad_norm": 2.6902718544006348, + "learning_rate": 8.757381258023107e-06, + "loss": 0.7032, + "step": 2276 + }, + { + "epoch": 2.922978177150193, + "grad_norm": 2.354093074798584, + "learning_rate": 8.761232349165598e-06, + "loss": 0.6935, + "step": 2277 + }, + { + "epoch": 2.9242618741976893, + "grad_norm": 4.5245041847229, + "learning_rate": 8.765083440308087e-06, + "loss": 0.6703, + "step": 2278 + }, + { + "epoch": 2.925545571245186, + "grad_norm": 2.5599536895751953, + "learning_rate": 8.768934531450578e-06, + "loss": 0.6691, + "step": 2279 + }, + { + "epoch": 2.926829268292683, + "grad_norm": 2.176313638687134, + "learning_rate": 8.77278562259307e-06, + "loss": 0.6659, + "step": 2280 + }, + { + "epoch": 2.92811296534018, + "grad_norm": 6.875970840454102, + "learning_rate": 8.776636713735559e-06, + "loss": 0.7691, + "step": 2281 + }, + { + "epoch": 2.9293966623876764, + "grad_norm": 2.0779225826263428, + "learning_rate": 8.780487804878048e-06, + "loss": 0.6702, + "step": 2282 + }, + { + "epoch": 2.9306803594351734, + "grad_norm": 3.9919939041137695, + "learning_rate": 8.78433889602054e-06, + "loss": 0.6755, + "step": 2283 + }, + { + "epoch": 2.93196405648267, + "grad_norm": 2.404475450515747, + "learning_rate": 8.78818998716303e-06, + "loss": 0.6711, + "step": 2284 + }, + { + "epoch": 2.933247753530167, + "grad_norm": 2.3055453300476074, + "learning_rate": 8.79204107830552e-06, + "loss": 0.6744, + "step": 2285 + }, + { + "epoch": 2.934531450577664, + "grad_norm": 2.440845012664795, + "learning_rate": 8.795892169448009e-06, + "loss": 0.6611, + "step": 2286 + }, + { + "epoch": 2.9358151476251604, + "grad_norm": 2.6731860637664795, + "learning_rate": 8.799743260590502e-06, + "loss": 0.6919, + "step": 2287 + }, + { + "epoch": 2.9370988446726574, + "grad_norm": 2.2174315452575684, + "learning_rate": 8.803594351732991e-06, + "loss": 0.6913, + "step": 2288 + }, + { + "epoch": 2.938382541720154, + "grad_norm": 3.424057960510254, + "learning_rate": 8.80744544287548e-06, + "loss": 0.7644, + "step": 2289 + }, + { + "epoch": 2.939666238767651, + "grad_norm": 8.10401439666748, + "learning_rate": 8.811296534017972e-06, + "loss": 0.718, + "step": 2290 + }, + { + "epoch": 2.940949935815148, + "grad_norm": 3.3283536434173584, + "learning_rate": 8.815147625160463e-06, + "loss": 0.74, + "step": 2291 + }, + { + "epoch": 2.9422336328626444, + "grad_norm": 2.0525758266448975, + "learning_rate": 8.818998716302952e-06, + "loss": 0.7065, + "step": 2292 + }, + { + "epoch": 2.9435173299101414, + "grad_norm": 5.348016262054443, + "learning_rate": 8.822849807445443e-06, + "loss": 0.6966, + "step": 2293 + }, + { + "epoch": 2.944801026957638, + "grad_norm": 3.909383535385132, + "learning_rate": 8.826700898587933e-06, + "loss": 0.7292, + "step": 2294 + }, + { + "epoch": 2.946084724005135, + "grad_norm": 4.137389183044434, + "learning_rate": 8.830551989730424e-06, + "loss": 0.7713, + "step": 2295 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 2.428713798522949, + "learning_rate": 8.834403080872915e-06, + "loss": 0.7671, + "step": 2296 + }, + { + "epoch": 2.9486521181001284, + "grad_norm": 2.1299216747283936, + "learning_rate": 8.838254172015404e-06, + "loss": 0.7113, + "step": 2297 + }, + { + "epoch": 2.949935815147625, + "grad_norm": 3.220607280731201, + "learning_rate": 8.842105263157893e-06, + "loss": 0.7788, + "step": 2298 + }, + { + "epoch": 2.951219512195122, + "grad_norm": 2.3428702354431152, + "learning_rate": 8.845956354300386e-06, + "loss": 0.7383, + "step": 2299 + }, + { + "epoch": 2.952503209242619, + "grad_norm": 3.2431089878082275, + "learning_rate": 8.849807445442876e-06, + "loss": 0.783, + "step": 2300 + }, + { + "epoch": 2.9537869062901154, + "grad_norm": 2.576369047164917, + "learning_rate": 8.853658536585365e-06, + "loss": 0.8157, + "step": 2301 + }, + { + "epoch": 2.9550706033376124, + "grad_norm": 2.6856234073638916, + "learning_rate": 8.857509627727858e-06, + "loss": 0.7911, + "step": 2302 + }, + { + "epoch": 2.956354300385109, + "grad_norm": 2.5286848545074463, + "learning_rate": 8.861360718870347e-06, + "loss": 0.7825, + "step": 2303 + }, + { + "epoch": 2.957637997432606, + "grad_norm": 10.058030128479004, + "learning_rate": 8.865211810012836e-06, + "loss": 0.8663, + "step": 2304 + }, + { + "epoch": 2.958921694480103, + "grad_norm": 4.029667377471924, + "learning_rate": 8.869062901155328e-06, + "loss": 0.8577, + "step": 2305 + }, + { + "epoch": 2.9602053915275994, + "grad_norm": 3.8689398765563965, + "learning_rate": 8.872913992297819e-06, + "loss": 0.9737, + "step": 2306 + }, + { + "epoch": 2.9614890885750964, + "grad_norm": 3.2342147827148438, + "learning_rate": 8.876765083440308e-06, + "loss": 0.8776, + "step": 2307 + }, + { + "epoch": 2.962772785622593, + "grad_norm": 4.779581069946289, + "learning_rate": 8.880616174582799e-06, + "loss": 1.0682, + "step": 2308 + }, + { + "epoch": 2.96405648267009, + "grad_norm": 1.9930979013442993, + "learning_rate": 8.884467265725288e-06, + "loss": 0.7165, + "step": 2309 + }, + { + "epoch": 2.965340179717587, + "grad_norm": 2.4096648693084717, + "learning_rate": 8.88831835686778e-06, + "loss": 0.6865, + "step": 2310 + }, + { + "epoch": 2.9666238767650834, + "grad_norm": 1.821736454963684, + "learning_rate": 8.89216944801027e-06, + "loss": 0.698, + "step": 2311 + }, + { + "epoch": 2.96790757381258, + "grad_norm": 3.4791290760040283, + "learning_rate": 8.89602053915276e-06, + "loss": 0.6679, + "step": 2312 + }, + { + "epoch": 2.969191270860077, + "grad_norm": 1.9345762729644775, + "learning_rate": 8.89987163029525e-06, + "loss": 0.6777, + "step": 2313 + }, + { + "epoch": 2.970474967907574, + "grad_norm": 3.4148504734039307, + "learning_rate": 8.903722721437742e-06, + "loss": 0.7343, + "step": 2314 + }, + { + "epoch": 2.9717586649550705, + "grad_norm": 2.200183868408203, + "learning_rate": 8.907573812580231e-06, + "loss": 0.7071, + "step": 2315 + }, + { + "epoch": 2.9730423620025674, + "grad_norm": 2.27124285697937, + "learning_rate": 8.911424903722721e-06, + "loss": 0.6724, + "step": 2316 + }, + { + "epoch": 2.974326059050064, + "grad_norm": 2.268874406814575, + "learning_rate": 8.915275994865212e-06, + "loss": 0.6866, + "step": 2317 + }, + { + "epoch": 2.975609756097561, + "grad_norm": 2.1315693855285645, + "learning_rate": 8.919127086007703e-06, + "loss": 0.7408, + "step": 2318 + }, + { + "epoch": 2.976893453145058, + "grad_norm": 2.5551140308380127, + "learning_rate": 8.922978177150192e-06, + "loss": 0.7061, + "step": 2319 + }, + { + "epoch": 2.9781771501925545, + "grad_norm": 2.565521478652954, + "learning_rate": 8.926829268292683e-06, + "loss": 0.6532, + "step": 2320 + }, + { + "epoch": 2.9794608472400514, + "grad_norm": 1.6763194799423218, + "learning_rate": 8.930680359435175e-06, + "loss": 0.6887, + "step": 2321 + }, + { + "epoch": 2.980744544287548, + "grad_norm": 2.283702850341797, + "learning_rate": 8.934531450577664e-06, + "loss": 0.6951, + "step": 2322 + }, + { + "epoch": 2.982028241335045, + "grad_norm": 3.0261032581329346, + "learning_rate": 8.938382541720155e-06, + "loss": 0.7153, + "step": 2323 + }, + { + "epoch": 2.983311938382542, + "grad_norm": 2.860058546066284, + "learning_rate": 8.942233632862644e-06, + "loss": 0.7225, + "step": 2324 + }, + { + "epoch": 2.9845956354300385, + "grad_norm": 2.345562696456909, + "learning_rate": 8.946084724005135e-06, + "loss": 0.7022, + "step": 2325 + }, + { + "epoch": 2.9858793324775355, + "grad_norm": 3.916023015975952, + "learning_rate": 8.949935815147626e-06, + "loss": 0.7504, + "step": 2326 + }, + { + "epoch": 2.987163029525032, + "grad_norm": 2.067283868789673, + "learning_rate": 8.953786906290116e-06, + "loss": 0.674, + "step": 2327 + }, + { + "epoch": 2.988446726572529, + "grad_norm": 2.7379372119903564, + "learning_rate": 8.957637997432605e-06, + "loss": 0.7389, + "step": 2328 + }, + { + "epoch": 2.989730423620026, + "grad_norm": 4.811281681060791, + "learning_rate": 8.961489088575098e-06, + "loss": 0.7662, + "step": 2329 + }, + { + "epoch": 2.9910141206675225, + "grad_norm": 2.292196750640869, + "learning_rate": 8.965340179717587e-06, + "loss": 0.7038, + "step": 2330 + }, + { + "epoch": 2.992297817715019, + "grad_norm": 2.6087710857391357, + "learning_rate": 8.969191270860077e-06, + "loss": 0.7357, + "step": 2331 + }, + { + "epoch": 2.993581514762516, + "grad_norm": 9.89484977722168, + "learning_rate": 8.973042362002568e-06, + "loss": 0.7739, + "step": 2332 + }, + { + "epoch": 2.994865211810013, + "grad_norm": 3.3655097484588623, + "learning_rate": 8.976893453145059e-06, + "loss": 0.8173, + "step": 2333 + }, + { + "epoch": 2.9961489088575095, + "grad_norm": 3.4282162189483643, + "learning_rate": 8.980744544287548e-06, + "loss": 0.7918, + "step": 2334 + }, + { + "epoch": 2.9974326059050065, + "grad_norm": 5.786780834197998, + "learning_rate": 8.984595635430038e-06, + "loss": 0.8371, + "step": 2335 + }, + { + "epoch": 2.998716302952503, + "grad_norm": 3.4928877353668213, + "learning_rate": 8.988446726572529e-06, + "loss": 0.8937, + "step": 2336 + }, + { + "epoch": 3.0, + "grad_norm": 5.168969631195068, + "learning_rate": 8.99229781771502e-06, + "loss": 1.0156, + "step": 2337 + }, + { + "epoch": 3.001283697047497, + "grad_norm": 3.5122787952423096, + "learning_rate": 8.99614890885751e-06, + "loss": 0.6849, + "step": 2338 + }, + { + "epoch": 3.0025673940949935, + "grad_norm": 1.6443812847137451, + "learning_rate": 9e-06, + "loss": 0.6638, + "step": 2339 + }, + { + "epoch": 3.0038510911424905, + "grad_norm": 1.8338737487792969, + "learning_rate": 9.00385109114249e-06, + "loss": 0.6981, + "step": 2340 + }, + { + "epoch": 3.005134788189987, + "grad_norm": 1.5994874238967896, + "learning_rate": 9.00770218228498e-06, + "loss": 0.6453, + "step": 2341 + }, + { + "epoch": 3.006418485237484, + "grad_norm": 1.9567056894302368, + "learning_rate": 9.011553273427472e-06, + "loss": 0.7294, + "step": 2342 + }, + { + "epoch": 3.0077021822849805, + "grad_norm": 1.6555129289627075, + "learning_rate": 9.015404364569961e-06, + "loss": 0.71, + "step": 2343 + }, + { + "epoch": 3.0089858793324775, + "grad_norm": 1.6425855159759521, + "learning_rate": 9.019255455712452e-06, + "loss": 0.6685, + "step": 2344 + }, + { + "epoch": 3.0102695763799745, + "grad_norm": 1.6872600317001343, + "learning_rate": 9.023106546854943e-06, + "loss": 0.6515, + "step": 2345 + }, + { + "epoch": 3.011553273427471, + "grad_norm": 1.8007631301879883, + "learning_rate": 9.026957637997433e-06, + "loss": 0.6663, + "step": 2346 + }, + { + "epoch": 3.012836970474968, + "grad_norm": 3.6651346683502197, + "learning_rate": 9.030808729139922e-06, + "loss": 0.6468, + "step": 2347 + }, + { + "epoch": 3.0141206675224645, + "grad_norm": 1.4094648361206055, + "learning_rate": 9.034659820282415e-06, + "loss": 0.6679, + "step": 2348 + }, + { + "epoch": 3.0154043645699615, + "grad_norm": 2.974575996398926, + "learning_rate": 9.038510911424904e-06, + "loss": 0.6773, + "step": 2349 + }, + { + "epoch": 3.016688061617458, + "grad_norm": 1.56626296043396, + "learning_rate": 9.042362002567394e-06, + "loss": 0.6794, + "step": 2350 + }, + { + "epoch": 3.017971758664955, + "grad_norm": 3.0723793506622314, + "learning_rate": 9.046213093709885e-06, + "loss": 0.7052, + "step": 2351 + }, + { + "epoch": 3.019255455712452, + "grad_norm": 2.6497888565063477, + "learning_rate": 9.050064184852376e-06, + "loss": 0.6998, + "step": 2352 + }, + { + "epoch": 3.0205391527599486, + "grad_norm": 4.9337334632873535, + "learning_rate": 9.053915275994865e-06, + "loss": 0.7197, + "step": 2353 + }, + { + "epoch": 3.0218228498074455, + "grad_norm": 2.864807367324829, + "learning_rate": 9.057766367137356e-06, + "loss": 0.7137, + "step": 2354 + }, + { + "epoch": 3.023106546854942, + "grad_norm": 2.304229974746704, + "learning_rate": 9.061617458279845e-06, + "loss": 0.6688, + "step": 2355 + }, + { + "epoch": 3.024390243902439, + "grad_norm": 2.3239965438842773, + "learning_rate": 9.065468549422337e-06, + "loss": 0.6605, + "step": 2356 + }, + { + "epoch": 3.025673940949936, + "grad_norm": 1.751045823097229, + "learning_rate": 9.069319640564828e-06, + "loss": 0.6512, + "step": 2357 + }, + { + "epoch": 3.0269576379974326, + "grad_norm": 2.7599165439605713, + "learning_rate": 9.073170731707317e-06, + "loss": 0.7029, + "step": 2358 + }, + { + "epoch": 3.0282413350449295, + "grad_norm": 2.9206583499908447, + "learning_rate": 9.077021822849806e-06, + "loss": 0.6756, + "step": 2359 + }, + { + "epoch": 3.029525032092426, + "grad_norm": 2.5323073863983154, + "learning_rate": 9.0808729139923e-06, + "loss": 0.6888, + "step": 2360 + }, + { + "epoch": 3.030808729139923, + "grad_norm": 1.9038052558898926, + "learning_rate": 9.084724005134789e-06, + "loss": 0.661, + "step": 2361 + }, + { + "epoch": 3.0320924261874196, + "grad_norm": 1.9830734729766846, + "learning_rate": 9.088575096277278e-06, + "loss": 0.7224, + "step": 2362 + }, + { + "epoch": 3.0333761232349166, + "grad_norm": 4.012494087219238, + "learning_rate": 9.092426187419769e-06, + "loss": 0.6567, + "step": 2363 + }, + { + "epoch": 3.0346598202824135, + "grad_norm": 2.3096859455108643, + "learning_rate": 9.09627727856226e-06, + "loss": 0.6342, + "step": 2364 + }, + { + "epoch": 3.03594351732991, + "grad_norm": 5.869582176208496, + "learning_rate": 9.10012836970475e-06, + "loss": 0.7254, + "step": 2365 + }, + { + "epoch": 3.037227214377407, + "grad_norm": 1.6924775838851929, + "learning_rate": 9.10397946084724e-06, + "loss": 0.7092, + "step": 2366 + }, + { + "epoch": 3.0385109114249036, + "grad_norm": 4.200716972351074, + "learning_rate": 9.107830551989732e-06, + "loss": 0.6897, + "step": 2367 + }, + { + "epoch": 3.0397946084724006, + "grad_norm": 14.233000755310059, + "learning_rate": 9.111681643132221e-06, + "loss": 0.748, + "step": 2368 + }, + { + "epoch": 3.041078305519897, + "grad_norm": 2.182210683822632, + "learning_rate": 9.115532734274712e-06, + "loss": 0.7243, + "step": 2369 + }, + { + "epoch": 3.042362002567394, + "grad_norm": 2.780930995941162, + "learning_rate": 9.119383825417201e-06, + "loss": 0.7188, + "step": 2370 + }, + { + "epoch": 3.043645699614891, + "grad_norm": 5.654487609863281, + "learning_rate": 9.123234916559692e-06, + "loss": 0.7662, + "step": 2371 + }, + { + "epoch": 3.0449293966623876, + "grad_norm": 2.4501760005950928, + "learning_rate": 9.127086007702184e-06, + "loss": 0.7605, + "step": 2372 + }, + { + "epoch": 3.0462130937098846, + "grad_norm": 2.2088303565979004, + "learning_rate": 9.130937098844673e-06, + "loss": 0.7106, + "step": 2373 + }, + { + "epoch": 3.047496790757381, + "grad_norm": 2.5282394886016846, + "learning_rate": 9.134788189987162e-06, + "loss": 0.8085, + "step": 2374 + }, + { + "epoch": 3.048780487804878, + "grad_norm": 5.709721088409424, + "learning_rate": 9.138639281129655e-06, + "loss": 0.7369, + "step": 2375 + }, + { + "epoch": 3.0500641848523746, + "grad_norm": 2.160633087158203, + "learning_rate": 9.142490372272144e-06, + "loss": 0.7339, + "step": 2376 + }, + { + "epoch": 3.0513478818998716, + "grad_norm": 3.692995548248291, + "learning_rate": 9.146341463414634e-06, + "loss": 0.7581, + "step": 2377 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 2.232238292694092, + "learning_rate": 9.150192554557125e-06, + "loss": 0.7591, + "step": 2378 + }, + { + "epoch": 3.053915275994865, + "grad_norm": 3.2158141136169434, + "learning_rate": 9.154043645699616e-06, + "loss": 0.7931, + "step": 2379 + }, + { + "epoch": 3.055198973042362, + "grad_norm": 3.165811777114868, + "learning_rate": 9.157894736842105e-06, + "loss": 0.7738, + "step": 2380 + }, + { + "epoch": 3.0564826700898586, + "grad_norm": 2.826587677001953, + "learning_rate": 9.161745827984596e-06, + "loss": 0.8004, + "step": 2381 + }, + { + "epoch": 3.0577663671373556, + "grad_norm": 2.9806008338928223, + "learning_rate": 9.165596919127086e-06, + "loss": 0.7343, + "step": 2382 + }, + { + "epoch": 3.0590500641848526, + "grad_norm": 2.983980894088745, + "learning_rate": 9.169448010269577e-06, + "loss": 0.789, + "step": 2383 + }, + { + "epoch": 3.060333761232349, + "grad_norm": 2.8929009437561035, + "learning_rate": 9.173299101412068e-06, + "loss": 0.8658, + "step": 2384 + }, + { + "epoch": 3.061617458279846, + "grad_norm": 6.77714729309082, + "learning_rate": 9.177150192554557e-06, + "loss": 0.7806, + "step": 2385 + }, + { + "epoch": 3.0629011553273426, + "grad_norm": 9.112574577331543, + "learning_rate": 9.181001283697047e-06, + "loss": 0.9202, + "step": 2386 + }, + { + "epoch": 3.0641848523748396, + "grad_norm": 4.300556182861328, + "learning_rate": 9.184852374839538e-06, + "loss": 1.0443, + "step": 2387 + }, + { + "epoch": 3.065468549422336, + "grad_norm": 1.5341871976852417, + "learning_rate": 9.188703465982029e-06, + "loss": 0.7001, + "step": 2388 + }, + { + "epoch": 3.066752246469833, + "grad_norm": 1.8849012851715088, + "learning_rate": 9.192554557124518e-06, + "loss": 0.6974, + "step": 2389 + }, + { + "epoch": 3.06803594351733, + "grad_norm": 2.166802406311035, + "learning_rate": 9.19640564826701e-06, + "loss": 0.6837, + "step": 2390 + }, + { + "epoch": 3.0693196405648266, + "grad_norm": 1.7537646293640137, + "learning_rate": 9.2002567394095e-06, + "loss": 0.6537, + "step": 2391 + }, + { + "epoch": 3.0706033376123236, + "grad_norm": 1.552445411682129, + "learning_rate": 9.20410783055199e-06, + "loss": 0.6421, + "step": 2392 + }, + { + "epoch": 3.07188703465982, + "grad_norm": 1.638474464416504, + "learning_rate": 9.207958921694479e-06, + "loss": 0.6914, + "step": 2393 + }, + { + "epoch": 3.073170731707317, + "grad_norm": 2.4698147773742676, + "learning_rate": 9.211810012836972e-06, + "loss": 0.6478, + "step": 2394 + }, + { + "epoch": 3.0744544287548137, + "grad_norm": 2.2801990509033203, + "learning_rate": 9.215661103979461e-06, + "loss": 0.6664, + "step": 2395 + }, + { + "epoch": 3.0757381258023107, + "grad_norm": 1.5799906253814697, + "learning_rate": 9.21951219512195e-06, + "loss": 0.6791, + "step": 2396 + }, + { + "epoch": 3.0770218228498076, + "grad_norm": 1.86100172996521, + "learning_rate": 9.223363286264442e-06, + "loss": 0.6473, + "step": 2397 + }, + { + "epoch": 3.078305519897304, + "grad_norm": 5.366511821746826, + "learning_rate": 9.227214377406933e-06, + "loss": 0.6622, + "step": 2398 + }, + { + "epoch": 3.079589216944801, + "grad_norm": 2.0688390731811523, + "learning_rate": 9.231065468549422e-06, + "loss": 0.6512, + "step": 2399 + }, + { + "epoch": 3.0808729139922977, + "grad_norm": 2.5611162185668945, + "learning_rate": 9.234916559691913e-06, + "loss": 0.685, + "step": 2400 + }, + { + "epoch": 3.0821566110397947, + "grad_norm": 3.312483549118042, + "learning_rate": 9.238767650834403e-06, + "loss": 0.6671, + "step": 2401 + }, + { + "epoch": 3.083440308087291, + "grad_norm": 2.1584484577178955, + "learning_rate": 9.242618741976894e-06, + "loss": 0.7438, + "step": 2402 + }, + { + "epoch": 3.084724005134788, + "grad_norm": 2.3419992923736572, + "learning_rate": 9.246469833119385e-06, + "loss": 0.669, + "step": 2403 + }, + { + "epoch": 3.086007702182285, + "grad_norm": 1.3946990966796875, + "learning_rate": 9.250320924261874e-06, + "loss": 0.6954, + "step": 2404 + }, + { + "epoch": 3.0872913992297817, + "grad_norm": 2.110443115234375, + "learning_rate": 9.254172015404363e-06, + "loss": 0.6968, + "step": 2405 + }, + { + "epoch": 3.0885750962772787, + "grad_norm": 1.7148596048355103, + "learning_rate": 9.258023106546856e-06, + "loss": 0.6987, + "step": 2406 + }, + { + "epoch": 3.089858793324775, + "grad_norm": 2.0458242893218994, + "learning_rate": 9.261874197689346e-06, + "loss": 0.6803, + "step": 2407 + }, + { + "epoch": 3.091142490372272, + "grad_norm": 2.8675200939178467, + "learning_rate": 9.265725288831835e-06, + "loss": 0.6788, + "step": 2408 + }, + { + "epoch": 3.092426187419769, + "grad_norm": 2.485707998275757, + "learning_rate": 9.269576379974326e-06, + "loss": 0.6721, + "step": 2409 + }, + { + "epoch": 3.0937098844672657, + "grad_norm": 2.834352493286133, + "learning_rate": 9.273427471116817e-06, + "loss": 0.6613, + "step": 2410 + }, + { + "epoch": 3.0949935815147627, + "grad_norm": 2.0163776874542236, + "learning_rate": 9.277278562259306e-06, + "loss": 0.6611, + "step": 2411 + }, + { + "epoch": 3.096277278562259, + "grad_norm": 2.8616855144500732, + "learning_rate": 9.281129653401798e-06, + "loss": 0.7066, + "step": 2412 + }, + { + "epoch": 3.097560975609756, + "grad_norm": 6.78464937210083, + "learning_rate": 9.284980744544289e-06, + "loss": 0.6647, + "step": 2413 + }, + { + "epoch": 3.0988446726572527, + "grad_norm": 2.3900492191314697, + "learning_rate": 9.288831835686778e-06, + "loss": 0.7098, + "step": 2414 + }, + { + "epoch": 3.1001283697047497, + "grad_norm": 2.4187610149383545, + "learning_rate": 9.292682926829269e-06, + "loss": 0.6878, + "step": 2415 + }, + { + "epoch": 3.1014120667522467, + "grad_norm": 2.528691291809082, + "learning_rate": 9.296534017971758e-06, + "loss": 0.6809, + "step": 2416 + }, + { + "epoch": 3.102695763799743, + "grad_norm": 2.5034923553466797, + "learning_rate": 9.30038510911425e-06, + "loss": 0.745, + "step": 2417 + }, + { + "epoch": 3.10397946084724, + "grad_norm": 2.406090021133423, + "learning_rate": 9.30423620025674e-06, + "loss": 0.6445, + "step": 2418 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 2.493788719177246, + "learning_rate": 9.30808729139923e-06, + "loss": 0.7026, + "step": 2419 + }, + { + "epoch": 3.1065468549422337, + "grad_norm": 2.3980588912963867, + "learning_rate": 9.31193838254172e-06, + "loss": 0.7425, + "step": 2420 + }, + { + "epoch": 3.1078305519897302, + "grad_norm": 2.5383551120758057, + "learning_rate": 9.315789473684212e-06, + "loss": 0.7019, + "step": 2421 + }, + { + "epoch": 3.109114249037227, + "grad_norm": 2.1249351501464844, + "learning_rate": 9.319640564826701e-06, + "loss": 0.7152, + "step": 2422 + }, + { + "epoch": 3.110397946084724, + "grad_norm": 2.263658285140991, + "learning_rate": 9.323491655969191e-06, + "loss": 0.7525, + "step": 2423 + }, + { + "epoch": 3.1116816431322207, + "grad_norm": 2.606584072113037, + "learning_rate": 9.327342747111682e-06, + "loss": 0.788, + "step": 2424 + }, + { + "epoch": 3.1129653401797177, + "grad_norm": 2.3829281330108643, + "learning_rate": 9.331193838254173e-06, + "loss": 0.7805, + "step": 2425 + }, + { + "epoch": 3.1142490372272142, + "grad_norm": 2.0398693084716797, + "learning_rate": 9.335044929396662e-06, + "loss": 0.7888, + "step": 2426 + }, + { + "epoch": 3.1155327342747112, + "grad_norm": 2.460555076599121, + "learning_rate": 9.338896020539153e-06, + "loss": 0.7218, + "step": 2427 + }, + { + "epoch": 3.1168164313222078, + "grad_norm": 3.019925117492676, + "learning_rate": 9.342747111681643e-06, + "loss": 0.7847, + "step": 2428 + }, + { + "epoch": 3.1181001283697047, + "grad_norm": 2.6943771839141846, + "learning_rate": 9.346598202824134e-06, + "loss": 0.8234, + "step": 2429 + }, + { + "epoch": 3.1193838254172017, + "grad_norm": 3.591064691543579, + "learning_rate": 9.350449293966625e-06, + "loss": 0.7681, + "step": 2430 + }, + { + "epoch": 3.1206675224646983, + "grad_norm": 2.236135721206665, + "learning_rate": 9.354300385109114e-06, + "loss": 0.7695, + "step": 2431 + }, + { + "epoch": 3.1219512195121952, + "grad_norm": 2.0993361473083496, + "learning_rate": 9.358151476251604e-06, + "loss": 0.7737, + "step": 2432 + }, + { + "epoch": 3.1232349165596918, + "grad_norm": 2.4853687286376953, + "learning_rate": 9.362002567394096e-06, + "loss": 0.8398, + "step": 2433 + }, + { + "epoch": 3.1245186136071887, + "grad_norm": 3.3764772415161133, + "learning_rate": 9.365853658536586e-06, + "loss": 0.8423, + "step": 2434 + }, + { + "epoch": 3.1258023106546853, + "grad_norm": 6.9105963706970215, + "learning_rate": 9.369704749679075e-06, + "loss": 0.8679, + "step": 2435 + }, + { + "epoch": 3.1270860077021823, + "grad_norm": 3.4740774631500244, + "learning_rate": 9.373555840821566e-06, + "loss": 0.9297, + "step": 2436 + }, + { + "epoch": 3.1283697047496792, + "grad_norm": 3.446113109588623, + "learning_rate": 9.377406931964057e-06, + "loss": 1.0658, + "step": 2437 + }, + { + "epoch": 3.1296534017971758, + "grad_norm": 2.5678396224975586, + "learning_rate": 9.381258023106547e-06, + "loss": 0.7101, + "step": 2438 + }, + { + "epoch": 3.1309370988446728, + "grad_norm": 1.654732584953308, + "learning_rate": 9.385109114249036e-06, + "loss": 0.6658, + "step": 2439 + }, + { + "epoch": 3.1322207958921693, + "grad_norm": 2.246267795562744, + "learning_rate": 9.388960205391529e-06, + "loss": 0.6472, + "step": 2440 + }, + { + "epoch": 3.1335044929396663, + "grad_norm": 2.397711992263794, + "learning_rate": 9.392811296534018e-06, + "loss": 0.657, + "step": 2441 + }, + { + "epoch": 3.1347881899871632, + "grad_norm": 2.1072590351104736, + "learning_rate": 9.396662387676508e-06, + "loss": 0.6866, + "step": 2442 + }, + { + "epoch": 3.1360718870346598, + "grad_norm": 1.6506563425064087, + "learning_rate": 9.400513478818999e-06, + "loss": 0.6519, + "step": 2443 + }, + { + "epoch": 3.1373555840821568, + "grad_norm": 1.6300950050354004, + "learning_rate": 9.40436456996149e-06, + "loss": 0.7106, + "step": 2444 + }, + { + "epoch": 3.1386392811296533, + "grad_norm": 1.8177329301834106, + "learning_rate": 9.40821566110398e-06, + "loss": 0.6909, + "step": 2445 + }, + { + "epoch": 3.1399229781771503, + "grad_norm": 1.9296201467514038, + "learning_rate": 9.41206675224647e-06, + "loss": 0.6777, + "step": 2446 + }, + { + "epoch": 3.141206675224647, + "grad_norm": 1.883845329284668, + "learning_rate": 9.41591784338896e-06, + "loss": 0.6775, + "step": 2447 + }, + { + "epoch": 3.142490372272144, + "grad_norm": 1.6102008819580078, + "learning_rate": 9.41976893453145e-06, + "loss": 0.6696, + "step": 2448 + }, + { + "epoch": 3.1437740693196408, + "grad_norm": 4.904893398284912, + "learning_rate": 9.423620025673942e-06, + "loss": 0.6781, + "step": 2449 + }, + { + "epoch": 3.1450577663671373, + "grad_norm": 1.720186710357666, + "learning_rate": 9.427471116816431e-06, + "loss": 0.6539, + "step": 2450 + }, + { + "epoch": 3.1463414634146343, + "grad_norm": 3.933713436126709, + "learning_rate": 9.43132220795892e-06, + "loss": 0.7121, + "step": 2451 + }, + { + "epoch": 3.147625160462131, + "grad_norm": 2.8619043827056885, + "learning_rate": 9.435173299101413e-06, + "loss": 0.6747, + "step": 2452 + }, + { + "epoch": 3.148908857509628, + "grad_norm": 1.6462944746017456, + "learning_rate": 9.439024390243903e-06, + "loss": 0.6911, + "step": 2453 + }, + { + "epoch": 3.1501925545571243, + "grad_norm": 2.986778736114502, + "learning_rate": 9.442875481386392e-06, + "loss": 0.6646, + "step": 2454 + }, + { + "epoch": 3.1514762516046213, + "grad_norm": 1.6767094135284424, + "learning_rate": 9.446726572528883e-06, + "loss": 0.6914, + "step": 2455 + }, + { + "epoch": 3.1527599486521183, + "grad_norm": 2.276010751724243, + "learning_rate": 9.450577663671374e-06, + "loss": 0.6682, + "step": 2456 + }, + { + "epoch": 3.154043645699615, + "grad_norm": 2.1815640926361084, + "learning_rate": 9.454428754813864e-06, + "loss": 0.7559, + "step": 2457 + }, + { + "epoch": 3.155327342747112, + "grad_norm": 1.862032175064087, + "learning_rate": 9.458279845956355e-06, + "loss": 0.6385, + "step": 2458 + }, + { + "epoch": 3.1566110397946083, + "grad_norm": 12.425849914550781, + "learning_rate": 9.462130937098846e-06, + "loss": 0.7052, + "step": 2459 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 1.8777782917022705, + "learning_rate": 9.465982028241335e-06, + "loss": 0.651, + "step": 2460 + }, + { + "epoch": 3.1591784338896023, + "grad_norm": 1.8475215435028076, + "learning_rate": 9.469833119383826e-06, + "loss": 0.6899, + "step": 2461 + }, + { + "epoch": 3.160462130937099, + "grad_norm": 1.5634076595306396, + "learning_rate": 9.473684210526315e-06, + "loss": 0.6655, + "step": 2462 + }, + { + "epoch": 3.161745827984596, + "grad_norm": 1.7393640279769897, + "learning_rate": 9.477535301668807e-06, + "loss": 0.6852, + "step": 2463 + }, + { + "epoch": 3.1630295250320923, + "grad_norm": 3.6600992679595947, + "learning_rate": 9.481386392811298e-06, + "loss": 0.7101, + "step": 2464 + }, + { + "epoch": 3.1643132220795893, + "grad_norm": 1.6988465785980225, + "learning_rate": 9.485237483953787e-06, + "loss": 0.6753, + "step": 2465 + }, + { + "epoch": 3.165596919127086, + "grad_norm": 1.5884363651275635, + "learning_rate": 9.489088575096276e-06, + "loss": 0.723, + "step": 2466 + }, + { + "epoch": 3.166880616174583, + "grad_norm": 2.0328733921051025, + "learning_rate": 9.492939666238769e-06, + "loss": 0.7397, + "step": 2467 + }, + { + "epoch": 3.1681643132220794, + "grad_norm": 2.866434335708618, + "learning_rate": 9.496790757381259e-06, + "loss": 0.6744, + "step": 2468 + }, + { + "epoch": 3.1694480102695763, + "grad_norm": 1.998705506324768, + "learning_rate": 9.500641848523748e-06, + "loss": 0.6898, + "step": 2469 + }, + { + "epoch": 3.1707317073170733, + "grad_norm": 2.840484142303467, + "learning_rate": 9.504492939666239e-06, + "loss": 0.7229, + "step": 2470 + }, + { + "epoch": 3.17201540436457, + "grad_norm": 2.1589643955230713, + "learning_rate": 9.50834403080873e-06, + "loss": 0.7675, + "step": 2471 + }, + { + "epoch": 3.173299101412067, + "grad_norm": 2.458932399749756, + "learning_rate": 9.51219512195122e-06, + "loss": 0.7599, + "step": 2472 + }, + { + "epoch": 3.1745827984595634, + "grad_norm": 2.395489454269409, + "learning_rate": 9.51604621309371e-06, + "loss": 0.7185, + "step": 2473 + }, + { + "epoch": 3.1758664955070603, + "grad_norm": 2.208071708679199, + "learning_rate": 9.5198973042362e-06, + "loss": 0.7055, + "step": 2474 + }, + { + "epoch": 3.1771501925545573, + "grad_norm": 2.6386783123016357, + "learning_rate": 9.523748395378691e-06, + "loss": 0.7405, + "step": 2475 + }, + { + "epoch": 3.178433889602054, + "grad_norm": 2.3254706859588623, + "learning_rate": 9.527599486521182e-06, + "loss": 0.6924, + "step": 2476 + }, + { + "epoch": 3.179717586649551, + "grad_norm": 2.5932185649871826, + "learning_rate": 9.531450577663671e-06, + "loss": 0.7316, + "step": 2477 + }, + { + "epoch": 3.1810012836970474, + "grad_norm": 2.328387975692749, + "learning_rate": 9.53530166880616e-06, + "loss": 0.7127, + "step": 2478 + }, + { + "epoch": 3.1822849807445444, + "grad_norm": 2.888840675354004, + "learning_rate": 9.539152759948654e-06, + "loss": 0.8047, + "step": 2479 + }, + { + "epoch": 3.183568677792041, + "grad_norm": 3.076129913330078, + "learning_rate": 9.543003851091143e-06, + "loss": 0.7724, + "step": 2480 + }, + { + "epoch": 3.184852374839538, + "grad_norm": 2.4889137744903564, + "learning_rate": 9.546854942233632e-06, + "loss": 0.7281, + "step": 2481 + }, + { + "epoch": 3.186136071887035, + "grad_norm": 2.9186465740203857, + "learning_rate": 9.550706033376125e-06, + "loss": 0.7592, + "step": 2482 + }, + { + "epoch": 3.1874197689345314, + "grad_norm": 2.492819309234619, + "learning_rate": 9.554557124518614e-06, + "loss": 0.8392, + "step": 2483 + }, + { + "epoch": 3.1887034659820284, + "grad_norm": 2.951016426086426, + "learning_rate": 9.558408215661104e-06, + "loss": 0.8115, + "step": 2484 + }, + { + "epoch": 3.189987163029525, + "grad_norm": 5.50147819519043, + "learning_rate": 9.562259306803595e-06, + "loss": 0.9274, + "step": 2485 + }, + { + "epoch": 3.191270860077022, + "grad_norm": 2.729022979736328, + "learning_rate": 9.566110397946086e-06, + "loss": 0.9844, + "step": 2486 + }, + { + "epoch": 3.1925545571245184, + "grad_norm": 4.294699668884277, + "learning_rate": 9.569961489088575e-06, + "loss": 1.0922, + "step": 2487 + }, + { + "epoch": 3.1938382541720154, + "grad_norm": 1.5660442113876343, + "learning_rate": 9.573812580231065e-06, + "loss": 0.6733, + "step": 2488 + }, + { + "epoch": 3.1951219512195124, + "grad_norm": 1.824267029762268, + "learning_rate": 9.577663671373556e-06, + "loss": 0.6652, + "step": 2489 + }, + { + "epoch": 3.196405648267009, + "grad_norm": 1.7696256637573242, + "learning_rate": 9.581514762516047e-06, + "loss": 0.6552, + "step": 2490 + }, + { + "epoch": 3.197689345314506, + "grad_norm": 2.2354934215545654, + "learning_rate": 9.585365853658536e-06, + "loss": 0.6568, + "step": 2491 + }, + { + "epoch": 3.1989730423620024, + "grad_norm": 2.042802572250366, + "learning_rate": 9.589216944801027e-06, + "loss": 0.6696, + "step": 2492 + }, + { + "epoch": 3.2002567394094994, + "grad_norm": 3.8039016723632812, + "learning_rate": 9.593068035943517e-06, + "loss": 0.6659, + "step": 2493 + }, + { + "epoch": 3.2015404364569964, + "grad_norm": 1.7443662881851196, + "learning_rate": 9.596919127086008e-06, + "loss": 0.6477, + "step": 2494 + }, + { + "epoch": 3.202824133504493, + "grad_norm": 1.8784061670303345, + "learning_rate": 9.600770218228499e-06, + "loss": 0.7089, + "step": 2495 + }, + { + "epoch": 3.20410783055199, + "grad_norm": 1.706976056098938, + "learning_rate": 9.604621309370988e-06, + "loss": 0.6658, + "step": 2496 + }, + { + "epoch": 3.2053915275994864, + "grad_norm": 1.8325352668762207, + "learning_rate": 9.608472400513478e-06, + "loss": 0.7056, + "step": 2497 + }, + { + "epoch": 3.2066752246469834, + "grad_norm": 2.2913334369659424, + "learning_rate": 9.61232349165597e-06, + "loss": 0.6377, + "step": 2498 + }, + { + "epoch": 3.20795892169448, + "grad_norm": 1.5883381366729736, + "learning_rate": 9.61617458279846e-06, + "loss": 0.6363, + "step": 2499 + }, + { + "epoch": 3.209242618741977, + "grad_norm": 1.2745251655578613, + "learning_rate": 9.620025673940949e-06, + "loss": 0.6658, + "step": 2500 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 1.7422935962677002, + "learning_rate": 9.62387676508344e-06, + "loss": 0.6855, + "step": 2501 + }, + { + "epoch": 3.2118100128369704, + "grad_norm": 2.3470349311828613, + "learning_rate": 9.627727856225931e-06, + "loss": 0.6818, + "step": 2502 + }, + { + "epoch": 3.2130937098844674, + "grad_norm": 1.4792391061782837, + "learning_rate": 9.63157894736842e-06, + "loss": 0.6598, + "step": 2503 + }, + { + "epoch": 3.214377406931964, + "grad_norm": 1.76111900806427, + "learning_rate": 9.635430038510912e-06, + "loss": 0.6275, + "step": 2504 + }, + { + "epoch": 3.215661103979461, + "grad_norm": 3.5816073417663574, + "learning_rate": 9.639281129653403e-06, + "loss": 0.6367, + "step": 2505 + }, + { + "epoch": 3.2169448010269575, + "grad_norm": 2.09645676612854, + "learning_rate": 9.643132220795892e-06, + "loss": 0.7007, + "step": 2506 + }, + { + "epoch": 3.2182284980744544, + "grad_norm": 1.9414172172546387, + "learning_rate": 9.646983311938383e-06, + "loss": 0.6733, + "step": 2507 + }, + { + "epoch": 3.2195121951219514, + "grad_norm": 1.3362104892730713, + "learning_rate": 9.650834403080873e-06, + "loss": 0.6889, + "step": 2508 + }, + { + "epoch": 3.220795892169448, + "grad_norm": 2.0286028385162354, + "learning_rate": 9.654685494223364e-06, + "loss": 0.6711, + "step": 2509 + }, + { + "epoch": 3.222079589216945, + "grad_norm": 4.032346725463867, + "learning_rate": 9.658536585365855e-06, + "loss": 0.6688, + "step": 2510 + }, + { + "epoch": 3.2233632862644415, + "grad_norm": 1.955578088760376, + "learning_rate": 9.662387676508344e-06, + "loss": 0.6767, + "step": 2511 + }, + { + "epoch": 3.2246469833119384, + "grad_norm": 1.430690884590149, + "learning_rate": 9.666238767650833e-06, + "loss": 0.6796, + "step": 2512 + }, + { + "epoch": 3.225930680359435, + "grad_norm": 1.7800604104995728, + "learning_rate": 9.670089858793326e-06, + "loss": 0.6729, + "step": 2513 + }, + { + "epoch": 3.227214377406932, + "grad_norm": 1.8123184442520142, + "learning_rate": 9.673940949935816e-06, + "loss": 0.6993, + "step": 2514 + }, + { + "epoch": 3.228498074454429, + "grad_norm": 2.743222713470459, + "learning_rate": 9.677792041078305e-06, + "loss": 0.6712, + "step": 2515 + }, + { + "epoch": 3.2297817715019255, + "grad_norm": 3.069122314453125, + "learning_rate": 9.681643132220796e-06, + "loss": 0.6829, + "step": 2516 + }, + { + "epoch": 3.2310654685494224, + "grad_norm": 2.663360595703125, + "learning_rate": 9.685494223363287e-06, + "loss": 0.6946, + "step": 2517 + }, + { + "epoch": 3.232349165596919, + "grad_norm": 5.244438171386719, + "learning_rate": 9.689345314505776e-06, + "loss": 0.708, + "step": 2518 + }, + { + "epoch": 3.233632862644416, + "grad_norm": 2.7481515407562256, + "learning_rate": 9.693196405648268e-06, + "loss": 0.7732, + "step": 2519 + }, + { + "epoch": 3.2349165596919125, + "grad_norm": 2.3706092834472656, + "learning_rate": 9.697047496790757e-06, + "loss": 0.7332, + "step": 2520 + }, + { + "epoch": 3.2362002567394095, + "grad_norm": 4.70185661315918, + "learning_rate": 9.700898587933248e-06, + "loss": 0.6651, + "step": 2521 + }, + { + "epoch": 3.2374839537869065, + "grad_norm": 2.3072922229766846, + "learning_rate": 9.704749679075739e-06, + "loss": 0.7331, + "step": 2522 + }, + { + "epoch": 3.238767650834403, + "grad_norm": 2.4920785427093506, + "learning_rate": 9.708600770218228e-06, + "loss": 0.7203, + "step": 2523 + }, + { + "epoch": 3.2400513478819, + "grad_norm": 2.1816813945770264, + "learning_rate": 9.71245186136072e-06, + "loss": 0.6908, + "step": 2524 + }, + { + "epoch": 3.2413350449293965, + "grad_norm": 2.0309441089630127, + "learning_rate": 9.71630295250321e-06, + "loss": 0.7529, + "step": 2525 + }, + { + "epoch": 3.2426187419768935, + "grad_norm": 4.268661022186279, + "learning_rate": 9.7201540436457e-06, + "loss": 0.7241, + "step": 2526 + }, + { + "epoch": 3.2439024390243905, + "grad_norm": 3.710942029953003, + "learning_rate": 9.72400513478819e-06, + "loss": 0.7447, + "step": 2527 + }, + { + "epoch": 3.245186136071887, + "grad_norm": 3.858847141265869, + "learning_rate": 9.727856225930682e-06, + "loss": 0.7172, + "step": 2528 + }, + { + "epoch": 3.246469833119384, + "grad_norm": 2.5997045040130615, + "learning_rate": 9.731707317073171e-06, + "loss": 0.7475, + "step": 2529 + }, + { + "epoch": 3.2477535301668805, + "grad_norm": 2.1152217388153076, + "learning_rate": 9.735558408215661e-06, + "loss": 0.7201, + "step": 2530 + }, + { + "epoch": 3.2490372272143775, + "grad_norm": 3.457461357116699, + "learning_rate": 9.739409499358152e-06, + "loss": 0.7413, + "step": 2531 + }, + { + "epoch": 3.250320924261874, + "grad_norm": 2.8669941425323486, + "learning_rate": 9.743260590500643e-06, + "loss": 0.7776, + "step": 2532 + }, + { + "epoch": 3.251604621309371, + "grad_norm": 2.383333683013916, + "learning_rate": 9.747111681643132e-06, + "loss": 0.7523, + "step": 2533 + }, + { + "epoch": 3.2528883183568675, + "grad_norm": 3.391280174255371, + "learning_rate": 9.750962772785623e-06, + "loss": 0.7917, + "step": 2534 + }, + { + "epoch": 3.2541720154043645, + "grad_norm": 4.26154088973999, + "learning_rate": 9.754813863928113e-06, + "loss": 0.937, + "step": 2535 + }, + { + "epoch": 3.2554557124518615, + "grad_norm": 2.5984368324279785, + "learning_rate": 9.758664955070604e-06, + "loss": 0.893, + "step": 2536 + }, + { + "epoch": 3.256739409499358, + "grad_norm": 3.77146053314209, + "learning_rate": 9.762516046213095e-06, + "loss": 1.006, + "step": 2537 + }, + { + "epoch": 3.258023106546855, + "grad_norm": 3.017975091934204, + "learning_rate": 9.766367137355584e-06, + "loss": 0.6765, + "step": 2538 + }, + { + "epoch": 3.2593068035943515, + "grad_norm": 1.2657771110534668, + "learning_rate": 9.770218228498074e-06, + "loss": 0.6787, + "step": 2539 + }, + { + "epoch": 3.2605905006418485, + "grad_norm": 3.2487432956695557, + "learning_rate": 9.774069319640565e-06, + "loss": 0.6627, + "step": 2540 + }, + { + "epoch": 3.2618741976893455, + "grad_norm": 2.6313841342926025, + "learning_rate": 9.777920410783056e-06, + "loss": 0.6936, + "step": 2541 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 4.684909343719482, + "learning_rate": 9.781771501925545e-06, + "loss": 0.6631, + "step": 2542 + }, + { + "epoch": 3.264441591784339, + "grad_norm": 1.7175875902175903, + "learning_rate": 9.785622593068035e-06, + "loss": 0.6432, + "step": 2543 + }, + { + "epoch": 3.2657252888318355, + "grad_norm": 1.640836477279663, + "learning_rate": 9.789473684210527e-06, + "loss": 0.7195, + "step": 2544 + }, + { + "epoch": 3.2670089858793325, + "grad_norm": 1.9716116189956665, + "learning_rate": 9.793324775353017e-06, + "loss": 0.665, + "step": 2545 + }, + { + "epoch": 3.2682926829268295, + "grad_norm": 1.390350341796875, + "learning_rate": 9.797175866495506e-06, + "loss": 0.644, + "step": 2546 + }, + { + "epoch": 3.269576379974326, + "grad_norm": 1.7749978303909302, + "learning_rate": 9.801026957637999e-06, + "loss": 0.6797, + "step": 2547 + }, + { + "epoch": 3.270860077021823, + "grad_norm": 1.8695571422576904, + "learning_rate": 9.804878048780488e-06, + "loss": 0.7369, + "step": 2548 + }, + { + "epoch": 3.2721437740693196, + "grad_norm": 2.5882723331451416, + "learning_rate": 9.808729139922978e-06, + "loss": 0.6334, + "step": 2549 + }, + { + "epoch": 3.2734274711168165, + "grad_norm": 1.5827269554138184, + "learning_rate": 9.812580231065469e-06, + "loss": 0.6422, + "step": 2550 + }, + { + "epoch": 3.274711168164313, + "grad_norm": 2.14029860496521, + "learning_rate": 9.81643132220796e-06, + "loss": 0.6707, + "step": 2551 + }, + { + "epoch": 3.27599486521181, + "grad_norm": 9.572820663452148, + "learning_rate": 9.82028241335045e-06, + "loss": 0.6955, + "step": 2552 + }, + { + "epoch": 3.2772785622593066, + "grad_norm": 2.394468307495117, + "learning_rate": 9.82413350449294e-06, + "loss": 0.6712, + "step": 2553 + }, + { + "epoch": 3.2785622593068036, + "grad_norm": 2.0095090866088867, + "learning_rate": 9.82798459563543e-06, + "loss": 0.6945, + "step": 2554 + }, + { + "epoch": 3.2798459563543005, + "grad_norm": 1.3282877206802368, + "learning_rate": 9.83183568677792e-06, + "loss": 0.6613, + "step": 2555 + }, + { + "epoch": 3.281129653401797, + "grad_norm": 2.0237998962402344, + "learning_rate": 9.835686777920412e-06, + "loss": 0.6445, + "step": 2556 + }, + { + "epoch": 3.282413350449294, + "grad_norm": 2.165588617324829, + "learning_rate": 9.839537869062901e-06, + "loss": 0.6826, + "step": 2557 + }, + { + "epoch": 3.2836970474967906, + "grad_norm": 1.6499741077423096, + "learning_rate": 9.84338896020539e-06, + "loss": 0.6841, + "step": 2558 + }, + { + "epoch": 3.2849807445442876, + "grad_norm": 3.7708845138549805, + "learning_rate": 9.847240051347883e-06, + "loss": 0.6494, + "step": 2559 + }, + { + "epoch": 3.2862644415917845, + "grad_norm": 8.40917682647705, + "learning_rate": 9.851091142490373e-06, + "loss": 0.7419, + "step": 2560 + }, + { + "epoch": 3.287548138639281, + "grad_norm": 1.858520746231079, + "learning_rate": 9.854942233632862e-06, + "loss": 0.7184, + "step": 2561 + }, + { + "epoch": 3.288831835686778, + "grad_norm": 1.5316368341445923, + "learning_rate": 9.858793324775353e-06, + "loss": 0.682, + "step": 2562 + }, + { + "epoch": 3.2901155327342746, + "grad_norm": 1.9896385669708252, + "learning_rate": 9.862644415917844e-06, + "loss": 0.6914, + "step": 2563 + }, + { + "epoch": 3.2913992297817716, + "grad_norm": 1.5724245309829712, + "learning_rate": 9.866495507060334e-06, + "loss": 0.7051, + "step": 2564 + }, + { + "epoch": 3.292682926829268, + "grad_norm": 4.058265686035156, + "learning_rate": 9.870346598202825e-06, + "loss": 0.7417, + "step": 2565 + }, + { + "epoch": 3.293966623876765, + "grad_norm": 1.9844075441360474, + "learning_rate": 9.874197689345314e-06, + "loss": 0.7194, + "step": 2566 + }, + { + "epoch": 3.295250320924262, + "grad_norm": 5.136328220367432, + "learning_rate": 9.878048780487805e-06, + "loss": 0.7024, + "step": 2567 + }, + { + "epoch": 3.2965340179717586, + "grad_norm": 2.696220874786377, + "learning_rate": 9.881899871630296e-06, + "loss": 0.7302, + "step": 2568 + }, + { + "epoch": 3.2978177150192556, + "grad_norm": 2.0439581871032715, + "learning_rate": 9.885750962772785e-06, + "loss": 0.7282, + "step": 2569 + }, + { + "epoch": 3.299101412066752, + "grad_norm": 2.5489721298217773, + "learning_rate": 9.889602053915277e-06, + "loss": 0.7301, + "step": 2570 + }, + { + "epoch": 3.300385109114249, + "grad_norm": 5.20399284362793, + "learning_rate": 9.893453145057768e-06, + "loss": 0.7356, + "step": 2571 + }, + { + "epoch": 3.3016688061617456, + "grad_norm": 2.251498222351074, + "learning_rate": 9.897304236200257e-06, + "loss": 0.7308, + "step": 2572 + }, + { + "epoch": 3.3029525032092426, + "grad_norm": 2.5346872806549072, + "learning_rate": 9.901155327342746e-06, + "loss": 0.6805, + "step": 2573 + }, + { + "epoch": 3.3042362002567396, + "grad_norm": 3.494720935821533, + "learning_rate": 9.905006418485239e-06, + "loss": 0.7332, + "step": 2574 + }, + { + "epoch": 3.305519897304236, + "grad_norm": 3.0152790546417236, + "learning_rate": 9.908857509627729e-06, + "loss": 0.7318, + "step": 2575 + }, + { + "epoch": 3.306803594351733, + "grad_norm": 2.989993095397949, + "learning_rate": 9.912708600770218e-06, + "loss": 0.7246, + "step": 2576 + }, + { + "epoch": 3.3080872913992296, + "grad_norm": 3.006300210952759, + "learning_rate": 9.916559691912709e-06, + "loss": 0.7004, + "step": 2577 + }, + { + "epoch": 3.3093709884467266, + "grad_norm": 2.106321096420288, + "learning_rate": 9.9204107830552e-06, + "loss": 0.7548, + "step": 2578 + }, + { + "epoch": 3.3106546854942236, + "grad_norm": 4.522500038146973, + "learning_rate": 9.92426187419769e-06, + "loss": 0.7691, + "step": 2579 + }, + { + "epoch": 3.31193838254172, + "grad_norm": 4.361861228942871, + "learning_rate": 9.92811296534018e-06, + "loss": 0.7862, + "step": 2580 + }, + { + "epoch": 3.313222079589217, + "grad_norm": 3.024074077606201, + "learning_rate": 9.93196405648267e-06, + "loss": 0.7038, + "step": 2581 + }, + { + "epoch": 3.3145057766367136, + "grad_norm": 2.705073356628418, + "learning_rate": 9.935815147625161e-06, + "loss": 0.8218, + "step": 2582 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 2.3161964416503906, + "learning_rate": 9.939666238767652e-06, + "loss": 0.8069, + "step": 2583 + }, + { + "epoch": 3.317073170731707, + "grad_norm": 2.495593547821045, + "learning_rate": 9.943517329910141e-06, + "loss": 0.8323, + "step": 2584 + }, + { + "epoch": 3.318356867779204, + "grad_norm": 2.892319679260254, + "learning_rate": 9.94736842105263e-06, + "loss": 0.8038, + "step": 2585 + }, + { + "epoch": 3.3196405648267007, + "grad_norm": 5.871700763702393, + "learning_rate": 9.951219512195124e-06, + "loss": 0.9054, + "step": 2586 + }, + { + "epoch": 3.3209242618741976, + "grad_norm": 4.821722507476807, + "learning_rate": 9.955070603337613e-06, + "loss": 1.0044, + "step": 2587 + }, + { + "epoch": 3.3222079589216946, + "grad_norm": 1.7482359409332275, + "learning_rate": 9.958921694480102e-06, + "loss": 0.6726, + "step": 2588 + }, + { + "epoch": 3.323491655969191, + "grad_norm": 2.077394962310791, + "learning_rate": 9.962772785622593e-06, + "loss": 0.6404, + "step": 2589 + }, + { + "epoch": 3.324775353016688, + "grad_norm": 2.400057315826416, + "learning_rate": 9.966623876765084e-06, + "loss": 0.6539, + "step": 2590 + }, + { + "epoch": 3.3260590500641847, + "grad_norm": 1.9741259813308716, + "learning_rate": 9.970474967907574e-06, + "loss": 0.7087, + "step": 2591 + }, + { + "epoch": 3.3273427471116817, + "grad_norm": 3.8041129112243652, + "learning_rate": 9.974326059050063e-06, + "loss": 0.6873, + "step": 2592 + }, + { + "epoch": 3.3286264441591786, + "grad_norm": 1.5027475357055664, + "learning_rate": 9.978177150192556e-06, + "loss": 0.6563, + "step": 2593 + }, + { + "epoch": 3.329910141206675, + "grad_norm": 3.2767715454101562, + "learning_rate": 9.982028241335045e-06, + "loss": 0.6209, + "step": 2594 + }, + { + "epoch": 3.331193838254172, + "grad_norm": 1.831133484840393, + "learning_rate": 9.985879332477535e-06, + "loss": 0.675, + "step": 2595 + }, + { + "epoch": 3.3324775353016687, + "grad_norm": 2.574585199356079, + "learning_rate": 9.989730423620026e-06, + "loss": 0.6436, + "step": 2596 + }, + { + "epoch": 3.3337612323491657, + "grad_norm": 10.255441665649414, + "learning_rate": 9.993581514762517e-06, + "loss": 0.7006, + "step": 2597 + }, + { + "epoch": 3.335044929396662, + "grad_norm": 1.7449803352355957, + "learning_rate": 9.997432605905006e-06, + "loss": 0.6611, + "step": 2598 + }, + { + "epoch": 3.336328626444159, + "grad_norm": 2.5885207653045654, + "learning_rate": 1.0001283697047497e-05, + "loss": 0.7314, + "step": 2599 + }, + { + "epoch": 3.337612323491656, + "grad_norm": 1.5108708143234253, + "learning_rate": 1.0005134788189987e-05, + "loss": 0.6865, + "step": 2600 + }, + { + "epoch": 3.3388960205391527, + "grad_norm": 1.454325556755066, + "learning_rate": 1.0008985879332478e-05, + "loss": 0.6942, + "step": 2601 + }, + { + "epoch": 3.3401797175866497, + "grad_norm": 1.6301778554916382, + "learning_rate": 1.0012836970474969e-05, + "loss": 0.6821, + "step": 2602 + }, + { + "epoch": 3.341463414634146, + "grad_norm": 1.5830568075180054, + "learning_rate": 1.0016688061617458e-05, + "loss": 0.7276, + "step": 2603 + }, + { + "epoch": 3.342747111681643, + "grad_norm": 2.8989977836608887, + "learning_rate": 1.0020539152759948e-05, + "loss": 0.676, + "step": 2604 + }, + { + "epoch": 3.3440308087291397, + "grad_norm": 7.976902008056641, + "learning_rate": 1.002439024390244e-05, + "loss": 0.6677, + "step": 2605 + }, + { + "epoch": 3.3453145057766367, + "grad_norm": 2.565958023071289, + "learning_rate": 1.002824133504493e-05, + "loss": 0.6667, + "step": 2606 + }, + { + "epoch": 3.3465982028241337, + "grad_norm": 6.487285137176514, + "learning_rate": 1.0032092426187419e-05, + "loss": 0.6716, + "step": 2607 + }, + { + "epoch": 3.34788189987163, + "grad_norm": 2.7885935306549072, + "learning_rate": 1.003594351732991e-05, + "loss": 0.6657, + "step": 2608 + }, + { + "epoch": 3.349165596919127, + "grad_norm": 2.071410894393921, + "learning_rate": 1.0039794608472401e-05, + "loss": 0.7071, + "step": 2609 + }, + { + "epoch": 3.3504492939666237, + "grad_norm": 1.8216063976287842, + "learning_rate": 1.004364569961489e-05, + "loss": 0.686, + "step": 2610 + }, + { + "epoch": 3.3517329910141207, + "grad_norm": 1.6617577075958252, + "learning_rate": 1.0047496790757382e-05, + "loss": 0.6973, + "step": 2611 + }, + { + "epoch": 3.3530166880616177, + "grad_norm": 2.5453720092773438, + "learning_rate": 1.0051347881899871e-05, + "loss": 0.7055, + "step": 2612 + }, + { + "epoch": 3.354300385109114, + "grad_norm": 2.7993967533111572, + "learning_rate": 1.0055198973042362e-05, + "loss": 0.6759, + "step": 2613 + }, + { + "epoch": 3.355584082156611, + "grad_norm": 1.4948097467422485, + "learning_rate": 1.0059050064184853e-05, + "loss": 0.6959, + "step": 2614 + }, + { + "epoch": 3.3568677792041077, + "grad_norm": 1.7085416316986084, + "learning_rate": 1.0062901155327343e-05, + "loss": 0.6712, + "step": 2615 + }, + { + "epoch": 3.3581514762516047, + "grad_norm": 2.334636926651001, + "learning_rate": 1.0066752246469834e-05, + "loss": 0.674, + "step": 2616 + }, + { + "epoch": 3.3594351732991012, + "grad_norm": 30.597360610961914, + "learning_rate": 1.0070603337612325e-05, + "loss": 0.6669, + "step": 2617 + }, + { + "epoch": 3.360718870346598, + "grad_norm": 4.540186405181885, + "learning_rate": 1.0074454428754814e-05, + "loss": 0.6498, + "step": 2618 + }, + { + "epoch": 3.3620025673940948, + "grad_norm": 3.5724310874938965, + "learning_rate": 1.0078305519897303e-05, + "loss": 0.7024, + "step": 2619 + }, + { + "epoch": 3.3632862644415917, + "grad_norm": 2.3734123706817627, + "learning_rate": 1.0082156611039796e-05, + "loss": 0.708, + "step": 2620 + }, + { + "epoch": 3.3645699614890887, + "grad_norm": 2.7290289402008057, + "learning_rate": 1.0086007702182286e-05, + "loss": 0.7175, + "step": 2621 + }, + { + "epoch": 3.3658536585365852, + "grad_norm": 3.531831979751587, + "learning_rate": 1.0089858793324775e-05, + "loss": 0.6947, + "step": 2622 + }, + { + "epoch": 3.3671373555840822, + "grad_norm": 3.1566991806030273, + "learning_rate": 1.0093709884467266e-05, + "loss": 0.7254, + "step": 2623 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 2.1668848991394043, + "learning_rate": 1.0097560975609757e-05, + "loss": 0.7289, + "step": 2624 + }, + { + "epoch": 3.3697047496790757, + "grad_norm": 2.1822216510772705, + "learning_rate": 1.0101412066752246e-05, + "loss": 0.6768, + "step": 2625 + }, + { + "epoch": 3.3709884467265727, + "grad_norm": 2.442375898361206, + "learning_rate": 1.0105263157894738e-05, + "loss": 0.7048, + "step": 2626 + }, + { + "epoch": 3.3722721437740693, + "grad_norm": 3.8795149326324463, + "learning_rate": 1.0109114249037227e-05, + "loss": 0.7541, + "step": 2627 + }, + { + "epoch": 3.3735558408215662, + "grad_norm": 3.541533946990967, + "learning_rate": 1.0112965340179718e-05, + "loss": 0.7204, + "step": 2628 + }, + { + "epoch": 3.3748395378690628, + "grad_norm": 2.1666183471679688, + "learning_rate": 1.0116816431322209e-05, + "loss": 0.7744, + "step": 2629 + }, + { + "epoch": 3.3761232349165597, + "grad_norm": 3.5979068279266357, + "learning_rate": 1.0120667522464698e-05, + "loss": 0.7523, + "step": 2630 + }, + { + "epoch": 3.3774069319640567, + "grad_norm": 4.787511348724365, + "learning_rate": 1.0124518613607188e-05, + "loss": 0.7244, + "step": 2631 + }, + { + "epoch": 3.3786906290115533, + "grad_norm": 4.099532127380371, + "learning_rate": 1.012836970474968e-05, + "loss": 0.8178, + "step": 2632 + }, + { + "epoch": 3.3799743260590502, + "grad_norm": 4.069502353668213, + "learning_rate": 1.013222079589217e-05, + "loss": 0.7683, + "step": 2633 + }, + { + "epoch": 3.3812580231065468, + "grad_norm": 2.6777796745300293, + "learning_rate": 1.013607188703466e-05, + "loss": 0.8405, + "step": 2634 + }, + { + "epoch": 3.3825417201540438, + "grad_norm": 5.08932638168335, + "learning_rate": 1.013992297817715e-05, + "loss": 0.7793, + "step": 2635 + }, + { + "epoch": 3.3838254172015403, + "grad_norm": 3.687075614929199, + "learning_rate": 1.0143774069319641e-05, + "loss": 0.8689, + "step": 2636 + }, + { + "epoch": 3.3851091142490373, + "grad_norm": 3.927088737487793, + "learning_rate": 1.0147625160462131e-05, + "loss": 0.9702, + "step": 2637 + }, + { + "epoch": 3.386392811296534, + "grad_norm": 1.6923471689224243, + "learning_rate": 1.0151476251604622e-05, + "loss": 0.6693, + "step": 2638 + }, + { + "epoch": 3.387676508344031, + "grad_norm": 2.0049941539764404, + "learning_rate": 1.0155327342747113e-05, + "loss": 0.6531, + "step": 2639 + }, + { + "epoch": 3.3889602053915278, + "grad_norm": 1.5738885402679443, + "learning_rate": 1.0159178433889602e-05, + "loss": 0.6359, + "step": 2640 + }, + { + "epoch": 3.3902439024390243, + "grad_norm": 2.2502171993255615, + "learning_rate": 1.0163029525032093e-05, + "loss": 0.6404, + "step": 2641 + }, + { + "epoch": 3.3915275994865213, + "grad_norm": 1.640234351158142, + "learning_rate": 1.0166880616174583e-05, + "loss": 0.6809, + "step": 2642 + }, + { + "epoch": 3.392811296534018, + "grad_norm": 1.2786725759506226, + "learning_rate": 1.0170731707317074e-05, + "loss": 0.6567, + "step": 2643 + }, + { + "epoch": 3.394094993581515, + "grad_norm": 2.2593166828155518, + "learning_rate": 1.0174582798459563e-05, + "loss": 0.695, + "step": 2644 + }, + { + "epoch": 3.3953786906290118, + "grad_norm": 1.4220527410507202, + "learning_rate": 1.0178433889602054e-05, + "loss": 0.6352, + "step": 2645 + }, + { + "epoch": 3.3966623876765083, + "grad_norm": 1.6717321872711182, + "learning_rate": 1.0182284980744544e-05, + "loss": 0.6118, + "step": 2646 + }, + { + "epoch": 3.3979460847240053, + "grad_norm": 3.75848388671875, + "learning_rate": 1.0186136071887035e-05, + "loss": 0.659, + "step": 2647 + }, + { + "epoch": 3.399229781771502, + "grad_norm": 1.724541187286377, + "learning_rate": 1.0189987163029526e-05, + "loss": 0.6699, + "step": 2648 + }, + { + "epoch": 3.400513478818999, + "grad_norm": 2.771974563598633, + "learning_rate": 1.0193838254172015e-05, + "loss": 0.6212, + "step": 2649 + }, + { + "epoch": 3.4017971758664953, + "grad_norm": 2.152066469192505, + "learning_rate": 1.0197689345314505e-05, + "loss": 0.6611, + "step": 2650 + }, + { + "epoch": 3.4030808729139923, + "grad_norm": 1.6945726871490479, + "learning_rate": 1.0201540436456997e-05, + "loss": 0.6624, + "step": 2651 + }, + { + "epoch": 3.404364569961489, + "grad_norm": 1.9470324516296387, + "learning_rate": 1.0205391527599487e-05, + "loss": 0.634, + "step": 2652 + }, + { + "epoch": 3.405648267008986, + "grad_norm": 3.0002262592315674, + "learning_rate": 1.0209242618741976e-05, + "loss": 0.6579, + "step": 2653 + }, + { + "epoch": 3.406931964056483, + "grad_norm": 1.7488524913787842, + "learning_rate": 1.0213093709884467e-05, + "loss": 0.6509, + "step": 2654 + }, + { + "epoch": 3.4082156611039793, + "grad_norm": 1.4402393102645874, + "learning_rate": 1.0216944801026958e-05, + "loss": 0.6622, + "step": 2655 + }, + { + "epoch": 3.4094993581514763, + "grad_norm": 1.8496170043945312, + "learning_rate": 1.0220795892169448e-05, + "loss": 0.6718, + "step": 2656 + }, + { + "epoch": 3.410783055198973, + "grad_norm": 3.756636619567871, + "learning_rate": 1.0224646983311939e-05, + "loss": 0.7399, + "step": 2657 + }, + { + "epoch": 3.41206675224647, + "grad_norm": 1.8873943090438843, + "learning_rate": 1.0228498074454428e-05, + "loss": 0.6657, + "step": 2658 + }, + { + "epoch": 3.413350449293967, + "grad_norm": 1.8343943357467651, + "learning_rate": 1.0232349165596919e-05, + "loss": 0.7191, + "step": 2659 + }, + { + "epoch": 3.4146341463414633, + "grad_norm": 2.0018391609191895, + "learning_rate": 1.023620025673941e-05, + "loss": 0.6909, + "step": 2660 + }, + { + "epoch": 3.4159178433889603, + "grad_norm": 2.4849514961242676, + "learning_rate": 1.02400513478819e-05, + "loss": 0.6658, + "step": 2661 + }, + { + "epoch": 3.417201540436457, + "grad_norm": 11.364225387573242, + "learning_rate": 1.024390243902439e-05, + "loss": 0.6111, + "step": 2662 + }, + { + "epoch": 3.418485237483954, + "grad_norm": 2.292902708053589, + "learning_rate": 1.0247753530166882e-05, + "loss": 0.6767, + "step": 2663 + }, + { + "epoch": 3.419768934531451, + "grad_norm": 2.922189474105835, + "learning_rate": 1.0251604621309371e-05, + "loss": 0.6707, + "step": 2664 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 1.5623844861984253, + "learning_rate": 1.025545571245186e-05, + "loss": 0.7322, + "step": 2665 + }, + { + "epoch": 3.4223363286264443, + "grad_norm": 2.7736432552337646, + "learning_rate": 1.0259306803594353e-05, + "loss": 0.656, + "step": 2666 + }, + { + "epoch": 3.423620025673941, + "grad_norm": 1.8777670860290527, + "learning_rate": 1.0263157894736843e-05, + "loss": 0.6793, + "step": 2667 + }, + { + "epoch": 3.424903722721438, + "grad_norm": 1.7730512619018555, + "learning_rate": 1.0267008985879332e-05, + "loss": 0.6876, + "step": 2668 + }, + { + "epoch": 3.4261874197689344, + "grad_norm": 2.345289945602417, + "learning_rate": 1.0270860077021823e-05, + "loss": 0.7143, + "step": 2669 + }, + { + "epoch": 3.4274711168164314, + "grad_norm": 2.2299625873565674, + "learning_rate": 1.0274711168164314e-05, + "loss": 0.7319, + "step": 2670 + }, + { + "epoch": 3.428754813863928, + "grad_norm": 6.863197326660156, + "learning_rate": 1.0278562259306804e-05, + "loss": 0.7369, + "step": 2671 + }, + { + "epoch": 3.430038510911425, + "grad_norm": 2.958873748779297, + "learning_rate": 1.0282413350449295e-05, + "loss": 0.7204, + "step": 2672 + }, + { + "epoch": 3.431322207958922, + "grad_norm": 5.3017706871032715, + "learning_rate": 1.0286264441591784e-05, + "loss": 0.7667, + "step": 2673 + }, + { + "epoch": 3.4326059050064184, + "grad_norm": 2.173703193664551, + "learning_rate": 1.0290115532734275e-05, + "loss": 0.717, + "step": 2674 + }, + { + "epoch": 3.4338896020539154, + "grad_norm": 2.1987478733062744, + "learning_rate": 1.0293966623876766e-05, + "loss": 0.7438, + "step": 2675 + }, + { + "epoch": 3.435173299101412, + "grad_norm": 2.896998405456543, + "learning_rate": 1.0297817715019255e-05, + "loss": 0.6866, + "step": 2676 + }, + { + "epoch": 3.436456996148909, + "grad_norm": 2.3428561687469482, + "learning_rate": 1.0301668806161745e-05, + "loss": 0.7522, + "step": 2677 + }, + { + "epoch": 3.437740693196406, + "grad_norm": 3.1956288814544678, + "learning_rate": 1.0305519897304238e-05, + "loss": 0.735, + "step": 2678 + }, + { + "epoch": 3.4390243902439024, + "grad_norm": 3.5329596996307373, + "learning_rate": 1.0309370988446727e-05, + "loss": 0.7609, + "step": 2679 + }, + { + "epoch": 3.4403080872913994, + "grad_norm": 2.3965768814086914, + "learning_rate": 1.0313222079589216e-05, + "loss": 0.7296, + "step": 2680 + }, + { + "epoch": 3.441591784338896, + "grad_norm": 2.9084808826446533, + "learning_rate": 1.0317073170731707e-05, + "loss": 0.7882, + "step": 2681 + }, + { + "epoch": 3.442875481386393, + "grad_norm": 5.431397914886475, + "learning_rate": 1.0320924261874199e-05, + "loss": 0.807, + "step": 2682 + }, + { + "epoch": 3.4441591784338894, + "grad_norm": 2.9117109775543213, + "learning_rate": 1.0324775353016688e-05, + "loss": 0.7856, + "step": 2683 + }, + { + "epoch": 3.4454428754813864, + "grad_norm": 1.9668043851852417, + "learning_rate": 1.0328626444159179e-05, + "loss": 0.7813, + "step": 2684 + }, + { + "epoch": 3.4467265725288834, + "grad_norm": 3.219043731689453, + "learning_rate": 1.033247753530167e-05, + "loss": 0.7878, + "step": 2685 + }, + { + "epoch": 3.44801026957638, + "grad_norm": 3.0886707305908203, + "learning_rate": 1.033632862644416e-05, + "loss": 0.8622, + "step": 2686 + }, + { + "epoch": 3.449293966623877, + "grad_norm": 3.565145492553711, + "learning_rate": 1.034017971758665e-05, + "loss": 1.014, + "step": 2687 + }, + { + "epoch": 3.4505776636713734, + "grad_norm": 1.9772050380706787, + "learning_rate": 1.034403080872914e-05, + "loss": 0.6819, + "step": 2688 + }, + { + "epoch": 3.4518613607188704, + "grad_norm": 1.3088300228118896, + "learning_rate": 1.0347881899871631e-05, + "loss": 0.6421, + "step": 2689 + }, + { + "epoch": 3.453145057766367, + "grad_norm": 1.2858500480651855, + "learning_rate": 1.0351732991014122e-05, + "loss": 0.6634, + "step": 2690 + }, + { + "epoch": 3.454428754813864, + "grad_norm": 2.2246499061584473, + "learning_rate": 1.0355584082156611e-05, + "loss": 0.7005, + "step": 2691 + }, + { + "epoch": 3.455712451861361, + "grad_norm": 2.1758694648742676, + "learning_rate": 1.03594351732991e-05, + "loss": 0.7249, + "step": 2692 + }, + { + "epoch": 3.4569961489088574, + "grad_norm": 2.7966580390930176, + "learning_rate": 1.0363286264441592e-05, + "loss": 0.652, + "step": 2693 + }, + { + "epoch": 3.4582798459563544, + "grad_norm": 2.0973262786865234, + "learning_rate": 1.0367137355584083e-05, + "loss": 0.6185, + "step": 2694 + }, + { + "epoch": 3.459563543003851, + "grad_norm": 2.800713300704956, + "learning_rate": 1.0370988446726572e-05, + "loss": 0.6968, + "step": 2695 + }, + { + "epoch": 3.460847240051348, + "grad_norm": 3.0424370765686035, + "learning_rate": 1.0374839537869062e-05, + "loss": 0.6456, + "step": 2696 + }, + { + "epoch": 3.462130937098845, + "grad_norm": 2.2481205463409424, + "learning_rate": 1.0378690629011554e-05, + "loss": 0.7371, + "step": 2697 + }, + { + "epoch": 3.4634146341463414, + "grad_norm": 1.7109023332595825, + "learning_rate": 1.0382541720154044e-05, + "loss": 0.6735, + "step": 2698 + }, + { + "epoch": 3.4646983311938384, + "grad_norm": 1.65056312084198, + "learning_rate": 1.0386392811296533e-05, + "loss": 0.6283, + "step": 2699 + }, + { + "epoch": 3.465982028241335, + "grad_norm": 4.390285968780518, + "learning_rate": 1.0390243902439024e-05, + "loss": 0.681, + "step": 2700 + }, + { + "epoch": 3.467265725288832, + "grad_norm": 1.5667589902877808, + "learning_rate": 1.0394094993581515e-05, + "loss": 0.6853, + "step": 2701 + }, + { + "epoch": 3.4685494223363285, + "grad_norm": 2.007737874984741, + "learning_rate": 1.0397946084724005e-05, + "loss": 0.6437, + "step": 2702 + }, + { + "epoch": 3.4698331193838254, + "grad_norm": 4.287989616394043, + "learning_rate": 1.0401797175866496e-05, + "loss": 0.6258, + "step": 2703 + }, + { + "epoch": 3.471116816431322, + "grad_norm": 2.201944351196289, + "learning_rate": 1.0405648267008987e-05, + "loss": 0.717, + "step": 2704 + }, + { + "epoch": 3.472400513478819, + "grad_norm": 3.2241880893707275, + "learning_rate": 1.0409499358151476e-05, + "loss": 0.6677, + "step": 2705 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 2.2323505878448486, + "learning_rate": 1.0413350449293967e-05, + "loss": 0.686, + "step": 2706 + }, + { + "epoch": 3.4749679075738125, + "grad_norm": 2.5817039012908936, + "learning_rate": 1.0417201540436457e-05, + "loss": 0.6728, + "step": 2707 + }, + { + "epoch": 3.4762516046213094, + "grad_norm": 1.8959835767745972, + "learning_rate": 1.0421052631578948e-05, + "loss": 0.6552, + "step": 2708 + }, + { + "epoch": 3.477535301668806, + "grad_norm": 1.5487585067749023, + "learning_rate": 1.0424903722721439e-05, + "loss": 0.6197, + "step": 2709 + }, + { + "epoch": 3.478818998716303, + "grad_norm": 2.0449607372283936, + "learning_rate": 1.0428754813863928e-05, + "loss": 0.6832, + "step": 2710 + }, + { + "epoch": 3.4801026957638, + "grad_norm": 1.5700403451919556, + "learning_rate": 1.0432605905006418e-05, + "loss": 0.7125, + "step": 2711 + }, + { + "epoch": 3.4813863928112965, + "grad_norm": 2.3290951251983643, + "learning_rate": 1.043645699614891e-05, + "loss": 0.6971, + "step": 2712 + }, + { + "epoch": 3.4826700898587934, + "grad_norm": 4.425687789916992, + "learning_rate": 1.04403080872914e-05, + "loss": 0.6507, + "step": 2713 + }, + { + "epoch": 3.48395378690629, + "grad_norm": 3.125411033630371, + "learning_rate": 1.0444159178433889e-05, + "loss": 0.7, + "step": 2714 + }, + { + "epoch": 3.485237483953787, + "grad_norm": 1.5769413709640503, + "learning_rate": 1.044801026957638e-05, + "loss": 0.6507, + "step": 2715 + }, + { + "epoch": 3.486521181001284, + "grad_norm": 2.3121981620788574, + "learning_rate": 1.0451861360718871e-05, + "loss": 0.6901, + "step": 2716 + }, + { + "epoch": 3.4878048780487805, + "grad_norm": 2.0128684043884277, + "learning_rate": 1.045571245186136e-05, + "loss": 0.6953, + "step": 2717 + }, + { + "epoch": 3.4890885750962775, + "grad_norm": 3.9026482105255127, + "learning_rate": 1.0459563543003852e-05, + "loss": 0.727, + "step": 2718 + }, + { + "epoch": 3.490372272143774, + "grad_norm": 1.6467941999435425, + "learning_rate": 1.0463414634146341e-05, + "loss": 0.6819, + "step": 2719 + }, + { + "epoch": 3.491655969191271, + "grad_norm": 1.6889469623565674, + "learning_rate": 1.0467265725288832e-05, + "loss": 0.6843, + "step": 2720 + }, + { + "epoch": 3.4929396662387675, + "grad_norm": 3.01202392578125, + "learning_rate": 1.0471116816431323e-05, + "loss": 0.7507, + "step": 2721 + }, + { + "epoch": 3.4942233632862645, + "grad_norm": 1.7998861074447632, + "learning_rate": 1.0474967907573813e-05, + "loss": 0.6829, + "step": 2722 + }, + { + "epoch": 3.495507060333761, + "grad_norm": 1.6339247226715088, + "learning_rate": 1.0478818998716302e-05, + "loss": 0.7335, + "step": 2723 + }, + { + "epoch": 3.496790757381258, + "grad_norm": 1.8176803588867188, + "learning_rate": 1.0482670089858795e-05, + "loss": 0.7313, + "step": 2724 + }, + { + "epoch": 3.498074454428755, + "grad_norm": 2.4157323837280273, + "learning_rate": 1.0486521181001284e-05, + "loss": 0.6757, + "step": 2725 + }, + { + "epoch": 3.4993581514762515, + "grad_norm": 2.4111318588256836, + "learning_rate": 1.0490372272143773e-05, + "loss": 0.7542, + "step": 2726 + }, + { + "epoch": 3.5006418485237485, + "grad_norm": 2.2800240516662598, + "learning_rate": 1.0494223363286266e-05, + "loss": 0.6609, + "step": 2727 + }, + { + "epoch": 3.501925545571245, + "grad_norm": 3.4658279418945312, + "learning_rate": 1.0498074454428756e-05, + "loss": 0.7329, + "step": 2728 + }, + { + "epoch": 3.503209242618742, + "grad_norm": 2.6632635593414307, + "learning_rate": 1.0501925545571245e-05, + "loss": 0.7791, + "step": 2729 + }, + { + "epoch": 3.504492939666239, + "grad_norm": 3.1832828521728516, + "learning_rate": 1.0505776636713736e-05, + "loss": 0.762, + "step": 2730 + }, + { + "epoch": 3.5057766367137355, + "grad_norm": 5.023934364318848, + "learning_rate": 1.0509627727856227e-05, + "loss": 0.7284, + "step": 2731 + }, + { + "epoch": 3.5070603337612325, + "grad_norm": 2.7985947132110596, + "learning_rate": 1.0513478818998716e-05, + "loss": 0.7301, + "step": 2732 + }, + { + "epoch": 3.508344030808729, + "grad_norm": 4.390094757080078, + "learning_rate": 1.0517329910141208e-05, + "loss": 0.7593, + "step": 2733 + }, + { + "epoch": 3.509627727856226, + "grad_norm": 3.4498445987701416, + "learning_rate": 1.0521181001283697e-05, + "loss": 0.7554, + "step": 2734 + }, + { + "epoch": 3.510911424903723, + "grad_norm": 4.802944660186768, + "learning_rate": 1.0525032092426188e-05, + "loss": 0.8686, + "step": 2735 + }, + { + "epoch": 3.5121951219512195, + "grad_norm": 8.122929573059082, + "learning_rate": 1.0528883183568679e-05, + "loss": 0.8971, + "step": 2736 + }, + { + "epoch": 3.513478818998716, + "grad_norm": 3.0878679752349854, + "learning_rate": 1.0532734274711168e-05, + "loss": 1.1129, + "step": 2737 + }, + { + "epoch": 3.514762516046213, + "grad_norm": 1.788765549659729, + "learning_rate": 1.0536585365853658e-05, + "loss": 0.6724, + "step": 2738 + }, + { + "epoch": 3.51604621309371, + "grad_norm": 1.8363198041915894, + "learning_rate": 1.054043645699615e-05, + "loss": 0.6492, + "step": 2739 + }, + { + "epoch": 3.5173299101412066, + "grad_norm": 2.1365373134613037, + "learning_rate": 1.054428754813864e-05, + "loss": 0.6503, + "step": 2740 + }, + { + "epoch": 3.5186136071887035, + "grad_norm": 1.9668110609054565, + "learning_rate": 1.054813863928113e-05, + "loss": 0.6571, + "step": 2741 + }, + { + "epoch": 3.5198973042362, + "grad_norm": 1.8018510341644287, + "learning_rate": 1.055198973042362e-05, + "loss": 0.6531, + "step": 2742 + }, + { + "epoch": 3.521181001283697, + "grad_norm": 1.4630478620529175, + "learning_rate": 1.0555840821566111e-05, + "loss": 0.6638, + "step": 2743 + }, + { + "epoch": 3.522464698331194, + "grad_norm": 2.1341710090637207, + "learning_rate": 1.05596919127086e-05, + "loss": 0.6327, + "step": 2744 + }, + { + "epoch": 3.5237483953786906, + "grad_norm": 2.2735159397125244, + "learning_rate": 1.056354300385109e-05, + "loss": 0.6567, + "step": 2745 + }, + { + "epoch": 3.5250320924261875, + "grad_norm": 2.9256818294525146, + "learning_rate": 1.0567394094993581e-05, + "loss": 0.6647, + "step": 2746 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 2.568089485168457, + "learning_rate": 1.0571245186136072e-05, + "loss": 0.6852, + "step": 2747 + }, + { + "epoch": 3.527599486521181, + "grad_norm": 1.8420544862747192, + "learning_rate": 1.0575096277278562e-05, + "loss": 0.6932, + "step": 2748 + }, + { + "epoch": 3.528883183568678, + "grad_norm": 1.5248273611068726, + "learning_rate": 1.0578947368421053e-05, + "loss": 0.6946, + "step": 2749 + }, + { + "epoch": 3.5301668806161746, + "grad_norm": 1.5497679710388184, + "learning_rate": 1.0582798459563544e-05, + "loss": 0.6257, + "step": 2750 + }, + { + "epoch": 3.531450577663671, + "grad_norm": 1.8012884855270386, + "learning_rate": 1.0586649550706033e-05, + "loss": 0.6085, + "step": 2751 + }, + { + "epoch": 3.532734274711168, + "grad_norm": 4.382095813751221, + "learning_rate": 1.0590500641848524e-05, + "loss": 0.6236, + "step": 2752 + }, + { + "epoch": 3.534017971758665, + "grad_norm": 2.1914174556732178, + "learning_rate": 1.0594351732991014e-05, + "loss": 0.6739, + "step": 2753 + }, + { + "epoch": 3.5353016688061616, + "grad_norm": 6.73301887512207, + "learning_rate": 1.0598202824133505e-05, + "loss": 0.6284, + "step": 2754 + }, + { + "epoch": 3.5365853658536586, + "grad_norm": 2.7553844451904297, + "learning_rate": 1.0602053915275996e-05, + "loss": 0.6676, + "step": 2755 + }, + { + "epoch": 3.537869062901155, + "grad_norm": 2.8822529315948486, + "learning_rate": 1.0605905006418485e-05, + "loss": 0.6339, + "step": 2756 + }, + { + "epoch": 3.539152759948652, + "grad_norm": 5.2742228507995605, + "learning_rate": 1.0609756097560975e-05, + "loss": 0.6277, + "step": 2757 + }, + { + "epoch": 3.540436456996149, + "grad_norm": 1.7863982915878296, + "learning_rate": 1.0613607188703467e-05, + "loss": 0.6674, + "step": 2758 + }, + { + "epoch": 3.5417201540436456, + "grad_norm": 2.876051664352417, + "learning_rate": 1.0617458279845957e-05, + "loss": 0.679, + "step": 2759 + }, + { + "epoch": 3.5430038510911426, + "grad_norm": 2.554091215133667, + "learning_rate": 1.0621309370988446e-05, + "loss": 0.6709, + "step": 2760 + }, + { + "epoch": 3.544287548138639, + "grad_norm": 2.3293848037719727, + "learning_rate": 1.0625160462130937e-05, + "loss": 0.6688, + "step": 2761 + }, + { + "epoch": 3.545571245186136, + "grad_norm": 3.528057098388672, + "learning_rate": 1.0629011553273428e-05, + "loss": 0.6939, + "step": 2762 + }, + { + "epoch": 3.546854942233633, + "grad_norm": 2.1664092540740967, + "learning_rate": 1.0632862644415918e-05, + "loss": 0.6728, + "step": 2763 + }, + { + "epoch": 3.5481386392811296, + "grad_norm": 2.8826305866241455, + "learning_rate": 1.0636713735558409e-05, + "loss": 0.6472, + "step": 2764 + }, + { + "epoch": 3.5494223363286266, + "grad_norm": 2.8962128162384033, + "learning_rate": 1.0640564826700898e-05, + "loss": 0.6485, + "step": 2765 + }, + { + "epoch": 3.550706033376123, + "grad_norm": 2.8615052700042725, + "learning_rate": 1.0644415917843389e-05, + "loss": 0.7278, + "step": 2766 + }, + { + "epoch": 3.55198973042362, + "grad_norm": 1.5696109533309937, + "learning_rate": 1.064826700898588e-05, + "loss": 0.7244, + "step": 2767 + }, + { + "epoch": 3.553273427471117, + "grad_norm": 4.15334939956665, + "learning_rate": 1.065211810012837e-05, + "loss": 0.6543, + "step": 2768 + }, + { + "epoch": 3.5545571245186136, + "grad_norm": 2.7940566539764404, + "learning_rate": 1.0655969191270859e-05, + "loss": 0.6741, + "step": 2769 + }, + { + "epoch": 3.55584082156611, + "grad_norm": 2.8141818046569824, + "learning_rate": 1.0659820282413352e-05, + "loss": 0.6733, + "step": 2770 + }, + { + "epoch": 3.557124518613607, + "grad_norm": 2.3002448081970215, + "learning_rate": 1.0663671373555841e-05, + "loss": 0.7042, + "step": 2771 + }, + { + "epoch": 3.558408215661104, + "grad_norm": 3.001811981201172, + "learning_rate": 1.066752246469833e-05, + "loss": 0.6711, + "step": 2772 + }, + { + "epoch": 3.5596919127086006, + "grad_norm": 2.6376566886901855, + "learning_rate": 1.0671373555840823e-05, + "loss": 0.7488, + "step": 2773 + }, + { + "epoch": 3.5609756097560976, + "grad_norm": 4.62398624420166, + "learning_rate": 1.0675224646983313e-05, + "loss": 0.7995, + "step": 2774 + }, + { + "epoch": 3.562259306803594, + "grad_norm": 2.2808187007904053, + "learning_rate": 1.0679075738125802e-05, + "loss": 0.8004, + "step": 2775 + }, + { + "epoch": 3.563543003851091, + "grad_norm": 4.226193428039551, + "learning_rate": 1.0682926829268293e-05, + "loss": 0.7846, + "step": 2776 + }, + { + "epoch": 3.564826700898588, + "grad_norm": 2.225804567337036, + "learning_rate": 1.0686777920410784e-05, + "loss": 0.7682, + "step": 2777 + }, + { + "epoch": 3.5661103979460846, + "grad_norm": 2.9855406284332275, + "learning_rate": 1.0690629011553274e-05, + "loss": 0.7358, + "step": 2778 + }, + { + "epoch": 3.5673940949935816, + "grad_norm": 3.539872646331787, + "learning_rate": 1.0694480102695765e-05, + "loss": 0.7947, + "step": 2779 + }, + { + "epoch": 3.568677792041078, + "grad_norm": 2.2244155406951904, + "learning_rate": 1.0698331193838254e-05, + "loss": 0.7542, + "step": 2780 + }, + { + "epoch": 3.569961489088575, + "grad_norm": 2.2028253078460693, + "learning_rate": 1.0702182284980745e-05, + "loss": 0.8178, + "step": 2781 + }, + { + "epoch": 3.571245186136072, + "grad_norm": 1.7794792652130127, + "learning_rate": 1.0706033376123236e-05, + "loss": 0.7807, + "step": 2782 + }, + { + "epoch": 3.5725288831835686, + "grad_norm": 2.918006420135498, + "learning_rate": 1.0709884467265725e-05, + "loss": 0.7761, + "step": 2783 + }, + { + "epoch": 3.5738125802310656, + "grad_norm": 2.64506196975708, + "learning_rate": 1.0713735558408215e-05, + "loss": 0.8007, + "step": 2784 + }, + { + "epoch": 3.575096277278562, + "grad_norm": 2.2880332469940186, + "learning_rate": 1.0717586649550708e-05, + "loss": 0.8047, + "step": 2785 + }, + { + "epoch": 3.576379974326059, + "grad_norm": 20.795516967773438, + "learning_rate": 1.0721437740693197e-05, + "loss": 0.8448, + "step": 2786 + }, + { + "epoch": 3.5776636713735557, + "grad_norm": 3.8426754474639893, + "learning_rate": 1.0725288831835686e-05, + "loss": 1.0371, + "step": 2787 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 2.4430694580078125, + "learning_rate": 1.0729139922978177e-05, + "loss": 0.6807, + "step": 2788 + }, + { + "epoch": 3.580231065468549, + "grad_norm": 4.516372203826904, + "learning_rate": 1.0732991014120669e-05, + "loss": 0.6478, + "step": 2789 + }, + { + "epoch": 3.581514762516046, + "grad_norm": 2.176475763320923, + "learning_rate": 1.0736842105263158e-05, + "loss": 0.6736, + "step": 2790 + }, + { + "epoch": 3.582798459563543, + "grad_norm": 2.7832391262054443, + "learning_rate": 1.0740693196405649e-05, + "loss": 0.6668, + "step": 2791 + }, + { + "epoch": 3.5840821566110397, + "grad_norm": 2.4124886989593506, + "learning_rate": 1.0744544287548138e-05, + "loss": 0.6355, + "step": 2792 + }, + { + "epoch": 3.5853658536585367, + "grad_norm": 1.5303518772125244, + "learning_rate": 1.074839537869063e-05, + "loss": 0.6303, + "step": 2793 + }, + { + "epoch": 3.586649550706033, + "grad_norm": 1.8016492128372192, + "learning_rate": 1.075224646983312e-05, + "loss": 0.6405, + "step": 2794 + }, + { + "epoch": 3.58793324775353, + "grad_norm": 3.0918045043945312, + "learning_rate": 1.075609756097561e-05, + "loss": 0.7345, + "step": 2795 + }, + { + "epoch": 3.589216944801027, + "grad_norm": 1.7776738405227661, + "learning_rate": 1.0759948652118101e-05, + "loss": 0.6578, + "step": 2796 + }, + { + "epoch": 3.5905006418485237, + "grad_norm": 2.004969835281372, + "learning_rate": 1.076379974326059e-05, + "loss": 0.6921, + "step": 2797 + }, + { + "epoch": 3.5917843388960207, + "grad_norm": 2.2060635089874268, + "learning_rate": 1.0767650834403081e-05, + "loss": 0.6195, + "step": 2798 + }, + { + "epoch": 3.593068035943517, + "grad_norm": 1.940295934677124, + "learning_rate": 1.077150192554557e-05, + "loss": 0.6945, + "step": 2799 + }, + { + "epoch": 3.594351732991014, + "grad_norm": 3.1322901248931885, + "learning_rate": 1.0775353016688062e-05, + "loss": 0.6565, + "step": 2800 + }, + { + "epoch": 3.595635430038511, + "grad_norm": 1.800782561302185, + "learning_rate": 1.0779204107830553e-05, + "loss": 0.7298, + "step": 2801 + }, + { + "epoch": 3.5969191270860077, + "grad_norm": 2.69040846824646, + "learning_rate": 1.0783055198973042e-05, + "loss": 0.6434, + "step": 2802 + }, + { + "epoch": 3.5982028241335042, + "grad_norm": 4.5971760749816895, + "learning_rate": 1.0786906290115532e-05, + "loss": 0.6371, + "step": 2803 + }, + { + "epoch": 3.599486521181001, + "grad_norm": 2.0600321292877197, + "learning_rate": 1.0790757381258024e-05, + "loss": 0.6954, + "step": 2804 + }, + { + "epoch": 3.600770218228498, + "grad_norm": 1.6707507371902466, + "learning_rate": 1.0794608472400514e-05, + "loss": 0.6167, + "step": 2805 + }, + { + "epoch": 3.6020539152759947, + "grad_norm": 1.1290979385375977, + "learning_rate": 1.0798459563543003e-05, + "loss": 0.6489, + "step": 2806 + }, + { + "epoch": 3.6033376123234917, + "grad_norm": 2.4071202278137207, + "learning_rate": 1.0802310654685494e-05, + "loss": 0.6645, + "step": 2807 + }, + { + "epoch": 3.6046213093709882, + "grad_norm": 2.6364355087280273, + "learning_rate": 1.0806161745827985e-05, + "loss": 0.6641, + "step": 2808 + }, + { + "epoch": 3.605905006418485, + "grad_norm": 1.5802568197250366, + "learning_rate": 1.0810012836970475e-05, + "loss": 0.6457, + "step": 2809 + }, + { + "epoch": 3.607188703465982, + "grad_norm": 1.3845248222351074, + "learning_rate": 1.0813863928112966e-05, + "loss": 0.6539, + "step": 2810 + }, + { + "epoch": 3.6084724005134787, + "grad_norm": 1.4641008377075195, + "learning_rate": 1.0817715019255455e-05, + "loss": 0.6837, + "step": 2811 + }, + { + "epoch": 3.6097560975609757, + "grad_norm": 1.9971882104873657, + "learning_rate": 1.0821566110397946e-05, + "loss": 0.7107, + "step": 2812 + }, + { + "epoch": 3.6110397946084722, + "grad_norm": 2.062098503112793, + "learning_rate": 1.0825417201540437e-05, + "loss": 0.6476, + "step": 2813 + }, + { + "epoch": 3.612323491655969, + "grad_norm": 3.081530809402466, + "learning_rate": 1.0829268292682927e-05, + "loss": 0.6501, + "step": 2814 + }, + { + "epoch": 3.613607188703466, + "grad_norm": 2.1729626655578613, + "learning_rate": 1.0833119383825416e-05, + "loss": 0.6242, + "step": 2815 + }, + { + "epoch": 3.6148908857509627, + "grad_norm": 3.5942978858947754, + "learning_rate": 1.0836970474967909e-05, + "loss": 0.7077, + "step": 2816 + }, + { + "epoch": 3.6161745827984597, + "grad_norm": 2.109971761703491, + "learning_rate": 1.0840821566110398e-05, + "loss": 0.7191, + "step": 2817 + }, + { + "epoch": 3.6174582798459562, + "grad_norm": 2.999929666519165, + "learning_rate": 1.0844672657252888e-05, + "loss": 0.7266, + "step": 2818 + }, + { + "epoch": 3.6187419768934532, + "grad_norm": 3.9464051723480225, + "learning_rate": 1.084852374839538e-05, + "loss": 0.6739, + "step": 2819 + }, + { + "epoch": 3.62002567394095, + "grad_norm": 1.9143067598342896, + "learning_rate": 1.085237483953787e-05, + "loss": 0.7307, + "step": 2820 + }, + { + "epoch": 3.6213093709884467, + "grad_norm": 2.4911069869995117, + "learning_rate": 1.0856225930680359e-05, + "loss": 0.6744, + "step": 2821 + }, + { + "epoch": 3.6225930680359433, + "grad_norm": 3.127211332321167, + "learning_rate": 1.086007702182285e-05, + "loss": 0.7045, + "step": 2822 + }, + { + "epoch": 3.6238767650834403, + "grad_norm": 3.9292354583740234, + "learning_rate": 1.0863928112965341e-05, + "loss": 0.7231, + "step": 2823 + }, + { + "epoch": 3.6251604621309372, + "grad_norm": 2.4024040699005127, + "learning_rate": 1.086777920410783e-05, + "loss": 0.713, + "step": 2824 + }, + { + "epoch": 3.6264441591784338, + "grad_norm": 3.6075334548950195, + "learning_rate": 1.0871630295250322e-05, + "loss": 0.6942, + "step": 2825 + }, + { + "epoch": 3.6277278562259307, + "grad_norm": 2.8800413608551025, + "learning_rate": 1.0875481386392811e-05, + "loss": 0.7279, + "step": 2826 + }, + { + "epoch": 3.6290115532734273, + "grad_norm": 2.4282357692718506, + "learning_rate": 1.0879332477535302e-05, + "loss": 0.7485, + "step": 2827 + }, + { + "epoch": 3.6302952503209243, + "grad_norm": 2.10608172416687, + "learning_rate": 1.0883183568677793e-05, + "loss": 0.7675, + "step": 2828 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 2.2926530838012695, + "learning_rate": 1.0887034659820283e-05, + "loss": 0.7367, + "step": 2829 + }, + { + "epoch": 3.6328626444159178, + "grad_norm": 1.872338891029358, + "learning_rate": 1.0890885750962772e-05, + "loss": 0.7253, + "step": 2830 + }, + { + "epoch": 3.6341463414634148, + "grad_norm": 4.354413986206055, + "learning_rate": 1.0894736842105265e-05, + "loss": 0.724, + "step": 2831 + }, + { + "epoch": 3.6354300385109113, + "grad_norm": 2.228896379470825, + "learning_rate": 1.0898587933247754e-05, + "loss": 0.822, + "step": 2832 + }, + { + "epoch": 3.6367137355584083, + "grad_norm": 6.737888336181641, + "learning_rate": 1.0902439024390243e-05, + "loss": 0.7713, + "step": 2833 + }, + { + "epoch": 3.6379974326059052, + "grad_norm": 2.462984800338745, + "learning_rate": 1.0906290115532735e-05, + "loss": 0.7273, + "step": 2834 + }, + { + "epoch": 3.639281129653402, + "grad_norm": 2.8446404933929443, + "learning_rate": 1.0910141206675226e-05, + "loss": 0.7545, + "step": 2835 + }, + { + "epoch": 3.6405648267008983, + "grad_norm": 7.535951137542725, + "learning_rate": 1.0913992297817715e-05, + "loss": 0.8319, + "step": 2836 + }, + { + "epoch": 3.6418485237483953, + "grad_norm": 3.281895399093628, + "learning_rate": 1.0917843388960206e-05, + "loss": 1.008, + "step": 2837 + }, + { + "epoch": 3.6431322207958923, + "grad_norm": 2.9976718425750732, + "learning_rate": 1.0921694480102695e-05, + "loss": 0.7167, + "step": 2838 + }, + { + "epoch": 3.644415917843389, + "grad_norm": 1.763967752456665, + "learning_rate": 1.0925545571245186e-05, + "loss": 0.6603, + "step": 2839 + }, + { + "epoch": 3.645699614890886, + "grad_norm": 4.471979141235352, + "learning_rate": 1.0929396662387678e-05, + "loss": 0.6675, + "step": 2840 + }, + { + "epoch": 3.6469833119383823, + "grad_norm": 2.3416695594787598, + "learning_rate": 1.0933247753530167e-05, + "loss": 0.6436, + "step": 2841 + }, + { + "epoch": 3.6482670089858793, + "grad_norm": 2.1426548957824707, + "learning_rate": 1.0937098844672658e-05, + "loss": 0.6798, + "step": 2842 + }, + { + "epoch": 3.6495507060333763, + "grad_norm": 1.7073636054992676, + "learning_rate": 1.0940949935815149e-05, + "loss": 0.6276, + "step": 2843 + }, + { + "epoch": 3.650834403080873, + "grad_norm": 2.5803704261779785, + "learning_rate": 1.0944801026957638e-05, + "loss": 0.6897, + "step": 2844 + }, + { + "epoch": 3.65211810012837, + "grad_norm": 2.5074663162231445, + "learning_rate": 1.0948652118100128e-05, + "loss": 0.6148, + "step": 2845 + }, + { + "epoch": 3.6534017971758663, + "grad_norm": 2.08964467048645, + "learning_rate": 1.0952503209242619e-05, + "loss": 0.6266, + "step": 2846 + }, + { + "epoch": 3.6546854942233633, + "grad_norm": 2.6217145919799805, + "learning_rate": 1.095635430038511e-05, + "loss": 0.6611, + "step": 2847 + }, + { + "epoch": 3.6559691912708603, + "grad_norm": 2.2076950073242188, + "learning_rate": 1.09602053915276e-05, + "loss": 0.6623, + "step": 2848 + }, + { + "epoch": 3.657252888318357, + "grad_norm": 2.1183159351348877, + "learning_rate": 1.0964056482670089e-05, + "loss": 0.6744, + "step": 2849 + }, + { + "epoch": 3.658536585365854, + "grad_norm": 3.0336403846740723, + "learning_rate": 1.0967907573812581e-05, + "loss": 0.6455, + "step": 2850 + }, + { + "epoch": 3.6598202824133503, + "grad_norm": 2.7612838745117188, + "learning_rate": 1.097175866495507e-05, + "loss": 0.6188, + "step": 2851 + }, + { + "epoch": 3.6611039794608473, + "grad_norm": 2.5765268802642822, + "learning_rate": 1.097560975609756e-05, + "loss": 0.7196, + "step": 2852 + }, + { + "epoch": 3.6623876765083443, + "grad_norm": 1.6684560775756836, + "learning_rate": 1.0979460847240051e-05, + "loss": 0.6747, + "step": 2853 + }, + { + "epoch": 3.663671373555841, + "grad_norm": 2.780754327774048, + "learning_rate": 1.0983311938382542e-05, + "loss": 0.618, + "step": 2854 + }, + { + "epoch": 3.6649550706033374, + "grad_norm": 2.8047335147857666, + "learning_rate": 1.0987163029525032e-05, + "loss": 0.6826, + "step": 2855 + }, + { + "epoch": 3.6662387676508343, + "grad_norm": 2.7431790828704834, + "learning_rate": 1.0991014120667523e-05, + "loss": 0.6905, + "step": 2856 + }, + { + "epoch": 3.6675224646983313, + "grad_norm": 2.6066460609436035, + "learning_rate": 1.0994865211810012e-05, + "loss": 0.69, + "step": 2857 + }, + { + "epoch": 3.668806161745828, + "grad_norm": 4.651686191558838, + "learning_rate": 1.0998716302952503e-05, + "loss": 0.6348, + "step": 2858 + }, + { + "epoch": 3.670089858793325, + "grad_norm": 1.749392032623291, + "learning_rate": 1.1002567394094994e-05, + "loss": 0.6144, + "step": 2859 + }, + { + "epoch": 3.6713735558408214, + "grad_norm": 5.416270732879639, + "learning_rate": 1.1006418485237484e-05, + "loss": 0.6765, + "step": 2860 + }, + { + "epoch": 3.6726572528883183, + "grad_norm": 2.0829567909240723, + "learning_rate": 1.1010269576379973e-05, + "loss": 0.6442, + "step": 2861 + }, + { + "epoch": 3.6739409499358153, + "grad_norm": 2.9364914894104004, + "learning_rate": 1.1014120667522466e-05, + "loss": 0.7491, + "step": 2862 + }, + { + "epoch": 3.675224646983312, + "grad_norm": 5.5813422203063965, + "learning_rate": 1.1017971758664955e-05, + "loss": 0.639, + "step": 2863 + }, + { + "epoch": 3.676508344030809, + "grad_norm": 2.7028021812438965, + "learning_rate": 1.1021822849807445e-05, + "loss": 0.6813, + "step": 2864 + }, + { + "epoch": 3.6777920410783054, + "grad_norm": 1.8003289699554443, + "learning_rate": 1.1025673940949937e-05, + "loss": 0.6646, + "step": 2865 + }, + { + "epoch": 3.6790757381258024, + "grad_norm": 2.6060359477996826, + "learning_rate": 1.1029525032092427e-05, + "loss": 0.6798, + "step": 2866 + }, + { + "epoch": 3.6803594351732993, + "grad_norm": 1.4999314546585083, + "learning_rate": 1.1033376123234916e-05, + "loss": 0.6839, + "step": 2867 + }, + { + "epoch": 3.681643132220796, + "grad_norm": 2.414327621459961, + "learning_rate": 1.1037227214377407e-05, + "loss": 0.6932, + "step": 2868 + }, + { + "epoch": 3.682926829268293, + "grad_norm": 6.005912780761719, + "learning_rate": 1.1041078305519898e-05, + "loss": 0.7225, + "step": 2869 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 1.8063210248947144, + "learning_rate": 1.1044929396662388e-05, + "loss": 0.7176, + "step": 2870 + }, + { + "epoch": 3.6854942233632864, + "grad_norm": 3.4945952892303467, + "learning_rate": 1.1048780487804879e-05, + "loss": 0.6822, + "step": 2871 + }, + { + "epoch": 3.686777920410783, + "grad_norm": 2.805783271789551, + "learning_rate": 1.1052631578947368e-05, + "loss": 0.6892, + "step": 2872 + }, + { + "epoch": 3.68806161745828, + "grad_norm": 2.5900182723999023, + "learning_rate": 1.1056482670089859e-05, + "loss": 0.7235, + "step": 2873 + }, + { + "epoch": 3.6893453145057764, + "grad_norm": 4.464431285858154, + "learning_rate": 1.106033376123235e-05, + "loss": 0.7672, + "step": 2874 + }, + { + "epoch": 3.6906290115532734, + "grad_norm": 3.0983753204345703, + "learning_rate": 1.106418485237484e-05, + "loss": 0.709, + "step": 2875 + }, + { + "epoch": 3.6919127086007704, + "grad_norm": 2.6528639793395996, + "learning_rate": 1.1068035943517329e-05, + "loss": 0.756, + "step": 2876 + }, + { + "epoch": 3.693196405648267, + "grad_norm": 3.511378288269043, + "learning_rate": 1.1071887034659822e-05, + "loss": 0.7181, + "step": 2877 + }, + { + "epoch": 3.694480102695764, + "grad_norm": 4.017459869384766, + "learning_rate": 1.1075738125802311e-05, + "loss": 0.7193, + "step": 2878 + }, + { + "epoch": 3.6957637997432604, + "grad_norm": 3.051758050918579, + "learning_rate": 1.10795892169448e-05, + "loss": 0.7682, + "step": 2879 + }, + { + "epoch": 3.6970474967907574, + "grad_norm": 3.754910707473755, + "learning_rate": 1.1083440308087292e-05, + "loss": 0.7437, + "step": 2880 + }, + { + "epoch": 3.6983311938382544, + "grad_norm": 5.3515625, + "learning_rate": 1.1087291399229783e-05, + "loss": 0.7742, + "step": 2881 + }, + { + "epoch": 3.699614890885751, + "grad_norm": 2.1085643768310547, + "learning_rate": 1.1091142490372272e-05, + "loss": 0.8151, + "step": 2882 + }, + { + "epoch": 3.700898587933248, + "grad_norm": 5.811005592346191, + "learning_rate": 1.1094993581514763e-05, + "loss": 0.7894, + "step": 2883 + }, + { + "epoch": 3.7021822849807444, + "grad_norm": 3.0162734985351562, + "learning_rate": 1.1098844672657254e-05, + "loss": 0.8417, + "step": 2884 + }, + { + "epoch": 3.7034659820282414, + "grad_norm": 3.5497043132781982, + "learning_rate": 1.1102695763799744e-05, + "loss": 0.8669, + "step": 2885 + }, + { + "epoch": 3.7047496790757384, + "grad_norm": 2.6347641944885254, + "learning_rate": 1.1106546854942235e-05, + "loss": 0.8674, + "step": 2886 + }, + { + "epoch": 3.706033376123235, + "grad_norm": 3.5236423015594482, + "learning_rate": 1.1110397946084724e-05, + "loss": 0.9846, + "step": 2887 + }, + { + "epoch": 3.7073170731707314, + "grad_norm": 1.9987376928329468, + "learning_rate": 1.1114249037227215e-05, + "loss": 0.6683, + "step": 2888 + }, + { + "epoch": 3.7086007702182284, + "grad_norm": 1.6169006824493408, + "learning_rate": 1.1118100128369706e-05, + "loss": 0.6523, + "step": 2889 + }, + { + "epoch": 3.7098844672657254, + "grad_norm": 2.9951364994049072, + "learning_rate": 1.1121951219512195e-05, + "loss": 0.6991, + "step": 2890 + }, + { + "epoch": 3.711168164313222, + "grad_norm": 1.4916179180145264, + "learning_rate": 1.1125802310654685e-05, + "loss": 0.6699, + "step": 2891 + }, + { + "epoch": 3.712451861360719, + "grad_norm": 3.5947422981262207, + "learning_rate": 1.1129653401797178e-05, + "loss": 0.6565, + "step": 2892 + }, + { + "epoch": 3.7137355584082155, + "grad_norm": 2.948129177093506, + "learning_rate": 1.1133504492939667e-05, + "loss": 0.6384, + "step": 2893 + }, + { + "epoch": 3.7150192554557124, + "grad_norm": 2.1827828884124756, + "learning_rate": 1.1137355584082156e-05, + "loss": 0.6758, + "step": 2894 + }, + { + "epoch": 3.7163029525032094, + "grad_norm": 2.1657466888427734, + "learning_rate": 1.1141206675224647e-05, + "loss": 0.6305, + "step": 2895 + }, + { + "epoch": 3.717586649550706, + "grad_norm": 2.24788761138916, + "learning_rate": 1.1145057766367139e-05, + "loss": 0.6367, + "step": 2896 + }, + { + "epoch": 3.718870346598203, + "grad_norm": 1.8341532945632935, + "learning_rate": 1.1148908857509628e-05, + "loss": 0.6758, + "step": 2897 + }, + { + "epoch": 3.7201540436456995, + "grad_norm": 1.9717410802841187, + "learning_rate": 1.1152759948652119e-05, + "loss": 0.6511, + "step": 2898 + }, + { + "epoch": 3.7214377406931964, + "grad_norm": 2.715895891189575, + "learning_rate": 1.1156611039794608e-05, + "loss": 0.6818, + "step": 2899 + }, + { + "epoch": 3.7227214377406934, + "grad_norm": 2.026254177093506, + "learning_rate": 1.11604621309371e-05, + "loss": 0.6991, + "step": 2900 + }, + { + "epoch": 3.72400513478819, + "grad_norm": 1.929201364517212, + "learning_rate": 1.1164313222079589e-05, + "loss": 0.6626, + "step": 2901 + }, + { + "epoch": 3.725288831835687, + "grad_norm": 5.788325786590576, + "learning_rate": 1.116816431322208e-05, + "loss": 0.6351, + "step": 2902 + }, + { + "epoch": 3.7265725288831835, + "grad_norm": 2.4845168590545654, + "learning_rate": 1.117201540436457e-05, + "loss": 0.6512, + "step": 2903 + }, + { + "epoch": 3.7278562259306804, + "grad_norm": 3.3266255855560303, + "learning_rate": 1.117586649550706e-05, + "loss": 0.6977, + "step": 2904 + }, + { + "epoch": 3.7291399229781774, + "grad_norm": 2.8276190757751465, + "learning_rate": 1.1179717586649551e-05, + "loss": 0.6665, + "step": 2905 + }, + { + "epoch": 3.730423620025674, + "grad_norm": 1.5739461183547974, + "learning_rate": 1.118356867779204e-05, + "loss": 0.7094, + "step": 2906 + }, + { + "epoch": 3.7317073170731705, + "grad_norm": 2.27877140045166, + "learning_rate": 1.1187419768934532e-05, + "loss": 0.6344, + "step": 2907 + }, + { + "epoch": 3.7329910141206675, + "grad_norm": 2.4490966796875, + "learning_rate": 1.1191270860077023e-05, + "loss": 0.6907, + "step": 2908 + }, + { + "epoch": 3.7342747111681645, + "grad_norm": 1.6394484043121338, + "learning_rate": 1.1195121951219512e-05, + "loss": 0.6577, + "step": 2909 + }, + { + "epoch": 3.735558408215661, + "grad_norm": 3.2184135913848877, + "learning_rate": 1.1198973042362002e-05, + "loss": 0.6305, + "step": 2910 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 5.265477180480957, + "learning_rate": 1.1202824133504494e-05, + "loss": 0.64, + "step": 2911 + }, + { + "epoch": 3.7381258023106545, + "grad_norm": 3.7437283992767334, + "learning_rate": 1.1206675224646984e-05, + "loss": 0.7034, + "step": 2912 + }, + { + "epoch": 3.7394094993581515, + "grad_norm": 2.2659947872161865, + "learning_rate": 1.1210526315789473e-05, + "loss": 0.6535, + "step": 2913 + }, + { + "epoch": 3.7406931964056485, + "grad_norm": 2.5936036109924316, + "learning_rate": 1.1214377406931964e-05, + "loss": 0.6711, + "step": 2914 + }, + { + "epoch": 3.741976893453145, + "grad_norm": 3.3692264556884766, + "learning_rate": 1.1218228498074455e-05, + "loss": 0.6692, + "step": 2915 + }, + { + "epoch": 3.743260590500642, + "grad_norm": 2.7716987133026123, + "learning_rate": 1.1222079589216945e-05, + "loss": 0.6923, + "step": 2916 + }, + { + "epoch": 3.7445442875481385, + "grad_norm": 2.377476692199707, + "learning_rate": 1.1225930680359436e-05, + "loss": 0.6747, + "step": 2917 + }, + { + "epoch": 3.7458279845956355, + "grad_norm": 2.050652503967285, + "learning_rate": 1.1229781771501925e-05, + "loss": 0.6848, + "step": 2918 + }, + { + "epoch": 3.7471116816431325, + "grad_norm": 6.73836088180542, + "learning_rate": 1.1233632862644416e-05, + "loss": 0.6947, + "step": 2919 + }, + { + "epoch": 3.748395378690629, + "grad_norm": 1.884226679801941, + "learning_rate": 1.1237483953786907e-05, + "loss": 0.6484, + "step": 2920 + }, + { + "epoch": 3.7496790757381255, + "grad_norm": 4.857021808624268, + "learning_rate": 1.1241335044929397e-05, + "loss": 0.7054, + "step": 2921 + }, + { + "epoch": 3.7509627727856225, + "grad_norm": 2.554112195968628, + "learning_rate": 1.1245186136071886e-05, + "loss": 0.7037, + "step": 2922 + }, + { + "epoch": 3.7522464698331195, + "grad_norm": 2.2839577198028564, + "learning_rate": 1.1249037227214379e-05, + "loss": 0.703, + "step": 2923 + }, + { + "epoch": 3.753530166880616, + "grad_norm": 3.340862989425659, + "learning_rate": 1.1252888318356868e-05, + "loss": 0.6979, + "step": 2924 + }, + { + "epoch": 3.754813863928113, + "grad_norm": 2.3391880989074707, + "learning_rate": 1.1256739409499358e-05, + "loss": 0.6841, + "step": 2925 + }, + { + "epoch": 3.7560975609756095, + "grad_norm": 2.6361145973205566, + "learning_rate": 1.1260590500641849e-05, + "loss": 0.6994, + "step": 2926 + }, + { + "epoch": 3.7573812580231065, + "grad_norm": 4.131369113922119, + "learning_rate": 1.126444159178434e-05, + "loss": 0.7397, + "step": 2927 + }, + { + "epoch": 3.7586649550706035, + "grad_norm": 3.07676100730896, + "learning_rate": 1.1268292682926829e-05, + "loss": 0.747, + "step": 2928 + }, + { + "epoch": 3.7599486521181, + "grad_norm": 2.2340753078460693, + "learning_rate": 1.127214377406932e-05, + "loss": 0.7112, + "step": 2929 + }, + { + "epoch": 3.761232349165597, + "grad_norm": 10.79589557647705, + "learning_rate": 1.1275994865211811e-05, + "loss": 0.7179, + "step": 2930 + }, + { + "epoch": 3.7625160462130935, + "grad_norm": 4.1465020179748535, + "learning_rate": 1.12798459563543e-05, + "loss": 0.7263, + "step": 2931 + }, + { + "epoch": 3.7637997432605905, + "grad_norm": 2.0457422733306885, + "learning_rate": 1.1283697047496792e-05, + "loss": 0.6818, + "step": 2932 + }, + { + "epoch": 3.7650834403080875, + "grad_norm": 3.3322112560272217, + "learning_rate": 1.1287548138639281e-05, + "loss": 0.735, + "step": 2933 + }, + { + "epoch": 3.766367137355584, + "grad_norm": 4.248661518096924, + "learning_rate": 1.1291399229781772e-05, + "loss": 0.7822, + "step": 2934 + }, + { + "epoch": 3.767650834403081, + "grad_norm": 3.432234287261963, + "learning_rate": 1.1295250320924263e-05, + "loss": 0.762, + "step": 2935 + }, + { + "epoch": 3.7689345314505776, + "grad_norm": 14.342144012451172, + "learning_rate": 1.1299101412066753e-05, + "loss": 0.8681, + "step": 2936 + }, + { + "epoch": 3.7702182284980745, + "grad_norm": 5.435516834259033, + "learning_rate": 1.1302952503209242e-05, + "loss": 0.9965, + "step": 2937 + }, + { + "epoch": 3.7715019255455715, + "grad_norm": 2.45017147064209, + "learning_rate": 1.1306803594351735e-05, + "loss": 0.6721, + "step": 2938 + }, + { + "epoch": 3.772785622593068, + "grad_norm": 1.559478998184204, + "learning_rate": 1.1310654685494224e-05, + "loss": 0.6742, + "step": 2939 + }, + { + "epoch": 3.7740693196405646, + "grad_norm": 2.930619239807129, + "learning_rate": 1.1314505776636713e-05, + "loss": 0.6573, + "step": 2940 + }, + { + "epoch": 3.7753530166880616, + "grad_norm": 5.45550012588501, + "learning_rate": 1.1318356867779205e-05, + "loss": 0.6243, + "step": 2941 + }, + { + "epoch": 3.7766367137355585, + "grad_norm": 1.804495096206665, + "learning_rate": 1.1322207958921696e-05, + "loss": 0.6401, + "step": 2942 + }, + { + "epoch": 3.777920410783055, + "grad_norm": 1.9514539241790771, + "learning_rate": 1.1326059050064185e-05, + "loss": 0.6866, + "step": 2943 + }, + { + "epoch": 3.779204107830552, + "grad_norm": 1.430029034614563, + "learning_rate": 1.1329910141206676e-05, + "loss": 0.6339, + "step": 2944 + }, + { + "epoch": 3.7804878048780486, + "grad_norm": 1.7000049352645874, + "learning_rate": 1.1333761232349165e-05, + "loss": 0.6662, + "step": 2945 + }, + { + "epoch": 3.7817715019255456, + "grad_norm": 1.9137924909591675, + "learning_rate": 1.1337612323491656e-05, + "loss": 0.6352, + "step": 2946 + }, + { + "epoch": 3.7830551989730425, + "grad_norm": 2.0836920738220215, + "learning_rate": 1.1341463414634148e-05, + "loss": 0.6498, + "step": 2947 + }, + { + "epoch": 3.784338896020539, + "grad_norm": 2.26442813873291, + "learning_rate": 1.1345314505776637e-05, + "loss": 0.7268, + "step": 2948 + }, + { + "epoch": 3.785622593068036, + "grad_norm": 2.2818827629089355, + "learning_rate": 1.1349165596919126e-05, + "loss": 0.6645, + "step": 2949 + }, + { + "epoch": 3.7869062901155326, + "grad_norm": 2.305413007736206, + "learning_rate": 1.1353016688061617e-05, + "loss": 0.6387, + "step": 2950 + }, + { + "epoch": 3.7881899871630296, + "grad_norm": 1.7073622941970825, + "learning_rate": 1.1356867779204108e-05, + "loss": 0.6688, + "step": 2951 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 2.2034411430358887, + "learning_rate": 1.1360718870346598e-05, + "loss": 0.6872, + "step": 2952 + }, + { + "epoch": 3.790757381258023, + "grad_norm": 5.601833343505859, + "learning_rate": 1.1364569961489089e-05, + "loss": 0.6882, + "step": 2953 + }, + { + "epoch": 3.79204107830552, + "grad_norm": 4.092410087585449, + "learning_rate": 1.136842105263158e-05, + "loss": 0.6192, + "step": 2954 + }, + { + "epoch": 3.7933247753530166, + "grad_norm": 2.5810723304748535, + "learning_rate": 1.137227214377407e-05, + "loss": 0.6872, + "step": 2955 + }, + { + "epoch": 3.7946084724005136, + "grad_norm": 2.2946579456329346, + "learning_rate": 1.1376123234916559e-05, + "loss": 0.677, + "step": 2956 + }, + { + "epoch": 3.79589216944801, + "grad_norm": 3.221383810043335, + "learning_rate": 1.1379974326059051e-05, + "loss": 0.7266, + "step": 2957 + }, + { + "epoch": 3.797175866495507, + "grad_norm": 2.2158420085906982, + "learning_rate": 1.138382541720154e-05, + "loss": 0.6706, + "step": 2958 + }, + { + "epoch": 3.7984595635430036, + "grad_norm": 2.791719913482666, + "learning_rate": 1.138767650834403e-05, + "loss": 0.7385, + "step": 2959 + }, + { + "epoch": 3.7997432605905006, + "grad_norm": 1.9573147296905518, + "learning_rate": 1.1391527599486521e-05, + "loss": 0.6524, + "step": 2960 + }, + { + "epoch": 3.8010269576379976, + "grad_norm": 3.0228261947631836, + "learning_rate": 1.1395378690629012e-05, + "loss": 0.6693, + "step": 2961 + }, + { + "epoch": 3.802310654685494, + "grad_norm": 3.22149395942688, + "learning_rate": 1.1399229781771502e-05, + "loss": 0.6945, + "step": 2962 + }, + { + "epoch": 3.803594351732991, + "grad_norm": 1.667860746383667, + "learning_rate": 1.1403080872913993e-05, + "loss": 0.6976, + "step": 2963 + }, + { + "epoch": 3.8048780487804876, + "grad_norm": 3.060971975326538, + "learning_rate": 1.1406931964056482e-05, + "loss": 0.6989, + "step": 2964 + }, + { + "epoch": 3.8061617458279846, + "grad_norm": 3.3124115467071533, + "learning_rate": 1.1410783055198973e-05, + "loss": 0.6923, + "step": 2965 + }, + { + "epoch": 3.8074454428754816, + "grad_norm": 2.2055137157440186, + "learning_rate": 1.1414634146341464e-05, + "loss": 0.718, + "step": 2966 + }, + { + "epoch": 3.808729139922978, + "grad_norm": 2.6839921474456787, + "learning_rate": 1.1418485237483954e-05, + "loss": 0.7092, + "step": 2967 + }, + { + "epoch": 3.810012836970475, + "grad_norm": 4.2640886306762695, + "learning_rate": 1.1422336328626443e-05, + "loss": 0.6666, + "step": 2968 + }, + { + "epoch": 3.8112965340179716, + "grad_norm": 5.405551910400391, + "learning_rate": 1.1426187419768936e-05, + "loss": 0.6388, + "step": 2969 + }, + { + "epoch": 3.8125802310654686, + "grad_norm": 3.37727952003479, + "learning_rate": 1.1430038510911425e-05, + "loss": 0.6951, + "step": 2970 + }, + { + "epoch": 3.8138639281129656, + "grad_norm": 3.925217390060425, + "learning_rate": 1.1433889602053915e-05, + "loss": 0.7388, + "step": 2971 + }, + { + "epoch": 3.815147625160462, + "grad_norm": 2.348768711090088, + "learning_rate": 1.1437740693196406e-05, + "loss": 0.6964, + "step": 2972 + }, + { + "epoch": 3.8164313222079587, + "grad_norm": 6.175014495849609, + "learning_rate": 1.1441591784338897e-05, + "loss": 0.6908, + "step": 2973 + }, + { + "epoch": 3.8177150192554556, + "grad_norm": 2.3749020099639893, + "learning_rate": 1.1445442875481386e-05, + "loss": 0.7295, + "step": 2974 + }, + { + "epoch": 3.8189987163029526, + "grad_norm": 2.633089780807495, + "learning_rate": 1.1449293966623877e-05, + "loss": 0.7113, + "step": 2975 + }, + { + "epoch": 3.820282413350449, + "grad_norm": 2.6181674003601074, + "learning_rate": 1.1453145057766368e-05, + "loss": 0.7015, + "step": 2976 + }, + { + "epoch": 3.821566110397946, + "grad_norm": 2.8240280151367188, + "learning_rate": 1.1456996148908858e-05, + "loss": 0.7287, + "step": 2977 + }, + { + "epoch": 3.8228498074454427, + "grad_norm": 3.68876576423645, + "learning_rate": 1.1460847240051349e-05, + "loss": 0.7369, + "step": 2978 + }, + { + "epoch": 3.8241335044929397, + "grad_norm": 4.14552116394043, + "learning_rate": 1.1464698331193838e-05, + "loss": 0.7928, + "step": 2979 + }, + { + "epoch": 3.8254172015404366, + "grad_norm": 8.171554565429688, + "learning_rate": 1.1468549422336329e-05, + "loss": 0.7369, + "step": 2980 + }, + { + "epoch": 3.826700898587933, + "grad_norm": 2.118396759033203, + "learning_rate": 1.147240051347882e-05, + "loss": 0.7891, + "step": 2981 + }, + { + "epoch": 3.82798459563543, + "grad_norm": 2.6813623905181885, + "learning_rate": 1.147625160462131e-05, + "loss": 0.7671, + "step": 2982 + }, + { + "epoch": 3.8292682926829267, + "grad_norm": 2.160726547241211, + "learning_rate": 1.1480102695763799e-05, + "loss": 0.7432, + "step": 2983 + }, + { + "epoch": 3.8305519897304237, + "grad_norm": 2.262117624282837, + "learning_rate": 1.1483953786906292e-05, + "loss": 0.7971, + "step": 2984 + }, + { + "epoch": 3.8318356867779206, + "grad_norm": 3.1118133068084717, + "learning_rate": 1.1487804878048781e-05, + "loss": 0.7865, + "step": 2985 + }, + { + "epoch": 3.833119383825417, + "grad_norm": 6.432515621185303, + "learning_rate": 1.149165596919127e-05, + "loss": 0.9122, + "step": 2986 + }, + { + "epoch": 3.834403080872914, + "grad_norm": 3.6464920043945312, + "learning_rate": 1.1495507060333762e-05, + "loss": 1.0098, + "step": 2987 + }, + { + "epoch": 3.8356867779204107, + "grad_norm": 2.3470582962036133, + "learning_rate": 1.1499358151476253e-05, + "loss": 0.6635, + "step": 2988 + }, + { + "epoch": 3.8369704749679077, + "grad_norm": 2.287388801574707, + "learning_rate": 1.1503209242618742e-05, + "loss": 0.6279, + "step": 2989 + }, + { + "epoch": 3.8382541720154046, + "grad_norm": 1.5859156847000122, + "learning_rate": 1.1507060333761233e-05, + "loss": 0.6462, + "step": 2990 + }, + { + "epoch": 3.839537869062901, + "grad_norm": 1.4124661684036255, + "learning_rate": 1.1510911424903722e-05, + "loss": 0.6581, + "step": 2991 + }, + { + "epoch": 3.8408215661103977, + "grad_norm": 2.483283758163452, + "learning_rate": 1.1514762516046214e-05, + "loss": 0.6312, + "step": 2992 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 1.5051345825195312, + "learning_rate": 1.1518613607188705e-05, + "loss": 0.6311, + "step": 2993 + }, + { + "epoch": 3.8433889602053917, + "grad_norm": 2.404860496520996, + "learning_rate": 1.1522464698331194e-05, + "loss": 0.6503, + "step": 2994 + }, + { + "epoch": 3.844672657252888, + "grad_norm": 2.111002206802368, + "learning_rate": 1.1526315789473683e-05, + "loss": 0.6735, + "step": 2995 + }, + { + "epoch": 3.845956354300385, + "grad_norm": 2.0336291790008545, + "learning_rate": 1.1530166880616176e-05, + "loss": 0.6231, + "step": 2996 + }, + { + "epoch": 3.8472400513478817, + "grad_norm": 3.621968984603882, + "learning_rate": 1.1534017971758665e-05, + "loss": 0.6313, + "step": 2997 + }, + { + "epoch": 3.8485237483953787, + "grad_norm": 2.395406484603882, + "learning_rate": 1.1537869062901155e-05, + "loss": 0.6459, + "step": 2998 + }, + { + "epoch": 3.8498074454428757, + "grad_norm": 1.7996222972869873, + "learning_rate": 1.1541720154043646e-05, + "loss": 0.6444, + "step": 2999 + }, + { + "epoch": 3.851091142490372, + "grad_norm": 3.246734142303467, + "learning_rate": 1.1545571245186137e-05, + "loss": 0.6502, + "step": 3000 + }, + { + "epoch": 3.851091142490372, + "eval_cer": 0.3087184256902291, + "eval_loss": 0.6525594592094421, + "eval_runtime": 13.9955, + "eval_samples_per_second": 70.237, + "eval_steps_per_second": 0.5, + "eval_wer": 0.5753267599174501, + "step": 3000 + }, + { + "epoch": 3.852374839537869, + "grad_norm": 1.792728066444397, + "learning_rate": 1.1549422336328626e-05, + "loss": 0.6866, + "step": 3001 + }, + { + "epoch": 3.8536585365853657, + "grad_norm": 2.7342748641967773, + "learning_rate": 1.1553273427471116e-05, + "loss": 0.6549, + "step": 3002 + }, + { + "epoch": 3.8549422336328627, + "grad_norm": 1.316015601158142, + "learning_rate": 1.1557124518613609e-05, + "loss": 0.6621, + "step": 3003 + }, + { + "epoch": 3.8562259306803597, + "grad_norm": 2.33286452293396, + "learning_rate": 1.1560975609756098e-05, + "loss": 0.6172, + "step": 3004 + }, + { + "epoch": 3.857509627727856, + "grad_norm": 3.139249324798584, + "learning_rate": 1.1564826700898587e-05, + "loss": 0.6182, + "step": 3005 + }, + { + "epoch": 3.8587933247753528, + "grad_norm": 1.7843209505081177, + "learning_rate": 1.1568677792041078e-05, + "loss": 0.6197, + "step": 3006 + }, + { + "epoch": 3.8600770218228497, + "grad_norm": 2.8754384517669678, + "learning_rate": 1.157252888318357e-05, + "loss": 0.6462, + "step": 3007 + }, + { + "epoch": 3.8613607188703467, + "grad_norm": 2.238109588623047, + "learning_rate": 1.1576379974326059e-05, + "loss": 0.6952, + "step": 3008 + }, + { + "epoch": 3.8626444159178432, + "grad_norm": 1.6081974506378174, + "learning_rate": 1.158023106546855e-05, + "loss": 0.6437, + "step": 3009 + }, + { + "epoch": 3.8639281129653402, + "grad_norm": 2.763451099395752, + "learning_rate": 1.158408215661104e-05, + "loss": 0.7041, + "step": 3010 + }, + { + "epoch": 3.8652118100128368, + "grad_norm": 2.4930827617645264, + "learning_rate": 1.158793324775353e-05, + "loss": 0.6868, + "step": 3011 + }, + { + "epoch": 3.8664955070603337, + "grad_norm": 2.135392189025879, + "learning_rate": 1.1591784338896021e-05, + "loss": 0.6724, + "step": 3012 + }, + { + "epoch": 3.8677792041078307, + "grad_norm": 2.0166213512420654, + "learning_rate": 1.159563543003851e-05, + "loss": 0.7067, + "step": 3013 + }, + { + "epoch": 3.8690629011553272, + "grad_norm": 2.0125229358673096, + "learning_rate": 1.1599486521181e-05, + "loss": 0.6479, + "step": 3014 + }, + { + "epoch": 3.8703465982028242, + "grad_norm": 2.6983389854431152, + "learning_rate": 1.1603337612323493e-05, + "loss": 0.6645, + "step": 3015 + }, + { + "epoch": 3.8716302952503208, + "grad_norm": 3.194549560546875, + "learning_rate": 1.1607188703465982e-05, + "loss": 0.6368, + "step": 3016 + }, + { + "epoch": 3.8729139922978177, + "grad_norm": 3.017703056335449, + "learning_rate": 1.1611039794608472e-05, + "loss": 0.6408, + "step": 3017 + }, + { + "epoch": 3.8741976893453147, + "grad_norm": 3.7856714725494385, + "learning_rate": 1.1614890885750963e-05, + "loss": 0.7515, + "step": 3018 + }, + { + "epoch": 3.8754813863928113, + "grad_norm": 2.6735877990722656, + "learning_rate": 1.1618741976893454e-05, + "loss": 0.6724, + "step": 3019 + }, + { + "epoch": 3.8767650834403082, + "grad_norm": 3.370357036590576, + "learning_rate": 1.1622593068035943e-05, + "loss": 0.7359, + "step": 3020 + }, + { + "epoch": 3.8780487804878048, + "grad_norm": 3.271918296813965, + "learning_rate": 1.1626444159178434e-05, + "loss": 0.7386, + "step": 3021 + }, + { + "epoch": 3.8793324775353017, + "grad_norm": 2.516101360321045, + "learning_rate": 1.1630295250320925e-05, + "loss": 0.7614, + "step": 3022 + }, + { + "epoch": 3.8806161745827987, + "grad_norm": 1.6166038513183594, + "learning_rate": 1.1634146341463415e-05, + "loss": 0.7408, + "step": 3023 + }, + { + "epoch": 3.8818998716302953, + "grad_norm": 3.453735828399658, + "learning_rate": 1.1637997432605906e-05, + "loss": 0.6648, + "step": 3024 + }, + { + "epoch": 3.883183568677792, + "grad_norm": 2.8543787002563477, + "learning_rate": 1.1641848523748395e-05, + "loss": 0.7447, + "step": 3025 + }, + { + "epoch": 3.8844672657252888, + "grad_norm": 1.5774363279342651, + "learning_rate": 1.1645699614890886e-05, + "loss": 0.7031, + "step": 3026 + }, + { + "epoch": 3.8857509627727858, + "grad_norm": 3.1518003940582275, + "learning_rate": 1.1649550706033377e-05, + "loss": 0.6988, + "step": 3027 + }, + { + "epoch": 3.8870346598202823, + "grad_norm": 6.283576965332031, + "learning_rate": 1.1653401797175867e-05, + "loss": 0.715, + "step": 3028 + }, + { + "epoch": 3.8883183568677793, + "grad_norm": 5.082571983337402, + "learning_rate": 1.1657252888318356e-05, + "loss": 0.7812, + "step": 3029 + }, + { + "epoch": 3.889602053915276, + "grad_norm": 4.280940532684326, + "learning_rate": 1.1661103979460849e-05, + "loss": 0.7557, + "step": 3030 + }, + { + "epoch": 3.890885750962773, + "grad_norm": 2.8256239891052246, + "learning_rate": 1.1664955070603338e-05, + "loss": 0.7003, + "step": 3031 + }, + { + "epoch": 3.8921694480102698, + "grad_norm": 5.595981121063232, + "learning_rate": 1.1668806161745828e-05, + "loss": 0.7344, + "step": 3032 + }, + { + "epoch": 3.8934531450577663, + "grad_norm": 3.2095022201538086, + "learning_rate": 1.1672657252888319e-05, + "loss": 0.7934, + "step": 3033 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 2.595877170562744, + "learning_rate": 1.167650834403081e-05, + "loss": 0.8037, + "step": 3034 + }, + { + "epoch": 3.89602053915276, + "grad_norm": 3.904137134552002, + "learning_rate": 1.1680359435173299e-05, + "loss": 0.8345, + "step": 3035 + }, + { + "epoch": 3.897304236200257, + "grad_norm": 4.35256814956665, + "learning_rate": 1.168421052631579e-05, + "loss": 0.8739, + "step": 3036 + }, + { + "epoch": 3.8985879332477538, + "grad_norm": 6.592706680297852, + "learning_rate": 1.168806161745828e-05, + "loss": 1.0291, + "step": 3037 + }, + { + "epoch": 3.8998716302952503, + "grad_norm": 1.7046579122543335, + "learning_rate": 1.169191270860077e-05, + "loss": 0.6523, + "step": 3038 + }, + { + "epoch": 3.901155327342747, + "grad_norm": 1.7130005359649658, + "learning_rate": 1.1695763799743262e-05, + "loss": 0.6819, + "step": 3039 + }, + { + "epoch": 3.902439024390244, + "grad_norm": 3.0240538120269775, + "learning_rate": 1.1699614890885751e-05, + "loss": 0.69, + "step": 3040 + }, + { + "epoch": 3.903722721437741, + "grad_norm": 1.948569655418396, + "learning_rate": 1.170346598202824e-05, + "loss": 0.6565, + "step": 3041 + }, + { + "epoch": 3.9050064184852373, + "grad_norm": 1.9431180953979492, + "learning_rate": 1.1707317073170733e-05, + "loss": 0.6674, + "step": 3042 + }, + { + "epoch": 3.9062901155327343, + "grad_norm": 1.90803861618042, + "learning_rate": 1.1711168164313223e-05, + "loss": 0.6465, + "step": 3043 + }, + { + "epoch": 3.907573812580231, + "grad_norm": 2.520472526550293, + "learning_rate": 1.1715019255455712e-05, + "loss": 0.6584, + "step": 3044 + }, + { + "epoch": 3.908857509627728, + "grad_norm": 1.6146435737609863, + "learning_rate": 1.1718870346598205e-05, + "loss": 0.6101, + "step": 3045 + }, + { + "epoch": 3.910141206675225, + "grad_norm": 2.641333818435669, + "learning_rate": 1.1722721437740694e-05, + "loss": 0.6783, + "step": 3046 + }, + { + "epoch": 3.9114249037227213, + "grad_norm": 2.161029100418091, + "learning_rate": 1.1726572528883183e-05, + "loss": 0.6859, + "step": 3047 + }, + { + "epoch": 3.9127086007702183, + "grad_norm": 2.4665393829345703, + "learning_rate": 1.1730423620025675e-05, + "loss": 0.6256, + "step": 3048 + }, + { + "epoch": 3.913992297817715, + "grad_norm": 1.7688900232315063, + "learning_rate": 1.1734274711168166e-05, + "loss": 0.6532, + "step": 3049 + }, + { + "epoch": 3.915275994865212, + "grad_norm": 3.9664788246154785, + "learning_rate": 1.1738125802310655e-05, + "loss": 0.6914, + "step": 3050 + }, + { + "epoch": 3.916559691912709, + "grad_norm": 2.507550001144409, + "learning_rate": 1.1741976893453146e-05, + "loss": 0.6057, + "step": 3051 + }, + { + "epoch": 3.9178433889602053, + "grad_norm": 1.897536277770996, + "learning_rate": 1.1745827984595635e-05, + "loss": 0.6268, + "step": 3052 + }, + { + "epoch": 3.9191270860077023, + "grad_norm": 2.4640419483184814, + "learning_rate": 1.1749679075738126e-05, + "loss": 0.6469, + "step": 3053 + }, + { + "epoch": 3.920410783055199, + "grad_norm": 2.616018533706665, + "learning_rate": 1.1753530166880616e-05, + "loss": 0.67, + "step": 3054 + }, + { + "epoch": 3.921694480102696, + "grad_norm": 4.238605499267578, + "learning_rate": 1.1757381258023107e-05, + "loss": 0.6778, + "step": 3055 + }, + { + "epoch": 3.922978177150193, + "grad_norm": 2.273465394973755, + "learning_rate": 1.1761232349165596e-05, + "loss": 0.6658, + "step": 3056 + }, + { + "epoch": 3.9242618741976893, + "grad_norm": 2.290295362472534, + "learning_rate": 1.1765083440308087e-05, + "loss": 0.6602, + "step": 3057 + }, + { + "epoch": 3.925545571245186, + "grad_norm": 1.55820894241333, + "learning_rate": 1.1768934531450578e-05, + "loss": 0.6844, + "step": 3058 + }, + { + "epoch": 3.926829268292683, + "grad_norm": 3.8937771320343018, + "learning_rate": 1.1772785622593068e-05, + "loss": 0.6716, + "step": 3059 + }, + { + "epoch": 3.92811296534018, + "grad_norm": 2.275902509689331, + "learning_rate": 1.1776636713735557e-05, + "loss": 0.6395, + "step": 3060 + }, + { + "epoch": 3.9293966623876764, + "grad_norm": 2.2751643657684326, + "learning_rate": 1.178048780487805e-05, + "loss": 0.6141, + "step": 3061 + }, + { + "epoch": 3.9306803594351734, + "grad_norm": 2.084245204925537, + "learning_rate": 1.178433889602054e-05, + "loss": 0.6783, + "step": 3062 + }, + { + "epoch": 3.93196405648267, + "grad_norm": 4.9176788330078125, + "learning_rate": 1.1788189987163029e-05, + "loss": 0.6559, + "step": 3063 + }, + { + "epoch": 3.933247753530167, + "grad_norm": 3.0740466117858887, + "learning_rate": 1.1792041078305521e-05, + "loss": 0.6318, + "step": 3064 + }, + { + "epoch": 3.934531450577664, + "grad_norm": 2.1555933952331543, + "learning_rate": 1.179589216944801e-05, + "loss": 0.7263, + "step": 3065 + }, + { + "epoch": 3.9358151476251604, + "grad_norm": 3.056734800338745, + "learning_rate": 1.17997432605905e-05, + "loss": 0.6858, + "step": 3066 + }, + { + "epoch": 3.9370988446726574, + "grad_norm": 2.7016944885253906, + "learning_rate": 1.1803594351732991e-05, + "loss": 0.6753, + "step": 3067 + }, + { + "epoch": 3.938382541720154, + "grad_norm": 3.754875421524048, + "learning_rate": 1.1807445442875482e-05, + "loss": 0.6974, + "step": 3068 + }, + { + "epoch": 3.939666238767651, + "grad_norm": 9.351109504699707, + "learning_rate": 1.1811296534017972e-05, + "loss": 0.6905, + "step": 3069 + }, + { + "epoch": 3.940949935815148, + "grad_norm": 4.632996082305908, + "learning_rate": 1.1815147625160463e-05, + "loss": 0.7595, + "step": 3070 + }, + { + "epoch": 3.9422336328626444, + "grad_norm": 10.101603507995605, + "learning_rate": 1.1818998716302952e-05, + "loss": 0.6963, + "step": 3071 + }, + { + "epoch": 3.9435173299101414, + "grad_norm": 3.2640109062194824, + "learning_rate": 1.1822849807445443e-05, + "loss": 0.6769, + "step": 3072 + }, + { + "epoch": 3.944801026957638, + "grad_norm": 4.898396015167236, + "learning_rate": 1.1826700898587934e-05, + "loss": 0.7451, + "step": 3073 + }, + { + "epoch": 3.946084724005135, + "grad_norm": 3.2837908267974854, + "learning_rate": 1.1830551989730424e-05, + "loss": 0.6691, + "step": 3074 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 2.820760488510132, + "learning_rate": 1.1834403080872913e-05, + "loss": 0.7306, + "step": 3075 + }, + { + "epoch": 3.9486521181001284, + "grad_norm": 2.6235344409942627, + "learning_rate": 1.1838254172015406e-05, + "loss": 0.738, + "step": 3076 + }, + { + "epoch": 3.949935815147625, + "grad_norm": 2.8767013549804688, + "learning_rate": 1.1842105263157895e-05, + "loss": 0.6878, + "step": 3077 + }, + { + "epoch": 3.951219512195122, + "grad_norm": 3.461038827896118, + "learning_rate": 1.1845956354300385e-05, + "loss": 0.7505, + "step": 3078 + }, + { + "epoch": 3.952503209242619, + "grad_norm": 3.1920580863952637, + "learning_rate": 1.1849807445442876e-05, + "loss": 0.7071, + "step": 3079 + }, + { + "epoch": 3.9537869062901154, + "grad_norm": 3.437826156616211, + "learning_rate": 1.1853658536585367e-05, + "loss": 0.7461, + "step": 3080 + }, + { + "epoch": 3.9550706033376124, + "grad_norm": 2.351640462875366, + "learning_rate": 1.1857509627727856e-05, + "loss": 0.7778, + "step": 3081 + }, + { + "epoch": 3.956354300385109, + "grad_norm": 13.904263496398926, + "learning_rate": 1.1861360718870347e-05, + "loss": 0.724, + "step": 3082 + }, + { + "epoch": 3.957637997432606, + "grad_norm": 2.9690284729003906, + "learning_rate": 1.1865211810012837e-05, + "loss": 0.8142, + "step": 3083 + }, + { + "epoch": 3.958921694480103, + "grad_norm": 5.928182601928711, + "learning_rate": 1.1869062901155328e-05, + "loss": 0.7545, + "step": 3084 + }, + { + "epoch": 3.9602053915275994, + "grad_norm": 4.316997051239014, + "learning_rate": 1.1872913992297819e-05, + "loss": 0.8771, + "step": 3085 + }, + { + "epoch": 3.9614890885750964, + "grad_norm": 3.385546922683716, + "learning_rate": 1.1876765083440308e-05, + "loss": 0.7971, + "step": 3086 + }, + { + "epoch": 3.962772785622593, + "grad_norm": 9.08889389038086, + "learning_rate": 1.1880616174582799e-05, + "loss": 1.0059, + "step": 3087 + }, + { + "epoch": 3.96405648267009, + "grad_norm": 2.2218239307403564, + "learning_rate": 1.188446726572529e-05, + "loss": 0.6135, + "step": 3088 + }, + { + "epoch": 3.965340179717587, + "grad_norm": 1.5301423072814941, + "learning_rate": 1.188831835686778e-05, + "loss": 0.6144, + "step": 3089 + }, + { + "epoch": 3.9666238767650834, + "grad_norm": 2.042039632797241, + "learning_rate": 1.1892169448010269e-05, + "loss": 0.6263, + "step": 3090 + }, + { + "epoch": 3.96790757381258, + "grad_norm": 2.438535451889038, + "learning_rate": 1.1896020539152762e-05, + "loss": 0.6386, + "step": 3091 + }, + { + "epoch": 3.969191270860077, + "grad_norm": 1.8046808242797852, + "learning_rate": 1.1899871630295251e-05, + "loss": 0.6412, + "step": 3092 + }, + { + "epoch": 3.970474967907574, + "grad_norm": 2.7717931270599365, + "learning_rate": 1.190372272143774e-05, + "loss": 0.7232, + "step": 3093 + }, + { + "epoch": 3.9717586649550705, + "grad_norm": 3.658745050430298, + "learning_rate": 1.1907573812580232e-05, + "loss": 0.6578, + "step": 3094 + }, + { + "epoch": 3.9730423620025674, + "grad_norm": 1.6206958293914795, + "learning_rate": 1.1911424903722723e-05, + "loss": 0.6128, + "step": 3095 + }, + { + "epoch": 3.974326059050064, + "grad_norm": 2.7003872394561768, + "learning_rate": 1.1915275994865212e-05, + "loss": 0.6774, + "step": 3096 + }, + { + "epoch": 3.975609756097561, + "grad_norm": 2.2906386852264404, + "learning_rate": 1.1919127086007703e-05, + "loss": 0.6427, + "step": 3097 + }, + { + "epoch": 3.976893453145058, + "grad_norm": 2.8072309494018555, + "learning_rate": 1.1922978177150192e-05, + "loss": 0.6628, + "step": 3098 + }, + { + "epoch": 3.9781771501925545, + "grad_norm": 3.212967872619629, + "learning_rate": 1.1926829268292684e-05, + "loss": 0.7139, + "step": 3099 + }, + { + "epoch": 3.9794608472400514, + "grad_norm": 2.615215301513672, + "learning_rate": 1.1930680359435175e-05, + "loss": 0.6765, + "step": 3100 + }, + { + "epoch": 3.980744544287548, + "grad_norm": 5.263926982879639, + "learning_rate": 1.1934531450577664e-05, + "loss": 0.6649, + "step": 3101 + }, + { + "epoch": 3.982028241335045, + "grad_norm": 2.3321094512939453, + "learning_rate": 1.1938382541720153e-05, + "loss": 0.6966, + "step": 3102 + }, + { + "epoch": 3.983311938382542, + "grad_norm": 3.155553102493286, + "learning_rate": 1.1942233632862644e-05, + "loss": 0.685, + "step": 3103 + }, + { + "epoch": 3.9845956354300385, + "grad_norm": 2.474677562713623, + "learning_rate": 1.1946084724005135e-05, + "loss": 0.6626, + "step": 3104 + }, + { + "epoch": 3.9858793324775355, + "grad_norm": 3.516493558883667, + "learning_rate": 1.1949935815147625e-05, + "loss": 0.7071, + "step": 3105 + }, + { + "epoch": 3.987163029525032, + "grad_norm": 2.0702338218688965, + "learning_rate": 1.1953786906290114e-05, + "loss": 0.7003, + "step": 3106 + }, + { + "epoch": 3.988446726572529, + "grad_norm": 4.304792881011963, + "learning_rate": 1.1957637997432607e-05, + "loss": 0.6746, + "step": 3107 + }, + { + "epoch": 3.989730423620026, + "grad_norm": 3.3157541751861572, + "learning_rate": 1.1961489088575096e-05, + "loss": 0.6992, + "step": 3108 + }, + { + "epoch": 3.9910141206675225, + "grad_norm": 3.127227306365967, + "learning_rate": 1.1965340179717586e-05, + "loss": 0.6959, + "step": 3109 + }, + { + "epoch": 3.992297817715019, + "grad_norm": 3.0063230991363525, + "learning_rate": 1.1969191270860079e-05, + "loss": 0.6932, + "step": 3110 + }, + { + "epoch": 3.993581514762516, + "grad_norm": 6.509521484375, + "learning_rate": 1.1973042362002568e-05, + "loss": 0.6435, + "step": 3111 + }, + { + "epoch": 3.994865211810013, + "grad_norm": 2.9662625789642334, + "learning_rate": 1.1976893453145057e-05, + "loss": 0.7487, + "step": 3112 + }, + { + "epoch": 3.9961489088575095, + "grad_norm": 1.8409380912780762, + "learning_rate": 1.1980744544287548e-05, + "loss": 0.7738, + "step": 3113 + }, + { + "epoch": 3.9974326059050065, + "grad_norm": 2.5172882080078125, + "learning_rate": 1.198459563543004e-05, + "loss": 0.8037, + "step": 3114 + }, + { + "epoch": 3.998716302952503, + "grad_norm": 3.765941858291626, + "learning_rate": 1.1988446726572529e-05, + "loss": 0.81, + "step": 3115 + }, + { + "epoch": 4.0, + "grad_norm": 3.2630486488342285, + "learning_rate": 1.199229781771502e-05, + "loss": 0.9605, + "step": 3116 + }, + { + "epoch": 4.001283697047497, + "grad_norm": 2.0363235473632812, + "learning_rate": 1.199614890885751e-05, + "loss": 0.6712, + "step": 3117 + }, + { + "epoch": 4.002567394094994, + "grad_norm": 6.794885158538818, + "learning_rate": 1.2e-05, + "loss": 0.6356, + "step": 3118 + }, + { + "epoch": 4.00385109114249, + "grad_norm": 2.046112298965454, + "learning_rate": 1.2003851091142491e-05, + "loss": 0.6228, + "step": 3119 + }, + { + "epoch": 4.005134788189987, + "grad_norm": 4.171177387237549, + "learning_rate": 1.200770218228498e-05, + "loss": 0.686, + "step": 3120 + }, + { + "epoch": 4.006418485237484, + "grad_norm": 2.124150514602661, + "learning_rate": 1.201155327342747e-05, + "loss": 0.6899, + "step": 3121 + }, + { + "epoch": 4.007702182284981, + "grad_norm": 1.6275590658187866, + "learning_rate": 1.2015404364569963e-05, + "loss": 0.6234, + "step": 3122 + }, + { + "epoch": 4.008985879332478, + "grad_norm": 4.088416576385498, + "learning_rate": 1.2019255455712452e-05, + "loss": 0.6464, + "step": 3123 + }, + { + "epoch": 4.010269576379974, + "grad_norm": 2.523959159851074, + "learning_rate": 1.2023106546854942e-05, + "loss": 0.6605, + "step": 3124 + }, + { + "epoch": 4.011553273427471, + "grad_norm": 2.213289260864258, + "learning_rate": 1.2026957637997433e-05, + "loss": 0.6618, + "step": 3125 + }, + { + "epoch": 4.012836970474968, + "grad_norm": 2.2113020420074463, + "learning_rate": 1.2030808729139924e-05, + "loss": 0.6236, + "step": 3126 + }, + { + "epoch": 4.014120667522465, + "grad_norm": 2.5315890312194824, + "learning_rate": 1.2034659820282413e-05, + "loss": 0.6198, + "step": 3127 + }, + { + "epoch": 4.015404364569961, + "grad_norm": 2.261427640914917, + "learning_rate": 1.2038510911424904e-05, + "loss": 0.6815, + "step": 3128 + }, + { + "epoch": 4.016688061617458, + "grad_norm": 2.7608883380889893, + "learning_rate": 1.2042362002567394e-05, + "loss": 0.6471, + "step": 3129 + }, + { + "epoch": 4.017971758664955, + "grad_norm": 2.2305643558502197, + "learning_rate": 1.2046213093709885e-05, + "loss": 0.6342, + "step": 3130 + }, + { + "epoch": 4.019255455712452, + "grad_norm": 13.066144943237305, + "learning_rate": 1.2050064184852376e-05, + "loss": 0.6619, + "step": 3131 + }, + { + "epoch": 4.020539152759949, + "grad_norm": 3.478980541229248, + "learning_rate": 1.2053915275994865e-05, + "loss": 0.6545, + "step": 3132 + }, + { + "epoch": 4.021822849807445, + "grad_norm": 2.5053648948669434, + "learning_rate": 1.2057766367137356e-05, + "loss": 0.6528, + "step": 3133 + }, + { + "epoch": 4.023106546854942, + "grad_norm": 2.4522602558135986, + "learning_rate": 1.2061617458279847e-05, + "loss": 0.6814, + "step": 3134 + }, + { + "epoch": 4.024390243902439, + "grad_norm": 2.1894028186798096, + "learning_rate": 1.2065468549422337e-05, + "loss": 0.664, + "step": 3135 + }, + { + "epoch": 4.025673940949936, + "grad_norm": 2.90376877784729, + "learning_rate": 1.2069319640564826e-05, + "loss": 0.6657, + "step": 3136 + }, + { + "epoch": 4.026957637997433, + "grad_norm": 2.0876684188842773, + "learning_rate": 1.2073170731707319e-05, + "loss": 0.6297, + "step": 3137 + }, + { + "epoch": 4.028241335044929, + "grad_norm": 2.9359490871429443, + "learning_rate": 1.2077021822849808e-05, + "loss": 0.6332, + "step": 3138 + }, + { + "epoch": 4.029525032092426, + "grad_norm": 1.9591219425201416, + "learning_rate": 1.2080872913992298e-05, + "loss": 0.6224, + "step": 3139 + }, + { + "epoch": 4.030808729139923, + "grad_norm": 2.9350674152374268, + "learning_rate": 1.2084724005134789e-05, + "loss": 0.6506, + "step": 3140 + }, + { + "epoch": 4.03209242618742, + "grad_norm": 3.4206511974334717, + "learning_rate": 1.208857509627728e-05, + "loss": 0.6309, + "step": 3141 + }, + { + "epoch": 4.033376123234916, + "grad_norm": 1.649977684020996, + "learning_rate": 1.2092426187419769e-05, + "loss": 0.6451, + "step": 3142 + }, + { + "epoch": 4.034659820282413, + "grad_norm": 2.8713133335113525, + "learning_rate": 1.209627727856226e-05, + "loss": 0.7141, + "step": 3143 + }, + { + "epoch": 4.03594351732991, + "grad_norm": 2.5255446434020996, + "learning_rate": 1.210012836970475e-05, + "loss": 0.707, + "step": 3144 + }, + { + "epoch": 4.037227214377407, + "grad_norm": 2.016578435897827, + "learning_rate": 1.210397946084724e-05, + "loss": 0.7369, + "step": 3145 + }, + { + "epoch": 4.038510911424904, + "grad_norm": 2.485837936401367, + "learning_rate": 1.2107830551989732e-05, + "loss": 0.647, + "step": 3146 + }, + { + "epoch": 4.0397946084724, + "grad_norm": 1.6693201065063477, + "learning_rate": 1.2111681643132221e-05, + "loss": 0.7156, + "step": 3147 + }, + { + "epoch": 4.041078305519897, + "grad_norm": 1.684945821762085, + "learning_rate": 1.211553273427471e-05, + "loss": 0.6517, + "step": 3148 + }, + { + "epoch": 4.042362002567394, + "grad_norm": 5.631387710571289, + "learning_rate": 1.2119383825417203e-05, + "loss": 0.6685, + "step": 3149 + }, + { + "epoch": 4.043645699614891, + "grad_norm": 2.3175270557403564, + "learning_rate": 1.2123234916559693e-05, + "loss": 0.7301, + "step": 3150 + }, + { + "epoch": 4.044929396662388, + "grad_norm": 2.8160481452941895, + "learning_rate": 1.2127086007702182e-05, + "loss": 0.7249, + "step": 3151 + }, + { + "epoch": 4.046213093709884, + "grad_norm": 2.490939140319824, + "learning_rate": 1.2130937098844673e-05, + "loss": 0.6709, + "step": 3152 + }, + { + "epoch": 4.047496790757381, + "grad_norm": 1.6626834869384766, + "learning_rate": 1.2134788189987164e-05, + "loss": 0.7108, + "step": 3153 + }, + { + "epoch": 4.048780487804878, + "grad_norm": 2.011749505996704, + "learning_rate": 1.2138639281129653e-05, + "loss": 0.6666, + "step": 3154 + }, + { + "epoch": 4.050064184852375, + "grad_norm": 3.1682119369506836, + "learning_rate": 1.2142490372272143e-05, + "loss": 0.7388, + "step": 3155 + }, + { + "epoch": 4.051347881899872, + "grad_norm": 2.983905553817749, + "learning_rate": 1.2146341463414636e-05, + "loss": 0.6975, + "step": 3156 + }, + { + "epoch": 4.052631578947368, + "grad_norm": 2.1070680618286133, + "learning_rate": 1.2150192554557125e-05, + "loss": 0.7854, + "step": 3157 + }, + { + "epoch": 4.053915275994865, + "grad_norm": 3.086822509765625, + "learning_rate": 1.2154043645699614e-05, + "loss": 0.7585, + "step": 3158 + }, + { + "epoch": 4.055198973042362, + "grad_norm": 2.170424222946167, + "learning_rate": 1.2157894736842105e-05, + "loss": 0.7392, + "step": 3159 + }, + { + "epoch": 4.056482670089859, + "grad_norm": 9.6680908203125, + "learning_rate": 1.2161745827984596e-05, + "loss": 0.7682, + "step": 3160 + }, + { + "epoch": 4.057766367137355, + "grad_norm": 2.9424304962158203, + "learning_rate": 1.2165596919127086e-05, + "loss": 0.765, + "step": 3161 + }, + { + "epoch": 4.059050064184852, + "grad_norm": 3.458472728729248, + "learning_rate": 1.2169448010269577e-05, + "loss": 0.7622, + "step": 3162 + }, + { + "epoch": 4.060333761232349, + "grad_norm": 2.827880620956421, + "learning_rate": 1.2173299101412066e-05, + "loss": 0.7776, + "step": 3163 + }, + { + "epoch": 4.061617458279846, + "grad_norm": 3.532113552093506, + "learning_rate": 1.2177150192554557e-05, + "loss": 0.8245, + "step": 3164 + }, + { + "epoch": 4.062901155327343, + "grad_norm": 2.420865774154663, + "learning_rate": 1.2181001283697048e-05, + "loss": 0.8283, + "step": 3165 + }, + { + "epoch": 4.064184852374839, + "grad_norm": 8.850095748901367, + "learning_rate": 1.2184852374839538e-05, + "loss": 0.9896, + "step": 3166 + }, + { + "epoch": 4.065468549422336, + "grad_norm": 2.800421714782715, + "learning_rate": 1.2188703465982027e-05, + "loss": 0.6587, + "step": 3167 + }, + { + "epoch": 4.066752246469833, + "grad_norm": 1.8159726858139038, + "learning_rate": 1.219255455712452e-05, + "loss": 0.6598, + "step": 3168 + }, + { + "epoch": 4.06803594351733, + "grad_norm": 2.029420852661133, + "learning_rate": 1.219640564826701e-05, + "loss": 0.6564, + "step": 3169 + }, + { + "epoch": 4.069319640564827, + "grad_norm": 2.0095911026000977, + "learning_rate": 1.2200256739409499e-05, + "loss": 0.6578, + "step": 3170 + }, + { + "epoch": 4.070603337612323, + "grad_norm": 1.557378888130188, + "learning_rate": 1.220410783055199e-05, + "loss": 0.6627, + "step": 3171 + }, + { + "epoch": 4.07188703465982, + "grad_norm": 2.1156787872314453, + "learning_rate": 1.220795892169448e-05, + "loss": 0.6338, + "step": 3172 + }, + { + "epoch": 4.073170731707317, + "grad_norm": 2.0238125324249268, + "learning_rate": 1.221181001283697e-05, + "loss": 0.6613, + "step": 3173 + }, + { + "epoch": 4.074454428754814, + "grad_norm": 1.8898366689682007, + "learning_rate": 1.2215661103979461e-05, + "loss": 0.6697, + "step": 3174 + }, + { + "epoch": 4.07573812580231, + "grad_norm": 1.7432814836502075, + "learning_rate": 1.221951219512195e-05, + "loss": 0.623, + "step": 3175 + }, + { + "epoch": 4.077021822849807, + "grad_norm": 2.8907394409179688, + "learning_rate": 1.2223363286264442e-05, + "loss": 0.6776, + "step": 3176 + }, + { + "epoch": 4.078305519897304, + "grad_norm": 1.8317153453826904, + "learning_rate": 1.2227214377406933e-05, + "loss": 0.6714, + "step": 3177 + }, + { + "epoch": 4.079589216944801, + "grad_norm": 2.729153871536255, + "learning_rate": 1.2231065468549422e-05, + "loss": 0.6797, + "step": 3178 + }, + { + "epoch": 4.080872913992298, + "grad_norm": 7.438006401062012, + "learning_rate": 1.2234916559691913e-05, + "loss": 0.6328, + "step": 3179 + }, + { + "epoch": 4.082156611039794, + "grad_norm": 2.2751710414886475, + "learning_rate": 1.2238767650834404e-05, + "loss": 0.6756, + "step": 3180 + }, + { + "epoch": 4.083440308087291, + "grad_norm": 2.4823663234710693, + "learning_rate": 1.2242618741976894e-05, + "loss": 0.6591, + "step": 3181 + }, + { + "epoch": 4.084724005134788, + "grad_norm": 3.1859302520751953, + "learning_rate": 1.2246469833119383e-05, + "loss": 0.6339, + "step": 3182 + }, + { + "epoch": 4.086007702182285, + "grad_norm": 3.4867491722106934, + "learning_rate": 1.2250320924261876e-05, + "loss": 0.6439, + "step": 3183 + }, + { + "epoch": 4.087291399229782, + "grad_norm": 1.7116711139678955, + "learning_rate": 1.2254172015404365e-05, + "loss": 0.6369, + "step": 3184 + }, + { + "epoch": 4.088575096277278, + "grad_norm": 1.4042531251907349, + "learning_rate": 1.2258023106546855e-05, + "loss": 0.6502, + "step": 3185 + }, + { + "epoch": 4.089858793324775, + "grad_norm": 4.148313999176025, + "learning_rate": 1.2261874197689346e-05, + "loss": 0.593, + "step": 3186 + }, + { + "epoch": 4.091142490372272, + "grad_norm": 1.719758152961731, + "learning_rate": 1.2265725288831837e-05, + "loss": 0.6726, + "step": 3187 + }, + { + "epoch": 4.092426187419769, + "grad_norm": 3.4055535793304443, + "learning_rate": 1.2269576379974326e-05, + "loss": 0.6324, + "step": 3188 + }, + { + "epoch": 4.093709884467266, + "grad_norm": 4.535161018371582, + "learning_rate": 1.2273427471116817e-05, + "loss": 0.6624, + "step": 3189 + }, + { + "epoch": 4.094993581514762, + "grad_norm": 10.36131477355957, + "learning_rate": 1.2277278562259307e-05, + "loss": 0.64, + "step": 3190 + }, + { + "epoch": 4.096277278562259, + "grad_norm": 2.1253185272216797, + "learning_rate": 1.2281129653401798e-05, + "loss": 0.6655, + "step": 3191 + }, + { + "epoch": 4.097560975609756, + "grad_norm": 9.418538093566895, + "learning_rate": 1.2284980744544289e-05, + "loss": 0.6954, + "step": 3192 + }, + { + "epoch": 4.098844672657253, + "grad_norm": 4.079799652099609, + "learning_rate": 1.2288831835686778e-05, + "loss": 0.6624, + "step": 3193 + }, + { + "epoch": 4.100128369704749, + "grad_norm": 2.1069765090942383, + "learning_rate": 1.2292682926829267e-05, + "loss": 0.6534, + "step": 3194 + }, + { + "epoch": 4.101412066752246, + "grad_norm": 3.099727153778076, + "learning_rate": 1.229653401797176e-05, + "loss": 0.7071, + "step": 3195 + }, + { + "epoch": 4.102695763799743, + "grad_norm": 2.925197124481201, + "learning_rate": 1.230038510911425e-05, + "loss": 0.6754, + "step": 3196 + }, + { + "epoch": 4.10397946084724, + "grad_norm": 3.5942134857177734, + "learning_rate": 1.2304236200256739e-05, + "loss": 0.6781, + "step": 3197 + }, + { + "epoch": 4.105263157894737, + "grad_norm": 3.9849853515625, + "learning_rate": 1.230808729139923e-05, + "loss": 0.6848, + "step": 3198 + }, + { + "epoch": 4.106546854942233, + "grad_norm": 2.436768054962158, + "learning_rate": 1.2311938382541721e-05, + "loss": 0.6564, + "step": 3199 + }, + { + "epoch": 4.10783055198973, + "grad_norm": 7.69277811050415, + "learning_rate": 1.231578947368421e-05, + "loss": 0.7492, + "step": 3200 + }, + { + "epoch": 4.109114249037227, + "grad_norm": 5.1054911613464355, + "learning_rate": 1.2319640564826702e-05, + "loss": 0.7353, + "step": 3201 + }, + { + "epoch": 4.110397946084724, + "grad_norm": 1.6034210920333862, + "learning_rate": 1.2323491655969193e-05, + "loss": 0.668, + "step": 3202 + }, + { + "epoch": 4.111681643132221, + "grad_norm": 2.965888023376465, + "learning_rate": 1.2327342747111682e-05, + "loss": 0.7328, + "step": 3203 + }, + { + "epoch": 4.112965340179717, + "grad_norm": 1.8310799598693848, + "learning_rate": 1.2331193838254173e-05, + "loss": 0.674, + "step": 3204 + }, + { + "epoch": 4.114249037227214, + "grad_norm": 3.0915000438690186, + "learning_rate": 1.2335044929396662e-05, + "loss": 0.7518, + "step": 3205 + }, + { + "epoch": 4.115532734274711, + "grad_norm": 5.282622337341309, + "learning_rate": 1.2338896020539154e-05, + "loss": 0.7074, + "step": 3206 + }, + { + "epoch": 4.116816431322208, + "grad_norm": 2.6369762420654297, + "learning_rate": 1.2342747111681643e-05, + "loss": 0.7604, + "step": 3207 + }, + { + "epoch": 4.118100128369705, + "grad_norm": 2.3682217597961426, + "learning_rate": 1.2346598202824134e-05, + "loss": 0.7016, + "step": 3208 + }, + { + "epoch": 4.119383825417201, + "grad_norm": 2.4278786182403564, + "learning_rate": 1.2350449293966623e-05, + "loss": 0.7118, + "step": 3209 + }, + { + "epoch": 4.120667522464698, + "grad_norm": 2.5555758476257324, + "learning_rate": 1.2354300385109114e-05, + "loss": 0.6998, + "step": 3210 + }, + { + "epoch": 4.121951219512195, + "grad_norm": 3.9388766288757324, + "learning_rate": 1.2358151476251605e-05, + "loss": 0.7185, + "step": 3211 + }, + { + "epoch": 4.123234916559692, + "grad_norm": 4.523067951202393, + "learning_rate": 1.2362002567394095e-05, + "loss": 0.7284, + "step": 3212 + }, + { + "epoch": 4.124518613607188, + "grad_norm": 4.143263816833496, + "learning_rate": 1.2365853658536584e-05, + "loss": 0.7095, + "step": 3213 + }, + { + "epoch": 4.125802310654685, + "grad_norm": 3.2779784202575684, + "learning_rate": 1.2369704749679077e-05, + "loss": 0.8067, + "step": 3214 + }, + { + "epoch": 4.127086007702182, + "grad_norm": 4.193215370178223, + "learning_rate": 1.2373555840821566e-05, + "loss": 0.8115, + "step": 3215 + }, + { + "epoch": 4.128369704749679, + "grad_norm": 3.4954092502593994, + "learning_rate": 1.2377406931964056e-05, + "loss": 0.9563, + "step": 3216 + }, + { + "epoch": 4.129653401797176, + "grad_norm": 2.9369983673095703, + "learning_rate": 1.2381258023106547e-05, + "loss": 0.644, + "step": 3217 + }, + { + "epoch": 4.130937098844672, + "grad_norm": 1.322764277458191, + "learning_rate": 1.2385109114249038e-05, + "loss": 0.6449, + "step": 3218 + }, + { + "epoch": 4.132220795892169, + "grad_norm": 1.7610279321670532, + "learning_rate": 1.2388960205391527e-05, + "loss": 0.6403, + "step": 3219 + }, + { + "epoch": 4.133504492939666, + "grad_norm": 1.989323616027832, + "learning_rate": 1.2392811296534018e-05, + "loss": 0.63, + "step": 3220 + }, + { + "epoch": 4.134788189987163, + "grad_norm": 2.0439293384552, + "learning_rate": 1.2396662387676508e-05, + "loss": 0.6161, + "step": 3221 + }, + { + "epoch": 4.13607188703466, + "grad_norm": 3.112009048461914, + "learning_rate": 1.2400513478818999e-05, + "loss": 0.6313, + "step": 3222 + }, + { + "epoch": 4.137355584082156, + "grad_norm": 1.2900968790054321, + "learning_rate": 1.240436456996149e-05, + "loss": 0.6401, + "step": 3223 + }, + { + "epoch": 4.138639281129653, + "grad_norm": 1.615221381187439, + "learning_rate": 1.240821566110398e-05, + "loss": 0.6528, + "step": 3224 + }, + { + "epoch": 4.13992297817715, + "grad_norm": 3.080510139465332, + "learning_rate": 1.241206675224647e-05, + "loss": 0.6561, + "step": 3225 + }, + { + "epoch": 4.141206675224647, + "grad_norm": 3.344534158706665, + "learning_rate": 1.2415917843388961e-05, + "loss": 0.6314, + "step": 3226 + }, + { + "epoch": 4.142490372272144, + "grad_norm": 2.2957170009613037, + "learning_rate": 1.241976893453145e-05, + "loss": 0.6227, + "step": 3227 + }, + { + "epoch": 4.14377406931964, + "grad_norm": 2.788856029510498, + "learning_rate": 1.242362002567394e-05, + "loss": 0.6484, + "step": 3228 + }, + { + "epoch": 4.145057766367137, + "grad_norm": 2.0284781455993652, + "learning_rate": 1.2427471116816433e-05, + "loss": 0.6373, + "step": 3229 + }, + { + "epoch": 4.146341463414634, + "grad_norm": 1.4312763214111328, + "learning_rate": 1.2431322207958922e-05, + "loss": 0.6478, + "step": 3230 + }, + { + "epoch": 4.147625160462131, + "grad_norm": 2.751462459564209, + "learning_rate": 1.2435173299101412e-05, + "loss": 0.6582, + "step": 3231 + }, + { + "epoch": 4.148908857509627, + "grad_norm": 1.8752819299697876, + "learning_rate": 1.2439024390243903e-05, + "loss": 0.676, + "step": 3232 + }, + { + "epoch": 4.150192554557124, + "grad_norm": 3.828355550765991, + "learning_rate": 1.2442875481386394e-05, + "loss": 0.6459, + "step": 3233 + }, + { + "epoch": 4.151476251604621, + "grad_norm": 1.2234357595443726, + "learning_rate": 1.2446726572528883e-05, + "loss": 0.6441, + "step": 3234 + }, + { + "epoch": 4.152759948652118, + "grad_norm": 2.829296827316284, + "learning_rate": 1.2450577663671374e-05, + "loss": 0.6934, + "step": 3235 + }, + { + "epoch": 4.154043645699615, + "grad_norm": 1.7559527158737183, + "learning_rate": 1.2454428754813864e-05, + "loss": 0.6688, + "step": 3236 + }, + { + "epoch": 4.155327342747111, + "grad_norm": 2.8839492797851562, + "learning_rate": 1.2458279845956355e-05, + "loss": 0.64, + "step": 3237 + }, + { + "epoch": 4.156611039794608, + "grad_norm": 2.513946056365967, + "learning_rate": 1.2462130937098846e-05, + "loss": 0.6694, + "step": 3238 + }, + { + "epoch": 4.157894736842105, + "grad_norm": 3.007761240005493, + "learning_rate": 1.2465982028241335e-05, + "loss": 0.6827, + "step": 3239 + }, + { + "epoch": 4.159178433889602, + "grad_norm": 1.8453326225280762, + "learning_rate": 1.2469833119383825e-05, + "loss": 0.6637, + "step": 3240 + }, + { + "epoch": 4.160462130937099, + "grad_norm": 1.9659446477890015, + "learning_rate": 1.2473684210526317e-05, + "loss": 0.652, + "step": 3241 + }, + { + "epoch": 4.161745827984595, + "grad_norm": 2.310500383377075, + "learning_rate": 1.2477535301668807e-05, + "loss": 0.6885, + "step": 3242 + }, + { + "epoch": 4.163029525032092, + "grad_norm": 1.6308566331863403, + "learning_rate": 1.2481386392811296e-05, + "loss": 0.6334, + "step": 3243 + }, + { + "epoch": 4.164313222079589, + "grad_norm": 1.7294384241104126, + "learning_rate": 1.2485237483953787e-05, + "loss": 0.6664, + "step": 3244 + }, + { + "epoch": 4.165596919127086, + "grad_norm": 2.9325320720672607, + "learning_rate": 1.2489088575096278e-05, + "loss": 0.7009, + "step": 3245 + }, + { + "epoch": 4.166880616174582, + "grad_norm": 2.216224193572998, + "learning_rate": 1.2492939666238768e-05, + "loss": 0.6765, + "step": 3246 + }, + { + "epoch": 4.168164313222079, + "grad_norm": 5.176456928253174, + "learning_rate": 1.2496790757381259e-05, + "loss": 0.6575, + "step": 3247 + }, + { + "epoch": 4.169448010269576, + "grad_norm": 8.30102825164795, + "learning_rate": 1.250064184852375e-05, + "loss": 0.6505, + "step": 3248 + }, + { + "epoch": 4.170731707317073, + "grad_norm": 2.927788257598877, + "learning_rate": 1.2504492939666239e-05, + "loss": 0.6854, + "step": 3249 + }, + { + "epoch": 4.17201540436457, + "grad_norm": 1.6757737398147583, + "learning_rate": 1.250834403080873e-05, + "loss": 0.6864, + "step": 3250 + }, + { + "epoch": 4.173299101412066, + "grad_norm": 1.9092236757278442, + "learning_rate": 1.251219512195122e-05, + "loss": 0.7057, + "step": 3251 + }, + { + "epoch": 4.174582798459563, + "grad_norm": 3.1402931213378906, + "learning_rate": 1.251604621309371e-05, + "loss": 0.666, + "step": 3252 + }, + { + "epoch": 4.17586649550706, + "grad_norm": 3.3460328578948975, + "learning_rate": 1.2519897304236202e-05, + "loss": 0.7989, + "step": 3253 + }, + { + "epoch": 4.177150192554557, + "grad_norm": 3.9069247245788574, + "learning_rate": 1.2523748395378691e-05, + "loss": 0.7149, + "step": 3254 + }, + { + "epoch": 4.178433889602054, + "grad_norm": 3.1910011768341064, + "learning_rate": 1.252759948652118e-05, + "loss": 0.6858, + "step": 3255 + }, + { + "epoch": 4.17971758664955, + "grad_norm": 2.2810497283935547, + "learning_rate": 1.2531450577663671e-05, + "loss": 0.704, + "step": 3256 + }, + { + "epoch": 4.181001283697047, + "grad_norm": 2.441014289855957, + "learning_rate": 1.2535301668806163e-05, + "loss": 0.6847, + "step": 3257 + }, + { + "epoch": 4.182284980744544, + "grad_norm": Infinity, + "learning_rate": 1.2535301668806163e-05, + "loss": 0.7123, + "step": 3258 + }, + { + "epoch": 4.183568677792041, + "grad_norm": 2.775900363922119, + "learning_rate": 1.2539152759948652e-05, + "loss": 0.7964, + "step": 3259 + }, + { + "epoch": 4.184852374839538, + "grad_norm": 4.798354148864746, + "learning_rate": 1.2543003851091141e-05, + "loss": 0.7221, + "step": 3260 + }, + { + "epoch": 4.186136071887034, + "grad_norm": 2.5993735790252686, + "learning_rate": 1.2546854942233634e-05, + "loss": 0.7721, + "step": 3261 + }, + { + "epoch": 4.187419768934531, + "grad_norm": 6.680633544921875, + "learning_rate": 1.2550706033376123e-05, + "loss": 0.7545, + "step": 3262 + }, + { + "epoch": 4.188703465982028, + "grad_norm": 2.2058603763580322, + "learning_rate": 1.2554557124518613e-05, + "loss": 0.777, + "step": 3263 + }, + { + "epoch": 4.189987163029525, + "grad_norm": 3.692336082458496, + "learning_rate": 1.2558408215661104e-05, + "loss": 0.7699, + "step": 3264 + }, + { + "epoch": 4.191270860077021, + "grad_norm": 4.424408435821533, + "learning_rate": 1.2562259306803595e-05, + "loss": 0.8376, + "step": 3265 + }, + { + "epoch": 4.192554557124518, + "grad_norm": 4.78484582901001, + "learning_rate": 1.2566110397946084e-05, + "loss": 0.9943, + "step": 3266 + }, + { + "epoch": 4.193838254172015, + "grad_norm": 2.2205965518951416, + "learning_rate": 1.2569961489088575e-05, + "loss": 0.6549, + "step": 3267 + }, + { + "epoch": 4.195121951219512, + "grad_norm": 1.739512324333191, + "learning_rate": 1.2573812580231066e-05, + "loss": 0.6186, + "step": 3268 + }, + { + "epoch": 4.196405648267009, + "grad_norm": 2.061314821243286, + "learning_rate": 1.2577663671373556e-05, + "loss": 0.6683, + "step": 3269 + }, + { + "epoch": 4.197689345314505, + "grad_norm": 2.3205835819244385, + "learning_rate": 1.2581514762516047e-05, + "loss": 0.6324, + "step": 3270 + }, + { + "epoch": 4.198973042362002, + "grad_norm": 1.6203197240829468, + "learning_rate": 1.2585365853658536e-05, + "loss": 0.6725, + "step": 3271 + }, + { + "epoch": 4.200256739409499, + "grad_norm": 1.6820629835128784, + "learning_rate": 1.2589216944801027e-05, + "loss": 0.651, + "step": 3272 + }, + { + "epoch": 4.201540436456996, + "grad_norm": 3.8214943408966064, + "learning_rate": 1.2593068035943518e-05, + "loss": 0.6676, + "step": 3273 + }, + { + "epoch": 4.202824133504493, + "grad_norm": 2.1532063484191895, + "learning_rate": 1.2596919127086008e-05, + "loss": 0.6449, + "step": 3274 + }, + { + "epoch": 4.2041078305519894, + "grad_norm": 4.421689510345459, + "learning_rate": 1.2600770218228497e-05, + "loss": 0.6368, + "step": 3275 + }, + { + "epoch": 4.205391527599486, + "grad_norm": 1.7917817831039429, + "learning_rate": 1.260462130937099e-05, + "loss": 0.662, + "step": 3276 + }, + { + "epoch": 4.206675224646983, + "grad_norm": 2.824801445007324, + "learning_rate": 1.260847240051348e-05, + "loss": 0.6357, + "step": 3277 + }, + { + "epoch": 4.20795892169448, + "grad_norm": 3.7081258296966553, + "learning_rate": 1.2612323491655969e-05, + "loss": 0.7074, + "step": 3278 + }, + { + "epoch": 4.2092426187419765, + "grad_norm": 1.6545590162277222, + "learning_rate": 1.261617458279846e-05, + "loss": 0.5993, + "step": 3279 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 1.766332745552063, + "learning_rate": 1.262002567394095e-05, + "loss": 0.6699, + "step": 3280 + }, + { + "epoch": 4.21181001283697, + "grad_norm": 2.8286514282226562, + "learning_rate": 1.262387676508344e-05, + "loss": 0.6778, + "step": 3281 + }, + { + "epoch": 4.213093709884467, + "grad_norm": 1.737268328666687, + "learning_rate": 1.2627727856225931e-05, + "loss": 0.6469, + "step": 3282 + }, + { + "epoch": 4.214377406931964, + "grad_norm": 2.8411800861358643, + "learning_rate": 1.263157894736842e-05, + "loss": 0.6213, + "step": 3283 + }, + { + "epoch": 4.2156611039794605, + "grad_norm": 1.7574305534362793, + "learning_rate": 1.2635430038510912e-05, + "loss": 0.6247, + "step": 3284 + }, + { + "epoch": 4.2169448010269575, + "grad_norm": 3.4624366760253906, + "learning_rate": 1.2639281129653403e-05, + "loss": 0.6455, + "step": 3285 + }, + { + "epoch": 4.218228498074454, + "grad_norm": 2.7046523094177246, + "learning_rate": 1.2643132220795892e-05, + "loss": 0.6576, + "step": 3286 + }, + { + "epoch": 4.219512195121951, + "grad_norm": 3.389801025390625, + "learning_rate": 1.2646983311938382e-05, + "loss": 0.6232, + "step": 3287 + }, + { + "epoch": 4.220795892169448, + "grad_norm": 3.0457687377929688, + "learning_rate": 1.2650834403080874e-05, + "loss": 0.7106, + "step": 3288 + }, + { + "epoch": 4.2220795892169445, + "grad_norm": 2.6800785064697266, + "learning_rate": 1.2654685494223364e-05, + "loss": 0.6307, + "step": 3289 + }, + { + "epoch": 4.2233632862644415, + "grad_norm": 1.977806568145752, + "learning_rate": 1.2658536585365853e-05, + "loss": 0.7103, + "step": 3290 + }, + { + "epoch": 4.224646983311938, + "grad_norm": 3.8138792514801025, + "learning_rate": 1.2662387676508346e-05, + "loss": 0.6209, + "step": 3291 + }, + { + "epoch": 4.225930680359435, + "grad_norm": 4.2376556396484375, + "learning_rate": 1.2666238767650835e-05, + "loss": 0.6815, + "step": 3292 + }, + { + "epoch": 4.227214377406932, + "grad_norm": 2.4886062145233154, + "learning_rate": 1.2670089858793325e-05, + "loss": 0.6749, + "step": 3293 + }, + { + "epoch": 4.2284980744544285, + "grad_norm": 7.602123737335205, + "learning_rate": 1.2673940949935816e-05, + "loss": 0.6522, + "step": 3294 + }, + { + "epoch": 4.2297817715019255, + "grad_norm": 5.310977458953857, + "learning_rate": 1.2677792041078307e-05, + "loss": 0.6768, + "step": 3295 + }, + { + "epoch": 4.2310654685494224, + "grad_norm": 3.4782912731170654, + "learning_rate": 1.2681643132220796e-05, + "loss": 0.7205, + "step": 3296 + }, + { + "epoch": 4.232349165596919, + "grad_norm": 2.5888617038726807, + "learning_rate": 1.2685494223363287e-05, + "loss": 0.7188, + "step": 3297 + }, + { + "epoch": 4.2336328626444155, + "grad_norm": 1.8210296630859375, + "learning_rate": 1.2689345314505777e-05, + "loss": 0.6702, + "step": 3298 + }, + { + "epoch": 4.2349165596919125, + "grad_norm": 4.391339302062988, + "learning_rate": 1.2693196405648268e-05, + "loss": 0.6741, + "step": 3299 + }, + { + "epoch": 4.2362002567394095, + "grad_norm": 2.958566427230835, + "learning_rate": 1.2697047496790759e-05, + "loss": 0.6702, + "step": 3300 + }, + { + "epoch": 4.2374839537869065, + "grad_norm": 2.124781847000122, + "learning_rate": 1.2700898587933248e-05, + "loss": 0.684, + "step": 3301 + }, + { + "epoch": 4.238767650834403, + "grad_norm": 2.6849448680877686, + "learning_rate": 1.2704749679075737e-05, + "loss": 0.7062, + "step": 3302 + }, + { + "epoch": 4.2400513478818995, + "grad_norm": 3.217407464981079, + "learning_rate": 1.270860077021823e-05, + "loss": 0.683, + "step": 3303 + }, + { + "epoch": 4.2413350449293965, + "grad_norm": 9.984580993652344, + "learning_rate": 1.271245186136072e-05, + "loss": 0.7524, + "step": 3304 + }, + { + "epoch": 4.2426187419768935, + "grad_norm": 2.87575101852417, + "learning_rate": 1.2716302952503209e-05, + "loss": 0.7385, + "step": 3305 + }, + { + "epoch": 4.2439024390243905, + "grad_norm": 2.5191307067871094, + "learning_rate": 1.27201540436457e-05, + "loss": 0.7422, + "step": 3306 + }, + { + "epoch": 4.245186136071887, + "grad_norm": 5.960726737976074, + "learning_rate": 1.2724005134788191e-05, + "loss": 0.7038, + "step": 3307 + }, + { + "epoch": 4.2464698331193835, + "grad_norm": 2.625760078430176, + "learning_rate": 1.272785622593068e-05, + "loss": 0.7368, + "step": 3308 + }, + { + "epoch": 4.2477535301668805, + "grad_norm": 4.800004959106445, + "learning_rate": 1.2731707317073172e-05, + "loss": 0.7713, + "step": 3309 + }, + { + "epoch": 4.2490372272143775, + "grad_norm": 4.785364151000977, + "learning_rate": 1.2735558408215661e-05, + "loss": 0.7337, + "step": 3310 + }, + { + "epoch": 4.2503209242618745, + "grad_norm": 2.6971545219421387, + "learning_rate": 1.2739409499358152e-05, + "loss": 0.8075, + "step": 3311 + }, + { + "epoch": 4.251604621309371, + "grad_norm": 2.6664226055145264, + "learning_rate": 1.2743260590500641e-05, + "loss": 0.7896, + "step": 3312 + }, + { + "epoch": 4.2528883183568675, + "grad_norm": 5.76943302154541, + "learning_rate": 1.2747111681643132e-05, + "loss": 0.7806, + "step": 3313 + }, + { + "epoch": 4.2541720154043645, + "grad_norm": 4.64140510559082, + "learning_rate": 1.2750962772785624e-05, + "loss": 0.8247, + "step": 3314 + }, + { + "epoch": 4.2554557124518615, + "grad_norm": 4.682523250579834, + "learning_rate": 1.2754813863928113e-05, + "loss": 0.8419, + "step": 3315 + }, + { + "epoch": 4.2567394094993585, + "grad_norm": 3.1252293586730957, + "learning_rate": 1.2758664955070604e-05, + "loss": 0.9965, + "step": 3316 + }, + { + "epoch": 4.258023106546855, + "grad_norm": 3.1876657009124756, + "learning_rate": 1.2762516046213093e-05, + "loss": 0.6899, + "step": 3317 + }, + { + "epoch": 4.2593068035943515, + "grad_norm": 2.831209421157837, + "learning_rate": 1.2766367137355584e-05, + "loss": 0.6312, + "step": 3318 + }, + { + "epoch": 4.2605905006418485, + "grad_norm": 2.1277387142181396, + "learning_rate": 1.2770218228498075e-05, + "loss": 0.6632, + "step": 3319 + }, + { + "epoch": 4.2618741976893455, + "grad_norm": 4.691371440887451, + "learning_rate": 1.2774069319640565e-05, + "loss": 0.6881, + "step": 3320 + }, + { + "epoch": 4.2631578947368425, + "grad_norm": 1.6199148893356323, + "learning_rate": 1.2777920410783054e-05, + "loss": 0.6384, + "step": 3321 + }, + { + "epoch": 4.264441591784339, + "grad_norm": 2.749446153640747, + "learning_rate": 1.2781771501925547e-05, + "loss": 0.6292, + "step": 3322 + }, + { + "epoch": 4.2657252888318355, + "grad_norm": 1.4867244958877563, + "learning_rate": 1.2785622593068036e-05, + "loss": 0.6307, + "step": 3323 + }, + { + "epoch": 4.2670089858793325, + "grad_norm": 1.8495131731033325, + "learning_rate": 1.2789473684210526e-05, + "loss": 0.6431, + "step": 3324 + }, + { + "epoch": 4.2682926829268295, + "grad_norm": 3.292001962661743, + "learning_rate": 1.2793324775353017e-05, + "loss": 0.6443, + "step": 3325 + }, + { + "epoch": 4.2695763799743265, + "grad_norm": 2.06020188331604, + "learning_rate": 1.2797175866495508e-05, + "loss": 0.6342, + "step": 3326 + }, + { + "epoch": 4.270860077021823, + "grad_norm": 3.6539504528045654, + "learning_rate": 1.2801026957637997e-05, + "loss": 0.6302, + "step": 3327 + }, + { + "epoch": 4.2721437740693196, + "grad_norm": 1.6358009576797485, + "learning_rate": 1.2804878048780488e-05, + "loss": 0.6823, + "step": 3328 + }, + { + "epoch": 4.2734274711168165, + "grad_norm": 2.116194486618042, + "learning_rate": 1.2808729139922978e-05, + "loss": 0.6636, + "step": 3329 + }, + { + "epoch": 4.2747111681643135, + "grad_norm": 1.4569824934005737, + "learning_rate": 1.2812580231065469e-05, + "loss": 0.6427, + "step": 3330 + }, + { + "epoch": 4.27599486521181, + "grad_norm": 6.621343612670898, + "learning_rate": 1.281643132220796e-05, + "loss": 0.6413, + "step": 3331 + }, + { + "epoch": 4.277278562259307, + "grad_norm": 3.8457934856414795, + "learning_rate": 1.282028241335045e-05, + "loss": 0.6273, + "step": 3332 + }, + { + "epoch": 4.278562259306804, + "grad_norm": 3.015479326248169, + "learning_rate": 1.2824133504492939e-05, + "loss": 0.6222, + "step": 3333 + }, + { + "epoch": 4.2798459563543005, + "grad_norm": 2.0153677463531494, + "learning_rate": 1.2827984595635431e-05, + "loss": 0.6489, + "step": 3334 + }, + { + "epoch": 4.2811296534017975, + "grad_norm": 2.397758960723877, + "learning_rate": 1.283183568677792e-05, + "loss": 0.6678, + "step": 3335 + }, + { + "epoch": 4.282413350449294, + "grad_norm": 2.309772491455078, + "learning_rate": 1.283568677792041e-05, + "loss": 0.6387, + "step": 3336 + }, + { + "epoch": 4.283697047496791, + "grad_norm": 2.726792573928833, + "learning_rate": 1.2839537869062903e-05, + "loss": 0.6364, + "step": 3337 + }, + { + "epoch": 4.284980744544288, + "grad_norm": 2.1724369525909424, + "learning_rate": 1.2843388960205392e-05, + "loss": 0.6453, + "step": 3338 + }, + { + "epoch": 4.2862644415917845, + "grad_norm": 2.9143269062042236, + "learning_rate": 1.2847240051347882e-05, + "loss": 0.6663, + "step": 3339 + }, + { + "epoch": 4.2875481386392815, + "grad_norm": 1.9788635969161987, + "learning_rate": 1.2851091142490373e-05, + "loss": 0.6316, + "step": 3340 + }, + { + "epoch": 4.288831835686778, + "grad_norm": 5.0425124168396, + "learning_rate": 1.2854942233632864e-05, + "loss": 0.6487, + "step": 3341 + }, + { + "epoch": 4.290115532734275, + "grad_norm": 4.086379528045654, + "learning_rate": 1.2858793324775353e-05, + "loss": 0.6426, + "step": 3342 + }, + { + "epoch": 4.291399229781772, + "grad_norm": 2.2067718505859375, + "learning_rate": 1.2862644415917844e-05, + "loss": 0.6112, + "step": 3343 + }, + { + "epoch": 4.2926829268292686, + "grad_norm": 2.015493392944336, + "learning_rate": 1.2866495507060334e-05, + "loss": 0.6562, + "step": 3344 + }, + { + "epoch": 4.293966623876765, + "grad_norm": 3.187141180038452, + "learning_rate": 1.2870346598202825e-05, + "loss": 0.6168, + "step": 3345 + }, + { + "epoch": 4.295250320924262, + "grad_norm": 2.0046045780181885, + "learning_rate": 1.2874197689345316e-05, + "loss": 0.6546, + "step": 3346 + }, + { + "epoch": 4.296534017971759, + "grad_norm": 2.2802236080169678, + "learning_rate": 1.2878048780487805e-05, + "loss": 0.6694, + "step": 3347 + }, + { + "epoch": 4.297817715019256, + "grad_norm": 3.168705701828003, + "learning_rate": 1.2881899871630294e-05, + "loss": 0.6197, + "step": 3348 + }, + { + "epoch": 4.299101412066753, + "grad_norm": 1.9617705345153809, + "learning_rate": 1.2885750962772787e-05, + "loss": 0.7297, + "step": 3349 + }, + { + "epoch": 4.300385109114249, + "grad_norm": 3.32619047164917, + "learning_rate": 1.2889602053915277e-05, + "loss": 0.7253, + "step": 3350 + }, + { + "epoch": 4.301668806161746, + "grad_norm": 2.270246744155884, + "learning_rate": 1.2893453145057766e-05, + "loss": 0.6452, + "step": 3351 + }, + { + "epoch": 4.302952503209243, + "grad_norm": 2.416959524154663, + "learning_rate": 1.2897304236200257e-05, + "loss": 0.6621, + "step": 3352 + }, + { + "epoch": 4.30423620025674, + "grad_norm": 2.6204679012298584, + "learning_rate": 1.2901155327342748e-05, + "loss": 0.7028, + "step": 3353 + }, + { + "epoch": 4.305519897304237, + "grad_norm": 2.1505017280578613, + "learning_rate": 1.2905006418485238e-05, + "loss": 0.709, + "step": 3354 + }, + { + "epoch": 4.306803594351733, + "grad_norm": 1.9130804538726807, + "learning_rate": 1.2908857509627729e-05, + "loss": 0.6811, + "step": 3355 + }, + { + "epoch": 4.30808729139923, + "grad_norm": 2.4412343502044678, + "learning_rate": 1.2912708600770218e-05, + "loss": 0.7228, + "step": 3356 + }, + { + "epoch": 4.309370988446727, + "grad_norm": 2.6887025833129883, + "learning_rate": 1.2916559691912709e-05, + "loss": 0.6527, + "step": 3357 + }, + { + "epoch": 4.310654685494224, + "grad_norm": 3.1685166358947754, + "learning_rate": 1.29204107830552e-05, + "loss": 0.7373, + "step": 3358 + }, + { + "epoch": 4.311938382541721, + "grad_norm": 7.258768558502197, + "learning_rate": 1.292426187419769e-05, + "loss": 0.7183, + "step": 3359 + }, + { + "epoch": 4.313222079589217, + "grad_norm": 2.6268441677093506, + "learning_rate": 1.292811296534018e-05, + "loss": 0.7343, + "step": 3360 + }, + { + "epoch": 4.314505776636714, + "grad_norm": 5.591504096984863, + "learning_rate": 1.293196405648267e-05, + "loss": 0.7448, + "step": 3361 + }, + { + "epoch": 4.315789473684211, + "grad_norm": 4.589951992034912, + "learning_rate": 1.2935815147625161e-05, + "loss": 0.7039, + "step": 3362 + }, + { + "epoch": 4.317073170731708, + "grad_norm": 2.622962236404419, + "learning_rate": 1.293966623876765e-05, + "loss": 0.7121, + "step": 3363 + }, + { + "epoch": 4.318356867779205, + "grad_norm": 4.5382843017578125, + "learning_rate": 1.2943517329910141e-05, + "loss": 0.7743, + "step": 3364 + }, + { + "epoch": 4.319640564826701, + "grad_norm": 3.3144962787628174, + "learning_rate": 1.2947368421052633e-05, + "loss": 0.875, + "step": 3365 + }, + { + "epoch": 4.320924261874198, + "grad_norm": 4.204307556152344, + "learning_rate": 1.2951219512195122e-05, + "loss": 1.0023, + "step": 3366 + }, + { + "epoch": 4.322207958921695, + "grad_norm": 1.8162118196487427, + "learning_rate": 1.2955070603337611e-05, + "loss": 0.6216, + "step": 3367 + }, + { + "epoch": 4.323491655969192, + "grad_norm": 2.0635976791381836, + "learning_rate": 1.2958921694480104e-05, + "loss": 0.641, + "step": 3368 + }, + { + "epoch": 4.324775353016688, + "grad_norm": 2.2817070484161377, + "learning_rate": 1.2962772785622593e-05, + "loss": 0.6271, + "step": 3369 + }, + { + "epoch": 4.326059050064185, + "grad_norm": 2.6855313777923584, + "learning_rate": 1.2966623876765083e-05, + "loss": 0.6471, + "step": 3370 + }, + { + "epoch": 4.327342747111682, + "grad_norm": 1.9757068157196045, + "learning_rate": 1.2970474967907574e-05, + "loss": 0.6535, + "step": 3371 + }, + { + "epoch": 4.328626444159179, + "grad_norm": 2.6303892135620117, + "learning_rate": 1.2974326059050065e-05, + "loss": 0.6471, + "step": 3372 + }, + { + "epoch": 4.329910141206676, + "grad_norm": 2.8627967834472656, + "learning_rate": 1.2978177150192554e-05, + "loss": 0.6758, + "step": 3373 + }, + { + "epoch": 4.331193838254172, + "grad_norm": 3.3168795108795166, + "learning_rate": 1.2982028241335045e-05, + "loss": 0.6127, + "step": 3374 + }, + { + "epoch": 4.332477535301669, + "grad_norm": 2.6311402320861816, + "learning_rate": 1.2985879332477535e-05, + "loss": 0.6819, + "step": 3375 + }, + { + "epoch": 4.333761232349166, + "grad_norm": 1.676558494567871, + "learning_rate": 1.2989730423620026e-05, + "loss": 0.702, + "step": 3376 + }, + { + "epoch": 4.335044929396663, + "grad_norm": 2.634799003601074, + "learning_rate": 1.2993581514762517e-05, + "loss": 0.6405, + "step": 3377 + }, + { + "epoch": 4.336328626444159, + "grad_norm": 2.001227378845215, + "learning_rate": 1.2997432605905006e-05, + "loss": 0.6648, + "step": 3378 + }, + { + "epoch": 4.337612323491656, + "grad_norm": 2.470529556274414, + "learning_rate": 1.3001283697047496e-05, + "loss": 0.6025, + "step": 3379 + }, + { + "epoch": 4.338896020539153, + "grad_norm": 3.0058045387268066, + "learning_rate": 1.3005134788189988e-05, + "loss": 0.6954, + "step": 3380 + }, + { + "epoch": 4.34017971758665, + "grad_norm": 2.6539649963378906, + "learning_rate": 1.3008985879332478e-05, + "loss": 0.6636, + "step": 3381 + }, + { + "epoch": 4.341463414634147, + "grad_norm": 2.422954797744751, + "learning_rate": 1.3012836970474967e-05, + "loss": 0.6076, + "step": 3382 + }, + { + "epoch": 4.342747111681643, + "grad_norm": 2.7511067390441895, + "learning_rate": 1.301668806161746e-05, + "loss": 0.6901, + "step": 3383 + }, + { + "epoch": 4.34403080872914, + "grad_norm": 4.952669620513916, + "learning_rate": 1.302053915275995e-05, + "loss": 0.6337, + "step": 3384 + }, + { + "epoch": 4.345314505776637, + "grad_norm": 3.735468864440918, + "learning_rate": 1.3024390243902439e-05, + "loss": 0.6472, + "step": 3385 + }, + { + "epoch": 4.346598202824134, + "grad_norm": 2.404334545135498, + "learning_rate": 1.302824133504493e-05, + "loss": 0.6441, + "step": 3386 + }, + { + "epoch": 4.347881899871631, + "grad_norm": 2.1884498596191406, + "learning_rate": 1.303209242618742e-05, + "loss": 0.657, + "step": 3387 + }, + { + "epoch": 4.349165596919127, + "grad_norm": 4.3115668296813965, + "learning_rate": 1.303594351732991e-05, + "loss": 0.6616, + "step": 3388 + }, + { + "epoch": 4.350449293966624, + "grad_norm": 1.9935731887817383, + "learning_rate": 1.3039794608472401e-05, + "loss": 0.6244, + "step": 3389 + }, + { + "epoch": 4.351732991014121, + "grad_norm": 2.6682074069976807, + "learning_rate": 1.304364569961489e-05, + "loss": 0.6472, + "step": 3390 + }, + { + "epoch": 4.353016688061618, + "grad_norm": 4.020568370819092, + "learning_rate": 1.3047496790757382e-05, + "loss": 0.721, + "step": 3391 + }, + { + "epoch": 4.354300385109115, + "grad_norm": 2.243328094482422, + "learning_rate": 1.3051347881899873e-05, + "loss": 0.619, + "step": 3392 + }, + { + "epoch": 4.355584082156611, + "grad_norm": 1.9237300157546997, + "learning_rate": 1.3055198973042362e-05, + "loss": 0.6625, + "step": 3393 + }, + { + "epoch": 4.356867779204108, + "grad_norm": 2.5841057300567627, + "learning_rate": 1.3059050064184852e-05, + "loss": 0.6731, + "step": 3394 + }, + { + "epoch": 4.358151476251605, + "grad_norm": 1.7472128868103027, + "learning_rate": 1.3062901155327344e-05, + "loss": 0.7192, + "step": 3395 + }, + { + "epoch": 4.359435173299102, + "grad_norm": 12.529579162597656, + "learning_rate": 1.3066752246469834e-05, + "loss": 0.6499, + "step": 3396 + }, + { + "epoch": 4.360718870346599, + "grad_norm": 5.454257965087891, + "learning_rate": 1.3070603337612323e-05, + "loss": 0.7418, + "step": 3397 + }, + { + "epoch": 4.362002567394095, + "grad_norm": 7.750865459442139, + "learning_rate": 1.3074454428754814e-05, + "loss": 0.6969, + "step": 3398 + }, + { + "epoch": 4.363286264441592, + "grad_norm": 2.8395538330078125, + "learning_rate": 1.3078305519897305e-05, + "loss": 0.6836, + "step": 3399 + }, + { + "epoch": 4.364569961489089, + "grad_norm": 4.086432933807373, + "learning_rate": 1.3082156611039795e-05, + "loss": 0.6924, + "step": 3400 + }, + { + "epoch": 4.365853658536586, + "grad_norm": 3.4866316318511963, + "learning_rate": 1.3086007702182286e-05, + "loss": 0.6552, + "step": 3401 + }, + { + "epoch": 4.367137355584082, + "grad_norm": 3.942544460296631, + "learning_rate": 1.3089858793324775e-05, + "loss": 0.6983, + "step": 3402 + }, + { + "epoch": 4.368421052631579, + "grad_norm": 5.723913192749023, + "learning_rate": 1.3093709884467266e-05, + "loss": 0.6694, + "step": 3403 + }, + { + "epoch": 4.369704749679076, + "grad_norm": 5.347942352294922, + "learning_rate": 1.3097560975609757e-05, + "loss": 0.6468, + "step": 3404 + }, + { + "epoch": 4.370988446726573, + "grad_norm": 6.626699447631836, + "learning_rate": 1.3101412066752247e-05, + "loss": 0.6546, + "step": 3405 + }, + { + "epoch": 4.37227214377407, + "grad_norm": 2.8508918285369873, + "learning_rate": 1.3105263157894738e-05, + "loss": 0.6876, + "step": 3406 + }, + { + "epoch": 4.373555840821566, + "grad_norm": 8.764588356018066, + "learning_rate": 1.3109114249037229e-05, + "loss": 0.7096, + "step": 3407 + }, + { + "epoch": 4.374839537869063, + "grad_norm": 3.5583200454711914, + "learning_rate": 1.3112965340179718e-05, + "loss": 0.7564, + "step": 3408 + }, + { + "epoch": 4.37612323491656, + "grad_norm": 4.106095790863037, + "learning_rate": 1.3116816431322207e-05, + "loss": 0.7103, + "step": 3409 + }, + { + "epoch": 4.377406931964057, + "grad_norm": 3.0100884437561035, + "learning_rate": 1.31206675224647e-05, + "loss": 0.7773, + "step": 3410 + }, + { + "epoch": 4.378690629011553, + "grad_norm": 3.3296449184417725, + "learning_rate": 1.312451861360719e-05, + "loss": 0.71, + "step": 3411 + }, + { + "epoch": 4.37997432605905, + "grad_norm": 4.962299346923828, + "learning_rate": 1.3128369704749679e-05, + "loss": 0.7438, + "step": 3412 + }, + { + "epoch": 4.381258023106547, + "grad_norm": 3.1759026050567627, + "learning_rate": 1.3132220795892168e-05, + "loss": 0.7821, + "step": 3413 + }, + { + "epoch": 4.382541720154044, + "grad_norm": 6.120744705200195, + "learning_rate": 1.3136071887034661e-05, + "loss": 0.7733, + "step": 3414 + }, + { + "epoch": 4.383825417201541, + "grad_norm": 2.725799083709717, + "learning_rate": 1.313992297817715e-05, + "loss": 0.8693, + "step": 3415 + }, + { + "epoch": 4.385109114249037, + "grad_norm": 5.931942462921143, + "learning_rate": 1.314377406931964e-05, + "loss": 1.008, + "step": 3416 + }, + { + "epoch": 4.386392811296534, + "grad_norm": 2.481506824493408, + "learning_rate": 1.3147625160462131e-05, + "loss": 0.6421, + "step": 3417 + }, + { + "epoch": 4.387676508344031, + "grad_norm": 2.612780809402466, + "learning_rate": 1.3151476251604622e-05, + "loss": 0.6415, + "step": 3418 + }, + { + "epoch": 4.388960205391528, + "grad_norm": 1.9243428707122803, + "learning_rate": 1.3155327342747111e-05, + "loss": 0.6392, + "step": 3419 + }, + { + "epoch": 4.390243902439025, + "grad_norm": 2.1053013801574707, + "learning_rate": 1.3159178433889602e-05, + "loss": 0.5968, + "step": 3420 + }, + { + "epoch": 4.391527599486521, + "grad_norm": 2.0879874229431152, + "learning_rate": 1.3163029525032092e-05, + "loss": 0.6601, + "step": 3421 + }, + { + "epoch": 4.392811296534018, + "grad_norm": 2.470064640045166, + "learning_rate": 1.3166880616174583e-05, + "loss": 0.6207, + "step": 3422 + }, + { + "epoch": 4.394094993581515, + "grad_norm": 2.0495553016662598, + "learning_rate": 1.3170731707317074e-05, + "loss": 0.6124, + "step": 3423 + }, + { + "epoch": 4.395378690629012, + "grad_norm": 2.1566343307495117, + "learning_rate": 1.3174582798459563e-05, + "loss": 0.6647, + "step": 3424 + }, + { + "epoch": 4.396662387676509, + "grad_norm": 1.8400535583496094, + "learning_rate": 1.3178433889602053e-05, + "loss": 0.6424, + "step": 3425 + }, + { + "epoch": 4.397946084724005, + "grad_norm": 9.514046669006348, + "learning_rate": 1.3182284980744545e-05, + "loss": 0.641, + "step": 3426 + }, + { + "epoch": 4.399229781771502, + "grad_norm": 1.7749872207641602, + "learning_rate": 1.3186136071887035e-05, + "loss": 0.5985, + "step": 3427 + }, + { + "epoch": 4.400513478818999, + "grad_norm": 6.5749192237854, + "learning_rate": 1.3189987163029524e-05, + "loss": 0.673, + "step": 3428 + }, + { + "epoch": 4.401797175866496, + "grad_norm": 3.777582883834839, + "learning_rate": 1.3193838254172017e-05, + "loss": 0.6564, + "step": 3429 + }, + { + "epoch": 4.403080872913993, + "grad_norm": 1.6176332235336304, + "learning_rate": 1.3197689345314506e-05, + "loss": 0.6661, + "step": 3430 + }, + { + "epoch": 4.404364569961489, + "grad_norm": 2.4316396713256836, + "learning_rate": 1.3201540436456996e-05, + "loss": 0.6191, + "step": 3431 + }, + { + "epoch": 4.405648267008986, + "grad_norm": 1.5059982538223267, + "learning_rate": 1.3205391527599487e-05, + "loss": 0.6412, + "step": 3432 + }, + { + "epoch": 4.406931964056483, + "grad_norm": 2.857445478439331, + "learning_rate": 1.3209242618741978e-05, + "loss": 0.6426, + "step": 3433 + }, + { + "epoch": 4.40821566110398, + "grad_norm": 1.6902027130126953, + "learning_rate": 1.3213093709884467e-05, + "loss": 0.6486, + "step": 3434 + }, + { + "epoch": 4.409499358151476, + "grad_norm": 1.8003451824188232, + "learning_rate": 1.3216944801026958e-05, + "loss": 0.6378, + "step": 3435 + }, + { + "epoch": 4.410783055198973, + "grad_norm": 1.8134925365447998, + "learning_rate": 1.3220795892169448e-05, + "loss": 0.6331, + "step": 3436 + }, + { + "epoch": 4.41206675224647, + "grad_norm": 2.2862815856933594, + "learning_rate": 1.3224646983311939e-05, + "loss": 0.7003, + "step": 3437 + }, + { + "epoch": 4.413350449293967, + "grad_norm": 2.029926300048828, + "learning_rate": 1.322849807445443e-05, + "loss": 0.6727, + "step": 3438 + }, + { + "epoch": 4.414634146341464, + "grad_norm": 3.266576051712036, + "learning_rate": 1.323234916559692e-05, + "loss": 0.6339, + "step": 3439 + }, + { + "epoch": 4.41591784338896, + "grad_norm": 6.6075263023376465, + "learning_rate": 1.3236200256739409e-05, + "loss": 0.67, + "step": 3440 + }, + { + "epoch": 4.417201540436457, + "grad_norm": 4.637826442718506, + "learning_rate": 1.3240051347881901e-05, + "loss": 0.6805, + "step": 3441 + }, + { + "epoch": 4.418485237483954, + "grad_norm": 2.470994234085083, + "learning_rate": 1.324390243902439e-05, + "loss": 0.625, + "step": 3442 + }, + { + "epoch": 4.419768934531451, + "grad_norm": 2.098045587539673, + "learning_rate": 1.324775353016688e-05, + "loss": 0.7308, + "step": 3443 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 2.6316492557525635, + "learning_rate": 1.3251604621309371e-05, + "loss": 0.6723, + "step": 3444 + }, + { + "epoch": 4.422336328626444, + "grad_norm": 2.294240713119507, + "learning_rate": 1.3255455712451862e-05, + "loss": 0.6806, + "step": 3445 + }, + { + "epoch": 4.423620025673941, + "grad_norm": 5.8277974128723145, + "learning_rate": 1.3259306803594352e-05, + "loss": 0.6835, + "step": 3446 + }, + { + "epoch": 4.424903722721438, + "grad_norm": 2.473039388656616, + "learning_rate": 1.3263157894736843e-05, + "loss": 0.6808, + "step": 3447 + }, + { + "epoch": 4.426187419768935, + "grad_norm": 1.6265640258789062, + "learning_rate": 1.3267008985879334e-05, + "loss": 0.6746, + "step": 3448 + }, + { + "epoch": 4.427471116816431, + "grad_norm": 3.581561803817749, + "learning_rate": 1.3270860077021823e-05, + "loss": 0.6854, + "step": 3449 + }, + { + "epoch": 4.428754813863928, + "grad_norm": 2.40073561668396, + "learning_rate": 1.3274711168164314e-05, + "loss": 0.763, + "step": 3450 + }, + { + "epoch": 4.430038510911425, + "grad_norm": 1.75571870803833, + "learning_rate": 1.3278562259306804e-05, + "loss": 0.6543, + "step": 3451 + }, + { + "epoch": 4.431322207958922, + "grad_norm": 2.348451614379883, + "learning_rate": 1.3282413350449295e-05, + "loss": 0.6496, + "step": 3452 + }, + { + "epoch": 4.432605905006419, + "grad_norm": 3.6198203563690186, + "learning_rate": 1.3286264441591786e-05, + "loss": 0.7, + "step": 3453 + }, + { + "epoch": 4.433889602053915, + "grad_norm": 12.183846473693848, + "learning_rate": 1.3290115532734275e-05, + "loss": 0.6887, + "step": 3454 + }, + { + "epoch": 4.435173299101412, + "grad_norm": 4.293573379516602, + "learning_rate": 1.3293966623876764e-05, + "loss": 0.7047, + "step": 3455 + }, + { + "epoch": 4.436456996148909, + "grad_norm": 3.5919382572174072, + "learning_rate": 1.3297817715019257e-05, + "loss": 0.7335, + "step": 3456 + }, + { + "epoch": 4.437740693196406, + "grad_norm": 2.016634702682495, + "learning_rate": 1.3301668806161747e-05, + "loss": 0.7308, + "step": 3457 + }, + { + "epoch": 4.439024390243903, + "grad_norm": 2.50534725189209, + "learning_rate": 1.3305519897304236e-05, + "loss": 0.7593, + "step": 3458 + }, + { + "epoch": 4.440308087291399, + "grad_norm": 2.4431846141815186, + "learning_rate": 1.3309370988446727e-05, + "loss": 0.7149, + "step": 3459 + }, + { + "epoch": 4.441591784338896, + "grad_norm": 2.536360740661621, + "learning_rate": 1.3313222079589218e-05, + "loss": 0.7246, + "step": 3460 + }, + { + "epoch": 4.442875481386393, + "grad_norm": 2.535590410232544, + "learning_rate": 1.3317073170731708e-05, + "loss": 0.7732, + "step": 3461 + }, + { + "epoch": 4.44415917843389, + "grad_norm": 1.9154232740402222, + "learning_rate": 1.3320924261874199e-05, + "loss": 0.7887, + "step": 3462 + }, + { + "epoch": 4.445442875481387, + "grad_norm": 3.507335901260376, + "learning_rate": 1.3324775353016688e-05, + "loss": 0.8136, + "step": 3463 + }, + { + "epoch": 4.446726572528883, + "grad_norm": 4.054792881011963, + "learning_rate": 1.3328626444159179e-05, + "loss": 0.8116, + "step": 3464 + }, + { + "epoch": 4.44801026957638, + "grad_norm": 3.3277053833007812, + "learning_rate": 1.3332477535301668e-05, + "loss": 0.834, + "step": 3465 + }, + { + "epoch": 4.449293966623877, + "grad_norm": 3.111039876937866, + "learning_rate": 1.333632862644416e-05, + "loss": 1.0027, + "step": 3466 + }, + { + "epoch": 4.450577663671374, + "grad_norm": 2.821777820587158, + "learning_rate": 1.3340179717586649e-05, + "loss": 0.662, + "step": 3467 + }, + { + "epoch": 4.45186136071887, + "grad_norm": 1.477008581161499, + "learning_rate": 1.334403080872914e-05, + "loss": 0.642, + "step": 3468 + }, + { + "epoch": 4.453145057766367, + "grad_norm": 4.2637619972229, + "learning_rate": 1.3347881899871631e-05, + "loss": 0.6596, + "step": 3469 + }, + { + "epoch": 4.454428754813864, + "grad_norm": 4.656399726867676, + "learning_rate": 1.335173299101412e-05, + "loss": 0.6458, + "step": 3470 + }, + { + "epoch": 4.455712451861361, + "grad_norm": 1.721492052078247, + "learning_rate": 1.3355584082156611e-05, + "loss": 0.6721, + "step": 3471 + }, + { + "epoch": 4.456996148908858, + "grad_norm": 5.557583808898926, + "learning_rate": 1.3359435173299103e-05, + "loss": 0.6121, + "step": 3472 + }, + { + "epoch": 4.458279845956354, + "grad_norm": 1.1559371948242188, + "learning_rate": 1.3363286264441592e-05, + "loss": 0.6228, + "step": 3473 + }, + { + "epoch": 4.459563543003851, + "grad_norm": 1.8708699941635132, + "learning_rate": 1.3367137355584081e-05, + "loss": 0.622, + "step": 3474 + }, + { + "epoch": 4.460847240051348, + "grad_norm": 1.6656376123428345, + "learning_rate": 1.3370988446726574e-05, + "loss": 0.6107, + "step": 3475 + }, + { + "epoch": 4.462130937098845, + "grad_norm": 3.402296543121338, + "learning_rate": 1.3374839537869063e-05, + "loss": 0.6216, + "step": 3476 + }, + { + "epoch": 4.463414634146342, + "grad_norm": 1.843571424484253, + "learning_rate": 1.3378690629011553e-05, + "loss": 0.6905, + "step": 3477 + }, + { + "epoch": 4.464698331193838, + "grad_norm": 2.4576468467712402, + "learning_rate": 1.3382541720154044e-05, + "loss": 0.6333, + "step": 3478 + }, + { + "epoch": 4.465982028241335, + "grad_norm": 1.640250563621521, + "learning_rate": 1.3386392811296535e-05, + "loss": 0.6199, + "step": 3479 + }, + { + "epoch": 4.467265725288832, + "grad_norm": 2.413520336151123, + "learning_rate": 1.3390243902439024e-05, + "loss": 0.6702, + "step": 3480 + }, + { + "epoch": 4.468549422336329, + "grad_norm": 3.0589187145233154, + "learning_rate": 1.3394094993581515e-05, + "loss": 0.6337, + "step": 3481 + }, + { + "epoch": 4.469833119383825, + "grad_norm": 1.8749454021453857, + "learning_rate": 1.3397946084724005e-05, + "loss": 0.6372, + "step": 3482 + }, + { + "epoch": 4.471116816431322, + "grad_norm": 1.603542685508728, + "learning_rate": 1.3401797175866496e-05, + "loss": 0.6558, + "step": 3483 + }, + { + "epoch": 4.472400513478819, + "grad_norm": 1.0290957689285278, + "learning_rate": 1.3405648267008987e-05, + "loss": 0.6427, + "step": 3484 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 1.9906377792358398, + "learning_rate": 1.3409499358151476e-05, + "loss": 0.6131, + "step": 3485 + }, + { + "epoch": 4.474967907573813, + "grad_norm": 2.4847214221954346, + "learning_rate": 1.3413350449293966e-05, + "loss": 0.6466, + "step": 3486 + }, + { + "epoch": 4.476251604621309, + "grad_norm": 5.198046684265137, + "learning_rate": 1.3417201540436458e-05, + "loss": 0.6823, + "step": 3487 + }, + { + "epoch": 4.477535301668806, + "grad_norm": 3.1284964084625244, + "learning_rate": 1.3421052631578948e-05, + "loss": 0.6909, + "step": 3488 + }, + { + "epoch": 4.478818998716303, + "grad_norm": 1.343506097793579, + "learning_rate": 1.3424903722721437e-05, + "loss": 0.6643, + "step": 3489 + }, + { + "epoch": 4.4801026957638, + "grad_norm": 3.4105255603790283, + "learning_rate": 1.3428754813863928e-05, + "loss": 0.623, + "step": 3490 + }, + { + "epoch": 4.481386392811297, + "grad_norm": 2.6384825706481934, + "learning_rate": 1.343260590500642e-05, + "loss": 0.6914, + "step": 3491 + }, + { + "epoch": 4.482670089858793, + "grad_norm": 1.6878842115402222, + "learning_rate": 1.3436456996148909e-05, + "loss": 0.6533, + "step": 3492 + }, + { + "epoch": 4.48395378690629, + "grad_norm": 1.2512280941009521, + "learning_rate": 1.34403080872914e-05, + "loss": 0.6376, + "step": 3493 + }, + { + "epoch": 4.485237483953787, + "grad_norm": 2.698636531829834, + "learning_rate": 1.344415917843389e-05, + "loss": 0.6318, + "step": 3494 + }, + { + "epoch": 4.486521181001284, + "grad_norm": 2.518793821334839, + "learning_rate": 1.344801026957638e-05, + "loss": 0.6183, + "step": 3495 + }, + { + "epoch": 4.487804878048781, + "grad_norm": 1.4921869039535522, + "learning_rate": 1.3451861360718871e-05, + "loss": 0.5995, + "step": 3496 + }, + { + "epoch": 4.489088575096277, + "grad_norm": 3.611677646636963, + "learning_rate": 1.345571245186136e-05, + "loss": 0.6654, + "step": 3497 + }, + { + "epoch": 4.490372272143774, + "grad_norm": 4.262780666351318, + "learning_rate": 1.3459563543003852e-05, + "loss": 0.6941, + "step": 3498 + }, + { + "epoch": 4.491655969191271, + "grad_norm": 2.484379529953003, + "learning_rate": 1.3463414634146343e-05, + "loss": 0.6568, + "step": 3499 + }, + { + "epoch": 4.492939666238768, + "grad_norm": 2.9691572189331055, + "learning_rate": 1.3467265725288832e-05, + "loss": 0.6108, + "step": 3500 + }, + { + "epoch": 4.494223363286264, + "grad_norm": 7.789943695068359, + "learning_rate": 1.3471116816431322e-05, + "loss": 0.6625, + "step": 3501 + }, + { + "epoch": 4.495507060333761, + "grad_norm": 2.2699966430664062, + "learning_rate": 1.3474967907573814e-05, + "loss": 0.6767, + "step": 3502 + }, + { + "epoch": 4.496790757381258, + "grad_norm": 1.9700543880462646, + "learning_rate": 1.3478818998716304e-05, + "loss": 0.7247, + "step": 3503 + }, + { + "epoch": 4.498074454428755, + "grad_norm": 8.220869064331055, + "learning_rate": 1.3482670089858793e-05, + "loss": 0.6779, + "step": 3504 + }, + { + "epoch": 4.499358151476252, + "grad_norm": 1.37590754032135, + "learning_rate": 1.3486521181001284e-05, + "loss": 0.7061, + "step": 3505 + }, + { + "epoch": 4.500641848523748, + "grad_norm": 2.839165449142456, + "learning_rate": 1.3490372272143775e-05, + "loss": 0.6748, + "step": 3506 + }, + { + "epoch": 4.501925545571245, + "grad_norm": 1.8499342203140259, + "learning_rate": 1.3494223363286265e-05, + "loss": 0.6689, + "step": 3507 + }, + { + "epoch": 4.503209242618742, + "grad_norm": 1.9314569234848022, + "learning_rate": 1.3498074454428756e-05, + "loss": 0.721, + "step": 3508 + }, + { + "epoch": 4.504492939666239, + "grad_norm": 1.934360146522522, + "learning_rate": 1.3501925545571245e-05, + "loss": 0.6733, + "step": 3509 + }, + { + "epoch": 4.505776636713735, + "grad_norm": 3.1641335487365723, + "learning_rate": 1.3505776636713736e-05, + "loss": 0.6886, + "step": 3510 + }, + { + "epoch": 4.507060333761232, + "grad_norm": 3.495015859603882, + "learning_rate": 1.3509627727856227e-05, + "loss": 0.7183, + "step": 3511 + }, + { + "epoch": 4.508344030808729, + "grad_norm": 3.847470998764038, + "learning_rate": 1.3513478818998717e-05, + "loss": 0.7445, + "step": 3512 + }, + { + "epoch": 4.509627727856226, + "grad_norm": 3.3805253505706787, + "learning_rate": 1.3517329910141206e-05, + "loss": 0.8145, + "step": 3513 + }, + { + "epoch": 4.510911424903723, + "grad_norm": 2.420107364654541, + "learning_rate": 1.3521181001283697e-05, + "loss": 0.785, + "step": 3514 + }, + { + "epoch": 4.512195121951219, + "grad_norm": 10.438972473144531, + "learning_rate": 1.3525032092426188e-05, + "loss": 0.8319, + "step": 3515 + }, + { + "epoch": 4.513478818998716, + "grad_norm": 3.0136115550994873, + "learning_rate": 1.3528883183568677e-05, + "loss": 0.8914, + "step": 3516 + }, + { + "epoch": 4.514762516046213, + "grad_norm": 1.8900691270828247, + "learning_rate": 1.3532734274711169e-05, + "loss": 0.6333, + "step": 3517 + }, + { + "epoch": 4.51604621309371, + "grad_norm": 1.5225495100021362, + "learning_rate": 1.353658536585366e-05, + "loss": 0.6489, + "step": 3518 + }, + { + "epoch": 4.517329910141207, + "grad_norm": 4.300928592681885, + "learning_rate": 1.3540436456996149e-05, + "loss": 0.6567, + "step": 3519 + }, + { + "epoch": 4.518613607188703, + "grad_norm": 1.3909735679626465, + "learning_rate": 1.3544287548138638e-05, + "loss": 0.647, + "step": 3520 + }, + { + "epoch": 4.5198973042362, + "grad_norm": 1.5022659301757812, + "learning_rate": 1.3548138639281131e-05, + "loss": 0.6566, + "step": 3521 + }, + { + "epoch": 4.521181001283697, + "grad_norm": 3.2527246475219727, + "learning_rate": 1.355198973042362e-05, + "loss": 0.6137, + "step": 3522 + }, + { + "epoch": 4.522464698331194, + "grad_norm": 2.542114019393921, + "learning_rate": 1.355584082156611e-05, + "loss": 0.6307, + "step": 3523 + }, + { + "epoch": 4.523748395378691, + "grad_norm": 2.1909279823303223, + "learning_rate": 1.3559691912708601e-05, + "loss": 0.6606, + "step": 3524 + }, + { + "epoch": 4.525032092426187, + "grad_norm": 4.521922588348389, + "learning_rate": 1.3563543003851092e-05, + "loss": 0.6265, + "step": 3525 + }, + { + "epoch": 4.526315789473684, + "grad_norm": 1.5220341682434082, + "learning_rate": 1.3567394094993581e-05, + "loss": 0.6094, + "step": 3526 + }, + { + "epoch": 4.527599486521181, + "grad_norm": 1.3035955429077148, + "learning_rate": 1.3571245186136072e-05, + "loss": 0.6434, + "step": 3527 + }, + { + "epoch": 4.528883183568678, + "grad_norm": 1.6495271921157837, + "learning_rate": 1.3575096277278562e-05, + "loss": 0.6531, + "step": 3528 + }, + { + "epoch": 4.530166880616175, + "grad_norm": 2.326941728591919, + "learning_rate": 1.3578947368421053e-05, + "loss": 0.6694, + "step": 3529 + }, + { + "epoch": 4.531450577663671, + "grad_norm": 1.7119232416152954, + "learning_rate": 1.3582798459563544e-05, + "loss": 0.6371, + "step": 3530 + }, + { + "epoch": 4.532734274711168, + "grad_norm": 2.770507574081421, + "learning_rate": 1.3586649550706033e-05, + "loss": 0.614, + "step": 3531 + }, + { + "epoch": 4.534017971758665, + "grad_norm": 2.7036960124969482, + "learning_rate": 1.3590500641848523e-05, + "loss": 0.6299, + "step": 3532 + }, + { + "epoch": 4.535301668806162, + "grad_norm": 2.2154734134674072, + "learning_rate": 1.3594351732991015e-05, + "loss": 0.6501, + "step": 3533 + }, + { + "epoch": 4.536585365853659, + "grad_norm": 9.596296310424805, + "learning_rate": 1.3598202824133505e-05, + "loss": 0.5857, + "step": 3534 + }, + { + "epoch": 4.537869062901155, + "grad_norm": 3.315000057220459, + "learning_rate": 1.3602053915275994e-05, + "loss": 0.6271, + "step": 3535 + }, + { + "epoch": 4.539152759948652, + "grad_norm": 2.4529953002929688, + "learning_rate": 1.3605905006418485e-05, + "loss": 0.6543, + "step": 3536 + }, + { + "epoch": 4.540436456996149, + "grad_norm": 1.9394006729125977, + "learning_rate": 1.3609756097560976e-05, + "loss": 0.6445, + "step": 3537 + }, + { + "epoch": 4.541720154043646, + "grad_norm": 2.6691079139709473, + "learning_rate": 1.3613607188703466e-05, + "loss": 0.6903, + "step": 3538 + }, + { + "epoch": 4.543003851091142, + "grad_norm": 2.7452244758605957, + "learning_rate": 1.3617458279845957e-05, + "loss": 0.6668, + "step": 3539 + }, + { + "epoch": 4.544287548138639, + "grad_norm": 7.415032863616943, + "learning_rate": 1.3621309370988448e-05, + "loss": 0.6882, + "step": 3540 + }, + { + "epoch": 4.545571245186136, + "grad_norm": 3.9640610218048096, + "learning_rate": 1.3625160462130937e-05, + "loss": 0.6, + "step": 3541 + }, + { + "epoch": 4.546854942233633, + "grad_norm": 2.560948133468628, + "learning_rate": 1.3629011553273428e-05, + "loss": 0.6381, + "step": 3542 + }, + { + "epoch": 4.548138639281129, + "grad_norm": 2.1469950675964355, + "learning_rate": 1.3632862644415918e-05, + "loss": 0.6541, + "step": 3543 + }, + { + "epoch": 4.549422336328626, + "grad_norm": 2.52624249458313, + "learning_rate": 1.3636713735558409e-05, + "loss": 0.6237, + "step": 3544 + }, + { + "epoch": 4.550706033376123, + "grad_norm": 2.576451301574707, + "learning_rate": 1.36405648267009e-05, + "loss": 0.6557, + "step": 3545 + }, + { + "epoch": 4.55198973042362, + "grad_norm": 2.763746976852417, + "learning_rate": 1.364441591784339e-05, + "loss": 0.6654, + "step": 3546 + }, + { + "epoch": 4.553273427471117, + "grad_norm": 1.8971164226531982, + "learning_rate": 1.3648267008985879e-05, + "loss": 0.6459, + "step": 3547 + }, + { + "epoch": 4.554557124518613, + "grad_norm": 2.0645601749420166, + "learning_rate": 1.3652118100128371e-05, + "loss": 0.6838, + "step": 3548 + }, + { + "epoch": 4.55584082156611, + "grad_norm": 1.8221906423568726, + "learning_rate": 1.365596919127086e-05, + "loss": 0.6892, + "step": 3549 + }, + { + "epoch": 4.557124518613607, + "grad_norm": 1.941619873046875, + "learning_rate": 1.365982028241335e-05, + "loss": 0.7104, + "step": 3550 + }, + { + "epoch": 4.558408215661104, + "grad_norm": 2.2242672443389893, + "learning_rate": 1.3663671373555841e-05, + "loss": 0.6571, + "step": 3551 + }, + { + "epoch": 4.559691912708601, + "grad_norm": 2.3662049770355225, + "learning_rate": 1.3667522464698332e-05, + "loss": 0.6583, + "step": 3552 + }, + { + "epoch": 4.560975609756097, + "grad_norm": 2.7204666137695312, + "learning_rate": 1.3671373555840822e-05, + "loss": 0.7125, + "step": 3553 + }, + { + "epoch": 4.562259306803594, + "grad_norm": 4.526642799377441, + "learning_rate": 1.3675224646983313e-05, + "loss": 0.7327, + "step": 3554 + }, + { + "epoch": 4.563543003851091, + "grad_norm": 2.7929511070251465, + "learning_rate": 1.3679075738125802e-05, + "loss": 0.6928, + "step": 3555 + }, + { + "epoch": 4.564826700898588, + "grad_norm": 2.866408109664917, + "learning_rate": 1.3682926829268293e-05, + "loss": 0.7479, + "step": 3556 + }, + { + "epoch": 4.566110397946085, + "grad_norm": 3.8196377754211426, + "learning_rate": 1.3686777920410784e-05, + "loss": 0.6805, + "step": 3557 + }, + { + "epoch": 4.567394094993581, + "grad_norm": 2.982245445251465, + "learning_rate": 1.3690629011553274e-05, + "loss": 0.816, + "step": 3558 + }, + { + "epoch": 4.568677792041078, + "grad_norm": 1.7507147789001465, + "learning_rate": 1.3694480102695763e-05, + "loss": 0.702, + "step": 3559 + }, + { + "epoch": 4.569961489088575, + "grad_norm": 2.1880404949188232, + "learning_rate": 1.3698331193838256e-05, + "loss": 0.7069, + "step": 3560 + }, + { + "epoch": 4.571245186136072, + "grad_norm": 5.076693058013916, + "learning_rate": 1.3702182284980745e-05, + "loss": 0.7312, + "step": 3561 + }, + { + "epoch": 4.572528883183569, + "grad_norm": 6.348215579986572, + "learning_rate": 1.3706033376123234e-05, + "loss": 0.7336, + "step": 3562 + }, + { + "epoch": 4.573812580231065, + "grad_norm": 3.3910932540893555, + "learning_rate": 1.3709884467265727e-05, + "loss": 0.7593, + "step": 3563 + }, + { + "epoch": 4.575096277278562, + "grad_norm": 4.467121601104736, + "learning_rate": 1.3713735558408217e-05, + "loss": 0.8133, + "step": 3564 + }, + { + "epoch": 4.576379974326059, + "grad_norm": 3.4917662143707275, + "learning_rate": 1.3717586649550706e-05, + "loss": 0.909, + "step": 3565 + }, + { + "epoch": 4.577663671373556, + "grad_norm": 2.9260332584381104, + "learning_rate": 1.3721437740693195e-05, + "loss": 0.9744, + "step": 3566 + }, + { + "epoch": 4.578947368421053, + "grad_norm": 1.6225426197052002, + "learning_rate": 1.3725288831835688e-05, + "loss": 0.6286, + "step": 3567 + }, + { + "epoch": 4.580231065468549, + "grad_norm": 2.062398910522461, + "learning_rate": 1.3729139922978178e-05, + "loss": 0.5942, + "step": 3568 + }, + { + "epoch": 4.581514762516046, + "grad_norm": 2.365770101547241, + "learning_rate": 1.3732991014120667e-05, + "loss": 0.6768, + "step": 3569 + }, + { + "epoch": 4.582798459563543, + "grad_norm": 1.6894731521606445, + "learning_rate": 1.3736842105263158e-05, + "loss": 0.6385, + "step": 3570 + }, + { + "epoch": 4.58408215661104, + "grad_norm": 1.464408278465271, + "learning_rate": 1.3740693196405649e-05, + "loss": 0.6706, + "step": 3571 + }, + { + "epoch": 4.585365853658536, + "grad_norm": 4.063952445983887, + "learning_rate": 1.3744544287548138e-05, + "loss": 0.5854, + "step": 3572 + }, + { + "epoch": 4.586649550706033, + "grad_norm": 1.6691702604293823, + "learning_rate": 1.374839537869063e-05, + "loss": 0.6208, + "step": 3573 + }, + { + "epoch": 4.58793324775353, + "grad_norm": 1.5032470226287842, + "learning_rate": 1.3752246469833119e-05, + "loss": 0.6015, + "step": 3574 + }, + { + "epoch": 4.589216944801027, + "grad_norm": 0.8975000977516174, + "learning_rate": 1.375609756097561e-05, + "loss": 0.6212, + "step": 3575 + }, + { + "epoch": 4.590500641848524, + "grad_norm": 1.6820130348205566, + "learning_rate": 1.3759948652118101e-05, + "loss": 0.6328, + "step": 3576 + }, + { + "epoch": 4.59178433889602, + "grad_norm": 3.6742188930511475, + "learning_rate": 1.376379974326059e-05, + "loss": 0.709, + "step": 3577 + }, + { + "epoch": 4.593068035943517, + "grad_norm": 1.8029155731201172, + "learning_rate": 1.376765083440308e-05, + "loss": 0.6892, + "step": 3578 + }, + { + "epoch": 4.594351732991014, + "grad_norm": 3.627568244934082, + "learning_rate": 1.3771501925545573e-05, + "loss": 0.6366, + "step": 3579 + }, + { + "epoch": 4.595635430038511, + "grad_norm": 2.1963350772857666, + "learning_rate": 1.3775353016688062e-05, + "loss": 0.6267, + "step": 3580 + }, + { + "epoch": 4.596919127086007, + "grad_norm": 1.3089522123336792, + "learning_rate": 1.3779204107830551e-05, + "loss": 0.6492, + "step": 3581 + }, + { + "epoch": 4.598202824133504, + "grad_norm": 2.7393627166748047, + "learning_rate": 1.3783055198973042e-05, + "loss": 0.6249, + "step": 3582 + }, + { + "epoch": 4.599486521181001, + "grad_norm": 2.8920156955718994, + "learning_rate": 1.3786906290115533e-05, + "loss": 0.6453, + "step": 3583 + }, + { + "epoch": 4.600770218228498, + "grad_norm": 1.6221612691879272, + "learning_rate": 1.3790757381258023e-05, + "loss": 0.672, + "step": 3584 + }, + { + "epoch": 4.602053915275995, + "grad_norm": 1.9675793647766113, + "learning_rate": 1.3794608472400514e-05, + "loss": 0.6396, + "step": 3585 + }, + { + "epoch": 4.603337612323491, + "grad_norm": 2.305333375930786, + "learning_rate": 1.3798459563543005e-05, + "loss": 0.6282, + "step": 3586 + }, + { + "epoch": 4.604621309370988, + "grad_norm": 1.7639187574386597, + "learning_rate": 1.3802310654685494e-05, + "loss": 0.6463, + "step": 3587 + }, + { + "epoch": 4.605905006418485, + "grad_norm": 1.9851964712142944, + "learning_rate": 1.3806161745827985e-05, + "loss": 0.6448, + "step": 3588 + }, + { + "epoch": 4.607188703465982, + "grad_norm": 1.5776150226593018, + "learning_rate": 1.3810012836970475e-05, + "loss": 0.6395, + "step": 3589 + }, + { + "epoch": 4.608472400513479, + "grad_norm": 1.7448033094406128, + "learning_rate": 1.3813863928112966e-05, + "loss": 0.6465, + "step": 3590 + }, + { + "epoch": 4.609756097560975, + "grad_norm": 2.6948468685150146, + "learning_rate": 1.3817715019255457e-05, + "loss": 0.688, + "step": 3591 + }, + { + "epoch": 4.611039794608472, + "grad_norm": 1.6825883388519287, + "learning_rate": 1.3821566110397946e-05, + "loss": 0.6411, + "step": 3592 + }, + { + "epoch": 4.612323491655969, + "grad_norm": 1.3071162700653076, + "learning_rate": 1.3825417201540436e-05, + "loss": 0.6558, + "step": 3593 + }, + { + "epoch": 4.613607188703466, + "grad_norm": 2.2715179920196533, + "learning_rate": 1.3829268292682928e-05, + "loss": 0.5925, + "step": 3594 + }, + { + "epoch": 4.614890885750963, + "grad_norm": 1.813273310661316, + "learning_rate": 1.3833119383825418e-05, + "loss": 0.6144, + "step": 3595 + }, + { + "epoch": 4.616174582798459, + "grad_norm": 1.9654120206832886, + "learning_rate": 1.3836970474967907e-05, + "loss": 0.683, + "step": 3596 + }, + { + "epoch": 4.617458279845956, + "grad_norm": 6.790087699890137, + "learning_rate": 1.3840821566110398e-05, + "loss": 0.6839, + "step": 3597 + }, + { + "epoch": 4.618741976893453, + "grad_norm": 2.354400396347046, + "learning_rate": 1.384467265725289e-05, + "loss": 0.6721, + "step": 3598 + }, + { + "epoch": 4.62002567394095, + "grad_norm": 2.0693132877349854, + "learning_rate": 1.3848523748395379e-05, + "loss": 0.6653, + "step": 3599 + }, + { + "epoch": 4.621309370988447, + "grad_norm": 1.6776219606399536, + "learning_rate": 1.385237483953787e-05, + "loss": 0.6585, + "step": 3600 + }, + { + "epoch": 4.622593068035943, + "grad_norm": 2.1669516563415527, + "learning_rate": 1.3856225930680359e-05, + "loss": 0.662, + "step": 3601 + }, + { + "epoch": 4.62387676508344, + "grad_norm": 2.2646543979644775, + "learning_rate": 1.386007702182285e-05, + "loss": 0.7127, + "step": 3602 + }, + { + "epoch": 4.625160462130937, + "grad_norm": 2.748486280441284, + "learning_rate": 1.3863928112965341e-05, + "loss": 0.6828, + "step": 3603 + }, + { + "epoch": 4.626444159178434, + "grad_norm": 6.142194747924805, + "learning_rate": 1.386777920410783e-05, + "loss": 0.6949, + "step": 3604 + }, + { + "epoch": 4.62772785622593, + "grad_norm": 4.795902252197266, + "learning_rate": 1.387163029525032e-05, + "loss": 0.6745, + "step": 3605 + }, + { + "epoch": 4.629011553273427, + "grad_norm": 5.2103190422058105, + "learning_rate": 1.3875481386392813e-05, + "loss": 0.6753, + "step": 3606 + }, + { + "epoch": 4.630295250320924, + "grad_norm": 2.511737585067749, + "learning_rate": 1.3879332477535302e-05, + "loss": 0.7597, + "step": 3607 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 2.38720703125, + "learning_rate": 1.3883183568677792e-05, + "loss": 0.7353, + "step": 3608 + }, + { + "epoch": 4.632862644415918, + "grad_norm": 3.1041390895843506, + "learning_rate": 1.3887034659820284e-05, + "loss": 0.7454, + "step": 3609 + }, + { + "epoch": 4.634146341463414, + "grad_norm": 3.607099771499634, + "learning_rate": 1.3890885750962774e-05, + "loss": 0.7103, + "step": 3610 + }, + { + "epoch": 4.635430038510911, + "grad_norm": 4.154376029968262, + "learning_rate": 1.3894736842105263e-05, + "loss": 0.7566, + "step": 3611 + }, + { + "epoch": 4.636713735558408, + "grad_norm": 2.2878293991088867, + "learning_rate": 1.3898587933247754e-05, + "loss": 0.7404, + "step": 3612 + }, + { + "epoch": 4.637997432605905, + "grad_norm": 5.042761325836182, + "learning_rate": 1.3902439024390245e-05, + "loss": 0.8079, + "step": 3613 + }, + { + "epoch": 4.639281129653401, + "grad_norm": 2.187863826751709, + "learning_rate": 1.3906290115532735e-05, + "loss": 0.7738, + "step": 3614 + }, + { + "epoch": 4.640564826700898, + "grad_norm": 3.746642827987671, + "learning_rate": 1.3910141206675226e-05, + "loss": 0.8307, + "step": 3615 + }, + { + "epoch": 4.641848523748395, + "grad_norm": 2.355031967163086, + "learning_rate": 1.3913992297817715e-05, + "loss": 0.9892, + "step": 3616 + }, + { + "epoch": 4.643132220795892, + "grad_norm": 1.5329734086990356, + "learning_rate": 1.3917843388960206e-05, + "loss": 0.6479, + "step": 3617 + }, + { + "epoch": 4.644415917843389, + "grad_norm": 2.2159576416015625, + "learning_rate": 1.3921694480102695e-05, + "loss": 0.6575, + "step": 3618 + }, + { + "epoch": 4.645699614890885, + "grad_norm": 1.6434321403503418, + "learning_rate": 1.3925545571245187e-05, + "loss": 0.6176, + "step": 3619 + }, + { + "epoch": 4.646983311938382, + "grad_norm": 4.026822566986084, + "learning_rate": 1.3929396662387676e-05, + "loss": 0.6426, + "step": 3620 + }, + { + "epoch": 4.648267008985879, + "grad_norm": 2.159381628036499, + "learning_rate": 1.3933247753530167e-05, + "loss": 0.6271, + "step": 3621 + }, + { + "epoch": 4.649550706033376, + "grad_norm": 2.138383388519287, + "learning_rate": 1.3937098844672658e-05, + "loss": 0.6442, + "step": 3622 + }, + { + "epoch": 4.650834403080873, + "grad_norm": 1.6912673711776733, + "learning_rate": 1.3940949935815147e-05, + "loss": 0.6745, + "step": 3623 + }, + { + "epoch": 4.652118100128369, + "grad_norm": 3.6643526554107666, + "learning_rate": 1.3944801026957637e-05, + "loss": 0.6494, + "step": 3624 + }, + { + "epoch": 4.653401797175866, + "grad_norm": 1.8867673873901367, + "learning_rate": 1.394865211810013e-05, + "loss": 0.6189, + "step": 3625 + }, + { + "epoch": 4.654685494223363, + "grad_norm": 4.409117221832275, + "learning_rate": 1.3952503209242619e-05, + "loss": 0.6505, + "step": 3626 + }, + { + "epoch": 4.65596919127086, + "grad_norm": 1.2359845638275146, + "learning_rate": 1.3956354300385108e-05, + "loss": 0.6299, + "step": 3627 + }, + { + "epoch": 4.657252888318357, + "grad_norm": 2.320859670639038, + "learning_rate": 1.3960205391527601e-05, + "loss": 0.6841, + "step": 3628 + }, + { + "epoch": 4.658536585365853, + "grad_norm": 2.5293374061584473, + "learning_rate": 1.396405648267009e-05, + "loss": 0.626, + "step": 3629 + }, + { + "epoch": 4.65982028241335, + "grad_norm": 3.5735092163085938, + "learning_rate": 1.396790757381258e-05, + "loss": 0.6486, + "step": 3630 + }, + { + "epoch": 4.661103979460847, + "grad_norm": 1.8440452814102173, + "learning_rate": 1.3971758664955071e-05, + "loss": 0.614, + "step": 3631 + }, + { + "epoch": 4.662387676508344, + "grad_norm": 5.093088626861572, + "learning_rate": 1.3975609756097562e-05, + "loss": 0.6692, + "step": 3632 + }, + { + "epoch": 4.663671373555841, + "grad_norm": 1.9079310894012451, + "learning_rate": 1.3979460847240051e-05, + "loss": 0.6591, + "step": 3633 + }, + { + "epoch": 4.664955070603337, + "grad_norm": 2.2160534858703613, + "learning_rate": 1.3983311938382542e-05, + "loss": 0.6121, + "step": 3634 + }, + { + "epoch": 4.666238767650834, + "grad_norm": 4.196793079376221, + "learning_rate": 1.3987163029525032e-05, + "loss": 0.6431, + "step": 3635 + }, + { + "epoch": 4.667522464698331, + "grad_norm": 2.499753475189209, + "learning_rate": 1.3991014120667523e-05, + "loss": 0.7055, + "step": 3636 + }, + { + "epoch": 4.668806161745828, + "grad_norm": 3.769678831100464, + "learning_rate": 1.3994865211810014e-05, + "loss": 0.6836, + "step": 3637 + }, + { + "epoch": 4.670089858793324, + "grad_norm": 3.676784038543701, + "learning_rate": 1.3998716302952503e-05, + "loss": 0.612, + "step": 3638 + }, + { + "epoch": 4.671373555840821, + "grad_norm": 3.0453617572784424, + "learning_rate": 1.4002567394094993e-05, + "loss": 0.6305, + "step": 3639 + }, + { + "epoch": 4.672657252888318, + "grad_norm": 2.0329322814941406, + "learning_rate": 1.4006418485237485e-05, + "loss": 0.6894, + "step": 3640 + }, + { + "epoch": 4.673940949935815, + "grad_norm": 6.603466033935547, + "learning_rate": 1.4010269576379975e-05, + "loss": 0.6568, + "step": 3641 + }, + { + "epoch": 4.675224646983312, + "grad_norm": 6.371218681335449, + "learning_rate": 1.4014120667522464e-05, + "loss": 0.6508, + "step": 3642 + }, + { + "epoch": 4.676508344030808, + "grad_norm": 2.4572842121124268, + "learning_rate": 1.4017971758664955e-05, + "loss": 0.6597, + "step": 3643 + }, + { + "epoch": 4.677792041078305, + "grad_norm": 2.393498659133911, + "learning_rate": 1.4021822849807446e-05, + "loss": 0.641, + "step": 3644 + }, + { + "epoch": 4.679075738125802, + "grad_norm": 2.5035204887390137, + "learning_rate": 1.4025673940949936e-05, + "loss": 0.7071, + "step": 3645 + }, + { + "epoch": 4.680359435173299, + "grad_norm": 1.6539585590362549, + "learning_rate": 1.4029525032092427e-05, + "loss": 0.6422, + "step": 3646 + }, + { + "epoch": 4.681643132220795, + "grad_norm": 2.7131128311157227, + "learning_rate": 1.4033376123234916e-05, + "loss": 0.6676, + "step": 3647 + }, + { + "epoch": 4.682926829268292, + "grad_norm": 4.478026390075684, + "learning_rate": 1.4037227214377407e-05, + "loss": 0.6368, + "step": 3648 + }, + { + "epoch": 4.684210526315789, + "grad_norm": 1.9707468748092651, + "learning_rate": 1.4041078305519898e-05, + "loss": 0.6898, + "step": 3649 + }, + { + "epoch": 4.685494223363286, + "grad_norm": 1.8982208967208862, + "learning_rate": 1.4044929396662388e-05, + "loss": 0.6745, + "step": 3650 + }, + { + "epoch": 4.686777920410783, + "grad_norm": 3.8534369468688965, + "learning_rate": 1.4048780487804879e-05, + "loss": 0.7197, + "step": 3651 + }, + { + "epoch": 4.688061617458279, + "grad_norm": 2.7069499492645264, + "learning_rate": 1.405263157894737e-05, + "loss": 0.721, + "step": 3652 + }, + { + "epoch": 4.689345314505776, + "grad_norm": 5.993485450744629, + "learning_rate": 1.405648267008986e-05, + "loss": 0.7347, + "step": 3653 + }, + { + "epoch": 4.690629011553273, + "grad_norm": 3.418572425842285, + "learning_rate": 1.4060333761232349e-05, + "loss": 0.6704, + "step": 3654 + }, + { + "epoch": 4.69191270860077, + "grad_norm": 2.3834023475646973, + "learning_rate": 1.4064184852374841e-05, + "loss": 0.7111, + "step": 3655 + }, + { + "epoch": 4.693196405648267, + "grad_norm": 2.908858299255371, + "learning_rate": 1.406803594351733e-05, + "loss": 0.7101, + "step": 3656 + }, + { + "epoch": 4.694480102695763, + "grad_norm": 1.834916353225708, + "learning_rate": 1.407188703465982e-05, + "loss": 0.7487, + "step": 3657 + }, + { + "epoch": 4.69576379974326, + "grad_norm": 2.754737377166748, + "learning_rate": 1.4075738125802311e-05, + "loss": 0.7323, + "step": 3658 + }, + { + "epoch": 4.697047496790757, + "grad_norm": 40.3231315612793, + "learning_rate": 1.4079589216944802e-05, + "loss": 0.7154, + "step": 3659 + }, + { + "epoch": 4.698331193838254, + "grad_norm": 2.1038341522216797, + "learning_rate": 1.4083440308087292e-05, + "loss": 0.6859, + "step": 3660 + }, + { + "epoch": 4.699614890885751, + "grad_norm": 2.705817222595215, + "learning_rate": 1.4087291399229783e-05, + "loss": 0.775, + "step": 3661 + }, + { + "epoch": 4.700898587933247, + "grad_norm": 8.796205520629883, + "learning_rate": 1.4091142490372272e-05, + "loss": 0.7674, + "step": 3662 + }, + { + "epoch": 4.702182284980744, + "grad_norm": 2.024674892425537, + "learning_rate": 1.4094993581514763e-05, + "loss": 0.7865, + "step": 3663 + }, + { + "epoch": 4.703465982028241, + "grad_norm": 4.796106338500977, + "learning_rate": 1.4098844672657254e-05, + "loss": 0.8027, + "step": 3664 + }, + { + "epoch": 4.704749679075738, + "grad_norm": 4.6941094398498535, + "learning_rate": 1.4102695763799744e-05, + "loss": 0.8324, + "step": 3665 + }, + { + "epoch": 4.706033376123235, + "grad_norm": 2.6940064430236816, + "learning_rate": 1.4106546854942233e-05, + "loss": 0.9568, + "step": 3666 + }, + { + "epoch": 4.7073170731707314, + "grad_norm": 3.2810375690460205, + "learning_rate": 1.4110397946084724e-05, + "loss": 0.6311, + "step": 3667 + }, + { + "epoch": 4.708600770218228, + "grad_norm": 1.417772889137268, + "learning_rate": 1.4114249037227215e-05, + "loss": 0.6027, + "step": 3668 + }, + { + "epoch": 4.709884467265725, + "grad_norm": 1.8193384408950806, + "learning_rate": 1.4118100128369704e-05, + "loss": 0.6134, + "step": 3669 + }, + { + "epoch": 4.711168164313222, + "grad_norm": 4.513727188110352, + "learning_rate": 1.4121951219512194e-05, + "loss": 0.5872, + "step": 3670 + }, + { + "epoch": 4.712451861360719, + "grad_norm": 2.1248583793640137, + "learning_rate": 1.4125802310654687e-05, + "loss": 0.5969, + "step": 3671 + }, + { + "epoch": 4.7137355584082155, + "grad_norm": 5.54728889465332, + "learning_rate": 1.4129653401797176e-05, + "loss": 0.6489, + "step": 3672 + }, + { + "epoch": 4.715019255455712, + "grad_norm": 1.8955414295196533, + "learning_rate": 1.4133504492939665e-05, + "loss": 0.6124, + "step": 3673 + }, + { + "epoch": 4.716302952503209, + "grad_norm": 3.144876718521118, + "learning_rate": 1.4137355584082158e-05, + "loss": 0.6247, + "step": 3674 + }, + { + "epoch": 4.717586649550706, + "grad_norm": 2.5689847469329834, + "learning_rate": 1.4141206675224648e-05, + "loss": 0.5944, + "step": 3675 + }, + { + "epoch": 4.7188703465982025, + "grad_norm": 7.240097522735596, + "learning_rate": 1.4145057766367137e-05, + "loss": 0.638, + "step": 3676 + }, + { + "epoch": 4.7201540436456995, + "grad_norm": 2.126587390899658, + "learning_rate": 1.4148908857509628e-05, + "loss": 0.5755, + "step": 3677 + }, + { + "epoch": 4.721437740693196, + "grad_norm": 2.8318302631378174, + "learning_rate": 1.4152759948652119e-05, + "loss": 0.6518, + "step": 3678 + }, + { + "epoch": 4.722721437740693, + "grad_norm": 3.843390464782715, + "learning_rate": 1.4156611039794608e-05, + "loss": 0.6211, + "step": 3679 + }, + { + "epoch": 4.7240051347881895, + "grad_norm": 2.8749146461486816, + "learning_rate": 1.41604621309371e-05, + "loss": 0.6295, + "step": 3680 + }, + { + "epoch": 4.7252888318356865, + "grad_norm": 1.7528612613677979, + "learning_rate": 1.4164313222079589e-05, + "loss": 0.5882, + "step": 3681 + }, + { + "epoch": 4.7265725288831835, + "grad_norm": 5.506635665893555, + "learning_rate": 1.416816431322208e-05, + "loss": 0.6344, + "step": 3682 + }, + { + "epoch": 4.7278562259306804, + "grad_norm": 1.9961678981781006, + "learning_rate": 1.4172015404364571e-05, + "loss": 0.648, + "step": 3683 + }, + { + "epoch": 4.729139922978177, + "grad_norm": 2.6863467693328857, + "learning_rate": 1.417586649550706e-05, + "loss": 0.576, + "step": 3684 + }, + { + "epoch": 4.7304236200256735, + "grad_norm": 1.7931251525878906, + "learning_rate": 1.417971758664955e-05, + "loss": 0.6368, + "step": 3685 + }, + { + "epoch": 4.7317073170731705, + "grad_norm": 4.935589790344238, + "learning_rate": 1.4183568677792043e-05, + "loss": 0.6458, + "step": 3686 + }, + { + "epoch": 4.7329910141206675, + "grad_norm": 1.771243929862976, + "learning_rate": 1.4187419768934532e-05, + "loss": 0.6541, + "step": 3687 + }, + { + "epoch": 4.7342747111681645, + "grad_norm": 1.8071458339691162, + "learning_rate": 1.4191270860077021e-05, + "loss": 0.572, + "step": 3688 + }, + { + "epoch": 4.735558408215661, + "grad_norm": 1.7541165351867676, + "learning_rate": 1.4195121951219512e-05, + "loss": 0.6036, + "step": 3689 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 3.265692949295044, + "learning_rate": 1.4198973042362003e-05, + "loss": 0.6409, + "step": 3690 + }, + { + "epoch": 4.7381258023106545, + "grad_norm": 1.7742706537246704, + "learning_rate": 1.4202824133504493e-05, + "loss": 0.609, + "step": 3691 + }, + { + "epoch": 4.7394094993581515, + "grad_norm": 2.7301688194274902, + "learning_rate": 1.4206675224646984e-05, + "loss": 0.6693, + "step": 3692 + }, + { + "epoch": 4.7406931964056485, + "grad_norm": 3.0110602378845215, + "learning_rate": 1.4210526315789473e-05, + "loss": 0.672, + "step": 3693 + }, + { + "epoch": 4.741976893453145, + "grad_norm": 2.388313055038452, + "learning_rate": 1.4214377406931964e-05, + "loss": 0.6, + "step": 3694 + }, + { + "epoch": 4.7432605905006415, + "grad_norm": 4.893860816955566, + "learning_rate": 1.4218228498074455e-05, + "loss": 0.6731, + "step": 3695 + }, + { + "epoch": 4.7445442875481385, + "grad_norm": 4.323074817657471, + "learning_rate": 1.4222079589216945e-05, + "loss": 0.6171, + "step": 3696 + }, + { + "epoch": 4.7458279845956355, + "grad_norm": 4.559858798980713, + "learning_rate": 1.4225930680359436e-05, + "loss": 0.6458, + "step": 3697 + }, + { + "epoch": 4.7471116816431325, + "grad_norm": 4.262350559234619, + "learning_rate": 1.4229781771501927e-05, + "loss": 0.6855, + "step": 3698 + }, + { + "epoch": 4.748395378690629, + "grad_norm": 2.0495729446411133, + "learning_rate": 1.4233632862644416e-05, + "loss": 0.67, + "step": 3699 + }, + { + "epoch": 4.7496790757381255, + "grad_norm": 2.435680866241455, + "learning_rate": 1.4237483953786906e-05, + "loss": 0.6907, + "step": 3700 + }, + { + "epoch": 4.7509627727856225, + "grad_norm": 2.6012659072875977, + "learning_rate": 1.4241335044929398e-05, + "loss": 0.696, + "step": 3701 + }, + { + "epoch": 4.7522464698331195, + "grad_norm": 3.1274466514587402, + "learning_rate": 1.4245186136071888e-05, + "loss": 0.6277, + "step": 3702 + }, + { + "epoch": 4.7535301668806165, + "grad_norm": 2.435572385787964, + "learning_rate": 1.4249037227214377e-05, + "loss": 0.7023, + "step": 3703 + }, + { + "epoch": 4.7548138639281134, + "grad_norm": 1.865195393562317, + "learning_rate": 1.4252888318356868e-05, + "loss": 0.6674, + "step": 3704 + }, + { + "epoch": 4.7560975609756095, + "grad_norm": 2.421724796295166, + "learning_rate": 1.425673940949936e-05, + "loss": 0.7514, + "step": 3705 + }, + { + "epoch": 4.7573812580231065, + "grad_norm": 20.492692947387695, + "learning_rate": 1.4260590500641849e-05, + "loss": 0.6781, + "step": 3706 + }, + { + "epoch": 4.7586649550706035, + "grad_norm": 3.437230348587036, + "learning_rate": 1.426444159178434e-05, + "loss": 0.714, + "step": 3707 + }, + { + "epoch": 4.7599486521181005, + "grad_norm": 6.396162986755371, + "learning_rate": 1.4268292682926829e-05, + "loss": 0.7248, + "step": 3708 + }, + { + "epoch": 4.761232349165597, + "grad_norm": 3.8680026531219482, + "learning_rate": 1.427214377406932e-05, + "loss": 0.6963, + "step": 3709 + }, + { + "epoch": 4.7625160462130935, + "grad_norm": 2.653327465057373, + "learning_rate": 1.4275994865211811e-05, + "loss": 0.7297, + "step": 3710 + }, + { + "epoch": 4.7637997432605905, + "grad_norm": 2.8584537506103516, + "learning_rate": 1.42798459563543e-05, + "loss": 0.6782, + "step": 3711 + }, + { + "epoch": 4.7650834403080875, + "grad_norm": 3.550938844680786, + "learning_rate": 1.428369704749679e-05, + "loss": 0.7097, + "step": 3712 + }, + { + "epoch": 4.766367137355584, + "grad_norm": 2.184204339981079, + "learning_rate": 1.4287548138639283e-05, + "loss": 0.746, + "step": 3713 + }, + { + "epoch": 4.767650834403081, + "grad_norm": 4.425995826721191, + "learning_rate": 1.4291399229781772e-05, + "loss": 0.836, + "step": 3714 + }, + { + "epoch": 4.7689345314505776, + "grad_norm": 3.7686753273010254, + "learning_rate": 1.4295250320924262e-05, + "loss": 0.842, + "step": 3715 + }, + { + "epoch": 4.7702182284980745, + "grad_norm": 26.07315444946289, + "learning_rate": 1.4299101412066753e-05, + "loss": 0.9527, + "step": 3716 + }, + { + "epoch": 4.7715019255455715, + "grad_norm": 1.7350465059280396, + "learning_rate": 1.4302952503209244e-05, + "loss": 0.6098, + "step": 3717 + }, + { + "epoch": 4.772785622593068, + "grad_norm": 2.803227186203003, + "learning_rate": 1.4306803594351733e-05, + "loss": 0.5895, + "step": 3718 + }, + { + "epoch": 4.774069319640565, + "grad_norm": 2.40563702583313, + "learning_rate": 1.4310654685494224e-05, + "loss": 0.648, + "step": 3719 + }, + { + "epoch": 4.775353016688062, + "grad_norm": 1.5553208589553833, + "learning_rate": 1.4314505776636715e-05, + "loss": 0.6759, + "step": 3720 + }, + { + "epoch": 4.7766367137355585, + "grad_norm": 1.8043572902679443, + "learning_rate": 1.4318356867779205e-05, + "loss": 0.6358, + "step": 3721 + }, + { + "epoch": 4.7779204107830555, + "grad_norm": 4.3506011962890625, + "learning_rate": 1.4322207958921694e-05, + "loss": 0.6347, + "step": 3722 + }, + { + "epoch": 4.779204107830552, + "grad_norm": 1.880060076713562, + "learning_rate": 1.4326059050064185e-05, + "loss": 0.6684, + "step": 3723 + }, + { + "epoch": 4.780487804878049, + "grad_norm": 3.7996699810028076, + "learning_rate": 1.4329910141206676e-05, + "loss": 0.5976, + "step": 3724 + }, + { + "epoch": 4.781771501925546, + "grad_norm": 3.804753303527832, + "learning_rate": 1.4333761232349165e-05, + "loss": 0.6254, + "step": 3725 + }, + { + "epoch": 4.7830551989730425, + "grad_norm": 4.534526348114014, + "learning_rate": 1.4337612323491657e-05, + "loss": 0.6931, + "step": 3726 + }, + { + "epoch": 4.7843388960205395, + "grad_norm": 2.576131820678711, + "learning_rate": 1.4341463414634146e-05, + "loss": 0.5656, + "step": 3727 + }, + { + "epoch": 4.785622593068036, + "grad_norm": 2.6990623474121094, + "learning_rate": 1.4345314505776637e-05, + "loss": 0.6164, + "step": 3728 + }, + { + "epoch": 4.786906290115533, + "grad_norm": 2.2406113147735596, + "learning_rate": 1.4349165596919128e-05, + "loss": 0.5988, + "step": 3729 + }, + { + "epoch": 4.78818998716303, + "grad_norm": 1.9423021078109741, + "learning_rate": 1.4353016688061617e-05, + "loss": 0.6154, + "step": 3730 + }, + { + "epoch": 4.7894736842105265, + "grad_norm": 12.791632652282715, + "learning_rate": 1.4356867779204107e-05, + "loss": 0.6912, + "step": 3731 + }, + { + "epoch": 4.7907573812580235, + "grad_norm": 3.223900318145752, + "learning_rate": 1.43607188703466e-05, + "loss": 0.693, + "step": 3732 + }, + { + "epoch": 4.79204107830552, + "grad_norm": 4.03583288192749, + "learning_rate": 1.4364569961489089e-05, + "loss": 0.6304, + "step": 3733 + }, + { + "epoch": 4.793324775353017, + "grad_norm": 6.974352836608887, + "learning_rate": 1.4368421052631578e-05, + "loss": 0.6563, + "step": 3734 + }, + { + "epoch": 4.794608472400514, + "grad_norm": 3.533383846282959, + "learning_rate": 1.437227214377407e-05, + "loss": 0.6385, + "step": 3735 + }, + { + "epoch": 4.7958921694480106, + "grad_norm": 3.36889386177063, + "learning_rate": 1.437612323491656e-05, + "loss": 0.6779, + "step": 3736 + }, + { + "epoch": 4.7971758664955075, + "grad_norm": 2.387166738510132, + "learning_rate": 1.437997432605905e-05, + "loss": 0.6671, + "step": 3737 + }, + { + "epoch": 4.798459563543004, + "grad_norm": 3.2753725051879883, + "learning_rate": 1.4383825417201541e-05, + "loss": 0.6528, + "step": 3738 + }, + { + "epoch": 4.799743260590501, + "grad_norm": 1.6215611696243286, + "learning_rate": 1.438767650834403e-05, + "loss": 0.6951, + "step": 3739 + }, + { + "epoch": 4.801026957637998, + "grad_norm": 2.372898578643799, + "learning_rate": 1.4391527599486521e-05, + "loss": 0.6215, + "step": 3740 + }, + { + "epoch": 4.802310654685495, + "grad_norm": 2.8197271823883057, + "learning_rate": 1.4395378690629012e-05, + "loss": 0.6414, + "step": 3741 + }, + { + "epoch": 4.803594351732991, + "grad_norm": 3.9805803298950195, + "learning_rate": 1.4399229781771502e-05, + "loss": 0.6248, + "step": 3742 + }, + { + "epoch": 4.804878048780488, + "grad_norm": 4.330574989318848, + "learning_rate": 1.4403080872913993e-05, + "loss": 0.6513, + "step": 3743 + }, + { + "epoch": 4.806161745827985, + "grad_norm": 1.6372910737991333, + "learning_rate": 1.4406931964056484e-05, + "loss": 0.6145, + "step": 3744 + }, + { + "epoch": 4.807445442875482, + "grad_norm": 2.6062026023864746, + "learning_rate": 1.4410783055198973e-05, + "loss": 0.6836, + "step": 3745 + }, + { + "epoch": 4.808729139922978, + "grad_norm": 2.6414313316345215, + "learning_rate": 1.4414634146341463e-05, + "loss": 0.6711, + "step": 3746 + }, + { + "epoch": 4.810012836970475, + "grad_norm": 1.9975999593734741, + "learning_rate": 1.4418485237483955e-05, + "loss": 0.6403, + "step": 3747 + }, + { + "epoch": 4.811296534017972, + "grad_norm": 1.935106635093689, + "learning_rate": 1.4422336328626445e-05, + "loss": 0.6763, + "step": 3748 + }, + { + "epoch": 4.812580231065469, + "grad_norm": 3.213732957839966, + "learning_rate": 1.4426187419768934e-05, + "loss": 0.6724, + "step": 3749 + }, + { + "epoch": 4.813863928112966, + "grad_norm": 13.168999671936035, + "learning_rate": 1.4430038510911425e-05, + "loss": 0.7016, + "step": 3750 + }, + { + "epoch": 4.815147625160462, + "grad_norm": 2.925152540206909, + "learning_rate": 1.4433889602053916e-05, + "loss": 0.638, + "step": 3751 + }, + { + "epoch": 4.816431322207959, + "grad_norm": 4.394402503967285, + "learning_rate": 1.4437740693196406e-05, + "loss": 0.6725, + "step": 3752 + }, + { + "epoch": 4.817715019255456, + "grad_norm": 2.486682653427124, + "learning_rate": 1.4441591784338897e-05, + "loss": 0.7115, + "step": 3753 + }, + { + "epoch": 4.818998716302953, + "grad_norm": 3.0419604778289795, + "learning_rate": 1.4445442875481386e-05, + "loss": 0.673, + "step": 3754 + }, + { + "epoch": 4.82028241335045, + "grad_norm": 5.838217735290527, + "learning_rate": 1.4449293966623877e-05, + "loss": 0.7009, + "step": 3755 + }, + { + "epoch": 4.821566110397946, + "grad_norm": 6.5063157081604, + "learning_rate": 1.4453145057766368e-05, + "loss": 0.7468, + "step": 3756 + }, + { + "epoch": 4.822849807445443, + "grad_norm": 1.8287824392318726, + "learning_rate": 1.4456996148908858e-05, + "loss": 0.6911, + "step": 3757 + }, + { + "epoch": 4.82413350449294, + "grad_norm": 1.9128645658493042, + "learning_rate": 1.4460847240051347e-05, + "loss": 0.7212, + "step": 3758 + }, + { + "epoch": 4.825417201540437, + "grad_norm": 3.5483992099761963, + "learning_rate": 1.446469833119384e-05, + "loss": 0.7226, + "step": 3759 + }, + { + "epoch": 4.826700898587934, + "grad_norm": 7.291738510131836, + "learning_rate": 1.446854942233633e-05, + "loss": 0.7478, + "step": 3760 + }, + { + "epoch": 4.82798459563543, + "grad_norm": 7.280688285827637, + "learning_rate": 1.4472400513478819e-05, + "loss": 0.6745, + "step": 3761 + }, + { + "epoch": 4.829268292682927, + "grad_norm": 5.1352386474609375, + "learning_rate": 1.447625160462131e-05, + "loss": 0.7733, + "step": 3762 + }, + { + "epoch": 4.830551989730424, + "grad_norm": 12.322234153747559, + "learning_rate": 1.44801026957638e-05, + "loss": 0.7155, + "step": 3763 + }, + { + "epoch": 4.831835686777921, + "grad_norm": 4.98524808883667, + "learning_rate": 1.448395378690629e-05, + "loss": 0.7395, + "step": 3764 + }, + { + "epoch": 4.833119383825418, + "grad_norm": 5.393190860748291, + "learning_rate": 1.4487804878048781e-05, + "loss": 0.818, + "step": 3765 + }, + { + "epoch": 4.834403080872914, + "grad_norm": 3.13525390625, + "learning_rate": 1.4491655969191272e-05, + "loss": 0.9898, + "step": 3766 + }, + { + "epoch": 4.835686777920411, + "grad_norm": 2.8742101192474365, + "learning_rate": 1.4495507060333762e-05, + "loss": 0.6452, + "step": 3767 + }, + { + "epoch": 4.836970474967908, + "grad_norm": 1.5355174541473389, + "learning_rate": 1.4499358151476253e-05, + "loss": 0.6193, + "step": 3768 + }, + { + "epoch": 4.838254172015405, + "grad_norm": 4.961114406585693, + "learning_rate": 1.4503209242618742e-05, + "loss": 0.6086, + "step": 3769 + }, + { + "epoch": 4.839537869062902, + "grad_norm": 4.768793106079102, + "learning_rate": 1.4507060333761233e-05, + "loss": 0.6309, + "step": 3770 + }, + { + "epoch": 4.840821566110398, + "grad_norm": 2.524158239364624, + "learning_rate": 1.4510911424903723e-05, + "loss": 0.6161, + "step": 3771 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 2.1969079971313477, + "learning_rate": 1.4514762516046214e-05, + "loss": 0.6156, + "step": 3772 + }, + { + "epoch": 4.843388960205392, + "grad_norm": 1.7901767492294312, + "learning_rate": 1.4518613607188703e-05, + "loss": 0.6597, + "step": 3773 + }, + { + "epoch": 4.844672657252889, + "grad_norm": 1.7498522996902466, + "learning_rate": 1.4522464698331194e-05, + "loss": 0.6673, + "step": 3774 + }, + { + "epoch": 4.845956354300385, + "grad_norm": 3.075732707977295, + "learning_rate": 1.4526315789473685e-05, + "loss": 0.6375, + "step": 3775 + }, + { + "epoch": 4.847240051347882, + "grad_norm": 8.10799789428711, + "learning_rate": 1.4530166880616174e-05, + "loss": 0.676, + "step": 3776 + }, + { + "epoch": 4.848523748395379, + "grad_norm": 2.0844051837921143, + "learning_rate": 1.4534017971758664e-05, + "loss": 0.6128, + "step": 3777 + }, + { + "epoch": 4.849807445442876, + "grad_norm": 2.462209463119507, + "learning_rate": 1.4537869062901157e-05, + "loss": 0.62, + "step": 3778 + }, + { + "epoch": 4.851091142490373, + "grad_norm": 3.332681179046631, + "learning_rate": 1.4541720154043646e-05, + "loss": 0.6564, + "step": 3779 + }, + { + "epoch": 4.852374839537869, + "grad_norm": 1.4025728702545166, + "learning_rate": 1.4545571245186135e-05, + "loss": 0.62, + "step": 3780 + }, + { + "epoch": 4.853658536585366, + "grad_norm": 2.2888243198394775, + "learning_rate": 1.4549422336328626e-05, + "loss": 0.6344, + "step": 3781 + }, + { + "epoch": 4.854942233632863, + "grad_norm": 2.239173173904419, + "learning_rate": 1.4553273427471118e-05, + "loss": 0.6371, + "step": 3782 + }, + { + "epoch": 4.85622593068036, + "grad_norm": 2.1272120475769043, + "learning_rate": 1.4557124518613607e-05, + "loss": 0.6345, + "step": 3783 + }, + { + "epoch": 4.857509627727856, + "grad_norm": 2.565242290496826, + "learning_rate": 1.4560975609756098e-05, + "loss": 0.6684, + "step": 3784 + }, + { + "epoch": 4.858793324775353, + "grad_norm": 6.582728862762451, + "learning_rate": 1.4564826700898587e-05, + "loss": 0.6404, + "step": 3785 + }, + { + "epoch": 4.86007702182285, + "grad_norm": 3.226679801940918, + "learning_rate": 1.4568677792041078e-05, + "loss": 0.638, + "step": 3786 + }, + { + "epoch": 4.861360718870347, + "grad_norm": 2.220064640045166, + "learning_rate": 1.457252888318357e-05, + "loss": 0.6091, + "step": 3787 + }, + { + "epoch": 4.862644415917844, + "grad_norm": 1.8520734310150146, + "learning_rate": 1.4576379974326059e-05, + "loss": 0.6557, + "step": 3788 + }, + { + "epoch": 4.86392811296534, + "grad_norm": 1.6550629138946533, + "learning_rate": 1.458023106546855e-05, + "loss": 0.6536, + "step": 3789 + }, + { + "epoch": 4.865211810012837, + "grad_norm": 1.4712133407592773, + "learning_rate": 1.4584082156611041e-05, + "loss": 0.6382, + "step": 3790 + }, + { + "epoch": 4.866495507060334, + "grad_norm": 1.6675407886505127, + "learning_rate": 1.458793324775353e-05, + "loss": 0.6469, + "step": 3791 + }, + { + "epoch": 4.867779204107831, + "grad_norm": 2.868675470352173, + "learning_rate": 1.459178433889602e-05, + "loss": 0.6533, + "step": 3792 + }, + { + "epoch": 4.869062901155328, + "grad_norm": 3.611725091934204, + "learning_rate": 1.4595635430038513e-05, + "loss": 0.631, + "step": 3793 + }, + { + "epoch": 4.870346598202824, + "grad_norm": 4.101836681365967, + "learning_rate": 1.4599486521181002e-05, + "loss": 0.6453, + "step": 3794 + }, + { + "epoch": 4.871630295250321, + "grad_norm": 2.631080150604248, + "learning_rate": 1.4603337612323491e-05, + "loss": 0.6754, + "step": 3795 + }, + { + "epoch": 4.872913992297818, + "grad_norm": 1.386109709739685, + "learning_rate": 1.4607188703465982e-05, + "loss": 0.6524, + "step": 3796 + }, + { + "epoch": 4.874197689345315, + "grad_norm": 1.760048508644104, + "learning_rate": 1.4611039794608473e-05, + "loss": 0.6456, + "step": 3797 + }, + { + "epoch": 4.875481386392812, + "grad_norm": 2.6397151947021484, + "learning_rate": 1.4614890885750963e-05, + "loss": 0.6887, + "step": 3798 + }, + { + "epoch": 4.876765083440308, + "grad_norm": 1.5372743606567383, + "learning_rate": 1.4618741976893454e-05, + "loss": 0.6711, + "step": 3799 + }, + { + "epoch": 4.878048780487805, + "grad_norm": 4.193472862243652, + "learning_rate": 1.4622593068035943e-05, + "loss": 0.6548, + "step": 3800 + }, + { + "epoch": 4.879332477535302, + "grad_norm": 2.2579686641693115, + "learning_rate": 1.4626444159178434e-05, + "loss": 0.6731, + "step": 3801 + }, + { + "epoch": 4.880616174582799, + "grad_norm": 2.0195913314819336, + "learning_rate": 1.4630295250320925e-05, + "loss": 0.6402, + "step": 3802 + }, + { + "epoch": 4.881899871630296, + "grad_norm": 1.608717679977417, + "learning_rate": 1.4634146341463415e-05, + "loss": 0.6617, + "step": 3803 + }, + { + "epoch": 4.883183568677792, + "grad_norm": 1.823927640914917, + "learning_rate": 1.4637997432605904e-05, + "loss": 0.6721, + "step": 3804 + }, + { + "epoch": 4.884467265725289, + "grad_norm": 3.9647650718688965, + "learning_rate": 1.4641848523748397e-05, + "loss": 0.6878, + "step": 3805 + }, + { + "epoch": 4.885750962772786, + "grad_norm": 2.825023889541626, + "learning_rate": 1.4645699614890886e-05, + "loss": 0.7184, + "step": 3806 + }, + { + "epoch": 4.887034659820283, + "grad_norm": 1.8389570713043213, + "learning_rate": 1.4649550706033376e-05, + "loss": 0.6625, + "step": 3807 + }, + { + "epoch": 4.888318356867779, + "grad_norm": 2.2967100143432617, + "learning_rate": 1.4653401797175868e-05, + "loss": 0.6635, + "step": 3808 + }, + { + "epoch": 4.889602053915276, + "grad_norm": 2.5147476196289062, + "learning_rate": 1.4657252888318358e-05, + "loss": 0.6935, + "step": 3809 + }, + { + "epoch": 4.890885750962773, + "grad_norm": 2.411224126815796, + "learning_rate": 1.4661103979460847e-05, + "loss": 0.6839, + "step": 3810 + }, + { + "epoch": 4.89216944801027, + "grad_norm": 3.3753082752227783, + "learning_rate": 1.4664955070603338e-05, + "loss": 0.7259, + "step": 3811 + }, + { + "epoch": 4.893453145057767, + "grad_norm": 5.1162943840026855, + "learning_rate": 1.466880616174583e-05, + "loss": 0.7433, + "step": 3812 + }, + { + "epoch": 4.894736842105263, + "grad_norm": 4.139003753662109, + "learning_rate": 1.4672657252888319e-05, + "loss": 0.727, + "step": 3813 + }, + { + "epoch": 4.89602053915276, + "grad_norm": 11.909112930297852, + "learning_rate": 1.467650834403081e-05, + "loss": 0.8699, + "step": 3814 + }, + { + "epoch": 4.897304236200257, + "grad_norm": 4.785507678985596, + "learning_rate": 1.4680359435173299e-05, + "loss": 0.7958, + "step": 3815 + }, + { + "epoch": 4.898587933247754, + "grad_norm": 10.494651794433594, + "learning_rate": 1.468421052631579e-05, + "loss": 0.9697, + "step": 3816 + }, + { + "epoch": 4.89987163029525, + "grad_norm": 1.8197473287582397, + "learning_rate": 1.4688061617458281e-05, + "loss": 0.6608, + "step": 3817 + }, + { + "epoch": 4.901155327342747, + "grad_norm": 1.5792315006256104, + "learning_rate": 1.469191270860077e-05, + "loss": 0.6186, + "step": 3818 + }, + { + "epoch": 4.902439024390244, + "grad_norm": 3.7511866092681885, + "learning_rate": 1.469576379974326e-05, + "loss": 0.602, + "step": 3819 + }, + { + "epoch": 4.903722721437741, + "grad_norm": 2.523516893386841, + "learning_rate": 1.4699614890885753e-05, + "loss": 0.628, + "step": 3820 + }, + { + "epoch": 4.905006418485238, + "grad_norm": 2.2303779125213623, + "learning_rate": 1.4703465982028242e-05, + "loss": 0.6012, + "step": 3821 + }, + { + "epoch": 4.906290115532734, + "grad_norm": 1.8046683073043823, + "learning_rate": 1.4707317073170732e-05, + "loss": 0.6097, + "step": 3822 + }, + { + "epoch": 4.907573812580231, + "grad_norm": 1.9285749197006226, + "learning_rate": 1.4711168164313221e-05, + "loss": 0.6182, + "step": 3823 + }, + { + "epoch": 4.908857509627728, + "grad_norm": 1.5394097566604614, + "learning_rate": 1.4715019255455714e-05, + "loss": 0.6035, + "step": 3824 + }, + { + "epoch": 4.910141206675225, + "grad_norm": 2.018237352371216, + "learning_rate": 1.4718870346598203e-05, + "loss": 0.6084, + "step": 3825 + }, + { + "epoch": 4.911424903722722, + "grad_norm": 1.7907087802886963, + "learning_rate": 1.4722721437740692e-05, + "loss": 0.6442, + "step": 3826 + }, + { + "epoch": 4.912708600770218, + "grad_norm": 2.859980821609497, + "learning_rate": 1.4726572528883184e-05, + "loss": 0.6346, + "step": 3827 + }, + { + "epoch": 4.913992297817715, + "grad_norm": 1.9881126880645752, + "learning_rate": 1.4730423620025675e-05, + "loss": 0.6208, + "step": 3828 + }, + { + "epoch": 4.915275994865212, + "grad_norm": 1.9527497291564941, + "learning_rate": 1.4734274711168164e-05, + "loss": 0.6029, + "step": 3829 + }, + { + "epoch": 4.916559691912709, + "grad_norm": 2.083634376525879, + "learning_rate": 1.4738125802310655e-05, + "loss": 0.6415, + "step": 3830 + }, + { + "epoch": 4.917843388960206, + "grad_norm": 3.5152547359466553, + "learning_rate": 1.4741976893453146e-05, + "loss": 0.604, + "step": 3831 + }, + { + "epoch": 4.919127086007702, + "grad_norm": 1.9861507415771484, + "learning_rate": 1.4745827984595635e-05, + "loss": 0.665, + "step": 3832 + }, + { + "epoch": 4.920410783055199, + "grad_norm": 2.289703130722046, + "learning_rate": 1.4749679075738127e-05, + "loss": 0.656, + "step": 3833 + }, + { + "epoch": 4.921694480102696, + "grad_norm": 1.8452891111373901, + "learning_rate": 1.4753530166880616e-05, + "loss": 0.63, + "step": 3834 + }, + { + "epoch": 4.922978177150193, + "grad_norm": 1.6517425775527954, + "learning_rate": 1.4757381258023107e-05, + "loss": 0.632, + "step": 3835 + }, + { + "epoch": 4.92426187419769, + "grad_norm": 1.6050456762313843, + "learning_rate": 1.4761232349165598e-05, + "loss": 0.6602, + "step": 3836 + }, + { + "epoch": 4.925545571245186, + "grad_norm": 1.8594657182693481, + "learning_rate": 1.4765083440308087e-05, + "loss": 0.6032, + "step": 3837 + }, + { + "epoch": 4.926829268292683, + "grad_norm": 1.9160237312316895, + "learning_rate": 1.4768934531450577e-05, + "loss": 0.6036, + "step": 3838 + }, + { + "epoch": 4.92811296534018, + "grad_norm": 3.864582061767578, + "learning_rate": 1.477278562259307e-05, + "loss": 0.6421, + "step": 3839 + }, + { + "epoch": 4.929396662387677, + "grad_norm": 1.965776801109314, + "learning_rate": 1.4776636713735559e-05, + "loss": 0.5845, + "step": 3840 + }, + { + "epoch": 4.930680359435174, + "grad_norm": 1.960839033126831, + "learning_rate": 1.4780487804878048e-05, + "loss": 0.6049, + "step": 3841 + }, + { + "epoch": 4.93196405648267, + "grad_norm": 1.6280443668365479, + "learning_rate": 1.478433889602054e-05, + "loss": 0.6606, + "step": 3842 + }, + { + "epoch": 4.933247753530167, + "grad_norm": 2.338820695877075, + "learning_rate": 1.478818998716303e-05, + "loss": 0.6039, + "step": 3843 + }, + { + "epoch": 4.934531450577664, + "grad_norm": 1.6715290546417236, + "learning_rate": 1.479204107830552e-05, + "loss": 0.6707, + "step": 3844 + }, + { + "epoch": 4.935815147625161, + "grad_norm": 2.6474924087524414, + "learning_rate": 1.4795892169448011e-05, + "loss": 0.6379, + "step": 3845 + }, + { + "epoch": 4.937098844672657, + "grad_norm": 4.5628204345703125, + "learning_rate": 1.47997432605905e-05, + "loss": 0.6821, + "step": 3846 + }, + { + "epoch": 4.938382541720154, + "grad_norm": 2.995866537094116, + "learning_rate": 1.4803594351732991e-05, + "loss": 0.6564, + "step": 3847 + }, + { + "epoch": 4.939666238767651, + "grad_norm": 2.8220856189727783, + "learning_rate": 1.4807445442875482e-05, + "loss": 0.6778, + "step": 3848 + }, + { + "epoch": 4.940949935815148, + "grad_norm": 3.7010321617126465, + "learning_rate": 1.4811296534017972e-05, + "loss": 0.6887, + "step": 3849 + }, + { + "epoch": 4.942233632862644, + "grad_norm": 4.2348809242248535, + "learning_rate": 1.4815147625160461e-05, + "loss": 0.6716, + "step": 3850 + }, + { + "epoch": 4.943517329910141, + "grad_norm": 1.8488143682479858, + "learning_rate": 1.4818998716302954e-05, + "loss": 0.7268, + "step": 3851 + }, + { + "epoch": 4.944801026957638, + "grad_norm": 2.024533271789551, + "learning_rate": 1.4822849807445443e-05, + "loss": 0.7241, + "step": 3852 + }, + { + "epoch": 4.946084724005135, + "grad_norm": 3.966815948486328, + "learning_rate": 1.4826700898587933e-05, + "loss": 0.6499, + "step": 3853 + }, + { + "epoch": 4.947368421052632, + "grad_norm": 4.61466121673584, + "learning_rate": 1.4830551989730425e-05, + "loss": 0.686, + "step": 3854 + }, + { + "epoch": 4.948652118100128, + "grad_norm": 2.6070640087127686, + "learning_rate": 1.4834403080872915e-05, + "loss": 0.6669, + "step": 3855 + }, + { + "epoch": 4.949935815147625, + "grad_norm": 5.107367038726807, + "learning_rate": 1.4838254172015404e-05, + "loss": 0.66, + "step": 3856 + }, + { + "epoch": 4.951219512195122, + "grad_norm": 2.4440605640411377, + "learning_rate": 1.4842105263157895e-05, + "loss": 0.641, + "step": 3857 + }, + { + "epoch": 4.952503209242619, + "grad_norm": 8.691120147705078, + "learning_rate": 1.4845956354300386e-05, + "loss": 0.7144, + "step": 3858 + }, + { + "epoch": 4.953786906290116, + "grad_norm": 4.953622341156006, + "learning_rate": 1.4849807445442876e-05, + "loss": 0.7171, + "step": 3859 + }, + { + "epoch": 4.955070603337612, + "grad_norm": 1.8871644735336304, + "learning_rate": 1.4853658536585367e-05, + "loss": 0.7201, + "step": 3860 + }, + { + "epoch": 4.956354300385109, + "grad_norm": 2.9130444526672363, + "learning_rate": 1.4857509627727856e-05, + "loss": 0.7151, + "step": 3861 + }, + { + "epoch": 4.957637997432606, + "grad_norm": 2.09570574760437, + "learning_rate": 1.4861360718870347e-05, + "loss": 0.7754, + "step": 3862 + }, + { + "epoch": 4.958921694480103, + "grad_norm": 3.851083517074585, + "learning_rate": 1.4865211810012838e-05, + "loss": 0.7698, + "step": 3863 + }, + { + "epoch": 4.9602053915276, + "grad_norm": 4.830481052398682, + "learning_rate": 1.4869062901155328e-05, + "loss": 0.8271, + "step": 3864 + }, + { + "epoch": 4.961489088575096, + "grad_norm": 3.357377052307129, + "learning_rate": 1.4872913992297817e-05, + "loss": 0.8824, + "step": 3865 + }, + { + "epoch": 4.962772785622593, + "grad_norm": 2.997957706451416, + "learning_rate": 1.487676508344031e-05, + "loss": 0.9215, + "step": 3866 + }, + { + "epoch": 4.96405648267009, + "grad_norm": 1.4914250373840332, + "learning_rate": 1.48806161745828e-05, + "loss": 0.627, + "step": 3867 + }, + { + "epoch": 4.965340179717587, + "grad_norm": 1.8998911380767822, + "learning_rate": 1.4884467265725289e-05, + "loss": 0.6268, + "step": 3868 + }, + { + "epoch": 4.966623876765084, + "grad_norm": 1.8255681991577148, + "learning_rate": 1.488831835686778e-05, + "loss": 0.6187, + "step": 3869 + }, + { + "epoch": 4.96790757381258, + "grad_norm": 2.1671135425567627, + "learning_rate": 1.489216944801027e-05, + "loss": 0.6409, + "step": 3870 + }, + { + "epoch": 4.969191270860077, + "grad_norm": 4.054187774658203, + "learning_rate": 1.489602053915276e-05, + "loss": 0.6815, + "step": 3871 + }, + { + "epoch": 4.970474967907574, + "grad_norm": 2.408535957336426, + "learning_rate": 1.4899871630295251e-05, + "loss": 0.6295, + "step": 3872 + }, + { + "epoch": 4.971758664955071, + "grad_norm": 2.4462502002716064, + "learning_rate": 1.490372272143774e-05, + "loss": 0.6757, + "step": 3873 + }, + { + "epoch": 4.973042362002568, + "grad_norm": 2.1040847301483154, + "learning_rate": 1.4907573812580232e-05, + "loss": 0.5957, + "step": 3874 + }, + { + "epoch": 4.974326059050064, + "grad_norm": 3.2682361602783203, + "learning_rate": 1.4911424903722721e-05, + "loss": 0.6506, + "step": 3875 + }, + { + "epoch": 4.975609756097561, + "grad_norm": 4.905563831329346, + "learning_rate": 1.4915275994865212e-05, + "loss": 0.6192, + "step": 3876 + }, + { + "epoch": 4.976893453145058, + "grad_norm": 2.9334354400634766, + "learning_rate": 1.4919127086007703e-05, + "loss": 0.6237, + "step": 3877 + }, + { + "epoch": 4.978177150192555, + "grad_norm": 5.810489654541016, + "learning_rate": 1.4922978177150193e-05, + "loss": 0.6714, + "step": 3878 + }, + { + "epoch": 4.979460847240051, + "grad_norm": 13.639580726623535, + "learning_rate": 1.4926829268292684e-05, + "loss": 0.6369, + "step": 3879 + }, + { + "epoch": 4.980744544287548, + "grad_norm": 2.1293082237243652, + "learning_rate": 1.4930680359435173e-05, + "loss": 0.614, + "step": 3880 + }, + { + "epoch": 4.982028241335045, + "grad_norm": 2.391047954559326, + "learning_rate": 1.4934531450577664e-05, + "loss": 0.643, + "step": 3881 + }, + { + "epoch": 4.983311938382542, + "grad_norm": 2.092421293258667, + "learning_rate": 1.4938382541720155e-05, + "loss": 0.6524, + "step": 3882 + }, + { + "epoch": 4.984595635430038, + "grad_norm": 2.4765639305114746, + "learning_rate": 1.4942233632862644e-05, + "loss": 0.6266, + "step": 3883 + }, + { + "epoch": 4.985879332477535, + "grad_norm": 4.090422630310059, + "learning_rate": 1.4946084724005134e-05, + "loss": 0.668, + "step": 3884 + }, + { + "epoch": 4.987163029525032, + "grad_norm": 4.263540267944336, + "learning_rate": 1.4949935815147627e-05, + "loss": 0.6698, + "step": 3885 + }, + { + "epoch": 4.988446726572529, + "grad_norm": 83.31165313720703, + "learning_rate": 1.4953786906290116e-05, + "loss": 0.714, + "step": 3886 + }, + { + "epoch": 4.989730423620026, + "grad_norm": 4.868527889251709, + "learning_rate": 1.4957637997432605e-05, + "loss": 0.7231, + "step": 3887 + }, + { + "epoch": 4.991014120667522, + "grad_norm": 1.819318413734436, + "learning_rate": 1.4961489088575096e-05, + "loss": 0.7233, + "step": 3888 + }, + { + "epoch": 4.992297817715019, + "grad_norm": 1.4202648401260376, + "learning_rate": 1.4965340179717588e-05, + "loss": 0.6076, + "step": 3889 + }, + { + "epoch": 4.993581514762516, + "grad_norm": 10.079683303833008, + "learning_rate": 1.4969191270860077e-05, + "loss": 0.7336, + "step": 3890 + }, + { + "epoch": 4.994865211810013, + "grad_norm": 2.4588570594787598, + "learning_rate": 1.4973042362002568e-05, + "loss": 0.7588, + "step": 3891 + }, + { + "epoch": 4.99614890885751, + "grad_norm": 2.1113529205322266, + "learning_rate": 1.4976893453145057e-05, + "loss": 0.7804, + "step": 3892 + }, + { + "epoch": 4.997432605905006, + "grad_norm": 1.8981006145477295, + "learning_rate": 1.4980744544287548e-05, + "loss": 0.7708, + "step": 3893 + }, + { + "epoch": 4.998716302952503, + "grad_norm": 5.721512317657471, + "learning_rate": 1.498459563543004e-05, + "loss": 0.8637, + "step": 3894 + }, + { + "epoch": 5.0, + "grad_norm": 3.9736506938934326, + "learning_rate": 1.4988446726572529e-05, + "loss": 0.9796, + "step": 3895 + }, + { + "epoch": 5.001283697047497, + "grad_norm": 3.360368490219116, + "learning_rate": 1.4992297817715018e-05, + "loss": 0.6173, + "step": 3896 + }, + { + "epoch": 5.002567394094994, + "grad_norm": 1.922447919845581, + "learning_rate": 1.4996148908857511e-05, + "loss": 0.6068, + "step": 3897 + }, + { + "epoch": 5.00385109114249, + "grad_norm": 1.4429768323898315, + "learning_rate": 1.5e-05, + "loss": 0.5886, + "step": 3898 + }, + { + "epoch": 5.005134788189987, + "grad_norm": 1.5944563150405884, + "learning_rate": 1.5003851091142491e-05, + "loss": 0.6084, + "step": 3899 + }, + { + "epoch": 5.006418485237484, + "grad_norm": 2.517897367477417, + "learning_rate": 1.500770218228498e-05, + "loss": 0.6213, + "step": 3900 + }, + { + "epoch": 5.007702182284981, + "grad_norm": 5.218886852264404, + "learning_rate": 1.5011553273427472e-05, + "loss": 0.6478, + "step": 3901 + }, + { + "epoch": 5.008985879332478, + "grad_norm": 3.1537084579467773, + "learning_rate": 1.5015404364569963e-05, + "loss": 0.6593, + "step": 3902 + }, + { + "epoch": 5.010269576379974, + "grad_norm": 3.095590353012085, + "learning_rate": 1.5019255455712452e-05, + "loss": 0.6111, + "step": 3903 + }, + { + "epoch": 5.011553273427471, + "grad_norm": 2.208683490753174, + "learning_rate": 1.5023106546854943e-05, + "loss": 0.6371, + "step": 3904 + }, + { + "epoch": 5.012836970474968, + "grad_norm": 2.308093309402466, + "learning_rate": 1.5026957637997431e-05, + "loss": 0.6254, + "step": 3905 + }, + { + "epoch": 5.014120667522465, + "grad_norm": 2.1444883346557617, + "learning_rate": 1.5030808729139924e-05, + "loss": 0.6324, + "step": 3906 + }, + { + "epoch": 5.015404364569961, + "grad_norm": 3.025191307067871, + "learning_rate": 1.5034659820282415e-05, + "loss": 0.6551, + "step": 3907 + }, + { + "epoch": 5.016688061617458, + "grad_norm": 6.6089606285095215, + "learning_rate": 1.5038510911424903e-05, + "loss": 0.6081, + "step": 3908 + }, + { + "epoch": 5.017971758664955, + "grad_norm": 2.0255210399627686, + "learning_rate": 1.5042362002567395e-05, + "loss": 0.6579, + "step": 3909 + }, + { + "epoch": 5.019255455712452, + "grad_norm": 1.2673782110214233, + "learning_rate": 1.5046213093709886e-05, + "loss": 0.619, + "step": 3910 + }, + { + "epoch": 5.020539152759949, + "grad_norm": 2.068720579147339, + "learning_rate": 1.5050064184852374e-05, + "loss": 0.6342, + "step": 3911 + }, + { + "epoch": 5.021822849807445, + "grad_norm": 3.1107757091522217, + "learning_rate": 1.5053915275994867e-05, + "loss": 0.608, + "step": 3912 + }, + { + "epoch": 5.023106546854942, + "grad_norm": 1.4303964376449585, + "learning_rate": 1.5057766367137355e-05, + "loss": 0.6208, + "step": 3913 + }, + { + "epoch": 5.024390243902439, + "grad_norm": 1.9452592134475708, + "learning_rate": 1.5061617458279846e-05, + "loss": 0.6174, + "step": 3914 + }, + { + "epoch": 5.025673940949936, + "grad_norm": 3.3999996185302734, + "learning_rate": 1.5065468549422338e-05, + "loss": 0.6064, + "step": 3915 + }, + { + "epoch": 5.026957637997433, + "grad_norm": 1.8991845846176147, + "learning_rate": 1.5069319640564826e-05, + "loss": 0.678, + "step": 3916 + }, + { + "epoch": 5.028241335044929, + "grad_norm": 2.376041889190674, + "learning_rate": 1.5073170731707317e-05, + "loss": 0.6569, + "step": 3917 + }, + { + "epoch": 5.029525032092426, + "grad_norm": 19.81144905090332, + "learning_rate": 1.507702182284981e-05, + "loss": 0.6692, + "step": 3918 + }, + { + "epoch": 5.030808729139923, + "grad_norm": 1.5247185230255127, + "learning_rate": 1.5080872913992298e-05, + "loss": 0.6809, + "step": 3919 + }, + { + "epoch": 5.03209242618742, + "grad_norm": 2.606527328491211, + "learning_rate": 1.5084724005134789e-05, + "loss": 0.6523, + "step": 3920 + }, + { + "epoch": 5.033376123234916, + "grad_norm": 1.6783899068832397, + "learning_rate": 1.5088575096277281e-05, + "loss": 0.6257, + "step": 3921 + }, + { + "epoch": 5.034659820282413, + "grad_norm": 2.446645736694336, + "learning_rate": 1.5092426187419769e-05, + "loss": 0.6388, + "step": 3922 + }, + { + "epoch": 5.03594351732991, + "grad_norm": 1.6245474815368652, + "learning_rate": 1.509627727856226e-05, + "loss": 0.6304, + "step": 3923 + }, + { + "epoch": 5.037227214377407, + "grad_norm": 2.9583542346954346, + "learning_rate": 1.510012836970475e-05, + "loss": 0.6366, + "step": 3924 + }, + { + "epoch": 5.038510911424904, + "grad_norm": 2.3510870933532715, + "learning_rate": 1.510397946084724e-05, + "loss": 0.6809, + "step": 3925 + }, + { + "epoch": 5.0397946084724, + "grad_norm": 1.64139986038208, + "learning_rate": 1.5107830551989732e-05, + "loss": 0.741, + "step": 3926 + }, + { + "epoch": 5.041078305519897, + "grad_norm": 4.047347545623779, + "learning_rate": 1.511168164313222e-05, + "loss": 0.6477, + "step": 3927 + }, + { + "epoch": 5.042362002567394, + "grad_norm": 4.828742980957031, + "learning_rate": 1.5115532734274712e-05, + "loss": 0.6695, + "step": 3928 + }, + { + "epoch": 5.043645699614891, + "grad_norm": 6.67119836807251, + "learning_rate": 1.5119383825417203e-05, + "loss": 0.6858, + "step": 3929 + }, + { + "epoch": 5.044929396662388, + "grad_norm": 2.0728964805603027, + "learning_rate": 1.5123234916559691e-05, + "loss": 0.6956, + "step": 3930 + }, + { + "epoch": 5.046213093709884, + "grad_norm": 1.8471554517745972, + "learning_rate": 1.5127086007702184e-05, + "loss": 0.6578, + "step": 3931 + }, + { + "epoch": 5.047496790757381, + "grad_norm": 9.754461288452148, + "learning_rate": 1.5130937098844671e-05, + "loss": 0.7247, + "step": 3932 + }, + { + "epoch": 5.048780487804878, + "grad_norm": 2.732081890106201, + "learning_rate": 1.5134788189987162e-05, + "loss": 0.6751, + "step": 3933 + }, + { + "epoch": 5.050064184852375, + "grad_norm": 1.9410662651062012, + "learning_rate": 1.5138639281129655e-05, + "loss": 0.671, + "step": 3934 + }, + { + "epoch": 5.051347881899872, + "grad_norm": 2.529433250427246, + "learning_rate": 1.5142490372272143e-05, + "loss": 0.6566, + "step": 3935 + }, + { + "epoch": 5.052631578947368, + "grad_norm": 2.8129732608795166, + "learning_rate": 1.5146341463414634e-05, + "loss": 0.6753, + "step": 3936 + }, + { + "epoch": 5.053915275994865, + "grad_norm": 2.8749194145202637, + "learning_rate": 1.5150192554557127e-05, + "loss": 0.6731, + "step": 3937 + }, + { + "epoch": 5.055198973042362, + "grad_norm": 1.9310553073883057, + "learning_rate": 1.5154043645699614e-05, + "loss": 0.6804, + "step": 3938 + }, + { + "epoch": 5.056482670089859, + "grad_norm": 2.781045436859131, + "learning_rate": 1.5157894736842105e-05, + "loss": 0.6769, + "step": 3939 + }, + { + "epoch": 5.057766367137355, + "grad_norm": 3.1413414478302, + "learning_rate": 1.5161745827984595e-05, + "loss": 0.6817, + "step": 3940 + }, + { + "epoch": 5.059050064184852, + "grad_norm": 3.782827854156494, + "learning_rate": 1.5165596919127086e-05, + "loss": 0.7377, + "step": 3941 + }, + { + "epoch": 5.060333761232349, + "grad_norm": 1.8712363243103027, + "learning_rate": 1.5169448010269577e-05, + "loss": 0.7475, + "step": 3942 + }, + { + "epoch": 5.061617458279846, + "grad_norm": 4.357664108276367, + "learning_rate": 1.5173299101412066e-05, + "loss": 0.7802, + "step": 3943 + }, + { + "epoch": 5.062901155327343, + "grad_norm": 3.843656539916992, + "learning_rate": 1.5177150192554557e-05, + "loss": 0.8001, + "step": 3944 + }, + { + "epoch": 5.064184852374839, + "grad_norm": 2.5906481742858887, + "learning_rate": 1.5181001283697049e-05, + "loss": 0.8809, + "step": 3945 + }, + { + "epoch": 5.065468549422336, + "grad_norm": 1.2883588075637817, + "learning_rate": 1.5184852374839538e-05, + "loss": 0.657, + "step": 3946 + }, + { + "epoch": 5.066752246469833, + "grad_norm": 2.1596391201019287, + "learning_rate": 1.5188703465982029e-05, + "loss": 0.6276, + "step": 3947 + }, + { + "epoch": 5.06803594351733, + "grad_norm": 2.816128730773926, + "learning_rate": 1.519255455712452e-05, + "loss": 0.6442, + "step": 3948 + }, + { + "epoch": 5.069319640564827, + "grad_norm": 1.8410507440567017, + "learning_rate": 1.519640564826701e-05, + "loss": 0.6609, + "step": 3949 + }, + { + "epoch": 5.070603337612323, + "grad_norm": 2.7896227836608887, + "learning_rate": 1.52002567394095e-05, + "loss": 0.599, + "step": 3950 + }, + { + "epoch": 5.07188703465982, + "grad_norm": 1.5719587802886963, + "learning_rate": 1.5204107830551988e-05, + "loss": 0.663, + "step": 3951 + }, + { + "epoch": 5.073170731707317, + "grad_norm": 1.5101051330566406, + "learning_rate": 1.5207958921694481e-05, + "loss": 0.6488, + "step": 3952 + }, + { + "epoch": 5.074454428754814, + "grad_norm": 1.5917270183563232, + "learning_rate": 1.5211810012836972e-05, + "loss": 0.6266, + "step": 3953 + }, + { + "epoch": 5.07573812580231, + "grad_norm": 1.0596908330917358, + "learning_rate": 1.521566110397946e-05, + "loss": 0.653, + "step": 3954 + }, + { + "epoch": 5.077021822849807, + "grad_norm": 6.126104831695557, + "learning_rate": 1.5219512195121952e-05, + "loss": 0.6276, + "step": 3955 + }, + { + "epoch": 5.078305519897304, + "grad_norm": 2.4984240531921387, + "learning_rate": 1.5223363286264444e-05, + "loss": 0.631, + "step": 3956 + }, + { + "epoch": 5.079589216944801, + "grad_norm": 1.675424337387085, + "learning_rate": 1.5227214377406931e-05, + "loss": 0.6517, + "step": 3957 + }, + { + "epoch": 5.080872913992298, + "grad_norm": 4.8762617111206055, + "learning_rate": 1.5231065468549424e-05, + "loss": 0.622, + "step": 3958 + }, + { + "epoch": 5.082156611039794, + "grad_norm": 3.368661403656006, + "learning_rate": 1.5234916559691912e-05, + "loss": 0.643, + "step": 3959 + }, + { + "epoch": 5.083440308087291, + "grad_norm": 1.6366950273513794, + "learning_rate": 1.5238767650834403e-05, + "loss": 0.6176, + "step": 3960 + }, + { + "epoch": 5.084724005134788, + "grad_norm": 1.4518698453903198, + "learning_rate": 1.5242618741976895e-05, + "loss": 0.6174, + "step": 3961 + }, + { + "epoch": 5.086007702182285, + "grad_norm": 2.1018238067626953, + "learning_rate": 1.5246469833119383e-05, + "loss": 0.6482, + "step": 3962 + }, + { + "epoch": 5.087291399229782, + "grad_norm": 1.5334125757217407, + "learning_rate": 1.5250320924261874e-05, + "loss": 0.6396, + "step": 3963 + }, + { + "epoch": 5.088575096277278, + "grad_norm": 4.7900190353393555, + "learning_rate": 1.5254172015404367e-05, + "loss": 0.6197, + "step": 3964 + }, + { + "epoch": 5.089858793324775, + "grad_norm": 2.0788509845733643, + "learning_rate": 1.5258023106546855e-05, + "loss": 0.6299, + "step": 3965 + }, + { + "epoch": 5.091142490372272, + "grad_norm": 1.5702170133590698, + "learning_rate": 1.5261874197689347e-05, + "loss": 0.6129, + "step": 3966 + }, + { + "epoch": 5.092426187419769, + "grad_norm": 1.3942809104919434, + "learning_rate": 1.526572528883184e-05, + "loss": 0.6337, + "step": 3967 + }, + { + "epoch": 5.093709884467266, + "grad_norm": 1.8182300329208374, + "learning_rate": 1.5269576379974326e-05, + "loss": 0.6398, + "step": 3968 + }, + { + "epoch": 5.094993581514762, + "grad_norm": 4.593632698059082, + "learning_rate": 1.5273427471116817e-05, + "loss": 0.6213, + "step": 3969 + }, + { + "epoch": 5.096277278562259, + "grad_norm": 1.6568801403045654, + "learning_rate": 1.5277278562259305e-05, + "loss": 0.6592, + "step": 3970 + }, + { + "epoch": 5.097560975609756, + "grad_norm": 5.728292465209961, + "learning_rate": 1.5281129653401796e-05, + "loss": 0.6935, + "step": 3971 + }, + { + "epoch": 5.098844672657253, + "grad_norm": 2.642376661300659, + "learning_rate": 1.528498074454429e-05, + "loss": 0.6593, + "step": 3972 + }, + { + "epoch": 5.100128369704749, + "grad_norm": 3.6254360675811768, + "learning_rate": 1.5288831835686778e-05, + "loss": 0.6952, + "step": 3973 + }, + { + "epoch": 5.101412066752246, + "grad_norm": 1.4772969484329224, + "learning_rate": 1.529268292682927e-05, + "loss": 0.6445, + "step": 3974 + }, + { + "epoch": 5.102695763799743, + "grad_norm": 1.9420469999313354, + "learning_rate": 1.529653401797176e-05, + "loss": 0.7119, + "step": 3975 + }, + { + "epoch": 5.10397946084724, + "grad_norm": 1.7813183069229126, + "learning_rate": 1.5300385109114248e-05, + "loss": 0.6339, + "step": 3976 + }, + { + "epoch": 5.105263157894737, + "grad_norm": 2.325282573699951, + "learning_rate": 1.530423620025674e-05, + "loss": 0.6389, + "step": 3977 + }, + { + "epoch": 5.106546854942233, + "grad_norm": 2.105792999267578, + "learning_rate": 1.530808729139923e-05, + "loss": 0.6504, + "step": 3978 + }, + { + "epoch": 5.10783055198973, + "grad_norm": 1.9284250736236572, + "learning_rate": 1.531193838254172e-05, + "loss": 0.6875, + "step": 3979 + }, + { + "epoch": 5.109114249037227, + "grad_norm": 1.581079363822937, + "learning_rate": 1.5315789473684212e-05, + "loss": 0.713, + "step": 3980 + }, + { + "epoch": 5.110397946084724, + "grad_norm": 2.3697896003723145, + "learning_rate": 1.53196405648267e-05, + "loss": 0.6611, + "step": 3981 + }, + { + "epoch": 5.111681643132221, + "grad_norm": 2.836477518081665, + "learning_rate": 1.532349165596919e-05, + "loss": 0.687, + "step": 3982 + }, + { + "epoch": 5.112965340179717, + "grad_norm": 1.6905009746551514, + "learning_rate": 1.5327342747111682e-05, + "loss": 0.6606, + "step": 3983 + }, + { + "epoch": 5.114249037227214, + "grad_norm": 2.485922336578369, + "learning_rate": 1.5331193838254173e-05, + "loss": 0.7251, + "step": 3984 + }, + { + "epoch": 5.115532734274711, + "grad_norm": 3.2538414001464844, + "learning_rate": 1.5335044929396664e-05, + "loss": 0.6501, + "step": 3985 + }, + { + "epoch": 5.116816431322208, + "grad_norm": 2.882143497467041, + "learning_rate": 1.5338896020539152e-05, + "loss": 0.701, + "step": 3986 + }, + { + "epoch": 5.118100128369705, + "grad_norm": 3.1189205646514893, + "learning_rate": 1.5342747111681643e-05, + "loss": 0.6818, + "step": 3987 + }, + { + "epoch": 5.119383825417201, + "grad_norm": 6.8380889892578125, + "learning_rate": 1.5346598202824134e-05, + "loss": 0.7243, + "step": 3988 + }, + { + "epoch": 5.120667522464698, + "grad_norm": 3.0373611450195312, + "learning_rate": 1.5350449293966622e-05, + "loss": 0.7221, + "step": 3989 + }, + { + "epoch": 5.121951219512195, + "grad_norm": 2.055469274520874, + "learning_rate": 1.5354300385109116e-05, + "loss": 0.7331, + "step": 3990 + }, + { + "epoch": 5.123234916559692, + "grad_norm": 5.13417387008667, + "learning_rate": 1.5358151476251607e-05, + "loss": 0.7632, + "step": 3991 + }, + { + "epoch": 5.124518613607188, + "grad_norm": 1.9284995794296265, + "learning_rate": 1.5362002567394095e-05, + "loss": 0.7369, + "step": 3992 + }, + { + "epoch": 5.125802310654685, + "grad_norm": 2.1192986965179443, + "learning_rate": 1.5365853658536586e-05, + "loss": 0.746, + "step": 3993 + }, + { + "epoch": 5.127086007702182, + "grad_norm": 2.747610092163086, + "learning_rate": 1.5369704749679077e-05, + "loss": 0.7769, + "step": 3994 + }, + { + "epoch": 5.128369704749679, + "grad_norm": 5.943600654602051, + "learning_rate": 1.5373555840821565e-05, + "loss": 0.9944, + "step": 3995 + }, + { + "epoch": 5.129653401797176, + "grad_norm": 2.1144442558288574, + "learning_rate": 1.537740693196406e-05, + "loss": 0.6009, + "step": 3996 + }, + { + "epoch": 5.130937098844672, + "grad_norm": 3.8656973838806152, + "learning_rate": 1.5381258023106547e-05, + "loss": 0.6096, + "step": 3997 + }, + { + "epoch": 5.132220795892169, + "grad_norm": 2.583643913269043, + "learning_rate": 1.5385109114249038e-05, + "loss": 0.6523, + "step": 3998 + }, + { + "epoch": 5.133504492939666, + "grad_norm": 2.1181888580322266, + "learning_rate": 1.538896020539153e-05, + "loss": 0.6928, + "step": 3999 + }, + { + "epoch": 5.134788189987163, + "grad_norm": 1.172457218170166, + "learning_rate": 1.5392811296534017e-05, + "loss": 0.6207, + "step": 4000 + }, + { + "epoch": 5.134788189987163, + "eval_cer": 0.3178725279028784, + "eval_loss": 0.6331427097320557, + "eval_runtime": 14.3565, + "eval_samples_per_second": 68.471, + "eval_steps_per_second": 0.488, + "eval_wer": 0.5624856684246733, + "step": 4000 + }, + { + "epoch": 5.13607188703466, + "grad_norm": 2.420711040496826, + "learning_rate": 1.5396662387676508e-05, + "loss": 0.5914, + "step": 4001 + }, + { + "epoch": 5.137355584082156, + "grad_norm": 1.6120367050170898, + "learning_rate": 1.5400513478819e-05, + "loss": 0.6335, + "step": 4002 + }, + { + "epoch": 5.138639281129653, + "grad_norm": 2.5044116973876953, + "learning_rate": 1.540436456996149e-05, + "loss": 0.6659, + "step": 4003 + }, + { + "epoch": 5.13992297817715, + "grad_norm": 2.6025197505950928, + "learning_rate": 1.540821566110398e-05, + "loss": 0.6281, + "step": 4004 + }, + { + "epoch": 5.141206675224647, + "grad_norm": 1.4615975618362427, + "learning_rate": 1.541206675224647e-05, + "loss": 0.6211, + "step": 4005 + }, + { + "epoch": 5.142490372272144, + "grad_norm": 1.4774707555770874, + "learning_rate": 1.541591784338896e-05, + "loss": 0.5859, + "step": 4006 + }, + { + "epoch": 5.14377406931964, + "grad_norm": 1.8570183515548706, + "learning_rate": 1.541976893453145e-05, + "loss": 0.6326, + "step": 4007 + }, + { + "epoch": 5.145057766367137, + "grad_norm": 1.2365422248840332, + "learning_rate": 1.542362002567394e-05, + "loss": 0.6071, + "step": 4008 + }, + { + "epoch": 5.146341463414634, + "grad_norm": 1.8495573997497559, + "learning_rate": 1.5427471116816433e-05, + "loss": 0.5853, + "step": 4009 + }, + { + "epoch": 5.147625160462131, + "grad_norm": 2.987886428833008, + "learning_rate": 1.5431322207958924e-05, + "loss": 0.5977, + "step": 4010 + }, + { + "epoch": 5.148908857509627, + "grad_norm": 1.8687331676483154, + "learning_rate": 1.5435173299101412e-05, + "loss": 0.6047, + "step": 4011 + }, + { + "epoch": 5.150192554557124, + "grad_norm": 1.7513742446899414, + "learning_rate": 1.5439024390243903e-05, + "loss": 0.6283, + "step": 4012 + }, + { + "epoch": 5.151476251604621, + "grad_norm": 1.5433536767959595, + "learning_rate": 1.5442875481386394e-05, + "loss": 0.6249, + "step": 4013 + }, + { + "epoch": 5.152759948652118, + "grad_norm": 2.0689198970794678, + "learning_rate": 1.544672657252888e-05, + "loss": 0.6357, + "step": 4014 + }, + { + "epoch": 5.154043645699615, + "grad_norm": 2.1885576248168945, + "learning_rate": 1.5450577663671376e-05, + "loss": 0.6389, + "step": 4015 + }, + { + "epoch": 5.155327342747111, + "grad_norm": 2.1141345500946045, + "learning_rate": 1.5454428754813864e-05, + "loss": 0.6436, + "step": 4016 + }, + { + "epoch": 5.156611039794608, + "grad_norm": 1.789086937904358, + "learning_rate": 1.5458279845956355e-05, + "loss": 0.5651, + "step": 4017 + }, + { + "epoch": 5.157894736842105, + "grad_norm": 1.461124300956726, + "learning_rate": 1.5462130937098846e-05, + "loss": 0.6184, + "step": 4018 + }, + { + "epoch": 5.159178433889602, + "grad_norm": 1.626528263092041, + "learning_rate": 1.5465982028241334e-05, + "loss": 0.6332, + "step": 4019 + }, + { + "epoch": 5.160462130937099, + "grad_norm": 2.1285767555236816, + "learning_rate": 1.5469833119383825e-05, + "loss": 0.6569, + "step": 4020 + }, + { + "epoch": 5.161745827984595, + "grad_norm": 2.842850923538208, + "learning_rate": 1.547368421052632e-05, + "loss": 0.5922, + "step": 4021 + }, + { + "epoch": 5.163029525032092, + "grad_norm": 1.7535349130630493, + "learning_rate": 1.5477535301668807e-05, + "loss": 0.5961, + "step": 4022 + }, + { + "epoch": 5.164313222079589, + "grad_norm": 1.9477663040161133, + "learning_rate": 1.5481386392811298e-05, + "loss": 0.6409, + "step": 4023 + }, + { + "epoch": 5.165596919127086, + "grad_norm": 3.3688204288482666, + "learning_rate": 1.5485237483953785e-05, + "loss": 0.6224, + "step": 4024 + }, + { + "epoch": 5.166880616174582, + "grad_norm": 2.1343772411346436, + "learning_rate": 1.5489088575096277e-05, + "loss": 0.6735, + "step": 4025 + }, + { + "epoch": 5.168164313222079, + "grad_norm": 8.016094207763672, + "learning_rate": 1.5492939666238768e-05, + "loss": 0.6602, + "step": 4026 + }, + { + "epoch": 5.169448010269576, + "grad_norm": 2.2168521881103516, + "learning_rate": 1.549679075738126e-05, + "loss": 0.7123, + "step": 4027 + }, + { + "epoch": 5.170731707317073, + "grad_norm": 10.036051750183105, + "learning_rate": 1.550064184852375e-05, + "loss": 0.6838, + "step": 4028 + }, + { + "epoch": 5.17201540436457, + "grad_norm": 3.8218164443969727, + "learning_rate": 1.550449293966624e-05, + "loss": 0.6378, + "step": 4029 + }, + { + "epoch": 5.173299101412066, + "grad_norm": 3.765359878540039, + "learning_rate": 1.550834403080873e-05, + "loss": 0.6415, + "step": 4030 + }, + { + "epoch": 5.174582798459563, + "grad_norm": 1.4550719261169434, + "learning_rate": 1.551219512195122e-05, + "loss": 0.6938, + "step": 4031 + }, + { + "epoch": 5.17586649550706, + "grad_norm": 2.871358871459961, + "learning_rate": 1.5516046213093707e-05, + "loss": 0.6889, + "step": 4032 + }, + { + "epoch": 5.177150192554557, + "grad_norm": 1.9088983535766602, + "learning_rate": 1.5519897304236202e-05, + "loss": 0.7254, + "step": 4033 + }, + { + "epoch": 5.178433889602054, + "grad_norm": 7.882320880889893, + "learning_rate": 1.5523748395378693e-05, + "loss": 0.6871, + "step": 4034 + }, + { + "epoch": 5.17971758664955, + "grad_norm": 2.1229441165924072, + "learning_rate": 1.552759948652118e-05, + "loss": 0.676, + "step": 4035 + }, + { + "epoch": 5.181001283697047, + "grad_norm": 3.1849756240844727, + "learning_rate": 1.553145057766367e-05, + "loss": 0.6751, + "step": 4036 + }, + { + "epoch": 5.182284980744544, + "grad_norm": 3.8020858764648438, + "learning_rate": 1.5535301668806163e-05, + "loss": 0.6991, + "step": 4037 + }, + { + "epoch": 5.183568677792041, + "grad_norm": 3.630796432495117, + "learning_rate": 1.553915275994865e-05, + "loss": 0.7171, + "step": 4038 + }, + { + "epoch": 5.184852374839538, + "grad_norm": 2.2445461750030518, + "learning_rate": 1.5543003851091145e-05, + "loss": 0.6656, + "step": 4039 + }, + { + "epoch": 5.186136071887034, + "grad_norm": 5.3850274085998535, + "learning_rate": 1.5546854942233636e-05, + "loss": 0.7889, + "step": 4040 + }, + { + "epoch": 5.187419768934531, + "grad_norm": 5.747956275939941, + "learning_rate": 1.5550706033376124e-05, + "loss": 0.7311, + "step": 4041 + }, + { + "epoch": 5.188703465982028, + "grad_norm": 4.084462642669678, + "learning_rate": 1.5554557124518615e-05, + "loss": 0.7866, + "step": 4042 + }, + { + "epoch": 5.189987163029525, + "grad_norm": 3.5855093002319336, + "learning_rate": 1.5558408215661102e-05, + "loss": 0.8249, + "step": 4043 + }, + { + "epoch": 5.191270860077021, + "grad_norm": 2.009455680847168, + "learning_rate": 1.5562259306803593e-05, + "loss": 0.7387, + "step": 4044 + }, + { + "epoch": 5.192554557124518, + "grad_norm": 36.48512649536133, + "learning_rate": 1.5566110397946088e-05, + "loss": 0.9754, + "step": 4045 + }, + { + "epoch": 5.193838254172015, + "grad_norm": 2.2737877368927, + "learning_rate": 1.5569961489088575e-05, + "loss": 0.6069, + "step": 4046 + }, + { + "epoch": 5.195121951219512, + "grad_norm": 2.697852611541748, + "learning_rate": 1.5573812580231067e-05, + "loss": 0.5922, + "step": 4047 + }, + { + "epoch": 5.196405648267009, + "grad_norm": 1.9616758823394775, + "learning_rate": 1.5577663671373558e-05, + "loss": 0.616, + "step": 4048 + }, + { + "epoch": 5.197689345314505, + "grad_norm": 1.45371413230896, + "learning_rate": 1.5581514762516045e-05, + "loss": 0.5917, + "step": 4049 + }, + { + "epoch": 5.198973042362002, + "grad_norm": 2.63128399848938, + "learning_rate": 1.5585365853658536e-05, + "loss": 0.6622, + "step": 4050 + }, + { + "epoch": 5.200256739409499, + "grad_norm": 2.516995906829834, + "learning_rate": 1.5589216944801027e-05, + "loss": 0.6147, + "step": 4051 + }, + { + "epoch": 5.201540436456996, + "grad_norm": 2.0516762733459473, + "learning_rate": 1.559306803594352e-05, + "loss": 0.596, + "step": 4052 + }, + { + "epoch": 5.202824133504493, + "grad_norm": 1.9088331460952759, + "learning_rate": 1.559691912708601e-05, + "loss": 0.6282, + "step": 4053 + }, + { + "epoch": 5.2041078305519894, + "grad_norm": 4.954780101776123, + "learning_rate": 1.5600770218228497e-05, + "loss": 0.5965, + "step": 4054 + }, + { + "epoch": 5.205391527599486, + "grad_norm": 7.494540214538574, + "learning_rate": 1.560462130937099e-05, + "loss": 0.639, + "step": 4055 + }, + { + "epoch": 5.206675224646983, + "grad_norm": 3.509826421737671, + "learning_rate": 1.560847240051348e-05, + "loss": 0.6575, + "step": 4056 + }, + { + "epoch": 5.20795892169448, + "grad_norm": 2.3960888385772705, + "learning_rate": 1.5612323491655967e-05, + "loss": 0.6016, + "step": 4057 + }, + { + "epoch": 5.2092426187419765, + "grad_norm": 4.947741985321045, + "learning_rate": 1.561617458279846e-05, + "loss": 0.6034, + "step": 4058 + }, + { + "epoch": 5.2105263157894735, + "grad_norm": 2.2851099967956543, + "learning_rate": 1.5620025673940953e-05, + "loss": 0.6189, + "step": 4059 + }, + { + "epoch": 5.21181001283697, + "grad_norm": 2.1529147624969482, + "learning_rate": 1.562387676508344e-05, + "loss": 0.6134, + "step": 4060 + }, + { + "epoch": 5.213093709884467, + "grad_norm": 2.3009793758392334, + "learning_rate": 1.562772785622593e-05, + "loss": 0.6238, + "step": 4061 + }, + { + "epoch": 5.214377406931964, + "grad_norm": 2.663614511489868, + "learning_rate": 1.563157894736842e-05, + "loss": 0.6211, + "step": 4062 + }, + { + "epoch": 5.2156611039794605, + "grad_norm": 1.6837427616119385, + "learning_rate": 1.563543003851091e-05, + "loss": 0.5963, + "step": 4063 + }, + { + "epoch": 5.2169448010269575, + "grad_norm": 2.453132390975952, + "learning_rate": 1.5639281129653405e-05, + "loss": 0.6608, + "step": 4064 + }, + { + "epoch": 5.218228498074454, + "grad_norm": 2.8986377716064453, + "learning_rate": 1.5643132220795892e-05, + "loss": 0.6312, + "step": 4065 + }, + { + "epoch": 5.219512195121951, + "grad_norm": 1.6387622356414795, + "learning_rate": 1.5646983311938383e-05, + "loss": 0.6594, + "step": 4066 + }, + { + "epoch": 5.220795892169448, + "grad_norm": 2.1605520248413086, + "learning_rate": 1.5650834403080874e-05, + "loss": 0.6483, + "step": 4067 + }, + { + "epoch": 5.2220795892169445, + "grad_norm": 4.567575454711914, + "learning_rate": 1.5654685494223362e-05, + "loss": 0.6378, + "step": 4068 + }, + { + "epoch": 5.2233632862644415, + "grad_norm": 1.3426835536956787, + "learning_rate": 1.5658536585365853e-05, + "loss": 0.5874, + "step": 4069 + }, + { + "epoch": 5.224646983311938, + "grad_norm": 3.0690102577209473, + "learning_rate": 1.5662387676508344e-05, + "loss": 0.629, + "step": 4070 + }, + { + "epoch": 5.225930680359435, + "grad_norm": 2.8091282844543457, + "learning_rate": 1.5666238767650835e-05, + "loss": 0.6653, + "step": 4071 + }, + { + "epoch": 5.227214377406932, + "grad_norm": 2.6515963077545166, + "learning_rate": 1.5670089858793326e-05, + "loss": 0.6159, + "step": 4072 + }, + { + "epoch": 5.2284980744544285, + "grad_norm": 1.834327220916748, + "learning_rate": 1.5673940949935814e-05, + "loss": 0.6162, + "step": 4073 + }, + { + "epoch": 5.2297817715019255, + "grad_norm": 3.4233291149139404, + "learning_rate": 1.5677792041078305e-05, + "loss": 0.6091, + "step": 4074 + }, + { + "epoch": 5.2310654685494224, + "grad_norm": 1.5054512023925781, + "learning_rate": 1.5681643132220796e-05, + "loss": 0.6158, + "step": 4075 + }, + { + "epoch": 5.232349165596919, + "grad_norm": 4.3566083908081055, + "learning_rate": 1.5685494223363287e-05, + "loss": 0.648, + "step": 4076 + }, + { + "epoch": 5.2336328626444155, + "grad_norm": 1.9112563133239746, + "learning_rate": 1.568934531450578e-05, + "loss": 0.583, + "step": 4077 + }, + { + "epoch": 5.2349165596919125, + "grad_norm": 2.4928629398345947, + "learning_rate": 1.5693196405648266e-05, + "loss": 0.7056, + "step": 4078 + }, + { + "epoch": 5.2362002567394095, + "grad_norm": 6.269789695739746, + "learning_rate": 1.5697047496790757e-05, + "loss": 0.6811, + "step": 4079 + }, + { + "epoch": 5.2374839537869065, + "grad_norm": 2.625793695449829, + "learning_rate": 1.5700898587933248e-05, + "loss": 0.6469, + "step": 4080 + }, + { + "epoch": 5.238767650834403, + "grad_norm": 2.6589369773864746, + "learning_rate": 1.5704749679075736e-05, + "loss": 0.6512, + "step": 4081 + }, + { + "epoch": 5.2400513478818995, + "grad_norm": 4.319664001464844, + "learning_rate": 1.570860077021823e-05, + "loss": 0.6984, + "step": 4082 + }, + { + "epoch": 5.2413350449293965, + "grad_norm": 2.403660535812378, + "learning_rate": 1.571245186136072e-05, + "loss": 0.6895, + "step": 4083 + }, + { + "epoch": 5.2426187419768935, + "grad_norm": 19.452503204345703, + "learning_rate": 1.571630295250321e-05, + "loss": 0.6754, + "step": 4084 + }, + { + "epoch": 5.2439024390243905, + "grad_norm": 2.0949747562408447, + "learning_rate": 1.57201540436457e-05, + "loss": 0.7165, + "step": 4085 + }, + { + "epoch": 5.245186136071887, + "grad_norm": 2.9447314739227295, + "learning_rate": 1.572400513478819e-05, + "loss": 0.7093, + "step": 4086 + }, + { + "epoch": 5.2464698331193835, + "grad_norm": 1.3162916898727417, + "learning_rate": 1.572785622593068e-05, + "loss": 0.681, + "step": 4087 + }, + { + "epoch": 5.2477535301668805, + "grad_norm": 2.8145792484283447, + "learning_rate": 1.5731707317073173e-05, + "loss": 0.7583, + "step": 4088 + }, + { + "epoch": 5.2490372272143775, + "grad_norm": 3.131622552871704, + "learning_rate": 1.573555840821566e-05, + "loss": 0.6946, + "step": 4089 + }, + { + "epoch": 5.2503209242618745, + "grad_norm": 7.287126064300537, + "learning_rate": 1.5739409499358152e-05, + "loss": 0.6954, + "step": 4090 + }, + { + "epoch": 5.251604621309371, + "grad_norm": 2.429964303970337, + "learning_rate": 1.5743260590500643e-05, + "loss": 0.7548, + "step": 4091 + }, + { + "epoch": 5.2528883183568675, + "grad_norm": 3.164731502532959, + "learning_rate": 1.574711168164313e-05, + "loss": 0.7829, + "step": 4092 + }, + { + "epoch": 5.2541720154043645, + "grad_norm": 2.317563772201538, + "learning_rate": 1.5750962772785622e-05, + "loss": 0.7559, + "step": 4093 + }, + { + "epoch": 5.2554557124518615, + "grad_norm": 5.85449743270874, + "learning_rate": 1.5754813863928116e-05, + "loss": 0.8088, + "step": 4094 + }, + { + "epoch": 5.2567394094993585, + "grad_norm": 3.5910463333129883, + "learning_rate": 1.5758664955070604e-05, + "loss": 0.911, + "step": 4095 + }, + { + "epoch": 5.258023106546855, + "grad_norm": 3.2000274658203125, + "learning_rate": 1.5762516046213095e-05, + "loss": 0.6263, + "step": 4096 + }, + { + "epoch": 5.2593068035943515, + "grad_norm": 1.5977823734283447, + "learning_rate": 1.5766367137355583e-05, + "loss": 0.6164, + "step": 4097 + }, + { + "epoch": 5.2605905006418485, + "grad_norm": 9.146986961364746, + "learning_rate": 1.5770218228498074e-05, + "loss": 0.6133, + "step": 4098 + }, + { + "epoch": 5.2618741976893455, + "grad_norm": 5.546446323394775, + "learning_rate": 1.5774069319640565e-05, + "loss": 0.6151, + "step": 4099 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 2.101184368133545, + "learning_rate": 1.5777920410783056e-05, + "loss": 0.6485, + "step": 4100 + }, + { + "epoch": 5.264441591784339, + "grad_norm": 3.179877996444702, + "learning_rate": 1.5781771501925547e-05, + "loss": 0.6456, + "step": 4101 + }, + { + "epoch": 5.2657252888318355, + "grad_norm": 1.1774340867996216, + "learning_rate": 1.5785622593068038e-05, + "loss": 0.6099, + "step": 4102 + }, + { + "epoch": 5.2670089858793325, + "grad_norm": 1.7437776327133179, + "learning_rate": 1.5789473684210526e-05, + "loss": 0.5778, + "step": 4103 + }, + { + "epoch": 5.2682926829268295, + "grad_norm": 1.4571985006332397, + "learning_rate": 1.5793324775353017e-05, + "loss": 0.6113, + "step": 4104 + }, + { + "epoch": 5.2695763799743265, + "grad_norm": 1.5323083400726318, + "learning_rate": 1.5797175866495508e-05, + "loss": 0.6239, + "step": 4105 + }, + { + "epoch": 5.270860077021823, + "grad_norm": 7.099809646606445, + "learning_rate": 1.5801026957637996e-05, + "loss": 0.5784, + "step": 4106 + }, + { + "epoch": 5.2721437740693196, + "grad_norm": 2.308004379272461, + "learning_rate": 1.580487804878049e-05, + "loss": 0.6356, + "step": 4107 + }, + { + "epoch": 5.2734274711168165, + "grad_norm": 14.21533489227295, + "learning_rate": 1.5808729139922978e-05, + "loss": 0.6202, + "step": 4108 + }, + { + "epoch": 5.2747111681643135, + "grad_norm": 1.8083785772323608, + "learning_rate": 1.581258023106547e-05, + "loss": 0.6684, + "step": 4109 + }, + { + "epoch": 5.27599486521181, + "grad_norm": 1.8340864181518555, + "learning_rate": 1.581643132220796e-05, + "loss": 0.6389, + "step": 4110 + }, + { + "epoch": 5.277278562259307, + "grad_norm": 7.9027814865112305, + "learning_rate": 1.5820282413350448e-05, + "loss": 0.6841, + "step": 4111 + }, + { + "epoch": 5.278562259306804, + "grad_norm": 2.60162615776062, + "learning_rate": 1.582413350449294e-05, + "loss": 0.6658, + "step": 4112 + }, + { + "epoch": 5.2798459563543005, + "grad_norm": 1.888907551765442, + "learning_rate": 1.5827984595635433e-05, + "loss": 0.6115, + "step": 4113 + }, + { + "epoch": 5.2811296534017975, + "grad_norm": 2.5081405639648438, + "learning_rate": 1.583183568677792e-05, + "loss": 0.64, + "step": 4114 + }, + { + "epoch": 5.282413350449294, + "grad_norm": 3.5566790103912354, + "learning_rate": 1.5835686777920412e-05, + "loss": 0.6587, + "step": 4115 + }, + { + "epoch": 5.283697047496791, + "grad_norm": 6.237599849700928, + "learning_rate": 1.58395378690629e-05, + "loss": 0.5831, + "step": 4116 + }, + { + "epoch": 5.284980744544288, + "grad_norm": 4.221958160400391, + "learning_rate": 1.584338896020539e-05, + "loss": 0.6442, + "step": 4117 + }, + { + "epoch": 5.2862644415917845, + "grad_norm": 3.516270399093628, + "learning_rate": 1.5847240051347882e-05, + "loss": 0.6158, + "step": 4118 + }, + { + "epoch": 5.2875481386392815, + "grad_norm": 3.4115724563598633, + "learning_rate": 1.5851091142490373e-05, + "loss": 0.6279, + "step": 4119 + }, + { + "epoch": 5.288831835686778, + "grad_norm": 2.304248809814453, + "learning_rate": 1.5854942233632864e-05, + "loss": 0.6428, + "step": 4120 + }, + { + "epoch": 5.290115532734275, + "grad_norm": 1.56059730052948, + "learning_rate": 1.5858793324775355e-05, + "loss": 0.628, + "step": 4121 + }, + { + "epoch": 5.291399229781772, + "grad_norm": 1.8371672630310059, + "learning_rate": 1.5862644415917843e-05, + "loss": 0.5911, + "step": 4122 + }, + { + "epoch": 5.2926829268292686, + "grad_norm": 2.2626194953918457, + "learning_rate": 1.5866495507060334e-05, + "loss": 0.672, + "step": 4123 + }, + { + "epoch": 5.293966623876765, + "grad_norm": 1.180273175239563, + "learning_rate": 1.5870346598202825e-05, + "loss": 0.6487, + "step": 4124 + }, + { + "epoch": 5.295250320924262, + "grad_norm": 1.640741229057312, + "learning_rate": 1.5874197689345316e-05, + "loss": 0.6052, + "step": 4125 + }, + { + "epoch": 5.296534017971759, + "grad_norm": 2.2496235370635986, + "learning_rate": 1.5878048780487807e-05, + "loss": 0.6404, + "step": 4126 + }, + { + "epoch": 5.297817715019256, + "grad_norm": 1.4191396236419678, + "learning_rate": 1.5881899871630295e-05, + "loss": 0.6248, + "step": 4127 + }, + { + "epoch": 5.299101412066753, + "grad_norm": 5.299356937408447, + "learning_rate": 1.5885750962772786e-05, + "loss": 0.6351, + "step": 4128 + }, + { + "epoch": 5.300385109114249, + "grad_norm": 2.2928271293640137, + "learning_rate": 1.5889602053915277e-05, + "loss": 0.6847, + "step": 4129 + }, + { + "epoch": 5.301668806161746, + "grad_norm": 3.045933246612549, + "learning_rate": 1.5893453145057764e-05, + "loss": 0.6493, + "step": 4130 + }, + { + "epoch": 5.302952503209243, + "grad_norm": 3.974154472351074, + "learning_rate": 1.589730423620026e-05, + "loss": 0.6406, + "step": 4131 + }, + { + "epoch": 5.30423620025674, + "grad_norm": 2.092911958694458, + "learning_rate": 1.590115532734275e-05, + "loss": 0.6556, + "step": 4132 + }, + { + "epoch": 5.305519897304237, + "grad_norm": 2.7398102283477783, + "learning_rate": 1.5905006418485238e-05, + "loss": 0.6521, + "step": 4133 + }, + { + "epoch": 5.306803594351733, + "grad_norm": 5.432845115661621, + "learning_rate": 1.590885750962773e-05, + "loss": 0.6245, + "step": 4134 + }, + { + "epoch": 5.30808729139923, + "grad_norm": 2.029982566833496, + "learning_rate": 1.5912708600770216e-05, + "loss": 0.6625, + "step": 4135 + }, + { + "epoch": 5.309370988446727, + "grad_norm": 1.7351845502853394, + "learning_rate": 1.5916559691912707e-05, + "loss": 0.6782, + "step": 4136 + }, + { + "epoch": 5.310654685494224, + "grad_norm": 2.4285616874694824, + "learning_rate": 1.5920410783055202e-05, + "loss": 0.7527, + "step": 4137 + }, + { + "epoch": 5.311938382541721, + "grad_norm": 3.9706833362579346, + "learning_rate": 1.592426187419769e-05, + "loss": 0.6589, + "step": 4138 + }, + { + "epoch": 5.313222079589217, + "grad_norm": 6.281116485595703, + "learning_rate": 1.592811296534018e-05, + "loss": 0.6611, + "step": 4139 + }, + { + "epoch": 5.314505776636714, + "grad_norm": 3.925813674926758, + "learning_rate": 1.5931964056482672e-05, + "loss": 0.72, + "step": 4140 + }, + { + "epoch": 5.315789473684211, + "grad_norm": 3.2339844703674316, + "learning_rate": 1.593581514762516e-05, + "loss": 0.7915, + "step": 4141 + }, + { + "epoch": 5.317073170731708, + "grad_norm": 1.8605577945709229, + "learning_rate": 1.593966623876765e-05, + "loss": 0.7529, + "step": 4142 + }, + { + "epoch": 5.318356867779205, + "grad_norm": 1.6788089275360107, + "learning_rate": 1.594351732991014e-05, + "loss": 0.7913, + "step": 4143 + }, + { + "epoch": 5.319640564826701, + "grad_norm": 4.34873104095459, + "learning_rate": 1.5947368421052633e-05, + "loss": 0.8572, + "step": 4144 + }, + { + "epoch": 5.320924261874198, + "grad_norm": 9.100040435791016, + "learning_rate": 1.5951219512195124e-05, + "loss": 0.982, + "step": 4145 + }, + { + "epoch": 5.322207958921695, + "grad_norm": 1.8978099822998047, + "learning_rate": 1.595507060333761e-05, + "loss": 0.6323, + "step": 4146 + }, + { + "epoch": 5.323491655969192, + "grad_norm": 1.5450056791305542, + "learning_rate": 1.5958921694480102e-05, + "loss": 0.5867, + "step": 4147 + }, + { + "epoch": 5.324775353016688, + "grad_norm": 3.5902135372161865, + "learning_rate": 1.5962772785622594e-05, + "loss": 0.6374, + "step": 4148 + }, + { + "epoch": 5.326059050064185, + "grad_norm": 2.049207925796509, + "learning_rate": 1.5966623876765085e-05, + "loss": 0.6669, + "step": 4149 + }, + { + "epoch": 5.327342747111682, + "grad_norm": 1.8231462240219116, + "learning_rate": 1.5970474967907576e-05, + "loss": 0.6361, + "step": 4150 + }, + { + "epoch": 5.328626444159179, + "grad_norm": 1.0615911483764648, + "learning_rate": 1.5974326059050067e-05, + "loss": 0.6298, + "step": 4151 + }, + { + "epoch": 5.329910141206676, + "grad_norm": 2.2421278953552246, + "learning_rate": 1.5978177150192554e-05, + "loss": 0.6318, + "step": 4152 + }, + { + "epoch": 5.331193838254172, + "grad_norm": 2.9801878929138184, + "learning_rate": 1.5982028241335045e-05, + "loss": 0.6798, + "step": 4153 + }, + { + "epoch": 5.332477535301669, + "grad_norm": 3.6053545475006104, + "learning_rate": 1.5985879332477533e-05, + "loss": 0.6145, + "step": 4154 + }, + { + "epoch": 5.333761232349166, + "grad_norm": 2.288904905319214, + "learning_rate": 1.5989730423620024e-05, + "loss": 0.6989, + "step": 4155 + }, + { + "epoch": 5.335044929396663, + "grad_norm": 2.5273609161376953, + "learning_rate": 1.599358151476252e-05, + "loss": 0.6489, + "step": 4156 + }, + { + "epoch": 5.336328626444159, + "grad_norm": 1.7449934482574463, + "learning_rate": 1.5997432605905006e-05, + "loss": 0.6794, + "step": 4157 + }, + { + "epoch": 5.337612323491656, + "grad_norm": 1.6166746616363525, + "learning_rate": 1.6001283697047497e-05, + "loss": 0.5947, + "step": 4158 + }, + { + "epoch": 5.338896020539153, + "grad_norm": 4.780966758728027, + "learning_rate": 1.600513478818999e-05, + "loss": 0.6832, + "step": 4159 + }, + { + "epoch": 5.34017971758665, + "grad_norm": 1.6528953313827515, + "learning_rate": 1.6008985879332476e-05, + "loss": 0.6244, + "step": 4160 + }, + { + "epoch": 5.341463414634147, + "grad_norm": 1.4498509168624878, + "learning_rate": 1.6012836970474967e-05, + "loss": 0.6383, + "step": 4161 + }, + { + "epoch": 5.342747111681643, + "grad_norm": 1.8704229593276978, + "learning_rate": 1.601668806161746e-05, + "loss": 0.6103, + "step": 4162 + }, + { + "epoch": 5.34403080872914, + "grad_norm": 1.1405516862869263, + "learning_rate": 1.602053915275995e-05, + "loss": 0.6216, + "step": 4163 + }, + { + "epoch": 5.345314505776637, + "grad_norm": 1.5496256351470947, + "learning_rate": 1.602439024390244e-05, + "loss": 0.6436, + "step": 4164 + }, + { + "epoch": 5.346598202824134, + "grad_norm": 2.501094102859497, + "learning_rate": 1.6028241335044928e-05, + "loss": 0.6498, + "step": 4165 + }, + { + "epoch": 5.347881899871631, + "grad_norm": 2.3501970767974854, + "learning_rate": 1.603209242618742e-05, + "loss": 0.6341, + "step": 4166 + }, + { + "epoch": 5.349165596919127, + "grad_norm": 3.337913990020752, + "learning_rate": 1.603594351732991e-05, + "loss": 0.6341, + "step": 4167 + }, + { + "epoch": 5.350449293966624, + "grad_norm": 2.976102828979492, + "learning_rate": 1.60397946084724e-05, + "loss": 0.6295, + "step": 4168 + }, + { + "epoch": 5.351732991014121, + "grad_norm": 2.3169658184051514, + "learning_rate": 1.6043645699614892e-05, + "loss": 0.6572, + "step": 4169 + }, + { + "epoch": 5.353016688061618, + "grad_norm": 6.096192836761475, + "learning_rate": 1.6047496790757383e-05, + "loss": 0.6309, + "step": 4170 + }, + { + "epoch": 5.354300385109115, + "grad_norm": 1.6961110830307007, + "learning_rate": 1.605134788189987e-05, + "loss": 0.6169, + "step": 4171 + }, + { + "epoch": 5.355584082156611, + "grad_norm": 2.024836778640747, + "learning_rate": 1.6055198973042362e-05, + "loss": 0.6811, + "step": 4172 + }, + { + "epoch": 5.356867779204108, + "grad_norm": 2.17374849319458, + "learning_rate": 1.605905006418485e-05, + "loss": 0.7078, + "step": 4173 + }, + { + "epoch": 5.358151476251605, + "grad_norm": 1.9120925664901733, + "learning_rate": 1.6062901155327344e-05, + "loss": 0.6124, + "step": 4174 + }, + { + "epoch": 5.359435173299102, + "grad_norm": 2.2921624183654785, + "learning_rate": 1.6066752246469835e-05, + "loss": 0.6551, + "step": 4175 + }, + { + "epoch": 5.360718870346599, + "grad_norm": 2.084669828414917, + "learning_rate": 1.6070603337612323e-05, + "loss": 0.6787, + "step": 4176 + }, + { + "epoch": 5.362002567394095, + "grad_norm": 2.4389591217041016, + "learning_rate": 1.6074454428754814e-05, + "loss": 0.65, + "step": 4177 + }, + { + "epoch": 5.363286264441592, + "grad_norm": 2.5594663619995117, + "learning_rate": 1.6078305519897305e-05, + "loss": 0.651, + "step": 4178 + }, + { + "epoch": 5.364569961489089, + "grad_norm": 3.7944207191467285, + "learning_rate": 1.6082156611039793e-05, + "loss": 0.6525, + "step": 4179 + }, + { + "epoch": 5.365853658536586, + "grad_norm": 2.9976887702941895, + "learning_rate": 1.6086007702182287e-05, + "loss": 0.6702, + "step": 4180 + }, + { + "epoch": 5.367137355584082, + "grad_norm": 2.2871134281158447, + "learning_rate": 1.6089858793324775e-05, + "loss": 0.6707, + "step": 4181 + }, + { + "epoch": 5.368421052631579, + "grad_norm": 3.244774103164673, + "learning_rate": 1.6093709884467266e-05, + "loss": 0.6701, + "step": 4182 + }, + { + "epoch": 5.369704749679076, + "grad_norm": 4.555275917053223, + "learning_rate": 1.6097560975609757e-05, + "loss": 0.696, + "step": 4183 + }, + { + "epoch": 5.370988446726573, + "grad_norm": 3.702625036239624, + "learning_rate": 1.6101412066752245e-05, + "loss": 0.7, + "step": 4184 + }, + { + "epoch": 5.37227214377407, + "grad_norm": 6.576122760772705, + "learning_rate": 1.6105263157894736e-05, + "loss": 0.674, + "step": 4185 + }, + { + "epoch": 5.373555840821566, + "grad_norm": 2.180065155029297, + "learning_rate": 1.610911424903723e-05, + "loss": 0.6818, + "step": 4186 + }, + { + "epoch": 5.374839537869063, + "grad_norm": 3.074937343597412, + "learning_rate": 1.6112965340179718e-05, + "loss": 0.7539, + "step": 4187 + }, + { + "epoch": 5.37612323491656, + "grad_norm": 2.6715006828308105, + "learning_rate": 1.611681643132221e-05, + "loss": 0.6857, + "step": 4188 + }, + { + "epoch": 5.377406931964057, + "grad_norm": 3.364168167114258, + "learning_rate": 1.6120667522464697e-05, + "loss": 0.686, + "step": 4189 + }, + { + "epoch": 5.378690629011553, + "grad_norm": 1.9851948022842407, + "learning_rate": 1.6124518613607188e-05, + "loss": 0.7218, + "step": 4190 + }, + { + "epoch": 5.37997432605905, + "grad_norm": 3.238682746887207, + "learning_rate": 1.612836970474968e-05, + "loss": 0.721, + "step": 4191 + }, + { + "epoch": 5.381258023106547, + "grad_norm": 1.7978001832962036, + "learning_rate": 1.613222079589217e-05, + "loss": 0.7878, + "step": 4192 + }, + { + "epoch": 5.382541720154044, + "grad_norm": 3.4052321910858154, + "learning_rate": 1.613607188703466e-05, + "loss": 0.8058, + "step": 4193 + }, + { + "epoch": 5.383825417201541, + "grad_norm": 2.748804807662964, + "learning_rate": 1.6139922978177152e-05, + "loss": 0.7777, + "step": 4194 + }, + { + "epoch": 5.385109114249037, + "grad_norm": 4.264201641082764, + "learning_rate": 1.614377406931964e-05, + "loss": 1.0527, + "step": 4195 + }, + { + "epoch": 5.386392811296534, + "grad_norm": 2.6514720916748047, + "learning_rate": 1.614762516046213e-05, + "loss": 0.6756, + "step": 4196 + }, + { + "epoch": 5.387676508344031, + "grad_norm": 1.505454659461975, + "learning_rate": 1.6151476251604622e-05, + "loss": 0.6322, + "step": 4197 + }, + { + "epoch": 5.388960205391528, + "grad_norm": 1.814753770828247, + "learning_rate": 1.6155327342747113e-05, + "loss": 0.6222, + "step": 4198 + }, + { + "epoch": 5.390243902439025, + "grad_norm": 2.473555088043213, + "learning_rate": 1.6159178433889604e-05, + "loss": 0.644, + "step": 4199 + }, + { + "epoch": 5.391527599486521, + "grad_norm": 2.266329050064087, + "learning_rate": 1.6163029525032092e-05, + "loss": 0.633, + "step": 4200 + }, + { + "epoch": 5.392811296534018, + "grad_norm": 1.2017114162445068, + "learning_rate": 1.6166880616174583e-05, + "loss": 0.6318, + "step": 4201 + }, + { + "epoch": 5.394094993581515, + "grad_norm": 1.6466405391693115, + "learning_rate": 1.6170731707317074e-05, + "loss": 0.6439, + "step": 4202 + }, + { + "epoch": 5.395378690629012, + "grad_norm": 1.4348183870315552, + "learning_rate": 1.6174582798459562e-05, + "loss": 0.6336, + "step": 4203 + }, + { + "epoch": 5.396662387676509, + "grad_norm": 1.4761970043182373, + "learning_rate": 1.6178433889602056e-05, + "loss": 0.6545, + "step": 4204 + }, + { + "epoch": 5.397946084724005, + "grad_norm": 1.4091612100601196, + "learning_rate": 1.6182284980744547e-05, + "loss": 0.627, + "step": 4205 + }, + { + "epoch": 5.399229781771502, + "grad_norm": 2.1815450191497803, + "learning_rate": 1.6186136071887035e-05, + "loss": 0.6297, + "step": 4206 + }, + { + "epoch": 5.400513478818999, + "grad_norm": 4.05512809753418, + "learning_rate": 1.6189987163029526e-05, + "loss": 0.6421, + "step": 4207 + }, + { + "epoch": 5.401797175866496, + "grad_norm": 1.7763680219650269, + "learning_rate": 1.6193838254172014e-05, + "loss": 0.7053, + "step": 4208 + }, + { + "epoch": 5.403080872913993, + "grad_norm": 1.7794417142868042, + "learning_rate": 1.6197689345314505e-05, + "loss": 0.6622, + "step": 4209 + }, + { + "epoch": 5.404364569961489, + "grad_norm": 2.8104262351989746, + "learning_rate": 1.6201540436456996e-05, + "loss": 0.6154, + "step": 4210 + }, + { + "epoch": 5.405648267008986, + "grad_norm": 2.2033474445343018, + "learning_rate": 1.6205391527599487e-05, + "loss": 0.68, + "step": 4211 + }, + { + "epoch": 5.406931964056483, + "grad_norm": 6.414041519165039, + "learning_rate": 1.6209242618741978e-05, + "loss": 0.6208, + "step": 4212 + }, + { + "epoch": 5.40821566110398, + "grad_norm": 9.502405166625977, + "learning_rate": 1.621309370988447e-05, + "loss": 0.6299, + "step": 4213 + }, + { + "epoch": 5.409499358151476, + "grad_norm": 3.7223703861236572, + "learning_rate": 1.6216944801026957e-05, + "loss": 0.6062, + "step": 4214 + }, + { + "epoch": 5.410783055198973, + "grad_norm": 10.446114540100098, + "learning_rate": 1.6220795892169448e-05, + "loss": 0.614, + "step": 4215 + }, + { + "epoch": 5.41206675224647, + "grad_norm": 1.5935193300247192, + "learning_rate": 1.622464698331194e-05, + "loss": 0.629, + "step": 4216 + }, + { + "epoch": 5.413350449293967, + "grad_norm": 1.9681264162063599, + "learning_rate": 1.622849807445443e-05, + "loss": 0.6379, + "step": 4217 + }, + { + "epoch": 5.414634146341464, + "grad_norm": 1.6943492889404297, + "learning_rate": 1.623234916559692e-05, + "loss": 0.6033, + "step": 4218 + }, + { + "epoch": 5.41591784338896, + "grad_norm": 1.427199363708496, + "learning_rate": 1.623620025673941e-05, + "loss": 0.6216, + "step": 4219 + }, + { + "epoch": 5.417201540436457, + "grad_norm": 1.86269211769104, + "learning_rate": 1.62400513478819e-05, + "loss": 0.6559, + "step": 4220 + }, + { + "epoch": 5.418485237483954, + "grad_norm": 1.6022456884384155, + "learning_rate": 1.624390243902439e-05, + "loss": 0.6223, + "step": 4221 + }, + { + "epoch": 5.419768934531451, + "grad_norm": 2.5826303958892822, + "learning_rate": 1.624775353016688e-05, + "loss": 0.6615, + "step": 4222 + }, + { + "epoch": 5.421052631578947, + "grad_norm": 1.8516720533370972, + "learning_rate": 1.6251604621309373e-05, + "loss": 0.6166, + "step": 4223 + }, + { + "epoch": 5.422336328626444, + "grad_norm": 1.4761958122253418, + "learning_rate": 1.6255455712451864e-05, + "loss": 0.6485, + "step": 4224 + }, + { + "epoch": 5.423620025673941, + "grad_norm": 1.4821816682815552, + "learning_rate": 1.6259306803594352e-05, + "loss": 0.615, + "step": 4225 + }, + { + "epoch": 5.424903722721438, + "grad_norm": 2.990129232406616, + "learning_rate": 1.6263157894736843e-05, + "loss": 0.6346, + "step": 4226 + }, + { + "epoch": 5.426187419768935, + "grad_norm": 2.245415687561035, + "learning_rate": 1.626700898587933e-05, + "loss": 0.6352, + "step": 4227 + }, + { + "epoch": 5.427471116816431, + "grad_norm": 1.5954135656356812, + "learning_rate": 1.627086007702182e-05, + "loss": 0.663, + "step": 4228 + }, + { + "epoch": 5.428754813863928, + "grad_norm": 2.0165088176727295, + "learning_rate": 1.6274711168164316e-05, + "loss": 0.676, + "step": 4229 + }, + { + "epoch": 5.430038510911425, + "grad_norm": 1.808259129524231, + "learning_rate": 1.6278562259306804e-05, + "loss": 0.7015, + "step": 4230 + }, + { + "epoch": 5.431322207958922, + "grad_norm": 1.840050458908081, + "learning_rate": 1.6282413350449295e-05, + "loss": 0.7129, + "step": 4231 + }, + { + "epoch": 5.432605905006419, + "grad_norm": 1.8677133321762085, + "learning_rate": 1.6286264441591786e-05, + "loss": 0.6775, + "step": 4232 + }, + { + "epoch": 5.433889602053915, + "grad_norm": 3.5101356506347656, + "learning_rate": 1.6290115532734274e-05, + "loss": 0.6576, + "step": 4233 + }, + { + "epoch": 5.435173299101412, + "grad_norm": 2.973928213119507, + "learning_rate": 1.6293966623876765e-05, + "loss": 0.7117, + "step": 4234 + }, + { + "epoch": 5.436456996148909, + "grad_norm": 3.0258452892303467, + "learning_rate": 1.6297817715019256e-05, + "loss": 0.6624, + "step": 4235 + }, + { + "epoch": 5.437740693196406, + "grad_norm": 2.2028701305389404, + "learning_rate": 1.6301668806161747e-05, + "loss": 0.754, + "step": 4236 + }, + { + "epoch": 5.439024390243903, + "grad_norm": 2.851534366607666, + "learning_rate": 1.6305519897304238e-05, + "loss": 0.7498, + "step": 4237 + }, + { + "epoch": 5.440308087291399, + "grad_norm": 3.4250876903533936, + "learning_rate": 1.6309370988446725e-05, + "loss": 0.7067, + "step": 4238 + }, + { + "epoch": 5.441591784338896, + "grad_norm": 3.87798810005188, + "learning_rate": 1.6313222079589217e-05, + "loss": 0.7034, + "step": 4239 + }, + { + "epoch": 5.442875481386393, + "grad_norm": 3.2009384632110596, + "learning_rate": 1.6317073170731708e-05, + "loss": 0.6988, + "step": 4240 + }, + { + "epoch": 5.44415917843389, + "grad_norm": 6.432138919830322, + "learning_rate": 1.63209242618742e-05, + "loss": 0.7502, + "step": 4241 + }, + { + "epoch": 5.445442875481387, + "grad_norm": 3.933030605316162, + "learning_rate": 1.632477535301669e-05, + "loss": 0.7855, + "step": 4242 + }, + { + "epoch": 5.446726572528883, + "grad_norm": 3.0735092163085938, + "learning_rate": 1.632862644415918e-05, + "loss": 0.7703, + "step": 4243 + }, + { + "epoch": 5.44801026957638, + "grad_norm": 4.943604946136475, + "learning_rate": 1.633247753530167e-05, + "loss": 0.8611, + "step": 4244 + }, + { + "epoch": 5.449293966623877, + "grad_norm": 2.8642776012420654, + "learning_rate": 1.633632862644416e-05, + "loss": 0.8572, + "step": 4245 + }, + { + "epoch": 5.450577663671374, + "grad_norm": 2.483442783355713, + "learning_rate": 1.6340179717586647e-05, + "loss": 0.6459, + "step": 4246 + }, + { + "epoch": 5.45186136071887, + "grad_norm": 1.4243509769439697, + "learning_rate": 1.6344030808729142e-05, + "loss": 0.6492, + "step": 4247 + }, + { + "epoch": 5.453145057766367, + "grad_norm": 1.4967914819717407, + "learning_rate": 1.6347881899871633e-05, + "loss": 0.6283, + "step": 4248 + }, + { + "epoch": 5.454428754813864, + "grad_norm": 1.923837423324585, + "learning_rate": 1.635173299101412e-05, + "loss": 0.5964, + "step": 4249 + }, + { + "epoch": 5.455712451861361, + "grad_norm": 1.7870192527770996, + "learning_rate": 1.635558408215661e-05, + "loss": 0.6381, + "step": 4250 + }, + { + "epoch": 5.456996148908858, + "grad_norm": 1.4920954704284668, + "learning_rate": 1.6359435173299103e-05, + "loss": 0.5927, + "step": 4251 + }, + { + "epoch": 5.458279845956354, + "grad_norm": 2.593395471572876, + "learning_rate": 1.636328626444159e-05, + "loss": 0.6313, + "step": 4252 + }, + { + "epoch": 5.459563543003851, + "grad_norm": 1.5453187227249146, + "learning_rate": 1.6367137355584085e-05, + "loss": 0.6, + "step": 4253 + }, + { + "epoch": 5.460847240051348, + "grad_norm": 1.3348695039749146, + "learning_rate": 1.6370988446726572e-05, + "loss": 0.6141, + "step": 4254 + }, + { + "epoch": 5.462130937098845, + "grad_norm": 2.678779125213623, + "learning_rate": 1.6374839537869063e-05, + "loss": 0.626, + "step": 4255 + }, + { + "epoch": 5.463414634146342, + "grad_norm": 1.5165486335754395, + "learning_rate": 1.6378690629011555e-05, + "loss": 0.6248, + "step": 4256 + }, + { + "epoch": 5.464698331193838, + "grad_norm": 19.677757263183594, + "learning_rate": 1.6382541720154042e-05, + "loss": 0.5903, + "step": 4257 + }, + { + "epoch": 5.465982028241335, + "grad_norm": 1.386756181716919, + "learning_rate": 1.6386392811296533e-05, + "loss": 0.596, + "step": 4258 + }, + { + "epoch": 5.467265725288832, + "grad_norm": 3.6565072536468506, + "learning_rate": 1.6390243902439024e-05, + "loss": 0.6032, + "step": 4259 + }, + { + "epoch": 5.468549422336329, + "grad_norm": 1.21598482131958, + "learning_rate": 1.6394094993581515e-05, + "loss": 0.605, + "step": 4260 + }, + { + "epoch": 5.469833119383825, + "grad_norm": 5.892523288726807, + "learning_rate": 1.6397946084724007e-05, + "loss": 0.613, + "step": 4261 + }, + { + "epoch": 5.471116816431322, + "grad_norm": 2.5575027465820312, + "learning_rate": 1.6401797175866498e-05, + "loss": 0.6539, + "step": 4262 + }, + { + "epoch": 5.472400513478819, + "grad_norm": 2.816608428955078, + "learning_rate": 1.6405648267008985e-05, + "loss": 0.6534, + "step": 4263 + }, + { + "epoch": 5.473684210526316, + "grad_norm": 2.716818332672119, + "learning_rate": 1.6409499358151476e-05, + "loss": 0.6742, + "step": 4264 + }, + { + "epoch": 5.474967907573813, + "grad_norm": 2.644439220428467, + "learning_rate": 1.6413350449293964e-05, + "loss": 0.6302, + "step": 4265 + }, + { + "epoch": 5.476251604621309, + "grad_norm": 2.003223180770874, + "learning_rate": 1.641720154043646e-05, + "loss": 0.6202, + "step": 4266 + }, + { + "epoch": 5.477535301668806, + "grad_norm": 2.279010772705078, + "learning_rate": 1.642105263157895e-05, + "loss": 0.6245, + "step": 4267 + }, + { + "epoch": 5.478818998716303, + "grad_norm": 4.138490200042725, + "learning_rate": 1.6424903722721437e-05, + "loss": 0.6044, + "step": 4268 + }, + { + "epoch": 5.4801026957638, + "grad_norm": 3.2843174934387207, + "learning_rate": 1.642875481386393e-05, + "loss": 0.665, + "step": 4269 + }, + { + "epoch": 5.481386392811297, + "grad_norm": 1.7014896869659424, + "learning_rate": 1.643260590500642e-05, + "loss": 0.66, + "step": 4270 + }, + { + "epoch": 5.482670089858793, + "grad_norm": 3.5983829498291016, + "learning_rate": 1.6436456996148907e-05, + "loss": 0.6618, + "step": 4271 + }, + { + "epoch": 5.48395378690629, + "grad_norm": 6.3533034324646, + "learning_rate": 1.64403080872914e-05, + "loss": 0.6632, + "step": 4272 + }, + { + "epoch": 5.485237483953787, + "grad_norm": 4.74854850769043, + "learning_rate": 1.644415917843389e-05, + "loss": 0.5826, + "step": 4273 + }, + { + "epoch": 5.486521181001284, + "grad_norm": 3.3803746700286865, + "learning_rate": 1.644801026957638e-05, + "loss": 0.6616, + "step": 4274 + }, + { + "epoch": 5.487804878048781, + "grad_norm": 2.8762547969818115, + "learning_rate": 1.645186136071887e-05, + "loss": 0.6714, + "step": 4275 + }, + { + "epoch": 5.489088575096277, + "grad_norm": 2.9515435695648193, + "learning_rate": 1.645571245186136e-05, + "loss": 0.6261, + "step": 4276 + }, + { + "epoch": 5.490372272143774, + "grad_norm": 7.585009574890137, + "learning_rate": 1.645956354300385e-05, + "loss": 0.681, + "step": 4277 + }, + { + "epoch": 5.491655969191271, + "grad_norm": 3.0360584259033203, + "learning_rate": 1.6463414634146345e-05, + "loss": 0.6467, + "step": 4278 + }, + { + "epoch": 5.492939666238768, + "grad_norm": 2.883394718170166, + "learning_rate": 1.6467265725288832e-05, + "loss": 0.6688, + "step": 4279 + }, + { + "epoch": 5.494223363286264, + "grad_norm": 4.011213779449463, + "learning_rate": 1.6471116816431323e-05, + "loss": 0.7228, + "step": 4280 + }, + { + "epoch": 5.495507060333761, + "grad_norm": 3.079350709915161, + "learning_rate": 1.6474967907573814e-05, + "loss": 0.7075, + "step": 4281 + }, + { + "epoch": 5.496790757381258, + "grad_norm": 2.5539839267730713, + "learning_rate": 1.6478818998716302e-05, + "loss": 0.6459, + "step": 4282 + }, + { + "epoch": 5.498074454428755, + "grad_norm": 4.945079803466797, + "learning_rate": 1.6482670089858793e-05, + "loss": 0.7226, + "step": 4283 + }, + { + "epoch": 5.499358151476252, + "grad_norm": 5.020328044891357, + "learning_rate": 1.6486521181001284e-05, + "loss": 0.7125, + "step": 4284 + }, + { + "epoch": 5.500641848523748, + "grad_norm": 3.052297592163086, + "learning_rate": 1.6490372272143775e-05, + "loss": 0.6494, + "step": 4285 + }, + { + "epoch": 5.501925545571245, + "grad_norm": 2.1950740814208984, + "learning_rate": 1.6494223363286266e-05, + "loss": 0.6692, + "step": 4286 + }, + { + "epoch": 5.503209242618742, + "grad_norm": 9.971721649169922, + "learning_rate": 1.6498074454428754e-05, + "loss": 0.6967, + "step": 4287 + }, + { + "epoch": 5.504492939666239, + "grad_norm": 2.2380504608154297, + "learning_rate": 1.6501925545571245e-05, + "loss": 0.6866, + "step": 4288 + }, + { + "epoch": 5.505776636713735, + "grad_norm": 6.13014554977417, + "learning_rate": 1.6505776636713736e-05, + "loss": 0.6716, + "step": 4289 + }, + { + "epoch": 5.507060333761232, + "grad_norm": 2.5367045402526855, + "learning_rate": 1.6509627727856227e-05, + "loss": 0.6564, + "step": 4290 + }, + { + "epoch": 5.508344030808729, + "grad_norm": 1.5381401777267456, + "learning_rate": 1.651347881899872e-05, + "loss": 0.671, + "step": 4291 + }, + { + "epoch": 5.509627727856226, + "grad_norm": 2.6815764904022217, + "learning_rate": 1.6517329910141206e-05, + "loss": 0.6941, + "step": 4292 + }, + { + "epoch": 5.510911424903723, + "grad_norm": 3.2153635025024414, + "learning_rate": 1.6521181001283697e-05, + "loss": 0.7235, + "step": 4293 + }, + { + "epoch": 5.512195121951219, + "grad_norm": 3.7667267322540283, + "learning_rate": 1.6525032092426188e-05, + "loss": 0.7924, + "step": 4294 + }, + { + "epoch": 5.513478818998716, + "grad_norm": 3.993846893310547, + "learning_rate": 1.6528883183568676e-05, + "loss": 0.9617, + "step": 4295 + }, + { + "epoch": 5.514762516046213, + "grad_norm": 3.3489573001861572, + "learning_rate": 1.653273427471117e-05, + "loss": 0.6082, + "step": 4296 + }, + { + "epoch": 5.51604621309371, + "grad_norm": 2.3644936084747314, + "learning_rate": 1.653658536585366e-05, + "loss": 0.5947, + "step": 4297 + }, + { + "epoch": 5.517329910141207, + "grad_norm": 3.332613945007324, + "learning_rate": 1.654043645699615e-05, + "loss": 0.6158, + "step": 4298 + }, + { + "epoch": 5.518613607188703, + "grad_norm": 1.8721575736999512, + "learning_rate": 1.654428754813864e-05, + "loss": 0.6381, + "step": 4299 + }, + { + "epoch": 5.5198973042362, + "grad_norm": 2.2883872985839844, + "learning_rate": 1.6548138639281128e-05, + "loss": 0.6657, + "step": 4300 + }, + { + "epoch": 5.521181001283697, + "grad_norm": 1.824894666671753, + "learning_rate": 1.655198973042362e-05, + "loss": 0.6154, + "step": 4301 + }, + { + "epoch": 5.522464698331194, + "grad_norm": 1.4057930707931519, + "learning_rate": 1.6555840821566113e-05, + "loss": 0.6167, + "step": 4302 + }, + { + "epoch": 5.523748395378691, + "grad_norm": 2.3687801361083984, + "learning_rate": 1.65596919127086e-05, + "loss": 0.603, + "step": 4303 + }, + { + "epoch": 5.525032092426187, + "grad_norm": 1.9276845455169678, + "learning_rate": 1.6563543003851092e-05, + "loss": 0.5893, + "step": 4304 + }, + { + "epoch": 5.526315789473684, + "grad_norm": 1.3847469091415405, + "learning_rate": 1.6567394094993583e-05, + "loss": 0.6201, + "step": 4305 + }, + { + "epoch": 5.527599486521181, + "grad_norm": 1.7054003477096558, + "learning_rate": 1.657124518613607e-05, + "loss": 0.5761, + "step": 4306 + }, + { + "epoch": 5.528883183568678, + "grad_norm": 1.9869316816329956, + "learning_rate": 1.6575096277278562e-05, + "loss": 0.6033, + "step": 4307 + }, + { + "epoch": 5.530166880616175, + "grad_norm": 4.805994510650635, + "learning_rate": 1.6578947368421053e-05, + "loss": 0.6483, + "step": 4308 + }, + { + "epoch": 5.531450577663671, + "grad_norm": 1.836843729019165, + "learning_rate": 1.6582798459563544e-05, + "loss": 0.6412, + "step": 4309 + }, + { + "epoch": 5.532734274711168, + "grad_norm": 2.5635082721710205, + "learning_rate": 1.6586649550706035e-05, + "loss": 0.6234, + "step": 4310 + }, + { + "epoch": 5.534017971758665, + "grad_norm": 2.1477386951446533, + "learning_rate": 1.6590500641848523e-05, + "loss": 0.6386, + "step": 4311 + }, + { + "epoch": 5.535301668806162, + "grad_norm": 2.714256763458252, + "learning_rate": 1.6594351732991014e-05, + "loss": 0.6215, + "step": 4312 + }, + { + "epoch": 5.536585365853659, + "grad_norm": 2.581988573074341, + "learning_rate": 1.6598202824133505e-05, + "loss": 0.6211, + "step": 4313 + }, + { + "epoch": 5.537869062901155, + "grad_norm": 4.338551998138428, + "learning_rate": 1.6602053915275993e-05, + "loss": 0.6516, + "step": 4314 + }, + { + "epoch": 5.539152759948652, + "grad_norm": 2.1291327476501465, + "learning_rate": 1.6605905006418487e-05, + "loss": 0.6108, + "step": 4315 + }, + { + "epoch": 5.540436456996149, + "grad_norm": 1.8580949306488037, + "learning_rate": 1.6609756097560978e-05, + "loss": 0.6357, + "step": 4316 + }, + { + "epoch": 5.541720154043646, + "grad_norm": 3.388636350631714, + "learning_rate": 1.6613607188703466e-05, + "loss": 0.6626, + "step": 4317 + }, + { + "epoch": 5.543003851091142, + "grad_norm": 1.3433785438537598, + "learning_rate": 1.6617458279845957e-05, + "loss": 0.6411, + "step": 4318 + }, + { + "epoch": 5.544287548138639, + "grad_norm": 1.5026334524154663, + "learning_rate": 1.6621309370988445e-05, + "loss": 0.6467, + "step": 4319 + }, + { + "epoch": 5.545571245186136, + "grad_norm": 4.253480434417725, + "learning_rate": 1.6625160462130936e-05, + "loss": 0.6346, + "step": 4320 + }, + { + "epoch": 5.546854942233633, + "grad_norm": 2.4640932083129883, + "learning_rate": 1.662901155327343e-05, + "loss": 0.626, + "step": 4321 + }, + { + "epoch": 5.548138639281129, + "grad_norm": 1.9935659170150757, + "learning_rate": 1.6632862644415918e-05, + "loss": 0.5945, + "step": 4322 + }, + { + "epoch": 5.549422336328626, + "grad_norm": 2.3665196895599365, + "learning_rate": 1.663671373555841e-05, + "loss": 0.6869, + "step": 4323 + }, + { + "epoch": 5.550706033376123, + "grad_norm": 3.3962454795837402, + "learning_rate": 1.66405648267009e-05, + "loss": 0.6676, + "step": 4324 + }, + { + "epoch": 5.55198973042362, + "grad_norm": 3.6323158740997314, + "learning_rate": 1.6644415917843388e-05, + "loss": 0.6459, + "step": 4325 + }, + { + "epoch": 5.553273427471117, + "grad_norm": 2.462651014328003, + "learning_rate": 1.664826700898588e-05, + "loss": 0.6892, + "step": 4326 + }, + { + "epoch": 5.554557124518613, + "grad_norm": 2.525251626968384, + "learning_rate": 1.6652118100128373e-05, + "loss": 0.6276, + "step": 4327 + }, + { + "epoch": 5.55584082156611, + "grad_norm": 4.528709888458252, + "learning_rate": 1.665596919127086e-05, + "loss": 0.7187, + "step": 4328 + }, + { + "epoch": 5.557124518613607, + "grad_norm": 2.061924457550049, + "learning_rate": 1.6659820282413352e-05, + "loss": 0.6134, + "step": 4329 + }, + { + "epoch": 5.558408215661104, + "grad_norm": 4.350642681121826, + "learning_rate": 1.666367137355584e-05, + "loss": 0.693, + "step": 4330 + }, + { + "epoch": 5.559691912708601, + "grad_norm": 2.764559507369995, + "learning_rate": 1.666752246469833e-05, + "loss": 0.6895, + "step": 4331 + }, + { + "epoch": 5.560975609756097, + "grad_norm": 3.3674495220184326, + "learning_rate": 1.6671373555840822e-05, + "loss": 0.6769, + "step": 4332 + }, + { + "epoch": 5.562259306803594, + "grad_norm": 1.8086693286895752, + "learning_rate": 1.6675224646983313e-05, + "loss": 0.6854, + "step": 4333 + }, + { + "epoch": 5.563543003851091, + "grad_norm": 1.7951706647872925, + "learning_rate": 1.6679075738125804e-05, + "loss": 0.6492, + "step": 4334 + }, + { + "epoch": 5.564826700898588, + "grad_norm": 4.247420787811279, + "learning_rate": 1.6682926829268295e-05, + "loss": 0.7175, + "step": 4335 + }, + { + "epoch": 5.566110397946085, + "grad_norm": 1.558682918548584, + "learning_rate": 1.6686777920410783e-05, + "loss": 0.6514, + "step": 4336 + }, + { + "epoch": 5.567394094993581, + "grad_norm": 3.9076411724090576, + "learning_rate": 1.6690629011553274e-05, + "loss": 0.6766, + "step": 4337 + }, + { + "epoch": 5.568677792041078, + "grad_norm": 2.5382139682769775, + "learning_rate": 1.669448010269576e-05, + "loss": 0.7545, + "step": 4338 + }, + { + "epoch": 5.569961489088575, + "grad_norm": 2.1908323764801025, + "learning_rate": 1.6698331193838256e-05, + "loss": 0.6941, + "step": 4339 + }, + { + "epoch": 5.571245186136072, + "grad_norm": 2.195509672164917, + "learning_rate": 1.6702182284980747e-05, + "loss": 0.6497, + "step": 4340 + }, + { + "epoch": 5.572528883183569, + "grad_norm": 2.9520161151885986, + "learning_rate": 1.6706033376123235e-05, + "loss": 0.7413, + "step": 4341 + }, + { + "epoch": 5.573812580231065, + "grad_norm": 4.592132568359375, + "learning_rate": 1.6709884467265726e-05, + "loss": 0.7774, + "step": 4342 + }, + { + "epoch": 5.575096277278562, + "grad_norm": 3.7807116508483887, + "learning_rate": 1.6713735558408217e-05, + "loss": 0.7416, + "step": 4343 + }, + { + "epoch": 5.576379974326059, + "grad_norm": 4.71297550201416, + "learning_rate": 1.6717586649550704e-05, + "loss": 0.8421, + "step": 4344 + }, + { + "epoch": 5.577663671373556, + "grad_norm": 2.589552164077759, + "learning_rate": 1.67214377406932e-05, + "loss": 0.9425, + "step": 4345 + }, + { + "epoch": 5.578947368421053, + "grad_norm": 2.305396318435669, + "learning_rate": 1.6725288831835687e-05, + "loss": 0.648, + "step": 4346 + }, + { + "epoch": 5.580231065468549, + "grad_norm": 2.516794443130493, + "learning_rate": 1.6729139922978178e-05, + "loss": 0.6002, + "step": 4347 + }, + { + "epoch": 5.581514762516046, + "grad_norm": 2.174468517303467, + "learning_rate": 1.673299101412067e-05, + "loss": 0.641, + "step": 4348 + }, + { + "epoch": 5.582798459563543, + "grad_norm": 1.9253032207489014, + "learning_rate": 1.6736842105263156e-05, + "loss": 0.6492, + "step": 4349 + }, + { + "epoch": 5.58408215661104, + "grad_norm": 3.1297905445098877, + "learning_rate": 1.6740693196405647e-05, + "loss": 0.6532, + "step": 4350 + }, + { + "epoch": 5.585365853658536, + "grad_norm": 1.3400427103042603, + "learning_rate": 1.6744544287548142e-05, + "loss": 0.625, + "step": 4351 + }, + { + "epoch": 5.586649550706033, + "grad_norm": 4.95754861831665, + "learning_rate": 1.674839537869063e-05, + "loss": 0.6383, + "step": 4352 + }, + { + "epoch": 5.58793324775353, + "grad_norm": 1.8568506240844727, + "learning_rate": 1.675224646983312e-05, + "loss": 0.6235, + "step": 4353 + }, + { + "epoch": 5.589216944801027, + "grad_norm": 2.9159092903137207, + "learning_rate": 1.6756097560975612e-05, + "loss": 0.5912, + "step": 4354 + }, + { + "epoch": 5.590500641848524, + "grad_norm": 2.6532351970672607, + "learning_rate": 1.67599486521181e-05, + "loss": 0.6483, + "step": 4355 + }, + { + "epoch": 5.59178433889602, + "grad_norm": 2.2968456745147705, + "learning_rate": 1.676379974326059e-05, + "loss": 0.6348, + "step": 4356 + }, + { + "epoch": 5.593068035943517, + "grad_norm": 4.521332740783691, + "learning_rate": 1.676765083440308e-05, + "loss": 0.6583, + "step": 4357 + }, + { + "epoch": 5.594351732991014, + "grad_norm": 2.484927177429199, + "learning_rate": 1.6771501925545573e-05, + "loss": 0.6024, + "step": 4358 + }, + { + "epoch": 5.595635430038511, + "grad_norm": 2.378330707550049, + "learning_rate": 1.6775353016688064e-05, + "loss": 0.609, + "step": 4359 + }, + { + "epoch": 5.596919127086007, + "grad_norm": 2.110781669616699, + "learning_rate": 1.677920410783055e-05, + "loss": 0.643, + "step": 4360 + }, + { + "epoch": 5.598202824133504, + "grad_norm": 4.264458179473877, + "learning_rate": 1.6783055198973042e-05, + "loss": 0.6576, + "step": 4361 + }, + { + "epoch": 5.599486521181001, + "grad_norm": 2.4229564666748047, + "learning_rate": 1.6786906290115533e-05, + "loss": 0.6439, + "step": 4362 + }, + { + "epoch": 5.600770218228498, + "grad_norm": 3.1604971885681152, + "learning_rate": 1.679075738125802e-05, + "loss": 0.6246, + "step": 4363 + }, + { + "epoch": 5.602053915275995, + "grad_norm": 2.7759346961975098, + "learning_rate": 1.6794608472400516e-05, + "loss": 0.6587, + "step": 4364 + }, + { + "epoch": 5.603337612323491, + "grad_norm": 2.4598405361175537, + "learning_rate": 1.6798459563543003e-05, + "loss": 0.6372, + "step": 4365 + }, + { + "epoch": 5.604621309370988, + "grad_norm": 3.9049203395843506, + "learning_rate": 1.6802310654685494e-05, + "loss": 0.5916, + "step": 4366 + }, + { + "epoch": 5.605905006418485, + "grad_norm": 3.312135696411133, + "learning_rate": 1.6806161745827985e-05, + "loss": 0.6144, + "step": 4367 + }, + { + "epoch": 5.607188703465982, + "grad_norm": 2.9578628540039062, + "learning_rate": 1.6810012836970473e-05, + "loss": 0.6562, + "step": 4368 + }, + { + "epoch": 5.608472400513479, + "grad_norm": 2.6465468406677246, + "learning_rate": 1.6813863928112964e-05, + "loss": 0.6325, + "step": 4369 + }, + { + "epoch": 5.609756097560975, + "grad_norm": 2.4080727100372314, + "learning_rate": 1.681771501925546e-05, + "loss": 0.6756, + "step": 4370 + }, + { + "epoch": 5.611039794608472, + "grad_norm": 2.403632164001465, + "learning_rate": 1.6821566110397946e-05, + "loss": 0.6976, + "step": 4371 + }, + { + "epoch": 5.612323491655969, + "grad_norm": 2.190084934234619, + "learning_rate": 1.6825417201540437e-05, + "loss": 0.6082, + "step": 4372 + }, + { + "epoch": 5.613607188703466, + "grad_norm": 2.940154552459717, + "learning_rate": 1.682926829268293e-05, + "loss": 0.6903, + "step": 4373 + }, + { + "epoch": 5.614890885750963, + "grad_norm": 3.24169659614563, + "learning_rate": 1.6833119383825416e-05, + "loss": 0.6668, + "step": 4374 + }, + { + "epoch": 5.616174582798459, + "grad_norm": 2.917513132095337, + "learning_rate": 1.6836970474967907e-05, + "loss": 0.6177, + "step": 4375 + }, + { + "epoch": 5.617458279845956, + "grad_norm": 5.584379196166992, + "learning_rate": 1.68408215661104e-05, + "loss": 0.6913, + "step": 4376 + }, + { + "epoch": 5.618741976893453, + "grad_norm": 2.663752317428589, + "learning_rate": 1.684467265725289e-05, + "loss": 0.6394, + "step": 4377 + }, + { + "epoch": 5.62002567394095, + "grad_norm": 2.474339485168457, + "learning_rate": 1.684852374839538e-05, + "loss": 0.6422, + "step": 4378 + }, + { + "epoch": 5.621309370988447, + "grad_norm": 4.444998741149902, + "learning_rate": 1.6852374839537868e-05, + "loss": 0.7499, + "step": 4379 + }, + { + "epoch": 5.622593068035943, + "grad_norm": 9.19686222076416, + "learning_rate": 1.685622593068036e-05, + "loss": 0.6266, + "step": 4380 + }, + { + "epoch": 5.62387676508344, + "grad_norm": 3.1083970069885254, + "learning_rate": 1.686007702182285e-05, + "loss": 0.7124, + "step": 4381 + }, + { + "epoch": 5.625160462130937, + "grad_norm": 2.1154308319091797, + "learning_rate": 1.686392811296534e-05, + "loss": 0.7245, + "step": 4382 + }, + { + "epoch": 5.626444159178434, + "grad_norm": 3.4584929943084717, + "learning_rate": 1.6867779204107832e-05, + "loss": 0.7284, + "step": 4383 + }, + { + "epoch": 5.62772785622593, + "grad_norm": 4.085469722747803, + "learning_rate": 1.687163029525032e-05, + "loss": 0.6553, + "step": 4384 + }, + { + "epoch": 5.629011553273427, + "grad_norm": 2.0517473220825195, + "learning_rate": 1.687548138639281e-05, + "loss": 0.6428, + "step": 4385 + }, + { + "epoch": 5.630295250320924, + "grad_norm": 2.2369680404663086, + "learning_rate": 1.6879332477535302e-05, + "loss": 0.7436, + "step": 4386 + }, + { + "epoch": 5.631578947368421, + "grad_norm": 1.7089945077896118, + "learning_rate": 1.688318356867779e-05, + "loss": 0.6964, + "step": 4387 + }, + { + "epoch": 5.632862644415918, + "grad_norm": 2.8962581157684326, + "learning_rate": 1.6887034659820284e-05, + "loss": 0.6968, + "step": 4388 + }, + { + "epoch": 5.634146341463414, + "grad_norm": 3.274390935897827, + "learning_rate": 1.6890885750962775e-05, + "loss": 0.7901, + "step": 4389 + }, + { + "epoch": 5.635430038510911, + "grad_norm": 3.1983983516693115, + "learning_rate": 1.6894736842105263e-05, + "loss": 0.7605, + "step": 4390 + }, + { + "epoch": 5.636713735558408, + "grad_norm": 3.614473581314087, + "learning_rate": 1.6898587933247754e-05, + "loss": 0.7413, + "step": 4391 + }, + { + "epoch": 5.637997432605905, + "grad_norm": 16.169057846069336, + "learning_rate": 1.6902439024390242e-05, + "loss": 0.6799, + "step": 4392 + }, + { + "epoch": 5.639281129653401, + "grad_norm": 3.8098740577697754, + "learning_rate": 1.6906290115532733e-05, + "loss": 0.8095, + "step": 4393 + }, + { + "epoch": 5.640564826700898, + "grad_norm": 4.8458638191223145, + "learning_rate": 1.6910141206675227e-05, + "loss": 0.7507, + "step": 4394 + }, + { + "epoch": 5.641848523748395, + "grad_norm": 3.4554738998413086, + "learning_rate": 1.6913992297817715e-05, + "loss": 0.9342, + "step": 4395 + }, + { + "epoch": 5.643132220795892, + "grad_norm": 2.1446499824523926, + "learning_rate": 1.6917843388960206e-05, + "loss": 0.6345, + "step": 4396 + }, + { + "epoch": 5.644415917843389, + "grad_norm": 2.0035576820373535, + "learning_rate": 1.6921694480102697e-05, + "loss": 0.6133, + "step": 4397 + }, + { + "epoch": 5.645699614890885, + "grad_norm": 2.0232245922088623, + "learning_rate": 1.6925545571245185e-05, + "loss": 0.6304, + "step": 4398 + }, + { + "epoch": 5.646983311938382, + "grad_norm": 1.6669145822525024, + "learning_rate": 1.6929396662387676e-05, + "loss": 0.5938, + "step": 4399 + }, + { + "epoch": 5.648267008985879, + "grad_norm": 4.056438446044922, + "learning_rate": 1.693324775353017e-05, + "loss": 0.619, + "step": 4400 + }, + { + "epoch": 5.649550706033376, + "grad_norm": 4.373841285705566, + "learning_rate": 1.6937098844672658e-05, + "loss": 0.6062, + "step": 4401 + }, + { + "epoch": 5.650834403080873, + "grad_norm": 2.1510438919067383, + "learning_rate": 1.694094993581515e-05, + "loss": 0.6078, + "step": 4402 + }, + { + "epoch": 5.652118100128369, + "grad_norm": 4.173067569732666, + "learning_rate": 1.6944801026957637e-05, + "loss": 0.6168, + "step": 4403 + }, + { + "epoch": 5.653401797175866, + "grad_norm": 1.769837498664856, + "learning_rate": 1.6948652118100128e-05, + "loss": 0.6351, + "step": 4404 + }, + { + "epoch": 5.654685494223363, + "grad_norm": 2.0561766624450684, + "learning_rate": 1.695250320924262e-05, + "loss": 0.6784, + "step": 4405 + }, + { + "epoch": 5.65596919127086, + "grad_norm": 1.664709448814392, + "learning_rate": 1.695635430038511e-05, + "loss": 0.5924, + "step": 4406 + }, + { + "epoch": 5.657252888318357, + "grad_norm": 2.035088539123535, + "learning_rate": 1.69602053915276e-05, + "loss": 0.6666, + "step": 4407 + }, + { + "epoch": 5.658536585365853, + "grad_norm": 2.3717055320739746, + "learning_rate": 1.6964056482670092e-05, + "loss": 0.6037, + "step": 4408 + }, + { + "epoch": 5.65982028241335, + "grad_norm": 1.5741952657699585, + "learning_rate": 1.696790757381258e-05, + "loss": 0.628, + "step": 4409 + }, + { + "epoch": 5.661103979460847, + "grad_norm": 2.4346437454223633, + "learning_rate": 1.697175866495507e-05, + "loss": 0.6631, + "step": 4410 + }, + { + "epoch": 5.662387676508344, + "grad_norm": 2.898308753967285, + "learning_rate": 1.697560975609756e-05, + "loss": 0.592, + "step": 4411 + }, + { + "epoch": 5.663671373555841, + "grad_norm": 2.1973934173583984, + "learning_rate": 1.697946084724005e-05, + "loss": 0.6114, + "step": 4412 + }, + { + "epoch": 5.664955070603337, + "grad_norm": 1.6031795740127563, + "learning_rate": 1.6983311938382544e-05, + "loss": 0.569, + "step": 4413 + }, + { + "epoch": 5.666238767650834, + "grad_norm": 2.107370138168335, + "learning_rate": 1.6987163029525032e-05, + "loss": 0.5989, + "step": 4414 + }, + { + "epoch": 5.667522464698331, + "grad_norm": 2.057605028152466, + "learning_rate": 1.6991014120667523e-05, + "loss": 0.5763, + "step": 4415 + }, + { + "epoch": 5.668806161745828, + "grad_norm": 2.176241397857666, + "learning_rate": 1.6994865211810014e-05, + "loss": 0.6723, + "step": 4416 + }, + { + "epoch": 5.670089858793324, + "grad_norm": 2.1223912239074707, + "learning_rate": 1.6998716302952502e-05, + "loss": 0.6054, + "step": 4417 + }, + { + "epoch": 5.671373555840821, + "grad_norm": 1.9455161094665527, + "learning_rate": 1.7002567394094993e-05, + "loss": 0.6119, + "step": 4418 + }, + { + "epoch": 5.672657252888318, + "grad_norm": 2.630560874938965, + "learning_rate": 1.7006418485237487e-05, + "loss": 0.624, + "step": 4419 + }, + { + "epoch": 5.673940949935815, + "grad_norm": 2.374250888824463, + "learning_rate": 1.7010269576379975e-05, + "loss": 0.6207, + "step": 4420 + }, + { + "epoch": 5.675224646983312, + "grad_norm": 2.8154783248901367, + "learning_rate": 1.7014120667522466e-05, + "loss": 0.5988, + "step": 4421 + }, + { + "epoch": 5.676508344030808, + "grad_norm": 6.841653823852539, + "learning_rate": 1.7017971758664954e-05, + "loss": 0.6074, + "step": 4422 + }, + { + "epoch": 5.677792041078305, + "grad_norm": 1.7842504978179932, + "learning_rate": 1.7021822849807445e-05, + "loss": 0.6255, + "step": 4423 + }, + { + "epoch": 5.679075738125802, + "grad_norm": 3.8720810413360596, + "learning_rate": 1.7025673940949936e-05, + "loss": 0.6718, + "step": 4424 + }, + { + "epoch": 5.680359435173299, + "grad_norm": 2.9874179363250732, + "learning_rate": 1.7029525032092427e-05, + "loss": 0.6373, + "step": 4425 + }, + { + "epoch": 5.681643132220795, + "grad_norm": 3.939159870147705, + "learning_rate": 1.7033376123234918e-05, + "loss": 0.6418, + "step": 4426 + }, + { + "epoch": 5.682926829268292, + "grad_norm": 2.3429551124572754, + "learning_rate": 1.703722721437741e-05, + "loss": 0.6721, + "step": 4427 + }, + { + "epoch": 5.684210526315789, + "grad_norm": 1.5716575384140015, + "learning_rate": 1.7041078305519897e-05, + "loss": 0.6733, + "step": 4428 + }, + { + "epoch": 5.685494223363286, + "grad_norm": 5.758504390716553, + "learning_rate": 1.7044929396662388e-05, + "loss": 0.6445, + "step": 4429 + }, + { + "epoch": 5.686777920410783, + "grad_norm": 1.4819997549057007, + "learning_rate": 1.7048780487804875e-05, + "loss": 0.6108, + "step": 4430 + }, + { + "epoch": 5.688061617458279, + "grad_norm": 8.936493873596191, + "learning_rate": 1.705263157894737e-05, + "loss": 0.6386, + "step": 4431 + }, + { + "epoch": 5.689345314505776, + "grad_norm": 2.6222000122070312, + "learning_rate": 1.705648267008986e-05, + "loss": 0.7008, + "step": 4432 + }, + { + "epoch": 5.690629011553273, + "grad_norm": 2.7086145877838135, + "learning_rate": 1.706033376123235e-05, + "loss": 0.6494, + "step": 4433 + }, + { + "epoch": 5.69191270860077, + "grad_norm": 4.2630615234375, + "learning_rate": 1.706418485237484e-05, + "loss": 0.7402, + "step": 4434 + }, + { + "epoch": 5.693196405648267, + "grad_norm": 2.8409583568573, + "learning_rate": 1.706803594351733e-05, + "loss": 0.7329, + "step": 4435 + }, + { + "epoch": 5.694480102695763, + "grad_norm": 2.845559597015381, + "learning_rate": 1.707188703465982e-05, + "loss": 0.7387, + "step": 4436 + }, + { + "epoch": 5.69576379974326, + "grad_norm": 2.713653326034546, + "learning_rate": 1.7075738125802313e-05, + "loss": 0.7699, + "step": 4437 + }, + { + "epoch": 5.697047496790757, + "grad_norm": 3.085509777069092, + "learning_rate": 1.70795892169448e-05, + "loss": 0.7082, + "step": 4438 + }, + { + "epoch": 5.698331193838254, + "grad_norm": 3.2184159755706787, + "learning_rate": 1.7083440308087292e-05, + "loss": 0.7288, + "step": 4439 + }, + { + "epoch": 5.699614890885751, + "grad_norm": 3.959197521209717, + "learning_rate": 1.7087291399229783e-05, + "loss": 0.7299, + "step": 4440 + }, + { + "epoch": 5.700898587933247, + "grad_norm": 3.5031375885009766, + "learning_rate": 1.709114249037227e-05, + "loss": 0.7011, + "step": 4441 + }, + { + "epoch": 5.702182284980744, + "grad_norm": 10.685999870300293, + "learning_rate": 1.709499358151476e-05, + "loss": 0.7183, + "step": 4442 + }, + { + "epoch": 5.703465982028241, + "grad_norm": 2.4554712772369385, + "learning_rate": 1.7098844672657256e-05, + "loss": 0.7662, + "step": 4443 + }, + { + "epoch": 5.704749679075738, + "grad_norm": 2.4095635414123535, + "learning_rate": 1.7102695763799744e-05, + "loss": 0.8003, + "step": 4444 + }, + { + "epoch": 5.706033376123235, + "grad_norm": 4.248488903045654, + "learning_rate": 1.7106546854942235e-05, + "loss": 0.8614, + "step": 4445 + }, + { + "epoch": 5.7073170731707314, + "grad_norm": 1.4764927625656128, + "learning_rate": 1.7110397946084726e-05, + "loss": 0.6155, + "step": 4446 + }, + { + "epoch": 5.708600770218228, + "grad_norm": 2.3267407417297363, + "learning_rate": 1.7114249037227213e-05, + "loss": 0.614, + "step": 4447 + }, + { + "epoch": 5.709884467265725, + "grad_norm": 1.7198165655136108, + "learning_rate": 1.7118100128369705e-05, + "loss": 0.6021, + "step": 4448 + }, + { + "epoch": 5.711168164313222, + "grad_norm": 1.8720952272415161, + "learning_rate": 1.7121951219512196e-05, + "loss": 0.622, + "step": 4449 + }, + { + "epoch": 5.712451861360719, + "grad_norm": 5.425906181335449, + "learning_rate": 1.7125802310654687e-05, + "loss": 0.6105, + "step": 4450 + }, + { + "epoch": 5.7137355584082155, + "grad_norm": 3.0833544731140137, + "learning_rate": 1.7129653401797178e-05, + "loss": 0.637, + "step": 4451 + }, + { + "epoch": 5.715019255455712, + "grad_norm": 3.5162670612335205, + "learning_rate": 1.7133504492939665e-05, + "loss": 0.5975, + "step": 4452 + }, + { + "epoch": 5.716302952503209, + "grad_norm": 2.710115909576416, + "learning_rate": 1.7137355584082157e-05, + "loss": 0.663, + "step": 4453 + }, + { + "epoch": 5.717586649550706, + "grad_norm": 2.083320140838623, + "learning_rate": 1.7141206675224648e-05, + "loss": 0.6208, + "step": 4454 + }, + { + "epoch": 5.7188703465982025, + "grad_norm": 2.0430781841278076, + "learning_rate": 1.714505776636714e-05, + "loss": 0.6421, + "step": 4455 + }, + { + "epoch": 5.7201540436456995, + "grad_norm": 3.8381803035736084, + "learning_rate": 1.714890885750963e-05, + "loss": 0.6548, + "step": 4456 + }, + { + "epoch": 5.721437740693196, + "grad_norm": 3.084186553955078, + "learning_rate": 1.7152759948652117e-05, + "loss": 0.6552, + "step": 4457 + }, + { + "epoch": 5.722721437740693, + "grad_norm": 1.5742462873458862, + "learning_rate": 1.715661103979461e-05, + "loss": 0.6084, + "step": 4458 + }, + { + "epoch": 5.7240051347881895, + "grad_norm": 10.036101341247559, + "learning_rate": 1.71604621309371e-05, + "loss": 0.6284, + "step": 4459 + }, + { + "epoch": 5.7252888318356865, + "grad_norm": 4.1023335456848145, + "learning_rate": 1.7164313222079587e-05, + "loss": 0.6198, + "step": 4460 + }, + { + "epoch": 5.7265725288831835, + "grad_norm": 3.743346691131592, + "learning_rate": 1.7168164313222082e-05, + "loss": 0.621, + "step": 4461 + }, + { + "epoch": 5.7278562259306804, + "grad_norm": 2.9867172241210938, + "learning_rate": 1.7172015404364573e-05, + "loss": 0.5809, + "step": 4462 + }, + { + "epoch": 5.729139922978177, + "grad_norm": 2.1854310035705566, + "learning_rate": 1.717586649550706e-05, + "loss": 0.6543, + "step": 4463 + }, + { + "epoch": 5.7304236200256735, + "grad_norm": 5.963146686553955, + "learning_rate": 1.717971758664955e-05, + "loss": 0.5855, + "step": 4464 + }, + { + "epoch": 5.7317073170731705, + "grad_norm": 4.273271083831787, + "learning_rate": 1.7183568677792043e-05, + "loss": 0.6519, + "step": 4465 + }, + { + "epoch": 5.7329910141206675, + "grad_norm": 7.289885997772217, + "learning_rate": 1.718741976893453e-05, + "loss": 0.6357, + "step": 4466 + }, + { + "epoch": 5.7342747111681645, + "grad_norm": 3.114041566848755, + "learning_rate": 1.719127086007702e-05, + "loss": 0.6606, + "step": 4467 + }, + { + "epoch": 5.735558408215661, + "grad_norm": 4.267681121826172, + "learning_rate": 1.7195121951219512e-05, + "loss": 0.6351, + "step": 4468 + }, + { + "epoch": 5.7368421052631575, + "grad_norm": 3.01507568359375, + "learning_rate": 1.7198973042362003e-05, + "loss": 0.6134, + "step": 4469 + }, + { + "epoch": 5.7381258023106545, + "grad_norm": 2.1498420238494873, + "learning_rate": 1.7202824133504495e-05, + "loss": 0.5946, + "step": 4470 + }, + { + "epoch": 5.7394094993581515, + "grad_norm": 1.7204375267028809, + "learning_rate": 1.7206675224646982e-05, + "loss": 0.652, + "step": 4471 + }, + { + "epoch": 5.7406931964056485, + "grad_norm": 2.637770414352417, + "learning_rate": 1.7210526315789473e-05, + "loss": 0.6324, + "step": 4472 + }, + { + "epoch": 5.741976893453145, + "grad_norm": 10.165199279785156, + "learning_rate": 1.7214377406931964e-05, + "loss": 0.6296, + "step": 4473 + }, + { + "epoch": 5.7432605905006415, + "grad_norm": 1.4224802255630493, + "learning_rate": 1.7218228498074455e-05, + "loss": 0.6091, + "step": 4474 + }, + { + "epoch": 5.7445442875481385, + "grad_norm": 2.08256196975708, + "learning_rate": 1.7222079589216947e-05, + "loss": 0.6679, + "step": 4475 + }, + { + "epoch": 5.7458279845956355, + "grad_norm": 3.55830454826355, + "learning_rate": 1.7225930680359434e-05, + "loss": 0.6397, + "step": 4476 + }, + { + "epoch": 5.7471116816431325, + "grad_norm": 3.0241928100585938, + "learning_rate": 1.7229781771501925e-05, + "loss": 0.6243, + "step": 4477 + }, + { + "epoch": 5.748395378690629, + "grad_norm": 2.731635570526123, + "learning_rate": 1.7233632862644416e-05, + "loss": 0.7122, + "step": 4478 + }, + { + "epoch": 5.7496790757381255, + "grad_norm": 1.9038891792297363, + "learning_rate": 1.7237483953786904e-05, + "loss": 0.6368, + "step": 4479 + }, + { + "epoch": 5.7509627727856225, + "grad_norm": 1.7401635646820068, + "learning_rate": 1.72413350449294e-05, + "loss": 0.5922, + "step": 4480 + }, + { + "epoch": 5.7522464698331195, + "grad_norm": 2.953303575515747, + "learning_rate": 1.724518613607189e-05, + "loss": 0.6476, + "step": 4481 + }, + { + "epoch": 5.7535301668806165, + "grad_norm": 2.7816691398620605, + "learning_rate": 1.7249037227214377e-05, + "loss": 0.6472, + "step": 4482 + }, + { + "epoch": 5.7548138639281134, + "grad_norm": 3.3974928855895996, + "learning_rate": 1.725288831835687e-05, + "loss": 0.6331, + "step": 4483 + }, + { + "epoch": 5.7560975609756095, + "grad_norm": 2.863156795501709, + "learning_rate": 1.725673940949936e-05, + "loss": 0.6681, + "step": 4484 + }, + { + "epoch": 5.7573812580231065, + "grad_norm": 2.8710901737213135, + "learning_rate": 1.7260590500641847e-05, + "loss": 0.6674, + "step": 4485 + }, + { + "epoch": 5.7586649550706035, + "grad_norm": 2.8076629638671875, + "learning_rate": 1.726444159178434e-05, + "loss": 0.6836, + "step": 4486 + }, + { + "epoch": 5.7599486521181005, + "grad_norm": 7.42437744140625, + "learning_rate": 1.726829268292683e-05, + "loss": 0.7324, + "step": 4487 + }, + { + "epoch": 5.761232349165597, + "grad_norm": 2.7311789989471436, + "learning_rate": 1.727214377406932e-05, + "loss": 0.6371, + "step": 4488 + }, + { + "epoch": 5.7625160462130935, + "grad_norm": 3.1428184509277344, + "learning_rate": 1.727599486521181e-05, + "loss": 0.6679, + "step": 4489 + }, + { + "epoch": 5.7637997432605905, + "grad_norm": 3.3958559036254883, + "learning_rate": 1.72798459563543e-05, + "loss": 0.6567, + "step": 4490 + }, + { + "epoch": 5.7650834403080875, + "grad_norm": 3.0600337982177734, + "learning_rate": 1.728369704749679e-05, + "loss": 0.6979, + "step": 4491 + }, + { + "epoch": 5.766367137355584, + "grad_norm": 3.141420364379883, + "learning_rate": 1.7287548138639285e-05, + "loss": 0.7581, + "step": 4492 + }, + { + "epoch": 5.767650834403081, + "grad_norm": 2.704113483428955, + "learning_rate": 1.7291399229781772e-05, + "loss": 0.6483, + "step": 4493 + }, + { + "epoch": 5.7689345314505776, + "grad_norm": 7.634315490722656, + "learning_rate": 1.7295250320924263e-05, + "loss": 0.8258, + "step": 4494 + }, + { + "epoch": 5.7702182284980745, + "grad_norm": 4.205955505371094, + "learning_rate": 1.729910141206675e-05, + "loss": 0.8798, + "step": 4495 + }, + { + "epoch": 5.7715019255455715, + "grad_norm": 2.10744309425354, + "learning_rate": 1.7302952503209242e-05, + "loss": 0.6196, + "step": 4496 + }, + { + "epoch": 5.772785622593068, + "grad_norm": 10.574149131774902, + "learning_rate": 1.7306803594351733e-05, + "loss": 0.578, + "step": 4497 + }, + { + "epoch": 5.774069319640565, + "grad_norm": 3.0922861099243164, + "learning_rate": 1.7310654685494224e-05, + "loss": 0.595, + "step": 4498 + }, + { + "epoch": 5.775353016688062, + "grad_norm": 1.6011080741882324, + "learning_rate": 1.7314505776636715e-05, + "loss": 0.6146, + "step": 4499 + }, + { + "epoch": 5.7766367137355585, + "grad_norm": 2.562373161315918, + "learning_rate": 1.7318356867779206e-05, + "loss": 0.6405, + "step": 4500 + }, + { + "epoch": 5.7779204107830555, + "grad_norm": 3.8408679962158203, + "learning_rate": 1.7322207958921694e-05, + "loss": 0.5854, + "step": 4501 + }, + { + "epoch": 5.779204107830552, + "grad_norm": 9.999032974243164, + "learning_rate": 1.7326059050064185e-05, + "loss": 0.6076, + "step": 4502 + }, + { + "epoch": 5.780487804878049, + "grad_norm": 3.50333833694458, + "learning_rate": 1.7329910141206673e-05, + "loss": 0.6122, + "step": 4503 + }, + { + "epoch": 5.781771501925546, + "grad_norm": 2.5854010581970215, + "learning_rate": 1.7333761232349167e-05, + "loss": 0.6252, + "step": 4504 + }, + { + "epoch": 5.7830551989730425, + "grad_norm": 1.8389009237289429, + "learning_rate": 1.733761232349166e-05, + "loss": 0.6449, + "step": 4505 + }, + { + "epoch": 5.7843388960205395, + "grad_norm": 9.7117280960083, + "learning_rate": 1.7341463414634146e-05, + "loss": 0.6095, + "step": 4506 + }, + { + "epoch": 5.785622593068036, + "grad_norm": 2.2726259231567383, + "learning_rate": 1.7345314505776637e-05, + "loss": 0.6244, + "step": 4507 + }, + { + "epoch": 5.786906290115533, + "grad_norm": 2.116105556488037, + "learning_rate": 1.7349165596919128e-05, + "loss": 0.6386, + "step": 4508 + }, + { + "epoch": 5.78818998716303, + "grad_norm": 2.922210693359375, + "learning_rate": 1.7353016688061616e-05, + "loss": 0.626, + "step": 4509 + }, + { + "epoch": 5.7894736842105265, + "grad_norm": 1.8339923620224, + "learning_rate": 1.735686777920411e-05, + "loss": 0.6235, + "step": 4510 + }, + { + "epoch": 5.7907573812580235, + "grad_norm": 1.440148949623108, + "learning_rate": 1.73607188703466e-05, + "loss": 0.636, + "step": 4511 + }, + { + "epoch": 5.79204107830552, + "grad_norm": 3.072077751159668, + "learning_rate": 1.736456996148909e-05, + "loss": 0.6143, + "step": 4512 + }, + { + "epoch": 5.793324775353017, + "grad_norm": 1.8953731060028076, + "learning_rate": 1.736842105263158e-05, + "loss": 0.6129, + "step": 4513 + }, + { + "epoch": 5.794608472400514, + "grad_norm": 7.498276710510254, + "learning_rate": 1.7372272143774068e-05, + "loss": 0.5757, + "step": 4514 + }, + { + "epoch": 5.7958921694480106, + "grad_norm": 3.267096996307373, + "learning_rate": 1.737612323491656e-05, + "loss": 0.6467, + "step": 4515 + }, + { + "epoch": 5.7971758664955075, + "grad_norm": 5.510442733764648, + "learning_rate": 1.737997432605905e-05, + "loss": 0.6008, + "step": 4516 + }, + { + "epoch": 5.798459563543004, + "grad_norm": 2.1455681324005127, + "learning_rate": 1.738382541720154e-05, + "loss": 0.6915, + "step": 4517 + }, + { + "epoch": 5.799743260590501, + "grad_norm": 3.6319916248321533, + "learning_rate": 1.7387676508344032e-05, + "loss": 0.6374, + "step": 4518 + }, + { + "epoch": 5.801026957637998, + "grad_norm": 3.0522780418395996, + "learning_rate": 1.7391527599486523e-05, + "loss": 0.6034, + "step": 4519 + }, + { + "epoch": 5.802310654685495, + "grad_norm": 2.978384017944336, + "learning_rate": 1.739537869062901e-05, + "loss": 0.6886, + "step": 4520 + }, + { + "epoch": 5.803594351732991, + "grad_norm": 3.149320363998413, + "learning_rate": 1.7399229781771502e-05, + "loss": 0.631, + "step": 4521 + }, + { + "epoch": 5.804878048780488, + "grad_norm": 4.661674976348877, + "learning_rate": 1.740308087291399e-05, + "loss": 0.6347, + "step": 4522 + }, + { + "epoch": 5.806161745827985, + "grad_norm": 2.9227044582366943, + "learning_rate": 1.7406931964056484e-05, + "loss": 0.5845, + "step": 4523 + }, + { + "epoch": 5.807445442875482, + "grad_norm": 3.397857427597046, + "learning_rate": 1.7410783055198975e-05, + "loss": 0.6429, + "step": 4524 + }, + { + "epoch": 5.808729139922978, + "grad_norm": 2.5639500617980957, + "learning_rate": 1.7414634146341463e-05, + "loss": 0.6378, + "step": 4525 + }, + { + "epoch": 5.810012836970475, + "grad_norm": 2.016998529434204, + "learning_rate": 1.7418485237483954e-05, + "loss": 0.6658, + "step": 4526 + }, + { + "epoch": 5.811296534017972, + "grad_norm": 2.3819520473480225, + "learning_rate": 1.7422336328626445e-05, + "loss": 0.6933, + "step": 4527 + }, + { + "epoch": 5.812580231065469, + "grad_norm": 4.191798210144043, + "learning_rate": 1.7426187419768933e-05, + "loss": 0.6403, + "step": 4528 + }, + { + "epoch": 5.813863928112966, + "grad_norm": 2.9059183597564697, + "learning_rate": 1.7430038510911427e-05, + "loss": 0.6902, + "step": 4529 + }, + { + "epoch": 5.815147625160462, + "grad_norm": 2.7533130645751953, + "learning_rate": 1.7433889602053918e-05, + "loss": 0.6348, + "step": 4530 + }, + { + "epoch": 5.816431322207959, + "grad_norm": 1.7973166704177856, + "learning_rate": 1.7437740693196406e-05, + "loss": 0.6508, + "step": 4531 + }, + { + "epoch": 5.817715019255456, + "grad_norm": 1.8578767776489258, + "learning_rate": 1.7441591784338897e-05, + "loss": 0.6925, + "step": 4532 + }, + { + "epoch": 5.818998716302953, + "grad_norm": 10.057832717895508, + "learning_rate": 1.7445442875481385e-05, + "loss": 0.6496, + "step": 4533 + }, + { + "epoch": 5.82028241335045, + "grad_norm": 1.8311811685562134, + "learning_rate": 1.7449293966623876e-05, + "loss": 0.668, + "step": 4534 + }, + { + "epoch": 5.821566110397946, + "grad_norm": 4.419598579406738, + "learning_rate": 1.745314505776637e-05, + "loss": 0.7533, + "step": 4535 + }, + { + "epoch": 5.822849807445443, + "grad_norm": 7.219152450561523, + "learning_rate": 1.7456996148908858e-05, + "loss": 0.6227, + "step": 4536 + }, + { + "epoch": 5.82413350449294, + "grad_norm": 3.8570265769958496, + "learning_rate": 1.746084724005135e-05, + "loss": 0.7058, + "step": 4537 + }, + { + "epoch": 5.825417201540437, + "grad_norm": 6.391698360443115, + "learning_rate": 1.746469833119384e-05, + "loss": 0.6876, + "step": 4538 + }, + { + "epoch": 5.826700898587934, + "grad_norm": 6.173409938812256, + "learning_rate": 1.7468549422336328e-05, + "loss": 0.7232, + "step": 4539 + }, + { + "epoch": 5.82798459563543, + "grad_norm": 1.6179472208023071, + "learning_rate": 1.747240051347882e-05, + "loss": 0.7211, + "step": 4540 + }, + { + "epoch": 5.829268292682927, + "grad_norm": 15.770174980163574, + "learning_rate": 1.747625160462131e-05, + "loss": 0.723, + "step": 4541 + }, + { + "epoch": 5.830551989730424, + "grad_norm": 2.690871477127075, + "learning_rate": 1.74801026957638e-05, + "loss": 0.7164, + "step": 4542 + }, + { + "epoch": 5.831835686777921, + "grad_norm": 9.120123863220215, + "learning_rate": 1.7483953786906292e-05, + "loss": 0.87, + "step": 4543 + }, + { + "epoch": 5.833119383825418, + "grad_norm": 5.620954513549805, + "learning_rate": 1.748780487804878e-05, + "loss": 0.8663, + "step": 4544 + }, + { + "epoch": 5.834403080872914, + "grad_norm": 6.882968902587891, + "learning_rate": 1.749165596919127e-05, + "loss": 0.8354, + "step": 4545 + }, + { + "epoch": 5.835686777920411, + "grad_norm": 2.740356922149658, + "learning_rate": 1.7495507060333762e-05, + "loss": 0.6171, + "step": 4546 + }, + { + "epoch": 5.836970474967908, + "grad_norm": 1.714073657989502, + "learning_rate": 1.7499358151476253e-05, + "loss": 0.6149, + "step": 4547 + }, + { + "epoch": 5.838254172015405, + "grad_norm": 1.7132099866867065, + "learning_rate": 1.7503209242618744e-05, + "loss": 0.6148, + "step": 4548 + }, + { + "epoch": 5.839537869062902, + "grad_norm": 5.157397270202637, + "learning_rate": 1.750706033376123e-05, + "loss": 0.6073, + "step": 4549 + }, + { + "epoch": 5.840821566110398, + "grad_norm": 6.392593860626221, + "learning_rate": 1.7510911424903723e-05, + "loss": 0.5713, + "step": 4550 + }, + { + "epoch": 5.842105263157895, + "grad_norm": 4.187321186065674, + "learning_rate": 1.7514762516046214e-05, + "loss": 0.6238, + "step": 4551 + }, + { + "epoch": 5.843388960205392, + "grad_norm": 16.70899772644043, + "learning_rate": 1.75186136071887e-05, + "loss": 0.5893, + "step": 4552 + }, + { + "epoch": 5.844672657252889, + "grad_norm": 5.195616245269775, + "learning_rate": 1.7522464698331196e-05, + "loss": 0.6593, + "step": 4553 + }, + { + "epoch": 5.845956354300385, + "grad_norm": 1.9338083267211914, + "learning_rate": 1.7526315789473687e-05, + "loss": 0.5961, + "step": 4554 + }, + { + "epoch": 5.847240051347882, + "grad_norm": 2.618783712387085, + "learning_rate": 1.7530166880616175e-05, + "loss": 0.6414, + "step": 4555 + }, + { + "epoch": 5.848523748395379, + "grad_norm": 2.1100456714630127, + "learning_rate": 1.7534017971758666e-05, + "loss": 0.6226, + "step": 4556 + }, + { + "epoch": 5.849807445442876, + "grad_norm": 7.153615474700928, + "learning_rate": 1.7537869062901157e-05, + "loss": 0.6304, + "step": 4557 + }, + { + "epoch": 5.851091142490373, + "grad_norm": 2.9916775226593018, + "learning_rate": 1.7541720154043644e-05, + "loss": 0.6491, + "step": 4558 + }, + { + "epoch": 5.852374839537869, + "grad_norm": 2.9151549339294434, + "learning_rate": 1.754557124518614e-05, + "loss": 0.6674, + "step": 4559 + }, + { + "epoch": 5.853658536585366, + "grad_norm": 2.733275890350342, + "learning_rate": 1.7549422336328627e-05, + "loss": 0.655, + "step": 4560 + }, + { + "epoch": 5.854942233632863, + "grad_norm": 4.410878658294678, + "learning_rate": 1.7553273427471118e-05, + "loss": 0.6573, + "step": 4561 + }, + { + "epoch": 5.85622593068036, + "grad_norm": 1.8935075998306274, + "learning_rate": 1.755712451861361e-05, + "loss": 0.6582, + "step": 4562 + }, + { + "epoch": 5.857509627727856, + "grad_norm": 3.572547674179077, + "learning_rate": 1.7560975609756096e-05, + "loss": 0.6521, + "step": 4563 + }, + { + "epoch": 5.858793324775353, + "grad_norm": 1.6123640537261963, + "learning_rate": 1.7564826700898587e-05, + "loss": 0.6188, + "step": 4564 + }, + { + "epoch": 5.86007702182285, + "grad_norm": 2.1853387355804443, + "learning_rate": 1.756867779204108e-05, + "loss": 0.6528, + "step": 4565 + }, + { + "epoch": 5.861360718870347, + "grad_norm": 3.3572559356689453, + "learning_rate": 1.757252888318357e-05, + "loss": 0.6458, + "step": 4566 + }, + { + "epoch": 5.862644415917844, + "grad_norm": 1.8785300254821777, + "learning_rate": 1.757637997432606e-05, + "loss": 0.6786, + "step": 4567 + }, + { + "epoch": 5.86392811296534, + "grad_norm": 2.330883264541626, + "learning_rate": 1.758023106546855e-05, + "loss": 0.6527, + "step": 4568 + }, + { + "epoch": 5.865211810012837, + "grad_norm": 1.8862265348434448, + "learning_rate": 1.758408215661104e-05, + "loss": 0.6505, + "step": 4569 + }, + { + "epoch": 5.866495507060334, + "grad_norm": 7.288956165313721, + "learning_rate": 1.758793324775353e-05, + "loss": 0.6081, + "step": 4570 + }, + { + "epoch": 5.867779204107831, + "grad_norm": 2.6151115894317627, + "learning_rate": 1.7591784338896018e-05, + "loss": 0.6774, + "step": 4571 + }, + { + "epoch": 5.869062901155328, + "grad_norm": 2.931999683380127, + "learning_rate": 1.7595635430038513e-05, + "loss": 0.7056, + "step": 4572 + }, + { + "epoch": 5.870346598202824, + "grad_norm": 3.8364341259002686, + "learning_rate": 1.7599486521181004e-05, + "loss": 0.7269, + "step": 4573 + }, + { + "epoch": 5.871630295250321, + "grad_norm": 2.934802532196045, + "learning_rate": 1.760333761232349e-05, + "loss": 0.6622, + "step": 4574 + }, + { + "epoch": 5.872913992297818, + "grad_norm": 2.7258617877960205, + "learning_rate": 1.7607188703465982e-05, + "loss": 0.6666, + "step": 4575 + }, + { + "epoch": 5.874197689345315, + "grad_norm": 2.857389211654663, + "learning_rate": 1.7611039794608473e-05, + "loss": 0.6733, + "step": 4576 + }, + { + "epoch": 5.875481386392812, + "grad_norm": 1.3953887224197388, + "learning_rate": 1.761489088575096e-05, + "loss": 0.6345, + "step": 4577 + }, + { + "epoch": 5.876765083440308, + "grad_norm": 5.598424911499023, + "learning_rate": 1.7618741976893456e-05, + "loss": 0.64, + "step": 4578 + }, + { + "epoch": 5.878048780487805, + "grad_norm": 2.9792654514312744, + "learning_rate": 1.7622593068035943e-05, + "loss": 0.6422, + "step": 4579 + }, + { + "epoch": 5.879332477535302, + "grad_norm": 3.4948747158050537, + "learning_rate": 1.7626444159178434e-05, + "loss": 0.6673, + "step": 4580 + }, + { + "epoch": 5.880616174582799, + "grad_norm": 2.471785068511963, + "learning_rate": 1.7630295250320925e-05, + "loss": 0.6802, + "step": 4581 + }, + { + "epoch": 5.881899871630296, + "grad_norm": 2.447373151779175, + "learning_rate": 1.7634146341463413e-05, + "loss": 0.6707, + "step": 4582 + }, + { + "epoch": 5.883183568677792, + "grad_norm": 3.6867566108703613, + "learning_rate": 1.7637997432605904e-05, + "loss": 0.649, + "step": 4583 + }, + { + "epoch": 5.884467265725289, + "grad_norm": 2.428753137588501, + "learning_rate": 1.76418485237484e-05, + "loss": 0.6619, + "step": 4584 + }, + { + "epoch": 5.885750962772786, + "grad_norm": 3.56917667388916, + "learning_rate": 1.7645699614890886e-05, + "loss": 0.6694, + "step": 4585 + }, + { + "epoch": 5.887034659820283, + "grad_norm": 2.4784533977508545, + "learning_rate": 1.7649550706033377e-05, + "loss": 0.7501, + "step": 4586 + }, + { + "epoch": 5.888318356867779, + "grad_norm": 5.969811916351318, + "learning_rate": 1.7653401797175865e-05, + "loss": 0.6624, + "step": 4587 + }, + { + "epoch": 5.889602053915276, + "grad_norm": 3.6161842346191406, + "learning_rate": 1.7657252888318356e-05, + "loss": 0.6989, + "step": 4588 + }, + { + "epoch": 5.890885750962773, + "grad_norm": 3.899855375289917, + "learning_rate": 1.7661103979460847e-05, + "loss": 0.6105, + "step": 4589 + }, + { + "epoch": 5.89216944801027, + "grad_norm": 2.309581756591797, + "learning_rate": 1.766495507060334e-05, + "loss": 0.7291, + "step": 4590 + }, + { + "epoch": 5.893453145057767, + "grad_norm": 5.6342973709106445, + "learning_rate": 1.766880616174583e-05, + "loss": 0.7442, + "step": 4591 + }, + { + "epoch": 5.894736842105263, + "grad_norm": NaN, + "learning_rate": 1.766880616174583e-05, + "loss": 0.7708, + "step": 4592 + }, + { + "epoch": 5.89602053915276, + "grad_norm": 5.313625812530518, + "learning_rate": 1.767265725288832e-05, + "loss": 0.745, + "step": 4593 + }, + { + "epoch": 5.897304236200257, + "grad_norm": 2.698484420776367, + "learning_rate": 1.7676508344030808e-05, + "loss": 0.7831, + "step": 4594 + }, + { + "epoch": 5.898587933247754, + "grad_norm": 2.4332175254821777, + "learning_rate": 1.76803594351733e-05, + "loss": 0.9296, + "step": 4595 + }, + { + "epoch": 5.89987163029525, + "grad_norm": 2.060483932495117, + "learning_rate": 1.7684210526315787e-05, + "loss": 0.6181, + "step": 4596 + }, + { + "epoch": 5.901155327342747, + "grad_norm": 1.731611967086792, + "learning_rate": 1.768806161745828e-05, + "loss": 0.6162, + "step": 4597 + }, + { + "epoch": 5.902439024390244, + "grad_norm": 1.8080580234527588, + "learning_rate": 1.7691912708600772e-05, + "loss": 0.6241, + "step": 4598 + }, + { + "epoch": 5.903722721437741, + "grad_norm": 1.5914385318756104, + "learning_rate": 1.769576379974326e-05, + "loss": 0.6533, + "step": 4599 + }, + { + "epoch": 5.905006418485238, + "grad_norm": 1.4388878345489502, + "learning_rate": 1.769961489088575e-05, + "loss": 0.6108, + "step": 4600 + }, + { + "epoch": 5.906290115532734, + "grad_norm": 1.7072020769119263, + "learning_rate": 1.7703465982028242e-05, + "loss": 0.6322, + "step": 4601 + }, + { + "epoch": 5.907573812580231, + "grad_norm": 2.9093377590179443, + "learning_rate": 1.770731707317073e-05, + "loss": 0.6242, + "step": 4602 + }, + { + "epoch": 5.908857509627728, + "grad_norm": 2.402216911315918, + "learning_rate": 1.7711168164313224e-05, + "loss": 0.6217, + "step": 4603 + }, + { + "epoch": 5.910141206675225, + "grad_norm": 5.415169715881348, + "learning_rate": 1.7715019255455715e-05, + "loss": 0.6173, + "step": 4604 + }, + { + "epoch": 5.911424903722722, + "grad_norm": 1.7306002378463745, + "learning_rate": 1.7718870346598203e-05, + "loss": 0.6179, + "step": 4605 + }, + { + "epoch": 5.912708600770218, + "grad_norm": 3.7607858180999756, + "learning_rate": 1.7722721437740694e-05, + "loss": 0.6106, + "step": 4606 + }, + { + "epoch": 5.913992297817715, + "grad_norm": 2.9524312019348145, + "learning_rate": 1.7726572528883182e-05, + "loss": 0.6267, + "step": 4607 + }, + { + "epoch": 5.915275994865212, + "grad_norm": 2.078030824661255, + "learning_rate": 1.7730423620025673e-05, + "loss": 0.617, + "step": 4608 + }, + { + "epoch": 5.916559691912709, + "grad_norm": 2.983609914779663, + "learning_rate": 1.7734274711168167e-05, + "loss": 0.6584, + "step": 4609 + }, + { + "epoch": 5.917843388960206, + "grad_norm": 2.596736192703247, + "learning_rate": 1.7738125802310655e-05, + "loss": 0.5821, + "step": 4610 + }, + { + "epoch": 5.919127086007702, + "grad_norm": 6.0481696128845215, + "learning_rate": 1.7741976893453146e-05, + "loss": 0.6255, + "step": 4611 + }, + { + "epoch": 5.920410783055199, + "grad_norm": 2.396156072616577, + "learning_rate": 1.7745827984595637e-05, + "loss": 0.5746, + "step": 4612 + }, + { + "epoch": 5.921694480102696, + "grad_norm": 5.278446197509766, + "learning_rate": 1.7749679075738125e-05, + "loss": 0.6707, + "step": 4613 + }, + { + "epoch": 5.922978177150193, + "grad_norm": 5.221423149108887, + "learning_rate": 1.7753530166880616e-05, + "loss": 0.6779, + "step": 4614 + }, + { + "epoch": 5.92426187419769, + "grad_norm": 2.8212034702301025, + "learning_rate": 1.7757381258023107e-05, + "loss": 0.6244, + "step": 4615 + }, + { + "epoch": 5.925545571245186, + "grad_norm": 3.3495571613311768, + "learning_rate": 1.7761232349165598e-05, + "loss": 0.6157, + "step": 4616 + }, + { + "epoch": 5.926829268292683, + "grad_norm": 2.5836358070373535, + "learning_rate": 1.776508344030809e-05, + "loss": 0.6111, + "step": 4617 + }, + { + "epoch": 5.92811296534018, + "grad_norm": 1.4045295715332031, + "learning_rate": 1.7768934531450577e-05, + "loss": 0.6512, + "step": 4618 + }, + { + "epoch": 5.929396662387677, + "grad_norm": 1.3976218700408936, + "learning_rate": 1.7772785622593068e-05, + "loss": 0.625, + "step": 4619 + }, + { + "epoch": 5.930680359435174, + "grad_norm": 1.4909189939498901, + "learning_rate": 1.777663671373556e-05, + "loss": 0.6152, + "step": 4620 + }, + { + "epoch": 5.93196405648267, + "grad_norm": 5.2021098136901855, + "learning_rate": 1.7780487804878047e-05, + "loss": 0.6129, + "step": 4621 + }, + { + "epoch": 5.933247753530167, + "grad_norm": 2.936063051223755, + "learning_rate": 1.778433889602054e-05, + "loss": 0.6928, + "step": 4622 + }, + { + "epoch": 5.934531450577664, + "grad_norm": 3.288605213165283, + "learning_rate": 1.7788189987163032e-05, + "loss": 0.5984, + "step": 4623 + }, + { + "epoch": 5.935815147625161, + "grad_norm": 2.751842498779297, + "learning_rate": 1.779204107830552e-05, + "loss": 0.5993, + "step": 4624 + }, + { + "epoch": 5.937098844672657, + "grad_norm": 1.6027302742004395, + "learning_rate": 1.779589216944801e-05, + "loss": 0.6432, + "step": 4625 + }, + { + "epoch": 5.938382541720154, + "grad_norm": 1.7446179389953613, + "learning_rate": 1.77997432605905e-05, + "loss": 0.651, + "step": 4626 + }, + { + "epoch": 5.939666238767651, + "grad_norm": 3.0849075317382812, + "learning_rate": 1.780359435173299e-05, + "loss": 0.6235, + "step": 4627 + }, + { + "epoch": 5.940949935815148, + "grad_norm": 1.8960349559783936, + "learning_rate": 1.7807445442875484e-05, + "loss": 0.6808, + "step": 4628 + }, + { + "epoch": 5.942233632862644, + "grad_norm": 4.444762229919434, + "learning_rate": 1.7811296534017972e-05, + "loss": 0.6675, + "step": 4629 + }, + { + "epoch": 5.943517329910141, + "grad_norm": 1.5570729970932007, + "learning_rate": 1.7815147625160463e-05, + "loss": 0.6684, + "step": 4630 + }, + { + "epoch": 5.944801026957638, + "grad_norm": 2.927471160888672, + "learning_rate": 1.7818998716302954e-05, + "loss": 0.6957, + "step": 4631 + }, + { + "epoch": 5.946084724005135, + "grad_norm": 5.709187030792236, + "learning_rate": 1.7822849807445442e-05, + "loss": 0.6683, + "step": 4632 + }, + { + "epoch": 5.947368421052632, + "grad_norm": 2.8162529468536377, + "learning_rate": 1.7826700898587933e-05, + "loss": 0.6367, + "step": 4633 + }, + { + "epoch": 5.948652118100128, + "grad_norm": 4.859838962554932, + "learning_rate": 1.7830551989730424e-05, + "loss": 0.6295, + "step": 4634 + }, + { + "epoch": 5.949935815147625, + "grad_norm": 5.201690196990967, + "learning_rate": 1.7834403080872915e-05, + "loss": 0.7101, + "step": 4635 + }, + { + "epoch": 5.951219512195122, + "grad_norm": 3.8242056369781494, + "learning_rate": 1.7838254172015406e-05, + "loss": 0.6469, + "step": 4636 + }, + { + "epoch": 5.952503209242619, + "grad_norm": 2.7861480712890625, + "learning_rate": 1.7842105263157894e-05, + "loss": 0.6471, + "step": 4637 + }, + { + "epoch": 5.953786906290116, + "grad_norm": 2.236196994781494, + "learning_rate": 1.7845956354300385e-05, + "loss": 0.736, + "step": 4638 + }, + { + "epoch": 5.955070603337612, + "grad_norm": 5.582145690917969, + "learning_rate": 1.7849807445442876e-05, + "loss": 0.6851, + "step": 4639 + }, + { + "epoch": 5.956354300385109, + "grad_norm": 4.344070911407471, + "learning_rate": 1.7853658536585367e-05, + "loss": 0.683, + "step": 4640 + }, + { + "epoch": 5.957637997432606, + "grad_norm": 2.853447198867798, + "learning_rate": 1.7857509627727858e-05, + "loss": 0.7359, + "step": 4641 + }, + { + "epoch": 5.958921694480103, + "grad_norm": 3.5913796424865723, + "learning_rate": 1.786136071887035e-05, + "loss": 0.7504, + "step": 4642 + }, + { + "epoch": 5.9602053915276, + "grad_norm": 2.118105888366699, + "learning_rate": 1.7865211810012837e-05, + "loss": 0.7508, + "step": 4643 + }, + { + "epoch": 5.961489088575096, + "grad_norm": 3.521005868911743, + "learning_rate": 1.7869062901155328e-05, + "loss": 0.8338, + "step": 4644 + }, + { + "epoch": 5.962772785622593, + "grad_norm": 3.815791130065918, + "learning_rate": 1.7872913992297815e-05, + "loss": 0.9315, + "step": 4645 + }, + { + "epoch": 5.96405648267009, + "grad_norm": 2.0616118907928467, + "learning_rate": 1.787676508344031e-05, + "loss": 0.6147, + "step": 4646 + }, + { + "epoch": 5.965340179717587, + "grad_norm": 3.8521080017089844, + "learning_rate": 1.78806161745828e-05, + "loss": 0.59, + "step": 4647 + }, + { + "epoch": 5.966623876765084, + "grad_norm": 3.3083479404449463, + "learning_rate": 1.788446726572529e-05, + "loss": 0.5848, + "step": 4648 + }, + { + "epoch": 5.96790757381258, + "grad_norm": 3.7859838008880615, + "learning_rate": 1.788831835686778e-05, + "loss": 0.6225, + "step": 4649 + }, + { + "epoch": 5.969191270860077, + "grad_norm": 3.8039002418518066, + "learning_rate": 1.789216944801027e-05, + "loss": 0.5904, + "step": 4650 + }, + { + "epoch": 5.970474967907574, + "grad_norm": 15.597793579101562, + "learning_rate": 1.789602053915276e-05, + "loss": 0.614, + "step": 4651 + }, + { + "epoch": 5.971758664955071, + "grad_norm": 2.075531482696533, + "learning_rate": 1.7899871630295253e-05, + "loss": 0.6437, + "step": 4652 + }, + { + "epoch": 5.973042362002568, + "grad_norm": 0.9460281729698181, + "learning_rate": 1.790372272143774e-05, + "loss": 0.5774, + "step": 4653 + }, + { + "epoch": 5.974326059050064, + "grad_norm": 5.627438068389893, + "learning_rate": 1.7907573812580232e-05, + "loss": 0.6559, + "step": 4654 + }, + { + "epoch": 5.975609756097561, + "grad_norm": 1.8562909364700317, + "learning_rate": 1.7911424903722723e-05, + "loss": 0.6327, + "step": 4655 + }, + { + "epoch": 5.976893453145058, + "grad_norm": 1.7667286396026611, + "learning_rate": 1.791527599486521e-05, + "loss": 0.6113, + "step": 4656 + }, + { + "epoch": 5.978177150192555, + "grad_norm": 3.02710223197937, + "learning_rate": 1.79191270860077e-05, + "loss": 0.6595, + "step": 4657 + }, + { + "epoch": 5.979460847240051, + "grad_norm": 3.6229588985443115, + "learning_rate": 1.7922978177150196e-05, + "loss": 0.5927, + "step": 4658 + }, + { + "epoch": 5.980744544287548, + "grad_norm": 7.67933464050293, + "learning_rate": 1.7926829268292684e-05, + "loss": 0.6699, + "step": 4659 + }, + { + "epoch": 5.982028241335045, + "grad_norm": 9.838215827941895, + "learning_rate": 1.7930680359435175e-05, + "loss": 0.5896, + "step": 4660 + }, + { + "epoch": 5.983311938382542, + "grad_norm": 2.1962552070617676, + "learning_rate": 1.7934531450577662e-05, + "loss": 0.6144, + "step": 4661 + }, + { + "epoch": 5.984595635430038, + "grad_norm": 2.568796396255493, + "learning_rate": 1.7938382541720153e-05, + "loss": 0.6796, + "step": 4662 + }, + { + "epoch": 5.985879332477535, + "grad_norm": 2.4772326946258545, + "learning_rate": 1.7942233632862645e-05, + "loss": 0.6325, + "step": 4663 + }, + { + "epoch": 5.987163029525032, + "grad_norm": 3.550342321395874, + "learning_rate": 1.7946084724005136e-05, + "loss": 0.695, + "step": 4664 + }, + { + "epoch": 5.988446726572529, + "grad_norm": 4.690410614013672, + "learning_rate": 1.7949935815147627e-05, + "loss": 0.6744, + "step": 4665 + }, + { + "epoch": 5.989730423620026, + "grad_norm": 2.4556326866149902, + "learning_rate": 1.7953786906290118e-05, + "loss": 0.6514, + "step": 4666 + }, + { + "epoch": 5.991014120667522, + "grad_norm": 2.992429733276367, + "learning_rate": 1.7957637997432605e-05, + "loss": 0.6688, + "step": 4667 + }, + { + "epoch": 5.992297817715019, + "grad_norm": 4.492903709411621, + "learning_rate": 1.7961489088575097e-05, + "loss": 0.6716, + "step": 4668 + }, + { + "epoch": 5.993581514762516, + "grad_norm": 2.409193992614746, + "learning_rate": 1.7965340179717588e-05, + "loss": 0.6756, + "step": 4669 + }, + { + "epoch": 5.994865211810013, + "grad_norm": 1.9924466609954834, + "learning_rate": 1.7969191270860075e-05, + "loss": 0.6958, + "step": 4670 + }, + { + "epoch": 5.99614890885751, + "grad_norm": 8.760640144348145, + "learning_rate": 1.797304236200257e-05, + "loss": 0.7378, + "step": 4671 + }, + { + "epoch": 5.997432605905006, + "grad_norm": 2.8134403228759766, + "learning_rate": 1.7976893453145057e-05, + "loss": 0.6644, + "step": 4672 + }, + { + "epoch": 5.998716302952503, + "grad_norm": 3.834331750869751, + "learning_rate": 1.798074454428755e-05, + "loss": 0.7961, + "step": 4673 + }, + { + "epoch": 6.0, + "grad_norm": 6.247755527496338, + "learning_rate": 1.798459563543004e-05, + "loss": 1.0221, + "step": 4674 + }, + { + "epoch": 6.001283697047497, + "grad_norm": 1.7479954957962036, + "learning_rate": 1.7988446726572527e-05, + "loss": 0.5917, + "step": 4675 + }, + { + "epoch": 6.002567394094994, + "grad_norm": 2.011643648147583, + "learning_rate": 1.799229781771502e-05, + "loss": 0.6089, + "step": 4676 + }, + { + "epoch": 6.00385109114249, + "grad_norm": 2.048703670501709, + "learning_rate": 1.7996148908857513e-05, + "loss": 0.5946, + "step": 4677 + }, + { + "epoch": 6.005134788189987, + "grad_norm": 3.801301956176758, + "learning_rate": 1.8e-05, + "loss": 0.6022, + "step": 4678 + }, + { + "epoch": 6.006418485237484, + "grad_norm": 1.2403218746185303, + "learning_rate": 1.800385109114249e-05, + "loss": 0.6256, + "step": 4679 + }, + { + "epoch": 6.007702182284981, + "grad_norm": 1.2759920358657837, + "learning_rate": 1.800770218228498e-05, + "loss": 0.5883, + "step": 4680 + }, + { + "epoch": 6.008985879332478, + "grad_norm": 1.7840625047683716, + "learning_rate": 1.801155327342747e-05, + "loss": 0.6252, + "step": 4681 + }, + { + "epoch": 6.010269576379974, + "grad_norm": 1.6369751691818237, + "learning_rate": 1.801540436456996e-05, + "loss": 0.5643, + "step": 4682 + }, + { + "epoch": 6.011553273427471, + "grad_norm": 1.676872968673706, + "learning_rate": 1.8019255455712452e-05, + "loss": 0.5651, + "step": 4683 + }, + { + "epoch": 6.012836970474968, + "grad_norm": 9.841994285583496, + "learning_rate": 1.8023106546854943e-05, + "loss": 0.6072, + "step": 4684 + }, + { + "epoch": 6.014120667522465, + "grad_norm": 1.7028534412384033, + "learning_rate": 1.8026957637997435e-05, + "loss": 0.5951, + "step": 4685 + }, + { + "epoch": 6.015404364569961, + "grad_norm": 1.4485937356948853, + "learning_rate": 1.8030808729139922e-05, + "loss": 0.6334, + "step": 4686 + }, + { + "epoch": 6.016688061617458, + "grad_norm": 1.9398579597473145, + "learning_rate": 1.8034659820282413e-05, + "loss": 0.6029, + "step": 4687 + }, + { + "epoch": 6.017971758664955, + "grad_norm": 1.789252758026123, + "learning_rate": 1.8038510911424904e-05, + "loss": 0.6401, + "step": 4688 + }, + { + "epoch": 6.019255455712452, + "grad_norm": 1.5968458652496338, + "learning_rate": 1.8042362002567395e-05, + "loss": 0.5847, + "step": 4689 + }, + { + "epoch": 6.020539152759949, + "grad_norm": 2.41624116897583, + "learning_rate": 1.8046213093709887e-05, + "loss": 0.5994, + "step": 4690 + }, + { + "epoch": 6.021822849807445, + "grad_norm": 2.0365211963653564, + "learning_rate": 1.8050064184852374e-05, + "loss": 0.6137, + "step": 4691 + }, + { + "epoch": 6.023106546854942, + "grad_norm": 1.5019164085388184, + "learning_rate": 1.8053915275994865e-05, + "loss": 0.6178, + "step": 4692 + }, + { + "epoch": 6.024390243902439, + "grad_norm": 2.3145153522491455, + "learning_rate": 1.8057766367137356e-05, + "loss": 0.6155, + "step": 4693 + }, + { + "epoch": 6.025673940949936, + "grad_norm": 6.448663234710693, + "learning_rate": 1.8061617458279844e-05, + "loss": 0.5826, + "step": 4694 + }, + { + "epoch": 6.026957637997433, + "grad_norm": 2.2047033309936523, + "learning_rate": 1.806546854942234e-05, + "loss": 0.5915, + "step": 4695 + }, + { + "epoch": 6.028241335044929, + "grad_norm": 3.8488216400146484, + "learning_rate": 1.806931964056483e-05, + "loss": 0.5779, + "step": 4696 + }, + { + "epoch": 6.029525032092426, + "grad_norm": 3.2727859020233154, + "learning_rate": 1.8073170731707317e-05, + "loss": 0.6334, + "step": 4697 + }, + { + "epoch": 6.030808729139923, + "grad_norm": 3.3458988666534424, + "learning_rate": 1.807702182284981e-05, + "loss": 0.6459, + "step": 4698 + }, + { + "epoch": 6.03209242618742, + "grad_norm": 1.7338917255401611, + "learning_rate": 1.8080872913992296e-05, + "loss": 0.6299, + "step": 4699 + }, + { + "epoch": 6.033376123234916, + "grad_norm": 1.5838537216186523, + "learning_rate": 1.8084724005134787e-05, + "loss": 0.6056, + "step": 4700 + }, + { + "epoch": 6.034659820282413, + "grad_norm": 3.8158633708953857, + "learning_rate": 1.808857509627728e-05, + "loss": 0.6964, + "step": 4701 + }, + { + "epoch": 6.03594351732991, + "grad_norm": 4.208054065704346, + "learning_rate": 1.809242618741977e-05, + "loss": 0.6341, + "step": 4702 + }, + { + "epoch": 6.037227214377407, + "grad_norm": 2.913938522338867, + "learning_rate": 1.809627727856226e-05, + "loss": 0.6255, + "step": 4703 + }, + { + "epoch": 6.038510911424904, + "grad_norm": 2.2427785396575928, + "learning_rate": 1.810012836970475e-05, + "loss": 0.6352, + "step": 4704 + }, + { + "epoch": 6.0397946084724, + "grad_norm": 1.880159616470337, + "learning_rate": 1.810397946084724e-05, + "loss": 0.6093, + "step": 4705 + }, + { + "epoch": 6.041078305519897, + "grad_norm": 1.768330454826355, + "learning_rate": 1.810783055198973e-05, + "loss": 0.653, + "step": 4706 + }, + { + "epoch": 6.042362002567394, + "grad_norm": 3.2513785362243652, + "learning_rate": 1.811168164313222e-05, + "loss": 0.6244, + "step": 4707 + }, + { + "epoch": 6.043645699614891, + "grad_norm": 1.7937164306640625, + "learning_rate": 1.8115532734274712e-05, + "loss": 0.6402, + "step": 4708 + }, + { + "epoch": 6.044929396662388, + "grad_norm": 2.959239959716797, + "learning_rate": 1.8119383825417203e-05, + "loss": 0.7291, + "step": 4709 + }, + { + "epoch": 6.046213093709884, + "grad_norm": 17.991683959960938, + "learning_rate": 1.812323491655969e-05, + "loss": 0.7201, + "step": 4710 + }, + { + "epoch": 6.047496790757381, + "grad_norm": 3.9552838802337646, + "learning_rate": 1.8127086007702182e-05, + "loss": 0.6532, + "step": 4711 + }, + { + "epoch": 6.048780487804878, + "grad_norm": 2.2317798137664795, + "learning_rate": 1.8130937098844673e-05, + "loss": 0.6723, + "step": 4712 + }, + { + "epoch": 6.050064184852375, + "grad_norm": 3.212873697280884, + "learning_rate": 1.8134788189987164e-05, + "loss": 0.6922, + "step": 4713 + }, + { + "epoch": 6.051347881899872, + "grad_norm": 2.314055919647217, + "learning_rate": 1.8138639281129655e-05, + "loss": 0.7175, + "step": 4714 + }, + { + "epoch": 6.052631578947368, + "grad_norm": 6.0000996589660645, + "learning_rate": 1.8142490372272146e-05, + "loss": 0.7098, + "step": 4715 + }, + { + "epoch": 6.053915275994865, + "grad_norm": 34.4080810546875, + "learning_rate": 1.8146341463414634e-05, + "loss": 0.6567, + "step": 4716 + }, + { + "epoch": 6.055198973042362, + "grad_norm": 2.287336826324463, + "learning_rate": 1.8150192554557125e-05, + "loss": 0.6686, + "step": 4717 + }, + { + "epoch": 6.056482670089859, + "grad_norm": 7.152041912078857, + "learning_rate": 1.8154043645699613e-05, + "loss": 0.6448, + "step": 4718 + }, + { + "epoch": 6.057766367137355, + "grad_norm": 3.917750120162964, + "learning_rate": 1.8157894736842107e-05, + "loss": 0.6813, + "step": 4719 + }, + { + "epoch": 6.059050064184852, + "grad_norm": 2.0006563663482666, + "learning_rate": 1.81617458279846e-05, + "loss": 0.7245, + "step": 4720 + }, + { + "epoch": 6.060333761232349, + "grad_norm": 8.06617546081543, + "learning_rate": 1.8165596919127086e-05, + "loss": 0.7279, + "step": 4721 + }, + { + "epoch": 6.061617458279846, + "grad_norm": 5.57946252822876, + "learning_rate": 1.8169448010269577e-05, + "loss": 0.7201, + "step": 4722 + }, + { + "epoch": 6.062901155327343, + "grad_norm": 4.280914306640625, + "learning_rate": 1.8173299101412068e-05, + "loss": 0.7851, + "step": 4723 + }, + { + "epoch": 6.064184852374839, + "grad_norm": 5.788909435272217, + "learning_rate": 1.8177150192554556e-05, + "loss": 0.9079, + "step": 4724 + }, + { + "epoch": 6.065468549422336, + "grad_norm": 2.083768367767334, + "learning_rate": 1.8181001283697047e-05, + "loss": 0.6017, + "step": 4725 + }, + { + "epoch": 6.066752246469833, + "grad_norm": 2.376084327697754, + "learning_rate": 1.8184852374839538e-05, + "loss": 0.665, + "step": 4726 + }, + { + "epoch": 6.06803594351733, + "grad_norm": 1.8653205633163452, + "learning_rate": 1.818870346598203e-05, + "loss": 0.6149, + "step": 4727 + }, + { + "epoch": 6.069319640564827, + "grad_norm": 3.863668441772461, + "learning_rate": 1.819255455712452e-05, + "loss": 0.6029, + "step": 4728 + }, + { + "epoch": 6.070603337612323, + "grad_norm": 5.010998725891113, + "learning_rate": 1.8196405648267008e-05, + "loss": 0.5985, + "step": 4729 + }, + { + "epoch": 6.07188703465982, + "grad_norm": 3.7758100032806396, + "learning_rate": 1.82002567394095e-05, + "loss": 0.6273, + "step": 4730 + }, + { + "epoch": 6.073170731707317, + "grad_norm": 3.507781982421875, + "learning_rate": 1.820410783055199e-05, + "loss": 0.6091, + "step": 4731 + }, + { + "epoch": 6.074454428754814, + "grad_norm": 1.9885247945785522, + "learning_rate": 1.820795892169448e-05, + "loss": 0.5849, + "step": 4732 + }, + { + "epoch": 6.07573812580231, + "grad_norm": 1.2455687522888184, + "learning_rate": 1.8211810012836972e-05, + "loss": 0.5781, + "step": 4733 + }, + { + "epoch": 6.077021822849807, + "grad_norm": 2.8441615104675293, + "learning_rate": 1.8215661103979463e-05, + "loss": 0.6423, + "step": 4734 + }, + { + "epoch": 6.078305519897304, + "grad_norm": 2.0404467582702637, + "learning_rate": 1.821951219512195e-05, + "loss": 0.6128, + "step": 4735 + }, + { + "epoch": 6.079589216944801, + "grad_norm": 2.2113778591156006, + "learning_rate": 1.8223363286264442e-05, + "loss": 0.6031, + "step": 4736 + }, + { + "epoch": 6.080872913992298, + "grad_norm": 1.7952781915664673, + "learning_rate": 1.822721437740693e-05, + "loss": 0.6729, + "step": 4737 + }, + { + "epoch": 6.082156611039794, + "grad_norm": 3.6958656311035156, + "learning_rate": 1.8231065468549424e-05, + "loss": 0.608, + "step": 4738 + }, + { + "epoch": 6.083440308087291, + "grad_norm": 2.1955113410949707, + "learning_rate": 1.8234916559691915e-05, + "loss": 0.5876, + "step": 4739 + }, + { + "epoch": 6.084724005134788, + "grad_norm": 1.449797511100769, + "learning_rate": 1.8238767650834403e-05, + "loss": 0.6164, + "step": 4740 + }, + { + "epoch": 6.086007702182285, + "grad_norm": 2.8089346885681152, + "learning_rate": 1.8242618741976894e-05, + "loss": 0.574, + "step": 4741 + }, + { + "epoch": 6.087291399229782, + "grad_norm": 3.118582010269165, + "learning_rate": 1.8246469833119385e-05, + "loss": 0.5933, + "step": 4742 + }, + { + "epoch": 6.088575096277278, + "grad_norm": 2.9262564182281494, + "learning_rate": 1.8250320924261873e-05, + "loss": 0.6119, + "step": 4743 + }, + { + "epoch": 6.089858793324775, + "grad_norm": 2.7786097526550293, + "learning_rate": 1.8254172015404367e-05, + "loss": 0.6384, + "step": 4744 + }, + { + "epoch": 6.091142490372272, + "grad_norm": 1.5401548147201538, + "learning_rate": 1.8258023106546855e-05, + "loss": 0.6183, + "step": 4745 + }, + { + "epoch": 6.092426187419769, + "grad_norm": 3.3360517024993896, + "learning_rate": 1.8261874197689346e-05, + "loss": 0.6139, + "step": 4746 + }, + { + "epoch": 6.093709884467266, + "grad_norm": 1.7209011316299438, + "learning_rate": 1.8265725288831837e-05, + "loss": 0.6345, + "step": 4747 + }, + { + "epoch": 6.094993581514762, + "grad_norm": 7.096157550811768, + "learning_rate": 1.8269576379974325e-05, + "loss": 0.565, + "step": 4748 + }, + { + "epoch": 6.096277278562259, + "grad_norm": 3.025418758392334, + "learning_rate": 1.8273427471116816e-05, + "loss": 0.5838, + "step": 4749 + }, + { + "epoch": 6.097560975609756, + "grad_norm": 3.1456069946289062, + "learning_rate": 1.827727856225931e-05, + "loss": 0.6929, + "step": 4750 + }, + { + "epoch": 6.098844672657253, + "grad_norm": 3.5662178993225098, + "learning_rate": 1.8281129653401798e-05, + "loss": 0.6644, + "step": 4751 + }, + { + "epoch": 6.100128369704749, + "grad_norm": 2.09277081489563, + "learning_rate": 1.828498074454429e-05, + "loss": 0.6593, + "step": 4752 + }, + { + "epoch": 6.101412066752246, + "grad_norm": 2.070751667022705, + "learning_rate": 1.8288831835686777e-05, + "loss": 0.6252, + "step": 4753 + }, + { + "epoch": 6.102695763799743, + "grad_norm": 3.4292256832122803, + "learning_rate": 1.8292682926829268e-05, + "loss": 0.6602, + "step": 4754 + }, + { + "epoch": 6.10397946084724, + "grad_norm": 3.267699956893921, + "learning_rate": 1.829653401797176e-05, + "loss": 0.6512, + "step": 4755 + }, + { + "epoch": 6.105263157894737, + "grad_norm": 3.1265556812286377, + "learning_rate": 1.830038510911425e-05, + "loss": 0.6719, + "step": 4756 + }, + { + "epoch": 6.106546854942233, + "grad_norm": 3.5977137088775635, + "learning_rate": 1.830423620025674e-05, + "loss": 0.6188, + "step": 4757 + }, + { + "epoch": 6.10783055198973, + "grad_norm": 4.0741143226623535, + "learning_rate": 1.8308087291399232e-05, + "loss": 0.697, + "step": 4758 + }, + { + "epoch": 6.109114249037227, + "grad_norm": 2.508234739303589, + "learning_rate": 1.831193838254172e-05, + "loss": 0.6453, + "step": 4759 + }, + { + "epoch": 6.110397946084724, + "grad_norm": 1.9835100173950195, + "learning_rate": 1.831578947368421e-05, + "loss": 0.63, + "step": 4760 + }, + { + "epoch": 6.111681643132221, + "grad_norm": 2.239386796951294, + "learning_rate": 1.8319640564826702e-05, + "loss": 0.6637, + "step": 4761 + }, + { + "epoch": 6.112965340179717, + "grad_norm": 2.9566104412078857, + "learning_rate": 1.8323491655969193e-05, + "loss": 0.6579, + "step": 4762 + }, + { + "epoch": 6.114249037227214, + "grad_norm": 2.880815267562866, + "learning_rate": 1.8327342747111684e-05, + "loss": 0.6349, + "step": 4763 + }, + { + "epoch": 6.115532734274711, + "grad_norm": 1.8686546087265015, + "learning_rate": 1.833119383825417e-05, + "loss": 0.6639, + "step": 4764 + }, + { + "epoch": 6.116816431322208, + "grad_norm": 5.02849006652832, + "learning_rate": 1.8335044929396663e-05, + "loss": 0.7055, + "step": 4765 + }, + { + "epoch": 6.118100128369705, + "grad_norm": 5.042064666748047, + "learning_rate": 1.8338896020539154e-05, + "loss": 0.7071, + "step": 4766 + }, + { + "epoch": 6.119383825417201, + "grad_norm": 3.407546281814575, + "learning_rate": 1.834274711168164e-05, + "loss": 0.6835, + "step": 4767 + }, + { + "epoch": 6.120667522464698, + "grad_norm": 5.781942844390869, + "learning_rate": 1.8346598202824136e-05, + "loss": 0.6525, + "step": 4768 + }, + { + "epoch": 6.121951219512195, + "grad_norm": 2.8627982139587402, + "learning_rate": 1.8350449293966627e-05, + "loss": 0.6934, + "step": 4769 + }, + { + "epoch": 6.123234916559692, + "grad_norm": 6.7778401374816895, + "learning_rate": 1.8354300385109115e-05, + "loss": 0.7356, + "step": 4770 + }, + { + "epoch": 6.124518613607188, + "grad_norm": 3.3808512687683105, + "learning_rate": 1.8358151476251606e-05, + "loss": 0.6896, + "step": 4771 + }, + { + "epoch": 6.125802310654685, + "grad_norm": 8.38050651550293, + "learning_rate": 1.8362002567394093e-05, + "loss": 0.7412, + "step": 4772 + }, + { + "epoch": 6.127086007702182, + "grad_norm": 7.272341728210449, + "learning_rate": 1.8365853658536584e-05, + "loss": 0.7827, + "step": 4773 + }, + { + "epoch": 6.128369704749679, + "grad_norm": 3.9332096576690674, + "learning_rate": 1.8369704749679075e-05, + "loss": 0.9039, + "step": 4774 + }, + { + "epoch": 6.129653401797176, + "grad_norm": 2.697736978530884, + "learning_rate": 1.8373555840821567e-05, + "loss": 0.5842, + "step": 4775 + }, + { + "epoch": 6.130937098844672, + "grad_norm": 2.5550591945648193, + "learning_rate": 1.8377406931964058e-05, + "loss": 0.5706, + "step": 4776 + }, + { + "epoch": 6.132220795892169, + "grad_norm": 1.5319504737854004, + "learning_rate": 1.838125802310655e-05, + "loss": 0.5867, + "step": 4777 + }, + { + "epoch": 6.133504492939666, + "grad_norm": 2.641186237335205, + "learning_rate": 1.8385109114249036e-05, + "loss": 0.66, + "step": 4778 + }, + { + "epoch": 6.134788189987163, + "grad_norm": 2.2687013149261475, + "learning_rate": 1.8388960205391527e-05, + "loss": 0.6148, + "step": 4779 + }, + { + "epoch": 6.13607188703466, + "grad_norm": 2.24701189994812, + "learning_rate": 1.839281129653402e-05, + "loss": 0.6326, + "step": 4780 + }, + { + "epoch": 6.137355584082156, + "grad_norm": 4.105069637298584, + "learning_rate": 1.839666238767651e-05, + "loss": 0.6099, + "step": 4781 + }, + { + "epoch": 6.138639281129653, + "grad_norm": 3.4631874561309814, + "learning_rate": 1.8400513478819e-05, + "loss": 0.5903, + "step": 4782 + }, + { + "epoch": 6.13992297817715, + "grad_norm": 1.6153866052627563, + "learning_rate": 1.840436456996149e-05, + "loss": 0.6003, + "step": 4783 + }, + { + "epoch": 6.141206675224647, + "grad_norm": 1.4082375764846802, + "learning_rate": 1.840821566110398e-05, + "loss": 0.6236, + "step": 4784 + }, + { + "epoch": 6.142490372272144, + "grad_norm": 1.4914350509643555, + "learning_rate": 1.841206675224647e-05, + "loss": 0.5726, + "step": 4785 + }, + { + "epoch": 6.14377406931964, + "grad_norm": 2.0777013301849365, + "learning_rate": 1.8415917843388958e-05, + "loss": 0.6178, + "step": 4786 + }, + { + "epoch": 6.145057766367137, + "grad_norm": 1.4128334522247314, + "learning_rate": 1.8419768934531453e-05, + "loss": 0.5845, + "step": 4787 + }, + { + "epoch": 6.146341463414634, + "grad_norm": 4.636231422424316, + "learning_rate": 1.8423620025673944e-05, + "loss": 0.6367, + "step": 4788 + }, + { + "epoch": 6.147625160462131, + "grad_norm": 1.901389718055725, + "learning_rate": 1.842747111681643e-05, + "loss": 0.6136, + "step": 4789 + }, + { + "epoch": 6.148908857509627, + "grad_norm": 3.007521152496338, + "learning_rate": 1.8431322207958922e-05, + "loss": 0.6289, + "step": 4790 + }, + { + "epoch": 6.150192554557124, + "grad_norm": 2.634308338165283, + "learning_rate": 1.843517329910141e-05, + "loss": 0.5936, + "step": 4791 + }, + { + "epoch": 6.151476251604621, + "grad_norm": 1.7571370601654053, + "learning_rate": 1.84390243902439e-05, + "loss": 0.6283, + "step": 4792 + }, + { + "epoch": 6.152759948652118, + "grad_norm": 2.2088029384613037, + "learning_rate": 1.8442875481386396e-05, + "loss": 0.6111, + "step": 4793 + }, + { + "epoch": 6.154043645699615, + "grad_norm": 4.173379898071289, + "learning_rate": 1.8446726572528883e-05, + "loss": 0.6192, + "step": 4794 + }, + { + "epoch": 6.155327342747111, + "grad_norm": 7.226598262786865, + "learning_rate": 1.8450577663671374e-05, + "loss": 0.5835, + "step": 4795 + }, + { + "epoch": 6.156611039794608, + "grad_norm": 4.948644161224365, + "learning_rate": 1.8454428754813865e-05, + "loss": 0.6258, + "step": 4796 + }, + { + "epoch": 6.157894736842105, + "grad_norm": 5.542666912078857, + "learning_rate": 1.8458279845956353e-05, + "loss": 0.6243, + "step": 4797 + }, + { + "epoch": 6.159178433889602, + "grad_norm": 4.711223602294922, + "learning_rate": 1.8462130937098844e-05, + "loss": 0.5818, + "step": 4798 + }, + { + "epoch": 6.160462130937099, + "grad_norm": 16.1571044921875, + "learning_rate": 1.8465982028241335e-05, + "loss": 0.6643, + "step": 4799 + }, + { + "epoch": 6.161745827984595, + "grad_norm": 10.673842430114746, + "learning_rate": 1.8469833119383826e-05, + "loss": 0.6157, + "step": 4800 + }, + { + "epoch": 6.163029525032092, + "grad_norm": 3.3671491146087646, + "learning_rate": 1.8473684210526317e-05, + "loss": 0.6034, + "step": 4801 + }, + { + "epoch": 6.164313222079589, + "grad_norm": 3.9880967140197754, + "learning_rate": 1.8477535301668805e-05, + "loss": 0.6131, + "step": 4802 + }, + { + "epoch": 6.165596919127086, + "grad_norm": 2.1781954765319824, + "learning_rate": 1.8481386392811296e-05, + "loss": 0.6719, + "step": 4803 + }, + { + "epoch": 6.166880616174582, + "grad_norm": 2.9440319538116455, + "learning_rate": 1.8485237483953787e-05, + "loss": 0.6191, + "step": 4804 + }, + { + "epoch": 6.168164313222079, + "grad_norm": 9.885763168334961, + "learning_rate": 1.848908857509628e-05, + "loss": 0.6296, + "step": 4805 + }, + { + "epoch": 6.169448010269576, + "grad_norm": 2.1220157146453857, + "learning_rate": 1.849293966623877e-05, + "loss": 0.5694, + "step": 4806 + }, + { + "epoch": 6.170731707317073, + "grad_norm": 3.507615327835083, + "learning_rate": 1.849679075738126e-05, + "loss": 0.7131, + "step": 4807 + }, + { + "epoch": 6.17201540436457, + "grad_norm": 8.21433162689209, + "learning_rate": 1.8500641848523748e-05, + "loss": 0.6574, + "step": 4808 + }, + { + "epoch": 6.173299101412066, + "grad_norm": 3.4292569160461426, + "learning_rate": 1.850449293966624e-05, + "loss": 0.6273, + "step": 4809 + }, + { + "epoch": 6.174582798459563, + "grad_norm": 1.8804502487182617, + "learning_rate": 1.8508344030808727e-05, + "loss": 0.6386, + "step": 4810 + }, + { + "epoch": 6.17586649550706, + "grad_norm": 1.7292202711105347, + "learning_rate": 1.851219512195122e-05, + "loss": 0.6664, + "step": 4811 + }, + { + "epoch": 6.177150192554557, + "grad_norm": 3.151402235031128, + "learning_rate": 1.8516046213093712e-05, + "loss": 0.6275, + "step": 4812 + }, + { + "epoch": 6.178433889602054, + "grad_norm": 3.8103017807006836, + "learning_rate": 1.85198973042362e-05, + "loss": 0.6259, + "step": 4813 + }, + { + "epoch": 6.17971758664955, + "grad_norm": 2.1367714405059814, + "learning_rate": 1.852374839537869e-05, + "loss": 0.682, + "step": 4814 + }, + { + "epoch": 6.181001283697047, + "grad_norm": 6.838284969329834, + "learning_rate": 1.8527599486521182e-05, + "loss": 0.6812, + "step": 4815 + }, + { + "epoch": 6.182284980744544, + "grad_norm": 2.1353068351745605, + "learning_rate": 1.853145057766367e-05, + "loss": 0.7658, + "step": 4816 + }, + { + "epoch": 6.183568677792041, + "grad_norm": 1.8470903635025024, + "learning_rate": 1.8535301668806164e-05, + "loss": 0.7294, + "step": 4817 + }, + { + "epoch": 6.184852374839538, + "grad_norm": 2.2996785640716553, + "learning_rate": 1.8539152759948652e-05, + "loss": 0.7171, + "step": 4818 + }, + { + "epoch": 6.186136071887034, + "grad_norm": 3.1601781845092773, + "learning_rate": 1.8543003851091143e-05, + "loss": 0.6408, + "step": 4819 + }, + { + "epoch": 6.187419768934531, + "grad_norm": 5.856865882873535, + "learning_rate": 1.8546854942233634e-05, + "loss": 0.7329, + "step": 4820 + }, + { + "epoch": 6.188703465982028, + "grad_norm": 2.444986581802368, + "learning_rate": 1.8550706033376122e-05, + "loss": 0.787, + "step": 4821 + }, + { + "epoch": 6.189987163029525, + "grad_norm": 4.367455005645752, + "learning_rate": 1.8554557124518613e-05, + "loss": 0.8456, + "step": 4822 + }, + { + "epoch": 6.191270860077021, + "grad_norm": 3.6789016723632812, + "learning_rate": 1.8558408215661104e-05, + "loss": 0.8048, + "step": 4823 + }, + { + "epoch": 6.192554557124518, + "grad_norm": 28.05374526977539, + "learning_rate": 1.8562259306803595e-05, + "loss": 0.9129, + "step": 4824 + }, + { + "epoch": 6.193838254172015, + "grad_norm": 3.8688340187072754, + "learning_rate": 1.8566110397946086e-05, + "loss": 0.6026, + "step": 4825 + }, + { + "epoch": 6.195121951219512, + "grad_norm": 3.4481942653656006, + "learning_rate": 1.8569961489088577e-05, + "loss": 0.5949, + "step": 4826 + }, + { + "epoch": 6.196405648267009, + "grad_norm": 2.2170090675354004, + "learning_rate": 1.8573812580231065e-05, + "loss": 0.6147, + "step": 4827 + }, + { + "epoch": 6.197689345314505, + "grad_norm": 2.027272939682007, + "learning_rate": 1.8577663671373556e-05, + "loss": 0.6125, + "step": 4828 + }, + { + "epoch": 6.198973042362002, + "grad_norm": 2.709331750869751, + "learning_rate": 1.8581514762516044e-05, + "loss": 0.5829, + "step": 4829 + }, + { + "epoch": 6.200256739409499, + "grad_norm": 3.5334551334381104, + "learning_rate": 1.8585365853658538e-05, + "loss": 0.5802, + "step": 4830 + }, + { + "epoch": 6.201540436456996, + "grad_norm": 1.9116179943084717, + "learning_rate": 1.858921694480103e-05, + "loss": 0.607, + "step": 4831 + }, + { + "epoch": 6.202824133504493, + "grad_norm": 2.7910430431365967, + "learning_rate": 1.8593068035943517e-05, + "loss": 0.676, + "step": 4832 + }, + { + "epoch": 6.2041078305519894, + "grad_norm": 3.0372703075408936, + "learning_rate": 1.8596919127086008e-05, + "loss": 0.6171, + "step": 4833 + }, + { + "epoch": 6.205391527599486, + "grad_norm": 1.931715965270996, + "learning_rate": 1.86007702182285e-05, + "loss": 0.659, + "step": 4834 + }, + { + "epoch": 6.206675224646983, + "grad_norm": 4.4634222984313965, + "learning_rate": 1.8604621309370987e-05, + "loss": 0.5976, + "step": 4835 + }, + { + "epoch": 6.20795892169448, + "grad_norm": 2.572479248046875, + "learning_rate": 1.860847240051348e-05, + "loss": 0.6459, + "step": 4836 + }, + { + "epoch": 6.2092426187419765, + "grad_norm": 1.8585799932479858, + "learning_rate": 1.861232349165597e-05, + "loss": 0.6417, + "step": 4837 + }, + { + "epoch": 6.2105263157894735, + "grad_norm": 2.5949506759643555, + "learning_rate": 1.861617458279846e-05, + "loss": 0.6209, + "step": 4838 + }, + { + "epoch": 6.21181001283697, + "grad_norm": 1.6043561697006226, + "learning_rate": 1.862002567394095e-05, + "loss": 0.6401, + "step": 4839 + }, + { + "epoch": 6.213093709884467, + "grad_norm": 1.5364545583724976, + "learning_rate": 1.862387676508344e-05, + "loss": 0.6203, + "step": 4840 + }, + { + "epoch": 6.214377406931964, + "grad_norm": 1.9469295740127563, + "learning_rate": 1.862772785622593e-05, + "loss": 0.6703, + "step": 4841 + }, + { + "epoch": 6.2156611039794605, + "grad_norm": 1.894809603691101, + "learning_rate": 1.8631578947368424e-05, + "loss": 0.6209, + "step": 4842 + }, + { + "epoch": 6.2169448010269575, + "grad_norm": 1.7163490056991577, + "learning_rate": 1.8635430038510912e-05, + "loss": 0.6636, + "step": 4843 + }, + { + "epoch": 6.218228498074454, + "grad_norm": 2.8062798976898193, + "learning_rate": 1.8639281129653403e-05, + "loss": 0.6338, + "step": 4844 + }, + { + "epoch": 6.219512195121951, + "grad_norm": 4.6206374168396, + "learning_rate": 1.8643132220795894e-05, + "loss": 0.6301, + "step": 4845 + }, + { + "epoch": 6.220795892169448, + "grad_norm": 2.987718343734741, + "learning_rate": 1.8646983311938382e-05, + "loss": 0.6292, + "step": 4846 + }, + { + "epoch": 6.2220795892169445, + "grad_norm": 3.104222059249878, + "learning_rate": 1.8650834403080873e-05, + "loss": 0.5807, + "step": 4847 + }, + { + "epoch": 6.2233632862644415, + "grad_norm": 1.4220091104507446, + "learning_rate": 1.8654685494223364e-05, + "loss": 0.6487, + "step": 4848 + }, + { + "epoch": 6.224646983311938, + "grad_norm": 3.259417772293091, + "learning_rate": 1.8658536585365855e-05, + "loss": 0.5857, + "step": 4849 + }, + { + "epoch": 6.225930680359435, + "grad_norm": 1.4865076541900635, + "learning_rate": 1.8662387676508346e-05, + "loss": 0.5895, + "step": 4850 + }, + { + "epoch": 6.227214377406932, + "grad_norm": 2.723262071609497, + "learning_rate": 1.8666238767650834e-05, + "loss": 0.6154, + "step": 4851 + }, + { + "epoch": 6.2284980744544285, + "grad_norm": 1.3412531614303589, + "learning_rate": 1.8670089858793325e-05, + "loss": 0.6197, + "step": 4852 + }, + { + "epoch": 6.2297817715019255, + "grad_norm": 1.6043167114257812, + "learning_rate": 1.8673940949935816e-05, + "loss": 0.6151, + "step": 4853 + }, + { + "epoch": 6.2310654685494224, + "grad_norm": 1.6967617273330688, + "learning_rate": 1.8677792041078307e-05, + "loss": 0.7223, + "step": 4854 + }, + { + "epoch": 6.232349165596919, + "grad_norm": 2.661783456802368, + "learning_rate": 1.8681643132220798e-05, + "loss": 0.6616, + "step": 4855 + }, + { + "epoch": 6.2336328626444155, + "grad_norm": 7.898088455200195, + "learning_rate": 1.8685494223363286e-05, + "loss": 0.6815, + "step": 4856 + }, + { + "epoch": 6.2349165596919125, + "grad_norm": 6.870777130126953, + "learning_rate": 1.8689345314505777e-05, + "loss": 0.6124, + "step": 4857 + }, + { + "epoch": 6.2362002567394095, + "grad_norm": 2.615131139755249, + "learning_rate": 1.8693196405648268e-05, + "loss": 0.5846, + "step": 4858 + }, + { + "epoch": 6.2374839537869065, + "grad_norm": 2.5022692680358887, + "learning_rate": 1.8697047496790755e-05, + "loss": 0.6643, + "step": 4859 + }, + { + "epoch": 6.238767650834403, + "grad_norm": 5.906851768493652, + "learning_rate": 1.870089858793325e-05, + "loss": 0.6537, + "step": 4860 + }, + { + "epoch": 6.2400513478818995, + "grad_norm": 11.1333646774292, + "learning_rate": 1.870474967907574e-05, + "loss": 0.6898, + "step": 4861 + }, + { + "epoch": 6.2413350449293965, + "grad_norm": 1.7406432628631592, + "learning_rate": 1.870860077021823e-05, + "loss": 0.6613, + "step": 4862 + }, + { + "epoch": 6.2426187419768935, + "grad_norm": 2.110323429107666, + "learning_rate": 1.871245186136072e-05, + "loss": 0.6154, + "step": 4863 + }, + { + "epoch": 6.2439024390243905, + "grad_norm": 2.804143190383911, + "learning_rate": 1.8716302952503207e-05, + "loss": 0.634, + "step": 4864 + }, + { + "epoch": 6.245186136071887, + "grad_norm": 2.3246169090270996, + "learning_rate": 1.87201540436457e-05, + "loss": 0.644, + "step": 4865 + }, + { + "epoch": 6.2464698331193835, + "grad_norm": 2.8713369369506836, + "learning_rate": 1.8724005134788193e-05, + "loss": 0.6835, + "step": 4866 + }, + { + "epoch": 6.2477535301668805, + "grad_norm": 8.922639846801758, + "learning_rate": 1.872785622593068e-05, + "loss": 0.6729, + "step": 4867 + }, + { + "epoch": 6.2490372272143775, + "grad_norm": 3.4021925926208496, + "learning_rate": 1.8731707317073172e-05, + "loss": 0.6696, + "step": 4868 + }, + { + "epoch": 6.2503209242618745, + "grad_norm": 3.355670213699341, + "learning_rate": 1.8735558408215663e-05, + "loss": 0.726, + "step": 4869 + }, + { + "epoch": 6.251604621309371, + "grad_norm": 4.330885887145996, + "learning_rate": 1.873940949935815e-05, + "loss": 0.6543, + "step": 4870 + }, + { + "epoch": 6.2528883183568675, + "grad_norm": 2.4820470809936523, + "learning_rate": 1.874326059050064e-05, + "loss": 0.6907, + "step": 4871 + }, + { + "epoch": 6.2541720154043645, + "grad_norm": 2.8027851581573486, + "learning_rate": 1.8747111681643133e-05, + "loss": 0.7415, + "step": 4872 + }, + { + "epoch": 6.2554557124518615, + "grad_norm": 5.677217960357666, + "learning_rate": 1.8750962772785624e-05, + "loss": 0.8504, + "step": 4873 + }, + { + "epoch": 6.2567394094993585, + "grad_norm": 5.304971694946289, + "learning_rate": 1.8754813863928115e-05, + "loss": 0.9753, + "step": 4874 + }, + { + "epoch": 6.258023106546855, + "grad_norm": 3.5023880004882812, + "learning_rate": 1.8758664955070602e-05, + "loss": 0.5911, + "step": 4875 + }, + { + "epoch": 6.2593068035943515, + "grad_norm": 3.5626189708709717, + "learning_rate": 1.8762516046213093e-05, + "loss": 0.5889, + "step": 4876 + }, + { + "epoch": 6.2605905006418485, + "grad_norm": 6.343590259552002, + "learning_rate": 1.8766367137355585e-05, + "loss": 0.6003, + "step": 4877 + }, + { + "epoch": 6.2618741976893455, + "grad_norm": 3.561370611190796, + "learning_rate": 1.8770218228498072e-05, + "loss": 0.6003, + "step": 4878 + }, + { + "epoch": 6.2631578947368425, + "grad_norm": 2.3311119079589844, + "learning_rate": 1.8774069319640567e-05, + "loss": 0.5834, + "step": 4879 + }, + { + "epoch": 6.264441591784339, + "grad_norm": 1.8246641159057617, + "learning_rate": 1.8777920410783058e-05, + "loss": 0.5679, + "step": 4880 + }, + { + "epoch": 6.2657252888318355, + "grad_norm": 1.753788709640503, + "learning_rate": 1.8781771501925545e-05, + "loss": 0.6088, + "step": 4881 + }, + { + "epoch": 6.2670089858793325, + "grad_norm": 3.0163745880126953, + "learning_rate": 1.8785622593068037e-05, + "loss": 0.6302, + "step": 4882 + }, + { + "epoch": 6.2682926829268295, + "grad_norm": 2.0782952308654785, + "learning_rate": 1.8789473684210524e-05, + "loss": 0.5916, + "step": 4883 + }, + { + "epoch": 6.2695763799743265, + "grad_norm": 1.6182760000228882, + "learning_rate": 1.8793324775353015e-05, + "loss": 0.6313, + "step": 4884 + }, + { + "epoch": 6.270860077021823, + "grad_norm": 1.6948176622390747, + "learning_rate": 1.879717586649551e-05, + "loss": 0.6196, + "step": 4885 + }, + { + "epoch": 6.2721437740693196, + "grad_norm": 5.2019853591918945, + "learning_rate": 1.8801026957637997e-05, + "loss": 0.5461, + "step": 4886 + }, + { + "epoch": 6.2734274711168165, + "grad_norm": 4.076870441436768, + "learning_rate": 1.880487804878049e-05, + "loss": 0.5962, + "step": 4887 + }, + { + "epoch": 6.2747111681643135, + "grad_norm": 1.5213204622268677, + "learning_rate": 1.880872913992298e-05, + "loss": 0.5984, + "step": 4888 + }, + { + "epoch": 6.27599486521181, + "grad_norm": 3.058394193649292, + "learning_rate": 1.8812580231065467e-05, + "loss": 0.5908, + "step": 4889 + }, + { + "epoch": 6.277278562259307, + "grad_norm": 1.633893609046936, + "learning_rate": 1.881643132220796e-05, + "loss": 0.6252, + "step": 4890 + }, + { + "epoch": 6.278562259306804, + "grad_norm": 2.768070936203003, + "learning_rate": 1.8820282413350453e-05, + "loss": 0.5984, + "step": 4891 + }, + { + "epoch": 6.2798459563543005, + "grad_norm": 2.3409745693206787, + "learning_rate": 1.882413350449294e-05, + "loss": 0.5902, + "step": 4892 + }, + { + "epoch": 6.2811296534017975, + "grad_norm": 2.821040630340576, + "learning_rate": 1.882798459563543e-05, + "loss": 0.6328, + "step": 4893 + }, + { + "epoch": 6.282413350449294, + "grad_norm": 3.0665340423583984, + "learning_rate": 1.883183568677792e-05, + "loss": 0.5795, + "step": 4894 + }, + { + "epoch": 6.283697047496791, + "grad_norm": 3.8551619052886963, + "learning_rate": 1.883568677792041e-05, + "loss": 0.5855, + "step": 4895 + }, + { + "epoch": 6.284980744544288, + "grad_norm": 2.0198216438293457, + "learning_rate": 1.88395378690629e-05, + "loss": 0.6518, + "step": 4896 + }, + { + "epoch": 6.2862644415917845, + "grad_norm": 2.444192409515381, + "learning_rate": 1.8843388960205392e-05, + "loss": 0.6115, + "step": 4897 + }, + { + "epoch": 6.2875481386392815, + "grad_norm": 2.0692026615142822, + "learning_rate": 1.8847240051347883e-05, + "loss": 0.6477, + "step": 4898 + }, + { + "epoch": 6.288831835686778, + "grad_norm": 1.8437806367874146, + "learning_rate": 1.8851091142490375e-05, + "loss": 0.6256, + "step": 4899 + }, + { + "epoch": 6.290115532734275, + "grad_norm": 14.040907859802246, + "learning_rate": 1.8854942233632862e-05, + "loss": 0.6055, + "step": 4900 + }, + { + "epoch": 6.291399229781772, + "grad_norm": 1.9972745180130005, + "learning_rate": 1.8858793324775353e-05, + "loss": 0.6255, + "step": 4901 + }, + { + "epoch": 6.2926829268292686, + "grad_norm": 3.440342426300049, + "learning_rate": 1.886264441591784e-05, + "loss": 0.607, + "step": 4902 + }, + { + "epoch": 6.293966623876765, + "grad_norm": 2.2385404109954834, + "learning_rate": 1.8866495507060335e-05, + "loss": 0.5971, + "step": 4903 + }, + { + "epoch": 6.295250320924262, + "grad_norm": 3.819347858428955, + "learning_rate": 1.8870346598202827e-05, + "loss": 0.6421, + "step": 4904 + }, + { + "epoch": 6.296534017971759, + "grad_norm": 9.690832138061523, + "learning_rate": 1.8874197689345314e-05, + "loss": 0.6144, + "step": 4905 + }, + { + "epoch": 6.297817715019256, + "grad_norm": 3.7554681301116943, + "learning_rate": 1.8878048780487805e-05, + "loss": 0.6127, + "step": 4906 + }, + { + "epoch": 6.299101412066753, + "grad_norm": 3.680988311767578, + "learning_rate": 1.8881899871630296e-05, + "loss": 0.6305, + "step": 4907 + }, + { + "epoch": 6.300385109114249, + "grad_norm": 3.1213700771331787, + "learning_rate": 1.8885750962772784e-05, + "loss": 0.6467, + "step": 4908 + }, + { + "epoch": 6.301668806161746, + "grad_norm": 5.247738838195801, + "learning_rate": 1.888960205391528e-05, + "loss": 0.6557, + "step": 4909 + }, + { + "epoch": 6.302952503209243, + "grad_norm": 3.3909077644348145, + "learning_rate": 1.8893453145057766e-05, + "loss": 0.6422, + "step": 4910 + }, + { + "epoch": 6.30423620025674, + "grad_norm": 5.416999340057373, + "learning_rate": 1.8897304236200257e-05, + "loss": 0.6538, + "step": 4911 + }, + { + "epoch": 6.305519897304237, + "grad_norm": 2.9370601177215576, + "learning_rate": 1.890115532734275e-05, + "loss": 0.6657, + "step": 4912 + }, + { + "epoch": 6.306803594351733, + "grad_norm": 4.858405590057373, + "learning_rate": 1.8905006418485236e-05, + "loss": 0.6595, + "step": 4913 + }, + { + "epoch": 6.30808729139923, + "grad_norm": 3.262401819229126, + "learning_rate": 1.8908857509627727e-05, + "loss": 0.627, + "step": 4914 + }, + { + "epoch": 6.309370988446727, + "grad_norm": 1.7836699485778809, + "learning_rate": 1.891270860077022e-05, + "loss": 0.6619, + "step": 4915 + }, + { + "epoch": 6.310654685494224, + "grad_norm": 3.917588949203491, + "learning_rate": 1.891655969191271e-05, + "loss": 0.6995, + "step": 4916 + }, + { + "epoch": 6.311938382541721, + "grad_norm": 4.116262435913086, + "learning_rate": 1.89204107830552e-05, + "loss": 0.6201, + "step": 4917 + }, + { + "epoch": 6.313222079589217, + "grad_norm": 7.241180419921875, + "learning_rate": 1.892426187419769e-05, + "loss": 0.6821, + "step": 4918 + }, + { + "epoch": 6.314505776636714, + "grad_norm": 3.083113431930542, + "learning_rate": 1.892811296534018e-05, + "loss": 0.6852, + "step": 4919 + }, + { + "epoch": 6.315789473684211, + "grad_norm": 2.8202872276306152, + "learning_rate": 1.893196405648267e-05, + "loss": 0.6842, + "step": 4920 + }, + { + "epoch": 6.317073170731708, + "grad_norm": 3.1716299057006836, + "learning_rate": 1.893581514762516e-05, + "loss": 0.6827, + "step": 4921 + }, + { + "epoch": 6.318356867779205, + "grad_norm": 6.689996242523193, + "learning_rate": 1.8939666238767652e-05, + "loss": 0.7113, + "step": 4922 + }, + { + "epoch": 6.319640564826701, + "grad_norm": 4.292172431945801, + "learning_rate": 1.8943517329910143e-05, + "loss": 0.7657, + "step": 4923 + }, + { + "epoch": 6.320924261874198, + "grad_norm": 6.770341873168945, + "learning_rate": 1.894736842105263e-05, + "loss": 0.9102, + "step": 4924 + }, + { + "epoch": 6.322207958921695, + "grad_norm": 1.6812621355056763, + "learning_rate": 1.8951219512195122e-05, + "loss": 0.6035, + "step": 4925 + }, + { + "epoch": 6.323491655969192, + "grad_norm": 1.8009265661239624, + "learning_rate": 1.8955070603337613e-05, + "loss": 0.606, + "step": 4926 + }, + { + "epoch": 6.324775353016688, + "grad_norm": 3.650876045227051, + "learning_rate": 1.89589216944801e-05, + "loss": 0.5716, + "step": 4927 + }, + { + "epoch": 6.326059050064185, + "grad_norm": 3.3982226848602295, + "learning_rate": 1.8962772785622595e-05, + "loss": 0.5871, + "step": 4928 + }, + { + "epoch": 6.327342747111682, + "grad_norm": 3.1135740280151367, + "learning_rate": 1.8966623876765083e-05, + "loss": 0.5918, + "step": 4929 + }, + { + "epoch": 6.328626444159179, + "grad_norm": 4.872652530670166, + "learning_rate": 1.8970474967907574e-05, + "loss": 0.6409, + "step": 4930 + }, + { + "epoch": 6.329910141206676, + "grad_norm": 2.5010509490966797, + "learning_rate": 1.8974326059050065e-05, + "loss": 0.6232, + "step": 4931 + }, + { + "epoch": 6.331193838254172, + "grad_norm": 7.323482036590576, + "learning_rate": 1.8978177150192553e-05, + "loss": 0.6179, + "step": 4932 + }, + { + "epoch": 6.332477535301669, + "grad_norm": 2.1791112422943115, + "learning_rate": 1.8982028241335044e-05, + "loss": 0.6471, + "step": 4933 + }, + { + "epoch": 6.333761232349166, + "grad_norm": 2.6867403984069824, + "learning_rate": 1.8985879332477538e-05, + "loss": 0.6028, + "step": 4934 + }, + { + "epoch": 6.335044929396663, + "grad_norm": 9.859064102172852, + "learning_rate": 1.8989730423620026e-05, + "loss": 0.6188, + "step": 4935 + }, + { + "epoch": 6.336328626444159, + "grad_norm": 3.3881688117980957, + "learning_rate": 1.8993581514762517e-05, + "loss": 0.6779, + "step": 4936 + }, + { + "epoch": 6.337612323491656, + "grad_norm": 5.2866530418396, + "learning_rate": 1.8997432605905008e-05, + "loss": 0.5603, + "step": 4937 + }, + { + "epoch": 6.338896020539153, + "grad_norm": 2.552427291870117, + "learning_rate": 1.9001283697047496e-05, + "loss": 0.6227, + "step": 4938 + }, + { + "epoch": 6.34017971758665, + "grad_norm": 3.4049766063690186, + "learning_rate": 1.9005134788189987e-05, + "loss": 0.5833, + "step": 4939 + }, + { + "epoch": 6.341463414634147, + "grad_norm": 7.589990139007568, + "learning_rate": 1.9008985879332478e-05, + "loss": 0.6081, + "step": 4940 + }, + { + "epoch": 6.342747111681643, + "grad_norm": 6.223786354064941, + "learning_rate": 1.901283697047497e-05, + "loss": 0.629, + "step": 4941 + }, + { + "epoch": 6.34403080872914, + "grad_norm": 2.0272445678710938, + "learning_rate": 1.901668806161746e-05, + "loss": 0.5701, + "step": 4942 + }, + { + "epoch": 6.345314505776637, + "grad_norm": 1.8749927282333374, + "learning_rate": 1.9020539152759948e-05, + "loss": 0.5696, + "step": 4943 + }, + { + "epoch": 6.346598202824134, + "grad_norm": 4.054995059967041, + "learning_rate": 1.902439024390244e-05, + "loss": 0.624, + "step": 4944 + }, + { + "epoch": 6.347881899871631, + "grad_norm": 2.8275721073150635, + "learning_rate": 1.902824133504493e-05, + "loss": 0.5785, + "step": 4945 + }, + { + "epoch": 6.349165596919127, + "grad_norm": 6.585011959075928, + "learning_rate": 1.903209242618742e-05, + "loss": 0.6056, + "step": 4946 + }, + { + "epoch": 6.350449293966624, + "grad_norm": 7.262278079986572, + "learning_rate": 1.9035943517329912e-05, + "loss": 0.5987, + "step": 4947 + }, + { + "epoch": 6.351732991014121, + "grad_norm": 6.48416805267334, + "learning_rate": 1.90397946084724e-05, + "loss": 0.6604, + "step": 4948 + }, + { + "epoch": 6.353016688061618, + "grad_norm": 3.259094476699829, + "learning_rate": 1.904364569961489e-05, + "loss": 0.6615, + "step": 4949 + }, + { + "epoch": 6.354300385109115, + "grad_norm": 6.280877590179443, + "learning_rate": 1.9047496790757382e-05, + "loss": 0.6279, + "step": 4950 + }, + { + "epoch": 6.355584082156611, + "grad_norm": 3.023618459701538, + "learning_rate": 1.905134788189987e-05, + "loss": 0.5947, + "step": 4951 + }, + { + "epoch": 6.356867779204108, + "grad_norm": 9.747068405151367, + "learning_rate": 1.9055198973042364e-05, + "loss": 0.6285, + "step": 4952 + }, + { + "epoch": 6.358151476251605, + "grad_norm": 4.871889114379883, + "learning_rate": 1.9059050064184855e-05, + "loss": 0.6622, + "step": 4953 + }, + { + "epoch": 6.359435173299102, + "grad_norm": 3.5675535202026367, + "learning_rate": 1.9062901155327343e-05, + "loss": 0.6756, + "step": 4954 + }, + { + "epoch": 6.360718870346599, + "grad_norm": 7.1746368408203125, + "learning_rate": 1.9066752246469834e-05, + "loss": 0.6407, + "step": 4955 + }, + { + "epoch": 6.362002567394095, + "grad_norm": 2.9055697917938232, + "learning_rate": 1.907060333761232e-05, + "loss": 0.6029, + "step": 4956 + }, + { + "epoch": 6.363286264441592, + "grad_norm": 3.4323699474334717, + "learning_rate": 1.9074454428754813e-05, + "loss": 0.6901, + "step": 4957 + }, + { + "epoch": 6.364569961489089, + "grad_norm": 3.154954195022583, + "learning_rate": 1.9078305519897307e-05, + "loss": 0.6546, + "step": 4958 + }, + { + "epoch": 6.365853658536586, + "grad_norm": 4.329569339752197, + "learning_rate": 1.9082156611039795e-05, + "loss": 0.6975, + "step": 4959 + }, + { + "epoch": 6.367137355584082, + "grad_norm": 1.6624407768249512, + "learning_rate": 1.9086007702182286e-05, + "loss": 0.6831, + "step": 4960 + }, + { + "epoch": 6.368421052631579, + "grad_norm": Infinity, + "learning_rate": 1.9086007702182286e-05, + "loss": 0.6641, + "step": 4961 + }, + { + "epoch": 6.369704749679076, + "grad_norm": 5.109246730804443, + "learning_rate": 1.9089858793324777e-05, + "loss": 0.6328, + "step": 4962 + }, + { + "epoch": 6.370988446726573, + "grad_norm": 2.733424663543701, + "learning_rate": 1.9093709884467265e-05, + "loss": 0.6388, + "step": 4963 + }, + { + "epoch": 6.37227214377407, + "grad_norm": 74.6393814086914, + "learning_rate": 1.9097560975609756e-05, + "loss": 0.6728, + "step": 4964 + }, + { + "epoch": 6.373555840821566, + "grad_norm": 6.293520450592041, + "learning_rate": 1.910141206675225e-05, + "loss": 0.6318, + "step": 4965 + }, + { + "epoch": 6.374839537869063, + "grad_norm": 16.020112991333008, + "learning_rate": 1.9105263157894738e-05, + "loss": 0.7034, + "step": 4966 + }, + { + "epoch": 6.37612323491656, + "grad_norm": 3.4325268268585205, + "learning_rate": 1.910911424903723e-05, + "loss": 0.7157, + "step": 4967 + }, + { + "epoch": 6.377406931964057, + "grad_norm": 5.194688320159912, + "learning_rate": 1.9112965340179717e-05, + "loss": 0.7195, + "step": 4968 + }, + { + "epoch": 6.378690629011553, + "grad_norm": 8.598408699035645, + "learning_rate": 1.9116816431322208e-05, + "loss": 0.6502, + "step": 4969 + }, + { + "epoch": 6.37997432605905, + "grad_norm": 3.3595974445343018, + "learning_rate": 1.91206675224647e-05, + "loss": 0.6915, + "step": 4970 + }, + { + "epoch": 6.381258023106547, + "grad_norm": 6.9943413734436035, + "learning_rate": 1.912451861360719e-05, + "loss": 0.7891, + "step": 4971 + }, + { + "epoch": 6.382541720154044, + "grad_norm": 7.708628177642822, + "learning_rate": 1.912836970474968e-05, + "loss": 0.754, + "step": 4972 + }, + { + "epoch": 6.383825417201541, + "grad_norm": 5.49733304977417, + "learning_rate": 1.9132220795892172e-05, + "loss": 0.782, + "step": 4973 + }, + { + "epoch": 6.385109114249037, + "grad_norm": 5.138641834259033, + "learning_rate": 1.913607188703466e-05, + "loss": 0.9337, + "step": 4974 + }, + { + "epoch": 6.386392811296534, + "grad_norm": 3.4391725063323975, + "learning_rate": 1.913992297817715e-05, + "loss": 0.5941, + "step": 4975 + }, + { + "epoch": 6.387676508344031, + "grad_norm": 4.996433734893799, + "learning_rate": 1.914377406931964e-05, + "loss": 0.6086, + "step": 4976 + }, + { + "epoch": 6.388960205391528, + "grad_norm": 5.000182151794434, + "learning_rate": 1.914762516046213e-05, + "loss": 0.5931, + "step": 4977 + }, + { + "epoch": 6.390243902439025, + "grad_norm": 4.069394588470459, + "learning_rate": 1.9151476251604624e-05, + "loss": 0.65, + "step": 4978 + }, + { + "epoch": 6.391527599486521, + "grad_norm": 1.973142147064209, + "learning_rate": 1.915532734274711e-05, + "loss": 0.6399, + "step": 4979 + }, + { + "epoch": 6.392811296534018, + "grad_norm": 26.800268173217773, + "learning_rate": 1.9159178433889603e-05, + "loss": 0.5807, + "step": 4980 + }, + { + "epoch": 6.394094993581515, + "grad_norm": 6.7457380294799805, + "learning_rate": 1.9163029525032094e-05, + "loss": 0.5893, + "step": 4981 + }, + { + "epoch": 6.395378690629012, + "grad_norm": 2.6553940773010254, + "learning_rate": 1.916688061617458e-05, + "loss": 0.5961, + "step": 4982 + }, + { + "epoch": 6.396662387676509, + "grad_norm": 4.368968963623047, + "learning_rate": 1.9170731707317072e-05, + "loss": 0.607, + "step": 4983 + }, + { + "epoch": 6.397946084724005, + "grad_norm": 3.024369239807129, + "learning_rate": 1.9174582798459567e-05, + "loss": 0.6723, + "step": 4984 + }, + { + "epoch": 6.399229781771502, + "grad_norm": 20.029773712158203, + "learning_rate": 1.9178433889602055e-05, + "loss": 0.6485, + "step": 4985 + }, + { + "epoch": 6.400513478818999, + "grad_norm": 4.0302510261535645, + "learning_rate": 1.9182284980744546e-05, + "loss": 0.6284, + "step": 4986 + }, + { + "epoch": 6.401797175866496, + "grad_norm": 1.7498149871826172, + "learning_rate": 1.9186136071887033e-05, + "loss": 0.6227, + "step": 4987 + }, + { + "epoch": 6.403080872913993, + "grad_norm": 2.298487663269043, + "learning_rate": 1.9189987163029524e-05, + "loss": 0.6106, + "step": 4988 + }, + { + "epoch": 6.404364569961489, + "grad_norm": 2.447861433029175, + "learning_rate": 1.9193838254172015e-05, + "loss": 0.6408, + "step": 4989 + }, + { + "epoch": 6.405648267008986, + "grad_norm": 2.68552827835083, + "learning_rate": 1.9197689345314507e-05, + "loss": 0.5995, + "step": 4990 + }, + { + "epoch": 6.406931964056483, + "grad_norm": 7.45361852645874, + "learning_rate": 1.9201540436456998e-05, + "loss": 0.6212, + "step": 4991 + }, + { + "epoch": 6.40821566110398, + "grad_norm": 6.056588172912598, + "learning_rate": 1.920539152759949e-05, + "loss": 0.6057, + "step": 4992 + }, + { + "epoch": 6.409499358151476, + "grad_norm": 3.5462522506713867, + "learning_rate": 1.9209242618741976e-05, + "loss": 0.6061, + "step": 4993 + }, + { + "epoch": 6.410783055198973, + "grad_norm": 28.03691291809082, + "learning_rate": 1.9213093709884467e-05, + "loss": 0.6144, + "step": 4994 + }, + { + "epoch": 6.41206675224647, + "grad_norm": 3.2115371227264404, + "learning_rate": 1.9216944801026955e-05, + "loss": 0.6696, + "step": 4995 + }, + { + "epoch": 6.413350449293967, + "grad_norm": 1.598698377609253, + "learning_rate": 1.922079589216945e-05, + "loss": 0.6386, + "step": 4996 + }, + { + "epoch": 6.414634146341464, + "grad_norm": 3.6865015029907227, + "learning_rate": 1.922464698331194e-05, + "loss": 0.6103, + "step": 4997 + }, + { + "epoch": 6.41591784338896, + "grad_norm": 3.5170247554779053, + "learning_rate": 1.922849807445443e-05, + "loss": 0.5615, + "step": 4998 + }, + { + "epoch": 6.417201540436457, + "grad_norm": 2.6983063220977783, + "learning_rate": 1.923234916559692e-05, + "loss": 0.6216, + "step": 4999 + }, + { + "epoch": 6.418485237483954, + "grad_norm": 18.692522048950195, + "learning_rate": 1.923620025673941e-05, + "loss": 0.6032, + "step": 5000 + }, + { + "epoch": 6.418485237483954, + "eval_cer": 0.315547288036029, + "eval_loss": 0.6054366827011108, + "eval_runtime": 14.0874, + "eval_samples_per_second": 69.779, + "eval_steps_per_second": 0.497, + "eval_wer": 0.5552625544599863, + "step": 5000 + }, + { + "epoch": 6.419768934531451, + "grad_norm": 2.2975351810455322, + "learning_rate": 1.9240051347881898e-05, + "loss": 0.5944, + "step": 5001 + }, + { + "epoch": 6.421052631578947, + "grad_norm": 2.3103432655334473, + "learning_rate": 1.9243902439024393e-05, + "loss": 0.6055, + "step": 5002 + }, + { + "epoch": 6.422336328626444, + "grad_norm": 1.786382794380188, + "learning_rate": 1.924775353016688e-05, + "loss": 0.6232, + "step": 5003 + }, + { + "epoch": 6.423620025673941, + "grad_norm": 3.4013969898223877, + "learning_rate": 1.925160462130937e-05, + "loss": 0.6483, + "step": 5004 + }, + { + "epoch": 6.424903722721438, + "grad_norm": 2.8121466636657715, + "learning_rate": 1.9255455712451862e-05, + "loss": 0.6047, + "step": 5005 + }, + { + "epoch": 6.426187419768935, + "grad_norm": 3.810835838317871, + "learning_rate": 1.925930680359435e-05, + "loss": 0.6393, + "step": 5006 + }, + { + "epoch": 6.427471116816431, + "grad_norm": 1.4713099002838135, + "learning_rate": 1.926315789473684e-05, + "loss": 0.6775, + "step": 5007 + }, + { + "epoch": 6.428754813863928, + "grad_norm": 9.417214393615723, + "learning_rate": 1.9267008985879336e-05, + "loss": 0.6655, + "step": 5008 + }, + { + "epoch": 6.430038510911425, + "grad_norm": 3.709599733352661, + "learning_rate": 1.9270860077021823e-05, + "loss": 0.6536, + "step": 5009 + }, + { + "epoch": 6.431322207958922, + "grad_norm": 2.5424466133117676, + "learning_rate": 1.9274711168164314e-05, + "loss": 0.6076, + "step": 5010 + }, + { + "epoch": 6.432605905006419, + "grad_norm": 3.7910943031311035, + "learning_rate": 1.9278562259306805e-05, + "loss": 0.7197, + "step": 5011 + }, + { + "epoch": 6.433889602053915, + "grad_norm": 2.721829891204834, + "learning_rate": 1.9282413350449293e-05, + "loss": 0.7208, + "step": 5012 + }, + { + "epoch": 6.435173299101412, + "grad_norm": 2.333061456680298, + "learning_rate": 1.9286264441591784e-05, + "loss": 0.6429, + "step": 5013 + }, + { + "epoch": 6.436456996148909, + "grad_norm": 2.5965657234191895, + "learning_rate": 1.9290115532734275e-05, + "loss": 0.7065, + "step": 5014 + }, + { + "epoch": 6.437740693196406, + "grad_norm": 3.8799307346343994, + "learning_rate": 1.9293966623876766e-05, + "loss": 0.6403, + "step": 5015 + }, + { + "epoch": 6.439024390243903, + "grad_norm": 2.0571956634521484, + "learning_rate": 1.9297817715019257e-05, + "loss": 0.6472, + "step": 5016 + }, + { + "epoch": 6.440308087291399, + "grad_norm": 3.4744253158569336, + "learning_rate": 1.9301668806161745e-05, + "loss": 0.6676, + "step": 5017 + }, + { + "epoch": 6.441591784338896, + "grad_norm": 6.4581170082092285, + "learning_rate": 1.9305519897304236e-05, + "loss": 0.654, + "step": 5018 + }, + { + "epoch": 6.442875481386393, + "grad_norm": 3.670205593109131, + "learning_rate": 1.9309370988446727e-05, + "loss": 0.6727, + "step": 5019 + }, + { + "epoch": 6.44415917843389, + "grad_norm": 27.66307830810547, + "learning_rate": 1.931322207958922e-05, + "loss": 0.7141, + "step": 5020 + }, + { + "epoch": 6.445442875481387, + "grad_norm": 4.5060200691223145, + "learning_rate": 1.931707317073171e-05, + "loss": 0.6466, + "step": 5021 + }, + { + "epoch": 6.446726572528883, + "grad_norm": 26.21676254272461, + "learning_rate": 1.9320924261874197e-05, + "loss": 0.775, + "step": 5022 + }, + { + "epoch": 6.44801026957638, + "grad_norm": 5.428861618041992, + "learning_rate": 1.9324775353016688e-05, + "loss": 0.8071, + "step": 5023 + }, + { + "epoch": 6.449293966623877, + "grad_norm": 3.634796142578125, + "learning_rate": 1.932862644415918e-05, + "loss": 0.9093, + "step": 5024 + }, + { + "epoch": 6.450577663671374, + "grad_norm": 3.3652093410491943, + "learning_rate": 1.9332477535301667e-05, + "loss": 0.5906, + "step": 5025 + }, + { + "epoch": 6.45186136071887, + "grad_norm": 4.720381259918213, + "learning_rate": 1.933632862644416e-05, + "loss": 0.5641, + "step": 5026 + }, + { + "epoch": 6.453145057766367, + "grad_norm": 3.9999756813049316, + "learning_rate": 1.9340179717586652e-05, + "loss": 0.5897, + "step": 5027 + }, + { + "epoch": 6.454428754813864, + "grad_norm": 6.475478172302246, + "learning_rate": 1.934403080872914e-05, + "loss": 0.5932, + "step": 5028 + }, + { + "epoch": 6.455712451861361, + "grad_norm": 2.2521090507507324, + "learning_rate": 1.934788189987163e-05, + "loss": 0.6307, + "step": 5029 + }, + { + "epoch": 6.456996148908858, + "grad_norm": 2.470698595046997, + "learning_rate": 1.9351732991014122e-05, + "loss": 0.5861, + "step": 5030 + }, + { + "epoch": 6.458279845956354, + "grad_norm": 3.3486785888671875, + "learning_rate": 1.935558408215661e-05, + "loss": 0.6239, + "step": 5031 + }, + { + "epoch": 6.459563543003851, + "grad_norm": 3.1489012241363525, + "learning_rate": 1.93594351732991e-05, + "loss": 0.5943, + "step": 5032 + }, + { + "epoch": 6.460847240051348, + "grad_norm": 2.70442533493042, + "learning_rate": 1.9363286264441592e-05, + "loss": 0.568, + "step": 5033 + }, + { + "epoch": 6.462130937098845, + "grad_norm": 5.323875904083252, + "learning_rate": 1.9367137355584083e-05, + "loss": 0.5816, + "step": 5034 + }, + { + "epoch": 6.463414634146342, + "grad_norm": 3.409332036972046, + "learning_rate": 1.9370988446726574e-05, + "loss": 0.5713, + "step": 5035 + }, + { + "epoch": 6.464698331193838, + "grad_norm": 7.36746883392334, + "learning_rate": 1.9374839537869062e-05, + "loss": 0.5989, + "step": 5036 + }, + { + "epoch": 6.465982028241335, + "grad_norm": 2.5057015419006348, + "learning_rate": 1.9378690629011553e-05, + "loss": 0.5982, + "step": 5037 + }, + { + "epoch": 6.467265725288832, + "grad_norm": 5.192127227783203, + "learning_rate": 1.9382541720154044e-05, + "loss": 0.6277, + "step": 5038 + }, + { + "epoch": 6.468549422336329, + "grad_norm": 4.511173248291016, + "learning_rate": 1.9386392811296535e-05, + "loss": 0.6353, + "step": 5039 + }, + { + "epoch": 6.469833119383825, + "grad_norm": 1.9392547607421875, + "learning_rate": 1.9390243902439026e-05, + "loss": 0.6374, + "step": 5040 + }, + { + "epoch": 6.471116816431322, + "grad_norm": 2.301023006439209, + "learning_rate": 1.9394094993581514e-05, + "loss": 0.6379, + "step": 5041 + }, + { + "epoch": 6.472400513478819, + "grad_norm": 3.1599934101104736, + "learning_rate": 1.9397946084724005e-05, + "loss": 0.6191, + "step": 5042 + }, + { + "epoch": 6.473684210526316, + "grad_norm": 5.227588653564453, + "learning_rate": 1.9401797175866496e-05, + "loss": 0.6658, + "step": 5043 + }, + { + "epoch": 6.474967907573813, + "grad_norm": 2.2661826610565186, + "learning_rate": 1.9405648267008984e-05, + "loss": 0.6216, + "step": 5044 + }, + { + "epoch": 6.476251604621309, + "grad_norm": 2.230027198791504, + "learning_rate": 1.9409499358151478e-05, + "loss": 0.6607, + "step": 5045 + }, + { + "epoch": 6.477535301668806, + "grad_norm": 8.555346488952637, + "learning_rate": 1.941335044929397e-05, + "loss": 0.6026, + "step": 5046 + }, + { + "epoch": 6.478818998716303, + "grad_norm": 4.6775288581848145, + "learning_rate": 1.9417201540436457e-05, + "loss": 0.609, + "step": 5047 + }, + { + "epoch": 6.4801026957638, + "grad_norm": 1.9924241304397583, + "learning_rate": 1.9421052631578948e-05, + "loss": 0.6362, + "step": 5048 + }, + { + "epoch": 6.481386392811297, + "grad_norm": 3.570024251937866, + "learning_rate": 1.942490372272144e-05, + "loss": 0.6007, + "step": 5049 + }, + { + "epoch": 6.482670089858793, + "grad_norm": 3.177940607070923, + "learning_rate": 1.9428754813863927e-05, + "loss": 0.6232, + "step": 5050 + }, + { + "epoch": 6.48395378690629, + "grad_norm": 4.229503631591797, + "learning_rate": 1.943260590500642e-05, + "loss": 0.6183, + "step": 5051 + }, + { + "epoch": 6.485237483953787, + "grad_norm": 4.795351505279541, + "learning_rate": 1.943645699614891e-05, + "loss": 0.6301, + "step": 5052 + }, + { + "epoch": 6.486521181001284, + "grad_norm": 6.340800762176514, + "learning_rate": 1.94403080872914e-05, + "loss": 0.662, + "step": 5053 + }, + { + "epoch": 6.487804878048781, + "grad_norm": 8.203522682189941, + "learning_rate": 1.944415917843389e-05, + "loss": 0.6235, + "step": 5054 + }, + { + "epoch": 6.489088575096277, + "grad_norm": 3.967496871948242, + "learning_rate": 1.944801026957638e-05, + "loss": 0.655, + "step": 5055 + }, + { + "epoch": 6.490372272143774, + "grad_norm": 2.7473411560058594, + "learning_rate": 1.945186136071887e-05, + "loss": 0.6786, + "step": 5056 + }, + { + "epoch": 6.491655969191271, + "grad_norm": 3.065098285675049, + "learning_rate": 1.9455712451861364e-05, + "loss": 0.655, + "step": 5057 + }, + { + "epoch": 6.492939666238768, + "grad_norm": 2.9032704830169678, + "learning_rate": 1.9459563543003852e-05, + "loss": 0.6331, + "step": 5058 + }, + { + "epoch": 6.494223363286264, + "grad_norm": 10.331042289733887, + "learning_rate": 1.9463414634146343e-05, + "loss": 0.6111, + "step": 5059 + }, + { + "epoch": 6.495507060333761, + "grad_norm": 5.061710834503174, + "learning_rate": 1.946726572528883e-05, + "loss": 0.6441, + "step": 5060 + }, + { + "epoch": 6.496790757381258, + "grad_norm": 2.521440029144287, + "learning_rate": 1.9471116816431322e-05, + "loss": 0.763, + "step": 5061 + }, + { + "epoch": 6.498074454428755, + "grad_norm": 2.815645217895508, + "learning_rate": 1.9474967907573813e-05, + "loss": 0.6443, + "step": 5062 + }, + { + "epoch": 6.499358151476252, + "grad_norm": 6.204456806182861, + "learning_rate": 1.9478818998716304e-05, + "loss": 0.6415, + "step": 5063 + }, + { + "epoch": 6.500641848523748, + "grad_norm": 3.0769577026367188, + "learning_rate": 1.9482670089858795e-05, + "loss": 0.6332, + "step": 5064 + }, + { + "epoch": 6.501925545571245, + "grad_norm": 4.096156120300293, + "learning_rate": 1.9486521181001286e-05, + "loss": 0.6597, + "step": 5065 + }, + { + "epoch": 6.503209242618742, + "grad_norm": 3.0003530979156494, + "learning_rate": 1.9490372272143774e-05, + "loss": 0.6902, + "step": 5066 + }, + { + "epoch": 6.504492939666239, + "grad_norm": 3.2781496047973633, + "learning_rate": 1.9494223363286265e-05, + "loss": 0.7096, + "step": 5067 + }, + { + "epoch": 6.505776636713735, + "grad_norm": 3.813278913497925, + "learning_rate": 1.9498074454428752e-05, + "loss": 0.646, + "step": 5068 + }, + { + "epoch": 6.507060333761232, + "grad_norm": 4.227931976318359, + "learning_rate": 1.9501925545571247e-05, + "loss": 0.7359, + "step": 5069 + }, + { + "epoch": 6.508344030808729, + "grad_norm": 2.5771913528442383, + "learning_rate": 1.9505776636713738e-05, + "loss": 0.6856, + "step": 5070 + }, + { + "epoch": 6.509627727856226, + "grad_norm": 3.233279228210449, + "learning_rate": 1.9509627727856226e-05, + "loss": 0.6935, + "step": 5071 + }, + { + "epoch": 6.510911424903723, + "grad_norm": 9.766013145446777, + "learning_rate": 1.9513478818998717e-05, + "loss": 0.7164, + "step": 5072 + }, + { + "epoch": 6.512195121951219, + "grad_norm": 5.506912708282471, + "learning_rate": 1.9517329910141208e-05, + "loss": 0.7769, + "step": 5073 + }, + { + "epoch": 6.513478818998716, + "grad_norm": 5.945881366729736, + "learning_rate": 1.9521181001283695e-05, + "loss": 1.0077, + "step": 5074 + }, + { + "epoch": 6.514762516046213, + "grad_norm": 15.191120147705078, + "learning_rate": 1.952503209242619e-05, + "loss": 0.6009, + "step": 5075 + }, + { + "epoch": 6.51604621309371, + "grad_norm": 5.947819232940674, + "learning_rate": 1.952888318356868e-05, + "loss": 0.5695, + "step": 5076 + }, + { + "epoch": 6.517329910141207, + "grad_norm": 5.3926825523376465, + "learning_rate": 1.953273427471117e-05, + "loss": 0.6278, + "step": 5077 + }, + { + "epoch": 6.518613607188703, + "grad_norm": 2.43156099319458, + "learning_rate": 1.953658536585366e-05, + "loss": 0.5899, + "step": 5078 + }, + { + "epoch": 6.5198973042362, + "grad_norm": 7.527767658233643, + "learning_rate": 1.9540436456996147e-05, + "loss": 0.6047, + "step": 5079 + }, + { + "epoch": 6.521181001283697, + "grad_norm": 2.8828542232513428, + "learning_rate": 1.954428754813864e-05, + "loss": 0.6279, + "step": 5080 + }, + { + "epoch": 6.522464698331194, + "grad_norm": 2.7252674102783203, + "learning_rate": 1.954813863928113e-05, + "loss": 0.6618, + "step": 5081 + }, + { + "epoch": 6.523748395378691, + "grad_norm": 13.140124320983887, + "learning_rate": 1.955198973042362e-05, + "loss": 0.5994, + "step": 5082 + }, + { + "epoch": 6.525032092426187, + "grad_norm": 3.7532925605773926, + "learning_rate": 1.9555840821566112e-05, + "loss": 0.5734, + "step": 5083 + }, + { + "epoch": 6.526315789473684, + "grad_norm": 2.156463146209717, + "learning_rate": 1.9559691912708603e-05, + "loss": 0.6163, + "step": 5084 + }, + { + "epoch": 6.527599486521181, + "grad_norm": 2.6996731758117676, + "learning_rate": 1.956354300385109e-05, + "loss": 0.684, + "step": 5085 + }, + { + "epoch": 6.528883183568678, + "grad_norm": 3.9190316200256348, + "learning_rate": 1.956739409499358e-05, + "loss": 0.6562, + "step": 5086 + }, + { + "epoch": 6.530166880616175, + "grad_norm": 2.1243057250976562, + "learning_rate": 1.957124518613607e-05, + "loss": 0.613, + "step": 5087 + }, + { + "epoch": 6.531450577663671, + "grad_norm": 2.8246049880981445, + "learning_rate": 1.9575096277278564e-05, + "loss": 0.6384, + "step": 5088 + }, + { + "epoch": 6.532734274711168, + "grad_norm": 6.298262596130371, + "learning_rate": 1.9578947368421055e-05, + "loss": 0.5875, + "step": 5089 + }, + { + "epoch": 6.534017971758665, + "grad_norm": 7.636255741119385, + "learning_rate": 1.9582798459563542e-05, + "loss": 0.6107, + "step": 5090 + }, + { + "epoch": 6.535301668806162, + "grad_norm": 3.214010000228882, + "learning_rate": 1.9586649550706033e-05, + "loss": 0.5948, + "step": 5091 + }, + { + "epoch": 6.536585365853659, + "grad_norm": 5.942883014678955, + "learning_rate": 1.9590500641848525e-05, + "loss": 0.5856, + "step": 5092 + }, + { + "epoch": 6.537869062901155, + "grad_norm": 1.7369279861450195, + "learning_rate": 1.9594351732991012e-05, + "loss": 0.6007, + "step": 5093 + }, + { + "epoch": 6.539152759948652, + "grad_norm": 2.6363959312438965, + "learning_rate": 1.9598202824133507e-05, + "loss": 0.7012, + "step": 5094 + }, + { + "epoch": 6.540436456996149, + "grad_norm": 2.2937986850738525, + "learning_rate": 1.9602053915275998e-05, + "loss": 0.6353, + "step": 5095 + }, + { + "epoch": 6.541720154043646, + "grad_norm": 2.4174342155456543, + "learning_rate": 1.9605905006418485e-05, + "loss": 0.6157, + "step": 5096 + }, + { + "epoch": 6.543003851091142, + "grad_norm": 2.4360272884368896, + "learning_rate": 1.9609756097560977e-05, + "loss": 0.629, + "step": 5097 + }, + { + "epoch": 6.544287548138639, + "grad_norm": 11.039366722106934, + "learning_rate": 1.9613607188703464e-05, + "loss": 0.6329, + "step": 5098 + }, + { + "epoch": 6.545571245186136, + "grad_norm": 2.4984397888183594, + "learning_rate": 1.9617458279845955e-05, + "loss": 0.632, + "step": 5099 + }, + { + "epoch": 6.546854942233633, + "grad_norm": 2.3754026889801025, + "learning_rate": 1.962130937098845e-05, + "loss": 0.6287, + "step": 5100 + }, + { + "epoch": 6.548138639281129, + "grad_norm": 15.47170352935791, + "learning_rate": 1.9625160462130937e-05, + "loss": 0.6353, + "step": 5101 + }, + { + "epoch": 6.549422336328626, + "grad_norm": 4.178474426269531, + "learning_rate": 1.962901155327343e-05, + "loss": 0.5924, + "step": 5102 + }, + { + "epoch": 6.550706033376123, + "grad_norm": 4.09556245803833, + "learning_rate": 1.963286264441592e-05, + "loss": 0.6258, + "step": 5103 + }, + { + "epoch": 6.55198973042362, + "grad_norm": 3.239051580429077, + "learning_rate": 1.9636713735558407e-05, + "loss": 0.6461, + "step": 5104 + }, + { + "epoch": 6.553273427471117, + "grad_norm": 2.47107195854187, + "learning_rate": 1.96405648267009e-05, + "loss": 0.6235, + "step": 5105 + }, + { + "epoch": 6.554557124518613, + "grad_norm": 3.722731590270996, + "learning_rate": 1.964441591784339e-05, + "loss": 0.5994, + "step": 5106 + }, + { + "epoch": 6.55584082156611, + "grad_norm": 2.963383436203003, + "learning_rate": 1.964826700898588e-05, + "loss": 0.578, + "step": 5107 + }, + { + "epoch": 6.557124518613607, + "grad_norm": 4.119647026062012, + "learning_rate": 1.965211810012837e-05, + "loss": 0.653, + "step": 5108 + }, + { + "epoch": 6.558408215661104, + "grad_norm": 5.230415344238281, + "learning_rate": 1.965596919127086e-05, + "loss": 0.63, + "step": 5109 + }, + { + "epoch": 6.559691912708601, + "grad_norm": 4.9626359939575195, + "learning_rate": 1.965982028241335e-05, + "loss": 0.6302, + "step": 5110 + }, + { + "epoch": 6.560975609756097, + "grad_norm": 3.5852229595184326, + "learning_rate": 1.966367137355584e-05, + "loss": 0.6349, + "step": 5111 + }, + { + "epoch": 6.562259306803594, + "grad_norm": 3.310105800628662, + "learning_rate": 1.9667522464698332e-05, + "loss": 0.6605, + "step": 5112 + }, + { + "epoch": 6.563543003851091, + "grad_norm": 4.450253009796143, + "learning_rate": 1.9671373555840823e-05, + "loss": 0.6966, + "step": 5113 + }, + { + "epoch": 6.564826700898588, + "grad_norm": 2.0033655166625977, + "learning_rate": 1.967522464698331e-05, + "loss": 0.5996, + "step": 5114 + }, + { + "epoch": 6.566110397946085, + "grad_norm": 14.418978691101074, + "learning_rate": 1.9679075738125802e-05, + "loss": 0.6775, + "step": 5115 + }, + { + "epoch": 6.567394094993581, + "grad_norm": 8.394737243652344, + "learning_rate": 1.9682926829268293e-05, + "loss": 0.6956, + "step": 5116 + }, + { + "epoch": 6.568677792041078, + "grad_norm": 4.419279098510742, + "learning_rate": 1.968677792041078e-05, + "loss": 0.6639, + "step": 5117 + }, + { + "epoch": 6.569961489088575, + "grad_norm": 7.766975402832031, + "learning_rate": 1.9690629011553275e-05, + "loss": 0.7292, + "step": 5118 + }, + { + "epoch": 6.571245186136072, + "grad_norm": 5.226737976074219, + "learning_rate": 1.9694480102695767e-05, + "loss": 0.6831, + "step": 5119 + }, + { + "epoch": 6.572528883183569, + "grad_norm": 3.6269021034240723, + "learning_rate": 1.9698331193838254e-05, + "loss": 0.759, + "step": 5120 + }, + { + "epoch": 6.573812580231065, + "grad_norm": 3.333017349243164, + "learning_rate": 1.9702182284980745e-05, + "loss": 0.75, + "step": 5121 + }, + { + "epoch": 6.575096277278562, + "grad_norm": 4.0777153968811035, + "learning_rate": 1.9706033376123236e-05, + "loss": 0.7885, + "step": 5122 + }, + { + "epoch": 6.576379974326059, + "grad_norm": 5.388266563415527, + "learning_rate": 1.9709884467265724e-05, + "loss": 0.8179, + "step": 5123 + }, + { + "epoch": 6.577663671373556, + "grad_norm": 3.483943462371826, + "learning_rate": 1.971373555840822e-05, + "loss": 0.9019, + "step": 5124 + }, + { + "epoch": 6.578947368421053, + "grad_norm": 1.656564712524414, + "learning_rate": 1.9717586649550706e-05, + "loss": 0.5926, + "step": 5125 + }, + { + "epoch": 6.580231065468549, + "grad_norm": 3.9596211910247803, + "learning_rate": 1.9721437740693197e-05, + "loss": 0.6328, + "step": 5126 + }, + { + "epoch": 6.581514762516046, + "grad_norm": 2.592365026473999, + "learning_rate": 1.9725288831835688e-05, + "loss": 0.6217, + "step": 5127 + }, + { + "epoch": 6.582798459563543, + "grad_norm": 2.511162757873535, + "learning_rate": 1.9729139922978176e-05, + "loss": 0.6165, + "step": 5128 + }, + { + "epoch": 6.58408215661104, + "grad_norm": 2.030179977416992, + "learning_rate": 1.9732991014120667e-05, + "loss": 0.6091, + "step": 5129 + }, + { + "epoch": 6.585365853658536, + "grad_norm": 3.0206081867218018, + "learning_rate": 1.9736842105263158e-05, + "loss": 0.5511, + "step": 5130 + }, + { + "epoch": 6.586649550706033, + "grad_norm": 2.6357126235961914, + "learning_rate": 1.974069319640565e-05, + "loss": 0.5962, + "step": 5131 + }, + { + "epoch": 6.58793324775353, + "grad_norm": 2.213209629058838, + "learning_rate": 1.974454428754814e-05, + "loss": 0.6077, + "step": 5132 + }, + { + "epoch": 6.589216944801027, + "grad_norm": 2.704979419708252, + "learning_rate": 1.9748395378690628e-05, + "loss": 0.5923, + "step": 5133 + }, + { + "epoch": 6.590500641848524, + "grad_norm": 3.7963345050811768, + "learning_rate": 1.975224646983312e-05, + "loss": 0.6157, + "step": 5134 + }, + { + "epoch": 6.59178433889602, + "grad_norm": 3.6157188415527344, + "learning_rate": 1.975609756097561e-05, + "loss": 0.6207, + "step": 5135 + }, + { + "epoch": 6.593068035943517, + "grad_norm": 2.698970317840576, + "learning_rate": 1.9759948652118098e-05, + "loss": 0.6222, + "step": 5136 + }, + { + "epoch": 6.594351732991014, + "grad_norm": 23.686105728149414, + "learning_rate": 1.9763799743260592e-05, + "loss": 0.5667, + "step": 5137 + }, + { + "epoch": 6.595635430038511, + "grad_norm": 1.8845969438552856, + "learning_rate": 1.9767650834403083e-05, + "loss": 0.6151, + "step": 5138 + }, + { + "epoch": 6.596919127086007, + "grad_norm": 3.0037317276000977, + "learning_rate": 1.977150192554557e-05, + "loss": 0.5547, + "step": 5139 + }, + { + "epoch": 6.598202824133504, + "grad_norm": 4.422155857086182, + "learning_rate": 1.9775353016688062e-05, + "loss": 0.6023, + "step": 5140 + }, + { + "epoch": 6.599486521181001, + "grad_norm": 2.67535400390625, + "learning_rate": 1.9779204107830553e-05, + "loss": 0.6058, + "step": 5141 + }, + { + "epoch": 6.600770218228498, + "grad_norm": 3.3052797317504883, + "learning_rate": 1.978305519897304e-05, + "loss": 0.5997, + "step": 5142 + }, + { + "epoch": 6.602053915275995, + "grad_norm": 1.4101612567901611, + "learning_rate": 1.9786906290115535e-05, + "loss": 0.5758, + "step": 5143 + }, + { + "epoch": 6.603337612323491, + "grad_norm": 3.0743680000305176, + "learning_rate": 1.9790757381258023e-05, + "loss": 0.5599, + "step": 5144 + }, + { + "epoch": 6.604621309370988, + "grad_norm": 3.2354841232299805, + "learning_rate": 1.9794608472400514e-05, + "loss": 0.5805, + "step": 5145 + }, + { + "epoch": 6.605905006418485, + "grad_norm": 4.063085556030273, + "learning_rate": 1.9798459563543005e-05, + "loss": 0.6217, + "step": 5146 + }, + { + "epoch": 6.607188703465982, + "grad_norm": 2.5837230682373047, + "learning_rate": 1.9802310654685493e-05, + "loss": 0.5663, + "step": 5147 + }, + { + "epoch": 6.608472400513479, + "grad_norm": 5.648983478546143, + "learning_rate": 1.9806161745827984e-05, + "loss": 0.6197, + "step": 5148 + }, + { + "epoch": 6.609756097560975, + "grad_norm": 2.780802011489868, + "learning_rate": 1.9810012836970478e-05, + "loss": 0.6233, + "step": 5149 + }, + { + "epoch": 6.611039794608472, + "grad_norm": 2.4066083431243896, + "learning_rate": 1.9813863928112966e-05, + "loss": 0.6278, + "step": 5150 + }, + { + "epoch": 6.612323491655969, + "grad_norm": 2.3797478675842285, + "learning_rate": 1.9817715019255457e-05, + "loss": 0.6357, + "step": 5151 + }, + { + "epoch": 6.613607188703466, + "grad_norm": 2.3359501361846924, + "learning_rate": 1.9821566110397945e-05, + "loss": 0.6341, + "step": 5152 + }, + { + "epoch": 6.614890885750963, + "grad_norm": 3.903362989425659, + "learning_rate": 1.9825417201540436e-05, + "loss": 0.6109, + "step": 5153 + }, + { + "epoch": 6.616174582798459, + "grad_norm": 6.713561534881592, + "learning_rate": 1.9829268292682927e-05, + "loss": 0.5877, + "step": 5154 + }, + { + "epoch": 6.617458279845956, + "grad_norm": 2.8721420764923096, + "learning_rate": 1.9833119383825418e-05, + "loss": 0.6174, + "step": 5155 + }, + { + "epoch": 6.618741976893453, + "grad_norm": 2.4373977184295654, + "learning_rate": 1.983697047496791e-05, + "loss": 0.6083, + "step": 5156 + }, + { + "epoch": 6.62002567394095, + "grad_norm": 2.181081533432007, + "learning_rate": 1.98408215661104e-05, + "loss": 0.6111, + "step": 5157 + }, + { + "epoch": 6.621309370988447, + "grad_norm": 3.889352560043335, + "learning_rate": 1.9844672657252888e-05, + "loss": 0.668, + "step": 5158 + }, + { + "epoch": 6.622593068035943, + "grad_norm": 3.2816178798675537, + "learning_rate": 1.984852374839538e-05, + "loss": 0.6559, + "step": 5159 + }, + { + "epoch": 6.62387676508344, + "grad_norm": 5.458675384521484, + "learning_rate": 1.9852374839537867e-05, + "loss": 0.6954, + "step": 5160 + }, + { + "epoch": 6.625160462130937, + "grad_norm": 2.8922150135040283, + "learning_rate": 1.985622593068036e-05, + "loss": 0.6442, + "step": 5161 + }, + { + "epoch": 6.626444159178434, + "grad_norm": 4.644096374511719, + "learning_rate": 1.9860077021822852e-05, + "loss": 0.6853, + "step": 5162 + }, + { + "epoch": 6.62772785622593, + "grad_norm": 2.758763074874878, + "learning_rate": 1.986392811296534e-05, + "loss": 0.6824, + "step": 5163 + }, + { + "epoch": 6.629011553273427, + "grad_norm": 2.0356650352478027, + "learning_rate": 1.986777920410783e-05, + "loss": 0.5758, + "step": 5164 + }, + { + "epoch": 6.630295250320924, + "grad_norm": 1.925083041191101, + "learning_rate": 1.9871630295250322e-05, + "loss": 0.7015, + "step": 5165 + }, + { + "epoch": 6.631578947368421, + "grad_norm": 1.6401667594909668, + "learning_rate": 1.987548138639281e-05, + "loss": 0.7222, + "step": 5166 + }, + { + "epoch": 6.632862644415918, + "grad_norm": 8.646139144897461, + "learning_rate": 1.9879332477535304e-05, + "loss": 0.6891, + "step": 5167 + }, + { + "epoch": 6.634146341463414, + "grad_norm": 2.8169898986816406, + "learning_rate": 1.9883183568677795e-05, + "loss": 0.6685, + "step": 5168 + }, + { + "epoch": 6.635430038510911, + "grad_norm": 2.851240396499634, + "learning_rate": 1.9887034659820283e-05, + "loss": 0.7265, + "step": 5169 + }, + { + "epoch": 6.636713735558408, + "grad_norm": 3.7300283908843994, + "learning_rate": 1.9890885750962774e-05, + "loss": 0.7107, + "step": 5170 + }, + { + "epoch": 6.637997432605905, + "grad_norm": 1.5346428155899048, + "learning_rate": 1.989473684210526e-05, + "loss": 0.7886, + "step": 5171 + }, + { + "epoch": 6.639281129653401, + "grad_norm": 1.8048051595687866, + "learning_rate": 1.9898587933247753e-05, + "loss": 0.7697, + "step": 5172 + }, + { + "epoch": 6.640564826700898, + "grad_norm": 1.8952772617340088, + "learning_rate": 1.9902439024390247e-05, + "loss": 0.8059, + "step": 5173 + }, + { + "epoch": 6.641848523748395, + "grad_norm": 6.91889762878418, + "learning_rate": 1.9906290115532735e-05, + "loss": 0.8992, + "step": 5174 + }, + { + "epoch": 6.643132220795892, + "grad_norm": 2.4935643672943115, + "learning_rate": 1.9910141206675226e-05, + "loss": 0.6393, + "step": 5175 + }, + { + "epoch": 6.644415917843389, + "grad_norm": 3.2076985836029053, + "learning_rate": 1.9913992297817717e-05, + "loss": 0.628, + "step": 5176 + }, + { + "epoch": 6.645699614890885, + "grad_norm": 1.9703295230865479, + "learning_rate": 1.9917843388960205e-05, + "loss": 0.5917, + "step": 5177 + }, + { + "epoch": 6.646983311938382, + "grad_norm": 3.2596659660339355, + "learning_rate": 1.9921694480102696e-05, + "loss": 0.6074, + "step": 5178 + }, + { + "epoch": 6.648267008985879, + "grad_norm": 31.3284854888916, + "learning_rate": 1.9925545571245187e-05, + "loss": 0.587, + "step": 5179 + }, + { + "epoch": 6.649550706033376, + "grad_norm": 2.2144904136657715, + "learning_rate": 1.9929396662387678e-05, + "loss": 0.6007, + "step": 5180 + }, + { + "epoch": 6.650834403080873, + "grad_norm": 1.9912430047988892, + "learning_rate": 1.993324775353017e-05, + "loss": 0.5946, + "step": 5181 + }, + { + "epoch": 6.652118100128369, + "grad_norm": 1.91506028175354, + "learning_rate": 1.9937098844672657e-05, + "loss": 0.5692, + "step": 5182 + }, + { + "epoch": 6.653401797175866, + "grad_norm": 3.8810482025146484, + "learning_rate": 1.9940949935815148e-05, + "loss": 0.6083, + "step": 5183 + }, + { + "epoch": 6.654685494223363, + "grad_norm": 6.862310409545898, + "learning_rate": 1.994480102695764e-05, + "loss": 0.591, + "step": 5184 + }, + { + "epoch": 6.65596919127086, + "grad_norm": 1.793291687965393, + "learning_rate": 1.9948652118100126e-05, + "loss": 0.5787, + "step": 5185 + }, + { + "epoch": 6.657252888318357, + "grad_norm": 1.566235899925232, + "learning_rate": 1.995250320924262e-05, + "loss": 0.5988, + "step": 5186 + }, + { + "epoch": 6.658536585365853, + "grad_norm": 3.4507603645324707, + "learning_rate": 1.9956354300385112e-05, + "loss": 0.6148, + "step": 5187 + }, + { + "epoch": 6.65982028241335, + "grad_norm": 2.2324395179748535, + "learning_rate": 1.99602053915276e-05, + "loss": 0.5972, + "step": 5188 + }, + { + "epoch": 6.661103979460847, + "grad_norm": 2.2803683280944824, + "learning_rate": 1.996405648267009e-05, + "loss": 0.638, + "step": 5189 + }, + { + "epoch": 6.662387676508344, + "grad_norm": 2.5750739574432373, + "learning_rate": 1.996790757381258e-05, + "loss": 0.6203, + "step": 5190 + }, + { + "epoch": 6.663671373555841, + "grad_norm": 1.3446725606918335, + "learning_rate": 1.997175866495507e-05, + "loss": 0.575, + "step": 5191 + }, + { + "epoch": 6.664955070603337, + "grad_norm": 3.164612293243408, + "learning_rate": 1.9975609756097564e-05, + "loss": 0.6346, + "step": 5192 + }, + { + "epoch": 6.666238767650834, + "grad_norm": 1.3605201244354248, + "learning_rate": 1.997946084724005e-05, + "loss": 0.6035, + "step": 5193 + }, + { + "epoch": 6.667522464698331, + "grad_norm": 1.8273344039916992, + "learning_rate": 1.9983311938382543e-05, + "loss": 0.6327, + "step": 5194 + }, + { + "epoch": 6.668806161745828, + "grad_norm": 3.7890541553497314, + "learning_rate": 1.9987163029525034e-05, + "loss": 0.587, + "step": 5195 + }, + { + "epoch": 6.670089858793324, + "grad_norm": 4.602201461791992, + "learning_rate": 1.999101412066752e-05, + "loss": 0.6048, + "step": 5196 + }, + { + "epoch": 6.671373555840821, + "grad_norm": 2.3685481548309326, + "learning_rate": 1.9994865211810012e-05, + "loss": 0.6248, + "step": 5197 + }, + { + "epoch": 6.672657252888318, + "grad_norm": 5.004748821258545, + "learning_rate": 1.9998716302952503e-05, + "loss": 0.6592, + "step": 5198 + }, + { + "epoch": 6.673940949935815, + "grad_norm": 1.932856559753418, + "learning_rate": 2.0002567394094995e-05, + "loss": 0.6659, + "step": 5199 + }, + { + "epoch": 6.675224646983312, + "grad_norm": 2.904660940170288, + "learning_rate": 2.0006418485237486e-05, + "loss": 0.6295, + "step": 5200 + }, + { + "epoch": 6.676508344030808, + "grad_norm": 3.837172508239746, + "learning_rate": 2.0010269576379973e-05, + "loss": 0.6042, + "step": 5201 + }, + { + "epoch": 6.677792041078305, + "grad_norm": 2.6973860263824463, + "learning_rate": 2.0014120667522464e-05, + "loss": 0.6658, + "step": 5202 + }, + { + "epoch": 6.679075738125802, + "grad_norm": 2.6544764041900635, + "learning_rate": 2.0017971758664955e-05, + "loss": 0.6821, + "step": 5203 + }, + { + "epoch": 6.680359435173299, + "grad_norm": 2.8627169132232666, + "learning_rate": 2.0021822849807447e-05, + "loss": 0.6193, + "step": 5204 + }, + { + "epoch": 6.681643132220795, + "grad_norm": 3.6807467937469482, + "learning_rate": 2.0025673940949938e-05, + "loss": 0.7018, + "step": 5205 + }, + { + "epoch": 6.682926829268292, + "grad_norm": 2.0515525341033936, + "learning_rate": 2.002952503209243e-05, + "loss": 0.6684, + "step": 5206 + }, + { + "epoch": 6.684210526315789, + "grad_norm": 2.2237558364868164, + "learning_rate": 2.0033376123234916e-05, + "loss": 0.6414, + "step": 5207 + }, + { + "epoch": 6.685494223363286, + "grad_norm": 2.594024181365967, + "learning_rate": 2.0037227214377407e-05, + "loss": 0.6556, + "step": 5208 + }, + { + "epoch": 6.686777920410783, + "grad_norm": 15.678549766540527, + "learning_rate": 2.0041078305519895e-05, + "loss": 0.6694, + "step": 5209 + }, + { + "epoch": 6.688061617458279, + "grad_norm": 3.0090274810791016, + "learning_rate": 2.004492939666239e-05, + "loss": 0.6224, + "step": 5210 + }, + { + "epoch": 6.689345314505776, + "grad_norm": 2.486276388168335, + "learning_rate": 2.004878048780488e-05, + "loss": 0.609, + "step": 5211 + }, + { + "epoch": 6.690629011553273, + "grad_norm": 3.0671637058258057, + "learning_rate": 2.0052631578947368e-05, + "loss": 0.6321, + "step": 5212 + }, + { + "epoch": 6.69191270860077, + "grad_norm": 5.090473175048828, + "learning_rate": 2.005648267008986e-05, + "loss": 0.6593, + "step": 5213 + }, + { + "epoch": 6.693196405648267, + "grad_norm": 7.203182220458984, + "learning_rate": 2.006033376123235e-05, + "loss": 0.6835, + "step": 5214 + }, + { + "epoch": 6.694480102695763, + "grad_norm": 8.873734474182129, + "learning_rate": 2.0064184852374838e-05, + "loss": 0.6756, + "step": 5215 + }, + { + "epoch": 6.69576379974326, + "grad_norm": 3.953580141067505, + "learning_rate": 2.0068035943517333e-05, + "loss": 0.6573, + "step": 5216 + }, + { + "epoch": 6.697047496790757, + "grad_norm": 2.0636370182037354, + "learning_rate": 2.007188703465982e-05, + "loss": 0.6941, + "step": 5217 + }, + { + "epoch": 6.698331193838254, + "grad_norm": 1.7662370204925537, + "learning_rate": 2.007573812580231e-05, + "loss": 0.6802, + "step": 5218 + }, + { + "epoch": 6.699614890885751, + "grad_norm": 3.9022324085235596, + "learning_rate": 2.0079589216944802e-05, + "loss": 0.7292, + "step": 5219 + }, + { + "epoch": 6.700898587933247, + "grad_norm": 1.890702486038208, + "learning_rate": 2.008344030808729e-05, + "loss": 0.6888, + "step": 5220 + }, + { + "epoch": 6.702182284980744, + "grad_norm": 1.9601335525512695, + "learning_rate": 2.008729139922978e-05, + "loss": 0.6621, + "step": 5221 + }, + { + "epoch": 6.703465982028241, + "grad_norm": 6.979122161865234, + "learning_rate": 2.0091142490372276e-05, + "loss": 0.8048, + "step": 5222 + }, + { + "epoch": 6.704749679075738, + "grad_norm": 4.614778995513916, + "learning_rate": 2.0094993581514763e-05, + "loss": 0.7804, + "step": 5223 + }, + { + "epoch": 6.706033376123235, + "grad_norm": 3.2895493507385254, + "learning_rate": 2.0098844672657254e-05, + "loss": 0.8556, + "step": 5224 + }, + { + "epoch": 6.7073170731707314, + "grad_norm": 2.198467493057251, + "learning_rate": 2.0102695763799742e-05, + "loss": 0.5969, + "step": 5225 + }, + { + "epoch": 6.708600770218228, + "grad_norm": 1.571539044380188, + "learning_rate": 2.0106546854942233e-05, + "loss": 0.5957, + "step": 5226 + }, + { + "epoch": 6.709884467265725, + "grad_norm": 4.053508758544922, + "learning_rate": 2.0110397946084724e-05, + "loss": 0.6272, + "step": 5227 + }, + { + "epoch": 6.711168164313222, + "grad_norm": 1.6774996519088745, + "learning_rate": 2.0114249037227215e-05, + "loss": 0.5784, + "step": 5228 + }, + { + "epoch": 6.712451861360719, + "grad_norm": 1.5397371053695679, + "learning_rate": 2.0118100128369706e-05, + "loss": 0.62, + "step": 5229 + }, + { + "epoch": 6.7137355584082155, + "grad_norm": 5.887795448303223, + "learning_rate": 2.0121951219512197e-05, + "loss": 0.6432, + "step": 5230 + }, + { + "epoch": 6.715019255455712, + "grad_norm": 1.558815598487854, + "learning_rate": 2.0125802310654685e-05, + "loss": 0.5898, + "step": 5231 + }, + { + "epoch": 6.716302952503209, + "grad_norm": 1.4546338319778442, + "learning_rate": 2.0129653401797176e-05, + "loss": 0.5646, + "step": 5232 + }, + { + "epoch": 6.717586649550706, + "grad_norm": 5.129533767700195, + "learning_rate": 2.0133504492939667e-05, + "loss": 0.6107, + "step": 5233 + }, + { + "epoch": 6.7188703465982025, + "grad_norm": 2.0559914112091064, + "learning_rate": 2.0137355584082155e-05, + "loss": 0.6257, + "step": 5234 + }, + { + "epoch": 6.7201540436456995, + "grad_norm": 4.706529140472412, + "learning_rate": 2.014120667522465e-05, + "loss": 0.5875, + "step": 5235 + }, + { + "epoch": 6.721437740693196, + "grad_norm": 1.8905051946640015, + "learning_rate": 2.0145057766367137e-05, + "loss": 0.6123, + "step": 5236 + }, + { + "epoch": 6.722721437740693, + "grad_norm": 2.7304511070251465, + "learning_rate": 2.0148908857509628e-05, + "loss": 0.5624, + "step": 5237 + }, + { + "epoch": 6.7240051347881895, + "grad_norm": 2.5913562774658203, + "learning_rate": 2.015275994865212e-05, + "loss": 0.6428, + "step": 5238 + }, + { + "epoch": 6.7252888318356865, + "grad_norm": 2.3494253158569336, + "learning_rate": 2.0156611039794607e-05, + "loss": 0.5438, + "step": 5239 + }, + { + "epoch": 6.7265725288831835, + "grad_norm": 5.061111927032471, + "learning_rate": 2.0160462130937098e-05, + "loss": 0.6095, + "step": 5240 + }, + { + "epoch": 6.7278562259306804, + "grad_norm": 2.623396396636963, + "learning_rate": 2.0164313222079592e-05, + "loss": 0.6283, + "step": 5241 + }, + { + "epoch": 6.729139922978177, + "grad_norm": 1.8087037801742554, + "learning_rate": 2.016816431322208e-05, + "loss": 0.6088, + "step": 5242 + }, + { + "epoch": 6.7304236200256735, + "grad_norm": 1.744834065437317, + "learning_rate": 2.017201540436457e-05, + "loss": 0.627, + "step": 5243 + }, + { + "epoch": 6.7317073170731705, + "grad_norm": 3.062586784362793, + "learning_rate": 2.017586649550706e-05, + "loss": 0.5896, + "step": 5244 + }, + { + "epoch": 6.7329910141206675, + "grad_norm": 3.7027790546417236, + "learning_rate": 2.017971758664955e-05, + "loss": 0.6744, + "step": 5245 + }, + { + "epoch": 6.7342747111681645, + "grad_norm": 2.0974555015563965, + "learning_rate": 2.018356867779204e-05, + "loss": 0.6138, + "step": 5246 + }, + { + "epoch": 6.735558408215661, + "grad_norm": 3.850707530975342, + "learning_rate": 2.0187419768934532e-05, + "loss": 0.6302, + "step": 5247 + }, + { + "epoch": 6.7368421052631575, + "grad_norm": 3.3302741050720215, + "learning_rate": 2.0191270860077023e-05, + "loss": 0.5633, + "step": 5248 + }, + { + "epoch": 6.7381258023106545, + "grad_norm": 1.5403571128845215, + "learning_rate": 2.0195121951219514e-05, + "loss": 0.6653, + "step": 5249 + }, + { + "epoch": 6.7394094993581515, + "grad_norm": 3.566045045852661, + "learning_rate": 2.0198973042362002e-05, + "loss": 0.6204, + "step": 5250 + }, + { + "epoch": 6.7406931964056485, + "grad_norm": 2.2035017013549805, + "learning_rate": 2.0202824133504493e-05, + "loss": 0.5798, + "step": 5251 + }, + { + "epoch": 6.741976893453145, + "grad_norm": 2.720479726791382, + "learning_rate": 2.0206675224646984e-05, + "loss": 0.5562, + "step": 5252 + }, + { + "epoch": 6.7432605905006415, + "grad_norm": 3.108750820159912, + "learning_rate": 2.0210526315789475e-05, + "loss": 0.5883, + "step": 5253 + }, + { + "epoch": 6.7445442875481385, + "grad_norm": 4.2905073165893555, + "learning_rate": 2.0214377406931966e-05, + "loss": 0.648, + "step": 5254 + }, + { + "epoch": 6.7458279845956355, + "grad_norm": 3.623642683029175, + "learning_rate": 2.0218228498074454e-05, + "loss": 0.6391, + "step": 5255 + }, + { + "epoch": 6.7471116816431325, + "grad_norm": 1.60041344165802, + "learning_rate": 2.0222079589216945e-05, + "loss": 0.6069, + "step": 5256 + }, + { + "epoch": 6.748395378690629, + "grad_norm": 2.4632065296173096, + "learning_rate": 2.0225930680359436e-05, + "loss": 0.6593, + "step": 5257 + }, + { + "epoch": 6.7496790757381255, + "grad_norm": 12.612724304199219, + "learning_rate": 2.0229781771501924e-05, + "loss": 0.6778, + "step": 5258 + }, + { + "epoch": 6.7509627727856225, + "grad_norm": 8.7681303024292, + "learning_rate": 2.0233632862644418e-05, + "loss": 0.6436, + "step": 5259 + }, + { + "epoch": 6.7522464698331195, + "grad_norm": 3.2643239498138428, + "learning_rate": 2.023748395378691e-05, + "loss": 0.6732, + "step": 5260 + }, + { + "epoch": 6.7535301668806165, + "grad_norm": 2.7247536182403564, + "learning_rate": 2.0241335044929397e-05, + "loss": 0.634, + "step": 5261 + }, + { + "epoch": 6.7548138639281134, + "grad_norm": 2.9768893718719482, + "learning_rate": 2.0245186136071888e-05, + "loss": 0.6412, + "step": 5262 + }, + { + "epoch": 6.7560975609756095, + "grad_norm": 4.478882789611816, + "learning_rate": 2.0249037227214376e-05, + "loss": 0.6567, + "step": 5263 + }, + { + "epoch": 6.7573812580231065, + "grad_norm": 4.290363311767578, + "learning_rate": 2.0252888318356867e-05, + "loss": 0.6301, + "step": 5264 + }, + { + "epoch": 6.7586649550706035, + "grad_norm": 5.543417453765869, + "learning_rate": 2.025673940949936e-05, + "loss": 0.7369, + "step": 5265 + }, + { + "epoch": 6.7599486521181005, + "grad_norm": 4.874026298522949, + "learning_rate": 2.026059050064185e-05, + "loss": 0.6301, + "step": 5266 + }, + { + "epoch": 6.761232349165597, + "grad_norm": 5.531477451324463, + "learning_rate": 2.026444159178434e-05, + "loss": 0.6747, + "step": 5267 + }, + { + "epoch": 6.7625160462130935, + "grad_norm": 4.59235143661499, + "learning_rate": 2.026829268292683e-05, + "loss": 0.6543, + "step": 5268 + }, + { + "epoch": 6.7637997432605905, + "grad_norm": 2.981031656265259, + "learning_rate": 2.027214377406932e-05, + "loss": 0.7213, + "step": 5269 + }, + { + "epoch": 6.7650834403080875, + "grad_norm": 2.6968719959259033, + "learning_rate": 2.027599486521181e-05, + "loss": 0.7196, + "step": 5270 + }, + { + "epoch": 6.766367137355584, + "grad_norm": 2.877741575241089, + "learning_rate": 2.02798459563543e-05, + "loss": 0.7778, + "step": 5271 + }, + { + "epoch": 6.767650834403081, + "grad_norm": 2.6630520820617676, + "learning_rate": 2.0283697047496792e-05, + "loss": 0.6445, + "step": 5272 + }, + { + "epoch": 6.7689345314505776, + "grad_norm": 3.2792632579803467, + "learning_rate": 2.0287548138639283e-05, + "loss": 0.8197, + "step": 5273 + }, + { + "epoch": 6.7702182284980745, + "grad_norm": 2.7567198276519775, + "learning_rate": 2.029139922978177e-05, + "loss": 0.9346, + "step": 5274 + }, + { + "epoch": 6.7715019255455715, + "grad_norm": 2.5596184730529785, + "learning_rate": 2.0295250320924262e-05, + "loss": 0.6148, + "step": 5275 + }, + { + "epoch": 6.772785622593068, + "grad_norm": 5.9510178565979, + "learning_rate": 2.0299101412066753e-05, + "loss": 0.5818, + "step": 5276 + }, + { + "epoch": 6.774069319640565, + "grad_norm": 4.42221736907959, + "learning_rate": 2.0302952503209244e-05, + "loss": 0.5949, + "step": 5277 + }, + { + "epoch": 6.775353016688062, + "grad_norm": 2.449709415435791, + "learning_rate": 2.0306803594351735e-05, + "loss": 0.6454, + "step": 5278 + }, + { + "epoch": 6.7766367137355585, + "grad_norm": 2.9515433311462402, + "learning_rate": 2.0310654685494226e-05, + "loss": 0.6204, + "step": 5279 + }, + { + "epoch": 6.7779204107830555, + "grad_norm": 3.195063352584839, + "learning_rate": 2.0314505776636714e-05, + "loss": 0.6578, + "step": 5280 + }, + { + "epoch": 6.779204107830552, + "grad_norm": 2.471693277359009, + "learning_rate": 2.0318356867779205e-05, + "loss": 0.6446, + "step": 5281 + }, + { + "epoch": 6.780487804878049, + "grad_norm": 1.350743055343628, + "learning_rate": 2.0322207958921692e-05, + "loss": 0.5977, + "step": 5282 + }, + { + "epoch": 6.781771501925546, + "grad_norm": 1.2057892084121704, + "learning_rate": 2.0326059050064187e-05, + "loss": 0.6282, + "step": 5283 + }, + { + "epoch": 6.7830551989730425, + "grad_norm": 1.9129750728607178, + "learning_rate": 2.0329910141206678e-05, + "loss": 0.6447, + "step": 5284 + }, + { + "epoch": 6.7843388960205395, + "grad_norm": 1.4530320167541504, + "learning_rate": 2.0333761232349166e-05, + "loss": 0.5723, + "step": 5285 + }, + { + "epoch": 6.785622593068036, + "grad_norm": 8.055242538452148, + "learning_rate": 2.0337612323491657e-05, + "loss": 0.6293, + "step": 5286 + }, + { + "epoch": 6.786906290115533, + "grad_norm": 2.325076103210449, + "learning_rate": 2.0341463414634148e-05, + "loss": 0.5746, + "step": 5287 + }, + { + "epoch": 6.78818998716303, + "grad_norm": 5.468558311462402, + "learning_rate": 2.0345314505776635e-05, + "loss": 0.6, + "step": 5288 + }, + { + "epoch": 6.7894736842105265, + "grad_norm": 6.090128421783447, + "learning_rate": 2.0349165596919127e-05, + "loss": 0.6718, + "step": 5289 + }, + { + "epoch": 6.7907573812580235, + "grad_norm": 2.702422618865967, + "learning_rate": 2.0353016688061618e-05, + "loss": 0.62, + "step": 5290 + }, + { + "epoch": 6.79204107830552, + "grad_norm": 1.6155058145523071, + "learning_rate": 2.035686777920411e-05, + "loss": 0.5995, + "step": 5291 + }, + { + "epoch": 6.793324775353017, + "grad_norm": 1.928475022315979, + "learning_rate": 2.03607188703466e-05, + "loss": 0.5703, + "step": 5292 + }, + { + "epoch": 6.794608472400514, + "grad_norm": 2.4246506690979004, + "learning_rate": 2.0364569961489087e-05, + "loss": 0.586, + "step": 5293 + }, + { + "epoch": 6.7958921694480106, + "grad_norm": 3.626568078994751, + "learning_rate": 2.036842105263158e-05, + "loss": 0.5778, + "step": 5294 + }, + { + "epoch": 6.7971758664955075, + "grad_norm": 4.926537990570068, + "learning_rate": 2.037227214377407e-05, + "loss": 0.6115, + "step": 5295 + }, + { + "epoch": 6.798459563543004, + "grad_norm": 2.1761250495910645, + "learning_rate": 2.037612323491656e-05, + "loss": 0.5877, + "step": 5296 + }, + { + "epoch": 6.799743260590501, + "grad_norm": 3.492673873901367, + "learning_rate": 2.037997432605905e-05, + "loss": 0.6182, + "step": 5297 + }, + { + "epoch": 6.801026957637998, + "grad_norm": 1.969265341758728, + "learning_rate": 2.0383825417201543e-05, + "loss": 0.5805, + "step": 5298 + }, + { + "epoch": 6.802310654685495, + "grad_norm": 3.517812728881836, + "learning_rate": 2.038767650834403e-05, + "loss": 0.6037, + "step": 5299 + }, + { + "epoch": 6.803594351732991, + "grad_norm": 2.2799837589263916, + "learning_rate": 2.039152759948652e-05, + "loss": 0.5828, + "step": 5300 + }, + { + "epoch": 6.804878048780488, + "grad_norm": 2.456418752670288, + "learning_rate": 2.039537869062901e-05, + "loss": 0.6107, + "step": 5301 + }, + { + "epoch": 6.806161745827985, + "grad_norm": 3.7949631214141846, + "learning_rate": 2.0399229781771504e-05, + "loss": 0.6251, + "step": 5302 + }, + { + "epoch": 6.807445442875482, + "grad_norm": 4.828501224517822, + "learning_rate": 2.0403080872913995e-05, + "loss": 0.6343, + "step": 5303 + }, + { + "epoch": 6.808729139922978, + "grad_norm": 2.75652813911438, + "learning_rate": 2.0406931964056482e-05, + "loss": 0.7314, + "step": 5304 + }, + { + "epoch": 6.810012836970475, + "grad_norm": 3.425234794616699, + "learning_rate": 2.0410783055198973e-05, + "loss": 0.6253, + "step": 5305 + }, + { + "epoch": 6.811296534017972, + "grad_norm": 4.1986799240112305, + "learning_rate": 2.0414634146341465e-05, + "loss": 0.6702, + "step": 5306 + }, + { + "epoch": 6.812580231065469, + "grad_norm": 2.334785223007202, + "learning_rate": 2.0418485237483952e-05, + "loss": 0.6774, + "step": 5307 + }, + { + "epoch": 6.813863928112966, + "grad_norm": 8.461898803710938, + "learning_rate": 2.0422336328626447e-05, + "loss": 0.6973, + "step": 5308 + }, + { + "epoch": 6.815147625160462, + "grad_norm": 5.359068870544434, + "learning_rate": 2.0426187419768934e-05, + "loss": 0.6671, + "step": 5309 + }, + { + "epoch": 6.816431322207959, + "grad_norm": 3.340909242630005, + "learning_rate": 2.0430038510911425e-05, + "loss": 0.6164, + "step": 5310 + }, + { + "epoch": 6.817715019255456, + "grad_norm": 3.0657248497009277, + "learning_rate": 2.0433889602053917e-05, + "loss": 0.6502, + "step": 5311 + }, + { + "epoch": 6.818998716302953, + "grad_norm": 2.4099299907684326, + "learning_rate": 2.0437740693196404e-05, + "loss": 0.6373, + "step": 5312 + }, + { + "epoch": 6.82028241335045, + "grad_norm": 2.184587001800537, + "learning_rate": 2.0441591784338895e-05, + "loss": 0.6696, + "step": 5313 + }, + { + "epoch": 6.821566110397946, + "grad_norm": 2.499552011489868, + "learning_rate": 2.044544287548139e-05, + "loss": 0.6147, + "step": 5314 + }, + { + "epoch": 6.822849807445443, + "grad_norm": 3.2620184421539307, + "learning_rate": 2.0449293966623877e-05, + "loss": 0.6643, + "step": 5315 + }, + { + "epoch": 6.82413350449294, + "grad_norm": 1.9280459880828857, + "learning_rate": 2.045314505776637e-05, + "loss": 0.7201, + "step": 5316 + }, + { + "epoch": 6.825417201540437, + "grad_norm": 3.7425012588500977, + "learning_rate": 2.0456996148908856e-05, + "loss": 0.7199, + "step": 5317 + }, + { + "epoch": 6.826700898587934, + "grad_norm": 3.234793186187744, + "learning_rate": 2.0460847240051347e-05, + "loss": 0.6445, + "step": 5318 + }, + { + "epoch": 6.82798459563543, + "grad_norm": 2.54042387008667, + "learning_rate": 2.0464698331193838e-05, + "loss": 0.7215, + "step": 5319 + }, + { + "epoch": 6.829268292682927, + "grad_norm": 2.190727710723877, + "learning_rate": 2.046854942233633e-05, + "loss": 0.6849, + "step": 5320 + }, + { + "epoch": 6.830551989730424, + "grad_norm": 4.301336288452148, + "learning_rate": 2.047240051347882e-05, + "loss": 0.6908, + "step": 5321 + }, + { + "epoch": 6.831835686777921, + "grad_norm": 12.026751518249512, + "learning_rate": 2.047625160462131e-05, + "loss": 0.785, + "step": 5322 + }, + { + "epoch": 6.833119383825418, + "grad_norm": 2.410461664199829, + "learning_rate": 2.04801026957638e-05, + "loss": 0.7647, + "step": 5323 + }, + { + "epoch": 6.834403080872914, + "grad_norm": 3.644963502883911, + "learning_rate": 2.048395378690629e-05, + "loss": 0.8526, + "step": 5324 + }, + { + "epoch": 6.835686777920411, + "grad_norm": 4.336236000061035, + "learning_rate": 2.048780487804878e-05, + "loss": 0.5953, + "step": 5325 + }, + { + "epoch": 6.836970474967908, + "grad_norm": 1.4263554811477661, + "learning_rate": 2.0491655969191272e-05, + "loss": 0.5716, + "step": 5326 + }, + { + "epoch": 6.838254172015405, + "grad_norm": 2.000931739807129, + "learning_rate": 2.0495507060333763e-05, + "loss": 0.624, + "step": 5327 + }, + { + "epoch": 6.839537869062902, + "grad_norm": 2.0610203742980957, + "learning_rate": 2.049935815147625e-05, + "loss": 0.6333, + "step": 5328 + }, + { + "epoch": 6.840821566110398, + "grad_norm": 1.240579605102539, + "learning_rate": 2.0503209242618742e-05, + "loss": 0.6178, + "step": 5329 + }, + { + "epoch": 6.842105263157895, + "grad_norm": 1.4613327980041504, + "learning_rate": 2.0507060333761233e-05, + "loss": 0.6038, + "step": 5330 + }, + { + "epoch": 6.843388960205392, + "grad_norm": 2.3981616497039795, + "learning_rate": 2.051091142490372e-05, + "loss": 0.5746, + "step": 5331 + }, + { + "epoch": 6.844672657252889, + "grad_norm": 11.141934394836426, + "learning_rate": 2.0514762516046215e-05, + "loss": 0.6297, + "step": 5332 + }, + { + "epoch": 6.845956354300385, + "grad_norm": 4.308582305908203, + "learning_rate": 2.0518613607188707e-05, + "loss": 0.5824, + "step": 5333 + }, + { + "epoch": 6.847240051347882, + "grad_norm": 3.129981756210327, + "learning_rate": 2.0522464698331194e-05, + "loss": 0.5765, + "step": 5334 + }, + { + "epoch": 6.848523748395379, + "grad_norm": 2.912457227706909, + "learning_rate": 2.0526315789473685e-05, + "loss": 0.619, + "step": 5335 + }, + { + "epoch": 6.849807445442876, + "grad_norm": 7.546358108520508, + "learning_rate": 2.0530166880616173e-05, + "loss": 0.5981, + "step": 5336 + }, + { + "epoch": 6.851091142490373, + "grad_norm": 2.7037887573242188, + "learning_rate": 2.0534017971758664e-05, + "loss": 0.6043, + "step": 5337 + }, + { + "epoch": 6.852374839537869, + "grad_norm": 3.134864330291748, + "learning_rate": 2.0537869062901155e-05, + "loss": 0.6132, + "step": 5338 + }, + { + "epoch": 6.853658536585366, + "grad_norm": 3.61263108253479, + "learning_rate": 2.0541720154043646e-05, + "loss": 0.6049, + "step": 5339 + }, + { + "epoch": 6.854942233632863, + "grad_norm": 2.575803518295288, + "learning_rate": 2.0545571245186137e-05, + "loss": 0.5928, + "step": 5340 + }, + { + "epoch": 6.85622593068036, + "grad_norm": 2.948751449584961, + "learning_rate": 2.0549422336328628e-05, + "loss": 0.6268, + "step": 5341 + }, + { + "epoch": 6.857509627727856, + "grad_norm": 2.285862922668457, + "learning_rate": 2.0553273427471116e-05, + "loss": 0.6096, + "step": 5342 + }, + { + "epoch": 6.858793324775353, + "grad_norm": 2.1373202800750732, + "learning_rate": 2.0557124518613607e-05, + "loss": 0.6289, + "step": 5343 + }, + { + "epoch": 6.86007702182285, + "grad_norm": 1.1956250667572021, + "learning_rate": 2.0560975609756098e-05, + "loss": 0.6468, + "step": 5344 + }, + { + "epoch": 6.861360718870347, + "grad_norm": 5.104648113250732, + "learning_rate": 2.056482670089859e-05, + "loss": 0.5976, + "step": 5345 + }, + { + "epoch": 6.862644415917844, + "grad_norm": 1.5427240133285522, + "learning_rate": 2.056867779204108e-05, + "loss": 0.5997, + "step": 5346 + }, + { + "epoch": 6.86392811296534, + "grad_norm": 1.5954360961914062, + "learning_rate": 2.0572528883183568e-05, + "loss": 0.6341, + "step": 5347 + }, + { + "epoch": 6.865211810012837, + "grad_norm": 2.693861484527588, + "learning_rate": 2.057637997432606e-05, + "loss": 0.6055, + "step": 5348 + }, + { + "epoch": 6.866495507060334, + "grad_norm": 4.602025508880615, + "learning_rate": 2.058023106546855e-05, + "loss": 0.6585, + "step": 5349 + }, + { + "epoch": 6.867779204107831, + "grad_norm": 2.321883201599121, + "learning_rate": 2.0584082156611038e-05, + "loss": 0.6055, + "step": 5350 + }, + { + "epoch": 6.869062901155328, + "grad_norm": 2.04648756980896, + "learning_rate": 2.0587933247753532e-05, + "loss": 0.6219, + "step": 5351 + }, + { + "epoch": 6.870346598202824, + "grad_norm": 2.8371975421905518, + "learning_rate": 2.0591784338896023e-05, + "loss": 0.5974, + "step": 5352 + }, + { + "epoch": 6.871630295250321, + "grad_norm": 2.3735597133636475, + "learning_rate": 2.059563543003851e-05, + "loss": 0.6427, + "step": 5353 + }, + { + "epoch": 6.872913992297818, + "grad_norm": 2.911323070526123, + "learning_rate": 2.0599486521181002e-05, + "loss": 0.6479, + "step": 5354 + }, + { + "epoch": 6.874197689345315, + "grad_norm": 2.137035369873047, + "learning_rate": 2.060333761232349e-05, + "loss": 0.6448, + "step": 5355 + }, + { + "epoch": 6.875481386392812, + "grad_norm": 1.3886644840240479, + "learning_rate": 2.060718870346598e-05, + "loss": 0.6228, + "step": 5356 + }, + { + "epoch": 6.876765083440308, + "grad_norm": 1.1555219888687134, + "learning_rate": 2.0611039794608475e-05, + "loss": 0.6396, + "step": 5357 + }, + { + "epoch": 6.878048780487805, + "grad_norm": 1.6166319847106934, + "learning_rate": 2.0614890885750963e-05, + "loss": 0.6691, + "step": 5358 + }, + { + "epoch": 6.879332477535302, + "grad_norm": 1.9875421524047852, + "learning_rate": 2.0618741976893454e-05, + "loss": 0.7077, + "step": 5359 + }, + { + "epoch": 6.880616174582799, + "grad_norm": 3.5660431385040283, + "learning_rate": 2.0622593068035945e-05, + "loss": 0.6327, + "step": 5360 + }, + { + "epoch": 6.881899871630296, + "grad_norm": 2.0939273834228516, + "learning_rate": 2.0626444159178433e-05, + "loss": 0.6893, + "step": 5361 + }, + { + "epoch": 6.883183568677792, + "grad_norm": 4.785465717315674, + "learning_rate": 2.0630295250320924e-05, + "loss": 0.6336, + "step": 5362 + }, + { + "epoch": 6.884467265725289, + "grad_norm": 1.3174779415130615, + "learning_rate": 2.0634146341463415e-05, + "loss": 0.7005, + "step": 5363 + }, + { + "epoch": 6.885750962772786, + "grad_norm": 1.7900949716567993, + "learning_rate": 2.0637997432605906e-05, + "loss": 0.6941, + "step": 5364 + }, + { + "epoch": 6.887034659820283, + "grad_norm": 2.0519487857818604, + "learning_rate": 2.0641848523748397e-05, + "loss": 0.6617, + "step": 5365 + }, + { + "epoch": 6.888318356867779, + "grad_norm": 1.8319686651229858, + "learning_rate": 2.0645699614890885e-05, + "loss": 0.7041, + "step": 5366 + }, + { + "epoch": 6.889602053915276, + "grad_norm": 2.0546481609344482, + "learning_rate": 2.0649550706033376e-05, + "loss": 0.659, + "step": 5367 + }, + { + "epoch": 6.890885750962773, + "grad_norm": 19.141254425048828, + "learning_rate": 2.0653401797175867e-05, + "loss": 0.6795, + "step": 5368 + }, + { + "epoch": 6.89216944801027, + "grad_norm": 1.5567783117294312, + "learning_rate": 2.0657252888318358e-05, + "loss": 0.7243, + "step": 5369 + }, + { + "epoch": 6.893453145057767, + "grad_norm": 4.279959678649902, + "learning_rate": 2.066110397946085e-05, + "loss": 0.7354, + "step": 5370 + }, + { + "epoch": 6.894736842105263, + "grad_norm": 3.8248918056488037, + "learning_rate": 2.066495507060334e-05, + "loss": 0.7727, + "step": 5371 + }, + { + "epoch": 6.89602053915276, + "grad_norm": 3.2239575386047363, + "learning_rate": 2.0668806161745828e-05, + "loss": 0.7159, + "step": 5372 + }, + { + "epoch": 6.897304236200257, + "grad_norm": 3.6363770961761475, + "learning_rate": 2.067265725288832e-05, + "loss": 0.8329, + "step": 5373 + }, + { + "epoch": 6.898587933247754, + "grad_norm": 2.0313825607299805, + "learning_rate": 2.0676508344030807e-05, + "loss": 0.9356, + "step": 5374 + }, + { + "epoch": 6.89987163029525, + "grad_norm": 2.2541401386260986, + "learning_rate": 2.06803594351733e-05, + "loss": 0.6532, + "step": 5375 + }, + { + "epoch": 6.901155327342747, + "grad_norm": 3.5934414863586426, + "learning_rate": 2.0684210526315792e-05, + "loss": 0.5784, + "step": 5376 + }, + { + "epoch": 6.902439024390244, + "grad_norm": 1.6167678833007812, + "learning_rate": 2.068806161745828e-05, + "loss": 0.5987, + "step": 5377 + }, + { + "epoch": 6.903722721437741, + "grad_norm": 1.5723881721496582, + "learning_rate": 2.069191270860077e-05, + "loss": 0.5964, + "step": 5378 + }, + { + "epoch": 6.905006418485238, + "grad_norm": 4.5549492835998535, + "learning_rate": 2.0695763799743262e-05, + "loss": 0.5885, + "step": 5379 + }, + { + "epoch": 6.906290115532734, + "grad_norm": 2.035534620285034, + "learning_rate": 2.069961489088575e-05, + "loss": 0.6151, + "step": 5380 + }, + { + "epoch": 6.907573812580231, + "grad_norm": 2.5063631534576416, + "learning_rate": 2.0703465982028244e-05, + "loss": 0.5915, + "step": 5381 + }, + { + "epoch": 6.908857509627728, + "grad_norm": 1.1093859672546387, + "learning_rate": 2.0707317073170732e-05, + "loss": 0.6018, + "step": 5382 + }, + { + "epoch": 6.910141206675225, + "grad_norm": 1.4971181154251099, + "learning_rate": 2.0711168164313223e-05, + "loss": 0.601, + "step": 5383 + }, + { + "epoch": 6.911424903722722, + "grad_norm": 1.2109909057617188, + "learning_rate": 2.0715019255455714e-05, + "loss": 0.6086, + "step": 5384 + }, + { + "epoch": 6.912708600770218, + "grad_norm": 1.6431167125701904, + "learning_rate": 2.07188703465982e-05, + "loss": 0.5936, + "step": 5385 + }, + { + "epoch": 6.913992297817715, + "grad_norm": 1.3162871599197388, + "learning_rate": 2.0722721437740693e-05, + "loss": 0.622, + "step": 5386 + }, + { + "epoch": 6.915275994865212, + "grad_norm": 1.727985143661499, + "learning_rate": 2.0726572528883184e-05, + "loss": 0.5927, + "step": 5387 + }, + { + "epoch": 6.916559691912709, + "grad_norm": 1.5794039964675903, + "learning_rate": 2.0730423620025675e-05, + "loss": 0.6174, + "step": 5388 + }, + { + "epoch": 6.917843388960206, + "grad_norm": 1.069381594657898, + "learning_rate": 2.0734274711168166e-05, + "loss": 0.5867, + "step": 5389 + }, + { + "epoch": 6.919127086007702, + "grad_norm": 3.883310079574585, + "learning_rate": 2.0738125802310657e-05, + "loss": 0.6105, + "step": 5390 + }, + { + "epoch": 6.920410783055199, + "grad_norm": 1.3357009887695312, + "learning_rate": 2.0741976893453145e-05, + "loss": 0.6168, + "step": 5391 + }, + { + "epoch": 6.921694480102696, + "grad_norm": 1.9900634288787842, + "learning_rate": 2.0745827984595636e-05, + "loss": 0.5973, + "step": 5392 + }, + { + "epoch": 6.922978177150193, + "grad_norm": 1.7274260520935059, + "learning_rate": 2.0749679075738123e-05, + "loss": 0.5923, + "step": 5393 + }, + { + "epoch": 6.92426187419769, + "grad_norm": 1.7957922220230103, + "learning_rate": 2.0753530166880618e-05, + "loss": 0.6052, + "step": 5394 + }, + { + "epoch": 6.925545571245186, + "grad_norm": 2.3923721313476562, + "learning_rate": 2.075738125802311e-05, + "loss": 0.6308, + "step": 5395 + }, + { + "epoch": 6.926829268292683, + "grad_norm": 2.3926923274993896, + "learning_rate": 2.0761232349165597e-05, + "loss": 0.6547, + "step": 5396 + }, + { + "epoch": 6.92811296534018, + "grad_norm": 1.7824616432189941, + "learning_rate": 2.0765083440308088e-05, + "loss": 0.6384, + "step": 5397 + }, + { + "epoch": 6.929396662387677, + "grad_norm": 1.613910436630249, + "learning_rate": 2.076893453145058e-05, + "loss": 0.6424, + "step": 5398 + }, + { + "epoch": 6.930680359435174, + "grad_norm": 1.0211304426193237, + "learning_rate": 2.0772785622593066e-05, + "loss": 0.6208, + "step": 5399 + }, + { + "epoch": 6.93196405648267, + "grad_norm": 1.6301207542419434, + "learning_rate": 2.077663671373556e-05, + "loss": 0.607, + "step": 5400 + }, + { + "epoch": 6.933247753530167, + "grad_norm": 2.368760347366333, + "learning_rate": 2.078048780487805e-05, + "loss": 0.5869, + "step": 5401 + }, + { + "epoch": 6.934531450577664, + "grad_norm": 1.55342435836792, + "learning_rate": 2.078433889602054e-05, + "loss": 0.5705, + "step": 5402 + }, + { + "epoch": 6.935815147625161, + "grad_norm": 11.182762145996094, + "learning_rate": 2.078818998716303e-05, + "loss": 0.6213, + "step": 5403 + }, + { + "epoch": 6.937098844672657, + "grad_norm": 1.6221427917480469, + "learning_rate": 2.0792041078305518e-05, + "loss": 0.6308, + "step": 5404 + }, + { + "epoch": 6.938382541720154, + "grad_norm": 2.7158055305480957, + "learning_rate": 2.079589216944801e-05, + "loss": 0.6506, + "step": 5405 + }, + { + "epoch": 6.939666238767651, + "grad_norm": 1.519768476486206, + "learning_rate": 2.0799743260590504e-05, + "loss": 0.6071, + "step": 5406 + }, + { + "epoch": 6.940949935815148, + "grad_norm": 1.2988704442977905, + "learning_rate": 2.080359435173299e-05, + "loss": 0.6038, + "step": 5407 + }, + { + "epoch": 6.942233632862644, + "grad_norm": 1.5983607769012451, + "learning_rate": 2.0807445442875483e-05, + "loss": 0.6159, + "step": 5408 + }, + { + "epoch": 6.943517329910141, + "grad_norm": 1.7140417098999023, + "learning_rate": 2.0811296534017974e-05, + "loss": 0.6526, + "step": 5409 + }, + { + "epoch": 6.944801026957638, + "grad_norm": 1.936116099357605, + "learning_rate": 2.081514762516046e-05, + "loss": 0.649, + "step": 5410 + }, + { + "epoch": 6.946084724005135, + "grad_norm": 5.3017258644104, + "learning_rate": 2.0818998716302952e-05, + "loss": 0.6606, + "step": 5411 + }, + { + "epoch": 6.947368421052632, + "grad_norm": 1.4532647132873535, + "learning_rate": 2.0822849807445443e-05, + "loss": 0.6256, + "step": 5412 + }, + { + "epoch": 6.948652118100128, + "grad_norm": 3.515427589416504, + "learning_rate": 2.0826700898587935e-05, + "loss": 0.6057, + "step": 5413 + }, + { + "epoch": 6.949935815147625, + "grad_norm": 12.457624435424805, + "learning_rate": 2.0830551989730426e-05, + "loss": 0.6909, + "step": 5414 + }, + { + "epoch": 6.951219512195122, + "grad_norm": 2.9832746982574463, + "learning_rate": 2.0834403080872913e-05, + "loss": 0.6896, + "step": 5415 + }, + { + "epoch": 6.952503209242619, + "grad_norm": 4.064573764801025, + "learning_rate": 2.0838254172015404e-05, + "loss": 0.6918, + "step": 5416 + }, + { + "epoch": 6.953786906290116, + "grad_norm": 4.35247278213501, + "learning_rate": 2.0842105263157895e-05, + "loss": 0.6785, + "step": 5417 + }, + { + "epoch": 6.955070603337612, + "grad_norm": 3.727851390838623, + "learning_rate": 2.0845956354300387e-05, + "loss": 0.7069, + "step": 5418 + }, + { + "epoch": 6.956354300385109, + "grad_norm": 1.9612274169921875, + "learning_rate": 2.0849807445442878e-05, + "loss": 0.6962, + "step": 5419 + }, + { + "epoch": 6.957637997432606, + "grad_norm": 23.579254150390625, + "learning_rate": 2.0853658536585365e-05, + "loss": 0.7735, + "step": 5420 + }, + { + "epoch": 6.958921694480103, + "grad_norm": 2.0949318408966064, + "learning_rate": 2.0857509627727856e-05, + "loss": 0.7403, + "step": 5421 + }, + { + "epoch": 6.9602053915276, + "grad_norm": 1.990571141242981, + "learning_rate": 2.0861360718870347e-05, + "loss": 0.7204, + "step": 5422 + }, + { + "epoch": 6.961489088575096, + "grad_norm": 2.8670761585235596, + "learning_rate": 2.0865211810012835e-05, + "loss": 0.7993, + "step": 5423 + }, + { + "epoch": 6.962772785622593, + "grad_norm": 4.0123701095581055, + "learning_rate": 2.086906290115533e-05, + "loss": 0.9085, + "step": 5424 + }, + { + "epoch": 6.96405648267009, + "grad_norm": 3.1241204738616943, + "learning_rate": 2.087291399229782e-05, + "loss": 0.5768, + "step": 5425 + }, + { + "epoch": 6.965340179717587, + "grad_norm": 1.5961389541625977, + "learning_rate": 2.0876765083440308e-05, + "loss": 0.5813, + "step": 5426 + }, + { + "epoch": 6.966623876765084, + "grad_norm": 1.643295407295227, + "learning_rate": 2.08806161745828e-05, + "loss": 0.6278, + "step": 5427 + }, + { + "epoch": 6.96790757381258, + "grad_norm": 3.3729631900787354, + "learning_rate": 2.0884467265725287e-05, + "loss": 0.6108, + "step": 5428 + }, + { + "epoch": 6.969191270860077, + "grad_norm": 1.362946629524231, + "learning_rate": 2.0888318356867778e-05, + "loss": 0.6172, + "step": 5429 + }, + { + "epoch": 6.970474967907574, + "grad_norm": 1.3671350479125977, + "learning_rate": 2.0892169448010273e-05, + "loss": 0.6159, + "step": 5430 + }, + { + "epoch": 6.971758664955071, + "grad_norm": 1.6196576356887817, + "learning_rate": 2.089602053915276e-05, + "loss": 0.5973, + "step": 5431 + }, + { + "epoch": 6.973042362002568, + "grad_norm": 3.545499086380005, + "learning_rate": 2.089987163029525e-05, + "loss": 0.6002, + "step": 5432 + }, + { + "epoch": 6.974326059050064, + "grad_norm": 2.67429780960083, + "learning_rate": 2.0903722721437742e-05, + "loss": 0.636, + "step": 5433 + }, + { + "epoch": 6.975609756097561, + "grad_norm": 1.971407175064087, + "learning_rate": 2.090757381258023e-05, + "loss": 0.6185, + "step": 5434 + }, + { + "epoch": 6.976893453145058, + "grad_norm": 14.178318977355957, + "learning_rate": 2.091142490372272e-05, + "loss": 0.6115, + "step": 5435 + }, + { + "epoch": 6.978177150192555, + "grad_norm": 1.787670373916626, + "learning_rate": 2.0915275994865212e-05, + "loss": 0.5991, + "step": 5436 + }, + { + "epoch": 6.979460847240051, + "grad_norm": 2.959357500076294, + "learning_rate": 2.0919127086007703e-05, + "loss": 0.6006, + "step": 5437 + }, + { + "epoch": 6.980744544287548, + "grad_norm": 2.4344849586486816, + "learning_rate": 2.0922978177150194e-05, + "loss": 0.5725, + "step": 5438 + }, + { + "epoch": 6.982028241335045, + "grad_norm": 2.254779100418091, + "learning_rate": 2.0926829268292682e-05, + "loss": 0.6342, + "step": 5439 + }, + { + "epoch": 6.983311938382542, + "grad_norm": 1.4022800922393799, + "learning_rate": 2.0930680359435173e-05, + "loss": 0.653, + "step": 5440 + }, + { + "epoch": 6.984595635430038, + "grad_norm": 1.8061323165893555, + "learning_rate": 2.0934531450577664e-05, + "loss": 0.6458, + "step": 5441 + }, + { + "epoch": 6.985879332477535, + "grad_norm": 2.534029245376587, + "learning_rate": 2.0938382541720152e-05, + "loss": 0.6796, + "step": 5442 + }, + { + "epoch": 6.987163029525032, + "grad_norm": 2.6002607345581055, + "learning_rate": 2.0942233632862646e-05, + "loss": 0.6407, + "step": 5443 + }, + { + "epoch": 6.988446726572529, + "grad_norm": 2.6067886352539062, + "learning_rate": 2.0946084724005137e-05, + "loss": 0.6926, + "step": 5444 + }, + { + "epoch": 6.989730423620026, + "grad_norm": 2.2369019985198975, + "learning_rate": 2.0949935815147625e-05, + "loss": 0.6109, + "step": 5445 + }, + { + "epoch": 6.991014120667522, + "grad_norm": 2.0529723167419434, + "learning_rate": 2.0953786906290116e-05, + "loss": 0.6591, + "step": 5446 + }, + { + "epoch": 6.992297817715019, + "grad_norm": 3.7195708751678467, + "learning_rate": 2.0957637997432604e-05, + "loss": 0.7138, + "step": 5447 + }, + { + "epoch": 6.993581514762516, + "grad_norm": 4.0842485427856445, + "learning_rate": 2.0961489088575095e-05, + "loss": 0.67, + "step": 5448 + }, + { + "epoch": 6.994865211810013, + "grad_norm": 2.7839913368225098, + "learning_rate": 2.096534017971759e-05, + "loss": 0.681, + "step": 5449 + }, + { + "epoch": 6.99614890885751, + "grad_norm": 2.6842117309570312, + "learning_rate": 2.0969191270860077e-05, + "loss": 0.6674, + "step": 5450 + }, + { + "epoch": 6.997432605905006, + "grad_norm": 2.5041773319244385, + "learning_rate": 2.0973042362002568e-05, + "loss": 0.6965, + "step": 5451 + }, + { + "epoch": 6.998716302952503, + "grad_norm": 3.455267906188965, + "learning_rate": 2.097689345314506e-05, + "loss": 0.7071, + "step": 5452 + }, + { + "epoch": 7.0, + "grad_norm": 2.98055362701416, + "learning_rate": 2.0980744544287547e-05, + "loss": 0.9143, + "step": 5453 + }, + { + "epoch": 7.001283697047497, + "grad_norm": 3.2954390048980713, + "learning_rate": 2.0984595635430038e-05, + "loss": 0.6355, + "step": 5454 + }, + { + "epoch": 7.002567394094994, + "grad_norm": 3.5004994869232178, + "learning_rate": 2.0988446726572532e-05, + "loss": 0.5852, + "step": 5455 + }, + { + "epoch": 7.00385109114249, + "grad_norm": 3.5346519947052, + "learning_rate": 2.099229781771502e-05, + "loss": 0.5948, + "step": 5456 + }, + { + "epoch": 7.005134788189987, + "grad_norm": 1.3666939735412598, + "learning_rate": 2.099614890885751e-05, + "loss": 0.6101, + "step": 5457 + }, + { + "epoch": 7.006418485237484, + "grad_norm": 1.3313369750976562, + "learning_rate": 2.1e-05, + "loss": 0.5911, + "step": 5458 + }, + { + "epoch": 7.007702182284981, + "grad_norm": 1.1835342645645142, + "learning_rate": 2.100385109114249e-05, + "loss": 0.5817, + "step": 5459 + }, + { + "epoch": 7.008985879332478, + "grad_norm": 4.0181145668029785, + "learning_rate": 2.100770218228498e-05, + "loss": 0.6103, + "step": 5460 + }, + { + "epoch": 7.010269576379974, + "grad_norm": 2.2018909454345703, + "learning_rate": 2.1011553273427472e-05, + "loss": 0.6102, + "step": 5461 + }, + { + "epoch": 7.011553273427471, + "grad_norm": 1.8786920309066772, + "learning_rate": 2.1015404364569963e-05, + "loss": 0.5731, + "step": 5462 + }, + { + "epoch": 7.012836970474968, + "grad_norm": 4.307723522186279, + "learning_rate": 2.1019255455712454e-05, + "loss": 0.6506, + "step": 5463 + }, + { + "epoch": 7.014120667522465, + "grad_norm": 1.5640264749526978, + "learning_rate": 2.1023106546854942e-05, + "loss": 0.6007, + "step": 5464 + }, + { + "epoch": 7.015404364569961, + "grad_norm": 1.6627204418182373, + "learning_rate": 2.1026957637997433e-05, + "loss": 0.6203, + "step": 5465 + }, + { + "epoch": 7.016688061617458, + "grad_norm": 2.0994248390197754, + "learning_rate": 2.103080872913992e-05, + "loss": 0.5574, + "step": 5466 + }, + { + "epoch": 7.017971758664955, + "grad_norm": 1.2262989282608032, + "learning_rate": 2.1034659820282415e-05, + "loss": 0.5718, + "step": 5467 + }, + { + "epoch": 7.019255455712452, + "grad_norm": 5.396697044372559, + "learning_rate": 2.1038510911424906e-05, + "loss": 0.6255, + "step": 5468 + }, + { + "epoch": 7.020539152759949, + "grad_norm": 2.575977325439453, + "learning_rate": 2.1042362002567394e-05, + "loss": 0.6399, + "step": 5469 + }, + { + "epoch": 7.021822849807445, + "grad_norm": 1.783273696899414, + "learning_rate": 2.1046213093709885e-05, + "loss": 0.555, + "step": 5470 + }, + { + "epoch": 7.023106546854942, + "grad_norm": 1.735529899597168, + "learning_rate": 2.1050064184852376e-05, + "loss": 0.604, + "step": 5471 + }, + { + "epoch": 7.024390243902439, + "grad_norm": 3.091629981994629, + "learning_rate": 2.1053915275994864e-05, + "loss": 0.584, + "step": 5472 + }, + { + "epoch": 7.025673940949936, + "grad_norm": 1.7584329843521118, + "learning_rate": 2.1057766367137358e-05, + "loss": 0.6259, + "step": 5473 + }, + { + "epoch": 7.026957637997433, + "grad_norm": 1.7028896808624268, + "learning_rate": 2.1061617458279846e-05, + "loss": 0.5927, + "step": 5474 + }, + { + "epoch": 7.028241335044929, + "grad_norm": 2.0476796627044678, + "learning_rate": 2.1065468549422337e-05, + "loss": 0.6052, + "step": 5475 + }, + { + "epoch": 7.029525032092426, + "grad_norm": 1.7270768880844116, + "learning_rate": 2.1069319640564828e-05, + "loss": 0.6107, + "step": 5476 + }, + { + "epoch": 7.030808729139923, + "grad_norm": 1.4897518157958984, + "learning_rate": 2.1073170731707316e-05, + "loss": 0.5305, + "step": 5477 + }, + { + "epoch": 7.03209242618742, + "grad_norm": 2.0645911693573, + "learning_rate": 2.1077021822849807e-05, + "loss": 0.5686, + "step": 5478 + }, + { + "epoch": 7.033376123234916, + "grad_norm": 1.5646510124206543, + "learning_rate": 2.10808729139923e-05, + "loss": 0.6068, + "step": 5479 + }, + { + "epoch": 7.034659820282413, + "grad_norm": 1.8632152080535889, + "learning_rate": 2.108472400513479e-05, + "loss": 0.6161, + "step": 5480 + }, + { + "epoch": 7.03594351732991, + "grad_norm": 2.3430516719818115, + "learning_rate": 2.108857509627728e-05, + "loss": 0.583, + "step": 5481 + }, + { + "epoch": 7.037227214377407, + "grad_norm": 3.4290051460266113, + "learning_rate": 2.109242618741977e-05, + "loss": 0.6373, + "step": 5482 + }, + { + "epoch": 7.038510911424904, + "grad_norm": 3.5818073749542236, + "learning_rate": 2.109627727856226e-05, + "loss": 0.624, + "step": 5483 + }, + { + "epoch": 7.0397946084724, + "grad_norm": 1.7327529191970825, + "learning_rate": 2.110012836970475e-05, + "loss": 0.6352, + "step": 5484 + }, + { + "epoch": 7.041078305519897, + "grad_norm": 2.791433572769165, + "learning_rate": 2.110397946084724e-05, + "loss": 0.634, + "step": 5485 + }, + { + "epoch": 7.042362002567394, + "grad_norm": 2.532531499862671, + "learning_rate": 2.1107830551989732e-05, + "loss": 0.6967, + "step": 5486 + }, + { + "epoch": 7.043645699614891, + "grad_norm": 2.8835718631744385, + "learning_rate": 2.1111681643132223e-05, + "loss": 0.6345, + "step": 5487 + }, + { + "epoch": 7.044929396662388, + "grad_norm": 3.2065482139587402, + "learning_rate": 2.111553273427471e-05, + "loss": 0.6611, + "step": 5488 + }, + { + "epoch": 7.046213093709884, + "grad_norm": 2.748946189880371, + "learning_rate": 2.11193838254172e-05, + "loss": 0.694, + "step": 5489 + }, + { + "epoch": 7.047496790757381, + "grad_norm": 8.48807144165039, + "learning_rate": 2.1123234916559693e-05, + "loss": 0.6372, + "step": 5490 + }, + { + "epoch": 7.048780487804878, + "grad_norm": 3.6314804553985596, + "learning_rate": 2.112708600770218e-05, + "loss": 0.5827, + "step": 5491 + }, + { + "epoch": 7.050064184852375, + "grad_norm": 3.198526382446289, + "learning_rate": 2.1130937098844675e-05, + "loss": 0.673, + "step": 5492 + }, + { + "epoch": 7.051347881899872, + "grad_norm": 3.909763813018799, + "learning_rate": 2.1134788189987163e-05, + "loss": 0.5812, + "step": 5493 + }, + { + "epoch": 7.052631578947368, + "grad_norm": 2.9374821186065674, + "learning_rate": 2.1138639281129654e-05, + "loss": 0.6373, + "step": 5494 + }, + { + "epoch": 7.053915275994865, + "grad_norm": 5.418295383453369, + "learning_rate": 2.1142490372272145e-05, + "loss": 0.6838, + "step": 5495 + }, + { + "epoch": 7.055198973042362, + "grad_norm": 3.721503257751465, + "learning_rate": 2.1146341463414632e-05, + "loss": 0.6532, + "step": 5496 + }, + { + "epoch": 7.056482670089859, + "grad_norm": 3.7423555850982666, + "learning_rate": 2.1150192554557123e-05, + "loss": 0.6849, + "step": 5497 + }, + { + "epoch": 7.057766367137355, + "grad_norm": 1.6475554704666138, + "learning_rate": 2.1154043645699618e-05, + "loss": 0.6897, + "step": 5498 + }, + { + "epoch": 7.059050064184852, + "grad_norm": 12.881848335266113, + "learning_rate": 2.1157894736842106e-05, + "loss": 0.7239, + "step": 5499 + }, + { + "epoch": 7.060333761232349, + "grad_norm": 2.2481064796447754, + "learning_rate": 2.1161745827984597e-05, + "loss": 0.673, + "step": 5500 + }, + { + "epoch": 7.061617458279846, + "grad_norm": 2.3204357624053955, + "learning_rate": 2.1165596919127088e-05, + "loss": 0.8174, + "step": 5501 + }, + { + "epoch": 7.062901155327343, + "grad_norm": 3.9216103553771973, + "learning_rate": 2.1169448010269575e-05, + "loss": 0.7638, + "step": 5502 + }, + { + "epoch": 7.064184852374839, + "grad_norm": 3.354693651199341, + "learning_rate": 2.1173299101412067e-05, + "loss": 0.873, + "step": 5503 + }, + { + "epoch": 7.065468549422336, + "grad_norm": 3.0318968296051025, + "learning_rate": 2.1177150192554558e-05, + "loss": 0.5996, + "step": 5504 + }, + { + "epoch": 7.066752246469833, + "grad_norm": 4.711429595947266, + "learning_rate": 2.118100128369705e-05, + "loss": 0.5701, + "step": 5505 + }, + { + "epoch": 7.06803594351733, + "grad_norm": 2.1299054622650146, + "learning_rate": 2.118485237483954e-05, + "loss": 0.5842, + "step": 5506 + }, + { + "epoch": 7.069319640564827, + "grad_norm": 9.604924201965332, + "learning_rate": 2.1188703465982027e-05, + "loss": 0.5568, + "step": 5507 + }, + { + "epoch": 7.070603337612323, + "grad_norm": 2.1004981994628906, + "learning_rate": 2.119255455712452e-05, + "loss": 0.59, + "step": 5508 + }, + { + "epoch": 7.07188703465982, + "grad_norm": 1.2180442810058594, + "learning_rate": 2.119640564826701e-05, + "loss": 0.6021, + "step": 5509 + }, + { + "epoch": 7.073170731707317, + "grad_norm": 1.8727470636367798, + "learning_rate": 2.12002567394095e-05, + "loss": 0.5966, + "step": 5510 + }, + { + "epoch": 7.074454428754814, + "grad_norm": 1.5666619539260864, + "learning_rate": 2.120410783055199e-05, + "loss": 0.5864, + "step": 5511 + }, + { + "epoch": 7.07573812580231, + "grad_norm": 2.351361036300659, + "learning_rate": 2.120795892169448e-05, + "loss": 0.6387, + "step": 5512 + }, + { + "epoch": 7.077021822849807, + "grad_norm": 2.335383892059326, + "learning_rate": 2.121181001283697e-05, + "loss": 0.5974, + "step": 5513 + }, + { + "epoch": 7.078305519897304, + "grad_norm": 4.580857753753662, + "learning_rate": 2.121566110397946e-05, + "loss": 0.5749, + "step": 5514 + }, + { + "epoch": 7.079589216944801, + "grad_norm": 2.305529832839966, + "learning_rate": 2.121951219512195e-05, + "loss": 0.5962, + "step": 5515 + }, + { + "epoch": 7.080872913992298, + "grad_norm": 2.8953909873962402, + "learning_rate": 2.1223363286264444e-05, + "loss": 0.6271, + "step": 5516 + }, + { + "epoch": 7.082156611039794, + "grad_norm": 1.8378080129623413, + "learning_rate": 2.1227214377406935e-05, + "loss": 0.6322, + "step": 5517 + }, + { + "epoch": 7.083440308087291, + "grad_norm": 1.2857074737548828, + "learning_rate": 2.1231065468549422e-05, + "loss": 0.5864, + "step": 5518 + }, + { + "epoch": 7.084724005134788, + "grad_norm": 3.785032272338867, + "learning_rate": 2.1234916559691913e-05, + "loss": 0.6332, + "step": 5519 + }, + { + "epoch": 7.086007702182285, + "grad_norm": 1.5067558288574219, + "learning_rate": 2.12387676508344e-05, + "loss": 0.6302, + "step": 5520 + }, + { + "epoch": 7.087291399229782, + "grad_norm": 3.29121994972229, + "learning_rate": 2.1242618741976892e-05, + "loss": 0.5694, + "step": 5521 + }, + { + "epoch": 7.088575096277278, + "grad_norm": 2.7980093955993652, + "learning_rate": 2.1246469833119387e-05, + "loss": 0.5599, + "step": 5522 + }, + { + "epoch": 7.089858793324775, + "grad_norm": 3.0528604984283447, + "learning_rate": 2.1250320924261874e-05, + "loss": 0.6001, + "step": 5523 + }, + { + "epoch": 7.091142490372272, + "grad_norm": 1.9516193866729736, + "learning_rate": 2.1254172015404365e-05, + "loss": 0.5684, + "step": 5524 + }, + { + "epoch": 7.092426187419769, + "grad_norm": 2.3352034091949463, + "learning_rate": 2.1258023106546857e-05, + "loss": 0.6252, + "step": 5525 + }, + { + "epoch": 7.093709884467266, + "grad_norm": 2.1159870624542236, + "learning_rate": 2.1261874197689344e-05, + "loss": 0.5611, + "step": 5526 + }, + { + "epoch": 7.094993581514762, + "grad_norm": 3.4739089012145996, + "learning_rate": 2.1265725288831835e-05, + "loss": 0.5913, + "step": 5527 + }, + { + "epoch": 7.096277278562259, + "grad_norm": 4.553858757019043, + "learning_rate": 2.126957637997433e-05, + "loss": 0.5766, + "step": 5528 + }, + { + "epoch": 7.097560975609756, + "grad_norm": 12.43681812286377, + "learning_rate": 2.1273427471116817e-05, + "loss": 0.6243, + "step": 5529 + }, + { + "epoch": 7.098844672657253, + "grad_norm": 5.358736991882324, + "learning_rate": 2.127727856225931e-05, + "loss": 0.6341, + "step": 5530 + }, + { + "epoch": 7.100128369704749, + "grad_norm": 5.522582054138184, + "learning_rate": 2.1281129653401796e-05, + "loss": 0.6253, + "step": 5531 + }, + { + "epoch": 7.101412066752246, + "grad_norm": 2.942537784576416, + "learning_rate": 2.1284980744544287e-05, + "loss": 0.6116, + "step": 5532 + }, + { + "epoch": 7.102695763799743, + "grad_norm": 3.520751953125, + "learning_rate": 2.1288831835686778e-05, + "loss": 0.6374, + "step": 5533 + }, + { + "epoch": 7.10397946084724, + "grad_norm": 3.4594571590423584, + "learning_rate": 2.129268292682927e-05, + "loss": 0.6728, + "step": 5534 + }, + { + "epoch": 7.105263157894737, + "grad_norm": 1.6186084747314453, + "learning_rate": 2.129653401797176e-05, + "loss": 0.592, + "step": 5535 + }, + { + "epoch": 7.106546854942233, + "grad_norm": 5.369798183441162, + "learning_rate": 2.130038510911425e-05, + "loss": 0.6095, + "step": 5536 + }, + { + "epoch": 7.10783055198973, + "grad_norm": 4.827002048492432, + "learning_rate": 2.130423620025674e-05, + "loss": 0.6337, + "step": 5537 + }, + { + "epoch": 7.109114249037227, + "grad_norm": 6.07631778717041, + "learning_rate": 2.130808729139923e-05, + "loss": 0.6025, + "step": 5538 + }, + { + "epoch": 7.110397946084724, + "grad_norm": 3.953873872756958, + "learning_rate": 2.1311938382541718e-05, + "loss": 0.6199, + "step": 5539 + }, + { + "epoch": 7.111681643132221, + "grad_norm": 2.8529269695281982, + "learning_rate": 2.1315789473684212e-05, + "loss": 0.6465, + "step": 5540 + }, + { + "epoch": 7.112965340179717, + "grad_norm": 5.132993698120117, + "learning_rate": 2.1319640564826703e-05, + "loss": 0.686, + "step": 5541 + }, + { + "epoch": 7.114249037227214, + "grad_norm": 3.4178643226623535, + "learning_rate": 2.132349165596919e-05, + "loss": 0.6642, + "step": 5542 + }, + { + "epoch": 7.115532734274711, + "grad_norm": 1.9573394060134888, + "learning_rate": 2.1327342747111682e-05, + "loss": 0.6324, + "step": 5543 + }, + { + "epoch": 7.116816431322208, + "grad_norm": 4.931451797485352, + "learning_rate": 2.1331193838254173e-05, + "loss": 0.6471, + "step": 5544 + }, + { + "epoch": 7.118100128369705, + "grad_norm": 2.341395854949951, + "learning_rate": 2.133504492939666e-05, + "loss": 0.6688, + "step": 5545 + }, + { + "epoch": 7.119383825417201, + "grad_norm": 3.966892719268799, + "learning_rate": 2.1338896020539152e-05, + "loss": 0.662, + "step": 5546 + }, + { + "epoch": 7.120667522464698, + "grad_norm": 1.600299596786499, + "learning_rate": 2.1342747111681647e-05, + "loss": 0.6152, + "step": 5547 + }, + { + "epoch": 7.121951219512195, + "grad_norm": 5.166574954986572, + "learning_rate": 2.1346598202824134e-05, + "loss": 0.7214, + "step": 5548 + }, + { + "epoch": 7.123234916559692, + "grad_norm": 1.7991598844528198, + "learning_rate": 2.1350449293966625e-05, + "loss": 0.6765, + "step": 5549 + }, + { + "epoch": 7.124518613607188, + "grad_norm": 7.43255615234375, + "learning_rate": 2.1354300385109113e-05, + "loss": 0.6733, + "step": 5550 + }, + { + "epoch": 7.125802310654685, + "grad_norm": 5.474233150482178, + "learning_rate": 2.1358151476251604e-05, + "loss": 0.7872, + "step": 5551 + }, + { + "epoch": 7.127086007702182, + "grad_norm": 2.681502103805542, + "learning_rate": 2.1362002567394095e-05, + "loss": 0.7907, + "step": 5552 + }, + { + "epoch": 7.128369704749679, + "grad_norm": 27.0837345123291, + "learning_rate": 2.1365853658536586e-05, + "loss": 0.8396, + "step": 5553 + }, + { + "epoch": 7.129653401797176, + "grad_norm": 2.1325929164886475, + "learning_rate": 2.1369704749679077e-05, + "loss": 0.5675, + "step": 5554 + }, + { + "epoch": 7.130937098844672, + "grad_norm": 5.855559825897217, + "learning_rate": 2.1373555840821568e-05, + "loss": 0.5961, + "step": 5555 + }, + { + "epoch": 7.132220795892169, + "grad_norm": 4.1363091468811035, + "learning_rate": 2.1377406931964056e-05, + "loss": 0.5947, + "step": 5556 + }, + { + "epoch": 7.133504492939666, + "grad_norm": 9.347902297973633, + "learning_rate": 2.1381258023106547e-05, + "loss": 0.6175, + "step": 5557 + }, + { + "epoch": 7.134788189987163, + "grad_norm": 3.4081320762634277, + "learning_rate": 2.1385109114249035e-05, + "loss": 0.6366, + "step": 5558 + }, + { + "epoch": 7.13607188703466, + "grad_norm": 2.32798171043396, + "learning_rate": 2.138896020539153e-05, + "loss": 0.6075, + "step": 5559 + }, + { + "epoch": 7.137355584082156, + "grad_norm": 2.570291042327881, + "learning_rate": 2.139281129653402e-05, + "loss": 0.616, + "step": 5560 + }, + { + "epoch": 7.138639281129653, + "grad_norm": 2.749497175216675, + "learning_rate": 2.1396662387676508e-05, + "loss": 0.6272, + "step": 5561 + }, + { + "epoch": 7.13992297817715, + "grad_norm": 4.240452766418457, + "learning_rate": 2.1400513478819e-05, + "loss": 0.5608, + "step": 5562 + }, + { + "epoch": 7.141206675224647, + "grad_norm": 3.380115270614624, + "learning_rate": 2.140436456996149e-05, + "loss": 0.6129, + "step": 5563 + }, + { + "epoch": 7.142490372272144, + "grad_norm": 2.898052453994751, + "learning_rate": 2.1408215661103978e-05, + "loss": 0.5802, + "step": 5564 + }, + { + "epoch": 7.14377406931964, + "grad_norm": 2.8370940685272217, + "learning_rate": 2.1412066752246472e-05, + "loss": 0.5861, + "step": 5565 + }, + { + "epoch": 7.145057766367137, + "grad_norm": 3.3867523670196533, + "learning_rate": 2.141591784338896e-05, + "loss": 0.5935, + "step": 5566 + }, + { + "epoch": 7.146341463414634, + "grad_norm": 4.735891342163086, + "learning_rate": 2.141976893453145e-05, + "loss": 0.623, + "step": 5567 + }, + { + "epoch": 7.147625160462131, + "grad_norm": 3.1271719932556152, + "learning_rate": 2.1423620025673942e-05, + "loss": 0.5494, + "step": 5568 + }, + { + "epoch": 7.148908857509627, + "grad_norm": 3.3894872665405273, + "learning_rate": 2.142747111681643e-05, + "loss": 0.5817, + "step": 5569 + }, + { + "epoch": 7.150192554557124, + "grad_norm": 6.948281288146973, + "learning_rate": 2.143132220795892e-05, + "loss": 0.571, + "step": 5570 + }, + { + "epoch": 7.151476251604621, + "grad_norm": 3.3112404346466064, + "learning_rate": 2.1435173299101415e-05, + "loss": 0.598, + "step": 5571 + }, + { + "epoch": 7.152759948652118, + "grad_norm": 3.323343276977539, + "learning_rate": 2.1439024390243903e-05, + "loss": 0.6217, + "step": 5572 + }, + { + "epoch": 7.154043645699615, + "grad_norm": 34.79995346069336, + "learning_rate": 2.1442875481386394e-05, + "loss": 0.6325, + "step": 5573 + }, + { + "epoch": 7.155327342747111, + "grad_norm": 2.847470760345459, + "learning_rate": 2.1446726572528885e-05, + "loss": 0.5996, + "step": 5574 + }, + { + "epoch": 7.156611039794608, + "grad_norm": 5.355295658111572, + "learning_rate": 2.1450577663671373e-05, + "loss": 0.6624, + "step": 5575 + }, + { + "epoch": 7.157894736842105, + "grad_norm": 1.5999785661697388, + "learning_rate": 2.1454428754813864e-05, + "loss": 0.5857, + "step": 5576 + }, + { + "epoch": 7.159178433889602, + "grad_norm": 4.6140456199646, + "learning_rate": 2.1458279845956355e-05, + "loss": 0.5891, + "step": 5577 + }, + { + "epoch": 7.160462130937099, + "grad_norm": 2.493122100830078, + "learning_rate": 2.1462130937098846e-05, + "loss": 0.6265, + "step": 5578 + }, + { + "epoch": 7.161745827984595, + "grad_norm": 2.524339199066162, + "learning_rate": 2.1465982028241337e-05, + "loss": 0.593, + "step": 5579 + }, + { + "epoch": 7.163029525032092, + "grad_norm": 5.521852016448975, + "learning_rate": 2.1469833119383825e-05, + "loss": 0.5926, + "step": 5580 + }, + { + "epoch": 7.164313222079589, + "grad_norm": 3.9642674922943115, + "learning_rate": 2.1473684210526316e-05, + "loss": 0.6076, + "step": 5581 + }, + { + "epoch": 7.165596919127086, + "grad_norm": 2.9376227855682373, + "learning_rate": 2.1477535301668807e-05, + "loss": 0.5595, + "step": 5582 + }, + { + "epoch": 7.166880616174582, + "grad_norm": 4.056319236755371, + "learning_rate": 2.1481386392811298e-05, + "loss": 0.6076, + "step": 5583 + }, + { + "epoch": 7.168164313222079, + "grad_norm": 5.1462321281433105, + "learning_rate": 2.148523748395379e-05, + "loss": 0.6628, + "step": 5584 + }, + { + "epoch": 7.169448010269576, + "grad_norm": 8.789929389953613, + "learning_rate": 2.1489088575096277e-05, + "loss": 0.6139, + "step": 5585 + }, + { + "epoch": 7.170731707317073, + "grad_norm": 2.2675745487213135, + "learning_rate": 2.1492939666238768e-05, + "loss": 0.6721, + "step": 5586 + }, + { + "epoch": 7.17201540436457, + "grad_norm": 5.151346206665039, + "learning_rate": 2.149679075738126e-05, + "loss": 0.6645, + "step": 5587 + }, + { + "epoch": 7.173299101412066, + "grad_norm": 4.8626933097839355, + "learning_rate": 2.1500641848523747e-05, + "loss": 0.6328, + "step": 5588 + }, + { + "epoch": 7.174582798459563, + "grad_norm": 3.2575113773345947, + "learning_rate": 2.150449293966624e-05, + "loss": 0.5846, + "step": 5589 + }, + { + "epoch": 7.17586649550706, + "grad_norm": 4.204816818237305, + "learning_rate": 2.1508344030808732e-05, + "loss": 0.6457, + "step": 5590 + }, + { + "epoch": 7.177150192554557, + "grad_norm": 5.054786682128906, + "learning_rate": 2.151219512195122e-05, + "loss": 0.6455, + "step": 5591 + }, + { + "epoch": 7.178433889602054, + "grad_norm": 9.645527839660645, + "learning_rate": 2.151604621309371e-05, + "loss": 0.6561, + "step": 5592 + }, + { + "epoch": 7.17971758664955, + "grad_norm": 4.084712505340576, + "learning_rate": 2.1519897304236202e-05, + "loss": 0.6294, + "step": 5593 + }, + { + "epoch": 7.181001283697047, + "grad_norm": 7.470680236816406, + "learning_rate": 2.152374839537869e-05, + "loss": 0.678, + "step": 5594 + }, + { + "epoch": 7.182284980744544, + "grad_norm": 3.5143516063690186, + "learning_rate": 2.152759948652118e-05, + "loss": 0.6721, + "step": 5595 + }, + { + "epoch": 7.183568677792041, + "grad_norm": 4.3650665283203125, + "learning_rate": 2.153145057766367e-05, + "loss": 0.7197, + "step": 5596 + }, + { + "epoch": 7.184852374839538, + "grad_norm": 24.946840286254883, + "learning_rate": 2.1535301668806163e-05, + "loss": 0.6935, + "step": 5597 + }, + { + "epoch": 7.186136071887034, + "grad_norm": 5.733648300170898, + "learning_rate": 2.1539152759948654e-05, + "loss": 0.6384, + "step": 5598 + }, + { + "epoch": 7.187419768934531, + "grad_norm": 4.13720178604126, + "learning_rate": 2.154300385109114e-05, + "loss": 0.6928, + "step": 5599 + }, + { + "epoch": 7.188703465982028, + "grad_norm": 5.636670112609863, + "learning_rate": 2.1546854942233633e-05, + "loss": 0.7139, + "step": 5600 + }, + { + "epoch": 7.189987163029525, + "grad_norm": 5.815284729003906, + "learning_rate": 2.1550706033376124e-05, + "loss": 0.7089, + "step": 5601 + }, + { + "epoch": 7.191270860077021, + "grad_norm": 4.777956485748291, + "learning_rate": 2.1554557124518615e-05, + "loss": 0.7712, + "step": 5602 + }, + { + "epoch": 7.192554557124518, + "grad_norm": 14.14599323272705, + "learning_rate": 2.1558408215661106e-05, + "loss": 1.01, + "step": 5603 + }, + { + "epoch": 7.193838254172015, + "grad_norm": 2.751286745071411, + "learning_rate": 2.1562259306803593e-05, + "loss": 0.5426, + "step": 5604 + }, + { + "epoch": 7.195121951219512, + "grad_norm": 2.4070258140563965, + "learning_rate": 2.1566110397946085e-05, + "loss": 0.5945, + "step": 5605 + }, + { + "epoch": 7.196405648267009, + "grad_norm": 4.4579691886901855, + "learning_rate": 2.1569961489088576e-05, + "loss": 0.5739, + "step": 5606 + }, + { + "epoch": 7.197689345314505, + "grad_norm": 2.215747117996216, + "learning_rate": 2.1573812580231063e-05, + "loss": 0.6114, + "step": 5607 + }, + { + "epoch": 7.198973042362002, + "grad_norm": 4.647279739379883, + "learning_rate": 2.1577663671373558e-05, + "loss": 0.5718, + "step": 5608 + }, + { + "epoch": 7.200256739409499, + "grad_norm": 3.3893353939056396, + "learning_rate": 2.158151476251605e-05, + "loss": 0.5952, + "step": 5609 + }, + { + "epoch": 7.201540436456996, + "grad_norm": 2.787783622741699, + "learning_rate": 2.1585365853658537e-05, + "loss": 0.5617, + "step": 5610 + }, + { + "epoch": 7.202824133504493, + "grad_norm": 2.7469701766967773, + "learning_rate": 2.1589216944801028e-05, + "loss": 0.5671, + "step": 5611 + }, + { + "epoch": 7.2041078305519894, + "grad_norm": 32.59522247314453, + "learning_rate": 2.159306803594352e-05, + "loss": 0.59, + "step": 5612 + }, + { + "epoch": 7.205391527599486, + "grad_norm": 3.2164793014526367, + "learning_rate": 2.1596919127086006e-05, + "loss": 0.6271, + "step": 5613 + }, + { + "epoch": 7.206675224646983, + "grad_norm": 2.778270721435547, + "learning_rate": 2.16007702182285e-05, + "loss": 0.6179, + "step": 5614 + }, + { + "epoch": 7.20795892169448, + "grad_norm": 3.086540937423706, + "learning_rate": 2.160462130937099e-05, + "loss": 0.5795, + "step": 5615 + }, + { + "epoch": 7.2092426187419765, + "grad_norm": 2.8150365352630615, + "learning_rate": 2.160847240051348e-05, + "loss": 0.5738, + "step": 5616 + }, + { + "epoch": 7.2105263157894735, + "grad_norm": 2.599649429321289, + "learning_rate": 2.161232349165597e-05, + "loss": 0.6202, + "step": 5617 + }, + { + "epoch": 7.21181001283697, + "grad_norm": 2.002566337585449, + "learning_rate": 2.1616174582798458e-05, + "loss": 0.602, + "step": 5618 + }, + { + "epoch": 7.213093709884467, + "grad_norm": 3.0190675258636475, + "learning_rate": 2.162002567394095e-05, + "loss": 0.6168, + "step": 5619 + }, + { + "epoch": 7.214377406931964, + "grad_norm": 4.084872245788574, + "learning_rate": 2.1623876765083444e-05, + "loss": 0.6442, + "step": 5620 + }, + { + "epoch": 7.2156611039794605, + "grad_norm": 2.9106171131134033, + "learning_rate": 2.162772785622593e-05, + "loss": 0.5833, + "step": 5621 + }, + { + "epoch": 7.2169448010269575, + "grad_norm": 5.093480587005615, + "learning_rate": 2.1631578947368423e-05, + "loss": 0.5759, + "step": 5622 + }, + { + "epoch": 7.218228498074454, + "grad_norm": 2.5821282863616943, + "learning_rate": 2.163543003851091e-05, + "loss": 0.5608, + "step": 5623 + }, + { + "epoch": 7.219512195121951, + "grad_norm": 3.5790555477142334, + "learning_rate": 2.16392811296534e-05, + "loss": 0.5863, + "step": 5624 + }, + { + "epoch": 7.220795892169448, + "grad_norm": 10.031783103942871, + "learning_rate": 2.1643132220795892e-05, + "loss": 0.6188, + "step": 5625 + }, + { + "epoch": 7.2220795892169445, + "grad_norm": 1.935137391090393, + "learning_rate": 2.1646983311938383e-05, + "loss": 0.5595, + "step": 5626 + }, + { + "epoch": 7.2233632862644415, + "grad_norm": 9.178366661071777, + "learning_rate": 2.1650834403080875e-05, + "loss": 0.6025, + "step": 5627 + }, + { + "epoch": 7.224646983311938, + "grad_norm": 3.558631658554077, + "learning_rate": 2.1654685494223366e-05, + "loss": 0.5939, + "step": 5628 + }, + { + "epoch": 7.225930680359435, + "grad_norm": 4.116735458374023, + "learning_rate": 2.1658536585365853e-05, + "loss": 0.6231, + "step": 5629 + }, + { + "epoch": 7.227214377406932, + "grad_norm": 6.534857749938965, + "learning_rate": 2.1662387676508344e-05, + "loss": 0.6384, + "step": 5630 + }, + { + "epoch": 7.2284980744544285, + "grad_norm": 3.3008837699890137, + "learning_rate": 2.1666238767650832e-05, + "loss": 0.6227, + "step": 5631 + }, + { + "epoch": 7.2297817715019255, + "grad_norm": 2.019623041152954, + "learning_rate": 2.1670089858793327e-05, + "loss": 0.6207, + "step": 5632 + }, + { + "epoch": 7.2310654685494224, + "grad_norm": 2.7561488151550293, + "learning_rate": 2.1673940949935818e-05, + "loss": 0.6104, + "step": 5633 + }, + { + "epoch": 7.232349165596919, + "grad_norm": 3.6391310691833496, + "learning_rate": 2.1677792041078305e-05, + "loss": 0.6223, + "step": 5634 + }, + { + "epoch": 7.2336328626444155, + "grad_norm": 7.499936103820801, + "learning_rate": 2.1681643132220796e-05, + "loss": 0.6167, + "step": 5635 + }, + { + "epoch": 7.2349165596919125, + "grad_norm": 12.788662910461426, + "learning_rate": 2.1685494223363287e-05, + "loss": 0.7001, + "step": 5636 + }, + { + "epoch": 7.2362002567394095, + "grad_norm": 9.797988891601562, + "learning_rate": 2.1689345314505775e-05, + "loss": 0.6563, + "step": 5637 + }, + { + "epoch": 7.2374839537869065, + "grad_norm": 4.024835109710693, + "learning_rate": 2.169319640564827e-05, + "loss": 0.6489, + "step": 5638 + }, + { + "epoch": 7.238767650834403, + "grad_norm": 8.531007766723633, + "learning_rate": 2.169704749679076e-05, + "loss": 0.6763, + "step": 5639 + }, + { + "epoch": 7.2400513478818995, + "grad_norm": 3.4455301761627197, + "learning_rate": 2.1700898587933248e-05, + "loss": 0.6919, + "step": 5640 + }, + { + "epoch": 7.2413350449293965, + "grad_norm": 8.436209678649902, + "learning_rate": 2.170474967907574e-05, + "loss": 0.6532, + "step": 5641 + }, + { + "epoch": 7.2426187419768935, + "grad_norm": 5.662521839141846, + "learning_rate": 2.1708600770218227e-05, + "loss": 0.6593, + "step": 5642 + }, + { + "epoch": 7.2439024390243905, + "grad_norm": 2.3775274753570557, + "learning_rate": 2.1712451861360718e-05, + "loss": 0.6327, + "step": 5643 + }, + { + "epoch": 7.245186136071887, + "grad_norm": 4.506122589111328, + "learning_rate": 2.171630295250321e-05, + "loss": 0.7288, + "step": 5644 + }, + { + "epoch": 7.2464698331193835, + "grad_norm": 6.690408706665039, + "learning_rate": 2.17201540436457e-05, + "loss": 0.691, + "step": 5645 + }, + { + "epoch": 7.2477535301668805, + "grad_norm": 15.112844467163086, + "learning_rate": 2.172400513478819e-05, + "loss": 0.742, + "step": 5646 + }, + { + "epoch": 7.2490372272143775, + "grad_norm": 6.3322272300720215, + "learning_rate": 2.1727856225930682e-05, + "loss": 0.7134, + "step": 5647 + }, + { + "epoch": 7.2503209242618745, + "grad_norm": 2.9946749210357666, + "learning_rate": 2.173170731707317e-05, + "loss": 0.6946, + "step": 5648 + }, + { + "epoch": 7.251604621309371, + "grad_norm": 8.96556282043457, + "learning_rate": 2.173555840821566e-05, + "loss": 0.723, + "step": 5649 + }, + { + "epoch": 7.2528883183568675, + "grad_norm": 3.662977695465088, + "learning_rate": 2.173940949935815e-05, + "loss": 0.7257, + "step": 5650 + }, + { + "epoch": 7.2541720154043645, + "grad_norm": 5.802520275115967, + "learning_rate": 2.1743260590500643e-05, + "loss": 0.8151, + "step": 5651 + }, + { + "epoch": 7.2554557124518615, + "grad_norm": 2.79021954536438, + "learning_rate": 2.1747111681643134e-05, + "loss": 0.844, + "step": 5652 + }, + { + "epoch": 7.2567394094993585, + "grad_norm": 6.239144802093506, + "learning_rate": 2.1750962772785622e-05, + "loss": 0.8478, + "step": 5653 + }, + { + "epoch": 7.258023106546855, + "grad_norm": 2.900700569152832, + "learning_rate": 2.1754813863928113e-05, + "loss": 0.6144, + "step": 5654 + }, + { + "epoch": 7.2593068035943515, + "grad_norm": 3.5937726497650146, + "learning_rate": 2.1758664955070604e-05, + "loss": 0.5785, + "step": 5655 + }, + { + "epoch": 7.2605905006418485, + "grad_norm": 9.543038368225098, + "learning_rate": 2.1762516046213092e-05, + "loss": 0.6207, + "step": 5656 + }, + { + "epoch": 7.2618741976893455, + "grad_norm": 3.0433692932128906, + "learning_rate": 2.1766367137355586e-05, + "loss": 0.6443, + "step": 5657 + }, + { + "epoch": 7.2631578947368425, + "grad_norm": 2.0428690910339355, + "learning_rate": 2.1770218228498077e-05, + "loss": 0.599, + "step": 5658 + }, + { + "epoch": 7.264441591784339, + "grad_norm": 9.006595611572266, + "learning_rate": 2.1774069319640565e-05, + "loss": 0.5749, + "step": 5659 + }, + { + "epoch": 7.2657252888318355, + "grad_norm": 3.864701271057129, + "learning_rate": 2.1777920410783056e-05, + "loss": 0.5764, + "step": 5660 + }, + { + "epoch": 7.2670089858793325, + "grad_norm": 2.482210397720337, + "learning_rate": 2.1781771501925544e-05, + "loss": 0.5985, + "step": 5661 + }, + { + "epoch": 7.2682926829268295, + "grad_norm": 1.9216609001159668, + "learning_rate": 2.1785622593068035e-05, + "loss": 0.6289, + "step": 5662 + }, + { + "epoch": 7.2695763799743265, + "grad_norm": 8.199789047241211, + "learning_rate": 2.178947368421053e-05, + "loss": 0.565, + "step": 5663 + }, + { + "epoch": 7.270860077021823, + "grad_norm": 2.3866336345672607, + "learning_rate": 2.1793324775353017e-05, + "loss": 0.5803, + "step": 5664 + }, + { + "epoch": 7.2721437740693196, + "grad_norm": 5.926215171813965, + "learning_rate": 2.1797175866495508e-05, + "loss": 0.6512, + "step": 5665 + }, + { + "epoch": 7.2734274711168165, + "grad_norm": 4.15081787109375, + "learning_rate": 2.1801026957638e-05, + "loss": 0.5804, + "step": 5666 + }, + { + "epoch": 7.2747111681643135, + "grad_norm": 1.843810796737671, + "learning_rate": 2.1804878048780487e-05, + "loss": 0.6386, + "step": 5667 + }, + { + "epoch": 7.27599486521181, + "grad_norm": 1.969336748123169, + "learning_rate": 2.1808729139922978e-05, + "loss": 0.5736, + "step": 5668 + }, + { + "epoch": 7.277278562259307, + "grad_norm": 4.679925918579102, + "learning_rate": 2.181258023106547e-05, + "loss": 0.6006, + "step": 5669 + }, + { + "epoch": 7.278562259306804, + "grad_norm": 2.097011089324951, + "learning_rate": 2.181643132220796e-05, + "loss": 0.6104, + "step": 5670 + }, + { + "epoch": 7.2798459563543005, + "grad_norm": 2.9973678588867188, + "learning_rate": 2.182028241335045e-05, + "loss": 0.6077, + "step": 5671 + }, + { + "epoch": 7.2811296534017975, + "grad_norm": 4.717432975769043, + "learning_rate": 2.182413350449294e-05, + "loss": 0.6868, + "step": 5672 + }, + { + "epoch": 7.282413350449294, + "grad_norm": 26.250242233276367, + "learning_rate": 2.182798459563543e-05, + "loss": 0.6173, + "step": 5673 + }, + { + "epoch": 7.283697047496791, + "grad_norm": 7.336633682250977, + "learning_rate": 2.183183568677792e-05, + "loss": 0.5967, + "step": 5674 + }, + { + "epoch": 7.284980744544288, + "grad_norm": 3.271622657775879, + "learning_rate": 2.1835686777920412e-05, + "loss": 0.6267, + "step": 5675 + }, + { + "epoch": 7.2862644415917845, + "grad_norm": 2.6077797412872314, + "learning_rate": 2.1839537869062903e-05, + "loss": 0.6036, + "step": 5676 + }, + { + "epoch": 7.2875481386392815, + "grad_norm": 2.0993459224700928, + "learning_rate": 2.184338896020539e-05, + "loss": 0.586, + "step": 5677 + }, + { + "epoch": 7.288831835686778, + "grad_norm": 7.803252220153809, + "learning_rate": 2.1847240051347882e-05, + "loss": 0.6312, + "step": 5678 + }, + { + "epoch": 7.290115532734275, + "grad_norm": 3.455928325653076, + "learning_rate": 2.1851091142490373e-05, + "loss": 0.612, + "step": 5679 + }, + { + "epoch": 7.291399229781772, + "grad_norm": 1.7921345233917236, + "learning_rate": 2.185494223363286e-05, + "loss": 0.5875, + "step": 5680 + }, + { + "epoch": 7.2926829268292686, + "grad_norm": 3.019160747528076, + "learning_rate": 2.1858793324775355e-05, + "loss": 0.6149, + "step": 5681 + }, + { + "epoch": 7.293966623876765, + "grad_norm": 2.591054916381836, + "learning_rate": 2.1862644415917846e-05, + "loss": 0.6422, + "step": 5682 + }, + { + "epoch": 7.295250320924262, + "grad_norm": 2.2283573150634766, + "learning_rate": 2.1866495507060334e-05, + "loss": 0.63, + "step": 5683 + }, + { + "epoch": 7.296534017971759, + "grad_norm": 8.00842571258545, + "learning_rate": 2.1870346598202825e-05, + "loss": 0.6154, + "step": 5684 + }, + { + "epoch": 7.297817715019256, + "grad_norm": 3.9633629322052, + "learning_rate": 2.1874197689345316e-05, + "loss": 0.6178, + "step": 5685 + }, + { + "epoch": 7.299101412066753, + "grad_norm": 2.8924129009246826, + "learning_rate": 2.1878048780487804e-05, + "loss": 0.5967, + "step": 5686 + }, + { + "epoch": 7.300385109114249, + "grad_norm": 4.240757465362549, + "learning_rate": 2.1881899871630298e-05, + "loss": 0.6523, + "step": 5687 + }, + { + "epoch": 7.301668806161746, + "grad_norm": 5.093058109283447, + "learning_rate": 2.1885750962772786e-05, + "loss": 0.6604, + "step": 5688 + }, + { + "epoch": 7.302952503209243, + "grad_norm": 1.9747329950332642, + "learning_rate": 2.1889602053915277e-05, + "loss": 0.6321, + "step": 5689 + }, + { + "epoch": 7.30423620025674, + "grad_norm": 2.0140345096588135, + "learning_rate": 2.1893453145057768e-05, + "loss": 0.5947, + "step": 5690 + }, + { + "epoch": 7.305519897304237, + "grad_norm": 4.312188625335693, + "learning_rate": 2.1897304236200256e-05, + "loss": 0.634, + "step": 5691 + }, + { + "epoch": 7.306803594351733, + "grad_norm": 5.9716386795043945, + "learning_rate": 2.1901155327342747e-05, + "loss": 0.6666, + "step": 5692 + }, + { + "epoch": 7.30808729139923, + "grad_norm": 2.7716622352600098, + "learning_rate": 2.1905006418485238e-05, + "loss": 0.6467, + "step": 5693 + }, + { + "epoch": 7.309370988446727, + "grad_norm": 4.597800254821777, + "learning_rate": 2.190885750962773e-05, + "loss": 0.6924, + "step": 5694 + }, + { + "epoch": 7.310654685494224, + "grad_norm": 8.458308219909668, + "learning_rate": 2.191270860077022e-05, + "loss": 0.7131, + "step": 5695 + }, + { + "epoch": 7.311938382541721, + "grad_norm": 4.727266311645508, + "learning_rate": 2.1916559691912708e-05, + "loss": 0.6486, + "step": 5696 + }, + { + "epoch": 7.313222079589217, + "grad_norm": 2.128171443939209, + "learning_rate": 2.19204107830552e-05, + "loss": 0.72, + "step": 5697 + }, + { + "epoch": 7.314505776636714, + "grad_norm": 4.467230796813965, + "learning_rate": 2.192426187419769e-05, + "loss": 0.6971, + "step": 5698 + }, + { + "epoch": 7.315789473684211, + "grad_norm": 2.7823233604431152, + "learning_rate": 2.1928112965340177e-05, + "loss": 0.7156, + "step": 5699 + }, + { + "epoch": 7.317073170731708, + "grad_norm": 2.89652943611145, + "learning_rate": 2.1931964056482672e-05, + "loss": 0.6771, + "step": 5700 + }, + { + "epoch": 7.318356867779205, + "grad_norm": 2.2977826595306396, + "learning_rate": 2.1935815147625163e-05, + "loss": 0.7454, + "step": 5701 + }, + { + "epoch": 7.319640564826701, + "grad_norm": 9.921215057373047, + "learning_rate": 2.193966623876765e-05, + "loss": 0.7923, + "step": 5702 + }, + { + "epoch": 7.320924261874198, + "grad_norm": 7.778502464294434, + "learning_rate": 2.194351732991014e-05, + "loss": 0.8223, + "step": 5703 + }, + { + "epoch": 7.322207958921695, + "grad_norm": 4.927448272705078, + "learning_rate": 2.1947368421052633e-05, + "loss": 0.5819, + "step": 5704 + }, + { + "epoch": 7.323491655969192, + "grad_norm": 2.8371176719665527, + "learning_rate": 2.195121951219512e-05, + "loss": 0.575, + "step": 5705 + }, + { + "epoch": 7.324775353016688, + "grad_norm": 1.9132519960403442, + "learning_rate": 2.1955070603337615e-05, + "loss": 0.5681, + "step": 5706 + }, + { + "epoch": 7.326059050064185, + "grad_norm": 3.060948133468628, + "learning_rate": 2.1958921694480103e-05, + "loss": 0.5613, + "step": 5707 + }, + { + "epoch": 7.327342747111682, + "grad_norm": 6.193986892700195, + "learning_rate": 2.1962772785622594e-05, + "loss": 0.6984, + "step": 5708 + }, + { + "epoch": 7.328626444159179, + "grad_norm": 1.7099876403808594, + "learning_rate": 2.1966623876765085e-05, + "loss": 0.5855, + "step": 5709 + }, + { + "epoch": 7.329910141206676, + "grad_norm": 3.1753811836242676, + "learning_rate": 2.1970474967907572e-05, + "loss": 0.632, + "step": 5710 + }, + { + "epoch": 7.331193838254172, + "grad_norm": 5.755321025848389, + "learning_rate": 2.1974326059050063e-05, + "loss": 0.5645, + "step": 5711 + }, + { + "epoch": 7.332477535301669, + "grad_norm": 4.40062141418457, + "learning_rate": 2.1978177150192558e-05, + "loss": 0.5855, + "step": 5712 + }, + { + "epoch": 7.333761232349166, + "grad_norm": 2.760928153991699, + "learning_rate": 2.1982028241335046e-05, + "loss": 0.5792, + "step": 5713 + }, + { + "epoch": 7.335044929396663, + "grad_norm": 2.892646312713623, + "learning_rate": 2.1985879332477537e-05, + "loss": 0.6213, + "step": 5714 + }, + { + "epoch": 7.336328626444159, + "grad_norm": 9.106376647949219, + "learning_rate": 2.1989730423620024e-05, + "loss": 0.6243, + "step": 5715 + }, + { + "epoch": 7.337612323491656, + "grad_norm": 9.275087356567383, + "learning_rate": 2.1993581514762515e-05, + "loss": 0.6129, + "step": 5716 + }, + { + "epoch": 7.338896020539153, + "grad_norm": 2.5229251384735107, + "learning_rate": 2.1997432605905007e-05, + "loss": 0.636, + "step": 5717 + }, + { + "epoch": 7.34017971758665, + "grad_norm": 3.000560998916626, + "learning_rate": 2.2001283697047498e-05, + "loss": 0.5865, + "step": 5718 + }, + { + "epoch": 7.341463414634147, + "grad_norm": 3.825441598892212, + "learning_rate": 2.200513478818999e-05, + "loss": 0.6368, + "step": 5719 + }, + { + "epoch": 7.342747111681643, + "grad_norm": 1.959752082824707, + "learning_rate": 2.200898587933248e-05, + "loss": 0.584, + "step": 5720 + }, + { + "epoch": 7.34403080872914, + "grad_norm": 8.850117683410645, + "learning_rate": 2.2012836970474967e-05, + "loss": 0.5872, + "step": 5721 + }, + { + "epoch": 7.345314505776637, + "grad_norm": 3.6745638847351074, + "learning_rate": 2.201668806161746e-05, + "loss": 0.6443, + "step": 5722 + }, + { + "epoch": 7.346598202824134, + "grad_norm": 1.7393637895584106, + "learning_rate": 2.2020539152759946e-05, + "loss": 0.5801, + "step": 5723 + }, + { + "epoch": 7.347881899871631, + "grad_norm": 3.5838162899017334, + "learning_rate": 2.202439024390244e-05, + "loss": 0.622, + "step": 5724 + }, + { + "epoch": 7.349165596919127, + "grad_norm": 6.1770853996276855, + "learning_rate": 2.202824133504493e-05, + "loss": 0.599, + "step": 5725 + }, + { + "epoch": 7.350449293966624, + "grad_norm": 3.8889081478118896, + "learning_rate": 2.203209242618742e-05, + "loss": 0.613, + "step": 5726 + }, + { + "epoch": 7.351732991014121, + "grad_norm": 6.705533027648926, + "learning_rate": 2.203594351732991e-05, + "loss": 0.5873, + "step": 5727 + }, + { + "epoch": 7.353016688061618, + "grad_norm": 2.5724334716796875, + "learning_rate": 2.20397946084724e-05, + "loss": 0.6466, + "step": 5728 + }, + { + "epoch": 7.354300385109115, + "grad_norm": 3.892472267150879, + "learning_rate": 2.204364569961489e-05, + "loss": 0.6275, + "step": 5729 + }, + { + "epoch": 7.355584082156611, + "grad_norm": 5.4291911125183105, + "learning_rate": 2.2047496790757384e-05, + "loss": 0.5991, + "step": 5730 + }, + { + "epoch": 7.356867779204108, + "grad_norm": 2.6923186779022217, + "learning_rate": 2.2051347881899875e-05, + "loss": 0.611, + "step": 5731 + }, + { + "epoch": 7.358151476251605, + "grad_norm": 5.222226142883301, + "learning_rate": 2.2055198973042362e-05, + "loss": 0.6007, + "step": 5732 + }, + { + "epoch": 7.359435173299102, + "grad_norm": 2.0659570693969727, + "learning_rate": 2.2059050064184853e-05, + "loss": 0.6142, + "step": 5733 + }, + { + "epoch": 7.360718870346599, + "grad_norm": 3.955723285675049, + "learning_rate": 2.206290115532734e-05, + "loss": 0.6794, + "step": 5734 + }, + { + "epoch": 7.362002567394095, + "grad_norm": 4.3831682205200195, + "learning_rate": 2.2066752246469832e-05, + "loss": 0.6075, + "step": 5735 + }, + { + "epoch": 7.363286264441592, + "grad_norm": 4.706425666809082, + "learning_rate": 2.2070603337612327e-05, + "loss": 0.5814, + "step": 5736 + }, + { + "epoch": 7.364569961489089, + "grad_norm": 2.8558764457702637, + "learning_rate": 2.2074454428754814e-05, + "loss": 0.5709, + "step": 5737 + }, + { + "epoch": 7.365853658536586, + "grad_norm": 3.2185842990875244, + "learning_rate": 2.2078305519897305e-05, + "loss": 0.5943, + "step": 5738 + }, + { + "epoch": 7.367137355584082, + "grad_norm": 5.457143783569336, + "learning_rate": 2.2082156611039797e-05, + "loss": 0.5994, + "step": 5739 + }, + { + "epoch": 7.368421052631579, + "grad_norm": 3.0876660346984863, + "learning_rate": 2.2086007702182284e-05, + "loss": 0.6461, + "step": 5740 + }, + { + "epoch": 7.369704749679076, + "grad_norm": 175.43115234375, + "learning_rate": 2.2089858793324775e-05, + "loss": 0.6538, + "step": 5741 + }, + { + "epoch": 7.370988446726573, + "grad_norm": 2.832580327987671, + "learning_rate": 2.2093709884467266e-05, + "loss": 0.6721, + "step": 5742 + }, + { + "epoch": 7.37227214377407, + "grad_norm": 10.155699729919434, + "learning_rate": 2.2097560975609757e-05, + "loss": 0.6155, + "step": 5743 + }, + { + "epoch": 7.373555840821566, + "grad_norm": 36.694984436035156, + "learning_rate": 2.210141206675225e-05, + "loss": 0.6085, + "step": 5744 + }, + { + "epoch": 7.374839537869063, + "grad_norm": 3.6331722736358643, + "learning_rate": 2.2105263157894736e-05, + "loss": 0.6121, + "step": 5745 + }, + { + "epoch": 7.37612323491656, + "grad_norm": 2.632737636566162, + "learning_rate": 2.2109114249037227e-05, + "loss": 0.6648, + "step": 5746 + }, + { + "epoch": 7.377406931964057, + "grad_norm": 2.3814656734466553, + "learning_rate": 2.2112965340179718e-05, + "loss": 0.636, + "step": 5747 + }, + { + "epoch": 7.378690629011553, + "grad_norm": 10.53480052947998, + "learning_rate": 2.2116816431322206e-05, + "loss": 0.6907, + "step": 5748 + }, + { + "epoch": 7.37997432605905, + "grad_norm": 5.642154216766357, + "learning_rate": 2.21206675224647e-05, + "loss": 0.7486, + "step": 5749 + }, + { + "epoch": 7.381258023106547, + "grad_norm": 2.1912081241607666, + "learning_rate": 2.212451861360719e-05, + "loss": 0.7138, + "step": 5750 + }, + { + "epoch": 7.382541720154044, + "grad_norm": 4.459690093994141, + "learning_rate": 2.212836970474968e-05, + "loss": 0.7539, + "step": 5751 + }, + { + "epoch": 7.383825417201541, + "grad_norm": 4.99705171585083, + "learning_rate": 2.213222079589217e-05, + "loss": 0.8397, + "step": 5752 + }, + { + "epoch": 7.385109114249037, + "grad_norm": 5.8904924392700195, + "learning_rate": 2.2136071887034658e-05, + "loss": 0.8488, + "step": 5753 + }, + { + "epoch": 7.386392811296534, + "grad_norm": 3.609316349029541, + "learning_rate": 2.213992297817715e-05, + "loss": 0.5803, + "step": 5754 + }, + { + "epoch": 7.387676508344031, + "grad_norm": 2.204807758331299, + "learning_rate": 2.2143774069319643e-05, + "loss": 0.6302, + "step": 5755 + }, + { + "epoch": 7.388960205391528, + "grad_norm": 2.869009017944336, + "learning_rate": 2.214762516046213e-05, + "loss": 0.5762, + "step": 5756 + }, + { + "epoch": 7.390243902439025, + "grad_norm": 5.509469509124756, + "learning_rate": 2.2151476251604622e-05, + "loss": 0.5765, + "step": 5757 + }, + { + "epoch": 7.391527599486521, + "grad_norm": 7.787851333618164, + "learning_rate": 2.2155327342747113e-05, + "loss": 0.6219, + "step": 5758 + }, + { + "epoch": 7.392811296534018, + "grad_norm": 2.037919282913208, + "learning_rate": 2.21591784338896e-05, + "loss": 0.5524, + "step": 5759 + }, + { + "epoch": 7.394094993581515, + "grad_norm": 2.841897964477539, + "learning_rate": 2.2163029525032092e-05, + "loss": 0.5738, + "step": 5760 + }, + { + "epoch": 7.395378690629012, + "grad_norm": 4.246678829193115, + "learning_rate": 2.2166880616174583e-05, + "loss": 0.5977, + "step": 5761 + }, + { + "epoch": 7.396662387676509, + "grad_norm": 5.22859001159668, + "learning_rate": 2.2170731707317074e-05, + "loss": 0.5549, + "step": 5762 + }, + { + "epoch": 7.397946084724005, + "grad_norm": 2.7665047645568848, + "learning_rate": 2.2174582798459565e-05, + "loss": 0.6518, + "step": 5763 + }, + { + "epoch": 7.399229781771502, + "grad_norm": 6.749922752380371, + "learning_rate": 2.2178433889602053e-05, + "loss": 0.6211, + "step": 5764 + }, + { + "epoch": 7.400513478818999, + "grad_norm": 6.3452019691467285, + "learning_rate": 2.2182284980744544e-05, + "loss": 0.6032, + "step": 5765 + }, + { + "epoch": 7.401797175866496, + "grad_norm": 2.138528347015381, + "learning_rate": 2.2186136071887035e-05, + "loss": 0.6333, + "step": 5766 + }, + { + "epoch": 7.403080872913993, + "grad_norm": 4.923240661621094, + "learning_rate": 2.2189987163029526e-05, + "loss": 0.5832, + "step": 5767 + }, + { + "epoch": 7.404364569961489, + "grad_norm": 2.95695424079895, + "learning_rate": 2.2193838254172017e-05, + "loss": 0.6086, + "step": 5768 + }, + { + "epoch": 7.405648267008986, + "grad_norm": 3.1013684272766113, + "learning_rate": 2.2197689345314508e-05, + "loss": 0.5775, + "step": 5769 + }, + { + "epoch": 7.406931964056483, + "grad_norm": 5.017786502838135, + "learning_rate": 2.2201540436456996e-05, + "loss": 0.5926, + "step": 5770 + }, + { + "epoch": 7.40821566110398, + "grad_norm": 1.9685002565383911, + "learning_rate": 2.2205391527599487e-05, + "loss": 0.6013, + "step": 5771 + }, + { + "epoch": 7.409499358151476, + "grad_norm": 2.7806270122528076, + "learning_rate": 2.2209242618741975e-05, + "loss": 0.6228, + "step": 5772 + }, + { + "epoch": 7.410783055198973, + "grad_norm": 2.263183355331421, + "learning_rate": 2.221309370988447e-05, + "loss": 0.6181, + "step": 5773 + }, + { + "epoch": 7.41206675224647, + "grad_norm": 3.206589937210083, + "learning_rate": 2.221694480102696e-05, + "loss": 0.5814, + "step": 5774 + }, + { + "epoch": 7.413350449293967, + "grad_norm": 6.438682556152344, + "learning_rate": 2.2220795892169448e-05, + "loss": 0.5915, + "step": 5775 + }, + { + "epoch": 7.414634146341464, + "grad_norm": 1.6950314044952393, + "learning_rate": 2.222464698331194e-05, + "loss": 0.6053, + "step": 5776 + }, + { + "epoch": 7.41591784338896, + "grad_norm": 3.328744649887085, + "learning_rate": 2.222849807445443e-05, + "loss": 0.6128, + "step": 5777 + }, + { + "epoch": 7.417201540436457, + "grad_norm": 2.3106067180633545, + "learning_rate": 2.2232349165596918e-05, + "loss": 0.6459, + "step": 5778 + }, + { + "epoch": 7.418485237483954, + "grad_norm": 2.4076013565063477, + "learning_rate": 2.2236200256739412e-05, + "loss": 0.6066, + "step": 5779 + }, + { + "epoch": 7.419768934531451, + "grad_norm": 2.455970287322998, + "learning_rate": 2.22400513478819e-05, + "loss": 0.6252, + "step": 5780 + }, + { + "epoch": 7.421052631578947, + "grad_norm": 2.6941745281219482, + "learning_rate": 2.224390243902439e-05, + "loss": 0.5596, + "step": 5781 + }, + { + "epoch": 7.422336328626444, + "grad_norm": 5.416585922241211, + "learning_rate": 2.2247753530166882e-05, + "loss": 0.6245, + "step": 5782 + }, + { + "epoch": 7.423620025673941, + "grad_norm": 3.0157055854797363, + "learning_rate": 2.225160462130937e-05, + "loss": 0.5947, + "step": 5783 + }, + { + "epoch": 7.424903722721438, + "grad_norm": 8.572805404663086, + "learning_rate": 2.225545571245186e-05, + "loss": 0.6306, + "step": 5784 + }, + { + "epoch": 7.426187419768935, + "grad_norm": 18.452438354492188, + "learning_rate": 2.2259306803594355e-05, + "loss": 0.5993, + "step": 5785 + }, + { + "epoch": 7.427471116816431, + "grad_norm": 7.0974531173706055, + "learning_rate": 2.2263157894736843e-05, + "loss": 0.5903, + "step": 5786 + }, + { + "epoch": 7.428754813863928, + "grad_norm": 5.510228633880615, + "learning_rate": 2.2267008985879334e-05, + "loss": 0.7015, + "step": 5787 + }, + { + "epoch": 7.430038510911425, + "grad_norm": 3.6673781871795654, + "learning_rate": 2.227086007702182e-05, + "loss": 0.7049, + "step": 5788 + }, + { + "epoch": 7.431322207958922, + "grad_norm": 4.641116142272949, + "learning_rate": 2.2274711168164313e-05, + "loss": 0.6343, + "step": 5789 + }, + { + "epoch": 7.432605905006419, + "grad_norm": 2.590665102005005, + "learning_rate": 2.2278562259306804e-05, + "loss": 0.6548, + "step": 5790 + }, + { + "epoch": 7.433889602053915, + "grad_norm": 7.49672794342041, + "learning_rate": 2.2282413350449295e-05, + "loss": 0.598, + "step": 5791 + }, + { + "epoch": 7.435173299101412, + "grad_norm": 6.599700927734375, + "learning_rate": 2.2286264441591786e-05, + "loss": 0.7364, + "step": 5792 + }, + { + "epoch": 7.436456996148909, + "grad_norm": 2.0940210819244385, + "learning_rate": 2.2290115532734277e-05, + "loss": 0.6322, + "step": 5793 + }, + { + "epoch": 7.437740693196406, + "grad_norm": 4.072161674499512, + "learning_rate": 2.2293966623876765e-05, + "loss": 0.7191, + "step": 5794 + }, + { + "epoch": 7.439024390243903, + "grad_norm": 2.9484469890594482, + "learning_rate": 2.2297817715019256e-05, + "loss": 0.6995, + "step": 5795 + }, + { + "epoch": 7.440308087291399, + "grad_norm": 1.888145923614502, + "learning_rate": 2.2301668806161747e-05, + "loss": 0.6491, + "step": 5796 + }, + { + "epoch": 7.441591784338896, + "grad_norm": 9.261605262756348, + "learning_rate": 2.2305519897304238e-05, + "loss": 0.7006, + "step": 5797 + }, + { + "epoch": 7.442875481386393, + "grad_norm": 2.329960346221924, + "learning_rate": 2.230937098844673e-05, + "loss": 0.73, + "step": 5798 + }, + { + "epoch": 7.44415917843389, + "grad_norm": 2.5143330097198486, + "learning_rate": 2.2313222079589217e-05, + "loss": 0.6616, + "step": 5799 + }, + { + "epoch": 7.445442875481387, + "grad_norm": 5.028128147125244, + "learning_rate": 2.2317073170731708e-05, + "loss": 0.6892, + "step": 5800 + }, + { + "epoch": 7.446726572528883, + "grad_norm": 4.049410820007324, + "learning_rate": 2.23209242618742e-05, + "loss": 0.7493, + "step": 5801 + }, + { + "epoch": 7.44801026957638, + "grad_norm": 2.9089365005493164, + "learning_rate": 2.2324775353016687e-05, + "loss": 0.7965, + "step": 5802 + }, + { + "epoch": 7.449293966623877, + "grad_norm": 4.8237481117248535, + "learning_rate": 2.2328626444159178e-05, + "loss": 0.9493, + "step": 5803 + }, + { + "epoch": 7.450577663671374, + "grad_norm": 2.9070260524749756, + "learning_rate": 2.2332477535301672e-05, + "loss": 0.5917, + "step": 5804 + }, + { + "epoch": 7.45186136071887, + "grad_norm": 3.6599457263946533, + "learning_rate": 2.233632862644416e-05, + "loss": 0.5723, + "step": 5805 + }, + { + "epoch": 7.453145057766367, + "grad_norm": 1.1837693452835083, + "learning_rate": 2.234017971758665e-05, + "loss": 0.6118, + "step": 5806 + }, + { + "epoch": 7.454428754813864, + "grad_norm": 3.781283378601074, + "learning_rate": 2.234403080872914e-05, + "loss": 0.5871, + "step": 5807 + }, + { + "epoch": 7.455712451861361, + "grad_norm": 2.258615016937256, + "learning_rate": 2.234788189987163e-05, + "loss": 0.5987, + "step": 5808 + }, + { + "epoch": 7.456996148908858, + "grad_norm": 2.034445285797119, + "learning_rate": 2.235173299101412e-05, + "loss": 0.6108, + "step": 5809 + }, + { + "epoch": 7.458279845956354, + "grad_norm": 2.401388645172119, + "learning_rate": 2.235558408215661e-05, + "loss": 0.6292, + "step": 5810 + }, + { + "epoch": 7.459563543003851, + "grad_norm": 6.300665378570557, + "learning_rate": 2.2359435173299103e-05, + "loss": 0.5879, + "step": 5811 + }, + { + "epoch": 7.460847240051348, + "grad_norm": 1.8537877798080444, + "learning_rate": 2.2363286264441594e-05, + "loss": 0.5933, + "step": 5812 + }, + { + "epoch": 7.462130937098845, + "grad_norm": 1.3455747365951538, + "learning_rate": 2.236713735558408e-05, + "loss": 0.6021, + "step": 5813 + }, + { + "epoch": 7.463414634146342, + "grad_norm": 1.4976146221160889, + "learning_rate": 2.2370988446726573e-05, + "loss": 0.5771, + "step": 5814 + }, + { + "epoch": 7.464698331193838, + "grad_norm": 3.78851318359375, + "learning_rate": 2.2374839537869064e-05, + "loss": 0.6037, + "step": 5815 + }, + { + "epoch": 7.465982028241335, + "grad_norm": 2.11777400970459, + "learning_rate": 2.2378690629011555e-05, + "loss": 0.6073, + "step": 5816 + }, + { + "epoch": 7.467265725288832, + "grad_norm": 12.964131355285645, + "learning_rate": 2.2382541720154046e-05, + "loss": 0.6049, + "step": 5817 + }, + { + "epoch": 7.468549422336329, + "grad_norm": 2.0319156646728516, + "learning_rate": 2.2386392811296533e-05, + "loss": 0.6049, + "step": 5818 + }, + { + "epoch": 7.469833119383825, + "grad_norm": 4.620457172393799, + "learning_rate": 2.2390243902439025e-05, + "loss": 0.6335, + "step": 5819 + }, + { + "epoch": 7.471116816431322, + "grad_norm": 3.8482649326324463, + "learning_rate": 2.2394094993581516e-05, + "loss": 0.5978, + "step": 5820 + }, + { + "epoch": 7.472400513478819, + "grad_norm": 1.945254921913147, + "learning_rate": 2.2397946084724003e-05, + "loss": 0.5702, + "step": 5821 + }, + { + "epoch": 7.473684210526316, + "grad_norm": 4.3837761878967285, + "learning_rate": 2.2401797175866498e-05, + "loss": 0.5876, + "step": 5822 + }, + { + "epoch": 7.474967907573813, + "grad_norm": 1.4271798133850098, + "learning_rate": 2.240564826700899e-05, + "loss": 0.561, + "step": 5823 + }, + { + "epoch": 7.476251604621309, + "grad_norm": 3.8762762546539307, + "learning_rate": 2.2409499358151477e-05, + "loss": 0.6118, + "step": 5824 + }, + { + "epoch": 7.477535301668806, + "grad_norm": 6.5554022789001465, + "learning_rate": 2.2413350449293968e-05, + "loss": 0.5754, + "step": 5825 + }, + { + "epoch": 7.478818998716303, + "grad_norm": 9.877501487731934, + "learning_rate": 2.2417201540436455e-05, + "loss": 0.6163, + "step": 5826 + }, + { + "epoch": 7.4801026957638, + "grad_norm": 3.950389862060547, + "learning_rate": 2.2421052631578946e-05, + "loss": 0.6226, + "step": 5827 + }, + { + "epoch": 7.481386392811297, + "grad_norm": 1.6178779602050781, + "learning_rate": 2.242490372272144e-05, + "loss": 0.5797, + "step": 5828 + }, + { + "epoch": 7.482670089858793, + "grad_norm": 2.8012213706970215, + "learning_rate": 2.242875481386393e-05, + "loss": 0.6079, + "step": 5829 + }, + { + "epoch": 7.48395378690629, + "grad_norm": 6.274683475494385, + "learning_rate": 2.243260590500642e-05, + "loss": 0.5762, + "step": 5830 + }, + { + "epoch": 7.485237483953787, + "grad_norm": 6.0280609130859375, + "learning_rate": 2.243645699614891e-05, + "loss": 0.6174, + "step": 5831 + }, + { + "epoch": 7.486521181001284, + "grad_norm": 2.2895758152008057, + "learning_rate": 2.2440308087291398e-05, + "loss": 0.6548, + "step": 5832 + }, + { + "epoch": 7.487804878048781, + "grad_norm": 4.008669853210449, + "learning_rate": 2.244415917843389e-05, + "loss": 0.6415, + "step": 5833 + }, + { + "epoch": 7.489088575096277, + "grad_norm": 2.784395217895508, + "learning_rate": 2.244801026957638e-05, + "loss": 0.6297, + "step": 5834 + }, + { + "epoch": 7.490372272143774, + "grad_norm": 2.4784677028656006, + "learning_rate": 2.245186136071887e-05, + "loss": 0.5621, + "step": 5835 + }, + { + "epoch": 7.491655969191271, + "grad_norm": 2.2580831050872803, + "learning_rate": 2.2455712451861363e-05, + "loss": 0.6152, + "step": 5836 + }, + { + "epoch": 7.492939666238768, + "grad_norm": 2.685493230819702, + "learning_rate": 2.245956354300385e-05, + "loss": 0.6596, + "step": 5837 + }, + { + "epoch": 7.494223363286264, + "grad_norm": 1.8990375995635986, + "learning_rate": 2.246341463414634e-05, + "loss": 0.6671, + "step": 5838 + }, + { + "epoch": 7.495507060333761, + "grad_norm": 1.7628669738769531, + "learning_rate": 2.2467265725288832e-05, + "loss": 0.6055, + "step": 5839 + }, + { + "epoch": 7.496790757381258, + "grad_norm": 2.285449504852295, + "learning_rate": 2.2471116816431323e-05, + "loss": 0.638, + "step": 5840 + }, + { + "epoch": 7.498074454428755, + "grad_norm": 2.0850188732147217, + "learning_rate": 2.2474967907573815e-05, + "loss": 0.6573, + "step": 5841 + }, + { + "epoch": 7.499358151476252, + "grad_norm": 7.807222843170166, + "learning_rate": 2.2478818998716306e-05, + "loss": 0.6553, + "step": 5842 + }, + { + "epoch": 7.500641848523748, + "grad_norm": 5.597051620483398, + "learning_rate": 2.2482670089858793e-05, + "loss": 0.7006, + "step": 5843 + }, + { + "epoch": 7.501925545571245, + "grad_norm": 2.918463706970215, + "learning_rate": 2.2486521181001284e-05, + "loss": 0.6519, + "step": 5844 + }, + { + "epoch": 7.503209242618742, + "grad_norm": 10.420697212219238, + "learning_rate": 2.2490372272143772e-05, + "loss": 0.7022, + "step": 5845 + }, + { + "epoch": 7.504492939666239, + "grad_norm": 15.246454238891602, + "learning_rate": 2.2494223363286267e-05, + "loss": 0.6829, + "step": 5846 + }, + { + "epoch": 7.505776636713735, + "grad_norm": 6.122875213623047, + "learning_rate": 2.2498074454428758e-05, + "loss": 0.6883, + "step": 5847 + }, + { + "epoch": 7.507060333761232, + "grad_norm": 4.509777069091797, + "learning_rate": 2.2501925545571245e-05, + "loss": 0.7532, + "step": 5848 + }, + { + "epoch": 7.508344030808729, + "grad_norm": 3.26213002204895, + "learning_rate": 2.2505776636713736e-05, + "loss": 0.7301, + "step": 5849 + }, + { + "epoch": 7.509627727856226, + "grad_norm": 3.0910558700561523, + "learning_rate": 2.2509627727856227e-05, + "loss": 0.771, + "step": 5850 + }, + { + "epoch": 7.510911424903723, + "grad_norm": 4.804920673370361, + "learning_rate": 2.2513478818998715e-05, + "loss": 0.7729, + "step": 5851 + }, + { + "epoch": 7.512195121951219, + "grad_norm": 3.638467311859131, + "learning_rate": 2.2517329910141206e-05, + "loss": 0.9034, + "step": 5852 + }, + { + "epoch": 7.513478818998716, + "grad_norm": 3.119279623031616, + "learning_rate": 2.2521181001283697e-05, + "loss": 0.9053, + "step": 5853 + }, + { + "epoch": 7.514762516046213, + "grad_norm": 2.436508893966675, + "learning_rate": 2.2525032092426188e-05, + "loss": 0.6244, + "step": 5854 + }, + { + "epoch": 7.51604621309371, + "grad_norm": 1.5290192365646362, + "learning_rate": 2.252888318356868e-05, + "loss": 0.5826, + "step": 5855 + }, + { + "epoch": 7.517329910141207, + "grad_norm": 2.871331214904785, + "learning_rate": 2.2532734274711167e-05, + "loss": 0.5968, + "step": 5856 + }, + { + "epoch": 7.518613607188703, + "grad_norm": 2.7752859592437744, + "learning_rate": 2.2536585365853658e-05, + "loss": 0.5983, + "step": 5857 + }, + { + "epoch": 7.5198973042362, + "grad_norm": 1.332186222076416, + "learning_rate": 2.254043645699615e-05, + "loss": 0.6067, + "step": 5858 + }, + { + "epoch": 7.521181001283697, + "grad_norm": 4.508073806762695, + "learning_rate": 2.254428754813864e-05, + "loss": 0.5799, + "step": 5859 + }, + { + "epoch": 7.522464698331194, + "grad_norm": 2.275073528289795, + "learning_rate": 2.254813863928113e-05, + "loss": 0.5772, + "step": 5860 + }, + { + "epoch": 7.523748395378691, + "grad_norm": 1.4240301847457886, + "learning_rate": 2.2551989730423622e-05, + "loss": 0.5983, + "step": 5861 + }, + { + "epoch": 7.525032092426187, + "grad_norm": 1.9746613502502441, + "learning_rate": 2.255584082156611e-05, + "loss": 0.5576, + "step": 5862 + }, + { + "epoch": 7.526315789473684, + "grad_norm": 2.092801570892334, + "learning_rate": 2.25596919127086e-05, + "loss": 0.5994, + "step": 5863 + }, + { + "epoch": 7.527599486521181, + "grad_norm": 5.472644329071045, + "learning_rate": 2.256354300385109e-05, + "loss": 0.6091, + "step": 5864 + }, + { + "epoch": 7.528883183568678, + "grad_norm": 3.878918170928955, + "learning_rate": 2.2567394094993583e-05, + "loss": 0.6469, + "step": 5865 + }, + { + "epoch": 7.530166880616175, + "grad_norm": 3.0970308780670166, + "learning_rate": 2.2571245186136074e-05, + "loss": 0.6215, + "step": 5866 + }, + { + "epoch": 7.531450577663671, + "grad_norm": 2.241844654083252, + "learning_rate": 2.2575096277278562e-05, + "loss": 0.5695, + "step": 5867 + }, + { + "epoch": 7.532734274711168, + "grad_norm": 3.846773386001587, + "learning_rate": 2.2578947368421053e-05, + "loss": 0.6482, + "step": 5868 + }, + { + "epoch": 7.534017971758665, + "grad_norm": 2.7898073196411133, + "learning_rate": 2.2582798459563544e-05, + "loss": 0.6001, + "step": 5869 + }, + { + "epoch": 7.535301668806162, + "grad_norm": 3.616295576095581, + "learning_rate": 2.2586649550706032e-05, + "loss": 0.5326, + "step": 5870 + }, + { + "epoch": 7.536585365853659, + "grad_norm": 11.57154655456543, + "learning_rate": 2.2590500641848526e-05, + "loss": 0.5501, + "step": 5871 + }, + { + "epoch": 7.537869062901155, + "grad_norm": 3.002262830734253, + "learning_rate": 2.2594351732991014e-05, + "loss": 0.612, + "step": 5872 + }, + { + "epoch": 7.539152759948652, + "grad_norm": 2.2287204265594482, + "learning_rate": 2.2598202824133505e-05, + "loss": 0.5583, + "step": 5873 + }, + { + "epoch": 7.540436456996149, + "grad_norm": 3.3357834815979004, + "learning_rate": 2.2602053915275996e-05, + "loss": 0.5851, + "step": 5874 + }, + { + "epoch": 7.541720154043646, + "grad_norm": 2.753173589706421, + "learning_rate": 2.2605905006418484e-05, + "loss": 0.584, + "step": 5875 + }, + { + "epoch": 7.543003851091142, + "grad_norm": 4.067176342010498, + "learning_rate": 2.2609756097560975e-05, + "loss": 0.594, + "step": 5876 + }, + { + "epoch": 7.544287548138639, + "grad_norm": 3.4991343021392822, + "learning_rate": 2.261360718870347e-05, + "loss": 0.5739, + "step": 5877 + }, + { + "epoch": 7.545571245186136, + "grad_norm": 2.8085551261901855, + "learning_rate": 2.2617458279845957e-05, + "loss": 0.609, + "step": 5878 + }, + { + "epoch": 7.546854942233633, + "grad_norm": 1.698873519897461, + "learning_rate": 2.2621309370988448e-05, + "loss": 0.6061, + "step": 5879 + }, + { + "epoch": 7.548138639281129, + "grad_norm": 3.3067498207092285, + "learning_rate": 2.2625160462130936e-05, + "loss": 0.5852, + "step": 5880 + }, + { + "epoch": 7.549422336328626, + "grad_norm": 4.873530864715576, + "learning_rate": 2.2629011553273427e-05, + "loss": 0.5823, + "step": 5881 + }, + { + "epoch": 7.550706033376123, + "grad_norm": 3.255621910095215, + "learning_rate": 2.2632862644415918e-05, + "loss": 0.631, + "step": 5882 + }, + { + "epoch": 7.55198973042362, + "grad_norm": 4.407940864562988, + "learning_rate": 2.263671373555841e-05, + "loss": 0.6491, + "step": 5883 + }, + { + "epoch": 7.553273427471117, + "grad_norm": 2.192495822906494, + "learning_rate": 2.26405648267009e-05, + "loss": 0.595, + "step": 5884 + }, + { + "epoch": 7.554557124518613, + "grad_norm": 4.975985050201416, + "learning_rate": 2.264441591784339e-05, + "loss": 0.5894, + "step": 5885 + }, + { + "epoch": 7.55584082156611, + "grad_norm": 2.2531485557556152, + "learning_rate": 2.264826700898588e-05, + "loss": 0.6006, + "step": 5886 + }, + { + "epoch": 7.557124518613607, + "grad_norm": 1.3841729164123535, + "learning_rate": 2.265211810012837e-05, + "loss": 0.6375, + "step": 5887 + }, + { + "epoch": 7.558408215661104, + "grad_norm": 3.3774526119232178, + "learning_rate": 2.265596919127086e-05, + "loss": 0.6549, + "step": 5888 + }, + { + "epoch": 7.559691912708601, + "grad_norm": 1.7521060705184937, + "learning_rate": 2.2659820282413352e-05, + "loss": 0.6462, + "step": 5889 + }, + { + "epoch": 7.560975609756097, + "grad_norm": 2.862417697906494, + "learning_rate": 2.2663671373555843e-05, + "loss": 0.6289, + "step": 5890 + }, + { + "epoch": 7.562259306803594, + "grad_norm": 2.877609968185425, + "learning_rate": 2.266752246469833e-05, + "loss": 0.6548, + "step": 5891 + }, + { + "epoch": 7.563543003851091, + "grad_norm": 2.676401376724243, + "learning_rate": 2.2671373555840822e-05, + "loss": 0.5948, + "step": 5892 + }, + { + "epoch": 7.564826700898588, + "grad_norm": 5.367919921875, + "learning_rate": 2.2675224646983313e-05, + "loss": 0.6994, + "step": 5893 + }, + { + "epoch": 7.566110397946085, + "grad_norm": 6.054384708404541, + "learning_rate": 2.26790757381258e-05, + "loss": 0.6997, + "step": 5894 + }, + { + "epoch": 7.567394094993581, + "grad_norm": 4.170044422149658, + "learning_rate": 2.2682926829268295e-05, + "loss": 0.7252, + "step": 5895 + }, + { + "epoch": 7.568677792041078, + "grad_norm": 2.5612356662750244, + "learning_rate": 2.2686777920410786e-05, + "loss": 0.6858, + "step": 5896 + }, + { + "epoch": 7.569961489088575, + "grad_norm": 1.5256810188293457, + "learning_rate": 2.2690629011553274e-05, + "loss": 0.6576, + "step": 5897 + }, + { + "epoch": 7.571245186136072, + "grad_norm": 3.5602076053619385, + "learning_rate": 2.2694480102695765e-05, + "loss": 0.6861, + "step": 5898 + }, + { + "epoch": 7.572528883183569, + "grad_norm": 4.1804938316345215, + "learning_rate": 2.2698331193838253e-05, + "loss": 0.7408, + "step": 5899 + }, + { + "epoch": 7.573812580231065, + "grad_norm": 4.070483207702637, + "learning_rate": 2.2702182284980744e-05, + "loss": 0.7314, + "step": 5900 + }, + { + "epoch": 7.575096277278562, + "grad_norm": 3.0922513008117676, + "learning_rate": 2.2706033376123235e-05, + "loss": 0.7083, + "step": 5901 + }, + { + "epoch": 7.576379974326059, + "grad_norm": 1.8565937280654907, + "learning_rate": 2.2709884467265726e-05, + "loss": 0.8174, + "step": 5902 + }, + { + "epoch": 7.577663671373556, + "grad_norm": 2.7461211681365967, + "learning_rate": 2.2713735558408217e-05, + "loss": 0.818, + "step": 5903 + }, + { + "epoch": 7.578947368421053, + "grad_norm": 1.698683261871338, + "learning_rate": 2.2717586649550708e-05, + "loss": 0.5572, + "step": 5904 + }, + { + "epoch": 7.580231065468549, + "grad_norm": 1.951219916343689, + "learning_rate": 2.2721437740693196e-05, + "loss": 0.597, + "step": 5905 + }, + { + "epoch": 7.581514762516046, + "grad_norm": 1.5593897104263306, + "learning_rate": 2.2725288831835687e-05, + "loss": 0.5639, + "step": 5906 + }, + { + "epoch": 7.582798459563543, + "grad_norm": 2.41334867477417, + "learning_rate": 2.2729139922978178e-05, + "loss": 0.62, + "step": 5907 + }, + { + "epoch": 7.58408215661104, + "grad_norm": 2.076859712600708, + "learning_rate": 2.273299101412067e-05, + "loss": 0.5904, + "step": 5908 + }, + { + "epoch": 7.585365853658536, + "grad_norm": 5.2166900634765625, + "learning_rate": 2.273684210526316e-05, + "loss": 0.5902, + "step": 5909 + }, + { + "epoch": 7.586649550706033, + "grad_norm": 2.179253578186035, + "learning_rate": 2.2740693196405648e-05, + "loss": 0.6224, + "step": 5910 + }, + { + "epoch": 7.58793324775353, + "grad_norm": 1.4424742460250854, + "learning_rate": 2.274454428754814e-05, + "loss": 0.6183, + "step": 5911 + }, + { + "epoch": 7.589216944801027, + "grad_norm": 1.4401267766952515, + "learning_rate": 2.274839537869063e-05, + "loss": 0.5994, + "step": 5912 + }, + { + "epoch": 7.590500641848524, + "grad_norm": 1.6515001058578491, + "learning_rate": 2.2752246469833117e-05, + "loss": 0.6151, + "step": 5913 + }, + { + "epoch": 7.59178433889602, + "grad_norm": 1.7350267171859741, + "learning_rate": 2.2756097560975612e-05, + "loss": 0.5973, + "step": 5914 + }, + { + "epoch": 7.593068035943517, + "grad_norm": 2.776554822921753, + "learning_rate": 2.2759948652118103e-05, + "loss": 0.5926, + "step": 5915 + }, + { + "epoch": 7.594351732991014, + "grad_norm": 1.475827693939209, + "learning_rate": 2.276379974326059e-05, + "loss": 0.5739, + "step": 5916 + }, + { + "epoch": 7.595635430038511, + "grad_norm": 4.775969505310059, + "learning_rate": 2.276765083440308e-05, + "loss": 0.6495, + "step": 5917 + }, + { + "epoch": 7.596919127086007, + "grad_norm": 6.516726493835449, + "learning_rate": 2.277150192554557e-05, + "loss": 0.6076, + "step": 5918 + }, + { + "epoch": 7.598202824133504, + "grad_norm": 1.2756763696670532, + "learning_rate": 2.277535301668806e-05, + "loss": 0.6241, + "step": 5919 + }, + { + "epoch": 7.599486521181001, + "grad_norm": 1.8981828689575195, + "learning_rate": 2.2779204107830555e-05, + "loss": 0.6084, + "step": 5920 + }, + { + "epoch": 7.600770218228498, + "grad_norm": 1.483924388885498, + "learning_rate": 2.2783055198973043e-05, + "loss": 0.6196, + "step": 5921 + }, + { + "epoch": 7.602053915275995, + "grad_norm": 2.457219362258911, + "learning_rate": 2.2786906290115534e-05, + "loss": 0.6413, + "step": 5922 + }, + { + "epoch": 7.603337612323491, + "grad_norm": 2.9353389739990234, + "learning_rate": 2.2790757381258025e-05, + "loss": 0.6079, + "step": 5923 + }, + { + "epoch": 7.604621309370988, + "grad_norm": 1.1179932355880737, + "learning_rate": 2.2794608472400512e-05, + "loss": 0.6201, + "step": 5924 + }, + { + "epoch": 7.605905006418485, + "grad_norm": 5.554969787597656, + "learning_rate": 2.2798459563543003e-05, + "loss": 0.627, + "step": 5925 + }, + { + "epoch": 7.607188703465982, + "grad_norm": 3.423536777496338, + "learning_rate": 2.2802310654685495e-05, + "loss": 0.5935, + "step": 5926 + }, + { + "epoch": 7.608472400513479, + "grad_norm": 1.676068663597107, + "learning_rate": 2.2806161745827986e-05, + "loss": 0.5848, + "step": 5927 + }, + { + "epoch": 7.609756097560975, + "grad_norm": 1.7437498569488525, + "learning_rate": 2.2810012836970477e-05, + "loss": 0.6152, + "step": 5928 + }, + { + "epoch": 7.611039794608472, + "grad_norm": 2.9841456413269043, + "learning_rate": 2.2813863928112964e-05, + "loss": 0.661, + "step": 5929 + }, + { + "epoch": 7.612323491655969, + "grad_norm": 1.5537540912628174, + "learning_rate": 2.2817715019255455e-05, + "loss": 0.6317, + "step": 5930 + }, + { + "epoch": 7.613607188703466, + "grad_norm": 2.4899494647979736, + "learning_rate": 2.2821566110397947e-05, + "loss": 0.6081, + "step": 5931 + }, + { + "epoch": 7.614890885750963, + "grad_norm": 3.27211856842041, + "learning_rate": 2.2825417201540438e-05, + "loss": 0.6739, + "step": 5932 + }, + { + "epoch": 7.616174582798459, + "grad_norm": 2.209700584411621, + "learning_rate": 2.282926829268293e-05, + "loss": 0.63, + "step": 5933 + }, + { + "epoch": 7.617458279845956, + "grad_norm": 3.0260348320007324, + "learning_rate": 2.283311938382542e-05, + "loss": 0.6641, + "step": 5934 + }, + { + "epoch": 7.618741976893453, + "grad_norm": 1.4119725227355957, + "learning_rate": 2.2836970474967907e-05, + "loss": 0.5963, + "step": 5935 + }, + { + "epoch": 7.62002567394095, + "grad_norm": 7.129110813140869, + "learning_rate": 2.28408215661104e-05, + "loss": 0.5913, + "step": 5936 + }, + { + "epoch": 7.621309370988447, + "grad_norm": 2.5481178760528564, + "learning_rate": 2.2844672657252886e-05, + "loss": 0.661, + "step": 5937 + }, + { + "epoch": 7.622593068035943, + "grad_norm": 1.9274225234985352, + "learning_rate": 2.284852374839538e-05, + "loss": 0.6301, + "step": 5938 + }, + { + "epoch": 7.62387676508344, + "grad_norm": 6.081809043884277, + "learning_rate": 2.285237483953787e-05, + "loss": 0.6674, + "step": 5939 + }, + { + "epoch": 7.625160462130937, + "grad_norm": 4.107434272766113, + "learning_rate": 2.285622593068036e-05, + "loss": 0.6528, + "step": 5940 + }, + { + "epoch": 7.626444159178434, + "grad_norm": 4.796444892883301, + "learning_rate": 2.286007702182285e-05, + "loss": 0.7169, + "step": 5941 + }, + { + "epoch": 7.62772785622593, + "grad_norm": 1.7217769622802734, + "learning_rate": 2.286392811296534e-05, + "loss": 0.6017, + "step": 5942 + }, + { + "epoch": 7.629011553273427, + "grad_norm": 2.3549137115478516, + "learning_rate": 2.286777920410783e-05, + "loss": 0.6826, + "step": 5943 + }, + { + "epoch": 7.630295250320924, + "grad_norm": 2.945624828338623, + "learning_rate": 2.2871630295250324e-05, + "loss": 0.6981, + "step": 5944 + }, + { + "epoch": 7.631578947368421, + "grad_norm": 1.8836973905563354, + "learning_rate": 2.287548138639281e-05, + "loss": 0.6682, + "step": 5945 + }, + { + "epoch": 7.632862644415918, + "grad_norm": 4.627108097076416, + "learning_rate": 2.2879332477535302e-05, + "loss": 0.6935, + "step": 5946 + }, + { + "epoch": 7.634146341463414, + "grad_norm": 3.1807053089141846, + "learning_rate": 2.2883183568677793e-05, + "loss": 0.6881, + "step": 5947 + }, + { + "epoch": 7.635430038510911, + "grad_norm": 4.121515274047852, + "learning_rate": 2.288703465982028e-05, + "loss": 0.6089, + "step": 5948 + }, + { + "epoch": 7.636713735558408, + "grad_norm": 1.689919352531433, + "learning_rate": 2.2890885750962772e-05, + "loss": 0.6724, + "step": 5949 + }, + { + "epoch": 7.637997432605905, + "grad_norm": 7.306366920471191, + "learning_rate": 2.2894736842105263e-05, + "loss": 0.6793, + "step": 5950 + }, + { + "epoch": 7.639281129653401, + "grad_norm": 6.785739421844482, + "learning_rate": 2.2898587933247754e-05, + "loss": 0.7078, + "step": 5951 + }, + { + "epoch": 7.640564826700898, + "grad_norm": 3.0853073596954346, + "learning_rate": 2.2902439024390245e-05, + "loss": 0.7953, + "step": 5952 + }, + { + "epoch": 7.641848523748395, + "grad_norm": 6.961120128631592, + "learning_rate": 2.2906290115532737e-05, + "loss": 0.8186, + "step": 5953 + }, + { + "epoch": 7.643132220795892, + "grad_norm": 1.9414219856262207, + "learning_rate": 2.2910141206675224e-05, + "loss": 0.602, + "step": 5954 + }, + { + "epoch": 7.644415917843389, + "grad_norm": 4.425426483154297, + "learning_rate": 2.2913992297817715e-05, + "loss": 0.5716, + "step": 5955 + }, + { + "epoch": 7.645699614890885, + "grad_norm": 2.814244270324707, + "learning_rate": 2.2917843388960203e-05, + "loss": 0.6151, + "step": 5956 + }, + { + "epoch": 7.646983311938382, + "grad_norm": 2.550157308578491, + "learning_rate": 2.2921694480102697e-05, + "loss": 0.6041, + "step": 5957 + }, + { + "epoch": 7.648267008985879, + "grad_norm": 1.623260259628296, + "learning_rate": 2.292554557124519e-05, + "loss": 0.6059, + "step": 5958 + }, + { + "epoch": 7.649550706033376, + "grad_norm": 1.9689778089523315, + "learning_rate": 2.2929396662387676e-05, + "loss": 0.5934, + "step": 5959 + }, + { + "epoch": 7.650834403080873, + "grad_norm": 1.8228116035461426, + "learning_rate": 2.2933247753530167e-05, + "loss": 0.6058, + "step": 5960 + }, + { + "epoch": 7.652118100128369, + "grad_norm": 2.3298261165618896, + "learning_rate": 2.2937098844672658e-05, + "loss": 0.6065, + "step": 5961 + }, + { + "epoch": 7.653401797175866, + "grad_norm": 2.145712375640869, + "learning_rate": 2.2940949935815146e-05, + "loss": 0.6123, + "step": 5962 + }, + { + "epoch": 7.654685494223363, + "grad_norm": 1.305494785308838, + "learning_rate": 2.294480102695764e-05, + "loss": 0.5971, + "step": 5963 + }, + { + "epoch": 7.65596919127086, + "grad_norm": 2.0683414936065674, + "learning_rate": 2.2948652118100128e-05, + "loss": 0.629, + "step": 5964 + }, + { + "epoch": 7.657252888318357, + "grad_norm": 4.289642333984375, + "learning_rate": 2.295250320924262e-05, + "loss": 0.571, + "step": 5965 + }, + { + "epoch": 7.658536585365853, + "grad_norm": 2.646901845932007, + "learning_rate": 2.295635430038511e-05, + "loss": 0.5893, + "step": 5966 + }, + { + "epoch": 7.65982028241335, + "grad_norm": 2.31935977935791, + "learning_rate": 2.2960205391527598e-05, + "loss": 0.5947, + "step": 5967 + }, + { + "epoch": 7.661103979460847, + "grad_norm": 6.1960625648498535, + "learning_rate": 2.296405648267009e-05, + "loss": 0.5893, + "step": 5968 + }, + { + "epoch": 7.662387676508344, + "grad_norm": 2.9828426837921143, + "learning_rate": 2.2967907573812583e-05, + "loss": 0.6379, + "step": 5969 + }, + { + "epoch": 7.663671373555841, + "grad_norm": 1.5284733772277832, + "learning_rate": 2.297175866495507e-05, + "loss": 0.5998, + "step": 5970 + }, + { + "epoch": 7.664955070603337, + "grad_norm": 9.479731559753418, + "learning_rate": 2.2975609756097562e-05, + "loss": 0.627, + "step": 5971 + }, + { + "epoch": 7.666238767650834, + "grad_norm": 2.2797703742980957, + "learning_rate": 2.2979460847240053e-05, + "loss": 0.6277, + "step": 5972 + }, + { + "epoch": 7.667522464698331, + "grad_norm": 2.037456750869751, + "learning_rate": 2.298331193838254e-05, + "loss": 0.6319, + "step": 5973 + }, + { + "epoch": 7.668806161745828, + "grad_norm": 1.6834349632263184, + "learning_rate": 2.2987163029525032e-05, + "loss": 0.628, + "step": 5974 + }, + { + "epoch": 7.670089858793324, + "grad_norm": 2.199469804763794, + "learning_rate": 2.2991014120667523e-05, + "loss": 0.605, + "step": 5975 + }, + { + "epoch": 7.671373555840821, + "grad_norm": 1.0881702899932861, + "learning_rate": 2.2994865211810014e-05, + "loss": 0.5862, + "step": 5976 + }, + { + "epoch": 7.672657252888318, + "grad_norm": 1.9643750190734863, + "learning_rate": 2.2998716302952505e-05, + "loss": 0.6258, + "step": 5977 + }, + { + "epoch": 7.673940949935815, + "grad_norm": 2.1930854320526123, + "learning_rate": 2.3002567394094993e-05, + "loss": 0.6187, + "step": 5978 + }, + { + "epoch": 7.675224646983312, + "grad_norm": 10.57880687713623, + "learning_rate": 2.3006418485237484e-05, + "loss": 0.5833, + "step": 5979 + }, + { + "epoch": 7.676508344030808, + "grad_norm": 1.5508357286453247, + "learning_rate": 2.3010269576379975e-05, + "loss": 0.6577, + "step": 5980 + }, + { + "epoch": 7.677792041078305, + "grad_norm": 2.2664875984191895, + "learning_rate": 2.3014120667522466e-05, + "loss": 0.642, + "step": 5981 + }, + { + "epoch": 7.679075738125802, + "grad_norm": 2.193931818008423, + "learning_rate": 2.3017971758664957e-05, + "loss": 0.6022, + "step": 5982 + }, + { + "epoch": 7.680359435173299, + "grad_norm": 1.6780046224594116, + "learning_rate": 2.3021822849807445e-05, + "loss": 0.6162, + "step": 5983 + }, + { + "epoch": 7.681643132220795, + "grad_norm": 1.7744807004928589, + "learning_rate": 2.3025673940949936e-05, + "loss": 0.6841, + "step": 5984 + }, + { + "epoch": 7.682926829268292, + "grad_norm": 5.330015659332275, + "learning_rate": 2.3029525032092427e-05, + "loss": 0.6273, + "step": 5985 + }, + { + "epoch": 7.684210526315789, + "grad_norm": 1.5794929265975952, + "learning_rate": 2.3033376123234915e-05, + "loss": 0.6959, + "step": 5986 + }, + { + "epoch": 7.685494223363286, + "grad_norm": 1.450798511505127, + "learning_rate": 2.303722721437741e-05, + "loss": 0.6472, + "step": 5987 + }, + { + "epoch": 7.686777920410783, + "grad_norm": 2.138030767440796, + "learning_rate": 2.30410783055199e-05, + "loss": 0.6257, + "step": 5988 + }, + { + "epoch": 7.688061617458279, + "grad_norm": 1.7928136587142944, + "learning_rate": 2.3044929396662388e-05, + "loss": 0.645, + "step": 5989 + }, + { + "epoch": 7.689345314505776, + "grad_norm": 2.3938567638397217, + "learning_rate": 2.304878048780488e-05, + "loss": 0.6336, + "step": 5990 + }, + { + "epoch": 7.690629011553273, + "grad_norm": 2.672452926635742, + "learning_rate": 2.3052631578947367e-05, + "loss": 0.6949, + "step": 5991 + }, + { + "epoch": 7.69191270860077, + "grad_norm": 1.9950408935546875, + "learning_rate": 2.3056482670089858e-05, + "loss": 0.62, + "step": 5992 + }, + { + "epoch": 7.693196405648267, + "grad_norm": 1.664311170578003, + "learning_rate": 2.3060333761232352e-05, + "loss": 0.6111, + "step": 5993 + }, + { + "epoch": 7.694480102695763, + "grad_norm": 1.4445120096206665, + "learning_rate": 2.306418485237484e-05, + "loss": 0.7373, + "step": 5994 + }, + { + "epoch": 7.69576379974326, + "grad_norm": 3.916889190673828, + "learning_rate": 2.306803594351733e-05, + "loss": 0.6369, + "step": 5995 + }, + { + "epoch": 7.697047496790757, + "grad_norm": 2.280566453933716, + "learning_rate": 2.3071887034659822e-05, + "loss": 0.7056, + "step": 5996 + }, + { + "epoch": 7.698331193838254, + "grad_norm": 16.41118812561035, + "learning_rate": 2.307573812580231e-05, + "loss": 0.6679, + "step": 5997 + }, + { + "epoch": 7.699614890885751, + "grad_norm": 4.847263336181641, + "learning_rate": 2.30795892169448e-05, + "loss": 0.6766, + "step": 5998 + }, + { + "epoch": 7.700898587933247, + "grad_norm": 1.5855400562286377, + "learning_rate": 2.3083440308087292e-05, + "loss": 0.7581, + "step": 5999 + }, + { + "epoch": 7.702182284980744, + "grad_norm": 2.0737929344177246, + "learning_rate": 2.3087291399229783e-05, + "loss": 0.7118, + "step": 6000 + }, + { + "epoch": 7.702182284980744, + "eval_cer": 0.29826708439396904, + "eval_loss": 0.600639820098877, + "eval_runtime": 14.0686, + "eval_samples_per_second": 69.872, + "eval_steps_per_second": 0.498, + "eval_wer": 0.5580142169227241, + "step": 6000 + }, + { + "epoch": 7.703465982028241, + "grad_norm": 6.83292293548584, + "learning_rate": 2.3091142490372274e-05, + "loss": 0.7768, + "step": 6001 + }, + { + "epoch": 7.704749679075738, + "grad_norm": 2.3460748195648193, + "learning_rate": 2.309499358151476e-05, + "loss": 0.7445, + "step": 6002 + }, + { + "epoch": 7.706033376123235, + "grad_norm": 3.362201452255249, + "learning_rate": 2.3098844672657253e-05, + "loss": 0.9455, + "step": 6003 + }, + { + "epoch": 7.7073170731707314, + "grad_norm": 1.9112989902496338, + "learning_rate": 2.3102695763799744e-05, + "loss": 0.624, + "step": 6004 + }, + { + "epoch": 7.708600770218228, + "grad_norm": 1.1652741432189941, + "learning_rate": 2.310654685494223e-05, + "loss": 0.5566, + "step": 6005 + }, + { + "epoch": 7.709884467265725, + "grad_norm": 1.4116460084915161, + "learning_rate": 2.3110397946084726e-05, + "loss": 0.5863, + "step": 6006 + }, + { + "epoch": 7.711168164313222, + "grad_norm": 1.6576427221298218, + "learning_rate": 2.3114249037227217e-05, + "loss": 0.6044, + "step": 6007 + }, + { + "epoch": 7.712451861360719, + "grad_norm": 1.5487594604492188, + "learning_rate": 2.3118100128369705e-05, + "loss": 0.5858, + "step": 6008 + }, + { + "epoch": 7.7137355584082155, + "grad_norm": 1.5280044078826904, + "learning_rate": 2.3121951219512196e-05, + "loss": 0.6243, + "step": 6009 + }, + { + "epoch": 7.715019255455712, + "grad_norm": 2.7457668781280518, + "learning_rate": 2.3125802310654683e-05, + "loss": 0.5799, + "step": 6010 + }, + { + "epoch": 7.716302952503209, + "grad_norm": 1.6753928661346436, + "learning_rate": 2.3129653401797175e-05, + "loss": 0.5983, + "step": 6011 + }, + { + "epoch": 7.717586649550706, + "grad_norm": 1.6052052974700928, + "learning_rate": 2.313350449293967e-05, + "loss": 0.6333, + "step": 6012 + }, + { + "epoch": 7.7188703465982025, + "grad_norm": 4.165308952331543, + "learning_rate": 2.3137355584082157e-05, + "loss": 0.6115, + "step": 6013 + }, + { + "epoch": 7.7201540436456995, + "grad_norm": 0.9919355511665344, + "learning_rate": 2.3141206675224648e-05, + "loss": 0.5658, + "step": 6014 + }, + { + "epoch": 7.721437740693196, + "grad_norm": 1.2620570659637451, + "learning_rate": 2.314505776636714e-05, + "loss": 0.6322, + "step": 6015 + }, + { + "epoch": 7.722721437740693, + "grad_norm": 2.1706454753875732, + "learning_rate": 2.3148908857509627e-05, + "loss": 0.5798, + "step": 6016 + }, + { + "epoch": 7.7240051347881895, + "grad_norm": 1.117845892906189, + "learning_rate": 2.3152759948652118e-05, + "loss": 0.5543, + "step": 6017 + }, + { + "epoch": 7.7252888318356865, + "grad_norm": 1.6479606628417969, + "learning_rate": 2.3156611039794612e-05, + "loss": 0.6014, + "step": 6018 + }, + { + "epoch": 7.7265725288831835, + "grad_norm": 3.6243720054626465, + "learning_rate": 2.31604621309371e-05, + "loss": 0.614, + "step": 6019 + }, + { + "epoch": 7.7278562259306804, + "grad_norm": 1.6752082109451294, + "learning_rate": 2.316431322207959e-05, + "loss": 0.5894, + "step": 6020 + }, + { + "epoch": 7.729139922978177, + "grad_norm": 1.796798586845398, + "learning_rate": 2.316816431322208e-05, + "loss": 0.5435, + "step": 6021 + }, + { + "epoch": 7.7304236200256735, + "grad_norm": 3.806607484817505, + "learning_rate": 2.317201540436457e-05, + "loss": 0.6355, + "step": 6022 + }, + { + "epoch": 7.7317073170731705, + "grad_norm": 1.4071762561798096, + "learning_rate": 2.317586649550706e-05, + "loss": 0.5812, + "step": 6023 + }, + { + "epoch": 7.7329910141206675, + "grad_norm": 2.757826566696167, + "learning_rate": 2.317971758664955e-05, + "loss": 0.5939, + "step": 6024 + }, + { + "epoch": 7.7342747111681645, + "grad_norm": 1.4263404607772827, + "learning_rate": 2.3183568677792043e-05, + "loss": 0.5737, + "step": 6025 + }, + { + "epoch": 7.735558408215661, + "grad_norm": 3.4167821407318115, + "learning_rate": 2.3187419768934534e-05, + "loss": 0.6164, + "step": 6026 + }, + { + "epoch": 7.7368421052631575, + "grad_norm": 2.651202917098999, + "learning_rate": 2.319127086007702e-05, + "loss": 0.5937, + "step": 6027 + }, + { + "epoch": 7.7381258023106545, + "grad_norm": 1.8843220472335815, + "learning_rate": 2.3195121951219513e-05, + "loss": 0.6316, + "step": 6028 + }, + { + "epoch": 7.7394094993581515, + "grad_norm": 6.893351078033447, + "learning_rate": 2.3198973042362e-05, + "loss": 0.6426, + "step": 6029 + }, + { + "epoch": 7.7406931964056485, + "grad_norm": 3.4261345863342285, + "learning_rate": 2.3202824133504495e-05, + "loss": 0.6252, + "step": 6030 + }, + { + "epoch": 7.741976893453145, + "grad_norm": 1.9758269786834717, + "learning_rate": 2.3206675224646986e-05, + "loss": 0.6182, + "step": 6031 + }, + { + "epoch": 7.7432605905006415, + "grad_norm": 1.3037641048431396, + "learning_rate": 2.3210526315789473e-05, + "loss": 0.618, + "step": 6032 + }, + { + "epoch": 7.7445442875481385, + "grad_norm": 2.374450206756592, + "learning_rate": 2.3214377406931965e-05, + "loss": 0.6169, + "step": 6033 + }, + { + "epoch": 7.7458279845956355, + "grad_norm": 1.4998202323913574, + "learning_rate": 2.3218228498074456e-05, + "loss": 0.5952, + "step": 6034 + }, + { + "epoch": 7.7471116816431325, + "grad_norm": 2.2122745513916016, + "learning_rate": 2.3222079589216943e-05, + "loss": 0.6823, + "step": 6035 + }, + { + "epoch": 7.748395378690629, + "grad_norm": 2.4981651306152344, + "learning_rate": 2.3225930680359438e-05, + "loss": 0.6124, + "step": 6036 + }, + { + "epoch": 7.7496790757381255, + "grad_norm": 2.4914984703063965, + "learning_rate": 2.3229781771501925e-05, + "loss": 0.6454, + "step": 6037 + }, + { + "epoch": 7.7509627727856225, + "grad_norm": 3.0414798259735107, + "learning_rate": 2.3233632862644417e-05, + "loss": 0.6798, + "step": 6038 + }, + { + "epoch": 7.7522464698331195, + "grad_norm": 1.7155773639678955, + "learning_rate": 2.3237483953786908e-05, + "loss": 0.6695, + "step": 6039 + }, + { + "epoch": 7.7535301668806165, + "grad_norm": 1.4927219152450562, + "learning_rate": 2.3241335044929395e-05, + "loss": 0.7231, + "step": 6040 + }, + { + "epoch": 7.7548138639281134, + "grad_norm": 4.036815166473389, + "learning_rate": 2.3245186136071886e-05, + "loss": 0.6123, + "step": 6041 + }, + { + "epoch": 7.7560975609756095, + "grad_norm": 13.049478530883789, + "learning_rate": 2.324903722721438e-05, + "loss": 0.6221, + "step": 6042 + }, + { + "epoch": 7.7573812580231065, + "grad_norm": 3.6704485416412354, + "learning_rate": 2.325288831835687e-05, + "loss": 0.6374, + "step": 6043 + }, + { + "epoch": 7.7586649550706035, + "grad_norm": 2.0211644172668457, + "learning_rate": 2.325673940949936e-05, + "loss": 0.659, + "step": 6044 + }, + { + "epoch": 7.7599486521181005, + "grad_norm": 1.5992250442504883, + "learning_rate": 2.326059050064185e-05, + "loss": 0.6886, + "step": 6045 + }, + { + "epoch": 7.761232349165597, + "grad_norm": 4.1588969230651855, + "learning_rate": 2.3264441591784338e-05, + "loss": 0.6462, + "step": 6046 + }, + { + "epoch": 7.7625160462130935, + "grad_norm": 3.5735690593719482, + "learning_rate": 2.326829268292683e-05, + "loss": 0.6603, + "step": 6047 + }, + { + "epoch": 7.7637997432605905, + "grad_norm": 1.8565154075622559, + "learning_rate": 2.327214377406932e-05, + "loss": 0.6575, + "step": 6048 + }, + { + "epoch": 7.7650834403080875, + "grad_norm": 6.606872081756592, + "learning_rate": 2.327599486521181e-05, + "loss": 0.6946, + "step": 6049 + }, + { + "epoch": 7.766367137355584, + "grad_norm": 4.263215065002441, + "learning_rate": 2.3279845956354303e-05, + "loss": 0.635, + "step": 6050 + }, + { + "epoch": 7.767650834403081, + "grad_norm": 3.956367015838623, + "learning_rate": 2.328369704749679e-05, + "loss": 0.708, + "step": 6051 + }, + { + "epoch": 7.7689345314505776, + "grad_norm": 3.9425384998321533, + "learning_rate": 2.328754813863928e-05, + "loss": 0.7675, + "step": 6052 + }, + { + "epoch": 7.7702182284980745, + "grad_norm": 20.465341567993164, + "learning_rate": 2.3291399229781772e-05, + "loss": 0.9947, + "step": 6053 + }, + { + "epoch": 7.7715019255455715, + "grad_norm": 3.215034008026123, + "learning_rate": 2.329525032092426e-05, + "loss": 0.5962, + "step": 6054 + }, + { + "epoch": 7.772785622593068, + "grad_norm": 6.745756149291992, + "learning_rate": 2.3299101412066755e-05, + "loss": 0.5904, + "step": 6055 + }, + { + "epoch": 7.774069319640565, + "grad_norm": 3.1767916679382324, + "learning_rate": 2.3302952503209242e-05, + "loss": 0.5986, + "step": 6056 + }, + { + "epoch": 7.775353016688062, + "grad_norm": 3.0681896209716797, + "learning_rate": 2.3306803594351733e-05, + "loss": 0.6212, + "step": 6057 + }, + { + "epoch": 7.7766367137355585, + "grad_norm": 3.2930946350097656, + "learning_rate": 2.3310654685494224e-05, + "loss": 0.6138, + "step": 6058 + }, + { + "epoch": 7.7779204107830555, + "grad_norm": 3.215588331222534, + "learning_rate": 2.3314505776636712e-05, + "loss": 0.6212, + "step": 6059 + }, + { + "epoch": 7.779204107830552, + "grad_norm": 2.084216356277466, + "learning_rate": 2.3318356867779203e-05, + "loss": 0.6301, + "step": 6060 + }, + { + "epoch": 7.780487804878049, + "grad_norm": 2.0743672847747803, + "learning_rate": 2.3322207958921698e-05, + "loss": 0.5897, + "step": 6061 + }, + { + "epoch": 7.781771501925546, + "grad_norm": 1.7108811140060425, + "learning_rate": 2.3326059050064185e-05, + "loss": 0.6273, + "step": 6062 + }, + { + "epoch": 7.7830551989730425, + "grad_norm": 2.3982253074645996, + "learning_rate": 2.3329910141206676e-05, + "loss": 0.6402, + "step": 6063 + }, + { + "epoch": 7.7843388960205395, + "grad_norm": 3.167083501815796, + "learning_rate": 2.3333761232349167e-05, + "loss": 0.6073, + "step": 6064 + }, + { + "epoch": 7.785622593068036, + "grad_norm": 2.669966459274292, + "learning_rate": 2.3337612323491655e-05, + "loss": 0.5928, + "step": 6065 + }, + { + "epoch": 7.786906290115533, + "grad_norm": 6.776124954223633, + "learning_rate": 2.3341463414634146e-05, + "loss": 0.592, + "step": 6066 + }, + { + "epoch": 7.78818998716303, + "grad_norm": 1.2675453424453735, + "learning_rate": 2.3345314505776637e-05, + "loss": 0.6195, + "step": 6067 + }, + { + "epoch": 7.7894736842105265, + "grad_norm": 2.697730779647827, + "learning_rate": 2.3349165596919128e-05, + "loss": 0.6005, + "step": 6068 + }, + { + "epoch": 7.7907573812580235, + "grad_norm": 1.6013691425323486, + "learning_rate": 2.335301668806162e-05, + "loss": 0.6292, + "step": 6069 + }, + { + "epoch": 7.79204107830552, + "grad_norm": 1.9701488018035889, + "learning_rate": 2.3356867779204107e-05, + "loss": 0.6021, + "step": 6070 + }, + { + "epoch": 7.793324775353017, + "grad_norm": 2.009666681289673, + "learning_rate": 2.3360718870346598e-05, + "loss": 0.5953, + "step": 6071 + }, + { + "epoch": 7.794608472400514, + "grad_norm": 1.8648602962493896, + "learning_rate": 2.336456996148909e-05, + "loss": 0.6303, + "step": 6072 + }, + { + "epoch": 7.7958921694480106, + "grad_norm": 2.43908953666687, + "learning_rate": 2.336842105263158e-05, + "loss": 0.6736, + "step": 6073 + }, + { + "epoch": 7.7971758664955075, + "grad_norm": 5.490927219390869, + "learning_rate": 2.337227214377407e-05, + "loss": 0.6177, + "step": 6074 + }, + { + "epoch": 7.798459563543004, + "grad_norm": 2.881801128387451, + "learning_rate": 2.337612323491656e-05, + "loss": 0.6222, + "step": 6075 + }, + { + "epoch": 7.799743260590501, + "grad_norm": 2.1768181324005127, + "learning_rate": 2.337997432605905e-05, + "loss": 0.6388, + "step": 6076 + }, + { + "epoch": 7.801026957637998, + "grad_norm": 3.1787126064300537, + "learning_rate": 2.338382541720154e-05, + "loss": 0.6057, + "step": 6077 + }, + { + "epoch": 7.802310654685495, + "grad_norm": 2.053722381591797, + "learning_rate": 2.338767650834403e-05, + "loss": 0.6403, + "step": 6078 + }, + { + "epoch": 7.803594351732991, + "grad_norm": 4.172235012054443, + "learning_rate": 2.3391527599486523e-05, + "loss": 0.6463, + "step": 6079 + }, + { + "epoch": 7.804878048780488, + "grad_norm": 1.9520628452301025, + "learning_rate": 2.3395378690629014e-05, + "loss": 0.6138, + "step": 6080 + }, + { + "epoch": 7.806161745827985, + "grad_norm": 1.7676564455032349, + "learning_rate": 2.3399229781771502e-05, + "loss": 0.5794, + "step": 6081 + }, + { + "epoch": 7.807445442875482, + "grad_norm": 4.654025554656982, + "learning_rate": 2.3403080872913993e-05, + "loss": 0.6813, + "step": 6082 + }, + { + "epoch": 7.808729139922978, + "grad_norm": 2.1294264793395996, + "learning_rate": 2.340693196405648e-05, + "loss": 0.6786, + "step": 6083 + }, + { + "epoch": 7.810012836970475, + "grad_norm": 2.302466630935669, + "learning_rate": 2.3410783055198972e-05, + "loss": 0.673, + "step": 6084 + }, + { + "epoch": 7.811296534017972, + "grad_norm": 2.1428005695343018, + "learning_rate": 2.3414634146341466e-05, + "loss": 0.643, + "step": 6085 + }, + { + "epoch": 7.812580231065469, + "grad_norm": 6.622169494628906, + "learning_rate": 2.3418485237483954e-05, + "loss": 0.6245, + "step": 6086 + }, + { + "epoch": 7.813863928112966, + "grad_norm": 3.9252946376800537, + "learning_rate": 2.3422336328626445e-05, + "loss": 0.6517, + "step": 6087 + }, + { + "epoch": 7.815147625160462, + "grad_norm": 5.832432746887207, + "learning_rate": 2.3426187419768936e-05, + "loss": 0.6389, + "step": 6088 + }, + { + "epoch": 7.816431322207959, + "grad_norm": 2.644477367401123, + "learning_rate": 2.3430038510911424e-05, + "loss": 0.6567, + "step": 6089 + }, + { + "epoch": 7.817715019255456, + "grad_norm": 3.3483457565307617, + "learning_rate": 2.3433889602053915e-05, + "loss": 0.669, + "step": 6090 + }, + { + "epoch": 7.818998716302953, + "grad_norm": 3.74495530128479, + "learning_rate": 2.343774069319641e-05, + "loss": 0.6429, + "step": 6091 + }, + { + "epoch": 7.82028241335045, + "grad_norm": 1.917747974395752, + "learning_rate": 2.3441591784338897e-05, + "loss": 0.6861, + "step": 6092 + }, + { + "epoch": 7.821566110397946, + "grad_norm": 4.912720680236816, + "learning_rate": 2.3445442875481388e-05, + "loss": 0.6816, + "step": 6093 + }, + { + "epoch": 7.822849807445443, + "grad_norm": 2.792570114135742, + "learning_rate": 2.3449293966623876e-05, + "loss": 0.6629, + "step": 6094 + }, + { + "epoch": 7.82413350449294, + "grad_norm": 2.9076476097106934, + "learning_rate": 2.3453145057766367e-05, + "loss": 0.6653, + "step": 6095 + }, + { + "epoch": 7.825417201540437, + "grad_norm": 2.274535655975342, + "learning_rate": 2.3456996148908858e-05, + "loss": 0.6509, + "step": 6096 + }, + { + "epoch": 7.826700898587934, + "grad_norm": 2.7285783290863037, + "learning_rate": 2.346084724005135e-05, + "loss": 0.6782, + "step": 6097 + }, + { + "epoch": 7.82798459563543, + "grad_norm": 6.567045211791992, + "learning_rate": 2.346469833119384e-05, + "loss": 0.7139, + "step": 6098 + }, + { + "epoch": 7.829268292682927, + "grad_norm": 2.69741153717041, + "learning_rate": 2.346854942233633e-05, + "loss": 0.6878, + "step": 6099 + }, + { + "epoch": 7.830551989730424, + "grad_norm": 6.494690895080566, + "learning_rate": 2.347240051347882e-05, + "loss": 0.7791, + "step": 6100 + }, + { + "epoch": 7.831835686777921, + "grad_norm": 1.7164084911346436, + "learning_rate": 2.347625160462131e-05, + "loss": 0.7539, + "step": 6101 + }, + { + "epoch": 7.833119383825418, + "grad_norm": 2.5996463298797607, + "learning_rate": 2.3480102695763798e-05, + "loss": 0.7688, + "step": 6102 + }, + { + "epoch": 7.834403080872914, + "grad_norm": 4.904178142547607, + "learning_rate": 2.3483953786906292e-05, + "loss": 0.9681, + "step": 6103 + }, + { + "epoch": 7.835686777920411, + "grad_norm": 2.7623848915100098, + "learning_rate": 2.3487804878048783e-05, + "loss": 0.6381, + "step": 6104 + }, + { + "epoch": 7.836970474967908, + "grad_norm": 1.400504231452942, + "learning_rate": 2.349165596919127e-05, + "loss": 0.6029, + "step": 6105 + }, + { + "epoch": 7.838254172015405, + "grad_norm": 2.3109920024871826, + "learning_rate": 2.3495507060333762e-05, + "loss": 0.5865, + "step": 6106 + }, + { + "epoch": 7.839537869062902, + "grad_norm": 3.444283962249756, + "learning_rate": 2.3499358151476253e-05, + "loss": 0.64, + "step": 6107 + }, + { + "epoch": 7.840821566110398, + "grad_norm": 2.0576372146606445, + "learning_rate": 2.350320924261874e-05, + "loss": 0.5937, + "step": 6108 + }, + { + "epoch": 7.842105263157895, + "grad_norm": 1.7952638864517212, + "learning_rate": 2.350706033376123e-05, + "loss": 0.6006, + "step": 6109 + }, + { + "epoch": 7.843388960205392, + "grad_norm": 2.480344772338867, + "learning_rate": 2.3510911424903726e-05, + "loss": 0.5704, + "step": 6110 + }, + { + "epoch": 7.844672657252889, + "grad_norm": 1.3463513851165771, + "learning_rate": 2.3514762516046214e-05, + "loss": 0.6166, + "step": 6111 + }, + { + "epoch": 7.845956354300385, + "grad_norm": 1.53445565700531, + "learning_rate": 2.3518613607188705e-05, + "loss": 0.6147, + "step": 6112 + }, + { + "epoch": 7.847240051347882, + "grad_norm": 4.833117961883545, + "learning_rate": 2.3522464698331193e-05, + "loss": 0.603, + "step": 6113 + }, + { + "epoch": 7.848523748395379, + "grad_norm": 3.0375547409057617, + "learning_rate": 2.3526315789473684e-05, + "loss": 0.6041, + "step": 6114 + }, + { + "epoch": 7.849807445442876, + "grad_norm": 68.21892547607422, + "learning_rate": 2.3530166880616175e-05, + "loss": 0.6629, + "step": 6115 + }, + { + "epoch": 7.851091142490373, + "grad_norm": 2.6204733848571777, + "learning_rate": 2.3534017971758666e-05, + "loss": 0.6281, + "step": 6116 + }, + { + "epoch": 7.852374839537869, + "grad_norm": 1.844322919845581, + "learning_rate": 2.3537869062901157e-05, + "loss": 0.6489, + "step": 6117 + }, + { + "epoch": 7.853658536585366, + "grad_norm": 2.570736885070801, + "learning_rate": 2.3541720154043648e-05, + "loss": 0.584, + "step": 6118 + }, + { + "epoch": 7.854942233632863, + "grad_norm": 2.24080228805542, + "learning_rate": 2.3545571245186136e-05, + "loss": 0.5558, + "step": 6119 + }, + { + "epoch": 7.85622593068036, + "grad_norm": 12.018404960632324, + "learning_rate": 2.3549422336328627e-05, + "loss": 0.6224, + "step": 6120 + }, + { + "epoch": 7.857509627727856, + "grad_norm": 13.4306058883667, + "learning_rate": 2.3553273427471114e-05, + "loss": 0.5899, + "step": 6121 + }, + { + "epoch": 7.858793324775353, + "grad_norm": 1.9872887134552002, + "learning_rate": 2.355712451861361e-05, + "loss": 0.6026, + "step": 6122 + }, + { + "epoch": 7.86007702182285, + "grad_norm": 1.6734663248062134, + "learning_rate": 2.35609756097561e-05, + "loss": 0.6312, + "step": 6123 + }, + { + "epoch": 7.861360718870347, + "grad_norm": 2.0893032550811768, + "learning_rate": 2.3564826700898588e-05, + "loss": 0.626, + "step": 6124 + }, + { + "epoch": 7.862644415917844, + "grad_norm": 1.8754087686538696, + "learning_rate": 2.356867779204108e-05, + "loss": 0.6137, + "step": 6125 + }, + { + "epoch": 7.86392811296534, + "grad_norm": 2.4485137462615967, + "learning_rate": 2.357252888318357e-05, + "loss": 0.6274, + "step": 6126 + }, + { + "epoch": 7.865211810012837, + "grad_norm": 2.110302686691284, + "learning_rate": 2.3576379974326057e-05, + "loss": 0.638, + "step": 6127 + }, + { + "epoch": 7.866495507060334, + "grad_norm": 6.414776802062988, + "learning_rate": 2.3580231065468552e-05, + "loss": 0.6238, + "step": 6128 + }, + { + "epoch": 7.867779204107831, + "grad_norm": 2.269850969314575, + "learning_rate": 2.3584082156611043e-05, + "loss": 0.6353, + "step": 6129 + }, + { + "epoch": 7.869062901155328, + "grad_norm": 1.712029218673706, + "learning_rate": 2.358793324775353e-05, + "loss": 0.5985, + "step": 6130 + }, + { + "epoch": 7.870346598202824, + "grad_norm": 3.132864236831665, + "learning_rate": 2.359178433889602e-05, + "loss": 0.5907, + "step": 6131 + }, + { + "epoch": 7.871630295250321, + "grad_norm": 2.6791491508483887, + "learning_rate": 2.359563543003851e-05, + "loss": 0.6358, + "step": 6132 + }, + { + "epoch": 7.872913992297818, + "grad_norm": 6.015817165374756, + "learning_rate": 2.3599486521181e-05, + "loss": 0.6436, + "step": 6133 + }, + { + "epoch": 7.874197689345315, + "grad_norm": 2.006897211074829, + "learning_rate": 2.3603337612323495e-05, + "loss": 0.6604, + "step": 6134 + }, + { + "epoch": 7.875481386392812, + "grad_norm": 1.9932059049606323, + "learning_rate": 2.3607188703465983e-05, + "loss": 0.6119, + "step": 6135 + }, + { + "epoch": 7.876765083440308, + "grad_norm": 3.434171199798584, + "learning_rate": 2.3611039794608474e-05, + "loss": 0.6022, + "step": 6136 + }, + { + "epoch": 7.878048780487805, + "grad_norm": 2.4220011234283447, + "learning_rate": 2.3614890885750965e-05, + "loss": 0.6266, + "step": 6137 + }, + { + "epoch": 7.879332477535302, + "grad_norm": 7.046273708343506, + "learning_rate": 2.3618741976893452e-05, + "loss": 0.5763, + "step": 6138 + }, + { + "epoch": 7.880616174582799, + "grad_norm": 4.841623783111572, + "learning_rate": 2.3622593068035943e-05, + "loss": 0.7237, + "step": 6139 + }, + { + "epoch": 7.881899871630296, + "grad_norm": 2.319629192352295, + "learning_rate": 2.3626444159178435e-05, + "loss": 0.6939, + "step": 6140 + }, + { + "epoch": 7.883183568677792, + "grad_norm": 1.872241735458374, + "learning_rate": 2.3630295250320926e-05, + "loss": 0.5697, + "step": 6141 + }, + { + "epoch": 7.884467265725289, + "grad_norm": 2.1071324348449707, + "learning_rate": 2.3634146341463417e-05, + "loss": 0.6256, + "step": 6142 + }, + { + "epoch": 7.885750962772786, + "grad_norm": 2.7669215202331543, + "learning_rate": 2.3637997432605904e-05, + "loss": 0.5858, + "step": 6143 + }, + { + "epoch": 7.887034659820283, + "grad_norm": 2.1260571479797363, + "learning_rate": 2.3641848523748395e-05, + "loss": 0.7253, + "step": 6144 + }, + { + "epoch": 7.888318356867779, + "grad_norm": 2.4413654804229736, + "learning_rate": 2.3645699614890887e-05, + "loss": 0.6601, + "step": 6145 + }, + { + "epoch": 7.889602053915276, + "grad_norm": 5.567632675170898, + "learning_rate": 2.3649550706033378e-05, + "loss": 0.642, + "step": 6146 + }, + { + "epoch": 7.890885750962773, + "grad_norm": 5.48069953918457, + "learning_rate": 2.365340179717587e-05, + "loss": 0.6567, + "step": 6147 + }, + { + "epoch": 7.89216944801027, + "grad_norm": 11.105510711669922, + "learning_rate": 2.3657252888318356e-05, + "loss": 0.6557, + "step": 6148 + }, + { + "epoch": 7.893453145057767, + "grad_norm": 5.357926845550537, + "learning_rate": 2.3661103979460847e-05, + "loss": 0.7115, + "step": 6149 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 9.892822265625, + "learning_rate": 2.366495507060334e-05, + "loss": 0.7059, + "step": 6150 + }, + { + "epoch": 7.89602053915276, + "grad_norm": 3.988654136657715, + "learning_rate": 2.3668806161745826e-05, + "loss": 0.7343, + "step": 6151 + }, + { + "epoch": 7.897304236200257, + "grad_norm": 3.3333094120025635, + "learning_rate": 2.367265725288832e-05, + "loss": 0.7703, + "step": 6152 + }, + { + "epoch": 7.898587933247754, + "grad_norm": 2.625706911087036, + "learning_rate": 2.367650834403081e-05, + "loss": 0.8792, + "step": 6153 + }, + { + "epoch": 7.89987163029525, + "grad_norm": 2.8029046058654785, + "learning_rate": 2.36803594351733e-05, + "loss": 0.5855, + "step": 6154 + }, + { + "epoch": 7.901155327342747, + "grad_norm": 0.9864883422851562, + "learning_rate": 2.368421052631579e-05, + "loss": 0.5802, + "step": 6155 + }, + { + "epoch": 7.902439024390244, + "grad_norm": 4.096097946166992, + "learning_rate": 2.368806161745828e-05, + "loss": 0.6068, + "step": 6156 + }, + { + "epoch": 7.903722721437741, + "grad_norm": 1.9535247087478638, + "learning_rate": 2.369191270860077e-05, + "loss": 0.6618, + "step": 6157 + }, + { + "epoch": 7.905006418485238, + "grad_norm": 2.0698635578155518, + "learning_rate": 2.369576379974326e-05, + "loss": 0.5951, + "step": 6158 + }, + { + "epoch": 7.906290115532734, + "grad_norm": 4.625611782073975, + "learning_rate": 2.369961489088575e-05, + "loss": 0.6626, + "step": 6159 + }, + { + "epoch": 7.907573812580231, + "grad_norm": 3.09360408782959, + "learning_rate": 2.3703465982028242e-05, + "loss": 0.6389, + "step": 6160 + }, + { + "epoch": 7.908857509627728, + "grad_norm": 3.253371000289917, + "learning_rate": 2.3707317073170733e-05, + "loss": 0.6139, + "step": 6161 + }, + { + "epoch": 7.910141206675225, + "grad_norm": 12.258646965026855, + "learning_rate": 2.371116816431322e-05, + "loss": 0.613, + "step": 6162 + }, + { + "epoch": 7.911424903722722, + "grad_norm": 1.7493821382522583, + "learning_rate": 2.3715019255455712e-05, + "loss": 0.6292, + "step": 6163 + }, + { + "epoch": 7.912708600770218, + "grad_norm": 1.8521924018859863, + "learning_rate": 2.3718870346598203e-05, + "loss": 0.5866, + "step": 6164 + }, + { + "epoch": 7.913992297817715, + "grad_norm": 3.6001009941101074, + "learning_rate": 2.3722721437740694e-05, + "loss": 0.5708, + "step": 6165 + }, + { + "epoch": 7.915275994865212, + "grad_norm": 2.8834753036499023, + "learning_rate": 2.3726572528883185e-05, + "loss": 0.6007, + "step": 6166 + }, + { + "epoch": 7.916559691912709, + "grad_norm": 2.516801357269287, + "learning_rate": 2.3730423620025673e-05, + "loss": 0.5756, + "step": 6167 + }, + { + "epoch": 7.917843388960206, + "grad_norm": 1.6025135517120361, + "learning_rate": 2.3734274711168164e-05, + "loss": 0.6524, + "step": 6168 + }, + { + "epoch": 7.919127086007702, + "grad_norm": 1.3942774534225464, + "learning_rate": 2.3738125802310655e-05, + "loss": 0.594, + "step": 6169 + }, + { + "epoch": 7.920410783055199, + "grad_norm": 2.372413158416748, + "learning_rate": 2.3741976893453143e-05, + "loss": 0.5864, + "step": 6170 + }, + { + "epoch": 7.921694480102696, + "grad_norm": 1.604977011680603, + "learning_rate": 2.3745827984595637e-05, + "loss": 0.6219, + "step": 6171 + }, + { + "epoch": 7.922978177150193, + "grad_norm": 20.749767303466797, + "learning_rate": 2.374967907573813e-05, + "loss": 0.594, + "step": 6172 + }, + { + "epoch": 7.92426187419769, + "grad_norm": 2.305088996887207, + "learning_rate": 2.3753530166880616e-05, + "loss": 0.6925, + "step": 6173 + }, + { + "epoch": 7.925545571245186, + "grad_norm": 5.482275485992432, + "learning_rate": 2.3757381258023107e-05, + "loss": 0.6267, + "step": 6174 + }, + { + "epoch": 7.926829268292683, + "grad_norm": 1.2902799844741821, + "learning_rate": 2.3761232349165598e-05, + "loss": 0.6099, + "step": 6175 + }, + { + "epoch": 7.92811296534018, + "grad_norm": 1.8427101373672485, + "learning_rate": 2.3765083440308086e-05, + "loss": 0.6174, + "step": 6176 + }, + { + "epoch": 7.929396662387677, + "grad_norm": 2.136254072189331, + "learning_rate": 2.376893453145058e-05, + "loss": 0.6251, + "step": 6177 + }, + { + "epoch": 7.930680359435174, + "grad_norm": 1.939070224761963, + "learning_rate": 2.3772785622593068e-05, + "loss": 0.6177, + "step": 6178 + }, + { + "epoch": 7.93196405648267, + "grad_norm": 4.22256326675415, + "learning_rate": 2.377663671373556e-05, + "loss": 0.6484, + "step": 6179 + }, + { + "epoch": 7.933247753530167, + "grad_norm": 1.9947974681854248, + "learning_rate": 2.378048780487805e-05, + "loss": 0.6183, + "step": 6180 + }, + { + "epoch": 7.934531450577664, + "grad_norm": 1.6649523973464966, + "learning_rate": 2.3784338896020538e-05, + "loss": 0.6327, + "step": 6181 + }, + { + "epoch": 7.935815147625161, + "grad_norm": 1.3751680850982666, + "learning_rate": 2.378818998716303e-05, + "loss": 0.5947, + "step": 6182 + }, + { + "epoch": 7.937098844672657, + "grad_norm": 2.844698905944824, + "learning_rate": 2.3792041078305523e-05, + "loss": 0.5548, + "step": 6183 + }, + { + "epoch": 7.938382541720154, + "grad_norm": 1.7270722389221191, + "learning_rate": 2.379589216944801e-05, + "loss": 0.6497, + "step": 6184 + }, + { + "epoch": 7.939666238767651, + "grad_norm": 1.3756657838821411, + "learning_rate": 2.3799743260590502e-05, + "loss": 0.6249, + "step": 6185 + }, + { + "epoch": 7.940949935815148, + "grad_norm": 2.8048319816589355, + "learning_rate": 2.380359435173299e-05, + "loss": 0.6457, + "step": 6186 + }, + { + "epoch": 7.942233632862644, + "grad_norm": 2.0504353046417236, + "learning_rate": 2.380744544287548e-05, + "loss": 0.6541, + "step": 6187 + }, + { + "epoch": 7.943517329910141, + "grad_norm": 3.2680792808532715, + "learning_rate": 2.3811296534017972e-05, + "loss": 0.6283, + "step": 6188 + }, + { + "epoch": 7.944801026957638, + "grad_norm": 2.071652412414551, + "learning_rate": 2.3815147625160463e-05, + "loss": 0.6255, + "step": 6189 + }, + { + "epoch": 7.946084724005135, + "grad_norm": 2.0730173587799072, + "learning_rate": 2.3818998716302954e-05, + "loss": 0.6565, + "step": 6190 + }, + { + "epoch": 7.947368421052632, + "grad_norm": 1.782095193862915, + "learning_rate": 2.3822849807445445e-05, + "loss": 0.6313, + "step": 6191 + }, + { + "epoch": 7.948652118100128, + "grad_norm": 34.77603530883789, + "learning_rate": 2.3826700898587933e-05, + "loss": 0.6382, + "step": 6192 + }, + { + "epoch": 7.949935815147625, + "grad_norm": 2.323399066925049, + "learning_rate": 2.3830551989730424e-05, + "loss": 0.6091, + "step": 6193 + }, + { + "epoch": 7.951219512195122, + "grad_norm": 1.8763320446014404, + "learning_rate": 2.383440308087291e-05, + "loss": 0.6985, + "step": 6194 + }, + { + "epoch": 7.952503209242619, + "grad_norm": 6.780721664428711, + "learning_rate": 2.3838254172015406e-05, + "loss": 0.6741, + "step": 6195 + }, + { + "epoch": 7.953786906290116, + "grad_norm": 6.8530402183532715, + "learning_rate": 2.3842105263157897e-05, + "loss": 0.6758, + "step": 6196 + }, + { + "epoch": 7.955070603337612, + "grad_norm": 3.666166067123413, + "learning_rate": 2.3845956354300385e-05, + "loss": 0.6384, + "step": 6197 + }, + { + "epoch": 7.956354300385109, + "grad_norm": 5.58335542678833, + "learning_rate": 2.3849807445442876e-05, + "loss": 0.651, + "step": 6198 + }, + { + "epoch": 7.957637997432606, + "grad_norm": 25.99307632446289, + "learning_rate": 2.3853658536585367e-05, + "loss": 0.7358, + "step": 6199 + }, + { + "epoch": 7.958921694480103, + "grad_norm": 7.436834335327148, + "learning_rate": 2.3857509627727855e-05, + "loss": 0.7277, + "step": 6200 + }, + { + "epoch": 7.9602053915276, + "grad_norm": 4.537602424621582, + "learning_rate": 2.386136071887035e-05, + "loss": 0.7095, + "step": 6201 + }, + { + "epoch": 7.961489088575096, + "grad_norm": 4.685937881469727, + "learning_rate": 2.386521181001284e-05, + "loss": 0.7408, + "step": 6202 + }, + { + "epoch": 7.962772785622593, + "grad_norm": 4.103137493133545, + "learning_rate": 2.3869062901155328e-05, + "loss": 0.9455, + "step": 6203 + }, + { + "epoch": 7.96405648267009, + "grad_norm": 3.235252857208252, + "learning_rate": 2.387291399229782e-05, + "loss": 0.5963, + "step": 6204 + }, + { + "epoch": 7.965340179717587, + "grad_norm": 1.730162262916565, + "learning_rate": 2.3876765083440307e-05, + "loss": 0.6189, + "step": 6205 + }, + { + "epoch": 7.966623876765084, + "grad_norm": 1.5587011575698853, + "learning_rate": 2.3880616174582798e-05, + "loss": 0.6064, + "step": 6206 + }, + { + "epoch": 7.96790757381258, + "grad_norm": 3.0412709712982178, + "learning_rate": 2.388446726572529e-05, + "loss": 0.6092, + "step": 6207 + }, + { + "epoch": 7.969191270860077, + "grad_norm": 1.929526686668396, + "learning_rate": 2.388831835686778e-05, + "loss": 0.5744, + "step": 6208 + }, + { + "epoch": 7.970474967907574, + "grad_norm": 2.573573112487793, + "learning_rate": 2.389216944801027e-05, + "loss": 0.5762, + "step": 6209 + }, + { + "epoch": 7.971758664955071, + "grad_norm": 2.464062213897705, + "learning_rate": 2.3896020539152762e-05, + "loss": 0.6348, + "step": 6210 + }, + { + "epoch": 7.973042362002568, + "grad_norm": 2.277080535888672, + "learning_rate": 2.389987163029525e-05, + "loss": 0.6305, + "step": 6211 + }, + { + "epoch": 7.974326059050064, + "grad_norm": 1.6244763135910034, + "learning_rate": 2.390372272143774e-05, + "loss": 0.6066, + "step": 6212 + }, + { + "epoch": 7.975609756097561, + "grad_norm": 1.556441068649292, + "learning_rate": 2.390757381258023e-05, + "loss": 0.6616, + "step": 6213 + }, + { + "epoch": 7.976893453145058, + "grad_norm": 3.178328275680542, + "learning_rate": 2.3911424903722723e-05, + "loss": 0.6169, + "step": 6214 + }, + { + "epoch": 7.978177150192555, + "grad_norm": 1.8423422574996948, + "learning_rate": 2.3915275994865214e-05, + "loss": 0.614, + "step": 6215 + }, + { + "epoch": 7.979460847240051, + "grad_norm": 2.0709874629974365, + "learning_rate": 2.39191270860077e-05, + "loss": 0.6287, + "step": 6216 + }, + { + "epoch": 7.980744544287548, + "grad_norm": 1.823858380317688, + "learning_rate": 2.3922978177150193e-05, + "loss": 0.6564, + "step": 6217 + }, + { + "epoch": 7.982028241335045, + "grad_norm": 2.712674856185913, + "learning_rate": 2.3926829268292684e-05, + "loss": 0.6555, + "step": 6218 + }, + { + "epoch": 7.983311938382542, + "grad_norm": 1.9923181533813477, + "learning_rate": 2.393068035943517e-05, + "loss": 0.5916, + "step": 6219 + }, + { + "epoch": 7.984595635430038, + "grad_norm": 2.7040553092956543, + "learning_rate": 2.3934531450577666e-05, + "loss": 0.594, + "step": 6220 + }, + { + "epoch": 7.985879332477535, + "grad_norm": 18.930326461791992, + "learning_rate": 2.3938382541720157e-05, + "loss": 0.5825, + "step": 6221 + }, + { + "epoch": 7.987163029525032, + "grad_norm": 1.8149927854537964, + "learning_rate": 2.3942233632862645e-05, + "loss": 0.6227, + "step": 6222 + }, + { + "epoch": 7.988446726572529, + "grad_norm": 4.159711837768555, + "learning_rate": 2.3946084724005136e-05, + "loss": 0.7027, + "step": 6223 + }, + { + "epoch": 7.989730423620026, + "grad_norm": 5.8675456047058105, + "learning_rate": 2.3949935815147623e-05, + "loss": 0.6472, + "step": 6224 + }, + { + "epoch": 7.991014120667522, + "grad_norm": 2.413499355316162, + "learning_rate": 2.3953786906290115e-05, + "loss": 0.6666, + "step": 6225 + }, + { + "epoch": 7.992297817715019, + "grad_norm": 4.335354328155518, + "learning_rate": 2.395763799743261e-05, + "loss": 0.6813, + "step": 6226 + }, + { + "epoch": 7.993581514762516, + "grad_norm": 2.65458345413208, + "learning_rate": 2.3961489088575097e-05, + "loss": 0.6468, + "step": 6227 + }, + { + "epoch": 7.994865211810013, + "grad_norm": 2.407043695449829, + "learning_rate": 2.3965340179717588e-05, + "loss": 0.6709, + "step": 6228 + }, + { + "epoch": 7.99614890885751, + "grad_norm": 2.3941256999969482, + "learning_rate": 2.396919127086008e-05, + "loss": 0.6665, + "step": 6229 + }, + { + "epoch": 7.997432605905006, + "grad_norm": 2.16532039642334, + "learning_rate": 2.3973042362002567e-05, + "loss": 0.7074, + "step": 6230 + }, + { + "epoch": 7.998716302952503, + "grad_norm": 1.9803742170333862, + "learning_rate": 2.3976893453145058e-05, + "loss": 0.7233, + "step": 6231 + }, + { + "epoch": 8.0, + "grad_norm": 6.604814529418945, + "learning_rate": 2.398074454428755e-05, + "loss": 1.0284, + "step": 6232 + }, + { + "epoch": 8.001283697047496, + "grad_norm": 1.3646386861801147, + "learning_rate": 2.398459563543004e-05, + "loss": 0.5858, + "step": 6233 + }, + { + "epoch": 8.002567394094994, + "grad_norm": 1.7437431812286377, + "learning_rate": 2.398844672657253e-05, + "loss": 0.5516, + "step": 6234 + }, + { + "epoch": 8.00385109114249, + "grad_norm": 2.6520256996154785, + "learning_rate": 2.399229781771502e-05, + "loss": 0.5638, + "step": 6235 + }, + { + "epoch": 8.005134788189988, + "grad_norm": 2.0220394134521484, + "learning_rate": 2.399614890885751e-05, + "loss": 0.6044, + "step": 6236 + }, + { + "epoch": 8.006418485237484, + "grad_norm": 3.397958755493164, + "learning_rate": 2.4e-05, + "loss": 0.592, + "step": 6237 + }, + { + "epoch": 8.00770218228498, + "grad_norm": 2.294633388519287, + "learning_rate": 2.400385109114249e-05, + "loss": 0.597, + "step": 6238 + }, + { + "epoch": 8.008985879332478, + "grad_norm": 2.2119476795196533, + "learning_rate": 2.4007702182284983e-05, + "loss": 0.5797, + "step": 6239 + }, + { + "epoch": 8.010269576379974, + "grad_norm": 1.469361424446106, + "learning_rate": 2.401155327342747e-05, + "loss": 0.5368, + "step": 6240 + }, + { + "epoch": 8.011553273427472, + "grad_norm": 1.6755956411361694, + "learning_rate": 2.401540436456996e-05, + "loss": 0.6256, + "step": 6241 + }, + { + "epoch": 8.012836970474968, + "grad_norm": 2.704101800918579, + "learning_rate": 2.4019255455712453e-05, + "loss": 0.5812, + "step": 6242 + }, + { + "epoch": 8.014120667522464, + "grad_norm": 1.6087473630905151, + "learning_rate": 2.402310654685494e-05, + "loss": 0.5822, + "step": 6243 + }, + { + "epoch": 8.015404364569962, + "grad_norm": 1.805041790008545, + "learning_rate": 2.4026957637997435e-05, + "loss": 0.6255, + "step": 6244 + }, + { + "epoch": 8.016688061617458, + "grad_norm": 1.4105372428894043, + "learning_rate": 2.4030808729139926e-05, + "loss": 0.5951, + "step": 6245 + }, + { + "epoch": 8.017971758664956, + "grad_norm": 3.5434978008270264, + "learning_rate": 2.4034659820282413e-05, + "loss": 0.6211, + "step": 6246 + }, + { + "epoch": 8.019255455712452, + "grad_norm": 2.525391101837158, + "learning_rate": 2.4038510911424905e-05, + "loss": 0.6285, + "step": 6247 + }, + { + "epoch": 8.020539152759948, + "grad_norm": 1.6094001531600952, + "learning_rate": 2.4042362002567396e-05, + "loss": 0.6062, + "step": 6248 + }, + { + "epoch": 8.021822849807446, + "grad_norm": 2.5175082683563232, + "learning_rate": 2.4046213093709883e-05, + "loss": 0.5786, + "step": 6249 + }, + { + "epoch": 8.023106546854942, + "grad_norm": 1.335640549659729, + "learning_rate": 2.4050064184852378e-05, + "loss": 0.5242, + "step": 6250 + }, + { + "epoch": 8.024390243902438, + "grad_norm": 3.2161030769348145, + "learning_rate": 2.4053915275994865e-05, + "loss": 0.5881, + "step": 6251 + }, + { + "epoch": 8.025673940949936, + "grad_norm": 3.1303277015686035, + "learning_rate": 2.4057766367137357e-05, + "loss": 0.6287, + "step": 6252 + }, + { + "epoch": 8.026957637997432, + "grad_norm": 1.6770442724227905, + "learning_rate": 2.4061617458279848e-05, + "loss": 0.5591, + "step": 6253 + }, + { + "epoch": 8.02824133504493, + "grad_norm": 1.65730619430542, + "learning_rate": 2.4065468549422335e-05, + "loss": 0.5697, + "step": 6254 + }, + { + "epoch": 8.029525032092426, + "grad_norm": 3.5399701595306396, + "learning_rate": 2.4069319640564826e-05, + "loss": 0.5979, + "step": 6255 + }, + { + "epoch": 8.030808729139922, + "grad_norm": 3.124401092529297, + "learning_rate": 2.4073170731707317e-05, + "loss": 0.6375, + "step": 6256 + }, + { + "epoch": 8.03209242618742, + "grad_norm": 1.897018551826477, + "learning_rate": 2.407702182284981e-05, + "loss": 0.558, + "step": 6257 + }, + { + "epoch": 8.033376123234916, + "grad_norm": 2.7375564575195312, + "learning_rate": 2.40808729139923e-05, + "loss": 0.5625, + "step": 6258 + }, + { + "epoch": 8.034659820282414, + "grad_norm": 2.5444719791412354, + "learning_rate": 2.4084724005134787e-05, + "loss": 0.5988, + "step": 6259 + }, + { + "epoch": 8.03594351732991, + "grad_norm": 2.3689115047454834, + "learning_rate": 2.4088575096277278e-05, + "loss": 0.5693, + "step": 6260 + }, + { + "epoch": 8.037227214377406, + "grad_norm": 2.9484264850616455, + "learning_rate": 2.409242618741977e-05, + "loss": 0.6406, + "step": 6261 + }, + { + "epoch": 8.038510911424904, + "grad_norm": 10.039155960083008, + "learning_rate": 2.4096277278562257e-05, + "loss": 0.6547, + "step": 6262 + }, + { + "epoch": 8.0397946084724, + "grad_norm": 2.546867609024048, + "learning_rate": 2.410012836970475e-05, + "loss": 0.6277, + "step": 6263 + }, + { + "epoch": 8.041078305519898, + "grad_norm": 1.4452091455459595, + "learning_rate": 2.4103979460847243e-05, + "loss": 0.6055, + "step": 6264 + }, + { + "epoch": 8.042362002567394, + "grad_norm": 3.0905911922454834, + "learning_rate": 2.410783055198973e-05, + "loss": 0.6422, + "step": 6265 + }, + { + "epoch": 8.04364569961489, + "grad_norm": 3.436882495880127, + "learning_rate": 2.411168164313222e-05, + "loss": 0.6242, + "step": 6266 + }, + { + "epoch": 8.044929396662388, + "grad_norm": 1.3530638217926025, + "learning_rate": 2.4115532734274712e-05, + "loss": 0.66, + "step": 6267 + }, + { + "epoch": 8.046213093709884, + "grad_norm": 2.1105308532714844, + "learning_rate": 2.41193838254172e-05, + "loss": 0.6558, + "step": 6268 + }, + { + "epoch": 8.047496790757382, + "grad_norm": 2.706202507019043, + "learning_rate": 2.4123234916559695e-05, + "loss": 0.6394, + "step": 6269 + }, + { + "epoch": 8.048780487804878, + "grad_norm": 2.545724868774414, + "learning_rate": 2.4127086007702182e-05, + "loss": 0.6018, + "step": 6270 + }, + { + "epoch": 8.050064184852374, + "grad_norm": 1.6317353248596191, + "learning_rate": 2.4130937098844673e-05, + "loss": 0.674, + "step": 6271 + }, + { + "epoch": 8.051347881899872, + "grad_norm": 2.5641849040985107, + "learning_rate": 2.4134788189987164e-05, + "loss": 0.6482, + "step": 6272 + }, + { + "epoch": 8.052631578947368, + "grad_norm": 3.7687246799468994, + "learning_rate": 2.4138639281129652e-05, + "loss": 0.62, + "step": 6273 + }, + { + "epoch": 8.053915275994866, + "grad_norm": 3.1644911766052246, + "learning_rate": 2.4142490372272143e-05, + "loss": 0.6734, + "step": 6274 + }, + { + "epoch": 8.055198973042362, + "grad_norm": 1.5906130075454712, + "learning_rate": 2.4146341463414638e-05, + "loss": 0.6156, + "step": 6275 + }, + { + "epoch": 8.056482670089858, + "grad_norm": 4.576804161071777, + "learning_rate": 2.4150192554557125e-05, + "loss": 0.6228, + "step": 6276 + }, + { + "epoch": 8.057766367137356, + "grad_norm": 4.451948165893555, + "learning_rate": 2.4154043645699616e-05, + "loss": 0.6398, + "step": 6277 + }, + { + "epoch": 8.059050064184852, + "grad_norm": 2.7404048442840576, + "learning_rate": 2.4157894736842104e-05, + "loss": 0.6931, + "step": 6278 + }, + { + "epoch": 8.06033376123235, + "grad_norm": 3.235185146331787, + "learning_rate": 2.4161745827984595e-05, + "loss": 0.713, + "step": 6279 + }, + { + "epoch": 8.061617458279846, + "grad_norm": 4.3360137939453125, + "learning_rate": 2.4165596919127086e-05, + "loss": 0.7659, + "step": 6280 + }, + { + "epoch": 8.062901155327342, + "grad_norm": 11.14806079864502, + "learning_rate": 2.4169448010269577e-05, + "loss": 0.8098, + "step": 6281 + }, + { + "epoch": 8.06418485237484, + "grad_norm": 2.5517845153808594, + "learning_rate": 2.4173299101412068e-05, + "loss": 0.8709, + "step": 6282 + }, + { + "epoch": 8.065468549422336, + "grad_norm": 2.562666177749634, + "learning_rate": 2.417715019255456e-05, + "loss": 0.5683, + "step": 6283 + }, + { + "epoch": 8.066752246469832, + "grad_norm": 1.9954278469085693, + "learning_rate": 2.4181001283697047e-05, + "loss": 0.5592, + "step": 6284 + }, + { + "epoch": 8.06803594351733, + "grad_norm": 1.706905722618103, + "learning_rate": 2.4184852374839538e-05, + "loss": 0.5838, + "step": 6285 + }, + { + "epoch": 8.069319640564826, + "grad_norm": 1.112085223197937, + "learning_rate": 2.4188703465982026e-05, + "loss": 0.5999, + "step": 6286 + }, + { + "epoch": 8.070603337612324, + "grad_norm": 2.581044912338257, + "learning_rate": 2.419255455712452e-05, + "loss": 0.6207, + "step": 6287 + }, + { + "epoch": 8.07188703465982, + "grad_norm": 1.5778703689575195, + "learning_rate": 2.419640564826701e-05, + "loss": 0.5771, + "step": 6288 + }, + { + "epoch": 8.073170731707316, + "grad_norm": 1.1835722923278809, + "learning_rate": 2.42002567394095e-05, + "loss": 0.6373, + "step": 6289 + }, + { + "epoch": 8.074454428754814, + "grad_norm": 1.556995153427124, + "learning_rate": 2.420410783055199e-05, + "loss": 0.5682, + "step": 6290 + }, + { + "epoch": 8.07573812580231, + "grad_norm": 2.592424154281616, + "learning_rate": 2.420795892169448e-05, + "loss": 0.5914, + "step": 6291 + }, + { + "epoch": 8.077021822849808, + "grad_norm": 1.7542952299118042, + "learning_rate": 2.421181001283697e-05, + "loss": 0.5712, + "step": 6292 + }, + { + "epoch": 8.078305519897304, + "grad_norm": 1.676992654800415, + "learning_rate": 2.4215661103979463e-05, + "loss": 0.6193, + "step": 6293 + }, + { + "epoch": 8.0795892169448, + "grad_norm": 1.7381482124328613, + "learning_rate": 2.4219512195121954e-05, + "loss": 0.5824, + "step": 6294 + }, + { + "epoch": 8.080872913992298, + "grad_norm": 1.8453776836395264, + "learning_rate": 2.4223363286264442e-05, + "loss": 0.5997, + "step": 6295 + }, + { + "epoch": 8.082156611039794, + "grad_norm": 2.8914313316345215, + "learning_rate": 2.4227214377406933e-05, + "loss": 0.582, + "step": 6296 + }, + { + "epoch": 8.083440308087292, + "grad_norm": 2.9843850135803223, + "learning_rate": 2.423106546854942e-05, + "loss": 0.5545, + "step": 6297 + }, + { + "epoch": 8.084724005134788, + "grad_norm": 2.499103307723999, + "learning_rate": 2.4234916559691912e-05, + "loss": 0.579, + "step": 6298 + }, + { + "epoch": 8.086007702182284, + "grad_norm": 3.4301767349243164, + "learning_rate": 2.4238767650834406e-05, + "loss": 0.6073, + "step": 6299 + }, + { + "epoch": 8.087291399229782, + "grad_norm": 2.088613748550415, + "learning_rate": 2.4242618741976894e-05, + "loss": 0.594, + "step": 6300 + }, + { + "epoch": 8.088575096277278, + "grad_norm": 2.659173011779785, + "learning_rate": 2.4246469833119385e-05, + "loss": 0.5852, + "step": 6301 + }, + { + "epoch": 8.089858793324776, + "grad_norm": 2.2836647033691406, + "learning_rate": 2.4250320924261876e-05, + "loss": 0.6303, + "step": 6302 + }, + { + "epoch": 8.091142490372272, + "grad_norm": 3.379901647567749, + "learning_rate": 2.4254172015404364e-05, + "loss": 0.5691, + "step": 6303 + }, + { + "epoch": 8.092426187419768, + "grad_norm": 1.6112968921661377, + "learning_rate": 2.4258023106546855e-05, + "loss": 0.6172, + "step": 6304 + }, + { + "epoch": 8.093709884467266, + "grad_norm": 1.4387322664260864, + "learning_rate": 2.4261874197689346e-05, + "loss": 0.5666, + "step": 6305 + }, + { + "epoch": 8.094993581514762, + "grad_norm": 2.4046919345855713, + "learning_rate": 2.4265725288831837e-05, + "loss": 0.5933, + "step": 6306 + }, + { + "epoch": 8.09627727856226, + "grad_norm": 1.9913601875305176, + "learning_rate": 2.4269576379974328e-05, + "loss": 0.562, + "step": 6307 + }, + { + "epoch": 8.097560975609756, + "grad_norm": 2.4989829063415527, + "learning_rate": 2.4273427471116816e-05, + "loss": 0.6404, + "step": 6308 + }, + { + "epoch": 8.098844672657252, + "grad_norm": 2.1655750274658203, + "learning_rate": 2.4277278562259307e-05, + "loss": 0.5851, + "step": 6309 + }, + { + "epoch": 8.10012836970475, + "grad_norm": 3.6092381477355957, + "learning_rate": 2.4281129653401798e-05, + "loss": 0.5793, + "step": 6310 + }, + { + "epoch": 8.101412066752246, + "grad_norm": 2.2918617725372314, + "learning_rate": 2.4284980744544286e-05, + "loss": 0.6426, + "step": 6311 + }, + { + "epoch": 8.102695763799744, + "grad_norm": 7.99707555770874, + "learning_rate": 2.428883183568678e-05, + "loss": 0.5788, + "step": 6312 + }, + { + "epoch": 8.10397946084724, + "grad_norm": 2.820892333984375, + "learning_rate": 2.429268292682927e-05, + "loss": 0.6212, + "step": 6313 + }, + { + "epoch": 8.105263157894736, + "grad_norm": 1.4996137619018555, + "learning_rate": 2.429653401797176e-05, + "loss": 0.6158, + "step": 6314 + }, + { + "epoch": 8.106546854942234, + "grad_norm": 3.684407949447632, + "learning_rate": 2.430038510911425e-05, + "loss": 0.6597, + "step": 6315 + }, + { + "epoch": 8.10783055198973, + "grad_norm": 6.965456962585449, + "learning_rate": 2.4304236200256738e-05, + "loss": 0.6338, + "step": 6316 + }, + { + "epoch": 8.109114249037226, + "grad_norm": 2.452847480773926, + "learning_rate": 2.430808729139923e-05, + "loss": 0.6398, + "step": 6317 + }, + { + "epoch": 8.110397946084724, + "grad_norm": 1.9918856620788574, + "learning_rate": 2.4311938382541723e-05, + "loss": 0.6343, + "step": 6318 + }, + { + "epoch": 8.11168164313222, + "grad_norm": 2.3310940265655518, + "learning_rate": 2.431578947368421e-05, + "loss": 0.6758, + "step": 6319 + }, + { + "epoch": 8.112965340179718, + "grad_norm": 2.3481569290161133, + "learning_rate": 2.4319640564826702e-05, + "loss": 0.6699, + "step": 6320 + }, + { + "epoch": 8.114249037227214, + "grad_norm": 2.5964696407318115, + "learning_rate": 2.4323491655969193e-05, + "loss": 0.6267, + "step": 6321 + }, + { + "epoch": 8.11553273427471, + "grad_norm": 5.354891300201416, + "learning_rate": 2.432734274711168e-05, + "loss": 0.6313, + "step": 6322 + }, + { + "epoch": 8.116816431322208, + "grad_norm": 2.7835781574249268, + "learning_rate": 2.433119383825417e-05, + "loss": 0.7104, + "step": 6323 + }, + { + "epoch": 8.118100128369704, + "grad_norm": 2.360197067260742, + "learning_rate": 2.4335044929396663e-05, + "loss": 0.6232, + "step": 6324 + }, + { + "epoch": 8.119383825417202, + "grad_norm": 32.578304290771484, + "learning_rate": 2.4338896020539154e-05, + "loss": 0.6228, + "step": 6325 + }, + { + "epoch": 8.120667522464698, + "grad_norm": 1.8205592632293701, + "learning_rate": 2.4342747111681645e-05, + "loss": 0.6272, + "step": 6326 + }, + { + "epoch": 8.121951219512194, + "grad_norm": 4.14470911026001, + "learning_rate": 2.4346598202824133e-05, + "loss": 0.6748, + "step": 6327 + }, + { + "epoch": 8.123234916559692, + "grad_norm": 1.6321895122528076, + "learning_rate": 2.4350449293966624e-05, + "loss": 0.7321, + "step": 6328 + }, + { + "epoch": 8.124518613607188, + "grad_norm": 4.726309776306152, + "learning_rate": 2.4354300385109115e-05, + "loss": 0.7176, + "step": 6329 + }, + { + "epoch": 8.125802310654686, + "grad_norm": 3.682708501815796, + "learning_rate": 2.4358151476251606e-05, + "loss": 0.7027, + "step": 6330 + }, + { + "epoch": 8.127086007702182, + "grad_norm": 2.5758464336395264, + "learning_rate": 2.4362002567394097e-05, + "loss": 0.6712, + "step": 6331 + }, + { + "epoch": 8.128369704749678, + "grad_norm": 2.3063831329345703, + "learning_rate": 2.4365853658536588e-05, + "loss": 0.8084, + "step": 6332 + }, + { + "epoch": 8.129653401797176, + "grad_norm": 15.678607940673828, + "learning_rate": 2.4369704749679076e-05, + "loss": 0.6046, + "step": 6333 + }, + { + "epoch": 8.130937098844672, + "grad_norm": 3.9491777420043945, + "learning_rate": 2.4373555840821567e-05, + "loss": 0.5341, + "step": 6334 + }, + { + "epoch": 8.13222079589217, + "grad_norm": 1.9127707481384277, + "learning_rate": 2.4377406931964054e-05, + "loss": 0.5517, + "step": 6335 + }, + { + "epoch": 8.133504492939666, + "grad_norm": 5.218945026397705, + "learning_rate": 2.438125802310655e-05, + "loss": 0.5511, + "step": 6336 + }, + { + "epoch": 8.134788189987162, + "grad_norm": 2.1675899028778076, + "learning_rate": 2.438510911424904e-05, + "loss": 0.5879, + "step": 6337 + }, + { + "epoch": 8.13607188703466, + "grad_norm": 2.7135510444641113, + "learning_rate": 2.4388960205391528e-05, + "loss": 0.5888, + "step": 6338 + }, + { + "epoch": 8.137355584082156, + "grad_norm": 4.570014953613281, + "learning_rate": 2.439281129653402e-05, + "loss": 0.5787, + "step": 6339 + }, + { + "epoch": 8.138639281129654, + "grad_norm": 2.5326285362243652, + "learning_rate": 2.439666238767651e-05, + "loss": 0.5832, + "step": 6340 + }, + { + "epoch": 8.13992297817715, + "grad_norm": 1.295655608177185, + "learning_rate": 2.4400513478818997e-05, + "loss": 0.5679, + "step": 6341 + }, + { + "epoch": 8.141206675224646, + "grad_norm": 1.541727900505066, + "learning_rate": 2.4404364569961492e-05, + "loss": 0.6171, + "step": 6342 + }, + { + "epoch": 8.142490372272144, + "grad_norm": 3.100242853164673, + "learning_rate": 2.440821566110398e-05, + "loss": 0.5525, + "step": 6343 + }, + { + "epoch": 8.14377406931964, + "grad_norm": 2.6355597972869873, + "learning_rate": 2.441206675224647e-05, + "loss": 0.5746, + "step": 6344 + }, + { + "epoch": 8.145057766367138, + "grad_norm": 1.0998939275741577, + "learning_rate": 2.441591784338896e-05, + "loss": 0.5505, + "step": 6345 + }, + { + "epoch": 8.146341463414634, + "grad_norm": 1.6758575439453125, + "learning_rate": 2.441976893453145e-05, + "loss": 0.5785, + "step": 6346 + }, + { + "epoch": 8.14762516046213, + "grad_norm": 2.7144110202789307, + "learning_rate": 2.442362002567394e-05, + "loss": 0.5964, + "step": 6347 + }, + { + "epoch": 8.148908857509628, + "grad_norm": 2.6607465744018555, + "learning_rate": 2.4427471116816435e-05, + "loss": 0.5983, + "step": 6348 + }, + { + "epoch": 8.150192554557124, + "grad_norm": 1.4128221273422241, + "learning_rate": 2.4431322207958923e-05, + "loss": 0.557, + "step": 6349 + }, + { + "epoch": 8.15147625160462, + "grad_norm": 1.802327275276184, + "learning_rate": 2.4435173299101414e-05, + "loss": 0.5855, + "step": 6350 + }, + { + "epoch": 8.152759948652118, + "grad_norm": 12.735929489135742, + "learning_rate": 2.44390243902439e-05, + "loss": 0.5549, + "step": 6351 + }, + { + "epoch": 8.154043645699614, + "grad_norm": 6.271300792694092, + "learning_rate": 2.4442875481386392e-05, + "loss": 0.6688, + "step": 6352 + }, + { + "epoch": 8.155327342747112, + "grad_norm": 2.7289037704467773, + "learning_rate": 2.4446726572528883e-05, + "loss": 0.6444, + "step": 6353 + }, + { + "epoch": 8.156611039794608, + "grad_norm": 1.7567895650863647, + "learning_rate": 2.4450577663671375e-05, + "loss": 0.5663, + "step": 6354 + }, + { + "epoch": 8.157894736842104, + "grad_norm": 2.9975426197052, + "learning_rate": 2.4454428754813866e-05, + "loss": 0.5968, + "step": 6355 + }, + { + "epoch": 8.159178433889602, + "grad_norm": 2.8767762184143066, + "learning_rate": 2.4458279845956357e-05, + "loss": 0.5939, + "step": 6356 + }, + { + "epoch": 8.160462130937098, + "grad_norm": 3.082435369491577, + "learning_rate": 2.4462130937098844e-05, + "loss": 0.64, + "step": 6357 + }, + { + "epoch": 8.161745827984596, + "grad_norm": 1.2013155221939087, + "learning_rate": 2.4465982028241335e-05, + "loss": 0.6071, + "step": 6358 + }, + { + "epoch": 8.163029525032092, + "grad_norm": 1.9139958620071411, + "learning_rate": 2.4469833119383826e-05, + "loss": 0.6456, + "step": 6359 + }, + { + "epoch": 8.164313222079588, + "grad_norm": 1.9335200786590576, + "learning_rate": 2.4473684210526318e-05, + "loss": 0.58, + "step": 6360 + }, + { + "epoch": 8.165596919127086, + "grad_norm": 1.2143468856811523, + "learning_rate": 2.447753530166881e-05, + "loss": 0.5894, + "step": 6361 + }, + { + "epoch": 8.166880616174582, + "grad_norm": 1.575405478477478, + "learning_rate": 2.4481386392811296e-05, + "loss": 0.6501, + "step": 6362 + }, + { + "epoch": 8.16816431322208, + "grad_norm": 1.3013871908187866, + "learning_rate": 2.4485237483953787e-05, + "loss": 0.6005, + "step": 6363 + }, + { + "epoch": 8.169448010269576, + "grad_norm": 1.7166577577590942, + "learning_rate": 2.448908857509628e-05, + "loss": 0.5819, + "step": 6364 + }, + { + "epoch": 8.170731707317072, + "grad_norm": 2.294785976409912, + "learning_rate": 2.4492939666238766e-05, + "loss": 0.6073, + "step": 6365 + }, + { + "epoch": 8.17201540436457, + "grad_norm": 1.6792353391647339, + "learning_rate": 2.4496790757381257e-05, + "loss": 0.6586, + "step": 6366 + }, + { + "epoch": 8.173299101412066, + "grad_norm": 2.327618360519409, + "learning_rate": 2.450064184852375e-05, + "loss": 0.6117, + "step": 6367 + }, + { + "epoch": 8.174582798459564, + "grad_norm": 2.8439834117889404, + "learning_rate": 2.450449293966624e-05, + "loss": 0.6563, + "step": 6368 + }, + { + "epoch": 8.17586649550706, + "grad_norm": 1.2421001195907593, + "learning_rate": 2.450834403080873e-05, + "loss": 0.6922, + "step": 6369 + }, + { + "epoch": 8.177150192554556, + "grad_norm": 1.3751089572906494, + "learning_rate": 2.4512195121951218e-05, + "loss": 0.6494, + "step": 6370 + }, + { + "epoch": 8.178433889602054, + "grad_norm": 3.0546741485595703, + "learning_rate": 2.451604621309371e-05, + "loss": 0.6282, + "step": 6371 + }, + { + "epoch": 8.17971758664955, + "grad_norm": 4.196930885314941, + "learning_rate": 2.45198973042362e-05, + "loss": 0.6241, + "step": 6372 + }, + { + "epoch": 8.181001283697048, + "grad_norm": 2.485570192337036, + "learning_rate": 2.452374839537869e-05, + "loss": 0.7055, + "step": 6373 + }, + { + "epoch": 8.182284980744544, + "grad_norm": 3.8024814128875732, + "learning_rate": 2.4527599486521182e-05, + "loss": 0.6257, + "step": 6374 + }, + { + "epoch": 8.18356867779204, + "grad_norm": 1.46562922000885, + "learning_rate": 2.4531450577663673e-05, + "loss": 0.6595, + "step": 6375 + }, + { + "epoch": 8.184852374839538, + "grad_norm": 3.497647523880005, + "learning_rate": 2.453530166880616e-05, + "loss": 0.7008, + "step": 6376 + }, + { + "epoch": 8.186136071887034, + "grad_norm": 2.728440761566162, + "learning_rate": 2.4539152759948652e-05, + "loss": 0.6474, + "step": 6377 + }, + { + "epoch": 8.187419768934532, + "grad_norm": 2.9560065269470215, + "learning_rate": 2.4543003851091143e-05, + "loss": 0.6541, + "step": 6378 + }, + { + "epoch": 8.188703465982028, + "grad_norm": 3.5139033794403076, + "learning_rate": 2.4546854942233634e-05, + "loss": 0.7301, + "step": 6379 + }, + { + "epoch": 8.189987163029524, + "grad_norm": 2.1414053440093994, + "learning_rate": 2.4550706033376125e-05, + "loss": 0.7191, + "step": 6380 + }, + { + "epoch": 8.191270860077022, + "grad_norm": 2.126335382461548, + "learning_rate": 2.4554557124518613e-05, + "loss": 0.7892, + "step": 6381 + }, + { + "epoch": 8.192554557124518, + "grad_norm": 16.717342376708984, + "learning_rate": 2.4558408215661104e-05, + "loss": 0.934, + "step": 6382 + }, + { + "epoch": 8.193838254172016, + "grad_norm": 2.040186643600464, + "learning_rate": 2.4562259306803595e-05, + "loss": 0.5788, + "step": 6383 + }, + { + "epoch": 8.195121951219512, + "grad_norm": 1.46153724193573, + "learning_rate": 2.4566110397946083e-05, + "loss": 0.6012, + "step": 6384 + }, + { + "epoch": 8.196405648267008, + "grad_norm": 1.2965846061706543, + "learning_rate": 2.4569961489088577e-05, + "loss": 0.5859, + "step": 6385 + }, + { + "epoch": 8.197689345314506, + "grad_norm": 2.085536479949951, + "learning_rate": 2.457381258023107e-05, + "loss": 0.5732, + "step": 6386 + }, + { + "epoch": 8.198973042362002, + "grad_norm": 1.0199567079544067, + "learning_rate": 2.4577663671373556e-05, + "loss": 0.5696, + "step": 6387 + }, + { + "epoch": 8.200256739409499, + "grad_norm": 1.0258241891860962, + "learning_rate": 2.4581514762516047e-05, + "loss": 0.5664, + "step": 6388 + }, + { + "epoch": 8.201540436456996, + "grad_norm": 3.5939786434173584, + "learning_rate": 2.4585365853658535e-05, + "loss": 0.6108, + "step": 6389 + }, + { + "epoch": 8.202824133504492, + "grad_norm": 1.70795738697052, + "learning_rate": 2.4589216944801026e-05, + "loss": 0.6125, + "step": 6390 + }, + { + "epoch": 8.20410783055199, + "grad_norm": 5.024095058441162, + "learning_rate": 2.459306803594352e-05, + "loss": 0.6075, + "step": 6391 + }, + { + "epoch": 8.205391527599486, + "grad_norm": 1.7455065250396729, + "learning_rate": 2.4596919127086008e-05, + "loss": 0.6063, + "step": 6392 + }, + { + "epoch": 8.206675224646983, + "grad_norm": 1.2869138717651367, + "learning_rate": 2.46007702182285e-05, + "loss": 0.5581, + "step": 6393 + }, + { + "epoch": 8.20795892169448, + "grad_norm": 2.7756803035736084, + "learning_rate": 2.460462130937099e-05, + "loss": 0.6162, + "step": 6394 + }, + { + "epoch": 8.209242618741976, + "grad_norm": 4.076871395111084, + "learning_rate": 2.4608472400513478e-05, + "loss": 0.5803, + "step": 6395 + }, + { + "epoch": 8.210526315789474, + "grad_norm": 3.1365749835968018, + "learning_rate": 2.461232349165597e-05, + "loss": 0.5851, + "step": 6396 + }, + { + "epoch": 8.21181001283697, + "grad_norm": 1.4597582817077637, + "learning_rate": 2.461617458279846e-05, + "loss": 0.6059, + "step": 6397 + }, + { + "epoch": 8.213093709884467, + "grad_norm": 1.895756483078003, + "learning_rate": 2.462002567394095e-05, + "loss": 0.6366, + "step": 6398 + }, + { + "epoch": 8.214377406931964, + "grad_norm": 2.7031307220458984, + "learning_rate": 2.4623876765083442e-05, + "loss": 0.5857, + "step": 6399 + }, + { + "epoch": 8.21566110397946, + "grad_norm": 1.6985589265823364, + "learning_rate": 2.462772785622593e-05, + "loss": 0.5576, + "step": 6400 + }, + { + "epoch": 8.216944801026958, + "grad_norm": 4.428929805755615, + "learning_rate": 2.463157894736842e-05, + "loss": 0.5971, + "step": 6401 + }, + { + "epoch": 8.218228498074454, + "grad_norm": 11.204327583312988, + "learning_rate": 2.4635430038510912e-05, + "loss": 0.5975, + "step": 6402 + }, + { + "epoch": 8.21951219512195, + "grad_norm": 1.8314534425735474, + "learning_rate": 2.4639281129653403e-05, + "loss": 0.6103, + "step": 6403 + }, + { + "epoch": 8.220795892169448, + "grad_norm": 1.2248954772949219, + "learning_rate": 2.4643132220795894e-05, + "loss": 0.644, + "step": 6404 + }, + { + "epoch": 8.222079589216944, + "grad_norm": 2.8591082096099854, + "learning_rate": 2.4646983311938385e-05, + "loss": 0.6351, + "step": 6405 + }, + { + "epoch": 8.223363286264442, + "grad_norm": 1.9398293495178223, + "learning_rate": 2.4650834403080873e-05, + "loss": 0.6139, + "step": 6406 + }, + { + "epoch": 8.224646983311938, + "grad_norm": 1.2543567419052124, + "learning_rate": 2.4654685494223364e-05, + "loss": 0.6204, + "step": 6407 + }, + { + "epoch": 8.225930680359435, + "grad_norm": 2.6161553859710693, + "learning_rate": 2.465853658536585e-05, + "loss": 0.6143, + "step": 6408 + }, + { + "epoch": 8.227214377406932, + "grad_norm": 18.729122161865234, + "learning_rate": 2.4662387676508346e-05, + "loss": 0.578, + "step": 6409 + }, + { + "epoch": 8.228498074454428, + "grad_norm": 1.6570364236831665, + "learning_rate": 2.4666238767650837e-05, + "loss": 0.6069, + "step": 6410 + }, + { + "epoch": 8.229781771501926, + "grad_norm": 2.8910861015319824, + "learning_rate": 2.4670089858793325e-05, + "loss": 0.6369, + "step": 6411 + }, + { + "epoch": 8.231065468549422, + "grad_norm": 3.3699402809143066, + "learning_rate": 2.4673940949935816e-05, + "loss": 0.6231, + "step": 6412 + }, + { + "epoch": 8.232349165596919, + "grad_norm": 2.5678842067718506, + "learning_rate": 2.4677792041078307e-05, + "loss": 0.6046, + "step": 6413 + }, + { + "epoch": 8.233632862644416, + "grad_norm": 1.5038909912109375, + "learning_rate": 2.4681643132220795e-05, + "loss": 0.6188, + "step": 6414 + }, + { + "epoch": 8.234916559691912, + "grad_norm": 1.0752569437026978, + "learning_rate": 2.4685494223363286e-05, + "loss": 0.6477, + "step": 6415 + }, + { + "epoch": 8.23620025673941, + "grad_norm": 2.508845090866089, + "learning_rate": 2.4689345314505777e-05, + "loss": 0.6388, + "step": 6416 + }, + { + "epoch": 8.237483953786906, + "grad_norm": 1.409893274307251, + "learning_rate": 2.4693196405648268e-05, + "loss": 0.6059, + "step": 6417 + }, + { + "epoch": 8.238767650834403, + "grad_norm": 1.6664468050003052, + "learning_rate": 2.469704749679076e-05, + "loss": 0.5834, + "step": 6418 + }, + { + "epoch": 8.2400513478819, + "grad_norm": 3.27166485786438, + "learning_rate": 2.4700898587933247e-05, + "loss": 0.5512, + "step": 6419 + }, + { + "epoch": 8.241335044929397, + "grad_norm": 1.8829768896102905, + "learning_rate": 2.4704749679075738e-05, + "loss": 0.5755, + "step": 6420 + }, + { + "epoch": 8.242618741976893, + "grad_norm": 2.4135360717773438, + "learning_rate": 2.470860077021823e-05, + "loss": 0.6365, + "step": 6421 + }, + { + "epoch": 8.24390243902439, + "grad_norm": 10.923074722290039, + "learning_rate": 2.471245186136072e-05, + "loss": 0.6397, + "step": 6422 + }, + { + "epoch": 8.245186136071887, + "grad_norm": 2.651029586791992, + "learning_rate": 2.471630295250321e-05, + "loss": 0.6054, + "step": 6423 + }, + { + "epoch": 8.246469833119384, + "grad_norm": 7.2750630378723145, + "learning_rate": 2.4720154043645702e-05, + "loss": 0.6947, + "step": 6424 + }, + { + "epoch": 8.24775353016688, + "grad_norm": 32.664981842041016, + "learning_rate": 2.472400513478819e-05, + "loss": 0.6688, + "step": 6425 + }, + { + "epoch": 8.249037227214377, + "grad_norm": 2.257308006286621, + "learning_rate": 2.472785622593068e-05, + "loss": 0.6247, + "step": 6426 + }, + { + "epoch": 8.250320924261874, + "grad_norm": 6.952650547027588, + "learning_rate": 2.473170731707317e-05, + "loss": 0.6987, + "step": 6427 + }, + { + "epoch": 8.25160462130937, + "grad_norm": 5.310196399688721, + "learning_rate": 2.4735558408215663e-05, + "loss": 0.7173, + "step": 6428 + }, + { + "epoch": 8.252888318356868, + "grad_norm": 10.392789840698242, + "learning_rate": 2.4739409499358154e-05, + "loss": 0.6717, + "step": 6429 + }, + { + "epoch": 8.254172015404365, + "grad_norm": 4.526954650878906, + "learning_rate": 2.474326059050064e-05, + "loss": 0.6758, + "step": 6430 + }, + { + "epoch": 8.25545571245186, + "grad_norm": 2.275062322616577, + "learning_rate": 2.4747111681643133e-05, + "loss": 0.7846, + "step": 6431 + }, + { + "epoch": 8.256739409499358, + "grad_norm": 6.655947208404541, + "learning_rate": 2.4750962772785624e-05, + "loss": 0.8574, + "step": 6432 + }, + { + "epoch": 8.258023106546855, + "grad_norm": 4.219980239868164, + "learning_rate": 2.475481386392811e-05, + "loss": 0.6274, + "step": 6433 + }, + { + "epoch": 8.259306803594352, + "grad_norm": 1.9054936170578003, + "learning_rate": 2.4758664955070606e-05, + "loss": 0.5774, + "step": 6434 + }, + { + "epoch": 8.260590500641849, + "grad_norm": 1.070307731628418, + "learning_rate": 2.4762516046213094e-05, + "loss": 0.6, + "step": 6435 + }, + { + "epoch": 8.261874197689345, + "grad_norm": 2.1717402935028076, + "learning_rate": 2.4766367137355585e-05, + "loss": 0.6195, + "step": 6436 + }, + { + "epoch": 8.263157894736842, + "grad_norm": 4.353585243225098, + "learning_rate": 2.4770218228498076e-05, + "loss": 0.6056, + "step": 6437 + }, + { + "epoch": 8.264441591784339, + "grad_norm": 2.190707206726074, + "learning_rate": 2.4774069319640563e-05, + "loss": 0.5752, + "step": 6438 + }, + { + "epoch": 8.265725288831836, + "grad_norm": 2.0720229148864746, + "learning_rate": 2.4777920410783055e-05, + "loss": 0.5613, + "step": 6439 + }, + { + "epoch": 8.267008985879333, + "grad_norm": 2.4623820781707764, + "learning_rate": 2.478177150192555e-05, + "loss": 0.5442, + "step": 6440 + }, + { + "epoch": 8.268292682926829, + "grad_norm": 3.2345364093780518, + "learning_rate": 2.4785622593068037e-05, + "loss": 0.5765, + "step": 6441 + }, + { + "epoch": 8.269576379974326, + "grad_norm": 11.314675331115723, + "learning_rate": 2.4789473684210528e-05, + "loss": 0.586, + "step": 6442 + }, + { + "epoch": 8.270860077021823, + "grad_norm": 4.765658855438232, + "learning_rate": 2.4793324775353015e-05, + "loss": 0.5899, + "step": 6443 + }, + { + "epoch": 8.27214377406932, + "grad_norm": 2.2557859420776367, + "learning_rate": 2.4797175866495506e-05, + "loss": 0.5798, + "step": 6444 + }, + { + "epoch": 8.273427471116817, + "grad_norm": 1.8644682168960571, + "learning_rate": 2.4801026957637998e-05, + "loss": 0.5636, + "step": 6445 + }, + { + "epoch": 8.274711168164313, + "grad_norm": 2.0303256511688232, + "learning_rate": 2.480487804878049e-05, + "loss": 0.5962, + "step": 6446 + }, + { + "epoch": 8.27599486521181, + "grad_norm": 0.9427006840705872, + "learning_rate": 2.480872913992298e-05, + "loss": 0.5583, + "step": 6447 + }, + { + "epoch": 8.277278562259307, + "grad_norm": 2.313627004623413, + "learning_rate": 2.481258023106547e-05, + "loss": 0.5897, + "step": 6448 + }, + { + "epoch": 8.278562259306804, + "grad_norm": 1.646113395690918, + "learning_rate": 2.481643132220796e-05, + "loss": 0.5438, + "step": 6449 + }, + { + "epoch": 8.2798459563543, + "grad_norm": 5.182469367980957, + "learning_rate": 2.482028241335045e-05, + "loss": 0.5993, + "step": 6450 + }, + { + "epoch": 8.281129653401797, + "grad_norm": 1.8857274055480957, + "learning_rate": 2.482413350449294e-05, + "loss": 0.598, + "step": 6451 + }, + { + "epoch": 8.282413350449294, + "grad_norm": 2.072220802307129, + "learning_rate": 2.482798459563543e-05, + "loss": 0.5722, + "step": 6452 + }, + { + "epoch": 8.28369704749679, + "grad_norm": 2.1103711128234863, + "learning_rate": 2.4831835686777923e-05, + "loss": 0.6133, + "step": 6453 + }, + { + "epoch": 8.284980744544288, + "grad_norm": 2.6074960231781006, + "learning_rate": 2.483568677792041e-05, + "loss": 0.5888, + "step": 6454 + }, + { + "epoch": 8.286264441591785, + "grad_norm": 1.1243617534637451, + "learning_rate": 2.48395378690629e-05, + "loss": 0.5867, + "step": 6455 + }, + { + "epoch": 8.28754813863928, + "grad_norm": 1.866227388381958, + "learning_rate": 2.4843388960205393e-05, + "loss": 0.5921, + "step": 6456 + }, + { + "epoch": 8.288831835686779, + "grad_norm": 1.590499997138977, + "learning_rate": 2.484724005134788e-05, + "loss": 0.641, + "step": 6457 + }, + { + "epoch": 8.290115532734275, + "grad_norm": 1.9813694953918457, + "learning_rate": 2.4851091142490375e-05, + "loss": 0.5772, + "step": 6458 + }, + { + "epoch": 8.29139922978177, + "grad_norm": 2.1440768241882324, + "learning_rate": 2.4854942233632866e-05, + "loss": 0.6173, + "step": 6459 + }, + { + "epoch": 8.292682926829269, + "grad_norm": 5.247136116027832, + "learning_rate": 2.4858793324775353e-05, + "loss": 0.627, + "step": 6460 + }, + { + "epoch": 8.293966623876765, + "grad_norm": 11.360709190368652, + "learning_rate": 2.4862644415917845e-05, + "loss": 0.5961, + "step": 6461 + }, + { + "epoch": 8.295250320924263, + "grad_norm": 1.6338934898376465, + "learning_rate": 2.4866495507060332e-05, + "loss": 0.6259, + "step": 6462 + }, + { + "epoch": 8.296534017971759, + "grad_norm": 5.733043193817139, + "learning_rate": 2.4870346598202823e-05, + "loss": 0.6116, + "step": 6463 + }, + { + "epoch": 8.297817715019255, + "grad_norm": 10.1329927444458, + "learning_rate": 2.4874197689345314e-05, + "loss": 0.5898, + "step": 6464 + }, + { + "epoch": 8.299101412066753, + "grad_norm": 2.910353899002075, + "learning_rate": 2.4878048780487805e-05, + "loss": 0.6043, + "step": 6465 + }, + { + "epoch": 8.300385109114249, + "grad_norm": 2.1800029277801514, + "learning_rate": 2.4881899871630296e-05, + "loss": 0.6392, + "step": 6466 + }, + { + "epoch": 8.301668806161747, + "grad_norm": 2.350156784057617, + "learning_rate": 2.4885750962772788e-05, + "loss": 0.5463, + "step": 6467 + }, + { + "epoch": 8.302952503209243, + "grad_norm": 2.251537561416626, + "learning_rate": 2.4889602053915275e-05, + "loss": 0.6274, + "step": 6468 + }, + { + "epoch": 8.304236200256739, + "grad_norm": 1.4993300437927246, + "learning_rate": 2.4893453145057766e-05, + "loss": 0.627, + "step": 6469 + }, + { + "epoch": 8.305519897304237, + "grad_norm": 3.2248525619506836, + "learning_rate": 2.4897304236200257e-05, + "loss": 0.5883, + "step": 6470 + }, + { + "epoch": 8.306803594351733, + "grad_norm": 2.8525390625, + "learning_rate": 2.490115532734275e-05, + "loss": 0.649, + "step": 6471 + }, + { + "epoch": 8.30808729139923, + "grad_norm": 2.530670404434204, + "learning_rate": 2.490500641848524e-05, + "loss": 0.5845, + "step": 6472 + }, + { + "epoch": 8.309370988446727, + "grad_norm": 2.5737788677215576, + "learning_rate": 2.4908857509627727e-05, + "loss": 0.5901, + "step": 6473 + }, + { + "epoch": 8.310654685494223, + "grad_norm": 2.4439165592193604, + "learning_rate": 2.4912708600770218e-05, + "loss": 0.6639, + "step": 6474 + }, + { + "epoch": 8.31193838254172, + "grad_norm": 2.632662534713745, + "learning_rate": 2.491655969191271e-05, + "loss": 0.6884, + "step": 6475 + }, + { + "epoch": 8.313222079589217, + "grad_norm": 2.7220723628997803, + "learning_rate": 2.4920410783055197e-05, + "loss": 0.6355, + "step": 6476 + }, + { + "epoch": 8.314505776636715, + "grad_norm": 4.234983444213867, + "learning_rate": 2.492426187419769e-05, + "loss": 0.6819, + "step": 6477 + }, + { + "epoch": 8.31578947368421, + "grad_norm": 12.894704818725586, + "learning_rate": 2.4928112965340183e-05, + "loss": 0.7121, + "step": 6478 + }, + { + "epoch": 8.317073170731707, + "grad_norm": 4.169455528259277, + "learning_rate": 2.493196405648267e-05, + "loss": 0.7145, + "step": 6479 + }, + { + "epoch": 8.318356867779205, + "grad_norm": 3.8420844078063965, + "learning_rate": 2.493581514762516e-05, + "loss": 0.8293, + "step": 6480 + }, + { + "epoch": 8.3196405648267, + "grad_norm": 3.1471893787384033, + "learning_rate": 2.493966623876765e-05, + "loss": 0.7801, + "step": 6481 + }, + { + "epoch": 8.320924261874199, + "grad_norm": 2.5935471057891846, + "learning_rate": 2.494351732991014e-05, + "loss": 0.9172, + "step": 6482 + }, + { + "epoch": 8.322207958921695, + "grad_norm": 3.092909336090088, + "learning_rate": 2.4947368421052635e-05, + "loss": 0.5773, + "step": 6483 + }, + { + "epoch": 8.32349165596919, + "grad_norm": 2.8756699562072754, + "learning_rate": 2.4951219512195122e-05, + "loss": 0.5508, + "step": 6484 + }, + { + "epoch": 8.324775353016689, + "grad_norm": 3.3931338787078857, + "learning_rate": 2.4955070603337613e-05, + "loss": 0.5874, + "step": 6485 + }, + { + "epoch": 8.326059050064185, + "grad_norm": 1.674453854560852, + "learning_rate": 2.4958921694480104e-05, + "loss": 0.5674, + "step": 6486 + }, + { + "epoch": 8.327342747111683, + "grad_norm": 2.099856376647949, + "learning_rate": 2.4962772785622592e-05, + "loss": 0.6472, + "step": 6487 + }, + { + "epoch": 8.328626444159179, + "grad_norm": 1.733654260635376, + "learning_rate": 2.4966623876765083e-05, + "loss": 0.5787, + "step": 6488 + }, + { + "epoch": 8.329910141206675, + "grad_norm": 1.475523591041565, + "learning_rate": 2.4970474967907574e-05, + "loss": 0.596, + "step": 6489 + }, + { + "epoch": 8.331193838254173, + "grad_norm": 3.903568744659424, + "learning_rate": 2.4974326059050065e-05, + "loss": 0.5986, + "step": 6490 + }, + { + "epoch": 8.332477535301669, + "grad_norm": 2.0339109897613525, + "learning_rate": 2.4978177150192556e-05, + "loss": 0.5882, + "step": 6491 + }, + { + "epoch": 8.333761232349165, + "grad_norm": 3.6589250564575195, + "learning_rate": 2.4982028241335044e-05, + "loss": 0.5739, + "step": 6492 + }, + { + "epoch": 8.335044929396663, + "grad_norm": 1.3752226829528809, + "learning_rate": 2.4985879332477535e-05, + "loss": 0.5612, + "step": 6493 + }, + { + "epoch": 8.336328626444159, + "grad_norm": 2.2102432250976562, + "learning_rate": 2.4989730423620026e-05, + "loss": 0.5944, + "step": 6494 + }, + { + "epoch": 8.337612323491657, + "grad_norm": 2.2215256690979004, + "learning_rate": 2.4993581514762517e-05, + "loss": 0.5559, + "step": 6495 + }, + { + "epoch": 8.338896020539153, + "grad_norm": 2.3471786975860596, + "learning_rate": 2.4997432605905008e-05, + "loss": 0.6487, + "step": 6496 + }, + { + "epoch": 8.340179717586649, + "grad_norm": 1.465282678604126, + "learning_rate": 2.50012836970475e-05, + "loss": 0.6194, + "step": 6497 + }, + { + "epoch": 8.341463414634147, + "grad_norm": 2.631593704223633, + "learning_rate": 2.5005134788189987e-05, + "loss": 0.6094, + "step": 6498 + }, + { + "epoch": 8.342747111681643, + "grad_norm": 5.931853294372559, + "learning_rate": 2.5008985879332478e-05, + "loss": 0.6409, + "step": 6499 + }, + { + "epoch": 8.34403080872914, + "grad_norm": 3.3241965770721436, + "learning_rate": 2.5012836970474966e-05, + "loss": 0.5943, + "step": 6500 + }, + { + "epoch": 8.345314505776637, + "grad_norm": 2.0240941047668457, + "learning_rate": 2.501668806161746e-05, + "loss": 0.6071, + "step": 6501 + }, + { + "epoch": 8.346598202824133, + "grad_norm": 3.087322235107422, + "learning_rate": 2.502053915275995e-05, + "loss": 0.5726, + "step": 6502 + }, + { + "epoch": 8.34788189987163, + "grad_norm": 2.816192388534546, + "learning_rate": 2.502439024390244e-05, + "loss": 0.5892, + "step": 6503 + }, + { + "epoch": 8.349165596919127, + "grad_norm": 1.6737688779830933, + "learning_rate": 2.502824133504493e-05, + "loss": 0.5888, + "step": 6504 + }, + { + "epoch": 8.350449293966625, + "grad_norm": 3.678213596343994, + "learning_rate": 2.503209242618742e-05, + "loss": 0.6119, + "step": 6505 + }, + { + "epoch": 8.35173299101412, + "grad_norm": 3.546424150466919, + "learning_rate": 2.503594351732991e-05, + "loss": 0.5508, + "step": 6506 + }, + { + "epoch": 8.353016688061617, + "grad_norm": 4.070678234100342, + "learning_rate": 2.5039794608472403e-05, + "loss": 0.5729, + "step": 6507 + }, + { + "epoch": 8.354300385109115, + "grad_norm": 3.500180244445801, + "learning_rate": 2.504364569961489e-05, + "loss": 0.5784, + "step": 6508 + }, + { + "epoch": 8.35558408215661, + "grad_norm": 1.8288644552230835, + "learning_rate": 2.5047496790757382e-05, + "loss": 0.5582, + "step": 6509 + }, + { + "epoch": 8.356867779204109, + "grad_norm": 1.3598508834838867, + "learning_rate": 2.5051347881899873e-05, + "loss": 0.5897, + "step": 6510 + }, + { + "epoch": 8.358151476251605, + "grad_norm": 2.1121509075164795, + "learning_rate": 2.505519897304236e-05, + "loss": 0.6031, + "step": 6511 + }, + { + "epoch": 8.3594351732991, + "grad_norm": 2.37070369720459, + "learning_rate": 2.5059050064184852e-05, + "loss": 0.5754, + "step": 6512 + }, + { + "epoch": 8.360718870346599, + "grad_norm": 1.8207290172576904, + "learning_rate": 2.5062901155327343e-05, + "loss": 0.6094, + "step": 6513 + }, + { + "epoch": 8.362002567394095, + "grad_norm": 1.893588662147522, + "learning_rate": 2.5066752246469834e-05, + "loss": 0.6527, + "step": 6514 + }, + { + "epoch": 8.363286264441593, + "grad_norm": 2.788321018218994, + "learning_rate": 2.5070603337612325e-05, + "loss": 0.7174, + "step": 6515 + }, + { + "epoch": 8.364569961489089, + "grad_norm": 12.590486526489258, + "learning_rate": 2.5074454428754816e-05, + "loss": 0.6519, + "step": 6516 + }, + { + "epoch": 8.365853658536585, + "grad_norm": 5.961674690246582, + "learning_rate": 2.5078305519897304e-05, + "loss": 0.6638, + "step": 6517 + }, + { + "epoch": 8.367137355584083, + "grad_norm": 1.8367794752120972, + "learning_rate": 2.5082156611039795e-05, + "loss": 0.6767, + "step": 6518 + }, + { + "epoch": 8.368421052631579, + "grad_norm": 9.616555213928223, + "learning_rate": 2.5086007702182283e-05, + "loss": 0.6695, + "step": 6519 + }, + { + "epoch": 8.369704749679077, + "grad_norm": 2.8948402404785156, + "learning_rate": 2.5089858793324777e-05, + "loss": 0.6637, + "step": 6520 + }, + { + "epoch": 8.370988446726573, + "grad_norm": 2.2066051959991455, + "learning_rate": 2.5093709884467268e-05, + "loss": 0.6688, + "step": 6521 + }, + { + "epoch": 8.372272143774069, + "grad_norm": 2.18662691116333, + "learning_rate": 2.5097560975609756e-05, + "loss": 0.6625, + "step": 6522 + }, + { + "epoch": 8.373555840821567, + "grad_norm": 2.1555538177490234, + "learning_rate": 2.5101412066752247e-05, + "loss": 0.6884, + "step": 6523 + }, + { + "epoch": 8.374839537869063, + "grad_norm": 7.104978561401367, + "learning_rate": 2.5105263157894738e-05, + "loss": 0.7044, + "step": 6524 + }, + { + "epoch": 8.376123234916559, + "grad_norm": 5.2768874168396, + "learning_rate": 2.5109114249037226e-05, + "loss": 0.6618, + "step": 6525 + }, + { + "epoch": 8.377406931964057, + "grad_norm": 1.3803468942642212, + "learning_rate": 2.511296534017972e-05, + "loss": 0.6509, + "step": 6526 + }, + { + "epoch": 8.378690629011553, + "grad_norm": 2.5110738277435303, + "learning_rate": 2.5116816431322208e-05, + "loss": 0.6492, + "step": 6527 + }, + { + "epoch": 8.37997432605905, + "grad_norm": 3.152804374694824, + "learning_rate": 2.51206675224647e-05, + "loss": 0.6718, + "step": 6528 + }, + { + "epoch": 8.381258023106547, + "grad_norm": 2.391817569732666, + "learning_rate": 2.512451861360719e-05, + "loss": 0.6863, + "step": 6529 + }, + { + "epoch": 8.382541720154043, + "grad_norm": 3.6190004348754883, + "learning_rate": 2.5128369704749678e-05, + "loss": 0.7051, + "step": 6530 + }, + { + "epoch": 8.38382541720154, + "grad_norm": 2.700255870819092, + "learning_rate": 2.513222079589217e-05, + "loss": 0.6943, + "step": 6531 + }, + { + "epoch": 8.385109114249037, + "grad_norm": 2.507521629333496, + "learning_rate": 2.5136071887034663e-05, + "loss": 0.8926, + "step": 6532 + }, + { + "epoch": 8.386392811296535, + "grad_norm": 1.6976094245910645, + "learning_rate": 2.513992297817715e-05, + "loss": 0.5797, + "step": 6533 + }, + { + "epoch": 8.38767650834403, + "grad_norm": 1.7516065835952759, + "learning_rate": 2.5143774069319642e-05, + "loss": 0.5994, + "step": 6534 + }, + { + "epoch": 8.388960205391527, + "grad_norm": 1.4562314748764038, + "learning_rate": 2.5147625160462133e-05, + "loss": 0.6073, + "step": 6535 + }, + { + "epoch": 8.390243902439025, + "grad_norm": 1.4386711120605469, + "learning_rate": 2.515147625160462e-05, + "loss": 0.6048, + "step": 6536 + }, + { + "epoch": 8.39152759948652, + "grad_norm": 2.5956876277923584, + "learning_rate": 2.515532734274711e-05, + "loss": 0.566, + "step": 6537 + }, + { + "epoch": 8.392811296534019, + "grad_norm": 2.0000650882720947, + "learning_rate": 2.5159178433889603e-05, + "loss": 0.6037, + "step": 6538 + }, + { + "epoch": 8.394094993581515, + "grad_norm": 4.356109142303467, + "learning_rate": 2.5163029525032094e-05, + "loss": 0.6003, + "step": 6539 + }, + { + "epoch": 8.39537869062901, + "grad_norm": 1.2735179662704468, + "learning_rate": 2.5166880616174585e-05, + "loss": 0.6188, + "step": 6540 + }, + { + "epoch": 8.396662387676509, + "grad_norm": 1.3230434656143188, + "learning_rate": 2.5170731707317073e-05, + "loss": 0.6028, + "step": 6541 + }, + { + "epoch": 8.397946084724005, + "grad_norm": 1.032386064529419, + "learning_rate": 2.5174582798459564e-05, + "loss": 0.5627, + "step": 6542 + }, + { + "epoch": 8.399229781771503, + "grad_norm": 2.1743855476379395, + "learning_rate": 2.5178433889602055e-05, + "loss": 0.5836, + "step": 6543 + }, + { + "epoch": 8.400513478818999, + "grad_norm": 4.330018997192383, + "learning_rate": 2.5182284980744546e-05, + "loss": 0.6068, + "step": 6544 + }, + { + "epoch": 8.401797175866495, + "grad_norm": 1.3411017656326294, + "learning_rate": 2.5186136071887037e-05, + "loss": 0.5509, + "step": 6545 + }, + { + "epoch": 8.403080872913993, + "grad_norm": 8.621671676635742, + "learning_rate": 2.5189987163029525e-05, + "loss": 0.6008, + "step": 6546 + }, + { + "epoch": 8.404364569961489, + "grad_norm": 4.1366987228393555, + "learning_rate": 2.5193838254172016e-05, + "loss": 0.5521, + "step": 6547 + }, + { + "epoch": 8.405648267008987, + "grad_norm": 1.1715314388275146, + "learning_rate": 2.5197689345314507e-05, + "loss": 0.6189, + "step": 6548 + }, + { + "epoch": 8.406931964056483, + "grad_norm": 2.143826484680176, + "learning_rate": 2.5201540436456994e-05, + "loss": 0.5527, + "step": 6549 + }, + { + "epoch": 8.408215661103979, + "grad_norm": 1.257581114768982, + "learning_rate": 2.520539152759949e-05, + "loss": 0.6371, + "step": 6550 + }, + { + "epoch": 8.409499358151477, + "grad_norm": 1.622389554977417, + "learning_rate": 2.520924261874198e-05, + "loss": 0.5773, + "step": 6551 + }, + { + "epoch": 8.410783055198973, + "grad_norm": 3.6256484985351562, + "learning_rate": 2.5213093709884468e-05, + "loss": 0.5725, + "step": 6552 + }, + { + "epoch": 8.41206675224647, + "grad_norm": 1.778104543685913, + "learning_rate": 2.521694480102696e-05, + "loss": 0.6181, + "step": 6553 + }, + { + "epoch": 8.413350449293967, + "grad_norm": 7.882443428039551, + "learning_rate": 2.5220795892169446e-05, + "loss": 0.6279, + "step": 6554 + }, + { + "epoch": 8.414634146341463, + "grad_norm": 1.3241056203842163, + "learning_rate": 2.5224646983311937e-05, + "loss": 0.5994, + "step": 6555 + }, + { + "epoch": 8.41591784338896, + "grad_norm": 2.2241175174713135, + "learning_rate": 2.5228498074454432e-05, + "loss": 0.6001, + "step": 6556 + }, + { + "epoch": 8.417201540436457, + "grad_norm": 5.013143062591553, + "learning_rate": 2.523234916559692e-05, + "loss": 0.5749, + "step": 6557 + }, + { + "epoch": 8.418485237483953, + "grad_norm": 1.8110437393188477, + "learning_rate": 2.523620025673941e-05, + "loss": 0.5927, + "step": 6558 + }, + { + "epoch": 8.41976893453145, + "grad_norm": 3.003143548965454, + "learning_rate": 2.52400513478819e-05, + "loss": 0.5613, + "step": 6559 + }, + { + "epoch": 8.421052631578947, + "grad_norm": 3.176252603530884, + "learning_rate": 2.524390243902439e-05, + "loss": 0.6041, + "step": 6560 + }, + { + "epoch": 8.422336328626445, + "grad_norm": 3.510908603668213, + "learning_rate": 2.524775353016688e-05, + "loss": 0.6039, + "step": 6561 + }, + { + "epoch": 8.42362002567394, + "grad_norm": 10.727826118469238, + "learning_rate": 2.5251604621309375e-05, + "loss": 0.6133, + "step": 6562 + }, + { + "epoch": 8.424903722721437, + "grad_norm": 1.545540690422058, + "learning_rate": 2.5255455712451863e-05, + "loss": 0.5845, + "step": 6563 + }, + { + "epoch": 8.426187419768935, + "grad_norm": 4.3808064460754395, + "learning_rate": 2.5259306803594354e-05, + "loss": 0.5937, + "step": 6564 + }, + { + "epoch": 8.427471116816431, + "grad_norm": 2.088463544845581, + "learning_rate": 2.526315789473684e-05, + "loss": 0.6552, + "step": 6565 + }, + { + "epoch": 8.428754813863929, + "grad_norm": 2.7013590335845947, + "learning_rate": 2.5267008985879332e-05, + "loss": 0.7193, + "step": 6566 + }, + { + "epoch": 8.430038510911425, + "grad_norm": 6.50105619430542, + "learning_rate": 2.5270860077021823e-05, + "loss": 0.6321, + "step": 6567 + }, + { + "epoch": 8.431322207958921, + "grad_norm": 1.9478480815887451, + "learning_rate": 2.527471116816431e-05, + "loss": 0.6795, + "step": 6568 + }, + { + "epoch": 8.432605905006419, + "grad_norm": 1.7539221048355103, + "learning_rate": 2.5278562259306806e-05, + "loss": 0.604, + "step": 6569 + }, + { + "epoch": 8.433889602053915, + "grad_norm": 3.605565071105957, + "learning_rate": 2.5282413350449297e-05, + "loss": 0.5867, + "step": 6570 + }, + { + "epoch": 8.435173299101413, + "grad_norm": 4.617795944213867, + "learning_rate": 2.5286264441591784e-05, + "loss": 0.6497, + "step": 6571 + }, + { + "epoch": 8.436456996148909, + "grad_norm": 1.6638391017913818, + "learning_rate": 2.5290115532734275e-05, + "loss": 0.647, + "step": 6572 + }, + { + "epoch": 8.437740693196405, + "grad_norm": 2.3404934406280518, + "learning_rate": 2.5293966623876763e-05, + "loss": 0.6997, + "step": 6573 + }, + { + "epoch": 8.439024390243903, + "grad_norm": 7.5806498527526855, + "learning_rate": 2.5297817715019254e-05, + "loss": 0.7204, + "step": 6574 + }, + { + "epoch": 8.440308087291399, + "grad_norm": 1.7599709033966064, + "learning_rate": 2.530166880616175e-05, + "loss": 0.7325, + "step": 6575 + }, + { + "epoch": 8.441591784338897, + "grad_norm": 2.419926404953003, + "learning_rate": 2.5305519897304236e-05, + "loss": 0.6774, + "step": 6576 + }, + { + "epoch": 8.442875481386393, + "grad_norm": 3.338829278945923, + "learning_rate": 2.5309370988446727e-05, + "loss": 0.6834, + "step": 6577 + }, + { + "epoch": 8.444159178433889, + "grad_norm": 2.8215348720550537, + "learning_rate": 2.531322207958922e-05, + "loss": 0.6723, + "step": 6578 + }, + { + "epoch": 8.445442875481387, + "grad_norm": 7.445374488830566, + "learning_rate": 2.5317073170731706e-05, + "loss": 0.6947, + "step": 6579 + }, + { + "epoch": 8.446726572528883, + "grad_norm": 3.737945795059204, + "learning_rate": 2.5320924261874197e-05, + "loss": 0.7183, + "step": 6580 + }, + { + "epoch": 8.44801026957638, + "grad_norm": 1.7374451160430908, + "learning_rate": 2.532477535301669e-05, + "loss": 0.8335, + "step": 6581 + }, + { + "epoch": 8.449293966623877, + "grad_norm": 2.920222282409668, + "learning_rate": 2.532862644415918e-05, + "loss": 0.8932, + "step": 6582 + }, + { + "epoch": 8.450577663671373, + "grad_norm": 3.0449118614196777, + "learning_rate": 2.533247753530167e-05, + "loss": 0.5749, + "step": 6583 + }, + { + "epoch": 8.45186136071887, + "grad_norm": 1.3688493967056274, + "learning_rate": 2.5336328626444158e-05, + "loss": 0.5549, + "step": 6584 + }, + { + "epoch": 8.453145057766367, + "grad_norm": 1.8609023094177246, + "learning_rate": 2.534017971758665e-05, + "loss": 0.574, + "step": 6585 + }, + { + "epoch": 8.454428754813865, + "grad_norm": 3.3919448852539062, + "learning_rate": 2.534403080872914e-05, + "loss": 0.5867, + "step": 6586 + }, + { + "epoch": 8.455712451861361, + "grad_norm": 1.6347380876541138, + "learning_rate": 2.534788189987163e-05, + "loss": 0.6201, + "step": 6587 + }, + { + "epoch": 8.456996148908857, + "grad_norm": 1.6414618492126465, + "learning_rate": 2.5351732991014122e-05, + "loss": 0.5598, + "step": 6588 + }, + { + "epoch": 8.458279845956355, + "grad_norm": 3.6802754402160645, + "learning_rate": 2.5355584082156613e-05, + "loss": 0.5891, + "step": 6589 + }, + { + "epoch": 8.459563543003851, + "grad_norm": 1.6051244735717773, + "learning_rate": 2.53594351732991e-05, + "loss": 0.6326, + "step": 6590 + }, + { + "epoch": 8.460847240051347, + "grad_norm": 1.9856606721878052, + "learning_rate": 2.5363286264441592e-05, + "loss": 0.5445, + "step": 6591 + }, + { + "epoch": 8.462130937098845, + "grad_norm": 2.3725199699401855, + "learning_rate": 2.536713735558408e-05, + "loss": 0.6127, + "step": 6592 + }, + { + "epoch": 8.463414634146341, + "grad_norm": 1.528746247291565, + "learning_rate": 2.5370988446726574e-05, + "loss": 0.6513, + "step": 6593 + }, + { + "epoch": 8.464698331193839, + "grad_norm": 4.509158611297607, + "learning_rate": 2.5374839537869065e-05, + "loss": 0.5835, + "step": 6594 + }, + { + "epoch": 8.465982028241335, + "grad_norm": 2.276024341583252, + "learning_rate": 2.5378690629011553e-05, + "loss": 0.5667, + "step": 6595 + }, + { + "epoch": 8.467265725288831, + "grad_norm": 2.5611047744750977, + "learning_rate": 2.5382541720154044e-05, + "loss": 0.6247, + "step": 6596 + }, + { + "epoch": 8.468549422336329, + "grad_norm": 2.120680332183838, + "learning_rate": 2.5386392811296535e-05, + "loss": 0.6437, + "step": 6597 + }, + { + "epoch": 8.469833119383825, + "grad_norm": 4.90083646774292, + "learning_rate": 2.5390243902439023e-05, + "loss": 0.5913, + "step": 6598 + }, + { + "epoch": 8.471116816431323, + "grad_norm": 4.033287525177002, + "learning_rate": 2.5394094993581517e-05, + "loss": 0.5742, + "step": 6599 + }, + { + "epoch": 8.472400513478819, + "grad_norm": 2.284003734588623, + "learning_rate": 2.5397946084724005e-05, + "loss": 0.6008, + "step": 6600 + }, + { + "epoch": 8.473684210526315, + "grad_norm": 2.1355762481689453, + "learning_rate": 2.5401797175866496e-05, + "loss": 0.5829, + "step": 6601 + }, + { + "epoch": 8.474967907573813, + "grad_norm": 2.085430145263672, + "learning_rate": 2.5405648267008987e-05, + "loss": 0.6403, + "step": 6602 + }, + { + "epoch": 8.476251604621309, + "grad_norm": 1.5159921646118164, + "learning_rate": 2.5409499358151475e-05, + "loss": 0.5921, + "step": 6603 + }, + { + "epoch": 8.477535301668807, + "grad_norm": 5.347773551940918, + "learning_rate": 2.5413350449293966e-05, + "loss": 0.5866, + "step": 6604 + }, + { + "epoch": 8.478818998716303, + "grad_norm": 1.4222227334976196, + "learning_rate": 2.541720154043646e-05, + "loss": 0.5851, + "step": 6605 + }, + { + "epoch": 8.480102695763799, + "grad_norm": 8.991772651672363, + "learning_rate": 2.5421052631578948e-05, + "loss": 0.5889, + "step": 6606 + }, + { + "epoch": 8.481386392811297, + "grad_norm": 2.0297162532806396, + "learning_rate": 2.542490372272144e-05, + "loss": 0.6145, + "step": 6607 + }, + { + "epoch": 8.482670089858793, + "grad_norm": 9.635451316833496, + "learning_rate": 2.542875481386393e-05, + "loss": 0.6083, + "step": 6608 + }, + { + "epoch": 8.48395378690629, + "grad_norm": 1.9871059656143188, + "learning_rate": 2.5432605905006418e-05, + "loss": 0.5964, + "step": 6609 + }, + { + "epoch": 8.485237483953787, + "grad_norm": 6.8375115394592285, + "learning_rate": 2.543645699614891e-05, + "loss": 0.59, + "step": 6610 + }, + { + "epoch": 8.486521181001283, + "grad_norm": 1.2068132162094116, + "learning_rate": 2.54403080872914e-05, + "loss": 0.6178, + "step": 6611 + }, + { + "epoch": 8.487804878048781, + "grad_norm": 1.7397128343582153, + "learning_rate": 2.544415917843389e-05, + "loss": 0.6589, + "step": 6612 + }, + { + "epoch": 8.489088575096277, + "grad_norm": 5.615724563598633, + "learning_rate": 2.5448010269576382e-05, + "loss": 0.6447, + "step": 6613 + }, + { + "epoch": 8.490372272143775, + "grad_norm": 5.163482189178467, + "learning_rate": 2.545186136071887e-05, + "loss": 0.5879, + "step": 6614 + }, + { + "epoch": 8.491655969191271, + "grad_norm": 2.4837586879730225, + "learning_rate": 2.545571245186136e-05, + "loss": 0.6232, + "step": 6615 + }, + { + "epoch": 8.492939666238767, + "grad_norm": 2.5424039363861084, + "learning_rate": 2.5459563543003852e-05, + "loss": 0.6354, + "step": 6616 + }, + { + "epoch": 8.494223363286265, + "grad_norm": 1.7903090715408325, + "learning_rate": 2.5463414634146343e-05, + "loss": 0.6301, + "step": 6617 + }, + { + "epoch": 8.495507060333761, + "grad_norm": 2.1601524353027344, + "learning_rate": 2.5467265725288834e-05, + "loss": 0.6532, + "step": 6618 + }, + { + "epoch": 8.496790757381259, + "grad_norm": 2.9961676597595215, + "learning_rate": 2.5471116816431322e-05, + "loss": 0.625, + "step": 6619 + }, + { + "epoch": 8.498074454428755, + "grad_norm": 2.019098997116089, + "learning_rate": 2.5474967907573813e-05, + "loss": 0.6178, + "step": 6620 + }, + { + "epoch": 8.499358151476251, + "grad_norm": 1.9747142791748047, + "learning_rate": 2.5478818998716304e-05, + "loss": 0.6106, + "step": 6621 + }, + { + "epoch": 8.500641848523749, + "grad_norm": 1.5214636325836182, + "learning_rate": 2.548267008985879e-05, + "loss": 0.6464, + "step": 6622 + }, + { + "epoch": 8.501925545571245, + "grad_norm": 1.513106107711792, + "learning_rate": 2.5486521181001283e-05, + "loss": 0.6037, + "step": 6623 + }, + { + "epoch": 8.503209242618741, + "grad_norm": 2.240396738052368, + "learning_rate": 2.5490372272143777e-05, + "loss": 0.6268, + "step": 6624 + }, + { + "epoch": 8.504492939666239, + "grad_norm": 4.467405319213867, + "learning_rate": 2.5494223363286265e-05, + "loss": 0.6891, + "step": 6625 + }, + { + "epoch": 8.505776636713735, + "grad_norm": 4.817278861999512, + "learning_rate": 2.5498074454428756e-05, + "loss": 0.7274, + "step": 6626 + }, + { + "epoch": 8.507060333761233, + "grad_norm": 2.2317638397216797, + "learning_rate": 2.5501925545571247e-05, + "loss": 0.7113, + "step": 6627 + }, + { + "epoch": 8.508344030808729, + "grad_norm": 1.815532922744751, + "learning_rate": 2.5505776636713735e-05, + "loss": 0.7216, + "step": 6628 + }, + { + "epoch": 8.509627727856225, + "grad_norm": 8.060799598693848, + "learning_rate": 2.5509627727856226e-05, + "loss": 0.734, + "step": 6629 + }, + { + "epoch": 8.510911424903723, + "grad_norm": 2.135580062866211, + "learning_rate": 2.5513478818998717e-05, + "loss": 0.7381, + "step": 6630 + }, + { + "epoch": 8.512195121951219, + "grad_norm": 3.858602523803711, + "learning_rate": 2.5517329910141208e-05, + "loss": 0.7312, + "step": 6631 + }, + { + "epoch": 8.513478818998717, + "grad_norm": 6.755694389343262, + "learning_rate": 2.55211810012837e-05, + "loss": 0.8704, + "step": 6632 + }, + { + "epoch": 8.514762516046213, + "grad_norm": 1.7332311868667603, + "learning_rate": 2.5525032092426187e-05, + "loss": 0.5747, + "step": 6633 + }, + { + "epoch": 8.51604621309371, + "grad_norm": 1.481484293937683, + "learning_rate": 2.5528883183568678e-05, + "loss": 0.5944, + "step": 6634 + }, + { + "epoch": 8.517329910141207, + "grad_norm": 4.398766994476318, + "learning_rate": 2.553273427471117e-05, + "loss": 0.6018, + "step": 6635 + }, + { + "epoch": 8.518613607188703, + "grad_norm": 3.529968500137329, + "learning_rate": 2.553658536585366e-05, + "loss": 0.6483, + "step": 6636 + }, + { + "epoch": 8.519897304236201, + "grad_norm": 3.4176290035247803, + "learning_rate": 2.554043645699615e-05, + "loss": 0.5926, + "step": 6637 + }, + { + "epoch": 8.521181001283697, + "grad_norm": 1.2393646240234375, + "learning_rate": 2.554428754813864e-05, + "loss": 0.5922, + "step": 6638 + }, + { + "epoch": 8.522464698331193, + "grad_norm": 3.3301162719726562, + "learning_rate": 2.554813863928113e-05, + "loss": 0.6098, + "step": 6639 + }, + { + "epoch": 8.523748395378691, + "grad_norm": 1.3032935857772827, + "learning_rate": 2.555198973042362e-05, + "loss": 0.5895, + "step": 6640 + }, + { + "epoch": 8.525032092426187, + "grad_norm": 6.403041839599609, + "learning_rate": 2.555584082156611e-05, + "loss": 0.5618, + "step": 6641 + }, + { + "epoch": 8.526315789473685, + "grad_norm": 5.498316287994385, + "learning_rate": 2.5559691912708603e-05, + "loss": 0.5753, + "step": 6642 + }, + { + "epoch": 8.527599486521181, + "grad_norm": 1.9829597473144531, + "learning_rate": 2.5563543003851094e-05, + "loss": 0.5985, + "step": 6643 + }, + { + "epoch": 8.528883183568677, + "grad_norm": 2.204118013381958, + "learning_rate": 2.556739409499358e-05, + "loss": 0.5849, + "step": 6644 + }, + { + "epoch": 8.530166880616175, + "grad_norm": 0.8824363350868225, + "learning_rate": 2.5571245186136073e-05, + "loss": 0.6101, + "step": 6645 + }, + { + "epoch": 8.531450577663671, + "grad_norm": 1.3468562364578247, + "learning_rate": 2.557509627727856e-05, + "loss": 0.6245, + "step": 6646 + }, + { + "epoch": 8.532734274711169, + "grad_norm": 1.5315663814544678, + "learning_rate": 2.557894736842105e-05, + "loss": 0.6187, + "step": 6647 + }, + { + "epoch": 8.534017971758665, + "grad_norm": 3.3435497283935547, + "learning_rate": 2.5582798459563546e-05, + "loss": 0.5734, + "step": 6648 + }, + { + "epoch": 8.535301668806161, + "grad_norm": 1.9270762205123901, + "learning_rate": 2.5586649550706034e-05, + "loss": 0.5459, + "step": 6649 + }, + { + "epoch": 8.536585365853659, + "grad_norm": 1.578188180923462, + "learning_rate": 2.5590500641848525e-05, + "loss": 0.6179, + "step": 6650 + }, + { + "epoch": 8.537869062901155, + "grad_norm": 1.6645338535308838, + "learning_rate": 2.5594351732991016e-05, + "loss": 0.5751, + "step": 6651 + }, + { + "epoch": 8.539152759948653, + "grad_norm": 1.4354602098464966, + "learning_rate": 2.5598202824133503e-05, + "loss": 0.5778, + "step": 6652 + }, + { + "epoch": 8.540436456996149, + "grad_norm": 5.618487358093262, + "learning_rate": 2.5602053915275995e-05, + "loss": 0.6293, + "step": 6653 + }, + { + "epoch": 8.541720154043645, + "grad_norm": 2.216883897781372, + "learning_rate": 2.560590500641849e-05, + "loss": 0.5801, + "step": 6654 + }, + { + "epoch": 8.543003851091143, + "grad_norm": 1.9382679462432861, + "learning_rate": 2.5609756097560977e-05, + "loss": 0.573, + "step": 6655 + }, + { + "epoch": 8.544287548138639, + "grad_norm": 1.2583352327346802, + "learning_rate": 2.5613607188703468e-05, + "loss": 0.6336, + "step": 6656 + }, + { + "epoch": 8.545571245186135, + "grad_norm": 3.6445720195770264, + "learning_rate": 2.5617458279845955e-05, + "loss": 0.6458, + "step": 6657 + }, + { + "epoch": 8.546854942233633, + "grad_norm": 2.2555291652679443, + "learning_rate": 2.5621309370988446e-05, + "loss": 0.643, + "step": 6658 + }, + { + "epoch": 8.54813863928113, + "grad_norm": 4.977335453033447, + "learning_rate": 2.5625160462130938e-05, + "loss": 0.6421, + "step": 6659 + }, + { + "epoch": 8.549422336328627, + "grad_norm": 4.280849933624268, + "learning_rate": 2.562901155327343e-05, + "loss": 0.591, + "step": 6660 + }, + { + "epoch": 8.550706033376123, + "grad_norm": 5.07271671295166, + "learning_rate": 2.563286264441592e-05, + "loss": 0.6186, + "step": 6661 + }, + { + "epoch": 8.55198973042362, + "grad_norm": 3.715458631515503, + "learning_rate": 2.563671373555841e-05, + "loss": 0.6213, + "step": 6662 + }, + { + "epoch": 8.553273427471117, + "grad_norm": 2.2207248210906982, + "learning_rate": 2.56405648267009e-05, + "loss": 0.5999, + "step": 6663 + }, + { + "epoch": 8.554557124518613, + "grad_norm": 3.5868711471557617, + "learning_rate": 2.564441591784339e-05, + "loss": 0.5744, + "step": 6664 + }, + { + "epoch": 8.555840821566111, + "grad_norm": 3.9206924438476562, + "learning_rate": 2.5648267008985877e-05, + "loss": 0.6515, + "step": 6665 + }, + { + "epoch": 8.557124518613607, + "grad_norm": 2.8446314334869385, + "learning_rate": 2.565211810012837e-05, + "loss": 0.587, + "step": 6666 + }, + { + "epoch": 8.558408215661103, + "grad_norm": 2.371626138687134, + "learning_rate": 2.5655969191270863e-05, + "loss": 0.6649, + "step": 6667 + }, + { + "epoch": 8.559691912708601, + "grad_norm": 2.0413339138031006, + "learning_rate": 2.565982028241335e-05, + "loss": 0.6837, + "step": 6668 + }, + { + "epoch": 8.560975609756097, + "grad_norm": 3.091048002243042, + "learning_rate": 2.566367137355584e-05, + "loss": 0.655, + "step": 6669 + }, + { + "epoch": 8.562259306803595, + "grad_norm": 4.400897979736328, + "learning_rate": 2.5667522464698333e-05, + "loss": 0.6162, + "step": 6670 + }, + { + "epoch": 8.563543003851091, + "grad_norm": 3.7308592796325684, + "learning_rate": 2.567137355584082e-05, + "loss": 0.6114, + "step": 6671 + }, + { + "epoch": 8.564826700898587, + "grad_norm": 3.916264295578003, + "learning_rate": 2.567522464698331e-05, + "loss": 0.6368, + "step": 6672 + }, + { + "epoch": 8.566110397946085, + "grad_norm": 4.6057257652282715, + "learning_rate": 2.5679075738125806e-05, + "loss": 0.6889, + "step": 6673 + }, + { + "epoch": 8.567394094993581, + "grad_norm": 2.119852066040039, + "learning_rate": 2.5682926829268293e-05, + "loss": 0.6777, + "step": 6674 + }, + { + "epoch": 8.568677792041079, + "grad_norm": 2.0373129844665527, + "learning_rate": 2.5686777920410785e-05, + "loss": 0.6601, + "step": 6675 + }, + { + "epoch": 8.569961489088575, + "grad_norm": 2.997331380844116, + "learning_rate": 2.5690629011553272e-05, + "loss": 0.6224, + "step": 6676 + }, + { + "epoch": 8.571245186136071, + "grad_norm": 5.4565253257751465, + "learning_rate": 2.5694480102695763e-05, + "loss": 0.6493, + "step": 6677 + }, + { + "epoch": 8.572528883183569, + "grad_norm": 3.133223295211792, + "learning_rate": 2.5698331193838254e-05, + "loss": 0.6655, + "step": 6678 + }, + { + "epoch": 8.573812580231065, + "grad_norm": 2.4002788066864014, + "learning_rate": 2.5702182284980745e-05, + "loss": 0.6463, + "step": 6679 + }, + { + "epoch": 8.575096277278563, + "grad_norm": 4.653738021850586, + "learning_rate": 2.5706033376123236e-05, + "loss": 0.7982, + "step": 6680 + }, + { + "epoch": 8.57637997432606, + "grad_norm": 2.014665126800537, + "learning_rate": 2.5709884467265728e-05, + "loss": 0.7967, + "step": 6681 + }, + { + "epoch": 8.577663671373555, + "grad_norm": 6.483300685882568, + "learning_rate": 2.5713735558408215e-05, + "loss": 0.8985, + "step": 6682 + }, + { + "epoch": 8.578947368421053, + "grad_norm": 2.067574977874756, + "learning_rate": 2.5717586649550706e-05, + "loss": 0.5733, + "step": 6683 + }, + { + "epoch": 8.58023106546855, + "grad_norm": 1.8425759077072144, + "learning_rate": 2.5721437740693194e-05, + "loss": 0.5593, + "step": 6684 + }, + { + "epoch": 8.581514762516047, + "grad_norm": 3.700808048248291, + "learning_rate": 2.572528883183569e-05, + "loss": 0.5998, + "step": 6685 + }, + { + "epoch": 8.582798459563543, + "grad_norm": 2.6813254356384277, + "learning_rate": 2.572913992297818e-05, + "loss": 0.6093, + "step": 6686 + }, + { + "epoch": 8.58408215661104, + "grad_norm": 4.565214157104492, + "learning_rate": 2.5732991014120667e-05, + "loss": 0.5932, + "step": 6687 + }, + { + "epoch": 8.585365853658537, + "grad_norm": 1.6118165254592896, + "learning_rate": 2.5736842105263158e-05, + "loss": 0.5606, + "step": 6688 + }, + { + "epoch": 8.586649550706033, + "grad_norm": 2.4873814582824707, + "learning_rate": 2.574069319640565e-05, + "loss": 0.5911, + "step": 6689 + }, + { + "epoch": 8.58793324775353, + "grad_norm": 2.9297268390655518, + "learning_rate": 2.5744544287548137e-05, + "loss": 0.563, + "step": 6690 + }, + { + "epoch": 8.589216944801027, + "grad_norm": 1.9839811325073242, + "learning_rate": 2.574839537869063e-05, + "loss": 0.5765, + "step": 6691 + }, + { + "epoch": 8.590500641848523, + "grad_norm": 1.8627268075942993, + "learning_rate": 2.5752246469833123e-05, + "loss": 0.5643, + "step": 6692 + }, + { + "epoch": 8.591784338896021, + "grad_norm": 2.6691806316375732, + "learning_rate": 2.575609756097561e-05, + "loss": 0.553, + "step": 6693 + }, + { + "epoch": 8.593068035943517, + "grad_norm": 3.6481773853302, + "learning_rate": 2.57599486521181e-05, + "loss": 0.6123, + "step": 6694 + }, + { + "epoch": 8.594351732991013, + "grad_norm": 2.058745861053467, + "learning_rate": 2.576379974326059e-05, + "loss": 0.592, + "step": 6695 + }, + { + "epoch": 8.595635430038511, + "grad_norm": 2.7567265033721924, + "learning_rate": 2.576765083440308e-05, + "loss": 0.5669, + "step": 6696 + }, + { + "epoch": 8.596919127086007, + "grad_norm": 6.330584526062012, + "learning_rate": 2.5771501925545575e-05, + "loss": 0.5567, + "step": 6697 + }, + { + "epoch": 8.598202824133505, + "grad_norm": 1.7594434022903442, + "learning_rate": 2.5775353016688062e-05, + "loss": 0.5792, + "step": 6698 + }, + { + "epoch": 8.599486521181001, + "grad_norm": 3.600659132003784, + "learning_rate": 2.5779204107830553e-05, + "loss": 0.6298, + "step": 6699 + }, + { + "epoch": 8.600770218228497, + "grad_norm": 0.9968865513801575, + "learning_rate": 2.5783055198973044e-05, + "loss": 0.587, + "step": 6700 + }, + { + "epoch": 8.602053915275995, + "grad_norm": 3.622936964035034, + "learning_rate": 2.5786906290115532e-05, + "loss": 0.6047, + "step": 6701 + }, + { + "epoch": 8.603337612323491, + "grad_norm": 6.465371608734131, + "learning_rate": 2.5790757381258023e-05, + "loss": 0.592, + "step": 6702 + }, + { + "epoch": 8.60462130937099, + "grad_norm": 4.257844924926758, + "learning_rate": 2.5794608472400514e-05, + "loss": 0.6122, + "step": 6703 + }, + { + "epoch": 8.605905006418485, + "grad_norm": 2.2327799797058105, + "learning_rate": 2.5798459563543005e-05, + "loss": 0.5735, + "step": 6704 + }, + { + "epoch": 8.607188703465981, + "grad_norm": 2.0814216136932373, + "learning_rate": 2.5802310654685496e-05, + "loss": 0.6318, + "step": 6705 + }, + { + "epoch": 8.60847240051348, + "grad_norm": 2.2144787311553955, + "learning_rate": 2.5806161745827984e-05, + "loss": 0.6232, + "step": 6706 + }, + { + "epoch": 8.609756097560975, + "grad_norm": 1.8434392213821411, + "learning_rate": 2.5810012836970475e-05, + "loss": 0.6391, + "step": 6707 + }, + { + "epoch": 8.611039794608473, + "grad_norm": 1.8191725015640259, + "learning_rate": 2.5813863928112966e-05, + "loss": 0.6354, + "step": 6708 + }, + { + "epoch": 8.61232349165597, + "grad_norm": 4.357552528381348, + "learning_rate": 2.5817715019255457e-05, + "loss": 0.6045, + "step": 6709 + }, + { + "epoch": 8.613607188703465, + "grad_norm": 14.927083969116211, + "learning_rate": 2.5821566110397948e-05, + "loss": 0.616, + "step": 6710 + }, + { + "epoch": 8.614890885750963, + "grad_norm": 5.4255051612854, + "learning_rate": 2.5825417201540436e-05, + "loss": 0.578, + "step": 6711 + }, + { + "epoch": 8.61617458279846, + "grad_norm": 2.7535412311553955, + "learning_rate": 2.5829268292682927e-05, + "loss": 0.604, + "step": 6712 + }, + { + "epoch": 8.617458279845957, + "grad_norm": 1.7515861988067627, + "learning_rate": 2.5833119383825418e-05, + "loss": 0.6088, + "step": 6713 + }, + { + "epoch": 8.618741976893453, + "grad_norm": 2.0974113941192627, + "learning_rate": 2.5836970474967906e-05, + "loss": 0.5811, + "step": 6714 + }, + { + "epoch": 8.62002567394095, + "grad_norm": 9.49094009399414, + "learning_rate": 2.58408215661104e-05, + "loss": 0.6266, + "step": 6715 + }, + { + "epoch": 8.621309370988447, + "grad_norm": 4.636343479156494, + "learning_rate": 2.584467265725289e-05, + "loss": 0.592, + "step": 6716 + }, + { + "epoch": 8.622593068035943, + "grad_norm": 2.5827572345733643, + "learning_rate": 2.584852374839538e-05, + "loss": 0.585, + "step": 6717 + }, + { + "epoch": 8.623876765083441, + "grad_norm": 3.1296961307525635, + "learning_rate": 2.585237483953787e-05, + "loss": 0.6587, + "step": 6718 + }, + { + "epoch": 8.625160462130937, + "grad_norm": 1.5465285778045654, + "learning_rate": 2.585622593068036e-05, + "loss": 0.6309, + "step": 6719 + }, + { + "epoch": 8.626444159178433, + "grad_norm": 2.065617084503174, + "learning_rate": 2.586007702182285e-05, + "loss": 0.6446, + "step": 6720 + }, + { + "epoch": 8.627727856225931, + "grad_norm": 2.5352401733398438, + "learning_rate": 2.586392811296534e-05, + "loss": 0.6069, + "step": 6721 + }, + { + "epoch": 8.629011553273427, + "grad_norm": 2.3316051959991455, + "learning_rate": 2.586777920410783e-05, + "loss": 0.6845, + "step": 6722 + }, + { + "epoch": 8.630295250320923, + "grad_norm": 5.0062689781188965, + "learning_rate": 2.5871630295250322e-05, + "loss": 0.627, + "step": 6723 + }, + { + "epoch": 8.631578947368421, + "grad_norm": 3.703209400177002, + "learning_rate": 2.5875481386392813e-05, + "loss": 0.6787, + "step": 6724 + }, + { + "epoch": 8.632862644415917, + "grad_norm": 2.6208913326263428, + "learning_rate": 2.58793324775353e-05, + "loss": 0.6239, + "step": 6725 + }, + { + "epoch": 8.634146341463415, + "grad_norm": 3.2364559173583984, + "learning_rate": 2.5883183568677792e-05, + "loss": 0.6584, + "step": 6726 + }, + { + "epoch": 8.635430038510911, + "grad_norm": 2.551037549972534, + "learning_rate": 2.5887034659820283e-05, + "loss": 0.7297, + "step": 6727 + }, + { + "epoch": 8.63671373555841, + "grad_norm": 4.333634853363037, + "learning_rate": 2.5890885750962774e-05, + "loss": 0.6647, + "step": 6728 + }, + { + "epoch": 8.637997432605905, + "grad_norm": 7.896062850952148, + "learning_rate": 2.5894736842105265e-05, + "loss": 0.7039, + "step": 6729 + }, + { + "epoch": 8.639281129653401, + "grad_norm": 2.7441654205322266, + "learning_rate": 2.5898587933247753e-05, + "loss": 0.6921, + "step": 6730 + }, + { + "epoch": 8.6405648267009, + "grad_norm": 4.136665344238281, + "learning_rate": 2.5902439024390244e-05, + "loss": 0.7722, + "step": 6731 + }, + { + "epoch": 8.641848523748395, + "grad_norm": 12.44482421875, + "learning_rate": 2.5906290115532735e-05, + "loss": 0.8377, + "step": 6732 + }, + { + "epoch": 8.643132220795891, + "grad_norm": 1.6737388372421265, + "learning_rate": 2.5910141206675223e-05, + "loss": 0.5616, + "step": 6733 + }, + { + "epoch": 8.64441591784339, + "grad_norm": 2.123279094696045, + "learning_rate": 2.5913992297817717e-05, + "loss": 0.5961, + "step": 6734 + }, + { + "epoch": 8.645699614890885, + "grad_norm": 2.2129886150360107, + "learning_rate": 2.5917843388960208e-05, + "loss": 0.5977, + "step": 6735 + }, + { + "epoch": 8.646983311938383, + "grad_norm": 1.4656463861465454, + "learning_rate": 2.5921694480102696e-05, + "loss": 0.5896, + "step": 6736 + }, + { + "epoch": 8.64826700898588, + "grad_norm": 1.919859528541565, + "learning_rate": 2.5925545571245187e-05, + "loss": 0.5707, + "step": 6737 + }, + { + "epoch": 8.649550706033375, + "grad_norm": 5.808783054351807, + "learning_rate": 2.5929396662387678e-05, + "loss": 0.5958, + "step": 6738 + }, + { + "epoch": 8.650834403080873, + "grad_norm": 1.5248997211456299, + "learning_rate": 2.5933247753530166e-05, + "loss": 0.6363, + "step": 6739 + }, + { + "epoch": 8.65211810012837, + "grad_norm": 2.433116912841797, + "learning_rate": 2.593709884467266e-05, + "loss": 0.5789, + "step": 6740 + }, + { + "epoch": 8.653401797175867, + "grad_norm": 2.007497549057007, + "learning_rate": 2.5940949935815148e-05, + "loss": 0.6029, + "step": 6741 + }, + { + "epoch": 8.654685494223363, + "grad_norm": 1.2040787935256958, + "learning_rate": 2.594480102695764e-05, + "loss": 0.6526, + "step": 6742 + }, + { + "epoch": 8.65596919127086, + "grad_norm": 2.672607898712158, + "learning_rate": 2.594865211810013e-05, + "loss": 0.5885, + "step": 6743 + }, + { + "epoch": 8.657252888318357, + "grad_norm": 2.347949981689453, + "learning_rate": 2.5952503209242618e-05, + "loss": 0.6373, + "step": 6744 + }, + { + "epoch": 8.658536585365853, + "grad_norm": 1.9676252603530884, + "learning_rate": 2.595635430038511e-05, + "loss": 0.5846, + "step": 6745 + }, + { + "epoch": 8.659820282413351, + "grad_norm": 3.0344326496124268, + "learning_rate": 2.5960205391527603e-05, + "loss": 0.5948, + "step": 6746 + }, + { + "epoch": 8.661103979460847, + "grad_norm": 1.5599281787872314, + "learning_rate": 2.596405648267009e-05, + "loss": 0.5667, + "step": 6747 + }, + { + "epoch": 8.662387676508343, + "grad_norm": 2.367793321609497, + "learning_rate": 2.5967907573812582e-05, + "loss": 0.5946, + "step": 6748 + }, + { + "epoch": 8.663671373555841, + "grad_norm": 2.3789522647857666, + "learning_rate": 2.597175866495507e-05, + "loss": 0.6092, + "step": 6749 + }, + { + "epoch": 8.664955070603337, + "grad_norm": 2.738154172897339, + "learning_rate": 2.597560975609756e-05, + "loss": 0.5969, + "step": 6750 + }, + { + "epoch": 8.666238767650835, + "grad_norm": 2.6672582626342773, + "learning_rate": 2.597946084724005e-05, + "loss": 0.5651, + "step": 6751 + }, + { + "epoch": 8.667522464698331, + "grad_norm": 1.6761544942855835, + "learning_rate": 2.5983311938382543e-05, + "loss": 0.584, + "step": 6752 + }, + { + "epoch": 8.668806161745827, + "grad_norm": 2.7580971717834473, + "learning_rate": 2.5987163029525034e-05, + "loss": 0.5996, + "step": 6753 + }, + { + "epoch": 8.670089858793325, + "grad_norm": 2.5343005657196045, + "learning_rate": 2.5991014120667525e-05, + "loss": 0.599, + "step": 6754 + }, + { + "epoch": 8.671373555840821, + "grad_norm": 4.267043113708496, + "learning_rate": 2.5994865211810013e-05, + "loss": 0.5843, + "step": 6755 + }, + { + "epoch": 8.672657252888317, + "grad_norm": 5.8472418785095215, + "learning_rate": 2.5998716302952504e-05, + "loss": 0.5709, + "step": 6756 + }, + { + "epoch": 8.673940949935815, + "grad_norm": 3.1767752170562744, + "learning_rate": 2.600256739409499e-05, + "loss": 0.5876, + "step": 6757 + }, + { + "epoch": 8.675224646983311, + "grad_norm": 6.376208782196045, + "learning_rate": 2.6006418485237486e-05, + "loss": 0.6048, + "step": 6758 + }, + { + "epoch": 8.67650834403081, + "grad_norm": 5.186850547790527, + "learning_rate": 2.6010269576379977e-05, + "loss": 0.5567, + "step": 6759 + }, + { + "epoch": 8.677792041078305, + "grad_norm": 16.80341339111328, + "learning_rate": 2.6014120667522465e-05, + "loss": 0.5635, + "step": 6760 + }, + { + "epoch": 8.679075738125803, + "grad_norm": 1.8649576902389526, + "learning_rate": 2.6017971758664956e-05, + "loss": 0.6181, + "step": 6761 + }, + { + "epoch": 8.6803594351733, + "grad_norm": 3.7334678173065186, + "learning_rate": 2.6021822849807447e-05, + "loss": 0.6333, + "step": 6762 + }, + { + "epoch": 8.681643132220795, + "grad_norm": 1.6146706342697144, + "learning_rate": 2.6025673940949934e-05, + "loss": 0.6309, + "step": 6763 + }, + { + "epoch": 8.682926829268293, + "grad_norm": 2.361929416656494, + "learning_rate": 2.602952503209243e-05, + "loss": 0.5983, + "step": 6764 + }, + { + "epoch": 8.68421052631579, + "grad_norm": 1.543782114982605, + "learning_rate": 2.603337612323492e-05, + "loss": 0.6122, + "step": 6765 + }, + { + "epoch": 8.685494223363285, + "grad_norm": 3.677093267440796, + "learning_rate": 2.6037227214377408e-05, + "loss": 0.6316, + "step": 6766 + }, + { + "epoch": 8.686777920410783, + "grad_norm": 2.3478448390960693, + "learning_rate": 2.60410783055199e-05, + "loss": 0.5801, + "step": 6767 + }, + { + "epoch": 8.68806161745828, + "grad_norm": 5.036169528961182, + "learning_rate": 2.6044929396662386e-05, + "loss": 0.6185, + "step": 6768 + }, + { + "epoch": 8.689345314505777, + "grad_norm": 2.9532670974731445, + "learning_rate": 2.6048780487804877e-05, + "loss": 0.6158, + "step": 6769 + }, + { + "epoch": 8.690629011553273, + "grad_norm": 1.946559190750122, + "learning_rate": 2.605263157894737e-05, + "loss": 0.7171, + "step": 6770 + }, + { + "epoch": 8.69191270860077, + "grad_norm": 4.594301700592041, + "learning_rate": 2.605648267008986e-05, + "loss": 0.6984, + "step": 6771 + }, + { + "epoch": 8.693196405648267, + "grad_norm": 2.4794681072235107, + "learning_rate": 2.606033376123235e-05, + "loss": 0.6345, + "step": 6772 + }, + { + "epoch": 8.694480102695763, + "grad_norm": 5.016441822052002, + "learning_rate": 2.606418485237484e-05, + "loss": 0.6671, + "step": 6773 + }, + { + "epoch": 8.695763799743261, + "grad_norm": 2.253514528274536, + "learning_rate": 2.606803594351733e-05, + "loss": 0.6846, + "step": 6774 + }, + { + "epoch": 8.697047496790757, + "grad_norm": 1.7354488372802734, + "learning_rate": 2.607188703465982e-05, + "loss": 0.6672, + "step": 6775 + }, + { + "epoch": 8.698331193838253, + "grad_norm": 11.290484428405762, + "learning_rate": 2.6075738125802308e-05, + "loss": 0.6092, + "step": 6776 + }, + { + "epoch": 8.699614890885751, + "grad_norm": 2.346731424331665, + "learning_rate": 2.6079589216944803e-05, + "loss": 0.6445, + "step": 6777 + }, + { + "epoch": 8.700898587933247, + "grad_norm": 5.311804294586182, + "learning_rate": 2.6083440308087294e-05, + "loss": 0.7101, + "step": 6778 + }, + { + "epoch": 8.702182284980745, + "grad_norm": 3.0617895126342773, + "learning_rate": 2.608729139922978e-05, + "loss": 0.683, + "step": 6779 + }, + { + "epoch": 8.703465982028241, + "grad_norm": 1.9889198541641235, + "learning_rate": 2.6091142490372272e-05, + "loss": 0.758, + "step": 6780 + }, + { + "epoch": 8.704749679075737, + "grad_norm": 3.360219717025757, + "learning_rate": 2.6094993581514763e-05, + "loss": 0.7694, + "step": 6781 + }, + { + "epoch": 8.706033376123235, + "grad_norm": 4.351644992828369, + "learning_rate": 2.609884467265725e-05, + "loss": 0.908, + "step": 6782 + }, + { + "epoch": 8.707317073170731, + "grad_norm": 2.913285255432129, + "learning_rate": 2.6102695763799746e-05, + "loss": 0.6283, + "step": 6783 + }, + { + "epoch": 8.70860077021823, + "grad_norm": 1.5290720462799072, + "learning_rate": 2.6106546854942237e-05, + "loss": 0.5632, + "step": 6784 + }, + { + "epoch": 8.709884467265725, + "grad_norm": 2.5301811695098877, + "learning_rate": 2.6110397946084724e-05, + "loss": 0.5967, + "step": 6785 + }, + { + "epoch": 8.711168164313221, + "grad_norm": 3.1340320110321045, + "learning_rate": 2.6114249037227215e-05, + "loss": 0.661, + "step": 6786 + }, + { + "epoch": 8.71245186136072, + "grad_norm": 1.8556956052780151, + "learning_rate": 2.6118100128369703e-05, + "loss": 0.5892, + "step": 6787 + }, + { + "epoch": 8.713735558408215, + "grad_norm": 1.9099643230438232, + "learning_rate": 2.6121951219512194e-05, + "loss": 0.6006, + "step": 6788 + }, + { + "epoch": 8.715019255455712, + "grad_norm": 1.8747673034667969, + "learning_rate": 2.612580231065469e-05, + "loss": 0.5711, + "step": 6789 + }, + { + "epoch": 8.71630295250321, + "grad_norm": 1.650018572807312, + "learning_rate": 2.6129653401797176e-05, + "loss": 0.5847, + "step": 6790 + }, + { + "epoch": 8.717586649550706, + "grad_norm": 3.5644373893737793, + "learning_rate": 2.6133504492939667e-05, + "loss": 0.5886, + "step": 6791 + }, + { + "epoch": 8.718870346598203, + "grad_norm": 1.8558465242385864, + "learning_rate": 2.613735558408216e-05, + "loss": 0.5608, + "step": 6792 + }, + { + "epoch": 8.7201540436457, + "grad_norm": 2.3016505241394043, + "learning_rate": 2.6141206675224646e-05, + "loss": 0.5743, + "step": 6793 + }, + { + "epoch": 8.721437740693197, + "grad_norm": 1.2404836416244507, + "learning_rate": 2.6145057766367137e-05, + "loss": 0.6296, + "step": 6794 + }, + { + "epoch": 8.722721437740693, + "grad_norm": 1.6790887117385864, + "learning_rate": 2.6148908857509628e-05, + "loss": 0.5688, + "step": 6795 + }, + { + "epoch": 8.72400513478819, + "grad_norm": 4.038029670715332, + "learning_rate": 2.615275994865212e-05, + "loss": 0.5258, + "step": 6796 + }, + { + "epoch": 8.725288831835687, + "grad_norm": 2.964632034301758, + "learning_rate": 2.615661103979461e-05, + "loss": 0.561, + "step": 6797 + }, + { + "epoch": 8.726572528883183, + "grad_norm": 1.1244248151779175, + "learning_rate": 2.6160462130937098e-05, + "loss": 0.5713, + "step": 6798 + }, + { + "epoch": 8.72785622593068, + "grad_norm": 1.817423701286316, + "learning_rate": 2.616431322207959e-05, + "loss": 0.6101, + "step": 6799 + }, + { + "epoch": 8.729139922978177, + "grad_norm": 1.421067237854004, + "learning_rate": 2.616816431322208e-05, + "loss": 0.5818, + "step": 6800 + }, + { + "epoch": 8.730423620025674, + "grad_norm": 1.5462958812713623, + "learning_rate": 2.617201540436457e-05, + "loss": 0.6331, + "step": 6801 + }, + { + "epoch": 8.731707317073171, + "grad_norm": 1.8760297298431396, + "learning_rate": 2.6175866495507062e-05, + "loss": 0.5913, + "step": 6802 + }, + { + "epoch": 8.732991014120667, + "grad_norm": 1.4835071563720703, + "learning_rate": 2.617971758664955e-05, + "loss": 0.5873, + "step": 6803 + }, + { + "epoch": 8.734274711168164, + "grad_norm": 1.340137243270874, + "learning_rate": 2.618356867779204e-05, + "loss": 0.6228, + "step": 6804 + }, + { + "epoch": 8.735558408215661, + "grad_norm": 2.1420233249664307, + "learning_rate": 2.6187419768934532e-05, + "loss": 0.553, + "step": 6805 + }, + { + "epoch": 8.736842105263158, + "grad_norm": 2.06386137008667, + "learning_rate": 2.619127086007702e-05, + "loss": 0.5597, + "step": 6806 + }, + { + "epoch": 8.738125802310655, + "grad_norm": 3.437293529510498, + "learning_rate": 2.6195121951219514e-05, + "loss": 0.5877, + "step": 6807 + }, + { + "epoch": 8.739409499358151, + "grad_norm": 1.3572471141815186, + "learning_rate": 2.6198973042362005e-05, + "loss": 0.6374, + "step": 6808 + }, + { + "epoch": 8.740693196405648, + "grad_norm": 1.6763734817504883, + "learning_rate": 2.6202824133504493e-05, + "loss": 0.5796, + "step": 6809 + }, + { + "epoch": 8.741976893453145, + "grad_norm": 1.5032035112380981, + "learning_rate": 2.6206675224646984e-05, + "loss": 0.5889, + "step": 6810 + }, + { + "epoch": 8.743260590500642, + "grad_norm": 1.693593144416809, + "learning_rate": 2.6210526315789475e-05, + "loss": 0.596, + "step": 6811 + }, + { + "epoch": 8.74454428754814, + "grad_norm": 3.698575973510742, + "learning_rate": 2.6214377406931963e-05, + "loss": 0.6033, + "step": 6812 + }, + { + "epoch": 8.745827984595635, + "grad_norm": 1.7579585313796997, + "learning_rate": 2.6218228498074457e-05, + "loss": 0.5873, + "step": 6813 + }, + { + "epoch": 8.747111681643132, + "grad_norm": 1.538232684135437, + "learning_rate": 2.6222079589216945e-05, + "loss": 0.6244, + "step": 6814 + }, + { + "epoch": 8.74839537869063, + "grad_norm": 2.469604969024658, + "learning_rate": 2.6225930680359436e-05, + "loss": 0.6457, + "step": 6815 + }, + { + "epoch": 8.749679075738126, + "grad_norm": 2.4778430461883545, + "learning_rate": 2.6229781771501927e-05, + "loss": 0.6012, + "step": 6816 + }, + { + "epoch": 8.750962772785623, + "grad_norm": 2.6896111965179443, + "learning_rate": 2.6233632862644415e-05, + "loss": 0.6539, + "step": 6817 + }, + { + "epoch": 8.75224646983312, + "grad_norm": 1.3799320459365845, + "learning_rate": 2.6237483953786906e-05, + "loss": 0.6028, + "step": 6818 + }, + { + "epoch": 8.753530166880616, + "grad_norm": 2.3694887161254883, + "learning_rate": 2.62413350449294e-05, + "loss": 0.6338, + "step": 6819 + }, + { + "epoch": 8.754813863928113, + "grad_norm": 2.980724811553955, + "learning_rate": 2.6245186136071888e-05, + "loss": 0.6307, + "step": 6820 + }, + { + "epoch": 8.75609756097561, + "grad_norm": 3.16337513923645, + "learning_rate": 2.624903722721438e-05, + "loss": 0.6144, + "step": 6821 + }, + { + "epoch": 8.757381258023106, + "grad_norm": 2.759998083114624, + "learning_rate": 2.6252888318356867e-05, + "loss": 0.6245, + "step": 6822 + }, + { + "epoch": 8.758664955070603, + "grad_norm": 7.425671100616455, + "learning_rate": 2.6256739409499358e-05, + "loss": 0.7254, + "step": 6823 + }, + { + "epoch": 8.7599486521181, + "grad_norm": 1.3351788520812988, + "learning_rate": 2.626059050064185e-05, + "loss": 0.6698, + "step": 6824 + }, + { + "epoch": 8.761232349165597, + "grad_norm": 8.074599266052246, + "learning_rate": 2.6264441591784337e-05, + "loss": 0.6674, + "step": 6825 + }, + { + "epoch": 8.762516046213094, + "grad_norm": 1.698808193206787, + "learning_rate": 2.626829268292683e-05, + "loss": 0.6607, + "step": 6826 + }, + { + "epoch": 8.763799743260591, + "grad_norm": 2.6149189472198486, + "learning_rate": 2.6272143774069322e-05, + "loss": 0.725, + "step": 6827 + }, + { + "epoch": 8.765083440308088, + "grad_norm": 2.024470090866089, + "learning_rate": 2.627599486521181e-05, + "loss": 0.671, + "step": 6828 + }, + { + "epoch": 8.766367137355584, + "grad_norm": 6.306472301483154, + "learning_rate": 2.62798459563543e-05, + "loss": 0.6776, + "step": 6829 + }, + { + "epoch": 8.767650834403081, + "grad_norm": 6.929853916168213, + "learning_rate": 2.6283697047496792e-05, + "loss": 0.7072, + "step": 6830 + }, + { + "epoch": 8.768934531450578, + "grad_norm": 2.2344284057617188, + "learning_rate": 2.628754813863928e-05, + "loss": 0.7652, + "step": 6831 + }, + { + "epoch": 8.770218228498074, + "grad_norm": 3.881430149078369, + "learning_rate": 2.6291399229781774e-05, + "loss": 0.905, + "step": 6832 + }, + { + "epoch": 8.771501925545572, + "grad_norm": 3.045642852783203, + "learning_rate": 2.6295250320924262e-05, + "loss": 0.5785, + "step": 6833 + }, + { + "epoch": 8.772785622593068, + "grad_norm": 4.513213157653809, + "learning_rate": 2.6299101412066753e-05, + "loss": 0.61, + "step": 6834 + }, + { + "epoch": 8.774069319640565, + "grad_norm": 2.16414475440979, + "learning_rate": 2.6302952503209244e-05, + "loss": 0.5995, + "step": 6835 + }, + { + "epoch": 8.775353016688062, + "grad_norm": 1.6763086318969727, + "learning_rate": 2.630680359435173e-05, + "loss": 0.5931, + "step": 6836 + }, + { + "epoch": 8.776636713735558, + "grad_norm": 1.6577143669128418, + "learning_rate": 2.6310654685494223e-05, + "loss": 0.5773, + "step": 6837 + }, + { + "epoch": 8.777920410783056, + "grad_norm": 3.956653594970703, + "learning_rate": 2.6314505776636717e-05, + "loss": 0.5731, + "step": 6838 + }, + { + "epoch": 8.779204107830552, + "grad_norm": 1.8121126890182495, + "learning_rate": 2.6318356867779205e-05, + "loss": 0.5766, + "step": 6839 + }, + { + "epoch": 8.78048780487805, + "grad_norm": 3.1767194271087646, + "learning_rate": 2.6322207958921696e-05, + "loss": 0.565, + "step": 6840 + }, + { + "epoch": 8.781771501925546, + "grad_norm": 2.9507789611816406, + "learning_rate": 2.6326059050064184e-05, + "loss": 0.5925, + "step": 6841 + }, + { + "epoch": 8.783055198973042, + "grad_norm": 5.120240211486816, + "learning_rate": 2.6329910141206675e-05, + "loss": 0.6131, + "step": 6842 + }, + { + "epoch": 8.78433889602054, + "grad_norm": 4.998256683349609, + "learning_rate": 2.6333761232349166e-05, + "loss": 0.6002, + "step": 6843 + }, + { + "epoch": 8.785622593068036, + "grad_norm": 1.671674370765686, + "learning_rate": 2.6337612323491657e-05, + "loss": 0.6275, + "step": 6844 + }, + { + "epoch": 8.786906290115533, + "grad_norm": 2.876020669937134, + "learning_rate": 2.6341463414634148e-05, + "loss": 0.5788, + "step": 6845 + }, + { + "epoch": 8.78818998716303, + "grad_norm": 2.7279999256134033, + "learning_rate": 2.634531450577664e-05, + "loss": 0.5536, + "step": 6846 + }, + { + "epoch": 8.789473684210526, + "grad_norm": 3.2754998207092285, + "learning_rate": 2.6349165596919127e-05, + "loss": 0.5738, + "step": 6847 + }, + { + "epoch": 8.790757381258024, + "grad_norm": 1.1362932920455933, + "learning_rate": 2.6353016688061618e-05, + "loss": 0.5785, + "step": 6848 + }, + { + "epoch": 8.79204107830552, + "grad_norm": 2.8515796661376953, + "learning_rate": 2.6356867779204105e-05, + "loss": 0.5795, + "step": 6849 + }, + { + "epoch": 8.793324775353017, + "grad_norm": 1.510554313659668, + "learning_rate": 2.63607188703466e-05, + "loss": 0.5687, + "step": 6850 + }, + { + "epoch": 8.794608472400514, + "grad_norm": 1.4298428297042847, + "learning_rate": 2.636456996148909e-05, + "loss": 0.6203, + "step": 6851 + }, + { + "epoch": 8.79589216944801, + "grad_norm": 4.179427146911621, + "learning_rate": 2.636842105263158e-05, + "loss": 0.5728, + "step": 6852 + }, + { + "epoch": 8.797175866495508, + "grad_norm": 1.2206016778945923, + "learning_rate": 2.637227214377407e-05, + "loss": 0.5513, + "step": 6853 + }, + { + "epoch": 8.798459563543004, + "grad_norm": 2.1844890117645264, + "learning_rate": 2.637612323491656e-05, + "loss": 0.5862, + "step": 6854 + }, + { + "epoch": 8.7997432605905, + "grad_norm": 4.724587440490723, + "learning_rate": 2.637997432605905e-05, + "loss": 0.5899, + "step": 6855 + }, + { + "epoch": 8.801026957637998, + "grad_norm": 2.63210129737854, + "learning_rate": 2.6383825417201543e-05, + "loss": 0.5709, + "step": 6856 + }, + { + "epoch": 8.802310654685494, + "grad_norm": 1.470698356628418, + "learning_rate": 2.6387676508344034e-05, + "loss": 0.6023, + "step": 6857 + }, + { + "epoch": 8.803594351732992, + "grad_norm": 1.22795832157135, + "learning_rate": 2.639152759948652e-05, + "loss": 0.5695, + "step": 6858 + }, + { + "epoch": 8.804878048780488, + "grad_norm": 4.486125946044922, + "learning_rate": 2.6395378690629013e-05, + "loss": 0.5968, + "step": 6859 + }, + { + "epoch": 8.806161745827985, + "grad_norm": 5.194540500640869, + "learning_rate": 2.63992297817715e-05, + "loss": 0.5738, + "step": 6860 + }, + { + "epoch": 8.807445442875482, + "grad_norm": 1.4694266319274902, + "learning_rate": 2.640308087291399e-05, + "loss": 0.5685, + "step": 6861 + }, + { + "epoch": 8.808729139922978, + "grad_norm": 1.6133538484573364, + "learning_rate": 2.6406931964056486e-05, + "loss": 0.614, + "step": 6862 + }, + { + "epoch": 8.810012836970476, + "grad_norm": 3.6530845165252686, + "learning_rate": 2.6410783055198974e-05, + "loss": 0.6097, + "step": 6863 + }, + { + "epoch": 8.811296534017972, + "grad_norm": 1.2310540676116943, + "learning_rate": 2.6414634146341465e-05, + "loss": 0.6028, + "step": 6864 + }, + { + "epoch": 8.812580231065468, + "grad_norm": 2.414940118789673, + "learning_rate": 2.6418485237483956e-05, + "loss": 0.5796, + "step": 6865 + }, + { + "epoch": 8.813863928112966, + "grad_norm": 1.386333703994751, + "learning_rate": 2.6422336328626443e-05, + "loss": 0.5967, + "step": 6866 + }, + { + "epoch": 8.815147625160462, + "grad_norm": 1.2193639278411865, + "learning_rate": 2.6426187419768935e-05, + "loss": 0.571, + "step": 6867 + }, + { + "epoch": 8.81643132220796, + "grad_norm": 2.7134106159210205, + "learning_rate": 2.6430038510911426e-05, + "loss": 0.636, + "step": 6868 + }, + { + "epoch": 8.817715019255456, + "grad_norm": 2.875662088394165, + "learning_rate": 2.6433889602053917e-05, + "loss": 0.646, + "step": 6869 + }, + { + "epoch": 8.818998716302952, + "grad_norm": 3.252985715866089, + "learning_rate": 2.6437740693196408e-05, + "loss": 0.6346, + "step": 6870 + }, + { + "epoch": 8.82028241335045, + "grad_norm": 1.5396069288253784, + "learning_rate": 2.6441591784338895e-05, + "loss": 0.6239, + "step": 6871 + }, + { + "epoch": 8.821566110397946, + "grad_norm": 4.594776630401611, + "learning_rate": 2.6445442875481386e-05, + "loss": 0.6486, + "step": 6872 + }, + { + "epoch": 8.822849807445444, + "grad_norm": 1.6776621341705322, + "learning_rate": 2.6449293966623878e-05, + "loss": 0.5771, + "step": 6873 + }, + { + "epoch": 8.82413350449294, + "grad_norm": 2.4258267879486084, + "learning_rate": 2.6453145057766365e-05, + "loss": 0.6094, + "step": 6874 + }, + { + "epoch": 8.825417201540436, + "grad_norm": 4.259385585784912, + "learning_rate": 2.645699614890886e-05, + "loss": 0.6587, + "step": 6875 + }, + { + "epoch": 8.826700898587934, + "grad_norm": 3.5214765071868896, + "learning_rate": 2.646084724005135e-05, + "loss": 0.7494, + "step": 6876 + }, + { + "epoch": 8.82798459563543, + "grad_norm": 2.284971237182617, + "learning_rate": 2.646469833119384e-05, + "loss": 0.5986, + "step": 6877 + }, + { + "epoch": 8.829268292682928, + "grad_norm": 2.0068883895874023, + "learning_rate": 2.646854942233633e-05, + "loss": 0.686, + "step": 6878 + }, + { + "epoch": 8.830551989730424, + "grad_norm": 5.311718463897705, + "learning_rate": 2.6472400513478817e-05, + "loss": 0.7188, + "step": 6879 + }, + { + "epoch": 8.83183568677792, + "grad_norm": 2.6213388442993164, + "learning_rate": 2.6476251604621308e-05, + "loss": 0.7174, + "step": 6880 + }, + { + "epoch": 8.833119383825418, + "grad_norm": 9.175209999084473, + "learning_rate": 2.6480102695763803e-05, + "loss": 0.8138, + "step": 6881 + }, + { + "epoch": 8.834403080872914, + "grad_norm": 3.678527593612671, + "learning_rate": 2.648395378690629e-05, + "loss": 0.8933, + "step": 6882 + }, + { + "epoch": 8.835686777920412, + "grad_norm": 1.8231425285339355, + "learning_rate": 2.648780487804878e-05, + "loss": 0.5795, + "step": 6883 + }, + { + "epoch": 8.836970474967908, + "grad_norm": 1.943257212638855, + "learning_rate": 2.6491655969191273e-05, + "loss": 0.6142, + "step": 6884 + }, + { + "epoch": 8.838254172015404, + "grad_norm": 2.3024067878723145, + "learning_rate": 2.649550706033376e-05, + "loss": 0.5762, + "step": 6885 + }, + { + "epoch": 8.839537869062902, + "grad_norm": 3.113318681716919, + "learning_rate": 2.649935815147625e-05, + "loss": 0.5935, + "step": 6886 + }, + { + "epoch": 8.840821566110398, + "grad_norm": 1.3450840711593628, + "learning_rate": 2.6503209242618742e-05, + "loss": 0.6166, + "step": 6887 + }, + { + "epoch": 8.842105263157894, + "grad_norm": 1.5275769233703613, + "learning_rate": 2.6507060333761233e-05, + "loss": 0.5678, + "step": 6888 + }, + { + "epoch": 8.843388960205392, + "grad_norm": 1.793678641319275, + "learning_rate": 2.6510911424903725e-05, + "loss": 0.568, + "step": 6889 + }, + { + "epoch": 8.844672657252888, + "grad_norm": 2.9638993740081787, + "learning_rate": 2.6514762516046212e-05, + "loss": 0.6151, + "step": 6890 + }, + { + "epoch": 8.845956354300386, + "grad_norm": 3.2900679111480713, + "learning_rate": 2.6518613607188703e-05, + "loss": 0.5834, + "step": 6891 + }, + { + "epoch": 8.847240051347882, + "grad_norm": 1.0413618087768555, + "learning_rate": 2.6522464698331194e-05, + "loss": 0.5994, + "step": 6892 + }, + { + "epoch": 8.84852374839538, + "grad_norm": 1.4742040634155273, + "learning_rate": 2.6526315789473685e-05, + "loss": 0.5891, + "step": 6893 + }, + { + "epoch": 8.849807445442876, + "grad_norm": 5.33107852935791, + "learning_rate": 2.6530166880616176e-05, + "loss": 0.6214, + "step": 6894 + }, + { + "epoch": 8.851091142490372, + "grad_norm": 2.0106184482574463, + "learning_rate": 2.6534017971758668e-05, + "loss": 0.629, + "step": 6895 + }, + { + "epoch": 8.85237483953787, + "grad_norm": 1.4055209159851074, + "learning_rate": 2.6537869062901155e-05, + "loss": 0.5999, + "step": 6896 + }, + { + "epoch": 8.853658536585366, + "grad_norm": 3.8378803730010986, + "learning_rate": 2.6541720154043646e-05, + "loss": 0.6046, + "step": 6897 + }, + { + "epoch": 8.854942233632862, + "grad_norm": 4.129585266113281, + "learning_rate": 2.6545571245186134e-05, + "loss": 0.6163, + "step": 6898 + }, + { + "epoch": 8.85622593068036, + "grad_norm": 1.4544095993041992, + "learning_rate": 2.654942233632863e-05, + "loss": 0.5767, + "step": 6899 + }, + { + "epoch": 8.857509627727856, + "grad_norm": 2.207122802734375, + "learning_rate": 2.655327342747112e-05, + "loss": 0.5529, + "step": 6900 + }, + { + "epoch": 8.858793324775354, + "grad_norm": 1.683436393737793, + "learning_rate": 2.6557124518613607e-05, + "loss": 0.6701, + "step": 6901 + }, + { + "epoch": 8.86007702182285, + "grad_norm": 2.4479422569274902, + "learning_rate": 2.6560975609756098e-05, + "loss": 0.6276, + "step": 6902 + }, + { + "epoch": 8.861360718870346, + "grad_norm": 1.2542829513549805, + "learning_rate": 2.656482670089859e-05, + "loss": 0.5396, + "step": 6903 + }, + { + "epoch": 8.862644415917844, + "grad_norm": 1.2355738878250122, + "learning_rate": 2.6568677792041077e-05, + "loss": 0.6367, + "step": 6904 + }, + { + "epoch": 8.86392811296534, + "grad_norm": 1.4708958864212036, + "learning_rate": 2.657252888318357e-05, + "loss": 0.6005, + "step": 6905 + }, + { + "epoch": 8.865211810012838, + "grad_norm": 2.2939369678497314, + "learning_rate": 2.657637997432606e-05, + "loss": 0.6014, + "step": 6906 + }, + { + "epoch": 8.866495507060334, + "grad_norm": 2.8999617099761963, + "learning_rate": 2.658023106546855e-05, + "loss": 0.5662, + "step": 6907 + }, + { + "epoch": 8.86777920410783, + "grad_norm": 1.3967989683151245, + "learning_rate": 2.658408215661104e-05, + "loss": 0.5989, + "step": 6908 + }, + { + "epoch": 8.869062901155328, + "grad_norm": 8.05020809173584, + "learning_rate": 2.658793324775353e-05, + "loss": 0.6331, + "step": 6909 + }, + { + "epoch": 8.870346598202824, + "grad_norm": 1.9821100234985352, + "learning_rate": 2.659178433889602e-05, + "loss": 0.5883, + "step": 6910 + }, + { + "epoch": 8.871630295250322, + "grad_norm": 1.7475892305374146, + "learning_rate": 2.6595635430038515e-05, + "loss": 0.5969, + "step": 6911 + }, + { + "epoch": 8.872913992297818, + "grad_norm": 3.429337739944458, + "learning_rate": 2.6599486521181002e-05, + "loss": 0.6449, + "step": 6912 + }, + { + "epoch": 8.874197689345314, + "grad_norm": 3.293765068054199, + "learning_rate": 2.6603337612323493e-05, + "loss": 0.6642, + "step": 6913 + }, + { + "epoch": 8.875481386392812, + "grad_norm": 6.732243537902832, + "learning_rate": 2.660718870346598e-05, + "loss": 0.6124, + "step": 6914 + }, + { + "epoch": 8.876765083440308, + "grad_norm": 1.3345990180969238, + "learning_rate": 2.6611039794608472e-05, + "loss": 0.6471, + "step": 6915 + }, + { + "epoch": 8.878048780487806, + "grad_norm": 6.7521491050720215, + "learning_rate": 2.6614890885750963e-05, + "loss": 0.6047, + "step": 6916 + }, + { + "epoch": 8.879332477535302, + "grad_norm": 4.251494884490967, + "learning_rate": 2.6618741976893454e-05, + "loss": 0.6283, + "step": 6917 + }, + { + "epoch": 8.880616174582798, + "grad_norm": 2.740027904510498, + "learning_rate": 2.6622593068035945e-05, + "loss": 0.5543, + "step": 6918 + }, + { + "epoch": 8.881899871630296, + "grad_norm": 2.560122013092041, + "learning_rate": 2.6626444159178436e-05, + "loss": 0.6439, + "step": 6919 + }, + { + "epoch": 8.883183568677792, + "grad_norm": 2.26861310005188, + "learning_rate": 2.6630295250320924e-05, + "loss": 0.627, + "step": 6920 + }, + { + "epoch": 8.88446726572529, + "grad_norm": 2.21134352684021, + "learning_rate": 2.6634146341463415e-05, + "loss": 0.6741, + "step": 6921 + }, + { + "epoch": 8.885750962772786, + "grad_norm": 1.6930451393127441, + "learning_rate": 2.6637997432605906e-05, + "loss": 0.6269, + "step": 6922 + }, + { + "epoch": 8.887034659820282, + "grad_norm": 3.7503209114074707, + "learning_rate": 2.6641848523748397e-05, + "loss": 0.6486, + "step": 6923 + }, + { + "epoch": 8.88831835686778, + "grad_norm": 1.6239631175994873, + "learning_rate": 2.6645699614890888e-05, + "loss": 0.6831, + "step": 6924 + }, + { + "epoch": 8.889602053915276, + "grad_norm": 1.873602032661438, + "learning_rate": 2.6649550706033376e-05, + "loss": 0.6721, + "step": 6925 + }, + { + "epoch": 8.890885750962774, + "grad_norm": 2.198306083679199, + "learning_rate": 2.6653401797175867e-05, + "loss": 0.626, + "step": 6926 + }, + { + "epoch": 8.89216944801027, + "grad_norm": 1.274157166481018, + "learning_rate": 2.6657252888318358e-05, + "loss": 0.6436, + "step": 6927 + }, + { + "epoch": 8.893453145057766, + "grad_norm": 2.24265456199646, + "learning_rate": 2.6661103979460846e-05, + "loss": 0.6657, + "step": 6928 + }, + { + "epoch": 8.894736842105264, + "grad_norm": 5.133227825164795, + "learning_rate": 2.6664955070603337e-05, + "loss": 0.6559, + "step": 6929 + }, + { + "epoch": 8.89602053915276, + "grad_norm": 2.57252836227417, + "learning_rate": 2.666880616174583e-05, + "loss": 0.7433, + "step": 6930 + }, + { + "epoch": 8.897304236200256, + "grad_norm": 2.7925021648406982, + "learning_rate": 2.667265725288832e-05, + "loss": 0.8035, + "step": 6931 + }, + { + "epoch": 8.898587933247754, + "grad_norm": 3.7102391719818115, + "learning_rate": 2.667650834403081e-05, + "loss": 0.8544, + "step": 6932 + }, + { + "epoch": 8.89987163029525, + "grad_norm": 2.3900530338287354, + "learning_rate": 2.6680359435173298e-05, + "loss": 0.5709, + "step": 6933 + }, + { + "epoch": 8.901155327342748, + "grad_norm": 2.558225154876709, + "learning_rate": 2.668421052631579e-05, + "loss": 0.6058, + "step": 6934 + }, + { + "epoch": 8.902439024390244, + "grad_norm": 1.2808637619018555, + "learning_rate": 2.668806161745828e-05, + "loss": 0.5323, + "step": 6935 + }, + { + "epoch": 8.90372272143774, + "grad_norm": 1.0914065837860107, + "learning_rate": 2.669191270860077e-05, + "loss": 0.5646, + "step": 6936 + }, + { + "epoch": 8.905006418485238, + "grad_norm": 5.116490364074707, + "learning_rate": 2.6695763799743262e-05, + "loss": 0.5633, + "step": 6937 + }, + { + "epoch": 8.906290115532734, + "grad_norm": 1.0139367580413818, + "learning_rate": 2.6699614890885753e-05, + "loss": 0.5398, + "step": 6938 + }, + { + "epoch": 8.907573812580232, + "grad_norm": 2.9907665252685547, + "learning_rate": 2.670346598202824e-05, + "loss": 0.5637, + "step": 6939 + }, + { + "epoch": 8.908857509627728, + "grad_norm": 1.6040964126586914, + "learning_rate": 2.6707317073170732e-05, + "loss": 0.6081, + "step": 6940 + }, + { + "epoch": 8.910141206675224, + "grad_norm": 1.2464909553527832, + "learning_rate": 2.6711168164313223e-05, + "loss": 0.5867, + "step": 6941 + }, + { + "epoch": 8.911424903722722, + "grad_norm": 2.5052809715270996, + "learning_rate": 2.6715019255455714e-05, + "loss": 0.6323, + "step": 6942 + }, + { + "epoch": 8.912708600770218, + "grad_norm": 1.1734509468078613, + "learning_rate": 2.6718870346598205e-05, + "loss": 0.568, + "step": 6943 + }, + { + "epoch": 8.913992297817716, + "grad_norm": 1.4495530128479004, + "learning_rate": 2.6722721437740693e-05, + "loss": 0.5918, + "step": 6944 + }, + { + "epoch": 8.915275994865212, + "grad_norm": 3.0856950283050537, + "learning_rate": 2.6726572528883184e-05, + "loss": 0.5809, + "step": 6945 + }, + { + "epoch": 8.916559691912708, + "grad_norm": 3.6034464836120605, + "learning_rate": 2.6730423620025675e-05, + "loss": 0.6133, + "step": 6946 + }, + { + "epoch": 8.917843388960206, + "grad_norm": 1.5774564743041992, + "learning_rate": 2.6734274711168163e-05, + "loss": 0.5886, + "step": 6947 + }, + { + "epoch": 8.919127086007702, + "grad_norm": 2.00205659866333, + "learning_rate": 2.6738125802310657e-05, + "loss": 0.6119, + "step": 6948 + }, + { + "epoch": 8.9204107830552, + "grad_norm": 1.5124142169952393, + "learning_rate": 2.6741976893453148e-05, + "loss": 0.6, + "step": 6949 + }, + { + "epoch": 8.921694480102696, + "grad_norm": 3.1178252696990967, + "learning_rate": 2.6745827984595636e-05, + "loss": 0.5969, + "step": 6950 + }, + { + "epoch": 8.922978177150192, + "grad_norm": 1.1232178211212158, + "learning_rate": 2.6749679075738127e-05, + "loss": 0.5651, + "step": 6951 + }, + { + "epoch": 8.92426187419769, + "grad_norm": 2.19932222366333, + "learning_rate": 2.6753530166880615e-05, + "loss": 0.5929, + "step": 6952 + }, + { + "epoch": 8.925545571245186, + "grad_norm": 1.777349829673767, + "learning_rate": 2.6757381258023106e-05, + "loss": 0.561, + "step": 6953 + }, + { + "epoch": 8.926829268292684, + "grad_norm": 3.244386672973633, + "learning_rate": 2.67612323491656e-05, + "loss": 0.5653, + "step": 6954 + }, + { + "epoch": 8.92811296534018, + "grad_norm": 1.6533195972442627, + "learning_rate": 2.6765083440308088e-05, + "loss": 0.6578, + "step": 6955 + }, + { + "epoch": 8.929396662387676, + "grad_norm": 1.7818659543991089, + "learning_rate": 2.676893453145058e-05, + "loss": 0.5325, + "step": 6956 + }, + { + "epoch": 8.930680359435174, + "grad_norm": 2.767864227294922, + "learning_rate": 2.677278562259307e-05, + "loss": 0.6017, + "step": 6957 + }, + { + "epoch": 8.93196405648267, + "grad_norm": 2.335951566696167, + "learning_rate": 2.6776636713735558e-05, + "loss": 0.592, + "step": 6958 + }, + { + "epoch": 8.933247753530168, + "grad_norm": 7.377711772918701, + "learning_rate": 2.678048780487805e-05, + "loss": 0.604, + "step": 6959 + }, + { + "epoch": 8.934531450577664, + "grad_norm": 1.759104609489441, + "learning_rate": 2.678433889602054e-05, + "loss": 0.6548, + "step": 6960 + }, + { + "epoch": 8.93581514762516, + "grad_norm": 1.7371524572372437, + "learning_rate": 2.678818998716303e-05, + "loss": 0.6064, + "step": 6961 + }, + { + "epoch": 8.937098844672658, + "grad_norm": 1.6921801567077637, + "learning_rate": 2.6792041078305522e-05, + "loss": 0.629, + "step": 6962 + }, + { + "epoch": 8.938382541720154, + "grad_norm": 1.5643810033798218, + "learning_rate": 2.679589216944801e-05, + "loss": 0.657, + "step": 6963 + }, + { + "epoch": 8.93966623876765, + "grad_norm": 1.83849036693573, + "learning_rate": 2.67997432605905e-05, + "loss": 0.6328, + "step": 6964 + }, + { + "epoch": 8.940949935815148, + "grad_norm": 1.5427950620651245, + "learning_rate": 2.680359435173299e-05, + "loss": 0.6265, + "step": 6965 + }, + { + "epoch": 8.942233632862644, + "grad_norm": 2.68813419342041, + "learning_rate": 2.6807445442875483e-05, + "loss": 0.6195, + "step": 6966 + }, + { + "epoch": 8.943517329910142, + "grad_norm": 1.76303231716156, + "learning_rate": 2.6811296534017974e-05, + "loss": 0.6352, + "step": 6967 + }, + { + "epoch": 8.944801026957638, + "grad_norm": 1.9391381740570068, + "learning_rate": 2.6815147625160465e-05, + "loss": 0.6239, + "step": 6968 + }, + { + "epoch": 8.946084724005134, + "grad_norm": 1.490957498550415, + "learning_rate": 2.6818998716302953e-05, + "loss": 0.6527, + "step": 6969 + }, + { + "epoch": 8.947368421052632, + "grad_norm": 2.4432575702667236, + "learning_rate": 2.6822849807445444e-05, + "loss": 0.6911, + "step": 6970 + }, + { + "epoch": 8.948652118100128, + "grad_norm": 2.8794636726379395, + "learning_rate": 2.682670089858793e-05, + "loss": 0.6475, + "step": 6971 + }, + { + "epoch": 8.949935815147626, + "grad_norm": 3.684006929397583, + "learning_rate": 2.6830551989730426e-05, + "loss": 0.6439, + "step": 6972 + }, + { + "epoch": 8.951219512195122, + "grad_norm": 2.281583309173584, + "learning_rate": 2.6834403080872917e-05, + "loss": 0.687, + "step": 6973 + }, + { + "epoch": 8.952503209242618, + "grad_norm": 4.239096164703369, + "learning_rate": 2.6838254172015405e-05, + "loss": 0.639, + "step": 6974 + }, + { + "epoch": 8.953786906290116, + "grad_norm": 1.6047731637954712, + "learning_rate": 2.6842105263157896e-05, + "loss": 0.6572, + "step": 6975 + }, + { + "epoch": 8.955070603337612, + "grad_norm": 4.6469950675964355, + "learning_rate": 2.6845956354300387e-05, + "loss": 0.6969, + "step": 6976 + }, + { + "epoch": 8.95635430038511, + "grad_norm": 2.4571828842163086, + "learning_rate": 2.6849807445442874e-05, + "loss": 0.7013, + "step": 6977 + }, + { + "epoch": 8.957637997432606, + "grad_norm": 1.9430054426193237, + "learning_rate": 2.6853658536585365e-05, + "loss": 0.7926, + "step": 6978 + }, + { + "epoch": 8.958921694480102, + "grad_norm": 2.8583569526672363, + "learning_rate": 2.6857509627727856e-05, + "loss": 0.718, + "step": 6979 + }, + { + "epoch": 8.9602053915276, + "grad_norm": 2.4927115440368652, + "learning_rate": 2.6861360718870348e-05, + "loss": 0.7306, + "step": 6980 + }, + { + "epoch": 8.961489088575096, + "grad_norm": 2.1755175590515137, + "learning_rate": 2.686521181001284e-05, + "loss": 0.7708, + "step": 6981 + }, + { + "epoch": 8.962772785622594, + "grad_norm": 3.0035693645477295, + "learning_rate": 2.6869062901155326e-05, + "loss": 0.825, + "step": 6982 + }, + { + "epoch": 8.96405648267009, + "grad_norm": 2.694662570953369, + "learning_rate": 2.6872913992297817e-05, + "loss": 0.5525, + "step": 6983 + }, + { + "epoch": 8.965340179717586, + "grad_norm": 2.229768991470337, + "learning_rate": 2.687676508344031e-05, + "loss": 0.5562, + "step": 6984 + }, + { + "epoch": 8.966623876765084, + "grad_norm": 3.108844041824341, + "learning_rate": 2.68806161745828e-05, + "loss": 0.5583, + "step": 6985 + }, + { + "epoch": 8.96790757381258, + "grad_norm": 1.1606484651565552, + "learning_rate": 2.688446726572529e-05, + "loss": 0.6076, + "step": 6986 + }, + { + "epoch": 8.969191270860078, + "grad_norm": 1.773362159729004, + "learning_rate": 2.688831835686778e-05, + "loss": 0.5788, + "step": 6987 + }, + { + "epoch": 8.970474967907574, + "grad_norm": 1.5369006395339966, + "learning_rate": 2.689216944801027e-05, + "loss": 0.5948, + "step": 6988 + }, + { + "epoch": 8.97175866495507, + "grad_norm": 2.2866787910461426, + "learning_rate": 2.689602053915276e-05, + "loss": 0.5966, + "step": 6989 + }, + { + "epoch": 8.973042362002568, + "grad_norm": 6.741832733154297, + "learning_rate": 2.6899871630295248e-05, + "loss": 0.5963, + "step": 6990 + }, + { + "epoch": 8.974326059050064, + "grad_norm": 1.5043971538543701, + "learning_rate": 2.6903722721437743e-05, + "loss": 0.5505, + "step": 6991 + }, + { + "epoch": 8.975609756097562, + "grad_norm": 1.020851731300354, + "learning_rate": 2.6907573812580234e-05, + "loss": 0.553, + "step": 6992 + }, + { + "epoch": 8.976893453145058, + "grad_norm": 1.0665783882141113, + "learning_rate": 2.691142490372272e-05, + "loss": 0.6271, + "step": 6993 + }, + { + "epoch": 8.978177150192554, + "grad_norm": 14.533548355102539, + "learning_rate": 2.6915275994865212e-05, + "loss": 0.5648, + "step": 6994 + }, + { + "epoch": 8.979460847240052, + "grad_norm": 1.7265745401382446, + "learning_rate": 2.6919127086007703e-05, + "loss": 0.568, + "step": 6995 + }, + { + "epoch": 8.980744544287548, + "grad_norm": 1.267020344734192, + "learning_rate": 2.692297817715019e-05, + "loss": 0.5623, + "step": 6996 + }, + { + "epoch": 8.982028241335044, + "grad_norm": 2.069861888885498, + "learning_rate": 2.6926829268292686e-05, + "loss": 0.6023, + "step": 6997 + }, + { + "epoch": 8.983311938382542, + "grad_norm": 2.239697217941284, + "learning_rate": 2.6930680359435173e-05, + "loss": 0.6073, + "step": 6998 + }, + { + "epoch": 8.984595635430038, + "grad_norm": 2.061026096343994, + "learning_rate": 2.6934531450577664e-05, + "loss": 0.6163, + "step": 6999 + }, + { + "epoch": 8.985879332477536, + "grad_norm": 1.9239107370376587, + "learning_rate": 2.6938382541720155e-05, + "loss": 0.5953, + "step": 7000 + }, + { + "epoch": 8.985879332477536, + "eval_cer": 0.29327393773252397, + "eval_loss": 0.5826528072357178, + "eval_runtime": 13.8246, + "eval_samples_per_second": 71.105, + "eval_steps_per_second": 0.506, + "eval_wer": 0.5371474432469617, + "step": 7000 + }, + { + "epoch": 8.987163029525032, + "grad_norm": 2.823120594024658, + "learning_rate": 2.6942233632862643e-05, + "loss": 0.6056, + "step": 7001 + }, + { + "epoch": 8.988446726572528, + "grad_norm": 1.700293779373169, + "learning_rate": 2.6946084724005134e-05, + "loss": 0.6076, + "step": 7002 + }, + { + "epoch": 8.989730423620026, + "grad_norm": 1.6488654613494873, + "learning_rate": 2.694993581514763e-05, + "loss": 0.5929, + "step": 7003 + }, + { + "epoch": 8.991014120667522, + "grad_norm": 1.6188908815383911, + "learning_rate": 2.6953786906290116e-05, + "loss": 0.6835, + "step": 7004 + }, + { + "epoch": 8.99229781771502, + "grad_norm": 2.269861936569214, + "learning_rate": 2.6957637997432607e-05, + "loss": 0.5989, + "step": 7005 + }, + { + "epoch": 8.993581514762516, + "grad_norm": 4.768495559692383, + "learning_rate": 2.6961489088575095e-05, + "loss": 0.6397, + "step": 7006 + }, + { + "epoch": 8.994865211810012, + "grad_norm": 2.790902614593506, + "learning_rate": 2.6965340179717586e-05, + "loss": 0.686, + "step": 7007 + }, + { + "epoch": 8.99614890885751, + "grad_norm": 2.801098585128784, + "learning_rate": 2.6969191270860077e-05, + "loss": 0.6437, + "step": 7008 + }, + { + "epoch": 8.997432605905006, + "grad_norm": 4.696126461029053, + "learning_rate": 2.6973042362002568e-05, + "loss": 0.6549, + "step": 7009 + }, + { + "epoch": 8.998716302952504, + "grad_norm": 1.4377418756484985, + "learning_rate": 2.697689345314506e-05, + "loss": 0.648, + "step": 7010 + }, + { + "epoch": 9.0, + "grad_norm": 4.371504783630371, + "learning_rate": 2.698074454428755e-05, + "loss": 0.9165, + "step": 7011 + }, + { + "epoch": 9.001283697047496, + "grad_norm": 2.7767906188964844, + "learning_rate": 2.6984595635430038e-05, + "loss": 0.5587, + "step": 7012 + }, + { + "epoch": 9.002567394094994, + "grad_norm": 1.4043757915496826, + "learning_rate": 2.698844672657253e-05, + "loss": 0.5624, + "step": 7013 + }, + { + "epoch": 9.00385109114249, + "grad_norm": 2.474468231201172, + "learning_rate": 2.699229781771502e-05, + "loss": 0.5775, + "step": 7014 + }, + { + "epoch": 9.005134788189988, + "grad_norm": 2.211589813232422, + "learning_rate": 2.699614890885751e-05, + "loss": 0.5696, + "step": 7015 + }, + { + "epoch": 9.006418485237484, + "grad_norm": 2.7465946674346924, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.5935, + "step": 7016 + }, + { + "epoch": 9.00770218228498, + "grad_norm": 3.7328848838806152, + "learning_rate": 2.700385109114249e-05, + "loss": 0.5671, + "step": 7017 + }, + { + "epoch": 9.008985879332478, + "grad_norm": 1.9922975301742554, + "learning_rate": 2.700770218228498e-05, + "loss": 0.6349, + "step": 7018 + }, + { + "epoch": 9.010269576379974, + "grad_norm": 1.141190528869629, + "learning_rate": 2.7011553273427472e-05, + "loss": 0.5993, + "step": 7019 + }, + { + "epoch": 9.011553273427472, + "grad_norm": 1.9157644510269165, + "learning_rate": 2.701540436456996e-05, + "loss": 0.6241, + "step": 7020 + }, + { + "epoch": 9.012836970474968, + "grad_norm": 2.713109016418457, + "learning_rate": 2.7019255455712454e-05, + "loss": 0.5809, + "step": 7021 + }, + { + "epoch": 9.014120667522464, + "grad_norm": 1.6872469186782837, + "learning_rate": 2.7023106546854945e-05, + "loss": 0.5619, + "step": 7022 + }, + { + "epoch": 9.015404364569962, + "grad_norm": 2.561661720275879, + "learning_rate": 2.7026957637997433e-05, + "loss": 0.6162, + "step": 7023 + }, + { + "epoch": 9.016688061617458, + "grad_norm": 1.7706249952316284, + "learning_rate": 2.7030808729139924e-05, + "loss": 0.5769, + "step": 7024 + }, + { + "epoch": 9.017971758664956, + "grad_norm": 4.778966426849365, + "learning_rate": 2.7034659820282412e-05, + "loss": 0.5965, + "step": 7025 + }, + { + "epoch": 9.019255455712452, + "grad_norm": 2.490100383758545, + "learning_rate": 2.7038510911424903e-05, + "loss": 0.5842, + "step": 7026 + }, + { + "epoch": 9.020539152759948, + "grad_norm": 1.8367422819137573, + "learning_rate": 2.7042362002567394e-05, + "loss": 0.5834, + "step": 7027 + }, + { + "epoch": 9.021822849807446, + "grad_norm": 2.8278632164001465, + "learning_rate": 2.7046213093709885e-05, + "loss": 0.5997, + "step": 7028 + }, + { + "epoch": 9.023106546854942, + "grad_norm": 2.4637560844421387, + "learning_rate": 2.7050064184852376e-05, + "loss": 0.6038, + "step": 7029 + }, + { + "epoch": 9.024390243902438, + "grad_norm": 2.1576077938079834, + "learning_rate": 2.7053915275994867e-05, + "loss": 0.5783, + "step": 7030 + }, + { + "epoch": 9.025673940949936, + "grad_norm": 1.7932060956954956, + "learning_rate": 2.7057766367137355e-05, + "loss": 0.5945, + "step": 7031 + }, + { + "epoch": 9.026957637997432, + "grad_norm": 2.118264675140381, + "learning_rate": 2.7061617458279846e-05, + "loss": 0.5896, + "step": 7032 + }, + { + "epoch": 9.02824133504493, + "grad_norm": 1.8809436559677124, + "learning_rate": 2.7065468549422337e-05, + "loss": 0.5726, + "step": 7033 + }, + { + "epoch": 9.029525032092426, + "grad_norm": 4.86196756362915, + "learning_rate": 2.7069319640564828e-05, + "loss": 0.5375, + "step": 7034 + }, + { + "epoch": 9.030808729139922, + "grad_norm": 4.8392815589904785, + "learning_rate": 2.707317073170732e-05, + "loss": 0.5617, + "step": 7035 + }, + { + "epoch": 9.03209242618742, + "grad_norm": 3.29838228225708, + "learning_rate": 2.7077021822849807e-05, + "loss": 0.6012, + "step": 7036 + }, + { + "epoch": 9.033376123234916, + "grad_norm": 1.6631689071655273, + "learning_rate": 2.7080872913992298e-05, + "loss": 0.587, + "step": 7037 + }, + { + "epoch": 9.034659820282414, + "grad_norm": 4.029662132263184, + "learning_rate": 2.708472400513479e-05, + "loss": 0.5827, + "step": 7038 + }, + { + "epoch": 9.03594351732991, + "grad_norm": 2.1162068843841553, + "learning_rate": 2.7088575096277277e-05, + "loss": 0.5862, + "step": 7039 + }, + { + "epoch": 9.037227214377406, + "grad_norm": 2.4361186027526855, + "learning_rate": 2.709242618741977e-05, + "loss": 0.5661, + "step": 7040 + }, + { + "epoch": 9.038510911424904, + "grad_norm": 1.9628031253814697, + "learning_rate": 2.7096277278562262e-05, + "loss": 0.5973, + "step": 7041 + }, + { + "epoch": 9.0397946084724, + "grad_norm": 4.169175624847412, + "learning_rate": 2.710012836970475e-05, + "loss": 0.633, + "step": 7042 + }, + { + "epoch": 9.041078305519898, + "grad_norm": 1.692267894744873, + "learning_rate": 2.710397946084724e-05, + "loss": 0.585, + "step": 7043 + }, + { + "epoch": 9.042362002567394, + "grad_norm": 3.8328216075897217, + "learning_rate": 2.710783055198973e-05, + "loss": 0.6165, + "step": 7044 + }, + { + "epoch": 9.04364569961489, + "grad_norm": 4.103415489196777, + "learning_rate": 2.711168164313222e-05, + "loss": 0.6026, + "step": 7045 + }, + { + "epoch": 9.044929396662388, + "grad_norm": 2.0354979038238525, + "learning_rate": 2.7115532734274714e-05, + "loss": 0.6047, + "step": 7046 + }, + { + "epoch": 9.046213093709884, + "grad_norm": 3.073852300643921, + "learning_rate": 2.7119383825417202e-05, + "loss": 0.6434, + "step": 7047 + }, + { + "epoch": 9.047496790757382, + "grad_norm": 5.8871002197265625, + "learning_rate": 2.7123234916559693e-05, + "loss": 0.6765, + "step": 7048 + }, + { + "epoch": 9.048780487804878, + "grad_norm": 3.932652473449707, + "learning_rate": 2.7127086007702184e-05, + "loss": 0.6503, + "step": 7049 + }, + { + "epoch": 9.050064184852374, + "grad_norm": 3.634011745452881, + "learning_rate": 2.713093709884467e-05, + "loss": 0.6949, + "step": 7050 + }, + { + "epoch": 9.051347881899872, + "grad_norm": 3.4177699089050293, + "learning_rate": 2.7134788189987163e-05, + "loss": 0.6764, + "step": 7051 + }, + { + "epoch": 9.052631578947368, + "grad_norm": 4.044840335845947, + "learning_rate": 2.7138639281129654e-05, + "loss": 0.6392, + "step": 7052 + }, + { + "epoch": 9.053915275994866, + "grad_norm": 7.667522430419922, + "learning_rate": 2.7142490372272145e-05, + "loss": 0.6606, + "step": 7053 + }, + { + "epoch": 9.055198973042362, + "grad_norm": 2.714897871017456, + "learning_rate": 2.7146341463414636e-05, + "loss": 0.636, + "step": 7054 + }, + { + "epoch": 9.056482670089858, + "grad_norm": 3.9441146850585938, + "learning_rate": 2.7150192554557124e-05, + "loss": 0.652, + "step": 7055 + }, + { + "epoch": 9.057766367137356, + "grad_norm": 3.403592824935913, + "learning_rate": 2.7154043645699615e-05, + "loss": 0.6887, + "step": 7056 + }, + { + "epoch": 9.059050064184852, + "grad_norm": 6.897748947143555, + "learning_rate": 2.7157894736842106e-05, + "loss": 0.7005, + "step": 7057 + }, + { + "epoch": 9.06033376123235, + "grad_norm": 4.547335624694824, + "learning_rate": 2.7161745827984597e-05, + "loss": 0.6993, + "step": 7058 + }, + { + "epoch": 9.061617458279846, + "grad_norm": 20.45985984802246, + "learning_rate": 2.7165596919127088e-05, + "loss": 0.7224, + "step": 7059 + }, + { + "epoch": 9.062901155327342, + "grad_norm": 19.04438018798828, + "learning_rate": 2.716944801026958e-05, + "loss": 0.7823, + "step": 7060 + }, + { + "epoch": 9.06418485237484, + "grad_norm": 5.430585861206055, + "learning_rate": 2.7173299101412067e-05, + "loss": 0.8245, + "step": 7061 + }, + { + "epoch": 9.065468549422336, + "grad_norm": 3.134960412979126, + "learning_rate": 2.7177150192554558e-05, + "loss": 0.581, + "step": 7062 + }, + { + "epoch": 9.066752246469832, + "grad_norm": 3.773379325866699, + "learning_rate": 2.7181001283697045e-05, + "loss": 0.5474, + "step": 7063 + }, + { + "epoch": 9.06803594351733, + "grad_norm": 1.8054265975952148, + "learning_rate": 2.718485237483954e-05, + "loss": 0.5316, + "step": 7064 + }, + { + "epoch": 9.069319640564826, + "grad_norm": 1.3526837825775146, + "learning_rate": 2.718870346598203e-05, + "loss": 0.5597, + "step": 7065 + }, + { + "epoch": 9.070603337612324, + "grad_norm": 2.494051218032837, + "learning_rate": 2.719255455712452e-05, + "loss": 0.5325, + "step": 7066 + }, + { + "epoch": 9.07188703465982, + "grad_norm": 1.6443028450012207, + "learning_rate": 2.719640564826701e-05, + "loss": 0.5653, + "step": 7067 + }, + { + "epoch": 9.073170731707316, + "grad_norm": 2.2996695041656494, + "learning_rate": 2.72002567394095e-05, + "loss": 0.5684, + "step": 7068 + }, + { + "epoch": 9.074454428754814, + "grad_norm": 2.3098368644714355, + "learning_rate": 2.720410783055199e-05, + "loss": 0.5841, + "step": 7069 + }, + { + "epoch": 9.07573812580231, + "grad_norm": 1.9296330213546753, + "learning_rate": 2.7207958921694483e-05, + "loss": 0.5408, + "step": 7070 + }, + { + "epoch": 9.077021822849808, + "grad_norm": 4.676167964935303, + "learning_rate": 2.721181001283697e-05, + "loss": 0.5336, + "step": 7071 + }, + { + "epoch": 9.078305519897304, + "grad_norm": 1.2134546041488647, + "learning_rate": 2.721566110397946e-05, + "loss": 0.5424, + "step": 7072 + }, + { + "epoch": 9.0795892169448, + "grad_norm": 8.599533081054688, + "learning_rate": 2.7219512195121953e-05, + "loss": 0.5676, + "step": 7073 + }, + { + "epoch": 9.080872913992298, + "grad_norm": 1.1971920728683472, + "learning_rate": 2.722336328626444e-05, + "loss": 0.5974, + "step": 7074 + }, + { + "epoch": 9.082156611039794, + "grad_norm": 2.198298692703247, + "learning_rate": 2.722721437740693e-05, + "loss": 0.5486, + "step": 7075 + }, + { + "epoch": 9.083440308087292, + "grad_norm": 1.5573170185089111, + "learning_rate": 2.7231065468549423e-05, + "loss": 0.5504, + "step": 7076 + }, + { + "epoch": 9.084724005134788, + "grad_norm": 7.295200347900391, + "learning_rate": 2.7234916559691914e-05, + "loss": 0.5729, + "step": 7077 + }, + { + "epoch": 9.086007702182284, + "grad_norm": 3.341350793838501, + "learning_rate": 2.7238767650834405e-05, + "loss": 0.5406, + "step": 7078 + }, + { + "epoch": 9.087291399229782, + "grad_norm": 2.4865944385528564, + "learning_rate": 2.7242618741976896e-05, + "loss": 0.5659, + "step": 7079 + }, + { + "epoch": 9.088575096277278, + "grad_norm": 2.879596710205078, + "learning_rate": 2.7246469833119383e-05, + "loss": 0.6002, + "step": 7080 + }, + { + "epoch": 9.089858793324776, + "grad_norm": 3.351588726043701, + "learning_rate": 2.7250320924261875e-05, + "loss": 0.5476, + "step": 7081 + }, + { + "epoch": 9.091142490372272, + "grad_norm": 1.9591681957244873, + "learning_rate": 2.7254172015404362e-05, + "loss": 0.5969, + "step": 7082 + }, + { + "epoch": 9.092426187419768, + "grad_norm": 4.950465679168701, + "learning_rate": 2.7258023106546857e-05, + "loss": 0.5536, + "step": 7083 + }, + { + "epoch": 9.093709884467266, + "grad_norm": 2.069446325302124, + "learning_rate": 2.7261874197689348e-05, + "loss": 0.6166, + "step": 7084 + }, + { + "epoch": 9.094993581514762, + "grad_norm": 2.3425254821777344, + "learning_rate": 2.7265725288831835e-05, + "loss": 0.5869, + "step": 7085 + }, + { + "epoch": 9.09627727856226, + "grad_norm": 1.7063803672790527, + "learning_rate": 2.7269576379974326e-05, + "loss": 0.5748, + "step": 7086 + }, + { + "epoch": 9.097560975609756, + "grad_norm": 3.13088059425354, + "learning_rate": 2.7273427471116818e-05, + "loss": 0.5787, + "step": 7087 + }, + { + "epoch": 9.098844672657252, + "grad_norm": 1.2403448820114136, + "learning_rate": 2.7277278562259305e-05, + "loss": 0.5476, + "step": 7088 + }, + { + "epoch": 9.10012836970475, + "grad_norm": 1.481418251991272, + "learning_rate": 2.72811296534018e-05, + "loss": 0.5654, + "step": 7089 + }, + { + "epoch": 9.101412066752246, + "grad_norm": 6.548892498016357, + "learning_rate": 2.7284980744544287e-05, + "loss": 0.5713, + "step": 7090 + }, + { + "epoch": 9.102695763799744, + "grad_norm": 2.4026126861572266, + "learning_rate": 2.728883183568678e-05, + "loss": 0.531, + "step": 7091 + }, + { + "epoch": 9.10397946084724, + "grad_norm": 3.3797831535339355, + "learning_rate": 2.729268292682927e-05, + "loss": 0.6105, + "step": 7092 + }, + { + "epoch": 9.105263157894736, + "grad_norm": 3.6465494632720947, + "learning_rate": 2.7296534017971757e-05, + "loss": 0.5646, + "step": 7093 + }, + { + "epoch": 9.106546854942234, + "grad_norm": 1.5191975831985474, + "learning_rate": 2.7300385109114248e-05, + "loss": 0.5993, + "step": 7094 + }, + { + "epoch": 9.10783055198973, + "grad_norm": 1.9441356658935547, + "learning_rate": 2.7304236200256743e-05, + "loss": 0.6161, + "step": 7095 + }, + { + "epoch": 9.109114249037226, + "grad_norm": 1.5157251358032227, + "learning_rate": 2.730808729139923e-05, + "loss": 0.5631, + "step": 7096 + }, + { + "epoch": 9.110397946084724, + "grad_norm": 6.367386341094971, + "learning_rate": 2.731193838254172e-05, + "loss": 0.5861, + "step": 7097 + }, + { + "epoch": 9.11168164313222, + "grad_norm": 8.492037773132324, + "learning_rate": 2.7315789473684213e-05, + "loss": 0.5846, + "step": 7098 + }, + { + "epoch": 9.112965340179718, + "grad_norm": 1.926035761833191, + "learning_rate": 2.73196405648267e-05, + "loss": 0.6032, + "step": 7099 + }, + { + "epoch": 9.114249037227214, + "grad_norm": 4.322613716125488, + "learning_rate": 2.732349165596919e-05, + "loss": 0.6041, + "step": 7100 + }, + { + "epoch": 9.11553273427471, + "grad_norm": 3.385305166244507, + "learning_rate": 2.7327342747111682e-05, + "loss": 0.5958, + "step": 7101 + }, + { + "epoch": 9.116816431322208, + "grad_norm": 3.461940288543701, + "learning_rate": 2.7331193838254173e-05, + "loss": 0.6854, + "step": 7102 + }, + { + "epoch": 9.118100128369704, + "grad_norm": 3.6520802974700928, + "learning_rate": 2.7335044929396665e-05, + "loss": 0.579, + "step": 7103 + }, + { + "epoch": 9.119383825417202, + "grad_norm": 1.2478541135787964, + "learning_rate": 2.7338896020539152e-05, + "loss": 0.5845, + "step": 7104 + }, + { + "epoch": 9.120667522464698, + "grad_norm": 1.9138542413711548, + "learning_rate": 2.7342747111681643e-05, + "loss": 0.6163, + "step": 7105 + }, + { + "epoch": 9.121951219512194, + "grad_norm": 4.370880126953125, + "learning_rate": 2.7346598202824134e-05, + "loss": 0.606, + "step": 7106 + }, + { + "epoch": 9.123234916559692, + "grad_norm": 2.423956871032715, + "learning_rate": 2.7350449293966625e-05, + "loss": 0.6963, + "step": 7107 + }, + { + "epoch": 9.124518613607188, + "grad_norm": 2.282853603363037, + "learning_rate": 2.7354300385109116e-05, + "loss": 0.7037, + "step": 7108 + }, + { + "epoch": 9.125802310654686, + "grad_norm": 1.7877554893493652, + "learning_rate": 2.7358151476251604e-05, + "loss": 0.736, + "step": 7109 + }, + { + "epoch": 9.127086007702182, + "grad_norm": 2.3576674461364746, + "learning_rate": 2.7362002567394095e-05, + "loss": 0.711, + "step": 7110 + }, + { + "epoch": 9.128369704749678, + "grad_norm": 2.601384401321411, + "learning_rate": 2.7365853658536586e-05, + "loss": 0.799, + "step": 7111 + }, + { + "epoch": 9.129653401797176, + "grad_norm": 1.4942810535430908, + "learning_rate": 2.7369704749679074e-05, + "loss": 0.5612, + "step": 7112 + }, + { + "epoch": 9.130937098844672, + "grad_norm": 1.4838777780532837, + "learning_rate": 2.737355584082157e-05, + "loss": 0.5498, + "step": 7113 + }, + { + "epoch": 9.13222079589217, + "grad_norm": 1.1942473649978638, + "learning_rate": 2.737740693196406e-05, + "loss": 0.5336, + "step": 7114 + }, + { + "epoch": 9.133504492939666, + "grad_norm": 1.708796501159668, + "learning_rate": 2.7381258023106547e-05, + "loss": 0.5538, + "step": 7115 + }, + { + "epoch": 9.134788189987162, + "grad_norm": 3.244515895843506, + "learning_rate": 2.7385109114249038e-05, + "loss": 0.5882, + "step": 7116 + }, + { + "epoch": 9.13607188703466, + "grad_norm": 4.166437149047852, + "learning_rate": 2.7388960205391526e-05, + "loss": 0.5904, + "step": 7117 + }, + { + "epoch": 9.137355584082156, + "grad_norm": 1.1428042650222778, + "learning_rate": 2.7392811296534017e-05, + "loss": 0.6166, + "step": 7118 + }, + { + "epoch": 9.138639281129654, + "grad_norm": 2.2191271781921387, + "learning_rate": 2.739666238767651e-05, + "loss": 0.543, + "step": 7119 + }, + { + "epoch": 9.13992297817715, + "grad_norm": 2.0699446201324463, + "learning_rate": 2.7400513478819e-05, + "loss": 0.5905, + "step": 7120 + }, + { + "epoch": 9.141206675224646, + "grad_norm": 2.518842935562134, + "learning_rate": 2.740436456996149e-05, + "loss": 0.5759, + "step": 7121 + }, + { + "epoch": 9.142490372272144, + "grad_norm": 2.190720319747925, + "learning_rate": 2.740821566110398e-05, + "loss": 0.5961, + "step": 7122 + }, + { + "epoch": 9.14377406931964, + "grad_norm": 2.676323652267456, + "learning_rate": 2.741206675224647e-05, + "loss": 0.5579, + "step": 7123 + }, + { + "epoch": 9.145057766367138, + "grad_norm": 3.243396759033203, + "learning_rate": 2.741591784338896e-05, + "loss": 0.5864, + "step": 7124 + }, + { + "epoch": 9.146341463414634, + "grad_norm": 2.744258165359497, + "learning_rate": 2.7419768934531455e-05, + "loss": 0.6017, + "step": 7125 + }, + { + "epoch": 9.14762516046213, + "grad_norm": 2.511258125305176, + "learning_rate": 2.7423620025673942e-05, + "loss": 0.6057, + "step": 7126 + }, + { + "epoch": 9.148908857509628, + "grad_norm": 3.002232313156128, + "learning_rate": 2.7427471116816433e-05, + "loss": 0.5976, + "step": 7127 + }, + { + "epoch": 9.150192554557124, + "grad_norm": 3.205162525177002, + "learning_rate": 2.743132220795892e-05, + "loss": 0.565, + "step": 7128 + }, + { + "epoch": 9.15147625160462, + "grad_norm": 1.6450717449188232, + "learning_rate": 2.7435173299101412e-05, + "loss": 0.5578, + "step": 7129 + }, + { + "epoch": 9.152759948652118, + "grad_norm": 26.025789260864258, + "learning_rate": 2.7439024390243903e-05, + "loss": 0.596, + "step": 7130 + }, + { + "epoch": 9.154043645699614, + "grad_norm": 2.791281223297119, + "learning_rate": 2.744287548138639e-05, + "loss": 0.5678, + "step": 7131 + }, + { + "epoch": 9.155327342747112, + "grad_norm": 1.113766074180603, + "learning_rate": 2.7446726572528885e-05, + "loss": 0.6037, + "step": 7132 + }, + { + "epoch": 9.156611039794608, + "grad_norm": 2.3799078464508057, + "learning_rate": 2.7450577663671376e-05, + "loss": 0.5992, + "step": 7133 + }, + { + "epoch": 9.157894736842104, + "grad_norm": 2.198244571685791, + "learning_rate": 2.7454428754813864e-05, + "loss": 0.5889, + "step": 7134 + }, + { + "epoch": 9.159178433889602, + "grad_norm": 1.4182368516921997, + "learning_rate": 2.7458279845956355e-05, + "loss": 0.5683, + "step": 7135 + }, + { + "epoch": 9.160462130937098, + "grad_norm": 1.789668321609497, + "learning_rate": 2.7462130937098843e-05, + "loss": 0.5792, + "step": 7136 + }, + { + "epoch": 9.161745827984596, + "grad_norm": 3.5864312648773193, + "learning_rate": 2.7465982028241334e-05, + "loss": 0.5754, + "step": 7137 + }, + { + "epoch": 9.163029525032092, + "grad_norm": 2.1355888843536377, + "learning_rate": 2.7469833119383828e-05, + "loss": 0.594, + "step": 7138 + }, + { + "epoch": 9.164313222079588, + "grad_norm": 1.4069899320602417, + "learning_rate": 2.7473684210526316e-05, + "loss": 0.6111, + "step": 7139 + }, + { + "epoch": 9.165596919127086, + "grad_norm": 1.3825154304504395, + "learning_rate": 2.7477535301668807e-05, + "loss": 0.6303, + "step": 7140 + }, + { + "epoch": 9.166880616174582, + "grad_norm": 2.723634719848633, + "learning_rate": 2.7481386392811298e-05, + "loss": 0.5496, + "step": 7141 + }, + { + "epoch": 9.16816431322208, + "grad_norm": 3.699784994125366, + "learning_rate": 2.7485237483953786e-05, + "loss": 0.6601, + "step": 7142 + }, + { + "epoch": 9.169448010269576, + "grad_norm": 2.2862441539764404, + "learning_rate": 2.7489088575096277e-05, + "loss": 0.5745, + "step": 7143 + }, + { + "epoch": 9.170731707317072, + "grad_norm": 2.3871138095855713, + "learning_rate": 2.749293966623877e-05, + "loss": 0.6143, + "step": 7144 + }, + { + "epoch": 9.17201540436457, + "grad_norm": 1.332682728767395, + "learning_rate": 2.749679075738126e-05, + "loss": 0.5973, + "step": 7145 + }, + { + "epoch": 9.173299101412066, + "grad_norm": 3.5466420650482178, + "learning_rate": 2.750064184852375e-05, + "loss": 0.631, + "step": 7146 + }, + { + "epoch": 9.174582798459564, + "grad_norm": 4.160435199737549, + "learning_rate": 2.7504492939666238e-05, + "loss": 0.5666, + "step": 7147 + }, + { + "epoch": 9.17586649550706, + "grad_norm": 4.085576057434082, + "learning_rate": 2.750834403080873e-05, + "loss": 0.6516, + "step": 7148 + }, + { + "epoch": 9.177150192554556, + "grad_norm": 2.0382888317108154, + "learning_rate": 2.751219512195122e-05, + "loss": 0.622, + "step": 7149 + }, + { + "epoch": 9.178433889602054, + "grad_norm": 1.8198894262313843, + "learning_rate": 2.751604621309371e-05, + "loss": 0.6221, + "step": 7150 + }, + { + "epoch": 9.17971758664955, + "grad_norm": 25.313610076904297, + "learning_rate": 2.7519897304236202e-05, + "loss": 0.6467, + "step": 7151 + }, + { + "epoch": 9.181001283697048, + "grad_norm": 2.060692071914673, + "learning_rate": 2.7523748395378693e-05, + "loss": 0.6716, + "step": 7152 + }, + { + "epoch": 9.182284980744544, + "grad_norm": 1.751373529434204, + "learning_rate": 2.752759948652118e-05, + "loss": 0.6807, + "step": 7153 + }, + { + "epoch": 9.18356867779204, + "grad_norm": 1.4993901252746582, + "learning_rate": 2.7531450577663672e-05, + "loss": 0.6281, + "step": 7154 + }, + { + "epoch": 9.184852374839538, + "grad_norm": 1.2845560312271118, + "learning_rate": 2.753530166880616e-05, + "loss": 0.6703, + "step": 7155 + }, + { + "epoch": 9.186136071887034, + "grad_norm": 2.203253984451294, + "learning_rate": 2.7539152759948654e-05, + "loss": 0.6399, + "step": 7156 + }, + { + "epoch": 9.187419768934532, + "grad_norm": 2.533278703689575, + "learning_rate": 2.7543003851091145e-05, + "loss": 0.6843, + "step": 7157 + }, + { + "epoch": 9.188703465982028, + "grad_norm": 13.809814453125, + "learning_rate": 2.7546854942233633e-05, + "loss": 0.6466, + "step": 7158 + }, + { + "epoch": 9.189987163029524, + "grad_norm": 3.6324732303619385, + "learning_rate": 2.7550706033376124e-05, + "loss": 0.6555, + "step": 7159 + }, + { + "epoch": 9.191270860077022, + "grad_norm": 2.4960079193115234, + "learning_rate": 2.7554557124518615e-05, + "loss": 0.7281, + "step": 7160 + }, + { + "epoch": 9.192554557124518, + "grad_norm": 5.912595748901367, + "learning_rate": 2.7558408215661103e-05, + "loss": 0.8406, + "step": 7161 + }, + { + "epoch": 9.193838254172016, + "grad_norm": 3.0066909790039062, + "learning_rate": 2.7562259306803597e-05, + "loss": 0.6131, + "step": 7162 + }, + { + "epoch": 9.195121951219512, + "grad_norm": 2.8837456703186035, + "learning_rate": 2.7566110397946085e-05, + "loss": 0.5389, + "step": 7163 + }, + { + "epoch": 9.196405648267008, + "grad_norm": 1.5885580778121948, + "learning_rate": 2.7569961489088576e-05, + "loss": 0.5714, + "step": 7164 + }, + { + "epoch": 9.197689345314506, + "grad_norm": 1.14346182346344, + "learning_rate": 2.7573812580231067e-05, + "loss": 0.5857, + "step": 7165 + }, + { + "epoch": 9.198973042362002, + "grad_norm": 1.477245807647705, + "learning_rate": 2.7577663671373555e-05, + "loss": 0.5731, + "step": 7166 + }, + { + "epoch": 9.200256739409499, + "grad_norm": 1.837111234664917, + "learning_rate": 2.7581514762516046e-05, + "loss": 0.5906, + "step": 7167 + }, + { + "epoch": 9.201540436456996, + "grad_norm": 8.218429565429688, + "learning_rate": 2.758536585365854e-05, + "loss": 0.5736, + "step": 7168 + }, + { + "epoch": 9.202824133504492, + "grad_norm": 1.6554930210113525, + "learning_rate": 2.7589216944801028e-05, + "loss": 0.5361, + "step": 7169 + }, + { + "epoch": 9.20410783055199, + "grad_norm": 5.0526123046875, + "learning_rate": 2.759306803594352e-05, + "loss": 0.6099, + "step": 7170 + }, + { + "epoch": 9.205391527599486, + "grad_norm": 2.505687952041626, + "learning_rate": 2.759691912708601e-05, + "loss": 0.5494, + "step": 7171 + }, + { + "epoch": 9.206675224646983, + "grad_norm": 2.1799261569976807, + "learning_rate": 2.7600770218228498e-05, + "loss": 0.5704, + "step": 7172 + }, + { + "epoch": 9.20795892169448, + "grad_norm": 2.5489394664764404, + "learning_rate": 2.760462130937099e-05, + "loss": 0.5981, + "step": 7173 + }, + { + "epoch": 9.209242618741976, + "grad_norm": 5.570895195007324, + "learning_rate": 2.760847240051348e-05, + "loss": 0.5541, + "step": 7174 + }, + { + "epoch": 9.210526315789474, + "grad_norm": 3.299726724624634, + "learning_rate": 2.761232349165597e-05, + "loss": 0.5833, + "step": 7175 + }, + { + "epoch": 9.21181001283697, + "grad_norm": 8.610556602478027, + "learning_rate": 2.7616174582798462e-05, + "loss": 0.5729, + "step": 7176 + }, + { + "epoch": 9.213093709884467, + "grad_norm": 1.1870604753494263, + "learning_rate": 2.762002567394095e-05, + "loss": 0.5814, + "step": 7177 + }, + { + "epoch": 9.214377406931964, + "grad_norm": 1.2451077699661255, + "learning_rate": 2.762387676508344e-05, + "loss": 0.582, + "step": 7178 + }, + { + "epoch": 9.21566110397946, + "grad_norm": 1.910105586051941, + "learning_rate": 2.762772785622593e-05, + "loss": 0.5683, + "step": 7179 + }, + { + "epoch": 9.216944801026958, + "grad_norm": 1.558854103088379, + "learning_rate": 2.7631578947368423e-05, + "loss": 0.559, + "step": 7180 + }, + { + "epoch": 9.218228498074454, + "grad_norm": 1.4155492782592773, + "learning_rate": 2.7635430038510914e-05, + "loss": 0.5503, + "step": 7181 + }, + { + "epoch": 9.21951219512195, + "grad_norm": 1.0292385816574097, + "learning_rate": 2.76392811296534e-05, + "loss": 0.6205, + "step": 7182 + }, + { + "epoch": 9.220795892169448, + "grad_norm": 2.67655873298645, + "learning_rate": 2.7643132220795893e-05, + "loss": 0.62, + "step": 7183 + }, + { + "epoch": 9.222079589216944, + "grad_norm": 1.04176664352417, + "learning_rate": 2.7646983311938384e-05, + "loss": 0.5641, + "step": 7184 + }, + { + "epoch": 9.223363286264442, + "grad_norm": 3.653296947479248, + "learning_rate": 2.765083440308087e-05, + "loss": 0.5596, + "step": 7185 + }, + { + "epoch": 9.224646983311938, + "grad_norm": 1.3723649978637695, + "learning_rate": 2.7654685494223362e-05, + "loss": 0.597, + "step": 7186 + }, + { + "epoch": 9.225930680359435, + "grad_norm": 1.361614465713501, + "learning_rate": 2.7658536585365857e-05, + "loss": 0.5622, + "step": 7187 + }, + { + "epoch": 9.227214377406932, + "grad_norm": 1.857444167137146, + "learning_rate": 2.7662387676508345e-05, + "loss": 0.587, + "step": 7188 + }, + { + "epoch": 9.228498074454428, + "grad_norm": 1.254294991493225, + "learning_rate": 2.7666238767650836e-05, + "loss": 0.6132, + "step": 7189 + }, + { + "epoch": 9.229781771501926, + "grad_norm": 1.1065342426300049, + "learning_rate": 2.7670089858793327e-05, + "loss": 0.5731, + "step": 7190 + }, + { + "epoch": 9.231065468549422, + "grad_norm": 1.3404384851455688, + "learning_rate": 2.7673940949935814e-05, + "loss": 0.6012, + "step": 7191 + }, + { + "epoch": 9.232349165596919, + "grad_norm": 2.29003643989563, + "learning_rate": 2.7677792041078305e-05, + "loss": 0.6317, + "step": 7192 + }, + { + "epoch": 9.233632862644416, + "grad_norm": 3.0905404090881348, + "learning_rate": 2.7681643132220796e-05, + "loss": 0.5668, + "step": 7193 + }, + { + "epoch": 9.234916559691912, + "grad_norm": 1.2853527069091797, + "learning_rate": 2.7685494223363288e-05, + "loss": 0.6109, + "step": 7194 + }, + { + "epoch": 9.23620025673941, + "grad_norm": 1.1878011226654053, + "learning_rate": 2.768934531450578e-05, + "loss": 0.6169, + "step": 7195 + }, + { + "epoch": 9.237483953786906, + "grad_norm": 2.6149775981903076, + "learning_rate": 2.7693196405648266e-05, + "loss": 0.6857, + "step": 7196 + }, + { + "epoch": 9.238767650834403, + "grad_norm": 1.0192310810089111, + "learning_rate": 2.7697047496790757e-05, + "loss": 0.6098, + "step": 7197 + }, + { + "epoch": 9.2400513478819, + "grad_norm": 1.3674418926239014, + "learning_rate": 2.770089858793325e-05, + "loss": 0.6024, + "step": 7198 + }, + { + "epoch": 9.241335044929397, + "grad_norm": 1.5895010232925415, + "learning_rate": 2.770474967907574e-05, + "loss": 0.6267, + "step": 7199 + }, + { + "epoch": 9.242618741976893, + "grad_norm": 3.0471925735473633, + "learning_rate": 2.770860077021823e-05, + "loss": 0.5887, + "step": 7200 + }, + { + "epoch": 9.24390243902439, + "grad_norm": 5.357485294342041, + "learning_rate": 2.7712451861360718e-05, + "loss": 0.6296, + "step": 7201 + }, + { + "epoch": 9.245186136071887, + "grad_norm": 1.1045186519622803, + "learning_rate": 2.771630295250321e-05, + "loss": 0.5666, + "step": 7202 + }, + { + "epoch": 9.246469833119384, + "grad_norm": 1.32386314868927, + "learning_rate": 2.77201540436457e-05, + "loss": 0.6991, + "step": 7203 + }, + { + "epoch": 9.24775353016688, + "grad_norm": 2.8398478031158447, + "learning_rate": 2.7724005134788188e-05, + "loss": 0.5808, + "step": 7204 + }, + { + "epoch": 9.249037227214377, + "grad_norm": 4.659090518951416, + "learning_rate": 2.7727856225930683e-05, + "loss": 0.6378, + "step": 7205 + }, + { + "epoch": 9.250320924261874, + "grad_norm": 2.6100096702575684, + "learning_rate": 2.7731707317073174e-05, + "loss": 0.6925, + "step": 7206 + }, + { + "epoch": 9.25160462130937, + "grad_norm": 1.934584379196167, + "learning_rate": 2.773555840821566e-05, + "loss": 0.6511, + "step": 7207 + }, + { + "epoch": 9.252888318356868, + "grad_norm": 1.565423607826233, + "learning_rate": 2.7739409499358152e-05, + "loss": 0.6588, + "step": 7208 + }, + { + "epoch": 9.254172015404365, + "grad_norm": 8.882234573364258, + "learning_rate": 2.774326059050064e-05, + "loss": 0.8132, + "step": 7209 + }, + { + "epoch": 9.25545571245186, + "grad_norm": 3.3784170150756836, + "learning_rate": 2.774711168164313e-05, + "loss": 0.74, + "step": 7210 + }, + { + "epoch": 9.256739409499358, + "grad_norm": 3.8123691082000732, + "learning_rate": 2.7750962772785626e-05, + "loss": 0.8316, + "step": 7211 + }, + { + "epoch": 9.258023106546855, + "grad_norm": 1.8554387092590332, + "learning_rate": 2.7754813863928113e-05, + "loss": 0.5584, + "step": 7212 + }, + { + "epoch": 9.259306803594352, + "grad_norm": 9.238286018371582, + "learning_rate": 2.7758664955070604e-05, + "loss": 0.5642, + "step": 7213 + }, + { + "epoch": 9.260590500641849, + "grad_norm": 1.7904034852981567, + "learning_rate": 2.7762516046213095e-05, + "loss": 0.5931, + "step": 7214 + }, + { + "epoch": 9.261874197689345, + "grad_norm": 1.5476163625717163, + "learning_rate": 2.7766367137355583e-05, + "loss": 0.5578, + "step": 7215 + }, + { + "epoch": 9.263157894736842, + "grad_norm": 2.1296162605285645, + "learning_rate": 2.7770218228498074e-05, + "loss": 0.5921, + "step": 7216 + }, + { + "epoch": 9.264441591784339, + "grad_norm": 1.6973578929901123, + "learning_rate": 2.777406931964057e-05, + "loss": 0.5708, + "step": 7217 + }, + { + "epoch": 9.265725288831836, + "grad_norm": 1.766574501991272, + "learning_rate": 2.7777920410783056e-05, + "loss": 0.5714, + "step": 7218 + }, + { + "epoch": 9.267008985879333, + "grad_norm": 3.106419563293457, + "learning_rate": 2.7781771501925547e-05, + "loss": 0.593, + "step": 7219 + }, + { + "epoch": 9.268292682926829, + "grad_norm": 1.7830631732940674, + "learning_rate": 2.7785622593068035e-05, + "loss": 0.5778, + "step": 7220 + }, + { + "epoch": 9.269576379974326, + "grad_norm": 1.1127341985702515, + "learning_rate": 2.7789473684210526e-05, + "loss": 0.5896, + "step": 7221 + }, + { + "epoch": 9.270860077021823, + "grad_norm": 1.8366575241088867, + "learning_rate": 2.7793324775353017e-05, + "loss": 0.527, + "step": 7222 + }, + { + "epoch": 9.27214377406932, + "grad_norm": 23.721935272216797, + "learning_rate": 2.7797175866495508e-05, + "loss": 0.612, + "step": 7223 + }, + { + "epoch": 9.273427471116817, + "grad_norm": 5.075152397155762, + "learning_rate": 2.7801026957638e-05, + "loss": 0.6096, + "step": 7224 + }, + { + "epoch": 9.274711168164313, + "grad_norm": 2.9460394382476807, + "learning_rate": 2.780487804878049e-05, + "loss": 0.5688, + "step": 7225 + }, + { + "epoch": 9.27599486521181, + "grad_norm": 1.4405736923217773, + "learning_rate": 2.7808729139922978e-05, + "loss": 0.5478, + "step": 7226 + }, + { + "epoch": 9.277278562259307, + "grad_norm": 3.315528631210327, + "learning_rate": 2.781258023106547e-05, + "loss": 0.6406, + "step": 7227 + }, + { + "epoch": 9.278562259306804, + "grad_norm": 2.049231767654419, + "learning_rate": 2.7816431322207957e-05, + "loss": 0.5895, + "step": 7228 + }, + { + "epoch": 9.2798459563543, + "grad_norm": 3.3997042179107666, + "learning_rate": 2.782028241335045e-05, + "loss": 0.5591, + "step": 7229 + }, + { + "epoch": 9.281129653401797, + "grad_norm": 1.5953267812728882, + "learning_rate": 2.7824133504492942e-05, + "loss": 0.6313, + "step": 7230 + }, + { + "epoch": 9.282413350449294, + "grad_norm": 1.7560625076293945, + "learning_rate": 2.782798459563543e-05, + "loss": 0.626, + "step": 7231 + }, + { + "epoch": 9.28369704749679, + "grad_norm": 1.973035216331482, + "learning_rate": 2.783183568677792e-05, + "loss": 0.5718, + "step": 7232 + }, + { + "epoch": 9.284980744544288, + "grad_norm": 1.8138940334320068, + "learning_rate": 2.7835686777920412e-05, + "loss": 0.5691, + "step": 7233 + }, + { + "epoch": 9.286264441591785, + "grad_norm": 1.643674612045288, + "learning_rate": 2.78395378690629e-05, + "loss": 0.5855, + "step": 7234 + }, + { + "epoch": 9.28754813863928, + "grad_norm": 1.8084936141967773, + "learning_rate": 2.784338896020539e-05, + "loss": 0.5469, + "step": 7235 + }, + { + "epoch": 9.288831835686779, + "grad_norm": 2.747246265411377, + "learning_rate": 2.7847240051347885e-05, + "loss": 0.5205, + "step": 7236 + }, + { + "epoch": 9.290115532734275, + "grad_norm": 2.0306763648986816, + "learning_rate": 2.7851091142490373e-05, + "loss": 0.61, + "step": 7237 + }, + { + "epoch": 9.29139922978177, + "grad_norm": 1.2286548614501953, + "learning_rate": 2.7854942233632864e-05, + "loss": 0.5893, + "step": 7238 + }, + { + "epoch": 9.292682926829269, + "grad_norm": 2.2467081546783447, + "learning_rate": 2.7858793324775352e-05, + "loss": 0.5708, + "step": 7239 + }, + { + "epoch": 9.293966623876765, + "grad_norm": 1.4679346084594727, + "learning_rate": 2.7862644415917843e-05, + "loss": 0.596, + "step": 7240 + }, + { + "epoch": 9.295250320924263, + "grad_norm": 1.2428759336471558, + "learning_rate": 2.7866495507060334e-05, + "loss": 0.6116, + "step": 7241 + }, + { + "epoch": 9.296534017971759, + "grad_norm": 2.826589345932007, + "learning_rate": 2.7870346598202825e-05, + "loss": 0.6265, + "step": 7242 + }, + { + "epoch": 9.297817715019255, + "grad_norm": 4.30914306640625, + "learning_rate": 2.7874197689345316e-05, + "loss": 0.6188, + "step": 7243 + }, + { + "epoch": 9.299101412066753, + "grad_norm": 1.6783027648925781, + "learning_rate": 2.7878048780487807e-05, + "loss": 0.6264, + "step": 7244 + }, + { + "epoch": 9.300385109114249, + "grad_norm": 2.3201711177825928, + "learning_rate": 2.7881899871630295e-05, + "loss": 0.6312, + "step": 7245 + }, + { + "epoch": 9.301668806161747, + "grad_norm": 1.8262715339660645, + "learning_rate": 2.7885750962772786e-05, + "loss": 0.632, + "step": 7246 + }, + { + "epoch": 9.302952503209243, + "grad_norm": 1.0601098537445068, + "learning_rate": 2.7889602053915274e-05, + "loss": 0.6009, + "step": 7247 + }, + { + "epoch": 9.304236200256739, + "grad_norm": 3.425485134124756, + "learning_rate": 2.7893453145057768e-05, + "loss": 0.6443, + "step": 7248 + }, + { + "epoch": 9.305519897304237, + "grad_norm": 1.2711654901504517, + "learning_rate": 2.789730423620026e-05, + "loss": 0.6619, + "step": 7249 + }, + { + "epoch": 9.306803594351733, + "grad_norm": 2.0094363689422607, + "learning_rate": 2.7901155327342747e-05, + "loss": 0.6111, + "step": 7250 + }, + { + "epoch": 9.30808729139923, + "grad_norm": 1.7087862491607666, + "learning_rate": 2.7905006418485238e-05, + "loss": 0.6041, + "step": 7251 + }, + { + "epoch": 9.309370988446727, + "grad_norm": 1.632084846496582, + "learning_rate": 2.790885750962773e-05, + "loss": 0.6605, + "step": 7252 + }, + { + "epoch": 9.310654685494223, + "grad_norm": 3.640599250793457, + "learning_rate": 2.7912708600770217e-05, + "loss": 0.6386, + "step": 7253 + }, + { + "epoch": 9.31193838254172, + "grad_norm": 1.5478558540344238, + "learning_rate": 2.791655969191271e-05, + "loss": 0.6494, + "step": 7254 + }, + { + "epoch": 9.313222079589217, + "grad_norm": 4.107391834259033, + "learning_rate": 2.7920410783055202e-05, + "loss": 0.6634, + "step": 7255 + }, + { + "epoch": 9.314505776636715, + "grad_norm": 6.010704517364502, + "learning_rate": 2.792426187419769e-05, + "loss": 0.6562, + "step": 7256 + }, + { + "epoch": 9.31578947368421, + "grad_norm": 4.305787086486816, + "learning_rate": 2.792811296534018e-05, + "loss": 0.6651, + "step": 7257 + }, + { + "epoch": 9.317073170731707, + "grad_norm": 10.605380058288574, + "learning_rate": 2.793196405648267e-05, + "loss": 0.6811, + "step": 7258 + }, + { + "epoch": 9.318356867779205, + "grad_norm": 1.5564008951187134, + "learning_rate": 2.793581514762516e-05, + "loss": 0.6573, + "step": 7259 + }, + { + "epoch": 9.3196405648267, + "grad_norm": 8.805770874023438, + "learning_rate": 2.7939666238767654e-05, + "loss": 0.787, + "step": 7260 + }, + { + "epoch": 9.320924261874199, + "grad_norm": 2.7653772830963135, + "learning_rate": 2.7943517329910142e-05, + "loss": 0.8783, + "step": 7261 + }, + { + "epoch": 9.322207958921695, + "grad_norm": 3.7927865982055664, + "learning_rate": 2.7947368421052633e-05, + "loss": 0.5977, + "step": 7262 + }, + { + "epoch": 9.32349165596919, + "grad_norm": 4.074792861938477, + "learning_rate": 2.7951219512195124e-05, + "loss": 0.6231, + "step": 7263 + }, + { + "epoch": 9.324775353016689, + "grad_norm": 1.8486828804016113, + "learning_rate": 2.795507060333761e-05, + "loss": 0.6105, + "step": 7264 + }, + { + "epoch": 9.326059050064185, + "grad_norm": 1.1891696453094482, + "learning_rate": 2.7958921694480103e-05, + "loss": 0.6104, + "step": 7265 + }, + { + "epoch": 9.327342747111683, + "grad_norm": 1.1109501123428345, + "learning_rate": 2.7962772785622594e-05, + "loss": 0.5728, + "step": 7266 + }, + { + "epoch": 9.328626444159179, + "grad_norm": 1.727982997894287, + "learning_rate": 2.7966623876765085e-05, + "loss": 0.536, + "step": 7267 + }, + { + "epoch": 9.329910141206675, + "grad_norm": 1.2137314081192017, + "learning_rate": 2.7970474967907576e-05, + "loss": 0.6116, + "step": 7268 + }, + { + "epoch": 9.331193838254173, + "grad_norm": 2.139443874359131, + "learning_rate": 2.7974326059050064e-05, + "loss": 0.565, + "step": 7269 + }, + { + "epoch": 9.332477535301669, + "grad_norm": 2.6631972789764404, + "learning_rate": 2.7978177150192555e-05, + "loss": 0.5471, + "step": 7270 + }, + { + "epoch": 9.333761232349165, + "grad_norm": 4.153201103210449, + "learning_rate": 2.7982028241335046e-05, + "loss": 0.621, + "step": 7271 + }, + { + "epoch": 9.335044929396663, + "grad_norm": 1.4697579145431519, + "learning_rate": 2.7985879332477537e-05, + "loss": 0.5937, + "step": 7272 + }, + { + "epoch": 9.336328626444159, + "grad_norm": 1.3851732015609741, + "learning_rate": 2.7989730423620028e-05, + "loss": 0.5828, + "step": 7273 + }, + { + "epoch": 9.337612323491657, + "grad_norm": 2.4775021076202393, + "learning_rate": 2.7993581514762516e-05, + "loss": 0.5645, + "step": 7274 + }, + { + "epoch": 9.338896020539153, + "grad_norm": 2.198061466217041, + "learning_rate": 2.7997432605905007e-05, + "loss": 0.5703, + "step": 7275 + }, + { + "epoch": 9.340179717586649, + "grad_norm": 1.8232736587524414, + "learning_rate": 2.8001283697047498e-05, + "loss": 0.5715, + "step": 7276 + }, + { + "epoch": 9.341463414634147, + "grad_norm": 1.1655598878860474, + "learning_rate": 2.8005134788189985e-05, + "loss": 0.6298, + "step": 7277 + }, + { + "epoch": 9.342747111681643, + "grad_norm": 2.2020325660705566, + "learning_rate": 2.800898587933248e-05, + "loss": 0.5304, + "step": 7278 + }, + { + "epoch": 9.34403080872914, + "grad_norm": 1.2245275974273682, + "learning_rate": 2.801283697047497e-05, + "loss": 0.5784, + "step": 7279 + }, + { + "epoch": 9.345314505776637, + "grad_norm": 1.7912994623184204, + "learning_rate": 2.801668806161746e-05, + "loss": 0.5761, + "step": 7280 + }, + { + "epoch": 9.346598202824133, + "grad_norm": 2.1909666061401367, + "learning_rate": 2.802053915275995e-05, + "loss": 0.5672, + "step": 7281 + }, + { + "epoch": 9.34788189987163, + "grad_norm": 1.8913211822509766, + "learning_rate": 2.802439024390244e-05, + "loss": 0.6265, + "step": 7282 + }, + { + "epoch": 9.349165596919127, + "grad_norm": 3.426027536392212, + "learning_rate": 2.802824133504493e-05, + "loss": 0.6075, + "step": 7283 + }, + { + "epoch": 9.350449293966625, + "grad_norm": 1.219950556755066, + "learning_rate": 2.803209242618742e-05, + "loss": 0.5725, + "step": 7284 + }, + { + "epoch": 9.35173299101412, + "grad_norm": 2.1174938678741455, + "learning_rate": 2.803594351732991e-05, + "loss": 0.562, + "step": 7285 + }, + { + "epoch": 9.353016688061617, + "grad_norm": 1.589564323425293, + "learning_rate": 2.80397946084724e-05, + "loss": 0.641, + "step": 7286 + }, + { + "epoch": 9.354300385109115, + "grad_norm": 3.9109389781951904, + "learning_rate": 2.8043645699614893e-05, + "loss": 0.5674, + "step": 7287 + }, + { + "epoch": 9.35558408215661, + "grad_norm": 1.2593196630477905, + "learning_rate": 2.804749679075738e-05, + "loss": 0.5757, + "step": 7288 + }, + { + "epoch": 9.356867779204109, + "grad_norm": 2.083641290664673, + "learning_rate": 2.805134788189987e-05, + "loss": 0.6265, + "step": 7289 + }, + { + "epoch": 9.358151476251605, + "grad_norm": 2.266590118408203, + "learning_rate": 2.8055198973042363e-05, + "loss": 0.6037, + "step": 7290 + }, + { + "epoch": 9.3594351732991, + "grad_norm": 1.9680455923080444, + "learning_rate": 2.8059050064184854e-05, + "loss": 0.6289, + "step": 7291 + }, + { + "epoch": 9.360718870346599, + "grad_norm": 2.1949286460876465, + "learning_rate": 2.8062901155327345e-05, + "loss": 0.5806, + "step": 7292 + }, + { + "epoch": 9.362002567394095, + "grad_norm": 1.3502309322357178, + "learning_rate": 2.8066752246469832e-05, + "loss": 0.6473, + "step": 7293 + }, + { + "epoch": 9.363286264441593, + "grad_norm": 11.38492202758789, + "learning_rate": 2.8070603337612323e-05, + "loss": 0.6423, + "step": 7294 + }, + { + "epoch": 9.364569961489089, + "grad_norm": 6.973803997039795, + "learning_rate": 2.8074454428754815e-05, + "loss": 0.6242, + "step": 7295 + }, + { + "epoch": 9.365853658536585, + "grad_norm": 2.3265342712402344, + "learning_rate": 2.8078305519897302e-05, + "loss": 0.6723, + "step": 7296 + }, + { + "epoch": 9.367137355584083, + "grad_norm": 1.578226923942566, + "learning_rate": 2.8082156611039797e-05, + "loss": 0.6284, + "step": 7297 + }, + { + "epoch": 9.368421052631579, + "grad_norm": 1.2774732112884521, + "learning_rate": 2.8086007702182288e-05, + "loss": 0.5913, + "step": 7298 + }, + { + "epoch": 9.369704749679077, + "grad_norm": 5.500885486602783, + "learning_rate": 2.8089858793324775e-05, + "loss": 0.6514, + "step": 7299 + }, + { + "epoch": 9.370988446726573, + "grad_norm": 1.3805150985717773, + "learning_rate": 2.8093709884467266e-05, + "loss": 0.6549, + "step": 7300 + }, + { + "epoch": 9.372272143774069, + "grad_norm": 2.6705408096313477, + "learning_rate": 2.8097560975609758e-05, + "loss": 0.5824, + "step": 7301 + }, + { + "epoch": 9.373555840821567, + "grad_norm": 1.6617368459701538, + "learning_rate": 2.8101412066752245e-05, + "loss": 0.6448, + "step": 7302 + }, + { + "epoch": 9.374839537869063, + "grad_norm": 1.6131898164749146, + "learning_rate": 2.810526315789474e-05, + "loss": 0.6061, + "step": 7303 + }, + { + "epoch": 9.376123234916559, + "grad_norm": 2.7623612880706787, + "learning_rate": 2.8109114249037227e-05, + "loss": 0.6424, + "step": 7304 + }, + { + "epoch": 9.377406931964057, + "grad_norm": 3.323519229888916, + "learning_rate": 2.811296534017972e-05, + "loss": 0.5607, + "step": 7305 + }, + { + "epoch": 9.378690629011553, + "grad_norm": 2.070312261581421, + "learning_rate": 2.811681643132221e-05, + "loss": 0.6556, + "step": 7306 + }, + { + "epoch": 9.37997432605905, + "grad_norm": 2.6531224250793457, + "learning_rate": 2.8120667522464697e-05, + "loss": 0.6843, + "step": 7307 + }, + { + "epoch": 9.381258023106547, + "grad_norm": 1.9519563913345337, + "learning_rate": 2.8124518613607188e-05, + "loss": 0.6581, + "step": 7308 + }, + { + "epoch": 9.382541720154043, + "grad_norm": 3.124575138092041, + "learning_rate": 2.8128369704749683e-05, + "loss": 0.6689, + "step": 7309 + }, + { + "epoch": 9.38382541720154, + "grad_norm": 1.6624736785888672, + "learning_rate": 2.813222079589217e-05, + "loss": 0.7679, + "step": 7310 + }, + { + "epoch": 9.385109114249037, + "grad_norm": 1.6780222654342651, + "learning_rate": 2.813607188703466e-05, + "loss": 0.8377, + "step": 7311 + }, + { + "epoch": 9.386392811296535, + "grad_norm": 1.1892900466918945, + "learning_rate": 2.813992297817715e-05, + "loss": 0.5909, + "step": 7312 + }, + { + "epoch": 9.38767650834403, + "grad_norm": 1.8927936553955078, + "learning_rate": 2.814377406931964e-05, + "loss": 0.5657, + "step": 7313 + }, + { + "epoch": 9.388960205391527, + "grad_norm": 1.6756850481033325, + "learning_rate": 2.814762516046213e-05, + "loss": 0.5699, + "step": 7314 + }, + { + "epoch": 9.390243902439025, + "grad_norm": 0.9743697047233582, + "learning_rate": 2.8151476251604622e-05, + "loss": 0.598, + "step": 7315 + }, + { + "epoch": 9.39152759948652, + "grad_norm": 1.166877269744873, + "learning_rate": 2.8155327342747113e-05, + "loss": 0.6042, + "step": 7316 + }, + { + "epoch": 9.392811296534019, + "grad_norm": 4.882363319396973, + "learning_rate": 2.8159178433889605e-05, + "loss": 0.5625, + "step": 7317 + }, + { + "epoch": 9.394094993581515, + "grad_norm": 1.1143673658370972, + "learning_rate": 2.8163029525032092e-05, + "loss": 0.6032, + "step": 7318 + }, + { + "epoch": 9.39537869062901, + "grad_norm": 1.494530439376831, + "learning_rate": 2.8166880616174583e-05, + "loss": 0.5438, + "step": 7319 + }, + { + "epoch": 9.396662387676509, + "grad_norm": 1.4338245391845703, + "learning_rate": 2.817073170731707e-05, + "loss": 0.5653, + "step": 7320 + }, + { + "epoch": 9.397946084724005, + "grad_norm": 1.0561332702636719, + "learning_rate": 2.8174582798459565e-05, + "loss": 0.6091, + "step": 7321 + }, + { + "epoch": 9.399229781771503, + "grad_norm": 2.6029138565063477, + "learning_rate": 2.8178433889602056e-05, + "loss": 0.5783, + "step": 7322 + }, + { + "epoch": 9.400513478818999, + "grad_norm": 7.969789505004883, + "learning_rate": 2.8182284980744544e-05, + "loss": 0.6138, + "step": 7323 + }, + { + "epoch": 9.401797175866495, + "grad_norm": 1.8467649221420288, + "learning_rate": 2.8186136071887035e-05, + "loss": 0.6148, + "step": 7324 + }, + { + "epoch": 9.403080872913993, + "grad_norm": 3.7454071044921875, + "learning_rate": 2.8189987163029526e-05, + "loss": 0.606, + "step": 7325 + }, + { + "epoch": 9.404364569961489, + "grad_norm": 1.406982421875, + "learning_rate": 2.8193838254172014e-05, + "loss": 0.6119, + "step": 7326 + }, + { + "epoch": 9.405648267008987, + "grad_norm": 1.384451985359192, + "learning_rate": 2.819768934531451e-05, + "loss": 0.6061, + "step": 7327 + }, + { + "epoch": 9.406931964056483, + "grad_norm": 1.2406712770462036, + "learning_rate": 2.8201540436457e-05, + "loss": 0.5292, + "step": 7328 + }, + { + "epoch": 9.408215661103979, + "grad_norm": 2.0645482540130615, + "learning_rate": 2.8205391527599487e-05, + "loss": 0.5526, + "step": 7329 + }, + { + "epoch": 9.409499358151477, + "grad_norm": 1.5562351942062378, + "learning_rate": 2.8209242618741978e-05, + "loss": 0.6058, + "step": 7330 + }, + { + "epoch": 9.410783055198973, + "grad_norm": 14.124509811401367, + "learning_rate": 2.8213093709884466e-05, + "loss": 0.6337, + "step": 7331 + }, + { + "epoch": 9.41206675224647, + "grad_norm": 1.5728514194488525, + "learning_rate": 2.8216944801026957e-05, + "loss": 0.5674, + "step": 7332 + }, + { + "epoch": 9.413350449293967, + "grad_norm": 1.2324109077453613, + "learning_rate": 2.8220795892169448e-05, + "loss": 0.6256, + "step": 7333 + }, + { + "epoch": 9.414634146341463, + "grad_norm": 1.6099612712860107, + "learning_rate": 2.822464698331194e-05, + "loss": 0.6074, + "step": 7334 + }, + { + "epoch": 9.41591784338896, + "grad_norm": 2.3873462677001953, + "learning_rate": 2.822849807445443e-05, + "loss": 0.5702, + "step": 7335 + }, + { + "epoch": 9.417201540436457, + "grad_norm": 1.7045670747756958, + "learning_rate": 2.823234916559692e-05, + "loss": 0.6394, + "step": 7336 + }, + { + "epoch": 9.418485237483953, + "grad_norm": 1.2937626838684082, + "learning_rate": 2.823620025673941e-05, + "loss": 0.5582, + "step": 7337 + }, + { + "epoch": 9.41976893453145, + "grad_norm": 2.872790575027466, + "learning_rate": 2.82400513478819e-05, + "loss": 0.6084, + "step": 7338 + }, + { + "epoch": 9.421052631578947, + "grad_norm": 1.3479185104370117, + "learning_rate": 2.8243902439024388e-05, + "loss": 0.6416, + "step": 7339 + }, + { + "epoch": 9.422336328626445, + "grad_norm": 1.1920708417892456, + "learning_rate": 2.8247753530166882e-05, + "loss": 0.5954, + "step": 7340 + }, + { + "epoch": 9.42362002567394, + "grad_norm": 1.3143316507339478, + "learning_rate": 2.8251604621309373e-05, + "loss": 0.6235, + "step": 7341 + }, + { + "epoch": 9.424903722721437, + "grad_norm": 1.4822198152542114, + "learning_rate": 2.825545571245186e-05, + "loss": 0.579, + "step": 7342 + }, + { + "epoch": 9.426187419768935, + "grad_norm": 1.0835895538330078, + "learning_rate": 2.8259306803594352e-05, + "loss": 0.5749, + "step": 7343 + }, + { + "epoch": 9.427471116816431, + "grad_norm": 2.3813791275024414, + "learning_rate": 2.8263157894736843e-05, + "loss": 0.5446, + "step": 7344 + }, + { + "epoch": 9.428754813863929, + "grad_norm": 1.8391659259796143, + "learning_rate": 2.826700898587933e-05, + "loss": 0.6331, + "step": 7345 + }, + { + "epoch": 9.430038510911425, + "grad_norm": 5.276408672332764, + "learning_rate": 2.8270860077021825e-05, + "loss": 0.5738, + "step": 7346 + }, + { + "epoch": 9.431322207958921, + "grad_norm": 3.730555772781372, + "learning_rate": 2.8274711168164316e-05, + "loss": 0.5591, + "step": 7347 + }, + { + "epoch": 9.432605905006419, + "grad_norm": 1.9607181549072266, + "learning_rate": 2.8278562259306804e-05, + "loss": 0.6168, + "step": 7348 + }, + { + "epoch": 9.433889602053915, + "grad_norm": 1.5693542957305908, + "learning_rate": 2.8282413350449295e-05, + "loss": 0.6038, + "step": 7349 + }, + { + "epoch": 9.435173299101413, + "grad_norm": 2.4255545139312744, + "learning_rate": 2.8286264441591783e-05, + "loss": 0.6636, + "step": 7350 + }, + { + "epoch": 9.436456996148909, + "grad_norm": 2.0137627124786377, + "learning_rate": 2.8290115532734274e-05, + "loss": 0.6077, + "step": 7351 + }, + { + "epoch": 9.437740693196405, + "grad_norm": 2.5281970500946045, + "learning_rate": 2.8293966623876768e-05, + "loss": 0.6194, + "step": 7352 + }, + { + "epoch": 9.439024390243903, + "grad_norm": 2.144260883331299, + "learning_rate": 2.8297817715019256e-05, + "loss": 0.6229, + "step": 7353 + }, + { + "epoch": 9.440308087291399, + "grad_norm": 1.2362968921661377, + "learning_rate": 2.8301668806161747e-05, + "loss": 0.6328, + "step": 7354 + }, + { + "epoch": 9.441591784338897, + "grad_norm": 1.7355375289916992, + "learning_rate": 2.8305519897304238e-05, + "loss": 0.6381, + "step": 7355 + }, + { + "epoch": 9.442875481386393, + "grad_norm": 5.789806842803955, + "learning_rate": 2.8309370988446726e-05, + "loss": 0.6466, + "step": 7356 + }, + { + "epoch": 9.444159178433889, + "grad_norm": 2.1303627490997314, + "learning_rate": 2.8313222079589217e-05, + "loss": 0.6622, + "step": 7357 + }, + { + "epoch": 9.445442875481387, + "grad_norm": 2.0538337230682373, + "learning_rate": 2.8317073170731708e-05, + "loss": 0.6552, + "step": 7358 + }, + { + "epoch": 9.446726572528883, + "grad_norm": 2.734950065612793, + "learning_rate": 2.83209242618742e-05, + "loss": 0.7863, + "step": 7359 + }, + { + "epoch": 9.44801026957638, + "grad_norm": 2.6157782077789307, + "learning_rate": 2.832477535301669e-05, + "loss": 0.6971, + "step": 7360 + }, + { + "epoch": 9.449293966623877, + "grad_norm": 1.7390707731246948, + "learning_rate": 2.8328626444159178e-05, + "loss": 0.8103, + "step": 7361 + }, + { + "epoch": 9.450577663671373, + "grad_norm": 2.3535280227661133, + "learning_rate": 2.833247753530167e-05, + "loss": 0.552, + "step": 7362 + }, + { + "epoch": 9.45186136071887, + "grad_norm": 1.3782763481140137, + "learning_rate": 2.833632862644416e-05, + "loss": 0.5569, + "step": 7363 + }, + { + "epoch": 9.453145057766367, + "grad_norm": 1.2390522956848145, + "learning_rate": 2.834017971758665e-05, + "loss": 0.6105, + "step": 7364 + }, + { + "epoch": 9.454428754813865, + "grad_norm": 1.0775843858718872, + "learning_rate": 2.8344030808729142e-05, + "loss": 0.6292, + "step": 7365 + }, + { + "epoch": 9.455712451861361, + "grad_norm": 1.44100821018219, + "learning_rate": 2.834788189987163e-05, + "loss": 0.5789, + "step": 7366 + }, + { + "epoch": 9.456996148908857, + "grad_norm": 2.5249855518341064, + "learning_rate": 2.835173299101412e-05, + "loss": 0.5648, + "step": 7367 + }, + { + "epoch": 9.458279845956355, + "grad_norm": 5.695305347442627, + "learning_rate": 2.8355584082156612e-05, + "loss": 0.5744, + "step": 7368 + }, + { + "epoch": 9.459563543003851, + "grad_norm": 3.2860381603240967, + "learning_rate": 2.83594351732991e-05, + "loss": 0.6062, + "step": 7369 + }, + { + "epoch": 9.460847240051347, + "grad_norm": 1.5653092861175537, + "learning_rate": 2.8363286264441594e-05, + "loss": 0.5745, + "step": 7370 + }, + { + "epoch": 9.462130937098845, + "grad_norm": 1.3617030382156372, + "learning_rate": 2.8367137355584085e-05, + "loss": 0.6174, + "step": 7371 + }, + { + "epoch": 9.463414634146341, + "grad_norm": 19.81918716430664, + "learning_rate": 2.8370988446726573e-05, + "loss": 0.5733, + "step": 7372 + }, + { + "epoch": 9.464698331193839, + "grad_norm": 1.4001145362854004, + "learning_rate": 2.8374839537869064e-05, + "loss": 0.5766, + "step": 7373 + }, + { + "epoch": 9.465982028241335, + "grad_norm": 1.4383152723312378, + "learning_rate": 2.8378690629011555e-05, + "loss": 0.542, + "step": 7374 + }, + { + "epoch": 9.467265725288831, + "grad_norm": 1.4637587070465088, + "learning_rate": 2.8382541720154043e-05, + "loss": 0.566, + "step": 7375 + }, + { + "epoch": 9.468549422336329, + "grad_norm": 1.4850956201553345, + "learning_rate": 2.8386392811296537e-05, + "loss": 0.5741, + "step": 7376 + }, + { + "epoch": 9.469833119383825, + "grad_norm": 1.3891726732254028, + "learning_rate": 2.8390243902439025e-05, + "loss": 0.5235, + "step": 7377 + }, + { + "epoch": 9.471116816431323, + "grad_norm": 1.485652208328247, + "learning_rate": 2.8394094993581516e-05, + "loss": 0.6003, + "step": 7378 + }, + { + "epoch": 9.472400513478819, + "grad_norm": 1.5562512874603271, + "learning_rate": 2.8397946084724007e-05, + "loss": 0.5718, + "step": 7379 + }, + { + "epoch": 9.473684210526315, + "grad_norm": 2.5516858100891113, + "learning_rate": 2.8401797175866495e-05, + "loss": 0.5527, + "step": 7380 + }, + { + "epoch": 9.474967907573813, + "grad_norm": 14.26940631866455, + "learning_rate": 2.8405648267008986e-05, + "loss": 0.5929, + "step": 7381 + }, + { + "epoch": 9.476251604621309, + "grad_norm": 1.6638952493667603, + "learning_rate": 2.840949935815148e-05, + "loss": 0.607, + "step": 7382 + }, + { + "epoch": 9.477535301668807, + "grad_norm": 3.7032597064971924, + "learning_rate": 2.8413350449293968e-05, + "loss": 0.6087, + "step": 7383 + }, + { + "epoch": 9.478818998716303, + "grad_norm": 5.341461181640625, + "learning_rate": 2.841720154043646e-05, + "loss": 0.6017, + "step": 7384 + }, + { + "epoch": 9.480102695763799, + "grad_norm": 2.7551677227020264, + "learning_rate": 2.8421052631578946e-05, + "loss": 0.6084, + "step": 7385 + }, + { + "epoch": 9.481386392811297, + "grad_norm": 1.7584964036941528, + "learning_rate": 2.8424903722721438e-05, + "loss": 0.5833, + "step": 7386 + }, + { + "epoch": 9.482670089858793, + "grad_norm": 27.69982147216797, + "learning_rate": 2.842875481386393e-05, + "loss": 0.6157, + "step": 7387 + }, + { + "epoch": 9.48395378690629, + "grad_norm": 2.1949896812438965, + "learning_rate": 2.8432605905006416e-05, + "loss": 0.6468, + "step": 7388 + }, + { + "epoch": 9.485237483953787, + "grad_norm": 2.805342435836792, + "learning_rate": 2.843645699614891e-05, + "loss": 0.5685, + "step": 7389 + }, + { + "epoch": 9.486521181001283, + "grad_norm": 1.7923475503921509, + "learning_rate": 2.8440308087291402e-05, + "loss": 0.5721, + "step": 7390 + }, + { + "epoch": 9.487804878048781, + "grad_norm": 2.2741270065307617, + "learning_rate": 2.844415917843389e-05, + "loss": 0.6124, + "step": 7391 + }, + { + "epoch": 9.489088575096277, + "grad_norm": 1.1916033029556274, + "learning_rate": 2.844801026957638e-05, + "loss": 0.5767, + "step": 7392 + }, + { + "epoch": 9.490372272143775, + "grad_norm": 4.355412483215332, + "learning_rate": 2.845186136071887e-05, + "loss": 0.5862, + "step": 7393 + }, + { + "epoch": 9.491655969191271, + "grad_norm": 1.6453986167907715, + "learning_rate": 2.845571245186136e-05, + "loss": 0.5653, + "step": 7394 + }, + { + "epoch": 9.492939666238767, + "grad_norm": 6.429062366485596, + "learning_rate": 2.8459563543003854e-05, + "loss": 0.5512, + "step": 7395 + }, + { + "epoch": 9.494223363286265, + "grad_norm": 1.8161792755126953, + "learning_rate": 2.846341463414634e-05, + "loss": 0.5694, + "step": 7396 + }, + { + "epoch": 9.495507060333761, + "grad_norm": 1.1045429706573486, + "learning_rate": 2.8467265725288833e-05, + "loss": 0.6057, + "step": 7397 + }, + { + "epoch": 9.496790757381259, + "grad_norm": 2.7988076210021973, + "learning_rate": 2.8471116816431324e-05, + "loss": 0.575, + "step": 7398 + }, + { + "epoch": 9.498074454428755, + "grad_norm": 3.288313388824463, + "learning_rate": 2.847496790757381e-05, + "loss": 0.6398, + "step": 7399 + }, + { + "epoch": 9.499358151476251, + "grad_norm": 1.4442218542099, + "learning_rate": 2.8478818998716302e-05, + "loss": 0.571, + "step": 7400 + }, + { + "epoch": 9.500641848523749, + "grad_norm": 2.266737222671509, + "learning_rate": 2.8482670089858797e-05, + "loss": 0.6121, + "step": 7401 + }, + { + "epoch": 9.501925545571245, + "grad_norm": 3.069326162338257, + "learning_rate": 2.8486521181001285e-05, + "loss": 0.6297, + "step": 7402 + }, + { + "epoch": 9.503209242618741, + "grad_norm": 2.5011844635009766, + "learning_rate": 2.8490372272143776e-05, + "loss": 0.6444, + "step": 7403 + }, + { + "epoch": 9.504492939666239, + "grad_norm": 2.195733070373535, + "learning_rate": 2.8494223363286263e-05, + "loss": 0.6589, + "step": 7404 + }, + { + "epoch": 9.505776636713735, + "grad_norm": 4.994268417358398, + "learning_rate": 2.8498074454428754e-05, + "loss": 0.6214, + "step": 7405 + }, + { + "epoch": 9.507060333761233, + "grad_norm": 3.1443982124328613, + "learning_rate": 2.8501925545571245e-05, + "loss": 0.6851, + "step": 7406 + }, + { + "epoch": 9.508344030808729, + "grad_norm": 2.5632450580596924, + "learning_rate": 2.8505776636713736e-05, + "loss": 0.6557, + "step": 7407 + }, + { + "epoch": 9.509627727856225, + "grad_norm": 4.7073259353637695, + "learning_rate": 2.8509627727856228e-05, + "loss": 0.6474, + "step": 7408 + }, + { + "epoch": 9.510911424903723, + "grad_norm": 3.741757392883301, + "learning_rate": 2.851347881899872e-05, + "loss": 0.6726, + "step": 7409 + }, + { + "epoch": 9.512195121951219, + "grad_norm": 2.1231350898742676, + "learning_rate": 2.8517329910141206e-05, + "loss": 0.7612, + "step": 7410 + }, + { + "epoch": 9.513478818998717, + "grad_norm": 2.619237184524536, + "learning_rate": 2.8521181001283697e-05, + "loss": 0.8611, + "step": 7411 + }, + { + "epoch": 9.514762516046213, + "grad_norm": 4.615137100219727, + "learning_rate": 2.8525032092426185e-05, + "loss": 0.5446, + "step": 7412 + }, + { + "epoch": 9.51604621309371, + "grad_norm": 1.2803924083709717, + "learning_rate": 2.852888318356868e-05, + "loss": 0.5558, + "step": 7413 + }, + { + "epoch": 9.517329910141207, + "grad_norm": 2.705687999725342, + "learning_rate": 2.853273427471117e-05, + "loss": 0.5724, + "step": 7414 + }, + { + "epoch": 9.518613607188703, + "grad_norm": 1.104994535446167, + "learning_rate": 2.8536585365853658e-05, + "loss": 0.5644, + "step": 7415 + }, + { + "epoch": 9.519897304236201, + "grad_norm": 1.9327600002288818, + "learning_rate": 2.854043645699615e-05, + "loss": 0.6069, + "step": 7416 + }, + { + "epoch": 9.521181001283697, + "grad_norm": 2.855649709701538, + "learning_rate": 2.854428754813864e-05, + "loss": 0.5365, + "step": 7417 + }, + { + "epoch": 9.522464698331193, + "grad_norm": 2.5198280811309814, + "learning_rate": 2.8548138639281128e-05, + "loss": 0.5777, + "step": 7418 + }, + { + "epoch": 9.523748395378691, + "grad_norm": 2.6860594749450684, + "learning_rate": 2.8551989730423623e-05, + "loss": 0.5936, + "step": 7419 + }, + { + "epoch": 9.525032092426187, + "grad_norm": 0.8097996115684509, + "learning_rate": 2.8555840821566114e-05, + "loss": 0.5726, + "step": 7420 + }, + { + "epoch": 9.526315789473685, + "grad_norm": 1.0884578227996826, + "learning_rate": 2.85596919127086e-05, + "loss": 0.5683, + "step": 7421 + }, + { + "epoch": 9.527599486521181, + "grad_norm": 1.9580135345458984, + "learning_rate": 2.8563543003851092e-05, + "loss": 0.5552, + "step": 7422 + }, + { + "epoch": 9.528883183568677, + "grad_norm": 2.4919843673706055, + "learning_rate": 2.856739409499358e-05, + "loss": 0.6126, + "step": 7423 + }, + { + "epoch": 9.530166880616175, + "grad_norm": 2.125229597091675, + "learning_rate": 2.857124518613607e-05, + "loss": 0.6163, + "step": 7424 + }, + { + "epoch": 9.531450577663671, + "grad_norm": 1.1462340354919434, + "learning_rate": 2.8575096277278566e-05, + "loss": 0.5732, + "step": 7425 + }, + { + "epoch": 9.532734274711169, + "grad_norm": 0.9003100991249084, + "learning_rate": 2.8578947368421053e-05, + "loss": 0.5481, + "step": 7426 + }, + { + "epoch": 9.534017971758665, + "grad_norm": 1.5745728015899658, + "learning_rate": 2.8582798459563544e-05, + "loss": 0.5818, + "step": 7427 + }, + { + "epoch": 9.535301668806161, + "grad_norm": 1.8527544736862183, + "learning_rate": 2.8586649550706035e-05, + "loss": 0.5256, + "step": 7428 + }, + { + "epoch": 9.536585365853659, + "grad_norm": 3.7941977977752686, + "learning_rate": 2.8590500641848523e-05, + "loss": 0.5282, + "step": 7429 + }, + { + "epoch": 9.537869062901155, + "grad_norm": 1.1779918670654297, + "learning_rate": 2.8594351732991014e-05, + "loss": 0.5536, + "step": 7430 + }, + { + "epoch": 9.539152759948653, + "grad_norm": 2.7725954055786133, + "learning_rate": 2.8598202824133505e-05, + "loss": 0.572, + "step": 7431 + }, + { + "epoch": 9.540436456996149, + "grad_norm": 2.475558042526245, + "learning_rate": 2.8602053915275996e-05, + "loss": 0.5296, + "step": 7432 + }, + { + "epoch": 9.541720154043645, + "grad_norm": 2.151434898376465, + "learning_rate": 2.8605905006418487e-05, + "loss": 0.5635, + "step": 7433 + }, + { + "epoch": 9.543003851091143, + "grad_norm": 4.937005043029785, + "learning_rate": 2.8609756097560975e-05, + "loss": 0.5286, + "step": 7434 + }, + { + "epoch": 9.544287548138639, + "grad_norm": 2.4676883220672607, + "learning_rate": 2.8613607188703466e-05, + "loss": 0.535, + "step": 7435 + }, + { + "epoch": 9.545571245186135, + "grad_norm": 1.7277873754501343, + "learning_rate": 2.8617458279845957e-05, + "loss": 0.5885, + "step": 7436 + }, + { + "epoch": 9.546854942233633, + "grad_norm": 1.3617573976516724, + "learning_rate": 2.8621309370988448e-05, + "loss": 0.5827, + "step": 7437 + }, + { + "epoch": 9.54813863928113, + "grad_norm": 4.366010665893555, + "learning_rate": 2.862516046213094e-05, + "loss": 0.6009, + "step": 7438 + }, + { + "epoch": 9.549422336328627, + "grad_norm": 1.5130022764205933, + "learning_rate": 2.862901155327343e-05, + "loss": 0.617, + "step": 7439 + }, + { + "epoch": 9.550706033376123, + "grad_norm": 2.4723989963531494, + "learning_rate": 2.8632862644415918e-05, + "loss": 0.6296, + "step": 7440 + }, + { + "epoch": 9.55198973042362, + "grad_norm": 4.509926795959473, + "learning_rate": 2.863671373555841e-05, + "loss": 0.5557, + "step": 7441 + }, + { + "epoch": 9.553273427471117, + "grad_norm": 1.794732689857483, + "learning_rate": 2.8640564826700897e-05, + "loss": 0.5723, + "step": 7442 + }, + { + "epoch": 9.554557124518613, + "grad_norm": 10.025986671447754, + "learning_rate": 2.8644415917843388e-05, + "loss": 0.5949, + "step": 7443 + }, + { + "epoch": 9.555840821566111, + "grad_norm": 4.745639324188232, + "learning_rate": 2.8648267008985882e-05, + "loss": 0.6493, + "step": 7444 + }, + { + "epoch": 9.557124518613607, + "grad_norm": 1.804924726486206, + "learning_rate": 2.865211810012837e-05, + "loss": 0.6354, + "step": 7445 + }, + { + "epoch": 9.558408215661103, + "grad_norm": 1.4586553573608398, + "learning_rate": 2.865596919127086e-05, + "loss": 0.6722, + "step": 7446 + }, + { + "epoch": 9.559691912708601, + "grad_norm": 2.6234824657440186, + "learning_rate": 2.8659820282413352e-05, + "loss": 0.6178, + "step": 7447 + }, + { + "epoch": 9.560975609756097, + "grad_norm": 4.309379577636719, + "learning_rate": 2.866367137355584e-05, + "loss": 0.6248, + "step": 7448 + }, + { + "epoch": 9.562259306803595, + "grad_norm": 10.540453910827637, + "learning_rate": 2.866752246469833e-05, + "loss": 0.6992, + "step": 7449 + }, + { + "epoch": 9.563543003851091, + "grad_norm": 2.183896064758301, + "learning_rate": 2.8671373555840822e-05, + "loss": 0.572, + "step": 7450 + }, + { + "epoch": 9.564826700898587, + "grad_norm": 1.936349630355835, + "learning_rate": 2.8675224646983313e-05, + "loss": 0.6164, + "step": 7451 + }, + { + "epoch": 9.566110397946085, + "grad_norm": 2.078641414642334, + "learning_rate": 2.8679075738125804e-05, + "loss": 0.6465, + "step": 7452 + }, + { + "epoch": 9.567394094993581, + "grad_norm": 5.52513313293457, + "learning_rate": 2.8682926829268292e-05, + "loss": 0.6514, + "step": 7453 + }, + { + "epoch": 9.568677792041079, + "grad_norm": 4.445849895477295, + "learning_rate": 2.8686777920410783e-05, + "loss": 0.6332, + "step": 7454 + }, + { + "epoch": 9.569961489088575, + "grad_norm": 1.925753116607666, + "learning_rate": 2.8690629011553274e-05, + "loss": 0.6691, + "step": 7455 + }, + { + "epoch": 9.571245186136071, + "grad_norm": 6.54870080947876, + "learning_rate": 2.8694480102695765e-05, + "loss": 0.6384, + "step": 7456 + }, + { + "epoch": 9.572528883183569, + "grad_norm": 8.071739196777344, + "learning_rate": 2.8698331193838256e-05, + "loss": 0.6529, + "step": 7457 + }, + { + "epoch": 9.573812580231065, + "grad_norm": 3.6226675510406494, + "learning_rate": 2.8702182284980747e-05, + "loss": 0.6439, + "step": 7458 + }, + { + "epoch": 9.575096277278563, + "grad_norm": 2.9893319606781006, + "learning_rate": 2.8706033376123235e-05, + "loss": 0.6868, + "step": 7459 + }, + { + "epoch": 9.57637997432606, + "grad_norm": 3.719310998916626, + "learning_rate": 2.8709884467265726e-05, + "loss": 0.6651, + "step": 7460 + }, + { + "epoch": 9.577663671373555, + "grad_norm": 2.8794896602630615, + "learning_rate": 2.8713735558408214e-05, + "loss": 0.8664, + "step": 7461 + }, + { + "epoch": 9.578947368421053, + "grad_norm": 2.4580540657043457, + "learning_rate": 2.8717586649550708e-05, + "loss": 0.5616, + "step": 7462 + }, + { + "epoch": 9.58023106546855, + "grad_norm": 1.9266711473464966, + "learning_rate": 2.87214377406932e-05, + "loss": 0.5713, + "step": 7463 + }, + { + "epoch": 9.581514762516047, + "grad_norm": 1.494003176689148, + "learning_rate": 2.8725288831835687e-05, + "loss": 0.5744, + "step": 7464 + }, + { + "epoch": 9.582798459563543, + "grad_norm": 1.4545856714248657, + "learning_rate": 2.8729139922978178e-05, + "loss": 0.6126, + "step": 7465 + }, + { + "epoch": 9.58408215661104, + "grad_norm": 2.187311887741089, + "learning_rate": 2.873299101412067e-05, + "loss": 0.59, + "step": 7466 + }, + { + "epoch": 9.585365853658537, + "grad_norm": 3.8943843841552734, + "learning_rate": 2.8736842105263157e-05, + "loss": 0.6105, + "step": 7467 + }, + { + "epoch": 9.586649550706033, + "grad_norm": 0.9273067116737366, + "learning_rate": 2.874069319640565e-05, + "loss": 0.56, + "step": 7468 + }, + { + "epoch": 9.58793324775353, + "grad_norm": 2.349015474319458, + "learning_rate": 2.874454428754814e-05, + "loss": 0.5446, + "step": 7469 + }, + { + "epoch": 9.589216944801027, + "grad_norm": 0.8245199918746948, + "learning_rate": 2.874839537869063e-05, + "loss": 0.5623, + "step": 7470 + }, + { + "epoch": 9.590500641848523, + "grad_norm": 1.8155924081802368, + "learning_rate": 2.875224646983312e-05, + "loss": 0.5647, + "step": 7471 + }, + { + "epoch": 9.591784338896021, + "grad_norm": 7.9835309982299805, + "learning_rate": 2.875609756097561e-05, + "loss": 0.6095, + "step": 7472 + }, + { + "epoch": 9.593068035943517, + "grad_norm": 1.533265233039856, + "learning_rate": 2.87599486521181e-05, + "loss": 0.6107, + "step": 7473 + }, + { + "epoch": 9.594351732991013, + "grad_norm": 7.713414192199707, + "learning_rate": 2.8763799743260594e-05, + "loss": 0.5435, + "step": 7474 + }, + { + "epoch": 9.595635430038511, + "grad_norm": 1.174123764038086, + "learning_rate": 2.8767650834403082e-05, + "loss": 0.6328, + "step": 7475 + }, + { + "epoch": 9.596919127086007, + "grad_norm": 3.8512604236602783, + "learning_rate": 2.8771501925545573e-05, + "loss": 0.532, + "step": 7476 + }, + { + "epoch": 9.598202824133505, + "grad_norm": 1.8198260068893433, + "learning_rate": 2.877535301668806e-05, + "loss": 0.5703, + "step": 7477 + }, + { + "epoch": 9.599486521181001, + "grad_norm": 1.1401642560958862, + "learning_rate": 2.877920410783055e-05, + "loss": 0.5849, + "step": 7478 + }, + { + "epoch": 9.600770218228497, + "grad_norm": 3.7112174034118652, + "learning_rate": 2.8783055198973043e-05, + "loss": 0.5957, + "step": 7479 + }, + { + "epoch": 9.602053915275995, + "grad_norm": 1.3226906061172485, + "learning_rate": 2.8786906290115534e-05, + "loss": 0.5724, + "step": 7480 + }, + { + "epoch": 9.603337612323491, + "grad_norm": 3.3548433780670166, + "learning_rate": 2.8790757381258025e-05, + "loss": 0.583, + "step": 7481 + }, + { + "epoch": 9.60462130937099, + "grad_norm": 1.3569276332855225, + "learning_rate": 2.8794608472400516e-05, + "loss": 0.5775, + "step": 7482 + }, + { + "epoch": 9.605905006418485, + "grad_norm": 1.2581175565719604, + "learning_rate": 2.8798459563543004e-05, + "loss": 0.5981, + "step": 7483 + }, + { + "epoch": 9.607188703465981, + "grad_norm": 1.8401018381118774, + "learning_rate": 2.8802310654685495e-05, + "loss": 0.5589, + "step": 7484 + }, + { + "epoch": 9.60847240051348, + "grad_norm": 1.9402296543121338, + "learning_rate": 2.8806161745827986e-05, + "loss": 0.5654, + "step": 7485 + }, + { + "epoch": 9.609756097560975, + "grad_norm": 1.6424696445465088, + "learning_rate": 2.8810012836970477e-05, + "loss": 0.5871, + "step": 7486 + }, + { + "epoch": 9.611039794608473, + "grad_norm": 1.3144515752792358, + "learning_rate": 2.8813863928112968e-05, + "loss": 0.5863, + "step": 7487 + }, + { + "epoch": 9.61232349165597, + "grad_norm": 11.15943431854248, + "learning_rate": 2.8817715019255456e-05, + "loss": 0.569, + "step": 7488 + }, + { + "epoch": 9.613607188703465, + "grad_norm": 1.2140620946884155, + "learning_rate": 2.8821566110397947e-05, + "loss": 0.5754, + "step": 7489 + }, + { + "epoch": 9.614890885750963, + "grad_norm": 6.5595502853393555, + "learning_rate": 2.8825417201540438e-05, + "loss": 0.5824, + "step": 7490 + }, + { + "epoch": 9.61617458279846, + "grad_norm": 1.1735512018203735, + "learning_rate": 2.8829268292682925e-05, + "loss": 0.6394, + "step": 7491 + }, + { + "epoch": 9.617458279845957, + "grad_norm": 2.4172723293304443, + "learning_rate": 2.8833119383825416e-05, + "loss": 0.6004, + "step": 7492 + }, + { + "epoch": 9.618741976893453, + "grad_norm": 0.9471763372421265, + "learning_rate": 2.883697047496791e-05, + "loss": 0.6172, + "step": 7493 + }, + { + "epoch": 9.62002567394095, + "grad_norm": 1.3193501234054565, + "learning_rate": 2.88408215661104e-05, + "loss": 0.6162, + "step": 7494 + }, + { + "epoch": 9.621309370988447, + "grad_norm": 2.337374687194824, + "learning_rate": 2.884467265725289e-05, + "loss": 0.5707, + "step": 7495 + }, + { + "epoch": 9.622593068035943, + "grad_norm": 4.890906810760498, + "learning_rate": 2.8848523748395377e-05, + "loss": 0.6181, + "step": 7496 + }, + { + "epoch": 9.623876765083441, + "grad_norm": 1.4448436498641968, + "learning_rate": 2.885237483953787e-05, + "loss": 0.5523, + "step": 7497 + }, + { + "epoch": 9.625160462130937, + "grad_norm": 10.259325981140137, + "learning_rate": 2.885622593068036e-05, + "loss": 0.6009, + "step": 7498 + }, + { + "epoch": 9.626444159178433, + "grad_norm": 1.2634741067886353, + "learning_rate": 2.886007702182285e-05, + "loss": 0.5574, + "step": 7499 + }, + { + "epoch": 9.627727856225931, + "grad_norm": 6.846724033355713, + "learning_rate": 2.886392811296534e-05, + "loss": 0.6387, + "step": 7500 + }, + { + "epoch": 9.629011553273427, + "grad_norm": 1.0786972045898438, + "learning_rate": 2.8867779204107833e-05, + "loss": 0.617, + "step": 7501 + }, + { + "epoch": 9.630295250320923, + "grad_norm": 1.8734084367752075, + "learning_rate": 2.887163029525032e-05, + "loss": 0.6288, + "step": 7502 + }, + { + "epoch": 9.631578947368421, + "grad_norm": 2.3589117527008057, + "learning_rate": 2.887548138639281e-05, + "loss": 0.6386, + "step": 7503 + }, + { + "epoch": 9.632862644415917, + "grad_norm": 11.554356575012207, + "learning_rate": 2.8879332477535303e-05, + "loss": 0.5823, + "step": 7504 + }, + { + "epoch": 9.634146341463415, + "grad_norm": 5.474252223968506, + "learning_rate": 2.8883183568677794e-05, + "loss": 0.6117, + "step": 7505 + }, + { + "epoch": 9.635430038510911, + "grad_norm": 4.705224990844727, + "learning_rate": 2.8887034659820285e-05, + "loss": 0.6446, + "step": 7506 + }, + { + "epoch": 9.63671373555841, + "grad_norm": 9.545845985412598, + "learning_rate": 2.8890885750962772e-05, + "loss": 0.6632, + "step": 7507 + }, + { + "epoch": 9.637997432605905, + "grad_norm": 2.1064352989196777, + "learning_rate": 2.8894736842105263e-05, + "loss": 0.693, + "step": 7508 + }, + { + "epoch": 9.639281129653401, + "grad_norm": 1.9070407152175903, + "learning_rate": 2.8898587933247755e-05, + "loss": 0.6772, + "step": 7509 + }, + { + "epoch": 9.6405648267009, + "grad_norm": 1.2770593166351318, + "learning_rate": 2.8902439024390242e-05, + "loss": 0.6922, + "step": 7510 + }, + { + "epoch": 9.641848523748395, + "grad_norm": 3.511870861053467, + "learning_rate": 2.8906290115532737e-05, + "loss": 0.8275, + "step": 7511 + }, + { + "epoch": 9.643132220795891, + "grad_norm": 1.5495898723602295, + "learning_rate": 2.8910141206675228e-05, + "loss": 0.5843, + "step": 7512 + }, + { + "epoch": 9.64441591784339, + "grad_norm": 3.580491542816162, + "learning_rate": 2.8913992297817715e-05, + "loss": 0.5493, + "step": 7513 + }, + { + "epoch": 9.645699614890885, + "grad_norm": 1.4051343202590942, + "learning_rate": 2.8917843388960206e-05, + "loss": 0.552, + "step": 7514 + }, + { + "epoch": 9.646983311938383, + "grad_norm": 3.9680089950561523, + "learning_rate": 2.8921694480102694e-05, + "loss": 0.5801, + "step": 7515 + }, + { + "epoch": 9.64826700898588, + "grad_norm": 2.4636662006378174, + "learning_rate": 2.8925545571245185e-05, + "loss": 0.5423, + "step": 7516 + }, + { + "epoch": 9.649550706033375, + "grad_norm": 2.5450470447540283, + "learning_rate": 2.892939666238768e-05, + "loss": 0.5258, + "step": 7517 + }, + { + "epoch": 9.650834403080873, + "grad_norm": 38.30866241455078, + "learning_rate": 2.8933247753530167e-05, + "loss": 0.5848, + "step": 7518 + }, + { + "epoch": 9.65211810012837, + "grad_norm": 1.4406235218048096, + "learning_rate": 2.893709884467266e-05, + "loss": 0.5487, + "step": 7519 + }, + { + "epoch": 9.653401797175867, + "grad_norm": 1.948838472366333, + "learning_rate": 2.894094993581515e-05, + "loss": 0.548, + "step": 7520 + }, + { + "epoch": 9.654685494223363, + "grad_norm": 1.2894542217254639, + "learning_rate": 2.8944801026957637e-05, + "loss": 0.6188, + "step": 7521 + }, + { + "epoch": 9.65596919127086, + "grad_norm": 1.516152024269104, + "learning_rate": 2.8948652118100128e-05, + "loss": 0.5887, + "step": 7522 + }, + { + "epoch": 9.657252888318357, + "grad_norm": 2.088712215423584, + "learning_rate": 2.895250320924262e-05, + "loss": 0.5835, + "step": 7523 + }, + { + "epoch": 9.658536585365853, + "grad_norm": 1.5085866451263428, + "learning_rate": 2.895635430038511e-05, + "loss": 0.604, + "step": 7524 + }, + { + "epoch": 9.659820282413351, + "grad_norm": 7.845776081085205, + "learning_rate": 2.89602053915276e-05, + "loss": 0.5621, + "step": 7525 + }, + { + "epoch": 9.661103979460847, + "grad_norm": 2.0412096977233887, + "learning_rate": 2.896405648267009e-05, + "loss": 0.6377, + "step": 7526 + }, + { + "epoch": 9.662387676508343, + "grad_norm": 1.0943180322647095, + "learning_rate": 2.896790757381258e-05, + "loss": 0.5965, + "step": 7527 + }, + { + "epoch": 9.663671373555841, + "grad_norm": 1.2936432361602783, + "learning_rate": 2.897175866495507e-05, + "loss": 0.573, + "step": 7528 + }, + { + "epoch": 9.664955070603337, + "grad_norm": 1.51963472366333, + "learning_rate": 2.8975609756097562e-05, + "loss": 0.5635, + "step": 7529 + }, + { + "epoch": 9.666238767650835, + "grad_norm": 1.127040147781372, + "learning_rate": 2.8979460847240053e-05, + "loss": 0.5265, + "step": 7530 + }, + { + "epoch": 9.667522464698331, + "grad_norm": 3.471254587173462, + "learning_rate": 2.8983311938382545e-05, + "loss": 0.5588, + "step": 7531 + }, + { + "epoch": 9.668806161745827, + "grad_norm": 2.5202250480651855, + "learning_rate": 2.8987163029525032e-05, + "loss": 0.5391, + "step": 7532 + }, + { + "epoch": 9.670089858793325, + "grad_norm": 1.1465705633163452, + "learning_rate": 2.8991014120667523e-05, + "loss": 0.5688, + "step": 7533 + }, + { + "epoch": 9.671373555840821, + "grad_norm": 1.1056466102600098, + "learning_rate": 2.899486521181001e-05, + "loss": 0.6109, + "step": 7534 + }, + { + "epoch": 9.672657252888317, + "grad_norm": 1.8872556686401367, + "learning_rate": 2.8998716302952505e-05, + "loss": 0.5803, + "step": 7535 + }, + { + "epoch": 9.673940949935815, + "grad_norm": 2.5133564472198486, + "learning_rate": 2.9002567394094996e-05, + "loss": 0.6172, + "step": 7536 + }, + { + "epoch": 9.675224646983311, + "grad_norm": 2.0213112831115723, + "learning_rate": 2.9006418485237484e-05, + "loss": 0.6121, + "step": 7537 + }, + { + "epoch": 9.67650834403081, + "grad_norm": 3.431143045425415, + "learning_rate": 2.9010269576379975e-05, + "loss": 0.5522, + "step": 7538 + }, + { + "epoch": 9.677792041078305, + "grad_norm": 1.3957265615463257, + "learning_rate": 2.9014120667522466e-05, + "loss": 0.5847, + "step": 7539 + }, + { + "epoch": 9.679075738125803, + "grad_norm": 1.69337797164917, + "learning_rate": 2.9017971758664954e-05, + "loss": 0.556, + "step": 7540 + }, + { + "epoch": 9.6803594351733, + "grad_norm": 2.699404239654541, + "learning_rate": 2.9021822849807445e-05, + "loss": 0.589, + "step": 7541 + }, + { + "epoch": 9.681643132220795, + "grad_norm": 3.2023866176605225, + "learning_rate": 2.9025673940949936e-05, + "loss": 0.5441, + "step": 7542 + }, + { + "epoch": 9.682926829268293, + "grad_norm": 3.4578568935394287, + "learning_rate": 2.9029525032092427e-05, + "loss": 0.511, + "step": 7543 + }, + { + "epoch": 9.68421052631579, + "grad_norm": 2.3963887691497803, + "learning_rate": 2.9033376123234918e-05, + "loss": 0.6064, + "step": 7544 + }, + { + "epoch": 9.685494223363285, + "grad_norm": 1.9851492643356323, + "learning_rate": 2.9037227214377406e-05, + "loss": 0.5961, + "step": 7545 + }, + { + "epoch": 9.686777920410783, + "grad_norm": 2.201003074645996, + "learning_rate": 2.9041078305519897e-05, + "loss": 0.5944, + "step": 7546 + }, + { + "epoch": 9.68806161745828, + "grad_norm": 3.355062961578369, + "learning_rate": 2.9044929396662388e-05, + "loss": 0.5958, + "step": 7547 + }, + { + "epoch": 9.689345314505777, + "grad_norm": 2.0099754333496094, + "learning_rate": 2.904878048780488e-05, + "loss": 0.5839, + "step": 7548 + }, + { + "epoch": 9.690629011553273, + "grad_norm": 2.885126829147339, + "learning_rate": 2.905263157894737e-05, + "loss": 0.6424, + "step": 7549 + }, + { + "epoch": 9.69191270860077, + "grad_norm": 2.855607271194458, + "learning_rate": 2.905648267008986e-05, + "loss": 0.6204, + "step": 7550 + }, + { + "epoch": 9.693196405648267, + "grad_norm": 2.5407373905181885, + "learning_rate": 2.906033376123235e-05, + "loss": 0.5689, + "step": 7551 + }, + { + "epoch": 9.694480102695763, + "grad_norm": 2.6723055839538574, + "learning_rate": 2.906418485237484e-05, + "loss": 0.6641, + "step": 7552 + }, + { + "epoch": 9.695763799743261, + "grad_norm": 1.6196341514587402, + "learning_rate": 2.9068035943517328e-05, + "loss": 0.6755, + "step": 7553 + }, + { + "epoch": 9.697047496790757, + "grad_norm": 3.100376844406128, + "learning_rate": 2.9071887034659822e-05, + "loss": 0.6273, + "step": 7554 + }, + { + "epoch": 9.698331193838253, + "grad_norm": 2.714604616165161, + "learning_rate": 2.9075738125802313e-05, + "loss": 0.6288, + "step": 7555 + }, + { + "epoch": 9.699614890885751, + "grad_norm": 4.737906455993652, + "learning_rate": 2.90795892169448e-05, + "loss": 0.6418, + "step": 7556 + }, + { + "epoch": 9.700898587933247, + "grad_norm": 2.263519048690796, + "learning_rate": 2.9083440308087292e-05, + "loss": 0.6677, + "step": 7557 + }, + { + "epoch": 9.702182284980745, + "grad_norm": 2.056413173675537, + "learning_rate": 2.9087291399229783e-05, + "loss": 0.6745, + "step": 7558 + }, + { + "epoch": 9.703465982028241, + "grad_norm": 10.348434448242188, + "learning_rate": 2.909114249037227e-05, + "loss": 0.6921, + "step": 7559 + }, + { + "epoch": 9.704749679075737, + "grad_norm": 2.3383495807647705, + "learning_rate": 2.9094993581514765e-05, + "loss": 0.7182, + "step": 7560 + }, + { + "epoch": 9.706033376123235, + "grad_norm": 6.09689474105835, + "learning_rate": 2.9098844672657253e-05, + "loss": 0.8718, + "step": 7561 + }, + { + "epoch": 9.707317073170731, + "grad_norm": 3.562337636947632, + "learning_rate": 2.9102695763799744e-05, + "loss": 0.5316, + "step": 7562 + }, + { + "epoch": 9.70860077021823, + "grad_norm": 1.5148613452911377, + "learning_rate": 2.9106546854942235e-05, + "loss": 0.5391, + "step": 7563 + }, + { + "epoch": 9.709884467265725, + "grad_norm": 3.3386435508728027, + "learning_rate": 2.9110397946084723e-05, + "loss": 0.55, + "step": 7564 + }, + { + "epoch": 9.711168164313221, + "grad_norm": 1.1590752601623535, + "learning_rate": 2.9114249037227214e-05, + "loss": 0.5345, + "step": 7565 + }, + { + "epoch": 9.71245186136072, + "grad_norm": 0.9454651474952698, + "learning_rate": 2.9118100128369708e-05, + "loss": 0.5284, + "step": 7566 + }, + { + "epoch": 9.713735558408215, + "grad_norm": 1.6779310703277588, + "learning_rate": 2.9121951219512196e-05, + "loss": 0.5869, + "step": 7567 + }, + { + "epoch": 9.715019255455712, + "grad_norm": 2.577742338180542, + "learning_rate": 2.9125802310654687e-05, + "loss": 0.5526, + "step": 7568 + }, + { + "epoch": 9.71630295250321, + "grad_norm": 1.3604168891906738, + "learning_rate": 2.9129653401797175e-05, + "loss": 0.553, + "step": 7569 + }, + { + "epoch": 9.717586649550706, + "grad_norm": 1.4216986894607544, + "learning_rate": 2.9133504492939666e-05, + "loss": 0.5441, + "step": 7570 + }, + { + "epoch": 9.718870346598203, + "grad_norm": 2.174921751022339, + "learning_rate": 2.9137355584082157e-05, + "loss": 0.573, + "step": 7571 + }, + { + "epoch": 9.7201540436457, + "grad_norm": 1.0831321477890015, + "learning_rate": 2.9141206675224648e-05, + "loss": 0.5923, + "step": 7572 + }, + { + "epoch": 9.721437740693197, + "grad_norm": 1.6290905475616455, + "learning_rate": 2.914505776636714e-05, + "loss": 0.5508, + "step": 7573 + }, + { + "epoch": 9.722721437740693, + "grad_norm": 3.425396203994751, + "learning_rate": 2.914890885750963e-05, + "loss": 0.5884, + "step": 7574 + }, + { + "epoch": 9.72400513478819, + "grad_norm": 1.4862034320831299, + "learning_rate": 2.9152759948652118e-05, + "loss": 0.541, + "step": 7575 + }, + { + "epoch": 9.725288831835687, + "grad_norm": 2.393937110900879, + "learning_rate": 2.915661103979461e-05, + "loss": 0.5616, + "step": 7576 + }, + { + "epoch": 9.726572528883183, + "grad_norm": 2.0163772106170654, + "learning_rate": 2.91604621309371e-05, + "loss": 0.5876, + "step": 7577 + }, + { + "epoch": 9.72785622593068, + "grad_norm": 2.53151273727417, + "learning_rate": 2.916431322207959e-05, + "loss": 0.5545, + "step": 7578 + }, + { + "epoch": 9.729139922978177, + "grad_norm": 2.914320468902588, + "learning_rate": 2.9168164313222082e-05, + "loss": 0.597, + "step": 7579 + }, + { + "epoch": 9.730423620025674, + "grad_norm": 1.7314157485961914, + "learning_rate": 2.917201540436457e-05, + "loss": 0.6047, + "step": 7580 + }, + { + "epoch": 9.731707317073171, + "grad_norm": 1.6328026056289673, + "learning_rate": 2.917586649550706e-05, + "loss": 0.611, + "step": 7581 + }, + { + "epoch": 9.732991014120667, + "grad_norm": 1.5556137561798096, + "learning_rate": 2.9179717586649552e-05, + "loss": 0.5788, + "step": 7582 + }, + { + "epoch": 9.734274711168164, + "grad_norm": 8.046784400939941, + "learning_rate": 2.918356867779204e-05, + "loss": 0.5924, + "step": 7583 + }, + { + "epoch": 9.735558408215661, + "grad_norm": 1.5771722793579102, + "learning_rate": 2.9187419768934534e-05, + "loss": 0.5988, + "step": 7584 + }, + { + "epoch": 9.736842105263158, + "grad_norm": 2.4112908840179443, + "learning_rate": 2.9191270860077025e-05, + "loss": 0.5621, + "step": 7585 + }, + { + "epoch": 9.738125802310655, + "grad_norm": 2.4516360759735107, + "learning_rate": 2.9195121951219513e-05, + "loss": 0.6067, + "step": 7586 + }, + { + "epoch": 9.739409499358151, + "grad_norm": 5.777984619140625, + "learning_rate": 2.9198973042362004e-05, + "loss": 0.6417, + "step": 7587 + }, + { + "epoch": 9.740693196405648, + "grad_norm": 2.3584177494049072, + "learning_rate": 2.920282413350449e-05, + "loss": 0.5941, + "step": 7588 + }, + { + "epoch": 9.741976893453145, + "grad_norm": 1.3757843971252441, + "learning_rate": 2.9206675224646983e-05, + "loss": 0.5026, + "step": 7589 + }, + { + "epoch": 9.743260590500642, + "grad_norm": 3.2073731422424316, + "learning_rate": 2.9210526315789474e-05, + "loss": 0.6152, + "step": 7590 + }, + { + "epoch": 9.74454428754814, + "grad_norm": 2.1311593055725098, + "learning_rate": 2.9214377406931965e-05, + "loss": 0.5985, + "step": 7591 + }, + { + "epoch": 9.745827984595635, + "grad_norm": 3.524055004119873, + "learning_rate": 2.9218228498074456e-05, + "loss": 0.5888, + "step": 7592 + }, + { + "epoch": 9.747111681643132, + "grad_norm": 2.420742988586426, + "learning_rate": 2.9222079589216947e-05, + "loss": 0.6094, + "step": 7593 + }, + { + "epoch": 9.74839537869063, + "grad_norm": 2.0969178676605225, + "learning_rate": 2.9225930680359435e-05, + "loss": 0.6044, + "step": 7594 + }, + { + "epoch": 9.749679075738126, + "grad_norm": 4.144533157348633, + "learning_rate": 2.9229781771501926e-05, + "loss": 0.607, + "step": 7595 + }, + { + "epoch": 9.750962772785623, + "grad_norm": 1.883076786994934, + "learning_rate": 2.9233632862644417e-05, + "loss": 0.5646, + "step": 7596 + }, + { + "epoch": 9.75224646983312, + "grad_norm": 2.9105095863342285, + "learning_rate": 2.9237483953786908e-05, + "loss": 0.5977, + "step": 7597 + }, + { + "epoch": 9.753530166880616, + "grad_norm": 2.2902750968933105, + "learning_rate": 2.92413350449294e-05, + "loss": 0.6757, + "step": 7598 + }, + { + "epoch": 9.754813863928113, + "grad_norm": 3.2084708213806152, + "learning_rate": 2.9245186136071886e-05, + "loss": 0.6088, + "step": 7599 + }, + { + "epoch": 9.75609756097561, + "grad_norm": 3.8483970165252686, + "learning_rate": 2.9249037227214378e-05, + "loss": 0.5641, + "step": 7600 + }, + { + "epoch": 9.757381258023106, + "grad_norm": 4.549903869628906, + "learning_rate": 2.925288831835687e-05, + "loss": 0.6035, + "step": 7601 + }, + { + "epoch": 9.758664955070603, + "grad_norm": 3.8758974075317383, + "learning_rate": 2.9256739409499356e-05, + "loss": 0.5877, + "step": 7602 + }, + { + "epoch": 9.7599486521181, + "grad_norm": 3.267835855484009, + "learning_rate": 2.926059050064185e-05, + "loss": 0.5801, + "step": 7603 + }, + { + "epoch": 9.761232349165597, + "grad_norm": 2.846135377883911, + "learning_rate": 2.9264441591784342e-05, + "loss": 0.6774, + "step": 7604 + }, + { + "epoch": 9.762516046213094, + "grad_norm": 5.752248764038086, + "learning_rate": 2.926829268292683e-05, + "loss": 0.6526, + "step": 7605 + }, + { + "epoch": 9.763799743260591, + "grad_norm": 3.357668399810791, + "learning_rate": 2.927214377406932e-05, + "loss": 0.6497, + "step": 7606 + }, + { + "epoch": 9.765083440308088, + "grad_norm": 3.875337600708008, + "learning_rate": 2.9275994865211808e-05, + "loss": 0.6521, + "step": 7607 + }, + { + "epoch": 9.766367137355584, + "grad_norm": 1.8715870380401611, + "learning_rate": 2.92798459563543e-05, + "loss": 0.6818, + "step": 7608 + }, + { + "epoch": 9.767650834403081, + "grad_norm": 2.637373208999634, + "learning_rate": 2.9283697047496794e-05, + "loss": 0.7382, + "step": 7609 + }, + { + "epoch": 9.768934531450578, + "grad_norm": 1.5111489295959473, + "learning_rate": 2.928754813863928e-05, + "loss": 0.7478, + "step": 7610 + }, + { + "epoch": 9.770218228498074, + "grad_norm": 2.580763578414917, + "learning_rate": 2.9291399229781773e-05, + "loss": 0.9414, + "step": 7611 + }, + { + "epoch": 9.771501925545572, + "grad_norm": 1.728129267692566, + "learning_rate": 2.9295250320924264e-05, + "loss": 0.5456, + "step": 7612 + }, + { + "epoch": 9.772785622593068, + "grad_norm": 2.812358856201172, + "learning_rate": 2.929910141206675e-05, + "loss": 0.5342, + "step": 7613 + }, + { + "epoch": 9.774069319640565, + "grad_norm": 6.633712291717529, + "learning_rate": 2.9302952503209242e-05, + "loss": 0.5849, + "step": 7614 + }, + { + "epoch": 9.775353016688062, + "grad_norm": 2.2682034969329834, + "learning_rate": 2.9306803594351737e-05, + "loss": 0.5901, + "step": 7615 + }, + { + "epoch": 9.776636713735558, + "grad_norm": 1.2562298774719238, + "learning_rate": 2.9310654685494225e-05, + "loss": 0.5873, + "step": 7616 + }, + { + "epoch": 9.777920410783056, + "grad_norm": 1.6220861673355103, + "learning_rate": 2.9314505776636716e-05, + "loss": 0.5408, + "step": 7617 + }, + { + "epoch": 9.779204107830552, + "grad_norm": 1.547340989112854, + "learning_rate": 2.9318356867779203e-05, + "loss": 0.5783, + "step": 7618 + }, + { + "epoch": 9.78048780487805, + "grad_norm": 2.334960699081421, + "learning_rate": 2.9322207958921694e-05, + "loss": 0.581, + "step": 7619 + }, + { + "epoch": 9.781771501925546, + "grad_norm": 1.560645580291748, + "learning_rate": 2.9326059050064185e-05, + "loss": 0.5491, + "step": 7620 + }, + { + "epoch": 9.783055198973042, + "grad_norm": 5.299452304840088, + "learning_rate": 2.9329910141206676e-05, + "loss": 0.6228, + "step": 7621 + }, + { + "epoch": 9.78433889602054, + "grad_norm": 1.2317554950714111, + "learning_rate": 2.9333761232349168e-05, + "loss": 0.5416, + "step": 7622 + }, + { + "epoch": 9.785622593068036, + "grad_norm": 2.797990560531616, + "learning_rate": 2.933761232349166e-05, + "loss": 0.5505, + "step": 7623 + }, + { + "epoch": 9.786906290115533, + "grad_norm": 3.883638858795166, + "learning_rate": 2.9341463414634146e-05, + "loss": 0.5412, + "step": 7624 + }, + { + "epoch": 9.78818998716303, + "grad_norm": 1.4718095064163208, + "learning_rate": 2.9345314505776637e-05, + "loss": 0.5953, + "step": 7625 + }, + { + "epoch": 9.789473684210526, + "grad_norm": 0.9851099848747253, + "learning_rate": 2.9349165596919125e-05, + "loss": 0.6048, + "step": 7626 + }, + { + "epoch": 9.790757381258024, + "grad_norm": 1.7296513319015503, + "learning_rate": 2.935301668806162e-05, + "loss": 0.5699, + "step": 7627 + }, + { + "epoch": 9.79204107830552, + "grad_norm": 2.972668170928955, + "learning_rate": 2.935686777920411e-05, + "loss": 0.6096, + "step": 7628 + }, + { + "epoch": 9.793324775353017, + "grad_norm": 3.020993232727051, + "learning_rate": 2.9360718870346598e-05, + "loss": 0.5892, + "step": 7629 + }, + { + "epoch": 9.794608472400514, + "grad_norm": 3.1122400760650635, + "learning_rate": 2.936456996148909e-05, + "loss": 0.545, + "step": 7630 + }, + { + "epoch": 9.79589216944801, + "grad_norm": 2.07550048828125, + "learning_rate": 2.936842105263158e-05, + "loss": 0.5878, + "step": 7631 + }, + { + "epoch": 9.797175866495508, + "grad_norm": 14.881814956665039, + "learning_rate": 2.9372272143774068e-05, + "loss": 0.5591, + "step": 7632 + }, + { + "epoch": 9.798459563543004, + "grad_norm": 1.295261263847351, + "learning_rate": 2.9376123234916563e-05, + "loss": 0.5726, + "step": 7633 + }, + { + "epoch": 9.7997432605905, + "grad_norm": 2.259538173675537, + "learning_rate": 2.937997432605905e-05, + "loss": 0.5833, + "step": 7634 + }, + { + "epoch": 9.801026957637998, + "grad_norm": 1.631440281867981, + "learning_rate": 2.938382541720154e-05, + "loss": 0.5748, + "step": 7635 + }, + { + "epoch": 9.802310654685494, + "grad_norm": 2.3660032749176025, + "learning_rate": 2.9387676508344032e-05, + "loss": 0.5438, + "step": 7636 + }, + { + "epoch": 9.803594351732992, + "grad_norm": 6.251654624938965, + "learning_rate": 2.939152759948652e-05, + "loss": 0.5474, + "step": 7637 + }, + { + "epoch": 9.804878048780488, + "grad_norm": 3.022718667984009, + "learning_rate": 2.939537869062901e-05, + "loss": 0.5962, + "step": 7638 + }, + { + "epoch": 9.806161745827985, + "grad_norm": 1.9733787775039673, + "learning_rate": 2.9399229781771506e-05, + "loss": 0.5339, + "step": 7639 + }, + { + "epoch": 9.807445442875482, + "grad_norm": 2.067906618118286, + "learning_rate": 2.9403080872913993e-05, + "loss": 0.6116, + "step": 7640 + }, + { + "epoch": 9.808729139922978, + "grad_norm": 4.942114353179932, + "learning_rate": 2.9406931964056484e-05, + "loss": 0.5998, + "step": 7641 + }, + { + "epoch": 9.810012836970476, + "grad_norm": 1.915601372718811, + "learning_rate": 2.9410783055198975e-05, + "loss": 0.5972, + "step": 7642 + }, + { + "epoch": 9.811296534017972, + "grad_norm": 5.560861587524414, + "learning_rate": 2.9414634146341463e-05, + "loss": 0.5734, + "step": 7643 + }, + { + "epoch": 9.812580231065468, + "grad_norm": 1.3689135313034058, + "learning_rate": 2.9418485237483954e-05, + "loss": 0.5857, + "step": 7644 + }, + { + "epoch": 9.813863928112966, + "grad_norm": 6.959369659423828, + "learning_rate": 2.9422336328626442e-05, + "loss": 0.5689, + "step": 7645 + }, + { + "epoch": 9.815147625160462, + "grad_norm": 3.413853168487549, + "learning_rate": 2.9426187419768936e-05, + "loss": 0.5813, + "step": 7646 + }, + { + "epoch": 9.81643132220796, + "grad_norm": 3.457515239715576, + "learning_rate": 2.9430038510911427e-05, + "loss": 0.6042, + "step": 7647 + }, + { + "epoch": 9.817715019255456, + "grad_norm": 15.390674591064453, + "learning_rate": 2.9433889602053915e-05, + "loss": 0.6205, + "step": 7648 + }, + { + "epoch": 9.818998716302952, + "grad_norm": 2.184375047683716, + "learning_rate": 2.9437740693196406e-05, + "loss": 0.6167, + "step": 7649 + }, + { + "epoch": 9.82028241335045, + "grad_norm": 2.2367067337036133, + "learning_rate": 2.9441591784338897e-05, + "loss": 0.6355, + "step": 7650 + }, + { + "epoch": 9.821566110397946, + "grad_norm": 4.338662624359131, + "learning_rate": 2.9445442875481385e-05, + "loss": 0.6413, + "step": 7651 + }, + { + "epoch": 9.822849807445444, + "grad_norm": 1.4378489255905151, + "learning_rate": 2.944929396662388e-05, + "loss": 0.6551, + "step": 7652 + }, + { + "epoch": 9.82413350449294, + "grad_norm": 3.9375159740448, + "learning_rate": 2.9453145057766367e-05, + "loss": 0.6331, + "step": 7653 + }, + { + "epoch": 9.825417201540436, + "grad_norm": 1.8004589080810547, + "learning_rate": 2.9456996148908858e-05, + "loss": 0.6479, + "step": 7654 + }, + { + "epoch": 9.826700898587934, + "grad_norm": 1.8550629615783691, + "learning_rate": 2.946084724005135e-05, + "loss": 0.6367, + "step": 7655 + }, + { + "epoch": 9.82798459563543, + "grad_norm": 2.0537328720092773, + "learning_rate": 2.9464698331193837e-05, + "loss": 0.6129, + "step": 7656 + }, + { + "epoch": 9.829268292682928, + "grad_norm": 3.5870542526245117, + "learning_rate": 2.9468549422336328e-05, + "loss": 0.6502, + "step": 7657 + }, + { + "epoch": 9.830551989730424, + "grad_norm": 8.584651947021484, + "learning_rate": 2.9472400513478822e-05, + "loss": 0.6519, + "step": 7658 + }, + { + "epoch": 9.83183568677792, + "grad_norm": 2.657651662826538, + "learning_rate": 2.947625160462131e-05, + "loss": 0.6732, + "step": 7659 + }, + { + "epoch": 9.833119383825418, + "grad_norm": 6.564329624176025, + "learning_rate": 2.94801026957638e-05, + "loss": 0.715, + "step": 7660 + }, + { + "epoch": 9.834403080872914, + "grad_norm": 3.2525434494018555, + "learning_rate": 2.9483953786906292e-05, + "loss": 0.8318, + "step": 7661 + }, + { + "epoch": 9.835686777920412, + "grad_norm": 2.2389297485351562, + "learning_rate": 2.948780487804878e-05, + "loss": 0.6006, + "step": 7662 + }, + { + "epoch": 9.836970474967908, + "grad_norm": 3.7488248348236084, + "learning_rate": 2.949165596919127e-05, + "loss": 0.5576, + "step": 7663 + }, + { + "epoch": 9.838254172015404, + "grad_norm": 2.163660764694214, + "learning_rate": 2.9495507060333762e-05, + "loss": 0.5771, + "step": 7664 + }, + { + "epoch": 9.839537869062902, + "grad_norm": 1.4755291938781738, + "learning_rate": 2.9499358151476253e-05, + "loss": 0.5914, + "step": 7665 + }, + { + "epoch": 9.840821566110398, + "grad_norm": 1.5871554613113403, + "learning_rate": 2.9503209242618744e-05, + "loss": 0.5502, + "step": 7666 + }, + { + "epoch": 9.842105263157894, + "grad_norm": 3.148895263671875, + "learning_rate": 2.9507060333761232e-05, + "loss": 0.5692, + "step": 7667 + }, + { + "epoch": 9.843388960205392, + "grad_norm": 1.2863221168518066, + "learning_rate": 2.9510911424903723e-05, + "loss": 0.5777, + "step": 7668 + }, + { + "epoch": 9.844672657252888, + "grad_norm": 3.741136312484741, + "learning_rate": 2.9514762516046214e-05, + "loss": 0.6172, + "step": 7669 + }, + { + "epoch": 9.845956354300386, + "grad_norm": 2.7623391151428223, + "learning_rate": 2.9518613607188705e-05, + "loss": 0.5979, + "step": 7670 + }, + { + "epoch": 9.847240051347882, + "grad_norm": 2.458353281021118, + "learning_rate": 2.9522464698331196e-05, + "loss": 0.5814, + "step": 7671 + }, + { + "epoch": 9.84852374839538, + "grad_norm": 3.342623472213745, + "learning_rate": 2.9526315789473684e-05, + "loss": 0.5292, + "step": 7672 + }, + { + "epoch": 9.849807445442876, + "grad_norm": 4.04233980178833, + "learning_rate": 2.9530166880616175e-05, + "loss": 0.5883, + "step": 7673 + }, + { + "epoch": 9.851091142490372, + "grad_norm": 3.0651285648345947, + "learning_rate": 2.9534017971758666e-05, + "loss": 0.5721, + "step": 7674 + }, + { + "epoch": 9.85237483953787, + "grad_norm": 1.9654895067214966, + "learning_rate": 2.9537869062901154e-05, + "loss": 0.5845, + "step": 7675 + }, + { + "epoch": 9.853658536585366, + "grad_norm": 3.7194700241088867, + "learning_rate": 2.9541720154043648e-05, + "loss": 0.5552, + "step": 7676 + }, + { + "epoch": 9.854942233632862, + "grad_norm": 5.04015588760376, + "learning_rate": 2.954557124518614e-05, + "loss": 0.6022, + "step": 7677 + }, + { + "epoch": 9.85622593068036, + "grad_norm": 2.3338944911956787, + "learning_rate": 2.9549422336328627e-05, + "loss": 0.5429, + "step": 7678 + }, + { + "epoch": 9.857509627727856, + "grad_norm": 3.4646995067596436, + "learning_rate": 2.9553273427471118e-05, + "loss": 0.5541, + "step": 7679 + }, + { + "epoch": 9.858793324775354, + "grad_norm": 1.4408788681030273, + "learning_rate": 2.9557124518613606e-05, + "loss": 0.5472, + "step": 7680 + }, + { + "epoch": 9.86007702182285, + "grad_norm": 1.8001376390457153, + "learning_rate": 2.9560975609756097e-05, + "loss": 0.5983, + "step": 7681 + }, + { + "epoch": 9.861360718870346, + "grad_norm": 2.6836442947387695, + "learning_rate": 2.956482670089859e-05, + "loss": 0.5551, + "step": 7682 + }, + { + "epoch": 9.862644415917844, + "grad_norm": 8.827695846557617, + "learning_rate": 2.956867779204108e-05, + "loss": 0.5195, + "step": 7683 + }, + { + "epoch": 9.86392811296534, + "grad_norm": 6.889891624450684, + "learning_rate": 2.957252888318357e-05, + "loss": 0.5334, + "step": 7684 + }, + { + "epoch": 9.865211810012838, + "grad_norm": 1.2725844383239746, + "learning_rate": 2.957637997432606e-05, + "loss": 0.5696, + "step": 7685 + }, + { + "epoch": 9.866495507060334, + "grad_norm": 2.184451103210449, + "learning_rate": 2.958023106546855e-05, + "loss": 0.5576, + "step": 7686 + }, + { + "epoch": 9.86777920410783, + "grad_norm": 8.96898078918457, + "learning_rate": 2.958408215661104e-05, + "loss": 0.577, + "step": 7687 + }, + { + "epoch": 9.869062901155328, + "grad_norm": 2.5694336891174316, + "learning_rate": 2.9587933247753534e-05, + "loss": 0.6069, + "step": 7688 + }, + { + "epoch": 9.870346598202824, + "grad_norm": 2.2404067516326904, + "learning_rate": 2.9591784338896022e-05, + "loss": 0.5977, + "step": 7689 + }, + { + "epoch": 9.871630295250322, + "grad_norm": 3.3980519771575928, + "learning_rate": 2.9595635430038513e-05, + "loss": 0.6448, + "step": 7690 + }, + { + "epoch": 9.872913992297818, + "grad_norm": 3.8248064517974854, + "learning_rate": 2.9599486521181e-05, + "loss": 0.6043, + "step": 7691 + }, + { + "epoch": 9.874197689345314, + "grad_norm": 2.7840497493743896, + "learning_rate": 2.960333761232349e-05, + "loss": 0.5988, + "step": 7692 + }, + { + "epoch": 9.875481386392812, + "grad_norm": 9.785523414611816, + "learning_rate": 2.9607188703465983e-05, + "loss": 0.5506, + "step": 7693 + }, + { + "epoch": 9.876765083440308, + "grad_norm": 5.554495811462402, + "learning_rate": 2.9611039794608474e-05, + "loss": 0.6344, + "step": 7694 + }, + { + "epoch": 9.878048780487806, + "grad_norm": 3.436474561691284, + "learning_rate": 2.9614890885750965e-05, + "loss": 0.6484, + "step": 7695 + }, + { + "epoch": 9.879332477535302, + "grad_norm": 3.096992015838623, + "learning_rate": 2.9618741976893456e-05, + "loss": 0.5955, + "step": 7696 + }, + { + "epoch": 9.880616174582798, + "grad_norm": 2.7166974544525146, + "learning_rate": 2.9622593068035944e-05, + "loss": 0.571, + "step": 7697 + }, + { + "epoch": 9.881899871630296, + "grad_norm": 1.881203532218933, + "learning_rate": 2.9626444159178435e-05, + "loss": 0.6118, + "step": 7698 + }, + { + "epoch": 9.883183568677792, + "grad_norm": 2.2594621181488037, + "learning_rate": 2.9630295250320922e-05, + "loss": 0.6035, + "step": 7699 + }, + { + "epoch": 9.88446726572529, + "grad_norm": 1.2778561115264893, + "learning_rate": 2.9634146341463413e-05, + "loss": 0.5835, + "step": 7700 + }, + { + "epoch": 9.885750962772786, + "grad_norm": 2.9729976654052734, + "learning_rate": 2.9637997432605908e-05, + "loss": 0.6273, + "step": 7701 + }, + { + "epoch": 9.887034659820282, + "grad_norm": 2.209608554840088, + "learning_rate": 2.9641848523748396e-05, + "loss": 0.5788, + "step": 7702 + }, + { + "epoch": 9.88831835686778, + "grad_norm": 4.740476608276367, + "learning_rate": 2.9645699614890887e-05, + "loss": 0.6755, + "step": 7703 + }, + { + "epoch": 9.889602053915276, + "grad_norm": 2.029803991317749, + "learning_rate": 2.9649550706033378e-05, + "loss": 0.6338, + "step": 7704 + }, + { + "epoch": 9.890885750962774, + "grad_norm": 1.7625733613967896, + "learning_rate": 2.9653401797175865e-05, + "loss": 0.6053, + "step": 7705 + }, + { + "epoch": 9.89216944801027, + "grad_norm": 2.0386483669281006, + "learning_rate": 2.9657252888318356e-05, + "loss": 0.6312, + "step": 7706 + }, + { + "epoch": 9.893453145057766, + "grad_norm": 2.6968657970428467, + "learning_rate": 2.966110397946085e-05, + "loss": 0.6471, + "step": 7707 + }, + { + "epoch": 9.894736842105264, + "grad_norm": 2.6176791191101074, + "learning_rate": 2.966495507060334e-05, + "loss": 0.7027, + "step": 7708 + }, + { + "epoch": 9.89602053915276, + "grad_norm": 2.021726369857788, + "learning_rate": 2.966880616174583e-05, + "loss": 0.6944, + "step": 7709 + }, + { + "epoch": 9.897304236200256, + "grad_norm": 10.743870735168457, + "learning_rate": 2.9672657252888317e-05, + "loss": 0.7545, + "step": 7710 + }, + { + "epoch": 9.898587933247754, + "grad_norm": 10.48861312866211, + "learning_rate": 2.967650834403081e-05, + "loss": 0.7904, + "step": 7711 + }, + { + "epoch": 9.89987163029525, + "grad_norm": 2.4230899810791016, + "learning_rate": 2.96803594351733e-05, + "loss": 0.5443, + "step": 7712 + }, + { + "epoch": 9.901155327342748, + "grad_norm": 2.5349931716918945, + "learning_rate": 2.968421052631579e-05, + "loss": 0.6087, + "step": 7713 + }, + { + "epoch": 9.902439024390244, + "grad_norm": 3.395841360092163, + "learning_rate": 2.968806161745828e-05, + "loss": 0.5733, + "step": 7714 + }, + { + "epoch": 9.90372272143774, + "grad_norm": 3.205355405807495, + "learning_rate": 2.9691912708600773e-05, + "loss": 0.5598, + "step": 7715 + }, + { + "epoch": 9.905006418485238, + "grad_norm": 1.521173119544983, + "learning_rate": 2.969576379974326e-05, + "loss": 0.5276, + "step": 7716 + }, + { + "epoch": 9.906290115532734, + "grad_norm": 1.3933414220809937, + "learning_rate": 2.969961489088575e-05, + "loss": 0.5746, + "step": 7717 + }, + { + "epoch": 9.907573812580232, + "grad_norm": 2.0836093425750732, + "learning_rate": 2.970346598202824e-05, + "loss": 0.5552, + "step": 7718 + }, + { + "epoch": 9.908857509627728, + "grad_norm": 2.11686635017395, + "learning_rate": 2.9707317073170734e-05, + "loss": 0.5634, + "step": 7719 + }, + { + "epoch": 9.910141206675224, + "grad_norm": 2.0808093547821045, + "learning_rate": 2.9711168164313225e-05, + "loss": 0.5711, + "step": 7720 + }, + { + "epoch": 9.911424903722722, + "grad_norm": 1.7681759595870972, + "learning_rate": 2.9715019255455712e-05, + "loss": 0.5595, + "step": 7721 + }, + { + "epoch": 9.912708600770218, + "grad_norm": 1.2186052799224854, + "learning_rate": 2.9718870346598203e-05, + "loss": 0.5534, + "step": 7722 + }, + { + "epoch": 9.913992297817716, + "grad_norm": 2.7283172607421875, + "learning_rate": 2.9722721437740695e-05, + "loss": 0.5788, + "step": 7723 + }, + { + "epoch": 9.915275994865212, + "grad_norm": 6.994935512542725, + "learning_rate": 2.9726572528883182e-05, + "loss": 0.5195, + "step": 7724 + }, + { + "epoch": 9.916559691912708, + "grad_norm": 1.8434957265853882, + "learning_rate": 2.9730423620025677e-05, + "loss": 0.5627, + "step": 7725 + }, + { + "epoch": 9.917843388960206, + "grad_norm": 2.649968147277832, + "learning_rate": 2.9734274711168164e-05, + "loss": 0.5553, + "step": 7726 + }, + { + "epoch": 9.919127086007702, + "grad_norm": 3.626981496810913, + "learning_rate": 2.9738125802310655e-05, + "loss": 0.5525, + "step": 7727 + }, + { + "epoch": 9.9204107830552, + "grad_norm": 1.1962920427322388, + "learning_rate": 2.9741976893453146e-05, + "loss": 0.6008, + "step": 7728 + }, + { + "epoch": 9.921694480102696, + "grad_norm": 1.9475985765457153, + "learning_rate": 2.9745827984595634e-05, + "loss": 0.5732, + "step": 7729 + }, + { + "epoch": 9.922978177150192, + "grad_norm": 2.9871561527252197, + "learning_rate": 2.9749679075738125e-05, + "loss": 0.6044, + "step": 7730 + }, + { + "epoch": 9.92426187419769, + "grad_norm": 1.727623701095581, + "learning_rate": 2.975353016688062e-05, + "loss": 0.5882, + "step": 7731 + }, + { + "epoch": 9.925545571245186, + "grad_norm": 1.1280643939971924, + "learning_rate": 2.9757381258023107e-05, + "loss": 0.5465, + "step": 7732 + }, + { + "epoch": 9.926829268292684, + "grad_norm": 6.750319480895996, + "learning_rate": 2.97612323491656e-05, + "loss": 0.5279, + "step": 7733 + }, + { + "epoch": 9.92811296534018, + "grad_norm": 2.804675579071045, + "learning_rate": 2.976508344030809e-05, + "loss": 0.5533, + "step": 7734 + }, + { + "epoch": 9.929396662387676, + "grad_norm": 2.689100742340088, + "learning_rate": 2.9768934531450577e-05, + "loss": 0.5658, + "step": 7735 + }, + { + "epoch": 9.930680359435174, + "grad_norm": 2.898150682449341, + "learning_rate": 2.9772785622593068e-05, + "loss": 0.6168, + "step": 7736 + }, + { + "epoch": 9.93196405648267, + "grad_norm": 2.154154062271118, + "learning_rate": 2.977663671373556e-05, + "loss": 0.5644, + "step": 7737 + }, + { + "epoch": 9.933247753530168, + "grad_norm": 3.8174731731414795, + "learning_rate": 2.978048780487805e-05, + "loss": 0.573, + "step": 7738 + }, + { + "epoch": 9.934531450577664, + "grad_norm": 1.1804476976394653, + "learning_rate": 2.978433889602054e-05, + "loss": 0.5838, + "step": 7739 + }, + { + "epoch": 9.93581514762516, + "grad_norm": 5.256398677825928, + "learning_rate": 2.978818998716303e-05, + "loss": 0.5959, + "step": 7740 + }, + { + "epoch": 9.937098844672658, + "grad_norm": 2.643648862838745, + "learning_rate": 2.979204107830552e-05, + "loss": 0.6361, + "step": 7741 + }, + { + "epoch": 9.938382541720154, + "grad_norm": 2.794529914855957, + "learning_rate": 2.979589216944801e-05, + "loss": 0.5764, + "step": 7742 + }, + { + "epoch": 9.93966623876765, + "grad_norm": 1.5229392051696777, + "learning_rate": 2.9799743260590502e-05, + "loss": 0.5662, + "step": 7743 + }, + { + "epoch": 9.940949935815148, + "grad_norm": 3.86260724067688, + "learning_rate": 2.9803594351732993e-05, + "loss": 0.6265, + "step": 7744 + }, + { + "epoch": 9.942233632862644, + "grad_norm": 2.3297715187072754, + "learning_rate": 2.980744544287548e-05, + "loss": 0.6293, + "step": 7745 + }, + { + "epoch": 9.943517329910142, + "grad_norm": 1.8780913352966309, + "learning_rate": 2.9811296534017972e-05, + "loss": 0.6177, + "step": 7746 + }, + { + "epoch": 9.944801026957638, + "grad_norm": 5.289205551147461, + "learning_rate": 2.9815147625160463e-05, + "loss": 0.5791, + "step": 7747 + }, + { + "epoch": 9.946084724005134, + "grad_norm": 2.5425407886505127, + "learning_rate": 2.981899871630295e-05, + "loss": 0.5838, + "step": 7748 + }, + { + "epoch": 9.947368421052632, + "grad_norm": 2.668177843093872, + "learning_rate": 2.9822849807445442e-05, + "loss": 0.662, + "step": 7749 + }, + { + "epoch": 9.948652118100128, + "grad_norm": 5.419576168060303, + "learning_rate": 2.9826700898587936e-05, + "loss": 0.6203, + "step": 7750 + }, + { + "epoch": 9.949935815147626, + "grad_norm": 2.2290918827056885, + "learning_rate": 2.9830551989730424e-05, + "loss": 0.6018, + "step": 7751 + }, + { + "epoch": 9.951219512195122, + "grad_norm": 2.4535462856292725, + "learning_rate": 2.9834403080872915e-05, + "loss": 0.6841, + "step": 7752 + }, + { + "epoch": 9.952503209242618, + "grad_norm": 2.9699976444244385, + "learning_rate": 2.9838254172015406e-05, + "loss": 0.6364, + "step": 7753 + }, + { + "epoch": 9.953786906290116, + "grad_norm": 1.720250129699707, + "learning_rate": 2.9842105263157894e-05, + "loss": 0.6759, + "step": 7754 + }, + { + "epoch": 9.955070603337612, + "grad_norm": 1.5080440044403076, + "learning_rate": 2.9845956354300385e-05, + "loss": 0.689, + "step": 7755 + }, + { + "epoch": 9.95635430038511, + "grad_norm": 7.891659736633301, + "learning_rate": 2.9849807445442876e-05, + "loss": 0.6279, + "step": 7756 + }, + { + "epoch": 9.957637997432606, + "grad_norm": 4.609415531158447, + "learning_rate": 2.9853658536585367e-05, + "loss": 0.7004, + "step": 7757 + }, + { + "epoch": 9.958921694480102, + "grad_norm": 3.5119290351867676, + "learning_rate": 2.9857509627727858e-05, + "loss": 0.7467, + "step": 7758 + }, + { + "epoch": 9.9602053915276, + "grad_norm": 2.55671763420105, + "learning_rate": 2.9861360718870346e-05, + "loss": 0.6829, + "step": 7759 + }, + { + "epoch": 9.961489088575096, + "grad_norm": 2.398125171661377, + "learning_rate": 2.9865211810012837e-05, + "loss": 0.8017, + "step": 7760 + }, + { + "epoch": 9.962772785622594, + "grad_norm": 3.1394565105438232, + "learning_rate": 2.9869062901155328e-05, + "loss": 0.913, + "step": 7761 + }, + { + "epoch": 9.96405648267009, + "grad_norm": 1.5297143459320068, + "learning_rate": 2.987291399229782e-05, + "loss": 0.5001, + "step": 7762 + }, + { + "epoch": 9.965340179717586, + "grad_norm": 4.165538787841797, + "learning_rate": 2.987676508344031e-05, + "loss": 0.5442, + "step": 7763 + }, + { + "epoch": 9.966623876765084, + "grad_norm": 2.1164438724517822, + "learning_rate": 2.9880616174582798e-05, + "loss": 0.5817, + "step": 7764 + }, + { + "epoch": 9.96790757381258, + "grad_norm": 1.7171416282653809, + "learning_rate": 2.988446726572529e-05, + "loss": 0.5643, + "step": 7765 + }, + { + "epoch": 9.969191270860078, + "grad_norm": 1.1460576057434082, + "learning_rate": 2.988831835686778e-05, + "loss": 0.556, + "step": 7766 + }, + { + "epoch": 9.970474967907574, + "grad_norm": 1.129912257194519, + "learning_rate": 2.9892169448010268e-05, + "loss": 0.5701, + "step": 7767 + }, + { + "epoch": 9.97175866495507, + "grad_norm": 0.912793755531311, + "learning_rate": 2.9896020539152762e-05, + "loss": 0.582, + "step": 7768 + }, + { + "epoch": 9.973042362002568, + "grad_norm": 1.692897915840149, + "learning_rate": 2.9899871630295253e-05, + "loss": 0.5259, + "step": 7769 + }, + { + "epoch": 9.974326059050064, + "grad_norm": 2.155076742172241, + "learning_rate": 2.990372272143774e-05, + "loss": 0.652, + "step": 7770 + }, + { + "epoch": 9.975609756097562, + "grad_norm": 1.1680952310562134, + "learning_rate": 2.9907573812580232e-05, + "loss": 0.5673, + "step": 7771 + }, + { + "epoch": 9.976893453145058, + "grad_norm": 1.5250312089920044, + "learning_rate": 2.991142490372272e-05, + "loss": 0.5743, + "step": 7772 + }, + { + "epoch": 9.978177150192554, + "grad_norm": 1.0559189319610596, + "learning_rate": 2.991527599486521e-05, + "loss": 0.5945, + "step": 7773 + }, + { + "epoch": 9.979460847240052, + "grad_norm": 2.9482007026672363, + "learning_rate": 2.9919127086007705e-05, + "loss": 0.6113, + "step": 7774 + }, + { + "epoch": 9.980744544287548, + "grad_norm": 2.112234115600586, + "learning_rate": 2.9922978177150193e-05, + "loss": 0.563, + "step": 7775 + }, + { + "epoch": 9.982028241335044, + "grad_norm": 8.536846160888672, + "learning_rate": 2.9926829268292684e-05, + "loss": 0.5787, + "step": 7776 + }, + { + "epoch": 9.983311938382542, + "grad_norm": 2.0005064010620117, + "learning_rate": 2.9930680359435175e-05, + "loss": 0.5489, + "step": 7777 + }, + { + "epoch": 9.984595635430038, + "grad_norm": 1.3847967386245728, + "learning_rate": 2.9934531450577663e-05, + "loss": 0.6007, + "step": 7778 + }, + { + "epoch": 9.985879332477536, + "grad_norm": 5.818824291229248, + "learning_rate": 2.9938382541720154e-05, + "loss": 0.5669, + "step": 7779 + }, + { + "epoch": 9.987163029525032, + "grad_norm": 1.137098789215088, + "learning_rate": 2.9942233632862648e-05, + "loss": 0.5766, + "step": 7780 + }, + { + "epoch": 9.988446726572528, + "grad_norm": 1.4699032306671143, + "learning_rate": 2.9946084724005136e-05, + "loss": 0.6007, + "step": 7781 + }, + { + "epoch": 9.989730423620026, + "grad_norm": 1.2485425472259521, + "learning_rate": 2.9949935815147627e-05, + "loss": 0.6272, + "step": 7782 + }, + { + "epoch": 9.991014120667522, + "grad_norm": 2.011320114135742, + "learning_rate": 2.9953786906290115e-05, + "loss": 0.6366, + "step": 7783 + }, + { + "epoch": 9.99229781771502, + "grad_norm": 2.4638671875, + "learning_rate": 2.9957637997432606e-05, + "loss": 0.5614, + "step": 7784 + }, + { + "epoch": 9.993581514762516, + "grad_norm": 2.8838069438934326, + "learning_rate": 2.9961489088575097e-05, + "loss": 0.6753, + "step": 7785 + }, + { + "epoch": 9.994865211810012, + "grad_norm": 2.740746259689331, + "learning_rate": 2.9965340179717588e-05, + "loss": 0.6084, + "step": 7786 + }, + { + "epoch": 9.99614890885751, + "grad_norm": 1.541233777999878, + "learning_rate": 2.996919127086008e-05, + "loss": 0.6212, + "step": 7787 + }, + { + "epoch": 9.997432605905006, + "grad_norm": 3.0357072353363037, + "learning_rate": 2.997304236200257e-05, + "loss": 0.6498, + "step": 7788 + }, + { + "epoch": 9.998716302952504, + "grad_norm": 2.8359224796295166, + "learning_rate": 2.9976893453145058e-05, + "loss": 0.6829, + "step": 7789 + }, + { + "epoch": 10.0, + "grad_norm": 3.291982650756836, + "learning_rate": 2.998074454428755e-05, + "loss": 0.8202, + "step": 7790 + }, + { + "epoch": 10.001283697047496, + "grad_norm": 2.3322372436523438, + "learning_rate": 2.9984595635430036e-05, + "loss": 0.5678, + "step": 7791 + }, + { + "epoch": 10.002567394094994, + "grad_norm": 2.169097900390625, + "learning_rate": 2.998844672657253e-05, + "loss": 0.5472, + "step": 7792 + }, + { + "epoch": 10.00385109114249, + "grad_norm": 4.580654621124268, + "learning_rate": 2.9992297817715022e-05, + "loss": 0.588, + "step": 7793 + }, + { + "epoch": 10.005134788189988, + "grad_norm": 2.121530771255493, + "learning_rate": 2.999614890885751e-05, + "loss": 0.5914, + "step": 7794 + }, + { + "epoch": 10.006418485237484, + "grad_norm": 2.1794021129608154, + "learning_rate": 3e-05, + "loss": 0.5668, + "step": 7795 + }, + { + "epoch": 10.00770218228498, + "grad_norm": 1.1667276620864868, + "learning_rate": 2.9999572100984166e-05, + "loss": 0.561, + "step": 7796 + }, + { + "epoch": 10.008985879332478, + "grad_norm": 2.2542762756347656, + "learning_rate": 2.9999144201968338e-05, + "loss": 0.5509, + "step": 7797 + }, + { + "epoch": 10.010269576379974, + "grad_norm": 2.13498592376709, + "learning_rate": 2.9998716302952503e-05, + "loss": 0.5878, + "step": 7798 + }, + { + "epoch": 10.011553273427472, + "grad_norm": 3.309577703475952, + "learning_rate": 2.999828840393667e-05, + "loss": 0.5669, + "step": 7799 + }, + { + "epoch": 10.012836970474968, + "grad_norm": 1.7775945663452148, + "learning_rate": 2.999786050492084e-05, + "loss": 0.5768, + "step": 7800 + }, + { + "epoch": 10.014120667522464, + "grad_norm": 4.72900390625, + "learning_rate": 2.9997432605905008e-05, + "loss": 0.5902, + "step": 7801 + }, + { + "epoch": 10.015404364569962, + "grad_norm": 1.1097861528396606, + "learning_rate": 2.9997004706889176e-05, + "loss": 0.5556, + "step": 7802 + }, + { + "epoch": 10.016688061617458, + "grad_norm": 3.7645087242126465, + "learning_rate": 2.999657680787334e-05, + "loss": 0.5107, + "step": 7803 + }, + { + "epoch": 10.017971758664956, + "grad_norm": 0.8561379313468933, + "learning_rate": 2.999614890885751e-05, + "loss": 0.5132, + "step": 7804 + }, + { + "epoch": 10.019255455712452, + "grad_norm": 1.4011077880859375, + "learning_rate": 2.9995721009841678e-05, + "loss": 0.5804, + "step": 7805 + }, + { + "epoch": 10.020539152759948, + "grad_norm": 2.321462392807007, + "learning_rate": 2.9995293110825847e-05, + "loss": 0.5765, + "step": 7806 + }, + { + "epoch": 10.021822849807446, + "grad_norm": 2.812330961227417, + "learning_rate": 2.999486521181001e-05, + "loss": 0.5345, + "step": 7807 + }, + { + "epoch": 10.023106546854942, + "grad_norm": 2.466677188873291, + "learning_rate": 2.9994437312794183e-05, + "loss": 0.5404, + "step": 7808 + }, + { + "epoch": 10.024390243902438, + "grad_norm": 2.507847309112549, + "learning_rate": 2.999400941377835e-05, + "loss": 0.5653, + "step": 7809 + }, + { + "epoch": 10.025673940949936, + "grad_norm": 2.1165096759796143, + "learning_rate": 2.9993581514762517e-05, + "loss": 0.5779, + "step": 7810 + }, + { + "epoch": 10.026957637997432, + "grad_norm": 10.151068687438965, + "learning_rate": 2.9993153615746685e-05, + "loss": 0.5554, + "step": 7811 + }, + { + "epoch": 10.02824133504493, + "grad_norm": 4.4770097732543945, + "learning_rate": 2.999272571673085e-05, + "loss": 0.559, + "step": 7812 + }, + { + "epoch": 10.029525032092426, + "grad_norm": 1.6853535175323486, + "learning_rate": 2.9992297817715022e-05, + "loss": 0.5637, + "step": 7813 + }, + { + "epoch": 10.030808729139922, + "grad_norm": 3.8897604942321777, + "learning_rate": 2.9991869918699187e-05, + "loss": 0.5343, + "step": 7814 + }, + { + "epoch": 10.03209242618742, + "grad_norm": 1.5243784189224243, + "learning_rate": 2.9991442019683355e-05, + "loss": 0.5749, + "step": 7815 + }, + { + "epoch": 10.033376123234916, + "grad_norm": 1.782270073890686, + "learning_rate": 2.9991014120667524e-05, + "loss": 0.5929, + "step": 7816 + }, + { + "epoch": 10.034659820282414, + "grad_norm": 1.860990047454834, + "learning_rate": 2.999058622165169e-05, + "loss": 0.5883, + "step": 7817 + }, + { + "epoch": 10.03594351732991, + "grad_norm": 2.429457664489746, + "learning_rate": 2.999015832263586e-05, + "loss": 0.5278, + "step": 7818 + }, + { + "epoch": 10.037227214377406, + "grad_norm": 2.003890037536621, + "learning_rate": 2.9989730423620026e-05, + "loss": 0.5825, + "step": 7819 + }, + { + "epoch": 10.038510911424904, + "grad_norm": 4.560482501983643, + "learning_rate": 2.9989302524604194e-05, + "loss": 0.5668, + "step": 7820 + }, + { + "epoch": 10.0397946084724, + "grad_norm": 1.591629147529602, + "learning_rate": 2.9988874625588363e-05, + "loss": 0.5731, + "step": 7821 + }, + { + "epoch": 10.041078305519898, + "grad_norm": 5.579545497894287, + "learning_rate": 2.998844672657253e-05, + "loss": 0.547, + "step": 7822 + }, + { + "epoch": 10.042362002567394, + "grad_norm": 5.67046594619751, + "learning_rate": 2.9988018827556696e-05, + "loss": 0.5948, + "step": 7823 + }, + { + "epoch": 10.04364569961489, + "grad_norm": 12.143543243408203, + "learning_rate": 2.9987590928540864e-05, + "loss": 0.5972, + "step": 7824 + }, + { + "epoch": 10.044929396662388, + "grad_norm": 1.7965720891952515, + "learning_rate": 2.9987163029525033e-05, + "loss": 0.5706, + "step": 7825 + }, + { + "epoch": 10.046213093709884, + "grad_norm": 2.124732494354248, + "learning_rate": 2.99867351305092e-05, + "loss": 0.5852, + "step": 7826 + }, + { + "epoch": 10.047496790757382, + "grad_norm": 6.59039306640625, + "learning_rate": 2.998630723149337e-05, + "loss": 0.5985, + "step": 7827 + }, + { + "epoch": 10.048780487804878, + "grad_norm": 3.5329349040985107, + "learning_rate": 2.9985879332477535e-05, + "loss": 0.577, + "step": 7828 + }, + { + "epoch": 10.050064184852374, + "grad_norm": 6.928347587585449, + "learning_rate": 2.9985451433461703e-05, + "loss": 0.6657, + "step": 7829 + }, + { + "epoch": 10.051347881899872, + "grad_norm": 3.444780111312866, + "learning_rate": 2.998502353444587e-05, + "loss": 0.6123, + "step": 7830 + }, + { + "epoch": 10.052631578947368, + "grad_norm": 10.554981231689453, + "learning_rate": 2.9984595635430036e-05, + "loss": 0.5985, + "step": 7831 + }, + { + "epoch": 10.053915275994866, + "grad_norm": 3.8222224712371826, + "learning_rate": 2.9984167736414208e-05, + "loss": 0.6342, + "step": 7832 + }, + { + "epoch": 10.055198973042362, + "grad_norm": 1.8044511079788208, + "learning_rate": 2.9983739837398373e-05, + "loss": 0.6402, + "step": 7833 + }, + { + "epoch": 10.056482670089858, + "grad_norm": 11.855904579162598, + "learning_rate": 2.9983311938382545e-05, + "loss": 0.6397, + "step": 7834 + }, + { + "epoch": 10.057766367137356, + "grad_norm": 3.9783804416656494, + "learning_rate": 2.998288403936671e-05, + "loss": 0.7, + "step": 7835 + }, + { + "epoch": 10.059050064184852, + "grad_norm": 6.672181606292725, + "learning_rate": 2.9982456140350875e-05, + "loss": 0.702, + "step": 7836 + }, + { + "epoch": 10.06033376123235, + "grad_norm": 2.8635332584381104, + "learning_rate": 2.9982028241335047e-05, + "loss": 0.6793, + "step": 7837 + }, + { + "epoch": 10.061617458279846, + "grad_norm": 2.504584312438965, + "learning_rate": 2.9981600342319212e-05, + "loss": 0.7079, + "step": 7838 + }, + { + "epoch": 10.062901155327342, + "grad_norm": 8.078412055969238, + "learning_rate": 2.998117244330338e-05, + "loss": 0.7245, + "step": 7839 + }, + { + "epoch": 10.06418485237484, + "grad_norm": 3.650764226913452, + "learning_rate": 2.998074454428755e-05, + "loss": 0.7558, + "step": 7840 + }, + { + "epoch": 10.065468549422336, + "grad_norm": 6.865694046020508, + "learning_rate": 2.9980316645271717e-05, + "loss": 0.5514, + "step": 7841 + }, + { + "epoch": 10.066752246469832, + "grad_norm": 7.122686386108398, + "learning_rate": 2.9979888746255886e-05, + "loss": 0.5533, + "step": 7842 + }, + { + "epoch": 10.06803594351733, + "grad_norm": 6.210193634033203, + "learning_rate": 2.997946084724005e-05, + "loss": 0.5399, + "step": 7843 + }, + { + "epoch": 10.069319640564826, + "grad_norm": 1.2991199493408203, + "learning_rate": 2.997903294822422e-05, + "loss": 0.5721, + "step": 7844 + }, + { + "epoch": 10.070603337612324, + "grad_norm": 2.232983112335205, + "learning_rate": 2.9978605049208387e-05, + "loss": 0.5817, + "step": 7845 + }, + { + "epoch": 10.07188703465982, + "grad_norm": 1.6938451528549194, + "learning_rate": 2.9978177150192556e-05, + "loss": 0.5529, + "step": 7846 + }, + { + "epoch": 10.073170731707316, + "grad_norm": 6.160478115081787, + "learning_rate": 2.997774925117672e-05, + "loss": 0.5865, + "step": 7847 + }, + { + "epoch": 10.074454428754814, + "grad_norm": 1.7921596765518188, + "learning_rate": 2.9977321352160893e-05, + "loss": 0.576, + "step": 7848 + }, + { + "epoch": 10.07573812580231, + "grad_norm": 2.29055118560791, + "learning_rate": 2.9976893453145058e-05, + "loss": 0.5493, + "step": 7849 + }, + { + "epoch": 10.077021822849808, + "grad_norm": 3.5314207077026367, + "learning_rate": 2.9976465554129226e-05, + "loss": 0.6413, + "step": 7850 + }, + { + "epoch": 10.078305519897304, + "grad_norm": 11.32669734954834, + "learning_rate": 2.9976037655113395e-05, + "loss": 0.5734, + "step": 7851 + }, + { + "epoch": 10.0795892169448, + "grad_norm": 1.546897292137146, + "learning_rate": 2.997560975609756e-05, + "loss": 0.578, + "step": 7852 + }, + { + "epoch": 10.080872913992298, + "grad_norm": 2.4250693321228027, + "learning_rate": 2.997518185708173e-05, + "loss": 0.5509, + "step": 7853 + }, + { + "epoch": 10.082156611039794, + "grad_norm": 4.373269557952881, + "learning_rate": 2.9974753958065896e-05, + "loss": 0.5631, + "step": 7854 + }, + { + "epoch": 10.083440308087292, + "grad_norm": 1.6975284814834595, + "learning_rate": 2.9974326059050065e-05, + "loss": 0.5575, + "step": 7855 + }, + { + "epoch": 10.084724005134788, + "grad_norm": 4.260570049285889, + "learning_rate": 2.9973898160034233e-05, + "loss": 0.6079, + "step": 7856 + }, + { + "epoch": 10.086007702182284, + "grad_norm": 2.5838751792907715, + "learning_rate": 2.9973470261018398e-05, + "loss": 0.5804, + "step": 7857 + }, + { + "epoch": 10.087291399229782, + "grad_norm": 1.574514389038086, + "learning_rate": 2.997304236200257e-05, + "loss": 0.569, + "step": 7858 + }, + { + "epoch": 10.088575096277278, + "grad_norm": 8.952038764953613, + "learning_rate": 2.9972614462986735e-05, + "loss": 0.5943, + "step": 7859 + }, + { + "epoch": 10.089858793324776, + "grad_norm": 2.555492877960205, + "learning_rate": 2.9972186563970903e-05, + "loss": 0.5578, + "step": 7860 + }, + { + "epoch": 10.091142490372272, + "grad_norm": 3.8422844409942627, + "learning_rate": 2.9971758664955072e-05, + "loss": 0.565, + "step": 7861 + }, + { + "epoch": 10.092426187419768, + "grad_norm": 1.6006994247436523, + "learning_rate": 2.997133076593924e-05, + "loss": 0.5506, + "step": 7862 + }, + { + "epoch": 10.093709884467266, + "grad_norm": 2.1756348609924316, + "learning_rate": 2.9970902866923405e-05, + "loss": 0.5927, + "step": 7863 + }, + { + "epoch": 10.094993581514762, + "grad_norm": 2.5868194103240967, + "learning_rate": 2.9970474967907574e-05, + "loss": 0.5511, + "step": 7864 + }, + { + "epoch": 10.09627727856226, + "grad_norm": 1.3032792806625366, + "learning_rate": 2.9970047068891742e-05, + "loss": 0.5663, + "step": 7865 + }, + { + "epoch": 10.097560975609756, + "grad_norm": 2.2713887691497803, + "learning_rate": 2.996961916987591e-05, + "loss": 0.5689, + "step": 7866 + }, + { + "epoch": 10.098844672657252, + "grad_norm": 9.27942943572998, + "learning_rate": 2.996919127086008e-05, + "loss": 0.5354, + "step": 7867 + }, + { + "epoch": 10.10012836970475, + "grad_norm": 2.2633352279663086, + "learning_rate": 2.9968763371844244e-05, + "loss": 0.578, + "step": 7868 + }, + { + "epoch": 10.101412066752246, + "grad_norm": 2.4943056106567383, + "learning_rate": 2.9968335472828416e-05, + "loss": 0.5856, + "step": 7869 + }, + { + "epoch": 10.102695763799744, + "grad_norm": 2.32142972946167, + "learning_rate": 2.996790757381258e-05, + "loss": 0.5732, + "step": 7870 + }, + { + "epoch": 10.10397946084724, + "grad_norm": 3.0262670516967773, + "learning_rate": 2.9967479674796746e-05, + "loss": 0.5886, + "step": 7871 + }, + { + "epoch": 10.105263157894736, + "grad_norm": 3.3262312412261963, + "learning_rate": 2.9967051775780918e-05, + "loss": 0.6215, + "step": 7872 + }, + { + "epoch": 10.106546854942234, + "grad_norm": 2.0880603790283203, + "learning_rate": 2.9966623876765083e-05, + "loss": 0.6177, + "step": 7873 + }, + { + "epoch": 10.10783055198973, + "grad_norm": 5.4937567710876465, + "learning_rate": 2.9966195977749254e-05, + "loss": 0.597, + "step": 7874 + }, + { + "epoch": 10.109114249037226, + "grad_norm": 1.6034519672393799, + "learning_rate": 2.996576807873342e-05, + "loss": 0.6096, + "step": 7875 + }, + { + "epoch": 10.110397946084724, + "grad_norm": 4.562175750732422, + "learning_rate": 2.9965340179717588e-05, + "loss": 0.5933, + "step": 7876 + }, + { + "epoch": 10.11168164313222, + "grad_norm": 2.1998634338378906, + "learning_rate": 2.9964912280701756e-05, + "loss": 0.5984, + "step": 7877 + }, + { + "epoch": 10.112965340179718, + "grad_norm": 4.700026988983154, + "learning_rate": 2.996448438168592e-05, + "loss": 0.6441, + "step": 7878 + }, + { + "epoch": 10.114249037227214, + "grad_norm": 2.5387606620788574, + "learning_rate": 2.996405648267009e-05, + "loss": 0.5922, + "step": 7879 + }, + { + "epoch": 10.11553273427471, + "grad_norm": 4.162777423858643, + "learning_rate": 2.9963628583654258e-05, + "loss": 0.5748, + "step": 7880 + }, + { + "epoch": 10.116816431322208, + "grad_norm": 24.77593421936035, + "learning_rate": 2.9963200684638427e-05, + "loss": 0.6563, + "step": 7881 + }, + { + "epoch": 10.118100128369704, + "grad_norm": 4.260110378265381, + "learning_rate": 2.9962772785622595e-05, + "loss": 0.6161, + "step": 7882 + }, + { + "epoch": 10.119383825417202, + "grad_norm": 2.225022315979004, + "learning_rate": 2.9962344886606763e-05, + "loss": 0.6563, + "step": 7883 + }, + { + "epoch": 10.120667522464698, + "grad_norm": 1.4790747165679932, + "learning_rate": 2.996191698759093e-05, + "loss": 0.6892, + "step": 7884 + }, + { + "epoch": 10.121951219512194, + "grad_norm": 2.6326775550842285, + "learning_rate": 2.9961489088575097e-05, + "loss": 0.6291, + "step": 7885 + }, + { + "epoch": 10.123234916559692, + "grad_norm": 1.9142216444015503, + "learning_rate": 2.9961061189559265e-05, + "loss": 0.6234, + "step": 7886 + }, + { + "epoch": 10.124518613607188, + "grad_norm": 7.166949272155762, + "learning_rate": 2.996063329054343e-05, + "loss": 0.6127, + "step": 7887 + }, + { + "epoch": 10.125802310654686, + "grad_norm": 5.565463542938232, + "learning_rate": 2.9960205391527602e-05, + "loss": 0.6869, + "step": 7888 + }, + { + "epoch": 10.127086007702182, + "grad_norm": 3.6965723037719727, + "learning_rate": 2.9959777492511767e-05, + "loss": 0.7292, + "step": 7889 + }, + { + "epoch": 10.128369704749678, + "grad_norm": 3.2989094257354736, + "learning_rate": 2.9959349593495936e-05, + "loss": 0.8417, + "step": 7890 + }, + { + "epoch": 10.129653401797176, + "grad_norm": 1.59934401512146, + "learning_rate": 2.9958921694480104e-05, + "loss": 0.5834, + "step": 7891 + }, + { + "epoch": 10.130937098844672, + "grad_norm": 3.535745859146118, + "learning_rate": 2.995849379546427e-05, + "loss": 0.5492, + "step": 7892 + }, + { + "epoch": 10.13222079589217, + "grad_norm": 1.52987539768219, + "learning_rate": 2.995806589644844e-05, + "loss": 0.5787, + "step": 7893 + }, + { + "epoch": 10.133504492939666, + "grad_norm": 1.0921787023544312, + "learning_rate": 2.9957637997432606e-05, + "loss": 0.5542, + "step": 7894 + }, + { + "epoch": 10.134788189987162, + "grad_norm": 1.8663004636764526, + "learning_rate": 2.9957210098416774e-05, + "loss": 0.5479, + "step": 7895 + }, + { + "epoch": 10.13607188703466, + "grad_norm": 2.0010647773742676, + "learning_rate": 2.9956782199400943e-05, + "loss": 0.5336, + "step": 7896 + }, + { + "epoch": 10.137355584082156, + "grad_norm": 11.946354866027832, + "learning_rate": 2.9956354300385108e-05, + "loss": 0.5265, + "step": 7897 + }, + { + "epoch": 10.138639281129654, + "grad_norm": 1.2892552614212036, + "learning_rate": 2.995592640136928e-05, + "loss": 0.584, + "step": 7898 + }, + { + "epoch": 10.13992297817715, + "grad_norm": 2.0759682655334473, + "learning_rate": 2.9955498502353444e-05, + "loss": 0.5134, + "step": 7899 + }, + { + "epoch": 10.141206675224646, + "grad_norm": 0.9545280337333679, + "learning_rate": 2.9955070603337613e-05, + "loss": 0.6046, + "step": 7900 + }, + { + "epoch": 10.142490372272144, + "grad_norm": 1.5596755743026733, + "learning_rate": 2.995464270432178e-05, + "loss": 0.5524, + "step": 7901 + }, + { + "epoch": 10.14377406931964, + "grad_norm": 1.2435542345046997, + "learning_rate": 2.995421480530595e-05, + "loss": 0.628, + "step": 7902 + }, + { + "epoch": 10.145057766367138, + "grad_norm": 2.1526691913604736, + "learning_rate": 2.9953786906290115e-05, + "loss": 0.5131, + "step": 7903 + }, + { + "epoch": 10.146341463414634, + "grad_norm": 0.9993299841880798, + "learning_rate": 2.9953359007274283e-05, + "loss": 0.5735, + "step": 7904 + }, + { + "epoch": 10.14762516046213, + "grad_norm": 1.2099226713180542, + "learning_rate": 2.995293110825845e-05, + "loss": 0.5461, + "step": 7905 + }, + { + "epoch": 10.148908857509628, + "grad_norm": 1.1432639360427856, + "learning_rate": 2.995250320924262e-05, + "loss": 0.535, + "step": 7906 + }, + { + "epoch": 10.150192554557124, + "grad_norm": 5.371866226196289, + "learning_rate": 2.995207531022679e-05, + "loss": 0.5629, + "step": 7907 + }, + { + "epoch": 10.15147625160462, + "grad_norm": 3.7853899002075195, + "learning_rate": 2.9951647411210953e-05, + "loss": 0.5336, + "step": 7908 + }, + { + "epoch": 10.152759948652118, + "grad_norm": 1.1038832664489746, + "learning_rate": 2.9951219512195125e-05, + "loss": 0.5207, + "step": 7909 + }, + { + "epoch": 10.154043645699614, + "grad_norm": 1.958189606666565, + "learning_rate": 2.995079161317929e-05, + "loss": 0.5742, + "step": 7910 + }, + { + "epoch": 10.155327342747112, + "grad_norm": 1.7665250301361084, + "learning_rate": 2.9950363714163455e-05, + "loss": 0.5583, + "step": 7911 + }, + { + "epoch": 10.156611039794608, + "grad_norm": 1.7594808340072632, + "learning_rate": 2.9949935815147627e-05, + "loss": 0.6032, + "step": 7912 + }, + { + "epoch": 10.157894736842104, + "grad_norm": 1.8056889772415161, + "learning_rate": 2.9949507916131792e-05, + "loss": 0.5453, + "step": 7913 + }, + { + "epoch": 10.159178433889602, + "grad_norm": 1.3343323469161987, + "learning_rate": 2.9949080017115964e-05, + "loss": 0.641, + "step": 7914 + }, + { + "epoch": 10.160462130937098, + "grad_norm": 7.876972675323486, + "learning_rate": 2.994865211810013e-05, + "loss": 0.5527, + "step": 7915 + }, + { + "epoch": 10.161745827984596, + "grad_norm": 2.3295936584472656, + "learning_rate": 2.9948224219084297e-05, + "loss": 0.5652, + "step": 7916 + }, + { + "epoch": 10.163029525032092, + "grad_norm": 1.910017490386963, + "learning_rate": 2.9947796320068466e-05, + "loss": 0.5685, + "step": 7917 + }, + { + "epoch": 10.164313222079588, + "grad_norm": 5.912302494049072, + "learning_rate": 2.994736842105263e-05, + "loss": 0.5589, + "step": 7918 + }, + { + "epoch": 10.165596919127086, + "grad_norm": 2.4982097148895264, + "learning_rate": 2.99469405220368e-05, + "loss": 0.6088, + "step": 7919 + }, + { + "epoch": 10.166880616174582, + "grad_norm": 2.045534372329712, + "learning_rate": 2.9946512623020968e-05, + "loss": 0.5335, + "step": 7920 + }, + { + "epoch": 10.16816431322208, + "grad_norm": 3.7617239952087402, + "learning_rate": 2.9946084724005136e-05, + "loss": 0.641, + "step": 7921 + }, + { + "epoch": 10.169448010269576, + "grad_norm": 1.9639281034469604, + "learning_rate": 2.9945656824989304e-05, + "loss": 0.5743, + "step": 7922 + }, + { + "epoch": 10.170731707317072, + "grad_norm": 2.7520811557769775, + "learning_rate": 2.9945228925973473e-05, + "loss": 0.6094, + "step": 7923 + }, + { + "epoch": 10.17201540436457, + "grad_norm": 1.5354721546173096, + "learning_rate": 2.9944801026957638e-05, + "loss": 0.5897, + "step": 7924 + }, + { + "epoch": 10.173299101412066, + "grad_norm": 2.4855709075927734, + "learning_rate": 2.9944373127941806e-05, + "loss": 0.5624, + "step": 7925 + }, + { + "epoch": 10.174582798459564, + "grad_norm": 2.97749662399292, + "learning_rate": 2.9943945228925975e-05, + "loss": 0.6066, + "step": 7926 + }, + { + "epoch": 10.17586649550706, + "grad_norm": 7.225115776062012, + "learning_rate": 2.994351732991014e-05, + "loss": 0.5816, + "step": 7927 + }, + { + "epoch": 10.177150192554556, + "grad_norm": 2.749635696411133, + "learning_rate": 2.994308943089431e-05, + "loss": 0.6183, + "step": 7928 + }, + { + "epoch": 10.178433889602054, + "grad_norm": 18.22191619873047, + "learning_rate": 2.9942661531878476e-05, + "loss": 0.5937, + "step": 7929 + }, + { + "epoch": 10.17971758664955, + "grad_norm": 3.7481019496917725, + "learning_rate": 2.9942233632862648e-05, + "loss": 0.6271, + "step": 7930 + }, + { + "epoch": 10.181001283697048, + "grad_norm": 1.3330544233322144, + "learning_rate": 2.9941805733846813e-05, + "loss": 0.6834, + "step": 7931 + }, + { + "epoch": 10.182284980744544, + "grad_norm": 1.9822949171066284, + "learning_rate": 2.9941377834830978e-05, + "loss": 0.6502, + "step": 7932 + }, + { + "epoch": 10.18356867779204, + "grad_norm": 5.213038921356201, + "learning_rate": 2.994094993581515e-05, + "loss": 0.6017, + "step": 7933 + }, + { + "epoch": 10.184852374839538, + "grad_norm": 4.576151371002197, + "learning_rate": 2.9940522036799315e-05, + "loss": 0.6849, + "step": 7934 + }, + { + "epoch": 10.186136071887034, + "grad_norm": 1.8661235570907593, + "learning_rate": 2.9940094137783484e-05, + "loss": 0.6697, + "step": 7935 + }, + { + "epoch": 10.187419768934532, + "grad_norm": 4.121037483215332, + "learning_rate": 2.9939666238767652e-05, + "loss": 0.6386, + "step": 7936 + }, + { + "epoch": 10.188703465982028, + "grad_norm": 2.102137565612793, + "learning_rate": 2.993923833975182e-05, + "loss": 0.7114, + "step": 7937 + }, + { + "epoch": 10.189987163029524, + "grad_norm": 3.580329656600952, + "learning_rate": 2.993881044073599e-05, + "loss": 0.7075, + "step": 7938 + }, + { + "epoch": 10.191270860077022, + "grad_norm": 2.3254425525665283, + "learning_rate": 2.9938382541720154e-05, + "loss": 0.721, + "step": 7939 + }, + { + "epoch": 10.192554557124518, + "grad_norm": 2.6515369415283203, + "learning_rate": 2.9937954642704322e-05, + "loss": 0.7946, + "step": 7940 + }, + { + "epoch": 10.193838254172016, + "grad_norm": 2.574151039123535, + "learning_rate": 2.993752674368849e-05, + "loss": 0.5421, + "step": 7941 + }, + { + "epoch": 10.195121951219512, + "grad_norm": 2.1676220893859863, + "learning_rate": 2.993709884467266e-05, + "loss": 0.5491, + "step": 7942 + }, + { + "epoch": 10.196405648267008, + "grad_norm": 2.5855658054351807, + "learning_rate": 2.9936670945656824e-05, + "loss": 0.5309, + "step": 7943 + }, + { + "epoch": 10.197689345314506, + "grad_norm": 1.3612004518508911, + "learning_rate": 2.9936243046640996e-05, + "loss": 0.5822, + "step": 7944 + }, + { + "epoch": 10.198973042362002, + "grad_norm": 4.880624294281006, + "learning_rate": 2.993581514762516e-05, + "loss": 0.5246, + "step": 7945 + }, + { + "epoch": 10.200256739409499, + "grad_norm": 1.5410887002944946, + "learning_rate": 2.993538724860933e-05, + "loss": 0.5526, + "step": 7946 + }, + { + "epoch": 10.201540436456996, + "grad_norm": 1.1887645721435547, + "learning_rate": 2.9934959349593498e-05, + "loss": 0.5866, + "step": 7947 + }, + { + "epoch": 10.202824133504492, + "grad_norm": 1.8020684719085693, + "learning_rate": 2.9934531450577663e-05, + "loss": 0.5672, + "step": 7948 + }, + { + "epoch": 10.20410783055199, + "grad_norm": 1.5701525211334229, + "learning_rate": 2.9934103551561835e-05, + "loss": 0.5501, + "step": 7949 + }, + { + "epoch": 10.205391527599486, + "grad_norm": 2.28568696975708, + "learning_rate": 2.9933675652546e-05, + "loss": 0.583, + "step": 7950 + }, + { + "epoch": 10.206675224646983, + "grad_norm": 3.238154172897339, + "learning_rate": 2.9933247753530168e-05, + "loss": 0.5881, + "step": 7951 + }, + { + "epoch": 10.20795892169448, + "grad_norm": 1.8169188499450684, + "learning_rate": 2.9932819854514336e-05, + "loss": 0.541, + "step": 7952 + }, + { + "epoch": 10.209242618741976, + "grad_norm": 1.9614115953445435, + "learning_rate": 2.99323919554985e-05, + "loss": 0.6072, + "step": 7953 + }, + { + "epoch": 10.210526315789474, + "grad_norm": 2.973802328109741, + "learning_rate": 2.9931964056482673e-05, + "loss": 0.5865, + "step": 7954 + }, + { + "epoch": 10.21181001283697, + "grad_norm": 1.66524076461792, + "learning_rate": 2.9931536157466838e-05, + "loss": 0.5092, + "step": 7955 + }, + { + "epoch": 10.213093709884467, + "grad_norm": 6.49181604385376, + "learning_rate": 2.9931108258451007e-05, + "loss": 0.563, + "step": 7956 + }, + { + "epoch": 10.214377406931964, + "grad_norm": 1.4773542881011963, + "learning_rate": 2.9930680359435175e-05, + "loss": 0.5752, + "step": 7957 + }, + { + "epoch": 10.21566110397946, + "grad_norm": 2.075997829437256, + "learning_rate": 2.993025246041934e-05, + "loss": 0.5371, + "step": 7958 + }, + { + "epoch": 10.216944801026958, + "grad_norm": 4.184356212615967, + "learning_rate": 2.992982456140351e-05, + "loss": 0.567, + "step": 7959 + }, + { + "epoch": 10.218228498074454, + "grad_norm": 1.1503671407699585, + "learning_rate": 2.9929396662387677e-05, + "loss": 0.5497, + "step": 7960 + }, + { + "epoch": 10.21951219512195, + "grad_norm": 2.315624713897705, + "learning_rate": 2.9928968763371845e-05, + "loss": 0.6184, + "step": 7961 + }, + { + "epoch": 10.220795892169448, + "grad_norm": 1.034178376197815, + "learning_rate": 2.9928540864356014e-05, + "loss": 0.5681, + "step": 7962 + }, + { + "epoch": 10.222079589216944, + "grad_norm": 1.5122063159942627, + "learning_rate": 2.9928112965340182e-05, + "loss": 0.5798, + "step": 7963 + }, + { + "epoch": 10.223363286264442, + "grad_norm": 1.4169390201568604, + "learning_rate": 2.9927685066324347e-05, + "loss": 0.566, + "step": 7964 + }, + { + "epoch": 10.224646983311938, + "grad_norm": 3.049297571182251, + "learning_rate": 2.9927257167308516e-05, + "loss": 0.6228, + "step": 7965 + }, + { + "epoch": 10.225930680359435, + "grad_norm": 2.810274839401245, + "learning_rate": 2.9926829268292684e-05, + "loss": 0.5723, + "step": 7966 + }, + { + "epoch": 10.227214377406932, + "grad_norm": 2.2327497005462646, + "learning_rate": 2.992640136927685e-05, + "loss": 0.5834, + "step": 7967 + }, + { + "epoch": 10.228498074454428, + "grad_norm": 1.5624972581863403, + "learning_rate": 2.992597347026102e-05, + "loss": 0.5799, + "step": 7968 + }, + { + "epoch": 10.229781771501926, + "grad_norm": 9.483297348022461, + "learning_rate": 2.9925545571245186e-05, + "loss": 0.5687, + "step": 7969 + }, + { + "epoch": 10.231065468549422, + "grad_norm": 2.8453831672668457, + "learning_rate": 2.9925117672229358e-05, + "loss": 0.6123, + "step": 7970 + }, + { + "epoch": 10.232349165596919, + "grad_norm": 5.991688251495361, + "learning_rate": 2.9924689773213523e-05, + "loss": 0.5583, + "step": 7971 + }, + { + "epoch": 10.233632862644416, + "grad_norm": 1.395281195640564, + "learning_rate": 2.9924261874197688e-05, + "loss": 0.622, + "step": 7972 + }, + { + "epoch": 10.234916559691912, + "grad_norm": 2.502967596054077, + "learning_rate": 2.992383397518186e-05, + "loss": 0.6089, + "step": 7973 + }, + { + "epoch": 10.23620025673941, + "grad_norm": 1.8497828245162964, + "learning_rate": 2.9923406076166024e-05, + "loss": 0.65, + "step": 7974 + }, + { + "epoch": 10.237483953786906, + "grad_norm": 2.3702728748321533, + "learning_rate": 2.9922978177150193e-05, + "loss": 0.5827, + "step": 7975 + }, + { + "epoch": 10.238767650834403, + "grad_norm": 1.8593698740005493, + "learning_rate": 2.992255027813436e-05, + "loss": 0.5834, + "step": 7976 + }, + { + "epoch": 10.2400513478819, + "grad_norm": 2.0435659885406494, + "learning_rate": 2.992212237911853e-05, + "loss": 0.6288, + "step": 7977 + }, + { + "epoch": 10.241335044929397, + "grad_norm": 2.920638084411621, + "learning_rate": 2.9921694480102698e-05, + "loss": 0.5707, + "step": 7978 + }, + { + "epoch": 10.242618741976893, + "grad_norm": 3.0579731464385986, + "learning_rate": 2.9921266581086863e-05, + "loss": 0.596, + "step": 7979 + }, + { + "epoch": 10.24390243902439, + "grad_norm": 3.291073799133301, + "learning_rate": 2.992083868207103e-05, + "loss": 0.5924, + "step": 7980 + }, + { + "epoch": 10.245186136071887, + "grad_norm": 1.922532081604004, + "learning_rate": 2.99204107830552e-05, + "loss": 0.6761, + "step": 7981 + }, + { + "epoch": 10.246469833119384, + "grad_norm": 4.153581142425537, + "learning_rate": 2.991998288403937e-05, + "loss": 0.6089, + "step": 7982 + }, + { + "epoch": 10.24775353016688, + "grad_norm": 2.2439234256744385, + "learning_rate": 2.9919554985023533e-05, + "loss": 0.6042, + "step": 7983 + }, + { + "epoch": 10.249037227214377, + "grad_norm": 2.2058115005493164, + "learning_rate": 2.9919127086007705e-05, + "loss": 0.6238, + "step": 7984 + }, + { + "epoch": 10.250320924261874, + "grad_norm": 2.91156005859375, + "learning_rate": 2.991869918699187e-05, + "loss": 0.6664, + "step": 7985 + }, + { + "epoch": 10.25160462130937, + "grad_norm": 3.3958828449249268, + "learning_rate": 2.991827128797604e-05, + "loss": 0.6371, + "step": 7986 + }, + { + "epoch": 10.252888318356868, + "grad_norm": 6.037657737731934, + "learning_rate": 2.9917843388960207e-05, + "loss": 0.7088, + "step": 7987 + }, + { + "epoch": 10.254172015404365, + "grad_norm": 2.4369921684265137, + "learning_rate": 2.9917415489944372e-05, + "loss": 0.6791, + "step": 7988 + }, + { + "epoch": 10.25545571245186, + "grad_norm": 3.2881534099578857, + "learning_rate": 2.9916987590928544e-05, + "loss": 0.866, + "step": 7989 + }, + { + "epoch": 10.256739409499358, + "grad_norm": 1.7530529499053955, + "learning_rate": 2.991655969191271e-05, + "loss": 0.8249, + "step": 7990 + }, + { + "epoch": 10.258023106546855, + "grad_norm": 1.4782321453094482, + "learning_rate": 2.9916131792896877e-05, + "loss": 0.5732, + "step": 7991 + }, + { + "epoch": 10.259306803594352, + "grad_norm": 2.105032205581665, + "learning_rate": 2.9915703893881046e-05, + "loss": 0.5577, + "step": 7992 + }, + { + "epoch": 10.260590500641849, + "grad_norm": 2.9680604934692383, + "learning_rate": 2.991527599486521e-05, + "loss": 0.5878, + "step": 7993 + }, + { + "epoch": 10.261874197689345, + "grad_norm": 2.7221641540527344, + "learning_rate": 2.9914848095849383e-05, + "loss": 0.5827, + "step": 7994 + }, + { + "epoch": 10.263157894736842, + "grad_norm": 0.9597245454788208, + "learning_rate": 2.9914420196833548e-05, + "loss": 0.5846, + "step": 7995 + }, + { + "epoch": 10.264441591784339, + "grad_norm": 2.2599570751190186, + "learning_rate": 2.9913992297817716e-05, + "loss": 0.5813, + "step": 7996 + }, + { + "epoch": 10.265725288831836, + "grad_norm": 1.5909711122512817, + "learning_rate": 2.9913564398801884e-05, + "loss": 0.5706, + "step": 7997 + }, + { + "epoch": 10.267008985879333, + "grad_norm": 1.2576746940612793, + "learning_rate": 2.9913136499786053e-05, + "loss": 0.5407, + "step": 7998 + }, + { + "epoch": 10.268292682926829, + "grad_norm": 2.399996757507324, + "learning_rate": 2.9912708600770218e-05, + "loss": 0.5806, + "step": 7999 + }, + { + "epoch": 10.269576379974326, + "grad_norm": 1.3222578763961792, + "learning_rate": 2.9912280701754386e-05, + "loss": 0.5799, + "step": 8000 + }, + { + "epoch": 10.269576379974326, + "eval_cer": 0.31026042686508715, + "eval_loss": 0.5903372764587402, + "eval_runtime": 14.2461, + "eval_samples_per_second": 69.001, + "eval_steps_per_second": 0.491, + "eval_wer": 0.5309562027058015, + "step": 8000 + }, + { + "epoch": 10.270860077021823, + "grad_norm": 2.7623231410980225, + "learning_rate": 2.9911852802738555e-05, + "loss": 0.5616, + "step": 8001 + }, + { + "epoch": 10.27214377406932, + "grad_norm": 1.5522409677505493, + "learning_rate": 2.991142490372272e-05, + "loss": 0.6161, + "step": 8002 + }, + { + "epoch": 10.273427471116817, + "grad_norm": 1.9504414796829224, + "learning_rate": 2.991099700470689e-05, + "loss": 0.586, + "step": 8003 + }, + { + "epoch": 10.274711168164313, + "grad_norm": 1.8507404327392578, + "learning_rate": 2.9910569105691057e-05, + "loss": 0.5922, + "step": 8004 + }, + { + "epoch": 10.27599486521181, + "grad_norm": 1.540136694908142, + "learning_rate": 2.991014120667523e-05, + "loss": 0.586, + "step": 8005 + }, + { + "epoch": 10.277278562259307, + "grad_norm": 2.828375816345215, + "learning_rate": 2.9909713307659393e-05, + "loss": 0.5622, + "step": 8006 + }, + { + "epoch": 10.278562259306804, + "grad_norm": 1.4557381868362427, + "learning_rate": 2.990928540864356e-05, + "loss": 0.5817, + "step": 8007 + }, + { + "epoch": 10.2798459563543, + "grad_norm": 4.105254650115967, + "learning_rate": 2.990885750962773e-05, + "loss": 0.579, + "step": 8008 + }, + { + "epoch": 10.281129653401797, + "grad_norm": 1.5875933170318604, + "learning_rate": 2.9908429610611895e-05, + "loss": 0.5569, + "step": 8009 + }, + { + "epoch": 10.282413350449294, + "grad_norm": 1.9296358823776245, + "learning_rate": 2.9908001711596064e-05, + "loss": 0.5871, + "step": 8010 + }, + { + "epoch": 10.28369704749679, + "grad_norm": 1.0946860313415527, + "learning_rate": 2.9907573812580232e-05, + "loss": 0.5711, + "step": 8011 + }, + { + "epoch": 10.284980744544288, + "grad_norm": 3.4097256660461426, + "learning_rate": 2.99071459135644e-05, + "loss": 0.5586, + "step": 8012 + }, + { + "epoch": 10.286264441591785, + "grad_norm": 4.485118389129639, + "learning_rate": 2.990671801454857e-05, + "loss": 0.5653, + "step": 8013 + }, + { + "epoch": 10.28754813863928, + "grad_norm": 4.1788716316223145, + "learning_rate": 2.9906290115532734e-05, + "loss": 0.6046, + "step": 8014 + }, + { + "epoch": 10.288831835686779, + "grad_norm": 2.0179028511047363, + "learning_rate": 2.9905862216516902e-05, + "loss": 0.5734, + "step": 8015 + }, + { + "epoch": 10.290115532734275, + "grad_norm": 1.5892220735549927, + "learning_rate": 2.990543431750107e-05, + "loss": 0.6024, + "step": 8016 + }, + { + "epoch": 10.29139922978177, + "grad_norm": 2.207704782485962, + "learning_rate": 2.990500641848524e-05, + "loss": 0.5491, + "step": 8017 + }, + { + "epoch": 10.292682926829269, + "grad_norm": 2.5607380867004395, + "learning_rate": 2.9904578519469404e-05, + "loss": 0.5657, + "step": 8018 + }, + { + "epoch": 10.293966623876765, + "grad_norm": 1.4802864789962769, + "learning_rate": 2.9904150620453573e-05, + "loss": 0.5747, + "step": 8019 + }, + { + "epoch": 10.295250320924263, + "grad_norm": 1.4979679584503174, + "learning_rate": 2.990372272143774e-05, + "loss": 0.5501, + "step": 8020 + }, + { + "epoch": 10.296534017971759, + "grad_norm": 5.501384735107422, + "learning_rate": 2.990329482242191e-05, + "loss": 0.6076, + "step": 8021 + }, + { + "epoch": 10.297817715019255, + "grad_norm": 1.8016738891601562, + "learning_rate": 2.9902866923406078e-05, + "loss": 0.5744, + "step": 8022 + }, + { + "epoch": 10.299101412066753, + "grad_norm": 4.216931343078613, + "learning_rate": 2.9902439024390243e-05, + "loss": 0.5937, + "step": 8023 + }, + { + "epoch": 10.300385109114249, + "grad_norm": 5.316683769226074, + "learning_rate": 2.9902011125374415e-05, + "loss": 0.6229, + "step": 8024 + }, + { + "epoch": 10.301668806161747, + "grad_norm": 1.786041498184204, + "learning_rate": 2.990158322635858e-05, + "loss": 0.5743, + "step": 8025 + }, + { + "epoch": 10.302952503209243, + "grad_norm": 1.569594383239746, + "learning_rate": 2.9901155327342745e-05, + "loss": 0.6142, + "step": 8026 + }, + { + "epoch": 10.304236200256739, + "grad_norm": 1.8650394678115845, + "learning_rate": 2.9900727428326916e-05, + "loss": 0.6227, + "step": 8027 + }, + { + "epoch": 10.305519897304237, + "grad_norm": 2.6123201847076416, + "learning_rate": 2.990029952931108e-05, + "loss": 0.5861, + "step": 8028 + }, + { + "epoch": 10.306803594351733, + "grad_norm": 1.8202483654022217, + "learning_rate": 2.9899871630295253e-05, + "loss": 0.6673, + "step": 8029 + }, + { + "epoch": 10.30808729139923, + "grad_norm": 1.1605957746505737, + "learning_rate": 2.9899443731279418e-05, + "loss": 0.6116, + "step": 8030 + }, + { + "epoch": 10.309370988446727, + "grad_norm": 1.7154638767242432, + "learning_rate": 2.9899015832263587e-05, + "loss": 0.6408, + "step": 8031 + }, + { + "epoch": 10.310654685494223, + "grad_norm": 1.5428273677825928, + "learning_rate": 2.9898587933247755e-05, + "loss": 0.6481, + "step": 8032 + }, + { + "epoch": 10.31193838254172, + "grad_norm": 2.1653666496276855, + "learning_rate": 2.989816003423192e-05, + "loss": 0.6398, + "step": 8033 + }, + { + "epoch": 10.313222079589217, + "grad_norm": 0.9693167805671692, + "learning_rate": 2.989773213521609e-05, + "loss": 0.5867, + "step": 8034 + }, + { + "epoch": 10.314505776636715, + "grad_norm": 1.3643549680709839, + "learning_rate": 2.9897304236200257e-05, + "loss": 0.6037, + "step": 8035 + }, + { + "epoch": 10.31578947368421, + "grad_norm": 2.2282767295837402, + "learning_rate": 2.9896876337184425e-05, + "loss": 0.6391, + "step": 8036 + }, + { + "epoch": 10.317073170731707, + "grad_norm": 5.888067245483398, + "learning_rate": 2.9896448438168594e-05, + "loss": 0.6839, + "step": 8037 + }, + { + "epoch": 10.318356867779205, + "grad_norm": 2.7501649856567383, + "learning_rate": 2.9896020539152762e-05, + "loss": 0.7271, + "step": 8038 + }, + { + "epoch": 10.3196405648267, + "grad_norm": 3.220999002456665, + "learning_rate": 2.9895592640136927e-05, + "loss": 0.7105, + "step": 8039 + }, + { + "epoch": 10.320924261874199, + "grad_norm": 5.737166404724121, + "learning_rate": 2.9895164741121096e-05, + "loss": 0.8068, + "step": 8040 + }, + { + "epoch": 10.322207958921695, + "grad_norm": 2.8022308349609375, + "learning_rate": 2.9894736842105264e-05, + "loss": 0.5767, + "step": 8041 + }, + { + "epoch": 10.32349165596919, + "grad_norm": 3.571486711502075, + "learning_rate": 2.989430894308943e-05, + "loss": 0.5543, + "step": 8042 + }, + { + "epoch": 10.324775353016689, + "grad_norm": 1.5219937562942505, + "learning_rate": 2.98938810440736e-05, + "loss": 0.5933, + "step": 8043 + }, + { + "epoch": 10.326059050064185, + "grad_norm": 1.8901126384735107, + "learning_rate": 2.9893453145057766e-05, + "loss": 0.6133, + "step": 8044 + }, + { + "epoch": 10.327342747111683, + "grad_norm": 3.3182828426361084, + "learning_rate": 2.9893025246041938e-05, + "loss": 0.5814, + "step": 8045 + }, + { + "epoch": 10.328626444159179, + "grad_norm": 2.0889971256256104, + "learning_rate": 2.9892597347026103e-05, + "loss": 0.5823, + "step": 8046 + }, + { + "epoch": 10.329910141206675, + "grad_norm": 1.6485737562179565, + "learning_rate": 2.9892169448010268e-05, + "loss": 0.5444, + "step": 8047 + }, + { + "epoch": 10.331193838254173, + "grad_norm": 2.9728784561157227, + "learning_rate": 2.989174154899444e-05, + "loss": 0.5621, + "step": 8048 + }, + { + "epoch": 10.332477535301669, + "grad_norm": 1.9879082441329956, + "learning_rate": 2.9891313649978605e-05, + "loss": 0.5674, + "step": 8049 + }, + { + "epoch": 10.333761232349165, + "grad_norm": 5.530394554138184, + "learning_rate": 2.9890885750962773e-05, + "loss": 0.5681, + "step": 8050 + }, + { + "epoch": 10.335044929396663, + "grad_norm": 1.3650444746017456, + "learning_rate": 2.989045785194694e-05, + "loss": 0.5176, + "step": 8051 + }, + { + "epoch": 10.336328626444159, + "grad_norm": 0.8467490077018738, + "learning_rate": 2.989002995293111e-05, + "loss": 0.5503, + "step": 8052 + }, + { + "epoch": 10.337612323491657, + "grad_norm": 4.402980804443359, + "learning_rate": 2.9889602053915278e-05, + "loss": 0.5516, + "step": 8053 + }, + { + "epoch": 10.338896020539153, + "grad_norm": 1.6617966890335083, + "learning_rate": 2.9889174154899443e-05, + "loss": 0.5743, + "step": 8054 + }, + { + "epoch": 10.340179717586649, + "grad_norm": 2.664321184158325, + "learning_rate": 2.988874625588361e-05, + "loss": 0.638, + "step": 8055 + }, + { + "epoch": 10.341463414634147, + "grad_norm": 1.6459609270095825, + "learning_rate": 2.988831835686778e-05, + "loss": 0.5422, + "step": 8056 + }, + { + "epoch": 10.342747111681643, + "grad_norm": 1.3915797472000122, + "learning_rate": 2.988789045785195e-05, + "loss": 0.5713, + "step": 8057 + }, + { + "epoch": 10.34403080872914, + "grad_norm": 2.689401626586914, + "learning_rate": 2.9887462558836113e-05, + "loss": 0.5467, + "step": 8058 + }, + { + "epoch": 10.345314505776637, + "grad_norm": 1.7682440280914307, + "learning_rate": 2.9887034659820285e-05, + "loss": 0.5837, + "step": 8059 + }, + { + "epoch": 10.346598202824133, + "grad_norm": 1.7339364290237427, + "learning_rate": 2.988660676080445e-05, + "loss": 0.5556, + "step": 8060 + }, + { + "epoch": 10.34788189987163, + "grad_norm": 3.6081576347351074, + "learning_rate": 2.988617886178862e-05, + "loss": 0.5656, + "step": 8061 + }, + { + "epoch": 10.349165596919127, + "grad_norm": 1.95755934715271, + "learning_rate": 2.9885750962772787e-05, + "loss": 0.5568, + "step": 8062 + }, + { + "epoch": 10.350449293966625, + "grad_norm": 1.9817806482315063, + "learning_rate": 2.9885323063756952e-05, + "loss": 0.5818, + "step": 8063 + }, + { + "epoch": 10.35173299101412, + "grad_norm": 2.2233030796051025, + "learning_rate": 2.9884895164741124e-05, + "loss": 0.5477, + "step": 8064 + }, + { + "epoch": 10.353016688061617, + "grad_norm": 1.602465271949768, + "learning_rate": 2.988446726572529e-05, + "loss": 0.6047, + "step": 8065 + }, + { + "epoch": 10.354300385109115, + "grad_norm": 3.0976693630218506, + "learning_rate": 2.9884039366709457e-05, + "loss": 0.5983, + "step": 8066 + }, + { + "epoch": 10.35558408215661, + "grad_norm": 37.8655891418457, + "learning_rate": 2.9883611467693626e-05, + "loss": 0.5571, + "step": 8067 + }, + { + "epoch": 10.356867779204109, + "grad_norm": 1.472129464149475, + "learning_rate": 2.988318356867779e-05, + "loss": 0.5448, + "step": 8068 + }, + { + "epoch": 10.358151476251605, + "grad_norm": 1.6258742809295654, + "learning_rate": 2.9882755669661963e-05, + "loss": 0.603, + "step": 8069 + }, + { + "epoch": 10.3594351732991, + "grad_norm": 1.8450994491577148, + "learning_rate": 2.9882327770646128e-05, + "loss": 0.5365, + "step": 8070 + }, + { + "epoch": 10.360718870346599, + "grad_norm": 1.6038541793823242, + "learning_rate": 2.9881899871630296e-05, + "loss": 0.5676, + "step": 8071 + }, + { + "epoch": 10.362002567394095, + "grad_norm": 4.347499370574951, + "learning_rate": 2.9881471972614464e-05, + "loss": 0.5895, + "step": 8072 + }, + { + "epoch": 10.363286264441593, + "grad_norm": 4.6467108726501465, + "learning_rate": 2.9881044073598633e-05, + "loss": 0.5749, + "step": 8073 + }, + { + "epoch": 10.364569961489089, + "grad_norm": 2.894928455352783, + "learning_rate": 2.9880616174582798e-05, + "loss": 0.6225, + "step": 8074 + }, + { + "epoch": 10.365853658536585, + "grad_norm": 1.443547010421753, + "learning_rate": 2.9880188275566966e-05, + "loss": 0.6239, + "step": 8075 + }, + { + "epoch": 10.367137355584083, + "grad_norm": 1.2525216341018677, + "learning_rate": 2.9879760376551135e-05, + "loss": 0.5988, + "step": 8076 + }, + { + "epoch": 10.368421052631579, + "grad_norm": 2.549089193344116, + "learning_rate": 2.9879332477535303e-05, + "loss": 0.5986, + "step": 8077 + }, + { + "epoch": 10.369704749679077, + "grad_norm": 4.155922889709473, + "learning_rate": 2.987890457851947e-05, + "loss": 0.5191, + "step": 8078 + }, + { + "epoch": 10.370988446726573, + "grad_norm": 3.636923313140869, + "learning_rate": 2.9878476679503637e-05, + "loss": 0.6165, + "step": 8079 + }, + { + "epoch": 10.372272143774069, + "grad_norm": 2.711668014526367, + "learning_rate": 2.9878048780487805e-05, + "loss": 0.6292, + "step": 8080 + }, + { + "epoch": 10.373555840821567, + "grad_norm": 4.970777988433838, + "learning_rate": 2.9877620881471973e-05, + "loss": 0.5992, + "step": 8081 + }, + { + "epoch": 10.374839537869063, + "grad_norm": 1.959230661392212, + "learning_rate": 2.987719298245614e-05, + "loss": 0.6358, + "step": 8082 + }, + { + "epoch": 10.376123234916559, + "grad_norm": 2.6841673851013184, + "learning_rate": 2.987676508344031e-05, + "loss": 0.5749, + "step": 8083 + }, + { + "epoch": 10.377406931964057, + "grad_norm": 2.166226863861084, + "learning_rate": 2.9876337184424475e-05, + "loss": 0.5881, + "step": 8084 + }, + { + "epoch": 10.378690629011553, + "grad_norm": 3.6664369106292725, + "learning_rate": 2.9875909285408647e-05, + "loss": 0.6245, + "step": 8085 + }, + { + "epoch": 10.37997432605905, + "grad_norm": 2.9168872833251953, + "learning_rate": 2.9875481386392812e-05, + "loss": 0.6375, + "step": 8086 + }, + { + "epoch": 10.381258023106547, + "grad_norm": 2.002855062484741, + "learning_rate": 2.9875053487376977e-05, + "loss": 0.6294, + "step": 8087 + }, + { + "epoch": 10.382541720154043, + "grad_norm": 1.8464926481246948, + "learning_rate": 2.987462558836115e-05, + "loss": 0.6685, + "step": 8088 + }, + { + "epoch": 10.38382541720154, + "grad_norm": 2.4992058277130127, + "learning_rate": 2.9874197689345314e-05, + "loss": 0.6704, + "step": 8089 + }, + { + "epoch": 10.385109114249037, + "grad_norm": 3.0584816932678223, + "learning_rate": 2.9873769790329482e-05, + "loss": 0.8431, + "step": 8090 + }, + { + "epoch": 10.386392811296535, + "grad_norm": 1.5086008310317993, + "learning_rate": 2.987334189131365e-05, + "loss": 0.5444, + "step": 8091 + }, + { + "epoch": 10.38767650834403, + "grad_norm": 1.7353837490081787, + "learning_rate": 2.987291399229782e-05, + "loss": 0.4986, + "step": 8092 + }, + { + "epoch": 10.388960205391527, + "grad_norm": 0.9804710745811462, + "learning_rate": 2.9872486093281988e-05, + "loss": 0.5449, + "step": 8093 + }, + { + "epoch": 10.390243902439025, + "grad_norm": 0.9125864505767822, + "learning_rate": 2.9872058194266153e-05, + "loss": 0.5529, + "step": 8094 + }, + { + "epoch": 10.39152759948652, + "grad_norm": 1.9083303213119507, + "learning_rate": 2.987163029525032e-05, + "loss": 0.518, + "step": 8095 + }, + { + "epoch": 10.392811296534019, + "grad_norm": 1.8743259906768799, + "learning_rate": 2.987120239623449e-05, + "loss": 0.5612, + "step": 8096 + }, + { + "epoch": 10.394094993581515, + "grad_norm": 13.161883354187012, + "learning_rate": 2.9870774497218658e-05, + "loss": 0.5492, + "step": 8097 + }, + { + "epoch": 10.39537869062901, + "grad_norm": 7.526561737060547, + "learning_rate": 2.9870346598202823e-05, + "loss": 0.5734, + "step": 8098 + }, + { + "epoch": 10.396662387676509, + "grad_norm": 4.834844589233398, + "learning_rate": 2.9869918699186995e-05, + "loss": 0.5834, + "step": 8099 + }, + { + "epoch": 10.397946084724005, + "grad_norm": 2.808925151824951, + "learning_rate": 2.986949080017116e-05, + "loss": 0.5687, + "step": 8100 + }, + { + "epoch": 10.399229781771503, + "grad_norm": 3.031048536300659, + "learning_rate": 2.9869062901155328e-05, + "loss": 0.5669, + "step": 8101 + }, + { + "epoch": 10.400513478818999, + "grad_norm": 1.4276243448257446, + "learning_rate": 2.9868635002139496e-05, + "loss": 0.5746, + "step": 8102 + }, + { + "epoch": 10.401797175866495, + "grad_norm": 1.5123589038848877, + "learning_rate": 2.986820710312366e-05, + "loss": 0.5589, + "step": 8103 + }, + { + "epoch": 10.403080872913993, + "grad_norm": 4.578408718109131, + "learning_rate": 2.9867779204107833e-05, + "loss": 0.583, + "step": 8104 + }, + { + "epoch": 10.404364569961489, + "grad_norm": 2.8406314849853516, + "learning_rate": 2.9867351305092e-05, + "loss": 0.5535, + "step": 8105 + }, + { + "epoch": 10.405648267008987, + "grad_norm": 6.6888346672058105, + "learning_rate": 2.9866923406076167e-05, + "loss": 0.5947, + "step": 8106 + }, + { + "epoch": 10.406931964056483, + "grad_norm": 5.468329429626465, + "learning_rate": 2.9866495507060335e-05, + "loss": 0.6024, + "step": 8107 + }, + { + "epoch": 10.408215661103979, + "grad_norm": 1.817865252494812, + "learning_rate": 2.98660676080445e-05, + "loss": 0.5518, + "step": 8108 + }, + { + "epoch": 10.409499358151477, + "grad_norm": 1.865487813949585, + "learning_rate": 2.9865639709028672e-05, + "loss": 0.5653, + "step": 8109 + }, + { + "epoch": 10.410783055198973, + "grad_norm": 2.248302459716797, + "learning_rate": 2.9865211810012837e-05, + "loss": 0.6097, + "step": 8110 + }, + { + "epoch": 10.41206675224647, + "grad_norm": 2.9984264373779297, + "learning_rate": 2.9864783910997005e-05, + "loss": 0.5407, + "step": 8111 + }, + { + "epoch": 10.413350449293967, + "grad_norm": 2.9384000301361084, + "learning_rate": 2.9864356011981174e-05, + "loss": 0.5995, + "step": 8112 + }, + { + "epoch": 10.414634146341463, + "grad_norm": 11.338271141052246, + "learning_rate": 2.9863928112965342e-05, + "loss": 0.6036, + "step": 8113 + }, + { + "epoch": 10.41591784338896, + "grad_norm": 1.4983301162719727, + "learning_rate": 2.9863500213949507e-05, + "loss": 0.5499, + "step": 8114 + }, + { + "epoch": 10.417201540436457, + "grad_norm": 2.7834465503692627, + "learning_rate": 2.9863072314933676e-05, + "loss": 0.553, + "step": 8115 + }, + { + "epoch": 10.418485237483953, + "grad_norm": 8.241930961608887, + "learning_rate": 2.9862644415917844e-05, + "loss": 0.5885, + "step": 8116 + }, + { + "epoch": 10.41976893453145, + "grad_norm": 1.0080665349960327, + "learning_rate": 2.9862216516902013e-05, + "loss": 0.5765, + "step": 8117 + }, + { + "epoch": 10.421052631578947, + "grad_norm": 0.8324804902076721, + "learning_rate": 2.986178861788618e-05, + "loss": 0.5483, + "step": 8118 + }, + { + "epoch": 10.422336328626445, + "grad_norm": 1.3383874893188477, + "learning_rate": 2.9861360718870346e-05, + "loss": 0.577, + "step": 8119 + }, + { + "epoch": 10.42362002567394, + "grad_norm": 3.169757843017578, + "learning_rate": 2.9860932819854518e-05, + "loss": 0.6336, + "step": 8120 + }, + { + "epoch": 10.424903722721437, + "grad_norm": 3.760767698287964, + "learning_rate": 2.9860504920838683e-05, + "loss": 0.6143, + "step": 8121 + }, + { + "epoch": 10.426187419768935, + "grad_norm": 1.8493478298187256, + "learning_rate": 2.9860077021822848e-05, + "loss": 0.6082, + "step": 8122 + }, + { + "epoch": 10.427471116816431, + "grad_norm": 1.4063153266906738, + "learning_rate": 2.985964912280702e-05, + "loss": 0.5924, + "step": 8123 + }, + { + "epoch": 10.428754813863929, + "grad_norm": 10.104106903076172, + "learning_rate": 2.9859221223791185e-05, + "loss": 0.5961, + "step": 8124 + }, + { + "epoch": 10.430038510911425, + "grad_norm": 2.8268346786499023, + "learning_rate": 2.9858793324775356e-05, + "loss": 0.6284, + "step": 8125 + }, + { + "epoch": 10.431322207958921, + "grad_norm": 1.5588897466659546, + "learning_rate": 2.985836542575952e-05, + "loss": 0.6047, + "step": 8126 + }, + { + "epoch": 10.432605905006419, + "grad_norm": 4.142857074737549, + "learning_rate": 2.985793752674369e-05, + "loss": 0.6095, + "step": 8127 + }, + { + "epoch": 10.433889602053915, + "grad_norm": 1.1956669092178345, + "learning_rate": 2.9857509627727858e-05, + "loss": 0.5998, + "step": 8128 + }, + { + "epoch": 10.435173299101413, + "grad_norm": 1.4033172130584717, + "learning_rate": 2.9857081728712023e-05, + "loss": 0.618, + "step": 8129 + }, + { + "epoch": 10.436456996148909, + "grad_norm": 1.85698401927948, + "learning_rate": 2.9856653829696192e-05, + "loss": 0.5589, + "step": 8130 + }, + { + "epoch": 10.437740693196405, + "grad_norm": 1.42831552028656, + "learning_rate": 2.985622593068036e-05, + "loss": 0.5419, + "step": 8131 + }, + { + "epoch": 10.439024390243903, + "grad_norm": 3.1970267295837402, + "learning_rate": 2.985579803166453e-05, + "loss": 0.6508, + "step": 8132 + }, + { + "epoch": 10.440308087291399, + "grad_norm": 1.8344039916992188, + "learning_rate": 2.9855370132648697e-05, + "loss": 0.6537, + "step": 8133 + }, + { + "epoch": 10.441591784338897, + "grad_norm": 1.9044233560562134, + "learning_rate": 2.9854942233632865e-05, + "loss": 0.607, + "step": 8134 + }, + { + "epoch": 10.442875481386393, + "grad_norm": 5.428215980529785, + "learning_rate": 2.985451433461703e-05, + "loss": 0.6341, + "step": 8135 + }, + { + "epoch": 10.444159178433889, + "grad_norm": 2.8215770721435547, + "learning_rate": 2.98540864356012e-05, + "loss": 0.6503, + "step": 8136 + }, + { + "epoch": 10.445442875481387, + "grad_norm": 1.5934809446334839, + "learning_rate": 2.9853658536585367e-05, + "loss": 0.6049, + "step": 8137 + }, + { + "epoch": 10.446726572528883, + "grad_norm": 3.683335304260254, + "learning_rate": 2.9853230637569532e-05, + "loss": 0.6632, + "step": 8138 + }, + { + "epoch": 10.44801026957638, + "grad_norm": 23.083293914794922, + "learning_rate": 2.9852802738553704e-05, + "loss": 0.7498, + "step": 8139 + }, + { + "epoch": 10.449293966623877, + "grad_norm": 2.577195882797241, + "learning_rate": 2.985237483953787e-05, + "loss": 0.8272, + "step": 8140 + }, + { + "epoch": 10.450577663671373, + "grad_norm": 3.127744436264038, + "learning_rate": 2.9851946940522037e-05, + "loss": 0.5416, + "step": 8141 + }, + { + "epoch": 10.45186136071887, + "grad_norm": 2.148127555847168, + "learning_rate": 2.9851519041506206e-05, + "loss": 0.5539, + "step": 8142 + }, + { + "epoch": 10.453145057766367, + "grad_norm": 2.2173094749450684, + "learning_rate": 2.985109114249037e-05, + "loss": 0.603, + "step": 8143 + }, + { + "epoch": 10.454428754813865, + "grad_norm": 1.6229606866836548, + "learning_rate": 2.9850663243474543e-05, + "loss": 0.5603, + "step": 8144 + }, + { + "epoch": 10.455712451861361, + "grad_norm": 0.883333146572113, + "learning_rate": 2.9850235344458708e-05, + "loss": 0.5541, + "step": 8145 + }, + { + "epoch": 10.456996148908857, + "grad_norm": 1.2137188911437988, + "learning_rate": 2.9849807445442876e-05, + "loss": 0.5743, + "step": 8146 + }, + { + "epoch": 10.458279845956355, + "grad_norm": 2.2518584728240967, + "learning_rate": 2.9849379546427045e-05, + "loss": 0.5755, + "step": 8147 + }, + { + "epoch": 10.459563543003851, + "grad_norm": 1.6335643529891968, + "learning_rate": 2.984895164741121e-05, + "loss": 0.5516, + "step": 8148 + }, + { + "epoch": 10.460847240051347, + "grad_norm": 2.0595216751098633, + "learning_rate": 2.984852374839538e-05, + "loss": 0.5526, + "step": 8149 + }, + { + "epoch": 10.462130937098845, + "grad_norm": 1.0603996515274048, + "learning_rate": 2.9848095849379546e-05, + "loss": 0.542, + "step": 8150 + }, + { + "epoch": 10.463414634146341, + "grad_norm": 0.8561428189277649, + "learning_rate": 2.9847667950363715e-05, + "loss": 0.5431, + "step": 8151 + }, + { + "epoch": 10.464698331193839, + "grad_norm": 1.7541794776916504, + "learning_rate": 2.9847240051347883e-05, + "loss": 0.5607, + "step": 8152 + }, + { + "epoch": 10.465982028241335, + "grad_norm": 2.161956548690796, + "learning_rate": 2.984681215233205e-05, + "loss": 0.5585, + "step": 8153 + }, + { + "epoch": 10.467265725288831, + "grad_norm": 1.4051655530929565, + "learning_rate": 2.9846384253316217e-05, + "loss": 0.5615, + "step": 8154 + }, + { + "epoch": 10.468549422336329, + "grad_norm": 0.9188032150268555, + "learning_rate": 2.9845956354300385e-05, + "loss": 0.5396, + "step": 8155 + }, + { + "epoch": 10.469833119383825, + "grad_norm": 1.3122061491012573, + "learning_rate": 2.9845528455284553e-05, + "loss": 0.6026, + "step": 8156 + }, + { + "epoch": 10.471116816431323, + "grad_norm": 3.789307117462158, + "learning_rate": 2.9845100556268722e-05, + "loss": 0.6426, + "step": 8157 + }, + { + "epoch": 10.472400513478819, + "grad_norm": 3.0354952812194824, + "learning_rate": 2.984467265725289e-05, + "loss": 0.5783, + "step": 8158 + }, + { + "epoch": 10.473684210526315, + "grad_norm": 2.118199348449707, + "learning_rate": 2.9844244758237055e-05, + "loss": 0.5726, + "step": 8159 + }, + { + "epoch": 10.474967907573813, + "grad_norm": 2.0850725173950195, + "learning_rate": 2.9843816859221227e-05, + "loss": 0.5452, + "step": 8160 + }, + { + "epoch": 10.476251604621309, + "grad_norm": 1.7494895458221436, + "learning_rate": 2.9843388960205392e-05, + "loss": 0.5753, + "step": 8161 + }, + { + "epoch": 10.477535301668807, + "grad_norm": 1.924140453338623, + "learning_rate": 2.9842961061189557e-05, + "loss": 0.5706, + "step": 8162 + }, + { + "epoch": 10.478818998716303, + "grad_norm": 2.8878731727600098, + "learning_rate": 2.984253316217373e-05, + "loss": 0.5842, + "step": 8163 + }, + { + "epoch": 10.480102695763799, + "grad_norm": 1.5170583724975586, + "learning_rate": 2.9842105263157894e-05, + "loss": 0.5702, + "step": 8164 + }, + { + "epoch": 10.481386392811297, + "grad_norm": 1.2878919839859009, + "learning_rate": 2.9841677364142066e-05, + "loss": 0.5377, + "step": 8165 + }, + { + "epoch": 10.482670089858793, + "grad_norm": 2.0681915283203125, + "learning_rate": 2.984124946512623e-05, + "loss": 0.6053, + "step": 8166 + }, + { + "epoch": 10.48395378690629, + "grad_norm": 1.539642333984375, + "learning_rate": 2.98408215661104e-05, + "loss": 0.5726, + "step": 8167 + }, + { + "epoch": 10.485237483953787, + "grad_norm": 5.2145185470581055, + "learning_rate": 2.9840393667094568e-05, + "loss": 0.5695, + "step": 8168 + }, + { + "epoch": 10.486521181001283, + "grad_norm": 1.7772547006607056, + "learning_rate": 2.9839965768078733e-05, + "loss": 0.5847, + "step": 8169 + }, + { + "epoch": 10.487804878048781, + "grad_norm": 1.5451653003692627, + "learning_rate": 2.98395378690629e-05, + "loss": 0.5832, + "step": 8170 + }, + { + "epoch": 10.489088575096277, + "grad_norm": 2.352410078048706, + "learning_rate": 2.983910997004707e-05, + "loss": 0.6397, + "step": 8171 + }, + { + "epoch": 10.490372272143775, + "grad_norm": 1.412949562072754, + "learning_rate": 2.9838682071031238e-05, + "loss": 0.5723, + "step": 8172 + }, + { + "epoch": 10.491655969191271, + "grad_norm": 1.262287974357605, + "learning_rate": 2.9838254172015406e-05, + "loss": 0.6284, + "step": 8173 + }, + { + "epoch": 10.492939666238767, + "grad_norm": 1.154021978378296, + "learning_rate": 2.9837826272999575e-05, + "loss": 0.6386, + "step": 8174 + }, + { + "epoch": 10.494223363286265, + "grad_norm": 2.816239833831787, + "learning_rate": 2.983739837398374e-05, + "loss": 0.5919, + "step": 8175 + }, + { + "epoch": 10.495507060333761, + "grad_norm": 3.4286513328552246, + "learning_rate": 2.9836970474967908e-05, + "loss": 0.5737, + "step": 8176 + }, + { + "epoch": 10.496790757381259, + "grad_norm": 0.9800030589103699, + "learning_rate": 2.9836542575952077e-05, + "loss": 0.613, + "step": 8177 + }, + { + "epoch": 10.498074454428755, + "grad_norm": 1.501816987991333, + "learning_rate": 2.983611467693624e-05, + "loss": 0.6351, + "step": 8178 + }, + { + "epoch": 10.499358151476251, + "grad_norm": 2.2793939113616943, + "learning_rate": 2.9835686777920413e-05, + "loss": 0.5963, + "step": 8179 + }, + { + "epoch": 10.500641848523749, + "grad_norm": 1.3568907976150513, + "learning_rate": 2.983525887890458e-05, + "loss": 0.6302, + "step": 8180 + }, + { + "epoch": 10.501925545571245, + "grad_norm": 2.3090548515319824, + "learning_rate": 2.983483097988875e-05, + "loss": 0.603, + "step": 8181 + }, + { + "epoch": 10.503209242618741, + "grad_norm": 2.389230489730835, + "learning_rate": 2.9834403080872915e-05, + "loss": 0.6093, + "step": 8182 + }, + { + "epoch": 10.504492939666239, + "grad_norm": 2.5670766830444336, + "learning_rate": 2.983397518185708e-05, + "loss": 0.6908, + "step": 8183 + }, + { + "epoch": 10.505776636713735, + "grad_norm": 1.643941044807434, + "learning_rate": 2.9833547282841252e-05, + "loss": 0.6312, + "step": 8184 + }, + { + "epoch": 10.507060333761233, + "grad_norm": 3.4562807083129883, + "learning_rate": 2.9833119383825417e-05, + "loss": 0.6609, + "step": 8185 + }, + { + "epoch": 10.508344030808729, + "grad_norm": 1.7108293771743774, + "learning_rate": 2.9832691484809585e-05, + "loss": 0.6174, + "step": 8186 + }, + { + "epoch": 10.509627727856225, + "grad_norm": 6.390148639678955, + "learning_rate": 2.9832263585793754e-05, + "loss": 0.6695, + "step": 8187 + }, + { + "epoch": 10.510911424903723, + "grad_norm": 2.4320967197418213, + "learning_rate": 2.9831835686777922e-05, + "loss": 0.6763, + "step": 8188 + }, + { + "epoch": 10.512195121951219, + "grad_norm": 2.76669979095459, + "learning_rate": 2.983140778776209e-05, + "loss": 0.7109, + "step": 8189 + }, + { + "epoch": 10.513478818998717, + "grad_norm": 4.718850612640381, + "learning_rate": 2.9830979888746256e-05, + "loss": 0.8457, + "step": 8190 + }, + { + "epoch": 10.514762516046213, + "grad_norm": 1.5143725872039795, + "learning_rate": 2.9830551989730424e-05, + "loss": 0.5493, + "step": 8191 + }, + { + "epoch": 10.51604621309371, + "grad_norm": 3.302055835723877, + "learning_rate": 2.9830124090714593e-05, + "loss": 0.5412, + "step": 8192 + }, + { + "epoch": 10.517329910141207, + "grad_norm": 2.352090358734131, + "learning_rate": 2.982969619169876e-05, + "loss": 0.5357, + "step": 8193 + }, + { + "epoch": 10.518613607188703, + "grad_norm": 3.478477716445923, + "learning_rate": 2.9829268292682926e-05, + "loss": 0.5363, + "step": 8194 + }, + { + "epoch": 10.519897304236201, + "grad_norm": 1.3922646045684814, + "learning_rate": 2.9828840393667098e-05, + "loss": 0.5725, + "step": 8195 + }, + { + "epoch": 10.521181001283697, + "grad_norm": 1.5412118434906006, + "learning_rate": 2.9828412494651263e-05, + "loss": 0.5406, + "step": 8196 + }, + { + "epoch": 10.522464698331193, + "grad_norm": 1.1230440139770508, + "learning_rate": 2.982798459563543e-05, + "loss": 0.5448, + "step": 8197 + }, + { + "epoch": 10.523748395378691, + "grad_norm": 3.3479013442993164, + "learning_rate": 2.98275566966196e-05, + "loss": 0.5624, + "step": 8198 + }, + { + "epoch": 10.525032092426187, + "grad_norm": 1.7868428230285645, + "learning_rate": 2.9827128797603765e-05, + "loss": 0.5666, + "step": 8199 + }, + { + "epoch": 10.526315789473685, + "grad_norm": 3.3789222240448, + "learning_rate": 2.9826700898587936e-05, + "loss": 0.5521, + "step": 8200 + }, + { + "epoch": 10.527599486521181, + "grad_norm": 1.9537642002105713, + "learning_rate": 2.98262729995721e-05, + "loss": 0.6031, + "step": 8201 + }, + { + "epoch": 10.528883183568677, + "grad_norm": 1.8358675241470337, + "learning_rate": 2.982584510055627e-05, + "loss": 0.586, + "step": 8202 + }, + { + "epoch": 10.530166880616175, + "grad_norm": 1.9595043659210205, + "learning_rate": 2.982541720154044e-05, + "loss": 0.5431, + "step": 8203 + }, + { + "epoch": 10.531450577663671, + "grad_norm": 4.249312400817871, + "learning_rate": 2.9824989302524603e-05, + "loss": 0.6043, + "step": 8204 + }, + { + "epoch": 10.532734274711169, + "grad_norm": 1.8185003995895386, + "learning_rate": 2.9824561403508772e-05, + "loss": 0.5792, + "step": 8205 + }, + { + "epoch": 10.534017971758665, + "grad_norm": 1.0111074447631836, + "learning_rate": 2.982413350449294e-05, + "loss": 0.5194, + "step": 8206 + }, + { + "epoch": 10.535301668806161, + "grad_norm": 3.4852519035339355, + "learning_rate": 2.982370560547711e-05, + "loss": 0.5831, + "step": 8207 + }, + { + "epoch": 10.536585365853659, + "grad_norm": 1.1595711708068848, + "learning_rate": 2.9823277706461277e-05, + "loss": 0.5511, + "step": 8208 + }, + { + "epoch": 10.537869062901155, + "grad_norm": 1.8670698404312134, + "learning_rate": 2.9822849807445442e-05, + "loss": 0.5824, + "step": 8209 + }, + { + "epoch": 10.539152759948653, + "grad_norm": 1.9737694263458252, + "learning_rate": 2.982242190842961e-05, + "loss": 0.5285, + "step": 8210 + }, + { + "epoch": 10.540436456996149, + "grad_norm": 1.368362307548523, + "learning_rate": 2.982199400941378e-05, + "loss": 0.5303, + "step": 8211 + }, + { + "epoch": 10.541720154043645, + "grad_norm": 2.795543909072876, + "learning_rate": 2.9821566110397947e-05, + "loss": 0.5492, + "step": 8212 + }, + { + "epoch": 10.543003851091143, + "grad_norm": 1.0939817428588867, + "learning_rate": 2.9821138211382112e-05, + "loss": 0.5513, + "step": 8213 + }, + { + "epoch": 10.544287548138639, + "grad_norm": 2.4725217819213867, + "learning_rate": 2.9820710312366284e-05, + "loss": 0.5833, + "step": 8214 + }, + { + "epoch": 10.545571245186135, + "grad_norm": 1.4899924993515015, + "learning_rate": 2.982028241335045e-05, + "loss": 0.595, + "step": 8215 + }, + { + "epoch": 10.546854942233633, + "grad_norm": 1.6153597831726074, + "learning_rate": 2.9819854514334618e-05, + "loss": 0.5978, + "step": 8216 + }, + { + "epoch": 10.54813863928113, + "grad_norm": 1.1485682725906372, + "learning_rate": 2.9819426615318786e-05, + "loss": 0.6094, + "step": 8217 + }, + { + "epoch": 10.549422336328627, + "grad_norm": 1.5665003061294556, + "learning_rate": 2.981899871630295e-05, + "loss": 0.5864, + "step": 8218 + }, + { + "epoch": 10.550706033376123, + "grad_norm": 3.3055431842803955, + "learning_rate": 2.9818570817287123e-05, + "loss": 0.587, + "step": 8219 + }, + { + "epoch": 10.55198973042362, + "grad_norm": 2.063412666320801, + "learning_rate": 2.9818142918271288e-05, + "loss": 0.5747, + "step": 8220 + }, + { + "epoch": 10.553273427471117, + "grad_norm": 2.4140679836273193, + "learning_rate": 2.9817715019255456e-05, + "loss": 0.5673, + "step": 8221 + }, + { + "epoch": 10.554557124518613, + "grad_norm": 0.8976830840110779, + "learning_rate": 2.9817287120239625e-05, + "loss": 0.5851, + "step": 8222 + }, + { + "epoch": 10.555840821566111, + "grad_norm": 2.9000375270843506, + "learning_rate": 2.981685922122379e-05, + "loss": 0.625, + "step": 8223 + }, + { + "epoch": 10.557124518613607, + "grad_norm": 1.751625895500183, + "learning_rate": 2.981643132220796e-05, + "loss": 0.6083, + "step": 8224 + }, + { + "epoch": 10.558408215661103, + "grad_norm": 1.2360718250274658, + "learning_rate": 2.9816003423192126e-05, + "loss": 0.5865, + "step": 8225 + }, + { + "epoch": 10.559691912708601, + "grad_norm": 8.052068710327148, + "learning_rate": 2.9815575524176295e-05, + "loss": 0.5642, + "step": 8226 + }, + { + "epoch": 10.560975609756097, + "grad_norm": 4.467218399047852, + "learning_rate": 2.9815147625160463e-05, + "loss": 0.6273, + "step": 8227 + }, + { + "epoch": 10.562259306803595, + "grad_norm": 4.177084445953369, + "learning_rate": 2.981471972614463e-05, + "loss": 0.6058, + "step": 8228 + }, + { + "epoch": 10.563543003851091, + "grad_norm": 2.8975298404693604, + "learning_rate": 2.9814291827128797e-05, + "loss": 0.596, + "step": 8229 + }, + { + "epoch": 10.564826700898587, + "grad_norm": 2.6602814197540283, + "learning_rate": 2.9813863928112965e-05, + "loss": 0.593, + "step": 8230 + }, + { + "epoch": 10.566110397946085, + "grad_norm": 2.9669411182403564, + "learning_rate": 2.9813436029097134e-05, + "loss": 0.6659, + "step": 8231 + }, + { + "epoch": 10.567394094993581, + "grad_norm": 1.9488439559936523, + "learning_rate": 2.9813008130081302e-05, + "loss": 0.618, + "step": 8232 + }, + { + "epoch": 10.568677792041079, + "grad_norm": 3.3879644870758057, + "learning_rate": 2.981258023106547e-05, + "loss": 0.6388, + "step": 8233 + }, + { + "epoch": 10.569961489088575, + "grad_norm": 4.106447696685791, + "learning_rate": 2.9812152332049635e-05, + "loss": 0.6692, + "step": 8234 + }, + { + "epoch": 10.571245186136071, + "grad_norm": 2.3561952114105225, + "learning_rate": 2.9811724433033807e-05, + "loss": 0.6576, + "step": 8235 + }, + { + "epoch": 10.572528883183569, + "grad_norm": 2.735276937484741, + "learning_rate": 2.9811296534017972e-05, + "loss": 0.6421, + "step": 8236 + }, + { + "epoch": 10.573812580231065, + "grad_norm": 2.2589259147644043, + "learning_rate": 2.9810868635002137e-05, + "loss": 0.6292, + "step": 8237 + }, + { + "epoch": 10.575096277278563, + "grad_norm": 3.7057740688323975, + "learning_rate": 2.981044073598631e-05, + "loss": 0.6993, + "step": 8238 + }, + { + "epoch": 10.57637997432606, + "grad_norm": 2.702697515487671, + "learning_rate": 2.9810012836970474e-05, + "loss": 0.6819, + "step": 8239 + }, + { + "epoch": 10.577663671373555, + "grad_norm": 2.9163010120391846, + "learning_rate": 2.9809584937954646e-05, + "loss": 0.8246, + "step": 8240 + }, + { + "epoch": 10.578947368421053, + "grad_norm": 2.704259157180786, + "learning_rate": 2.980915703893881e-05, + "loss": 0.5465, + "step": 8241 + }, + { + "epoch": 10.58023106546855, + "grad_norm": 1.7002429962158203, + "learning_rate": 2.980872913992298e-05, + "loss": 0.583, + "step": 8242 + }, + { + "epoch": 10.581514762516047, + "grad_norm": 2.770017623901367, + "learning_rate": 2.9808301240907148e-05, + "loss": 0.5777, + "step": 8243 + }, + { + "epoch": 10.582798459563543, + "grad_norm": 2.8674325942993164, + "learning_rate": 2.9807873341891313e-05, + "loss": 0.5269, + "step": 8244 + }, + { + "epoch": 10.58408215661104, + "grad_norm": 2.780228614807129, + "learning_rate": 2.980744544287548e-05, + "loss": 0.6204, + "step": 8245 + }, + { + "epoch": 10.585365853658537, + "grad_norm": 1.6847480535507202, + "learning_rate": 2.980701754385965e-05, + "loss": 0.5261, + "step": 8246 + }, + { + "epoch": 10.586649550706033, + "grad_norm": 1.554937481880188, + "learning_rate": 2.9806589644843818e-05, + "loss": 0.5809, + "step": 8247 + }, + { + "epoch": 10.58793324775353, + "grad_norm": 1.4871716499328613, + "learning_rate": 2.9806161745827986e-05, + "loss": 0.5508, + "step": 8248 + }, + { + "epoch": 10.589216944801027, + "grad_norm": 2.879518508911133, + "learning_rate": 2.9805733846812155e-05, + "loss": 0.5462, + "step": 8249 + }, + { + "epoch": 10.590500641848523, + "grad_norm": 3.1218209266662598, + "learning_rate": 2.980530594779632e-05, + "loss": 0.5412, + "step": 8250 + }, + { + "epoch": 10.591784338896021, + "grad_norm": 1.5125224590301514, + "learning_rate": 2.9804878048780488e-05, + "loss": 0.5205, + "step": 8251 + }, + { + "epoch": 10.593068035943517, + "grad_norm": 3.6585299968719482, + "learning_rate": 2.9804450149764657e-05, + "loss": 0.5974, + "step": 8252 + }, + { + "epoch": 10.594351732991013, + "grad_norm": 1.2134395837783813, + "learning_rate": 2.980402225074882e-05, + "loss": 0.5192, + "step": 8253 + }, + { + "epoch": 10.595635430038511, + "grad_norm": 4.6921000480651855, + "learning_rate": 2.9803594351732993e-05, + "loss": 0.5531, + "step": 8254 + }, + { + "epoch": 10.596919127086007, + "grad_norm": 5.457936763763428, + "learning_rate": 2.980316645271716e-05, + "loss": 0.5769, + "step": 8255 + }, + { + "epoch": 10.598202824133505, + "grad_norm": 2.008594036102295, + "learning_rate": 2.980273855370133e-05, + "loss": 0.5596, + "step": 8256 + }, + { + "epoch": 10.599486521181001, + "grad_norm": 17.17001724243164, + "learning_rate": 2.9802310654685495e-05, + "loss": 0.5149, + "step": 8257 + }, + { + "epoch": 10.600770218228497, + "grad_norm": 3.466803550720215, + "learning_rate": 2.980188275566966e-05, + "loss": 0.5892, + "step": 8258 + }, + { + "epoch": 10.602053915275995, + "grad_norm": 1.1279048919677734, + "learning_rate": 2.9801454856653832e-05, + "loss": 0.5752, + "step": 8259 + }, + { + "epoch": 10.603337612323491, + "grad_norm": 1.1190606355667114, + "learning_rate": 2.9801026957637997e-05, + "loss": 0.5478, + "step": 8260 + }, + { + "epoch": 10.60462130937099, + "grad_norm": 4.5618157386779785, + "learning_rate": 2.9800599058622166e-05, + "loss": 0.5906, + "step": 8261 + }, + { + "epoch": 10.605905006418485, + "grad_norm": 2.516662359237671, + "learning_rate": 2.9800171159606334e-05, + "loss": 0.5856, + "step": 8262 + }, + { + "epoch": 10.607188703465981, + "grad_norm": 2.0869269371032715, + "learning_rate": 2.9799743260590502e-05, + "loss": 0.5784, + "step": 8263 + }, + { + "epoch": 10.60847240051348, + "grad_norm": 3.042435884475708, + "learning_rate": 2.979931536157467e-05, + "loss": 0.5391, + "step": 8264 + }, + { + "epoch": 10.609756097560975, + "grad_norm": 2.4730782508850098, + "learning_rate": 2.9798887462558836e-05, + "loss": 0.5991, + "step": 8265 + }, + { + "epoch": 10.611039794608473, + "grad_norm": 2.386530876159668, + "learning_rate": 2.9798459563543004e-05, + "loss": 0.5659, + "step": 8266 + }, + { + "epoch": 10.61232349165597, + "grad_norm": 2.279752492904663, + "learning_rate": 2.9798031664527173e-05, + "loss": 0.5884, + "step": 8267 + }, + { + "epoch": 10.613607188703465, + "grad_norm": 2.4978652000427246, + "learning_rate": 2.979760376551134e-05, + "loss": 0.5367, + "step": 8268 + }, + { + "epoch": 10.614890885750963, + "grad_norm": 6.154351234436035, + "learning_rate": 2.9797175866495506e-05, + "loss": 0.5899, + "step": 8269 + }, + { + "epoch": 10.61617458279846, + "grad_norm": 2.5572967529296875, + "learning_rate": 2.9796747967479674e-05, + "loss": 0.6814, + "step": 8270 + }, + { + "epoch": 10.617458279845957, + "grad_norm": 5.098578453063965, + "learning_rate": 2.9796320068463843e-05, + "loss": 0.6125, + "step": 8271 + }, + { + "epoch": 10.618741976893453, + "grad_norm": 2.1448583602905273, + "learning_rate": 2.979589216944801e-05, + "loss": 0.6108, + "step": 8272 + }, + { + "epoch": 10.62002567394095, + "grad_norm": 1.4369373321533203, + "learning_rate": 2.979546427043218e-05, + "loss": 0.5827, + "step": 8273 + }, + { + "epoch": 10.621309370988447, + "grad_norm": 7.725733757019043, + "learning_rate": 2.9795036371416345e-05, + "loss": 0.6289, + "step": 8274 + }, + { + "epoch": 10.622593068035943, + "grad_norm": 2.0286471843719482, + "learning_rate": 2.9794608472400517e-05, + "loss": 0.5765, + "step": 8275 + }, + { + "epoch": 10.623876765083441, + "grad_norm": 8.147302627563477, + "learning_rate": 2.979418057338468e-05, + "loss": 0.619, + "step": 8276 + }, + { + "epoch": 10.625160462130937, + "grad_norm": 5.332424163818359, + "learning_rate": 2.9793752674368847e-05, + "loss": 0.5929, + "step": 8277 + }, + { + "epoch": 10.626444159178433, + "grad_norm": 1.9664514064788818, + "learning_rate": 2.979332477535302e-05, + "loss": 0.6358, + "step": 8278 + }, + { + "epoch": 10.627727856225931, + "grad_norm": 1.7330172061920166, + "learning_rate": 2.9792896876337183e-05, + "loss": 0.5679, + "step": 8279 + }, + { + "epoch": 10.629011553273427, + "grad_norm": 2.1967689990997314, + "learning_rate": 2.9792468977321355e-05, + "loss": 0.5776, + "step": 8280 + }, + { + "epoch": 10.630295250320923, + "grad_norm": 1.4458062648773193, + "learning_rate": 2.979204107830552e-05, + "loss": 0.6113, + "step": 8281 + }, + { + "epoch": 10.631578947368421, + "grad_norm": 2.485443353652954, + "learning_rate": 2.979161317928969e-05, + "loss": 0.5988, + "step": 8282 + }, + { + "epoch": 10.632862644415917, + "grad_norm": 2.7144973278045654, + "learning_rate": 2.9791185280273857e-05, + "loss": 0.5989, + "step": 8283 + }, + { + "epoch": 10.634146341463415, + "grad_norm": 1.4904310703277588, + "learning_rate": 2.9790757381258022e-05, + "loss": 0.6058, + "step": 8284 + }, + { + "epoch": 10.635430038510911, + "grad_norm": 2.3679327964782715, + "learning_rate": 2.979032948224219e-05, + "loss": 0.6442, + "step": 8285 + }, + { + "epoch": 10.63671373555841, + "grad_norm": 1.9044631719589233, + "learning_rate": 2.978990158322636e-05, + "loss": 0.6333, + "step": 8286 + }, + { + "epoch": 10.637997432605905, + "grad_norm": 2.8049468994140625, + "learning_rate": 2.9789473684210527e-05, + "loss": 0.6605, + "step": 8287 + }, + { + "epoch": 10.639281129653401, + "grad_norm": 6.669823169708252, + "learning_rate": 2.9789045785194696e-05, + "loss": 0.6567, + "step": 8288 + }, + { + "epoch": 10.6405648267009, + "grad_norm": 2.0673773288726807, + "learning_rate": 2.9788617886178864e-05, + "loss": 0.7249, + "step": 8289 + }, + { + "epoch": 10.641848523748395, + "grad_norm": 2.953158378601074, + "learning_rate": 2.978818998716303e-05, + "loss": 0.8095, + "step": 8290 + }, + { + "epoch": 10.643132220795891, + "grad_norm": 1.7780213356018066, + "learning_rate": 2.9787762088147198e-05, + "loss": 0.6096, + "step": 8291 + }, + { + "epoch": 10.64441591784339, + "grad_norm": 1.280380368232727, + "learning_rate": 2.9787334189131366e-05, + "loss": 0.5113, + "step": 8292 + }, + { + "epoch": 10.645699614890885, + "grad_norm": 1.3816447257995605, + "learning_rate": 2.978690629011553e-05, + "loss": 0.5512, + "step": 8293 + }, + { + "epoch": 10.646983311938383, + "grad_norm": 1.7765480279922485, + "learning_rate": 2.9786478391099703e-05, + "loss": 0.577, + "step": 8294 + }, + { + "epoch": 10.64826700898588, + "grad_norm": 1.2902251482009888, + "learning_rate": 2.9786050492083868e-05, + "loss": 0.5867, + "step": 8295 + }, + { + "epoch": 10.649550706033375, + "grad_norm": 2.0027480125427246, + "learning_rate": 2.978562259306804e-05, + "loss": 0.5728, + "step": 8296 + }, + { + "epoch": 10.650834403080873, + "grad_norm": 1.9682096242904663, + "learning_rate": 2.9785194694052205e-05, + "loss": 0.6272, + "step": 8297 + }, + { + "epoch": 10.65211810012837, + "grad_norm": 3.9554250240325928, + "learning_rate": 2.978476679503637e-05, + "loss": 0.5654, + "step": 8298 + }, + { + "epoch": 10.653401797175867, + "grad_norm": 2.000824213027954, + "learning_rate": 2.978433889602054e-05, + "loss": 0.576, + "step": 8299 + }, + { + "epoch": 10.654685494223363, + "grad_norm": 1.5244115591049194, + "learning_rate": 2.9783910997004706e-05, + "loss": 0.5728, + "step": 8300 + }, + { + "epoch": 10.65596919127086, + "grad_norm": 1.4169775247573853, + "learning_rate": 2.9783483097988875e-05, + "loss": 0.5269, + "step": 8301 + }, + { + "epoch": 10.657252888318357, + "grad_norm": 2.0352208614349365, + "learning_rate": 2.9783055198973043e-05, + "loss": 0.5706, + "step": 8302 + }, + { + "epoch": 10.658536585365853, + "grad_norm": 2.621589183807373, + "learning_rate": 2.9782627299957212e-05, + "loss": 0.5944, + "step": 8303 + }, + { + "epoch": 10.659820282413351, + "grad_norm": 1.5867189168930054, + "learning_rate": 2.978219940094138e-05, + "loss": 0.5526, + "step": 8304 + }, + { + "epoch": 10.661103979460847, + "grad_norm": 1.8233246803283691, + "learning_rate": 2.9781771501925545e-05, + "loss": 0.5524, + "step": 8305 + }, + { + "epoch": 10.662387676508343, + "grad_norm": 1.583677053451538, + "learning_rate": 2.9781343602909714e-05, + "loss": 0.6125, + "step": 8306 + }, + { + "epoch": 10.663671373555841, + "grad_norm": 1.309335708618164, + "learning_rate": 2.9780915703893882e-05, + "loss": 0.5509, + "step": 8307 + }, + { + "epoch": 10.664955070603337, + "grad_norm": 2.819226026535034, + "learning_rate": 2.978048780487805e-05, + "loss": 0.5364, + "step": 8308 + }, + { + "epoch": 10.666238767650835, + "grad_norm": 4.116434574127197, + "learning_rate": 2.9780059905862215e-05, + "loss": 0.5892, + "step": 8309 + }, + { + "epoch": 10.667522464698331, + "grad_norm": 3.3414835929870605, + "learning_rate": 2.9779632006846387e-05, + "loss": 0.5796, + "step": 8310 + }, + { + "epoch": 10.668806161745827, + "grad_norm": 1.6892818212509155, + "learning_rate": 2.9779204107830552e-05, + "loss": 0.5913, + "step": 8311 + }, + { + "epoch": 10.670089858793325, + "grad_norm": 1.5955226421356201, + "learning_rate": 2.977877620881472e-05, + "loss": 0.5359, + "step": 8312 + }, + { + "epoch": 10.671373555840821, + "grad_norm": 3.3690693378448486, + "learning_rate": 2.977834830979889e-05, + "loss": 0.5581, + "step": 8313 + }, + { + "epoch": 10.672657252888317, + "grad_norm": 1.6092580556869507, + "learning_rate": 2.9777920410783054e-05, + "loss": 0.5954, + "step": 8314 + }, + { + "epoch": 10.673940949935815, + "grad_norm": 1.8276472091674805, + "learning_rate": 2.9777492511767226e-05, + "loss": 0.572, + "step": 8315 + }, + { + "epoch": 10.675224646983311, + "grad_norm": 1.4462521076202393, + "learning_rate": 2.977706461275139e-05, + "loss": 0.5865, + "step": 8316 + }, + { + "epoch": 10.67650834403081, + "grad_norm": 4.064497470855713, + "learning_rate": 2.977663671373556e-05, + "loss": 0.606, + "step": 8317 + }, + { + "epoch": 10.677792041078305, + "grad_norm": 1.5806634426116943, + "learning_rate": 2.9776208814719728e-05, + "loss": 0.5862, + "step": 8318 + }, + { + "epoch": 10.679075738125803, + "grad_norm": 1.474338173866272, + "learning_rate": 2.9775780915703893e-05, + "loss": 0.5829, + "step": 8319 + }, + { + "epoch": 10.6803594351733, + "grad_norm": 1.31576669216156, + "learning_rate": 2.9775353016688065e-05, + "loss": 0.5524, + "step": 8320 + }, + { + "epoch": 10.681643132220795, + "grad_norm": 1.613953709602356, + "learning_rate": 2.977492511767223e-05, + "loss": 0.6243, + "step": 8321 + }, + { + "epoch": 10.682926829268293, + "grad_norm": 1.7774896621704102, + "learning_rate": 2.9774497218656398e-05, + "loss": 0.5794, + "step": 8322 + }, + { + "epoch": 10.68421052631579, + "grad_norm": 1.7750210762023926, + "learning_rate": 2.9774069319640566e-05, + "loss": 0.5969, + "step": 8323 + }, + { + "epoch": 10.685494223363285, + "grad_norm": 3.310584545135498, + "learning_rate": 2.9773641420624735e-05, + "loss": 0.5377, + "step": 8324 + }, + { + "epoch": 10.686777920410783, + "grad_norm": 1.8288562297821045, + "learning_rate": 2.97732135216089e-05, + "loss": 0.588, + "step": 8325 + }, + { + "epoch": 10.68806161745828, + "grad_norm": 2.060612916946411, + "learning_rate": 2.9772785622593068e-05, + "loss": 0.576, + "step": 8326 + }, + { + "epoch": 10.689345314505777, + "grad_norm": 3.0318567752838135, + "learning_rate": 2.9772357723577237e-05, + "loss": 0.6839, + "step": 8327 + }, + { + "epoch": 10.690629011553273, + "grad_norm": 1.4424389600753784, + "learning_rate": 2.9771929824561405e-05, + "loss": 0.6162, + "step": 8328 + }, + { + "epoch": 10.69191270860077, + "grad_norm": 1.5953869819641113, + "learning_rate": 2.9771501925545573e-05, + "loss": 0.6244, + "step": 8329 + }, + { + "epoch": 10.693196405648267, + "grad_norm": 1.7845457792282104, + "learning_rate": 2.977107402652974e-05, + "loss": 0.6198, + "step": 8330 + }, + { + "epoch": 10.694480102695763, + "grad_norm": 1.7170931100845337, + "learning_rate": 2.9770646127513907e-05, + "loss": 0.5806, + "step": 8331 + }, + { + "epoch": 10.695763799743261, + "grad_norm": 1.321736216545105, + "learning_rate": 2.9770218228498075e-05, + "loss": 0.5787, + "step": 8332 + }, + { + "epoch": 10.697047496790757, + "grad_norm": 1.3897325992584229, + "learning_rate": 2.976979032948224e-05, + "loss": 0.6554, + "step": 8333 + }, + { + "epoch": 10.698331193838253, + "grad_norm": 3.903073310852051, + "learning_rate": 2.9769362430466412e-05, + "loss": 0.5788, + "step": 8334 + }, + { + "epoch": 10.699614890885751, + "grad_norm": 1.5167738199234009, + "learning_rate": 2.9768934531450577e-05, + "loss": 0.6204, + "step": 8335 + }, + { + "epoch": 10.700898587933247, + "grad_norm": 2.2609379291534424, + "learning_rate": 2.976850663243475e-05, + "loss": 0.6867, + "step": 8336 + }, + { + "epoch": 10.702182284980745, + "grad_norm": 4.469411373138428, + "learning_rate": 2.9768078733418914e-05, + "loss": 0.6431, + "step": 8337 + }, + { + "epoch": 10.703465982028241, + "grad_norm": 2.6038098335266113, + "learning_rate": 2.976765083440308e-05, + "loss": 0.6811, + "step": 8338 + }, + { + "epoch": 10.704749679075737, + "grad_norm": 3.9199962615966797, + "learning_rate": 2.976722293538725e-05, + "loss": 0.7304, + "step": 8339 + }, + { + "epoch": 10.706033376123235, + "grad_norm": 3.1841626167297363, + "learning_rate": 2.9766795036371416e-05, + "loss": 0.8184, + "step": 8340 + }, + { + "epoch": 10.707317073170731, + "grad_norm": 1.543274998664856, + "learning_rate": 2.9766367137355584e-05, + "loss": 0.5797, + "step": 8341 + }, + { + "epoch": 10.70860077021823, + "grad_norm": 0.9987638592720032, + "learning_rate": 2.9765939238339753e-05, + "loss": 0.5621, + "step": 8342 + }, + { + "epoch": 10.709884467265725, + "grad_norm": 1.39008367061615, + "learning_rate": 2.976551133932392e-05, + "loss": 0.5604, + "step": 8343 + }, + { + "epoch": 10.711168164313221, + "grad_norm": 1.987013816833496, + "learning_rate": 2.976508344030809e-05, + "loss": 0.5778, + "step": 8344 + }, + { + "epoch": 10.71245186136072, + "grad_norm": 4.245272636413574, + "learning_rate": 2.9764655541292255e-05, + "loss": 0.5416, + "step": 8345 + }, + { + "epoch": 10.713735558408215, + "grad_norm": 1.1563996076583862, + "learning_rate": 2.9764227642276423e-05, + "loss": 0.5633, + "step": 8346 + }, + { + "epoch": 10.715019255455712, + "grad_norm": 1.7561758756637573, + "learning_rate": 2.976379974326059e-05, + "loss": 0.5545, + "step": 8347 + }, + { + "epoch": 10.71630295250321, + "grad_norm": 0.9266384840011597, + "learning_rate": 2.976337184424476e-05, + "loss": 0.5841, + "step": 8348 + }, + { + "epoch": 10.717586649550706, + "grad_norm": 1.1199333667755127, + "learning_rate": 2.9762943945228925e-05, + "loss": 0.5214, + "step": 8349 + }, + { + "epoch": 10.718870346598203, + "grad_norm": 1.1725424528121948, + "learning_rate": 2.9762516046213097e-05, + "loss": 0.6147, + "step": 8350 + }, + { + "epoch": 10.7201540436457, + "grad_norm": 2.574521780014038, + "learning_rate": 2.976208814719726e-05, + "loss": 0.5892, + "step": 8351 + }, + { + "epoch": 10.721437740693197, + "grad_norm": 1.5159229040145874, + "learning_rate": 2.976166024818143e-05, + "loss": 0.5564, + "step": 8352 + }, + { + "epoch": 10.722721437740693, + "grad_norm": 2.380215644836426, + "learning_rate": 2.97612323491656e-05, + "loss": 0.55, + "step": 8353 + }, + { + "epoch": 10.72400513478819, + "grad_norm": 1.4466063976287842, + "learning_rate": 2.9760804450149763e-05, + "loss": 0.5979, + "step": 8354 + }, + { + "epoch": 10.725288831835687, + "grad_norm": 0.931843101978302, + "learning_rate": 2.9760376551133935e-05, + "loss": 0.5237, + "step": 8355 + }, + { + "epoch": 10.726572528883183, + "grad_norm": 1.543905258178711, + "learning_rate": 2.97599486521181e-05, + "loss": 0.5544, + "step": 8356 + }, + { + "epoch": 10.72785622593068, + "grad_norm": 1.7068471908569336, + "learning_rate": 2.975952075310227e-05, + "loss": 0.5252, + "step": 8357 + }, + { + "epoch": 10.729139922978177, + "grad_norm": 1.4171749353408813, + "learning_rate": 2.9759092854086437e-05, + "loss": 0.5478, + "step": 8358 + }, + { + "epoch": 10.730423620025674, + "grad_norm": 5.344094753265381, + "learning_rate": 2.9758664955070602e-05, + "loss": 0.5405, + "step": 8359 + }, + { + "epoch": 10.731707317073171, + "grad_norm": 1.1190804243087769, + "learning_rate": 2.9758237056054774e-05, + "loss": 0.572, + "step": 8360 + }, + { + "epoch": 10.732991014120667, + "grad_norm": 2.206756114959717, + "learning_rate": 2.975780915703894e-05, + "loss": 0.5542, + "step": 8361 + }, + { + "epoch": 10.734274711168164, + "grad_norm": 3.956267833709717, + "learning_rate": 2.9757381258023107e-05, + "loss": 0.5449, + "step": 8362 + }, + { + "epoch": 10.735558408215661, + "grad_norm": 1.8614187240600586, + "learning_rate": 2.9756953359007276e-05, + "loss": 0.5295, + "step": 8363 + }, + { + "epoch": 10.736842105263158, + "grad_norm": 2.7800660133361816, + "learning_rate": 2.9756525459991444e-05, + "loss": 0.5445, + "step": 8364 + }, + { + "epoch": 10.738125802310655, + "grad_norm": 1.907910943031311, + "learning_rate": 2.975609756097561e-05, + "loss": 0.5782, + "step": 8365 + }, + { + "epoch": 10.739409499358151, + "grad_norm": 19.003562927246094, + "learning_rate": 2.9755669661959778e-05, + "loss": 0.5658, + "step": 8366 + }, + { + "epoch": 10.740693196405648, + "grad_norm": 1.5886917114257812, + "learning_rate": 2.9755241762943946e-05, + "loss": 0.5552, + "step": 8367 + }, + { + "epoch": 10.741976893453145, + "grad_norm": 5.796630859375, + "learning_rate": 2.9754813863928114e-05, + "loss": 0.5685, + "step": 8368 + }, + { + "epoch": 10.743260590500642, + "grad_norm": 1.2104430198669434, + "learning_rate": 2.9754385964912283e-05, + "loss": 0.5945, + "step": 8369 + }, + { + "epoch": 10.74454428754814, + "grad_norm": 1.733641266822815, + "learning_rate": 2.9753958065896448e-05, + "loss": 0.5977, + "step": 8370 + }, + { + "epoch": 10.745827984595635, + "grad_norm": 10.008526802062988, + "learning_rate": 2.975353016688062e-05, + "loss": 0.5427, + "step": 8371 + }, + { + "epoch": 10.747111681643132, + "grad_norm": 4.860391139984131, + "learning_rate": 2.9753102267864785e-05, + "loss": 0.583, + "step": 8372 + }, + { + "epoch": 10.74839537869063, + "grad_norm": 3.3717246055603027, + "learning_rate": 2.975267436884895e-05, + "loss": 0.5764, + "step": 8373 + }, + { + "epoch": 10.749679075738126, + "grad_norm": 2.733750104904175, + "learning_rate": 2.975224646983312e-05, + "loss": 0.5737, + "step": 8374 + }, + { + "epoch": 10.750962772785623, + "grad_norm": 3.1473517417907715, + "learning_rate": 2.9751818570817287e-05, + "loss": 0.599, + "step": 8375 + }, + { + "epoch": 10.75224646983312, + "grad_norm": 4.037353992462158, + "learning_rate": 2.975139067180146e-05, + "loss": 0.5819, + "step": 8376 + }, + { + "epoch": 10.753530166880616, + "grad_norm": 3.233990430831909, + "learning_rate": 2.9750962772785623e-05, + "loss": 0.5917, + "step": 8377 + }, + { + "epoch": 10.754813863928113, + "grad_norm": 1.4648561477661133, + "learning_rate": 2.9750534873769792e-05, + "loss": 0.605, + "step": 8378 + }, + { + "epoch": 10.75609756097561, + "grad_norm": 16.012521743774414, + "learning_rate": 2.975010697475396e-05, + "loss": 0.6311, + "step": 8379 + }, + { + "epoch": 10.757381258023106, + "grad_norm": 2.7132887840270996, + "learning_rate": 2.9749679075738125e-05, + "loss": 0.6251, + "step": 8380 + }, + { + "epoch": 10.758664955070603, + "grad_norm": 2.8680922985076904, + "learning_rate": 2.9749251176722294e-05, + "loss": 0.5624, + "step": 8381 + }, + { + "epoch": 10.7599486521181, + "grad_norm": 2.0195581912994385, + "learning_rate": 2.9748823277706462e-05, + "loss": 0.5942, + "step": 8382 + }, + { + "epoch": 10.761232349165597, + "grad_norm": 2.022737503051758, + "learning_rate": 2.974839537869063e-05, + "loss": 0.6015, + "step": 8383 + }, + { + "epoch": 10.762516046213094, + "grad_norm": 7.606781005859375, + "learning_rate": 2.97479674796748e-05, + "loss": 0.5636, + "step": 8384 + }, + { + "epoch": 10.763799743260591, + "grad_norm": 1.8766413927078247, + "learning_rate": 2.9747539580658967e-05, + "loss": 0.5761, + "step": 8385 + }, + { + "epoch": 10.765083440308088, + "grad_norm": 2.5594403743743896, + "learning_rate": 2.9747111681643132e-05, + "loss": 0.7119, + "step": 8386 + }, + { + "epoch": 10.766367137355584, + "grad_norm": 11.926742553710938, + "learning_rate": 2.97466837826273e-05, + "loss": 0.6531, + "step": 8387 + }, + { + "epoch": 10.767650834403081, + "grad_norm": 6.396458625793457, + "learning_rate": 2.974625588361147e-05, + "loss": 0.6158, + "step": 8388 + }, + { + "epoch": 10.768934531450578, + "grad_norm": 3.8468308448791504, + "learning_rate": 2.9745827984595634e-05, + "loss": 0.7061, + "step": 8389 + }, + { + "epoch": 10.770218228498074, + "grad_norm": 7.178188800811768, + "learning_rate": 2.9745400085579806e-05, + "loss": 0.8743, + "step": 8390 + }, + { + "epoch": 10.771501925545572, + "grad_norm": 1.4400789737701416, + "learning_rate": 2.974497218656397e-05, + "loss": 0.5283, + "step": 8391 + }, + { + "epoch": 10.772785622593068, + "grad_norm": 7.396529197692871, + "learning_rate": 2.974454428754814e-05, + "loss": 0.5636, + "step": 8392 + }, + { + "epoch": 10.774069319640565, + "grad_norm": 2.7571640014648438, + "learning_rate": 2.9744116388532308e-05, + "loss": 0.5867, + "step": 8393 + }, + { + "epoch": 10.775353016688062, + "grad_norm": 2.8197181224823, + "learning_rate": 2.9743688489516473e-05, + "loss": 0.5611, + "step": 8394 + }, + { + "epoch": 10.776636713735558, + "grad_norm": 4.092532157897949, + "learning_rate": 2.9743260590500645e-05, + "loss": 0.5558, + "step": 8395 + }, + { + "epoch": 10.777920410783056, + "grad_norm": 1.6314681768417358, + "learning_rate": 2.974283269148481e-05, + "loss": 0.5511, + "step": 8396 + }, + { + "epoch": 10.779204107830552, + "grad_norm": 1.5502843856811523, + "learning_rate": 2.9742404792468978e-05, + "loss": 0.5889, + "step": 8397 + }, + { + "epoch": 10.78048780487805, + "grad_norm": 2.176866054534912, + "learning_rate": 2.9741976893453146e-05, + "loss": 0.558, + "step": 8398 + }, + { + "epoch": 10.781771501925546, + "grad_norm": 1.5690664052963257, + "learning_rate": 2.974154899443731e-05, + "loss": 0.5495, + "step": 8399 + }, + { + "epoch": 10.783055198973042, + "grad_norm": 1.7676506042480469, + "learning_rate": 2.9741121095421483e-05, + "loss": 0.582, + "step": 8400 + }, + { + "epoch": 10.78433889602054, + "grad_norm": 1.4046751260757446, + "learning_rate": 2.974069319640565e-05, + "loss": 0.5757, + "step": 8401 + }, + { + "epoch": 10.785622593068036, + "grad_norm": 2.5550103187561035, + "learning_rate": 2.9740265297389817e-05, + "loss": 0.5169, + "step": 8402 + }, + { + "epoch": 10.786906290115533, + "grad_norm": 2.714611291885376, + "learning_rate": 2.9739837398373985e-05, + "loss": 0.5766, + "step": 8403 + }, + { + "epoch": 10.78818998716303, + "grad_norm": 1.969034194946289, + "learning_rate": 2.9739409499358154e-05, + "loss": 0.5431, + "step": 8404 + }, + { + "epoch": 10.789473684210526, + "grad_norm": 1.8011521100997925, + "learning_rate": 2.973898160034232e-05, + "loss": 0.5457, + "step": 8405 + }, + { + "epoch": 10.790757381258024, + "grad_norm": 3.920494556427002, + "learning_rate": 2.9738553701326487e-05, + "loss": 0.5538, + "step": 8406 + }, + { + "epoch": 10.79204107830552, + "grad_norm": 3.6628870964050293, + "learning_rate": 2.9738125802310655e-05, + "loss": 0.5732, + "step": 8407 + }, + { + "epoch": 10.793324775353017, + "grad_norm": 1.5575255155563354, + "learning_rate": 2.973769790329482e-05, + "loss": 0.5668, + "step": 8408 + }, + { + "epoch": 10.794608472400514, + "grad_norm": 4.303025722503662, + "learning_rate": 2.9737270004278992e-05, + "loss": 0.573, + "step": 8409 + }, + { + "epoch": 10.79589216944801, + "grad_norm": 1.189267635345459, + "learning_rate": 2.9736842105263157e-05, + "loss": 0.5416, + "step": 8410 + }, + { + "epoch": 10.797175866495508, + "grad_norm": 2.6090123653411865, + "learning_rate": 2.973641420624733e-05, + "loss": 0.5694, + "step": 8411 + }, + { + "epoch": 10.798459563543004, + "grad_norm": 1.602265477180481, + "learning_rate": 2.9735986307231494e-05, + "loss": 0.5598, + "step": 8412 + }, + { + "epoch": 10.7997432605905, + "grad_norm": 1.1750268936157227, + "learning_rate": 2.973555840821566e-05, + "loss": 0.5131, + "step": 8413 + }, + { + "epoch": 10.801026957637998, + "grad_norm": 1.7771201133728027, + "learning_rate": 2.973513050919983e-05, + "loss": 0.4993, + "step": 8414 + }, + { + "epoch": 10.802310654685494, + "grad_norm": 4.595146656036377, + "learning_rate": 2.9734702610183996e-05, + "loss": 0.5493, + "step": 8415 + }, + { + "epoch": 10.803594351732992, + "grad_norm": 1.6114611625671387, + "learning_rate": 2.9734274711168164e-05, + "loss": 0.5364, + "step": 8416 + }, + { + "epoch": 10.804878048780488, + "grad_norm": 1.2752888202667236, + "learning_rate": 2.9733846812152333e-05, + "loss": 0.6423, + "step": 8417 + }, + { + "epoch": 10.806161745827985, + "grad_norm": 2.7281010150909424, + "learning_rate": 2.97334189131365e-05, + "loss": 0.5844, + "step": 8418 + }, + { + "epoch": 10.807445442875482, + "grad_norm": 1.350009799003601, + "learning_rate": 2.973299101412067e-05, + "loss": 0.5592, + "step": 8419 + }, + { + "epoch": 10.808729139922978, + "grad_norm": 1.0770567655563354, + "learning_rate": 2.9732563115104835e-05, + "loss": 0.5623, + "step": 8420 + }, + { + "epoch": 10.810012836970476, + "grad_norm": 7.027641296386719, + "learning_rate": 2.9732135216089003e-05, + "loss": 0.5708, + "step": 8421 + }, + { + "epoch": 10.811296534017972, + "grad_norm": 2.095461130142212, + "learning_rate": 2.973170731707317e-05, + "loss": 0.5433, + "step": 8422 + }, + { + "epoch": 10.812580231065468, + "grad_norm": 13.605138778686523, + "learning_rate": 2.973127941805734e-05, + "loss": 0.5942, + "step": 8423 + }, + { + "epoch": 10.813863928112966, + "grad_norm": 2.802309989929199, + "learning_rate": 2.9730851519041505e-05, + "loss": 0.5637, + "step": 8424 + }, + { + "epoch": 10.815147625160462, + "grad_norm": 1.6252893209457397, + "learning_rate": 2.9730423620025677e-05, + "loss": 0.5581, + "step": 8425 + }, + { + "epoch": 10.81643132220796, + "grad_norm": 1.6393601894378662, + "learning_rate": 2.972999572100984e-05, + "loss": 0.5645, + "step": 8426 + }, + { + "epoch": 10.817715019255456, + "grad_norm": 7.0067338943481445, + "learning_rate": 2.972956782199401e-05, + "loss": 0.5773, + "step": 8427 + }, + { + "epoch": 10.818998716302952, + "grad_norm": 1.9504603147506714, + "learning_rate": 2.972913992297818e-05, + "loss": 0.5947, + "step": 8428 + }, + { + "epoch": 10.82028241335045, + "grad_norm": 1.642313838005066, + "learning_rate": 2.9728712023962344e-05, + "loss": 0.5839, + "step": 8429 + }, + { + "epoch": 10.821566110397946, + "grad_norm": 2.2592716217041016, + "learning_rate": 2.9728284124946515e-05, + "loss": 0.6267, + "step": 8430 + }, + { + "epoch": 10.822849807445444, + "grad_norm": 5.691397666931152, + "learning_rate": 2.972785622593068e-05, + "loss": 0.5644, + "step": 8431 + }, + { + "epoch": 10.82413350449294, + "grad_norm": 1.9076051712036133, + "learning_rate": 2.972742832691485e-05, + "loss": 0.5937, + "step": 8432 + }, + { + "epoch": 10.825417201540436, + "grad_norm": 2.3443379402160645, + "learning_rate": 2.9727000427899017e-05, + "loss": 0.6313, + "step": 8433 + }, + { + "epoch": 10.826700898587934, + "grad_norm": 2.5076210498809814, + "learning_rate": 2.9726572528883182e-05, + "loss": 0.6484, + "step": 8434 + }, + { + "epoch": 10.82798459563543, + "grad_norm": 1.474199652671814, + "learning_rate": 2.9726144629867354e-05, + "loss": 0.6687, + "step": 8435 + }, + { + "epoch": 10.829268292682928, + "grad_norm": 2.601379632949829, + "learning_rate": 2.972571673085152e-05, + "loss": 0.6719, + "step": 8436 + }, + { + "epoch": 10.830551989730424, + "grad_norm": 1.5112476348876953, + "learning_rate": 2.9725288831835687e-05, + "loss": 0.6262, + "step": 8437 + }, + { + "epoch": 10.83183568677792, + "grad_norm": 2.5556352138519287, + "learning_rate": 2.9724860932819856e-05, + "loss": 0.6847, + "step": 8438 + }, + { + "epoch": 10.833119383825418, + "grad_norm": 2.6237664222717285, + "learning_rate": 2.9724433033804024e-05, + "loss": 0.7206, + "step": 8439 + }, + { + "epoch": 10.834403080872914, + "grad_norm": 2.2070343494415283, + "learning_rate": 2.972400513478819e-05, + "loss": 0.7642, + "step": 8440 + }, + { + "epoch": 10.835686777920412, + "grad_norm": 1.2869268655776978, + "learning_rate": 2.9723577235772358e-05, + "loss": 0.54, + "step": 8441 + }, + { + "epoch": 10.836970474967908, + "grad_norm": 2.3037209510803223, + "learning_rate": 2.9723149336756526e-05, + "loss": 0.5455, + "step": 8442 + }, + { + "epoch": 10.838254172015404, + "grad_norm": 1.4868922233581543, + "learning_rate": 2.9722721437740695e-05, + "loss": 0.5993, + "step": 8443 + }, + { + "epoch": 10.839537869062902, + "grad_norm": 1.3850572109222412, + "learning_rate": 2.9722293538724863e-05, + "loss": 0.5672, + "step": 8444 + }, + { + "epoch": 10.840821566110398, + "grad_norm": 1.5646988153457642, + "learning_rate": 2.9721865639709028e-05, + "loss": 0.5357, + "step": 8445 + }, + { + "epoch": 10.842105263157894, + "grad_norm": 2.0660042762756348, + "learning_rate": 2.97214377406932e-05, + "loss": 0.5634, + "step": 8446 + }, + { + "epoch": 10.843388960205392, + "grad_norm": 3.4059340953826904, + "learning_rate": 2.9721009841677365e-05, + "loss": 0.5037, + "step": 8447 + }, + { + "epoch": 10.844672657252888, + "grad_norm": 1.7610958814620972, + "learning_rate": 2.972058194266153e-05, + "loss": 0.5699, + "step": 8448 + }, + { + "epoch": 10.845956354300386, + "grad_norm": 2.6168599128723145, + "learning_rate": 2.97201540436457e-05, + "loss": 0.483, + "step": 8449 + }, + { + "epoch": 10.847240051347882, + "grad_norm": 2.2620580196380615, + "learning_rate": 2.9719726144629867e-05, + "loss": 0.5809, + "step": 8450 + }, + { + "epoch": 10.84852374839538, + "grad_norm": 1.6263033151626587, + "learning_rate": 2.971929824561404e-05, + "loss": 0.5768, + "step": 8451 + }, + { + "epoch": 10.849807445442876, + "grad_norm": 1.7850728034973145, + "learning_rate": 2.9718870346598203e-05, + "loss": 0.601, + "step": 8452 + }, + { + "epoch": 10.851091142490372, + "grad_norm": 1.942370057106018, + "learning_rate": 2.9718442447582372e-05, + "loss": 0.5679, + "step": 8453 + }, + { + "epoch": 10.85237483953787, + "grad_norm": 1.6845017671585083, + "learning_rate": 2.971801454856654e-05, + "loss": 0.5714, + "step": 8454 + }, + { + "epoch": 10.853658536585366, + "grad_norm": 2.0845305919647217, + "learning_rate": 2.9717586649550705e-05, + "loss": 0.5915, + "step": 8455 + }, + { + "epoch": 10.854942233632862, + "grad_norm": 10.446776390075684, + "learning_rate": 2.9717158750534874e-05, + "loss": 0.5772, + "step": 8456 + }, + { + "epoch": 10.85622593068036, + "grad_norm": 3.0778210163116455, + "learning_rate": 2.9716730851519042e-05, + "loss": 0.6023, + "step": 8457 + }, + { + "epoch": 10.857509627727856, + "grad_norm": 1.5664820671081543, + "learning_rate": 2.971630295250321e-05, + "loss": 0.5399, + "step": 8458 + }, + { + "epoch": 10.858793324775354, + "grad_norm": 4.507162094116211, + "learning_rate": 2.971587505348738e-05, + "loss": 0.5714, + "step": 8459 + }, + { + "epoch": 10.86007702182285, + "grad_norm": 1.8731515407562256, + "learning_rate": 2.9715447154471544e-05, + "loss": 0.5238, + "step": 8460 + }, + { + "epoch": 10.861360718870346, + "grad_norm": 1.6043652296066284, + "learning_rate": 2.9715019255455712e-05, + "loss": 0.593, + "step": 8461 + }, + { + "epoch": 10.862644415917844, + "grad_norm": 6.8703389167785645, + "learning_rate": 2.971459135643988e-05, + "loss": 0.5787, + "step": 8462 + }, + { + "epoch": 10.86392811296534, + "grad_norm": 1.5090988874435425, + "learning_rate": 2.971416345742405e-05, + "loss": 0.508, + "step": 8463 + }, + { + "epoch": 10.865211810012838, + "grad_norm": 1.3542718887329102, + "learning_rate": 2.9713735558408214e-05, + "loss": 0.5574, + "step": 8464 + }, + { + "epoch": 10.866495507060334, + "grad_norm": 3.3701651096343994, + "learning_rate": 2.9713307659392386e-05, + "loss": 0.5522, + "step": 8465 + }, + { + "epoch": 10.86777920410783, + "grad_norm": 2.019657611846924, + "learning_rate": 2.971287976037655e-05, + "loss": 0.5453, + "step": 8466 + }, + { + "epoch": 10.869062901155328, + "grad_norm": 3.1282765865325928, + "learning_rate": 2.971245186136072e-05, + "loss": 0.5447, + "step": 8467 + }, + { + "epoch": 10.870346598202824, + "grad_norm": 3.038999080657959, + "learning_rate": 2.9712023962344888e-05, + "loss": 0.5659, + "step": 8468 + }, + { + "epoch": 10.871630295250322, + "grad_norm": 2.0292093753814697, + "learning_rate": 2.9711596063329053e-05, + "loss": 0.6023, + "step": 8469 + }, + { + "epoch": 10.872913992297818, + "grad_norm": 2.689160108566284, + "learning_rate": 2.9711168164313225e-05, + "loss": 0.5966, + "step": 8470 + }, + { + "epoch": 10.874197689345314, + "grad_norm": 2.221039056777954, + "learning_rate": 2.971074026529739e-05, + "loss": 0.5883, + "step": 8471 + }, + { + "epoch": 10.875481386392812, + "grad_norm": 1.8027640581130981, + "learning_rate": 2.9710312366281558e-05, + "loss": 0.5471, + "step": 8472 + }, + { + "epoch": 10.876765083440308, + "grad_norm": 1.9126149415969849, + "learning_rate": 2.9709884467265727e-05, + "loss": 0.6421, + "step": 8473 + }, + { + "epoch": 10.878048780487806, + "grad_norm": 1.5234663486480713, + "learning_rate": 2.970945656824989e-05, + "loss": 0.5817, + "step": 8474 + }, + { + "epoch": 10.879332477535302, + "grad_norm": 2.0821893215179443, + "learning_rate": 2.9709028669234063e-05, + "loss": 0.5827, + "step": 8475 + }, + { + "epoch": 10.880616174582798, + "grad_norm": 2.6148407459259033, + "learning_rate": 2.970860077021823e-05, + "loss": 0.5985, + "step": 8476 + }, + { + "epoch": 10.881899871630296, + "grad_norm": 2.2796449661254883, + "learning_rate": 2.9708172871202397e-05, + "loss": 0.6201, + "step": 8477 + }, + { + "epoch": 10.883183568677792, + "grad_norm": 5.25133752822876, + "learning_rate": 2.9707744972186565e-05, + "loss": 0.5927, + "step": 8478 + }, + { + "epoch": 10.88446726572529, + "grad_norm": 4.348674297332764, + "learning_rate": 2.9707317073170734e-05, + "loss": 0.5974, + "step": 8479 + }, + { + "epoch": 10.885750962772786, + "grad_norm": 1.9566413164138794, + "learning_rate": 2.97068891741549e-05, + "loss": 0.596, + "step": 8480 + }, + { + "epoch": 10.887034659820282, + "grad_norm": 2.077775478363037, + "learning_rate": 2.9706461275139067e-05, + "loss": 0.6126, + "step": 8481 + }, + { + "epoch": 10.88831835686778, + "grad_norm": 4.059676170349121, + "learning_rate": 2.9706033376123235e-05, + "loss": 0.6543, + "step": 8482 + }, + { + "epoch": 10.889602053915276, + "grad_norm": 7.320502281188965, + "learning_rate": 2.9705605477107404e-05, + "loss": 0.5671, + "step": 8483 + }, + { + "epoch": 10.890885750962774, + "grad_norm": 6.932172775268555, + "learning_rate": 2.9705177578091572e-05, + "loss": 0.5988, + "step": 8484 + }, + { + "epoch": 10.89216944801027, + "grad_norm": 3.3869643211364746, + "learning_rate": 2.9704749679075737e-05, + "loss": 0.633, + "step": 8485 + }, + { + "epoch": 10.893453145057766, + "grad_norm": 1.940058946609497, + "learning_rate": 2.970432178005991e-05, + "loss": 0.62, + "step": 8486 + }, + { + "epoch": 10.894736842105264, + "grad_norm": 1.278308629989624, + "learning_rate": 2.9703893881044074e-05, + "loss": 0.6731, + "step": 8487 + }, + { + "epoch": 10.89602053915276, + "grad_norm": 1.7955987453460693, + "learning_rate": 2.970346598202824e-05, + "loss": 0.7676, + "step": 8488 + }, + { + "epoch": 10.897304236200256, + "grad_norm": 2.7483553886413574, + "learning_rate": 2.970303808301241e-05, + "loss": 0.7287, + "step": 8489 + }, + { + "epoch": 10.898587933247754, + "grad_norm": 4.977984428405762, + "learning_rate": 2.9702610183996576e-05, + "loss": 0.9125, + "step": 8490 + }, + { + "epoch": 10.89987163029525, + "grad_norm": 1.4454643726348877, + "learning_rate": 2.9702182284980748e-05, + "loss": 0.5132, + "step": 8491 + }, + { + "epoch": 10.901155327342748, + "grad_norm": 1.2802035808563232, + "learning_rate": 2.9701754385964913e-05, + "loss": 0.5516, + "step": 8492 + }, + { + "epoch": 10.902439024390244, + "grad_norm": 4.256333827972412, + "learning_rate": 2.970132648694908e-05, + "loss": 0.5224, + "step": 8493 + }, + { + "epoch": 10.90372272143774, + "grad_norm": 1.9762952327728271, + "learning_rate": 2.970089858793325e-05, + "loss": 0.558, + "step": 8494 + }, + { + "epoch": 10.905006418485238, + "grad_norm": 1.515770673751831, + "learning_rate": 2.9700470688917415e-05, + "loss": 0.5727, + "step": 8495 + }, + { + "epoch": 10.906290115532734, + "grad_norm": 3.593242645263672, + "learning_rate": 2.9700042789901583e-05, + "loss": 0.5734, + "step": 8496 + }, + { + "epoch": 10.907573812580232, + "grad_norm": 1.4856661558151245, + "learning_rate": 2.969961489088575e-05, + "loss": 0.5437, + "step": 8497 + }, + { + "epoch": 10.908857509627728, + "grad_norm": 2.007547378540039, + "learning_rate": 2.969918699186992e-05, + "loss": 0.5927, + "step": 8498 + }, + { + "epoch": 10.910141206675224, + "grad_norm": 1.0538887977600098, + "learning_rate": 2.9698759092854088e-05, + "loss": 0.5705, + "step": 8499 + }, + { + "epoch": 10.911424903722722, + "grad_norm": 3.8736870288848877, + "learning_rate": 2.9698331193838257e-05, + "loss": 0.5663, + "step": 8500 + }, + { + "epoch": 10.912708600770218, + "grad_norm": 1.649274468421936, + "learning_rate": 2.9697903294822422e-05, + "loss": 0.5641, + "step": 8501 + }, + { + "epoch": 10.913992297817716, + "grad_norm": 3.304868698120117, + "learning_rate": 2.969747539580659e-05, + "loss": 0.5733, + "step": 8502 + }, + { + "epoch": 10.915275994865212, + "grad_norm": 0.9616047143936157, + "learning_rate": 2.969704749679076e-05, + "loss": 0.578, + "step": 8503 + }, + { + "epoch": 10.916559691912708, + "grad_norm": 1.6350455284118652, + "learning_rate": 2.9696619597774924e-05, + "loss": 0.5679, + "step": 8504 + }, + { + "epoch": 10.917843388960206, + "grad_norm": 2.4389376640319824, + "learning_rate": 2.9696191698759095e-05, + "loss": 0.5213, + "step": 8505 + }, + { + "epoch": 10.919127086007702, + "grad_norm": 1.393726110458374, + "learning_rate": 2.969576379974326e-05, + "loss": 0.546, + "step": 8506 + }, + { + "epoch": 10.9204107830552, + "grad_norm": 1.532312035560608, + "learning_rate": 2.9695335900727432e-05, + "loss": 0.5234, + "step": 8507 + }, + { + "epoch": 10.921694480102696, + "grad_norm": 2.2577154636383057, + "learning_rate": 2.9694908001711597e-05, + "loss": 0.5751, + "step": 8508 + }, + { + "epoch": 10.922978177150192, + "grad_norm": 4.488059043884277, + "learning_rate": 2.9694480102695762e-05, + "loss": 0.5705, + "step": 8509 + }, + { + "epoch": 10.92426187419769, + "grad_norm": 1.717396855354309, + "learning_rate": 2.9694052203679934e-05, + "loss": 0.5241, + "step": 8510 + }, + { + "epoch": 10.925545571245186, + "grad_norm": 3.974602460861206, + "learning_rate": 2.96936243046641e-05, + "loss": 0.5945, + "step": 8511 + }, + { + "epoch": 10.926829268292684, + "grad_norm": 3.1125903129577637, + "learning_rate": 2.9693196405648267e-05, + "loss": 0.5801, + "step": 8512 + }, + { + "epoch": 10.92811296534018, + "grad_norm": 1.985978603363037, + "learning_rate": 2.9692768506632436e-05, + "loss": 0.5707, + "step": 8513 + }, + { + "epoch": 10.929396662387676, + "grad_norm": 7.754435062408447, + "learning_rate": 2.9692340607616604e-05, + "loss": 0.5963, + "step": 8514 + }, + { + "epoch": 10.930680359435174, + "grad_norm": 1.6345889568328857, + "learning_rate": 2.9691912708600773e-05, + "loss": 0.6239, + "step": 8515 + }, + { + "epoch": 10.93196405648267, + "grad_norm": 1.5476454496383667, + "learning_rate": 2.9691484809584938e-05, + "loss": 0.5381, + "step": 8516 + }, + { + "epoch": 10.933247753530168, + "grad_norm": 1.7316142320632935, + "learning_rate": 2.9691056910569106e-05, + "loss": 0.563, + "step": 8517 + }, + { + "epoch": 10.934531450577664, + "grad_norm": 2.0030295848846436, + "learning_rate": 2.9690629011553275e-05, + "loss": 0.5343, + "step": 8518 + }, + { + "epoch": 10.93581514762516, + "grad_norm": 2.6779794692993164, + "learning_rate": 2.9690201112537443e-05, + "loss": 0.5572, + "step": 8519 + }, + { + "epoch": 10.937098844672658, + "grad_norm": 1.317755937576294, + "learning_rate": 2.9689773213521608e-05, + "loss": 0.5737, + "step": 8520 + }, + { + "epoch": 10.938382541720154, + "grad_norm": 1.7982478141784668, + "learning_rate": 2.9689345314505776e-05, + "loss": 0.5493, + "step": 8521 + }, + { + "epoch": 10.93966623876765, + "grad_norm": 1.6308609247207642, + "learning_rate": 2.9688917415489945e-05, + "loss": 0.5104, + "step": 8522 + }, + { + "epoch": 10.940949935815148, + "grad_norm": 4.011989116668701, + "learning_rate": 2.9688489516474113e-05, + "loss": 0.5865, + "step": 8523 + }, + { + "epoch": 10.942233632862644, + "grad_norm": 1.4731312990188599, + "learning_rate": 2.968806161745828e-05, + "loss": 0.5272, + "step": 8524 + }, + { + "epoch": 10.943517329910142, + "grad_norm": 3.6135506629943848, + "learning_rate": 2.9687633718442447e-05, + "loss": 0.5948, + "step": 8525 + }, + { + "epoch": 10.944801026957638, + "grad_norm": 1.7859621047973633, + "learning_rate": 2.968720581942662e-05, + "loss": 0.5929, + "step": 8526 + }, + { + "epoch": 10.946084724005134, + "grad_norm": 4.206851482391357, + "learning_rate": 2.9686777920410783e-05, + "loss": 0.6031, + "step": 8527 + }, + { + "epoch": 10.947368421052632, + "grad_norm": 3.71649432182312, + "learning_rate": 2.968635002139495e-05, + "loss": 0.6032, + "step": 8528 + }, + { + "epoch": 10.948652118100128, + "grad_norm": 2.001253366470337, + "learning_rate": 2.968592212237912e-05, + "loss": 0.5716, + "step": 8529 + }, + { + "epoch": 10.949935815147626, + "grad_norm": 3.6916701793670654, + "learning_rate": 2.9685494223363285e-05, + "loss": 0.6028, + "step": 8530 + }, + { + "epoch": 10.951219512195122, + "grad_norm": 3.167480230331421, + "learning_rate": 2.9685066324347457e-05, + "loss": 0.5956, + "step": 8531 + }, + { + "epoch": 10.952503209242618, + "grad_norm": 1.6652333736419678, + "learning_rate": 2.9684638425331622e-05, + "loss": 0.5804, + "step": 8532 + }, + { + "epoch": 10.953786906290116, + "grad_norm": 1.7780088186264038, + "learning_rate": 2.968421052631579e-05, + "loss": 0.6061, + "step": 8533 + }, + { + "epoch": 10.955070603337612, + "grad_norm": 2.654712438583374, + "learning_rate": 2.968378262729996e-05, + "loss": 0.6281, + "step": 8534 + }, + { + "epoch": 10.95635430038511, + "grad_norm": 3.57722806930542, + "learning_rate": 2.9683354728284124e-05, + "loss": 0.675, + "step": 8535 + }, + { + "epoch": 10.957637997432606, + "grad_norm": 1.4246045351028442, + "learning_rate": 2.9682926829268292e-05, + "loss": 0.6025, + "step": 8536 + }, + { + "epoch": 10.958921694480102, + "grad_norm": 2.7097623348236084, + "learning_rate": 2.968249893025246e-05, + "loss": 0.7173, + "step": 8537 + }, + { + "epoch": 10.9602053915276, + "grad_norm": 5.561372756958008, + "learning_rate": 2.968207103123663e-05, + "loss": 0.6208, + "step": 8538 + }, + { + "epoch": 10.961489088575096, + "grad_norm": 2.3990519046783447, + "learning_rate": 2.9681643132220798e-05, + "loss": 0.727, + "step": 8539 + }, + { + "epoch": 10.962772785622594, + "grad_norm": 2.395203113555908, + "learning_rate": 2.9681215233204966e-05, + "loss": 0.8455, + "step": 8540 + }, + { + "epoch": 10.96405648267009, + "grad_norm": 1.4537984132766724, + "learning_rate": 2.968078733418913e-05, + "loss": 0.568, + "step": 8541 + }, + { + "epoch": 10.965340179717586, + "grad_norm": 4.008123874664307, + "learning_rate": 2.96803594351733e-05, + "loss": 0.5759, + "step": 8542 + }, + { + "epoch": 10.966623876765084, + "grad_norm": 1.5204176902770996, + "learning_rate": 2.9679931536157468e-05, + "loss": 0.5664, + "step": 8543 + }, + { + "epoch": 10.96790757381258, + "grad_norm": 1.0555717945098877, + "learning_rate": 2.9679503637141633e-05, + "loss": 0.5333, + "step": 8544 + }, + { + "epoch": 10.969191270860078, + "grad_norm": 3.195934772491455, + "learning_rate": 2.9679075738125805e-05, + "loss": 0.5438, + "step": 8545 + }, + { + "epoch": 10.970474967907574, + "grad_norm": 1.9135290384292603, + "learning_rate": 2.967864783910997e-05, + "loss": 0.5402, + "step": 8546 + }, + { + "epoch": 10.97175866495507, + "grad_norm": 1.3743404150009155, + "learning_rate": 2.967821994009414e-05, + "loss": 0.5516, + "step": 8547 + }, + { + "epoch": 10.973042362002568, + "grad_norm": 1.5686269998550415, + "learning_rate": 2.9677792041078307e-05, + "loss": 0.561, + "step": 8548 + }, + { + "epoch": 10.974326059050064, + "grad_norm": 1.1636816263198853, + "learning_rate": 2.967736414206247e-05, + "loss": 0.5714, + "step": 8549 + }, + { + "epoch": 10.975609756097562, + "grad_norm": 1.1972390413284302, + "learning_rate": 2.9676936243046643e-05, + "loss": 0.5218, + "step": 8550 + }, + { + "epoch": 10.976893453145058, + "grad_norm": 3.16988468170166, + "learning_rate": 2.967650834403081e-05, + "loss": 0.6018, + "step": 8551 + }, + { + "epoch": 10.978177150192554, + "grad_norm": 1.1684681177139282, + "learning_rate": 2.9676080445014977e-05, + "loss": 0.5585, + "step": 8552 + }, + { + "epoch": 10.979460847240052, + "grad_norm": 1.4215651750564575, + "learning_rate": 2.9675652545999145e-05, + "loss": 0.5781, + "step": 8553 + }, + { + "epoch": 10.980744544287548, + "grad_norm": 2.104153633117676, + "learning_rate": 2.9675224646983314e-05, + "loss": 0.5782, + "step": 8554 + }, + { + "epoch": 10.982028241335044, + "grad_norm": 2.5567750930786133, + "learning_rate": 2.9674796747967482e-05, + "loss": 0.5516, + "step": 8555 + }, + { + "epoch": 10.983311938382542, + "grad_norm": 3.9561688899993896, + "learning_rate": 2.9674368848951647e-05, + "loss": 0.5922, + "step": 8556 + }, + { + "epoch": 10.984595635430038, + "grad_norm": 2.172687292098999, + "learning_rate": 2.9673940949935816e-05, + "loss": 0.5684, + "step": 8557 + }, + { + "epoch": 10.985879332477536, + "grad_norm": 1.704567313194275, + "learning_rate": 2.9673513050919984e-05, + "loss": 0.5689, + "step": 8558 + }, + { + "epoch": 10.987163029525032, + "grad_norm": 4.539129257202148, + "learning_rate": 2.9673085151904152e-05, + "loss": 0.5787, + "step": 8559 + }, + { + "epoch": 10.988446726572528, + "grad_norm": 1.852734923362732, + "learning_rate": 2.9672657252888317e-05, + "loss": 0.6282, + "step": 8560 + }, + { + "epoch": 10.989730423620026, + "grad_norm": 2.675172805786133, + "learning_rate": 2.967222935387249e-05, + "loss": 0.5668, + "step": 8561 + }, + { + "epoch": 10.991014120667522, + "grad_norm": 2.6914026737213135, + "learning_rate": 2.9671801454856654e-05, + "loss": 0.5622, + "step": 8562 + }, + { + "epoch": 10.99229781771502, + "grad_norm": 1.5746749639511108, + "learning_rate": 2.9671373555840823e-05, + "loss": 0.6189, + "step": 8563 + }, + { + "epoch": 10.993581514762516, + "grad_norm": 2.8081982135772705, + "learning_rate": 2.967094565682499e-05, + "loss": 0.6246, + "step": 8564 + }, + { + "epoch": 10.994865211810012, + "grad_norm": 3.0254950523376465, + "learning_rate": 2.9670517757809156e-05, + "loss": 0.6457, + "step": 8565 + }, + { + "epoch": 10.99614890885751, + "grad_norm": 2.5608348846435547, + "learning_rate": 2.9670089858793328e-05, + "loss": 0.6161, + "step": 8566 + }, + { + "epoch": 10.997432605905006, + "grad_norm": 2.2211780548095703, + "learning_rate": 2.9669661959777493e-05, + "loss": 0.6003, + "step": 8567 + }, + { + "epoch": 10.998716302952504, + "grad_norm": 5.165369510650635, + "learning_rate": 2.966923406076166e-05, + "loss": 0.7167, + "step": 8568 + }, + { + "epoch": 11.0, + "grad_norm": 3.6669304370880127, + "learning_rate": 2.966880616174583e-05, + "loss": 0.8057, + "step": 8569 + }, + { + "epoch": 11.001283697047496, + "grad_norm": 1.1892154216766357, + "learning_rate": 2.9668378262729995e-05, + "loss": 0.5475, + "step": 8570 + }, + { + "epoch": 11.002567394094994, + "grad_norm": 1.5319843292236328, + "learning_rate": 2.9667950363714167e-05, + "loss": 0.5145, + "step": 8571 + }, + { + "epoch": 11.00385109114249, + "grad_norm": 2.3662912845611572, + "learning_rate": 2.966752246469833e-05, + "loss": 0.5596, + "step": 8572 + }, + { + "epoch": 11.005134788189988, + "grad_norm": 1.541764259338379, + "learning_rate": 2.96670945656825e-05, + "loss": 0.5225, + "step": 8573 + }, + { + "epoch": 11.006418485237484, + "grad_norm": 1.2376567125320435, + "learning_rate": 2.966666666666667e-05, + "loss": 0.5314, + "step": 8574 + }, + { + "epoch": 11.00770218228498, + "grad_norm": 1.1614588499069214, + "learning_rate": 2.9666238767650837e-05, + "loss": 0.5161, + "step": 8575 + }, + { + "epoch": 11.008985879332478, + "grad_norm": 2.039641857147217, + "learning_rate": 2.9665810868635002e-05, + "loss": 0.5597, + "step": 8576 + }, + { + "epoch": 11.010269576379974, + "grad_norm": 1.4833791255950928, + "learning_rate": 2.966538296961917e-05, + "loss": 0.5577, + "step": 8577 + }, + { + "epoch": 11.011553273427472, + "grad_norm": 1.5173420906066895, + "learning_rate": 2.966495507060334e-05, + "loss": 0.532, + "step": 8578 + }, + { + "epoch": 11.012836970474968, + "grad_norm": 1.0747170448303223, + "learning_rate": 2.9664527171587507e-05, + "loss": 0.5555, + "step": 8579 + }, + { + "epoch": 11.014120667522464, + "grad_norm": 2.590625286102295, + "learning_rate": 2.9664099272571675e-05, + "loss": 0.5648, + "step": 8580 + }, + { + "epoch": 11.015404364569962, + "grad_norm": 2.1242117881774902, + "learning_rate": 2.966367137355584e-05, + "loss": 0.5159, + "step": 8581 + }, + { + "epoch": 11.016688061617458, + "grad_norm": 1.8722811937332153, + "learning_rate": 2.966324347454001e-05, + "loss": 0.5533, + "step": 8582 + }, + { + "epoch": 11.017971758664956, + "grad_norm": 1.5678722858428955, + "learning_rate": 2.9662815575524177e-05, + "loss": 0.5328, + "step": 8583 + }, + { + "epoch": 11.019255455712452, + "grad_norm": 1.6650067567825317, + "learning_rate": 2.9662387676508342e-05, + "loss": 0.5077, + "step": 8584 + }, + { + "epoch": 11.020539152759948, + "grad_norm": 27.077255249023438, + "learning_rate": 2.9661959777492514e-05, + "loss": 0.5844, + "step": 8585 + }, + { + "epoch": 11.021822849807446, + "grad_norm": 1.1639775037765503, + "learning_rate": 2.966153187847668e-05, + "loss": 0.6028, + "step": 8586 + }, + { + "epoch": 11.023106546854942, + "grad_norm": 1.3899257183074951, + "learning_rate": 2.966110397946085e-05, + "loss": 0.5284, + "step": 8587 + }, + { + "epoch": 11.024390243902438, + "grad_norm": 2.6164631843566895, + "learning_rate": 2.9660676080445016e-05, + "loss": 0.5464, + "step": 8588 + }, + { + "epoch": 11.025673940949936, + "grad_norm": 1.0521341562271118, + "learning_rate": 2.966024818142918e-05, + "loss": 0.5295, + "step": 8589 + }, + { + "epoch": 11.026957637997432, + "grad_norm": 1.4870014190673828, + "learning_rate": 2.9659820282413353e-05, + "loss": 0.5723, + "step": 8590 + }, + { + "epoch": 11.02824133504493, + "grad_norm": 3.947341203689575, + "learning_rate": 2.9659392383397518e-05, + "loss": 0.5248, + "step": 8591 + }, + { + "epoch": 11.029525032092426, + "grad_norm": 2.0798962116241455, + "learning_rate": 2.9658964484381686e-05, + "loss": 0.5456, + "step": 8592 + }, + { + "epoch": 11.030808729139922, + "grad_norm": 5.2287373542785645, + "learning_rate": 2.9658536585365855e-05, + "loss": 0.514, + "step": 8593 + }, + { + "epoch": 11.03209242618742, + "grad_norm": 3.227015256881714, + "learning_rate": 2.9658108686350023e-05, + "loss": 0.5489, + "step": 8594 + }, + { + "epoch": 11.033376123234916, + "grad_norm": 1.2861337661743164, + "learning_rate": 2.965768078733419e-05, + "loss": 0.6036, + "step": 8595 + }, + { + "epoch": 11.034659820282414, + "grad_norm": 1.3111519813537598, + "learning_rate": 2.9657252888318356e-05, + "loss": 0.557, + "step": 8596 + }, + { + "epoch": 11.03594351732991, + "grad_norm": 1.2193188667297363, + "learning_rate": 2.9656824989302525e-05, + "loss": 0.5412, + "step": 8597 + }, + { + "epoch": 11.037227214377406, + "grad_norm": 1.7122762203216553, + "learning_rate": 2.9656397090286693e-05, + "loss": 0.526, + "step": 8598 + }, + { + "epoch": 11.038510911424904, + "grad_norm": 1.842802882194519, + "learning_rate": 2.9655969191270862e-05, + "loss": 0.5926, + "step": 8599 + }, + { + "epoch": 11.0397946084724, + "grad_norm": 1.7573570013046265, + "learning_rate": 2.9655541292255027e-05, + "loss": 0.5404, + "step": 8600 + }, + { + "epoch": 11.041078305519898, + "grad_norm": 2.6225249767303467, + "learning_rate": 2.96551133932392e-05, + "loss": 0.5237, + "step": 8601 + }, + { + "epoch": 11.042362002567394, + "grad_norm": 1.0848647356033325, + "learning_rate": 2.9654685494223364e-05, + "loss": 0.5479, + "step": 8602 + }, + { + "epoch": 11.04364569961489, + "grad_norm": 2.3523166179656982, + "learning_rate": 2.9654257595207532e-05, + "loss": 0.5944, + "step": 8603 + }, + { + "epoch": 11.044929396662388, + "grad_norm": 2.9355762004852295, + "learning_rate": 2.96538296961917e-05, + "loss": 0.5486, + "step": 8604 + }, + { + "epoch": 11.046213093709884, + "grad_norm": 1.2065171003341675, + "learning_rate": 2.9653401797175865e-05, + "loss": 0.5685, + "step": 8605 + }, + { + "epoch": 11.047496790757382, + "grad_norm": 2.1890809535980225, + "learning_rate": 2.9652973898160037e-05, + "loss": 0.5757, + "step": 8606 + }, + { + "epoch": 11.048780487804878, + "grad_norm": 4.3867573738098145, + "learning_rate": 2.9652545999144202e-05, + "loss": 0.5531, + "step": 8607 + }, + { + "epoch": 11.050064184852374, + "grad_norm": 1.355055809020996, + "learning_rate": 2.965211810012837e-05, + "loss": 0.5787, + "step": 8608 + }, + { + "epoch": 11.051347881899872, + "grad_norm": 1.8579846620559692, + "learning_rate": 2.965169020111254e-05, + "loss": 0.5503, + "step": 8609 + }, + { + "epoch": 11.052631578947368, + "grad_norm": 3.0237488746643066, + "learning_rate": 2.9651262302096704e-05, + "loss": 0.6198, + "step": 8610 + }, + { + "epoch": 11.053915275994866, + "grad_norm": 2.9409282207489014, + "learning_rate": 2.9650834403080872e-05, + "loss": 0.6596, + "step": 8611 + }, + { + "epoch": 11.055198973042362, + "grad_norm": 2.2980635166168213, + "learning_rate": 2.965040650406504e-05, + "loss": 0.6003, + "step": 8612 + }, + { + "epoch": 11.056482670089858, + "grad_norm": 2.278310775756836, + "learning_rate": 2.964997860504921e-05, + "loss": 0.5847, + "step": 8613 + }, + { + "epoch": 11.057766367137356, + "grad_norm": 2.3825159072875977, + "learning_rate": 2.9649550706033378e-05, + "loss": 0.5971, + "step": 8614 + }, + { + "epoch": 11.059050064184852, + "grad_norm": 2.6809451580047607, + "learning_rate": 2.9649122807017546e-05, + "loss": 0.5885, + "step": 8615 + }, + { + "epoch": 11.06033376123235, + "grad_norm": 2.0249149799346924, + "learning_rate": 2.964869490800171e-05, + "loss": 0.6365, + "step": 8616 + }, + { + "epoch": 11.061617458279846, + "grad_norm": 3.979377508163452, + "learning_rate": 2.964826700898588e-05, + "loss": 0.5981, + "step": 8617 + }, + { + "epoch": 11.062901155327342, + "grad_norm": 2.6563496589660645, + "learning_rate": 2.9647839109970048e-05, + "loss": 0.7029, + "step": 8618 + }, + { + "epoch": 11.06418485237484, + "grad_norm": 3.985651969909668, + "learning_rate": 2.9647411210954213e-05, + "loss": 0.822, + "step": 8619 + }, + { + "epoch": 11.065468549422336, + "grad_norm": 0.932644248008728, + "learning_rate": 2.9646983311938385e-05, + "loss": 0.5051, + "step": 8620 + }, + { + "epoch": 11.066752246469832, + "grad_norm": 3.2242166996002197, + "learning_rate": 2.964655541292255e-05, + "loss": 0.5298, + "step": 8621 + }, + { + "epoch": 11.06803594351733, + "grad_norm": 1.5647015571594238, + "learning_rate": 2.964612751390672e-05, + "loss": 0.5528, + "step": 8622 + }, + { + "epoch": 11.069319640564826, + "grad_norm": 1.9987504482269287, + "learning_rate": 2.9645699614890887e-05, + "loss": 0.5529, + "step": 8623 + }, + { + "epoch": 11.070603337612324, + "grad_norm": 2.373969078063965, + "learning_rate": 2.964527171587505e-05, + "loss": 0.5509, + "step": 8624 + }, + { + "epoch": 11.07188703465982, + "grad_norm": 2.1654536724090576, + "learning_rate": 2.9644843816859223e-05, + "loss": 0.5659, + "step": 8625 + }, + { + "epoch": 11.073170731707316, + "grad_norm": 1.4392931461334229, + "learning_rate": 2.964441591784339e-05, + "loss": 0.5366, + "step": 8626 + }, + { + "epoch": 11.074454428754814, + "grad_norm": 1.5110360383987427, + "learning_rate": 2.9643988018827557e-05, + "loss": 0.5322, + "step": 8627 + }, + { + "epoch": 11.07573812580231, + "grad_norm": 2.4981820583343506, + "learning_rate": 2.9643560119811725e-05, + "loss": 0.571, + "step": 8628 + }, + { + "epoch": 11.077021822849808, + "grad_norm": 1.8383439779281616, + "learning_rate": 2.9643132220795894e-05, + "loss": 0.5837, + "step": 8629 + }, + { + "epoch": 11.078305519897304, + "grad_norm": 3.303847551345825, + "learning_rate": 2.9642704321780062e-05, + "loss": 0.5457, + "step": 8630 + }, + { + "epoch": 11.0795892169448, + "grad_norm": 1.307942271232605, + "learning_rate": 2.9642276422764227e-05, + "loss": 0.538, + "step": 8631 + }, + { + "epoch": 11.080872913992298, + "grad_norm": 9.519554138183594, + "learning_rate": 2.9641848523748396e-05, + "loss": 0.545, + "step": 8632 + }, + { + "epoch": 11.082156611039794, + "grad_norm": 3.284416675567627, + "learning_rate": 2.9641420624732564e-05, + "loss": 0.52, + "step": 8633 + }, + { + "epoch": 11.083440308087292, + "grad_norm": 2.496885299682617, + "learning_rate": 2.9640992725716732e-05, + "loss": 0.6168, + "step": 8634 + }, + { + "epoch": 11.084724005134788, + "grad_norm": 5.274321556091309, + "learning_rate": 2.9640564826700897e-05, + "loss": 0.5359, + "step": 8635 + }, + { + "epoch": 11.086007702182284, + "grad_norm": 2.877185583114624, + "learning_rate": 2.964013692768507e-05, + "loss": 0.5577, + "step": 8636 + }, + { + "epoch": 11.087291399229782, + "grad_norm": 4.366084575653076, + "learning_rate": 2.9639709028669234e-05, + "loss": 0.5237, + "step": 8637 + }, + { + "epoch": 11.088575096277278, + "grad_norm": 1.1624175310134888, + "learning_rate": 2.9639281129653403e-05, + "loss": 0.5565, + "step": 8638 + }, + { + "epoch": 11.089858793324776, + "grad_norm": 6.891331195831299, + "learning_rate": 2.963885323063757e-05, + "loss": 0.587, + "step": 8639 + }, + { + "epoch": 11.091142490372272, + "grad_norm": 3.6289310455322266, + "learning_rate": 2.9638425331621736e-05, + "loss": 0.5179, + "step": 8640 + }, + { + "epoch": 11.092426187419768, + "grad_norm": 3.3137195110321045, + "learning_rate": 2.9637997432605908e-05, + "loss": 0.563, + "step": 8641 + }, + { + "epoch": 11.093709884467266, + "grad_norm": 4.19423246383667, + "learning_rate": 2.9637569533590073e-05, + "loss": 0.581, + "step": 8642 + }, + { + "epoch": 11.094993581514762, + "grad_norm": 2.9055938720703125, + "learning_rate": 2.9637141634574238e-05, + "loss": 0.5575, + "step": 8643 + }, + { + "epoch": 11.09627727856226, + "grad_norm": 1.3121933937072754, + "learning_rate": 2.963671373555841e-05, + "loss": 0.5847, + "step": 8644 + }, + { + "epoch": 11.097560975609756, + "grad_norm": 1.9849541187286377, + "learning_rate": 2.9636285836542575e-05, + "loss": 0.5454, + "step": 8645 + }, + { + "epoch": 11.098844672657252, + "grad_norm": 1.9915344715118408, + "learning_rate": 2.9635857937526747e-05, + "loss": 0.5518, + "step": 8646 + }, + { + "epoch": 11.10012836970475, + "grad_norm": 3.223275899887085, + "learning_rate": 2.963543003851091e-05, + "loss": 0.5428, + "step": 8647 + }, + { + "epoch": 11.101412066752246, + "grad_norm": 1.4243210554122925, + "learning_rate": 2.963500213949508e-05, + "loss": 0.5679, + "step": 8648 + }, + { + "epoch": 11.102695763799744, + "grad_norm": 1.2481420040130615, + "learning_rate": 2.963457424047925e-05, + "loss": 0.6049, + "step": 8649 + }, + { + "epoch": 11.10397946084724, + "grad_norm": 3.174309015274048, + "learning_rate": 2.9634146341463413e-05, + "loss": 0.5876, + "step": 8650 + }, + { + "epoch": 11.105263157894736, + "grad_norm": 1.9612644910812378, + "learning_rate": 2.9633718442447582e-05, + "loss": 0.5639, + "step": 8651 + }, + { + "epoch": 11.106546854942234, + "grad_norm": 1.7976902723312378, + "learning_rate": 2.963329054343175e-05, + "loss": 0.6406, + "step": 8652 + }, + { + "epoch": 11.10783055198973, + "grad_norm": 4.070161819458008, + "learning_rate": 2.963286264441592e-05, + "loss": 0.5892, + "step": 8653 + }, + { + "epoch": 11.109114249037226, + "grad_norm": 1.6915584802627563, + "learning_rate": 2.9632434745400087e-05, + "loss": 0.5529, + "step": 8654 + }, + { + "epoch": 11.110397946084724, + "grad_norm": 2.94775652885437, + "learning_rate": 2.9632006846384256e-05, + "loss": 0.5789, + "step": 8655 + }, + { + "epoch": 11.11168164313222, + "grad_norm": 3.0415456295013428, + "learning_rate": 2.963157894736842e-05, + "loss": 0.6879, + "step": 8656 + }, + { + "epoch": 11.112965340179718, + "grad_norm": 3.274430751800537, + "learning_rate": 2.963115104835259e-05, + "loss": 0.5622, + "step": 8657 + }, + { + "epoch": 11.114249037227214, + "grad_norm": 2.8460748195648193, + "learning_rate": 2.9630723149336757e-05, + "loss": 0.6262, + "step": 8658 + }, + { + "epoch": 11.11553273427471, + "grad_norm": 2.3529560565948486, + "learning_rate": 2.9630295250320922e-05, + "loss": 0.5919, + "step": 8659 + }, + { + "epoch": 11.116816431322208, + "grad_norm": 3.3168511390686035, + "learning_rate": 2.9629867351305094e-05, + "loss": 0.6552, + "step": 8660 + }, + { + "epoch": 11.118100128369704, + "grad_norm": 3.5624287128448486, + "learning_rate": 2.962943945228926e-05, + "loss": 0.6334, + "step": 8661 + }, + { + "epoch": 11.119383825417202, + "grad_norm": 2.5505921840667725, + "learning_rate": 2.962901155327343e-05, + "loss": 0.5867, + "step": 8662 + }, + { + "epoch": 11.120667522464698, + "grad_norm": 2.364163875579834, + "learning_rate": 2.9628583654257596e-05, + "loss": 0.6279, + "step": 8663 + }, + { + "epoch": 11.121951219512194, + "grad_norm": 1.7666953802108765, + "learning_rate": 2.962815575524176e-05, + "loss": 0.5942, + "step": 8664 + }, + { + "epoch": 11.123234916559692, + "grad_norm": 1.645846962928772, + "learning_rate": 2.9627727856225933e-05, + "loss": 0.625, + "step": 8665 + }, + { + "epoch": 11.124518613607188, + "grad_norm": 2.264536142349243, + "learning_rate": 2.9627299957210098e-05, + "loss": 0.6232, + "step": 8666 + }, + { + "epoch": 11.125802310654686, + "grad_norm": 5.888008117675781, + "learning_rate": 2.9626872058194266e-05, + "loss": 0.6818, + "step": 8667 + }, + { + "epoch": 11.127086007702182, + "grad_norm": 4.109063148498535, + "learning_rate": 2.9626444159178435e-05, + "loss": 0.7209, + "step": 8668 + }, + { + "epoch": 11.128369704749678, + "grad_norm": 2.169084310531616, + "learning_rate": 2.9626016260162603e-05, + "loss": 0.8266, + "step": 8669 + }, + { + "epoch": 11.129653401797176, + "grad_norm": 4.2549824714660645, + "learning_rate": 2.962558836114677e-05, + "loss": 0.5318, + "step": 8670 + }, + { + "epoch": 11.130937098844672, + "grad_norm": 1.5091297626495361, + "learning_rate": 2.9625160462130937e-05, + "loss": 0.5456, + "step": 8671 + }, + { + "epoch": 11.13222079589217, + "grad_norm": 1.715837001800537, + "learning_rate": 2.9624732563115105e-05, + "loss": 0.5658, + "step": 8672 + }, + { + "epoch": 11.133504492939666, + "grad_norm": 0.8235031962394714, + "learning_rate": 2.9624304664099273e-05, + "loss": 0.5456, + "step": 8673 + }, + { + "epoch": 11.134788189987162, + "grad_norm": 1.4083298444747925, + "learning_rate": 2.9623876765083442e-05, + "loss": 0.5397, + "step": 8674 + }, + { + "epoch": 11.13607188703466, + "grad_norm": 3.049839496612549, + "learning_rate": 2.9623448866067607e-05, + "loss": 0.5356, + "step": 8675 + }, + { + "epoch": 11.137355584082156, + "grad_norm": 16.62083625793457, + "learning_rate": 2.962302096705178e-05, + "loss": 0.5582, + "step": 8676 + }, + { + "epoch": 11.138639281129654, + "grad_norm": 1.0076911449432373, + "learning_rate": 2.9622593068035944e-05, + "loss": 0.5421, + "step": 8677 + }, + { + "epoch": 11.13992297817715, + "grad_norm": 2.113797187805176, + "learning_rate": 2.9622165169020112e-05, + "loss": 0.5761, + "step": 8678 + }, + { + "epoch": 11.141206675224646, + "grad_norm": 2.318969488143921, + "learning_rate": 2.962173727000428e-05, + "loss": 0.56, + "step": 8679 + }, + { + "epoch": 11.142490372272144, + "grad_norm": 2.0826473236083984, + "learning_rate": 2.9621309370988445e-05, + "loss": 0.6094, + "step": 8680 + }, + { + "epoch": 11.14377406931964, + "grad_norm": 1.9010282754898071, + "learning_rate": 2.9620881471972617e-05, + "loss": 0.5848, + "step": 8681 + }, + { + "epoch": 11.145057766367138, + "grad_norm": 1.2002233266830444, + "learning_rate": 2.9620453572956782e-05, + "loss": 0.5415, + "step": 8682 + }, + { + "epoch": 11.146341463414634, + "grad_norm": 1.863533854484558, + "learning_rate": 2.962002567394095e-05, + "loss": 0.5326, + "step": 8683 + }, + { + "epoch": 11.14762516046213, + "grad_norm": 1.8378139734268188, + "learning_rate": 2.961959777492512e-05, + "loss": 0.5078, + "step": 8684 + }, + { + "epoch": 11.148908857509628, + "grad_norm": 1.5378198623657227, + "learning_rate": 2.9619169875909284e-05, + "loss": 0.5733, + "step": 8685 + }, + { + "epoch": 11.150192554557124, + "grad_norm": 2.1074962615966797, + "learning_rate": 2.9618741976893456e-05, + "loss": 0.5729, + "step": 8686 + }, + { + "epoch": 11.15147625160462, + "grad_norm": 2.1021265983581543, + "learning_rate": 2.961831407787762e-05, + "loss": 0.5625, + "step": 8687 + }, + { + "epoch": 11.152759948652118, + "grad_norm": 1.6781113147735596, + "learning_rate": 2.961788617886179e-05, + "loss": 0.5487, + "step": 8688 + }, + { + "epoch": 11.154043645699614, + "grad_norm": 8.955901145935059, + "learning_rate": 2.9617458279845958e-05, + "loss": 0.5459, + "step": 8689 + }, + { + "epoch": 11.155327342747112, + "grad_norm": 3.1409337520599365, + "learning_rate": 2.9617030380830126e-05, + "loss": 0.61, + "step": 8690 + }, + { + "epoch": 11.156611039794608, + "grad_norm": 1.2895146608352661, + "learning_rate": 2.961660248181429e-05, + "loss": 0.5481, + "step": 8691 + }, + { + "epoch": 11.157894736842104, + "grad_norm": 1.1343168020248413, + "learning_rate": 2.961617458279846e-05, + "loss": 0.5579, + "step": 8692 + }, + { + "epoch": 11.159178433889602, + "grad_norm": 0.8629897832870483, + "learning_rate": 2.9615746683782628e-05, + "loss": 0.5119, + "step": 8693 + }, + { + "epoch": 11.160462130937098, + "grad_norm": 3.909914493560791, + "learning_rate": 2.9615318784766796e-05, + "loss": 0.5847, + "step": 8694 + }, + { + "epoch": 11.161745827984596, + "grad_norm": 1.8187534809112549, + "learning_rate": 2.9614890885750965e-05, + "loss": 0.575, + "step": 8695 + }, + { + "epoch": 11.163029525032092, + "grad_norm": 0.9524403810501099, + "learning_rate": 2.961446298673513e-05, + "loss": 0.4989, + "step": 8696 + }, + { + "epoch": 11.164313222079588, + "grad_norm": 2.0361714363098145, + "learning_rate": 2.96140350877193e-05, + "loss": 0.5425, + "step": 8697 + }, + { + "epoch": 11.165596919127086, + "grad_norm": 4.810928821563721, + "learning_rate": 2.9613607188703467e-05, + "loss": 0.571, + "step": 8698 + }, + { + "epoch": 11.166880616174582, + "grad_norm": 1.796561360359192, + "learning_rate": 2.9613179289687632e-05, + "loss": 0.5806, + "step": 8699 + }, + { + "epoch": 11.16816431322208, + "grad_norm": 2.2313995361328125, + "learning_rate": 2.9612751390671804e-05, + "loss": 0.6055, + "step": 8700 + }, + { + "epoch": 11.169448010269576, + "grad_norm": 1.4386104345321655, + "learning_rate": 2.961232349165597e-05, + "loss": 0.5857, + "step": 8701 + }, + { + "epoch": 11.170731707317072, + "grad_norm": 2.0007195472717285, + "learning_rate": 2.961189559264014e-05, + "loss": 0.5544, + "step": 8702 + }, + { + "epoch": 11.17201540436457, + "grad_norm": 1.3037898540496826, + "learning_rate": 2.9611467693624305e-05, + "loss": 0.6048, + "step": 8703 + }, + { + "epoch": 11.173299101412066, + "grad_norm": 1.2609577178955078, + "learning_rate": 2.9611039794608474e-05, + "loss": 0.527, + "step": 8704 + }, + { + "epoch": 11.174582798459564, + "grad_norm": 1.760265588760376, + "learning_rate": 2.9610611895592642e-05, + "loss": 0.5617, + "step": 8705 + }, + { + "epoch": 11.17586649550706, + "grad_norm": 1.4497921466827393, + "learning_rate": 2.9610183996576807e-05, + "loss": 0.5863, + "step": 8706 + }, + { + "epoch": 11.177150192554556, + "grad_norm": 1.7032530307769775, + "learning_rate": 2.9609756097560976e-05, + "loss": 0.6296, + "step": 8707 + }, + { + "epoch": 11.178433889602054, + "grad_norm": 1.910943627357483, + "learning_rate": 2.9609328198545144e-05, + "loss": 0.5791, + "step": 8708 + }, + { + "epoch": 11.17971758664955, + "grad_norm": 1.9815632104873657, + "learning_rate": 2.9608900299529312e-05, + "loss": 0.5326, + "step": 8709 + }, + { + "epoch": 11.181001283697048, + "grad_norm": 5.024496555328369, + "learning_rate": 2.960847240051348e-05, + "loss": 0.5643, + "step": 8710 + }, + { + "epoch": 11.182284980744544, + "grad_norm": 1.3498578071594238, + "learning_rate": 2.9608044501497646e-05, + "loss": 0.5989, + "step": 8711 + }, + { + "epoch": 11.18356867779204, + "grad_norm": 1.6781631708145142, + "learning_rate": 2.9607616602481814e-05, + "loss": 0.6228, + "step": 8712 + }, + { + "epoch": 11.184852374839538, + "grad_norm": 7.632567882537842, + "learning_rate": 2.9607188703465983e-05, + "loss": 0.5787, + "step": 8713 + }, + { + "epoch": 11.186136071887034, + "grad_norm": 1.5214262008666992, + "learning_rate": 2.960676080445015e-05, + "loss": 0.6012, + "step": 8714 + }, + { + "epoch": 11.187419768934532, + "grad_norm": 1.5584912300109863, + "learning_rate": 2.9606332905434316e-05, + "loss": 0.5887, + "step": 8715 + }, + { + "epoch": 11.188703465982028, + "grad_norm": 3.128624200820923, + "learning_rate": 2.9605905006418488e-05, + "loss": 0.6155, + "step": 8716 + }, + { + "epoch": 11.189987163029524, + "grad_norm": 3.5312583446502686, + "learning_rate": 2.9605477107402653e-05, + "loss": 0.7106, + "step": 8717 + }, + { + "epoch": 11.191270860077022, + "grad_norm": 6.264497756958008, + "learning_rate": 2.960504920838682e-05, + "loss": 0.6909, + "step": 8718 + }, + { + "epoch": 11.192554557124518, + "grad_norm": 3.7712666988372803, + "learning_rate": 2.960462130937099e-05, + "loss": 0.8758, + "step": 8719 + }, + { + "epoch": 11.193838254172016, + "grad_norm": 2.4854636192321777, + "learning_rate": 2.9604193410355155e-05, + "loss": 0.5504, + "step": 8720 + }, + { + "epoch": 11.195121951219512, + "grad_norm": 2.576007127761841, + "learning_rate": 2.9603765511339327e-05, + "loss": 0.5301, + "step": 8721 + }, + { + "epoch": 11.196405648267008, + "grad_norm": 2.3592822551727295, + "learning_rate": 2.960333761232349e-05, + "loss": 0.5323, + "step": 8722 + }, + { + "epoch": 11.197689345314506, + "grad_norm": 1.6412689685821533, + "learning_rate": 2.960290971330766e-05, + "loss": 0.5491, + "step": 8723 + }, + { + "epoch": 11.198973042362002, + "grad_norm": 1.1285043954849243, + "learning_rate": 2.960248181429183e-05, + "loss": 0.571, + "step": 8724 + }, + { + "epoch": 11.200256739409499, + "grad_norm": 2.2053163051605225, + "learning_rate": 2.9602053915275994e-05, + "loss": 0.5543, + "step": 8725 + }, + { + "epoch": 11.201540436456996, + "grad_norm": 2.0069422721862793, + "learning_rate": 2.9601626016260165e-05, + "loss": 0.5402, + "step": 8726 + }, + { + "epoch": 11.202824133504492, + "grad_norm": 1.8175208568572998, + "learning_rate": 2.960119811724433e-05, + "loss": 0.5409, + "step": 8727 + }, + { + "epoch": 11.20410783055199, + "grad_norm": 2.338301420211792, + "learning_rate": 2.96007702182285e-05, + "loss": 0.5466, + "step": 8728 + }, + { + "epoch": 11.205391527599486, + "grad_norm": 2.1319570541381836, + "learning_rate": 2.9600342319212667e-05, + "loss": 0.549, + "step": 8729 + }, + { + "epoch": 11.206675224646983, + "grad_norm": 7.652103424072266, + "learning_rate": 2.9599914420196836e-05, + "loss": 0.5826, + "step": 8730 + }, + { + "epoch": 11.20795892169448, + "grad_norm": 1.2453464269638062, + "learning_rate": 2.9599486521181e-05, + "loss": 0.5622, + "step": 8731 + }, + { + "epoch": 11.209242618741976, + "grad_norm": 1.9590647220611572, + "learning_rate": 2.959905862216517e-05, + "loss": 0.5147, + "step": 8732 + }, + { + "epoch": 11.210526315789474, + "grad_norm": 4.36486291885376, + "learning_rate": 2.9598630723149337e-05, + "loss": 0.548, + "step": 8733 + }, + { + "epoch": 11.21181001283697, + "grad_norm": 1.3654234409332275, + "learning_rate": 2.9598202824133506e-05, + "loss": 0.5627, + "step": 8734 + }, + { + "epoch": 11.213093709884467, + "grad_norm": 0.9999096989631653, + "learning_rate": 2.9597774925117674e-05, + "loss": 0.5341, + "step": 8735 + }, + { + "epoch": 11.214377406931964, + "grad_norm": 1.7445930242538452, + "learning_rate": 2.959734702610184e-05, + "loss": 0.523, + "step": 8736 + }, + { + "epoch": 11.21566110397946, + "grad_norm": 5.048068046569824, + "learning_rate": 2.959691912708601e-05, + "loss": 0.5034, + "step": 8737 + }, + { + "epoch": 11.216944801026958, + "grad_norm": 1.9181350469589233, + "learning_rate": 2.9596491228070176e-05, + "loss": 0.5355, + "step": 8738 + }, + { + "epoch": 11.218228498074454, + "grad_norm": 1.1050047874450684, + "learning_rate": 2.959606332905434e-05, + "loss": 0.6043, + "step": 8739 + }, + { + "epoch": 11.21951219512195, + "grad_norm": 3.0557668209075928, + "learning_rate": 2.9595635430038513e-05, + "loss": 0.5286, + "step": 8740 + }, + { + "epoch": 11.220795892169448, + "grad_norm": 2.1697885990142822, + "learning_rate": 2.9595207531022678e-05, + "loss": 0.5673, + "step": 8741 + }, + { + "epoch": 11.222079589216944, + "grad_norm": 14.422195434570312, + "learning_rate": 2.959477963200685e-05, + "loss": 0.5531, + "step": 8742 + }, + { + "epoch": 11.223363286264442, + "grad_norm": 1.2484217882156372, + "learning_rate": 2.9594351732991015e-05, + "loss": 0.5469, + "step": 8743 + }, + { + "epoch": 11.224646983311938, + "grad_norm": 2.199424982070923, + "learning_rate": 2.9593923833975183e-05, + "loss": 0.552, + "step": 8744 + }, + { + "epoch": 11.225930680359435, + "grad_norm": 2.432512044906616, + "learning_rate": 2.959349593495935e-05, + "loss": 0.5561, + "step": 8745 + }, + { + "epoch": 11.227214377406932, + "grad_norm": 5.285175323486328, + "learning_rate": 2.9593068035943517e-05, + "loss": 0.6023, + "step": 8746 + }, + { + "epoch": 11.228498074454428, + "grad_norm": 2.9964420795440674, + "learning_rate": 2.9592640136927685e-05, + "loss": 0.5576, + "step": 8747 + }, + { + "epoch": 11.229781771501926, + "grad_norm": 3.0909600257873535, + "learning_rate": 2.9592212237911853e-05, + "loss": 0.5673, + "step": 8748 + }, + { + "epoch": 11.231065468549422, + "grad_norm": 6.638199329376221, + "learning_rate": 2.9591784338896022e-05, + "loss": 0.564, + "step": 8749 + }, + { + "epoch": 11.232349165596919, + "grad_norm": 2.051743268966675, + "learning_rate": 2.959135643988019e-05, + "loss": 0.5364, + "step": 8750 + }, + { + "epoch": 11.233632862644416, + "grad_norm": 1.875028133392334, + "learning_rate": 2.959092854086436e-05, + "loss": 0.5691, + "step": 8751 + }, + { + "epoch": 11.234916559691912, + "grad_norm": 1.07272207736969, + "learning_rate": 2.9590500641848524e-05, + "loss": 0.5849, + "step": 8752 + }, + { + "epoch": 11.23620025673941, + "grad_norm": 1.7577465772628784, + "learning_rate": 2.9590072742832692e-05, + "loss": 0.5361, + "step": 8753 + }, + { + "epoch": 11.237483953786906, + "grad_norm": 6.8414483070373535, + "learning_rate": 2.958964484381686e-05, + "loss": 0.5757, + "step": 8754 + }, + { + "epoch": 11.238767650834403, + "grad_norm": 1.6976816654205322, + "learning_rate": 2.9589216944801026e-05, + "loss": 0.6091, + "step": 8755 + }, + { + "epoch": 11.2400513478819, + "grad_norm": 1.078710675239563, + "learning_rate": 2.9588789045785197e-05, + "loss": 0.5675, + "step": 8756 + }, + { + "epoch": 11.241335044929397, + "grad_norm": 1.4669991731643677, + "learning_rate": 2.9588361146769362e-05, + "loss": 0.577, + "step": 8757 + }, + { + "epoch": 11.242618741976893, + "grad_norm": 2.2956430912017822, + "learning_rate": 2.9587933247753534e-05, + "loss": 0.619, + "step": 8758 + }, + { + "epoch": 11.24390243902439, + "grad_norm": 3.4573445320129395, + "learning_rate": 2.95875053487377e-05, + "loss": 0.5843, + "step": 8759 + }, + { + "epoch": 11.245186136071887, + "grad_norm": 3.805330514907837, + "learning_rate": 2.9587077449721864e-05, + "loss": 0.6098, + "step": 8760 + }, + { + "epoch": 11.246469833119384, + "grad_norm": 4.8036603927612305, + "learning_rate": 2.9586649550706036e-05, + "loss": 0.6168, + "step": 8761 + }, + { + "epoch": 11.24775353016688, + "grad_norm": 2.529400110244751, + "learning_rate": 2.95862216516902e-05, + "loss": 0.6045, + "step": 8762 + }, + { + "epoch": 11.249037227214377, + "grad_norm": 1.8870669603347778, + "learning_rate": 2.958579375267437e-05, + "loss": 0.5993, + "step": 8763 + }, + { + "epoch": 11.250320924261874, + "grad_norm": 2.6951146125793457, + "learning_rate": 2.9585365853658538e-05, + "loss": 0.621, + "step": 8764 + }, + { + "epoch": 11.25160462130937, + "grad_norm": 2.314485788345337, + "learning_rate": 2.9584937954642706e-05, + "loss": 0.585, + "step": 8765 + }, + { + "epoch": 11.252888318356868, + "grad_norm": 10.007341384887695, + "learning_rate": 2.9584510055626875e-05, + "loss": 0.6595, + "step": 8766 + }, + { + "epoch": 11.254172015404365, + "grad_norm": 7.186239242553711, + "learning_rate": 2.958408215661104e-05, + "loss": 0.6819, + "step": 8767 + }, + { + "epoch": 11.25545571245186, + "grad_norm": 3.815166473388672, + "learning_rate": 2.9583654257595208e-05, + "loss": 0.6756, + "step": 8768 + }, + { + "epoch": 11.256739409499358, + "grad_norm": 3.1462841033935547, + "learning_rate": 2.9583226358579377e-05, + "loss": 0.885, + "step": 8769 + }, + { + "epoch": 11.258023106546855, + "grad_norm": 2.265711784362793, + "learning_rate": 2.9582798459563545e-05, + "loss": 0.5544, + "step": 8770 + }, + { + "epoch": 11.259306803594352, + "grad_norm": 1.3630191087722778, + "learning_rate": 2.958237056054771e-05, + "loss": 0.56, + "step": 8771 + }, + { + "epoch": 11.260590500641849, + "grad_norm": 1.0127339363098145, + "learning_rate": 2.958194266153188e-05, + "loss": 0.5089, + "step": 8772 + }, + { + "epoch": 11.261874197689345, + "grad_norm": 3.2688939571380615, + "learning_rate": 2.9581514762516047e-05, + "loss": 0.5692, + "step": 8773 + }, + { + "epoch": 11.263157894736842, + "grad_norm": 1.6397802829742432, + "learning_rate": 2.9581086863500215e-05, + "loss": 0.5846, + "step": 8774 + }, + { + "epoch": 11.264441591784339, + "grad_norm": 1.9147440195083618, + "learning_rate": 2.9580658964484384e-05, + "loss": 0.5361, + "step": 8775 + }, + { + "epoch": 11.265725288831836, + "grad_norm": 1.2638744115829468, + "learning_rate": 2.958023106546855e-05, + "loss": 0.5889, + "step": 8776 + }, + { + "epoch": 11.267008985879333, + "grad_norm": 1.7453893423080444, + "learning_rate": 2.957980316645272e-05, + "loss": 0.5588, + "step": 8777 + }, + { + "epoch": 11.268292682926829, + "grad_norm": 6.2538042068481445, + "learning_rate": 2.9579375267436885e-05, + "loss": 0.5419, + "step": 8778 + }, + { + "epoch": 11.269576379974326, + "grad_norm": 1.4811832904815674, + "learning_rate": 2.957894736842105e-05, + "loss": 0.5446, + "step": 8779 + }, + { + "epoch": 11.270860077021823, + "grad_norm": 1.3200781345367432, + "learning_rate": 2.9578519469405222e-05, + "loss": 0.5526, + "step": 8780 + }, + { + "epoch": 11.27214377406932, + "grad_norm": 2.8329341411590576, + "learning_rate": 2.9578091570389387e-05, + "loss": 0.5818, + "step": 8781 + }, + { + "epoch": 11.273427471116817, + "grad_norm": 2.594564199447632, + "learning_rate": 2.957766367137356e-05, + "loss": 0.5706, + "step": 8782 + }, + { + "epoch": 11.274711168164313, + "grad_norm": 2.1760342121124268, + "learning_rate": 2.9577235772357724e-05, + "loss": 0.5645, + "step": 8783 + }, + { + "epoch": 11.27599486521181, + "grad_norm": 2.6673672199249268, + "learning_rate": 2.9576807873341893e-05, + "loss": 0.5039, + "step": 8784 + }, + { + "epoch": 11.277278562259307, + "grad_norm": 2.2211754322052, + "learning_rate": 2.957637997432606e-05, + "loss": 0.5253, + "step": 8785 + }, + { + "epoch": 11.278562259306804, + "grad_norm": 2.0810210704803467, + "learning_rate": 2.9575952075310226e-05, + "loss": 0.5313, + "step": 8786 + }, + { + "epoch": 11.2798459563543, + "grad_norm": 1.6744095087051392, + "learning_rate": 2.9575524176294394e-05, + "loss": 0.525, + "step": 8787 + }, + { + "epoch": 11.281129653401797, + "grad_norm": 3.3667056560516357, + "learning_rate": 2.9575096277278563e-05, + "loss": 0.5681, + "step": 8788 + }, + { + "epoch": 11.282413350449294, + "grad_norm": 1.413514494895935, + "learning_rate": 2.957466837826273e-05, + "loss": 0.5367, + "step": 8789 + }, + { + "epoch": 11.28369704749679, + "grad_norm": 1.3840296268463135, + "learning_rate": 2.95742404792469e-05, + "loss": 0.5554, + "step": 8790 + }, + { + "epoch": 11.284980744544288, + "grad_norm": 2.1761136054992676, + "learning_rate": 2.9573812580231068e-05, + "loss": 0.5543, + "step": 8791 + }, + { + "epoch": 11.286264441591785, + "grad_norm": 2.549173355102539, + "learning_rate": 2.9573384681215233e-05, + "loss": 0.5342, + "step": 8792 + }, + { + "epoch": 11.28754813863928, + "grad_norm": 2.0557174682617188, + "learning_rate": 2.95729567821994e-05, + "loss": 0.5299, + "step": 8793 + }, + { + "epoch": 11.288831835686779, + "grad_norm": 4.1172566413879395, + "learning_rate": 2.957252888318357e-05, + "loss": 0.5262, + "step": 8794 + }, + { + "epoch": 11.290115532734275, + "grad_norm": 1.9005417823791504, + "learning_rate": 2.9572100984167735e-05, + "loss": 0.5826, + "step": 8795 + }, + { + "epoch": 11.29139922978177, + "grad_norm": 1.0134817361831665, + "learning_rate": 2.9571673085151907e-05, + "loss": 0.5273, + "step": 8796 + }, + { + "epoch": 11.292682926829269, + "grad_norm": 2.1841959953308105, + "learning_rate": 2.9571245186136072e-05, + "loss": 0.5094, + "step": 8797 + }, + { + "epoch": 11.293966623876765, + "grad_norm": 1.0608488321304321, + "learning_rate": 2.9570817287120244e-05, + "loss": 0.5311, + "step": 8798 + }, + { + "epoch": 11.295250320924263, + "grad_norm": 1.498622179031372, + "learning_rate": 2.957038938810441e-05, + "loss": 0.5677, + "step": 8799 + }, + { + "epoch": 11.296534017971759, + "grad_norm": 1.5644639730453491, + "learning_rate": 2.9569961489088574e-05, + "loss": 0.5301, + "step": 8800 + }, + { + "epoch": 11.297817715019255, + "grad_norm": 0.9738709330558777, + "learning_rate": 2.9569533590072745e-05, + "loss": 0.5888, + "step": 8801 + }, + { + "epoch": 11.299101412066753, + "grad_norm": 0.9857414960861206, + "learning_rate": 2.956910569105691e-05, + "loss": 0.5627, + "step": 8802 + }, + { + "epoch": 11.300385109114249, + "grad_norm": 0.9848891496658325, + "learning_rate": 2.956867779204108e-05, + "loss": 0.5593, + "step": 8803 + }, + { + "epoch": 11.301668806161747, + "grad_norm": 1.9303890466690063, + "learning_rate": 2.9568249893025247e-05, + "loss": 0.6368, + "step": 8804 + }, + { + "epoch": 11.302952503209243, + "grad_norm": 2.000962734222412, + "learning_rate": 2.9567821994009416e-05, + "loss": 0.5782, + "step": 8805 + }, + { + "epoch": 11.304236200256739, + "grad_norm": 3.1743221282958984, + "learning_rate": 2.9567394094993584e-05, + "loss": 0.5768, + "step": 8806 + }, + { + "epoch": 11.305519897304237, + "grad_norm": 1.2086822986602783, + "learning_rate": 2.956696619597775e-05, + "loss": 0.6045, + "step": 8807 + }, + { + "epoch": 11.306803594351733, + "grad_norm": 1.8945585489273071, + "learning_rate": 2.9566538296961917e-05, + "loss": 0.5957, + "step": 8808 + }, + { + "epoch": 11.30808729139923, + "grad_norm": 1.4476563930511475, + "learning_rate": 2.9566110397946086e-05, + "loss": 0.5845, + "step": 8809 + }, + { + "epoch": 11.309370988446727, + "grad_norm": 5.267148494720459, + "learning_rate": 2.9565682498930254e-05, + "loss": 0.6121, + "step": 8810 + }, + { + "epoch": 11.310654685494223, + "grad_norm": 1.409148097038269, + "learning_rate": 2.956525459991442e-05, + "loss": 0.6065, + "step": 8811 + }, + { + "epoch": 11.31193838254172, + "grad_norm": 1.3081821203231812, + "learning_rate": 2.956482670089859e-05, + "loss": 0.6209, + "step": 8812 + }, + { + "epoch": 11.313222079589217, + "grad_norm": 1.6351038217544556, + "learning_rate": 2.9564398801882756e-05, + "loss": 0.5398, + "step": 8813 + }, + { + "epoch": 11.314505776636715, + "grad_norm": 2.197173833847046, + "learning_rate": 2.956397090286692e-05, + "loss": 0.6255, + "step": 8814 + }, + { + "epoch": 11.31578947368421, + "grad_norm": 1.9237661361694336, + "learning_rate": 2.9563543003851093e-05, + "loss": 0.6568, + "step": 8815 + }, + { + "epoch": 11.317073170731707, + "grad_norm": 38.9892463684082, + "learning_rate": 2.9563115104835258e-05, + "loss": 0.591, + "step": 8816 + }, + { + "epoch": 11.318356867779205, + "grad_norm": 3.358149528503418, + "learning_rate": 2.956268720581943e-05, + "loss": 0.6575, + "step": 8817 + }, + { + "epoch": 11.3196405648267, + "grad_norm": 2.8930234909057617, + "learning_rate": 2.9562259306803595e-05, + "loss": 0.6609, + "step": 8818 + }, + { + "epoch": 11.320924261874199, + "grad_norm": 2.80830454826355, + "learning_rate": 2.9561831407787763e-05, + "loss": 0.7913, + "step": 8819 + }, + { + "epoch": 11.322207958921695, + "grad_norm": 1.4707072973251343, + "learning_rate": 2.956140350877193e-05, + "loss": 0.5162, + "step": 8820 + }, + { + "epoch": 11.32349165596919, + "grad_norm": 1.1339478492736816, + "learning_rate": 2.9560975609756097e-05, + "loss": 0.5363, + "step": 8821 + }, + { + "epoch": 11.324775353016689, + "grad_norm": 1.358262538909912, + "learning_rate": 2.9560547710740265e-05, + "loss": 0.5437, + "step": 8822 + }, + { + "epoch": 11.326059050064185, + "grad_norm": 1.0349417924880981, + "learning_rate": 2.9560119811724433e-05, + "loss": 0.569, + "step": 8823 + }, + { + "epoch": 11.327342747111683, + "grad_norm": 1.244242787361145, + "learning_rate": 2.9559691912708602e-05, + "loss": 0.5267, + "step": 8824 + }, + { + "epoch": 11.328626444159179, + "grad_norm": 1.758273720741272, + "learning_rate": 2.955926401369277e-05, + "loss": 0.5219, + "step": 8825 + }, + { + "epoch": 11.329910141206675, + "grad_norm": 3.0668671131134033, + "learning_rate": 2.955883611467694e-05, + "loss": 0.5257, + "step": 8826 + }, + { + "epoch": 11.331193838254173, + "grad_norm": 1.549461841583252, + "learning_rate": 2.9558408215661104e-05, + "loss": 0.5822, + "step": 8827 + }, + { + "epoch": 11.332477535301669, + "grad_norm": 1.1678286790847778, + "learning_rate": 2.9557980316645272e-05, + "loss": 0.5228, + "step": 8828 + }, + { + "epoch": 11.333761232349165, + "grad_norm": 1.241584300994873, + "learning_rate": 2.955755241762944e-05, + "loss": 0.5564, + "step": 8829 + }, + { + "epoch": 11.335044929396663, + "grad_norm": 2.4867234230041504, + "learning_rate": 2.9557124518613606e-05, + "loss": 0.5589, + "step": 8830 + }, + { + "epoch": 11.336328626444159, + "grad_norm": 1.1115165948867798, + "learning_rate": 2.9556696619597777e-05, + "loss": 0.5664, + "step": 8831 + }, + { + "epoch": 11.337612323491657, + "grad_norm": 2.221142530441284, + "learning_rate": 2.9556268720581942e-05, + "loss": 0.594, + "step": 8832 + }, + { + "epoch": 11.338896020539153, + "grad_norm": 1.8332089185714722, + "learning_rate": 2.955584082156611e-05, + "loss": 0.5658, + "step": 8833 + }, + { + "epoch": 11.340179717586649, + "grad_norm": 0.8832342028617859, + "learning_rate": 2.955541292255028e-05, + "loss": 0.5515, + "step": 8834 + }, + { + "epoch": 11.341463414634147, + "grad_norm": 2.9711766242980957, + "learning_rate": 2.9554985023534444e-05, + "loss": 0.5497, + "step": 8835 + }, + { + "epoch": 11.342747111681643, + "grad_norm": 1.2797508239746094, + "learning_rate": 2.9554557124518616e-05, + "loss": 0.4857, + "step": 8836 + }, + { + "epoch": 11.34403080872914, + "grad_norm": 1.014487862586975, + "learning_rate": 2.955412922550278e-05, + "loss": 0.5154, + "step": 8837 + }, + { + "epoch": 11.345314505776637, + "grad_norm": 1.6176501512527466, + "learning_rate": 2.955370132648695e-05, + "loss": 0.5316, + "step": 8838 + }, + { + "epoch": 11.346598202824133, + "grad_norm": 2.1416594982147217, + "learning_rate": 2.9553273427471118e-05, + "loss": 0.535, + "step": 8839 + }, + { + "epoch": 11.34788189987163, + "grad_norm": 1.3362455368041992, + "learning_rate": 2.9552845528455283e-05, + "loss": 0.5136, + "step": 8840 + }, + { + "epoch": 11.349165596919127, + "grad_norm": 0.950724720954895, + "learning_rate": 2.9552417629439455e-05, + "loss": 0.5397, + "step": 8841 + }, + { + "epoch": 11.350449293966625, + "grad_norm": 1.4312868118286133, + "learning_rate": 2.955198973042362e-05, + "loss": 0.5785, + "step": 8842 + }, + { + "epoch": 11.35173299101412, + "grad_norm": 2.0637857913970947, + "learning_rate": 2.9551561831407788e-05, + "loss": 0.529, + "step": 8843 + }, + { + "epoch": 11.353016688061617, + "grad_norm": 1.4637023210525513, + "learning_rate": 2.9551133932391957e-05, + "loss": 0.5393, + "step": 8844 + }, + { + "epoch": 11.354300385109115, + "grad_norm": 2.3036916255950928, + "learning_rate": 2.9550706033376125e-05, + "loss": 0.5672, + "step": 8845 + }, + { + "epoch": 11.35558408215661, + "grad_norm": 2.6382224559783936, + "learning_rate": 2.955027813436029e-05, + "loss": 0.5773, + "step": 8846 + }, + { + "epoch": 11.356867779204109, + "grad_norm": 1.7957704067230225, + "learning_rate": 2.954985023534446e-05, + "loss": 0.5692, + "step": 8847 + }, + { + "epoch": 11.358151476251605, + "grad_norm": 1.0274449586868286, + "learning_rate": 2.9549422336328627e-05, + "loss": 0.5791, + "step": 8848 + }, + { + "epoch": 11.3594351732991, + "grad_norm": 2.0401692390441895, + "learning_rate": 2.9548994437312795e-05, + "loss": 0.5563, + "step": 8849 + }, + { + "epoch": 11.360718870346599, + "grad_norm": 1.634948492050171, + "learning_rate": 2.9548566538296964e-05, + "loss": 0.572, + "step": 8850 + }, + { + "epoch": 11.362002567394095, + "grad_norm": 5.2538228034973145, + "learning_rate": 2.954813863928113e-05, + "loss": 0.5723, + "step": 8851 + }, + { + "epoch": 11.363286264441593, + "grad_norm": 1.301279067993164, + "learning_rate": 2.95477107402653e-05, + "loss": 0.5689, + "step": 8852 + }, + { + "epoch": 11.364569961489089, + "grad_norm": 1.5714325904846191, + "learning_rate": 2.9547282841249466e-05, + "loss": 0.5644, + "step": 8853 + }, + { + "epoch": 11.365853658536585, + "grad_norm": 1.560439109802246, + "learning_rate": 2.954685494223363e-05, + "loss": 0.5605, + "step": 8854 + }, + { + "epoch": 11.367137355584083, + "grad_norm": 3.084505558013916, + "learning_rate": 2.9546427043217802e-05, + "loss": 0.6668, + "step": 8855 + }, + { + "epoch": 11.368421052631579, + "grad_norm": 2.1388626098632812, + "learning_rate": 2.9545999144201967e-05, + "loss": 0.5747, + "step": 8856 + }, + { + "epoch": 11.369704749679077, + "grad_norm": 2.505582332611084, + "learning_rate": 2.954557124518614e-05, + "loss": 0.5811, + "step": 8857 + }, + { + "epoch": 11.370988446726573, + "grad_norm": 1.5312448740005493, + "learning_rate": 2.9545143346170304e-05, + "loss": 0.5771, + "step": 8858 + }, + { + "epoch": 11.372272143774069, + "grad_norm": 1.0579792261123657, + "learning_rate": 2.9544715447154473e-05, + "loss": 0.661, + "step": 8859 + }, + { + "epoch": 11.373555840821567, + "grad_norm": 1.5242817401885986, + "learning_rate": 2.954428754813864e-05, + "loss": 0.6242, + "step": 8860 + }, + { + "epoch": 11.374839537869063, + "grad_norm": 1.1145886182785034, + "learning_rate": 2.9543859649122806e-05, + "loss": 0.6112, + "step": 8861 + }, + { + "epoch": 11.376123234916559, + "grad_norm": 2.4351909160614014, + "learning_rate": 2.9543431750106974e-05, + "loss": 0.5936, + "step": 8862 + }, + { + "epoch": 11.377406931964057, + "grad_norm": 3.388918161392212, + "learning_rate": 2.9543003851091143e-05, + "loss": 0.6303, + "step": 8863 + }, + { + "epoch": 11.378690629011553, + "grad_norm": 3.800933599472046, + "learning_rate": 2.954257595207531e-05, + "loss": 0.6057, + "step": 8864 + }, + { + "epoch": 11.37997432605905, + "grad_norm": 1.267171859741211, + "learning_rate": 2.954214805305948e-05, + "loss": 0.6482, + "step": 8865 + }, + { + "epoch": 11.381258023106547, + "grad_norm": 2.2159152030944824, + "learning_rate": 2.9541720154043648e-05, + "loss": 0.5944, + "step": 8866 + }, + { + "epoch": 11.382541720154043, + "grad_norm": 6.397982597351074, + "learning_rate": 2.9541292255027813e-05, + "loss": 0.6393, + "step": 8867 + }, + { + "epoch": 11.38382541720154, + "grad_norm": 1.8132580518722534, + "learning_rate": 2.954086435601198e-05, + "loss": 0.6682, + "step": 8868 + }, + { + "epoch": 11.385109114249037, + "grad_norm": 2.546722412109375, + "learning_rate": 2.954043645699615e-05, + "loss": 0.8211, + "step": 8869 + }, + { + "epoch": 11.386392811296535, + "grad_norm": 1.1056078672409058, + "learning_rate": 2.9540008557980315e-05, + "loss": 0.4883, + "step": 8870 + }, + { + "epoch": 11.38767650834403, + "grad_norm": 1.1703153848648071, + "learning_rate": 2.9539580658964487e-05, + "loss": 0.5181, + "step": 8871 + }, + { + "epoch": 11.388960205391527, + "grad_norm": 1.2378993034362793, + "learning_rate": 2.9539152759948652e-05, + "loss": 0.5603, + "step": 8872 + }, + { + "epoch": 11.390243902439025, + "grad_norm": 1.9318218231201172, + "learning_rate": 2.9538724860932824e-05, + "loss": 0.5778, + "step": 8873 + }, + { + "epoch": 11.39152759948652, + "grad_norm": 1.5412349700927734, + "learning_rate": 2.953829696191699e-05, + "loss": 0.5423, + "step": 8874 + }, + { + "epoch": 11.392811296534019, + "grad_norm": 1.1241265535354614, + "learning_rate": 2.9537869062901154e-05, + "loss": 0.517, + "step": 8875 + }, + { + "epoch": 11.394094993581515, + "grad_norm": 0.8445144295692444, + "learning_rate": 2.9537441163885325e-05, + "loss": 0.5409, + "step": 8876 + }, + { + "epoch": 11.39537869062901, + "grad_norm": 1.3639270067214966, + "learning_rate": 2.953701326486949e-05, + "loss": 0.5944, + "step": 8877 + }, + { + "epoch": 11.396662387676509, + "grad_norm": 0.799203634262085, + "learning_rate": 2.953658536585366e-05, + "loss": 0.5247, + "step": 8878 + }, + { + "epoch": 11.397946084724005, + "grad_norm": 2.606351375579834, + "learning_rate": 2.9536157466837827e-05, + "loss": 0.5878, + "step": 8879 + }, + { + "epoch": 11.399229781771503, + "grad_norm": 2.4943695068359375, + "learning_rate": 2.9535729567821996e-05, + "loss": 0.5519, + "step": 8880 + }, + { + "epoch": 11.400513478818999, + "grad_norm": 2.1879186630249023, + "learning_rate": 2.9535301668806164e-05, + "loss": 0.5687, + "step": 8881 + }, + { + "epoch": 11.401797175866495, + "grad_norm": 1.6792709827423096, + "learning_rate": 2.953487376979033e-05, + "loss": 0.5566, + "step": 8882 + }, + { + "epoch": 11.403080872913993, + "grad_norm": 1.6049474477767944, + "learning_rate": 2.9534445870774498e-05, + "loss": 0.5602, + "step": 8883 + }, + { + "epoch": 11.404364569961489, + "grad_norm": 2.291506052017212, + "learning_rate": 2.9534017971758666e-05, + "loss": 0.5376, + "step": 8884 + }, + { + "epoch": 11.405648267008987, + "grad_norm": 1.2099052667617798, + "learning_rate": 2.9533590072742834e-05, + "loss": 0.5961, + "step": 8885 + }, + { + "epoch": 11.406931964056483, + "grad_norm": 1.6706563234329224, + "learning_rate": 2.9533162173727e-05, + "loss": 0.5071, + "step": 8886 + }, + { + "epoch": 11.408215661103979, + "grad_norm": 1.1260700225830078, + "learning_rate": 2.953273427471117e-05, + "loss": 0.5827, + "step": 8887 + }, + { + "epoch": 11.409499358151477, + "grad_norm": 3.224712610244751, + "learning_rate": 2.9532306375695336e-05, + "loss": 0.5716, + "step": 8888 + }, + { + "epoch": 11.410783055198973, + "grad_norm": 1.0920053720474243, + "learning_rate": 2.9531878476679505e-05, + "loss": 0.5671, + "step": 8889 + }, + { + "epoch": 11.41206675224647, + "grad_norm": 1.5441796779632568, + "learning_rate": 2.9531450577663673e-05, + "loss": 0.6038, + "step": 8890 + }, + { + "epoch": 11.413350449293967, + "grad_norm": 1.457746982574463, + "learning_rate": 2.9531022678647838e-05, + "loss": 0.5686, + "step": 8891 + }, + { + "epoch": 11.414634146341463, + "grad_norm": 2.4976794719696045, + "learning_rate": 2.953059477963201e-05, + "loss": 0.5212, + "step": 8892 + }, + { + "epoch": 11.41591784338896, + "grad_norm": 1.6622097492218018, + "learning_rate": 2.9530166880616175e-05, + "loss": 0.5259, + "step": 8893 + }, + { + "epoch": 11.417201540436457, + "grad_norm": 1.0518522262573242, + "learning_rate": 2.952973898160034e-05, + "loss": 0.6089, + "step": 8894 + }, + { + "epoch": 11.418485237483953, + "grad_norm": 2.1660773754119873, + "learning_rate": 2.952931108258451e-05, + "loss": 0.5689, + "step": 8895 + }, + { + "epoch": 11.41976893453145, + "grad_norm": 1.2824324369430542, + "learning_rate": 2.9528883183568677e-05, + "loss": 0.5405, + "step": 8896 + }, + { + "epoch": 11.421052631578947, + "grad_norm": 1.9729608297348022, + "learning_rate": 2.952845528455285e-05, + "loss": 0.5701, + "step": 8897 + }, + { + "epoch": 11.422336328626445, + "grad_norm": 1.6607933044433594, + "learning_rate": 2.9528027385537014e-05, + "loss": 0.5381, + "step": 8898 + }, + { + "epoch": 11.42362002567394, + "grad_norm": 3.1026837825775146, + "learning_rate": 2.9527599486521182e-05, + "loss": 0.6585, + "step": 8899 + }, + { + "epoch": 11.424903722721437, + "grad_norm": 1.0542540550231934, + "learning_rate": 2.952717158750535e-05, + "loss": 0.5385, + "step": 8900 + }, + { + "epoch": 11.426187419768935, + "grad_norm": 3.269068956375122, + "learning_rate": 2.9526743688489515e-05, + "loss": 0.5994, + "step": 8901 + }, + { + "epoch": 11.427471116816431, + "grad_norm": 2.2719669342041016, + "learning_rate": 2.9526315789473684e-05, + "loss": 0.5992, + "step": 8902 + }, + { + "epoch": 11.428754813863929, + "grad_norm": 2.344066858291626, + "learning_rate": 2.9525887890457852e-05, + "loss": 0.5691, + "step": 8903 + }, + { + "epoch": 11.430038510911425, + "grad_norm": 1.9803848266601562, + "learning_rate": 2.952545999144202e-05, + "loss": 0.6306, + "step": 8904 + }, + { + "epoch": 11.431322207958921, + "grad_norm": 2.581677198410034, + "learning_rate": 2.952503209242619e-05, + "loss": 0.5764, + "step": 8905 + }, + { + "epoch": 11.432605905006419, + "grad_norm": 2.763429880142212, + "learning_rate": 2.9524604193410357e-05, + "loss": 0.5857, + "step": 8906 + }, + { + "epoch": 11.433889602053915, + "grad_norm": 1.0633559226989746, + "learning_rate": 2.9524176294394522e-05, + "loss": 0.5122, + "step": 8907 + }, + { + "epoch": 11.435173299101413, + "grad_norm": 1.392324447631836, + "learning_rate": 2.952374839537869e-05, + "loss": 0.5374, + "step": 8908 + }, + { + "epoch": 11.436456996148909, + "grad_norm": 1.5613797903060913, + "learning_rate": 2.952332049636286e-05, + "loss": 0.5625, + "step": 8909 + }, + { + "epoch": 11.437740693196405, + "grad_norm": 2.2330269813537598, + "learning_rate": 2.9522892597347024e-05, + "loss": 0.6009, + "step": 8910 + }, + { + "epoch": 11.439024390243903, + "grad_norm": 2.082792043685913, + "learning_rate": 2.9522464698331196e-05, + "loss": 0.6507, + "step": 8911 + }, + { + "epoch": 11.440308087291399, + "grad_norm": 2.538665294647217, + "learning_rate": 2.952203679931536e-05, + "loss": 0.6047, + "step": 8912 + }, + { + "epoch": 11.441591784338897, + "grad_norm": 2.1959164142608643, + "learning_rate": 2.9521608900299533e-05, + "loss": 0.586, + "step": 8913 + }, + { + "epoch": 11.442875481386393, + "grad_norm": 4.998832702636719, + "learning_rate": 2.9521181001283698e-05, + "loss": 0.6005, + "step": 8914 + }, + { + "epoch": 11.444159178433889, + "grad_norm": 2.3835809230804443, + "learning_rate": 2.9520753102267863e-05, + "loss": 0.6024, + "step": 8915 + }, + { + "epoch": 11.445442875481387, + "grad_norm": 3.831240177154541, + "learning_rate": 2.9520325203252035e-05, + "loss": 0.6532, + "step": 8916 + }, + { + "epoch": 11.446726572528883, + "grad_norm": 3.8279411792755127, + "learning_rate": 2.95198973042362e-05, + "loss": 0.6717, + "step": 8917 + }, + { + "epoch": 11.44801026957638, + "grad_norm": 1.9131054878234863, + "learning_rate": 2.9519469405220368e-05, + "loss": 0.7647, + "step": 8918 + }, + { + "epoch": 11.449293966623877, + "grad_norm": 2.4581050872802734, + "learning_rate": 2.9519041506204537e-05, + "loss": 0.8696, + "step": 8919 + }, + { + "epoch": 11.450577663671373, + "grad_norm": 1.9109638929367065, + "learning_rate": 2.9518613607188705e-05, + "loss": 0.5374, + "step": 8920 + }, + { + "epoch": 11.45186136071887, + "grad_norm": 1.0838491916656494, + "learning_rate": 2.9518185708172873e-05, + "loss": 0.5532, + "step": 8921 + }, + { + "epoch": 11.453145057766367, + "grad_norm": 1.505144715309143, + "learning_rate": 2.951775780915704e-05, + "loss": 0.5569, + "step": 8922 + }, + { + "epoch": 11.454428754813865, + "grad_norm": 2.1284093856811523, + "learning_rate": 2.9517329910141207e-05, + "loss": 0.5514, + "step": 8923 + }, + { + "epoch": 11.455712451861361, + "grad_norm": 1.797025203704834, + "learning_rate": 2.9516902011125375e-05, + "loss": 0.5068, + "step": 8924 + }, + { + "epoch": 11.456996148908857, + "grad_norm": 1.294865369796753, + "learning_rate": 2.9516474112109544e-05, + "loss": 0.546, + "step": 8925 + }, + { + "epoch": 11.458279845956355, + "grad_norm": 1.430886149406433, + "learning_rate": 2.951604621309371e-05, + "loss": 0.5225, + "step": 8926 + }, + { + "epoch": 11.459563543003851, + "grad_norm": 0.8674275875091553, + "learning_rate": 2.951561831407788e-05, + "loss": 0.5888, + "step": 8927 + }, + { + "epoch": 11.460847240051347, + "grad_norm": 1.8671857118606567, + "learning_rate": 2.9515190415062046e-05, + "loss": 0.5007, + "step": 8928 + }, + { + "epoch": 11.462130937098845, + "grad_norm": 1.915724277496338, + "learning_rate": 2.9514762516046214e-05, + "loss": 0.5671, + "step": 8929 + }, + { + "epoch": 11.463414634146341, + "grad_norm": 2.793694496154785, + "learning_rate": 2.9514334617030382e-05, + "loss": 0.5017, + "step": 8930 + }, + { + "epoch": 11.464698331193839, + "grad_norm": 1.3779728412628174, + "learning_rate": 2.9513906718014547e-05, + "loss": 0.5519, + "step": 8931 + }, + { + "epoch": 11.465982028241335, + "grad_norm": 2.601778984069824, + "learning_rate": 2.951347881899872e-05, + "loss": 0.5801, + "step": 8932 + }, + { + "epoch": 11.467265725288831, + "grad_norm": 1.9291130304336548, + "learning_rate": 2.9513050919982884e-05, + "loss": 0.5215, + "step": 8933 + }, + { + "epoch": 11.468549422336329, + "grad_norm": 6.863522052764893, + "learning_rate": 2.9512623020967053e-05, + "loss": 0.5214, + "step": 8934 + }, + { + "epoch": 11.469833119383825, + "grad_norm": 1.598815679550171, + "learning_rate": 2.951219512195122e-05, + "loss": 0.5343, + "step": 8935 + }, + { + "epoch": 11.471116816431323, + "grad_norm": 2.0728161334991455, + "learning_rate": 2.9511767222935386e-05, + "loss": 0.5454, + "step": 8936 + }, + { + "epoch": 11.472400513478819, + "grad_norm": 0.9985893368721008, + "learning_rate": 2.9511339323919558e-05, + "loss": 0.5185, + "step": 8937 + }, + { + "epoch": 11.473684210526315, + "grad_norm": 2.5278263092041016, + "learning_rate": 2.9510911424903723e-05, + "loss": 0.5398, + "step": 8938 + }, + { + "epoch": 11.474967907573813, + "grad_norm": 1.3628993034362793, + "learning_rate": 2.951048352588789e-05, + "loss": 0.5566, + "step": 8939 + }, + { + "epoch": 11.476251604621309, + "grad_norm": 1.5271470546722412, + "learning_rate": 2.951005562687206e-05, + "loss": 0.5455, + "step": 8940 + }, + { + "epoch": 11.477535301668807, + "grad_norm": 1.682679533958435, + "learning_rate": 2.9509627727856228e-05, + "loss": 0.5802, + "step": 8941 + }, + { + "epoch": 11.478818998716303, + "grad_norm": 5.034493446350098, + "learning_rate": 2.9509199828840393e-05, + "loss": 0.5683, + "step": 8942 + }, + { + "epoch": 11.480102695763799, + "grad_norm": 2.48030948638916, + "learning_rate": 2.950877192982456e-05, + "loss": 0.605, + "step": 8943 + }, + { + "epoch": 11.481386392811297, + "grad_norm": 2.5547144412994385, + "learning_rate": 2.950834403080873e-05, + "loss": 0.5284, + "step": 8944 + }, + { + "epoch": 11.482670089858793, + "grad_norm": 1.8848254680633545, + "learning_rate": 2.95079161317929e-05, + "loss": 0.6094, + "step": 8945 + }, + { + "epoch": 11.48395378690629, + "grad_norm": 1.8101530075073242, + "learning_rate": 2.9507488232777067e-05, + "loss": 0.549, + "step": 8946 + }, + { + "epoch": 11.485237483953787, + "grad_norm": 1.4013851881027222, + "learning_rate": 2.9507060333761232e-05, + "loss": 0.5677, + "step": 8947 + }, + { + "epoch": 11.486521181001283, + "grad_norm": 1.6874173879623413, + "learning_rate": 2.9506632434745404e-05, + "loss": 0.5104, + "step": 8948 + }, + { + "epoch": 11.487804878048781, + "grad_norm": 1.9604153633117676, + "learning_rate": 2.950620453572957e-05, + "loss": 0.5256, + "step": 8949 + }, + { + "epoch": 11.489088575096277, + "grad_norm": 2.0115301609039307, + "learning_rate": 2.9505776636713734e-05, + "loss": 0.5512, + "step": 8950 + }, + { + "epoch": 11.490372272143775, + "grad_norm": 1.3068124055862427, + "learning_rate": 2.9505348737697905e-05, + "loss": 0.5377, + "step": 8951 + }, + { + "epoch": 11.491655969191271, + "grad_norm": 1.5968782901763916, + "learning_rate": 2.950492083868207e-05, + "loss": 0.6148, + "step": 8952 + }, + { + "epoch": 11.492939666238767, + "grad_norm": 2.048978090286255, + "learning_rate": 2.9504492939666242e-05, + "loss": 0.6184, + "step": 8953 + }, + { + "epoch": 11.494223363286265, + "grad_norm": 1.3669768571853638, + "learning_rate": 2.9504065040650407e-05, + "loss": 0.566, + "step": 8954 + }, + { + "epoch": 11.495507060333761, + "grad_norm": 3.7883098125457764, + "learning_rate": 2.9503637141634572e-05, + "loss": 0.5276, + "step": 8955 + }, + { + "epoch": 11.496790757381259, + "grad_norm": 2.491765022277832, + "learning_rate": 2.9503209242618744e-05, + "loss": 0.5954, + "step": 8956 + }, + { + "epoch": 11.498074454428755, + "grad_norm": 1.4548736810684204, + "learning_rate": 2.950278134360291e-05, + "loss": 0.5781, + "step": 8957 + }, + { + "epoch": 11.499358151476251, + "grad_norm": 1.8743963241577148, + "learning_rate": 2.9502353444587078e-05, + "loss": 0.5943, + "step": 8958 + }, + { + "epoch": 11.500641848523749, + "grad_norm": 1.3724136352539062, + "learning_rate": 2.9501925545571246e-05, + "loss": 0.5916, + "step": 8959 + }, + { + "epoch": 11.501925545571245, + "grad_norm": 2.370741605758667, + "learning_rate": 2.9501497646555414e-05, + "loss": 0.5838, + "step": 8960 + }, + { + "epoch": 11.503209242618741, + "grad_norm": 2.4314582347869873, + "learning_rate": 2.9501069747539583e-05, + "loss": 0.6274, + "step": 8961 + }, + { + "epoch": 11.504492939666239, + "grad_norm": 4.127711772918701, + "learning_rate": 2.9500641848523748e-05, + "loss": 0.607, + "step": 8962 + }, + { + "epoch": 11.505776636713735, + "grad_norm": 2.1331264972686768, + "learning_rate": 2.9500213949507916e-05, + "loss": 0.649, + "step": 8963 + }, + { + "epoch": 11.507060333761233, + "grad_norm": 1.50307297706604, + "learning_rate": 2.9499786050492085e-05, + "loss": 0.6278, + "step": 8964 + }, + { + "epoch": 11.508344030808729, + "grad_norm": 1.7029106616973877, + "learning_rate": 2.9499358151476253e-05, + "loss": 0.6357, + "step": 8965 + }, + { + "epoch": 11.509627727856225, + "grad_norm": 1.7426584959030151, + "learning_rate": 2.9498930252460418e-05, + "loss": 0.6188, + "step": 8966 + }, + { + "epoch": 11.510911424903723, + "grad_norm": 2.7471470832824707, + "learning_rate": 2.949850235344459e-05, + "loss": 0.6488, + "step": 8967 + }, + { + "epoch": 11.512195121951219, + "grad_norm": 1.7666338682174683, + "learning_rate": 2.9498074454428755e-05, + "loss": 0.7151, + "step": 8968 + }, + { + "epoch": 11.513478818998717, + "grad_norm": 3.5562474727630615, + "learning_rate": 2.9497646555412923e-05, + "loss": 0.7459, + "step": 8969 + }, + { + "epoch": 11.514762516046213, + "grad_norm": 1.0743770599365234, + "learning_rate": 2.9497218656397092e-05, + "loss": 0.5438, + "step": 8970 + }, + { + "epoch": 11.51604621309371, + "grad_norm": 2.459676742553711, + "learning_rate": 2.9496790757381257e-05, + "loss": 0.5202, + "step": 8971 + }, + { + "epoch": 11.517329910141207, + "grad_norm": 4.171727180480957, + "learning_rate": 2.949636285836543e-05, + "loss": 0.5195, + "step": 8972 + }, + { + "epoch": 11.518613607188703, + "grad_norm": 3.9906985759735107, + "learning_rate": 2.9495934959349594e-05, + "loss": 0.5704, + "step": 8973 + }, + { + "epoch": 11.519897304236201, + "grad_norm": 1.3443161249160767, + "learning_rate": 2.9495507060333762e-05, + "loss": 0.5613, + "step": 8974 + }, + { + "epoch": 11.521181001283697, + "grad_norm": 2.1236839294433594, + "learning_rate": 2.949507916131793e-05, + "loss": 0.5425, + "step": 8975 + }, + { + "epoch": 11.522464698331193, + "grad_norm": 1.8552130460739136, + "learning_rate": 2.9494651262302095e-05, + "loss": 0.5395, + "step": 8976 + }, + { + "epoch": 11.523748395378691, + "grad_norm": 1.9106334447860718, + "learning_rate": 2.9494223363286267e-05, + "loss": 0.5532, + "step": 8977 + }, + { + "epoch": 11.525032092426187, + "grad_norm": 1.6452761888504028, + "learning_rate": 2.9493795464270432e-05, + "loss": 0.5564, + "step": 8978 + }, + { + "epoch": 11.526315789473685, + "grad_norm": 0.8134541511535645, + "learning_rate": 2.94933675652546e-05, + "loss": 0.5049, + "step": 8979 + }, + { + "epoch": 11.527599486521181, + "grad_norm": 2.410970449447632, + "learning_rate": 2.949293966623877e-05, + "loss": 0.5084, + "step": 8980 + }, + { + "epoch": 11.528883183568677, + "grad_norm": 2.520036220550537, + "learning_rate": 2.9492511767222938e-05, + "loss": 0.571, + "step": 8981 + }, + { + "epoch": 11.530166880616175, + "grad_norm": 1.162657380104065, + "learning_rate": 2.9492083868207103e-05, + "loss": 0.5104, + "step": 8982 + }, + { + "epoch": 11.531450577663671, + "grad_norm": 3.766713857650757, + "learning_rate": 2.949165596919127e-05, + "loss": 0.5575, + "step": 8983 + }, + { + "epoch": 11.532734274711169, + "grad_norm": 1.2751799821853638, + "learning_rate": 2.949122807017544e-05, + "loss": 0.5593, + "step": 8984 + }, + { + "epoch": 11.534017971758665, + "grad_norm": 1.0062472820281982, + "learning_rate": 2.9490800171159608e-05, + "loss": 0.5579, + "step": 8985 + }, + { + "epoch": 11.535301668806161, + "grad_norm": 1.2219572067260742, + "learning_rate": 2.9490372272143776e-05, + "loss": 0.5473, + "step": 8986 + }, + { + "epoch": 11.536585365853659, + "grad_norm": 1.316997766494751, + "learning_rate": 2.948994437312794e-05, + "loss": 0.5142, + "step": 8987 + }, + { + "epoch": 11.537869062901155, + "grad_norm": 1.5540781021118164, + "learning_rate": 2.9489516474112113e-05, + "loss": 0.5208, + "step": 8988 + }, + { + "epoch": 11.539152759948653, + "grad_norm": 2.834766149520874, + "learning_rate": 2.9489088575096278e-05, + "loss": 0.5349, + "step": 8989 + }, + { + "epoch": 11.540436456996149, + "grad_norm": 2.042156457901001, + "learning_rate": 2.9488660676080443e-05, + "loss": 0.5258, + "step": 8990 + }, + { + "epoch": 11.541720154043645, + "grad_norm": 1.6650149822235107, + "learning_rate": 2.9488232777064615e-05, + "loss": 0.5535, + "step": 8991 + }, + { + "epoch": 11.543003851091143, + "grad_norm": 2.3354523181915283, + "learning_rate": 2.948780487804878e-05, + "loss": 0.5312, + "step": 8992 + }, + { + "epoch": 11.544287548138639, + "grad_norm": 1.7145479917526245, + "learning_rate": 2.948737697903295e-05, + "loss": 0.5923, + "step": 8993 + }, + { + "epoch": 11.545571245186135, + "grad_norm": 4.739439010620117, + "learning_rate": 2.9486949080017117e-05, + "loss": 0.5836, + "step": 8994 + }, + { + "epoch": 11.546854942233633, + "grad_norm": 3.467306137084961, + "learning_rate": 2.9486521181001285e-05, + "loss": 0.5422, + "step": 8995 + }, + { + "epoch": 11.54813863928113, + "grad_norm": 1.805680751800537, + "learning_rate": 2.9486093281985454e-05, + "loss": 0.4815, + "step": 8996 + }, + { + "epoch": 11.549422336328627, + "grad_norm": 2.86256742477417, + "learning_rate": 2.948566538296962e-05, + "loss": 0.55, + "step": 8997 + }, + { + "epoch": 11.550706033376123, + "grad_norm": 2.5573456287384033, + "learning_rate": 2.9485237483953787e-05, + "loss": 0.5814, + "step": 8998 + }, + { + "epoch": 11.55198973042362, + "grad_norm": 1.2024286985397339, + "learning_rate": 2.9484809584937955e-05, + "loss": 0.5439, + "step": 8999 + }, + { + "epoch": 11.553273427471117, + "grad_norm": 1.1799920797348022, + "learning_rate": 2.9484381685922124e-05, + "loss": 0.5432, + "step": 9000 + }, + { + "epoch": 11.553273427471117, + "eval_cer": 0.28908850597219504, + "eval_loss": 0.55454021692276, + "eval_runtime": 13.5791, + "eval_samples_per_second": 72.391, + "eval_steps_per_second": 0.515, + "eval_wer": 0.5284338454482916, + "step": 9000 + }, + { + "epoch": 11.554557124518613, + "grad_norm": 2.9384920597076416, + "learning_rate": 2.9483953786906292e-05, + "loss": 0.5343, + "step": 9001 + }, + { + "epoch": 11.555840821566111, + "grad_norm": 2.2652645111083984, + "learning_rate": 2.948352588789046e-05, + "loss": 0.5809, + "step": 9002 + }, + { + "epoch": 11.557124518613607, + "grad_norm": 1.9545001983642578, + "learning_rate": 2.9483097988874626e-05, + "loss": 0.5803, + "step": 9003 + }, + { + "epoch": 11.558408215661103, + "grad_norm": 2.4073150157928467, + "learning_rate": 2.9482670089858794e-05, + "loss": 0.5562, + "step": 9004 + }, + { + "epoch": 11.559691912708601, + "grad_norm": 1.6845571994781494, + "learning_rate": 2.9482242190842962e-05, + "loss": 0.5901, + "step": 9005 + }, + { + "epoch": 11.560975609756097, + "grad_norm": 2.8955166339874268, + "learning_rate": 2.9481814291827127e-05, + "loss": 0.6689, + "step": 9006 + }, + { + "epoch": 11.562259306803595, + "grad_norm": 2.599673271179199, + "learning_rate": 2.94813863928113e-05, + "loss": 0.6255, + "step": 9007 + }, + { + "epoch": 11.563543003851091, + "grad_norm": 1.5303092002868652, + "learning_rate": 2.9480958493795464e-05, + "loss": 0.6063, + "step": 9008 + }, + { + "epoch": 11.564826700898587, + "grad_norm": 1.2026550769805908, + "learning_rate": 2.9480530594779636e-05, + "loss": 0.5637, + "step": 9009 + }, + { + "epoch": 11.566110397946085, + "grad_norm": 5.099169731140137, + "learning_rate": 2.94801026957638e-05, + "loss": 0.5503, + "step": 9010 + }, + { + "epoch": 11.567394094993581, + "grad_norm": 1.7042956352233887, + "learning_rate": 2.9479674796747966e-05, + "loss": 0.5801, + "step": 9011 + }, + { + "epoch": 11.568677792041079, + "grad_norm": 2.8014447689056396, + "learning_rate": 2.9479246897732138e-05, + "loss": 0.6414, + "step": 9012 + }, + { + "epoch": 11.569961489088575, + "grad_norm": 1.3267900943756104, + "learning_rate": 2.9478818998716303e-05, + "loss": 0.5637, + "step": 9013 + }, + { + "epoch": 11.571245186136071, + "grad_norm": 1.951920747756958, + "learning_rate": 2.947839109970047e-05, + "loss": 0.5629, + "step": 9014 + }, + { + "epoch": 11.572528883183569, + "grad_norm": 2.2004027366638184, + "learning_rate": 2.947796320068464e-05, + "loss": 0.649, + "step": 9015 + }, + { + "epoch": 11.573812580231065, + "grad_norm": 2.5580198764801025, + "learning_rate": 2.9477535301668805e-05, + "loss": 0.6063, + "step": 9016 + }, + { + "epoch": 11.575096277278563, + "grad_norm": 3.7606139183044434, + "learning_rate": 2.9477107402652973e-05, + "loss": 0.6284, + "step": 9017 + }, + { + "epoch": 11.57637997432606, + "grad_norm": 1.891901969909668, + "learning_rate": 2.947667950363714e-05, + "loss": 0.7781, + "step": 9018 + }, + { + "epoch": 11.577663671373555, + "grad_norm": 4.003821849822998, + "learning_rate": 2.947625160462131e-05, + "loss": 0.7766, + "step": 9019 + }, + { + "epoch": 11.578947368421053, + "grad_norm": 1.255317211151123, + "learning_rate": 2.947582370560548e-05, + "loss": 0.5014, + "step": 9020 + }, + { + "epoch": 11.58023106546855, + "grad_norm": 2.0523481369018555, + "learning_rate": 2.9475395806589647e-05, + "loss": 0.5228, + "step": 9021 + }, + { + "epoch": 11.581514762516047, + "grad_norm": 2.8946046829223633, + "learning_rate": 2.9474967907573812e-05, + "loss": 0.5285, + "step": 9022 + }, + { + "epoch": 11.582798459563543, + "grad_norm": 1.143216609954834, + "learning_rate": 2.947454000855798e-05, + "loss": 0.5394, + "step": 9023 + }, + { + "epoch": 11.58408215661104, + "grad_norm": 4.251554489135742, + "learning_rate": 2.947411210954215e-05, + "loss": 0.559, + "step": 9024 + }, + { + "epoch": 11.585365853658537, + "grad_norm": 1.595025897026062, + "learning_rate": 2.9473684210526314e-05, + "loss": 0.5241, + "step": 9025 + }, + { + "epoch": 11.586649550706033, + "grad_norm": 1.4908411502838135, + "learning_rate": 2.9473256311510486e-05, + "loss": 0.4894, + "step": 9026 + }, + { + "epoch": 11.58793324775353, + "grad_norm": 1.770818829536438, + "learning_rate": 2.947282841249465e-05, + "loss": 0.5446, + "step": 9027 + }, + { + "epoch": 11.589216944801027, + "grad_norm": 1.717116117477417, + "learning_rate": 2.9472400513478822e-05, + "loss": 0.5146, + "step": 9028 + }, + { + "epoch": 11.590500641848523, + "grad_norm": 1.270545482635498, + "learning_rate": 2.9471972614462987e-05, + "loss": 0.5349, + "step": 9029 + }, + { + "epoch": 11.591784338896021, + "grad_norm": 2.7202627658843994, + "learning_rate": 2.9471544715447152e-05, + "loss": 0.4955, + "step": 9030 + }, + { + "epoch": 11.593068035943517, + "grad_norm": 2.327608108520508, + "learning_rate": 2.9471116816431324e-05, + "loss": 0.5516, + "step": 9031 + }, + { + "epoch": 11.594351732991013, + "grad_norm": 1.4225962162017822, + "learning_rate": 2.947068891741549e-05, + "loss": 0.5217, + "step": 9032 + }, + { + "epoch": 11.595635430038511, + "grad_norm": 2.8465662002563477, + "learning_rate": 2.9470261018399658e-05, + "loss": 0.5969, + "step": 9033 + }, + { + "epoch": 11.596919127086007, + "grad_norm": 1.8158599138259888, + "learning_rate": 2.9469833119383826e-05, + "loss": 0.5384, + "step": 9034 + }, + { + "epoch": 11.598202824133505, + "grad_norm": 1.7750636339187622, + "learning_rate": 2.9469405220367994e-05, + "loss": 0.5666, + "step": 9035 + }, + { + "epoch": 11.599486521181001, + "grad_norm": 2.117993116378784, + "learning_rate": 2.9468977321352163e-05, + "loss": 0.5297, + "step": 9036 + }, + { + "epoch": 11.600770218228497, + "grad_norm": 1.7855772972106934, + "learning_rate": 2.9468549422336328e-05, + "loss": 0.569, + "step": 9037 + }, + { + "epoch": 11.602053915275995, + "grad_norm": 1.3213750123977661, + "learning_rate": 2.9468121523320496e-05, + "loss": 0.5582, + "step": 9038 + }, + { + "epoch": 11.603337612323491, + "grad_norm": 2.176754951477051, + "learning_rate": 2.9467693624304665e-05, + "loss": 0.5718, + "step": 9039 + }, + { + "epoch": 11.60462130937099, + "grad_norm": 1.5104799270629883, + "learning_rate": 2.9467265725288833e-05, + "loss": 0.5433, + "step": 9040 + }, + { + "epoch": 11.605905006418485, + "grad_norm": 3.310704231262207, + "learning_rate": 2.9466837826272998e-05, + "loss": 0.5634, + "step": 9041 + }, + { + "epoch": 11.607188703465981, + "grad_norm": 1.2628154754638672, + "learning_rate": 2.946640992725717e-05, + "loss": 0.5686, + "step": 9042 + }, + { + "epoch": 11.60847240051348, + "grad_norm": 1.9854142665863037, + "learning_rate": 2.9465982028241335e-05, + "loss": 0.5472, + "step": 9043 + }, + { + "epoch": 11.609756097560975, + "grad_norm": 1.1918991804122925, + "learning_rate": 2.9465554129225503e-05, + "loss": 0.5415, + "step": 9044 + }, + { + "epoch": 11.611039794608473, + "grad_norm": 1.610421895980835, + "learning_rate": 2.9465126230209672e-05, + "loss": 0.5746, + "step": 9045 + }, + { + "epoch": 11.61232349165597, + "grad_norm": 2.790677309036255, + "learning_rate": 2.9464698331193837e-05, + "loss": 0.5463, + "step": 9046 + }, + { + "epoch": 11.613607188703465, + "grad_norm": 1.2230008840560913, + "learning_rate": 2.946427043217801e-05, + "loss": 0.5804, + "step": 9047 + }, + { + "epoch": 11.614890885750963, + "grad_norm": 1.8941123485565186, + "learning_rate": 2.9463842533162174e-05, + "loss": 0.5239, + "step": 9048 + }, + { + "epoch": 11.61617458279846, + "grad_norm": 1.7595937252044678, + "learning_rate": 2.9463414634146342e-05, + "loss": 0.5725, + "step": 9049 + }, + { + "epoch": 11.617458279845957, + "grad_norm": 5.406755447387695, + "learning_rate": 2.946298673513051e-05, + "loss": 0.5811, + "step": 9050 + }, + { + "epoch": 11.618741976893453, + "grad_norm": 1.1761921644210815, + "learning_rate": 2.9462558836114676e-05, + "loss": 0.6044, + "step": 9051 + }, + { + "epoch": 11.62002567394095, + "grad_norm": 1.7346587181091309, + "learning_rate": 2.9462130937098847e-05, + "loss": 0.5231, + "step": 9052 + }, + { + "epoch": 11.621309370988447, + "grad_norm": 1.054989218711853, + "learning_rate": 2.9461703038083012e-05, + "loss": 0.5666, + "step": 9053 + }, + { + "epoch": 11.622593068035943, + "grad_norm": 1.7750720977783203, + "learning_rate": 2.946127513906718e-05, + "loss": 0.5845, + "step": 9054 + }, + { + "epoch": 11.623876765083441, + "grad_norm": 1.2942508459091187, + "learning_rate": 2.946084724005135e-05, + "loss": 0.5936, + "step": 9055 + }, + { + "epoch": 11.625160462130937, + "grad_norm": 5.240456581115723, + "learning_rate": 2.9460419341035518e-05, + "loss": 0.6027, + "step": 9056 + }, + { + "epoch": 11.626444159178433, + "grad_norm": 1.2764829397201538, + "learning_rate": 2.9459991442019683e-05, + "loss": 0.5824, + "step": 9057 + }, + { + "epoch": 11.627727856225931, + "grad_norm": 1.0119541883468628, + "learning_rate": 2.945956354300385e-05, + "loss": 0.5447, + "step": 9058 + }, + { + "epoch": 11.629011553273427, + "grad_norm": 1.7791606187820435, + "learning_rate": 2.945913564398802e-05, + "loss": 0.5695, + "step": 9059 + }, + { + "epoch": 11.630295250320923, + "grad_norm": 1.3249456882476807, + "learning_rate": 2.9458707744972188e-05, + "loss": 0.6405, + "step": 9060 + }, + { + "epoch": 11.631578947368421, + "grad_norm": 2.1848044395446777, + "learning_rate": 2.9458279845956356e-05, + "loss": 0.5813, + "step": 9061 + }, + { + "epoch": 11.632862644415917, + "grad_norm": 1.77434241771698, + "learning_rate": 2.945785194694052e-05, + "loss": 0.5296, + "step": 9062 + }, + { + "epoch": 11.634146341463415, + "grad_norm": 1.3139392137527466, + "learning_rate": 2.9457424047924693e-05, + "loss": 0.5957, + "step": 9063 + }, + { + "epoch": 11.635430038510911, + "grad_norm": 2.5907230377197266, + "learning_rate": 2.9456996148908858e-05, + "loss": 0.6184, + "step": 9064 + }, + { + "epoch": 11.63671373555841, + "grad_norm": 2.108983039855957, + "learning_rate": 2.9456568249893023e-05, + "loss": 0.6174, + "step": 9065 + }, + { + "epoch": 11.637997432605905, + "grad_norm": 3.916691303253174, + "learning_rate": 2.9456140350877195e-05, + "loss": 0.6151, + "step": 9066 + }, + { + "epoch": 11.639281129653401, + "grad_norm": 2.6407363414764404, + "learning_rate": 2.945571245186136e-05, + "loss": 0.6547, + "step": 9067 + }, + { + "epoch": 11.6405648267009, + "grad_norm": 3.375235080718994, + "learning_rate": 2.9455284552845532e-05, + "loss": 0.741, + "step": 9068 + }, + { + "epoch": 11.641848523748395, + "grad_norm": 2.9788777828216553, + "learning_rate": 2.9454856653829697e-05, + "loss": 0.7063, + "step": 9069 + }, + { + "epoch": 11.643132220795891, + "grad_norm": 1.4477380514144897, + "learning_rate": 2.9454428754813865e-05, + "loss": 0.5364, + "step": 9070 + }, + { + "epoch": 11.64441591784339, + "grad_norm": 2.6734869480133057, + "learning_rate": 2.9454000855798034e-05, + "loss": 0.5098, + "step": 9071 + }, + { + "epoch": 11.645699614890885, + "grad_norm": 1.2220293283462524, + "learning_rate": 2.94535729567822e-05, + "loss": 0.5479, + "step": 9072 + }, + { + "epoch": 11.646983311938383, + "grad_norm": 0.8508923649787903, + "learning_rate": 2.9453145057766367e-05, + "loss": 0.5574, + "step": 9073 + }, + { + "epoch": 11.64826700898588, + "grad_norm": 0.9953225255012512, + "learning_rate": 2.9452717158750535e-05, + "loss": 0.5346, + "step": 9074 + }, + { + "epoch": 11.649550706033375, + "grad_norm": 1.2868425846099854, + "learning_rate": 2.9452289259734704e-05, + "loss": 0.5224, + "step": 9075 + }, + { + "epoch": 11.650834403080873, + "grad_norm": 2.042193651199341, + "learning_rate": 2.9451861360718872e-05, + "loss": 0.5815, + "step": 9076 + }, + { + "epoch": 11.65211810012837, + "grad_norm": 1.6015982627868652, + "learning_rate": 2.9451433461703037e-05, + "loss": 0.57, + "step": 9077 + }, + { + "epoch": 11.653401797175867, + "grad_norm": 1.789239764213562, + "learning_rate": 2.9451005562687206e-05, + "loss": 0.5518, + "step": 9078 + }, + { + "epoch": 11.654685494223363, + "grad_norm": 1.4356515407562256, + "learning_rate": 2.9450577663671374e-05, + "loss": 0.5527, + "step": 9079 + }, + { + "epoch": 11.65596919127086, + "grad_norm": 1.714721918106079, + "learning_rate": 2.9450149764655543e-05, + "loss": 0.5304, + "step": 9080 + }, + { + "epoch": 11.657252888318357, + "grad_norm": 3.8221402168273926, + "learning_rate": 2.9449721865639708e-05, + "loss": 0.5447, + "step": 9081 + }, + { + "epoch": 11.658536585365853, + "grad_norm": 3.1283082962036133, + "learning_rate": 2.944929396662388e-05, + "loss": 0.5765, + "step": 9082 + }, + { + "epoch": 11.659820282413351, + "grad_norm": 1.7301241159439087, + "learning_rate": 2.9448866067608044e-05, + "loss": 0.5307, + "step": 9083 + }, + { + "epoch": 11.661103979460847, + "grad_norm": 1.3966654539108276, + "learning_rate": 2.9448438168592213e-05, + "loss": 0.584, + "step": 9084 + }, + { + "epoch": 11.662387676508343, + "grad_norm": 1.7287997007369995, + "learning_rate": 2.944801026957638e-05, + "loss": 0.5309, + "step": 9085 + }, + { + "epoch": 11.663671373555841, + "grad_norm": 5.675223350524902, + "learning_rate": 2.9447582370560546e-05, + "loss": 0.5962, + "step": 9086 + }, + { + "epoch": 11.664955070603337, + "grad_norm": 1.6196434497833252, + "learning_rate": 2.9447154471544718e-05, + "loss": 0.5575, + "step": 9087 + }, + { + "epoch": 11.666238767650835, + "grad_norm": 1.5636247396469116, + "learning_rate": 2.9446726572528883e-05, + "loss": 0.5972, + "step": 9088 + }, + { + "epoch": 11.667522464698331, + "grad_norm": 1.291168212890625, + "learning_rate": 2.944629867351305e-05, + "loss": 0.5079, + "step": 9089 + }, + { + "epoch": 11.668806161745827, + "grad_norm": 1.3954483270645142, + "learning_rate": 2.944587077449722e-05, + "loss": 0.5157, + "step": 9090 + }, + { + "epoch": 11.670089858793325, + "grad_norm": 1.9863373041152954, + "learning_rate": 2.9445442875481385e-05, + "loss": 0.5311, + "step": 9091 + }, + { + "epoch": 11.671373555840821, + "grad_norm": 1.6126837730407715, + "learning_rate": 2.9445014976465557e-05, + "loss": 0.5233, + "step": 9092 + }, + { + "epoch": 11.672657252888317, + "grad_norm": 3.5200469493865967, + "learning_rate": 2.9444587077449722e-05, + "loss": 0.5301, + "step": 9093 + }, + { + "epoch": 11.673940949935815, + "grad_norm": 1.397765874862671, + "learning_rate": 2.944415917843389e-05, + "loss": 0.5575, + "step": 9094 + }, + { + "epoch": 11.675224646983311, + "grad_norm": 2.617403268814087, + "learning_rate": 2.944373127941806e-05, + "loss": 0.507, + "step": 9095 + }, + { + "epoch": 11.67650834403081, + "grad_norm": 1.4548592567443848, + "learning_rate": 2.9443303380402227e-05, + "loss": 0.5522, + "step": 9096 + }, + { + "epoch": 11.677792041078305, + "grad_norm": 2.2817726135253906, + "learning_rate": 2.9442875481386392e-05, + "loss": 0.5556, + "step": 9097 + }, + { + "epoch": 11.679075738125803, + "grad_norm": 1.616161823272705, + "learning_rate": 2.944244758237056e-05, + "loss": 0.5377, + "step": 9098 + }, + { + "epoch": 11.6803594351733, + "grad_norm": 1.2124581336975098, + "learning_rate": 2.944201968335473e-05, + "loss": 0.5227, + "step": 9099 + }, + { + "epoch": 11.681643132220795, + "grad_norm": 1.9527168273925781, + "learning_rate": 2.9441591784338897e-05, + "loss": 0.5561, + "step": 9100 + }, + { + "epoch": 11.682926829268293, + "grad_norm": 1.971959114074707, + "learning_rate": 2.9441163885323066e-05, + "loss": 0.5259, + "step": 9101 + }, + { + "epoch": 11.68421052631579, + "grad_norm": 6.375008583068848, + "learning_rate": 2.944073598630723e-05, + "loss": 0.5628, + "step": 9102 + }, + { + "epoch": 11.685494223363285, + "grad_norm": 1.5943214893341064, + "learning_rate": 2.9440308087291402e-05, + "loss": 0.6005, + "step": 9103 + }, + { + "epoch": 11.686777920410783, + "grad_norm": 5.962901592254639, + "learning_rate": 2.9439880188275567e-05, + "loss": 0.5279, + "step": 9104 + }, + { + "epoch": 11.68806161745828, + "grad_norm": 2.3186376094818115, + "learning_rate": 2.9439452289259732e-05, + "loss": 0.6002, + "step": 9105 + }, + { + "epoch": 11.689345314505777, + "grad_norm": 6.5000386238098145, + "learning_rate": 2.9439024390243904e-05, + "loss": 0.5917, + "step": 9106 + }, + { + "epoch": 11.690629011553273, + "grad_norm": 4.171356201171875, + "learning_rate": 2.943859649122807e-05, + "loss": 0.5867, + "step": 9107 + }, + { + "epoch": 11.69191270860077, + "grad_norm": 1.7650525569915771, + "learning_rate": 2.943816859221224e-05, + "loss": 0.573, + "step": 9108 + }, + { + "epoch": 11.693196405648267, + "grad_norm": 6.270993232727051, + "learning_rate": 2.9437740693196406e-05, + "loss": 0.6354, + "step": 9109 + }, + { + "epoch": 11.694480102695763, + "grad_norm": 1.8454792499542236, + "learning_rate": 2.9437312794180575e-05, + "loss": 0.5669, + "step": 9110 + }, + { + "epoch": 11.695763799743261, + "grad_norm": 1.3603148460388184, + "learning_rate": 2.9436884895164743e-05, + "loss": 0.5507, + "step": 9111 + }, + { + "epoch": 11.697047496790757, + "grad_norm": 3.1324944496154785, + "learning_rate": 2.9436456996148908e-05, + "loss": 0.5672, + "step": 9112 + }, + { + "epoch": 11.698331193838253, + "grad_norm": 1.5216033458709717, + "learning_rate": 2.9436029097133076e-05, + "loss": 0.6173, + "step": 9113 + }, + { + "epoch": 11.699614890885751, + "grad_norm": 26.2543888092041, + "learning_rate": 2.9435601198117245e-05, + "loss": 0.6433, + "step": 9114 + }, + { + "epoch": 11.700898587933247, + "grad_norm": 3.3668501377105713, + "learning_rate": 2.9435173299101413e-05, + "loss": 0.6167, + "step": 9115 + }, + { + "epoch": 11.702182284980745, + "grad_norm": 2.3218064308166504, + "learning_rate": 2.943474540008558e-05, + "loss": 0.6102, + "step": 9116 + }, + { + "epoch": 11.703465982028241, + "grad_norm": 6.247335910797119, + "learning_rate": 2.943431750106975e-05, + "loss": 0.667, + "step": 9117 + }, + { + "epoch": 11.704749679075737, + "grad_norm": 3.354703903198242, + "learning_rate": 2.9433889602053915e-05, + "loss": 0.6719, + "step": 9118 + }, + { + "epoch": 11.706033376123235, + "grad_norm": 7.650423049926758, + "learning_rate": 2.9433461703038083e-05, + "loss": 0.766, + "step": 9119 + }, + { + "epoch": 11.707317073170731, + "grad_norm": 1.504686951637268, + "learning_rate": 2.9433033804022252e-05, + "loss": 0.5353, + "step": 9120 + }, + { + "epoch": 11.70860077021823, + "grad_norm": 1.8832473754882812, + "learning_rate": 2.9432605905006417e-05, + "loss": 0.531, + "step": 9121 + }, + { + "epoch": 11.709884467265725, + "grad_norm": 1.4613003730773926, + "learning_rate": 2.943217800599059e-05, + "loss": 0.579, + "step": 9122 + }, + { + "epoch": 11.711168164313221, + "grad_norm": 3.2470312118530273, + "learning_rate": 2.9431750106974754e-05, + "loss": 0.5211, + "step": 9123 + }, + { + "epoch": 11.71245186136072, + "grad_norm": 1.1635758876800537, + "learning_rate": 2.9431322207958926e-05, + "loss": 0.5508, + "step": 9124 + }, + { + "epoch": 11.713735558408215, + "grad_norm": 0.8712024092674255, + "learning_rate": 2.943089430894309e-05, + "loss": 0.5623, + "step": 9125 + }, + { + "epoch": 11.715019255455712, + "grad_norm": 1.6797162294387817, + "learning_rate": 2.9430466409927256e-05, + "loss": 0.5656, + "step": 9126 + }, + { + "epoch": 11.71630295250321, + "grad_norm": 1.2000555992126465, + "learning_rate": 2.9430038510911427e-05, + "loss": 0.5297, + "step": 9127 + }, + { + "epoch": 11.717586649550706, + "grad_norm": 1.9060580730438232, + "learning_rate": 2.9429610611895592e-05, + "loss": 0.5105, + "step": 9128 + }, + { + "epoch": 11.718870346598203, + "grad_norm": 2.239426851272583, + "learning_rate": 2.942918271287976e-05, + "loss": 0.5507, + "step": 9129 + }, + { + "epoch": 11.7201540436457, + "grad_norm": 2.6246626377105713, + "learning_rate": 2.942875481386393e-05, + "loss": 0.5603, + "step": 9130 + }, + { + "epoch": 11.721437740693197, + "grad_norm": 1.727858543395996, + "learning_rate": 2.9428326914848098e-05, + "loss": 0.5318, + "step": 9131 + }, + { + "epoch": 11.722721437740693, + "grad_norm": 1.1784768104553223, + "learning_rate": 2.9427899015832266e-05, + "loss": 0.567, + "step": 9132 + }, + { + "epoch": 11.72400513478819, + "grad_norm": 2.043618679046631, + "learning_rate": 2.942747111681643e-05, + "loss": 0.5608, + "step": 9133 + }, + { + "epoch": 11.725288831835687, + "grad_norm": 0.963323175907135, + "learning_rate": 2.94270432178006e-05, + "loss": 0.5421, + "step": 9134 + }, + { + "epoch": 11.726572528883183, + "grad_norm": 2.520902156829834, + "learning_rate": 2.9426615318784768e-05, + "loss": 0.5889, + "step": 9135 + }, + { + "epoch": 11.72785622593068, + "grad_norm": 2.343170404434204, + "learning_rate": 2.9426187419768936e-05, + "loss": 0.4998, + "step": 9136 + }, + { + "epoch": 11.729139922978177, + "grad_norm": 3.0263311862945557, + "learning_rate": 2.94257595207531e-05, + "loss": 0.5365, + "step": 9137 + }, + { + "epoch": 11.730423620025674, + "grad_norm": 1.5715656280517578, + "learning_rate": 2.9425331621737273e-05, + "loss": 0.5514, + "step": 9138 + }, + { + "epoch": 11.731707317073171, + "grad_norm": 1.4313470125198364, + "learning_rate": 2.9424903722721438e-05, + "loss": 0.5675, + "step": 9139 + }, + { + "epoch": 11.732991014120667, + "grad_norm": 2.0882863998413086, + "learning_rate": 2.9424475823705607e-05, + "loss": 0.5276, + "step": 9140 + }, + { + "epoch": 11.734274711168164, + "grad_norm": 1.259789228439331, + "learning_rate": 2.9424047924689775e-05, + "loss": 0.5141, + "step": 9141 + }, + { + "epoch": 11.735558408215661, + "grad_norm": 1.3134657144546509, + "learning_rate": 2.942362002567394e-05, + "loss": 0.5537, + "step": 9142 + }, + { + "epoch": 11.736842105263158, + "grad_norm": 1.833823323249817, + "learning_rate": 2.9423192126658112e-05, + "loss": 0.5287, + "step": 9143 + }, + { + "epoch": 11.738125802310655, + "grad_norm": 1.7786095142364502, + "learning_rate": 2.9422764227642277e-05, + "loss": 0.5447, + "step": 9144 + }, + { + "epoch": 11.739409499358151, + "grad_norm": 1.7519391775131226, + "learning_rate": 2.9422336328626442e-05, + "loss": 0.6049, + "step": 9145 + }, + { + "epoch": 11.740693196405648, + "grad_norm": 3.494866371154785, + "learning_rate": 2.9421908429610614e-05, + "loss": 0.5494, + "step": 9146 + }, + { + "epoch": 11.741976893453145, + "grad_norm": 11.722095489501953, + "learning_rate": 2.942148053059478e-05, + "loss": 0.6101, + "step": 9147 + }, + { + "epoch": 11.743260590500642, + "grad_norm": 1.8408981561660767, + "learning_rate": 2.942105263157895e-05, + "loss": 0.5755, + "step": 9148 + }, + { + "epoch": 11.74454428754814, + "grad_norm": 3.630185842514038, + "learning_rate": 2.9420624732563115e-05, + "loss": 0.5813, + "step": 9149 + }, + { + "epoch": 11.745827984595635, + "grad_norm": 1.6370831727981567, + "learning_rate": 2.9420196833547284e-05, + "loss": 0.525, + "step": 9150 + }, + { + "epoch": 11.747111681643132, + "grad_norm": 2.2196531295776367, + "learning_rate": 2.9419768934531452e-05, + "loss": 0.6059, + "step": 9151 + }, + { + "epoch": 11.74839537869063, + "grad_norm": 1.4248325824737549, + "learning_rate": 2.9419341035515617e-05, + "loss": 0.5337, + "step": 9152 + }, + { + "epoch": 11.749679075738126, + "grad_norm": 5.011711597442627, + "learning_rate": 2.9418913136499786e-05, + "loss": 0.5455, + "step": 9153 + }, + { + "epoch": 11.750962772785623, + "grad_norm": 1.6928813457489014, + "learning_rate": 2.9418485237483954e-05, + "loss": 0.555, + "step": 9154 + }, + { + "epoch": 11.75224646983312, + "grad_norm": 3.1629085540771484, + "learning_rate": 2.9418057338468123e-05, + "loss": 0.5279, + "step": 9155 + }, + { + "epoch": 11.753530166880616, + "grad_norm": 1.5579389333724976, + "learning_rate": 2.941762943945229e-05, + "loss": 0.5947, + "step": 9156 + }, + { + "epoch": 11.754813863928113, + "grad_norm": 3.2393851280212402, + "learning_rate": 2.941720154043646e-05, + "loss": 0.523, + "step": 9157 + }, + { + "epoch": 11.75609756097561, + "grad_norm": 1.46394681930542, + "learning_rate": 2.9416773641420624e-05, + "loss": 0.6338, + "step": 9158 + }, + { + "epoch": 11.757381258023106, + "grad_norm": 4.439261436462402, + "learning_rate": 2.9416345742404793e-05, + "loss": 0.5476, + "step": 9159 + }, + { + "epoch": 11.758664955070603, + "grad_norm": 2.6096746921539307, + "learning_rate": 2.941591784338896e-05, + "loss": 0.6362, + "step": 9160 + }, + { + "epoch": 11.7599486521181, + "grad_norm": 1.0464972257614136, + "learning_rate": 2.9415489944373126e-05, + "loss": 0.6528, + "step": 9161 + }, + { + "epoch": 11.761232349165597, + "grad_norm": 1.386361002922058, + "learning_rate": 2.9415062045357298e-05, + "loss": 0.6072, + "step": 9162 + }, + { + "epoch": 11.762516046213094, + "grad_norm": 1.3172014951705933, + "learning_rate": 2.9414634146341463e-05, + "loss": 0.5682, + "step": 9163 + }, + { + "epoch": 11.763799743260591, + "grad_norm": 6.725571632385254, + "learning_rate": 2.9414206247325635e-05, + "loss": 0.6516, + "step": 9164 + }, + { + "epoch": 11.765083440308088, + "grad_norm": 2.3568263053894043, + "learning_rate": 2.94137783483098e-05, + "loss": 0.657, + "step": 9165 + }, + { + "epoch": 11.766367137355584, + "grad_norm": 1.587090015411377, + "learning_rate": 2.9413350449293965e-05, + "loss": 0.6345, + "step": 9166 + }, + { + "epoch": 11.767650834403081, + "grad_norm": 6.443321228027344, + "learning_rate": 2.9412922550278137e-05, + "loss": 0.6898, + "step": 9167 + }, + { + "epoch": 11.768934531450578, + "grad_norm": 1.374494194984436, + "learning_rate": 2.9412494651262302e-05, + "loss": 0.7362, + "step": 9168 + }, + { + "epoch": 11.770218228498074, + "grad_norm": 2.4214813709259033, + "learning_rate": 2.941206675224647e-05, + "loss": 0.7714, + "step": 9169 + }, + { + "epoch": 11.771501925545572, + "grad_norm": 1.3518568277359009, + "learning_rate": 2.941163885323064e-05, + "loss": 0.5608, + "step": 9170 + }, + { + "epoch": 11.772785622593068, + "grad_norm": 1.2563419342041016, + "learning_rate": 2.9411210954214807e-05, + "loss": 0.5331, + "step": 9171 + }, + { + "epoch": 11.774069319640565, + "grad_norm": 1.4395408630371094, + "learning_rate": 2.9410783055198975e-05, + "loss": 0.4958, + "step": 9172 + }, + { + "epoch": 11.775353016688062, + "grad_norm": 1.514187216758728, + "learning_rate": 2.941035515618314e-05, + "loss": 0.5265, + "step": 9173 + }, + { + "epoch": 11.776636713735558, + "grad_norm": 2.0929558277130127, + "learning_rate": 2.940992725716731e-05, + "loss": 0.5287, + "step": 9174 + }, + { + "epoch": 11.777920410783056, + "grad_norm": 4.639008522033691, + "learning_rate": 2.9409499358151477e-05, + "loss": 0.5734, + "step": 9175 + }, + { + "epoch": 11.779204107830552, + "grad_norm": 0.9504207372665405, + "learning_rate": 2.9409071459135646e-05, + "loss": 0.5015, + "step": 9176 + }, + { + "epoch": 11.78048780487805, + "grad_norm": 1.5510058403015137, + "learning_rate": 2.940864356011981e-05, + "loss": 0.5501, + "step": 9177 + }, + { + "epoch": 11.781771501925546, + "grad_norm": 2.613457441329956, + "learning_rate": 2.9408215661103982e-05, + "loss": 0.57, + "step": 9178 + }, + { + "epoch": 11.783055198973042, + "grad_norm": 1.0174453258514404, + "learning_rate": 2.9407787762088148e-05, + "loss": 0.5735, + "step": 9179 + }, + { + "epoch": 11.78433889602054, + "grad_norm": 4.500221252441406, + "learning_rate": 2.9407359863072316e-05, + "loss": 0.5492, + "step": 9180 + }, + { + "epoch": 11.785622593068036, + "grad_norm": 1.8536323308944702, + "learning_rate": 2.9406931964056484e-05, + "loss": 0.5598, + "step": 9181 + }, + { + "epoch": 11.786906290115533, + "grad_norm": 1.358009696006775, + "learning_rate": 2.940650406504065e-05, + "loss": 0.5456, + "step": 9182 + }, + { + "epoch": 11.78818998716303, + "grad_norm": 4.634948253631592, + "learning_rate": 2.940607616602482e-05, + "loss": 0.5595, + "step": 9183 + }, + { + "epoch": 11.789473684210526, + "grad_norm": 1.1080917119979858, + "learning_rate": 2.9405648267008986e-05, + "loss": 0.5373, + "step": 9184 + }, + { + "epoch": 11.790757381258024, + "grad_norm": 1.543667197227478, + "learning_rate": 2.9405220367993155e-05, + "loss": 0.5634, + "step": 9185 + }, + { + "epoch": 11.79204107830552, + "grad_norm": 2.7671749591827393, + "learning_rate": 2.9404792468977323e-05, + "loss": 0.5744, + "step": 9186 + }, + { + "epoch": 11.793324775353017, + "grad_norm": 3.486420154571533, + "learning_rate": 2.9404364569961488e-05, + "loss": 0.558, + "step": 9187 + }, + { + "epoch": 11.794608472400514, + "grad_norm": 1.0252004861831665, + "learning_rate": 2.940393667094566e-05, + "loss": 0.536, + "step": 9188 + }, + { + "epoch": 11.79589216944801, + "grad_norm": 1.5703598260879517, + "learning_rate": 2.9403508771929825e-05, + "loss": 0.5555, + "step": 9189 + }, + { + "epoch": 11.797175866495508, + "grad_norm": 2.6450369358062744, + "learning_rate": 2.9403080872913993e-05, + "loss": 0.5644, + "step": 9190 + }, + { + "epoch": 11.798459563543004, + "grad_norm": 1.7107274532318115, + "learning_rate": 2.940265297389816e-05, + "loss": 0.5339, + "step": 9191 + }, + { + "epoch": 11.7997432605905, + "grad_norm": 1.8330552577972412, + "learning_rate": 2.940222507488233e-05, + "loss": 0.5272, + "step": 9192 + }, + { + "epoch": 11.801026957637998, + "grad_norm": 1.2551462650299072, + "learning_rate": 2.9401797175866495e-05, + "loss": 0.5159, + "step": 9193 + }, + { + "epoch": 11.802310654685494, + "grad_norm": 1.088010311126709, + "learning_rate": 2.9401369276850664e-05, + "loss": 0.5312, + "step": 9194 + }, + { + "epoch": 11.803594351732992, + "grad_norm": 1.066941261291504, + "learning_rate": 2.9400941377834832e-05, + "loss": 0.5317, + "step": 9195 + }, + { + "epoch": 11.804878048780488, + "grad_norm": 1.2012437582015991, + "learning_rate": 2.9400513478819e-05, + "loss": 0.5818, + "step": 9196 + }, + { + "epoch": 11.806161745827985, + "grad_norm": 0.9822961091995239, + "learning_rate": 2.940008557980317e-05, + "loss": 0.5554, + "step": 9197 + }, + { + "epoch": 11.807445442875482, + "grad_norm": 13.93772029876709, + "learning_rate": 2.9399657680787334e-05, + "loss": 0.5622, + "step": 9198 + }, + { + "epoch": 11.808729139922978, + "grad_norm": 1.4151909351348877, + "learning_rate": 2.9399229781771506e-05, + "loss": 0.6073, + "step": 9199 + }, + { + "epoch": 11.810012836970476, + "grad_norm": 1.5470861196517944, + "learning_rate": 2.939880188275567e-05, + "loss": 0.5685, + "step": 9200 + }, + { + "epoch": 11.811296534017972, + "grad_norm": 3.271653413772583, + "learning_rate": 2.9398373983739836e-05, + "loss": 0.5558, + "step": 9201 + }, + { + "epoch": 11.812580231065468, + "grad_norm": 1.3684881925582886, + "learning_rate": 2.9397946084724007e-05, + "loss": 0.5731, + "step": 9202 + }, + { + "epoch": 11.813863928112966, + "grad_norm": 1.1378196477890015, + "learning_rate": 2.9397518185708172e-05, + "loss": 0.5996, + "step": 9203 + }, + { + "epoch": 11.815147625160462, + "grad_norm": 1.0285159349441528, + "learning_rate": 2.9397090286692344e-05, + "loss": 0.563, + "step": 9204 + }, + { + "epoch": 11.81643132220796, + "grad_norm": 2.6470446586608887, + "learning_rate": 2.939666238767651e-05, + "loss": 0.4907, + "step": 9205 + }, + { + "epoch": 11.817715019255456, + "grad_norm": 1.809354305267334, + "learning_rate": 2.9396234488660674e-05, + "loss": 0.6224, + "step": 9206 + }, + { + "epoch": 11.818998716302952, + "grad_norm": 3.6524498462677, + "learning_rate": 2.9395806589644846e-05, + "loss": 0.5801, + "step": 9207 + }, + { + "epoch": 11.82028241335045, + "grad_norm": 1.3651460409164429, + "learning_rate": 2.939537869062901e-05, + "loss": 0.6444, + "step": 9208 + }, + { + "epoch": 11.821566110397946, + "grad_norm": 1.4438564777374268, + "learning_rate": 2.939495079161318e-05, + "loss": 0.5954, + "step": 9209 + }, + { + "epoch": 11.822849807445444, + "grad_norm": 5.000957489013672, + "learning_rate": 2.9394522892597348e-05, + "loss": 0.5862, + "step": 9210 + }, + { + "epoch": 11.82413350449294, + "grad_norm": 2.163654327392578, + "learning_rate": 2.9394094993581516e-05, + "loss": 0.6416, + "step": 9211 + }, + { + "epoch": 11.825417201540436, + "grad_norm": 1.6608388423919678, + "learning_rate": 2.9393667094565685e-05, + "loss": 0.5821, + "step": 9212 + }, + { + "epoch": 11.826700898587934, + "grad_norm": 1.9809596538543701, + "learning_rate": 2.939323919554985e-05, + "loss": 0.582, + "step": 9213 + }, + { + "epoch": 11.82798459563543, + "grad_norm": 2.7901980876922607, + "learning_rate": 2.9392811296534018e-05, + "loss": 0.6449, + "step": 9214 + }, + { + "epoch": 11.829268292682928, + "grad_norm": 1.788388729095459, + "learning_rate": 2.9392383397518187e-05, + "loss": 0.6159, + "step": 9215 + }, + { + "epoch": 11.830551989730424, + "grad_norm": 3.942462682723999, + "learning_rate": 2.9391955498502355e-05, + "loss": 0.6809, + "step": 9216 + }, + { + "epoch": 11.83183568677792, + "grad_norm": 3.0422921180725098, + "learning_rate": 2.939152759948652e-05, + "loss": 0.691, + "step": 9217 + }, + { + "epoch": 11.833119383825418, + "grad_norm": 1.9438438415527344, + "learning_rate": 2.9391099700470692e-05, + "loss": 0.7037, + "step": 9218 + }, + { + "epoch": 11.834403080872914, + "grad_norm": 55.3436164855957, + "learning_rate": 2.9390671801454857e-05, + "loss": 0.7533, + "step": 9219 + }, + { + "epoch": 11.835686777920412, + "grad_norm": 1.3184105157852173, + "learning_rate": 2.9390243902439022e-05, + "loss": 0.5433, + "step": 9220 + }, + { + "epoch": 11.836970474967908, + "grad_norm": 1.3399298191070557, + "learning_rate": 2.9389816003423194e-05, + "loss": 0.5691, + "step": 9221 + }, + { + "epoch": 11.838254172015404, + "grad_norm": 0.8229213953018188, + "learning_rate": 2.938938810440736e-05, + "loss": 0.5455, + "step": 9222 + }, + { + "epoch": 11.839537869062902, + "grad_norm": 3.0855350494384766, + "learning_rate": 2.938896020539153e-05, + "loss": 0.5133, + "step": 9223 + }, + { + "epoch": 11.840821566110398, + "grad_norm": 1.3432278633117676, + "learning_rate": 2.9388532306375696e-05, + "loss": 0.5524, + "step": 9224 + }, + { + "epoch": 11.842105263157894, + "grad_norm": 1.538766622543335, + "learning_rate": 2.9388104407359864e-05, + "loss": 0.529, + "step": 9225 + }, + { + "epoch": 11.843388960205392, + "grad_norm": 0.9615644812583923, + "learning_rate": 2.9387676508344032e-05, + "loss": 0.6023, + "step": 9226 + }, + { + "epoch": 11.844672657252888, + "grad_norm": 1.5863487720489502, + "learning_rate": 2.9387248609328197e-05, + "loss": 0.5392, + "step": 9227 + }, + { + "epoch": 11.845956354300386, + "grad_norm": 1.522337555885315, + "learning_rate": 2.9386820710312366e-05, + "loss": 0.5551, + "step": 9228 + }, + { + "epoch": 11.847240051347882, + "grad_norm": 1.7499868869781494, + "learning_rate": 2.9386392811296534e-05, + "loss": 0.5554, + "step": 9229 + }, + { + "epoch": 11.84852374839538, + "grad_norm": 2.3804550170898438, + "learning_rate": 2.9385964912280703e-05, + "loss": 0.5407, + "step": 9230 + }, + { + "epoch": 11.849807445442876, + "grad_norm": 2.0914971828460693, + "learning_rate": 2.938553701326487e-05, + "loss": 0.6021, + "step": 9231 + }, + { + "epoch": 11.851091142490372, + "grad_norm": 1.3572360277175903, + "learning_rate": 2.938510911424904e-05, + "loss": 0.5481, + "step": 9232 + }, + { + "epoch": 11.85237483953787, + "grad_norm": 1.3801642656326294, + "learning_rate": 2.9384681215233204e-05, + "loss": 0.5183, + "step": 9233 + }, + { + "epoch": 11.853658536585366, + "grad_norm": 1.042659878730774, + "learning_rate": 2.9384253316217373e-05, + "loss": 0.5111, + "step": 9234 + }, + { + "epoch": 11.854942233632862, + "grad_norm": 1.366580605506897, + "learning_rate": 2.938382541720154e-05, + "loss": 0.5538, + "step": 9235 + }, + { + "epoch": 11.85622593068036, + "grad_norm": 13.156213760375977, + "learning_rate": 2.9383397518185706e-05, + "loss": 0.5363, + "step": 9236 + }, + { + "epoch": 11.857509627727856, + "grad_norm": 1.4723739624023438, + "learning_rate": 2.9382969619169878e-05, + "loss": 0.5316, + "step": 9237 + }, + { + "epoch": 11.858793324775354, + "grad_norm": 1.7588391304016113, + "learning_rate": 2.9382541720154043e-05, + "loss": 0.5126, + "step": 9238 + }, + { + "epoch": 11.86007702182285, + "grad_norm": 1.4518846273422241, + "learning_rate": 2.9382113821138215e-05, + "loss": 0.5694, + "step": 9239 + }, + { + "epoch": 11.861360718870346, + "grad_norm": 0.9711573719978333, + "learning_rate": 2.938168592212238e-05, + "loss": 0.5612, + "step": 9240 + }, + { + "epoch": 11.862644415917844, + "grad_norm": 1.2249382734298706, + "learning_rate": 2.9381258023106545e-05, + "loss": 0.548, + "step": 9241 + }, + { + "epoch": 11.86392811296534, + "grad_norm": 1.475899577140808, + "learning_rate": 2.9380830124090717e-05, + "loss": 0.549, + "step": 9242 + }, + { + "epoch": 11.865211810012838, + "grad_norm": 1.2300390005111694, + "learning_rate": 2.9380402225074882e-05, + "loss": 0.5676, + "step": 9243 + }, + { + "epoch": 11.866495507060334, + "grad_norm": 1.0211269855499268, + "learning_rate": 2.937997432605905e-05, + "loss": 0.5853, + "step": 9244 + }, + { + "epoch": 11.86777920410783, + "grad_norm": 2.037879228591919, + "learning_rate": 2.937954642704322e-05, + "loss": 0.5275, + "step": 9245 + }, + { + "epoch": 11.869062901155328, + "grad_norm": 2.5136160850524902, + "learning_rate": 2.9379118528027387e-05, + "loss": 0.5614, + "step": 9246 + }, + { + "epoch": 11.870346598202824, + "grad_norm": 2.5896894931793213, + "learning_rate": 2.9378690629011555e-05, + "loss": 0.5879, + "step": 9247 + }, + { + "epoch": 11.871630295250322, + "grad_norm": 2.058833360671997, + "learning_rate": 2.937826272999572e-05, + "loss": 0.5568, + "step": 9248 + }, + { + "epoch": 11.872913992297818, + "grad_norm": 1.2072852849960327, + "learning_rate": 2.937783483097989e-05, + "loss": 0.5644, + "step": 9249 + }, + { + "epoch": 11.874197689345314, + "grad_norm": 2.4042937755584717, + "learning_rate": 2.9377406931964057e-05, + "loss": 0.5683, + "step": 9250 + }, + { + "epoch": 11.875481386392812, + "grad_norm": 3.8280935287475586, + "learning_rate": 2.9376979032948226e-05, + "loss": 0.5776, + "step": 9251 + }, + { + "epoch": 11.876765083440308, + "grad_norm": 1.3522440195083618, + "learning_rate": 2.937655113393239e-05, + "loss": 0.5529, + "step": 9252 + }, + { + "epoch": 11.878048780487806, + "grad_norm": 1.9430160522460938, + "learning_rate": 2.9376123234916563e-05, + "loss": 0.601, + "step": 9253 + }, + { + "epoch": 11.879332477535302, + "grad_norm": 1.4636995792388916, + "learning_rate": 2.9375695335900728e-05, + "loss": 0.5518, + "step": 9254 + }, + { + "epoch": 11.880616174582798, + "grad_norm": 2.075993776321411, + "learning_rate": 2.9375267436884896e-05, + "loss": 0.5601, + "step": 9255 + }, + { + "epoch": 11.881899871630296, + "grad_norm": 1.5091317892074585, + "learning_rate": 2.9374839537869064e-05, + "loss": 0.5655, + "step": 9256 + }, + { + "epoch": 11.883183568677792, + "grad_norm": 1.3000391721725464, + "learning_rate": 2.937441163885323e-05, + "loss": 0.5574, + "step": 9257 + }, + { + "epoch": 11.88446726572529, + "grad_norm": 1.410723090171814, + "learning_rate": 2.93739837398374e-05, + "loss": 0.5826, + "step": 9258 + }, + { + "epoch": 11.885750962772786, + "grad_norm": 1.640979290008545, + "learning_rate": 2.9373555840821566e-05, + "loss": 0.5711, + "step": 9259 + }, + { + "epoch": 11.887034659820282, + "grad_norm": 2.1978354454040527, + "learning_rate": 2.9373127941805735e-05, + "loss": 0.547, + "step": 9260 + }, + { + "epoch": 11.88831835686778, + "grad_norm": 3.9449939727783203, + "learning_rate": 2.9372700042789903e-05, + "loss": 0.559, + "step": 9261 + }, + { + "epoch": 11.889602053915276, + "grad_norm": 5.413078784942627, + "learning_rate": 2.9372272143774068e-05, + "loss": 0.5642, + "step": 9262 + }, + { + "epoch": 11.890885750962774, + "grad_norm": 1.4464454650878906, + "learning_rate": 2.937184424475824e-05, + "loss": 0.6119, + "step": 9263 + }, + { + "epoch": 11.89216944801027, + "grad_norm": 1.978127121925354, + "learning_rate": 2.9371416345742405e-05, + "loss": 0.6628, + "step": 9264 + }, + { + "epoch": 11.893453145057766, + "grad_norm": 2.9461729526519775, + "learning_rate": 2.9370988446726573e-05, + "loss": 0.6629, + "step": 9265 + }, + { + "epoch": 11.894736842105264, + "grad_norm": 2.010638475418091, + "learning_rate": 2.9370560547710742e-05, + "loss": 0.64, + "step": 9266 + }, + { + "epoch": 11.89602053915276, + "grad_norm": 5.218266487121582, + "learning_rate": 2.9370132648694907e-05, + "loss": 0.7082, + "step": 9267 + }, + { + "epoch": 11.897304236200256, + "grad_norm": 7.041507244110107, + "learning_rate": 2.9369704749679075e-05, + "loss": 0.6955, + "step": 9268 + }, + { + "epoch": 11.898587933247754, + "grad_norm": 24.50853157043457, + "learning_rate": 2.9369276850663244e-05, + "loss": 0.8559, + "step": 9269 + }, + { + "epoch": 11.89987163029525, + "grad_norm": 1.7230656147003174, + "learning_rate": 2.9368848951647412e-05, + "loss": 0.5394, + "step": 9270 + }, + { + "epoch": 11.901155327342748, + "grad_norm": 2.2964890003204346, + "learning_rate": 2.936842105263158e-05, + "loss": 0.538, + "step": 9271 + }, + { + "epoch": 11.902439024390244, + "grad_norm": 1.2182776927947998, + "learning_rate": 2.936799315361575e-05, + "loss": 0.566, + "step": 9272 + }, + { + "epoch": 11.90372272143774, + "grad_norm": 1.390173077583313, + "learning_rate": 2.9367565254599914e-05, + "loss": 0.5467, + "step": 9273 + }, + { + "epoch": 11.905006418485238, + "grad_norm": 0.7446790933609009, + "learning_rate": 2.9367137355584082e-05, + "loss": 0.5524, + "step": 9274 + }, + { + "epoch": 11.906290115532734, + "grad_norm": 1.1333669424057007, + "learning_rate": 2.936670945656825e-05, + "loss": 0.5789, + "step": 9275 + }, + { + "epoch": 11.907573812580232, + "grad_norm": 1.2666441202163696, + "learning_rate": 2.9366281557552416e-05, + "loss": 0.516, + "step": 9276 + }, + { + "epoch": 11.908857509627728, + "grad_norm": 1.2285382747650146, + "learning_rate": 2.9365853658536587e-05, + "loss": 0.5502, + "step": 9277 + }, + { + "epoch": 11.910141206675224, + "grad_norm": 1.0616059303283691, + "learning_rate": 2.9365425759520753e-05, + "loss": 0.5463, + "step": 9278 + }, + { + "epoch": 11.911424903722722, + "grad_norm": 2.0897600650787354, + "learning_rate": 2.9364997860504924e-05, + "loss": 0.5651, + "step": 9279 + }, + { + "epoch": 11.912708600770218, + "grad_norm": 1.412358283996582, + "learning_rate": 2.936456996148909e-05, + "loss": 0.4955, + "step": 9280 + }, + { + "epoch": 11.913992297817716, + "grad_norm": 9.013694763183594, + "learning_rate": 2.9364142062473254e-05, + "loss": 0.5191, + "step": 9281 + }, + { + "epoch": 11.915275994865212, + "grad_norm": 0.9061340093612671, + "learning_rate": 2.9363714163457426e-05, + "loss": 0.529, + "step": 9282 + }, + { + "epoch": 11.916559691912708, + "grad_norm": 1.8011517524719238, + "learning_rate": 2.936328626444159e-05, + "loss": 0.5796, + "step": 9283 + }, + { + "epoch": 11.917843388960206, + "grad_norm": 1.6120492219924927, + "learning_rate": 2.936285836542576e-05, + "loss": 0.5293, + "step": 9284 + }, + { + "epoch": 11.919127086007702, + "grad_norm": 1.6782219409942627, + "learning_rate": 2.9362430466409928e-05, + "loss": 0.5202, + "step": 9285 + }, + { + "epoch": 11.9204107830552, + "grad_norm": 0.9813486337661743, + "learning_rate": 2.9362002567394096e-05, + "loss": 0.5623, + "step": 9286 + }, + { + "epoch": 11.921694480102696, + "grad_norm": 1.3866724967956543, + "learning_rate": 2.9361574668378265e-05, + "loss": 0.5845, + "step": 9287 + }, + { + "epoch": 11.922978177150192, + "grad_norm": 1.0503867864608765, + "learning_rate": 2.936114676936243e-05, + "loss": 0.5164, + "step": 9288 + }, + { + "epoch": 11.92426187419769, + "grad_norm": 1.2786413431167603, + "learning_rate": 2.9360718870346598e-05, + "loss": 0.4966, + "step": 9289 + }, + { + "epoch": 11.925545571245186, + "grad_norm": 1.182281255722046, + "learning_rate": 2.9360290971330767e-05, + "loss": 0.5071, + "step": 9290 + }, + { + "epoch": 11.926829268292684, + "grad_norm": 2.2763075828552246, + "learning_rate": 2.9359863072314935e-05, + "loss": 0.5585, + "step": 9291 + }, + { + "epoch": 11.92811296534018, + "grad_norm": 1.343409538269043, + "learning_rate": 2.93594351732991e-05, + "loss": 0.5773, + "step": 9292 + }, + { + "epoch": 11.929396662387676, + "grad_norm": 2.2930703163146973, + "learning_rate": 2.9359007274283272e-05, + "loss": 0.5068, + "step": 9293 + }, + { + "epoch": 11.930680359435174, + "grad_norm": 1.6905137300491333, + "learning_rate": 2.9358579375267437e-05, + "loss": 0.5871, + "step": 9294 + }, + { + "epoch": 11.93196405648267, + "grad_norm": 1.7131247520446777, + "learning_rate": 2.9358151476251605e-05, + "loss": 0.572, + "step": 9295 + }, + { + "epoch": 11.933247753530168, + "grad_norm": 1.3799786567687988, + "learning_rate": 2.9357723577235774e-05, + "loss": 0.5278, + "step": 9296 + }, + { + "epoch": 11.934531450577664, + "grad_norm": 1.7990624904632568, + "learning_rate": 2.935729567821994e-05, + "loss": 0.513, + "step": 9297 + }, + { + "epoch": 11.93581514762516, + "grad_norm": 3.064232110977173, + "learning_rate": 2.935686777920411e-05, + "loss": 0.5233, + "step": 9298 + }, + { + "epoch": 11.937098844672658, + "grad_norm": 1.389638900756836, + "learning_rate": 2.9356439880188276e-05, + "loss": 0.5979, + "step": 9299 + }, + { + "epoch": 11.938382541720154, + "grad_norm": 1.2847895622253418, + "learning_rate": 2.9356011981172444e-05, + "loss": 0.552, + "step": 9300 + }, + { + "epoch": 11.93966623876765, + "grad_norm": 2.0199427604675293, + "learning_rate": 2.9355584082156612e-05, + "loss": 0.5346, + "step": 9301 + }, + { + "epoch": 11.940949935815148, + "grad_norm": 1.7154481410980225, + "learning_rate": 2.9355156183140777e-05, + "loss": 0.596, + "step": 9302 + }, + { + "epoch": 11.942233632862644, + "grad_norm": 1.7946630716323853, + "learning_rate": 2.935472828412495e-05, + "loss": 0.5635, + "step": 9303 + }, + { + "epoch": 11.943517329910142, + "grad_norm": 2.2276129722595215, + "learning_rate": 2.9354300385109114e-05, + "loss": 0.5881, + "step": 9304 + }, + { + "epoch": 11.944801026957638, + "grad_norm": 2.1478006839752197, + "learning_rate": 2.9353872486093283e-05, + "loss": 0.5558, + "step": 9305 + }, + { + "epoch": 11.946084724005134, + "grad_norm": 1.6068178415298462, + "learning_rate": 2.935344458707745e-05, + "loss": 0.5312, + "step": 9306 + }, + { + "epoch": 11.947368421052632, + "grad_norm": 1.4955294132232666, + "learning_rate": 2.935301668806162e-05, + "loss": 0.5792, + "step": 9307 + }, + { + "epoch": 11.948652118100128, + "grad_norm": 1.2284553050994873, + "learning_rate": 2.9352588789045785e-05, + "loss": 0.5714, + "step": 9308 + }, + { + "epoch": 11.949935815147626, + "grad_norm": 1.5355005264282227, + "learning_rate": 2.9352160890029953e-05, + "loss": 0.5472, + "step": 9309 + }, + { + "epoch": 11.951219512195122, + "grad_norm": 1.45120370388031, + "learning_rate": 2.935173299101412e-05, + "loss": 0.6485, + "step": 9310 + }, + { + "epoch": 11.952503209242618, + "grad_norm": 1.6403065919876099, + "learning_rate": 2.935130509199829e-05, + "loss": 0.5494, + "step": 9311 + }, + { + "epoch": 11.953786906290116, + "grad_norm": 3.2783455848693848, + "learning_rate": 2.9350877192982458e-05, + "loss": 0.6107, + "step": 9312 + }, + { + "epoch": 11.955070603337612, + "grad_norm": 1.312322974205017, + "learning_rate": 2.9350449293966623e-05, + "loss": 0.5838, + "step": 9313 + }, + { + "epoch": 11.95635430038511, + "grad_norm": 1.9406031370162964, + "learning_rate": 2.9350021394950795e-05, + "loss": 0.6306, + "step": 9314 + }, + { + "epoch": 11.957637997432606, + "grad_norm": 3.468190908432007, + "learning_rate": 2.934959349593496e-05, + "loss": 0.6918, + "step": 9315 + }, + { + "epoch": 11.958921694480102, + "grad_norm": 1.470894694328308, + "learning_rate": 2.9349165596919125e-05, + "loss": 0.6795, + "step": 9316 + }, + { + "epoch": 11.9602053915276, + "grad_norm": 2.1931324005126953, + "learning_rate": 2.9348737697903297e-05, + "loss": 0.6575, + "step": 9317 + }, + { + "epoch": 11.961489088575096, + "grad_norm": 3.3051934242248535, + "learning_rate": 2.9348309798887462e-05, + "loss": 0.7215, + "step": 9318 + }, + { + "epoch": 11.962772785622594, + "grad_norm": 2.6057939529418945, + "learning_rate": 2.9347881899871634e-05, + "loss": 0.8026, + "step": 9319 + }, + { + "epoch": 11.96405648267009, + "grad_norm": 1.3880802392959595, + "learning_rate": 2.93474540008558e-05, + "loss": 0.5285, + "step": 9320 + }, + { + "epoch": 11.965340179717586, + "grad_norm": 7.528379440307617, + "learning_rate": 2.9347026101839967e-05, + "loss": 0.5311, + "step": 9321 + }, + { + "epoch": 11.966623876765084, + "grad_norm": 2.967813491821289, + "learning_rate": 2.9346598202824136e-05, + "loss": 0.561, + "step": 9322 + }, + { + "epoch": 11.96790757381258, + "grad_norm": 1.6721614599227905, + "learning_rate": 2.93461703038083e-05, + "loss": 0.5281, + "step": 9323 + }, + { + "epoch": 11.969191270860078, + "grad_norm": 1.228755235671997, + "learning_rate": 2.934574240479247e-05, + "loss": 0.5436, + "step": 9324 + }, + { + "epoch": 11.970474967907574, + "grad_norm": 0.987882673740387, + "learning_rate": 2.9345314505776637e-05, + "loss": 0.5416, + "step": 9325 + }, + { + "epoch": 11.97175866495507, + "grad_norm": 1.5185227394104004, + "learning_rate": 2.9344886606760806e-05, + "loss": 0.5048, + "step": 9326 + }, + { + "epoch": 11.973042362002568, + "grad_norm": 1.7164161205291748, + "learning_rate": 2.9344458707744974e-05, + "loss": 0.4965, + "step": 9327 + }, + { + "epoch": 11.974326059050064, + "grad_norm": 1.8575217723846436, + "learning_rate": 2.934403080872914e-05, + "loss": 0.572, + "step": 9328 + }, + { + "epoch": 11.975609756097562, + "grad_norm": 21.66748809814453, + "learning_rate": 2.9343602909713308e-05, + "loss": 0.5676, + "step": 9329 + }, + { + "epoch": 11.976893453145058, + "grad_norm": 2.6204535961151123, + "learning_rate": 2.9343175010697476e-05, + "loss": 0.5184, + "step": 9330 + }, + { + "epoch": 11.978177150192554, + "grad_norm": 1.8491063117980957, + "learning_rate": 2.9342747111681644e-05, + "loss": 0.5584, + "step": 9331 + }, + { + "epoch": 11.979460847240052, + "grad_norm": 1.391151785850525, + "learning_rate": 2.934231921266581e-05, + "loss": 0.5904, + "step": 9332 + }, + { + "epoch": 11.980744544287548, + "grad_norm": 2.6840708255767822, + "learning_rate": 2.934189131364998e-05, + "loss": 0.5832, + "step": 9333 + }, + { + "epoch": 11.982028241335044, + "grad_norm": 2.8405709266662598, + "learning_rate": 2.9341463414634146e-05, + "loss": 0.5901, + "step": 9334 + }, + { + "epoch": 11.983311938382542, + "grad_norm": 5.136682033538818, + "learning_rate": 2.9341035515618315e-05, + "loss": 0.5314, + "step": 9335 + }, + { + "epoch": 11.984595635430038, + "grad_norm": 4.778032302856445, + "learning_rate": 2.9340607616602483e-05, + "loss": 0.5315, + "step": 9336 + }, + { + "epoch": 11.985879332477536, + "grad_norm": 2.837087869644165, + "learning_rate": 2.9340179717586648e-05, + "loss": 0.5554, + "step": 9337 + }, + { + "epoch": 11.987163029525032, + "grad_norm": 2.0428857803344727, + "learning_rate": 2.933975181857082e-05, + "loss": 0.5932, + "step": 9338 + }, + { + "epoch": 11.988446726572528, + "grad_norm": 1.7290235757827759, + "learning_rate": 2.9339323919554985e-05, + "loss": 0.5692, + "step": 9339 + }, + { + "epoch": 11.989730423620026, + "grad_norm": 1.7815347909927368, + "learning_rate": 2.9338896020539153e-05, + "loss": 0.5813, + "step": 9340 + }, + { + "epoch": 11.991014120667522, + "grad_norm": 1.7570686340332031, + "learning_rate": 2.9338468121523322e-05, + "loss": 0.6055, + "step": 9341 + }, + { + "epoch": 11.99229781771502, + "grad_norm": 3.9154505729675293, + "learning_rate": 2.9338040222507487e-05, + "loss": 0.5485, + "step": 9342 + }, + { + "epoch": 11.993581514762516, + "grad_norm": 3.3257784843444824, + "learning_rate": 2.933761232349166e-05, + "loss": 0.6178, + "step": 9343 + }, + { + "epoch": 11.994865211810012, + "grad_norm": 2.100433111190796, + "learning_rate": 2.9337184424475824e-05, + "loss": 0.6345, + "step": 9344 + }, + { + "epoch": 11.99614890885751, + "grad_norm": 7.9456868171691895, + "learning_rate": 2.9336756525459992e-05, + "loss": 0.6045, + "step": 9345 + }, + { + "epoch": 11.997432605905006, + "grad_norm": 1.5846465826034546, + "learning_rate": 2.933632862644416e-05, + "loss": 0.6269, + "step": 9346 + }, + { + "epoch": 11.998716302952504, + "grad_norm": 2.565366268157959, + "learning_rate": 2.933590072742833e-05, + "loss": 0.5945, + "step": 9347 + }, + { + "epoch": 12.0, + "grad_norm": 1.9751460552215576, + "learning_rate": 2.9335472828412494e-05, + "loss": 0.8338, + "step": 9348 + }, + { + "epoch": 12.001283697047496, + "grad_norm": 1.8139598369598389, + "learning_rate": 2.9335044929396662e-05, + "loss": 0.4842, + "step": 9349 + }, + { + "epoch": 12.002567394094994, + "grad_norm": 2.0838849544525146, + "learning_rate": 2.933461703038083e-05, + "loss": 0.5299, + "step": 9350 + }, + { + "epoch": 12.00385109114249, + "grad_norm": 2.4320685863494873, + "learning_rate": 2.9334189131365e-05, + "loss": 0.5109, + "step": 9351 + }, + { + "epoch": 12.005134788189988, + "grad_norm": 1.205540657043457, + "learning_rate": 2.9333761232349168e-05, + "loss": 0.5342, + "step": 9352 + }, + { + "epoch": 12.006418485237484, + "grad_norm": 7.3423285484313965, + "learning_rate": 2.9333333333333333e-05, + "loss": 0.524, + "step": 9353 + }, + { + "epoch": 12.00770218228498, + "grad_norm": 2.508540153503418, + "learning_rate": 2.9332905434317504e-05, + "loss": 0.5663, + "step": 9354 + }, + { + "epoch": 12.008985879332478, + "grad_norm": 6.189655780792236, + "learning_rate": 2.933247753530167e-05, + "loss": 0.4876, + "step": 9355 + }, + { + "epoch": 12.010269576379974, + "grad_norm": 1.9719873666763306, + "learning_rate": 2.9332049636285834e-05, + "loss": 0.5233, + "step": 9356 + }, + { + "epoch": 12.011553273427472, + "grad_norm": 9.981634140014648, + "learning_rate": 2.9331621737270006e-05, + "loss": 0.5243, + "step": 9357 + }, + { + "epoch": 12.012836970474968, + "grad_norm": 1.74155592918396, + "learning_rate": 2.933119383825417e-05, + "loss": 0.535, + "step": 9358 + }, + { + "epoch": 12.014120667522464, + "grad_norm": 1.042065143585205, + "learning_rate": 2.9330765939238343e-05, + "loss": 0.5051, + "step": 9359 + }, + { + "epoch": 12.015404364569962, + "grad_norm": 1.250949501991272, + "learning_rate": 2.9330338040222508e-05, + "loss": 0.5109, + "step": 9360 + }, + { + "epoch": 12.016688061617458, + "grad_norm": 0.8532597422599792, + "learning_rate": 2.9329910141206676e-05, + "loss": 0.5088, + "step": 9361 + }, + { + "epoch": 12.017971758664956, + "grad_norm": 2.439992666244507, + "learning_rate": 2.9329482242190845e-05, + "loss": 0.5051, + "step": 9362 + }, + { + "epoch": 12.019255455712452, + "grad_norm": 1.7378605604171753, + "learning_rate": 2.932905434317501e-05, + "loss": 0.5554, + "step": 9363 + }, + { + "epoch": 12.020539152759948, + "grad_norm": 1.4312626123428345, + "learning_rate": 2.932862644415918e-05, + "loss": 0.5149, + "step": 9364 + }, + { + "epoch": 12.021822849807446, + "grad_norm": 1.172221302986145, + "learning_rate": 2.9328198545143347e-05, + "loss": 0.4996, + "step": 9365 + }, + { + "epoch": 12.023106546854942, + "grad_norm": 1.3087478876113892, + "learning_rate": 2.9327770646127515e-05, + "loss": 0.5224, + "step": 9366 + }, + { + "epoch": 12.024390243902438, + "grad_norm": 0.9989596605300903, + "learning_rate": 2.9327342747111684e-05, + "loss": 0.5088, + "step": 9367 + }, + { + "epoch": 12.025673940949936, + "grad_norm": 2.0743536949157715, + "learning_rate": 2.9326914848095852e-05, + "loss": 0.5564, + "step": 9368 + }, + { + "epoch": 12.026957637997432, + "grad_norm": 1.2798254489898682, + "learning_rate": 2.9326486949080017e-05, + "loss": 0.5253, + "step": 9369 + }, + { + "epoch": 12.02824133504493, + "grad_norm": 2.1157968044281006, + "learning_rate": 2.9326059050064185e-05, + "loss": 0.5729, + "step": 9370 + }, + { + "epoch": 12.029525032092426, + "grad_norm": 3.4828438758850098, + "learning_rate": 2.9325631151048354e-05, + "loss": 0.5387, + "step": 9371 + }, + { + "epoch": 12.030808729139922, + "grad_norm": 1.520715594291687, + "learning_rate": 2.932520325203252e-05, + "loss": 0.5766, + "step": 9372 + }, + { + "epoch": 12.03209242618742, + "grad_norm": 1.6311308145523071, + "learning_rate": 2.932477535301669e-05, + "loss": 0.5421, + "step": 9373 + }, + { + "epoch": 12.033376123234916, + "grad_norm": 2.023991584777832, + "learning_rate": 2.9324347454000856e-05, + "loss": 0.5244, + "step": 9374 + }, + { + "epoch": 12.034659820282414, + "grad_norm": 9.224623680114746, + "learning_rate": 2.9323919554985027e-05, + "loss": 0.547, + "step": 9375 + }, + { + "epoch": 12.03594351732991, + "grad_norm": 1.8217697143554688, + "learning_rate": 2.9323491655969192e-05, + "loss": 0.5551, + "step": 9376 + }, + { + "epoch": 12.037227214377406, + "grad_norm": 1.2820061445236206, + "learning_rate": 2.9323063756953358e-05, + "loss": 0.5419, + "step": 9377 + }, + { + "epoch": 12.038510911424904, + "grad_norm": 2.7593741416931152, + "learning_rate": 2.932263585793753e-05, + "loss": 0.5454, + "step": 9378 + }, + { + "epoch": 12.0397946084724, + "grad_norm": 2.4570178985595703, + "learning_rate": 2.9322207958921694e-05, + "loss": 0.5685, + "step": 9379 + }, + { + "epoch": 12.041078305519898, + "grad_norm": 1.4533336162567139, + "learning_rate": 2.9321780059905863e-05, + "loss": 0.5376, + "step": 9380 + }, + { + "epoch": 12.042362002567394, + "grad_norm": 2.1978960037231445, + "learning_rate": 2.932135216089003e-05, + "loss": 0.552, + "step": 9381 + }, + { + "epoch": 12.04364569961489, + "grad_norm": 1.836832880973816, + "learning_rate": 2.93209242618742e-05, + "loss": 0.5208, + "step": 9382 + }, + { + "epoch": 12.044929396662388, + "grad_norm": 1.9989891052246094, + "learning_rate": 2.9320496362858368e-05, + "loss": 0.5674, + "step": 9383 + }, + { + "epoch": 12.046213093709884, + "grad_norm": 1.816207766532898, + "learning_rate": 2.9320068463842533e-05, + "loss": 0.5178, + "step": 9384 + }, + { + "epoch": 12.047496790757382, + "grad_norm": 1.9800620079040527, + "learning_rate": 2.93196405648267e-05, + "loss": 0.5553, + "step": 9385 + }, + { + "epoch": 12.048780487804878, + "grad_norm": 1.0779820680618286, + "learning_rate": 2.931921266581087e-05, + "loss": 0.5963, + "step": 9386 + }, + { + "epoch": 12.050064184852374, + "grad_norm": 5.320992469787598, + "learning_rate": 2.9318784766795038e-05, + "loss": 0.5916, + "step": 9387 + }, + { + "epoch": 12.051347881899872, + "grad_norm": 1.57945716381073, + "learning_rate": 2.9318356867779203e-05, + "loss": 0.512, + "step": 9388 + }, + { + "epoch": 12.052631578947368, + "grad_norm": 1.6562355756759644, + "learning_rate": 2.931792896876337e-05, + "loss": 0.5685, + "step": 9389 + }, + { + "epoch": 12.053915275994866, + "grad_norm": 1.96412193775177, + "learning_rate": 2.931750106974754e-05, + "loss": 0.6691, + "step": 9390 + }, + { + "epoch": 12.055198973042362, + "grad_norm": 1.4867385625839233, + "learning_rate": 2.931707317073171e-05, + "loss": 0.6595, + "step": 9391 + }, + { + "epoch": 12.056482670089858, + "grad_norm": 2.164433240890503, + "learning_rate": 2.9316645271715877e-05, + "loss": 0.5344, + "step": 9392 + }, + { + "epoch": 12.057766367137356, + "grad_norm": 1.665484070777893, + "learning_rate": 2.9316217372700042e-05, + "loss": 0.6116, + "step": 9393 + }, + { + "epoch": 12.059050064184852, + "grad_norm": 2.319591522216797, + "learning_rate": 2.9315789473684214e-05, + "loss": 0.6301, + "step": 9394 + }, + { + "epoch": 12.06033376123235, + "grad_norm": 5.436972141265869, + "learning_rate": 2.931536157466838e-05, + "loss": 0.6601, + "step": 9395 + }, + { + "epoch": 12.061617458279846, + "grad_norm": 2.2461116313934326, + "learning_rate": 2.9314933675652544e-05, + "loss": 0.7228, + "step": 9396 + }, + { + "epoch": 12.062901155327342, + "grad_norm": 1.3686989545822144, + "learning_rate": 2.9314505776636716e-05, + "loss": 0.6685, + "step": 9397 + }, + { + "epoch": 12.06418485237484, + "grad_norm": 12.459151268005371, + "learning_rate": 2.931407787762088e-05, + "loss": 0.7908, + "step": 9398 + }, + { + "epoch": 12.065468549422336, + "grad_norm": 2.4134719371795654, + "learning_rate": 2.9313649978605052e-05, + "loss": 0.5482, + "step": 9399 + }, + { + "epoch": 12.066752246469832, + "grad_norm": 1.6156600713729858, + "learning_rate": 2.9313222079589217e-05, + "loss": 0.532, + "step": 9400 + }, + { + "epoch": 12.06803594351733, + "grad_norm": 1.7348519563674927, + "learning_rate": 2.9312794180573386e-05, + "loss": 0.5172, + "step": 9401 + }, + { + "epoch": 12.069319640564826, + "grad_norm": 1.4553166627883911, + "learning_rate": 2.9312366281557554e-05, + "loss": 0.5606, + "step": 9402 + }, + { + "epoch": 12.070603337612324, + "grad_norm": 1.1531620025634766, + "learning_rate": 2.931193838254172e-05, + "loss": 0.5541, + "step": 9403 + }, + { + "epoch": 12.07188703465982, + "grad_norm": 1.3391282558441162, + "learning_rate": 2.9311510483525888e-05, + "loss": 0.5008, + "step": 9404 + }, + { + "epoch": 12.073170731707316, + "grad_norm": 1.7616208791732788, + "learning_rate": 2.9311082584510056e-05, + "loss": 0.5223, + "step": 9405 + }, + { + "epoch": 12.074454428754814, + "grad_norm": 3.8030757904052734, + "learning_rate": 2.9310654685494225e-05, + "loss": 0.5325, + "step": 9406 + }, + { + "epoch": 12.07573812580231, + "grad_norm": 1.445125699043274, + "learning_rate": 2.9310226786478393e-05, + "loss": 0.5867, + "step": 9407 + }, + { + "epoch": 12.077021822849808, + "grad_norm": 1.261530876159668, + "learning_rate": 2.930979888746256e-05, + "loss": 0.5222, + "step": 9408 + }, + { + "epoch": 12.078305519897304, + "grad_norm": 1.2375723123550415, + "learning_rate": 2.9309370988446726e-05, + "loss": 0.5268, + "step": 9409 + }, + { + "epoch": 12.0795892169448, + "grad_norm": 3.41178560256958, + "learning_rate": 2.9308943089430895e-05, + "loss": 0.5359, + "step": 9410 + }, + { + "epoch": 12.080872913992298, + "grad_norm": 1.733343243598938, + "learning_rate": 2.9308515190415063e-05, + "loss": 0.5421, + "step": 9411 + }, + { + "epoch": 12.082156611039794, + "grad_norm": 2.710247755050659, + "learning_rate": 2.9308087291399228e-05, + "loss": 0.509, + "step": 9412 + }, + { + "epoch": 12.083440308087292, + "grad_norm": 2.029418468475342, + "learning_rate": 2.93076593923834e-05, + "loss": 0.5263, + "step": 9413 + }, + { + "epoch": 12.084724005134788, + "grad_norm": 2.8986620903015137, + "learning_rate": 2.9307231493367565e-05, + "loss": 0.5348, + "step": 9414 + }, + { + "epoch": 12.086007702182284, + "grad_norm": 1.850986123085022, + "learning_rate": 2.9306803594351737e-05, + "loss": 0.5432, + "step": 9415 + }, + { + "epoch": 12.087291399229782, + "grad_norm": 1.3873111009597778, + "learning_rate": 2.9306375695335902e-05, + "loss": 0.4843, + "step": 9416 + }, + { + "epoch": 12.088575096277278, + "grad_norm": 1.2417747974395752, + "learning_rate": 2.9305947796320067e-05, + "loss": 0.5402, + "step": 9417 + }, + { + "epoch": 12.089858793324776, + "grad_norm": 3.9618654251098633, + "learning_rate": 2.930551989730424e-05, + "loss": 0.5009, + "step": 9418 + }, + { + "epoch": 12.091142490372272, + "grad_norm": 1.2375205755233765, + "learning_rate": 2.9305091998288404e-05, + "loss": 0.5363, + "step": 9419 + }, + { + "epoch": 12.092426187419768, + "grad_norm": 1.164096474647522, + "learning_rate": 2.9304664099272572e-05, + "loss": 0.5594, + "step": 9420 + }, + { + "epoch": 12.093709884467266, + "grad_norm": 4.832309246063232, + "learning_rate": 2.930423620025674e-05, + "loss": 0.5167, + "step": 9421 + }, + { + "epoch": 12.094993581514762, + "grad_norm": 1.850526213645935, + "learning_rate": 2.930380830124091e-05, + "loss": 0.5471, + "step": 9422 + }, + { + "epoch": 12.09627727856226, + "grad_norm": 1.4457684755325317, + "learning_rate": 2.9303380402225074e-05, + "loss": 0.5447, + "step": 9423 + }, + { + "epoch": 12.097560975609756, + "grad_norm": 2.767956018447876, + "learning_rate": 2.9302952503209242e-05, + "loss": 0.5138, + "step": 9424 + }, + { + "epoch": 12.098844672657252, + "grad_norm": 2.598691701889038, + "learning_rate": 2.930252460419341e-05, + "loss": 0.5057, + "step": 9425 + }, + { + "epoch": 12.10012836970475, + "grad_norm": 1.3260676860809326, + "learning_rate": 2.930209670517758e-05, + "loss": 0.5541, + "step": 9426 + }, + { + "epoch": 12.101412066752246, + "grad_norm": 1.4959602355957031, + "learning_rate": 2.9301668806161748e-05, + "loss": 0.5404, + "step": 9427 + }, + { + "epoch": 12.102695763799744, + "grad_norm": 2.7183403968811035, + "learning_rate": 2.9301240907145913e-05, + "loss": 0.565, + "step": 9428 + }, + { + "epoch": 12.10397946084724, + "grad_norm": 2.519015073776245, + "learning_rate": 2.9300813008130084e-05, + "loss": 0.634, + "step": 9429 + }, + { + "epoch": 12.105263157894736, + "grad_norm": 1.6348329782485962, + "learning_rate": 2.930038510911425e-05, + "loss": 0.525, + "step": 9430 + }, + { + "epoch": 12.106546854942234, + "grad_norm": 1.1135892868041992, + "learning_rate": 2.9299957210098414e-05, + "loss": 0.6209, + "step": 9431 + }, + { + "epoch": 12.10783055198973, + "grad_norm": 1.0971479415893555, + "learning_rate": 2.9299529311082586e-05, + "loss": 0.5392, + "step": 9432 + }, + { + "epoch": 12.109114249037226, + "grad_norm": 2.402510166168213, + "learning_rate": 2.929910141206675e-05, + "loss": 0.5551, + "step": 9433 + }, + { + "epoch": 12.110397946084724, + "grad_norm": 3.8173182010650635, + "learning_rate": 2.9298673513050923e-05, + "loss": 0.5575, + "step": 9434 + }, + { + "epoch": 12.11168164313222, + "grad_norm": 3.5824215412139893, + "learning_rate": 2.9298245614035088e-05, + "loss": 0.607, + "step": 9435 + }, + { + "epoch": 12.112965340179718, + "grad_norm": 3.7203872203826904, + "learning_rate": 2.9297817715019257e-05, + "loss": 0.6327, + "step": 9436 + }, + { + "epoch": 12.114249037227214, + "grad_norm": 2.252850294113159, + "learning_rate": 2.9297389816003425e-05, + "loss": 0.565, + "step": 9437 + }, + { + "epoch": 12.11553273427471, + "grad_norm": 2.1191399097442627, + "learning_rate": 2.929696191698759e-05, + "loss": 0.5452, + "step": 9438 + }, + { + "epoch": 12.116816431322208, + "grad_norm": 1.7075999975204468, + "learning_rate": 2.929653401797176e-05, + "loss": 0.6067, + "step": 9439 + }, + { + "epoch": 12.118100128369704, + "grad_norm": 1.7579362392425537, + "learning_rate": 2.9296106118955927e-05, + "loss": 0.5958, + "step": 9440 + }, + { + "epoch": 12.119383825417202, + "grad_norm": 1.9639995098114014, + "learning_rate": 2.9295678219940095e-05, + "loss": 0.5504, + "step": 9441 + }, + { + "epoch": 12.120667522464698, + "grad_norm": 2.701066732406616, + "learning_rate": 2.9295250320924264e-05, + "loss": 0.56, + "step": 9442 + }, + { + "epoch": 12.121951219512194, + "grad_norm": 2.3895301818847656, + "learning_rate": 2.9294822421908432e-05, + "loss": 0.569, + "step": 9443 + }, + { + "epoch": 12.123234916559692, + "grad_norm": 1.8798620700836182, + "learning_rate": 2.9294394522892597e-05, + "loss": 0.6616, + "step": 9444 + }, + { + "epoch": 12.124518613607188, + "grad_norm": 3.9942121505737305, + "learning_rate": 2.9293966623876765e-05, + "loss": 0.632, + "step": 9445 + }, + { + "epoch": 12.125802310654686, + "grad_norm": 4.337707042694092, + "learning_rate": 2.9293538724860934e-05, + "loss": 0.6379, + "step": 9446 + }, + { + "epoch": 12.127086007702182, + "grad_norm": 16.085872650146484, + "learning_rate": 2.92931108258451e-05, + "loss": 0.6506, + "step": 9447 + }, + { + "epoch": 12.128369704749678, + "grad_norm": 3.9121899604797363, + "learning_rate": 2.929268292682927e-05, + "loss": 0.8593, + "step": 9448 + }, + { + "epoch": 12.129653401797176, + "grad_norm": 1.3580013513565063, + "learning_rate": 2.9292255027813436e-05, + "loss": 0.51, + "step": 9449 + }, + { + "epoch": 12.130937098844672, + "grad_norm": 1.807686448097229, + "learning_rate": 2.9291827128797604e-05, + "loss": 0.4989, + "step": 9450 + }, + { + "epoch": 12.13222079589217, + "grad_norm": 1.4922432899475098, + "learning_rate": 2.9291399229781773e-05, + "loss": 0.5735, + "step": 9451 + }, + { + "epoch": 12.133504492939666, + "grad_norm": 1.9478477239608765, + "learning_rate": 2.9290971330765938e-05, + "loss": 0.5605, + "step": 9452 + }, + { + "epoch": 12.134788189987162, + "grad_norm": 1.9406828880310059, + "learning_rate": 2.929054343175011e-05, + "loss": 0.5229, + "step": 9453 + }, + { + "epoch": 12.13607188703466, + "grad_norm": 2.1669511795043945, + "learning_rate": 2.9290115532734274e-05, + "loss": 0.484, + "step": 9454 + }, + { + "epoch": 12.137355584082156, + "grad_norm": 1.6614655256271362, + "learning_rate": 2.9289687633718443e-05, + "loss": 0.5763, + "step": 9455 + }, + { + "epoch": 12.138639281129654, + "grad_norm": 1.8339529037475586, + "learning_rate": 2.928925973470261e-05, + "loss": 0.4947, + "step": 9456 + }, + { + "epoch": 12.13992297817715, + "grad_norm": 6.125359535217285, + "learning_rate": 2.9288831835686776e-05, + "loss": 0.5394, + "step": 9457 + }, + { + "epoch": 12.141206675224646, + "grad_norm": 1.989394187927246, + "learning_rate": 2.9288403936670948e-05, + "loss": 0.504, + "step": 9458 + }, + { + "epoch": 12.142490372272144, + "grad_norm": 1.495862603187561, + "learning_rate": 2.9287976037655113e-05, + "loss": 0.5251, + "step": 9459 + }, + { + "epoch": 12.14377406931964, + "grad_norm": 1.0649714469909668, + "learning_rate": 2.928754813863928e-05, + "loss": 0.5404, + "step": 9460 + }, + { + "epoch": 12.145057766367138, + "grad_norm": 1.5024304389953613, + "learning_rate": 2.928712023962345e-05, + "loss": 0.5334, + "step": 9461 + }, + { + "epoch": 12.146341463414634, + "grad_norm": 1.9360307455062866, + "learning_rate": 2.9286692340607618e-05, + "loss": 0.5378, + "step": 9462 + }, + { + "epoch": 12.14762516046213, + "grad_norm": 1.6357789039611816, + "learning_rate": 2.9286264441591783e-05, + "loss": 0.5365, + "step": 9463 + }, + { + "epoch": 12.148908857509628, + "grad_norm": 5.018376350402832, + "learning_rate": 2.9285836542575952e-05, + "loss": 0.5402, + "step": 9464 + }, + { + "epoch": 12.150192554557124, + "grad_norm": 1.3661154508590698, + "learning_rate": 2.928540864356012e-05, + "loss": 0.5613, + "step": 9465 + }, + { + "epoch": 12.15147625160462, + "grad_norm": 2.155402183532715, + "learning_rate": 2.928498074454429e-05, + "loss": 0.5335, + "step": 9466 + }, + { + "epoch": 12.152759948652118, + "grad_norm": 2.068105936050415, + "learning_rate": 2.9284552845528457e-05, + "loss": 0.5355, + "step": 9467 + }, + { + "epoch": 12.154043645699614, + "grad_norm": 1.5198451280593872, + "learning_rate": 2.9284124946512622e-05, + "loss": 0.5556, + "step": 9468 + }, + { + "epoch": 12.155327342747112, + "grad_norm": 2.6000263690948486, + "learning_rate": 2.9283697047496794e-05, + "loss": 0.5438, + "step": 9469 + }, + { + "epoch": 12.156611039794608, + "grad_norm": 2.976895809173584, + "learning_rate": 2.928326914848096e-05, + "loss": 0.5529, + "step": 9470 + }, + { + "epoch": 12.157894736842104, + "grad_norm": 2.481208086013794, + "learning_rate": 2.9282841249465124e-05, + "loss": 0.5372, + "step": 9471 + }, + { + "epoch": 12.159178433889602, + "grad_norm": 2.9185631275177, + "learning_rate": 2.9282413350449296e-05, + "loss": 0.5049, + "step": 9472 + }, + { + "epoch": 12.160462130937098, + "grad_norm": 1.4060291051864624, + "learning_rate": 2.928198545143346e-05, + "loss": 0.508, + "step": 9473 + }, + { + "epoch": 12.161745827984596, + "grad_norm": 3.233177423477173, + "learning_rate": 2.9281557552417632e-05, + "loss": 0.5306, + "step": 9474 + }, + { + "epoch": 12.163029525032092, + "grad_norm": 1.7026618719100952, + "learning_rate": 2.9281129653401797e-05, + "loss": 0.5066, + "step": 9475 + }, + { + "epoch": 12.164313222079588, + "grad_norm": 1.7545520067214966, + "learning_rate": 2.9280701754385966e-05, + "loss": 0.5908, + "step": 9476 + }, + { + "epoch": 12.165596919127086, + "grad_norm": 3.320605516433716, + "learning_rate": 2.9280273855370134e-05, + "loss": 0.539, + "step": 9477 + }, + { + "epoch": 12.166880616174582, + "grad_norm": 1.0005027055740356, + "learning_rate": 2.92798459563543e-05, + "loss": 0.5773, + "step": 9478 + }, + { + "epoch": 12.16816431322208, + "grad_norm": 2.4123876094818115, + "learning_rate": 2.9279418057338468e-05, + "loss": 0.5481, + "step": 9479 + }, + { + "epoch": 12.169448010269576, + "grad_norm": 1.5253181457519531, + "learning_rate": 2.9278990158322636e-05, + "loss": 0.5578, + "step": 9480 + }, + { + "epoch": 12.170731707317072, + "grad_norm": 1.5574756860733032, + "learning_rate": 2.9278562259306805e-05, + "loss": 0.5673, + "step": 9481 + }, + { + "epoch": 12.17201540436457, + "grad_norm": 1.5803042650222778, + "learning_rate": 2.9278134360290973e-05, + "loss": 0.5378, + "step": 9482 + }, + { + "epoch": 12.173299101412066, + "grad_norm": 1.8547810316085815, + "learning_rate": 2.927770646127514e-05, + "loss": 0.5511, + "step": 9483 + }, + { + "epoch": 12.174582798459564, + "grad_norm": 1.2166377305984497, + "learning_rate": 2.9277278562259306e-05, + "loss": 0.5467, + "step": 9484 + }, + { + "epoch": 12.17586649550706, + "grad_norm": 1.6480991840362549, + "learning_rate": 2.9276850663243475e-05, + "loss": 0.5703, + "step": 9485 + }, + { + "epoch": 12.177150192554556, + "grad_norm": 1.710998296737671, + "learning_rate": 2.9276422764227643e-05, + "loss": 0.5622, + "step": 9486 + }, + { + "epoch": 12.178433889602054, + "grad_norm": 2.700348138809204, + "learning_rate": 2.9275994865211808e-05, + "loss": 0.5317, + "step": 9487 + }, + { + "epoch": 12.17971758664955, + "grad_norm": 5.302177906036377, + "learning_rate": 2.927556696619598e-05, + "loss": 0.514, + "step": 9488 + }, + { + "epoch": 12.181001283697048, + "grad_norm": 2.802624464035034, + "learning_rate": 2.9275139067180145e-05, + "loss": 0.5949, + "step": 9489 + }, + { + "epoch": 12.182284980744544, + "grad_norm": 6.023795127868652, + "learning_rate": 2.9274711168164317e-05, + "loss": 0.6672, + "step": 9490 + }, + { + "epoch": 12.18356867779204, + "grad_norm": 1.636231541633606, + "learning_rate": 2.9274283269148482e-05, + "loss": 0.5046, + "step": 9491 + }, + { + "epoch": 12.184852374839538, + "grad_norm": 1.9245628118515015, + "learning_rate": 2.9273855370132647e-05, + "loss": 0.5769, + "step": 9492 + }, + { + "epoch": 12.186136071887034, + "grad_norm": 2.4642951488494873, + "learning_rate": 2.927342747111682e-05, + "loss": 0.5656, + "step": 9493 + }, + { + "epoch": 12.187419768934532, + "grad_norm": 2.552708864212036, + "learning_rate": 2.9272999572100984e-05, + "loss": 0.6725, + "step": 9494 + }, + { + "epoch": 12.188703465982028, + "grad_norm": 2.6187870502471924, + "learning_rate": 2.9272571673085152e-05, + "loss": 0.6255, + "step": 9495 + }, + { + "epoch": 12.189987163029524, + "grad_norm": 2.207157850265503, + "learning_rate": 2.927214377406932e-05, + "loss": 0.6984, + "step": 9496 + }, + { + "epoch": 12.191270860077022, + "grad_norm": 4.554255485534668, + "learning_rate": 2.927171587505349e-05, + "loss": 0.6724, + "step": 9497 + }, + { + "epoch": 12.192554557124518, + "grad_norm": 5.049839496612549, + "learning_rate": 2.9271287976037657e-05, + "loss": 0.7904, + "step": 9498 + }, + { + "epoch": 12.193838254172016, + "grad_norm": 3.310798168182373, + "learning_rate": 2.9270860077021822e-05, + "loss": 0.5091, + "step": 9499 + }, + { + "epoch": 12.195121951219512, + "grad_norm": 1.1688997745513916, + "learning_rate": 2.927043217800599e-05, + "loss": 0.5165, + "step": 9500 + }, + { + "epoch": 12.196405648267008, + "grad_norm": 1.286916971206665, + "learning_rate": 2.927000427899016e-05, + "loss": 0.534, + "step": 9501 + }, + { + "epoch": 12.197689345314506, + "grad_norm": 1.0855116844177246, + "learning_rate": 2.9269576379974328e-05, + "loss": 0.5082, + "step": 9502 + }, + { + "epoch": 12.198973042362002, + "grad_norm": 1.3335224390029907, + "learning_rate": 2.9269148480958493e-05, + "loss": 0.5311, + "step": 9503 + }, + { + "epoch": 12.200256739409499, + "grad_norm": 1.4381831884384155, + "learning_rate": 2.9268720581942664e-05, + "loss": 0.5186, + "step": 9504 + }, + { + "epoch": 12.201540436456996, + "grad_norm": 3.6500918865203857, + "learning_rate": 2.926829268292683e-05, + "loss": 0.529, + "step": 9505 + }, + { + "epoch": 12.202824133504492, + "grad_norm": 2.0446019172668457, + "learning_rate": 2.9267864783910998e-05, + "loss": 0.5244, + "step": 9506 + }, + { + "epoch": 12.20410783055199, + "grad_norm": 2.3136277198791504, + "learning_rate": 2.9267436884895166e-05, + "loss": 0.5393, + "step": 9507 + }, + { + "epoch": 12.205391527599486, + "grad_norm": 1.2343461513519287, + "learning_rate": 2.926700898587933e-05, + "loss": 0.5484, + "step": 9508 + }, + { + "epoch": 12.206675224646983, + "grad_norm": 0.9095605611801147, + "learning_rate": 2.9266581086863503e-05, + "loss": 0.5307, + "step": 9509 + }, + { + "epoch": 12.20795892169448, + "grad_norm": 2.1086578369140625, + "learning_rate": 2.9266153187847668e-05, + "loss": 0.5072, + "step": 9510 + }, + { + "epoch": 12.209242618741976, + "grad_norm": 1.2807731628417969, + "learning_rate": 2.9265725288831837e-05, + "loss": 0.5285, + "step": 9511 + }, + { + "epoch": 12.210526315789474, + "grad_norm": 1.9413138628005981, + "learning_rate": 2.9265297389816005e-05, + "loss": 0.5069, + "step": 9512 + }, + { + "epoch": 12.21181001283697, + "grad_norm": 1.7107731103897095, + "learning_rate": 2.926486949080017e-05, + "loss": 0.5387, + "step": 9513 + }, + { + "epoch": 12.213093709884467, + "grad_norm": 2.0898020267486572, + "learning_rate": 2.9264441591784342e-05, + "loss": 0.5101, + "step": 9514 + }, + { + "epoch": 12.214377406931964, + "grad_norm": 1.7933112382888794, + "learning_rate": 2.9264013692768507e-05, + "loss": 0.5226, + "step": 9515 + }, + { + "epoch": 12.21566110397946, + "grad_norm": 2.1087896823883057, + "learning_rate": 2.9263585793752675e-05, + "loss": 0.5309, + "step": 9516 + }, + { + "epoch": 12.216944801026958, + "grad_norm": 1.3791203498840332, + "learning_rate": 2.9263157894736844e-05, + "loss": 0.5331, + "step": 9517 + }, + { + "epoch": 12.218228498074454, + "grad_norm": 6.642542362213135, + "learning_rate": 2.926272999572101e-05, + "loss": 0.5557, + "step": 9518 + }, + { + "epoch": 12.21951219512195, + "grad_norm": 1.713731050491333, + "learning_rate": 2.9262302096705177e-05, + "loss": 0.5233, + "step": 9519 + }, + { + "epoch": 12.220795892169448, + "grad_norm": 3.3223488330841064, + "learning_rate": 2.9261874197689346e-05, + "loss": 0.5257, + "step": 9520 + }, + { + "epoch": 12.222079589216944, + "grad_norm": 1.8429945707321167, + "learning_rate": 2.9261446298673514e-05, + "loss": 0.5574, + "step": 9521 + }, + { + "epoch": 12.223363286264442, + "grad_norm": 4.312039852142334, + "learning_rate": 2.9261018399657682e-05, + "loss": 0.5532, + "step": 9522 + }, + { + "epoch": 12.224646983311938, + "grad_norm": 1.975700855255127, + "learning_rate": 2.926059050064185e-05, + "loss": 0.5583, + "step": 9523 + }, + { + "epoch": 12.225930680359435, + "grad_norm": 23.01015853881836, + "learning_rate": 2.9260162601626016e-05, + "loss": 0.5579, + "step": 9524 + }, + { + "epoch": 12.227214377406932, + "grad_norm": 1.597063660621643, + "learning_rate": 2.9259734702610184e-05, + "loss": 0.5413, + "step": 9525 + }, + { + "epoch": 12.228498074454428, + "grad_norm": 1.1534361839294434, + "learning_rate": 2.9259306803594353e-05, + "loss": 0.5149, + "step": 9526 + }, + { + "epoch": 12.229781771501926, + "grad_norm": 1.9022040367126465, + "learning_rate": 2.9258878904578518e-05, + "loss": 0.5929, + "step": 9527 + }, + { + "epoch": 12.231065468549422, + "grad_norm": 1.8806315660476685, + "learning_rate": 2.925845100556269e-05, + "loss": 0.5447, + "step": 9528 + }, + { + "epoch": 12.232349165596919, + "grad_norm": 1.6034380197525024, + "learning_rate": 2.9258023106546854e-05, + "loss": 0.5625, + "step": 9529 + }, + { + "epoch": 12.233632862644416, + "grad_norm": 1.2811719179153442, + "learning_rate": 2.9257595207531026e-05, + "loss": 0.5524, + "step": 9530 + }, + { + "epoch": 12.234916559691912, + "grad_norm": 2.229039192199707, + "learning_rate": 2.925716730851519e-05, + "loss": 0.5364, + "step": 9531 + }, + { + "epoch": 12.23620025673941, + "grad_norm": 1.3839342594146729, + "learning_rate": 2.9256739409499356e-05, + "loss": 0.561, + "step": 9532 + }, + { + "epoch": 12.237483953786906, + "grad_norm": 1.2807518243789673, + "learning_rate": 2.9256311510483528e-05, + "loss": 0.5562, + "step": 9533 + }, + { + "epoch": 12.238767650834403, + "grad_norm": 3.380388021469116, + "learning_rate": 2.9255883611467693e-05, + "loss": 0.6172, + "step": 9534 + }, + { + "epoch": 12.2400513478819, + "grad_norm": 1.562248706817627, + "learning_rate": 2.925545571245186e-05, + "loss": 0.5853, + "step": 9535 + }, + { + "epoch": 12.241335044929397, + "grad_norm": 1.4526830911636353, + "learning_rate": 2.925502781343603e-05, + "loss": 0.5448, + "step": 9536 + }, + { + "epoch": 12.242618741976893, + "grad_norm": 1.2358506917953491, + "learning_rate": 2.92545999144202e-05, + "loss": 0.563, + "step": 9537 + }, + { + "epoch": 12.24390243902439, + "grad_norm": 1.7524058818817139, + "learning_rate": 2.9254172015404367e-05, + "loss": 0.5521, + "step": 9538 + }, + { + "epoch": 12.245186136071887, + "grad_norm": 4.576063632965088, + "learning_rate": 2.9253744116388532e-05, + "loss": 0.5543, + "step": 9539 + }, + { + "epoch": 12.246469833119384, + "grad_norm": 2.7127327919006348, + "learning_rate": 2.92533162173727e-05, + "loss": 0.6218, + "step": 9540 + }, + { + "epoch": 12.24775353016688, + "grad_norm": 2.8531646728515625, + "learning_rate": 2.925288831835687e-05, + "loss": 0.6111, + "step": 9541 + }, + { + "epoch": 12.249037227214377, + "grad_norm": 2.1626453399658203, + "learning_rate": 2.9252460419341037e-05, + "loss": 0.5629, + "step": 9542 + }, + { + "epoch": 12.250320924261874, + "grad_norm": 3.7984812259674072, + "learning_rate": 2.9252032520325202e-05, + "loss": 0.5259, + "step": 9543 + }, + { + "epoch": 12.25160462130937, + "grad_norm": 1.2672678232192993, + "learning_rate": 2.9251604621309374e-05, + "loss": 0.587, + "step": 9544 + }, + { + "epoch": 12.252888318356868, + "grad_norm": 2.047672986984253, + "learning_rate": 2.925117672229354e-05, + "loss": 0.6498, + "step": 9545 + }, + { + "epoch": 12.254172015404365, + "grad_norm": 4.771602630615234, + "learning_rate": 2.9250748823277707e-05, + "loss": 0.6157, + "step": 9546 + }, + { + "epoch": 12.25545571245186, + "grad_norm": 2.484560966491699, + "learning_rate": 2.9250320924261876e-05, + "loss": 0.6624, + "step": 9547 + }, + { + "epoch": 12.256739409499358, + "grad_norm": 2.6817660331726074, + "learning_rate": 2.924989302524604e-05, + "loss": 0.8265, + "step": 9548 + }, + { + "epoch": 12.258023106546855, + "grad_norm": 1.4843698740005493, + "learning_rate": 2.9249465126230213e-05, + "loss": 0.5019, + "step": 9549 + }, + { + "epoch": 12.259306803594352, + "grad_norm": 2.6005117893218994, + "learning_rate": 2.9249037227214378e-05, + "loss": 0.5486, + "step": 9550 + }, + { + "epoch": 12.260590500641849, + "grad_norm": 5.9481048583984375, + "learning_rate": 2.9248609328198546e-05, + "loss": 0.5666, + "step": 9551 + }, + { + "epoch": 12.261874197689345, + "grad_norm": 1.573057770729065, + "learning_rate": 2.9248181429182714e-05, + "loss": 0.5329, + "step": 9552 + }, + { + "epoch": 12.263157894736842, + "grad_norm": 1.1790846586227417, + "learning_rate": 2.924775353016688e-05, + "loss": 0.5419, + "step": 9553 + }, + { + "epoch": 12.264441591784339, + "grad_norm": 1.6116865873336792, + "learning_rate": 2.924732563115105e-05, + "loss": 0.5377, + "step": 9554 + }, + { + "epoch": 12.265725288831836, + "grad_norm": 1.4427921772003174, + "learning_rate": 2.9246897732135216e-05, + "loss": 0.5053, + "step": 9555 + }, + { + "epoch": 12.267008985879333, + "grad_norm": 1.8052120208740234, + "learning_rate": 2.9246469833119385e-05, + "loss": 0.5606, + "step": 9556 + }, + { + "epoch": 12.268292682926829, + "grad_norm": 1.546272873878479, + "learning_rate": 2.9246041934103553e-05, + "loss": 0.5268, + "step": 9557 + }, + { + "epoch": 12.269576379974326, + "grad_norm": 1.3028124570846558, + "learning_rate": 2.924561403508772e-05, + "loss": 0.5412, + "step": 9558 + }, + { + "epoch": 12.270860077021823, + "grad_norm": 1.528863787651062, + "learning_rate": 2.9245186136071886e-05, + "loss": 0.5224, + "step": 9559 + }, + { + "epoch": 12.27214377406932, + "grad_norm": 1.228476881980896, + "learning_rate": 2.9244758237056055e-05, + "loss": 0.5669, + "step": 9560 + }, + { + "epoch": 12.273427471116817, + "grad_norm": 5.209446907043457, + "learning_rate": 2.9244330338040223e-05, + "loss": 0.5628, + "step": 9561 + }, + { + "epoch": 12.274711168164313, + "grad_norm": 2.71693754196167, + "learning_rate": 2.9243902439024392e-05, + "loss": 0.5345, + "step": 9562 + }, + { + "epoch": 12.27599486521181, + "grad_norm": 1.5795185565948486, + "learning_rate": 2.924347454000856e-05, + "loss": 0.5442, + "step": 9563 + }, + { + "epoch": 12.277278562259307, + "grad_norm": 1.4401254653930664, + "learning_rate": 2.9243046640992725e-05, + "loss": 0.5566, + "step": 9564 + }, + { + "epoch": 12.278562259306804, + "grad_norm": 7.073253631591797, + "learning_rate": 2.9242618741976897e-05, + "loss": 0.5566, + "step": 9565 + }, + { + "epoch": 12.2798459563543, + "grad_norm": 1.5889719724655151, + "learning_rate": 2.9242190842961062e-05, + "loss": 0.5268, + "step": 9566 + }, + { + "epoch": 12.281129653401797, + "grad_norm": 1.2884224653244019, + "learning_rate": 2.9241762943945227e-05, + "loss": 0.523, + "step": 9567 + }, + { + "epoch": 12.282413350449294, + "grad_norm": 1.8837647438049316, + "learning_rate": 2.92413350449294e-05, + "loss": 0.5356, + "step": 9568 + }, + { + "epoch": 12.28369704749679, + "grad_norm": 1.904666781425476, + "learning_rate": 2.9240907145913564e-05, + "loss": 0.5219, + "step": 9569 + }, + { + "epoch": 12.284980744544288, + "grad_norm": 1.5401555299758911, + "learning_rate": 2.9240479246897736e-05, + "loss": 0.5713, + "step": 9570 + }, + { + "epoch": 12.286264441591785, + "grad_norm": 1.2685531377792358, + "learning_rate": 2.92400513478819e-05, + "loss": 0.5157, + "step": 9571 + }, + { + "epoch": 12.28754813863928, + "grad_norm": 1.3792128562927246, + "learning_rate": 2.923962344886607e-05, + "loss": 0.514, + "step": 9572 + }, + { + "epoch": 12.288831835686779, + "grad_norm": 4.083863735198975, + "learning_rate": 2.9239195549850237e-05, + "loss": 0.5591, + "step": 9573 + }, + { + "epoch": 12.290115532734275, + "grad_norm": 2.1628541946411133, + "learning_rate": 2.9238767650834402e-05, + "loss": 0.5599, + "step": 9574 + }, + { + "epoch": 12.29139922978177, + "grad_norm": 1.555626392364502, + "learning_rate": 2.923833975181857e-05, + "loss": 0.5367, + "step": 9575 + }, + { + "epoch": 12.292682926829269, + "grad_norm": 3.4482388496398926, + "learning_rate": 2.923791185280274e-05, + "loss": 0.5205, + "step": 9576 + }, + { + "epoch": 12.293966623876765, + "grad_norm": 2.2305355072021484, + "learning_rate": 2.9237483953786908e-05, + "loss": 0.595, + "step": 9577 + }, + { + "epoch": 12.295250320924263, + "grad_norm": 1.453321933746338, + "learning_rate": 2.9237056054771076e-05, + "loss": 0.5721, + "step": 9578 + }, + { + "epoch": 12.296534017971759, + "grad_norm": 1.3992375135421753, + "learning_rate": 2.923662815575524e-05, + "loss": 0.5227, + "step": 9579 + }, + { + "epoch": 12.297817715019255, + "grad_norm": 2.999220132827759, + "learning_rate": 2.923620025673941e-05, + "loss": 0.5605, + "step": 9580 + }, + { + "epoch": 12.299101412066753, + "grad_norm": 3.579501152038574, + "learning_rate": 2.9235772357723578e-05, + "loss": 0.5564, + "step": 9581 + }, + { + "epoch": 12.300385109114249, + "grad_norm": 4.356724739074707, + "learning_rate": 2.9235344458707746e-05, + "loss": 0.5506, + "step": 9582 + }, + { + "epoch": 12.301668806161747, + "grad_norm": 1.1380730867385864, + "learning_rate": 2.923491655969191e-05, + "loss": 0.5787, + "step": 9583 + }, + { + "epoch": 12.302952503209243, + "grad_norm": 1.4143035411834717, + "learning_rate": 2.9234488660676083e-05, + "loss": 0.5318, + "step": 9584 + }, + { + "epoch": 12.304236200256739, + "grad_norm": 1.4734234809875488, + "learning_rate": 2.9234060761660248e-05, + "loss": 0.5697, + "step": 9585 + }, + { + "epoch": 12.305519897304237, + "grad_norm": 1.3392006158828735, + "learning_rate": 2.9233632862644417e-05, + "loss": 0.5666, + "step": 9586 + }, + { + "epoch": 12.306803594351733, + "grad_norm": 4.12023401260376, + "learning_rate": 2.9233204963628585e-05, + "loss": 0.563, + "step": 9587 + }, + { + "epoch": 12.30808729139923, + "grad_norm": 1.926470160484314, + "learning_rate": 2.923277706461275e-05, + "loss": 0.5839, + "step": 9588 + }, + { + "epoch": 12.309370988446727, + "grad_norm": 5.588286876678467, + "learning_rate": 2.9232349165596922e-05, + "loss": 0.6107, + "step": 9589 + }, + { + "epoch": 12.310654685494223, + "grad_norm": 5.037435531616211, + "learning_rate": 2.9231921266581087e-05, + "loss": 0.5791, + "step": 9590 + }, + { + "epoch": 12.31193838254172, + "grad_norm": 4.58099889755249, + "learning_rate": 2.9231493367565255e-05, + "loss": 0.5584, + "step": 9591 + }, + { + "epoch": 12.313222079589217, + "grad_norm": 1.1619287729263306, + "learning_rate": 2.9231065468549424e-05, + "loss": 0.5993, + "step": 9592 + }, + { + "epoch": 12.314505776636715, + "grad_norm": 1.4414563179016113, + "learning_rate": 2.923063756953359e-05, + "loss": 0.5584, + "step": 9593 + }, + { + "epoch": 12.31578947368421, + "grad_norm": 3.4215331077575684, + "learning_rate": 2.923020967051776e-05, + "loss": 0.5965, + "step": 9594 + }, + { + "epoch": 12.317073170731707, + "grad_norm": 3.613844633102417, + "learning_rate": 2.9229781771501926e-05, + "loss": 0.6909, + "step": 9595 + }, + { + "epoch": 12.318356867779205, + "grad_norm": 3.7599058151245117, + "learning_rate": 2.9229353872486094e-05, + "loss": 0.6848, + "step": 9596 + }, + { + "epoch": 12.3196405648267, + "grad_norm": 2.485538959503174, + "learning_rate": 2.9228925973470262e-05, + "loss": 0.7134, + "step": 9597 + }, + { + "epoch": 12.320924261874199, + "grad_norm": 2.5567216873168945, + "learning_rate": 2.922849807445443e-05, + "loss": 0.7279, + "step": 9598 + }, + { + "epoch": 12.322207958921695, + "grad_norm": 14.034089088439941, + "learning_rate": 2.9228070175438596e-05, + "loss": 0.5324, + "step": 9599 + }, + { + "epoch": 12.32349165596919, + "grad_norm": 4.00920295715332, + "learning_rate": 2.9227642276422764e-05, + "loss": 0.5289, + "step": 9600 + }, + { + "epoch": 12.324775353016689, + "grad_norm": 1.2710282802581787, + "learning_rate": 2.9227214377406933e-05, + "loss": 0.5277, + "step": 9601 + }, + { + "epoch": 12.326059050064185, + "grad_norm": 1.3956670761108398, + "learning_rate": 2.92267864783911e-05, + "loss": 0.5301, + "step": 9602 + }, + { + "epoch": 12.327342747111683, + "grad_norm": 1.6909056901931763, + "learning_rate": 2.922635857937527e-05, + "loss": 0.5219, + "step": 9603 + }, + { + "epoch": 12.328626444159179, + "grad_norm": 1.506733775138855, + "learning_rate": 2.9225930680359435e-05, + "loss": 0.5337, + "step": 9604 + }, + { + "epoch": 12.329910141206675, + "grad_norm": 1.200400948524475, + "learning_rate": 2.9225502781343606e-05, + "loss": 0.5241, + "step": 9605 + }, + { + "epoch": 12.331193838254173, + "grad_norm": 1.496623158454895, + "learning_rate": 2.922507488232777e-05, + "loss": 0.5052, + "step": 9606 + }, + { + "epoch": 12.332477535301669, + "grad_norm": 1.214972972869873, + "learning_rate": 2.9224646983311936e-05, + "loss": 0.5297, + "step": 9607 + }, + { + "epoch": 12.333761232349165, + "grad_norm": 1.5385980606079102, + "learning_rate": 2.9224219084296108e-05, + "loss": 0.5471, + "step": 9608 + }, + { + "epoch": 12.335044929396663, + "grad_norm": 1.5377472639083862, + "learning_rate": 2.9223791185280273e-05, + "loss": 0.5319, + "step": 9609 + }, + { + "epoch": 12.336328626444159, + "grad_norm": 1.2332676649093628, + "learning_rate": 2.9223363286264445e-05, + "loss": 0.5413, + "step": 9610 + }, + { + "epoch": 12.337612323491657, + "grad_norm": 2.1983160972595215, + "learning_rate": 2.922293538724861e-05, + "loss": 0.5418, + "step": 9611 + }, + { + "epoch": 12.338896020539153, + "grad_norm": 1.1585824489593506, + "learning_rate": 2.922250748823278e-05, + "loss": 0.5071, + "step": 9612 + }, + { + "epoch": 12.340179717586649, + "grad_norm": 2.1463358402252197, + "learning_rate": 2.9222079589216947e-05, + "loss": 0.5162, + "step": 9613 + }, + { + "epoch": 12.341463414634147, + "grad_norm": 1.8810501098632812, + "learning_rate": 2.9221651690201112e-05, + "loss": 0.5532, + "step": 9614 + }, + { + "epoch": 12.342747111681643, + "grad_norm": 2.190518379211426, + "learning_rate": 2.922122379118528e-05, + "loss": 0.5094, + "step": 9615 + }, + { + "epoch": 12.34403080872914, + "grad_norm": 2.126399040222168, + "learning_rate": 2.922079589216945e-05, + "loss": 0.5235, + "step": 9616 + }, + { + "epoch": 12.345314505776637, + "grad_norm": 1.8599811792373657, + "learning_rate": 2.9220367993153617e-05, + "loss": 0.5085, + "step": 9617 + }, + { + "epoch": 12.346598202824133, + "grad_norm": 1.1903057098388672, + "learning_rate": 2.9219940094137786e-05, + "loss": 0.5334, + "step": 9618 + }, + { + "epoch": 12.34788189987163, + "grad_norm": 13.176106452941895, + "learning_rate": 2.9219512195121954e-05, + "loss": 0.5586, + "step": 9619 + }, + { + "epoch": 12.349165596919127, + "grad_norm": 5.024960041046143, + "learning_rate": 2.921908429610612e-05, + "loss": 0.5482, + "step": 9620 + }, + { + "epoch": 12.350449293966625, + "grad_norm": 0.7814184427261353, + "learning_rate": 2.9218656397090287e-05, + "loss": 0.5285, + "step": 9621 + }, + { + "epoch": 12.35173299101412, + "grad_norm": 1.0335458517074585, + "learning_rate": 2.9218228498074456e-05, + "loss": 0.5287, + "step": 9622 + }, + { + "epoch": 12.353016688061617, + "grad_norm": 2.574343681335449, + "learning_rate": 2.921780059905862e-05, + "loss": 0.5095, + "step": 9623 + }, + { + "epoch": 12.354300385109115, + "grad_norm": 32.45996856689453, + "learning_rate": 2.9217372700042793e-05, + "loss": 0.5501, + "step": 9624 + }, + { + "epoch": 12.35558408215661, + "grad_norm": 1.2581173181533813, + "learning_rate": 2.9216944801026958e-05, + "loss": 0.5693, + "step": 9625 + }, + { + "epoch": 12.356867779204109, + "grad_norm": 1.6830673217773438, + "learning_rate": 2.9216516902011126e-05, + "loss": 0.5623, + "step": 9626 + }, + { + "epoch": 12.358151476251605, + "grad_norm": 1.2830778360366821, + "learning_rate": 2.9216089002995294e-05, + "loss": 0.5254, + "step": 9627 + }, + { + "epoch": 12.3594351732991, + "grad_norm": 1.3634915351867676, + "learning_rate": 2.921566110397946e-05, + "loss": 0.4632, + "step": 9628 + }, + { + "epoch": 12.360718870346599, + "grad_norm": 1.2604299783706665, + "learning_rate": 2.921523320496363e-05, + "loss": 0.5119, + "step": 9629 + }, + { + "epoch": 12.362002567394095, + "grad_norm": 1.3619822263717651, + "learning_rate": 2.9214805305947796e-05, + "loss": 0.5403, + "step": 9630 + }, + { + "epoch": 12.363286264441593, + "grad_norm": 1.6057837009429932, + "learning_rate": 2.9214377406931965e-05, + "loss": 0.607, + "step": 9631 + }, + { + "epoch": 12.364569961489089, + "grad_norm": 1.3515026569366455, + "learning_rate": 2.9213949507916133e-05, + "loss": 0.5976, + "step": 9632 + }, + { + "epoch": 12.365853658536585, + "grad_norm": 2.4717419147491455, + "learning_rate": 2.92135216089003e-05, + "loss": 0.5741, + "step": 9633 + }, + { + "epoch": 12.367137355584083, + "grad_norm": 2.657926559448242, + "learning_rate": 2.9213093709884467e-05, + "loss": 0.5564, + "step": 9634 + }, + { + "epoch": 12.368421052631579, + "grad_norm": 2.261498212814331, + "learning_rate": 2.9212665810868635e-05, + "loss": 0.6149, + "step": 9635 + }, + { + "epoch": 12.369704749679077, + "grad_norm": 2.397517681121826, + "learning_rate": 2.9212237911852803e-05, + "loss": 0.614, + "step": 9636 + }, + { + "epoch": 12.370988446726573, + "grad_norm": 1.8365626335144043, + "learning_rate": 2.9211810012836972e-05, + "loss": 0.6102, + "step": 9637 + }, + { + "epoch": 12.372272143774069, + "grad_norm": 1.9833135604858398, + "learning_rate": 2.921138211382114e-05, + "loss": 0.5837, + "step": 9638 + }, + { + "epoch": 12.373555840821567, + "grad_norm": 2.340311288833618, + "learning_rate": 2.9210954214805305e-05, + "loss": 0.6102, + "step": 9639 + }, + { + "epoch": 12.374839537869063, + "grad_norm": 2.345979928970337, + "learning_rate": 2.9210526315789474e-05, + "loss": 0.5903, + "step": 9640 + }, + { + "epoch": 12.376123234916559, + "grad_norm": 1.6805899143218994, + "learning_rate": 2.9210098416773642e-05, + "loss": 0.5954, + "step": 9641 + }, + { + "epoch": 12.377406931964057, + "grad_norm": 4.089231491088867, + "learning_rate": 2.9209670517757807e-05, + "loss": 0.5742, + "step": 9642 + }, + { + "epoch": 12.378690629011553, + "grad_norm": 2.488248109817505, + "learning_rate": 2.920924261874198e-05, + "loss": 0.5787, + "step": 9643 + }, + { + "epoch": 12.37997432605905, + "grad_norm": 1.687269687652588, + "learning_rate": 2.9208814719726144e-05, + "loss": 0.6049, + "step": 9644 + }, + { + "epoch": 12.381258023106547, + "grad_norm": 1.903424859046936, + "learning_rate": 2.9208386820710316e-05, + "loss": 0.583, + "step": 9645 + }, + { + "epoch": 12.382541720154043, + "grad_norm": 4.345402717590332, + "learning_rate": 2.920795892169448e-05, + "loss": 0.6061, + "step": 9646 + }, + { + "epoch": 12.38382541720154, + "grad_norm": 1.9553959369659424, + "learning_rate": 2.9207531022678646e-05, + "loss": 0.6298, + "step": 9647 + }, + { + "epoch": 12.385109114249037, + "grad_norm": 5.38988733291626, + "learning_rate": 2.9207103123662818e-05, + "loss": 0.7464, + "step": 9648 + }, + { + "epoch": 12.386392811296535, + "grad_norm": 3.1077041625976562, + "learning_rate": 2.9206675224646983e-05, + "loss": 0.5622, + "step": 9649 + }, + { + "epoch": 12.38767650834403, + "grad_norm": 1.1195399761199951, + "learning_rate": 2.920624732563115e-05, + "loss": 0.4806, + "step": 9650 + }, + { + "epoch": 12.388960205391527, + "grad_norm": 2.190765142440796, + "learning_rate": 2.920581942661532e-05, + "loss": 0.5348, + "step": 9651 + }, + { + "epoch": 12.390243902439025, + "grad_norm": 2.239164113998413, + "learning_rate": 2.9205391527599488e-05, + "loss": 0.5379, + "step": 9652 + }, + { + "epoch": 12.39152759948652, + "grad_norm": 2.16066575050354, + "learning_rate": 2.9204963628583656e-05, + "loss": 0.5804, + "step": 9653 + }, + { + "epoch": 12.392811296534019, + "grad_norm": 3.584196090698242, + "learning_rate": 2.920453572956782e-05, + "loss": 0.5323, + "step": 9654 + }, + { + "epoch": 12.394094993581515, + "grad_norm": 1.5119026899337769, + "learning_rate": 2.920410783055199e-05, + "loss": 0.5121, + "step": 9655 + }, + { + "epoch": 12.39537869062901, + "grad_norm": 2.0587711334228516, + "learning_rate": 2.9203679931536158e-05, + "loss": 0.5287, + "step": 9656 + }, + { + "epoch": 12.396662387676509, + "grad_norm": 1.2875350713729858, + "learning_rate": 2.9203252032520326e-05, + "loss": 0.5184, + "step": 9657 + }, + { + "epoch": 12.397946084724005, + "grad_norm": 1.7295502424240112, + "learning_rate": 2.920282413350449e-05, + "loss": 0.5599, + "step": 9658 + }, + { + "epoch": 12.399229781771503, + "grad_norm": 0.9111841917037964, + "learning_rate": 2.9202396234488663e-05, + "loss": 0.5338, + "step": 9659 + }, + { + "epoch": 12.400513478818999, + "grad_norm": 1.578413486480713, + "learning_rate": 2.9201968335472828e-05, + "loss": 0.5193, + "step": 9660 + }, + { + "epoch": 12.401797175866495, + "grad_norm": 2.6450235843658447, + "learning_rate": 2.9201540436456997e-05, + "loss": 0.522, + "step": 9661 + }, + { + "epoch": 12.403080872913993, + "grad_norm": 2.812169313430786, + "learning_rate": 2.9201112537441165e-05, + "loss": 0.5165, + "step": 9662 + }, + { + "epoch": 12.404364569961489, + "grad_norm": 2.4519660472869873, + "learning_rate": 2.920068463842533e-05, + "loss": 0.5457, + "step": 9663 + }, + { + "epoch": 12.405648267008987, + "grad_norm": 3.7011513710021973, + "learning_rate": 2.9200256739409502e-05, + "loss": 0.5217, + "step": 9664 + }, + { + "epoch": 12.406931964056483, + "grad_norm": 1.6244471073150635, + "learning_rate": 2.9199828840393667e-05, + "loss": 0.5652, + "step": 9665 + }, + { + "epoch": 12.408215661103979, + "grad_norm": 2.673445463180542, + "learning_rate": 2.9199400941377835e-05, + "loss": 0.5114, + "step": 9666 + }, + { + "epoch": 12.409499358151477, + "grad_norm": 2.021254777908325, + "learning_rate": 2.9198973042362004e-05, + "loss": 0.5679, + "step": 9667 + }, + { + "epoch": 12.410783055198973, + "grad_norm": 1.7167482376098633, + "learning_rate": 2.919854514334617e-05, + "loss": 0.5165, + "step": 9668 + }, + { + "epoch": 12.41206675224647, + "grad_norm": 1.99867582321167, + "learning_rate": 2.919811724433034e-05, + "loss": 0.5186, + "step": 9669 + }, + { + "epoch": 12.413350449293967, + "grad_norm": 1.5340099334716797, + "learning_rate": 2.9197689345314506e-05, + "loss": 0.532, + "step": 9670 + }, + { + "epoch": 12.414634146341463, + "grad_norm": 1.499955415725708, + "learning_rate": 2.9197261446298674e-05, + "loss": 0.5312, + "step": 9671 + }, + { + "epoch": 12.41591784338896, + "grad_norm": 1.288274884223938, + "learning_rate": 2.9196833547282842e-05, + "loss": 0.5323, + "step": 9672 + }, + { + "epoch": 12.417201540436457, + "grad_norm": 3.2504541873931885, + "learning_rate": 2.919640564826701e-05, + "loss": 0.5407, + "step": 9673 + }, + { + "epoch": 12.418485237483953, + "grad_norm": 2.3676657676696777, + "learning_rate": 2.9195977749251176e-05, + "loss": 0.5197, + "step": 9674 + }, + { + "epoch": 12.41976893453145, + "grad_norm": 1.8973361253738403, + "learning_rate": 2.9195549850235344e-05, + "loss": 0.4992, + "step": 9675 + }, + { + "epoch": 12.421052631578947, + "grad_norm": 1.3528329133987427, + "learning_rate": 2.9195121951219513e-05, + "loss": 0.5366, + "step": 9676 + }, + { + "epoch": 12.422336328626445, + "grad_norm": 2.10129976272583, + "learning_rate": 2.919469405220368e-05, + "loss": 0.5281, + "step": 9677 + }, + { + "epoch": 12.42362002567394, + "grad_norm": 1.6668152809143066, + "learning_rate": 2.919426615318785e-05, + "loss": 0.5485, + "step": 9678 + }, + { + "epoch": 12.424903722721437, + "grad_norm": 1.6635745763778687, + "learning_rate": 2.9193838254172015e-05, + "loss": 0.5514, + "step": 9679 + }, + { + "epoch": 12.426187419768935, + "grad_norm": 1.5927307605743408, + "learning_rate": 2.9193410355156186e-05, + "loss": 0.5185, + "step": 9680 + }, + { + "epoch": 12.427471116816431, + "grad_norm": 1.9529956579208374, + "learning_rate": 2.919298245614035e-05, + "loss": 0.5592, + "step": 9681 + }, + { + "epoch": 12.428754813863929, + "grad_norm": 1.129016637802124, + "learning_rate": 2.9192554557124516e-05, + "loss": 0.5922, + "step": 9682 + }, + { + "epoch": 12.430038510911425, + "grad_norm": 3.971999406814575, + "learning_rate": 2.9192126658108688e-05, + "loss": 0.5569, + "step": 9683 + }, + { + "epoch": 12.431322207958921, + "grad_norm": 2.657672166824341, + "learning_rate": 2.9191698759092853e-05, + "loss": 0.5745, + "step": 9684 + }, + { + "epoch": 12.432605905006419, + "grad_norm": 2.427551031112671, + "learning_rate": 2.9191270860077025e-05, + "loss": 0.5498, + "step": 9685 + }, + { + "epoch": 12.433889602053915, + "grad_norm": 4.407980918884277, + "learning_rate": 2.919084296106119e-05, + "loss": 0.5511, + "step": 9686 + }, + { + "epoch": 12.435173299101413, + "grad_norm": 4.212848663330078, + "learning_rate": 2.919041506204536e-05, + "loss": 0.5542, + "step": 9687 + }, + { + "epoch": 12.436456996148909, + "grad_norm": 5.571094036102295, + "learning_rate": 2.9189987163029527e-05, + "loss": 0.5949, + "step": 9688 + }, + { + "epoch": 12.437740693196405, + "grad_norm": 1.5206762552261353, + "learning_rate": 2.9189559264013692e-05, + "loss": 0.541, + "step": 9689 + }, + { + "epoch": 12.439024390243903, + "grad_norm": 2.277982473373413, + "learning_rate": 2.918913136499786e-05, + "loss": 0.5672, + "step": 9690 + }, + { + "epoch": 12.440308087291399, + "grad_norm": 3.3972887992858887, + "learning_rate": 2.918870346598203e-05, + "loss": 0.6809, + "step": 9691 + }, + { + "epoch": 12.441591784338897, + "grad_norm": 1.8717927932739258, + "learning_rate": 2.9188275566966197e-05, + "loss": 0.5908, + "step": 9692 + }, + { + "epoch": 12.442875481386393, + "grad_norm": 2.0774807929992676, + "learning_rate": 2.9187847667950366e-05, + "loss": 0.5884, + "step": 9693 + }, + { + "epoch": 12.444159178433889, + "grad_norm": 2.3562872409820557, + "learning_rate": 2.9187419768934534e-05, + "loss": 0.6253, + "step": 9694 + }, + { + "epoch": 12.445442875481387, + "grad_norm": 2.645443916320801, + "learning_rate": 2.91869918699187e-05, + "loss": 0.6011, + "step": 9695 + }, + { + "epoch": 12.446726572528883, + "grad_norm": 2.5102858543395996, + "learning_rate": 2.9186563970902867e-05, + "loss": 0.6949, + "step": 9696 + }, + { + "epoch": 12.44801026957638, + "grad_norm": 3.602177143096924, + "learning_rate": 2.9186136071887036e-05, + "loss": 0.7126, + "step": 9697 + }, + { + "epoch": 12.449293966623877, + "grad_norm": 4.048396587371826, + "learning_rate": 2.91857081728712e-05, + "loss": 0.7952, + "step": 9698 + }, + { + "epoch": 12.450577663671373, + "grad_norm": 1.8446621894836426, + "learning_rate": 2.9185280273855373e-05, + "loss": 0.5297, + "step": 9699 + }, + { + "epoch": 12.45186136071887, + "grad_norm": 1.057846188545227, + "learning_rate": 2.9184852374839538e-05, + "loss": 0.5286, + "step": 9700 + }, + { + "epoch": 12.453145057766367, + "grad_norm": 1.1860071420669556, + "learning_rate": 2.9184424475823706e-05, + "loss": 0.5277, + "step": 9701 + }, + { + "epoch": 12.454428754813865, + "grad_norm": 1.0800327062606812, + "learning_rate": 2.9183996576807874e-05, + "loss": 0.561, + "step": 9702 + }, + { + "epoch": 12.455712451861361, + "grad_norm": 1.0590656995773315, + "learning_rate": 2.918356867779204e-05, + "loss": 0.5403, + "step": 9703 + }, + { + "epoch": 12.456996148908857, + "grad_norm": 0.9783191680908203, + "learning_rate": 2.918314077877621e-05, + "loss": 0.5392, + "step": 9704 + }, + { + "epoch": 12.458279845956355, + "grad_norm": 3.709444522857666, + "learning_rate": 2.9182712879760376e-05, + "loss": 0.5668, + "step": 9705 + }, + { + "epoch": 12.459563543003851, + "grad_norm": 3.6171936988830566, + "learning_rate": 2.9182284980744545e-05, + "loss": 0.5885, + "step": 9706 + }, + { + "epoch": 12.460847240051347, + "grad_norm": 2.3965938091278076, + "learning_rate": 2.9181857081728713e-05, + "loss": 0.4881, + "step": 9707 + }, + { + "epoch": 12.462130937098845, + "grad_norm": 1.130253791809082, + "learning_rate": 2.9181429182712878e-05, + "loss": 0.601, + "step": 9708 + }, + { + "epoch": 12.463414634146341, + "grad_norm": 1.0254175662994385, + "learning_rate": 2.918100128369705e-05, + "loss": 0.523, + "step": 9709 + }, + { + "epoch": 12.464698331193839, + "grad_norm": 1.2617192268371582, + "learning_rate": 2.9180573384681215e-05, + "loss": 0.5457, + "step": 9710 + }, + { + "epoch": 12.465982028241335, + "grad_norm": 2.4247756004333496, + "learning_rate": 2.9180145485665383e-05, + "loss": 0.4961, + "step": 9711 + }, + { + "epoch": 12.467265725288831, + "grad_norm": 1.0794963836669922, + "learning_rate": 2.9179717586649552e-05, + "loss": 0.5332, + "step": 9712 + }, + { + "epoch": 12.468549422336329, + "grad_norm": 1.0839229822158813, + "learning_rate": 2.917928968763372e-05, + "loss": 0.5286, + "step": 9713 + }, + { + "epoch": 12.469833119383825, + "grad_norm": 1.0200895071029663, + "learning_rate": 2.9178861788617885e-05, + "loss": 0.5609, + "step": 9714 + }, + { + "epoch": 12.471116816431323, + "grad_norm": 1.6453542709350586, + "learning_rate": 2.9178433889602054e-05, + "loss": 0.5235, + "step": 9715 + }, + { + "epoch": 12.472400513478819, + "grad_norm": 1.4265438318252563, + "learning_rate": 2.9178005990586222e-05, + "loss": 0.5312, + "step": 9716 + }, + { + "epoch": 12.473684210526315, + "grad_norm": 10.022527694702148, + "learning_rate": 2.917757809157039e-05, + "loss": 0.5366, + "step": 9717 + }, + { + "epoch": 12.474967907573813, + "grad_norm": 1.4432810544967651, + "learning_rate": 2.917715019255456e-05, + "loss": 0.544, + "step": 9718 + }, + { + "epoch": 12.476251604621309, + "grad_norm": 1.2228513956069946, + "learning_rate": 2.9176722293538724e-05, + "loss": 0.5538, + "step": 9719 + }, + { + "epoch": 12.477535301668807, + "grad_norm": 2.3923609256744385, + "learning_rate": 2.9176294394522896e-05, + "loss": 0.4896, + "step": 9720 + }, + { + "epoch": 12.478818998716303, + "grad_norm": 1.4460071325302124, + "learning_rate": 2.917586649550706e-05, + "loss": 0.4895, + "step": 9721 + }, + { + "epoch": 12.480102695763799, + "grad_norm": 1.5441005229949951, + "learning_rate": 2.9175438596491226e-05, + "loss": 0.512, + "step": 9722 + }, + { + "epoch": 12.481386392811297, + "grad_norm": 1.6813561916351318, + "learning_rate": 2.9175010697475398e-05, + "loss": 0.5388, + "step": 9723 + }, + { + "epoch": 12.482670089858793, + "grad_norm": 1.4204803705215454, + "learning_rate": 2.9174582798459563e-05, + "loss": 0.589, + "step": 9724 + }, + { + "epoch": 12.48395378690629, + "grad_norm": 1.809439778327942, + "learning_rate": 2.9174154899443734e-05, + "loss": 0.5628, + "step": 9725 + }, + { + "epoch": 12.485237483953787, + "grad_norm": 2.4249136447906494, + "learning_rate": 2.91737270004279e-05, + "loss": 0.5525, + "step": 9726 + }, + { + "epoch": 12.486521181001283, + "grad_norm": 3.1525368690490723, + "learning_rate": 2.9173299101412068e-05, + "loss": 0.573, + "step": 9727 + }, + { + "epoch": 12.487804878048781, + "grad_norm": 1.3913627862930298, + "learning_rate": 2.9172871202396236e-05, + "loss": 0.5619, + "step": 9728 + }, + { + "epoch": 12.489088575096277, + "grad_norm": 2.712439775466919, + "learning_rate": 2.91724433033804e-05, + "loss": 0.5947, + "step": 9729 + }, + { + "epoch": 12.490372272143775, + "grad_norm": 2.6820859909057617, + "learning_rate": 2.917201540436457e-05, + "loss": 0.5539, + "step": 9730 + }, + { + "epoch": 12.491655969191271, + "grad_norm": 3.3410708904266357, + "learning_rate": 2.9171587505348738e-05, + "loss": 0.5849, + "step": 9731 + }, + { + "epoch": 12.492939666238767, + "grad_norm": 3.509561777114868, + "learning_rate": 2.9171159606332907e-05, + "loss": 0.5669, + "step": 9732 + }, + { + "epoch": 12.494223363286265, + "grad_norm": 1.8645122051239014, + "learning_rate": 2.9170731707317075e-05, + "loss": 0.5545, + "step": 9733 + }, + { + "epoch": 12.495507060333761, + "grad_norm": 2.313883066177368, + "learning_rate": 2.9170303808301243e-05, + "loss": 0.5762, + "step": 9734 + }, + { + "epoch": 12.496790757381259, + "grad_norm": 2.4816837310791016, + "learning_rate": 2.916987590928541e-05, + "loss": 0.5543, + "step": 9735 + }, + { + "epoch": 12.498074454428755, + "grad_norm": 4.8996381759643555, + "learning_rate": 2.9169448010269577e-05, + "loss": 0.5394, + "step": 9736 + }, + { + "epoch": 12.499358151476251, + "grad_norm": 5.682677745819092, + "learning_rate": 2.9169020111253745e-05, + "loss": 0.5737, + "step": 9737 + }, + { + "epoch": 12.500641848523749, + "grad_norm": 1.988006830215454, + "learning_rate": 2.916859221223791e-05, + "loss": 0.5702, + "step": 9738 + }, + { + "epoch": 12.501925545571245, + "grad_norm": 2.1546387672424316, + "learning_rate": 2.9168164313222082e-05, + "loss": 0.6018, + "step": 9739 + }, + { + "epoch": 12.503209242618741, + "grad_norm": 2.9332900047302246, + "learning_rate": 2.9167736414206247e-05, + "loss": 0.5948, + "step": 9740 + }, + { + "epoch": 12.504492939666239, + "grad_norm": 2.506845235824585, + "learning_rate": 2.916730851519042e-05, + "loss": 0.5716, + "step": 9741 + }, + { + "epoch": 12.505776636713735, + "grad_norm": 1.4004933834075928, + "learning_rate": 2.9166880616174584e-05, + "loss": 0.5658, + "step": 9742 + }, + { + "epoch": 12.507060333761233, + "grad_norm": 1.2363568544387817, + "learning_rate": 2.916645271715875e-05, + "loss": 0.6397, + "step": 9743 + }, + { + "epoch": 12.508344030808729, + "grad_norm": 1.727818489074707, + "learning_rate": 2.916602481814292e-05, + "loss": 0.6112, + "step": 9744 + }, + { + "epoch": 12.509627727856225, + "grad_norm": 1.3850722312927246, + "learning_rate": 2.9165596919127086e-05, + "loss": 0.6197, + "step": 9745 + }, + { + "epoch": 12.510911424903723, + "grad_norm": 3.1149938106536865, + "learning_rate": 2.9165169020111254e-05, + "loss": 0.6618, + "step": 9746 + }, + { + "epoch": 12.512195121951219, + "grad_norm": 2.816072463989258, + "learning_rate": 2.9164741121095423e-05, + "loss": 0.6811, + "step": 9747 + }, + { + "epoch": 12.513478818998717, + "grad_norm": 4.578885078430176, + "learning_rate": 2.916431322207959e-05, + "loss": 0.7884, + "step": 9748 + }, + { + "epoch": 12.514762516046213, + "grad_norm": 1.3948746919631958, + "learning_rate": 2.916388532306376e-05, + "loss": 0.5249, + "step": 9749 + }, + { + "epoch": 12.51604621309371, + "grad_norm": 1.0799152851104736, + "learning_rate": 2.9163457424047924e-05, + "loss": 0.5466, + "step": 9750 + }, + { + "epoch": 12.517329910141207, + "grad_norm": 1.9821138381958008, + "learning_rate": 2.9163029525032093e-05, + "loss": 0.5694, + "step": 9751 + }, + { + "epoch": 12.518613607188703, + "grad_norm": 1.6221188306808472, + "learning_rate": 2.916260162601626e-05, + "loss": 0.5843, + "step": 9752 + }, + { + "epoch": 12.519897304236201, + "grad_norm": 1.1154143810272217, + "learning_rate": 2.916217372700043e-05, + "loss": 0.5409, + "step": 9753 + }, + { + "epoch": 12.521181001283697, + "grad_norm": 3.108567476272583, + "learning_rate": 2.9161745827984595e-05, + "loss": 0.5235, + "step": 9754 + }, + { + "epoch": 12.522464698331193, + "grad_norm": 1.1051193475723267, + "learning_rate": 2.9161317928968766e-05, + "loss": 0.5925, + "step": 9755 + }, + { + "epoch": 12.523748395378691, + "grad_norm": 1.901556372642517, + "learning_rate": 2.916089002995293e-05, + "loss": 0.5006, + "step": 9756 + }, + { + "epoch": 12.525032092426187, + "grad_norm": 12.291058540344238, + "learning_rate": 2.91604621309371e-05, + "loss": 0.5562, + "step": 9757 + }, + { + "epoch": 12.526315789473685, + "grad_norm": 3.522926092147827, + "learning_rate": 2.9160034231921268e-05, + "loss": 0.543, + "step": 9758 + }, + { + "epoch": 12.527599486521181, + "grad_norm": 1.2344417572021484, + "learning_rate": 2.9159606332905433e-05, + "loss": 0.5821, + "step": 9759 + }, + { + "epoch": 12.528883183568677, + "grad_norm": 1.452565312385559, + "learning_rate": 2.9159178433889605e-05, + "loss": 0.5778, + "step": 9760 + }, + { + "epoch": 12.530166880616175, + "grad_norm": 1.1239769458770752, + "learning_rate": 2.915875053487377e-05, + "loss": 0.5315, + "step": 9761 + }, + { + "epoch": 12.531450577663671, + "grad_norm": 2.0150020122528076, + "learning_rate": 2.915832263585794e-05, + "loss": 0.5811, + "step": 9762 + }, + { + "epoch": 12.532734274711169, + "grad_norm": 1.4228870868682861, + "learning_rate": 2.9157894736842107e-05, + "loss": 0.5443, + "step": 9763 + }, + { + "epoch": 12.534017971758665, + "grad_norm": 2.241729259490967, + "learning_rate": 2.9157466837826272e-05, + "loss": 0.5688, + "step": 9764 + }, + { + "epoch": 12.535301668806161, + "grad_norm": 1.3834995031356812, + "learning_rate": 2.9157038938810444e-05, + "loss": 0.573, + "step": 9765 + }, + { + "epoch": 12.536585365853659, + "grad_norm": 2.913369655609131, + "learning_rate": 2.915661103979461e-05, + "loss": 0.5565, + "step": 9766 + }, + { + "epoch": 12.537869062901155, + "grad_norm": 2.253702402114868, + "learning_rate": 2.9156183140778777e-05, + "loss": 0.5632, + "step": 9767 + }, + { + "epoch": 12.539152759948653, + "grad_norm": 4.142477989196777, + "learning_rate": 2.9155755241762946e-05, + "loss": 0.5646, + "step": 9768 + }, + { + "epoch": 12.540436456996149, + "grad_norm": 1.3738824129104614, + "learning_rate": 2.915532734274711e-05, + "loss": 0.5601, + "step": 9769 + }, + { + "epoch": 12.541720154043645, + "grad_norm": 1.6125879287719727, + "learning_rate": 2.915489944373128e-05, + "loss": 0.5213, + "step": 9770 + }, + { + "epoch": 12.543003851091143, + "grad_norm": 1.9180054664611816, + "learning_rate": 2.9154471544715447e-05, + "loss": 0.5669, + "step": 9771 + }, + { + "epoch": 12.544287548138639, + "grad_norm": 4.951095104217529, + "learning_rate": 2.9154043645699616e-05, + "loss": 0.5821, + "step": 9772 + }, + { + "epoch": 12.545571245186135, + "grad_norm": 1.568479061126709, + "learning_rate": 2.9153615746683784e-05, + "loss": 0.5357, + "step": 9773 + }, + { + "epoch": 12.546854942233633, + "grad_norm": 2.2177059650421143, + "learning_rate": 2.9153187847667953e-05, + "loss": 0.6419, + "step": 9774 + }, + { + "epoch": 12.54813863928113, + "grad_norm": 1.7610039710998535, + "learning_rate": 2.9152759948652118e-05, + "loss": 0.5592, + "step": 9775 + }, + { + "epoch": 12.549422336328627, + "grad_norm": 2.803699254989624, + "learning_rate": 2.9152332049636286e-05, + "loss": 0.4979, + "step": 9776 + }, + { + "epoch": 12.550706033376123, + "grad_norm": 3.485243082046509, + "learning_rate": 2.9151904150620455e-05, + "loss": 0.5531, + "step": 9777 + }, + { + "epoch": 12.55198973042362, + "grad_norm": 1.0804308652877808, + "learning_rate": 2.915147625160462e-05, + "loss": 0.5644, + "step": 9778 + }, + { + "epoch": 12.553273427471117, + "grad_norm": 1.996258020401001, + "learning_rate": 2.915104835258879e-05, + "loss": 0.5533, + "step": 9779 + }, + { + "epoch": 12.554557124518613, + "grad_norm": 2.9716269969940186, + "learning_rate": 2.9150620453572956e-05, + "loss": 0.5441, + "step": 9780 + }, + { + "epoch": 12.555840821566111, + "grad_norm": 1.0169644355773926, + "learning_rate": 2.9150192554557128e-05, + "loss": 0.5632, + "step": 9781 + }, + { + "epoch": 12.557124518613607, + "grad_norm": 1.5111589431762695, + "learning_rate": 2.9149764655541293e-05, + "loss": 0.5591, + "step": 9782 + }, + { + "epoch": 12.558408215661103, + "grad_norm": 1.7532466650009155, + "learning_rate": 2.9149336756525458e-05, + "loss": 0.5846, + "step": 9783 + }, + { + "epoch": 12.559691912708601, + "grad_norm": 1.7765446901321411, + "learning_rate": 2.914890885750963e-05, + "loss": 0.5384, + "step": 9784 + }, + { + "epoch": 12.560975609756097, + "grad_norm": 1.3023914098739624, + "learning_rate": 2.9148480958493795e-05, + "loss": 0.5382, + "step": 9785 + }, + { + "epoch": 12.562259306803595, + "grad_norm": 1.3590766191482544, + "learning_rate": 2.9148053059477963e-05, + "loss": 0.5695, + "step": 9786 + }, + { + "epoch": 12.563543003851091, + "grad_norm": 3.9753196239471436, + "learning_rate": 2.9147625160462132e-05, + "loss": 0.5987, + "step": 9787 + }, + { + "epoch": 12.564826700898587, + "grad_norm": 3.939678907394409, + "learning_rate": 2.91471972614463e-05, + "loss": 0.5813, + "step": 9788 + }, + { + "epoch": 12.566110397946085, + "grad_norm": 1.5653349161148071, + "learning_rate": 2.914676936243047e-05, + "loss": 0.6163, + "step": 9789 + }, + { + "epoch": 12.567394094993581, + "grad_norm": 1.9527945518493652, + "learning_rate": 2.9146341463414634e-05, + "loss": 0.5498, + "step": 9790 + }, + { + "epoch": 12.568677792041079, + "grad_norm": 1.403991937637329, + "learning_rate": 2.9145913564398802e-05, + "loss": 0.5623, + "step": 9791 + }, + { + "epoch": 12.569961489088575, + "grad_norm": 2.9429500102996826, + "learning_rate": 2.914548566538297e-05, + "loss": 0.6567, + "step": 9792 + }, + { + "epoch": 12.571245186136071, + "grad_norm": 2.1819331645965576, + "learning_rate": 2.914505776636714e-05, + "loss": 0.5637, + "step": 9793 + }, + { + "epoch": 12.572528883183569, + "grad_norm": 2.193571090698242, + "learning_rate": 2.9144629867351304e-05, + "loss": 0.6281, + "step": 9794 + }, + { + "epoch": 12.573812580231065, + "grad_norm": 1.9647796154022217, + "learning_rate": 2.9144201968335476e-05, + "loss": 0.6054, + "step": 9795 + }, + { + "epoch": 12.575096277278563, + "grad_norm": 1.5610162019729614, + "learning_rate": 2.914377406931964e-05, + "loss": 0.6583, + "step": 9796 + }, + { + "epoch": 12.57637997432606, + "grad_norm": 1.6912368535995483, + "learning_rate": 2.914334617030381e-05, + "loss": 0.7256, + "step": 9797 + }, + { + "epoch": 12.577663671373555, + "grad_norm": 4.5219221115112305, + "learning_rate": 2.9142918271287978e-05, + "loss": 0.7937, + "step": 9798 + }, + { + "epoch": 12.578947368421053, + "grad_norm": 1.1063157320022583, + "learning_rate": 2.9142490372272143e-05, + "loss": 0.5507, + "step": 9799 + }, + { + "epoch": 12.58023106546855, + "grad_norm": 1.1751108169555664, + "learning_rate": 2.9142062473256314e-05, + "loss": 0.5614, + "step": 9800 + }, + { + "epoch": 12.581514762516047, + "grad_norm": 1.9196465015411377, + "learning_rate": 2.914163457424048e-05, + "loss": 0.5517, + "step": 9801 + }, + { + "epoch": 12.582798459563543, + "grad_norm": 0.823902428150177, + "learning_rate": 2.9141206675224648e-05, + "loss": 0.4985, + "step": 9802 + }, + { + "epoch": 12.58408215661104, + "grad_norm": 1.0114140510559082, + "learning_rate": 2.9140778776208816e-05, + "loss": 0.5161, + "step": 9803 + }, + { + "epoch": 12.585365853658537, + "grad_norm": 1.9035524129867554, + "learning_rate": 2.914035087719298e-05, + "loss": 0.5287, + "step": 9804 + }, + { + "epoch": 12.586649550706033, + "grad_norm": 0.981289267539978, + "learning_rate": 2.9139922978177153e-05, + "loss": 0.5602, + "step": 9805 + }, + { + "epoch": 12.58793324775353, + "grad_norm": 0.7150043845176697, + "learning_rate": 2.9139495079161318e-05, + "loss": 0.5016, + "step": 9806 + }, + { + "epoch": 12.589216944801027, + "grad_norm": 1.5895307064056396, + "learning_rate": 2.9139067180145487e-05, + "loss": 0.5554, + "step": 9807 + }, + { + "epoch": 12.590500641848523, + "grad_norm": 0.9646741151809692, + "learning_rate": 2.9138639281129655e-05, + "loss": 0.516, + "step": 9808 + }, + { + "epoch": 12.591784338896021, + "grad_norm": 1.2362219095230103, + "learning_rate": 2.9138211382113823e-05, + "loss": 0.536, + "step": 9809 + }, + { + "epoch": 12.593068035943517, + "grad_norm": 1.7818629741668701, + "learning_rate": 2.913778348309799e-05, + "loss": 0.568, + "step": 9810 + }, + { + "epoch": 12.594351732991013, + "grad_norm": 0.8691895008087158, + "learning_rate": 2.9137355584082157e-05, + "loss": 0.5202, + "step": 9811 + }, + { + "epoch": 12.595635430038511, + "grad_norm": 2.557978868484497, + "learning_rate": 2.9136927685066325e-05, + "loss": 0.5274, + "step": 9812 + }, + { + "epoch": 12.596919127086007, + "grad_norm": 2.093129873275757, + "learning_rate": 2.9136499786050494e-05, + "loss": 0.5473, + "step": 9813 + }, + { + "epoch": 12.598202824133505, + "grad_norm": 2.073493480682373, + "learning_rate": 2.9136071887034662e-05, + "loss": 0.5545, + "step": 9814 + }, + { + "epoch": 12.599486521181001, + "grad_norm": 1.5232443809509277, + "learning_rate": 2.9135643988018827e-05, + "loss": 0.5324, + "step": 9815 + }, + { + "epoch": 12.600770218228497, + "grad_norm": 2.3420214653015137, + "learning_rate": 2.9135216089003e-05, + "loss": 0.5456, + "step": 9816 + }, + { + "epoch": 12.602053915275995, + "grad_norm": 1.5651847124099731, + "learning_rate": 2.9134788189987164e-05, + "loss": 0.5551, + "step": 9817 + }, + { + "epoch": 12.603337612323491, + "grad_norm": 1.903252363204956, + "learning_rate": 2.913436029097133e-05, + "loss": 0.5205, + "step": 9818 + }, + { + "epoch": 12.60462130937099, + "grad_norm": 1.203879952430725, + "learning_rate": 2.91339323919555e-05, + "loss": 0.4969, + "step": 9819 + }, + { + "epoch": 12.605905006418485, + "grad_norm": 1.1215256452560425, + "learning_rate": 2.9133504492939666e-05, + "loss": 0.5046, + "step": 9820 + }, + { + "epoch": 12.607188703465981, + "grad_norm": 1.3271945714950562, + "learning_rate": 2.9133076593923838e-05, + "loss": 0.5968, + "step": 9821 + }, + { + "epoch": 12.60847240051348, + "grad_norm": 1.10805082321167, + "learning_rate": 2.9132648694908003e-05, + "loss": 0.5604, + "step": 9822 + }, + { + "epoch": 12.609756097560975, + "grad_norm": 1.1721491813659668, + "learning_rate": 2.913222079589217e-05, + "loss": 0.5688, + "step": 9823 + }, + { + "epoch": 12.611039794608473, + "grad_norm": 1.0538026094436646, + "learning_rate": 2.913179289687634e-05, + "loss": 0.5471, + "step": 9824 + }, + { + "epoch": 12.61232349165597, + "grad_norm": 3.071180582046509, + "learning_rate": 2.9131364997860504e-05, + "loss": 0.5157, + "step": 9825 + }, + { + "epoch": 12.613607188703465, + "grad_norm": 1.2898783683776855, + "learning_rate": 2.9130937098844673e-05, + "loss": 0.5761, + "step": 9826 + }, + { + "epoch": 12.614890885750963, + "grad_norm": 1.4004398584365845, + "learning_rate": 2.913050919982884e-05, + "loss": 0.5523, + "step": 9827 + }, + { + "epoch": 12.61617458279846, + "grad_norm": 1.7236740589141846, + "learning_rate": 2.913008130081301e-05, + "loss": 0.5624, + "step": 9828 + }, + { + "epoch": 12.617458279845957, + "grad_norm": 1.334903359413147, + "learning_rate": 2.9129653401797175e-05, + "loss": 0.5357, + "step": 9829 + }, + { + "epoch": 12.618741976893453, + "grad_norm": 1.2954365015029907, + "learning_rate": 2.9129225502781343e-05, + "loss": 0.5233, + "step": 9830 + }, + { + "epoch": 12.62002567394095, + "grad_norm": 1.7053076028823853, + "learning_rate": 2.912879760376551e-05, + "loss": 0.5332, + "step": 9831 + }, + { + "epoch": 12.621309370988447, + "grad_norm": 7.21295690536499, + "learning_rate": 2.912836970474968e-05, + "loss": 0.5702, + "step": 9832 + }, + { + "epoch": 12.622593068035943, + "grad_norm": 0.9783334732055664, + "learning_rate": 2.912794180573385e-05, + "loss": 0.5843, + "step": 9833 + }, + { + "epoch": 12.623876765083441, + "grad_norm": 1.010668396949768, + "learning_rate": 2.9127513906718013e-05, + "loss": 0.5203, + "step": 9834 + }, + { + "epoch": 12.625160462130937, + "grad_norm": 1.1640725135803223, + "learning_rate": 2.9127086007702185e-05, + "loss": 0.5368, + "step": 9835 + }, + { + "epoch": 12.626444159178433, + "grad_norm": 3.1619627475738525, + "learning_rate": 2.912665810868635e-05, + "loss": 0.6017, + "step": 9836 + }, + { + "epoch": 12.627727856225931, + "grad_norm": 2.5044424533843994, + "learning_rate": 2.9126230209670515e-05, + "loss": 0.5231, + "step": 9837 + }, + { + "epoch": 12.629011553273427, + "grad_norm": 2.4486987590789795, + "learning_rate": 2.9125802310654687e-05, + "loss": 0.5849, + "step": 9838 + }, + { + "epoch": 12.630295250320923, + "grad_norm": 3.329481601715088, + "learning_rate": 2.9125374411638852e-05, + "loss": 0.6105, + "step": 9839 + }, + { + "epoch": 12.631578947368421, + "grad_norm": 3.542973756790161, + "learning_rate": 2.9124946512623024e-05, + "loss": 0.5747, + "step": 9840 + }, + { + "epoch": 12.632862644415917, + "grad_norm": 1.4452069997787476, + "learning_rate": 2.912451861360719e-05, + "loss": 0.5822, + "step": 9841 + }, + { + "epoch": 12.634146341463415, + "grad_norm": 1.4541614055633545, + "learning_rate": 2.9124090714591357e-05, + "loss": 0.5961, + "step": 9842 + }, + { + "epoch": 12.635430038510911, + "grad_norm": 3.2965283393859863, + "learning_rate": 2.9123662815575526e-05, + "loss": 0.6661, + "step": 9843 + }, + { + "epoch": 12.63671373555841, + "grad_norm": 6.2873616218566895, + "learning_rate": 2.912323491655969e-05, + "loss": 0.6586, + "step": 9844 + }, + { + "epoch": 12.637997432605905, + "grad_norm": 1.3560236692428589, + "learning_rate": 2.912280701754386e-05, + "loss": 0.63, + "step": 9845 + }, + { + "epoch": 12.639281129653401, + "grad_norm": 3.526845932006836, + "learning_rate": 2.9122379118528028e-05, + "loss": 0.6281, + "step": 9846 + }, + { + "epoch": 12.6405648267009, + "grad_norm": 2.108487606048584, + "learning_rate": 2.9121951219512196e-05, + "loss": 0.6613, + "step": 9847 + }, + { + "epoch": 12.641848523748395, + "grad_norm": 1.9879671335220337, + "learning_rate": 2.9121523320496364e-05, + "loss": 0.8013, + "step": 9848 + }, + { + "epoch": 12.643132220795891, + "grad_norm": 1.2790731191635132, + "learning_rate": 2.9121095421480533e-05, + "loss": 0.5188, + "step": 9849 + }, + { + "epoch": 12.64441591784339, + "grad_norm": 1.4097083806991577, + "learning_rate": 2.9120667522464698e-05, + "loss": 0.5171, + "step": 9850 + }, + { + "epoch": 12.645699614890885, + "grad_norm": 1.6101727485656738, + "learning_rate": 2.9120239623448866e-05, + "loss": 0.5444, + "step": 9851 + }, + { + "epoch": 12.646983311938383, + "grad_norm": 1.2844852209091187, + "learning_rate": 2.9119811724433035e-05, + "loss": 0.5319, + "step": 9852 + }, + { + "epoch": 12.64826700898588, + "grad_norm": 1.278540849685669, + "learning_rate": 2.91193838254172e-05, + "loss": 0.5941, + "step": 9853 + }, + { + "epoch": 12.649550706033375, + "grad_norm": 3.770631790161133, + "learning_rate": 2.911895592640137e-05, + "loss": 0.5476, + "step": 9854 + }, + { + "epoch": 12.650834403080873, + "grad_norm": 3.6638214588165283, + "learning_rate": 2.9118528027385536e-05, + "loss": 0.5419, + "step": 9855 + }, + { + "epoch": 12.65211810012837, + "grad_norm": 2.705928325653076, + "learning_rate": 2.9118100128369708e-05, + "loss": 0.5169, + "step": 9856 + }, + { + "epoch": 12.653401797175867, + "grad_norm": 1.4573841094970703, + "learning_rate": 2.9117672229353873e-05, + "loss": 0.5562, + "step": 9857 + }, + { + "epoch": 12.654685494223363, + "grad_norm": 1.2122091054916382, + "learning_rate": 2.911724433033804e-05, + "loss": 0.4965, + "step": 9858 + }, + { + "epoch": 12.65596919127086, + "grad_norm": 1.1418601274490356, + "learning_rate": 2.911681643132221e-05, + "loss": 0.5798, + "step": 9859 + }, + { + "epoch": 12.657252888318357, + "grad_norm": 3.615086555480957, + "learning_rate": 2.9116388532306375e-05, + "loss": 0.5743, + "step": 9860 + }, + { + "epoch": 12.658536585365853, + "grad_norm": 5.176105499267578, + "learning_rate": 2.9115960633290544e-05, + "loss": 0.5282, + "step": 9861 + }, + { + "epoch": 12.659820282413351, + "grad_norm": 2.0120949745178223, + "learning_rate": 2.9115532734274712e-05, + "loss": 0.5512, + "step": 9862 + }, + { + "epoch": 12.661103979460847, + "grad_norm": 2.435513734817505, + "learning_rate": 2.911510483525888e-05, + "loss": 0.5478, + "step": 9863 + }, + { + "epoch": 12.662387676508343, + "grad_norm": 1.2786929607391357, + "learning_rate": 2.911467693624305e-05, + "loss": 0.5706, + "step": 9864 + }, + { + "epoch": 12.663671373555841, + "grad_norm": 2.6584291458129883, + "learning_rate": 2.9114249037227214e-05, + "loss": 0.5337, + "step": 9865 + }, + { + "epoch": 12.664955070603337, + "grad_norm": 1.0891674757003784, + "learning_rate": 2.9113821138211382e-05, + "loss": 0.4829, + "step": 9866 + }, + { + "epoch": 12.666238767650835, + "grad_norm": 1.9561885595321655, + "learning_rate": 2.911339323919555e-05, + "loss": 0.5432, + "step": 9867 + }, + { + "epoch": 12.667522464698331, + "grad_norm": 3.225677251815796, + "learning_rate": 2.911296534017972e-05, + "loss": 0.5137, + "step": 9868 + }, + { + "epoch": 12.668806161745827, + "grad_norm": 1.177063226699829, + "learning_rate": 2.9112537441163884e-05, + "loss": 0.5468, + "step": 9869 + }, + { + "epoch": 12.670089858793325, + "grad_norm": 2.075482130050659, + "learning_rate": 2.9112109542148056e-05, + "loss": 0.5337, + "step": 9870 + }, + { + "epoch": 12.671373555840821, + "grad_norm": 1.8756963014602661, + "learning_rate": 2.911168164313222e-05, + "loss": 0.5321, + "step": 9871 + }, + { + "epoch": 12.672657252888317, + "grad_norm": 2.6849591732025146, + "learning_rate": 2.911125374411639e-05, + "loss": 0.5223, + "step": 9872 + }, + { + "epoch": 12.673940949935815, + "grad_norm": 1.0906935930252075, + "learning_rate": 2.9110825845100558e-05, + "loss": 0.5747, + "step": 9873 + }, + { + "epoch": 12.675224646983311, + "grad_norm": 1.9242768287658691, + "learning_rate": 2.9110397946084723e-05, + "loss": 0.5395, + "step": 9874 + }, + { + "epoch": 12.67650834403081, + "grad_norm": 1.8309203386306763, + "learning_rate": 2.9109970047068895e-05, + "loss": 0.5524, + "step": 9875 + }, + { + "epoch": 12.677792041078305, + "grad_norm": 2.1002793312072754, + "learning_rate": 2.910954214805306e-05, + "loss": 0.5201, + "step": 9876 + }, + { + "epoch": 12.679075738125803, + "grad_norm": 1.3506766557693481, + "learning_rate": 2.9109114249037228e-05, + "loss": 0.5518, + "step": 9877 + }, + { + "epoch": 12.6803594351733, + "grad_norm": 1.0008857250213623, + "learning_rate": 2.9108686350021396e-05, + "loss": 0.5657, + "step": 9878 + }, + { + "epoch": 12.681643132220795, + "grad_norm": 2.8765721321105957, + "learning_rate": 2.910825845100556e-05, + "loss": 0.5502, + "step": 9879 + }, + { + "epoch": 12.682926829268293, + "grad_norm": 6.234247207641602, + "learning_rate": 2.9107830551989733e-05, + "loss": 0.5499, + "step": 9880 + }, + { + "epoch": 12.68421052631579, + "grad_norm": 1.3583550453186035, + "learning_rate": 2.9107402652973898e-05, + "loss": 0.581, + "step": 9881 + }, + { + "epoch": 12.685494223363285, + "grad_norm": 3.1342148780822754, + "learning_rate": 2.9106974753958067e-05, + "loss": 0.5882, + "step": 9882 + }, + { + "epoch": 12.686777920410783, + "grad_norm": 1.03035306930542, + "learning_rate": 2.9106546854942235e-05, + "loss": 0.563, + "step": 9883 + }, + { + "epoch": 12.68806161745828, + "grad_norm": 1.7155042886734009, + "learning_rate": 2.9106118955926403e-05, + "loss": 0.5931, + "step": 9884 + }, + { + "epoch": 12.689345314505777, + "grad_norm": 2.4593799114227295, + "learning_rate": 2.910569105691057e-05, + "loss": 0.5558, + "step": 9885 + }, + { + "epoch": 12.690629011553273, + "grad_norm": 1.3966926336288452, + "learning_rate": 2.9105263157894737e-05, + "loss": 0.5271, + "step": 9886 + }, + { + "epoch": 12.69191270860077, + "grad_norm": 4.540501117706299, + "learning_rate": 2.9104835258878905e-05, + "loss": 0.5569, + "step": 9887 + }, + { + "epoch": 12.693196405648267, + "grad_norm": 2.6033854484558105, + "learning_rate": 2.9104407359863074e-05, + "loss": 0.6179, + "step": 9888 + }, + { + "epoch": 12.694480102695763, + "grad_norm": 7.905585289001465, + "learning_rate": 2.9103979460847242e-05, + "loss": 0.5855, + "step": 9889 + }, + { + "epoch": 12.695763799743261, + "grad_norm": 2.3372421264648438, + "learning_rate": 2.9103551561831407e-05, + "loss": 0.5617, + "step": 9890 + }, + { + "epoch": 12.697047496790757, + "grad_norm": 1.5903109312057495, + "learning_rate": 2.9103123662815576e-05, + "loss": 0.5057, + "step": 9891 + }, + { + "epoch": 12.698331193838253, + "grad_norm": 1.679081916809082, + "learning_rate": 2.9102695763799744e-05, + "loss": 0.5726, + "step": 9892 + }, + { + "epoch": 12.699614890885751, + "grad_norm": 1.9279708862304688, + "learning_rate": 2.910226786478391e-05, + "loss": 0.6046, + "step": 9893 + }, + { + "epoch": 12.700898587933247, + "grad_norm": 5.738786697387695, + "learning_rate": 2.910183996576808e-05, + "loss": 0.6173, + "step": 9894 + }, + { + "epoch": 12.702182284980745, + "grad_norm": 2.1571733951568604, + "learning_rate": 2.9101412066752246e-05, + "loss": 0.5989, + "step": 9895 + }, + { + "epoch": 12.703465982028241, + "grad_norm": 5.405139923095703, + "learning_rate": 2.9100984167736418e-05, + "loss": 0.6139, + "step": 9896 + }, + { + "epoch": 12.704749679075737, + "grad_norm": 2.012829542160034, + "learning_rate": 2.9100556268720583e-05, + "loss": 0.5955, + "step": 9897 + }, + { + "epoch": 12.706033376123235, + "grad_norm": 2.0224246978759766, + "learning_rate": 2.9100128369704748e-05, + "loss": 0.7403, + "step": 9898 + }, + { + "epoch": 12.707317073170731, + "grad_norm": 1.7646815776824951, + "learning_rate": 2.909970047068892e-05, + "loss": 0.5106, + "step": 9899 + }, + { + "epoch": 12.70860077021823, + "grad_norm": 1.7374991178512573, + "learning_rate": 2.9099272571673084e-05, + "loss": 0.5127, + "step": 9900 + }, + { + "epoch": 12.709884467265725, + "grad_norm": 1.4453405141830444, + "learning_rate": 2.9098844672657253e-05, + "loss": 0.5194, + "step": 9901 + }, + { + "epoch": 12.711168164313221, + "grad_norm": 0.8223535418510437, + "learning_rate": 2.909841677364142e-05, + "loss": 0.5063, + "step": 9902 + }, + { + "epoch": 12.71245186136072, + "grad_norm": 1.5389589071273804, + "learning_rate": 2.909798887462559e-05, + "loss": 0.5016, + "step": 9903 + }, + { + "epoch": 12.713735558408215, + "grad_norm": 1.4768539667129517, + "learning_rate": 2.9097560975609758e-05, + "loss": 0.5533, + "step": 9904 + }, + { + "epoch": 12.715019255455712, + "grad_norm": 2.3264620304107666, + "learning_rate": 2.9097133076593923e-05, + "loss": 0.4928, + "step": 9905 + }, + { + "epoch": 12.71630295250321, + "grad_norm": 82.39496612548828, + "learning_rate": 2.909670517757809e-05, + "loss": 0.5246, + "step": 9906 + }, + { + "epoch": 12.717586649550706, + "grad_norm": 5.725306510925293, + "learning_rate": 2.909627727856226e-05, + "loss": 0.5568, + "step": 9907 + }, + { + "epoch": 12.718870346598203, + "grad_norm": 2.617417573928833, + "learning_rate": 2.909584937954643e-05, + "loss": 0.5271, + "step": 9908 + }, + { + "epoch": 12.7201540436457, + "grad_norm": 2.764084577560425, + "learning_rate": 2.9095421480530593e-05, + "loss": 0.5397, + "step": 9909 + }, + { + "epoch": 12.721437740693197, + "grad_norm": 1.5942407846450806, + "learning_rate": 2.9094993581514765e-05, + "loss": 0.5292, + "step": 9910 + }, + { + "epoch": 12.722721437740693, + "grad_norm": 1.6053168773651123, + "learning_rate": 2.909456568249893e-05, + "loss": 0.5748, + "step": 9911 + }, + { + "epoch": 12.72400513478819, + "grad_norm": 1.4837566614151, + "learning_rate": 2.90941377834831e-05, + "loss": 0.514, + "step": 9912 + }, + { + "epoch": 12.725288831835687, + "grad_norm": 1.2648855447769165, + "learning_rate": 2.9093709884467267e-05, + "loss": 0.4994, + "step": 9913 + }, + { + "epoch": 12.726572528883183, + "grad_norm": 4.375746726989746, + "learning_rate": 2.9093281985451432e-05, + "loss": 0.5389, + "step": 9914 + }, + { + "epoch": 12.72785622593068, + "grad_norm": 1.1488027572631836, + "learning_rate": 2.9092854086435604e-05, + "loss": 0.4802, + "step": 9915 + }, + { + "epoch": 12.729139922978177, + "grad_norm": 1.493222713470459, + "learning_rate": 2.909242618741977e-05, + "loss": 0.5074, + "step": 9916 + }, + { + "epoch": 12.730423620025674, + "grad_norm": 2.522238254547119, + "learning_rate": 2.9091998288403937e-05, + "loss": 0.5229, + "step": 9917 + }, + { + "epoch": 12.731707317073171, + "grad_norm": 1.9602012634277344, + "learning_rate": 2.9091570389388106e-05, + "loss": 0.499, + "step": 9918 + }, + { + "epoch": 12.732991014120667, + "grad_norm": 1.1854426860809326, + "learning_rate": 2.909114249037227e-05, + "loss": 0.5547, + "step": 9919 + }, + { + "epoch": 12.734274711168164, + "grad_norm": 1.5938427448272705, + "learning_rate": 2.9090714591356443e-05, + "loss": 0.5403, + "step": 9920 + }, + { + "epoch": 12.735558408215661, + "grad_norm": 1.4009979963302612, + "learning_rate": 2.9090286692340608e-05, + "loss": 0.5403, + "step": 9921 + }, + { + "epoch": 12.736842105263158, + "grad_norm": 1.202332615852356, + "learning_rate": 2.9089858793324776e-05, + "loss": 0.548, + "step": 9922 + }, + { + "epoch": 12.738125802310655, + "grad_norm": 1.5275545120239258, + "learning_rate": 2.9089430894308944e-05, + "loss": 0.5441, + "step": 9923 + }, + { + "epoch": 12.739409499358151, + "grad_norm": 1.6878167390823364, + "learning_rate": 2.9089002995293113e-05, + "loss": 0.5624, + "step": 9924 + }, + { + "epoch": 12.740693196405648, + "grad_norm": 3.6214444637298584, + "learning_rate": 2.9088575096277278e-05, + "loss": 0.5596, + "step": 9925 + }, + { + "epoch": 12.741976893453145, + "grad_norm": 1.6898174285888672, + "learning_rate": 2.9088147197261446e-05, + "loss": 0.5303, + "step": 9926 + }, + { + "epoch": 12.743260590500642, + "grad_norm": 4.722596168518066, + "learning_rate": 2.9087719298245615e-05, + "loss": 0.5879, + "step": 9927 + }, + { + "epoch": 12.74454428754814, + "grad_norm": 1.9631633758544922, + "learning_rate": 2.9087291399229783e-05, + "loss": 0.5523, + "step": 9928 + }, + { + "epoch": 12.745827984595635, + "grad_norm": 1.7433916330337524, + "learning_rate": 2.908686350021395e-05, + "loss": 0.5349, + "step": 9929 + }, + { + "epoch": 12.747111681643132, + "grad_norm": 1.4310013055801392, + "learning_rate": 2.9086435601198117e-05, + "loss": 0.5106, + "step": 9930 + }, + { + "epoch": 12.74839537869063, + "grad_norm": 2.943303346633911, + "learning_rate": 2.908600770218229e-05, + "loss": 0.5328, + "step": 9931 + }, + { + "epoch": 12.749679075738126, + "grad_norm": 3.72552752494812, + "learning_rate": 2.9085579803166453e-05, + "loss": 0.554, + "step": 9932 + }, + { + "epoch": 12.750962772785623, + "grad_norm": 2.127211093902588, + "learning_rate": 2.908515190415062e-05, + "loss": 0.5937, + "step": 9933 + }, + { + "epoch": 12.75224646983312, + "grad_norm": 1.2912412881851196, + "learning_rate": 2.908472400513479e-05, + "loss": 0.5129, + "step": 9934 + }, + { + "epoch": 12.753530166880616, + "grad_norm": 1.8106763362884521, + "learning_rate": 2.9084296106118955e-05, + "loss": 0.5802, + "step": 9935 + }, + { + "epoch": 12.754813863928113, + "grad_norm": 3.9530041217803955, + "learning_rate": 2.9083868207103127e-05, + "loss": 0.5214, + "step": 9936 + }, + { + "epoch": 12.75609756097561, + "grad_norm": 3.5107486248016357, + "learning_rate": 2.9083440308087292e-05, + "loss": 0.5573, + "step": 9937 + }, + { + "epoch": 12.757381258023106, + "grad_norm": 3.107779026031494, + "learning_rate": 2.908301240907146e-05, + "loss": 0.6066, + "step": 9938 + }, + { + "epoch": 12.758664955070603, + "grad_norm": 2.7912163734436035, + "learning_rate": 2.908258451005563e-05, + "loss": 0.5756, + "step": 9939 + }, + { + "epoch": 12.7599486521181, + "grad_norm": 2.273197889328003, + "learning_rate": 2.9082156611039794e-05, + "loss": 0.5876, + "step": 9940 + }, + { + "epoch": 12.761232349165597, + "grad_norm": 2.1829652786254883, + "learning_rate": 2.9081728712023962e-05, + "loss": 0.6395, + "step": 9941 + }, + { + "epoch": 12.762516046213094, + "grad_norm": 1.415717363357544, + "learning_rate": 2.908130081300813e-05, + "loss": 0.59, + "step": 9942 + }, + { + "epoch": 12.763799743260591, + "grad_norm": 2.855527877807617, + "learning_rate": 2.90808729139923e-05, + "loss": 0.6101, + "step": 9943 + }, + { + "epoch": 12.765083440308088, + "grad_norm": 5.150649547576904, + "learning_rate": 2.9080445014976468e-05, + "loss": 0.6054, + "step": 9944 + }, + { + "epoch": 12.766367137355584, + "grad_norm": 9.956789016723633, + "learning_rate": 2.9080017115960636e-05, + "loss": 0.5608, + "step": 9945 + }, + { + "epoch": 12.767650834403081, + "grad_norm": 2.6423141956329346, + "learning_rate": 2.90795892169448e-05, + "loss": 0.63, + "step": 9946 + }, + { + "epoch": 12.768934531450578, + "grad_norm": 4.472597122192383, + "learning_rate": 2.907916131792897e-05, + "loss": 0.6826, + "step": 9947 + }, + { + "epoch": 12.770218228498074, + "grad_norm": 2.3420422077178955, + "learning_rate": 2.9078733418913138e-05, + "loss": 0.7803, + "step": 9948 + }, + { + "epoch": 12.771501925545572, + "grad_norm": 1.632630705833435, + "learning_rate": 2.9078305519897303e-05, + "loss": 0.5247, + "step": 9949 + }, + { + "epoch": 12.772785622593068, + "grad_norm": 3.544916868209839, + "learning_rate": 2.9077877620881475e-05, + "loss": 0.5234, + "step": 9950 + }, + { + "epoch": 12.774069319640565, + "grad_norm": 2.026320219039917, + "learning_rate": 2.907744972186564e-05, + "loss": 0.5103, + "step": 9951 + }, + { + "epoch": 12.775353016688062, + "grad_norm": 1.5161261558532715, + "learning_rate": 2.9077021822849808e-05, + "loss": 0.5519, + "step": 9952 + }, + { + "epoch": 12.776636713735558, + "grad_norm": 0.9760687351226807, + "learning_rate": 2.9076593923833976e-05, + "loss": 0.5278, + "step": 9953 + }, + { + "epoch": 12.777920410783056, + "grad_norm": 1.1118630170822144, + "learning_rate": 2.907616602481814e-05, + "loss": 0.5418, + "step": 9954 + }, + { + "epoch": 12.779204107830552, + "grad_norm": 1.8736536502838135, + "learning_rate": 2.9075738125802313e-05, + "loss": 0.5764, + "step": 9955 + }, + { + "epoch": 12.78048780487805, + "grad_norm": 1.8611979484558105, + "learning_rate": 2.9075310226786478e-05, + "loss": 0.6072, + "step": 9956 + }, + { + "epoch": 12.781771501925546, + "grad_norm": 1.5070512294769287, + "learning_rate": 2.9074882327770647e-05, + "loss": 0.5849, + "step": 9957 + }, + { + "epoch": 12.783055198973042, + "grad_norm": 3.1976735591888428, + "learning_rate": 2.9074454428754815e-05, + "loss": 0.5428, + "step": 9958 + }, + { + "epoch": 12.78433889602054, + "grad_norm": 1.2086366415023804, + "learning_rate": 2.907402652973898e-05, + "loss": 0.5057, + "step": 9959 + }, + { + "epoch": 12.785622593068036, + "grad_norm": 0.8802750110626221, + "learning_rate": 2.9073598630723152e-05, + "loss": 0.5441, + "step": 9960 + }, + { + "epoch": 12.786906290115533, + "grad_norm": 2.0542314052581787, + "learning_rate": 2.9073170731707317e-05, + "loss": 0.5292, + "step": 9961 + }, + { + "epoch": 12.78818998716303, + "grad_norm": 2.8117916584014893, + "learning_rate": 2.9072742832691485e-05, + "loss": 0.6013, + "step": 9962 + }, + { + "epoch": 12.789473684210526, + "grad_norm": 4.763435363769531, + "learning_rate": 2.9072314933675654e-05, + "loss": 0.4969, + "step": 9963 + }, + { + "epoch": 12.790757381258024, + "grad_norm": 1.1295428276062012, + "learning_rate": 2.9071887034659822e-05, + "loss": 0.5219, + "step": 9964 + }, + { + "epoch": 12.79204107830552, + "grad_norm": 9.408763885498047, + "learning_rate": 2.9071459135643987e-05, + "loss": 0.5366, + "step": 9965 + }, + { + "epoch": 12.793324775353017, + "grad_norm": 1.3414793014526367, + "learning_rate": 2.9071031236628156e-05, + "loss": 0.5774, + "step": 9966 + }, + { + "epoch": 12.794608472400514, + "grad_norm": 1.7442514896392822, + "learning_rate": 2.9070603337612324e-05, + "loss": 0.505, + "step": 9967 + }, + { + "epoch": 12.79589216944801, + "grad_norm": 2.0397932529449463, + "learning_rate": 2.9070175438596492e-05, + "loss": 0.5463, + "step": 9968 + }, + { + "epoch": 12.797175866495508, + "grad_norm": 3.1315486431121826, + "learning_rate": 2.906974753958066e-05, + "loss": 0.5288, + "step": 9969 + }, + { + "epoch": 12.798459563543004, + "grad_norm": 4.665079593658447, + "learning_rate": 2.9069319640564826e-05, + "loss": 0.5935, + "step": 9970 + }, + { + "epoch": 12.7997432605905, + "grad_norm": 3.6450467109680176, + "learning_rate": 2.9068891741548998e-05, + "loss": 0.5125, + "step": 9971 + }, + { + "epoch": 12.801026957637998, + "grad_norm": 1.1353027820587158, + "learning_rate": 2.9068463842533163e-05, + "loss": 0.5276, + "step": 9972 + }, + { + "epoch": 12.802310654685494, + "grad_norm": 3.0697543621063232, + "learning_rate": 2.9068035943517328e-05, + "loss": 0.5483, + "step": 9973 + }, + { + "epoch": 12.803594351732992, + "grad_norm": 3.3623976707458496, + "learning_rate": 2.90676080445015e-05, + "loss": 0.5249, + "step": 9974 + }, + { + "epoch": 12.804878048780488, + "grad_norm": 3.3071272373199463, + "learning_rate": 2.9067180145485665e-05, + "loss": 0.5545, + "step": 9975 + }, + { + "epoch": 12.806161745827985, + "grad_norm": 2.0765621662139893, + "learning_rate": 2.9066752246469836e-05, + "loss": 0.5582, + "step": 9976 + }, + { + "epoch": 12.807445442875482, + "grad_norm": 1.0432965755462646, + "learning_rate": 2.9066324347454e-05, + "loss": 0.5069, + "step": 9977 + }, + { + "epoch": 12.808729139922978, + "grad_norm": 1.2842793464660645, + "learning_rate": 2.906589644843817e-05, + "loss": 0.5134, + "step": 9978 + }, + { + "epoch": 12.810012836970476, + "grad_norm": 1.1822627782821655, + "learning_rate": 2.9065468549422338e-05, + "loss": 0.5829, + "step": 9979 + }, + { + "epoch": 12.811296534017972, + "grad_norm": 1.1364959478378296, + "learning_rate": 2.9065040650406503e-05, + "loss": 0.5867, + "step": 9980 + }, + { + "epoch": 12.812580231065468, + "grad_norm": 1.2113720178604126, + "learning_rate": 2.906461275139067e-05, + "loss": 0.5603, + "step": 9981 + }, + { + "epoch": 12.813863928112966, + "grad_norm": 1.972833275794983, + "learning_rate": 2.906418485237484e-05, + "loss": 0.5317, + "step": 9982 + }, + { + "epoch": 12.815147625160462, + "grad_norm": 3.35062837600708, + "learning_rate": 2.906375695335901e-05, + "loss": 0.5497, + "step": 9983 + }, + { + "epoch": 12.81643132220796, + "grad_norm": 1.8670600652694702, + "learning_rate": 2.9063329054343177e-05, + "loss": 0.5285, + "step": 9984 + }, + { + "epoch": 12.817715019255456, + "grad_norm": 2.797954559326172, + "learning_rate": 2.9062901155327345e-05, + "loss": 0.5507, + "step": 9985 + }, + { + "epoch": 12.818998716302952, + "grad_norm": 1.1899360418319702, + "learning_rate": 2.906247325631151e-05, + "loss": 0.5508, + "step": 9986 + }, + { + "epoch": 12.82028241335045, + "grad_norm": 3.0388588905334473, + "learning_rate": 2.906204535729568e-05, + "loss": 0.5336, + "step": 9987 + }, + { + "epoch": 12.821566110397946, + "grad_norm": 16.541744232177734, + "learning_rate": 2.9061617458279847e-05, + "loss": 0.5692, + "step": 9988 + }, + { + "epoch": 12.822849807445444, + "grad_norm": 6.418515682220459, + "learning_rate": 2.9061189559264012e-05, + "loss": 0.57, + "step": 9989 + }, + { + "epoch": 12.82413350449294, + "grad_norm": 1.9208139181137085, + "learning_rate": 2.9060761660248184e-05, + "loss": 0.6165, + "step": 9990 + }, + { + "epoch": 12.825417201540436, + "grad_norm": 1.6118370294570923, + "learning_rate": 2.906033376123235e-05, + "loss": 0.5812, + "step": 9991 + }, + { + "epoch": 12.826700898587934, + "grad_norm": 2.5122148990631104, + "learning_rate": 2.905990586221652e-05, + "loss": 0.5665, + "step": 9992 + }, + { + "epoch": 12.82798459563543, + "grad_norm": 7.669635772705078, + "learning_rate": 2.9059477963200686e-05, + "loss": 0.587, + "step": 9993 + }, + { + "epoch": 12.829268292682928, + "grad_norm": 1.6185964345932007, + "learning_rate": 2.905905006418485e-05, + "loss": 0.5831, + "step": 9994 + }, + { + "epoch": 12.830551989730424, + "grad_norm": 3.8938727378845215, + "learning_rate": 2.9058622165169023e-05, + "loss": 0.6303, + "step": 9995 + }, + { + "epoch": 12.83183568677792, + "grad_norm": 12.943095207214355, + "learning_rate": 2.9058194266153188e-05, + "loss": 0.6622, + "step": 9996 + }, + { + "epoch": 12.833119383825418, + "grad_norm": 2.7880468368530273, + "learning_rate": 2.9057766367137356e-05, + "loss": 0.7476, + "step": 9997 + }, + { + "epoch": 12.834403080872914, + "grad_norm": 4.067072868347168, + "learning_rate": 2.9057338468121524e-05, + "loss": 0.7996, + "step": 9998 + }, + { + "epoch": 12.835686777920412, + "grad_norm": 1.07874596118927, + "learning_rate": 2.9056910569105693e-05, + "loss": 0.5121, + "step": 9999 + }, + { + "epoch": 12.836970474967908, + "grad_norm": 2.436324119567871, + "learning_rate": 2.905648267008986e-05, + "loss": 0.5047, + "step": 10000 + }, + { + "epoch": 12.836970474967908, + "eval_cer": 0.29138926962991973, + "eval_loss": 0.5422592759132385, + "eval_runtime": 13.6652, + "eval_samples_per_second": 71.935, + "eval_steps_per_second": 0.512, + "eval_wer": 0.5197202476496217, + "step": 10000 + }, + { + "epoch": 12.838254172015404, + "grad_norm": 1.7507082223892212, + "learning_rate": 2.9056054771074026e-05, + "loss": 0.5073, + "step": 10001 + }, + { + "epoch": 12.839537869062902, + "grad_norm": 5.078795909881592, + "learning_rate": 2.9055626872058195e-05, + "loss": 0.5344, + "step": 10002 + }, + { + "epoch": 12.840821566110398, + "grad_norm": 2.5769524574279785, + "learning_rate": 2.9055198973042363e-05, + "loss": 0.512, + "step": 10003 + }, + { + "epoch": 12.842105263157894, + "grad_norm": 1.9224584102630615, + "learning_rate": 2.905477107402653e-05, + "loss": 0.501, + "step": 10004 + }, + { + "epoch": 12.843388960205392, + "grad_norm": 3.0099174976348877, + "learning_rate": 2.9054343175010697e-05, + "loss": 0.5737, + "step": 10005 + }, + { + "epoch": 12.844672657252888, + "grad_norm": 1.8674179315567017, + "learning_rate": 2.905391527599487e-05, + "loss": 0.5438, + "step": 10006 + }, + { + "epoch": 12.845956354300386, + "grad_norm": 2.3001511096954346, + "learning_rate": 2.9053487376979033e-05, + "loss": 0.537, + "step": 10007 + }, + { + "epoch": 12.847240051347882, + "grad_norm": 4.534532070159912, + "learning_rate": 2.9053059477963202e-05, + "loss": 0.5826, + "step": 10008 + }, + { + "epoch": 12.84852374839538, + "grad_norm": 2.6724581718444824, + "learning_rate": 2.905263157894737e-05, + "loss": 0.5231, + "step": 10009 + }, + { + "epoch": 12.849807445442876, + "grad_norm": 1.2574824094772339, + "learning_rate": 2.9052203679931535e-05, + "loss": 0.5413, + "step": 10010 + }, + { + "epoch": 12.851091142490372, + "grad_norm": 1.4348106384277344, + "learning_rate": 2.9051775780915707e-05, + "loss": 0.5194, + "step": 10011 + }, + { + "epoch": 12.85237483953787, + "grad_norm": 2.6584513187408447, + "learning_rate": 2.9051347881899872e-05, + "loss": 0.5425, + "step": 10012 + }, + { + "epoch": 12.853658536585366, + "grad_norm": 2.6938135623931885, + "learning_rate": 2.905091998288404e-05, + "loss": 0.5607, + "step": 10013 + }, + { + "epoch": 12.854942233632862, + "grad_norm": 1.681818962097168, + "learning_rate": 2.905049208386821e-05, + "loss": 0.5114, + "step": 10014 + }, + { + "epoch": 12.85622593068036, + "grad_norm": 1.831701636314392, + "learning_rate": 2.9050064184852374e-05, + "loss": 0.52, + "step": 10015 + }, + { + "epoch": 12.857509627727856, + "grad_norm": 3.0458109378814697, + "learning_rate": 2.9049636285836546e-05, + "loss": 0.5107, + "step": 10016 + }, + { + "epoch": 12.858793324775354, + "grad_norm": 1.9059815406799316, + "learning_rate": 2.904920838682071e-05, + "loss": 0.5178, + "step": 10017 + }, + { + "epoch": 12.86007702182285, + "grad_norm": 2.9536805152893066, + "learning_rate": 2.904878048780488e-05, + "loss": 0.5037, + "step": 10018 + }, + { + "epoch": 12.861360718870346, + "grad_norm": 5.140941619873047, + "learning_rate": 2.9048352588789048e-05, + "loss": 0.5242, + "step": 10019 + }, + { + "epoch": 12.862644415917844, + "grad_norm": 13.175456047058105, + "learning_rate": 2.9047924689773213e-05, + "loss": 0.5305, + "step": 10020 + }, + { + "epoch": 12.86392811296534, + "grad_norm": 7.538053512573242, + "learning_rate": 2.904749679075738e-05, + "loss": 0.5542, + "step": 10021 + }, + { + "epoch": 12.865211810012838, + "grad_norm": 7.126104831695557, + "learning_rate": 2.904706889174155e-05, + "loss": 0.515, + "step": 10022 + }, + { + "epoch": 12.866495507060334, + "grad_norm": 5.460160732269287, + "learning_rate": 2.9046640992725718e-05, + "loss": 0.5299, + "step": 10023 + }, + { + "epoch": 12.86777920410783, + "grad_norm": 2.619340419769287, + "learning_rate": 2.9046213093709883e-05, + "loss": 0.5466, + "step": 10024 + }, + { + "epoch": 12.869062901155328, + "grad_norm": 1.7011877298355103, + "learning_rate": 2.9045785194694055e-05, + "loss": 0.5373, + "step": 10025 + }, + { + "epoch": 12.870346598202824, + "grad_norm": 1.1039377450942993, + "learning_rate": 2.904535729567822e-05, + "loss": 0.4685, + "step": 10026 + }, + { + "epoch": 12.871630295250322, + "grad_norm": 7.426340579986572, + "learning_rate": 2.9044929396662388e-05, + "loss": 0.5319, + "step": 10027 + }, + { + "epoch": 12.872913992297818, + "grad_norm": 1.5735223293304443, + "learning_rate": 2.9044501497646556e-05, + "loss": 0.5495, + "step": 10028 + }, + { + "epoch": 12.874197689345314, + "grad_norm": 2.3846046924591064, + "learning_rate": 2.904407359863072e-05, + "loss": 0.5203, + "step": 10029 + }, + { + "epoch": 12.875481386392812, + "grad_norm": 3.974691867828369, + "learning_rate": 2.9043645699614893e-05, + "loss": 0.605, + "step": 10030 + }, + { + "epoch": 12.876765083440308, + "grad_norm": 70.38166046142578, + "learning_rate": 2.904321780059906e-05, + "loss": 0.6252, + "step": 10031 + }, + { + "epoch": 12.878048780487806, + "grad_norm": 2.0622451305389404, + "learning_rate": 2.9042789901583227e-05, + "loss": 0.5716, + "step": 10032 + }, + { + "epoch": 12.879332477535302, + "grad_norm": 2.671604633331299, + "learning_rate": 2.9042362002567395e-05, + "loss": 0.5707, + "step": 10033 + }, + { + "epoch": 12.880616174582798, + "grad_norm": 1.8239048719406128, + "learning_rate": 2.904193410355156e-05, + "loss": 0.5376, + "step": 10034 + }, + { + "epoch": 12.881899871630296, + "grad_norm": 1.0325870513916016, + "learning_rate": 2.9041506204535732e-05, + "loss": 0.6145, + "step": 10035 + }, + { + "epoch": 12.883183568677792, + "grad_norm": 1.8856481313705444, + "learning_rate": 2.9041078305519897e-05, + "loss": 0.6259, + "step": 10036 + }, + { + "epoch": 12.88446726572529, + "grad_norm": 1.6942425966262817, + "learning_rate": 2.9040650406504065e-05, + "loss": 0.5638, + "step": 10037 + }, + { + "epoch": 12.885750962772786, + "grad_norm": 2.0772147178649902, + "learning_rate": 2.9040222507488234e-05, + "loss": 0.5892, + "step": 10038 + }, + { + "epoch": 12.887034659820282, + "grad_norm": 2.6226680278778076, + "learning_rate": 2.9039794608472402e-05, + "loss": 0.5895, + "step": 10039 + }, + { + "epoch": 12.88831835686778, + "grad_norm": 1.3246197700500488, + "learning_rate": 2.9039366709456567e-05, + "loss": 0.5627, + "step": 10040 + }, + { + "epoch": 12.889602053915276, + "grad_norm": 1.8367712497711182, + "learning_rate": 2.9038938810440736e-05, + "loss": 0.6035, + "step": 10041 + }, + { + "epoch": 12.890885750962774, + "grad_norm": 6.531193733215332, + "learning_rate": 2.9038510911424904e-05, + "loss": 0.5677, + "step": 10042 + }, + { + "epoch": 12.89216944801027, + "grad_norm": 3.6095774173736572, + "learning_rate": 2.9038083012409073e-05, + "loss": 0.5953, + "step": 10043 + }, + { + "epoch": 12.893453145057766, + "grad_norm": 2.7294764518737793, + "learning_rate": 2.903765511339324e-05, + "loss": 0.5953, + "step": 10044 + }, + { + "epoch": 12.894736842105264, + "grad_norm": 1.386037826538086, + "learning_rate": 2.9037227214377406e-05, + "loss": 0.6177, + "step": 10045 + }, + { + "epoch": 12.89602053915276, + "grad_norm": 5.206795692443848, + "learning_rate": 2.9036799315361578e-05, + "loss": 0.6492, + "step": 10046 + }, + { + "epoch": 12.897304236200256, + "grad_norm": 2.3114535808563232, + "learning_rate": 2.9036371416345743e-05, + "loss": 0.7376, + "step": 10047 + }, + { + "epoch": 12.898587933247754, + "grad_norm": 4.815097808837891, + "learning_rate": 2.9035943517329908e-05, + "loss": 0.7061, + "step": 10048 + }, + { + "epoch": 12.89987163029525, + "grad_norm": 1.2690138816833496, + "learning_rate": 2.903551561831408e-05, + "loss": 0.5059, + "step": 10049 + }, + { + "epoch": 12.901155327342748, + "grad_norm": 0.9859081506729126, + "learning_rate": 2.9035087719298245e-05, + "loss": 0.49, + "step": 10050 + }, + { + "epoch": 12.902439024390244, + "grad_norm": 1.41612708568573, + "learning_rate": 2.9034659820282416e-05, + "loss": 0.5041, + "step": 10051 + }, + { + "epoch": 12.90372272143774, + "grad_norm": 2.1070363521575928, + "learning_rate": 2.903423192126658e-05, + "loss": 0.5408, + "step": 10052 + }, + { + "epoch": 12.905006418485238, + "grad_norm": 2.0167219638824463, + "learning_rate": 2.903380402225075e-05, + "loss": 0.5377, + "step": 10053 + }, + { + "epoch": 12.906290115532734, + "grad_norm": 3.0588860511779785, + "learning_rate": 2.9033376123234918e-05, + "loss": 0.4742, + "step": 10054 + }, + { + "epoch": 12.907573812580232, + "grad_norm": 1.2264443635940552, + "learning_rate": 2.9032948224219083e-05, + "loss": 0.5277, + "step": 10055 + }, + { + "epoch": 12.908857509627728, + "grad_norm": 1.4590067863464355, + "learning_rate": 2.9032520325203252e-05, + "loss": 0.5412, + "step": 10056 + }, + { + "epoch": 12.910141206675224, + "grad_norm": 0.9610728621482849, + "learning_rate": 2.903209242618742e-05, + "loss": 0.5085, + "step": 10057 + }, + { + "epoch": 12.911424903722722, + "grad_norm": 6.4997477531433105, + "learning_rate": 2.903166452717159e-05, + "loss": 0.5319, + "step": 10058 + }, + { + "epoch": 12.912708600770218, + "grad_norm": 2.8415019512176514, + "learning_rate": 2.9031236628155757e-05, + "loss": 0.5077, + "step": 10059 + }, + { + "epoch": 12.913992297817716, + "grad_norm": 1.4525671005249023, + "learning_rate": 2.9030808729139925e-05, + "loss": 0.592, + "step": 10060 + }, + { + "epoch": 12.915275994865212, + "grad_norm": 1.074711799621582, + "learning_rate": 2.903038083012409e-05, + "loss": 0.4994, + "step": 10061 + }, + { + "epoch": 12.916559691912708, + "grad_norm": 3.5421948432922363, + "learning_rate": 2.902995293110826e-05, + "loss": 0.5486, + "step": 10062 + }, + { + "epoch": 12.917843388960206, + "grad_norm": 3.1796677112579346, + "learning_rate": 2.9029525032092427e-05, + "loss": 0.5151, + "step": 10063 + }, + { + "epoch": 12.919127086007702, + "grad_norm": 0.989539623260498, + "learning_rate": 2.9029097133076592e-05, + "loss": 0.5523, + "step": 10064 + }, + { + "epoch": 12.9204107830552, + "grad_norm": 1.3603894710540771, + "learning_rate": 2.9028669234060764e-05, + "loss": 0.5476, + "step": 10065 + }, + { + "epoch": 12.921694480102696, + "grad_norm": 2.8306257724761963, + "learning_rate": 2.902824133504493e-05, + "loss": 0.5326, + "step": 10066 + }, + { + "epoch": 12.922978177150192, + "grad_norm": 2.1796586513519287, + "learning_rate": 2.90278134360291e-05, + "loss": 0.5566, + "step": 10067 + }, + { + "epoch": 12.92426187419769, + "grad_norm": 2.445023536682129, + "learning_rate": 2.9027385537013266e-05, + "loss": 0.5306, + "step": 10068 + }, + { + "epoch": 12.925545571245186, + "grad_norm": 1.8948203325271606, + "learning_rate": 2.902695763799743e-05, + "loss": 0.5511, + "step": 10069 + }, + { + "epoch": 12.926829268292684, + "grad_norm": 2.676365852355957, + "learning_rate": 2.9026529738981603e-05, + "loss": 0.4768, + "step": 10070 + }, + { + "epoch": 12.92811296534018, + "grad_norm": 2.953214406967163, + "learning_rate": 2.9026101839965768e-05, + "loss": 0.4971, + "step": 10071 + }, + { + "epoch": 12.929396662387676, + "grad_norm": 7.43635892868042, + "learning_rate": 2.9025673940949936e-05, + "loss": 0.5266, + "step": 10072 + }, + { + "epoch": 12.930680359435174, + "grad_norm": 1.5006086826324463, + "learning_rate": 2.9025246041934105e-05, + "loss": 0.53, + "step": 10073 + }, + { + "epoch": 12.93196405648267, + "grad_norm": 1.3674448728561401, + "learning_rate": 2.9024818142918273e-05, + "loss": 0.5348, + "step": 10074 + }, + { + "epoch": 12.933247753530168, + "grad_norm": 1.2390037775039673, + "learning_rate": 2.902439024390244e-05, + "loss": 0.5298, + "step": 10075 + }, + { + "epoch": 12.934531450577664, + "grad_norm": 5.840522289276123, + "learning_rate": 2.9023962344886606e-05, + "loss": 0.5549, + "step": 10076 + }, + { + "epoch": 12.93581514762516, + "grad_norm": 1.2818578481674194, + "learning_rate": 2.9023534445870775e-05, + "loss": 0.5611, + "step": 10077 + }, + { + "epoch": 12.937098844672658, + "grad_norm": 2.699026346206665, + "learning_rate": 2.9023106546854943e-05, + "loss": 0.5146, + "step": 10078 + }, + { + "epoch": 12.938382541720154, + "grad_norm": 1.7225418090820312, + "learning_rate": 2.902267864783911e-05, + "loss": 0.5574, + "step": 10079 + }, + { + "epoch": 12.93966623876765, + "grad_norm": 2.4412930011749268, + "learning_rate": 2.9022250748823277e-05, + "loss": 0.5166, + "step": 10080 + }, + { + "epoch": 12.940949935815148, + "grad_norm": 1.3791109323501587, + "learning_rate": 2.9021822849807445e-05, + "loss": 0.5504, + "step": 10081 + }, + { + "epoch": 12.942233632862644, + "grad_norm": 1.0416544675827026, + "learning_rate": 2.9021394950791613e-05, + "loss": 0.5702, + "step": 10082 + }, + { + "epoch": 12.943517329910142, + "grad_norm": 1.6929295063018799, + "learning_rate": 2.9020967051775782e-05, + "loss": 0.5306, + "step": 10083 + }, + { + "epoch": 12.944801026957638, + "grad_norm": 2.9926559925079346, + "learning_rate": 2.902053915275995e-05, + "loss": 0.5693, + "step": 10084 + }, + { + "epoch": 12.946084724005134, + "grad_norm": 0.9181146621704102, + "learning_rate": 2.9020111253744115e-05, + "loss": 0.5563, + "step": 10085 + }, + { + "epoch": 12.947368421052632, + "grad_norm": 3.643404722213745, + "learning_rate": 2.9019683354728287e-05, + "loss": 0.5645, + "step": 10086 + }, + { + "epoch": 12.948652118100128, + "grad_norm": 1.3602854013442993, + "learning_rate": 2.9019255455712452e-05, + "loss": 0.5429, + "step": 10087 + }, + { + "epoch": 12.949935815147626, + "grad_norm": 1.9356181621551514, + "learning_rate": 2.9018827556696617e-05, + "loss": 0.5272, + "step": 10088 + }, + { + "epoch": 12.951219512195122, + "grad_norm": 1.6229817867279053, + "learning_rate": 2.901839965768079e-05, + "loss": 0.5546, + "step": 10089 + }, + { + "epoch": 12.952503209242618, + "grad_norm": 2.6153318881988525, + "learning_rate": 2.9017971758664954e-05, + "loss": 0.5279, + "step": 10090 + }, + { + "epoch": 12.953786906290116, + "grad_norm": 6.6273064613342285, + "learning_rate": 2.9017543859649126e-05, + "loss": 0.5765, + "step": 10091 + }, + { + "epoch": 12.955070603337612, + "grad_norm": 4.649940013885498, + "learning_rate": 2.901711596063329e-05, + "loss": 0.6134, + "step": 10092 + }, + { + "epoch": 12.95635430038511, + "grad_norm": 4.25803804397583, + "learning_rate": 2.901668806161746e-05, + "loss": 0.5969, + "step": 10093 + }, + { + "epoch": 12.957637997432606, + "grad_norm": 2.490831136703491, + "learning_rate": 2.9016260162601628e-05, + "loss": 0.5889, + "step": 10094 + }, + { + "epoch": 12.958921694480102, + "grad_norm": 2.3936474323272705, + "learning_rate": 2.9015832263585793e-05, + "loss": 0.6208, + "step": 10095 + }, + { + "epoch": 12.9602053915276, + "grad_norm": 1.7316945791244507, + "learning_rate": 2.901540436456996e-05, + "loss": 0.6341, + "step": 10096 + }, + { + "epoch": 12.961489088575096, + "grad_norm": 2.072077512741089, + "learning_rate": 2.901497646555413e-05, + "loss": 0.7088, + "step": 10097 + }, + { + "epoch": 12.962772785622594, + "grad_norm": 4.651294708251953, + "learning_rate": 2.9014548566538298e-05, + "loss": 0.8185, + "step": 10098 + }, + { + "epoch": 12.96405648267009, + "grad_norm": 1.4809225797653198, + "learning_rate": 2.9014120667522466e-05, + "loss": 0.5033, + "step": 10099 + }, + { + "epoch": 12.965340179717586, + "grad_norm": 1.0308667421340942, + "learning_rate": 2.9013692768506635e-05, + "loss": 0.5131, + "step": 10100 + }, + { + "epoch": 12.966623876765084, + "grad_norm": 2.3111913204193115, + "learning_rate": 2.90132648694908e-05, + "loss": 0.5249, + "step": 10101 + }, + { + "epoch": 12.96790757381258, + "grad_norm": 1.5136260986328125, + "learning_rate": 2.9012836970474968e-05, + "loss": 0.5215, + "step": 10102 + }, + { + "epoch": 12.969191270860078, + "grad_norm": 1.1814016103744507, + "learning_rate": 2.9012409071459137e-05, + "loss": 0.4932, + "step": 10103 + }, + { + "epoch": 12.970474967907574, + "grad_norm": 0.9406662583351135, + "learning_rate": 2.90119811724433e-05, + "loss": 0.5456, + "step": 10104 + }, + { + "epoch": 12.97175866495507, + "grad_norm": 3.3602678775787354, + "learning_rate": 2.9011553273427473e-05, + "loss": 0.5396, + "step": 10105 + }, + { + "epoch": 12.973042362002568, + "grad_norm": 2.8504269123077393, + "learning_rate": 2.901112537441164e-05, + "loss": 0.5276, + "step": 10106 + }, + { + "epoch": 12.974326059050064, + "grad_norm": 1.0049173831939697, + "learning_rate": 2.901069747539581e-05, + "loss": 0.5261, + "step": 10107 + }, + { + "epoch": 12.975609756097562, + "grad_norm": 1.559016466140747, + "learning_rate": 2.9010269576379975e-05, + "loss": 0.6055, + "step": 10108 + }, + { + "epoch": 12.976893453145058, + "grad_norm": 1.5728546380996704, + "learning_rate": 2.900984167736414e-05, + "loss": 0.5251, + "step": 10109 + }, + { + "epoch": 12.978177150192554, + "grad_norm": 1.6100902557373047, + "learning_rate": 2.9009413778348312e-05, + "loss": 0.5741, + "step": 10110 + }, + { + "epoch": 12.979460847240052, + "grad_norm": 1.3384788036346436, + "learning_rate": 2.9008985879332477e-05, + "loss": 0.5879, + "step": 10111 + }, + { + "epoch": 12.980744544287548, + "grad_norm": 4.451927185058594, + "learning_rate": 2.9008557980316645e-05, + "loss": 0.518, + "step": 10112 + }, + { + "epoch": 12.982028241335044, + "grad_norm": 2.390418767929077, + "learning_rate": 2.9008130081300814e-05, + "loss": 0.5337, + "step": 10113 + }, + { + "epoch": 12.983311938382542, + "grad_norm": 1.1462904214859009, + "learning_rate": 2.9007702182284982e-05, + "loss": 0.5322, + "step": 10114 + }, + { + "epoch": 12.984595635430038, + "grad_norm": 1.616537094116211, + "learning_rate": 2.900727428326915e-05, + "loss": 0.5523, + "step": 10115 + }, + { + "epoch": 12.985879332477536, + "grad_norm": 1.0428820848464966, + "learning_rate": 2.9006846384253316e-05, + "loss": 0.64, + "step": 10116 + }, + { + "epoch": 12.987163029525032, + "grad_norm": 1.6379473209381104, + "learning_rate": 2.9006418485237484e-05, + "loss": 0.5593, + "step": 10117 + }, + { + "epoch": 12.988446726572528, + "grad_norm": 2.9291484355926514, + "learning_rate": 2.9005990586221653e-05, + "loss": 0.5672, + "step": 10118 + }, + { + "epoch": 12.989730423620026, + "grad_norm": 2.9036545753479004, + "learning_rate": 2.900556268720582e-05, + "loss": 0.56, + "step": 10119 + }, + { + "epoch": 12.991014120667522, + "grad_norm": 2.8475122451782227, + "learning_rate": 2.9005134788189986e-05, + "loss": 0.5823, + "step": 10120 + }, + { + "epoch": 12.99229781771502, + "grad_norm": 1.674302339553833, + "learning_rate": 2.9004706889174158e-05, + "loss": 0.5996, + "step": 10121 + }, + { + "epoch": 12.993581514762516, + "grad_norm": 2.470494270324707, + "learning_rate": 2.9004278990158323e-05, + "loss": 0.6067, + "step": 10122 + }, + { + "epoch": 12.994865211810012, + "grad_norm": 1.6811197996139526, + "learning_rate": 2.900385109114249e-05, + "loss": 0.5566, + "step": 10123 + }, + { + "epoch": 12.99614890885751, + "grad_norm": 1.4466307163238525, + "learning_rate": 2.900342319212666e-05, + "loss": 0.6169, + "step": 10124 + }, + { + "epoch": 12.997432605905006, + "grad_norm": 1.5671364068984985, + "learning_rate": 2.9002995293110825e-05, + "loss": 0.6262, + "step": 10125 + }, + { + "epoch": 12.998716302952504, + "grad_norm": 2.974026679992676, + "learning_rate": 2.9002567394094996e-05, + "loss": 0.6174, + "step": 10126 + }, + { + "epoch": 13.0, + "grad_norm": 4.413461208343506, + "learning_rate": 2.900213949507916e-05, + "loss": 0.7152, + "step": 10127 + }, + { + "epoch": 13.001283697047496, + "grad_norm": 1.7490261793136597, + "learning_rate": 2.900171159606333e-05, + "loss": 0.4625, + "step": 10128 + }, + { + "epoch": 13.002567394094994, + "grad_norm": 5.490370273590088, + "learning_rate": 2.90012836970475e-05, + "loss": 0.4718, + "step": 10129 + }, + { + "epoch": 13.00385109114249, + "grad_norm": 1.1969541311264038, + "learning_rate": 2.9000855798031663e-05, + "loss": 0.4978, + "step": 10130 + }, + { + "epoch": 13.005134788189988, + "grad_norm": 1.420733094215393, + "learning_rate": 2.9000427899015835e-05, + "loss": 0.5387, + "step": 10131 + }, + { + "epoch": 13.006418485237484, + "grad_norm": 1.3977705240249634, + "learning_rate": 2.9e-05, + "loss": 0.5089, + "step": 10132 + }, + { + "epoch": 13.00770218228498, + "grad_norm": 1.8465739488601685, + "learning_rate": 2.899957210098417e-05, + "loss": 0.5161, + "step": 10133 + }, + { + "epoch": 13.008985879332478, + "grad_norm": 1.8204807043075562, + "learning_rate": 2.8999144201968337e-05, + "loss": 0.5506, + "step": 10134 + }, + { + "epoch": 13.010269576379974, + "grad_norm": 2.15506911277771, + "learning_rate": 2.8998716302952505e-05, + "loss": 0.5472, + "step": 10135 + }, + { + "epoch": 13.011553273427472, + "grad_norm": 1.1416857242584229, + "learning_rate": 2.899828840393667e-05, + "loss": 0.536, + "step": 10136 + }, + { + "epoch": 13.012836970474968, + "grad_norm": 2.290778875350952, + "learning_rate": 2.899786050492084e-05, + "loss": 0.4854, + "step": 10137 + }, + { + "epoch": 13.014120667522464, + "grad_norm": 1.3509814739227295, + "learning_rate": 2.8997432605905007e-05, + "loss": 0.547, + "step": 10138 + }, + { + "epoch": 13.015404364569962, + "grad_norm": 0.9029693603515625, + "learning_rate": 2.8997004706889176e-05, + "loss": 0.494, + "step": 10139 + }, + { + "epoch": 13.016688061617458, + "grad_norm": 1.740221619606018, + "learning_rate": 2.8996576807873344e-05, + "loss": 0.5173, + "step": 10140 + }, + { + "epoch": 13.017971758664956, + "grad_norm": 2.1724131107330322, + "learning_rate": 2.899614890885751e-05, + "loss": 0.5342, + "step": 10141 + }, + { + "epoch": 13.019255455712452, + "grad_norm": 3.9000651836395264, + "learning_rate": 2.8995721009841678e-05, + "loss": 0.5217, + "step": 10142 + }, + { + "epoch": 13.020539152759948, + "grad_norm": 2.8651821613311768, + "learning_rate": 2.8995293110825846e-05, + "loss": 0.5068, + "step": 10143 + }, + { + "epoch": 13.021822849807446, + "grad_norm": 1.1689165830612183, + "learning_rate": 2.899486521181001e-05, + "loss": 0.4956, + "step": 10144 + }, + { + "epoch": 13.023106546854942, + "grad_norm": 1.838078498840332, + "learning_rate": 2.8994437312794183e-05, + "loss": 0.5318, + "step": 10145 + }, + { + "epoch": 13.024390243902438, + "grad_norm": 1.5165326595306396, + "learning_rate": 2.8994009413778348e-05, + "loss": 0.5295, + "step": 10146 + }, + { + "epoch": 13.025673940949936, + "grad_norm": 1.1548798084259033, + "learning_rate": 2.899358151476252e-05, + "loss": 0.4894, + "step": 10147 + }, + { + "epoch": 13.026957637997432, + "grad_norm": 1.1578574180603027, + "learning_rate": 2.8993153615746685e-05, + "loss": 0.5252, + "step": 10148 + }, + { + "epoch": 13.02824133504493, + "grad_norm": 2.7028417587280273, + "learning_rate": 2.899272571673085e-05, + "loss": 0.5151, + "step": 10149 + }, + { + "epoch": 13.029525032092426, + "grad_norm": 1.611697793006897, + "learning_rate": 2.899229781771502e-05, + "loss": 0.538, + "step": 10150 + }, + { + "epoch": 13.030808729139922, + "grad_norm": 2.218796968460083, + "learning_rate": 2.8991869918699186e-05, + "loss": 0.5279, + "step": 10151 + }, + { + "epoch": 13.03209242618742, + "grad_norm": 3.9823055267333984, + "learning_rate": 2.8991442019683355e-05, + "loss": 0.5675, + "step": 10152 + }, + { + "epoch": 13.033376123234916, + "grad_norm": 2.9814136028289795, + "learning_rate": 2.8991014120667523e-05, + "loss": 0.596, + "step": 10153 + }, + { + "epoch": 13.034659820282414, + "grad_norm": 1.6424555778503418, + "learning_rate": 2.899058622165169e-05, + "loss": 0.5183, + "step": 10154 + }, + { + "epoch": 13.03594351732991, + "grad_norm": 2.3279099464416504, + "learning_rate": 2.899015832263586e-05, + "loss": 0.5371, + "step": 10155 + }, + { + "epoch": 13.037227214377406, + "grad_norm": 1.3011208772659302, + "learning_rate": 2.8989730423620025e-05, + "loss": 0.5146, + "step": 10156 + }, + { + "epoch": 13.038510911424904, + "grad_norm": 1.2722176313400269, + "learning_rate": 2.8989302524604194e-05, + "loss": 0.5487, + "step": 10157 + }, + { + "epoch": 13.0397946084724, + "grad_norm": 5.585996627807617, + "learning_rate": 2.8988874625588362e-05, + "loss": 0.5329, + "step": 10158 + }, + { + "epoch": 13.041078305519898, + "grad_norm": 2.151096820831299, + "learning_rate": 2.898844672657253e-05, + "loss": 0.538, + "step": 10159 + }, + { + "epoch": 13.042362002567394, + "grad_norm": 1.665557861328125, + "learning_rate": 2.8988018827556695e-05, + "loss": 0.6005, + "step": 10160 + }, + { + "epoch": 13.04364569961489, + "grad_norm": 1.7725213766098022, + "learning_rate": 2.8987590928540867e-05, + "loss": 0.575, + "step": 10161 + }, + { + "epoch": 13.044929396662388, + "grad_norm": 2.364687204360962, + "learning_rate": 2.8987163029525032e-05, + "loss": 0.48, + "step": 10162 + }, + { + "epoch": 13.046213093709884, + "grad_norm": 1.6301307678222656, + "learning_rate": 2.89867351305092e-05, + "loss": 0.5766, + "step": 10163 + }, + { + "epoch": 13.047496790757382, + "grad_norm": 2.014991283416748, + "learning_rate": 2.898630723149337e-05, + "loss": 0.5732, + "step": 10164 + }, + { + "epoch": 13.048780487804878, + "grad_norm": 1.7351937294006348, + "learning_rate": 2.8985879332477534e-05, + "loss": 0.5332, + "step": 10165 + }, + { + "epoch": 13.050064184852374, + "grad_norm": 1.4916399717330933, + "learning_rate": 2.8985451433461706e-05, + "loss": 0.5113, + "step": 10166 + }, + { + "epoch": 13.051347881899872, + "grad_norm": 2.635155439376831, + "learning_rate": 2.898502353444587e-05, + "loss": 0.5945, + "step": 10167 + }, + { + "epoch": 13.052631578947368, + "grad_norm": 1.5496705770492554, + "learning_rate": 2.898459563543004e-05, + "loss": 0.5537, + "step": 10168 + }, + { + "epoch": 13.053915275994866, + "grad_norm": 5.540354251861572, + "learning_rate": 2.8984167736414208e-05, + "loss": 0.5459, + "step": 10169 + }, + { + "epoch": 13.055198973042362, + "grad_norm": 2.1422855854034424, + "learning_rate": 2.8983739837398373e-05, + "loss": 0.5113, + "step": 10170 + }, + { + "epoch": 13.056482670089858, + "grad_norm": 2.389526605606079, + "learning_rate": 2.8983311938382545e-05, + "loss": 0.5273, + "step": 10171 + }, + { + "epoch": 13.057766367137356, + "grad_norm": 1.5939605236053467, + "learning_rate": 2.898288403936671e-05, + "loss": 0.5818, + "step": 10172 + }, + { + "epoch": 13.059050064184852, + "grad_norm": 2.0603384971618652, + "learning_rate": 2.8982456140350878e-05, + "loss": 0.6134, + "step": 10173 + }, + { + "epoch": 13.06033376123235, + "grad_norm": 2.526425361633301, + "learning_rate": 2.8982028241335046e-05, + "loss": 0.6406, + "step": 10174 + }, + { + "epoch": 13.061617458279846, + "grad_norm": 2.509079694747925, + "learning_rate": 2.8981600342319215e-05, + "loss": 0.6648, + "step": 10175 + }, + { + "epoch": 13.062901155327342, + "grad_norm": 6.131321907043457, + "learning_rate": 2.898117244330338e-05, + "loss": 0.6344, + "step": 10176 + }, + { + "epoch": 13.06418485237484, + "grad_norm": 3.32222580909729, + "learning_rate": 2.8980744544287548e-05, + "loss": 0.7812, + "step": 10177 + }, + { + "epoch": 13.065468549422336, + "grad_norm": 1.2907732725143433, + "learning_rate": 2.8980316645271717e-05, + "loss": 0.5384, + "step": 10178 + }, + { + "epoch": 13.066752246469832, + "grad_norm": 1.6908845901489258, + "learning_rate": 2.8979888746255885e-05, + "loss": 0.5168, + "step": 10179 + }, + { + "epoch": 13.06803594351733, + "grad_norm": 1.427909255027771, + "learning_rate": 2.8979460847240053e-05, + "loss": 0.5414, + "step": 10180 + }, + { + "epoch": 13.069319640564826, + "grad_norm": 2.861126661300659, + "learning_rate": 2.897903294822422e-05, + "loss": 0.517, + "step": 10181 + }, + { + "epoch": 13.070603337612324, + "grad_norm": 1.174121618270874, + "learning_rate": 2.897860504920839e-05, + "loss": 0.5069, + "step": 10182 + }, + { + "epoch": 13.07188703465982, + "grad_norm": 2.121952533721924, + "learning_rate": 2.8978177150192555e-05, + "loss": 0.5064, + "step": 10183 + }, + { + "epoch": 13.073170731707316, + "grad_norm": 8.309317588806152, + "learning_rate": 2.897774925117672e-05, + "loss": 0.5388, + "step": 10184 + }, + { + "epoch": 13.074454428754814, + "grad_norm": 1.193467140197754, + "learning_rate": 2.8977321352160892e-05, + "loss": 0.4858, + "step": 10185 + }, + { + "epoch": 13.07573812580231, + "grad_norm": 1.6002391576766968, + "learning_rate": 2.8976893453145057e-05, + "loss": 0.5474, + "step": 10186 + }, + { + "epoch": 13.077021822849808, + "grad_norm": 1.1012356281280518, + "learning_rate": 2.897646555412923e-05, + "loss": 0.5189, + "step": 10187 + }, + { + "epoch": 13.078305519897304, + "grad_norm": 2.0046775341033936, + "learning_rate": 2.8976037655113394e-05, + "loss": 0.5898, + "step": 10188 + }, + { + "epoch": 13.0795892169448, + "grad_norm": 1.2271846532821655, + "learning_rate": 2.8975609756097562e-05, + "loss": 0.5065, + "step": 10189 + }, + { + "epoch": 13.080872913992298, + "grad_norm": 1.3611818552017212, + "learning_rate": 2.897518185708173e-05, + "loss": 0.5185, + "step": 10190 + }, + { + "epoch": 13.082156611039794, + "grad_norm": 8.116425514221191, + "learning_rate": 2.8974753958065896e-05, + "loss": 0.541, + "step": 10191 + }, + { + "epoch": 13.083440308087292, + "grad_norm": 2.2311840057373047, + "learning_rate": 2.8974326059050064e-05, + "loss": 0.5308, + "step": 10192 + }, + { + "epoch": 13.084724005134788, + "grad_norm": 1.3078029155731201, + "learning_rate": 2.8973898160034233e-05, + "loss": 0.5184, + "step": 10193 + }, + { + "epoch": 13.086007702182284, + "grad_norm": 0.9005863666534424, + "learning_rate": 2.89734702610184e-05, + "loss": 0.528, + "step": 10194 + }, + { + "epoch": 13.087291399229782, + "grad_norm": 2.989384412765503, + "learning_rate": 2.897304236200257e-05, + "loss": 0.5217, + "step": 10195 + }, + { + "epoch": 13.088575096277278, + "grad_norm": 1.0507841110229492, + "learning_rate": 2.8972614462986738e-05, + "loss": 0.4985, + "step": 10196 + }, + { + "epoch": 13.089858793324776, + "grad_norm": 1.3151198625564575, + "learning_rate": 2.8972186563970903e-05, + "loss": 0.4964, + "step": 10197 + }, + { + "epoch": 13.091142490372272, + "grad_norm": 1.0723565816879272, + "learning_rate": 2.897175866495507e-05, + "loss": 0.5076, + "step": 10198 + }, + { + "epoch": 13.092426187419768, + "grad_norm": 0.950162947177887, + "learning_rate": 2.897133076593924e-05, + "loss": 0.5159, + "step": 10199 + }, + { + "epoch": 13.093709884467266, + "grad_norm": 2.0601677894592285, + "learning_rate": 2.8970902866923405e-05, + "loss": 0.4902, + "step": 10200 + }, + { + "epoch": 13.094993581514762, + "grad_norm": 1.7805765867233276, + "learning_rate": 2.8970474967907577e-05, + "loss": 0.5064, + "step": 10201 + }, + { + "epoch": 13.09627727856226, + "grad_norm": 1.465834617614746, + "learning_rate": 2.897004706889174e-05, + "loss": 0.5585, + "step": 10202 + }, + { + "epoch": 13.097560975609756, + "grad_norm": 3.3013439178466797, + "learning_rate": 2.896961916987591e-05, + "loss": 0.518, + "step": 10203 + }, + { + "epoch": 13.098844672657252, + "grad_norm": 1.4831126928329468, + "learning_rate": 2.896919127086008e-05, + "loss": 0.5196, + "step": 10204 + }, + { + "epoch": 13.10012836970475, + "grad_norm": 0.9083868265151978, + "learning_rate": 2.8968763371844243e-05, + "loss": 0.5032, + "step": 10205 + }, + { + "epoch": 13.101412066752246, + "grad_norm": 3.353079080581665, + "learning_rate": 2.8968335472828415e-05, + "loss": 0.5872, + "step": 10206 + }, + { + "epoch": 13.102695763799744, + "grad_norm": 2.0571701526641846, + "learning_rate": 2.896790757381258e-05, + "loss": 0.5416, + "step": 10207 + }, + { + "epoch": 13.10397946084724, + "grad_norm": 1.5171054601669312, + "learning_rate": 2.896747967479675e-05, + "loss": 0.5736, + "step": 10208 + }, + { + "epoch": 13.105263157894736, + "grad_norm": 1.0105350017547607, + "learning_rate": 2.8967051775780917e-05, + "loss": 0.5526, + "step": 10209 + }, + { + "epoch": 13.106546854942234, + "grad_norm": 1.0964304208755493, + "learning_rate": 2.8966623876765082e-05, + "loss": 0.5429, + "step": 10210 + }, + { + "epoch": 13.10783055198973, + "grad_norm": 1.0998178720474243, + "learning_rate": 2.8966195977749254e-05, + "loss": 0.5306, + "step": 10211 + }, + { + "epoch": 13.109114249037226, + "grad_norm": 1.1078109741210938, + "learning_rate": 2.896576807873342e-05, + "loss": 0.5562, + "step": 10212 + }, + { + "epoch": 13.110397946084724, + "grad_norm": 0.9570335149765015, + "learning_rate": 2.8965340179717587e-05, + "loss": 0.5524, + "step": 10213 + }, + { + "epoch": 13.11168164313222, + "grad_norm": 2.002812385559082, + "learning_rate": 2.8964912280701756e-05, + "loss": 0.5699, + "step": 10214 + }, + { + "epoch": 13.112965340179718, + "grad_norm": 1.1007106304168701, + "learning_rate": 2.8964484381685924e-05, + "loss": 0.5766, + "step": 10215 + }, + { + "epoch": 13.114249037227214, + "grad_norm": 3.766975164413452, + "learning_rate": 2.896405648267009e-05, + "loss": 0.5299, + "step": 10216 + }, + { + "epoch": 13.11553273427471, + "grad_norm": 1.7904317378997803, + "learning_rate": 2.8963628583654258e-05, + "loss": 0.5372, + "step": 10217 + }, + { + "epoch": 13.116816431322208, + "grad_norm": 1.9797749519348145, + "learning_rate": 2.8963200684638426e-05, + "loss": 0.5435, + "step": 10218 + }, + { + "epoch": 13.118100128369704, + "grad_norm": 1.7822805643081665, + "learning_rate": 2.8962772785622594e-05, + "loss": 0.596, + "step": 10219 + }, + { + "epoch": 13.119383825417202, + "grad_norm": 0.9353850483894348, + "learning_rate": 2.8962344886606763e-05, + "loss": 0.5723, + "step": 10220 + }, + { + "epoch": 13.120667522464698, + "grad_norm": 1.6838877201080322, + "learning_rate": 2.8961916987590928e-05, + "loss": 0.6273, + "step": 10221 + }, + { + "epoch": 13.121951219512194, + "grad_norm": 1.2812573909759521, + "learning_rate": 2.89614890885751e-05, + "loss": 0.5744, + "step": 10222 + }, + { + "epoch": 13.123234916559692, + "grad_norm": 4.33425235748291, + "learning_rate": 2.8961061189559265e-05, + "loss": 0.6666, + "step": 10223 + }, + { + "epoch": 13.124518613607188, + "grad_norm": 2.1470754146575928, + "learning_rate": 2.896063329054343e-05, + "loss": 0.5799, + "step": 10224 + }, + { + "epoch": 13.125802310654686, + "grad_norm": 2.9828341007232666, + "learning_rate": 2.89602053915276e-05, + "loss": 0.6177, + "step": 10225 + }, + { + "epoch": 13.127086007702182, + "grad_norm": 5.205046653747559, + "learning_rate": 2.8959777492511767e-05, + "loss": 0.6932, + "step": 10226 + }, + { + "epoch": 13.128369704749678, + "grad_norm": 1.9888592958450317, + "learning_rate": 2.8959349593495935e-05, + "loss": 0.7225, + "step": 10227 + }, + { + "epoch": 13.129653401797176, + "grad_norm": 1.295658826828003, + "learning_rate": 2.8958921694480103e-05, + "loss": 0.523, + "step": 10228 + }, + { + "epoch": 13.130937098844672, + "grad_norm": 1.9226603507995605, + "learning_rate": 2.8958493795464272e-05, + "loss": 0.5124, + "step": 10229 + }, + { + "epoch": 13.13222079589217, + "grad_norm": 1.6453219652175903, + "learning_rate": 2.895806589644844e-05, + "loss": 0.5234, + "step": 10230 + }, + { + "epoch": 13.133504492939666, + "grad_norm": 0.9976353645324707, + "learning_rate": 2.8957637997432605e-05, + "loss": 0.5315, + "step": 10231 + }, + { + "epoch": 13.134788189987162, + "grad_norm": 1.029219150543213, + "learning_rate": 2.8957210098416774e-05, + "loss": 0.5251, + "step": 10232 + }, + { + "epoch": 13.13607188703466, + "grad_norm": 1.6096850633621216, + "learning_rate": 2.8956782199400942e-05, + "loss": 0.4892, + "step": 10233 + }, + { + "epoch": 13.137355584082156, + "grad_norm": 1.0156724452972412, + "learning_rate": 2.895635430038511e-05, + "loss": 0.5309, + "step": 10234 + }, + { + "epoch": 13.138639281129654, + "grad_norm": 1.620633840560913, + "learning_rate": 2.8955926401369275e-05, + "loss": 0.577, + "step": 10235 + }, + { + "epoch": 13.13992297817715, + "grad_norm": 1.3250865936279297, + "learning_rate": 2.8955498502353447e-05, + "loss": 0.5148, + "step": 10236 + }, + { + "epoch": 13.141206675224646, + "grad_norm": 1.391610026359558, + "learning_rate": 2.8955070603337612e-05, + "loss": 0.5586, + "step": 10237 + }, + { + "epoch": 13.142490372272144, + "grad_norm": 0.98138028383255, + "learning_rate": 2.895464270432178e-05, + "loss": 0.509, + "step": 10238 + }, + { + "epoch": 13.14377406931964, + "grad_norm": 1.5953117609024048, + "learning_rate": 2.895421480530595e-05, + "loss": 0.5056, + "step": 10239 + }, + { + "epoch": 13.145057766367138, + "grad_norm": 0.7841355204582214, + "learning_rate": 2.8953786906290114e-05, + "loss": 0.5296, + "step": 10240 + }, + { + "epoch": 13.146341463414634, + "grad_norm": 2.4557712078094482, + "learning_rate": 2.8953359007274286e-05, + "loss": 0.5205, + "step": 10241 + }, + { + "epoch": 13.14762516046213, + "grad_norm": 1.1526740789413452, + "learning_rate": 2.895293110825845e-05, + "loss": 0.5096, + "step": 10242 + }, + { + "epoch": 13.148908857509628, + "grad_norm": 1.324180245399475, + "learning_rate": 2.895250320924262e-05, + "loss": 0.5395, + "step": 10243 + }, + { + "epoch": 13.150192554557124, + "grad_norm": 1.6453888416290283, + "learning_rate": 2.8952075310226788e-05, + "loss": 0.5315, + "step": 10244 + }, + { + "epoch": 13.15147625160462, + "grad_norm": 1.3323606252670288, + "learning_rate": 2.8951647411210953e-05, + "loss": 0.5223, + "step": 10245 + }, + { + "epoch": 13.152759948652118, + "grad_norm": 1.555378794670105, + "learning_rate": 2.8951219512195125e-05, + "loss": 0.5698, + "step": 10246 + }, + { + "epoch": 13.154043645699614, + "grad_norm": 2.0699310302734375, + "learning_rate": 2.895079161317929e-05, + "loss": 0.5175, + "step": 10247 + }, + { + "epoch": 13.155327342747112, + "grad_norm": 1.4823609590530396, + "learning_rate": 2.8950363714163458e-05, + "loss": 0.5028, + "step": 10248 + }, + { + "epoch": 13.156611039794608, + "grad_norm": 1.3895432949066162, + "learning_rate": 2.8949935815147626e-05, + "loss": 0.5541, + "step": 10249 + }, + { + "epoch": 13.157894736842104, + "grad_norm": 1.7029411792755127, + "learning_rate": 2.8949507916131795e-05, + "loss": 0.5239, + "step": 10250 + }, + { + "epoch": 13.159178433889602, + "grad_norm": 1.0970691442489624, + "learning_rate": 2.894908001711596e-05, + "loss": 0.5619, + "step": 10251 + }, + { + "epoch": 13.160462130937098, + "grad_norm": 5.0358734130859375, + "learning_rate": 2.8948652118100128e-05, + "loss": 0.534, + "step": 10252 + }, + { + "epoch": 13.161745827984596, + "grad_norm": 1.929792881011963, + "learning_rate": 2.8948224219084297e-05, + "loss": 0.4944, + "step": 10253 + }, + { + "epoch": 13.163029525032092, + "grad_norm": 2.5417520999908447, + "learning_rate": 2.8947796320068465e-05, + "loss": 0.4859, + "step": 10254 + }, + { + "epoch": 13.164313222079588, + "grad_norm": 1.7990307807922363, + "learning_rate": 2.8947368421052634e-05, + "loss": 0.5506, + "step": 10255 + }, + { + "epoch": 13.165596919127086, + "grad_norm": 1.3364437818527222, + "learning_rate": 2.89469405220368e-05, + "loss": 0.4945, + "step": 10256 + }, + { + "epoch": 13.166880616174582, + "grad_norm": 2.0935146808624268, + "learning_rate": 2.894651262302097e-05, + "loss": 0.5139, + "step": 10257 + }, + { + "epoch": 13.16816431322208, + "grad_norm": 0.9442129731178284, + "learning_rate": 2.8946084724005135e-05, + "loss": 0.5106, + "step": 10258 + }, + { + "epoch": 13.169448010269576, + "grad_norm": 2.93131685256958, + "learning_rate": 2.89456568249893e-05, + "loss": 0.5149, + "step": 10259 + }, + { + "epoch": 13.170731707317072, + "grad_norm": 1.2068575620651245, + "learning_rate": 2.8945228925973472e-05, + "loss": 0.5146, + "step": 10260 + }, + { + "epoch": 13.17201540436457, + "grad_norm": 2.657189130783081, + "learning_rate": 2.8944801026957637e-05, + "loss": 0.5863, + "step": 10261 + }, + { + "epoch": 13.173299101412066, + "grad_norm": 1.9521434307098389, + "learning_rate": 2.894437312794181e-05, + "loss": 0.5566, + "step": 10262 + }, + { + "epoch": 13.174582798459564, + "grad_norm": 2.153616189956665, + "learning_rate": 2.8943945228925974e-05, + "loss": 0.5807, + "step": 10263 + }, + { + "epoch": 13.17586649550706, + "grad_norm": 2.7859976291656494, + "learning_rate": 2.8943517329910142e-05, + "loss": 0.5588, + "step": 10264 + }, + { + "epoch": 13.177150192554556, + "grad_norm": 1.0966110229492188, + "learning_rate": 2.894308943089431e-05, + "loss": 0.5578, + "step": 10265 + }, + { + "epoch": 13.178433889602054, + "grad_norm": 1.4716811180114746, + "learning_rate": 2.8942661531878476e-05, + "loss": 0.5902, + "step": 10266 + }, + { + "epoch": 13.17971758664955, + "grad_norm": 1.7467085123062134, + "learning_rate": 2.8942233632862644e-05, + "loss": 0.5543, + "step": 10267 + }, + { + "epoch": 13.181001283697048, + "grad_norm": 1.252799391746521, + "learning_rate": 2.8941805733846813e-05, + "loss": 0.5237, + "step": 10268 + }, + { + "epoch": 13.182284980744544, + "grad_norm": 1.1309313774108887, + "learning_rate": 2.894137783483098e-05, + "loss": 0.6085, + "step": 10269 + }, + { + "epoch": 13.18356867779204, + "grad_norm": 1.458277940750122, + "learning_rate": 2.894094993581515e-05, + "loss": 0.5988, + "step": 10270 + }, + { + "epoch": 13.184852374839538, + "grad_norm": 1.397185206413269, + "learning_rate": 2.8940522036799315e-05, + "loss": 0.6129, + "step": 10271 + }, + { + "epoch": 13.186136071887034, + "grad_norm": 1.1236553192138672, + "learning_rate": 2.8940094137783483e-05, + "loss": 0.5391, + "step": 10272 + }, + { + "epoch": 13.187419768934532, + "grad_norm": 1.8952375650405884, + "learning_rate": 2.893966623876765e-05, + "loss": 0.6376, + "step": 10273 + }, + { + "epoch": 13.188703465982028, + "grad_norm": 2.206989288330078, + "learning_rate": 2.893923833975182e-05, + "loss": 0.6251, + "step": 10274 + }, + { + "epoch": 13.189987163029524, + "grad_norm": 4.302009105682373, + "learning_rate": 2.8938810440735985e-05, + "loss": 0.5636, + "step": 10275 + }, + { + "epoch": 13.191270860077022, + "grad_norm": 2.9451022148132324, + "learning_rate": 2.8938382541720157e-05, + "loss": 0.664, + "step": 10276 + }, + { + "epoch": 13.192554557124518, + "grad_norm": 2.39372181892395, + "learning_rate": 2.893795464270432e-05, + "loss": 0.8114, + "step": 10277 + }, + { + "epoch": 13.193838254172016, + "grad_norm": 1.1564191579818726, + "learning_rate": 2.893752674368849e-05, + "loss": 0.4876, + "step": 10278 + }, + { + "epoch": 13.195121951219512, + "grad_norm": 1.2633620500564575, + "learning_rate": 2.893709884467266e-05, + "loss": 0.5184, + "step": 10279 + }, + { + "epoch": 13.196405648267008, + "grad_norm": 2.3437836170196533, + "learning_rate": 2.8936670945656823e-05, + "loss": 0.4904, + "step": 10280 + }, + { + "epoch": 13.197689345314506, + "grad_norm": 2.901035785675049, + "learning_rate": 2.8936243046640995e-05, + "loss": 0.4982, + "step": 10281 + }, + { + "epoch": 13.198973042362002, + "grad_norm": 1.8670469522476196, + "learning_rate": 2.893581514762516e-05, + "loss": 0.4782, + "step": 10282 + }, + { + "epoch": 13.200256739409499, + "grad_norm": 1.6725404262542725, + "learning_rate": 2.893538724860933e-05, + "loss": 0.5146, + "step": 10283 + }, + { + "epoch": 13.201540436456996, + "grad_norm": 1.0608144998550415, + "learning_rate": 2.8934959349593497e-05, + "loss": 0.5297, + "step": 10284 + }, + { + "epoch": 13.202824133504492, + "grad_norm": 2.9004409313201904, + "learning_rate": 2.8934531450577662e-05, + "loss": 0.51, + "step": 10285 + }, + { + "epoch": 13.20410783055199, + "grad_norm": 0.8661898970603943, + "learning_rate": 2.8934103551561834e-05, + "loss": 0.519, + "step": 10286 + }, + { + "epoch": 13.205391527599486, + "grad_norm": 1.4800890684127808, + "learning_rate": 2.8933675652546e-05, + "loss": 0.5411, + "step": 10287 + }, + { + "epoch": 13.206675224646983, + "grad_norm": 4.61477518081665, + "learning_rate": 2.8933247753530167e-05, + "loss": 0.5212, + "step": 10288 + }, + { + "epoch": 13.20795892169448, + "grad_norm": 1.442790150642395, + "learning_rate": 2.8932819854514336e-05, + "loss": 0.5855, + "step": 10289 + }, + { + "epoch": 13.209242618741976, + "grad_norm": 2.4708011150360107, + "learning_rate": 2.8932391955498504e-05, + "loss": 0.518, + "step": 10290 + }, + { + "epoch": 13.210526315789474, + "grad_norm": 2.0384392738342285, + "learning_rate": 2.893196405648267e-05, + "loss": 0.5347, + "step": 10291 + }, + { + "epoch": 13.21181001283697, + "grad_norm": 2.5917437076568604, + "learning_rate": 2.8931536157466838e-05, + "loss": 0.5238, + "step": 10292 + }, + { + "epoch": 13.213093709884467, + "grad_norm": 0.9920194149017334, + "learning_rate": 2.8931108258451006e-05, + "loss": 0.5611, + "step": 10293 + }, + { + "epoch": 13.214377406931964, + "grad_norm": 1.6199535131454468, + "learning_rate": 2.8930680359435174e-05, + "loss": 0.4992, + "step": 10294 + }, + { + "epoch": 13.21566110397946, + "grad_norm": 1.7829736471176147, + "learning_rate": 2.8930252460419343e-05, + "loss": 0.5516, + "step": 10295 + }, + { + "epoch": 13.216944801026958, + "grad_norm": 1.3733192682266235, + "learning_rate": 2.8929824561403508e-05, + "loss": 0.5434, + "step": 10296 + }, + { + "epoch": 13.218228498074454, + "grad_norm": 0.8936635255813599, + "learning_rate": 2.892939666238768e-05, + "loss": 0.5446, + "step": 10297 + }, + { + "epoch": 13.21951219512195, + "grad_norm": 2.2736353874206543, + "learning_rate": 2.8928968763371845e-05, + "loss": 0.5033, + "step": 10298 + }, + { + "epoch": 13.220795892169448, + "grad_norm": 1.4360202550888062, + "learning_rate": 2.892854086435601e-05, + "loss": 0.5525, + "step": 10299 + }, + { + "epoch": 13.222079589216944, + "grad_norm": 1.9950252771377563, + "learning_rate": 2.892811296534018e-05, + "loss": 0.5397, + "step": 10300 + }, + { + "epoch": 13.223363286264442, + "grad_norm": 1.400839924812317, + "learning_rate": 2.8927685066324347e-05, + "loss": 0.4978, + "step": 10301 + }, + { + "epoch": 13.224646983311938, + "grad_norm": 1.1666756868362427, + "learning_rate": 2.892725716730852e-05, + "loss": 0.5673, + "step": 10302 + }, + { + "epoch": 13.225930680359435, + "grad_norm": 4.761521339416504, + "learning_rate": 2.8926829268292683e-05, + "loss": 0.5589, + "step": 10303 + }, + { + "epoch": 13.227214377406932, + "grad_norm": 1.735360860824585, + "learning_rate": 2.8926401369276852e-05, + "loss": 0.5276, + "step": 10304 + }, + { + "epoch": 13.228498074454428, + "grad_norm": 1.1487892866134644, + "learning_rate": 2.892597347026102e-05, + "loss": 0.541, + "step": 10305 + }, + { + "epoch": 13.229781771501926, + "grad_norm": 2.2663683891296387, + "learning_rate": 2.8925545571245185e-05, + "loss": 0.5344, + "step": 10306 + }, + { + "epoch": 13.231065468549422, + "grad_norm": 1.8784899711608887, + "learning_rate": 2.8925117672229354e-05, + "loss": 0.5729, + "step": 10307 + }, + { + "epoch": 13.232349165596919, + "grad_norm": 1.1663975715637207, + "learning_rate": 2.8924689773213522e-05, + "loss": 0.5677, + "step": 10308 + }, + { + "epoch": 13.233632862644416, + "grad_norm": 1.4526185989379883, + "learning_rate": 2.892426187419769e-05, + "loss": 0.5076, + "step": 10309 + }, + { + "epoch": 13.234916559691912, + "grad_norm": 1.0798627138137817, + "learning_rate": 2.892383397518186e-05, + "loss": 0.5423, + "step": 10310 + }, + { + "epoch": 13.23620025673941, + "grad_norm": 1.371980905532837, + "learning_rate": 2.8923406076166027e-05, + "loss": 0.5121, + "step": 10311 + }, + { + "epoch": 13.237483953786906, + "grad_norm": 1.2019156217575073, + "learning_rate": 2.8922978177150192e-05, + "loss": 0.5534, + "step": 10312 + }, + { + "epoch": 13.238767650834403, + "grad_norm": 2.348980665206909, + "learning_rate": 2.892255027813436e-05, + "loss": 0.5593, + "step": 10313 + }, + { + "epoch": 13.2400513478819, + "grad_norm": 2.0577526092529297, + "learning_rate": 2.892212237911853e-05, + "loss": 0.5545, + "step": 10314 + }, + { + "epoch": 13.241335044929397, + "grad_norm": 2.052786350250244, + "learning_rate": 2.8921694480102694e-05, + "loss": 0.5566, + "step": 10315 + }, + { + "epoch": 13.242618741976893, + "grad_norm": 1.0052365064620972, + "learning_rate": 2.8921266581086866e-05, + "loss": 0.5214, + "step": 10316 + }, + { + "epoch": 13.24390243902439, + "grad_norm": 1.0723258256912231, + "learning_rate": 2.892083868207103e-05, + "loss": 0.5755, + "step": 10317 + }, + { + "epoch": 13.245186136071887, + "grad_norm": 2.6194658279418945, + "learning_rate": 2.8920410783055203e-05, + "loss": 0.4864, + "step": 10318 + }, + { + "epoch": 13.246469833119384, + "grad_norm": 1.9401365518569946, + "learning_rate": 2.8919982884039368e-05, + "loss": 0.5171, + "step": 10319 + }, + { + "epoch": 13.24775353016688, + "grad_norm": 1.283240795135498, + "learning_rate": 2.8919554985023533e-05, + "loss": 0.5924, + "step": 10320 + }, + { + "epoch": 13.249037227214377, + "grad_norm": 1.0838671922683716, + "learning_rate": 2.8919127086007705e-05, + "loss": 0.5444, + "step": 10321 + }, + { + "epoch": 13.250320924261874, + "grad_norm": 2.210191488265991, + "learning_rate": 2.891869918699187e-05, + "loss": 0.5937, + "step": 10322 + }, + { + "epoch": 13.25160462130937, + "grad_norm": 1.8816593885421753, + "learning_rate": 2.8918271287976038e-05, + "loss": 0.5316, + "step": 10323 + }, + { + "epoch": 13.252888318356868, + "grad_norm": 1.2761224508285522, + "learning_rate": 2.8917843388960206e-05, + "loss": 0.6193, + "step": 10324 + }, + { + "epoch": 13.254172015404365, + "grad_norm": 3.30289363861084, + "learning_rate": 2.8917415489944375e-05, + "loss": 0.6762, + "step": 10325 + }, + { + "epoch": 13.25545571245186, + "grad_norm": 3.6903469562530518, + "learning_rate": 2.8916987590928543e-05, + "loss": 0.6566, + "step": 10326 + }, + { + "epoch": 13.256739409499358, + "grad_norm": 3.5426831245422363, + "learning_rate": 2.891655969191271e-05, + "loss": 0.7518, + "step": 10327 + }, + { + "epoch": 13.258023106546855, + "grad_norm": 2.0960659980773926, + "learning_rate": 2.8916131792896877e-05, + "loss": 0.4977, + "step": 10328 + }, + { + "epoch": 13.259306803594352, + "grad_norm": 0.903885543346405, + "learning_rate": 2.8915703893881045e-05, + "loss": 0.5388, + "step": 10329 + }, + { + "epoch": 13.260590500641849, + "grad_norm": 1.906296730041504, + "learning_rate": 2.8915275994865214e-05, + "loss": 0.5409, + "step": 10330 + }, + { + "epoch": 13.261874197689345, + "grad_norm": 2.2475991249084473, + "learning_rate": 2.891484809584938e-05, + "loss": 0.5209, + "step": 10331 + }, + { + "epoch": 13.263157894736842, + "grad_norm": 2.3700687885284424, + "learning_rate": 2.8914420196833547e-05, + "loss": 0.5176, + "step": 10332 + }, + { + "epoch": 13.264441591784339, + "grad_norm": 1.112237811088562, + "learning_rate": 2.8913992297817715e-05, + "loss": 0.503, + "step": 10333 + }, + { + "epoch": 13.265725288831836, + "grad_norm": 1.9883337020874023, + "learning_rate": 2.8913564398801884e-05, + "loss": 0.537, + "step": 10334 + }, + { + "epoch": 13.267008985879333, + "grad_norm": 10.869194984436035, + "learning_rate": 2.8913136499786052e-05, + "loss": 0.5044, + "step": 10335 + }, + { + "epoch": 13.268292682926829, + "grad_norm": 0.8620529174804688, + "learning_rate": 2.8912708600770217e-05, + "loss": 0.5229, + "step": 10336 + }, + { + "epoch": 13.269576379974326, + "grad_norm": 0.9705656170845032, + "learning_rate": 2.891228070175439e-05, + "loss": 0.54, + "step": 10337 + }, + { + "epoch": 13.270860077021823, + "grad_norm": 2.2157726287841797, + "learning_rate": 2.8911852802738554e-05, + "loss": 0.5433, + "step": 10338 + }, + { + "epoch": 13.27214377406932, + "grad_norm": 1.8773292303085327, + "learning_rate": 2.891142490372272e-05, + "loss": 0.532, + "step": 10339 + }, + { + "epoch": 13.273427471116817, + "grad_norm": 2.592620372772217, + "learning_rate": 2.891099700470689e-05, + "loss": 0.5473, + "step": 10340 + }, + { + "epoch": 13.274711168164313, + "grad_norm": 0.8182120323181152, + "learning_rate": 2.8910569105691056e-05, + "loss": 0.5215, + "step": 10341 + }, + { + "epoch": 13.27599486521181, + "grad_norm": 1.3955847024917603, + "learning_rate": 2.8910141206675228e-05, + "loss": 0.4847, + "step": 10342 + }, + { + "epoch": 13.277278562259307, + "grad_norm": 1.4254862070083618, + "learning_rate": 2.8909713307659393e-05, + "loss": 0.5076, + "step": 10343 + }, + { + "epoch": 13.278562259306804, + "grad_norm": 1.5037258863449097, + "learning_rate": 2.890928540864356e-05, + "loss": 0.5591, + "step": 10344 + }, + { + "epoch": 13.2798459563543, + "grad_norm": 0.8927475810050964, + "learning_rate": 2.890885750962773e-05, + "loss": 0.4722, + "step": 10345 + }, + { + "epoch": 13.281129653401797, + "grad_norm": 1.2431979179382324, + "learning_rate": 2.8908429610611895e-05, + "loss": 0.5215, + "step": 10346 + }, + { + "epoch": 13.282413350449294, + "grad_norm": 1.4626721143722534, + "learning_rate": 2.8908001711596063e-05, + "loss": 0.5177, + "step": 10347 + }, + { + "epoch": 13.28369704749679, + "grad_norm": 0.8226361274719238, + "learning_rate": 2.890757381258023e-05, + "loss": 0.5041, + "step": 10348 + }, + { + "epoch": 13.284980744544288, + "grad_norm": 1.105648159980774, + "learning_rate": 2.89071459135644e-05, + "loss": 0.5769, + "step": 10349 + }, + { + "epoch": 13.286264441591785, + "grad_norm": 1.932669997215271, + "learning_rate": 2.8906718014548568e-05, + "loss": 0.5409, + "step": 10350 + }, + { + "epoch": 13.28754813863928, + "grad_norm": 1.101173758506775, + "learning_rate": 2.8906290115532737e-05, + "loss": 0.5352, + "step": 10351 + }, + { + "epoch": 13.288831835686779, + "grad_norm": 1.2291208505630493, + "learning_rate": 2.89058622165169e-05, + "loss": 0.5656, + "step": 10352 + }, + { + "epoch": 13.290115532734275, + "grad_norm": 1.1657390594482422, + "learning_rate": 2.890543431750107e-05, + "loss": 0.5374, + "step": 10353 + }, + { + "epoch": 13.29139922978177, + "grad_norm": 1.2453981637954712, + "learning_rate": 2.890500641848524e-05, + "loss": 0.4908, + "step": 10354 + }, + { + "epoch": 13.292682926829269, + "grad_norm": 1.7861381769180298, + "learning_rate": 2.8904578519469404e-05, + "loss": 0.5855, + "step": 10355 + }, + { + "epoch": 13.293966623876765, + "grad_norm": 1.297042727470398, + "learning_rate": 2.8904150620453575e-05, + "loss": 0.5223, + "step": 10356 + }, + { + "epoch": 13.295250320924263, + "grad_norm": 5.218934535980225, + "learning_rate": 2.890372272143774e-05, + "loss": 0.5291, + "step": 10357 + }, + { + "epoch": 13.296534017971759, + "grad_norm": 2.134754180908203, + "learning_rate": 2.8903294822421912e-05, + "loss": 0.5437, + "step": 10358 + }, + { + "epoch": 13.297817715019255, + "grad_norm": 9.469324111938477, + "learning_rate": 2.8902866923406077e-05, + "loss": 0.5423, + "step": 10359 + }, + { + "epoch": 13.299101412066753, + "grad_norm": 1.2259269952774048, + "learning_rate": 2.8902439024390242e-05, + "loss": 0.6178, + "step": 10360 + }, + { + "epoch": 13.300385109114249, + "grad_norm": 1.0998680591583252, + "learning_rate": 2.8902011125374414e-05, + "loss": 0.5216, + "step": 10361 + }, + { + "epoch": 13.301668806161747, + "grad_norm": 1.3019813299179077, + "learning_rate": 2.890158322635858e-05, + "loss": 0.5531, + "step": 10362 + }, + { + "epoch": 13.302952503209243, + "grad_norm": 1.159065842628479, + "learning_rate": 2.8901155327342747e-05, + "loss": 0.496, + "step": 10363 + }, + { + "epoch": 13.304236200256739, + "grad_norm": 1.4374172687530518, + "learning_rate": 2.8900727428326916e-05, + "loss": 0.5141, + "step": 10364 + }, + { + "epoch": 13.305519897304237, + "grad_norm": 5.285946369171143, + "learning_rate": 2.8900299529311084e-05, + "loss": 0.5999, + "step": 10365 + }, + { + "epoch": 13.306803594351733, + "grad_norm": 2.122650384902954, + "learning_rate": 2.8899871630295253e-05, + "loss": 0.5435, + "step": 10366 + }, + { + "epoch": 13.30808729139923, + "grad_norm": 2.1692988872528076, + "learning_rate": 2.8899443731279418e-05, + "loss": 0.5795, + "step": 10367 + }, + { + "epoch": 13.309370988446727, + "grad_norm": 2.660881996154785, + "learning_rate": 2.8899015832263586e-05, + "loss": 0.5828, + "step": 10368 + }, + { + "epoch": 13.310654685494223, + "grad_norm": 1.5675424337387085, + "learning_rate": 2.8898587933247755e-05, + "loss": 0.5481, + "step": 10369 + }, + { + "epoch": 13.31193838254172, + "grad_norm": 1.2689661979675293, + "learning_rate": 2.8898160034231923e-05, + "loss": 0.5492, + "step": 10370 + }, + { + "epoch": 13.313222079589217, + "grad_norm": 1.2201170921325684, + "learning_rate": 2.8897732135216088e-05, + "loss": 0.5802, + "step": 10371 + }, + { + "epoch": 13.314505776636715, + "grad_norm": 1.269693374633789, + "learning_rate": 2.889730423620026e-05, + "loss": 0.5867, + "step": 10372 + }, + { + "epoch": 13.31578947368421, + "grad_norm": 10.51960563659668, + "learning_rate": 2.8896876337184425e-05, + "loss": 0.5945, + "step": 10373 + }, + { + "epoch": 13.317073170731707, + "grad_norm": 1.3285325765609741, + "learning_rate": 2.8896448438168593e-05, + "loss": 0.61, + "step": 10374 + }, + { + "epoch": 13.318356867779205, + "grad_norm": 1.6000285148620605, + "learning_rate": 2.889602053915276e-05, + "loss": 0.6572, + "step": 10375 + }, + { + "epoch": 13.3196405648267, + "grad_norm": 2.7764716148376465, + "learning_rate": 2.8895592640136927e-05, + "loss": 0.6599, + "step": 10376 + }, + { + "epoch": 13.320924261874199, + "grad_norm": 2.9500043392181396, + "learning_rate": 2.88951647411211e-05, + "loss": 0.7462, + "step": 10377 + }, + { + "epoch": 13.322207958921695, + "grad_norm": 1.310712456703186, + "learning_rate": 2.8894736842105263e-05, + "loss": 0.5032, + "step": 10378 + }, + { + "epoch": 13.32349165596919, + "grad_norm": 1.1791971921920776, + "learning_rate": 2.8894308943089432e-05, + "loss": 0.4932, + "step": 10379 + }, + { + "epoch": 13.324775353016689, + "grad_norm": 0.8542216420173645, + "learning_rate": 2.88938810440736e-05, + "loss": 0.4838, + "step": 10380 + }, + { + "epoch": 13.326059050064185, + "grad_norm": 0.943986177444458, + "learning_rate": 2.8893453145057765e-05, + "loss": 0.5177, + "step": 10381 + }, + { + "epoch": 13.327342747111683, + "grad_norm": 2.3639445304870605, + "learning_rate": 2.8893025246041937e-05, + "loss": 0.5235, + "step": 10382 + }, + { + "epoch": 13.328626444159179, + "grad_norm": 0.7853935360908508, + "learning_rate": 2.8892597347026102e-05, + "loss": 0.5378, + "step": 10383 + }, + { + "epoch": 13.329910141206675, + "grad_norm": 1.8394955396652222, + "learning_rate": 2.889216944801027e-05, + "loss": 0.5467, + "step": 10384 + }, + { + "epoch": 13.331193838254173, + "grad_norm": 1.003924012184143, + "learning_rate": 2.889174154899444e-05, + "loss": 0.5042, + "step": 10385 + }, + { + "epoch": 13.332477535301669, + "grad_norm": 1.3343353271484375, + "learning_rate": 2.8891313649978607e-05, + "loss": 0.5258, + "step": 10386 + }, + { + "epoch": 13.333761232349165, + "grad_norm": 2.258004903793335, + "learning_rate": 2.8890885750962772e-05, + "loss": 0.5308, + "step": 10387 + }, + { + "epoch": 13.335044929396663, + "grad_norm": 1.0508931875228882, + "learning_rate": 2.889045785194694e-05, + "loss": 0.5396, + "step": 10388 + }, + { + "epoch": 13.336328626444159, + "grad_norm": 2.512924909591675, + "learning_rate": 2.889002995293111e-05, + "loss": 0.5228, + "step": 10389 + }, + { + "epoch": 13.337612323491657, + "grad_norm": 1.3290300369262695, + "learning_rate": 2.8889602053915278e-05, + "loss": 0.5468, + "step": 10390 + }, + { + "epoch": 13.338896020539153, + "grad_norm": 2.4414291381835938, + "learning_rate": 2.8889174154899446e-05, + "loss": 0.531, + "step": 10391 + }, + { + "epoch": 13.340179717586649, + "grad_norm": 1.8882251977920532, + "learning_rate": 2.888874625588361e-05, + "loss": 0.5203, + "step": 10392 + }, + { + "epoch": 13.341463414634147, + "grad_norm": 1.6670150756835938, + "learning_rate": 2.888831835686778e-05, + "loss": 0.5215, + "step": 10393 + }, + { + "epoch": 13.342747111681643, + "grad_norm": 1.4328033924102783, + "learning_rate": 2.8887890457851948e-05, + "loss": 0.5238, + "step": 10394 + }, + { + "epoch": 13.34403080872914, + "grad_norm": 2.339266538619995, + "learning_rate": 2.8887462558836113e-05, + "loss": 0.5245, + "step": 10395 + }, + { + "epoch": 13.345314505776637, + "grad_norm": 1.8840960264205933, + "learning_rate": 2.8887034659820285e-05, + "loss": 0.564, + "step": 10396 + }, + { + "epoch": 13.346598202824133, + "grad_norm": 0.9057774543762207, + "learning_rate": 2.888660676080445e-05, + "loss": 0.5208, + "step": 10397 + }, + { + "epoch": 13.34788189987163, + "grad_norm": 1.7275452613830566, + "learning_rate": 2.888617886178862e-05, + "loss": 0.5035, + "step": 10398 + }, + { + "epoch": 13.349165596919127, + "grad_norm": 1.3582779169082642, + "learning_rate": 2.8885750962772787e-05, + "loss": 0.562, + "step": 10399 + }, + { + "epoch": 13.350449293966625, + "grad_norm": 1.1542125940322876, + "learning_rate": 2.888532306375695e-05, + "loss": 0.4988, + "step": 10400 + }, + { + "epoch": 13.35173299101412, + "grad_norm": 2.305126667022705, + "learning_rate": 2.8884895164741123e-05, + "loss": 0.5442, + "step": 10401 + }, + { + "epoch": 13.353016688061617, + "grad_norm": 4.669949054718018, + "learning_rate": 2.888446726572529e-05, + "loss": 0.5394, + "step": 10402 + }, + { + "epoch": 13.354300385109115, + "grad_norm": 1.5534056425094604, + "learning_rate": 2.8884039366709457e-05, + "loss": 0.5065, + "step": 10403 + }, + { + "epoch": 13.35558408215661, + "grad_norm": 1.1904919147491455, + "learning_rate": 2.8883611467693625e-05, + "loss": 0.5099, + "step": 10404 + }, + { + "epoch": 13.356867779204109, + "grad_norm": 1.8422545194625854, + "learning_rate": 2.8883183568677794e-05, + "loss": 0.5531, + "step": 10405 + }, + { + "epoch": 13.358151476251605, + "grad_norm": 1.8197883367538452, + "learning_rate": 2.8882755669661962e-05, + "loss": 0.525, + "step": 10406 + }, + { + "epoch": 13.3594351732991, + "grad_norm": 2.543320894241333, + "learning_rate": 2.8882327770646127e-05, + "loss": 0.5488, + "step": 10407 + }, + { + "epoch": 13.360718870346599, + "grad_norm": 1.1946861743927002, + "learning_rate": 2.8881899871630295e-05, + "loss": 0.5578, + "step": 10408 + }, + { + "epoch": 13.362002567394095, + "grad_norm": 1.0896004438400269, + "learning_rate": 2.8881471972614464e-05, + "loss": 0.5333, + "step": 10409 + }, + { + "epoch": 13.363286264441593, + "grad_norm": 2.173537492752075, + "learning_rate": 2.8881044073598632e-05, + "loss": 0.518, + "step": 10410 + }, + { + "epoch": 13.364569961489089, + "grad_norm": 1.156834363937378, + "learning_rate": 2.8880616174582797e-05, + "loss": 0.5791, + "step": 10411 + }, + { + "epoch": 13.365853658536585, + "grad_norm": 2.760622501373291, + "learning_rate": 2.888018827556697e-05, + "loss": 0.5778, + "step": 10412 + }, + { + "epoch": 13.367137355584083, + "grad_norm": 2.6054632663726807, + "learning_rate": 2.8879760376551134e-05, + "loss": 0.538, + "step": 10413 + }, + { + "epoch": 13.368421052631579, + "grad_norm": 1.8192342519760132, + "learning_rate": 2.8879332477535303e-05, + "loss": 0.5365, + "step": 10414 + }, + { + "epoch": 13.369704749679077, + "grad_norm": 2.0183119773864746, + "learning_rate": 2.887890457851947e-05, + "loss": 0.5216, + "step": 10415 + }, + { + "epoch": 13.370988446726573, + "grad_norm": 1.2094191312789917, + "learning_rate": 2.8878476679503636e-05, + "loss": 0.5747, + "step": 10416 + }, + { + "epoch": 13.372272143774069, + "grad_norm": 1.0878621339797974, + "learning_rate": 2.8878048780487808e-05, + "loss": 0.5558, + "step": 10417 + }, + { + "epoch": 13.373555840821567, + "grad_norm": 2.5615556240081787, + "learning_rate": 2.8877620881471973e-05, + "loss": 0.575, + "step": 10418 + }, + { + "epoch": 13.374839537869063, + "grad_norm": 4.027122497558594, + "learning_rate": 2.887719298245614e-05, + "loss": 0.6289, + "step": 10419 + }, + { + "epoch": 13.376123234916559, + "grad_norm": 1.6297926902770996, + "learning_rate": 2.887676508344031e-05, + "loss": 0.5689, + "step": 10420 + }, + { + "epoch": 13.377406931964057, + "grad_norm": 3.0252726078033447, + "learning_rate": 2.8876337184424475e-05, + "loss": 0.5419, + "step": 10421 + }, + { + "epoch": 13.378690629011553, + "grad_norm": 1.2504518032073975, + "learning_rate": 2.8875909285408646e-05, + "loss": 0.5831, + "step": 10422 + }, + { + "epoch": 13.37997432605905, + "grad_norm": 1.8153116703033447, + "learning_rate": 2.887548138639281e-05, + "loss": 0.6137, + "step": 10423 + }, + { + "epoch": 13.381258023106547, + "grad_norm": 1.616649866104126, + "learning_rate": 2.887505348737698e-05, + "loss": 0.6089, + "step": 10424 + }, + { + "epoch": 13.382541720154043, + "grad_norm": 3.76316237449646, + "learning_rate": 2.8874625588361148e-05, + "loss": 0.6658, + "step": 10425 + }, + { + "epoch": 13.38382541720154, + "grad_norm": 2.3745243549346924, + "learning_rate": 2.8874197689345317e-05, + "loss": 0.6322, + "step": 10426 + }, + { + "epoch": 13.385109114249037, + "grad_norm": 2.8216512203216553, + "learning_rate": 2.8873769790329482e-05, + "loss": 0.7771, + "step": 10427 + }, + { + "epoch": 13.386392811296535, + "grad_norm": 1.7886422872543335, + "learning_rate": 2.887334189131365e-05, + "loss": 0.5009, + "step": 10428 + }, + { + "epoch": 13.38767650834403, + "grad_norm": 1.1043751239776611, + "learning_rate": 2.887291399229782e-05, + "loss": 0.4986, + "step": 10429 + }, + { + "epoch": 13.388960205391527, + "grad_norm": 2.8700549602508545, + "learning_rate": 2.8872486093281984e-05, + "loss": 0.5293, + "step": 10430 + }, + { + "epoch": 13.390243902439025, + "grad_norm": 1.4289172887802124, + "learning_rate": 2.8872058194266155e-05, + "loss": 0.5279, + "step": 10431 + }, + { + "epoch": 13.39152759948652, + "grad_norm": 1.2407190799713135, + "learning_rate": 2.887163029525032e-05, + "loss": 0.5452, + "step": 10432 + }, + { + "epoch": 13.392811296534019, + "grad_norm": 1.1209690570831299, + "learning_rate": 2.8871202396234492e-05, + "loss": 0.5073, + "step": 10433 + }, + { + "epoch": 13.394094993581515, + "grad_norm": 1.1223032474517822, + "learning_rate": 2.8870774497218657e-05, + "loss": 0.5361, + "step": 10434 + }, + { + "epoch": 13.39537869062901, + "grad_norm": 1.1415047645568848, + "learning_rate": 2.8870346598202822e-05, + "loss": 0.5465, + "step": 10435 + }, + { + "epoch": 13.396662387676509, + "grad_norm": 0.9106853008270264, + "learning_rate": 2.8869918699186994e-05, + "loss": 0.5218, + "step": 10436 + }, + { + "epoch": 13.397946084724005, + "grad_norm": 1.5872329473495483, + "learning_rate": 2.886949080017116e-05, + "loss": 0.4635, + "step": 10437 + }, + { + "epoch": 13.399229781771503, + "grad_norm": 4.083974838256836, + "learning_rate": 2.8869062901155327e-05, + "loss": 0.5254, + "step": 10438 + }, + { + "epoch": 13.400513478818999, + "grad_norm": 5.802887916564941, + "learning_rate": 2.8868635002139496e-05, + "loss": 0.526, + "step": 10439 + }, + { + "epoch": 13.401797175866495, + "grad_norm": 2.5388643741607666, + "learning_rate": 2.8868207103123664e-05, + "loss": 0.4984, + "step": 10440 + }, + { + "epoch": 13.403080872913993, + "grad_norm": 2.5894405841827393, + "learning_rate": 2.8867779204107833e-05, + "loss": 0.5212, + "step": 10441 + }, + { + "epoch": 13.404364569961489, + "grad_norm": 1.195871114730835, + "learning_rate": 2.8867351305091998e-05, + "loss": 0.5437, + "step": 10442 + }, + { + "epoch": 13.405648267008987, + "grad_norm": 1.180857539176941, + "learning_rate": 2.8866923406076166e-05, + "loss": 0.5338, + "step": 10443 + }, + { + "epoch": 13.406931964056483, + "grad_norm": 1.046415090560913, + "learning_rate": 2.8866495507060335e-05, + "loss": 0.5221, + "step": 10444 + }, + { + "epoch": 13.408215661103979, + "grad_norm": 1.1271648406982422, + "learning_rate": 2.8866067608044503e-05, + "loss": 0.5119, + "step": 10445 + }, + { + "epoch": 13.409499358151477, + "grad_norm": 1.1600013971328735, + "learning_rate": 2.8865639709028668e-05, + "loss": 0.5229, + "step": 10446 + }, + { + "epoch": 13.410783055198973, + "grad_norm": 1.5761340856552124, + "learning_rate": 2.886521181001284e-05, + "loss": 0.5375, + "step": 10447 + }, + { + "epoch": 13.41206675224647, + "grad_norm": 1.6194405555725098, + "learning_rate": 2.8864783910997005e-05, + "loss": 0.5641, + "step": 10448 + }, + { + "epoch": 13.413350449293967, + "grad_norm": 19.487751007080078, + "learning_rate": 2.8864356011981173e-05, + "loss": 0.5448, + "step": 10449 + }, + { + "epoch": 13.414634146341463, + "grad_norm": 1.8572871685028076, + "learning_rate": 2.886392811296534e-05, + "loss": 0.523, + "step": 10450 + }, + { + "epoch": 13.41591784338896, + "grad_norm": 1.3675493001937866, + "learning_rate": 2.8863500213949507e-05, + "loss": 0.5304, + "step": 10451 + }, + { + "epoch": 13.417201540436457, + "grad_norm": 2.4803242683410645, + "learning_rate": 2.886307231493368e-05, + "loss": 0.5405, + "step": 10452 + }, + { + "epoch": 13.418485237483953, + "grad_norm": 3.562368154525757, + "learning_rate": 2.8862644415917844e-05, + "loss": 0.5289, + "step": 10453 + }, + { + "epoch": 13.41976893453145, + "grad_norm": 2.0817861557006836, + "learning_rate": 2.8862216516902012e-05, + "loss": 0.5279, + "step": 10454 + }, + { + "epoch": 13.421052631578947, + "grad_norm": 1.9344552755355835, + "learning_rate": 2.886178861788618e-05, + "loss": 0.4888, + "step": 10455 + }, + { + "epoch": 13.422336328626445, + "grad_norm": 1.1085292100906372, + "learning_rate": 2.8861360718870345e-05, + "loss": 0.5051, + "step": 10456 + }, + { + "epoch": 13.42362002567394, + "grad_norm": 0.9871942400932312, + "learning_rate": 2.8860932819854517e-05, + "loss": 0.5535, + "step": 10457 + }, + { + "epoch": 13.424903722721437, + "grad_norm": 2.1975762844085693, + "learning_rate": 2.8860504920838682e-05, + "loss": 0.532, + "step": 10458 + }, + { + "epoch": 13.426187419768935, + "grad_norm": 1.4180470705032349, + "learning_rate": 2.886007702182285e-05, + "loss": 0.5255, + "step": 10459 + }, + { + "epoch": 13.427471116816431, + "grad_norm": 2.1603665351867676, + "learning_rate": 2.885964912280702e-05, + "loss": 0.5291, + "step": 10460 + }, + { + "epoch": 13.428754813863929, + "grad_norm": 3.38840651512146, + "learning_rate": 2.8859221223791184e-05, + "loss": 0.5799, + "step": 10461 + }, + { + "epoch": 13.430038510911425, + "grad_norm": 4.518436431884766, + "learning_rate": 2.8858793324775352e-05, + "loss": 0.5832, + "step": 10462 + }, + { + "epoch": 13.431322207958921, + "grad_norm": 2.010082244873047, + "learning_rate": 2.885836542575952e-05, + "loss": 0.5395, + "step": 10463 + }, + { + "epoch": 13.432605905006419, + "grad_norm": 1.802197813987732, + "learning_rate": 2.885793752674369e-05, + "loss": 0.5285, + "step": 10464 + }, + { + "epoch": 13.433889602053915, + "grad_norm": 1.670266032218933, + "learning_rate": 2.8857509627727858e-05, + "loss": 0.5624, + "step": 10465 + }, + { + "epoch": 13.435173299101413, + "grad_norm": 1.9407511949539185, + "learning_rate": 2.8857081728712026e-05, + "loss": 0.5498, + "step": 10466 + }, + { + "epoch": 13.436456996148909, + "grad_norm": 2.229546308517456, + "learning_rate": 2.885665382969619e-05, + "loss": 0.5681, + "step": 10467 + }, + { + "epoch": 13.437740693196405, + "grad_norm": 1.2791229486465454, + "learning_rate": 2.885622593068036e-05, + "loss": 0.5958, + "step": 10468 + }, + { + "epoch": 13.439024390243903, + "grad_norm": 1.592563271522522, + "learning_rate": 2.8855798031664528e-05, + "loss": 0.5682, + "step": 10469 + }, + { + "epoch": 13.440308087291399, + "grad_norm": 2.0725810527801514, + "learning_rate": 2.8855370132648693e-05, + "loss": 0.5659, + "step": 10470 + }, + { + "epoch": 13.441591784338897, + "grad_norm": 5.818667888641357, + "learning_rate": 2.8854942233632865e-05, + "loss": 0.5329, + "step": 10471 + }, + { + "epoch": 13.442875481386393, + "grad_norm": 1.2398134469985962, + "learning_rate": 2.885451433461703e-05, + "loss": 0.5987, + "step": 10472 + }, + { + "epoch": 13.444159178433889, + "grad_norm": 1.4018137454986572, + "learning_rate": 2.88540864356012e-05, + "loss": 0.5975, + "step": 10473 + }, + { + "epoch": 13.445442875481387, + "grad_norm": 2.281538248062134, + "learning_rate": 2.8853658536585367e-05, + "loss": 0.6236, + "step": 10474 + }, + { + "epoch": 13.446726572528883, + "grad_norm": 2.3602757453918457, + "learning_rate": 2.885323063756953e-05, + "loss": 0.6586, + "step": 10475 + }, + { + "epoch": 13.44801026957638, + "grad_norm": 2.109933853149414, + "learning_rate": 2.8852802738553703e-05, + "loss": 0.6886, + "step": 10476 + }, + { + "epoch": 13.449293966623877, + "grad_norm": 2.9115233421325684, + "learning_rate": 2.885237483953787e-05, + "loss": 0.7668, + "step": 10477 + }, + { + "epoch": 13.450577663671373, + "grad_norm": 1.667128086090088, + "learning_rate": 2.8851946940522037e-05, + "loss": 0.5117, + "step": 10478 + }, + { + "epoch": 13.45186136071887, + "grad_norm": 2.1586318016052246, + "learning_rate": 2.8851519041506205e-05, + "loss": 0.5215, + "step": 10479 + }, + { + "epoch": 13.453145057766367, + "grad_norm": 1.1812299489974976, + "learning_rate": 2.8851091142490374e-05, + "loss": 0.5037, + "step": 10480 + }, + { + "epoch": 13.454428754813865, + "grad_norm": 4.180335998535156, + "learning_rate": 2.8850663243474542e-05, + "loss": 0.5312, + "step": 10481 + }, + { + "epoch": 13.455712451861361, + "grad_norm": 1.6947040557861328, + "learning_rate": 2.8850235344458707e-05, + "loss": 0.5057, + "step": 10482 + }, + { + "epoch": 13.456996148908857, + "grad_norm": 2.426462173461914, + "learning_rate": 2.8849807445442876e-05, + "loss": 0.5025, + "step": 10483 + }, + { + "epoch": 13.458279845956355, + "grad_norm": 1.4777765274047852, + "learning_rate": 2.8849379546427044e-05, + "loss": 0.503, + "step": 10484 + }, + { + "epoch": 13.459563543003851, + "grad_norm": 2.1691436767578125, + "learning_rate": 2.8848951647411212e-05, + "loss": 0.5571, + "step": 10485 + }, + { + "epoch": 13.460847240051347, + "grad_norm": 3.6219065189361572, + "learning_rate": 2.8848523748395377e-05, + "loss": 0.5145, + "step": 10486 + }, + { + "epoch": 13.462130937098845, + "grad_norm": 12.060078620910645, + "learning_rate": 2.884809584937955e-05, + "loss": 0.5035, + "step": 10487 + }, + { + "epoch": 13.463414634146341, + "grad_norm": 1.5702993869781494, + "learning_rate": 2.8847667950363714e-05, + "loss": 0.5069, + "step": 10488 + }, + { + "epoch": 13.464698331193839, + "grad_norm": 1.187907338142395, + "learning_rate": 2.8847240051347883e-05, + "loss": 0.5136, + "step": 10489 + }, + { + "epoch": 13.465982028241335, + "grad_norm": 1.4426600933074951, + "learning_rate": 2.884681215233205e-05, + "loss": 0.521, + "step": 10490 + }, + { + "epoch": 13.467265725288831, + "grad_norm": 1.0726274251937866, + "learning_rate": 2.8846384253316216e-05, + "loss": 0.5311, + "step": 10491 + }, + { + "epoch": 13.468549422336329, + "grad_norm": 2.514535903930664, + "learning_rate": 2.8845956354300388e-05, + "loss": 0.4791, + "step": 10492 + }, + { + "epoch": 13.469833119383825, + "grad_norm": 1.3872827291488647, + "learning_rate": 2.8845528455284553e-05, + "loss": 0.5306, + "step": 10493 + }, + { + "epoch": 13.471116816431323, + "grad_norm": 1.2021492719650269, + "learning_rate": 2.884510055626872e-05, + "loss": 0.4913, + "step": 10494 + }, + { + "epoch": 13.472400513478819, + "grad_norm": 11.346747398376465, + "learning_rate": 2.884467265725289e-05, + "loss": 0.4975, + "step": 10495 + }, + { + "epoch": 13.473684210526315, + "grad_norm": 1.8066858053207397, + "learning_rate": 2.8844244758237055e-05, + "loss": 0.5298, + "step": 10496 + }, + { + "epoch": 13.474967907573813, + "grad_norm": 2.1862051486968994, + "learning_rate": 2.8843816859221227e-05, + "loss": 0.5509, + "step": 10497 + }, + { + "epoch": 13.476251604621309, + "grad_norm": 12.348793983459473, + "learning_rate": 2.884338896020539e-05, + "loss": 0.5074, + "step": 10498 + }, + { + "epoch": 13.477535301668807, + "grad_norm": 1.461503505706787, + "learning_rate": 2.884296106118956e-05, + "loss": 0.6066, + "step": 10499 + }, + { + "epoch": 13.478818998716303, + "grad_norm": 1.196056842803955, + "learning_rate": 2.884253316217373e-05, + "loss": 0.5246, + "step": 10500 + }, + { + "epoch": 13.480102695763799, + "grad_norm": 3.5969812870025635, + "learning_rate": 2.8842105263157897e-05, + "loss": 0.4963, + "step": 10501 + }, + { + "epoch": 13.481386392811297, + "grad_norm": 4.245793342590332, + "learning_rate": 2.8841677364142062e-05, + "loss": 0.54, + "step": 10502 + }, + { + "epoch": 13.482670089858793, + "grad_norm": 3.967583656311035, + "learning_rate": 2.884124946512623e-05, + "loss": 0.5362, + "step": 10503 + }, + { + "epoch": 13.48395378690629, + "grad_norm": 0.9698159694671631, + "learning_rate": 2.88408215661104e-05, + "loss": 0.5195, + "step": 10504 + }, + { + "epoch": 13.485237483953787, + "grad_norm": 3.4528965950012207, + "learning_rate": 2.8840393667094567e-05, + "loss": 0.532, + "step": 10505 + }, + { + "epoch": 13.486521181001283, + "grad_norm": 2.076766014099121, + "learning_rate": 2.8839965768078735e-05, + "loss": 0.5473, + "step": 10506 + }, + { + "epoch": 13.487804878048781, + "grad_norm": 1.6143754720687866, + "learning_rate": 2.88395378690629e-05, + "loss": 0.5178, + "step": 10507 + }, + { + "epoch": 13.489088575096277, + "grad_norm": 2.820021152496338, + "learning_rate": 2.8839109970047072e-05, + "loss": 0.5644, + "step": 10508 + }, + { + "epoch": 13.490372272143775, + "grad_norm": 4.925491809844971, + "learning_rate": 2.8838682071031237e-05, + "loss": 0.5713, + "step": 10509 + }, + { + "epoch": 13.491655969191271, + "grad_norm": 9.611665725708008, + "learning_rate": 2.8838254172015402e-05, + "loss": 0.5339, + "step": 10510 + }, + { + "epoch": 13.492939666238767, + "grad_norm": 1.198646903038025, + "learning_rate": 2.8837826272999574e-05, + "loss": 0.5854, + "step": 10511 + }, + { + "epoch": 13.494223363286265, + "grad_norm": 1.6938155889511108, + "learning_rate": 2.883739837398374e-05, + "loss": 0.5539, + "step": 10512 + }, + { + "epoch": 13.495507060333761, + "grad_norm": 5.7672271728515625, + "learning_rate": 2.883697047496791e-05, + "loss": 0.5087, + "step": 10513 + }, + { + "epoch": 13.496790757381259, + "grad_norm": 1.0660827159881592, + "learning_rate": 2.8836542575952076e-05, + "loss": 0.5601, + "step": 10514 + }, + { + "epoch": 13.498074454428755, + "grad_norm": 1.142974615097046, + "learning_rate": 2.8836114676936244e-05, + "loss": 0.5419, + "step": 10515 + }, + { + "epoch": 13.499358151476251, + "grad_norm": 6.201892375946045, + "learning_rate": 2.8835686777920413e-05, + "loss": 0.5782, + "step": 10516 + }, + { + "epoch": 13.500641848523749, + "grad_norm": 2.0076205730438232, + "learning_rate": 2.8835258878904578e-05, + "loss": 0.5137, + "step": 10517 + }, + { + "epoch": 13.501925545571245, + "grad_norm": 2.574549674987793, + "learning_rate": 2.8834830979888746e-05, + "loss": 0.5301, + "step": 10518 + }, + { + "epoch": 13.503209242618741, + "grad_norm": 2.2634146213531494, + "learning_rate": 2.8834403080872915e-05, + "loss": 0.5008, + "step": 10519 + }, + { + "epoch": 13.504492939666239, + "grad_norm": 1.2558857202529907, + "learning_rate": 2.8833975181857083e-05, + "loss": 0.5537, + "step": 10520 + }, + { + "epoch": 13.505776636713735, + "grad_norm": 5.392444610595703, + "learning_rate": 2.883354728284125e-05, + "loss": 0.5565, + "step": 10521 + }, + { + "epoch": 13.507060333761233, + "grad_norm": 2.3403480052948, + "learning_rate": 2.8833119383825416e-05, + "loss": 0.515, + "step": 10522 + }, + { + "epoch": 13.508344030808729, + "grad_norm": 1.7259845733642578, + "learning_rate": 2.8832691484809585e-05, + "loss": 0.5609, + "step": 10523 + }, + { + "epoch": 13.509627727856225, + "grad_norm": 1.8410053253173828, + "learning_rate": 2.8832263585793753e-05, + "loss": 0.5696, + "step": 10524 + }, + { + "epoch": 13.510911424903723, + "grad_norm": 2.7050180435180664, + "learning_rate": 2.8831835686777922e-05, + "loss": 0.6294, + "step": 10525 + }, + { + "epoch": 13.512195121951219, + "grad_norm": 1.657111406326294, + "learning_rate": 2.8831407787762087e-05, + "loss": 0.7076, + "step": 10526 + }, + { + "epoch": 13.513478818998717, + "grad_norm": 2.6443774700164795, + "learning_rate": 2.883097988874626e-05, + "loss": 0.7958, + "step": 10527 + }, + { + "epoch": 13.514762516046213, + "grad_norm": 1.1201199293136597, + "learning_rate": 2.8830551989730424e-05, + "loss": 0.4898, + "step": 10528 + }, + { + "epoch": 13.51604621309371, + "grad_norm": 0.9358518719673157, + "learning_rate": 2.8830124090714592e-05, + "loss": 0.524, + "step": 10529 + }, + { + "epoch": 13.517329910141207, + "grad_norm": 0.9430612325668335, + "learning_rate": 2.882969619169876e-05, + "loss": 0.5079, + "step": 10530 + }, + { + "epoch": 13.518613607188703, + "grad_norm": 1.881745457649231, + "learning_rate": 2.8829268292682925e-05, + "loss": 0.5007, + "step": 10531 + }, + { + "epoch": 13.519897304236201, + "grad_norm": 1.9954899549484253, + "learning_rate": 2.8828840393667097e-05, + "loss": 0.556, + "step": 10532 + }, + { + "epoch": 13.521181001283697, + "grad_norm": 1.1942973136901855, + "learning_rate": 2.8828412494651262e-05, + "loss": 0.5372, + "step": 10533 + }, + { + "epoch": 13.522464698331193, + "grad_norm": 1.2580941915512085, + "learning_rate": 2.882798459563543e-05, + "loss": 0.5295, + "step": 10534 + }, + { + "epoch": 13.523748395378691, + "grad_norm": 2.086454153060913, + "learning_rate": 2.88275566966196e-05, + "loss": 0.5209, + "step": 10535 + }, + { + "epoch": 13.525032092426187, + "grad_norm": 0.8589649200439453, + "learning_rate": 2.8827128797603764e-05, + "loss": 0.5315, + "step": 10536 + }, + { + "epoch": 13.526315789473685, + "grad_norm": 1.0520132780075073, + "learning_rate": 2.8826700898587936e-05, + "loss": 0.5282, + "step": 10537 + }, + { + "epoch": 13.527599486521181, + "grad_norm": 1.1113698482513428, + "learning_rate": 2.88262729995721e-05, + "loss": 0.5115, + "step": 10538 + }, + { + "epoch": 13.528883183568677, + "grad_norm": 1.1475735902786255, + "learning_rate": 2.882584510055627e-05, + "loss": 0.5484, + "step": 10539 + }, + { + "epoch": 13.530166880616175, + "grad_norm": 1.4060444831848145, + "learning_rate": 2.8825417201540438e-05, + "loss": 0.5125, + "step": 10540 + }, + { + "epoch": 13.531450577663671, + "grad_norm": 4.501863479614258, + "learning_rate": 2.8824989302524606e-05, + "loss": 0.6017, + "step": 10541 + }, + { + "epoch": 13.532734274711169, + "grad_norm": 1.836018681526184, + "learning_rate": 2.882456140350877e-05, + "loss": 0.5243, + "step": 10542 + }, + { + "epoch": 13.534017971758665, + "grad_norm": 2.4271392822265625, + "learning_rate": 2.882413350449294e-05, + "loss": 0.5354, + "step": 10543 + }, + { + "epoch": 13.535301668806161, + "grad_norm": 1.570863127708435, + "learning_rate": 2.8823705605477108e-05, + "loss": 0.5342, + "step": 10544 + }, + { + "epoch": 13.536585365853659, + "grad_norm": 4.822537899017334, + "learning_rate": 2.8823277706461276e-05, + "loss": 0.494, + "step": 10545 + }, + { + "epoch": 13.537869062901155, + "grad_norm": 1.7844231128692627, + "learning_rate": 2.8822849807445445e-05, + "loss": 0.5139, + "step": 10546 + }, + { + "epoch": 13.539152759948653, + "grad_norm": 2.0485332012176514, + "learning_rate": 2.882242190842961e-05, + "loss": 0.5001, + "step": 10547 + }, + { + "epoch": 13.540436456996149, + "grad_norm": 0.9102346897125244, + "learning_rate": 2.882199400941378e-05, + "loss": 0.4961, + "step": 10548 + }, + { + "epoch": 13.541720154043645, + "grad_norm": 1.9473000764846802, + "learning_rate": 2.8821566110397947e-05, + "loss": 0.5249, + "step": 10549 + }, + { + "epoch": 13.543003851091143, + "grad_norm": 1.0739738941192627, + "learning_rate": 2.882113821138211e-05, + "loss": 0.4916, + "step": 10550 + }, + { + "epoch": 13.544287548138639, + "grad_norm": 1.2436243295669556, + "learning_rate": 2.8820710312366283e-05, + "loss": 0.5366, + "step": 10551 + }, + { + "epoch": 13.545571245186135, + "grad_norm": 1.5661592483520508, + "learning_rate": 2.882028241335045e-05, + "loss": 0.5485, + "step": 10552 + }, + { + "epoch": 13.546854942233633, + "grad_norm": 4.000072479248047, + "learning_rate": 2.881985451433462e-05, + "loss": 0.5582, + "step": 10553 + }, + { + "epoch": 13.54813863928113, + "grad_norm": 0.7402800917625427, + "learning_rate": 2.8819426615318785e-05, + "loss": 0.5333, + "step": 10554 + }, + { + "epoch": 13.549422336328627, + "grad_norm": 2.9244909286499023, + "learning_rate": 2.8818998716302954e-05, + "loss": 0.5123, + "step": 10555 + }, + { + "epoch": 13.550706033376123, + "grad_norm": 2.5949177742004395, + "learning_rate": 2.8818570817287122e-05, + "loss": 0.5009, + "step": 10556 + }, + { + "epoch": 13.55198973042362, + "grad_norm": 1.0556608438491821, + "learning_rate": 2.8818142918271287e-05, + "loss": 0.5475, + "step": 10557 + }, + { + "epoch": 13.553273427471117, + "grad_norm": 1.2454286813735962, + "learning_rate": 2.8817715019255456e-05, + "loss": 0.5268, + "step": 10558 + }, + { + "epoch": 13.554557124518613, + "grad_norm": 1.3731876611709595, + "learning_rate": 2.8817287120239624e-05, + "loss": 0.5091, + "step": 10559 + }, + { + "epoch": 13.555840821566111, + "grad_norm": 1.823089361190796, + "learning_rate": 2.8816859221223792e-05, + "loss": 0.5499, + "step": 10560 + }, + { + "epoch": 13.557124518613607, + "grad_norm": 4.85189151763916, + "learning_rate": 2.881643132220796e-05, + "loss": 0.5088, + "step": 10561 + }, + { + "epoch": 13.558408215661103, + "grad_norm": 2.636131763458252, + "learning_rate": 2.881600342319213e-05, + "loss": 0.5303, + "step": 10562 + }, + { + "epoch": 13.559691912708601, + "grad_norm": 4.11879825592041, + "learning_rate": 2.8815575524176294e-05, + "loss": 0.5434, + "step": 10563 + }, + { + "epoch": 13.560975609756097, + "grad_norm": 2.491255521774292, + "learning_rate": 2.8815147625160463e-05, + "loss": 0.5988, + "step": 10564 + }, + { + "epoch": 13.562259306803595, + "grad_norm": 1.8852158784866333, + "learning_rate": 2.881471972614463e-05, + "loss": 0.567, + "step": 10565 + }, + { + "epoch": 13.563543003851091, + "grad_norm": 1.903996467590332, + "learning_rate": 2.8814291827128796e-05, + "loss": 0.5614, + "step": 10566 + }, + { + "epoch": 13.564826700898587, + "grad_norm": 2.0117897987365723, + "learning_rate": 2.8813863928112968e-05, + "loss": 0.5648, + "step": 10567 + }, + { + "epoch": 13.566110397946085, + "grad_norm": 1.4825291633605957, + "learning_rate": 2.8813436029097133e-05, + "loss": 0.5688, + "step": 10568 + }, + { + "epoch": 13.567394094993581, + "grad_norm": 1.4706268310546875, + "learning_rate": 2.8813008130081305e-05, + "loss": 0.6321, + "step": 10569 + }, + { + "epoch": 13.568677792041079, + "grad_norm": 4.187020778656006, + "learning_rate": 2.881258023106547e-05, + "loss": 0.5334, + "step": 10570 + }, + { + "epoch": 13.569961489088575, + "grad_norm": 1.5288666486740112, + "learning_rate": 2.8812152332049635e-05, + "loss": 0.5563, + "step": 10571 + }, + { + "epoch": 13.571245186136071, + "grad_norm": 2.284437417984009, + "learning_rate": 2.8811724433033807e-05, + "loss": 0.5145, + "step": 10572 + }, + { + "epoch": 13.572528883183569, + "grad_norm": 3.732964515686035, + "learning_rate": 2.881129653401797e-05, + "loss": 0.6556, + "step": 10573 + }, + { + "epoch": 13.573812580231065, + "grad_norm": 4.839802265167236, + "learning_rate": 2.881086863500214e-05, + "loss": 0.585, + "step": 10574 + }, + { + "epoch": 13.575096277278563, + "grad_norm": 4.255147933959961, + "learning_rate": 2.881044073598631e-05, + "loss": 0.6526, + "step": 10575 + }, + { + "epoch": 13.57637997432606, + "grad_norm": 2.5671303272247314, + "learning_rate": 2.8810012836970477e-05, + "loss": 0.7169, + "step": 10576 + }, + { + "epoch": 13.577663671373555, + "grad_norm": 2.045974016189575, + "learning_rate": 2.8809584937954645e-05, + "loss": 0.7532, + "step": 10577 + }, + { + "epoch": 13.578947368421053, + "grad_norm": 1.3280749320983887, + "learning_rate": 2.880915703893881e-05, + "loss": 0.497, + "step": 10578 + }, + { + "epoch": 13.58023106546855, + "grad_norm": 0.9733142256736755, + "learning_rate": 2.880872913992298e-05, + "loss": 0.4841, + "step": 10579 + }, + { + "epoch": 13.581514762516047, + "grad_norm": 1.0799885988235474, + "learning_rate": 2.8808301240907147e-05, + "loss": 0.5048, + "step": 10580 + }, + { + "epoch": 13.582798459563543, + "grad_norm": 1.2622759342193604, + "learning_rate": 2.8807873341891316e-05, + "loss": 0.5258, + "step": 10581 + }, + { + "epoch": 13.58408215661104, + "grad_norm": 2.3485138416290283, + "learning_rate": 2.880744544287548e-05, + "loss": 0.5188, + "step": 10582 + }, + { + "epoch": 13.585365853658537, + "grad_norm": 1.8599798679351807, + "learning_rate": 2.880701754385965e-05, + "loss": 0.501, + "step": 10583 + }, + { + "epoch": 13.586649550706033, + "grad_norm": 0.8082655072212219, + "learning_rate": 2.8806589644843817e-05, + "loss": 0.5203, + "step": 10584 + }, + { + "epoch": 13.58793324775353, + "grad_norm": 1.9956773519515991, + "learning_rate": 2.8806161745827986e-05, + "loss": 0.5553, + "step": 10585 + }, + { + "epoch": 13.589216944801027, + "grad_norm": 4.41464376449585, + "learning_rate": 2.8805733846812154e-05, + "loss": 0.5221, + "step": 10586 + }, + { + "epoch": 13.590500641848523, + "grad_norm": 1.4518072605133057, + "learning_rate": 2.880530594779632e-05, + "loss": 0.5266, + "step": 10587 + }, + { + "epoch": 13.591784338896021, + "grad_norm": 18.877485275268555, + "learning_rate": 2.880487804878049e-05, + "loss": 0.559, + "step": 10588 + }, + { + "epoch": 13.593068035943517, + "grad_norm": 0.9480569958686829, + "learning_rate": 2.8804450149764656e-05, + "loss": 0.5772, + "step": 10589 + }, + { + "epoch": 13.594351732991013, + "grad_norm": 1.0729297399520874, + "learning_rate": 2.880402225074882e-05, + "loss": 0.5302, + "step": 10590 + }, + { + "epoch": 13.595635430038511, + "grad_norm": 2.242032527923584, + "learning_rate": 2.8803594351732993e-05, + "loss": 0.5212, + "step": 10591 + }, + { + "epoch": 13.596919127086007, + "grad_norm": 2.2799439430236816, + "learning_rate": 2.8803166452717158e-05, + "loss": 0.533, + "step": 10592 + }, + { + "epoch": 13.598202824133505, + "grad_norm": 1.5435386896133423, + "learning_rate": 2.880273855370133e-05, + "loss": 0.5159, + "step": 10593 + }, + { + "epoch": 13.599486521181001, + "grad_norm": 2.146298885345459, + "learning_rate": 2.8802310654685495e-05, + "loss": 0.5656, + "step": 10594 + }, + { + "epoch": 13.600770218228497, + "grad_norm": 0.8713536858558655, + "learning_rate": 2.8801882755669663e-05, + "loss": 0.5257, + "step": 10595 + }, + { + "epoch": 13.602053915275995, + "grad_norm": 3.352277994155884, + "learning_rate": 2.880145485665383e-05, + "loss": 0.5231, + "step": 10596 + }, + { + "epoch": 13.603337612323491, + "grad_norm": 2.3111934661865234, + "learning_rate": 2.8801026957637997e-05, + "loss": 0.5174, + "step": 10597 + }, + { + "epoch": 13.60462130937099, + "grad_norm": 0.9030321836471558, + "learning_rate": 2.8800599058622165e-05, + "loss": 0.5449, + "step": 10598 + }, + { + "epoch": 13.605905006418485, + "grad_norm": 11.137877464294434, + "learning_rate": 2.8800171159606333e-05, + "loss": 0.5571, + "step": 10599 + }, + { + "epoch": 13.607188703465981, + "grad_norm": 5.252739906311035, + "learning_rate": 2.8799743260590502e-05, + "loss": 0.5295, + "step": 10600 + }, + { + "epoch": 13.60847240051348, + "grad_norm": 0.7681583762168884, + "learning_rate": 2.879931536157467e-05, + "loss": 0.5041, + "step": 10601 + }, + { + "epoch": 13.609756097560975, + "grad_norm": 1.402940273284912, + "learning_rate": 2.879888746255884e-05, + "loss": 0.5487, + "step": 10602 + }, + { + "epoch": 13.611039794608473, + "grad_norm": 1.1466686725616455, + "learning_rate": 2.8798459563543004e-05, + "loss": 0.5452, + "step": 10603 + }, + { + "epoch": 13.61232349165597, + "grad_norm": 1.3497035503387451, + "learning_rate": 2.8798031664527172e-05, + "loss": 0.5445, + "step": 10604 + }, + { + "epoch": 13.613607188703465, + "grad_norm": 1.8549636602401733, + "learning_rate": 2.879760376551134e-05, + "loss": 0.5411, + "step": 10605 + }, + { + "epoch": 13.614890885750963, + "grad_norm": 1.2944704294204712, + "learning_rate": 2.8797175866495505e-05, + "loss": 0.4833, + "step": 10606 + }, + { + "epoch": 13.61617458279846, + "grad_norm": 1.2126442193984985, + "learning_rate": 2.8796747967479677e-05, + "loss": 0.5403, + "step": 10607 + }, + { + "epoch": 13.617458279845957, + "grad_norm": 0.935337483882904, + "learning_rate": 2.8796320068463842e-05, + "loss": 0.5886, + "step": 10608 + }, + { + "epoch": 13.618741976893453, + "grad_norm": 1.6106445789337158, + "learning_rate": 2.8795892169448014e-05, + "loss": 0.5031, + "step": 10609 + }, + { + "epoch": 13.62002567394095, + "grad_norm": 4.395267486572266, + "learning_rate": 2.879546427043218e-05, + "loss": 0.5757, + "step": 10610 + }, + { + "epoch": 13.621309370988447, + "grad_norm": 1.5180190801620483, + "learning_rate": 2.8795036371416344e-05, + "loss": 0.4545, + "step": 10611 + }, + { + "epoch": 13.622593068035943, + "grad_norm": 1.3518240451812744, + "learning_rate": 2.8794608472400516e-05, + "loss": 0.53, + "step": 10612 + }, + { + "epoch": 13.623876765083441, + "grad_norm": 2.423752546310425, + "learning_rate": 2.879418057338468e-05, + "loss": 0.5231, + "step": 10613 + }, + { + "epoch": 13.625160462130937, + "grad_norm": 2.0707244873046875, + "learning_rate": 2.879375267436885e-05, + "loss": 0.53, + "step": 10614 + }, + { + "epoch": 13.626444159178433, + "grad_norm": 1.9718432426452637, + "learning_rate": 2.8793324775353018e-05, + "loss": 0.544, + "step": 10615 + }, + { + "epoch": 13.627727856225931, + "grad_norm": 1.042850136756897, + "learning_rate": 2.8792896876337186e-05, + "loss": 0.5346, + "step": 10616 + }, + { + "epoch": 13.629011553273427, + "grad_norm": 1.7254106998443604, + "learning_rate": 2.8792468977321355e-05, + "loss": 0.5726, + "step": 10617 + }, + { + "epoch": 13.630295250320923, + "grad_norm": 1.968286156654358, + "learning_rate": 2.879204107830552e-05, + "loss": 0.5726, + "step": 10618 + }, + { + "epoch": 13.631578947368421, + "grad_norm": 1.500952124595642, + "learning_rate": 2.8791613179289688e-05, + "loss": 0.5928, + "step": 10619 + }, + { + "epoch": 13.632862644415917, + "grad_norm": 1.7318336963653564, + "learning_rate": 2.8791185280273856e-05, + "loss": 0.5605, + "step": 10620 + }, + { + "epoch": 13.634146341463415, + "grad_norm": 2.1085572242736816, + "learning_rate": 2.8790757381258025e-05, + "loss": 0.5555, + "step": 10621 + }, + { + "epoch": 13.635430038510911, + "grad_norm": 3.275322675704956, + "learning_rate": 2.879032948224219e-05, + "loss": 0.6044, + "step": 10622 + }, + { + "epoch": 13.63671373555841, + "grad_norm": 3.2123637199401855, + "learning_rate": 2.8789901583226362e-05, + "loss": 0.615, + "step": 10623 + }, + { + "epoch": 13.637997432605905, + "grad_norm": 2.079171657562256, + "learning_rate": 2.8789473684210527e-05, + "loss": 0.5289, + "step": 10624 + }, + { + "epoch": 13.639281129653401, + "grad_norm": 3.1727170944213867, + "learning_rate": 2.8789045785194695e-05, + "loss": 0.6698, + "step": 10625 + }, + { + "epoch": 13.6405648267009, + "grad_norm": 1.9401359558105469, + "learning_rate": 2.8788617886178864e-05, + "loss": 0.5914, + "step": 10626 + }, + { + "epoch": 13.641848523748395, + "grad_norm": 2.692328691482544, + "learning_rate": 2.878818998716303e-05, + "loss": 0.8282, + "step": 10627 + }, + { + "epoch": 13.643132220795891, + "grad_norm": 1.376044511795044, + "learning_rate": 2.87877620881472e-05, + "loss": 0.5024, + "step": 10628 + }, + { + "epoch": 13.64441591784339, + "grad_norm": 1.1824026107788086, + "learning_rate": 2.8787334189131365e-05, + "loss": 0.4786, + "step": 10629 + }, + { + "epoch": 13.645699614890885, + "grad_norm": 1.319878339767456, + "learning_rate": 2.8786906290115534e-05, + "loss": 0.52, + "step": 10630 + }, + { + "epoch": 13.646983311938383, + "grad_norm": 1.0572088956832886, + "learning_rate": 2.8786478391099702e-05, + "loss": 0.5352, + "step": 10631 + }, + { + "epoch": 13.64826700898588, + "grad_norm": 1.9440149068832397, + "learning_rate": 2.8786050492083867e-05, + "loss": 0.5416, + "step": 10632 + }, + { + "epoch": 13.649550706033375, + "grad_norm": 1.6076663732528687, + "learning_rate": 2.8785622593068036e-05, + "loss": 0.4948, + "step": 10633 + }, + { + "epoch": 13.650834403080873, + "grad_norm": 1.4261404275894165, + "learning_rate": 2.8785194694052204e-05, + "loss": 0.5228, + "step": 10634 + }, + { + "epoch": 13.65211810012837, + "grad_norm": 1.1186206340789795, + "learning_rate": 2.8784766795036372e-05, + "loss": 0.4995, + "step": 10635 + }, + { + "epoch": 13.653401797175867, + "grad_norm": 1.2856940031051636, + "learning_rate": 2.878433889602054e-05, + "loss": 0.4931, + "step": 10636 + }, + { + "epoch": 13.654685494223363, + "grad_norm": 1.2703051567077637, + "learning_rate": 2.878391099700471e-05, + "loss": 0.5285, + "step": 10637 + }, + { + "epoch": 13.65596919127086, + "grad_norm": 1.9451978206634521, + "learning_rate": 2.8783483097988874e-05, + "loss": 0.4832, + "step": 10638 + }, + { + "epoch": 13.657252888318357, + "grad_norm": 19.276554107666016, + "learning_rate": 2.8783055198973043e-05, + "loss": 0.571, + "step": 10639 + }, + { + "epoch": 13.658536585365853, + "grad_norm": 1.1837276220321655, + "learning_rate": 2.878262729995721e-05, + "loss": 0.5645, + "step": 10640 + }, + { + "epoch": 13.659820282413351, + "grad_norm": 3.581803560256958, + "learning_rate": 2.8782199400941376e-05, + "loss": 0.5375, + "step": 10641 + }, + { + "epoch": 13.661103979460847, + "grad_norm": 5.102277755737305, + "learning_rate": 2.8781771501925548e-05, + "loss": 0.5417, + "step": 10642 + }, + { + "epoch": 13.662387676508343, + "grad_norm": 3.5418851375579834, + "learning_rate": 2.8781343602909713e-05, + "loss": 0.557, + "step": 10643 + }, + { + "epoch": 13.663671373555841, + "grad_norm": 0.9024901390075684, + "learning_rate": 2.878091570389388e-05, + "loss": 0.5215, + "step": 10644 + }, + { + "epoch": 13.664955070603337, + "grad_norm": 1.1566731929779053, + "learning_rate": 2.878048780487805e-05, + "loss": 0.5011, + "step": 10645 + }, + { + "epoch": 13.666238767650835, + "grad_norm": 1.2098543643951416, + "learning_rate": 2.8780059905862215e-05, + "loss": 0.5235, + "step": 10646 + }, + { + "epoch": 13.667522464698331, + "grad_norm": 1.7659392356872559, + "learning_rate": 2.8779632006846387e-05, + "loss": 0.4781, + "step": 10647 + }, + { + "epoch": 13.668806161745827, + "grad_norm": 1.3273202180862427, + "learning_rate": 2.877920410783055e-05, + "loss": 0.5029, + "step": 10648 + }, + { + "epoch": 13.670089858793325, + "grad_norm": 1.6965765953063965, + "learning_rate": 2.877877620881472e-05, + "loss": 0.4807, + "step": 10649 + }, + { + "epoch": 13.671373555840821, + "grad_norm": 2.0725486278533936, + "learning_rate": 2.877834830979889e-05, + "loss": 0.5329, + "step": 10650 + }, + { + "epoch": 13.672657252888317, + "grad_norm": 3.439434766769409, + "learning_rate": 2.8777920410783054e-05, + "loss": 0.5576, + "step": 10651 + }, + { + "epoch": 13.673940949935815, + "grad_norm": 1.0845667123794556, + "learning_rate": 2.8777492511767225e-05, + "loss": 0.5365, + "step": 10652 + }, + { + "epoch": 13.675224646983311, + "grad_norm": 2.5816092491149902, + "learning_rate": 2.877706461275139e-05, + "loss": 0.5195, + "step": 10653 + }, + { + "epoch": 13.67650834403081, + "grad_norm": 1.0942715406417847, + "learning_rate": 2.877663671373556e-05, + "loss": 0.5367, + "step": 10654 + }, + { + "epoch": 13.677792041078305, + "grad_norm": 1.4724656343460083, + "learning_rate": 2.8776208814719727e-05, + "loss": 0.5064, + "step": 10655 + }, + { + "epoch": 13.679075738125803, + "grad_norm": 1.5784926414489746, + "learning_rate": 2.8775780915703896e-05, + "loss": 0.5245, + "step": 10656 + }, + { + "epoch": 13.6803594351733, + "grad_norm": 2.197209119796753, + "learning_rate": 2.877535301668806e-05, + "loss": 0.5297, + "step": 10657 + }, + { + "epoch": 13.681643132220795, + "grad_norm": 1.7515298128128052, + "learning_rate": 2.877492511767223e-05, + "loss": 0.5639, + "step": 10658 + }, + { + "epoch": 13.682926829268293, + "grad_norm": 4.3180317878723145, + "learning_rate": 2.8774497218656397e-05, + "loss": 0.5184, + "step": 10659 + }, + { + "epoch": 13.68421052631579, + "grad_norm": 3.391530752182007, + "learning_rate": 2.8774069319640566e-05, + "loss": 0.5088, + "step": 10660 + }, + { + "epoch": 13.685494223363285, + "grad_norm": 1.5190857648849487, + "learning_rate": 2.8773641420624734e-05, + "loss": 0.6168, + "step": 10661 + }, + { + "epoch": 13.686777920410783, + "grad_norm": 2.241471529006958, + "learning_rate": 2.87732135216089e-05, + "loss": 0.5412, + "step": 10662 + }, + { + "epoch": 13.68806161745828, + "grad_norm": 1.3803330659866333, + "learning_rate": 2.877278562259307e-05, + "loss": 0.5769, + "step": 10663 + }, + { + "epoch": 13.689345314505777, + "grad_norm": 1.6163238286972046, + "learning_rate": 2.8772357723577236e-05, + "loss": 0.556, + "step": 10664 + }, + { + "epoch": 13.690629011553273, + "grad_norm": 1.3140676021575928, + "learning_rate": 2.87719298245614e-05, + "loss": 0.5689, + "step": 10665 + }, + { + "epoch": 13.69191270860077, + "grad_norm": 1.2869337797164917, + "learning_rate": 2.8771501925545573e-05, + "loss": 0.5482, + "step": 10666 + }, + { + "epoch": 13.693196405648267, + "grad_norm": 2.724048137664795, + "learning_rate": 2.8771074026529738e-05, + "loss": 0.5478, + "step": 10667 + }, + { + "epoch": 13.694480102695763, + "grad_norm": 5.338704586029053, + "learning_rate": 2.877064612751391e-05, + "loss": 0.5611, + "step": 10668 + }, + { + "epoch": 13.695763799743261, + "grad_norm": 2.192168951034546, + "learning_rate": 2.8770218228498075e-05, + "loss": 0.493, + "step": 10669 + }, + { + "epoch": 13.697047496790757, + "grad_norm": 2.5076777935028076, + "learning_rate": 2.8769790329482243e-05, + "loss": 0.5654, + "step": 10670 + }, + { + "epoch": 13.698331193838253, + "grad_norm": 0.8218163847923279, + "learning_rate": 2.876936243046641e-05, + "loss": 0.5607, + "step": 10671 + }, + { + "epoch": 13.699614890885751, + "grad_norm": 1.9944323301315308, + "learning_rate": 2.8768934531450577e-05, + "loss": 0.5701, + "step": 10672 + }, + { + "epoch": 13.700898587933247, + "grad_norm": 1.9152331352233887, + "learning_rate": 2.8768506632434745e-05, + "loss": 0.6268, + "step": 10673 + }, + { + "epoch": 13.702182284980745, + "grad_norm": 4.133584976196289, + "learning_rate": 2.8768078733418913e-05, + "loss": 0.6058, + "step": 10674 + }, + { + "epoch": 13.703465982028241, + "grad_norm": 2.2349693775177, + "learning_rate": 2.8767650834403082e-05, + "loss": 0.605, + "step": 10675 + }, + { + "epoch": 13.704749679075737, + "grad_norm": 2.4154436588287354, + "learning_rate": 2.876722293538725e-05, + "loss": 0.6693, + "step": 10676 + }, + { + "epoch": 13.706033376123235, + "grad_norm": 1.9223864078521729, + "learning_rate": 2.876679503637142e-05, + "loss": 0.7132, + "step": 10677 + }, + { + "epoch": 13.707317073170731, + "grad_norm": 1.1808995008468628, + "learning_rate": 2.8766367137355584e-05, + "loss": 0.5203, + "step": 10678 + }, + { + "epoch": 13.70860077021823, + "grad_norm": 0.889016330242157, + "learning_rate": 2.8765939238339752e-05, + "loss": 0.5362, + "step": 10679 + }, + { + "epoch": 13.709884467265725, + "grad_norm": 1.304450273513794, + "learning_rate": 2.876551133932392e-05, + "loss": 0.5104, + "step": 10680 + }, + { + "epoch": 13.711168164313221, + "grad_norm": 1.352007508277893, + "learning_rate": 2.8765083440308086e-05, + "loss": 0.5557, + "step": 10681 + }, + { + "epoch": 13.71245186136072, + "grad_norm": 3.046182870864868, + "learning_rate": 2.8764655541292257e-05, + "loss": 0.5129, + "step": 10682 + }, + { + "epoch": 13.713735558408215, + "grad_norm": 2.3036723136901855, + "learning_rate": 2.8764227642276422e-05, + "loss": 0.5426, + "step": 10683 + }, + { + "epoch": 13.715019255455712, + "grad_norm": 2.579922676086426, + "learning_rate": 2.8763799743260594e-05, + "loss": 0.5096, + "step": 10684 + }, + { + "epoch": 13.71630295250321, + "grad_norm": 1.4016352891921997, + "learning_rate": 2.876337184424476e-05, + "loss": 0.5421, + "step": 10685 + }, + { + "epoch": 13.717586649550706, + "grad_norm": 1.1968270540237427, + "learning_rate": 2.8762943945228924e-05, + "loss": 0.4863, + "step": 10686 + }, + { + "epoch": 13.718870346598203, + "grad_norm": 1.4557491540908813, + "learning_rate": 2.8762516046213096e-05, + "loss": 0.5272, + "step": 10687 + }, + { + "epoch": 13.7201540436457, + "grad_norm": 1.8298144340515137, + "learning_rate": 2.876208814719726e-05, + "loss": 0.5058, + "step": 10688 + }, + { + "epoch": 13.721437740693197, + "grad_norm": 2.435971736907959, + "learning_rate": 2.876166024818143e-05, + "loss": 0.4589, + "step": 10689 + }, + { + "epoch": 13.722721437740693, + "grad_norm": 1.1328949928283691, + "learning_rate": 2.8761232349165598e-05, + "loss": 0.4791, + "step": 10690 + }, + { + "epoch": 13.72400513478819, + "grad_norm": 0.9771497249603271, + "learning_rate": 2.8760804450149766e-05, + "loss": 0.4989, + "step": 10691 + }, + { + "epoch": 13.725288831835687, + "grad_norm": 2.194722890853882, + "learning_rate": 2.8760376551133935e-05, + "loss": 0.5371, + "step": 10692 + }, + { + "epoch": 13.726572528883183, + "grad_norm": 1.023658275604248, + "learning_rate": 2.87599486521181e-05, + "loss": 0.5376, + "step": 10693 + }, + { + "epoch": 13.72785622593068, + "grad_norm": 1.1920324563980103, + "learning_rate": 2.8759520753102268e-05, + "loss": 0.5176, + "step": 10694 + }, + { + "epoch": 13.729139922978177, + "grad_norm": 1.2794963121414185, + "learning_rate": 2.8759092854086437e-05, + "loss": 0.4746, + "step": 10695 + }, + { + "epoch": 13.730423620025674, + "grad_norm": 1.3392630815505981, + "learning_rate": 2.8758664955070605e-05, + "loss": 0.5257, + "step": 10696 + }, + { + "epoch": 13.731707317073171, + "grad_norm": 1.899820327758789, + "learning_rate": 2.875823705605477e-05, + "loss": 0.53, + "step": 10697 + }, + { + "epoch": 13.732991014120667, + "grad_norm": 1.5316716432571411, + "learning_rate": 2.8757809157038942e-05, + "loss": 0.4957, + "step": 10698 + }, + { + "epoch": 13.734274711168164, + "grad_norm": 1.2014566659927368, + "learning_rate": 2.8757381258023107e-05, + "loss": 0.4997, + "step": 10699 + }, + { + "epoch": 13.735558408215661, + "grad_norm": 2.2860729694366455, + "learning_rate": 2.8756953359007275e-05, + "loss": 0.4689, + "step": 10700 + }, + { + "epoch": 13.736842105263158, + "grad_norm": 1.6411021947860718, + "learning_rate": 2.8756525459991444e-05, + "loss": 0.5096, + "step": 10701 + }, + { + "epoch": 13.738125802310655, + "grad_norm": 3.7893056869506836, + "learning_rate": 2.875609756097561e-05, + "loss": 0.5587, + "step": 10702 + }, + { + "epoch": 13.739409499358151, + "grad_norm": 2.3557279109954834, + "learning_rate": 2.875566966195978e-05, + "loss": 0.5589, + "step": 10703 + }, + { + "epoch": 13.740693196405648, + "grad_norm": 2.104902982711792, + "learning_rate": 2.8755241762943945e-05, + "loss": 0.5068, + "step": 10704 + }, + { + "epoch": 13.741976893453145, + "grad_norm": 2.720259666442871, + "learning_rate": 2.8754813863928114e-05, + "loss": 0.5494, + "step": 10705 + }, + { + "epoch": 13.743260590500642, + "grad_norm": 1.2171951532363892, + "learning_rate": 2.8754385964912282e-05, + "loss": 0.5437, + "step": 10706 + }, + { + "epoch": 13.74454428754814, + "grad_norm": 1.6342319250106812, + "learning_rate": 2.8753958065896447e-05, + "loss": 0.4809, + "step": 10707 + }, + { + "epoch": 13.745827984595635, + "grad_norm": 10.53164005279541, + "learning_rate": 2.875353016688062e-05, + "loss": 0.5463, + "step": 10708 + }, + { + "epoch": 13.747111681643132, + "grad_norm": 1.226313591003418, + "learning_rate": 2.8753102267864784e-05, + "loss": 0.5323, + "step": 10709 + }, + { + "epoch": 13.74839537869063, + "grad_norm": 1.4420229196548462, + "learning_rate": 2.8752674368848953e-05, + "loss": 0.5717, + "step": 10710 + }, + { + "epoch": 13.749679075738126, + "grad_norm": 1.7240478992462158, + "learning_rate": 2.875224646983312e-05, + "loss": 0.5485, + "step": 10711 + }, + { + "epoch": 13.750962772785623, + "grad_norm": 2.9718286991119385, + "learning_rate": 2.8751818570817286e-05, + "loss": 0.5491, + "step": 10712 + }, + { + "epoch": 13.75224646983312, + "grad_norm": 0.9787531495094299, + "learning_rate": 2.8751390671801454e-05, + "loss": 0.5114, + "step": 10713 + }, + { + "epoch": 13.753530166880616, + "grad_norm": 2.0146231651306152, + "learning_rate": 2.8750962772785623e-05, + "loss": 0.5157, + "step": 10714 + }, + { + "epoch": 13.754813863928113, + "grad_norm": 0.863540530204773, + "learning_rate": 2.875053487376979e-05, + "loss": 0.5619, + "step": 10715 + }, + { + "epoch": 13.75609756097561, + "grad_norm": 1.5061110258102417, + "learning_rate": 2.875010697475396e-05, + "loss": 0.5864, + "step": 10716 + }, + { + "epoch": 13.757381258023106, + "grad_norm": 1.5420711040496826, + "learning_rate": 2.8749679075738128e-05, + "loss": 0.5036, + "step": 10717 + }, + { + "epoch": 13.758664955070603, + "grad_norm": 1.9112141132354736, + "learning_rate": 2.8749251176722293e-05, + "loss": 0.5547, + "step": 10718 + }, + { + "epoch": 13.7599486521181, + "grad_norm": 1.4871467351913452, + "learning_rate": 2.874882327770646e-05, + "loss": 0.5637, + "step": 10719 + }, + { + "epoch": 13.761232349165597, + "grad_norm": 2.1456775665283203, + "learning_rate": 2.874839537869063e-05, + "loss": 0.6221, + "step": 10720 + }, + { + "epoch": 13.762516046213094, + "grad_norm": 1.441174864768982, + "learning_rate": 2.8747967479674795e-05, + "loss": 0.519, + "step": 10721 + }, + { + "epoch": 13.763799743260591, + "grad_norm": 1.3693028688430786, + "learning_rate": 2.8747539580658967e-05, + "loss": 0.6375, + "step": 10722 + }, + { + "epoch": 13.765083440308088, + "grad_norm": 2.611083507537842, + "learning_rate": 2.8747111681643132e-05, + "loss": 0.5553, + "step": 10723 + }, + { + "epoch": 13.766367137355584, + "grad_norm": 1.4184110164642334, + "learning_rate": 2.8746683782627304e-05, + "loss": 0.5547, + "step": 10724 + }, + { + "epoch": 13.767650834403081, + "grad_norm": 4.411312580108643, + "learning_rate": 2.874625588361147e-05, + "loss": 0.6129, + "step": 10725 + }, + { + "epoch": 13.768934531450578, + "grad_norm": 2.4106028079986572, + "learning_rate": 2.8745827984595634e-05, + "loss": 0.6781, + "step": 10726 + }, + { + "epoch": 13.770218228498074, + "grad_norm": 1.8111367225646973, + "learning_rate": 2.8745400085579805e-05, + "loss": 0.7536, + "step": 10727 + }, + { + "epoch": 13.771501925545572, + "grad_norm": 2.5339832305908203, + "learning_rate": 2.874497218656397e-05, + "loss": 0.5144, + "step": 10728 + }, + { + "epoch": 13.772785622593068, + "grad_norm": 0.8970808982849121, + "learning_rate": 2.874454428754814e-05, + "loss": 0.4764, + "step": 10729 + }, + { + "epoch": 13.774069319640565, + "grad_norm": 0.6772329807281494, + "learning_rate": 2.8744116388532307e-05, + "loss": 0.5147, + "step": 10730 + }, + { + "epoch": 13.775353016688062, + "grad_norm": 1.946805477142334, + "learning_rate": 2.8743688489516476e-05, + "loss": 0.5902, + "step": 10731 + }, + { + "epoch": 13.776636713735558, + "grad_norm": 1.5419436693191528, + "learning_rate": 2.8743260590500644e-05, + "loss": 0.5034, + "step": 10732 + }, + { + "epoch": 13.777920410783056, + "grad_norm": 1.874666452407837, + "learning_rate": 2.874283269148481e-05, + "loss": 0.508, + "step": 10733 + }, + { + "epoch": 13.779204107830552, + "grad_norm": 4.1636643409729, + "learning_rate": 2.8742404792468977e-05, + "loss": 0.5033, + "step": 10734 + }, + { + "epoch": 13.78048780487805, + "grad_norm": 1.4229708909988403, + "learning_rate": 2.8741976893453146e-05, + "loss": 0.4919, + "step": 10735 + }, + { + "epoch": 13.781771501925546, + "grad_norm": 2.2667086124420166, + "learning_rate": 2.8741548994437314e-05, + "loss": 0.5816, + "step": 10736 + }, + { + "epoch": 13.783055198973042, + "grad_norm": 1.7241579294204712, + "learning_rate": 2.874112109542148e-05, + "loss": 0.5552, + "step": 10737 + }, + { + "epoch": 13.78433889602054, + "grad_norm": 2.2958898544311523, + "learning_rate": 2.874069319640565e-05, + "loss": 0.5176, + "step": 10738 + }, + { + "epoch": 13.785622593068036, + "grad_norm": 1.1395394802093506, + "learning_rate": 2.8740265297389816e-05, + "loss": 0.5164, + "step": 10739 + }, + { + "epoch": 13.786906290115533, + "grad_norm": 1.2842944860458374, + "learning_rate": 2.8739837398373985e-05, + "loss": 0.5455, + "step": 10740 + }, + { + "epoch": 13.78818998716303, + "grad_norm": 2.835390567779541, + "learning_rate": 2.8739409499358153e-05, + "loss": 0.4859, + "step": 10741 + }, + { + "epoch": 13.789473684210526, + "grad_norm": 1.072845458984375, + "learning_rate": 2.8738981600342318e-05, + "loss": 0.5506, + "step": 10742 + }, + { + "epoch": 13.790757381258024, + "grad_norm": 0.9520222544670105, + "learning_rate": 2.873855370132649e-05, + "loss": 0.5238, + "step": 10743 + }, + { + "epoch": 13.79204107830552, + "grad_norm": 3.5437779426574707, + "learning_rate": 2.8738125802310655e-05, + "loss": 0.5232, + "step": 10744 + }, + { + "epoch": 13.793324775353017, + "grad_norm": 1.853722095489502, + "learning_rate": 2.8737697903294823e-05, + "loss": 0.5531, + "step": 10745 + }, + { + "epoch": 13.794608472400514, + "grad_norm": 0.8901689648628235, + "learning_rate": 2.873727000427899e-05, + "loss": 0.516, + "step": 10746 + }, + { + "epoch": 13.79589216944801, + "grad_norm": 1.5623533725738525, + "learning_rate": 2.8736842105263157e-05, + "loss": 0.5524, + "step": 10747 + }, + { + "epoch": 13.797175866495508, + "grad_norm": 1.7902789115905762, + "learning_rate": 2.873641420624733e-05, + "loss": 0.5295, + "step": 10748 + }, + { + "epoch": 13.798459563543004, + "grad_norm": 1.0693840980529785, + "learning_rate": 2.8735986307231493e-05, + "loss": 0.5151, + "step": 10749 + }, + { + "epoch": 13.7997432605905, + "grad_norm": 2.707484006881714, + "learning_rate": 2.8735558408215662e-05, + "loss": 0.5202, + "step": 10750 + }, + { + "epoch": 13.801026957637998, + "grad_norm": 1.0424991846084595, + "learning_rate": 2.873513050919983e-05, + "loss": 0.4817, + "step": 10751 + }, + { + "epoch": 13.802310654685494, + "grad_norm": 8.067072868347168, + "learning_rate": 2.8734702610184e-05, + "loss": 0.5082, + "step": 10752 + }, + { + "epoch": 13.803594351732992, + "grad_norm": 1.4244598150253296, + "learning_rate": 2.8734274711168164e-05, + "loss": 0.5328, + "step": 10753 + }, + { + "epoch": 13.804878048780488, + "grad_norm": 1.1404426097869873, + "learning_rate": 2.8733846812152332e-05, + "loss": 0.4928, + "step": 10754 + }, + { + "epoch": 13.806161745827985, + "grad_norm": 0.9048909544944763, + "learning_rate": 2.87334189131365e-05, + "loss": 0.4834, + "step": 10755 + }, + { + "epoch": 13.807445442875482, + "grad_norm": 1.1021792888641357, + "learning_rate": 2.873299101412067e-05, + "loss": 0.5266, + "step": 10756 + }, + { + "epoch": 13.808729139922978, + "grad_norm": 2.5149405002593994, + "learning_rate": 2.8732563115104837e-05, + "loss": 0.5314, + "step": 10757 + }, + { + "epoch": 13.810012836970476, + "grad_norm": 1.1361819505691528, + "learning_rate": 2.8732135216089002e-05, + "loss": 0.5361, + "step": 10758 + }, + { + "epoch": 13.811296534017972, + "grad_norm": 4.156093120574951, + "learning_rate": 2.8731707317073174e-05, + "loss": 0.5014, + "step": 10759 + }, + { + "epoch": 13.812580231065468, + "grad_norm": 1.7270938158035278, + "learning_rate": 2.873127941805734e-05, + "loss": 0.5519, + "step": 10760 + }, + { + "epoch": 13.813863928112966, + "grad_norm": 1.3017231225967407, + "learning_rate": 2.8730851519041504e-05, + "loss": 0.549, + "step": 10761 + }, + { + "epoch": 13.815147625160462, + "grad_norm": 1.3897309303283691, + "learning_rate": 2.8730423620025676e-05, + "loss": 0.5132, + "step": 10762 + }, + { + "epoch": 13.81643132220796, + "grad_norm": 1.6199381351470947, + "learning_rate": 2.872999572100984e-05, + "loss": 0.5398, + "step": 10763 + }, + { + "epoch": 13.817715019255456, + "grad_norm": 1.558971643447876, + "learning_rate": 2.8729567821994013e-05, + "loss": 0.5506, + "step": 10764 + }, + { + "epoch": 13.818998716302952, + "grad_norm": 1.0488100051879883, + "learning_rate": 2.8729139922978178e-05, + "loss": 0.5704, + "step": 10765 + }, + { + "epoch": 13.82028241335045, + "grad_norm": 2.0910871028900146, + "learning_rate": 2.8728712023962346e-05, + "loss": 0.5502, + "step": 10766 + }, + { + "epoch": 13.821566110397946, + "grad_norm": 0.987241268157959, + "learning_rate": 2.8728284124946515e-05, + "loss": 0.5554, + "step": 10767 + }, + { + "epoch": 13.822849807445444, + "grad_norm": 2.954702854156494, + "learning_rate": 2.872785622593068e-05, + "loss": 0.5824, + "step": 10768 + }, + { + "epoch": 13.82413350449294, + "grad_norm": 0.9040500521659851, + "learning_rate": 2.8727428326914848e-05, + "loss": 0.5681, + "step": 10769 + }, + { + "epoch": 13.825417201540436, + "grad_norm": 7.676657676696777, + "learning_rate": 2.8727000427899017e-05, + "loss": 0.6092, + "step": 10770 + }, + { + "epoch": 13.826700898587934, + "grad_norm": 1.5476473569869995, + "learning_rate": 2.8726572528883185e-05, + "loss": 0.5966, + "step": 10771 + }, + { + "epoch": 13.82798459563543, + "grad_norm": 1.4933228492736816, + "learning_rate": 2.8726144629867353e-05, + "loss": 0.5623, + "step": 10772 + }, + { + "epoch": 13.829268292682928, + "grad_norm": 2.4368252754211426, + "learning_rate": 2.872571673085152e-05, + "loss": 0.5563, + "step": 10773 + }, + { + "epoch": 13.830551989730424, + "grad_norm": 4.6212944984436035, + "learning_rate": 2.8725288831835687e-05, + "loss": 0.6245, + "step": 10774 + }, + { + "epoch": 13.83183568677792, + "grad_norm": 5.562192440032959, + "learning_rate": 2.8724860932819855e-05, + "loss": 0.6021, + "step": 10775 + }, + { + "epoch": 13.833119383825418, + "grad_norm": 2.3365068435668945, + "learning_rate": 2.8724433033804024e-05, + "loss": 0.6409, + "step": 10776 + }, + { + "epoch": 13.834403080872914, + "grad_norm": 2.4480905532836914, + "learning_rate": 2.872400513478819e-05, + "loss": 0.7572, + "step": 10777 + }, + { + "epoch": 13.835686777920412, + "grad_norm": 1.592182993888855, + "learning_rate": 2.872357723577236e-05, + "loss": 0.4864, + "step": 10778 + }, + { + "epoch": 13.836970474967908, + "grad_norm": 0.9260572791099548, + "learning_rate": 2.8723149336756526e-05, + "loss": 0.4821, + "step": 10779 + }, + { + "epoch": 13.838254172015404, + "grad_norm": 1.0320020914077759, + "learning_rate": 2.8722721437740694e-05, + "loss": 0.5183, + "step": 10780 + }, + { + "epoch": 13.839537869062902, + "grad_norm": 0.9161650538444519, + "learning_rate": 2.8722293538724862e-05, + "loss": 0.5241, + "step": 10781 + }, + { + "epoch": 13.840821566110398, + "grad_norm": 1.7650259733200073, + "learning_rate": 2.8721865639709027e-05, + "loss": 0.5765, + "step": 10782 + }, + { + "epoch": 13.842105263157894, + "grad_norm": 1.813214659690857, + "learning_rate": 2.87214377406932e-05, + "loss": 0.5679, + "step": 10783 + }, + { + "epoch": 13.843388960205392, + "grad_norm": 4.400651454925537, + "learning_rate": 2.8721009841677364e-05, + "loss": 0.5111, + "step": 10784 + }, + { + "epoch": 13.844672657252888, + "grad_norm": 2.1185567378997803, + "learning_rate": 2.8720581942661533e-05, + "loss": 0.5158, + "step": 10785 + }, + { + "epoch": 13.845956354300386, + "grad_norm": 1.9867016077041626, + "learning_rate": 2.87201540436457e-05, + "loss": 0.5177, + "step": 10786 + }, + { + "epoch": 13.847240051347882, + "grad_norm": 2.6969733238220215, + "learning_rate": 2.8719726144629866e-05, + "loss": 0.5587, + "step": 10787 + }, + { + "epoch": 13.84852374839538, + "grad_norm": 0.9243907928466797, + "learning_rate": 2.8719298245614038e-05, + "loss": 0.4999, + "step": 10788 + }, + { + "epoch": 13.849807445442876, + "grad_norm": 1.719625473022461, + "learning_rate": 2.8718870346598203e-05, + "loss": 0.529, + "step": 10789 + }, + { + "epoch": 13.851091142490372, + "grad_norm": 1.3746728897094727, + "learning_rate": 2.871844244758237e-05, + "loss": 0.4926, + "step": 10790 + }, + { + "epoch": 13.85237483953787, + "grad_norm": 1.037568211555481, + "learning_rate": 2.871801454856654e-05, + "loss": 0.5326, + "step": 10791 + }, + { + "epoch": 13.853658536585366, + "grad_norm": 2.3121895790100098, + "learning_rate": 2.8717586649550708e-05, + "loss": 0.5126, + "step": 10792 + }, + { + "epoch": 13.854942233632862, + "grad_norm": 3.3113276958465576, + "learning_rate": 2.8717158750534873e-05, + "loss": 0.5076, + "step": 10793 + }, + { + "epoch": 13.85622593068036, + "grad_norm": 1.5627363920211792, + "learning_rate": 2.871673085151904e-05, + "loss": 0.5182, + "step": 10794 + }, + { + "epoch": 13.857509627727856, + "grad_norm": 1.4946930408477783, + "learning_rate": 2.871630295250321e-05, + "loss": 0.4812, + "step": 10795 + }, + { + "epoch": 13.858793324775354, + "grad_norm": 1.131572961807251, + "learning_rate": 2.871587505348738e-05, + "loss": 0.5297, + "step": 10796 + }, + { + "epoch": 13.86007702182285, + "grad_norm": 1.5370904207229614, + "learning_rate": 2.8715447154471547e-05, + "loss": 0.5344, + "step": 10797 + }, + { + "epoch": 13.861360718870346, + "grad_norm": 0.996360182762146, + "learning_rate": 2.8715019255455712e-05, + "loss": 0.5214, + "step": 10798 + }, + { + "epoch": 13.862644415917844, + "grad_norm": 1.4555037021636963, + "learning_rate": 2.8714591356439884e-05, + "loss": 0.5271, + "step": 10799 + }, + { + "epoch": 13.86392811296534, + "grad_norm": 0.9745811820030212, + "learning_rate": 2.871416345742405e-05, + "loss": 0.4815, + "step": 10800 + }, + { + "epoch": 13.865211810012838, + "grad_norm": 0.9414850473403931, + "learning_rate": 2.8713735558408214e-05, + "loss": 0.4792, + "step": 10801 + }, + { + "epoch": 13.866495507060334, + "grad_norm": 0.9231777787208557, + "learning_rate": 2.8713307659392385e-05, + "loss": 0.5375, + "step": 10802 + }, + { + "epoch": 13.86777920410783, + "grad_norm": 0.7336996793746948, + "learning_rate": 2.871287976037655e-05, + "loss": 0.5139, + "step": 10803 + }, + { + "epoch": 13.869062901155328, + "grad_norm": 3.965092658996582, + "learning_rate": 2.8712451861360722e-05, + "loss": 0.5499, + "step": 10804 + }, + { + "epoch": 13.870346598202824, + "grad_norm": 1.9582796096801758, + "learning_rate": 2.8712023962344887e-05, + "loss": 0.5027, + "step": 10805 + }, + { + "epoch": 13.871630295250322, + "grad_norm": 2.0072782039642334, + "learning_rate": 2.8711596063329056e-05, + "loss": 0.5424, + "step": 10806 + }, + { + "epoch": 13.872913992297818, + "grad_norm": 1.1994659900665283, + "learning_rate": 2.8711168164313224e-05, + "loss": 0.5448, + "step": 10807 + }, + { + "epoch": 13.874197689345314, + "grad_norm": 1.099116325378418, + "learning_rate": 2.871074026529739e-05, + "loss": 0.5726, + "step": 10808 + }, + { + "epoch": 13.875481386392812, + "grad_norm": 3.5780563354492188, + "learning_rate": 2.8710312366281558e-05, + "loss": 0.4909, + "step": 10809 + }, + { + "epoch": 13.876765083440308, + "grad_norm": 1.2683818340301514, + "learning_rate": 2.8709884467265726e-05, + "loss": 0.5078, + "step": 10810 + }, + { + "epoch": 13.878048780487806, + "grad_norm": 1.6292848587036133, + "learning_rate": 2.8709456568249894e-05, + "loss": 0.5716, + "step": 10811 + }, + { + "epoch": 13.879332477535302, + "grad_norm": 1.0468695163726807, + "learning_rate": 2.8709028669234063e-05, + "loss": 0.5835, + "step": 10812 + }, + { + "epoch": 13.880616174582798, + "grad_norm": 0.9941972494125366, + "learning_rate": 2.870860077021823e-05, + "loss": 0.5613, + "step": 10813 + }, + { + "epoch": 13.881899871630296, + "grad_norm": 1.2608224153518677, + "learning_rate": 2.8708172871202396e-05, + "loss": 0.5432, + "step": 10814 + }, + { + "epoch": 13.883183568677792, + "grad_norm": 2.0019283294677734, + "learning_rate": 2.8707744972186565e-05, + "loss": 0.5707, + "step": 10815 + }, + { + "epoch": 13.88446726572529, + "grad_norm": 1.08290696144104, + "learning_rate": 2.8707317073170733e-05, + "loss": 0.561, + "step": 10816 + }, + { + "epoch": 13.885750962772786, + "grad_norm": 0.9838491082191467, + "learning_rate": 2.8706889174154898e-05, + "loss": 0.5539, + "step": 10817 + }, + { + "epoch": 13.887034659820282, + "grad_norm": 1.1124218702316284, + "learning_rate": 2.870646127513907e-05, + "loss": 0.5611, + "step": 10818 + }, + { + "epoch": 13.88831835686778, + "grad_norm": 1.4303715229034424, + "learning_rate": 2.8706033376123235e-05, + "loss": 0.6012, + "step": 10819 + }, + { + "epoch": 13.889602053915276, + "grad_norm": 2.1583657264709473, + "learning_rate": 2.8705605477107407e-05, + "loss": 0.6031, + "step": 10820 + }, + { + "epoch": 13.890885750962774, + "grad_norm": 1.3700262308120728, + "learning_rate": 2.8705177578091572e-05, + "loss": 0.5515, + "step": 10821 + }, + { + "epoch": 13.89216944801027, + "grad_norm": 3.026820421218872, + "learning_rate": 2.8704749679075737e-05, + "loss": 0.5828, + "step": 10822 + }, + { + "epoch": 13.893453145057766, + "grad_norm": 1.064633846282959, + "learning_rate": 2.870432178005991e-05, + "loss": 0.6516, + "step": 10823 + }, + { + "epoch": 13.894736842105264, + "grad_norm": 2.6436450481414795, + "learning_rate": 2.8703893881044074e-05, + "loss": 0.6363, + "step": 10824 + }, + { + "epoch": 13.89602053915276, + "grad_norm": 1.9849860668182373, + "learning_rate": 2.8703465982028242e-05, + "loss": 0.5536, + "step": 10825 + }, + { + "epoch": 13.897304236200256, + "grad_norm": 1.5477855205535889, + "learning_rate": 2.870303808301241e-05, + "loss": 0.6581, + "step": 10826 + }, + { + "epoch": 13.898587933247754, + "grad_norm": 2.7070505619049072, + "learning_rate": 2.870261018399658e-05, + "loss": 0.7339, + "step": 10827 + }, + { + "epoch": 13.89987163029525, + "grad_norm": 1.0352169275283813, + "learning_rate": 2.8702182284980747e-05, + "loss": 0.5041, + "step": 10828 + }, + { + "epoch": 13.901155327342748, + "grad_norm": 1.3847874402999878, + "learning_rate": 2.8701754385964912e-05, + "loss": 0.4991, + "step": 10829 + }, + { + "epoch": 13.902439024390244, + "grad_norm": 1.1621019840240479, + "learning_rate": 2.870132648694908e-05, + "loss": 0.5332, + "step": 10830 + }, + { + "epoch": 13.90372272143774, + "grad_norm": 1.7660672664642334, + "learning_rate": 2.870089858793325e-05, + "loss": 0.5195, + "step": 10831 + }, + { + "epoch": 13.905006418485238, + "grad_norm": 0.9894919395446777, + "learning_rate": 2.8700470688917417e-05, + "loss": 0.5265, + "step": 10832 + }, + { + "epoch": 13.906290115532734, + "grad_norm": 0.7518975734710693, + "learning_rate": 2.8700042789901582e-05, + "loss": 0.4952, + "step": 10833 + }, + { + "epoch": 13.907573812580232, + "grad_norm": 1.2378156185150146, + "learning_rate": 2.869961489088575e-05, + "loss": 0.4681, + "step": 10834 + }, + { + "epoch": 13.908857509627728, + "grad_norm": 1.0080825090408325, + "learning_rate": 2.869918699186992e-05, + "loss": 0.4677, + "step": 10835 + }, + { + "epoch": 13.910141206675224, + "grad_norm": 1.221330165863037, + "learning_rate": 2.8698759092854084e-05, + "loss": 0.5126, + "step": 10836 + }, + { + "epoch": 13.911424903722722, + "grad_norm": 1.4339183568954468, + "learning_rate": 2.8698331193838256e-05, + "loss": 0.5086, + "step": 10837 + }, + { + "epoch": 13.912708600770218, + "grad_norm": 1.565328598022461, + "learning_rate": 2.869790329482242e-05, + "loss": 0.5513, + "step": 10838 + }, + { + "epoch": 13.913992297817716, + "grad_norm": 1.0358285903930664, + "learning_rate": 2.8697475395806593e-05, + "loss": 0.5239, + "step": 10839 + }, + { + "epoch": 13.915275994865212, + "grad_norm": 1.2693685293197632, + "learning_rate": 2.8697047496790758e-05, + "loss": 0.5344, + "step": 10840 + }, + { + "epoch": 13.916559691912708, + "grad_norm": 1.1585772037506104, + "learning_rate": 2.8696619597774923e-05, + "loss": 0.4732, + "step": 10841 + }, + { + "epoch": 13.917843388960206, + "grad_norm": 1.0407910346984863, + "learning_rate": 2.8696191698759095e-05, + "loss": 0.5008, + "step": 10842 + }, + { + "epoch": 13.919127086007702, + "grad_norm": 0.9924984574317932, + "learning_rate": 2.869576379974326e-05, + "loss": 0.5616, + "step": 10843 + }, + { + "epoch": 13.9204107830552, + "grad_norm": 1.3939828872680664, + "learning_rate": 2.8695335900727428e-05, + "loss": 0.533, + "step": 10844 + }, + { + "epoch": 13.921694480102696, + "grad_norm": 1.0087933540344238, + "learning_rate": 2.8694908001711597e-05, + "loss": 0.5129, + "step": 10845 + }, + { + "epoch": 13.922978177150192, + "grad_norm": 2.1208813190460205, + "learning_rate": 2.8694480102695765e-05, + "loss": 0.5359, + "step": 10846 + }, + { + "epoch": 13.92426187419769, + "grad_norm": 1.310182809829712, + "learning_rate": 2.8694052203679933e-05, + "loss": 0.5197, + "step": 10847 + }, + { + "epoch": 13.925545571245186, + "grad_norm": 2.477353572845459, + "learning_rate": 2.86936243046641e-05, + "loss": 0.5301, + "step": 10848 + }, + { + "epoch": 13.926829268292684, + "grad_norm": 1.4889912605285645, + "learning_rate": 2.8693196405648267e-05, + "loss": 0.5006, + "step": 10849 + }, + { + "epoch": 13.92811296534018, + "grad_norm": 1.5042251348495483, + "learning_rate": 2.8692768506632435e-05, + "loss": 0.4885, + "step": 10850 + }, + { + "epoch": 13.929396662387676, + "grad_norm": 1.5185127258300781, + "learning_rate": 2.8692340607616604e-05, + "loss": 0.5114, + "step": 10851 + }, + { + "epoch": 13.930680359435174, + "grad_norm": 1.0634669065475464, + "learning_rate": 2.869191270860077e-05, + "loss": 0.4807, + "step": 10852 + }, + { + "epoch": 13.93196405648267, + "grad_norm": 1.1137787103652954, + "learning_rate": 2.869148480958494e-05, + "loss": 0.5227, + "step": 10853 + }, + { + "epoch": 13.933247753530168, + "grad_norm": 1.4711179733276367, + "learning_rate": 2.8691056910569106e-05, + "loss": 0.5074, + "step": 10854 + }, + { + "epoch": 13.934531450577664, + "grad_norm": 1.2554997205734253, + "learning_rate": 2.8690629011553274e-05, + "loss": 0.5124, + "step": 10855 + }, + { + "epoch": 13.93581514762516, + "grad_norm": 3.6120412349700928, + "learning_rate": 2.8690201112537442e-05, + "loss": 0.5308, + "step": 10856 + }, + { + "epoch": 13.937098844672658, + "grad_norm": 1.2530698776245117, + "learning_rate": 2.8689773213521607e-05, + "loss": 0.5406, + "step": 10857 + }, + { + "epoch": 13.938382541720154, + "grad_norm": 3.001809597015381, + "learning_rate": 2.868934531450578e-05, + "loss": 0.5248, + "step": 10858 + }, + { + "epoch": 13.93966623876765, + "grad_norm": 2.465888023376465, + "learning_rate": 2.8688917415489944e-05, + "loss": 0.4815, + "step": 10859 + }, + { + "epoch": 13.940949935815148, + "grad_norm": 2.9137182235717773, + "learning_rate": 2.8688489516474113e-05, + "loss": 0.5509, + "step": 10860 + }, + { + "epoch": 13.942233632862644, + "grad_norm": 1.3753236532211304, + "learning_rate": 2.868806161745828e-05, + "loss": 0.5691, + "step": 10861 + }, + { + "epoch": 13.943517329910142, + "grad_norm": 2.4740147590637207, + "learning_rate": 2.8687633718442446e-05, + "loss": 0.5469, + "step": 10862 + }, + { + "epoch": 13.944801026957638, + "grad_norm": 1.4652854204177856, + "learning_rate": 2.8687205819426618e-05, + "loss": 0.5769, + "step": 10863 + }, + { + "epoch": 13.946084724005134, + "grad_norm": 2.3604066371917725, + "learning_rate": 2.8686777920410783e-05, + "loss": 0.5378, + "step": 10864 + }, + { + "epoch": 13.947368421052632, + "grad_norm": 1.967726707458496, + "learning_rate": 2.868635002139495e-05, + "loss": 0.5482, + "step": 10865 + }, + { + "epoch": 13.948652118100128, + "grad_norm": 1.226601004600525, + "learning_rate": 2.868592212237912e-05, + "loss": 0.5318, + "step": 10866 + }, + { + "epoch": 13.949935815147626, + "grad_norm": 3.8558847904205322, + "learning_rate": 2.8685494223363288e-05, + "loss": 0.5752, + "step": 10867 + }, + { + "epoch": 13.951219512195122, + "grad_norm": 2.3010737895965576, + "learning_rate": 2.8685066324347453e-05, + "loss": 0.5776, + "step": 10868 + }, + { + "epoch": 13.952503209242618, + "grad_norm": 2.8981051445007324, + "learning_rate": 2.868463842533162e-05, + "loss": 0.6043, + "step": 10869 + }, + { + "epoch": 13.953786906290116, + "grad_norm": 3.039445638656616, + "learning_rate": 2.868421052631579e-05, + "loss": 0.5713, + "step": 10870 + }, + { + "epoch": 13.955070603337612, + "grad_norm": 1.5916918516159058, + "learning_rate": 2.868378262729996e-05, + "loss": 0.6365, + "step": 10871 + }, + { + "epoch": 13.95635430038511, + "grad_norm": 3.9379618167877197, + "learning_rate": 2.8683354728284127e-05, + "loss": 0.5699, + "step": 10872 + }, + { + "epoch": 13.957637997432606, + "grad_norm": 2.499406099319458, + "learning_rate": 2.8682926829268292e-05, + "loss": 0.6071, + "step": 10873 + }, + { + "epoch": 13.958921694480102, + "grad_norm": 6.570038795471191, + "learning_rate": 2.8682498930252464e-05, + "loss": 0.5703, + "step": 10874 + }, + { + "epoch": 13.9602053915276, + "grad_norm": 3.1464662551879883, + "learning_rate": 2.868207103123663e-05, + "loss": 0.6391, + "step": 10875 + }, + { + "epoch": 13.961489088575096, + "grad_norm": 2.3534162044525146, + "learning_rate": 2.8681643132220794e-05, + "loss": 0.637, + "step": 10876 + }, + { + "epoch": 13.962772785622594, + "grad_norm": 2.3212392330169678, + "learning_rate": 2.8681215233204965e-05, + "loss": 0.793, + "step": 10877 + }, + { + "epoch": 13.96405648267009, + "grad_norm": 2.1599698066711426, + "learning_rate": 2.868078733418913e-05, + "loss": 0.5614, + "step": 10878 + }, + { + "epoch": 13.965340179717586, + "grad_norm": 0.9565873146057129, + "learning_rate": 2.8680359435173302e-05, + "loss": 0.5643, + "step": 10879 + }, + { + "epoch": 13.966623876765084, + "grad_norm": 1.12610924243927, + "learning_rate": 2.8679931536157467e-05, + "loss": 0.4832, + "step": 10880 + }, + { + "epoch": 13.96790757381258, + "grad_norm": 1.2377705574035645, + "learning_rate": 2.8679503637141636e-05, + "loss": 0.5325, + "step": 10881 + }, + { + "epoch": 13.969191270860078, + "grad_norm": 1.7483848333358765, + "learning_rate": 2.8679075738125804e-05, + "loss": 0.548, + "step": 10882 + }, + { + "epoch": 13.970474967907574, + "grad_norm": 1.2342102527618408, + "learning_rate": 2.867864783910997e-05, + "loss": 0.5062, + "step": 10883 + }, + { + "epoch": 13.97175866495507, + "grad_norm": 1.1977322101593018, + "learning_rate": 2.8678219940094138e-05, + "loss": 0.5622, + "step": 10884 + }, + { + "epoch": 13.973042362002568, + "grad_norm": 2.552680730819702, + "learning_rate": 2.8677792041078306e-05, + "loss": 0.4891, + "step": 10885 + }, + { + "epoch": 13.974326059050064, + "grad_norm": 1.6121599674224854, + "learning_rate": 2.8677364142062474e-05, + "loss": 0.515, + "step": 10886 + }, + { + "epoch": 13.975609756097562, + "grad_norm": 2.204066038131714, + "learning_rate": 2.8676936243046643e-05, + "loss": 0.4937, + "step": 10887 + }, + { + "epoch": 13.976893453145058, + "grad_norm": 1.3430067300796509, + "learning_rate": 2.867650834403081e-05, + "loss": 0.5039, + "step": 10888 + }, + { + "epoch": 13.978177150192554, + "grad_norm": 1.317275881767273, + "learning_rate": 2.8676080445014976e-05, + "loss": 0.5244, + "step": 10889 + }, + { + "epoch": 13.979460847240052, + "grad_norm": 1.1596819162368774, + "learning_rate": 2.8675652545999145e-05, + "loss": 0.5461, + "step": 10890 + }, + { + "epoch": 13.980744544287548, + "grad_norm": 1.3753321170806885, + "learning_rate": 2.8675224646983313e-05, + "loss": 0.5349, + "step": 10891 + }, + { + "epoch": 13.982028241335044, + "grad_norm": 1.0040783882141113, + "learning_rate": 2.8674796747967478e-05, + "loss": 0.5298, + "step": 10892 + }, + { + "epoch": 13.983311938382542, + "grad_norm": 1.7383973598480225, + "learning_rate": 2.867436884895165e-05, + "loss": 0.5241, + "step": 10893 + }, + { + "epoch": 13.984595635430038, + "grad_norm": 1.7305519580841064, + "learning_rate": 2.8673940949935815e-05, + "loss": 0.5704, + "step": 10894 + }, + { + "epoch": 13.985879332477536, + "grad_norm": 1.7567331790924072, + "learning_rate": 2.8673513050919983e-05, + "loss": 0.5845, + "step": 10895 + }, + { + "epoch": 13.987163029525032, + "grad_norm": 1.1871989965438843, + "learning_rate": 2.8673085151904152e-05, + "loss": 0.5218, + "step": 10896 + }, + { + "epoch": 13.988446726572528, + "grad_norm": 2.532487392425537, + "learning_rate": 2.8672657252888317e-05, + "loss": 0.5063, + "step": 10897 + }, + { + "epoch": 13.989730423620026, + "grad_norm": 5.046580791473389, + "learning_rate": 2.867222935387249e-05, + "loss": 0.5523, + "step": 10898 + }, + { + "epoch": 13.991014120667522, + "grad_norm": 1.5054526329040527, + "learning_rate": 2.8671801454856654e-05, + "loss": 0.5613, + "step": 10899 + }, + { + "epoch": 13.99229781771502, + "grad_norm": 1.4843839406967163, + "learning_rate": 2.8671373555840822e-05, + "loss": 0.5744, + "step": 10900 + }, + { + "epoch": 13.993581514762516, + "grad_norm": 1.0141547918319702, + "learning_rate": 2.867094565682499e-05, + "loss": 0.6009, + "step": 10901 + }, + { + "epoch": 13.994865211810012, + "grad_norm": 6.9947404861450195, + "learning_rate": 2.8670517757809155e-05, + "loss": 0.549, + "step": 10902 + }, + { + "epoch": 13.99614890885751, + "grad_norm": 3.0876448154449463, + "learning_rate": 2.8670089858793327e-05, + "loss": 0.5696, + "step": 10903 + }, + { + "epoch": 13.997432605905006, + "grad_norm": 3.106860637664795, + "learning_rate": 2.8669661959777492e-05, + "loss": 0.6148, + "step": 10904 + }, + { + "epoch": 13.998716302952504, + "grad_norm": 3.636317729949951, + "learning_rate": 2.866923406076166e-05, + "loss": 0.5913, + "step": 10905 + }, + { + "epoch": 14.0, + "grad_norm": 2.3663547039031982, + "learning_rate": 2.866880616174583e-05, + "loss": 0.7208, + "step": 10906 + }, + { + "epoch": 14.001283697047496, + "grad_norm": 1.1206539869308472, + "learning_rate": 2.8668378262729998e-05, + "loss": 0.4965, + "step": 10907 + }, + { + "epoch": 14.002567394094994, + "grad_norm": 1.0890228748321533, + "learning_rate": 2.8667950363714163e-05, + "loss": 0.5157, + "step": 10908 + }, + { + "epoch": 14.00385109114249, + "grad_norm": 2.4026126861572266, + "learning_rate": 2.866752246469833e-05, + "loss": 0.5155, + "step": 10909 + }, + { + "epoch": 14.005134788189988, + "grad_norm": 1.1998990774154663, + "learning_rate": 2.86670945656825e-05, + "loss": 0.5471, + "step": 10910 + }, + { + "epoch": 14.006418485237484, + "grad_norm": 2.276020050048828, + "learning_rate": 2.8666666666666668e-05, + "loss": 0.5165, + "step": 10911 + }, + { + "epoch": 14.00770218228498, + "grad_norm": 1.0234839916229248, + "learning_rate": 2.8666238767650836e-05, + "loss": 0.5158, + "step": 10912 + }, + { + "epoch": 14.008985879332478, + "grad_norm": 2.9079318046569824, + "learning_rate": 2.8665810868635e-05, + "loss": 0.5246, + "step": 10913 + }, + { + "epoch": 14.010269576379974, + "grad_norm": 1.1233729124069214, + "learning_rate": 2.8665382969619173e-05, + "loss": 0.4889, + "step": 10914 + }, + { + "epoch": 14.011553273427472, + "grad_norm": 1.56806218624115, + "learning_rate": 2.8664955070603338e-05, + "loss": 0.5039, + "step": 10915 + }, + { + "epoch": 14.012836970474968, + "grad_norm": 1.1446410417556763, + "learning_rate": 2.8664527171587503e-05, + "loss": 0.4934, + "step": 10916 + }, + { + "epoch": 14.014120667522464, + "grad_norm": 1.1757994890213013, + "learning_rate": 2.8664099272571675e-05, + "loss": 0.5159, + "step": 10917 + }, + { + "epoch": 14.015404364569962, + "grad_norm": 1.4228304624557495, + "learning_rate": 2.866367137355584e-05, + "loss": 0.5137, + "step": 10918 + }, + { + "epoch": 14.016688061617458, + "grad_norm": 0.9984022378921509, + "learning_rate": 2.866324347454001e-05, + "loss": 0.474, + "step": 10919 + }, + { + "epoch": 14.017971758664956, + "grad_norm": 1.358405590057373, + "learning_rate": 2.8662815575524177e-05, + "loss": 0.5234, + "step": 10920 + }, + { + "epoch": 14.019255455712452, + "grad_norm": 1.790175199508667, + "learning_rate": 2.8662387676508345e-05, + "loss": 0.5539, + "step": 10921 + }, + { + "epoch": 14.020539152759948, + "grad_norm": 1.010683536529541, + "learning_rate": 2.8661959777492514e-05, + "loss": 0.5093, + "step": 10922 + }, + { + "epoch": 14.021822849807446, + "grad_norm": 1.2310454845428467, + "learning_rate": 2.866153187847668e-05, + "loss": 0.5319, + "step": 10923 + }, + { + "epoch": 14.023106546854942, + "grad_norm": 1.2921295166015625, + "learning_rate": 2.8661103979460847e-05, + "loss": 0.4987, + "step": 10924 + }, + { + "epoch": 14.024390243902438, + "grad_norm": 1.7980191707611084, + "learning_rate": 2.8660676080445015e-05, + "loss": 0.4973, + "step": 10925 + }, + { + "epoch": 14.025673940949936, + "grad_norm": 1.2188231945037842, + "learning_rate": 2.8660248181429184e-05, + "loss": 0.4965, + "step": 10926 + }, + { + "epoch": 14.026957637997432, + "grad_norm": 1.9232760667800903, + "learning_rate": 2.8659820282413352e-05, + "loss": 0.5325, + "step": 10927 + }, + { + "epoch": 14.02824133504493, + "grad_norm": 1.4358959197998047, + "learning_rate": 2.865939238339752e-05, + "loss": 0.4906, + "step": 10928 + }, + { + "epoch": 14.029525032092426, + "grad_norm": 1.7801737785339355, + "learning_rate": 2.8658964484381686e-05, + "loss": 0.5308, + "step": 10929 + }, + { + "epoch": 14.030808729139922, + "grad_norm": 0.8925427794456482, + "learning_rate": 2.8658536585365854e-05, + "loss": 0.5412, + "step": 10930 + }, + { + "epoch": 14.03209242618742, + "grad_norm": 1.6833072900772095, + "learning_rate": 2.8658108686350022e-05, + "loss": 0.5304, + "step": 10931 + }, + { + "epoch": 14.033376123234916, + "grad_norm": 1.7062866687774658, + "learning_rate": 2.8657680787334187e-05, + "loss": 0.5172, + "step": 10932 + }, + { + "epoch": 14.034659820282414, + "grad_norm": 1.8081800937652588, + "learning_rate": 2.865725288831836e-05, + "loss": 0.5462, + "step": 10933 + }, + { + "epoch": 14.03594351732991, + "grad_norm": 5.152360916137695, + "learning_rate": 2.8656824989302524e-05, + "loss": 0.5126, + "step": 10934 + }, + { + "epoch": 14.037227214377406, + "grad_norm": 2.0511209964752197, + "learning_rate": 2.8656397090286696e-05, + "loss": 0.46, + "step": 10935 + }, + { + "epoch": 14.038510911424904, + "grad_norm": 1.350645899772644, + "learning_rate": 2.865596919127086e-05, + "loss": 0.5293, + "step": 10936 + }, + { + "epoch": 14.0397946084724, + "grad_norm": 0.8926303386688232, + "learning_rate": 2.8655541292255026e-05, + "loss": 0.54, + "step": 10937 + }, + { + "epoch": 14.041078305519898, + "grad_norm": 1.5964736938476562, + "learning_rate": 2.8655113393239198e-05, + "loss": 0.4875, + "step": 10938 + }, + { + "epoch": 14.042362002567394, + "grad_norm": 1.6258479356765747, + "learning_rate": 2.8654685494223363e-05, + "loss": 0.503, + "step": 10939 + }, + { + "epoch": 14.04364569961489, + "grad_norm": 1.101840853691101, + "learning_rate": 2.865425759520753e-05, + "loss": 0.5258, + "step": 10940 + }, + { + "epoch": 14.044929396662388, + "grad_norm": 1.6455219984054565, + "learning_rate": 2.86538296961917e-05, + "loss": 0.5082, + "step": 10941 + }, + { + "epoch": 14.046213093709884, + "grad_norm": 1.3700913190841675, + "learning_rate": 2.8653401797175868e-05, + "loss": 0.5282, + "step": 10942 + }, + { + "epoch": 14.047496790757382, + "grad_norm": 1.305930495262146, + "learning_rate": 2.8652973898160037e-05, + "loss": 0.5662, + "step": 10943 + }, + { + "epoch": 14.048780487804878, + "grad_norm": 1.6781214475631714, + "learning_rate": 2.86525459991442e-05, + "loss": 0.5393, + "step": 10944 + }, + { + "epoch": 14.050064184852374, + "grad_norm": 1.883116364479065, + "learning_rate": 2.865211810012837e-05, + "loss": 0.5522, + "step": 10945 + }, + { + "epoch": 14.051347881899872, + "grad_norm": 1.0218205451965332, + "learning_rate": 2.865169020111254e-05, + "loss": 0.5455, + "step": 10946 + }, + { + "epoch": 14.052631578947368, + "grad_norm": 2.403573513031006, + "learning_rate": 2.8651262302096707e-05, + "loss": 0.5375, + "step": 10947 + }, + { + "epoch": 14.053915275994866, + "grad_norm": 1.985940933227539, + "learning_rate": 2.8650834403080872e-05, + "loss": 0.569, + "step": 10948 + }, + { + "epoch": 14.055198973042362, + "grad_norm": 2.0878806114196777, + "learning_rate": 2.8650406504065044e-05, + "loss": 0.5652, + "step": 10949 + }, + { + "epoch": 14.056482670089858, + "grad_norm": 1.4133654832839966, + "learning_rate": 2.864997860504921e-05, + "loss": 0.5342, + "step": 10950 + }, + { + "epoch": 14.057766367137356, + "grad_norm": 2.881403923034668, + "learning_rate": 2.8649550706033377e-05, + "loss": 0.5159, + "step": 10951 + }, + { + "epoch": 14.059050064184852, + "grad_norm": 1.408240795135498, + "learning_rate": 2.8649122807017546e-05, + "loss": 0.5754, + "step": 10952 + }, + { + "epoch": 14.06033376123235, + "grad_norm": 3.259528875350952, + "learning_rate": 2.864869490800171e-05, + "loss": 0.6495, + "step": 10953 + }, + { + "epoch": 14.061617458279846, + "grad_norm": 1.3086657524108887, + "learning_rate": 2.8648267008985882e-05, + "loss": 0.553, + "step": 10954 + }, + { + "epoch": 14.062901155327342, + "grad_norm": 6.080264091491699, + "learning_rate": 2.8647839109970047e-05, + "loss": 0.6755, + "step": 10955 + }, + { + "epoch": 14.06418485237484, + "grad_norm": 3.0951926708221436, + "learning_rate": 2.8647411210954216e-05, + "loss": 0.7057, + "step": 10956 + }, + { + "epoch": 14.065468549422336, + "grad_norm": 1.0377023220062256, + "learning_rate": 2.8646983311938384e-05, + "loss": 0.4944, + "step": 10957 + }, + { + "epoch": 14.066752246469832, + "grad_norm": 1.0255457162857056, + "learning_rate": 2.864655541292255e-05, + "loss": 0.4924, + "step": 10958 + }, + { + "epoch": 14.06803594351733, + "grad_norm": 1.4935239553451538, + "learning_rate": 2.864612751390672e-05, + "loss": 0.4505, + "step": 10959 + }, + { + "epoch": 14.069319640564826, + "grad_norm": 3.5959577560424805, + "learning_rate": 2.8645699614890886e-05, + "loss": 0.5641, + "step": 10960 + }, + { + "epoch": 14.070603337612324, + "grad_norm": 1.1124778985977173, + "learning_rate": 2.8645271715875054e-05, + "loss": 0.4979, + "step": 10961 + }, + { + "epoch": 14.07188703465982, + "grad_norm": 0.9284899234771729, + "learning_rate": 2.8644843816859223e-05, + "loss": 0.4672, + "step": 10962 + }, + { + "epoch": 14.073170731707316, + "grad_norm": 6.12245512008667, + "learning_rate": 2.8644415917843388e-05, + "loss": 0.5028, + "step": 10963 + }, + { + "epoch": 14.074454428754814, + "grad_norm": 4.420348644256592, + "learning_rate": 2.8643988018827556e-05, + "loss": 0.5095, + "step": 10964 + }, + { + "epoch": 14.07573812580231, + "grad_norm": 1.36854887008667, + "learning_rate": 2.8643560119811725e-05, + "loss": 0.5009, + "step": 10965 + }, + { + "epoch": 14.077021822849808, + "grad_norm": 1.2145705223083496, + "learning_rate": 2.8643132220795893e-05, + "loss": 0.5646, + "step": 10966 + }, + { + "epoch": 14.078305519897304, + "grad_norm": 1.462760090827942, + "learning_rate": 2.864270432178006e-05, + "loss": 0.5127, + "step": 10967 + }, + { + "epoch": 14.0795892169448, + "grad_norm": 2.566864252090454, + "learning_rate": 2.864227642276423e-05, + "loss": 0.5037, + "step": 10968 + }, + { + "epoch": 14.080872913992298, + "grad_norm": 0.9903876185417175, + "learning_rate": 2.8641848523748395e-05, + "loss": 0.5415, + "step": 10969 + }, + { + "epoch": 14.082156611039794, + "grad_norm": 1.419381856918335, + "learning_rate": 2.8641420624732563e-05, + "loss": 0.5593, + "step": 10970 + }, + { + "epoch": 14.083440308087292, + "grad_norm": 0.9918239712715149, + "learning_rate": 2.8640992725716732e-05, + "loss": 0.507, + "step": 10971 + }, + { + "epoch": 14.084724005134788, + "grad_norm": 0.6800997853279114, + "learning_rate": 2.8640564826700897e-05, + "loss": 0.4751, + "step": 10972 + }, + { + "epoch": 14.086007702182284, + "grad_norm": 3.4730794429779053, + "learning_rate": 2.864013692768507e-05, + "loss": 0.5204, + "step": 10973 + }, + { + "epoch": 14.087291399229782, + "grad_norm": 3.7348787784576416, + "learning_rate": 2.8639709028669234e-05, + "loss": 0.5211, + "step": 10974 + }, + { + "epoch": 14.088575096277278, + "grad_norm": 1.7288751602172852, + "learning_rate": 2.8639281129653405e-05, + "loss": 0.4855, + "step": 10975 + }, + { + "epoch": 14.089858793324776, + "grad_norm": 1.586999773979187, + "learning_rate": 2.863885323063757e-05, + "loss": 0.4868, + "step": 10976 + }, + { + "epoch": 14.091142490372272, + "grad_norm": 1.295872688293457, + "learning_rate": 2.8638425331621736e-05, + "loss": 0.4751, + "step": 10977 + }, + { + "epoch": 14.092426187419768, + "grad_norm": 1.7695201635360718, + "learning_rate": 2.8637997432605907e-05, + "loss": 0.5463, + "step": 10978 + }, + { + "epoch": 14.093709884467266, + "grad_norm": 3.5698933601379395, + "learning_rate": 2.8637569533590072e-05, + "loss": 0.4928, + "step": 10979 + }, + { + "epoch": 14.094993581514762, + "grad_norm": 1.7063928842544556, + "learning_rate": 2.863714163457424e-05, + "loss": 0.5019, + "step": 10980 + }, + { + "epoch": 14.09627727856226, + "grad_norm": 1.3204843997955322, + "learning_rate": 2.863671373555841e-05, + "loss": 0.5039, + "step": 10981 + }, + { + "epoch": 14.097560975609756, + "grad_norm": 1.0347191095352173, + "learning_rate": 2.8636285836542578e-05, + "loss": 0.5192, + "step": 10982 + }, + { + "epoch": 14.098844672657252, + "grad_norm": 1.2296162843704224, + "learning_rate": 2.8635857937526746e-05, + "loss": 0.481, + "step": 10983 + }, + { + "epoch": 14.10012836970475, + "grad_norm": 1.4371757507324219, + "learning_rate": 2.863543003851091e-05, + "loss": 0.4907, + "step": 10984 + }, + { + "epoch": 14.101412066752246, + "grad_norm": 1.4584046602249146, + "learning_rate": 2.863500213949508e-05, + "loss": 0.4939, + "step": 10985 + }, + { + "epoch": 14.102695763799744, + "grad_norm": 6.130337715148926, + "learning_rate": 2.8634574240479248e-05, + "loss": 0.5201, + "step": 10986 + }, + { + "epoch": 14.10397946084724, + "grad_norm": 3.6623144149780273, + "learning_rate": 2.8634146341463416e-05, + "loss": 0.5082, + "step": 10987 + }, + { + "epoch": 14.105263157894736, + "grad_norm": 0.8951476812362671, + "learning_rate": 2.863371844244758e-05, + "loss": 0.4912, + "step": 10988 + }, + { + "epoch": 14.106546854942234, + "grad_norm": 1.2933313846588135, + "learning_rate": 2.8633290543431753e-05, + "loss": 0.4903, + "step": 10989 + }, + { + "epoch": 14.10783055198973, + "grad_norm": 3.989368200302124, + "learning_rate": 2.8632862644415918e-05, + "loss": 0.5452, + "step": 10990 + }, + { + "epoch": 14.109114249037226, + "grad_norm": 0.8463482856750488, + "learning_rate": 2.8632434745400086e-05, + "loss": 0.54, + "step": 10991 + }, + { + "epoch": 14.110397946084724, + "grad_norm": 3.121518135070801, + "learning_rate": 2.8632006846384255e-05, + "loss": 0.5126, + "step": 10992 + }, + { + "epoch": 14.11168164313222, + "grad_norm": 0.9899258613586426, + "learning_rate": 2.863157894736842e-05, + "loss": 0.5571, + "step": 10993 + }, + { + "epoch": 14.112965340179718, + "grad_norm": 3.270777702331543, + "learning_rate": 2.8631151048352592e-05, + "loss": 0.526, + "step": 10994 + }, + { + "epoch": 14.114249037227214, + "grad_norm": 1.5850074291229248, + "learning_rate": 2.8630723149336757e-05, + "loss": 0.5133, + "step": 10995 + }, + { + "epoch": 14.11553273427471, + "grad_norm": 2.6081137657165527, + "learning_rate": 2.8630295250320925e-05, + "loss": 0.5532, + "step": 10996 + }, + { + "epoch": 14.116816431322208, + "grad_norm": 1.299148678779602, + "learning_rate": 2.8629867351305094e-05, + "loss": 0.5057, + "step": 10997 + }, + { + "epoch": 14.118100128369704, + "grad_norm": 1.06436288356781, + "learning_rate": 2.862943945228926e-05, + "loss": 0.5532, + "step": 10998 + }, + { + "epoch": 14.119383825417202, + "grad_norm": 5.2232890129089355, + "learning_rate": 2.862901155327343e-05, + "loss": 0.5098, + "step": 10999 + }, + { + "epoch": 14.120667522464698, + "grad_norm": 1.5998796224594116, + "learning_rate": 2.8628583654257595e-05, + "loss": 0.5812, + "step": 11000 + }, + { + "epoch": 14.120667522464698, + "eval_cer": 0.29816917955747013, + "eval_loss": 0.5365867018699646, + "eval_runtime": 13.9753, + "eval_samples_per_second": 70.338, + "eval_steps_per_second": 0.501, + "eval_wer": 0.5127264388901628, + "step": 11000 + }, + { + "epoch": 14.121951219512194, + "grad_norm": 1.4958001375198364, + "learning_rate": 2.8628155755241764e-05, + "loss": 0.5432, + "step": 11001 + }, + { + "epoch": 14.123234916559692, + "grad_norm": 1.2161492109298706, + "learning_rate": 2.8627727856225932e-05, + "loss": 0.518, + "step": 11002 + }, + { + "epoch": 14.124518613607188, + "grad_norm": 4.207080364227295, + "learning_rate": 2.86272999572101e-05, + "loss": 0.5908, + "step": 11003 + }, + { + "epoch": 14.125802310654686, + "grad_norm": 3.428077220916748, + "learning_rate": 2.8626872058194266e-05, + "loss": 0.5927, + "step": 11004 + }, + { + "epoch": 14.127086007702182, + "grad_norm": 2.124964475631714, + "learning_rate": 2.8626444159178434e-05, + "loss": 0.6165, + "step": 11005 + }, + { + "epoch": 14.128369704749678, + "grad_norm": 1.9323493242263794, + "learning_rate": 2.8626016260162603e-05, + "loss": 0.741, + "step": 11006 + }, + { + "epoch": 14.129653401797176, + "grad_norm": 2.9515182971954346, + "learning_rate": 2.862558836114677e-05, + "loss": 0.5098, + "step": 11007 + }, + { + "epoch": 14.130937098844672, + "grad_norm": 1.2239253520965576, + "learning_rate": 2.862516046213094e-05, + "loss": 0.4965, + "step": 11008 + }, + { + "epoch": 14.13222079589217, + "grad_norm": 1.6784558296203613, + "learning_rate": 2.8624732563115104e-05, + "loss": 0.5305, + "step": 11009 + }, + { + "epoch": 14.133504492939666, + "grad_norm": 2.521573305130005, + "learning_rate": 2.8624304664099276e-05, + "loss": 0.5067, + "step": 11010 + }, + { + "epoch": 14.134788189987162, + "grad_norm": 2.2715799808502197, + "learning_rate": 2.862387676508344e-05, + "loss": 0.5286, + "step": 11011 + }, + { + "epoch": 14.13607188703466, + "grad_norm": 1.2338777780532837, + "learning_rate": 2.8623448866067606e-05, + "loss": 0.5049, + "step": 11012 + }, + { + "epoch": 14.137355584082156, + "grad_norm": 1.372410774230957, + "learning_rate": 2.8623020967051778e-05, + "loss": 0.4969, + "step": 11013 + }, + { + "epoch": 14.138639281129654, + "grad_norm": 1.5191346406936646, + "learning_rate": 2.8622593068035943e-05, + "loss": 0.5407, + "step": 11014 + }, + { + "epoch": 14.13992297817715, + "grad_norm": 1.3521027565002441, + "learning_rate": 2.8622165169020115e-05, + "loss": 0.5303, + "step": 11015 + }, + { + "epoch": 14.141206675224646, + "grad_norm": 0.9857374429702759, + "learning_rate": 2.862173727000428e-05, + "loss": 0.5199, + "step": 11016 + }, + { + "epoch": 14.142490372272144, + "grad_norm": 1.8643931150436401, + "learning_rate": 2.8621309370988448e-05, + "loss": 0.5269, + "step": 11017 + }, + { + "epoch": 14.14377406931964, + "grad_norm": 2.6620113849639893, + "learning_rate": 2.8620881471972617e-05, + "loss": 0.5455, + "step": 11018 + }, + { + "epoch": 14.145057766367138, + "grad_norm": 1.5758705139160156, + "learning_rate": 2.8620453572956782e-05, + "loss": 0.4859, + "step": 11019 + }, + { + "epoch": 14.146341463414634, + "grad_norm": 1.7102009057998657, + "learning_rate": 2.862002567394095e-05, + "loss": 0.504, + "step": 11020 + }, + { + "epoch": 14.14762516046213, + "grad_norm": 1.1164841651916504, + "learning_rate": 2.861959777492512e-05, + "loss": 0.4755, + "step": 11021 + }, + { + "epoch": 14.148908857509628, + "grad_norm": 3.1397411823272705, + "learning_rate": 2.8619169875909287e-05, + "loss": 0.5025, + "step": 11022 + }, + { + "epoch": 14.150192554557124, + "grad_norm": 1.156020164489746, + "learning_rate": 2.8618741976893455e-05, + "loss": 0.4878, + "step": 11023 + }, + { + "epoch": 14.15147625160462, + "grad_norm": 1.5293688774108887, + "learning_rate": 2.861831407787762e-05, + "loss": 0.4969, + "step": 11024 + }, + { + "epoch": 14.152759948652118, + "grad_norm": 2.0623254776000977, + "learning_rate": 2.861788617886179e-05, + "loss": 0.4958, + "step": 11025 + }, + { + "epoch": 14.154043645699614, + "grad_norm": 1.391920804977417, + "learning_rate": 2.8617458279845957e-05, + "loss": 0.5373, + "step": 11026 + }, + { + "epoch": 14.155327342747112, + "grad_norm": 1.0130451917648315, + "learning_rate": 2.8617030380830126e-05, + "loss": 0.5033, + "step": 11027 + }, + { + "epoch": 14.156611039794608, + "grad_norm": 1.7293568849563599, + "learning_rate": 2.861660248181429e-05, + "loss": 0.5132, + "step": 11028 + }, + { + "epoch": 14.157894736842104, + "grad_norm": 2.251898765563965, + "learning_rate": 2.8616174582798462e-05, + "loss": 0.5148, + "step": 11029 + }, + { + "epoch": 14.159178433889602, + "grad_norm": 2.16817569732666, + "learning_rate": 2.8615746683782627e-05, + "loss": 0.6019, + "step": 11030 + }, + { + "epoch": 14.160462130937098, + "grad_norm": 1.2216929197311401, + "learning_rate": 2.8615318784766796e-05, + "loss": 0.525, + "step": 11031 + }, + { + "epoch": 14.161745827984596, + "grad_norm": 1.1339082717895508, + "learning_rate": 2.8614890885750964e-05, + "loss": 0.5167, + "step": 11032 + }, + { + "epoch": 14.163029525032092, + "grad_norm": 1.2039577960968018, + "learning_rate": 2.861446298673513e-05, + "loss": 0.5245, + "step": 11033 + }, + { + "epoch": 14.164313222079588, + "grad_norm": 1.4576624631881714, + "learning_rate": 2.86140350877193e-05, + "loss": 0.6203, + "step": 11034 + }, + { + "epoch": 14.165596919127086, + "grad_norm": 1.5000927448272705, + "learning_rate": 2.8613607188703466e-05, + "loss": 0.5386, + "step": 11035 + }, + { + "epoch": 14.166880616174582, + "grad_norm": 2.600621223449707, + "learning_rate": 2.8613179289687635e-05, + "loss": 0.5606, + "step": 11036 + }, + { + "epoch": 14.16816431322208, + "grad_norm": 3.81075119972229, + "learning_rate": 2.8612751390671803e-05, + "loss": 0.545, + "step": 11037 + }, + { + "epoch": 14.169448010269576, + "grad_norm": 1.1766384840011597, + "learning_rate": 2.8612323491655968e-05, + "loss": 0.5325, + "step": 11038 + }, + { + "epoch": 14.170731707317072, + "grad_norm": 1.1306864023208618, + "learning_rate": 2.8611895592640136e-05, + "loss": 0.552, + "step": 11039 + }, + { + "epoch": 14.17201540436457, + "grad_norm": 1.2004212141036987, + "learning_rate": 2.8611467693624305e-05, + "loss": 0.5455, + "step": 11040 + }, + { + "epoch": 14.173299101412066, + "grad_norm": 1.8699008226394653, + "learning_rate": 2.8611039794608473e-05, + "loss": 0.5405, + "step": 11041 + }, + { + "epoch": 14.174582798459564, + "grad_norm": 5.919888019561768, + "learning_rate": 2.861061189559264e-05, + "loss": 0.5253, + "step": 11042 + }, + { + "epoch": 14.17586649550706, + "grad_norm": 2.7691233158111572, + "learning_rate": 2.861018399657681e-05, + "loss": 0.5856, + "step": 11043 + }, + { + "epoch": 14.177150192554556, + "grad_norm": 2.706352710723877, + "learning_rate": 2.8609756097560975e-05, + "loss": 0.5916, + "step": 11044 + }, + { + "epoch": 14.178433889602054, + "grad_norm": 4.290264129638672, + "learning_rate": 2.8609328198545143e-05, + "loss": 0.6355, + "step": 11045 + }, + { + "epoch": 14.17971758664955, + "grad_norm": 4.84696102142334, + "learning_rate": 2.8608900299529312e-05, + "loss": 0.5516, + "step": 11046 + }, + { + "epoch": 14.181001283697048, + "grad_norm": 2.325525999069214, + "learning_rate": 2.8608472400513477e-05, + "loss": 0.5589, + "step": 11047 + }, + { + "epoch": 14.182284980744544, + "grad_norm": 1.609886884689331, + "learning_rate": 2.860804450149765e-05, + "loss": 0.5477, + "step": 11048 + }, + { + "epoch": 14.18356867779204, + "grad_norm": 3.6743662357330322, + "learning_rate": 2.8607616602481814e-05, + "loss": 0.5457, + "step": 11049 + }, + { + "epoch": 14.184852374839538, + "grad_norm": 4.582575798034668, + "learning_rate": 2.8607188703465986e-05, + "loss": 0.5242, + "step": 11050 + }, + { + "epoch": 14.186136071887034, + "grad_norm": 2.016841411590576, + "learning_rate": 2.860676080445015e-05, + "loss": 0.616, + "step": 11051 + }, + { + "epoch": 14.187419768934532, + "grad_norm": 4.258647441864014, + "learning_rate": 2.8606332905434316e-05, + "loss": 0.6092, + "step": 11052 + }, + { + "epoch": 14.188703465982028, + "grad_norm": 1.8780392408370972, + "learning_rate": 2.8605905006418487e-05, + "loss": 0.5631, + "step": 11053 + }, + { + "epoch": 14.189987163029524, + "grad_norm": 1.5868545770645142, + "learning_rate": 2.8605477107402652e-05, + "loss": 0.7073, + "step": 11054 + }, + { + "epoch": 14.191270860077022, + "grad_norm": 12.18797779083252, + "learning_rate": 2.860504920838682e-05, + "loss": 0.5794, + "step": 11055 + }, + { + "epoch": 14.192554557124518, + "grad_norm": 3.533007860183716, + "learning_rate": 2.860462130937099e-05, + "loss": 0.7307, + "step": 11056 + }, + { + "epoch": 14.193838254172016, + "grad_norm": 1.1385589838027954, + "learning_rate": 2.8604193410355158e-05, + "loss": 0.4916, + "step": 11057 + }, + { + "epoch": 14.195121951219512, + "grad_norm": 1.786099910736084, + "learning_rate": 2.8603765511339326e-05, + "loss": 0.4943, + "step": 11058 + }, + { + "epoch": 14.196405648267008, + "grad_norm": 0.8791409134864807, + "learning_rate": 2.860333761232349e-05, + "loss": 0.5133, + "step": 11059 + }, + { + "epoch": 14.197689345314506, + "grad_norm": 1.4379576444625854, + "learning_rate": 2.860290971330766e-05, + "loss": 0.5118, + "step": 11060 + }, + { + "epoch": 14.198973042362002, + "grad_norm": 1.2766249179840088, + "learning_rate": 2.8602481814291828e-05, + "loss": 0.4786, + "step": 11061 + }, + { + "epoch": 14.200256739409499, + "grad_norm": 1.6967664957046509, + "learning_rate": 2.8602053915275996e-05, + "loss": 0.531, + "step": 11062 + }, + { + "epoch": 14.201540436456996, + "grad_norm": 1.3338309526443481, + "learning_rate": 2.860162601626016e-05, + "loss": 0.4736, + "step": 11063 + }, + { + "epoch": 14.202824133504492, + "grad_norm": 1.0196620225906372, + "learning_rate": 2.8601198117244333e-05, + "loss": 0.5312, + "step": 11064 + }, + { + "epoch": 14.20410783055199, + "grad_norm": 1.6637221574783325, + "learning_rate": 2.8600770218228498e-05, + "loss": 0.5082, + "step": 11065 + }, + { + "epoch": 14.205391527599486, + "grad_norm": 3.136509656906128, + "learning_rate": 2.8600342319212667e-05, + "loss": 0.5075, + "step": 11066 + }, + { + "epoch": 14.206675224646983, + "grad_norm": 1.3518437147140503, + "learning_rate": 2.8599914420196835e-05, + "loss": 0.5122, + "step": 11067 + }, + { + "epoch": 14.20795892169448, + "grad_norm": 1.0499759912490845, + "learning_rate": 2.8599486521181e-05, + "loss": 0.5035, + "step": 11068 + }, + { + "epoch": 14.209242618741976, + "grad_norm": 0.9775916934013367, + "learning_rate": 2.8599058622165172e-05, + "loss": 0.4914, + "step": 11069 + }, + { + "epoch": 14.210526315789474, + "grad_norm": 2.0785648822784424, + "learning_rate": 2.8598630723149337e-05, + "loss": 0.5395, + "step": 11070 + }, + { + "epoch": 14.21181001283697, + "grad_norm": 2.363008499145508, + "learning_rate": 2.8598202824133505e-05, + "loss": 0.5088, + "step": 11071 + }, + { + "epoch": 14.213093709884467, + "grad_norm": 2.6579456329345703, + "learning_rate": 2.8597774925117674e-05, + "loss": 0.4957, + "step": 11072 + }, + { + "epoch": 14.214377406931964, + "grad_norm": 2.526309013366699, + "learning_rate": 2.859734702610184e-05, + "loss": 0.4954, + "step": 11073 + }, + { + "epoch": 14.21566110397946, + "grad_norm": 1.4687985181808472, + "learning_rate": 2.859691912708601e-05, + "loss": 0.4794, + "step": 11074 + }, + { + "epoch": 14.216944801026958, + "grad_norm": 1.5321050882339478, + "learning_rate": 2.8596491228070175e-05, + "loss": 0.5256, + "step": 11075 + }, + { + "epoch": 14.218228498074454, + "grad_norm": 1.4073420763015747, + "learning_rate": 2.8596063329054344e-05, + "loss": 0.4889, + "step": 11076 + }, + { + "epoch": 14.21951219512195, + "grad_norm": 2.657731771469116, + "learning_rate": 2.8595635430038512e-05, + "loss": 0.4686, + "step": 11077 + }, + { + "epoch": 14.220795892169448, + "grad_norm": 1.2024509906768799, + "learning_rate": 2.859520753102268e-05, + "loss": 0.5394, + "step": 11078 + }, + { + "epoch": 14.222079589216944, + "grad_norm": 1.359460473060608, + "learning_rate": 2.8594779632006846e-05, + "loss": 0.5369, + "step": 11079 + }, + { + "epoch": 14.223363286264442, + "grad_norm": 1.5602506399154663, + "learning_rate": 2.8594351732991014e-05, + "loss": 0.5355, + "step": 11080 + }, + { + "epoch": 14.224646983311938, + "grad_norm": 1.2011756896972656, + "learning_rate": 2.8593923833975183e-05, + "loss": 0.4977, + "step": 11081 + }, + { + "epoch": 14.225930680359435, + "grad_norm": 1.6093288660049438, + "learning_rate": 2.859349593495935e-05, + "loss": 0.5503, + "step": 11082 + }, + { + "epoch": 14.227214377406932, + "grad_norm": 3.473037004470825, + "learning_rate": 2.859306803594352e-05, + "loss": 0.514, + "step": 11083 + }, + { + "epoch": 14.228498074454428, + "grad_norm": 2.353391408920288, + "learning_rate": 2.8592640136927684e-05, + "loss": 0.5123, + "step": 11084 + }, + { + "epoch": 14.229781771501926, + "grad_norm": 2.314218759536743, + "learning_rate": 2.8592212237911853e-05, + "loss": 0.5058, + "step": 11085 + }, + { + "epoch": 14.231065468549422, + "grad_norm": 1.9617842435836792, + "learning_rate": 2.859178433889602e-05, + "loss": 0.5495, + "step": 11086 + }, + { + "epoch": 14.232349165596919, + "grad_norm": 2.6397242546081543, + "learning_rate": 2.8591356439880186e-05, + "loss": 0.5669, + "step": 11087 + }, + { + "epoch": 14.233632862644416, + "grad_norm": 2.262050151824951, + "learning_rate": 2.8590928540864358e-05, + "loss": 0.5514, + "step": 11088 + }, + { + "epoch": 14.234916559691912, + "grad_norm": 1.4037915468215942, + "learning_rate": 2.8590500641848523e-05, + "loss": 0.5395, + "step": 11089 + }, + { + "epoch": 14.23620025673941, + "grad_norm": 2.2028346061706543, + "learning_rate": 2.8590072742832695e-05, + "loss": 0.5444, + "step": 11090 + }, + { + "epoch": 14.237483953786906, + "grad_norm": 1.8234556913375854, + "learning_rate": 2.858964484381686e-05, + "loss": 0.5555, + "step": 11091 + }, + { + "epoch": 14.238767650834403, + "grad_norm": 3.5681166648864746, + "learning_rate": 2.8589216944801025e-05, + "loss": 0.5297, + "step": 11092 + }, + { + "epoch": 14.2400513478819, + "grad_norm": 2.1387550830841064, + "learning_rate": 2.8588789045785197e-05, + "loss": 0.5547, + "step": 11093 + }, + { + "epoch": 14.241335044929397, + "grad_norm": 1.4158058166503906, + "learning_rate": 2.8588361146769362e-05, + "loss": 0.5188, + "step": 11094 + }, + { + "epoch": 14.242618741976893, + "grad_norm": 2.892000675201416, + "learning_rate": 2.858793324775353e-05, + "loss": 0.5496, + "step": 11095 + }, + { + "epoch": 14.24390243902439, + "grad_norm": 4.1698174476623535, + "learning_rate": 2.85875053487377e-05, + "loss": 0.5488, + "step": 11096 + }, + { + "epoch": 14.245186136071887, + "grad_norm": 2.5678274631500244, + "learning_rate": 2.8587077449721867e-05, + "loss": 0.5863, + "step": 11097 + }, + { + "epoch": 14.246469833119384, + "grad_norm": 3.4556891918182373, + "learning_rate": 2.8586649550706035e-05, + "loss": 0.6163, + "step": 11098 + }, + { + "epoch": 14.24775353016688, + "grad_norm": 2.705451250076294, + "learning_rate": 2.85862216516902e-05, + "loss": 0.5766, + "step": 11099 + }, + { + "epoch": 14.249037227214377, + "grad_norm": 7.463961601257324, + "learning_rate": 2.858579375267437e-05, + "loss": 0.5326, + "step": 11100 + }, + { + "epoch": 14.250320924261874, + "grad_norm": 2.9022231101989746, + "learning_rate": 2.8585365853658537e-05, + "loss": 0.5557, + "step": 11101 + }, + { + "epoch": 14.25160462130937, + "grad_norm": 1.5539331436157227, + "learning_rate": 2.8584937954642706e-05, + "loss": 0.5924, + "step": 11102 + }, + { + "epoch": 14.252888318356868, + "grad_norm": 2.950305461883545, + "learning_rate": 2.858451005562687e-05, + "loss": 0.5851, + "step": 11103 + }, + { + "epoch": 14.254172015404365, + "grad_norm": 2.070605993270874, + "learning_rate": 2.8584082156611042e-05, + "loss": 0.6254, + "step": 11104 + }, + { + "epoch": 14.25545571245186, + "grad_norm": 1.3505834341049194, + "learning_rate": 2.8583654257595208e-05, + "loss": 0.6511, + "step": 11105 + }, + { + "epoch": 14.256739409499358, + "grad_norm": 2.3087050914764404, + "learning_rate": 2.8583226358579376e-05, + "loss": 0.8258, + "step": 11106 + }, + { + "epoch": 14.258023106546855, + "grad_norm": 3.831575632095337, + "learning_rate": 2.8582798459563544e-05, + "loss": 0.4895, + "step": 11107 + }, + { + "epoch": 14.259306803594352, + "grad_norm": 1.8103430271148682, + "learning_rate": 2.858237056054771e-05, + "loss": 0.469, + "step": 11108 + }, + { + "epoch": 14.260590500641849, + "grad_norm": 1.6490947008132935, + "learning_rate": 2.858194266153188e-05, + "loss": 0.4939, + "step": 11109 + }, + { + "epoch": 14.261874197689345, + "grad_norm": 1.3567754030227661, + "learning_rate": 2.8581514762516046e-05, + "loss": 0.5049, + "step": 11110 + }, + { + "epoch": 14.263157894736842, + "grad_norm": 2.170093297958374, + "learning_rate": 2.8581086863500215e-05, + "loss": 0.5051, + "step": 11111 + }, + { + "epoch": 14.264441591784339, + "grad_norm": 2.5531198978424072, + "learning_rate": 2.8580658964484383e-05, + "loss": 0.5293, + "step": 11112 + }, + { + "epoch": 14.265725288831836, + "grad_norm": 1.145835041999817, + "learning_rate": 2.8580231065468548e-05, + "loss": 0.5243, + "step": 11113 + }, + { + "epoch": 14.267008985879333, + "grad_norm": 2.0135672092437744, + "learning_rate": 2.857980316645272e-05, + "loss": 0.5113, + "step": 11114 + }, + { + "epoch": 14.268292682926829, + "grad_norm": 2.2157886028289795, + "learning_rate": 2.8579375267436885e-05, + "loss": 0.5226, + "step": 11115 + }, + { + "epoch": 14.269576379974326, + "grad_norm": 2.160013437271118, + "learning_rate": 2.8578947368421053e-05, + "loss": 0.5186, + "step": 11116 + }, + { + "epoch": 14.270860077021823, + "grad_norm": 1.2005153894424438, + "learning_rate": 2.857851946940522e-05, + "loss": 0.52, + "step": 11117 + }, + { + "epoch": 14.27214377406932, + "grad_norm": 0.9676594138145447, + "learning_rate": 2.857809157038939e-05, + "loss": 0.5776, + "step": 11118 + }, + { + "epoch": 14.273427471116817, + "grad_norm": 2.8139421939849854, + "learning_rate": 2.8577663671373555e-05, + "loss": 0.5038, + "step": 11119 + }, + { + "epoch": 14.274711168164313, + "grad_norm": 1.0235377550125122, + "learning_rate": 2.8577235772357724e-05, + "loss": 0.4997, + "step": 11120 + }, + { + "epoch": 14.27599486521181, + "grad_norm": 11.784379959106445, + "learning_rate": 2.8576807873341892e-05, + "loss": 0.5231, + "step": 11121 + }, + { + "epoch": 14.277278562259307, + "grad_norm": 1.4175949096679688, + "learning_rate": 2.857637997432606e-05, + "loss": 0.5259, + "step": 11122 + }, + { + "epoch": 14.278562259306804, + "grad_norm": 3.711040496826172, + "learning_rate": 2.857595207531023e-05, + "loss": 0.5305, + "step": 11123 + }, + { + "epoch": 14.2798459563543, + "grad_norm": 1.6955604553222656, + "learning_rate": 2.8575524176294394e-05, + "loss": 0.5128, + "step": 11124 + }, + { + "epoch": 14.281129653401797, + "grad_norm": 2.1144638061523438, + "learning_rate": 2.8575096277278566e-05, + "loss": 0.5155, + "step": 11125 + }, + { + "epoch": 14.282413350449294, + "grad_norm": 1.1111088991165161, + "learning_rate": 2.857466837826273e-05, + "loss": 0.4728, + "step": 11126 + }, + { + "epoch": 14.28369704749679, + "grad_norm": 1.1883002519607544, + "learning_rate": 2.8574240479246896e-05, + "loss": 0.4814, + "step": 11127 + }, + { + "epoch": 14.284980744544288, + "grad_norm": 2.7258148193359375, + "learning_rate": 2.8573812580231067e-05, + "loss": 0.5029, + "step": 11128 + }, + { + "epoch": 14.286264441591785, + "grad_norm": 1.163527250289917, + "learning_rate": 2.8573384681215232e-05, + "loss": 0.524, + "step": 11129 + }, + { + "epoch": 14.28754813863928, + "grad_norm": 1.6802245378494263, + "learning_rate": 2.8572956782199404e-05, + "loss": 0.4505, + "step": 11130 + }, + { + "epoch": 14.288831835686779, + "grad_norm": 1.216096043586731, + "learning_rate": 2.857252888318357e-05, + "loss": 0.4716, + "step": 11131 + }, + { + "epoch": 14.290115532734275, + "grad_norm": 1.4977667331695557, + "learning_rate": 2.8572100984167738e-05, + "loss": 0.5538, + "step": 11132 + }, + { + "epoch": 14.29139922978177, + "grad_norm": 1.289055347442627, + "learning_rate": 2.8571673085151906e-05, + "loss": 0.5686, + "step": 11133 + }, + { + "epoch": 14.292682926829269, + "grad_norm": 1.9111182689666748, + "learning_rate": 2.857124518613607e-05, + "loss": 0.4848, + "step": 11134 + }, + { + "epoch": 14.293966623876765, + "grad_norm": 3.4675443172454834, + "learning_rate": 2.857081728712024e-05, + "loss": 0.5372, + "step": 11135 + }, + { + "epoch": 14.295250320924263, + "grad_norm": 2.5406579971313477, + "learning_rate": 2.8570389388104408e-05, + "loss": 0.5174, + "step": 11136 + }, + { + "epoch": 14.296534017971759, + "grad_norm": 1.5844709873199463, + "learning_rate": 2.8569961489088576e-05, + "loss": 0.5111, + "step": 11137 + }, + { + "epoch": 14.297817715019255, + "grad_norm": 1.0410027503967285, + "learning_rate": 2.8569533590072745e-05, + "loss": 0.5377, + "step": 11138 + }, + { + "epoch": 14.299101412066753, + "grad_norm": 1.0859591960906982, + "learning_rate": 2.8569105691056913e-05, + "loss": 0.542, + "step": 11139 + }, + { + "epoch": 14.300385109114249, + "grad_norm": 0.86358642578125, + "learning_rate": 2.8568677792041078e-05, + "loss": 0.4858, + "step": 11140 + }, + { + "epoch": 14.301668806161747, + "grad_norm": 2.4748873710632324, + "learning_rate": 2.8568249893025247e-05, + "loss": 0.5163, + "step": 11141 + }, + { + "epoch": 14.302952503209243, + "grad_norm": 1.2180893421173096, + "learning_rate": 2.8567821994009415e-05, + "loss": 0.5269, + "step": 11142 + }, + { + "epoch": 14.304236200256739, + "grad_norm": 2.36974835395813, + "learning_rate": 2.856739409499358e-05, + "loss": 0.5654, + "step": 11143 + }, + { + "epoch": 14.305519897304237, + "grad_norm": 2.223614454269409, + "learning_rate": 2.8566966195977752e-05, + "loss": 0.5193, + "step": 11144 + }, + { + "epoch": 14.306803594351733, + "grad_norm": 2.3542304039001465, + "learning_rate": 2.8566538296961917e-05, + "loss": 0.5387, + "step": 11145 + }, + { + "epoch": 14.30808729139923, + "grad_norm": 2.121769428253174, + "learning_rate": 2.8566110397946085e-05, + "loss": 0.5211, + "step": 11146 + }, + { + "epoch": 14.309370988446727, + "grad_norm": 2.65657901763916, + "learning_rate": 2.8565682498930254e-05, + "loss": 0.4852, + "step": 11147 + }, + { + "epoch": 14.310654685494223, + "grad_norm": 1.1889394521713257, + "learning_rate": 2.856525459991442e-05, + "loss": 0.5533, + "step": 11148 + }, + { + "epoch": 14.31193838254172, + "grad_norm": 1.892965316772461, + "learning_rate": 2.856482670089859e-05, + "loss": 0.6157, + "step": 11149 + }, + { + "epoch": 14.313222079589217, + "grad_norm": 1.5970442295074463, + "learning_rate": 2.8564398801882756e-05, + "loss": 0.5815, + "step": 11150 + }, + { + "epoch": 14.314505776636715, + "grad_norm": 2.075761318206787, + "learning_rate": 2.8563970902866924e-05, + "loss": 0.5894, + "step": 11151 + }, + { + "epoch": 14.31578947368421, + "grad_norm": 2.210366725921631, + "learning_rate": 2.8563543003851092e-05, + "loss": 0.5883, + "step": 11152 + }, + { + "epoch": 14.317073170731707, + "grad_norm": 3.763866424560547, + "learning_rate": 2.8563115104835257e-05, + "loss": 0.5445, + "step": 11153 + }, + { + "epoch": 14.318356867779205, + "grad_norm": 3.4394843578338623, + "learning_rate": 2.856268720581943e-05, + "loss": 0.6246, + "step": 11154 + }, + { + "epoch": 14.3196405648267, + "grad_norm": 2.359863758087158, + "learning_rate": 2.8562259306803594e-05, + "loss": 0.6395, + "step": 11155 + }, + { + "epoch": 14.320924261874199, + "grad_norm": 11.178110122680664, + "learning_rate": 2.8561831407787763e-05, + "loss": 0.7889, + "step": 11156 + }, + { + "epoch": 14.322207958921695, + "grad_norm": 1.1680934429168701, + "learning_rate": 2.856140350877193e-05, + "loss": 0.4874, + "step": 11157 + }, + { + "epoch": 14.32349165596919, + "grad_norm": 1.4001436233520508, + "learning_rate": 2.85609756097561e-05, + "loss": 0.5094, + "step": 11158 + }, + { + "epoch": 14.324775353016689, + "grad_norm": 2.404340982437134, + "learning_rate": 2.8560547710740264e-05, + "loss": 0.5261, + "step": 11159 + }, + { + "epoch": 14.326059050064185, + "grad_norm": 1.1516633033752441, + "learning_rate": 2.8560119811724433e-05, + "loss": 0.5154, + "step": 11160 + }, + { + "epoch": 14.327342747111683, + "grad_norm": 1.2522755861282349, + "learning_rate": 2.85596919127086e-05, + "loss": 0.508, + "step": 11161 + }, + { + "epoch": 14.328626444159179, + "grad_norm": 1.6317943334579468, + "learning_rate": 2.855926401369277e-05, + "loss": 0.5027, + "step": 11162 + }, + { + "epoch": 14.329910141206675, + "grad_norm": 1.0241619348526, + "learning_rate": 2.8558836114676938e-05, + "loss": 0.5039, + "step": 11163 + }, + { + "epoch": 14.331193838254173, + "grad_norm": 1.30228853225708, + "learning_rate": 2.8558408215661103e-05, + "loss": 0.5346, + "step": 11164 + }, + { + "epoch": 14.332477535301669, + "grad_norm": 1.4489834308624268, + "learning_rate": 2.8557980316645275e-05, + "loss": 0.5256, + "step": 11165 + }, + { + "epoch": 14.333761232349165, + "grad_norm": 1.374328851699829, + "learning_rate": 2.855755241762944e-05, + "loss": 0.499, + "step": 11166 + }, + { + "epoch": 14.335044929396663, + "grad_norm": 4.785099506378174, + "learning_rate": 2.8557124518613605e-05, + "loss": 0.5122, + "step": 11167 + }, + { + "epoch": 14.336328626444159, + "grad_norm": 1.4309576749801636, + "learning_rate": 2.8556696619597777e-05, + "loss": 0.4826, + "step": 11168 + }, + { + "epoch": 14.337612323491657, + "grad_norm": 1.3892745971679688, + "learning_rate": 2.8556268720581942e-05, + "loss": 0.5261, + "step": 11169 + }, + { + "epoch": 14.338896020539153, + "grad_norm": 2.5070297718048096, + "learning_rate": 2.8555840821566114e-05, + "loss": 0.5287, + "step": 11170 + }, + { + "epoch": 14.340179717586649, + "grad_norm": 1.9171783924102783, + "learning_rate": 2.855541292255028e-05, + "loss": 0.4888, + "step": 11171 + }, + { + "epoch": 14.341463414634147, + "grad_norm": 2.469585418701172, + "learning_rate": 2.8554985023534447e-05, + "loss": 0.554, + "step": 11172 + }, + { + "epoch": 14.342747111681643, + "grad_norm": 1.5952476263046265, + "learning_rate": 2.8554557124518615e-05, + "loss": 0.5016, + "step": 11173 + }, + { + "epoch": 14.34403080872914, + "grad_norm": 1.529957890510559, + "learning_rate": 2.855412922550278e-05, + "loss": 0.5343, + "step": 11174 + }, + { + "epoch": 14.345314505776637, + "grad_norm": 1.3606934547424316, + "learning_rate": 2.855370132648695e-05, + "loss": 0.5137, + "step": 11175 + }, + { + "epoch": 14.346598202824133, + "grad_norm": 0.9541172385215759, + "learning_rate": 2.8553273427471117e-05, + "loss": 0.5236, + "step": 11176 + }, + { + "epoch": 14.34788189987163, + "grad_norm": 0.914444088935852, + "learning_rate": 2.8552845528455286e-05, + "loss": 0.505, + "step": 11177 + }, + { + "epoch": 14.349165596919127, + "grad_norm": 1.9345617294311523, + "learning_rate": 2.8552417629439454e-05, + "loss": 0.5042, + "step": 11178 + }, + { + "epoch": 14.350449293966625, + "grad_norm": 2.4211084842681885, + "learning_rate": 2.8551989730423623e-05, + "loss": 0.5153, + "step": 11179 + }, + { + "epoch": 14.35173299101412, + "grad_norm": 1.9763245582580566, + "learning_rate": 2.8551561831407788e-05, + "loss": 0.5143, + "step": 11180 + }, + { + "epoch": 14.353016688061617, + "grad_norm": 1.2733100652694702, + "learning_rate": 2.8551133932391956e-05, + "loss": 0.4974, + "step": 11181 + }, + { + "epoch": 14.354300385109115, + "grad_norm": 2.7378201484680176, + "learning_rate": 2.8550706033376124e-05, + "loss": 0.506, + "step": 11182 + }, + { + "epoch": 14.35558408215661, + "grad_norm": 1.0306488275527954, + "learning_rate": 2.855027813436029e-05, + "loss": 0.5308, + "step": 11183 + }, + { + "epoch": 14.356867779204109, + "grad_norm": 1.6296918392181396, + "learning_rate": 2.854985023534446e-05, + "loss": 0.4806, + "step": 11184 + }, + { + "epoch": 14.358151476251605, + "grad_norm": 1.0957388877868652, + "learning_rate": 2.8549422336328626e-05, + "loss": 0.5207, + "step": 11185 + }, + { + "epoch": 14.3594351732991, + "grad_norm": 1.4333999156951904, + "learning_rate": 2.8548994437312798e-05, + "loss": 0.5668, + "step": 11186 + }, + { + "epoch": 14.360718870346599, + "grad_norm": 1.4018640518188477, + "learning_rate": 2.8548566538296963e-05, + "loss": 0.5399, + "step": 11187 + }, + { + "epoch": 14.362002567394095, + "grad_norm": 1.6394773721694946, + "learning_rate": 2.8548138639281128e-05, + "loss": 0.5127, + "step": 11188 + }, + { + "epoch": 14.363286264441593, + "grad_norm": 1.8924692869186401, + "learning_rate": 2.85477107402653e-05, + "loss": 0.5402, + "step": 11189 + }, + { + "epoch": 14.364569961489089, + "grad_norm": 2.59647274017334, + "learning_rate": 2.8547282841249465e-05, + "loss": 0.5154, + "step": 11190 + }, + { + "epoch": 14.365853658536585, + "grad_norm": 1.183301329612732, + "learning_rate": 2.8546854942233633e-05, + "loss": 0.5403, + "step": 11191 + }, + { + "epoch": 14.367137355584083, + "grad_norm": 1.512192726135254, + "learning_rate": 2.8546427043217802e-05, + "loss": 0.5818, + "step": 11192 + }, + { + "epoch": 14.368421052631579, + "grad_norm": 1.3971248865127563, + "learning_rate": 2.854599914420197e-05, + "loss": 0.491, + "step": 11193 + }, + { + "epoch": 14.369704749679077, + "grad_norm": 1.5938369035720825, + "learning_rate": 2.854557124518614e-05, + "loss": 0.5249, + "step": 11194 + }, + { + "epoch": 14.370988446726573, + "grad_norm": 2.67376708984375, + "learning_rate": 2.8545143346170304e-05, + "loss": 0.5538, + "step": 11195 + }, + { + "epoch": 14.372272143774069, + "grad_norm": 1.4846502542495728, + "learning_rate": 2.8544715447154472e-05, + "loss": 0.6204, + "step": 11196 + }, + { + "epoch": 14.373555840821567, + "grad_norm": 1.0761984586715698, + "learning_rate": 2.854428754813864e-05, + "loss": 0.599, + "step": 11197 + }, + { + "epoch": 14.374839537869063, + "grad_norm": 1.4445313215255737, + "learning_rate": 2.854385964912281e-05, + "loss": 0.6207, + "step": 11198 + }, + { + "epoch": 14.376123234916559, + "grad_norm": 1.509890079498291, + "learning_rate": 2.8543431750106974e-05, + "loss": 0.5589, + "step": 11199 + }, + { + "epoch": 14.377406931964057, + "grad_norm": 1.8385246992111206, + "learning_rate": 2.8543003851091146e-05, + "loss": 0.5369, + "step": 11200 + }, + { + "epoch": 14.378690629011553, + "grad_norm": 1.19007408618927, + "learning_rate": 2.854257595207531e-05, + "loss": 0.5608, + "step": 11201 + }, + { + "epoch": 14.37997432605905, + "grad_norm": 1.707911729812622, + "learning_rate": 2.854214805305948e-05, + "loss": 0.5615, + "step": 11202 + }, + { + "epoch": 14.381258023106547, + "grad_norm": 1.5992614030838013, + "learning_rate": 2.8541720154043647e-05, + "loss": 0.6108, + "step": 11203 + }, + { + "epoch": 14.382541720154043, + "grad_norm": 2.440462112426758, + "learning_rate": 2.8541292255027813e-05, + "loss": 0.5739, + "step": 11204 + }, + { + "epoch": 14.38382541720154, + "grad_norm": 2.5433571338653564, + "learning_rate": 2.8540864356011984e-05, + "loss": 0.6747, + "step": 11205 + }, + { + "epoch": 14.385109114249037, + "grad_norm": 3.276076316833496, + "learning_rate": 2.854043645699615e-05, + "loss": 0.7081, + "step": 11206 + }, + { + "epoch": 14.386392811296535, + "grad_norm": 1.197383165359497, + "learning_rate": 2.8540008557980314e-05, + "loss": 0.5159, + "step": 11207 + }, + { + "epoch": 14.38767650834403, + "grad_norm": 1.32177734375, + "learning_rate": 2.8539580658964486e-05, + "loss": 0.5116, + "step": 11208 + }, + { + "epoch": 14.388960205391527, + "grad_norm": 1.0016858577728271, + "learning_rate": 2.853915275994865e-05, + "loss": 0.5065, + "step": 11209 + }, + { + "epoch": 14.390243902439025, + "grad_norm": 1.3223508596420288, + "learning_rate": 2.8538724860932823e-05, + "loss": 0.5189, + "step": 11210 + }, + { + "epoch": 14.39152759948652, + "grad_norm": 0.9457641839981079, + "learning_rate": 2.8538296961916988e-05, + "loss": 0.4939, + "step": 11211 + }, + { + "epoch": 14.392811296534019, + "grad_norm": 1.0990593433380127, + "learning_rate": 2.8537869062901156e-05, + "loss": 0.4947, + "step": 11212 + }, + { + "epoch": 14.394094993581515, + "grad_norm": 1.36319100856781, + "learning_rate": 2.8537441163885325e-05, + "loss": 0.5304, + "step": 11213 + }, + { + "epoch": 14.39537869062901, + "grad_norm": 2.758361339569092, + "learning_rate": 2.853701326486949e-05, + "loss": 0.5182, + "step": 11214 + }, + { + "epoch": 14.396662387676509, + "grad_norm": 0.795367419719696, + "learning_rate": 2.8536585365853658e-05, + "loss": 0.483, + "step": 11215 + }, + { + "epoch": 14.397946084724005, + "grad_norm": 2.5619139671325684, + "learning_rate": 2.8536157466837827e-05, + "loss": 0.5011, + "step": 11216 + }, + { + "epoch": 14.399229781771503, + "grad_norm": 2.5168187618255615, + "learning_rate": 2.8535729567821995e-05, + "loss": 0.488, + "step": 11217 + }, + { + "epoch": 14.400513478818999, + "grad_norm": 1.121518611907959, + "learning_rate": 2.8535301668806164e-05, + "loss": 0.5277, + "step": 11218 + }, + { + "epoch": 14.401797175866495, + "grad_norm": 2.1024723052978516, + "learning_rate": 2.8534873769790332e-05, + "loss": 0.4941, + "step": 11219 + }, + { + "epoch": 14.403080872913993, + "grad_norm": 2.5191195011138916, + "learning_rate": 2.8534445870774497e-05, + "loss": 0.5527, + "step": 11220 + }, + { + "epoch": 14.404364569961489, + "grad_norm": 1.0872247219085693, + "learning_rate": 2.8534017971758665e-05, + "loss": 0.5398, + "step": 11221 + }, + { + "epoch": 14.405648267008987, + "grad_norm": 1.4103518724441528, + "learning_rate": 2.8533590072742834e-05, + "loss": 0.4994, + "step": 11222 + }, + { + "epoch": 14.406931964056483, + "grad_norm": 1.3270602226257324, + "learning_rate": 2.8533162173727e-05, + "loss": 0.5293, + "step": 11223 + }, + { + "epoch": 14.408215661103979, + "grad_norm": 1.1611310243606567, + "learning_rate": 2.853273427471117e-05, + "loss": 0.5014, + "step": 11224 + }, + { + "epoch": 14.409499358151477, + "grad_norm": 1.1480084657669067, + "learning_rate": 2.8532306375695336e-05, + "loss": 0.4735, + "step": 11225 + }, + { + "epoch": 14.410783055198973, + "grad_norm": 1.088027000427246, + "learning_rate": 2.8531878476679507e-05, + "loss": 0.4857, + "step": 11226 + }, + { + "epoch": 14.41206675224647, + "grad_norm": 1.1034983396530151, + "learning_rate": 2.8531450577663672e-05, + "loss": 0.5432, + "step": 11227 + }, + { + "epoch": 14.413350449293967, + "grad_norm": 1.7154017686843872, + "learning_rate": 2.8531022678647837e-05, + "loss": 0.5435, + "step": 11228 + }, + { + "epoch": 14.414634146341463, + "grad_norm": 2.021449089050293, + "learning_rate": 2.853059477963201e-05, + "loss": 0.4959, + "step": 11229 + }, + { + "epoch": 14.41591784338896, + "grad_norm": 1.2690330743789673, + "learning_rate": 2.8530166880616174e-05, + "loss": 0.4919, + "step": 11230 + }, + { + "epoch": 14.417201540436457, + "grad_norm": 1.880265712738037, + "learning_rate": 2.8529738981600343e-05, + "loss": 0.4914, + "step": 11231 + }, + { + "epoch": 14.418485237483953, + "grad_norm": 4.852440357208252, + "learning_rate": 2.852931108258451e-05, + "loss": 0.5312, + "step": 11232 + }, + { + "epoch": 14.41976893453145, + "grad_norm": 2.0337960720062256, + "learning_rate": 2.852888318356868e-05, + "loss": 0.5006, + "step": 11233 + }, + { + "epoch": 14.421052631578947, + "grad_norm": 1.3703649044036865, + "learning_rate": 2.8528455284552848e-05, + "loss": 0.5362, + "step": 11234 + }, + { + "epoch": 14.422336328626445, + "grad_norm": 1.149348497390747, + "learning_rate": 2.8528027385537013e-05, + "loss": 0.4729, + "step": 11235 + }, + { + "epoch": 14.42362002567394, + "grad_norm": 1.1651345491409302, + "learning_rate": 2.852759948652118e-05, + "loss": 0.495, + "step": 11236 + }, + { + "epoch": 14.424903722721437, + "grad_norm": 1.6039555072784424, + "learning_rate": 2.852717158750535e-05, + "loss": 0.507, + "step": 11237 + }, + { + "epoch": 14.426187419768935, + "grad_norm": 0.9304179549217224, + "learning_rate": 2.8526743688489518e-05, + "loss": 0.4878, + "step": 11238 + }, + { + "epoch": 14.427471116816431, + "grad_norm": 1.2998073101043701, + "learning_rate": 2.8526315789473683e-05, + "loss": 0.5325, + "step": 11239 + }, + { + "epoch": 14.428754813863929, + "grad_norm": 1.461126446723938, + "learning_rate": 2.8525887890457855e-05, + "loss": 0.5366, + "step": 11240 + }, + { + "epoch": 14.430038510911425, + "grad_norm": 0.9936828017234802, + "learning_rate": 2.852545999144202e-05, + "loss": 0.5337, + "step": 11241 + }, + { + "epoch": 14.431322207958921, + "grad_norm": 1.4171146154403687, + "learning_rate": 2.8525032092426185e-05, + "loss": 0.5668, + "step": 11242 + }, + { + "epoch": 14.432605905006419, + "grad_norm": 2.1883206367492676, + "learning_rate": 2.8524604193410357e-05, + "loss": 0.5863, + "step": 11243 + }, + { + "epoch": 14.433889602053915, + "grad_norm": 1.8611241579055786, + "learning_rate": 2.8524176294394522e-05, + "loss": 0.522, + "step": 11244 + }, + { + "epoch": 14.435173299101413, + "grad_norm": 1.1320117712020874, + "learning_rate": 2.8523748395378694e-05, + "loss": 0.5135, + "step": 11245 + }, + { + "epoch": 14.436456996148909, + "grad_norm": 1.1270970106124878, + "learning_rate": 2.852332049636286e-05, + "loss": 0.5629, + "step": 11246 + }, + { + "epoch": 14.437740693196405, + "grad_norm": 2.874148368835449, + "learning_rate": 2.8522892597347027e-05, + "loss": 0.5618, + "step": 11247 + }, + { + "epoch": 14.439024390243903, + "grad_norm": 1.0960371494293213, + "learning_rate": 2.8522464698331196e-05, + "loss": 0.568, + "step": 11248 + }, + { + "epoch": 14.440308087291399, + "grad_norm": 2.022217273712158, + "learning_rate": 2.852203679931536e-05, + "loss": 0.4994, + "step": 11249 + }, + { + "epoch": 14.441591784338897, + "grad_norm": 1.0138144493103027, + "learning_rate": 2.852160890029953e-05, + "loss": 0.5221, + "step": 11250 + }, + { + "epoch": 14.442875481386393, + "grad_norm": 0.9940122365951538, + "learning_rate": 2.8521181001283697e-05, + "loss": 0.5174, + "step": 11251 + }, + { + "epoch": 14.444159178433889, + "grad_norm": 5.858369827270508, + "learning_rate": 2.8520753102267866e-05, + "loss": 0.5526, + "step": 11252 + }, + { + "epoch": 14.445442875481387, + "grad_norm": 1.2752941846847534, + "learning_rate": 2.8520325203252034e-05, + "loss": 0.5628, + "step": 11253 + }, + { + "epoch": 14.446726572528883, + "grad_norm": 2.254887342453003, + "learning_rate": 2.8519897304236203e-05, + "loss": 0.6526, + "step": 11254 + }, + { + "epoch": 14.44801026957638, + "grad_norm": 2.2599422931671143, + "learning_rate": 2.8519469405220368e-05, + "loss": 0.7025, + "step": 11255 + }, + { + "epoch": 14.449293966623877, + "grad_norm": 2.5332717895507812, + "learning_rate": 2.8519041506204536e-05, + "loss": 0.7611, + "step": 11256 + }, + { + "epoch": 14.450577663671373, + "grad_norm": 0.7773675918579102, + "learning_rate": 2.8518613607188704e-05, + "loss": 0.4794, + "step": 11257 + }, + { + "epoch": 14.45186136071887, + "grad_norm": 1.2828224897384644, + "learning_rate": 2.851818570817287e-05, + "loss": 0.5009, + "step": 11258 + }, + { + "epoch": 14.453145057766367, + "grad_norm": 1.6715667247772217, + "learning_rate": 2.851775780915704e-05, + "loss": 0.481, + "step": 11259 + }, + { + "epoch": 14.454428754813865, + "grad_norm": 0.971228837966919, + "learning_rate": 2.8517329910141206e-05, + "loss": 0.5019, + "step": 11260 + }, + { + "epoch": 14.455712451861361, + "grad_norm": 0.8116723895072937, + "learning_rate": 2.8516902011125378e-05, + "loss": 0.5232, + "step": 11261 + }, + { + "epoch": 14.456996148908857, + "grad_norm": 0.815962553024292, + "learning_rate": 2.8516474112109543e-05, + "loss": 0.5267, + "step": 11262 + }, + { + "epoch": 14.458279845956355, + "grad_norm": 1.4175525903701782, + "learning_rate": 2.8516046213093708e-05, + "loss": 0.4894, + "step": 11263 + }, + { + "epoch": 14.459563543003851, + "grad_norm": 1.1719963550567627, + "learning_rate": 2.851561831407788e-05, + "loss": 0.5545, + "step": 11264 + }, + { + "epoch": 14.460847240051347, + "grad_norm": 1.350338339805603, + "learning_rate": 2.8515190415062045e-05, + "loss": 0.535, + "step": 11265 + }, + { + "epoch": 14.462130937098845, + "grad_norm": 0.9697319269180298, + "learning_rate": 2.8514762516046213e-05, + "loss": 0.5023, + "step": 11266 + }, + { + "epoch": 14.463414634146341, + "grad_norm": 1.4068067073822021, + "learning_rate": 2.8514334617030382e-05, + "loss": 0.5039, + "step": 11267 + }, + { + "epoch": 14.464698331193839, + "grad_norm": 1.2200344800949097, + "learning_rate": 2.8513906718014547e-05, + "loss": 0.4985, + "step": 11268 + }, + { + "epoch": 14.465982028241335, + "grad_norm": 1.0869207382202148, + "learning_rate": 2.851347881899872e-05, + "loss": 0.4935, + "step": 11269 + }, + { + "epoch": 14.467265725288831, + "grad_norm": 2.2795987129211426, + "learning_rate": 2.8513050919982884e-05, + "loss": 0.5414, + "step": 11270 + }, + { + "epoch": 14.468549422336329, + "grad_norm": 0.9852678179740906, + "learning_rate": 2.8512623020967052e-05, + "loss": 0.4701, + "step": 11271 + }, + { + "epoch": 14.469833119383825, + "grad_norm": 1.4138234853744507, + "learning_rate": 2.851219512195122e-05, + "loss": 0.4958, + "step": 11272 + }, + { + "epoch": 14.471116816431323, + "grad_norm": 1.0953277349472046, + "learning_rate": 2.851176722293539e-05, + "loss": 0.5001, + "step": 11273 + }, + { + "epoch": 14.472400513478819, + "grad_norm": 1.054349422454834, + "learning_rate": 2.8511339323919554e-05, + "loss": 0.5169, + "step": 11274 + }, + { + "epoch": 14.473684210526315, + "grad_norm": 1.2972122430801392, + "learning_rate": 2.8510911424903722e-05, + "loss": 0.5126, + "step": 11275 + }, + { + "epoch": 14.474967907573813, + "grad_norm": 1.5919971466064453, + "learning_rate": 2.851048352588789e-05, + "loss": 0.5254, + "step": 11276 + }, + { + "epoch": 14.476251604621309, + "grad_norm": 1.2713481187820435, + "learning_rate": 2.851005562687206e-05, + "loss": 0.4859, + "step": 11277 + }, + { + "epoch": 14.477535301668807, + "grad_norm": 1.794670581817627, + "learning_rate": 2.8509627727856228e-05, + "loss": 0.5261, + "step": 11278 + }, + { + "epoch": 14.478818998716303, + "grad_norm": 2.044734477996826, + "learning_rate": 2.8509199828840393e-05, + "loss": 0.5033, + "step": 11279 + }, + { + "epoch": 14.480102695763799, + "grad_norm": 1.3803094625473022, + "learning_rate": 2.8508771929824564e-05, + "loss": 0.5453, + "step": 11280 + }, + { + "epoch": 14.481386392811297, + "grad_norm": 2.693096876144409, + "learning_rate": 2.850834403080873e-05, + "loss": 0.4924, + "step": 11281 + }, + { + "epoch": 14.482670089858793, + "grad_norm": 2.0000147819519043, + "learning_rate": 2.8507916131792894e-05, + "loss": 0.5061, + "step": 11282 + }, + { + "epoch": 14.48395378690629, + "grad_norm": 3.4235825538635254, + "learning_rate": 2.8507488232777066e-05, + "loss": 0.5051, + "step": 11283 + }, + { + "epoch": 14.485237483953787, + "grad_norm": 1.1560802459716797, + "learning_rate": 2.850706033376123e-05, + "loss": 0.5515, + "step": 11284 + }, + { + "epoch": 14.486521181001283, + "grad_norm": 1.4510399103164673, + "learning_rate": 2.8506632434745403e-05, + "loss": 0.5368, + "step": 11285 + }, + { + "epoch": 14.487804878048781, + "grad_norm": 1.1375434398651123, + "learning_rate": 2.8506204535729568e-05, + "loss": 0.5079, + "step": 11286 + }, + { + "epoch": 14.489088575096277, + "grad_norm": 1.4697167873382568, + "learning_rate": 2.8505776636713736e-05, + "loss": 0.5064, + "step": 11287 + }, + { + "epoch": 14.490372272143775, + "grad_norm": 2.085357427597046, + "learning_rate": 2.8505348737697905e-05, + "loss": 0.5592, + "step": 11288 + }, + { + "epoch": 14.491655969191271, + "grad_norm": 3.545893669128418, + "learning_rate": 2.850492083868207e-05, + "loss": 0.5458, + "step": 11289 + }, + { + "epoch": 14.492939666238767, + "grad_norm": 1.4712722301483154, + "learning_rate": 2.850449293966624e-05, + "loss": 0.5057, + "step": 11290 + }, + { + "epoch": 14.494223363286265, + "grad_norm": 0.9856816530227661, + "learning_rate": 2.8504065040650407e-05, + "loss": 0.5077, + "step": 11291 + }, + { + "epoch": 14.495507060333761, + "grad_norm": 1.3208919763565063, + "learning_rate": 2.8503637141634575e-05, + "loss": 0.5111, + "step": 11292 + }, + { + "epoch": 14.496790757381259, + "grad_norm": 1.3352220058441162, + "learning_rate": 2.8503209242618744e-05, + "loss": 0.5063, + "step": 11293 + }, + { + "epoch": 14.498074454428755, + "grad_norm": 3.3380494117736816, + "learning_rate": 2.8502781343602912e-05, + "loss": 0.5809, + "step": 11294 + }, + { + "epoch": 14.499358151476251, + "grad_norm": 2.6683080196380615, + "learning_rate": 2.8502353444587077e-05, + "loss": 0.5328, + "step": 11295 + }, + { + "epoch": 14.500641848523749, + "grad_norm": 3.0551843643188477, + "learning_rate": 2.8501925545571245e-05, + "loss": 0.5316, + "step": 11296 + }, + { + "epoch": 14.501925545571245, + "grad_norm": 1.5715378522872925, + "learning_rate": 2.8501497646555414e-05, + "loss": 0.5693, + "step": 11297 + }, + { + "epoch": 14.503209242618741, + "grad_norm": 1.2992799282073975, + "learning_rate": 2.850106974753958e-05, + "loss": 0.5588, + "step": 11298 + }, + { + "epoch": 14.504492939666239, + "grad_norm": 1.1301500797271729, + "learning_rate": 2.850064184852375e-05, + "loss": 0.6121, + "step": 11299 + }, + { + "epoch": 14.505776636713735, + "grad_norm": 0.8593239784240723, + "learning_rate": 2.8500213949507916e-05, + "loss": 0.5203, + "step": 11300 + }, + { + "epoch": 14.507060333761233, + "grad_norm": 0.9998995661735535, + "learning_rate": 2.8499786050492087e-05, + "loss": 0.5606, + "step": 11301 + }, + { + "epoch": 14.508344030808729, + "grad_norm": 1.2225362062454224, + "learning_rate": 2.8499358151476252e-05, + "loss": 0.5995, + "step": 11302 + }, + { + "epoch": 14.509627727856225, + "grad_norm": 1.8330862522125244, + "learning_rate": 2.8498930252460418e-05, + "loss": 0.6003, + "step": 11303 + }, + { + "epoch": 14.510911424903723, + "grad_norm": 1.300957441329956, + "learning_rate": 2.849850235344459e-05, + "loss": 0.6362, + "step": 11304 + }, + { + "epoch": 14.512195121951219, + "grad_norm": 1.593718409538269, + "learning_rate": 2.8498074454428754e-05, + "loss": 0.6349, + "step": 11305 + }, + { + "epoch": 14.513478818998717, + "grad_norm": 2.756187677383423, + "learning_rate": 2.8497646555412923e-05, + "loss": 0.7798, + "step": 11306 + }, + { + "epoch": 14.514762516046213, + "grad_norm": 1.1991304159164429, + "learning_rate": 2.849721865639709e-05, + "loss": 0.4819, + "step": 11307 + }, + { + "epoch": 14.51604621309371, + "grad_norm": 0.9600969552993774, + "learning_rate": 2.849679075738126e-05, + "loss": 0.4689, + "step": 11308 + }, + { + "epoch": 14.517329910141207, + "grad_norm": 1.271815299987793, + "learning_rate": 2.8496362858365428e-05, + "loss": 0.5099, + "step": 11309 + }, + { + "epoch": 14.518613607188703, + "grad_norm": 1.0205414295196533, + "learning_rate": 2.8495934959349593e-05, + "loss": 0.5201, + "step": 11310 + }, + { + "epoch": 14.519897304236201, + "grad_norm": 1.3660025596618652, + "learning_rate": 2.849550706033376e-05, + "loss": 0.5216, + "step": 11311 + }, + { + "epoch": 14.521181001283697, + "grad_norm": 1.016108751296997, + "learning_rate": 2.849507916131793e-05, + "loss": 0.5065, + "step": 11312 + }, + { + "epoch": 14.522464698331193, + "grad_norm": 1.2889820337295532, + "learning_rate": 2.8494651262302098e-05, + "loss": 0.5061, + "step": 11313 + }, + { + "epoch": 14.523748395378691, + "grad_norm": 0.6739168763160706, + "learning_rate": 2.8494223363286263e-05, + "loss": 0.4722, + "step": 11314 + }, + { + "epoch": 14.525032092426187, + "grad_norm": 1.8601378202438354, + "learning_rate": 2.8493795464270435e-05, + "loss": 0.4918, + "step": 11315 + }, + { + "epoch": 14.526315789473685, + "grad_norm": 3.0072529315948486, + "learning_rate": 2.84933675652546e-05, + "loss": 0.5319, + "step": 11316 + }, + { + "epoch": 14.527599486521181, + "grad_norm": 1.132348656654358, + "learning_rate": 2.849293966623877e-05, + "loss": 0.5073, + "step": 11317 + }, + { + "epoch": 14.528883183568677, + "grad_norm": 2.1178462505340576, + "learning_rate": 2.8492511767222937e-05, + "loss": 0.5224, + "step": 11318 + }, + { + "epoch": 14.530166880616175, + "grad_norm": 0.8824914693832397, + "learning_rate": 2.8492083868207102e-05, + "loss": 0.4948, + "step": 11319 + }, + { + "epoch": 14.531450577663671, + "grad_norm": 0.8571637868881226, + "learning_rate": 2.8491655969191274e-05, + "loss": 0.5326, + "step": 11320 + }, + { + "epoch": 14.532734274711169, + "grad_norm": 1.3720389604568481, + "learning_rate": 2.849122807017544e-05, + "loss": 0.5398, + "step": 11321 + }, + { + "epoch": 14.534017971758665, + "grad_norm": 1.1076778173446655, + "learning_rate": 2.8490800171159607e-05, + "loss": 0.5311, + "step": 11322 + }, + { + "epoch": 14.535301668806161, + "grad_norm": 2.1650400161743164, + "learning_rate": 2.8490372272143776e-05, + "loss": 0.5142, + "step": 11323 + }, + { + "epoch": 14.536585365853659, + "grad_norm": 0.9178436994552612, + "learning_rate": 2.848994437312794e-05, + "loss": 0.5067, + "step": 11324 + }, + { + "epoch": 14.537869062901155, + "grad_norm": 1.9270633459091187, + "learning_rate": 2.8489516474112112e-05, + "loss": 0.5028, + "step": 11325 + }, + { + "epoch": 14.539152759948653, + "grad_norm": 1.6621370315551758, + "learning_rate": 2.8489088575096277e-05, + "loss": 0.5388, + "step": 11326 + }, + { + "epoch": 14.540436456996149, + "grad_norm": 0.9594261646270752, + "learning_rate": 2.8488660676080446e-05, + "loss": 0.5292, + "step": 11327 + }, + { + "epoch": 14.541720154043645, + "grad_norm": 0.9269458651542664, + "learning_rate": 2.8488232777064614e-05, + "loss": 0.5128, + "step": 11328 + }, + { + "epoch": 14.543003851091143, + "grad_norm": 1.2183895111083984, + "learning_rate": 2.8487804878048783e-05, + "loss": 0.496, + "step": 11329 + }, + { + "epoch": 14.544287548138639, + "grad_norm": 0.9812383055686951, + "learning_rate": 2.8487376979032948e-05, + "loss": 0.4803, + "step": 11330 + }, + { + "epoch": 14.545571245186135, + "grad_norm": 0.9768988490104675, + "learning_rate": 2.8486949080017116e-05, + "loss": 0.5547, + "step": 11331 + }, + { + "epoch": 14.546854942233633, + "grad_norm": 0.9380442500114441, + "learning_rate": 2.8486521181001285e-05, + "loss": 0.5708, + "step": 11332 + }, + { + "epoch": 14.54813863928113, + "grad_norm": 2.290647268295288, + "learning_rate": 2.8486093281985453e-05, + "loss": 0.5274, + "step": 11333 + }, + { + "epoch": 14.549422336328627, + "grad_norm": 1.6602410078048706, + "learning_rate": 2.848566538296962e-05, + "loss": 0.5021, + "step": 11334 + }, + { + "epoch": 14.550706033376123, + "grad_norm": 1.293771743774414, + "learning_rate": 2.8485237483953786e-05, + "loss": 0.5172, + "step": 11335 + }, + { + "epoch": 14.55198973042362, + "grad_norm": 1.5680017471313477, + "learning_rate": 2.8484809584937955e-05, + "loss": 0.4887, + "step": 11336 + }, + { + "epoch": 14.553273427471117, + "grad_norm": 1.296775460243225, + "learning_rate": 2.8484381685922123e-05, + "loss": 0.5879, + "step": 11337 + }, + { + "epoch": 14.554557124518613, + "grad_norm": 1.8606581687927246, + "learning_rate": 2.8483953786906288e-05, + "loss": 0.5374, + "step": 11338 + }, + { + "epoch": 14.555840821566111, + "grad_norm": 0.9086663126945496, + "learning_rate": 2.848352588789046e-05, + "loss": 0.5381, + "step": 11339 + }, + { + "epoch": 14.557124518613607, + "grad_norm": 0.9999949932098389, + "learning_rate": 2.8483097988874625e-05, + "loss": 0.5371, + "step": 11340 + }, + { + "epoch": 14.558408215661103, + "grad_norm": 1.1854833364486694, + "learning_rate": 2.8482670089858797e-05, + "loss": 0.5801, + "step": 11341 + }, + { + "epoch": 14.559691912708601, + "grad_norm": 1.648566484451294, + "learning_rate": 2.8482242190842962e-05, + "loss": 0.5155, + "step": 11342 + }, + { + "epoch": 14.560975609756097, + "grad_norm": 1.632399320602417, + "learning_rate": 2.8481814291827127e-05, + "loss": 0.5347, + "step": 11343 + }, + { + "epoch": 14.562259306803595, + "grad_norm": 2.1484591960906982, + "learning_rate": 2.84813863928113e-05, + "loss": 0.496, + "step": 11344 + }, + { + "epoch": 14.563543003851091, + "grad_norm": 1.7536537647247314, + "learning_rate": 2.8480958493795464e-05, + "loss": 0.5508, + "step": 11345 + }, + { + "epoch": 14.564826700898587, + "grad_norm": 1.0114389657974243, + "learning_rate": 2.8480530594779632e-05, + "loss": 0.5188, + "step": 11346 + }, + { + "epoch": 14.566110397946085, + "grad_norm": 1.1050901412963867, + "learning_rate": 2.84801026957638e-05, + "loss": 0.517, + "step": 11347 + }, + { + "epoch": 14.567394094993581, + "grad_norm": 1.3890639543533325, + "learning_rate": 2.847967479674797e-05, + "loss": 0.5471, + "step": 11348 + }, + { + "epoch": 14.568677792041079, + "grad_norm": 2.278062582015991, + "learning_rate": 2.8479246897732137e-05, + "loss": 0.5419, + "step": 11349 + }, + { + "epoch": 14.569961489088575, + "grad_norm": 1.235758662223816, + "learning_rate": 2.8478818998716302e-05, + "loss": 0.5041, + "step": 11350 + }, + { + "epoch": 14.571245186136071, + "grad_norm": 2.4163782596588135, + "learning_rate": 2.847839109970047e-05, + "loss": 0.61, + "step": 11351 + }, + { + "epoch": 14.572528883183569, + "grad_norm": 1.2592869997024536, + "learning_rate": 2.847796320068464e-05, + "loss": 0.5498, + "step": 11352 + }, + { + "epoch": 14.573812580231065, + "grad_norm": 2.782506227493286, + "learning_rate": 2.8477535301668808e-05, + "loss": 0.6603, + "step": 11353 + }, + { + "epoch": 14.575096277278563, + "grad_norm": 33.80615234375, + "learning_rate": 2.8477107402652973e-05, + "loss": 0.6144, + "step": 11354 + }, + { + "epoch": 14.57637997432606, + "grad_norm": 2.667407512664795, + "learning_rate": 2.8476679503637144e-05, + "loss": 0.615, + "step": 11355 + }, + { + "epoch": 14.577663671373555, + "grad_norm": 3.93398118019104, + "learning_rate": 2.847625160462131e-05, + "loss": 0.7266, + "step": 11356 + }, + { + "epoch": 14.578947368421053, + "grad_norm": 1.3534530401229858, + "learning_rate": 2.8475823705605478e-05, + "loss": 0.4704, + "step": 11357 + }, + { + "epoch": 14.58023106546855, + "grad_norm": 0.9397999048233032, + "learning_rate": 2.8475395806589646e-05, + "loss": 0.5081, + "step": 11358 + }, + { + "epoch": 14.581514762516047, + "grad_norm": 1.6080818176269531, + "learning_rate": 2.847496790757381e-05, + "loss": 0.4745, + "step": 11359 + }, + { + "epoch": 14.582798459563543, + "grad_norm": 1.193949580192566, + "learning_rate": 2.8474540008557983e-05, + "loss": 0.541, + "step": 11360 + }, + { + "epoch": 14.58408215661104, + "grad_norm": 1.0843580961227417, + "learning_rate": 2.8474112109542148e-05, + "loss": 0.5267, + "step": 11361 + }, + { + "epoch": 14.585365853658537, + "grad_norm": 1.4916964769363403, + "learning_rate": 2.8473684210526317e-05, + "loss": 0.5263, + "step": 11362 + }, + { + "epoch": 14.586649550706033, + "grad_norm": 1.4110517501831055, + "learning_rate": 2.8473256311510485e-05, + "loss": 0.5161, + "step": 11363 + }, + { + "epoch": 14.58793324775353, + "grad_norm": 2.3057303428649902, + "learning_rate": 2.847282841249465e-05, + "loss": 0.4669, + "step": 11364 + }, + { + "epoch": 14.589216944801027, + "grad_norm": 3.604562997817993, + "learning_rate": 2.8472400513478822e-05, + "loss": 0.5083, + "step": 11365 + }, + { + "epoch": 14.590500641848523, + "grad_norm": 1.256223201751709, + "learning_rate": 2.8471972614462987e-05, + "loss": 0.5528, + "step": 11366 + }, + { + "epoch": 14.591784338896021, + "grad_norm": 1.20012629032135, + "learning_rate": 2.8471544715447155e-05, + "loss": 0.4973, + "step": 11367 + }, + { + "epoch": 14.593068035943517, + "grad_norm": 2.9210784435272217, + "learning_rate": 2.8471116816431324e-05, + "loss": 0.4807, + "step": 11368 + }, + { + "epoch": 14.594351732991013, + "grad_norm": 0.9026572108268738, + "learning_rate": 2.8470688917415492e-05, + "loss": 0.4866, + "step": 11369 + }, + { + "epoch": 14.595635430038511, + "grad_norm": 1.4312934875488281, + "learning_rate": 2.8470261018399657e-05, + "loss": 0.491, + "step": 11370 + }, + { + "epoch": 14.596919127086007, + "grad_norm": 1.8258545398712158, + "learning_rate": 2.8469833119383825e-05, + "loss": 0.5004, + "step": 11371 + }, + { + "epoch": 14.598202824133505, + "grad_norm": 1.9427812099456787, + "learning_rate": 2.8469405220367994e-05, + "loss": 0.5293, + "step": 11372 + }, + { + "epoch": 14.599486521181001, + "grad_norm": 1.9402525424957275, + "learning_rate": 2.8468977321352162e-05, + "loss": 0.4949, + "step": 11373 + }, + { + "epoch": 14.600770218228497, + "grad_norm": 1.520020604133606, + "learning_rate": 2.846854942233633e-05, + "loss": 0.4958, + "step": 11374 + }, + { + "epoch": 14.602053915275995, + "grad_norm": 1.3384151458740234, + "learning_rate": 2.8468121523320496e-05, + "loss": 0.5177, + "step": 11375 + }, + { + "epoch": 14.603337612323491, + "grad_norm": 5.091301918029785, + "learning_rate": 2.8467693624304668e-05, + "loss": 0.505, + "step": 11376 + }, + { + "epoch": 14.60462130937099, + "grad_norm": 2.625053882598877, + "learning_rate": 2.8467265725288833e-05, + "loss": 0.4682, + "step": 11377 + }, + { + "epoch": 14.605905006418485, + "grad_norm": 1.1145588159561157, + "learning_rate": 2.8466837826272998e-05, + "loss": 0.4928, + "step": 11378 + }, + { + "epoch": 14.607188703465981, + "grad_norm": 0.8029578924179077, + "learning_rate": 2.846640992725717e-05, + "loss": 0.5492, + "step": 11379 + }, + { + "epoch": 14.60847240051348, + "grad_norm": 1.398854374885559, + "learning_rate": 2.8465982028241334e-05, + "loss": 0.4853, + "step": 11380 + }, + { + "epoch": 14.609756097560975, + "grad_norm": 0.9726335406303406, + "learning_rate": 2.8465554129225506e-05, + "loss": 0.4742, + "step": 11381 + }, + { + "epoch": 14.611039794608473, + "grad_norm": 1.6394902467727661, + "learning_rate": 2.846512623020967e-05, + "loss": 0.5038, + "step": 11382 + }, + { + "epoch": 14.61232349165597, + "grad_norm": 0.9889063835144043, + "learning_rate": 2.846469833119384e-05, + "loss": 0.5028, + "step": 11383 + }, + { + "epoch": 14.613607188703465, + "grad_norm": 3.3790385723114014, + "learning_rate": 2.8464270432178008e-05, + "loss": 0.5211, + "step": 11384 + }, + { + "epoch": 14.614890885750963, + "grad_norm": 1.3473950624465942, + "learning_rate": 2.8463842533162173e-05, + "loss": 0.5229, + "step": 11385 + }, + { + "epoch": 14.61617458279846, + "grad_norm": 1.7853946685791016, + "learning_rate": 2.846341463414634e-05, + "loss": 0.5733, + "step": 11386 + }, + { + "epoch": 14.617458279845957, + "grad_norm": 1.6761726140975952, + "learning_rate": 2.846298673513051e-05, + "loss": 0.515, + "step": 11387 + }, + { + "epoch": 14.618741976893453, + "grad_norm": 3.274976968765259, + "learning_rate": 2.8462558836114678e-05, + "loss": 0.5644, + "step": 11388 + }, + { + "epoch": 14.62002567394095, + "grad_norm": 1.4661117792129517, + "learning_rate": 2.8462130937098847e-05, + "loss": 0.5252, + "step": 11389 + }, + { + "epoch": 14.621309370988447, + "grad_norm": 2.1341629028320312, + "learning_rate": 2.8461703038083015e-05, + "loss": 0.5598, + "step": 11390 + }, + { + "epoch": 14.622593068035943, + "grad_norm": 1.0035057067871094, + "learning_rate": 2.846127513906718e-05, + "loss": 0.5118, + "step": 11391 + }, + { + "epoch": 14.623876765083441, + "grad_norm": 0.8961331844329834, + "learning_rate": 2.846084724005135e-05, + "loss": 0.484, + "step": 11392 + }, + { + "epoch": 14.625160462130937, + "grad_norm": 1.5984269380569458, + "learning_rate": 2.8460419341035517e-05, + "loss": 0.5173, + "step": 11393 + }, + { + "epoch": 14.626444159178433, + "grad_norm": 1.2280280590057373, + "learning_rate": 2.8459991442019682e-05, + "loss": 0.5474, + "step": 11394 + }, + { + "epoch": 14.627727856225931, + "grad_norm": 2.883169174194336, + "learning_rate": 2.8459563543003854e-05, + "loss": 0.5177, + "step": 11395 + }, + { + "epoch": 14.629011553273427, + "grad_norm": 3.112173080444336, + "learning_rate": 2.845913564398802e-05, + "loss": 0.5439, + "step": 11396 + }, + { + "epoch": 14.630295250320923, + "grad_norm": 2.5128324031829834, + "learning_rate": 2.8458707744972187e-05, + "loss": 0.5733, + "step": 11397 + }, + { + "epoch": 14.631578947368421, + "grad_norm": 2.376735210418701, + "learning_rate": 2.8458279845956356e-05, + "loss": 0.5676, + "step": 11398 + }, + { + "epoch": 14.632862644415917, + "grad_norm": 1.7949552536010742, + "learning_rate": 2.845785194694052e-05, + "loss": 0.6084, + "step": 11399 + }, + { + "epoch": 14.634146341463415, + "grad_norm": 4.731709957122803, + "learning_rate": 2.8457424047924692e-05, + "loss": 0.5843, + "step": 11400 + }, + { + "epoch": 14.635430038510911, + "grad_norm": 1.9786940813064575, + "learning_rate": 2.8456996148908857e-05, + "loss": 0.5647, + "step": 11401 + }, + { + "epoch": 14.63671373555841, + "grad_norm": 2.114272356033325, + "learning_rate": 2.8456568249893026e-05, + "loss": 0.5961, + "step": 11402 + }, + { + "epoch": 14.637997432605905, + "grad_norm": 5.268378734588623, + "learning_rate": 2.8456140350877194e-05, + "loss": 0.5701, + "step": 11403 + }, + { + "epoch": 14.639281129653401, + "grad_norm": 2.6483652591705322, + "learning_rate": 2.845571245186136e-05, + "loss": 0.6252, + "step": 11404 + }, + { + "epoch": 14.6405648267009, + "grad_norm": 1.6148879528045654, + "learning_rate": 2.845528455284553e-05, + "loss": 0.6027, + "step": 11405 + }, + { + "epoch": 14.641848523748395, + "grad_norm": 1.7044930458068848, + "learning_rate": 2.8454856653829696e-05, + "loss": 0.7074, + "step": 11406 + }, + { + "epoch": 14.643132220795891, + "grad_norm": 1.8351622819900513, + "learning_rate": 2.8454428754813865e-05, + "loss": 0.4733, + "step": 11407 + }, + { + "epoch": 14.64441591784339, + "grad_norm": 0.8771982192993164, + "learning_rate": 2.8454000855798033e-05, + "loss": 0.5277, + "step": 11408 + }, + { + "epoch": 14.645699614890885, + "grad_norm": 0.8917669653892517, + "learning_rate": 2.84535729567822e-05, + "loss": 0.5111, + "step": 11409 + }, + { + "epoch": 14.646983311938383, + "grad_norm": 0.89900803565979, + "learning_rate": 2.8453145057766366e-05, + "loss": 0.5335, + "step": 11410 + }, + { + "epoch": 14.64826700898588, + "grad_norm": 2.0213022232055664, + "learning_rate": 2.8452717158750535e-05, + "loss": 0.5302, + "step": 11411 + }, + { + "epoch": 14.649550706033375, + "grad_norm": 3.2614452838897705, + "learning_rate": 2.8452289259734703e-05, + "loss": 0.4699, + "step": 11412 + }, + { + "epoch": 14.650834403080873, + "grad_norm": 0.8876367807388306, + "learning_rate": 2.845186136071887e-05, + "loss": 0.4991, + "step": 11413 + }, + { + "epoch": 14.65211810012837, + "grad_norm": 1.1928818225860596, + "learning_rate": 2.845143346170304e-05, + "loss": 0.5265, + "step": 11414 + }, + { + "epoch": 14.653401797175867, + "grad_norm": 1.2253899574279785, + "learning_rate": 2.8451005562687205e-05, + "loss": 0.5014, + "step": 11415 + }, + { + "epoch": 14.654685494223363, + "grad_norm": 2.3178110122680664, + "learning_rate": 2.8450577663671377e-05, + "loss": 0.5322, + "step": 11416 + }, + { + "epoch": 14.65596919127086, + "grad_norm": 1.0678740739822388, + "learning_rate": 2.8450149764655542e-05, + "loss": 0.5272, + "step": 11417 + }, + { + "epoch": 14.657252888318357, + "grad_norm": 1.0441454648971558, + "learning_rate": 2.8449721865639707e-05, + "loss": 0.5348, + "step": 11418 + }, + { + "epoch": 14.658536585365853, + "grad_norm": 1.8409117460250854, + "learning_rate": 2.844929396662388e-05, + "loss": 0.4865, + "step": 11419 + }, + { + "epoch": 14.659820282413351, + "grad_norm": 3.3210349082946777, + "learning_rate": 2.8448866067608044e-05, + "loss": 0.5131, + "step": 11420 + }, + { + "epoch": 14.661103979460847, + "grad_norm": 1.108107566833496, + "learning_rate": 2.8448438168592216e-05, + "loss": 0.5608, + "step": 11421 + }, + { + "epoch": 14.662387676508343, + "grad_norm": 0.8112375736236572, + "learning_rate": 2.844801026957638e-05, + "loss": 0.5385, + "step": 11422 + }, + { + "epoch": 14.663671373555841, + "grad_norm": 2.1672091484069824, + "learning_rate": 2.844758237056055e-05, + "loss": 0.5215, + "step": 11423 + }, + { + "epoch": 14.664955070603337, + "grad_norm": 1.400458812713623, + "learning_rate": 2.8447154471544717e-05, + "loss": 0.504, + "step": 11424 + }, + { + "epoch": 14.666238767650835, + "grad_norm": 6.072054862976074, + "learning_rate": 2.8446726572528882e-05, + "loss": 0.5119, + "step": 11425 + }, + { + "epoch": 14.667522464698331, + "grad_norm": 1.336520791053772, + "learning_rate": 2.844629867351305e-05, + "loss": 0.571, + "step": 11426 + }, + { + "epoch": 14.668806161745827, + "grad_norm": 1.542698860168457, + "learning_rate": 2.844587077449722e-05, + "loss": 0.4721, + "step": 11427 + }, + { + "epoch": 14.670089858793325, + "grad_norm": 4.46663236618042, + "learning_rate": 2.8445442875481388e-05, + "loss": 0.5146, + "step": 11428 + }, + { + "epoch": 14.671373555840821, + "grad_norm": 1.4596221446990967, + "learning_rate": 2.8445014976465556e-05, + "loss": 0.4798, + "step": 11429 + }, + { + "epoch": 14.672657252888317, + "grad_norm": 1.1692776679992676, + "learning_rate": 2.8444587077449724e-05, + "loss": 0.5277, + "step": 11430 + }, + { + "epoch": 14.673940949935815, + "grad_norm": 1.3142681121826172, + "learning_rate": 2.844415917843389e-05, + "loss": 0.5042, + "step": 11431 + }, + { + "epoch": 14.675224646983311, + "grad_norm": 0.9971376657485962, + "learning_rate": 2.8443731279418058e-05, + "loss": 0.4807, + "step": 11432 + }, + { + "epoch": 14.67650834403081, + "grad_norm": 1.7230854034423828, + "learning_rate": 2.8443303380402226e-05, + "loss": 0.5514, + "step": 11433 + }, + { + "epoch": 14.677792041078305, + "grad_norm": 2.8796372413635254, + "learning_rate": 2.844287548138639e-05, + "loss": 0.4926, + "step": 11434 + }, + { + "epoch": 14.679075738125803, + "grad_norm": 1.0039271116256714, + "learning_rate": 2.8442447582370563e-05, + "loss": 0.4937, + "step": 11435 + }, + { + "epoch": 14.6803594351733, + "grad_norm": 1.41249418258667, + "learning_rate": 2.8442019683354728e-05, + "loss": 0.4882, + "step": 11436 + }, + { + "epoch": 14.681643132220795, + "grad_norm": 1.0289711952209473, + "learning_rate": 2.84415917843389e-05, + "loss": 0.5123, + "step": 11437 + }, + { + "epoch": 14.682926829268293, + "grad_norm": 1.5157209634780884, + "learning_rate": 2.8441163885323065e-05, + "loss": 0.4545, + "step": 11438 + }, + { + "epoch": 14.68421052631579, + "grad_norm": 2.1311044692993164, + "learning_rate": 2.844073598630723e-05, + "loss": 0.5522, + "step": 11439 + }, + { + "epoch": 14.685494223363285, + "grad_norm": 1.1606051921844482, + "learning_rate": 2.8440308087291402e-05, + "loss": 0.5549, + "step": 11440 + }, + { + "epoch": 14.686777920410783, + "grad_norm": 3.0906317234039307, + "learning_rate": 2.8439880188275567e-05, + "loss": 0.4849, + "step": 11441 + }, + { + "epoch": 14.68806161745828, + "grad_norm": 3.695733070373535, + "learning_rate": 2.8439452289259735e-05, + "loss": 0.566, + "step": 11442 + }, + { + "epoch": 14.689345314505777, + "grad_norm": 2.2568931579589844, + "learning_rate": 2.8439024390243904e-05, + "loss": 0.526, + "step": 11443 + }, + { + "epoch": 14.690629011553273, + "grad_norm": 1.9911690950393677, + "learning_rate": 2.8438596491228072e-05, + "loss": 0.5339, + "step": 11444 + }, + { + "epoch": 14.69191270860077, + "grad_norm": 1.3016700744628906, + "learning_rate": 2.8438168592212237e-05, + "loss": 0.5542, + "step": 11445 + }, + { + "epoch": 14.693196405648267, + "grad_norm": 3.4102206230163574, + "learning_rate": 2.8437740693196406e-05, + "loss": 0.5457, + "step": 11446 + }, + { + "epoch": 14.694480102695763, + "grad_norm": 1.9296759366989136, + "learning_rate": 2.8437312794180574e-05, + "loss": 0.5477, + "step": 11447 + }, + { + "epoch": 14.695763799743261, + "grad_norm": 1.463182806968689, + "learning_rate": 2.8436884895164742e-05, + "loss": 0.546, + "step": 11448 + }, + { + "epoch": 14.697047496790757, + "grad_norm": 3.440973997116089, + "learning_rate": 2.843645699614891e-05, + "loss": 0.5522, + "step": 11449 + }, + { + "epoch": 14.698331193838253, + "grad_norm": 1.0096484422683716, + "learning_rate": 2.8436029097133076e-05, + "loss": 0.5628, + "step": 11450 + }, + { + "epoch": 14.699614890885751, + "grad_norm": 2.3444817066192627, + "learning_rate": 2.8435601198117248e-05, + "loss": 0.5086, + "step": 11451 + }, + { + "epoch": 14.700898587933247, + "grad_norm": 2.6267359256744385, + "learning_rate": 2.8435173299101413e-05, + "loss": 0.6042, + "step": 11452 + }, + { + "epoch": 14.702182284980745, + "grad_norm": 2.640498638153076, + "learning_rate": 2.8434745400085578e-05, + "loss": 0.57, + "step": 11453 + }, + { + "epoch": 14.703465982028241, + "grad_norm": 3.542567014694214, + "learning_rate": 2.843431750106975e-05, + "loss": 0.6567, + "step": 11454 + }, + { + "epoch": 14.704749679075737, + "grad_norm": 2.3789095878601074, + "learning_rate": 2.8433889602053914e-05, + "loss": 0.5993, + "step": 11455 + }, + { + "epoch": 14.706033376123235, + "grad_norm": 2.209477424621582, + "learning_rate": 2.8433461703038086e-05, + "loss": 0.8034, + "step": 11456 + }, + { + "epoch": 14.707317073170731, + "grad_norm": 1.4440593719482422, + "learning_rate": 2.843303380402225e-05, + "loss": 0.4667, + "step": 11457 + }, + { + "epoch": 14.70860077021823, + "grad_norm": 1.893489956855774, + "learning_rate": 2.8432605905006416e-05, + "loss": 0.4576, + "step": 11458 + }, + { + "epoch": 14.709884467265725, + "grad_norm": 3.2113149166107178, + "learning_rate": 2.8432178005990588e-05, + "loss": 0.5072, + "step": 11459 + }, + { + "epoch": 14.711168164313221, + "grad_norm": 1.5638912916183472, + "learning_rate": 2.8431750106974753e-05, + "loss": 0.4962, + "step": 11460 + }, + { + "epoch": 14.71245186136072, + "grad_norm": 1.8850536346435547, + "learning_rate": 2.843132220795892e-05, + "loss": 0.5437, + "step": 11461 + }, + { + "epoch": 14.713735558408215, + "grad_norm": 1.2495416402816772, + "learning_rate": 2.843089430894309e-05, + "loss": 0.5054, + "step": 11462 + }, + { + "epoch": 14.715019255455712, + "grad_norm": 1.6122950315475464, + "learning_rate": 2.843046640992726e-05, + "loss": 0.5133, + "step": 11463 + }, + { + "epoch": 14.71630295250321, + "grad_norm": 2.894451856613159, + "learning_rate": 2.8430038510911427e-05, + "loss": 0.5074, + "step": 11464 + }, + { + "epoch": 14.717586649550706, + "grad_norm": 2.218276023864746, + "learning_rate": 2.8429610611895592e-05, + "loss": 0.5281, + "step": 11465 + }, + { + "epoch": 14.718870346598203, + "grad_norm": 1.885871171951294, + "learning_rate": 2.842918271287976e-05, + "loss": 0.5455, + "step": 11466 + }, + { + "epoch": 14.7201540436457, + "grad_norm": 2.197187900543213, + "learning_rate": 2.842875481386393e-05, + "loss": 0.5005, + "step": 11467 + }, + { + "epoch": 14.721437740693197, + "grad_norm": 1.3847336769104004, + "learning_rate": 2.8428326914848097e-05, + "loss": 0.5364, + "step": 11468 + }, + { + "epoch": 14.722721437740693, + "grad_norm": 3.351039409637451, + "learning_rate": 2.8427899015832262e-05, + "loss": 0.4838, + "step": 11469 + }, + { + "epoch": 14.72400513478819, + "grad_norm": 1.8551578521728516, + "learning_rate": 2.8427471116816434e-05, + "loss": 0.5072, + "step": 11470 + }, + { + "epoch": 14.725288831835687, + "grad_norm": 0.9231565594673157, + "learning_rate": 2.84270432178006e-05, + "loss": 0.5123, + "step": 11471 + }, + { + "epoch": 14.726572528883183, + "grad_norm": 1.963268518447876, + "learning_rate": 2.8426615318784767e-05, + "loss": 0.5304, + "step": 11472 + }, + { + "epoch": 14.72785622593068, + "grad_norm": 2.236788272857666, + "learning_rate": 2.8426187419768936e-05, + "loss": 0.4838, + "step": 11473 + }, + { + "epoch": 14.729139922978177, + "grad_norm": 2.1078922748565674, + "learning_rate": 2.84257595207531e-05, + "loss": 0.5055, + "step": 11474 + }, + { + "epoch": 14.730423620025674, + "grad_norm": 1.9288429021835327, + "learning_rate": 2.8425331621737273e-05, + "loss": 0.5234, + "step": 11475 + }, + { + "epoch": 14.731707317073171, + "grad_norm": 7.090954303741455, + "learning_rate": 2.8424903722721438e-05, + "loss": 0.5159, + "step": 11476 + }, + { + "epoch": 14.732991014120667, + "grad_norm": 1.132505178451538, + "learning_rate": 2.8424475823705606e-05, + "loss": 0.5287, + "step": 11477 + }, + { + "epoch": 14.734274711168164, + "grad_norm": 1.7041535377502441, + "learning_rate": 2.8424047924689774e-05, + "loss": 0.543, + "step": 11478 + }, + { + "epoch": 14.735558408215661, + "grad_norm": 0.9841461777687073, + "learning_rate": 2.842362002567394e-05, + "loss": 0.5252, + "step": 11479 + }, + { + "epoch": 14.736842105263158, + "grad_norm": 4.181325435638428, + "learning_rate": 2.842319212665811e-05, + "loss": 0.529, + "step": 11480 + }, + { + "epoch": 14.738125802310655, + "grad_norm": 2.175910234451294, + "learning_rate": 2.8422764227642276e-05, + "loss": 0.5343, + "step": 11481 + }, + { + "epoch": 14.739409499358151, + "grad_norm": 1.680964469909668, + "learning_rate": 2.8422336328626445e-05, + "loss": 0.5031, + "step": 11482 + }, + { + "epoch": 14.740693196405648, + "grad_norm": 2.2196946144104004, + "learning_rate": 2.8421908429610613e-05, + "loss": 0.4796, + "step": 11483 + }, + { + "epoch": 14.741976893453145, + "grad_norm": 2.426121950149536, + "learning_rate": 2.842148053059478e-05, + "loss": 0.556, + "step": 11484 + }, + { + "epoch": 14.743260590500642, + "grad_norm": 1.1986756324768066, + "learning_rate": 2.8421052631578946e-05, + "loss": 0.5415, + "step": 11485 + }, + { + "epoch": 14.74454428754814, + "grad_norm": 2.2726142406463623, + "learning_rate": 2.8420624732563115e-05, + "loss": 0.5476, + "step": 11486 + }, + { + "epoch": 14.745827984595635, + "grad_norm": 1.1322672367095947, + "learning_rate": 2.8420196833547283e-05, + "loss": 0.5327, + "step": 11487 + }, + { + "epoch": 14.747111681643132, + "grad_norm": 1.7902594804763794, + "learning_rate": 2.8419768934531452e-05, + "loss": 0.5354, + "step": 11488 + }, + { + "epoch": 14.74839537869063, + "grad_norm": 1.6693708896636963, + "learning_rate": 2.841934103551562e-05, + "loss": 0.5607, + "step": 11489 + }, + { + "epoch": 14.749679075738126, + "grad_norm": 1.4874296188354492, + "learning_rate": 2.8418913136499785e-05, + "loss": 0.5183, + "step": 11490 + }, + { + "epoch": 14.750962772785623, + "grad_norm": 1.909853219985962, + "learning_rate": 2.8418485237483957e-05, + "loss": 0.5338, + "step": 11491 + }, + { + "epoch": 14.75224646983312, + "grad_norm": 2.451766014099121, + "learning_rate": 2.8418057338468122e-05, + "loss": 0.5155, + "step": 11492 + }, + { + "epoch": 14.753530166880616, + "grad_norm": 1.603891372680664, + "learning_rate": 2.8417629439452287e-05, + "loss": 0.5983, + "step": 11493 + }, + { + "epoch": 14.754813863928113, + "grad_norm": 1.3715399503707886, + "learning_rate": 2.841720154043646e-05, + "loss": 0.5284, + "step": 11494 + }, + { + "epoch": 14.75609756097561, + "grad_norm": 2.1068968772888184, + "learning_rate": 2.8416773641420624e-05, + "loss": 0.5124, + "step": 11495 + }, + { + "epoch": 14.757381258023106, + "grad_norm": 1.263819932937622, + "learning_rate": 2.8416345742404796e-05, + "loss": 0.5108, + "step": 11496 + }, + { + "epoch": 14.758664955070603, + "grad_norm": 5.753138542175293, + "learning_rate": 2.841591784338896e-05, + "loss": 0.5916, + "step": 11497 + }, + { + "epoch": 14.7599486521181, + "grad_norm": 1.4255541563034058, + "learning_rate": 2.841548994437313e-05, + "loss": 0.5111, + "step": 11498 + }, + { + "epoch": 14.761232349165597, + "grad_norm": 3.4565534591674805, + "learning_rate": 2.8415062045357297e-05, + "loss": 0.5139, + "step": 11499 + }, + { + "epoch": 14.762516046213094, + "grad_norm": 2.115408182144165, + "learning_rate": 2.8414634146341462e-05, + "loss": 0.5487, + "step": 11500 + }, + { + "epoch": 14.763799743260591, + "grad_norm": 1.3061655759811401, + "learning_rate": 2.841420624732563e-05, + "loss": 0.5646, + "step": 11501 + }, + { + "epoch": 14.765083440308088, + "grad_norm": 6.180852890014648, + "learning_rate": 2.84137783483098e-05, + "loss": 0.534, + "step": 11502 + }, + { + "epoch": 14.766367137355584, + "grad_norm": 2.1105189323425293, + "learning_rate": 2.8413350449293968e-05, + "loss": 0.6569, + "step": 11503 + }, + { + "epoch": 14.767650834403081, + "grad_norm": 2.2217137813568115, + "learning_rate": 2.8412922550278136e-05, + "loss": 0.6021, + "step": 11504 + }, + { + "epoch": 14.768934531450578, + "grad_norm": 2.13976788520813, + "learning_rate": 2.8412494651262305e-05, + "loss": 0.6641, + "step": 11505 + }, + { + "epoch": 14.770218228498074, + "grad_norm": 1.7363654375076294, + "learning_rate": 2.841206675224647e-05, + "loss": 0.7501, + "step": 11506 + }, + { + "epoch": 14.771501925545572, + "grad_norm": 1.7075927257537842, + "learning_rate": 2.8411638853230638e-05, + "loss": 0.4666, + "step": 11507 + }, + { + "epoch": 14.772785622593068, + "grad_norm": 1.406109094619751, + "learning_rate": 2.8411210954214806e-05, + "loss": 0.4834, + "step": 11508 + }, + { + "epoch": 14.774069319640565, + "grad_norm": 1.5061120986938477, + "learning_rate": 2.841078305519897e-05, + "loss": 0.5055, + "step": 11509 + }, + { + "epoch": 14.775353016688062, + "grad_norm": 1.3918685913085938, + "learning_rate": 2.8410355156183143e-05, + "loss": 0.4634, + "step": 11510 + }, + { + "epoch": 14.776636713735558, + "grad_norm": 1.354736566543579, + "learning_rate": 2.8409927257167308e-05, + "loss": 0.4768, + "step": 11511 + }, + { + "epoch": 14.777920410783056, + "grad_norm": 1.5697368383407593, + "learning_rate": 2.840949935815148e-05, + "loss": 0.5329, + "step": 11512 + }, + { + "epoch": 14.779204107830552, + "grad_norm": 2.422415256500244, + "learning_rate": 2.8409071459135645e-05, + "loss": 0.5359, + "step": 11513 + }, + { + "epoch": 14.78048780487805, + "grad_norm": 1.1645619869232178, + "learning_rate": 2.840864356011981e-05, + "loss": 0.495, + "step": 11514 + }, + { + "epoch": 14.781771501925546, + "grad_norm": 1.2145708799362183, + "learning_rate": 2.8408215661103982e-05, + "loss": 0.4792, + "step": 11515 + }, + { + "epoch": 14.783055198973042, + "grad_norm": 6.888045787811279, + "learning_rate": 2.8407787762088147e-05, + "loss": 0.5158, + "step": 11516 + }, + { + "epoch": 14.78433889602054, + "grad_norm": 1.2707678079605103, + "learning_rate": 2.8407359863072315e-05, + "loss": 0.4907, + "step": 11517 + }, + { + "epoch": 14.785622593068036, + "grad_norm": 1.736960768699646, + "learning_rate": 2.8406931964056484e-05, + "loss": 0.4786, + "step": 11518 + }, + { + "epoch": 14.786906290115533, + "grad_norm": 1.638351321220398, + "learning_rate": 2.840650406504065e-05, + "loss": 0.5491, + "step": 11519 + }, + { + "epoch": 14.78818998716303, + "grad_norm": 1.5960888862609863, + "learning_rate": 2.840607616602482e-05, + "loss": 0.48, + "step": 11520 + }, + { + "epoch": 14.789473684210526, + "grad_norm": 2.323803663253784, + "learning_rate": 2.8405648267008986e-05, + "loss": 0.5493, + "step": 11521 + }, + { + "epoch": 14.790757381258024, + "grad_norm": 1.532442569732666, + "learning_rate": 2.8405220367993154e-05, + "loss": 0.478, + "step": 11522 + }, + { + "epoch": 14.79204107830552, + "grad_norm": 1.061530351638794, + "learning_rate": 2.8404792468977322e-05, + "loss": 0.5121, + "step": 11523 + }, + { + "epoch": 14.793324775353017, + "grad_norm": 1.6733070611953735, + "learning_rate": 2.840436456996149e-05, + "loss": 0.4874, + "step": 11524 + }, + { + "epoch": 14.794608472400514, + "grad_norm": 0.8091328740119934, + "learning_rate": 2.8403936670945656e-05, + "loss": 0.554, + "step": 11525 + }, + { + "epoch": 14.79589216944801, + "grad_norm": 1.5171699523925781, + "learning_rate": 2.8403508771929824e-05, + "loss": 0.4664, + "step": 11526 + }, + { + "epoch": 14.797175866495508, + "grad_norm": 1.2614250183105469, + "learning_rate": 2.8403080872913993e-05, + "loss": 0.5389, + "step": 11527 + }, + { + "epoch": 14.798459563543004, + "grad_norm": 1.3443255424499512, + "learning_rate": 2.840265297389816e-05, + "loss": 0.5212, + "step": 11528 + }, + { + "epoch": 14.7997432605905, + "grad_norm": 1.205636739730835, + "learning_rate": 2.840222507488233e-05, + "loss": 0.531, + "step": 11529 + }, + { + "epoch": 14.801026957637998, + "grad_norm": 1.645326018333435, + "learning_rate": 2.8401797175866495e-05, + "loss": 0.4592, + "step": 11530 + }, + { + "epoch": 14.802310654685494, + "grad_norm": 3.425685167312622, + "learning_rate": 2.8401369276850666e-05, + "loss": 0.5289, + "step": 11531 + }, + { + "epoch": 14.803594351732992, + "grad_norm": 2.0903234481811523, + "learning_rate": 2.840094137783483e-05, + "loss": 0.5562, + "step": 11532 + }, + { + "epoch": 14.804878048780488, + "grad_norm": 1.5667898654937744, + "learning_rate": 2.8400513478818996e-05, + "loss": 0.4808, + "step": 11533 + }, + { + "epoch": 14.806161745827985, + "grad_norm": 2.163626194000244, + "learning_rate": 2.8400085579803168e-05, + "loss": 0.5059, + "step": 11534 + }, + { + "epoch": 14.807445442875482, + "grad_norm": 1.8153259754180908, + "learning_rate": 2.8399657680787333e-05, + "loss": 0.4518, + "step": 11535 + }, + { + "epoch": 14.808729139922978, + "grad_norm": 1.4159175157546997, + "learning_rate": 2.8399229781771505e-05, + "loss": 0.5927, + "step": 11536 + }, + { + "epoch": 14.810012836970476, + "grad_norm": 1.8592203855514526, + "learning_rate": 2.839880188275567e-05, + "loss": 0.5276, + "step": 11537 + }, + { + "epoch": 14.811296534017972, + "grad_norm": 1.328101396560669, + "learning_rate": 2.839837398373984e-05, + "loss": 0.5142, + "step": 11538 + }, + { + "epoch": 14.812580231065468, + "grad_norm": 2.7701776027679443, + "learning_rate": 2.8397946084724007e-05, + "loss": 0.5282, + "step": 11539 + }, + { + "epoch": 14.813863928112966, + "grad_norm": 8.78802490234375, + "learning_rate": 2.8397518185708172e-05, + "loss": 0.5341, + "step": 11540 + }, + { + "epoch": 14.815147625160462, + "grad_norm": 1.8001738786697388, + "learning_rate": 2.839709028669234e-05, + "loss": 0.5218, + "step": 11541 + }, + { + "epoch": 14.81643132220796, + "grad_norm": 1.7521253824234009, + "learning_rate": 2.839666238767651e-05, + "loss": 0.5404, + "step": 11542 + }, + { + "epoch": 14.817715019255456, + "grad_norm": 0.9318298697471619, + "learning_rate": 2.8396234488660677e-05, + "loss": 0.5594, + "step": 11543 + }, + { + "epoch": 14.818998716302952, + "grad_norm": 3.7373526096343994, + "learning_rate": 2.8395806589644846e-05, + "loss": 0.5269, + "step": 11544 + }, + { + "epoch": 14.82028241335045, + "grad_norm": 2.3744935989379883, + "learning_rate": 2.8395378690629014e-05, + "loss": 0.5373, + "step": 11545 + }, + { + "epoch": 14.821566110397946, + "grad_norm": 1.9949758052825928, + "learning_rate": 2.839495079161318e-05, + "loss": 0.5354, + "step": 11546 + }, + { + "epoch": 14.822849807445444, + "grad_norm": 1.826987385749817, + "learning_rate": 2.8394522892597347e-05, + "loss": 0.5255, + "step": 11547 + }, + { + "epoch": 14.82413350449294, + "grad_norm": 2.0967421531677246, + "learning_rate": 2.8394094993581516e-05, + "loss": 0.5265, + "step": 11548 + }, + { + "epoch": 14.825417201540436, + "grad_norm": 3.5217092037200928, + "learning_rate": 2.839366709456568e-05, + "loss": 0.5165, + "step": 11549 + }, + { + "epoch": 14.826700898587934, + "grad_norm": 5.192197799682617, + "learning_rate": 2.8393239195549853e-05, + "loss": 0.5267, + "step": 11550 + }, + { + "epoch": 14.82798459563543, + "grad_norm": 2.0023956298828125, + "learning_rate": 2.8392811296534018e-05, + "loss": 0.6214, + "step": 11551 + }, + { + "epoch": 14.829268292682928, + "grad_norm": 3.2125513553619385, + "learning_rate": 2.839238339751819e-05, + "loss": 0.5794, + "step": 11552 + }, + { + "epoch": 14.830551989730424, + "grad_norm": 2.955575942993164, + "learning_rate": 2.8391955498502354e-05, + "loss": 0.5129, + "step": 11553 + }, + { + "epoch": 14.83183568677792, + "grad_norm": 6.192313194274902, + "learning_rate": 2.839152759948652e-05, + "loss": 0.5733, + "step": 11554 + }, + { + "epoch": 14.833119383825418, + "grad_norm": 3.0210046768188477, + "learning_rate": 2.839109970047069e-05, + "loss": 0.6484, + "step": 11555 + }, + { + "epoch": 14.834403080872914, + "grad_norm": 4.5659990310668945, + "learning_rate": 2.8390671801454856e-05, + "loss": 0.7602, + "step": 11556 + }, + { + "epoch": 14.835686777920412, + "grad_norm": 9.777071952819824, + "learning_rate": 2.8390243902439025e-05, + "loss": 0.4804, + "step": 11557 + }, + { + "epoch": 14.836970474967908, + "grad_norm": 2.0024635791778564, + "learning_rate": 2.8389816003423193e-05, + "loss": 0.5563, + "step": 11558 + }, + { + "epoch": 14.838254172015404, + "grad_norm": 0.9658944010734558, + "learning_rate": 2.838938810440736e-05, + "loss": 0.5173, + "step": 11559 + }, + { + "epoch": 14.839537869062902, + "grad_norm": 1.690948247909546, + "learning_rate": 2.838896020539153e-05, + "loss": 0.5075, + "step": 11560 + }, + { + "epoch": 14.840821566110398, + "grad_norm": 2.269178867340088, + "learning_rate": 2.8388532306375695e-05, + "loss": 0.4896, + "step": 11561 + }, + { + "epoch": 14.842105263157894, + "grad_norm": 1.5666695833206177, + "learning_rate": 2.8388104407359863e-05, + "loss": 0.5114, + "step": 11562 + }, + { + "epoch": 14.843388960205392, + "grad_norm": 1.9593907594680786, + "learning_rate": 2.8387676508344032e-05, + "loss": 0.4983, + "step": 11563 + }, + { + "epoch": 14.844672657252888, + "grad_norm": 1.945756196975708, + "learning_rate": 2.83872486093282e-05, + "loss": 0.5173, + "step": 11564 + }, + { + "epoch": 14.845956354300386, + "grad_norm": 1.9257874488830566, + "learning_rate": 2.8386820710312365e-05, + "loss": 0.4946, + "step": 11565 + }, + { + "epoch": 14.847240051347882, + "grad_norm": 4.372401237487793, + "learning_rate": 2.8386392811296537e-05, + "loss": 0.5109, + "step": 11566 + }, + { + "epoch": 14.84852374839538, + "grad_norm": 2.6570942401885986, + "learning_rate": 2.8385964912280702e-05, + "loss": 0.5238, + "step": 11567 + }, + { + "epoch": 14.849807445442876, + "grad_norm": 1.6814500093460083, + "learning_rate": 2.838553701326487e-05, + "loss": 0.5545, + "step": 11568 + }, + { + "epoch": 14.851091142490372, + "grad_norm": 3.690518379211426, + "learning_rate": 2.838510911424904e-05, + "loss": 0.484, + "step": 11569 + }, + { + "epoch": 14.85237483953787, + "grad_norm": 2.5832881927490234, + "learning_rate": 2.8384681215233204e-05, + "loss": 0.4891, + "step": 11570 + }, + { + "epoch": 14.853658536585366, + "grad_norm": 1.5166211128234863, + "learning_rate": 2.8384253316217376e-05, + "loss": 0.5101, + "step": 11571 + }, + { + "epoch": 14.854942233632862, + "grad_norm": 1.6741931438446045, + "learning_rate": 2.838382541720154e-05, + "loss": 0.5138, + "step": 11572 + }, + { + "epoch": 14.85622593068036, + "grad_norm": 1.3429025411605835, + "learning_rate": 2.838339751818571e-05, + "loss": 0.5063, + "step": 11573 + }, + { + "epoch": 14.857509627727856, + "grad_norm": 3.4085752964019775, + "learning_rate": 2.8382969619169878e-05, + "loss": 0.4692, + "step": 11574 + }, + { + "epoch": 14.858793324775354, + "grad_norm": 1.0491005182266235, + "learning_rate": 2.8382541720154043e-05, + "loss": 0.5034, + "step": 11575 + }, + { + "epoch": 14.86007702182285, + "grad_norm": 2.58144211769104, + "learning_rate": 2.8382113821138214e-05, + "loss": 0.5939, + "step": 11576 + }, + { + "epoch": 14.861360718870346, + "grad_norm": 0.9305863976478577, + "learning_rate": 2.838168592212238e-05, + "loss": 0.4962, + "step": 11577 + }, + { + "epoch": 14.862644415917844, + "grad_norm": 2.6247990131378174, + "learning_rate": 2.8381258023106548e-05, + "loss": 0.4948, + "step": 11578 + }, + { + "epoch": 14.86392811296534, + "grad_norm": 4.064218521118164, + "learning_rate": 2.8380830124090716e-05, + "loss": 0.5166, + "step": 11579 + }, + { + "epoch": 14.865211810012838, + "grad_norm": 2.764132261276245, + "learning_rate": 2.838040222507488e-05, + "loss": 0.4934, + "step": 11580 + }, + { + "epoch": 14.866495507060334, + "grad_norm": 1.6161394119262695, + "learning_rate": 2.837997432605905e-05, + "loss": 0.5469, + "step": 11581 + }, + { + "epoch": 14.86777920410783, + "grad_norm": 1.3208487033843994, + "learning_rate": 2.8379546427043218e-05, + "loss": 0.4431, + "step": 11582 + }, + { + "epoch": 14.869062901155328, + "grad_norm": 2.0161783695220947, + "learning_rate": 2.8379118528027386e-05, + "loss": 0.5055, + "step": 11583 + }, + { + "epoch": 14.870346598202824, + "grad_norm": 1.199933409690857, + "learning_rate": 2.8378690629011555e-05, + "loss": 0.5365, + "step": 11584 + }, + { + "epoch": 14.871630295250322, + "grad_norm": 1.5824097394943237, + "learning_rate": 2.8378262729995723e-05, + "loss": 0.5263, + "step": 11585 + }, + { + "epoch": 14.872913992297818, + "grad_norm": 1.2093026638031006, + "learning_rate": 2.837783483097989e-05, + "loss": 0.5194, + "step": 11586 + }, + { + "epoch": 14.874197689345314, + "grad_norm": 1.8968108892440796, + "learning_rate": 2.8377406931964057e-05, + "loss": 0.5147, + "step": 11587 + }, + { + "epoch": 14.875481386392812, + "grad_norm": 1.4597877264022827, + "learning_rate": 2.8376979032948225e-05, + "loss": 0.5008, + "step": 11588 + }, + { + "epoch": 14.876765083440308, + "grad_norm": 2.4356179237365723, + "learning_rate": 2.837655113393239e-05, + "loss": 0.5179, + "step": 11589 + }, + { + "epoch": 14.878048780487806, + "grad_norm": 1.8977293968200684, + "learning_rate": 2.8376123234916562e-05, + "loss": 0.5502, + "step": 11590 + }, + { + "epoch": 14.879332477535302, + "grad_norm": 1.6737858057022095, + "learning_rate": 2.8375695335900727e-05, + "loss": 0.5361, + "step": 11591 + }, + { + "epoch": 14.880616174582798, + "grad_norm": 4.475741863250732, + "learning_rate": 2.83752674368849e-05, + "loss": 0.4825, + "step": 11592 + }, + { + "epoch": 14.881899871630296, + "grad_norm": 1.5635509490966797, + "learning_rate": 2.8374839537869064e-05, + "loss": 0.5403, + "step": 11593 + }, + { + "epoch": 14.883183568677792, + "grad_norm": 3.1357927322387695, + "learning_rate": 2.837441163885323e-05, + "loss": 0.5365, + "step": 11594 + }, + { + "epoch": 14.88446726572529, + "grad_norm": 1.5686317682266235, + "learning_rate": 2.83739837398374e-05, + "loss": 0.5517, + "step": 11595 + }, + { + "epoch": 14.885750962772786, + "grad_norm": 5.757293224334717, + "learning_rate": 2.8373555840821566e-05, + "loss": 0.5172, + "step": 11596 + }, + { + "epoch": 14.887034659820282, + "grad_norm": 3.237492561340332, + "learning_rate": 2.8373127941805734e-05, + "loss": 0.5172, + "step": 11597 + }, + { + "epoch": 14.88831835686778, + "grad_norm": 3.1663057804107666, + "learning_rate": 2.8372700042789902e-05, + "loss": 0.5603, + "step": 11598 + }, + { + "epoch": 14.889602053915276, + "grad_norm": 4.276076316833496, + "learning_rate": 2.837227214377407e-05, + "loss": 0.559, + "step": 11599 + }, + { + "epoch": 14.890885750962774, + "grad_norm": 2.5269675254821777, + "learning_rate": 2.837184424475824e-05, + "loss": 0.5269, + "step": 11600 + }, + { + "epoch": 14.89216944801027, + "grad_norm": 1.7844159603118896, + "learning_rate": 2.8371416345742404e-05, + "loss": 0.5594, + "step": 11601 + }, + { + "epoch": 14.893453145057766, + "grad_norm": 3.629207134246826, + "learning_rate": 2.8370988446726573e-05, + "loss": 0.6391, + "step": 11602 + }, + { + "epoch": 14.894736842105264, + "grad_norm": 1.3861808776855469, + "learning_rate": 2.837056054771074e-05, + "loss": 0.5675, + "step": 11603 + }, + { + "epoch": 14.89602053915276, + "grad_norm": 20.144319534301758, + "learning_rate": 2.837013264869491e-05, + "loss": 0.6124, + "step": 11604 + }, + { + "epoch": 14.897304236200256, + "grad_norm": 5.403670787811279, + "learning_rate": 2.8369704749679075e-05, + "loss": 0.6903, + "step": 11605 + }, + { + "epoch": 14.898587933247754, + "grad_norm": 2.878129005432129, + "learning_rate": 2.8369276850663246e-05, + "loss": 0.7226, + "step": 11606 + }, + { + "epoch": 14.89987163029525, + "grad_norm": 1.916132926940918, + "learning_rate": 2.836884895164741e-05, + "loss": 0.4671, + "step": 11607 + }, + { + "epoch": 14.901155327342748, + "grad_norm": 1.4660686254501343, + "learning_rate": 2.836842105263158e-05, + "loss": 0.4775, + "step": 11608 + }, + { + "epoch": 14.902439024390244, + "grad_norm": 1.978175163269043, + "learning_rate": 2.8367993153615748e-05, + "loss": 0.4916, + "step": 11609 + }, + { + "epoch": 14.90372272143774, + "grad_norm": 2.518625497817993, + "learning_rate": 2.8367565254599913e-05, + "loss": 0.4806, + "step": 11610 + }, + { + "epoch": 14.905006418485238, + "grad_norm": 1.476144552230835, + "learning_rate": 2.8367137355584085e-05, + "loss": 0.4914, + "step": 11611 + }, + { + "epoch": 14.906290115532734, + "grad_norm": 1.065449595451355, + "learning_rate": 2.836670945656825e-05, + "loss": 0.5334, + "step": 11612 + }, + { + "epoch": 14.907573812580232, + "grad_norm": 3.1232690811157227, + "learning_rate": 2.836628155755242e-05, + "loss": 0.5143, + "step": 11613 + }, + { + "epoch": 14.908857509627728, + "grad_norm": 1.6469919681549072, + "learning_rate": 2.8365853658536587e-05, + "loss": 0.497, + "step": 11614 + }, + { + "epoch": 14.910141206675224, + "grad_norm": 1.590815544128418, + "learning_rate": 2.8365425759520752e-05, + "loss": 0.4837, + "step": 11615 + }, + { + "epoch": 14.911424903722722, + "grad_norm": 3.9460084438323975, + "learning_rate": 2.8364997860504924e-05, + "loss": 0.5142, + "step": 11616 + }, + { + "epoch": 14.912708600770218, + "grad_norm": 2.045544385910034, + "learning_rate": 2.836456996148909e-05, + "loss": 0.5504, + "step": 11617 + }, + { + "epoch": 14.913992297817716, + "grad_norm": 1.5601776838302612, + "learning_rate": 2.8364142062473257e-05, + "loss": 0.521, + "step": 11618 + }, + { + "epoch": 14.915275994865212, + "grad_norm": 2.0329694747924805, + "learning_rate": 2.8363714163457426e-05, + "loss": 0.515, + "step": 11619 + }, + { + "epoch": 14.916559691912708, + "grad_norm": 3.384974241256714, + "learning_rate": 2.8363286264441594e-05, + "loss": 0.5229, + "step": 11620 + }, + { + "epoch": 14.917843388960206, + "grad_norm": 6.777871608734131, + "learning_rate": 2.836285836542576e-05, + "loss": 0.5176, + "step": 11621 + }, + { + "epoch": 14.919127086007702, + "grad_norm": 2.4311769008636475, + "learning_rate": 2.8362430466409927e-05, + "loss": 0.4679, + "step": 11622 + }, + { + "epoch": 14.9204107830552, + "grad_norm": 2.569899559020996, + "learning_rate": 2.8362002567394096e-05, + "loss": 0.4807, + "step": 11623 + }, + { + "epoch": 14.921694480102696, + "grad_norm": 0.92988520860672, + "learning_rate": 2.8361574668378264e-05, + "loss": 0.5099, + "step": 11624 + }, + { + "epoch": 14.922978177150192, + "grad_norm": 1.7582342624664307, + "learning_rate": 2.8361146769362433e-05, + "loss": 0.4976, + "step": 11625 + }, + { + "epoch": 14.92426187419769, + "grad_norm": 1.2257513999938965, + "learning_rate": 2.8360718870346598e-05, + "loss": 0.5349, + "step": 11626 + }, + { + "epoch": 14.925545571245186, + "grad_norm": 2.9745702743530273, + "learning_rate": 2.836029097133077e-05, + "loss": 0.4743, + "step": 11627 + }, + { + "epoch": 14.926829268292684, + "grad_norm": 0.9987706542015076, + "learning_rate": 2.8359863072314934e-05, + "loss": 0.4803, + "step": 11628 + }, + { + "epoch": 14.92811296534018, + "grad_norm": 1.2540465593338013, + "learning_rate": 2.83594351732991e-05, + "loss": 0.4881, + "step": 11629 + }, + { + "epoch": 14.929396662387676, + "grad_norm": 1.369838833808899, + "learning_rate": 2.835900727428327e-05, + "loss": 0.4932, + "step": 11630 + }, + { + "epoch": 14.930680359435174, + "grad_norm": 1.8472208976745605, + "learning_rate": 2.8358579375267436e-05, + "loss": 0.5078, + "step": 11631 + }, + { + "epoch": 14.93196405648267, + "grad_norm": 2.0427067279815674, + "learning_rate": 2.8358151476251608e-05, + "loss": 0.5217, + "step": 11632 + }, + { + "epoch": 14.933247753530168, + "grad_norm": 2.2830567359924316, + "learning_rate": 2.8357723577235773e-05, + "loss": 0.4936, + "step": 11633 + }, + { + "epoch": 14.934531450577664, + "grad_norm": 1.3633991479873657, + "learning_rate": 2.835729567821994e-05, + "loss": 0.5275, + "step": 11634 + }, + { + "epoch": 14.93581514762516, + "grad_norm": 2.149658441543579, + "learning_rate": 2.835686777920411e-05, + "loss": 0.5045, + "step": 11635 + }, + { + "epoch": 14.937098844672658, + "grad_norm": 5.3455352783203125, + "learning_rate": 2.8356439880188275e-05, + "loss": 0.4882, + "step": 11636 + }, + { + "epoch": 14.938382541720154, + "grad_norm": 1.7764102220535278, + "learning_rate": 2.8356011981172443e-05, + "loss": 0.5351, + "step": 11637 + }, + { + "epoch": 14.93966623876765, + "grad_norm": 5.342166900634766, + "learning_rate": 2.8355584082156612e-05, + "loss": 0.5269, + "step": 11638 + }, + { + "epoch": 14.940949935815148, + "grad_norm": 5.417939186096191, + "learning_rate": 2.835515618314078e-05, + "loss": 0.5441, + "step": 11639 + }, + { + "epoch": 14.942233632862644, + "grad_norm": 2.8203232288360596, + "learning_rate": 2.835472828412495e-05, + "loss": 0.5136, + "step": 11640 + }, + { + "epoch": 14.943517329910142, + "grad_norm": 1.3558419942855835, + "learning_rate": 2.8354300385109114e-05, + "loss": 0.4755, + "step": 11641 + }, + { + "epoch": 14.944801026957638, + "grad_norm": 4.779064178466797, + "learning_rate": 2.8353872486093282e-05, + "loss": 0.5563, + "step": 11642 + }, + { + "epoch": 14.946084724005134, + "grad_norm": 2.890284776687622, + "learning_rate": 2.835344458707745e-05, + "loss": 0.5745, + "step": 11643 + }, + { + "epoch": 14.947368421052632, + "grad_norm": 1.1430730819702148, + "learning_rate": 2.835301668806162e-05, + "loss": 0.5698, + "step": 11644 + }, + { + "epoch": 14.948652118100128, + "grad_norm": 1.5391921997070312, + "learning_rate": 2.8352588789045784e-05, + "loss": 0.5854, + "step": 11645 + }, + { + "epoch": 14.949935815147626, + "grad_norm": 1.5180821418762207, + "learning_rate": 2.8352160890029956e-05, + "loss": 0.5046, + "step": 11646 + }, + { + "epoch": 14.951219512195122, + "grad_norm": 1.2912808656692505, + "learning_rate": 2.835173299101412e-05, + "loss": 0.582, + "step": 11647 + }, + { + "epoch": 14.952503209242618, + "grad_norm": 1.516627550125122, + "learning_rate": 2.8351305091998286e-05, + "loss": 0.5978, + "step": 11648 + }, + { + "epoch": 14.953786906290116, + "grad_norm": 4.4735212326049805, + "learning_rate": 2.8350877192982458e-05, + "loss": 0.5672, + "step": 11649 + }, + { + "epoch": 14.955070603337612, + "grad_norm": 6.456432342529297, + "learning_rate": 2.8350449293966623e-05, + "loss": 0.5687, + "step": 11650 + }, + { + "epoch": 14.95635430038511, + "grad_norm": 2.7434258460998535, + "learning_rate": 2.8350021394950794e-05, + "loss": 0.55, + "step": 11651 + }, + { + "epoch": 14.957637997432606, + "grad_norm": 4.208024501800537, + "learning_rate": 2.834959349593496e-05, + "loss": 0.6339, + "step": 11652 + }, + { + "epoch": 14.958921694480102, + "grad_norm": 1.2137980461120605, + "learning_rate": 2.8349165596919128e-05, + "loss": 0.5661, + "step": 11653 + }, + { + "epoch": 14.9602053915276, + "grad_norm": 1.9858521223068237, + "learning_rate": 2.8348737697903296e-05, + "loss": 0.6407, + "step": 11654 + }, + { + "epoch": 14.961489088575096, + "grad_norm": 2.0447115898132324, + "learning_rate": 2.834830979888746e-05, + "loss": 0.7014, + "step": 11655 + }, + { + "epoch": 14.962772785622594, + "grad_norm": 8.058395385742188, + "learning_rate": 2.834788189987163e-05, + "loss": 0.7128, + "step": 11656 + }, + { + "epoch": 14.96405648267009, + "grad_norm": 1.0327297449111938, + "learning_rate": 2.8347454000855798e-05, + "loss": 0.547, + "step": 11657 + }, + { + "epoch": 14.965340179717586, + "grad_norm": 1.474314570426941, + "learning_rate": 2.8347026101839967e-05, + "loss": 0.5193, + "step": 11658 + }, + { + "epoch": 14.966623876765084, + "grad_norm": 1.008700966835022, + "learning_rate": 2.8346598202824135e-05, + "loss": 0.4987, + "step": 11659 + }, + { + "epoch": 14.96790757381258, + "grad_norm": 2.0011637210845947, + "learning_rate": 2.8346170303808303e-05, + "loss": 0.5217, + "step": 11660 + }, + { + "epoch": 14.969191270860078, + "grad_norm": 1.940319538116455, + "learning_rate": 2.834574240479247e-05, + "loss": 0.5019, + "step": 11661 + }, + { + "epoch": 14.970474967907574, + "grad_norm": 1.3607310056686401, + "learning_rate": 2.8345314505776637e-05, + "loss": 0.4938, + "step": 11662 + }, + { + "epoch": 14.97175866495507, + "grad_norm": 1.492424488067627, + "learning_rate": 2.8344886606760805e-05, + "loss": 0.5095, + "step": 11663 + }, + { + "epoch": 14.973042362002568, + "grad_norm": 3.983011484146118, + "learning_rate": 2.834445870774497e-05, + "loss": 0.5074, + "step": 11664 + }, + { + "epoch": 14.974326059050064, + "grad_norm": 1.085740566253662, + "learning_rate": 2.8344030808729142e-05, + "loss": 0.4966, + "step": 11665 + }, + { + "epoch": 14.975609756097562, + "grad_norm": 1.1820576190948486, + "learning_rate": 2.8343602909713307e-05, + "loss": 0.5189, + "step": 11666 + }, + { + "epoch": 14.976893453145058, + "grad_norm": 1.4205857515335083, + "learning_rate": 2.834317501069748e-05, + "loss": 0.568, + "step": 11667 + }, + { + "epoch": 14.978177150192554, + "grad_norm": 2.5839803218841553, + "learning_rate": 2.8342747111681644e-05, + "loss": 0.4974, + "step": 11668 + }, + { + "epoch": 14.979460847240052, + "grad_norm": 1.0630265474319458, + "learning_rate": 2.834231921266581e-05, + "loss": 0.4761, + "step": 11669 + }, + { + "epoch": 14.980744544287548, + "grad_norm": 1.6196197271347046, + "learning_rate": 2.834189131364998e-05, + "loss": 0.529, + "step": 11670 + }, + { + "epoch": 14.982028241335044, + "grad_norm": 1.3892918825149536, + "learning_rate": 2.8341463414634146e-05, + "loss": 0.5637, + "step": 11671 + }, + { + "epoch": 14.983311938382542, + "grad_norm": 1.7549327611923218, + "learning_rate": 2.8341035515618314e-05, + "loss": 0.5159, + "step": 11672 + }, + { + "epoch": 14.984595635430038, + "grad_norm": 1.694618821144104, + "learning_rate": 2.8340607616602483e-05, + "loss": 0.5211, + "step": 11673 + }, + { + "epoch": 14.985879332477536, + "grad_norm": 1.265337586402893, + "learning_rate": 2.834017971758665e-05, + "loss": 0.5144, + "step": 11674 + }, + { + "epoch": 14.987163029525032, + "grad_norm": 1.6750164031982422, + "learning_rate": 2.833975181857082e-05, + "loss": 0.5319, + "step": 11675 + }, + { + "epoch": 14.988446726572528, + "grad_norm": 0.9638991951942444, + "learning_rate": 2.8339323919554984e-05, + "loss": 0.5404, + "step": 11676 + }, + { + "epoch": 14.989730423620026, + "grad_norm": 1.4496852159500122, + "learning_rate": 2.8338896020539153e-05, + "loss": 0.5375, + "step": 11677 + }, + { + "epoch": 14.991014120667522, + "grad_norm": 3.600912570953369, + "learning_rate": 2.833846812152332e-05, + "loss": 0.5138, + "step": 11678 + }, + { + "epoch": 14.99229781771502, + "grad_norm": 1.8843297958374023, + "learning_rate": 2.833804022250749e-05, + "loss": 0.4857, + "step": 11679 + }, + { + "epoch": 14.993581514762516, + "grad_norm": 1.471561074256897, + "learning_rate": 2.8337612323491655e-05, + "loss": 0.5301, + "step": 11680 + }, + { + "epoch": 14.994865211810012, + "grad_norm": 1.2008154392242432, + "learning_rate": 2.8337184424475826e-05, + "loss": 0.5925, + "step": 11681 + }, + { + "epoch": 14.99614890885751, + "grad_norm": 2.38523530960083, + "learning_rate": 2.833675652545999e-05, + "loss": 0.545, + "step": 11682 + }, + { + "epoch": 14.997432605905006, + "grad_norm": 2.1558191776275635, + "learning_rate": 2.833632862644416e-05, + "loss": 0.5681, + "step": 11683 + }, + { + "epoch": 14.998716302952504, + "grad_norm": 1.9117037057876587, + "learning_rate": 2.8335900727428328e-05, + "loss": 0.5708, + "step": 11684 + }, + { + "epoch": 15.0, + "grad_norm": 2.9599432945251465, + "learning_rate": 2.8335472828412493e-05, + "loss": 0.6538, + "step": 11685 + }, + { + "epoch": 15.001283697047496, + "grad_norm": 1.9043502807617188, + "learning_rate": 2.8335044929396665e-05, + "loss": 0.4934, + "step": 11686 + }, + { + "epoch": 15.002567394094994, + "grad_norm": 1.1876695156097412, + "learning_rate": 2.833461703038083e-05, + "loss": 0.4581, + "step": 11687 + }, + { + "epoch": 15.00385109114249, + "grad_norm": 1.0512832403182983, + "learning_rate": 2.8334189131365e-05, + "loss": 0.5097, + "step": 11688 + }, + { + "epoch": 15.005134788189988, + "grad_norm": 2.624812602996826, + "learning_rate": 2.8333761232349167e-05, + "loss": 0.5055, + "step": 11689 + }, + { + "epoch": 15.006418485237484, + "grad_norm": 0.9675685167312622, + "learning_rate": 2.8333333333333332e-05, + "loss": 0.4893, + "step": 11690 + }, + { + "epoch": 15.00770218228498, + "grad_norm": 1.0130152702331543, + "learning_rate": 2.8332905434317504e-05, + "loss": 0.5219, + "step": 11691 + }, + { + "epoch": 15.008985879332478, + "grad_norm": 0.9981334805488586, + "learning_rate": 2.833247753530167e-05, + "loss": 0.5105, + "step": 11692 + }, + { + "epoch": 15.010269576379974, + "grad_norm": 1.8364161252975464, + "learning_rate": 2.8332049636285837e-05, + "loss": 0.4651, + "step": 11693 + }, + { + "epoch": 15.011553273427472, + "grad_norm": 1.682838797569275, + "learning_rate": 2.8331621737270006e-05, + "loss": 0.4953, + "step": 11694 + }, + { + "epoch": 15.012836970474968, + "grad_norm": 1.1091216802597046, + "learning_rate": 2.8331193838254174e-05, + "loss": 0.4864, + "step": 11695 + }, + { + "epoch": 15.014120667522464, + "grad_norm": 1.1657980680465698, + "learning_rate": 2.833076593923834e-05, + "loss": 0.4974, + "step": 11696 + }, + { + "epoch": 15.015404364569962, + "grad_norm": 1.077573299407959, + "learning_rate": 2.8330338040222507e-05, + "loss": 0.4985, + "step": 11697 + }, + { + "epoch": 15.016688061617458, + "grad_norm": 1.9462437629699707, + "learning_rate": 2.8329910141206676e-05, + "loss": 0.5287, + "step": 11698 + }, + { + "epoch": 15.017971758664956, + "grad_norm": 1.558179259300232, + "learning_rate": 2.8329482242190844e-05, + "loss": 0.5568, + "step": 11699 + }, + { + "epoch": 15.019255455712452, + "grad_norm": 1.3374594449996948, + "learning_rate": 2.8329054343175013e-05, + "loss": 0.4902, + "step": 11700 + }, + { + "epoch": 15.020539152759948, + "grad_norm": 1.4682976007461548, + "learning_rate": 2.8328626444159178e-05, + "loss": 0.4957, + "step": 11701 + }, + { + "epoch": 15.021822849807446, + "grad_norm": 2.2502992153167725, + "learning_rate": 2.8328198545143346e-05, + "loss": 0.4624, + "step": 11702 + }, + { + "epoch": 15.023106546854942, + "grad_norm": 1.293166995048523, + "learning_rate": 2.8327770646127515e-05, + "loss": 0.4956, + "step": 11703 + }, + { + "epoch": 15.024390243902438, + "grad_norm": 0.9785970449447632, + "learning_rate": 2.832734274711168e-05, + "loss": 0.486, + "step": 11704 + }, + { + "epoch": 15.025673940949936, + "grad_norm": 1.327881097793579, + "learning_rate": 2.832691484809585e-05, + "loss": 0.5223, + "step": 11705 + }, + { + "epoch": 15.026957637997432, + "grad_norm": 1.059083342552185, + "learning_rate": 2.8326486949080016e-05, + "loss": 0.52, + "step": 11706 + }, + { + "epoch": 15.02824133504493, + "grad_norm": 1.2440215349197388, + "learning_rate": 2.8326059050064188e-05, + "loss": 0.4845, + "step": 11707 + }, + { + "epoch": 15.029525032092426, + "grad_norm": 1.3671908378601074, + "learning_rate": 2.8325631151048353e-05, + "loss": 0.4149, + "step": 11708 + }, + { + "epoch": 15.030808729139922, + "grad_norm": 1.603448748588562, + "learning_rate": 2.8325203252032518e-05, + "loss": 0.4974, + "step": 11709 + }, + { + "epoch": 15.03209242618742, + "grad_norm": 2.335974931716919, + "learning_rate": 2.832477535301669e-05, + "loss": 0.4705, + "step": 11710 + }, + { + "epoch": 15.033376123234916, + "grad_norm": 1.334594488143921, + "learning_rate": 2.8324347454000855e-05, + "loss": 0.5141, + "step": 11711 + }, + { + "epoch": 15.034659820282414, + "grad_norm": 1.0645781755447388, + "learning_rate": 2.8323919554985023e-05, + "loss": 0.4867, + "step": 11712 + }, + { + "epoch": 15.03594351732991, + "grad_norm": 1.3383865356445312, + "learning_rate": 2.8323491655969192e-05, + "loss": 0.5037, + "step": 11713 + }, + { + "epoch": 15.037227214377406, + "grad_norm": 1.590397834777832, + "learning_rate": 2.832306375695336e-05, + "loss": 0.501, + "step": 11714 + }, + { + "epoch": 15.038510911424904, + "grad_norm": 1.1665356159210205, + "learning_rate": 2.832263585793753e-05, + "loss": 0.5602, + "step": 11715 + }, + { + "epoch": 15.0397946084724, + "grad_norm": 2.2506542205810547, + "learning_rate": 2.8322207958921694e-05, + "loss": 0.4803, + "step": 11716 + }, + { + "epoch": 15.041078305519898, + "grad_norm": 1.1381040811538696, + "learning_rate": 2.8321780059905862e-05, + "loss": 0.5014, + "step": 11717 + }, + { + "epoch": 15.042362002567394, + "grad_norm": 2.2517735958099365, + "learning_rate": 2.832135216089003e-05, + "loss": 0.5146, + "step": 11718 + }, + { + "epoch": 15.04364569961489, + "grad_norm": 1.4204025268554688, + "learning_rate": 2.83209242618742e-05, + "loss": 0.5179, + "step": 11719 + }, + { + "epoch": 15.044929396662388, + "grad_norm": 2.2323250770568848, + "learning_rate": 2.8320496362858364e-05, + "loss": 0.5069, + "step": 11720 + }, + { + "epoch": 15.046213093709884, + "grad_norm": 2.383671522140503, + "learning_rate": 2.8320068463842536e-05, + "loss": 0.5213, + "step": 11721 + }, + { + "epoch": 15.047496790757382, + "grad_norm": 1.598514437675476, + "learning_rate": 2.83196405648267e-05, + "loss": 0.549, + "step": 11722 + }, + { + "epoch": 15.048780487804878, + "grad_norm": 2.1760823726654053, + "learning_rate": 2.831921266581087e-05, + "loss": 0.5364, + "step": 11723 + }, + { + "epoch": 15.050064184852374, + "grad_norm": 1.45937979221344, + "learning_rate": 2.8318784766795038e-05, + "loss": 0.4977, + "step": 11724 + }, + { + "epoch": 15.051347881899872, + "grad_norm": 1.9415459632873535, + "learning_rate": 2.8318356867779203e-05, + "loss": 0.5282, + "step": 11725 + }, + { + "epoch": 15.052631578947368, + "grad_norm": 1.762810230255127, + "learning_rate": 2.8317928968763374e-05, + "loss": 0.5429, + "step": 11726 + }, + { + "epoch": 15.053915275994866, + "grad_norm": 2.187981367111206, + "learning_rate": 2.831750106974754e-05, + "loss": 0.5647, + "step": 11727 + }, + { + "epoch": 15.055198973042362, + "grad_norm": 2.7712740898132324, + "learning_rate": 2.8317073170731708e-05, + "loss": 0.5133, + "step": 11728 + }, + { + "epoch": 15.056482670089858, + "grad_norm": 1.1563986539840698, + "learning_rate": 2.8316645271715876e-05, + "loss": 0.4936, + "step": 11729 + }, + { + "epoch": 15.057766367137356, + "grad_norm": 2.262718439102173, + "learning_rate": 2.831621737270004e-05, + "loss": 0.5204, + "step": 11730 + }, + { + "epoch": 15.059050064184852, + "grad_norm": 2.5256824493408203, + "learning_rate": 2.8315789473684213e-05, + "loss": 0.5196, + "step": 11731 + }, + { + "epoch": 15.06033376123235, + "grad_norm": 2.9809012413024902, + "learning_rate": 2.8315361574668378e-05, + "loss": 0.5775, + "step": 11732 + }, + { + "epoch": 15.061617458279846, + "grad_norm": 4.868967056274414, + "learning_rate": 2.8314933675652547e-05, + "loss": 0.5923, + "step": 11733 + }, + { + "epoch": 15.062901155327342, + "grad_norm": 3.093473434448242, + "learning_rate": 2.8314505776636715e-05, + "loss": 0.6374, + "step": 11734 + }, + { + "epoch": 15.06418485237484, + "grad_norm": 3.5976293087005615, + "learning_rate": 2.8314077877620883e-05, + "loss": 0.6505, + "step": 11735 + }, + { + "epoch": 15.065468549422336, + "grad_norm": 2.6221487522125244, + "learning_rate": 2.831364997860505e-05, + "loss": 0.4526, + "step": 11736 + }, + { + "epoch": 15.066752246469832, + "grad_norm": 1.2923939228057861, + "learning_rate": 2.8313222079589217e-05, + "loss": 0.4799, + "step": 11737 + }, + { + "epoch": 15.06803594351733, + "grad_norm": 0.8832753896713257, + "learning_rate": 2.8312794180573385e-05, + "loss": 0.4725, + "step": 11738 + }, + { + "epoch": 15.069319640564826, + "grad_norm": 0.9252192974090576, + "learning_rate": 2.8312366281557554e-05, + "loss": 0.4946, + "step": 11739 + }, + { + "epoch": 15.070603337612324, + "grad_norm": 2.497434616088867, + "learning_rate": 2.8311938382541722e-05, + "loss": 0.5054, + "step": 11740 + }, + { + "epoch": 15.07188703465982, + "grad_norm": 1.2498672008514404, + "learning_rate": 2.8311510483525887e-05, + "loss": 0.5003, + "step": 11741 + }, + { + "epoch": 15.073170731707316, + "grad_norm": 1.0065174102783203, + "learning_rate": 2.831108258451006e-05, + "loss": 0.5328, + "step": 11742 + }, + { + "epoch": 15.074454428754814, + "grad_norm": 1.3090342283248901, + "learning_rate": 2.8310654685494224e-05, + "loss": 0.4926, + "step": 11743 + }, + { + "epoch": 15.07573812580231, + "grad_norm": 1.2200639247894287, + "learning_rate": 2.831022678647839e-05, + "loss": 0.4796, + "step": 11744 + }, + { + "epoch": 15.077021822849808, + "grad_norm": 1.1092044115066528, + "learning_rate": 2.830979888746256e-05, + "loss": 0.4896, + "step": 11745 + }, + { + "epoch": 15.078305519897304, + "grad_norm": 1.656489610671997, + "learning_rate": 2.8309370988446726e-05, + "loss": 0.5098, + "step": 11746 + }, + { + "epoch": 15.0795892169448, + "grad_norm": 1.3796160221099854, + "learning_rate": 2.8308943089430898e-05, + "loss": 0.4874, + "step": 11747 + }, + { + "epoch": 15.080872913992298, + "grad_norm": 0.9106793403625488, + "learning_rate": 2.8308515190415063e-05, + "loss": 0.5292, + "step": 11748 + }, + { + "epoch": 15.082156611039794, + "grad_norm": 2.463740825653076, + "learning_rate": 2.830808729139923e-05, + "loss": 0.5044, + "step": 11749 + }, + { + "epoch": 15.083440308087292, + "grad_norm": 0.8914505839347839, + "learning_rate": 2.83076593923834e-05, + "loss": 0.4961, + "step": 11750 + }, + { + "epoch": 15.084724005134788, + "grad_norm": 1.135221242904663, + "learning_rate": 2.8307231493367564e-05, + "loss": 0.5085, + "step": 11751 + }, + { + "epoch": 15.086007702182284, + "grad_norm": 0.9979475140571594, + "learning_rate": 2.8306803594351733e-05, + "loss": 0.4704, + "step": 11752 + }, + { + "epoch": 15.087291399229782, + "grad_norm": 1.0147647857666016, + "learning_rate": 2.83063756953359e-05, + "loss": 0.4729, + "step": 11753 + }, + { + "epoch": 15.088575096277278, + "grad_norm": 1.5563902854919434, + "learning_rate": 2.830594779632007e-05, + "loss": 0.4604, + "step": 11754 + }, + { + "epoch": 15.089858793324776, + "grad_norm": 1.0333828926086426, + "learning_rate": 2.8305519897304238e-05, + "loss": 0.5134, + "step": 11755 + }, + { + "epoch": 15.091142490372272, + "grad_norm": 1.5118341445922852, + "learning_rate": 2.8305091998288406e-05, + "loss": 0.4849, + "step": 11756 + }, + { + "epoch": 15.092426187419768, + "grad_norm": 2.1275787353515625, + "learning_rate": 2.830466409927257e-05, + "loss": 0.4984, + "step": 11757 + }, + { + "epoch": 15.093709884467266, + "grad_norm": 1.260879635810852, + "learning_rate": 2.830423620025674e-05, + "loss": 0.4754, + "step": 11758 + }, + { + "epoch": 15.094993581514762, + "grad_norm": 1.0207263231277466, + "learning_rate": 2.830380830124091e-05, + "loss": 0.4874, + "step": 11759 + }, + { + "epoch": 15.09627727856226, + "grad_norm": 2.870887041091919, + "learning_rate": 2.8303380402225073e-05, + "loss": 0.5095, + "step": 11760 + }, + { + "epoch": 15.097560975609756, + "grad_norm": 1.343808650970459, + "learning_rate": 2.8302952503209245e-05, + "loss": 0.5232, + "step": 11761 + }, + { + "epoch": 15.098844672657252, + "grad_norm": 4.508312225341797, + "learning_rate": 2.830252460419341e-05, + "loss": 0.466, + "step": 11762 + }, + { + "epoch": 15.10012836970475, + "grad_norm": 1.2765065431594849, + "learning_rate": 2.8302096705177582e-05, + "loss": 0.509, + "step": 11763 + }, + { + "epoch": 15.101412066752246, + "grad_norm": 1.4766526222229004, + "learning_rate": 2.8301668806161747e-05, + "loss": 0.5167, + "step": 11764 + }, + { + "epoch": 15.102695763799744, + "grad_norm": 2.2681992053985596, + "learning_rate": 2.8301240907145912e-05, + "loss": 0.5125, + "step": 11765 + }, + { + "epoch": 15.10397946084724, + "grad_norm": 2.100008249282837, + "learning_rate": 2.8300813008130084e-05, + "loss": 0.4884, + "step": 11766 + }, + { + "epoch": 15.105263157894736, + "grad_norm": 4.572580814361572, + "learning_rate": 2.830038510911425e-05, + "loss": 0.5678, + "step": 11767 + }, + { + "epoch": 15.106546854942234, + "grad_norm": 1.2976078987121582, + "learning_rate": 2.8299957210098417e-05, + "loss": 0.5333, + "step": 11768 + }, + { + "epoch": 15.10783055198973, + "grad_norm": 1.4392987489700317, + "learning_rate": 2.8299529311082586e-05, + "loss": 0.5008, + "step": 11769 + }, + { + "epoch": 15.109114249037226, + "grad_norm": 1.0462820529937744, + "learning_rate": 2.829910141206675e-05, + "loss": 0.5265, + "step": 11770 + }, + { + "epoch": 15.110397946084724, + "grad_norm": 2.229851484298706, + "learning_rate": 2.8298673513050923e-05, + "loss": 0.4563, + "step": 11771 + }, + { + "epoch": 15.11168164313222, + "grad_norm": 1.297848105430603, + "learning_rate": 2.8298245614035088e-05, + "loss": 0.4828, + "step": 11772 + }, + { + "epoch": 15.112965340179718, + "grad_norm": 5.056548595428467, + "learning_rate": 2.8297817715019256e-05, + "loss": 0.4953, + "step": 11773 + }, + { + "epoch": 15.114249037227214, + "grad_norm": 1.2336606979370117, + "learning_rate": 2.8297389816003424e-05, + "loss": 0.5015, + "step": 11774 + }, + { + "epoch": 15.11553273427471, + "grad_norm": 6.285245895385742, + "learning_rate": 2.8296961916987593e-05, + "loss": 0.5209, + "step": 11775 + }, + { + "epoch": 15.116816431322208, + "grad_norm": 1.1500388383865356, + "learning_rate": 2.8296534017971758e-05, + "loss": 0.5203, + "step": 11776 + }, + { + "epoch": 15.118100128369704, + "grad_norm": 3.504411458969116, + "learning_rate": 2.8296106118955926e-05, + "loss": 0.5617, + "step": 11777 + }, + { + "epoch": 15.119383825417202, + "grad_norm": 1.433083415031433, + "learning_rate": 2.8295678219940095e-05, + "loss": 0.4968, + "step": 11778 + }, + { + "epoch": 15.120667522464698, + "grad_norm": 1.9537714719772339, + "learning_rate": 2.8295250320924263e-05, + "loss": 0.5347, + "step": 11779 + }, + { + "epoch": 15.121951219512194, + "grad_norm": 2.3441002368927, + "learning_rate": 2.829482242190843e-05, + "loss": 0.5199, + "step": 11780 + }, + { + "epoch": 15.123234916559692, + "grad_norm": 3.5920159816741943, + "learning_rate": 2.8294394522892596e-05, + "loss": 0.6497, + "step": 11781 + }, + { + "epoch": 15.124518613607188, + "grad_norm": 2.7210328578948975, + "learning_rate": 2.8293966623876768e-05, + "loss": 0.5764, + "step": 11782 + }, + { + "epoch": 15.125802310654686, + "grad_norm": 1.8658769130706787, + "learning_rate": 2.8293538724860933e-05, + "loss": 0.5956, + "step": 11783 + }, + { + "epoch": 15.127086007702182, + "grad_norm": 2.3695602416992188, + "learning_rate": 2.82931108258451e-05, + "loss": 0.6086, + "step": 11784 + }, + { + "epoch": 15.128369704749678, + "grad_norm": 6.110289573669434, + "learning_rate": 2.829268292682927e-05, + "loss": 0.71, + "step": 11785 + }, + { + "epoch": 15.129653401797176, + "grad_norm": 1.0792776346206665, + "learning_rate": 2.8292255027813435e-05, + "loss": 0.4316, + "step": 11786 + }, + { + "epoch": 15.130937098844672, + "grad_norm": 1.6556836366653442, + "learning_rate": 2.8291827128797607e-05, + "loss": 0.4943, + "step": 11787 + }, + { + "epoch": 15.13222079589217, + "grad_norm": 1.3661482334136963, + "learning_rate": 2.8291399229781772e-05, + "loss": 0.4781, + "step": 11788 + }, + { + "epoch": 15.133504492939666, + "grad_norm": 1.6095695495605469, + "learning_rate": 2.829097133076594e-05, + "loss": 0.5094, + "step": 11789 + }, + { + "epoch": 15.134788189987162, + "grad_norm": 0.8322271108627319, + "learning_rate": 2.829054343175011e-05, + "loss": 0.4866, + "step": 11790 + }, + { + "epoch": 15.13607188703466, + "grad_norm": 2.052447557449341, + "learning_rate": 2.8290115532734274e-05, + "loss": 0.4657, + "step": 11791 + }, + { + "epoch": 15.137355584082156, + "grad_norm": 1.6129790544509888, + "learning_rate": 2.8289687633718442e-05, + "loss": 0.5081, + "step": 11792 + }, + { + "epoch": 15.138639281129654, + "grad_norm": 0.887998640537262, + "learning_rate": 2.828925973470261e-05, + "loss": 0.5215, + "step": 11793 + }, + { + "epoch": 15.13992297817715, + "grad_norm": 1.2898778915405273, + "learning_rate": 2.828883183568678e-05, + "loss": 0.5081, + "step": 11794 + }, + { + "epoch": 15.141206675224646, + "grad_norm": 2.3656668663024902, + "learning_rate": 2.8288403936670947e-05, + "loss": 0.5338, + "step": 11795 + }, + { + "epoch": 15.142490372272144, + "grad_norm": 1.2761130332946777, + "learning_rate": 2.8287976037655116e-05, + "loss": 0.5051, + "step": 11796 + }, + { + "epoch": 15.14377406931964, + "grad_norm": 1.980332612991333, + "learning_rate": 2.828754813863928e-05, + "loss": 0.48, + "step": 11797 + }, + { + "epoch": 15.145057766367138, + "grad_norm": 0.9587022662162781, + "learning_rate": 2.828712023962345e-05, + "loss": 0.5076, + "step": 11798 + }, + { + "epoch": 15.146341463414634, + "grad_norm": 1.3306149244308472, + "learning_rate": 2.8286692340607618e-05, + "loss": 0.4912, + "step": 11799 + }, + { + "epoch": 15.14762516046213, + "grad_norm": 0.6793012619018555, + "learning_rate": 2.8286264441591783e-05, + "loss": 0.5183, + "step": 11800 + }, + { + "epoch": 15.148908857509628, + "grad_norm": 2.0275845527648926, + "learning_rate": 2.8285836542575955e-05, + "loss": 0.5399, + "step": 11801 + }, + { + "epoch": 15.150192554557124, + "grad_norm": 2.107365369796753, + "learning_rate": 2.828540864356012e-05, + "loss": 0.4621, + "step": 11802 + }, + { + "epoch": 15.15147625160462, + "grad_norm": 1.3008151054382324, + "learning_rate": 2.828498074454429e-05, + "loss": 0.4841, + "step": 11803 + }, + { + "epoch": 15.152759948652118, + "grad_norm": 1.193806529045105, + "learning_rate": 2.8284552845528456e-05, + "loss": 0.5088, + "step": 11804 + }, + { + "epoch": 15.154043645699614, + "grad_norm": 3.8274028301239014, + "learning_rate": 2.828412494651262e-05, + "loss": 0.4919, + "step": 11805 + }, + { + "epoch": 15.155327342747112, + "grad_norm": 0.8405700325965881, + "learning_rate": 2.8283697047496793e-05, + "loss": 0.5364, + "step": 11806 + }, + { + "epoch": 15.156611039794608, + "grad_norm": 1.1469312906265259, + "learning_rate": 2.8283269148480958e-05, + "loss": 0.5262, + "step": 11807 + }, + { + "epoch": 15.157894736842104, + "grad_norm": 1.1684978008270264, + "learning_rate": 2.8282841249465127e-05, + "loss": 0.4707, + "step": 11808 + }, + { + "epoch": 15.159178433889602, + "grad_norm": 1.0434603691101074, + "learning_rate": 2.8282413350449295e-05, + "loss": 0.4759, + "step": 11809 + }, + { + "epoch": 15.160462130937098, + "grad_norm": 1.6160783767700195, + "learning_rate": 2.8281985451433463e-05, + "loss": 0.4902, + "step": 11810 + }, + { + "epoch": 15.161745827984596, + "grad_norm": 1.6983708143234253, + "learning_rate": 2.8281557552417632e-05, + "loss": 0.4922, + "step": 11811 + }, + { + "epoch": 15.163029525032092, + "grad_norm": 1.5672988891601562, + "learning_rate": 2.8281129653401797e-05, + "loss": 0.4732, + "step": 11812 + }, + { + "epoch": 15.164313222079588, + "grad_norm": 2.7611138820648193, + "learning_rate": 2.8280701754385965e-05, + "loss": 0.4965, + "step": 11813 + }, + { + "epoch": 15.165596919127086, + "grad_norm": 0.9151108264923096, + "learning_rate": 2.8280273855370134e-05, + "loss": 0.4765, + "step": 11814 + }, + { + "epoch": 15.166880616174582, + "grad_norm": 1.1553140878677368, + "learning_rate": 2.8279845956354302e-05, + "loss": 0.5396, + "step": 11815 + }, + { + "epoch": 15.16816431322208, + "grad_norm": 1.1146236658096313, + "learning_rate": 2.8279418057338467e-05, + "loss": 0.5731, + "step": 11816 + }, + { + "epoch": 15.169448010269576, + "grad_norm": 2.63242244720459, + "learning_rate": 2.827899015832264e-05, + "loss": 0.4644, + "step": 11817 + }, + { + "epoch": 15.170731707317072, + "grad_norm": 1.0979554653167725, + "learning_rate": 2.8278562259306804e-05, + "loss": 0.5185, + "step": 11818 + }, + { + "epoch": 15.17201540436457, + "grad_norm": 2.4656925201416016, + "learning_rate": 2.8278134360290972e-05, + "loss": 0.5051, + "step": 11819 + }, + { + "epoch": 15.173299101412066, + "grad_norm": 1.6815567016601562, + "learning_rate": 2.827770646127514e-05, + "loss": 0.4774, + "step": 11820 + }, + { + "epoch": 15.174582798459564, + "grad_norm": 1.1373155117034912, + "learning_rate": 2.8277278562259306e-05, + "loss": 0.5152, + "step": 11821 + }, + { + "epoch": 15.17586649550706, + "grad_norm": 1.5855802297592163, + "learning_rate": 2.8276850663243478e-05, + "loss": 0.5313, + "step": 11822 + }, + { + "epoch": 15.177150192554556, + "grad_norm": 1.6470266580581665, + "learning_rate": 2.8276422764227643e-05, + "loss": 0.516, + "step": 11823 + }, + { + "epoch": 15.178433889602054, + "grad_norm": 1.8454911708831787, + "learning_rate": 2.827599486521181e-05, + "loss": 0.5431, + "step": 11824 + }, + { + "epoch": 15.17971758664955, + "grad_norm": 2.1542303562164307, + "learning_rate": 2.827556696619598e-05, + "loss": 0.5346, + "step": 11825 + }, + { + "epoch": 15.181001283697048, + "grad_norm": 2.7962164878845215, + "learning_rate": 2.8275139067180145e-05, + "loss": 0.4939, + "step": 11826 + }, + { + "epoch": 15.182284980744544, + "grad_norm": 1.7260804176330566, + "learning_rate": 2.8274711168164316e-05, + "loss": 0.5111, + "step": 11827 + }, + { + "epoch": 15.18356867779204, + "grad_norm": 1.2950092554092407, + "learning_rate": 2.827428326914848e-05, + "loss": 0.5332, + "step": 11828 + }, + { + "epoch": 15.184852374839538, + "grad_norm": 3.5852890014648438, + "learning_rate": 2.827385537013265e-05, + "loss": 0.5415, + "step": 11829 + }, + { + "epoch": 15.186136071887034, + "grad_norm": 2.2950241565704346, + "learning_rate": 2.8273427471116818e-05, + "loss": 0.5239, + "step": 11830 + }, + { + "epoch": 15.187419768934532, + "grad_norm": 1.2628742456436157, + "learning_rate": 2.8272999572100983e-05, + "loss": 0.5201, + "step": 11831 + }, + { + "epoch": 15.188703465982028, + "grad_norm": 1.9749244451522827, + "learning_rate": 2.827257167308515e-05, + "loss": 0.6206, + "step": 11832 + }, + { + "epoch": 15.189987163029524, + "grad_norm": 1.1748709678649902, + "learning_rate": 2.827214377406932e-05, + "loss": 0.5827, + "step": 11833 + }, + { + "epoch": 15.191270860077022, + "grad_norm": 2.0198850631713867, + "learning_rate": 2.827171587505349e-05, + "loss": 0.592, + "step": 11834 + }, + { + "epoch": 15.192554557124518, + "grad_norm": 3.6269898414611816, + "learning_rate": 2.8271287976037657e-05, + "loss": 0.7055, + "step": 11835 + }, + { + "epoch": 15.193838254172016, + "grad_norm": 1.1539280414581299, + "learning_rate": 2.8270860077021825e-05, + "loss": 0.4778, + "step": 11836 + }, + { + "epoch": 15.195121951219512, + "grad_norm": 2.2580111026763916, + "learning_rate": 2.827043217800599e-05, + "loss": 0.5009, + "step": 11837 + }, + { + "epoch": 15.196405648267008, + "grad_norm": 0.9727275371551514, + "learning_rate": 2.827000427899016e-05, + "loss": 0.4677, + "step": 11838 + }, + { + "epoch": 15.197689345314506, + "grad_norm": 1.4982810020446777, + "learning_rate": 2.8269576379974327e-05, + "loss": 0.5055, + "step": 11839 + }, + { + "epoch": 15.198973042362002, + "grad_norm": 0.8215435147285461, + "learning_rate": 2.8269148480958492e-05, + "loss": 0.4797, + "step": 11840 + }, + { + "epoch": 15.200256739409499, + "grad_norm": 1.1661196947097778, + "learning_rate": 2.8268720581942664e-05, + "loss": 0.4683, + "step": 11841 + }, + { + "epoch": 15.201540436456996, + "grad_norm": 2.449380397796631, + "learning_rate": 2.826829268292683e-05, + "loss": 0.4994, + "step": 11842 + }, + { + "epoch": 15.202824133504492, + "grad_norm": 0.8448970913887024, + "learning_rate": 2.8267864783911e-05, + "loss": 0.4733, + "step": 11843 + }, + { + "epoch": 15.20410783055199, + "grad_norm": 2.206291913986206, + "learning_rate": 2.8267436884895166e-05, + "loss": 0.5125, + "step": 11844 + }, + { + "epoch": 15.205391527599486, + "grad_norm": 1.4046096801757812, + "learning_rate": 2.826700898587933e-05, + "loss": 0.4898, + "step": 11845 + }, + { + "epoch": 15.206675224646983, + "grad_norm": 1.4681568145751953, + "learning_rate": 2.8266581086863503e-05, + "loss": 0.5075, + "step": 11846 + }, + { + "epoch": 15.20795892169448, + "grad_norm": 1.8337163925170898, + "learning_rate": 2.8266153187847668e-05, + "loss": 0.5309, + "step": 11847 + }, + { + "epoch": 15.209242618741976, + "grad_norm": 1.6205943822860718, + "learning_rate": 2.8265725288831836e-05, + "loss": 0.4741, + "step": 11848 + }, + { + "epoch": 15.210526315789474, + "grad_norm": 0.960381031036377, + "learning_rate": 2.8265297389816004e-05, + "loss": 0.4754, + "step": 11849 + }, + { + "epoch": 15.21181001283697, + "grad_norm": 1.0689163208007812, + "learning_rate": 2.8264869490800173e-05, + "loss": 0.5436, + "step": 11850 + }, + { + "epoch": 15.213093709884467, + "grad_norm": 1.070486068725586, + "learning_rate": 2.8264441591784338e-05, + "loss": 0.4746, + "step": 11851 + }, + { + "epoch": 15.214377406931964, + "grad_norm": 1.3339669704437256, + "learning_rate": 2.8264013692768506e-05, + "loss": 0.4941, + "step": 11852 + }, + { + "epoch": 15.21566110397946, + "grad_norm": 1.4039703607559204, + "learning_rate": 2.8263585793752675e-05, + "loss": 0.484, + "step": 11853 + }, + { + "epoch": 15.216944801026958, + "grad_norm": 1.1832650899887085, + "learning_rate": 2.8263157894736843e-05, + "loss": 0.4841, + "step": 11854 + }, + { + "epoch": 15.218228498074454, + "grad_norm": 2.4449880123138428, + "learning_rate": 2.826272999572101e-05, + "loss": 0.4959, + "step": 11855 + }, + { + "epoch": 15.21951219512195, + "grad_norm": 0.9472103118896484, + "learning_rate": 2.8262302096705177e-05, + "loss": 0.5082, + "step": 11856 + }, + { + "epoch": 15.220795892169448, + "grad_norm": 2.4440789222717285, + "learning_rate": 2.826187419768935e-05, + "loss": 0.5001, + "step": 11857 + }, + { + "epoch": 15.222079589216944, + "grad_norm": 1.1084637641906738, + "learning_rate": 2.8261446298673513e-05, + "loss": 0.5035, + "step": 11858 + }, + { + "epoch": 15.223363286264442, + "grad_norm": 1.221288800239563, + "learning_rate": 2.826101839965768e-05, + "loss": 0.4997, + "step": 11859 + }, + { + "epoch": 15.224646983311938, + "grad_norm": 1.1824291944503784, + "learning_rate": 2.826059050064185e-05, + "loss": 0.4884, + "step": 11860 + }, + { + "epoch": 15.225930680359435, + "grad_norm": 3.718142509460449, + "learning_rate": 2.8260162601626015e-05, + "loss": 0.5017, + "step": 11861 + }, + { + "epoch": 15.227214377406932, + "grad_norm": 2.8184311389923096, + "learning_rate": 2.8259734702610187e-05, + "loss": 0.481, + "step": 11862 + }, + { + "epoch": 15.228498074454428, + "grad_norm": 1.256132960319519, + "learning_rate": 2.8259306803594352e-05, + "loss": 0.5077, + "step": 11863 + }, + { + "epoch": 15.229781771501926, + "grad_norm": 2.339179039001465, + "learning_rate": 2.825887890457852e-05, + "loss": 0.4654, + "step": 11864 + }, + { + "epoch": 15.231065468549422, + "grad_norm": 1.5447704792022705, + "learning_rate": 2.825845100556269e-05, + "loss": 0.5247, + "step": 11865 + }, + { + "epoch": 15.232349165596919, + "grad_norm": 1.1820297241210938, + "learning_rate": 2.8258023106546854e-05, + "loss": 0.5063, + "step": 11866 + }, + { + "epoch": 15.233632862644416, + "grad_norm": 1.2060942649841309, + "learning_rate": 2.8257595207531022e-05, + "loss": 0.5103, + "step": 11867 + }, + { + "epoch": 15.234916559691912, + "grad_norm": 1.6624417304992676, + "learning_rate": 2.825716730851519e-05, + "loss": 0.4849, + "step": 11868 + }, + { + "epoch": 15.23620025673941, + "grad_norm": 1.711729645729065, + "learning_rate": 2.825673940949936e-05, + "loss": 0.5323, + "step": 11869 + }, + { + "epoch": 15.237483953786906, + "grad_norm": 1.2009567022323608, + "learning_rate": 2.8256311510483528e-05, + "loss": 0.5675, + "step": 11870 + }, + { + "epoch": 15.238767650834403, + "grad_norm": 1.0006688833236694, + "learning_rate": 2.8255883611467696e-05, + "loss": 0.5306, + "step": 11871 + }, + { + "epoch": 15.2400513478819, + "grad_norm": 3.2125864028930664, + "learning_rate": 2.825545571245186e-05, + "loss": 0.5119, + "step": 11872 + }, + { + "epoch": 15.241335044929397, + "grad_norm": 1.3439162969589233, + "learning_rate": 2.825502781343603e-05, + "loss": 0.5717, + "step": 11873 + }, + { + "epoch": 15.242618741976893, + "grad_norm": 3.387906789779663, + "learning_rate": 2.8254599914420198e-05, + "loss": 0.5304, + "step": 11874 + }, + { + "epoch": 15.24390243902439, + "grad_norm": 1.7096489667892456, + "learning_rate": 2.8254172015404363e-05, + "loss": 0.4589, + "step": 11875 + }, + { + "epoch": 15.245186136071887, + "grad_norm": 1.7937251329421997, + "learning_rate": 2.8253744116388535e-05, + "loss": 0.5335, + "step": 11876 + }, + { + "epoch": 15.246469833119384, + "grad_norm": 5.542914867401123, + "learning_rate": 2.82533162173727e-05, + "loss": 0.5644, + "step": 11877 + }, + { + "epoch": 15.24775353016688, + "grad_norm": 1.6031067371368408, + "learning_rate": 2.825288831835687e-05, + "loss": 0.5223, + "step": 11878 + }, + { + "epoch": 15.249037227214377, + "grad_norm": 2.6344797611236572, + "learning_rate": 2.8252460419341036e-05, + "loss": 0.5526, + "step": 11879 + }, + { + "epoch": 15.250320924261874, + "grad_norm": 3.471519708633423, + "learning_rate": 2.82520325203252e-05, + "loss": 0.5858, + "step": 11880 + }, + { + "epoch": 15.25160462130937, + "grad_norm": 2.978895902633667, + "learning_rate": 2.8251604621309373e-05, + "loss": 0.5349, + "step": 11881 + }, + { + "epoch": 15.252888318356868, + "grad_norm": 1.4044910669326782, + "learning_rate": 2.8251176722293538e-05, + "loss": 0.5786, + "step": 11882 + }, + { + "epoch": 15.254172015404365, + "grad_norm": 2.045847177505493, + "learning_rate": 2.8250748823277707e-05, + "loss": 0.6013, + "step": 11883 + }, + { + "epoch": 15.25545571245186, + "grad_norm": 2.881338119506836, + "learning_rate": 2.8250320924261875e-05, + "loss": 0.6736, + "step": 11884 + }, + { + "epoch": 15.256739409499358, + "grad_norm": 35.9804573059082, + "learning_rate": 2.8249893025246044e-05, + "loss": 0.7695, + "step": 11885 + }, + { + "epoch": 15.258023106546855, + "grad_norm": 1.612460732460022, + "learning_rate": 2.8249465126230212e-05, + "loss": 0.5218, + "step": 11886 + }, + { + "epoch": 15.259306803594352, + "grad_norm": 3.4840471744537354, + "learning_rate": 2.8249037227214377e-05, + "loss": 0.4984, + "step": 11887 + }, + { + "epoch": 15.260590500641849, + "grad_norm": 1.6246845722198486, + "learning_rate": 2.8248609328198545e-05, + "loss": 0.5133, + "step": 11888 + }, + { + "epoch": 15.261874197689345, + "grad_norm": 0.9298492670059204, + "learning_rate": 2.8248181429182714e-05, + "loss": 0.4922, + "step": 11889 + }, + { + "epoch": 15.263157894736842, + "grad_norm": 1.2907263040542603, + "learning_rate": 2.8247753530166882e-05, + "loss": 0.4893, + "step": 11890 + }, + { + "epoch": 15.264441591784339, + "grad_norm": 1.665696144104004, + "learning_rate": 2.8247325631151047e-05, + "loss": 0.4979, + "step": 11891 + }, + { + "epoch": 15.265725288831836, + "grad_norm": 1.0569010972976685, + "learning_rate": 2.8246897732135216e-05, + "loss": 0.4909, + "step": 11892 + }, + { + "epoch": 15.267008985879333, + "grad_norm": 2.5541577339172363, + "learning_rate": 2.8246469833119384e-05, + "loss": 0.4815, + "step": 11893 + }, + { + "epoch": 15.268292682926829, + "grad_norm": 1.9587606191635132, + "learning_rate": 2.8246041934103552e-05, + "loss": 0.5049, + "step": 11894 + }, + { + "epoch": 15.269576379974326, + "grad_norm": 2.918327808380127, + "learning_rate": 2.824561403508772e-05, + "loss": 0.544, + "step": 11895 + }, + { + "epoch": 15.270860077021823, + "grad_norm": 5.426923751831055, + "learning_rate": 2.8245186136071886e-05, + "loss": 0.5032, + "step": 11896 + }, + { + "epoch": 15.27214377406932, + "grad_norm": 1.1204227209091187, + "learning_rate": 2.8244758237056058e-05, + "loss": 0.4675, + "step": 11897 + }, + { + "epoch": 15.273427471116817, + "grad_norm": 1.586049199104309, + "learning_rate": 2.8244330338040223e-05, + "loss": 0.484, + "step": 11898 + }, + { + "epoch": 15.274711168164313, + "grad_norm": 1.2467620372772217, + "learning_rate": 2.8243902439024388e-05, + "loss": 0.4826, + "step": 11899 + }, + { + "epoch": 15.27599486521181, + "grad_norm": 1.2986551523208618, + "learning_rate": 2.824347454000856e-05, + "loss": 0.468, + "step": 11900 + }, + { + "epoch": 15.277278562259307, + "grad_norm": 1.4666966199874878, + "learning_rate": 2.8243046640992725e-05, + "loss": 0.4722, + "step": 11901 + }, + { + "epoch": 15.278562259306804, + "grad_norm": 1.1915572881698608, + "learning_rate": 2.8242618741976896e-05, + "loss": 0.4835, + "step": 11902 + }, + { + "epoch": 15.2798459563543, + "grad_norm": 0.9266879558563232, + "learning_rate": 2.824219084296106e-05, + "loss": 0.4883, + "step": 11903 + }, + { + "epoch": 15.281129653401797, + "grad_norm": 7.515641212463379, + "learning_rate": 2.824176294394523e-05, + "loss": 0.4508, + "step": 11904 + }, + { + "epoch": 15.282413350449294, + "grad_norm": 1.8271124362945557, + "learning_rate": 2.8241335044929398e-05, + "loss": 0.5083, + "step": 11905 + }, + { + "epoch": 15.28369704749679, + "grad_norm": 1.1781808137893677, + "learning_rate": 2.8240907145913563e-05, + "loss": 0.5195, + "step": 11906 + }, + { + "epoch": 15.284980744544288, + "grad_norm": 1.0649042129516602, + "learning_rate": 2.824047924689773e-05, + "loss": 0.4795, + "step": 11907 + }, + { + "epoch": 15.286264441591785, + "grad_norm": 0.8661693930625916, + "learning_rate": 2.82400513478819e-05, + "loss": 0.4727, + "step": 11908 + }, + { + "epoch": 15.28754813863928, + "grad_norm": 1.1892542839050293, + "learning_rate": 2.823962344886607e-05, + "loss": 0.4769, + "step": 11909 + }, + { + "epoch": 15.288831835686779, + "grad_norm": 4.259785175323486, + "learning_rate": 2.8239195549850237e-05, + "loss": 0.4695, + "step": 11910 + }, + { + "epoch": 15.290115532734275, + "grad_norm": 1.6473090648651123, + "learning_rate": 2.8238767650834405e-05, + "loss": 0.5321, + "step": 11911 + }, + { + "epoch": 15.29139922978177, + "grad_norm": 0.8648337125778198, + "learning_rate": 2.823833975181857e-05, + "loss": 0.4763, + "step": 11912 + }, + { + "epoch": 15.292682926829269, + "grad_norm": 2.5375473499298096, + "learning_rate": 2.823791185280274e-05, + "loss": 0.5155, + "step": 11913 + }, + { + "epoch": 15.293966623876765, + "grad_norm": 1.8641047477722168, + "learning_rate": 2.8237483953786907e-05, + "loss": 0.4971, + "step": 11914 + }, + { + "epoch": 15.295250320924263, + "grad_norm": 1.628469705581665, + "learning_rate": 2.8237056054771072e-05, + "loss": 0.5237, + "step": 11915 + }, + { + "epoch": 15.296534017971759, + "grad_norm": 1.542761206626892, + "learning_rate": 2.8236628155755244e-05, + "loss": 0.5197, + "step": 11916 + }, + { + "epoch": 15.297817715019255, + "grad_norm": 2.2940850257873535, + "learning_rate": 2.823620025673941e-05, + "loss": 0.4854, + "step": 11917 + }, + { + "epoch": 15.299101412066753, + "grad_norm": 1.497396469116211, + "learning_rate": 2.823577235772358e-05, + "loss": 0.4955, + "step": 11918 + }, + { + "epoch": 15.300385109114249, + "grad_norm": 3.7963688373565674, + "learning_rate": 2.8235344458707746e-05, + "loss": 0.5183, + "step": 11919 + }, + { + "epoch": 15.301668806161747, + "grad_norm": 1.1949241161346436, + "learning_rate": 2.823491655969191e-05, + "loss": 0.5219, + "step": 11920 + }, + { + "epoch": 15.302952503209243, + "grad_norm": 1.5769025087356567, + "learning_rate": 2.8234488660676083e-05, + "loss": 0.4798, + "step": 11921 + }, + { + "epoch": 15.304236200256739, + "grad_norm": 1.2393697500228882, + "learning_rate": 2.8234060761660248e-05, + "loss": 0.5439, + "step": 11922 + }, + { + "epoch": 15.305519897304237, + "grad_norm": 3.6057353019714355, + "learning_rate": 2.8233632862644416e-05, + "loss": 0.5373, + "step": 11923 + }, + { + "epoch": 15.306803594351733, + "grad_norm": 1.3769787549972534, + "learning_rate": 2.8233204963628584e-05, + "loss": 0.5262, + "step": 11924 + }, + { + "epoch": 15.30808729139923, + "grad_norm": 0.991736650466919, + "learning_rate": 2.8232777064612753e-05, + "loss": 0.5737, + "step": 11925 + }, + { + "epoch": 15.309370988446727, + "grad_norm": 1.0686067342758179, + "learning_rate": 2.823234916559692e-05, + "loss": 0.5308, + "step": 11926 + }, + { + "epoch": 15.310654685494223, + "grad_norm": 2.464911699295044, + "learning_rate": 2.8231921266581086e-05, + "loss": 0.5984, + "step": 11927 + }, + { + "epoch": 15.31193838254172, + "grad_norm": 1.7902871370315552, + "learning_rate": 2.8231493367565255e-05, + "loss": 0.5174, + "step": 11928 + }, + { + "epoch": 15.313222079589217, + "grad_norm": 3.274808883666992, + "learning_rate": 2.8231065468549423e-05, + "loss": 0.5571, + "step": 11929 + }, + { + "epoch": 15.314505776636715, + "grad_norm": 3.4994680881500244, + "learning_rate": 2.823063756953359e-05, + "loss": 0.5273, + "step": 11930 + }, + { + "epoch": 15.31578947368421, + "grad_norm": 1.477091670036316, + "learning_rate": 2.8230209670517757e-05, + "loss": 0.5534, + "step": 11931 + }, + { + "epoch": 15.317073170731707, + "grad_norm": 2.9876327514648438, + "learning_rate": 2.822978177150193e-05, + "loss": 0.5558, + "step": 11932 + }, + { + "epoch": 15.318356867779205, + "grad_norm": 4.480223178863525, + "learning_rate": 2.8229353872486093e-05, + "loss": 0.5823, + "step": 11933 + }, + { + "epoch": 15.3196405648267, + "grad_norm": 1.9918776750564575, + "learning_rate": 2.8228925973470262e-05, + "loss": 0.7036, + "step": 11934 + }, + { + "epoch": 15.320924261874199, + "grad_norm": 3.3755264282226562, + "learning_rate": 2.822849807445443e-05, + "loss": 0.6807, + "step": 11935 + }, + { + "epoch": 15.322207958921695, + "grad_norm": 1.3291168212890625, + "learning_rate": 2.8228070175438595e-05, + "loss": 0.4751, + "step": 11936 + }, + { + "epoch": 15.32349165596919, + "grad_norm": 0.9678550958633423, + "learning_rate": 2.8227642276422767e-05, + "loss": 0.4727, + "step": 11937 + }, + { + "epoch": 15.324775353016689, + "grad_norm": 1.6787712574005127, + "learning_rate": 2.8227214377406932e-05, + "loss": 0.502, + "step": 11938 + }, + { + "epoch": 15.326059050064185, + "grad_norm": 1.7836601734161377, + "learning_rate": 2.82267864783911e-05, + "loss": 0.4835, + "step": 11939 + }, + { + "epoch": 15.327342747111683, + "grad_norm": 2.0664472579956055, + "learning_rate": 2.822635857937527e-05, + "loss": 0.4719, + "step": 11940 + }, + { + "epoch": 15.328626444159179, + "grad_norm": 1.503199577331543, + "learning_rate": 2.8225930680359434e-05, + "loss": 0.4774, + "step": 11941 + }, + { + "epoch": 15.329910141206675, + "grad_norm": 0.9560047388076782, + "learning_rate": 2.8225502781343606e-05, + "loss": 0.4993, + "step": 11942 + }, + { + "epoch": 15.331193838254173, + "grad_norm": 1.3450400829315186, + "learning_rate": 2.822507488232777e-05, + "loss": 0.4913, + "step": 11943 + }, + { + "epoch": 15.332477535301669, + "grad_norm": 2.136065721511841, + "learning_rate": 2.822464698331194e-05, + "loss": 0.4934, + "step": 11944 + }, + { + "epoch": 15.333761232349165, + "grad_norm": 0.8983252644538879, + "learning_rate": 2.8224219084296108e-05, + "loss": 0.4794, + "step": 11945 + }, + { + "epoch": 15.335044929396663, + "grad_norm": 5.641371250152588, + "learning_rate": 2.8223791185280276e-05, + "loss": 0.5172, + "step": 11946 + }, + { + "epoch": 15.336328626444159, + "grad_norm": 3.1416831016540527, + "learning_rate": 2.822336328626444e-05, + "loss": 0.4872, + "step": 11947 + }, + { + "epoch": 15.337612323491657, + "grad_norm": 1.7071157693862915, + "learning_rate": 2.822293538724861e-05, + "loss": 0.5286, + "step": 11948 + }, + { + "epoch": 15.338896020539153, + "grad_norm": 1.3764723539352417, + "learning_rate": 2.8222507488232778e-05, + "loss": 0.5222, + "step": 11949 + }, + { + "epoch": 15.340179717586649, + "grad_norm": 7.3603620529174805, + "learning_rate": 2.8222079589216946e-05, + "loss": 0.5124, + "step": 11950 + }, + { + "epoch": 15.341463414634147, + "grad_norm": 1.7790632247924805, + "learning_rate": 2.8221651690201115e-05, + "loss": 0.4934, + "step": 11951 + }, + { + "epoch": 15.342747111681643, + "grad_norm": 1.0749379396438599, + "learning_rate": 2.822122379118528e-05, + "loss": 0.4797, + "step": 11952 + }, + { + "epoch": 15.34403080872914, + "grad_norm": 1.3365678787231445, + "learning_rate": 2.8220795892169448e-05, + "loss": 0.5186, + "step": 11953 + }, + { + "epoch": 15.345314505776637, + "grad_norm": 1.283765196800232, + "learning_rate": 2.8220367993153617e-05, + "loss": 0.4673, + "step": 11954 + }, + { + "epoch": 15.346598202824133, + "grad_norm": 4.253618240356445, + "learning_rate": 2.821994009413778e-05, + "loss": 0.5273, + "step": 11955 + }, + { + "epoch": 15.34788189987163, + "grad_norm": 0.7745940685272217, + "learning_rate": 2.8219512195121953e-05, + "loss": 0.4539, + "step": 11956 + }, + { + "epoch": 15.349165596919127, + "grad_norm": 1.3825092315673828, + "learning_rate": 2.821908429610612e-05, + "loss": 0.4936, + "step": 11957 + }, + { + "epoch": 15.350449293966625, + "grad_norm": 1.2510324716567993, + "learning_rate": 2.821865639709029e-05, + "loss": 0.5235, + "step": 11958 + }, + { + "epoch": 15.35173299101412, + "grad_norm": 1.7582563161849976, + "learning_rate": 2.8218228498074455e-05, + "loss": 0.5401, + "step": 11959 + }, + { + "epoch": 15.353016688061617, + "grad_norm": 3.811237096786499, + "learning_rate": 2.821780059905862e-05, + "loss": 0.4725, + "step": 11960 + }, + { + "epoch": 15.354300385109115, + "grad_norm": 3.1272504329681396, + "learning_rate": 2.8217372700042792e-05, + "loss": 0.4939, + "step": 11961 + }, + { + "epoch": 15.35558408215661, + "grad_norm": 1.0001459121704102, + "learning_rate": 2.8216944801026957e-05, + "loss": 0.4989, + "step": 11962 + }, + { + "epoch": 15.356867779204109, + "grad_norm": 2.1491377353668213, + "learning_rate": 2.8216516902011125e-05, + "loss": 0.4753, + "step": 11963 + }, + { + "epoch": 15.358151476251605, + "grad_norm": 1.5634244680404663, + "learning_rate": 2.8216089002995294e-05, + "loss": 0.4808, + "step": 11964 + }, + { + "epoch": 15.3594351732991, + "grad_norm": 4.106842517852783, + "learning_rate": 2.8215661103979462e-05, + "loss": 0.5193, + "step": 11965 + }, + { + "epoch": 15.360718870346599, + "grad_norm": 1.1541298627853394, + "learning_rate": 2.821523320496363e-05, + "loss": 0.5508, + "step": 11966 + }, + { + "epoch": 15.362002567394095, + "grad_norm": 8.067065238952637, + "learning_rate": 2.8214805305947796e-05, + "loss": 0.474, + "step": 11967 + }, + { + "epoch": 15.363286264441593, + "grad_norm": 1.3757524490356445, + "learning_rate": 2.8214377406931964e-05, + "loss": 0.5496, + "step": 11968 + }, + { + "epoch": 15.364569961489089, + "grad_norm": 1.1908389329910278, + "learning_rate": 2.8213949507916133e-05, + "loss": 0.5068, + "step": 11969 + }, + { + "epoch": 15.365853658536585, + "grad_norm": 1.9042694568634033, + "learning_rate": 2.82135216089003e-05, + "loss": 0.4941, + "step": 11970 + }, + { + "epoch": 15.367137355584083, + "grad_norm": 3.433540105819702, + "learning_rate": 2.8213093709884466e-05, + "loss": 0.4942, + "step": 11971 + }, + { + "epoch": 15.368421052631579, + "grad_norm": 1.477399468421936, + "learning_rate": 2.8212665810868638e-05, + "loss": 0.526, + "step": 11972 + }, + { + "epoch": 15.369704749679077, + "grad_norm": 1.1801668405532837, + "learning_rate": 2.8212237911852803e-05, + "loss": 0.5351, + "step": 11973 + }, + { + "epoch": 15.370988446726573, + "grad_norm": 1.5429102182388306, + "learning_rate": 2.821181001283697e-05, + "loss": 0.5531, + "step": 11974 + }, + { + "epoch": 15.372272143774069, + "grad_norm": 1.7130146026611328, + "learning_rate": 2.821138211382114e-05, + "loss": 0.606, + "step": 11975 + }, + { + "epoch": 15.373555840821567, + "grad_norm": 1.4160377979278564, + "learning_rate": 2.8210954214805305e-05, + "loss": 0.498, + "step": 11976 + }, + { + "epoch": 15.374839537869063, + "grad_norm": 1.948691487312317, + "learning_rate": 2.8210526315789476e-05, + "loss": 0.5288, + "step": 11977 + }, + { + "epoch": 15.376123234916559, + "grad_norm": 2.2061424255371094, + "learning_rate": 2.821009841677364e-05, + "loss": 0.5537, + "step": 11978 + }, + { + "epoch": 15.377406931964057, + "grad_norm": 2.1024718284606934, + "learning_rate": 2.820967051775781e-05, + "loss": 0.5496, + "step": 11979 + }, + { + "epoch": 15.378690629011553, + "grad_norm": 1.4977744817733765, + "learning_rate": 2.8209242618741978e-05, + "loss": 0.5565, + "step": 11980 + }, + { + "epoch": 15.37997432605905, + "grad_norm": 2.642709255218506, + "learning_rate": 2.8208814719726143e-05, + "loss": 0.5908, + "step": 11981 + }, + { + "epoch": 15.381258023106547, + "grad_norm": 1.6546893119812012, + "learning_rate": 2.8208386820710315e-05, + "loss": 0.5846, + "step": 11982 + }, + { + "epoch": 15.382541720154043, + "grad_norm": 2.5143980979919434, + "learning_rate": 2.820795892169448e-05, + "loss": 0.607, + "step": 11983 + }, + { + "epoch": 15.38382541720154, + "grad_norm": 2.940042018890381, + "learning_rate": 2.820753102267865e-05, + "loss": 0.5412, + "step": 11984 + }, + { + "epoch": 15.385109114249037, + "grad_norm": 1.699171781539917, + "learning_rate": 2.8207103123662817e-05, + "loss": 0.7126, + "step": 11985 + }, + { + "epoch": 15.386392811296535, + "grad_norm": 1.060741662979126, + "learning_rate": 2.8206675224646985e-05, + "loss": 0.4575, + "step": 11986 + }, + { + "epoch": 15.38767650834403, + "grad_norm": 1.9501750469207764, + "learning_rate": 2.820624732563115e-05, + "loss": 0.4741, + "step": 11987 + }, + { + "epoch": 15.388960205391527, + "grad_norm": 1.0215022563934326, + "learning_rate": 2.820581942661532e-05, + "loss": 0.4879, + "step": 11988 + }, + { + "epoch": 15.390243902439025, + "grad_norm": 1.1275819540023804, + "learning_rate": 2.8205391527599487e-05, + "loss": 0.4763, + "step": 11989 + }, + { + "epoch": 15.39152759948652, + "grad_norm": 1.8185900449752808, + "learning_rate": 2.8204963628583656e-05, + "loss": 0.5221, + "step": 11990 + }, + { + "epoch": 15.392811296534019, + "grad_norm": 0.9697922468185425, + "learning_rate": 2.8204535729567824e-05, + "loss": 0.4875, + "step": 11991 + }, + { + "epoch": 15.394094993581515, + "grad_norm": 4.814516544342041, + "learning_rate": 2.820410783055199e-05, + "loss": 0.4874, + "step": 11992 + }, + { + "epoch": 15.39537869062901, + "grad_norm": 1.3274328708648682, + "learning_rate": 2.820367993153616e-05, + "loss": 0.5279, + "step": 11993 + }, + { + "epoch": 15.396662387676509, + "grad_norm": 1.07354736328125, + "learning_rate": 2.8203252032520326e-05, + "loss": 0.517, + "step": 11994 + }, + { + "epoch": 15.397946084724005, + "grad_norm": 1.1272056102752686, + "learning_rate": 2.820282413350449e-05, + "loss": 0.5072, + "step": 11995 + }, + { + "epoch": 15.399229781771503, + "grad_norm": 2.1261630058288574, + "learning_rate": 2.8202396234488663e-05, + "loss": 0.4865, + "step": 11996 + }, + { + "epoch": 15.400513478818999, + "grad_norm": 1.1066206693649292, + "learning_rate": 2.8201968335472828e-05, + "loss": 0.4943, + "step": 11997 + }, + { + "epoch": 15.401797175866495, + "grad_norm": 1.7627867460250854, + "learning_rate": 2.8201540436457e-05, + "loss": 0.5011, + "step": 11998 + }, + { + "epoch": 15.403080872913993, + "grad_norm": 1.6723603010177612, + "learning_rate": 2.8201112537441165e-05, + "loss": 0.5098, + "step": 11999 + }, + { + "epoch": 15.404364569961489, + "grad_norm": 1.1644343137741089, + "learning_rate": 2.8200684638425333e-05, + "loss": 0.5, + "step": 12000 + }, + { + "epoch": 15.404364569961489, + "eval_cer": 0.283728216173879, + "eval_loss": 0.5234231352806091, + "eval_runtime": 13.7523, + "eval_samples_per_second": 71.479, + "eval_steps_per_second": 0.509, + "eval_wer": 0.507223113964687, + "step": 12000 + }, + { + "epoch": 15.405648267008987, + "grad_norm": 3.153353452682495, + "learning_rate": 2.82002567394095e-05, + "loss": 0.5361, + "step": 12001 + }, + { + "epoch": 15.406931964056483, + "grad_norm": 0.9845522046089172, + "learning_rate": 2.8199828840393666e-05, + "loss": 0.5044, + "step": 12002 + }, + { + "epoch": 15.408215661103979, + "grad_norm": 2.104848623275757, + "learning_rate": 2.8199400941377835e-05, + "loss": 0.4924, + "step": 12003 + }, + { + "epoch": 15.409499358151477, + "grad_norm": 2.1080729961395264, + "learning_rate": 2.8198973042362003e-05, + "loss": 0.5661, + "step": 12004 + }, + { + "epoch": 15.410783055198973, + "grad_norm": 1.4094656705856323, + "learning_rate": 2.819854514334617e-05, + "loss": 0.5198, + "step": 12005 + }, + { + "epoch": 15.41206675224647, + "grad_norm": 2.838134527206421, + "learning_rate": 2.819811724433034e-05, + "loss": 0.5225, + "step": 12006 + }, + { + "epoch": 15.413350449293967, + "grad_norm": 1.226545810699463, + "learning_rate": 2.819768934531451e-05, + "loss": 0.4861, + "step": 12007 + }, + { + "epoch": 15.414634146341463, + "grad_norm": 1.1774624586105347, + "learning_rate": 2.8197261446298673e-05, + "loss": 0.506, + "step": 12008 + }, + { + "epoch": 15.41591784338896, + "grad_norm": 2.7705695629119873, + "learning_rate": 2.8196833547282842e-05, + "loss": 0.5207, + "step": 12009 + }, + { + "epoch": 15.417201540436457, + "grad_norm": 4.196287155151367, + "learning_rate": 2.819640564826701e-05, + "loss": 0.5604, + "step": 12010 + }, + { + "epoch": 15.418485237483953, + "grad_norm": 3.2954158782958984, + "learning_rate": 2.8195977749251175e-05, + "loss": 0.4456, + "step": 12011 + }, + { + "epoch": 15.41976893453145, + "grad_norm": 1.1086331605911255, + "learning_rate": 2.8195549850235347e-05, + "loss": 0.4721, + "step": 12012 + }, + { + "epoch": 15.421052631578947, + "grad_norm": 0.9227104187011719, + "learning_rate": 2.8195121951219512e-05, + "loss": 0.5183, + "step": 12013 + }, + { + "epoch": 15.422336328626445, + "grad_norm": 2.664574146270752, + "learning_rate": 2.819469405220368e-05, + "loss": 0.516, + "step": 12014 + }, + { + "epoch": 15.42362002567394, + "grad_norm": 2.168482780456543, + "learning_rate": 2.819426615318785e-05, + "loss": 0.5063, + "step": 12015 + }, + { + "epoch": 15.424903722721437, + "grad_norm": 1.2908302545547485, + "learning_rate": 2.8193838254172014e-05, + "loss": 0.5237, + "step": 12016 + }, + { + "epoch": 15.426187419768935, + "grad_norm": 1.6827203035354614, + "learning_rate": 2.8193410355156186e-05, + "loss": 0.5215, + "step": 12017 + }, + { + "epoch": 15.427471116816431, + "grad_norm": 0.9917140603065491, + "learning_rate": 2.819298245614035e-05, + "loss": 0.5623, + "step": 12018 + }, + { + "epoch": 15.428754813863929, + "grad_norm": 3.390699863433838, + "learning_rate": 2.819255455712452e-05, + "loss": 0.5262, + "step": 12019 + }, + { + "epoch": 15.430038510911425, + "grad_norm": 8.495696067810059, + "learning_rate": 2.8192126658108688e-05, + "loss": 0.5242, + "step": 12020 + }, + { + "epoch": 15.431322207958921, + "grad_norm": 1.497403860092163, + "learning_rate": 2.8191698759092853e-05, + "loss": 0.5521, + "step": 12021 + }, + { + "epoch": 15.432605905006419, + "grad_norm": 1.5923616886138916, + "learning_rate": 2.8191270860077024e-05, + "loss": 0.5353, + "step": 12022 + }, + { + "epoch": 15.433889602053915, + "grad_norm": 3.98649263381958, + "learning_rate": 2.819084296106119e-05, + "loss": 0.5501, + "step": 12023 + }, + { + "epoch": 15.435173299101413, + "grad_norm": 2.1904947757720947, + "learning_rate": 2.8190415062045358e-05, + "loss": 0.4929, + "step": 12024 + }, + { + "epoch": 15.436456996148909, + "grad_norm": 1.5288983583450317, + "learning_rate": 2.8189987163029526e-05, + "loss": 0.4641, + "step": 12025 + }, + { + "epoch": 15.437740693196405, + "grad_norm": 1.3625458478927612, + "learning_rate": 2.8189559264013695e-05, + "loss": 0.508, + "step": 12026 + }, + { + "epoch": 15.439024390243903, + "grad_norm": 2.6802866458892822, + "learning_rate": 2.818913136499786e-05, + "loss": 0.6053, + "step": 12027 + }, + { + "epoch": 15.440308087291399, + "grad_norm": 1.695065975189209, + "learning_rate": 2.8188703465982028e-05, + "loss": 0.5758, + "step": 12028 + }, + { + "epoch": 15.441591784338897, + "grad_norm": 2.0539710521698, + "learning_rate": 2.8188275566966197e-05, + "loss": 0.5492, + "step": 12029 + }, + { + "epoch": 15.442875481386393, + "grad_norm": 3.131920337677002, + "learning_rate": 2.8187847667950365e-05, + "loss": 0.5923, + "step": 12030 + }, + { + "epoch": 15.444159178433889, + "grad_norm": 1.9690256118774414, + "learning_rate": 2.8187419768934533e-05, + "loss": 0.5571, + "step": 12031 + }, + { + "epoch": 15.445442875481387, + "grad_norm": 1.854146122932434, + "learning_rate": 2.81869918699187e-05, + "loss": 0.4957, + "step": 12032 + }, + { + "epoch": 15.446726572528883, + "grad_norm": 1.9379738569259644, + "learning_rate": 2.818656397090287e-05, + "loss": 0.5937, + "step": 12033 + }, + { + "epoch": 15.44801026957638, + "grad_norm": 1.7759655714035034, + "learning_rate": 2.8186136071887035e-05, + "loss": 0.6155, + "step": 12034 + }, + { + "epoch": 15.449293966623877, + "grad_norm": 2.244986057281494, + "learning_rate": 2.81857081728712e-05, + "loss": 0.7668, + "step": 12035 + }, + { + "epoch": 15.450577663671373, + "grad_norm": 1.2953178882598877, + "learning_rate": 2.8185280273855372e-05, + "loss": 0.4677, + "step": 12036 + }, + { + "epoch": 15.45186136071887, + "grad_norm": 1.0399430990219116, + "learning_rate": 2.8184852374839537e-05, + "loss": 0.5043, + "step": 12037 + }, + { + "epoch": 15.453145057766367, + "grad_norm": 1.2889647483825684, + "learning_rate": 2.818442447582371e-05, + "loss": 0.51, + "step": 12038 + }, + { + "epoch": 15.454428754813865, + "grad_norm": 0.903131902217865, + "learning_rate": 2.8183996576807874e-05, + "loss": 0.4989, + "step": 12039 + }, + { + "epoch": 15.455712451861361, + "grad_norm": 1.1397567987442017, + "learning_rate": 2.8183568677792042e-05, + "loss": 0.4833, + "step": 12040 + }, + { + "epoch": 15.456996148908857, + "grad_norm": 1.5786973237991333, + "learning_rate": 2.818314077877621e-05, + "loss": 0.5079, + "step": 12041 + }, + { + "epoch": 15.458279845956355, + "grad_norm": 1.2985827922821045, + "learning_rate": 2.8182712879760376e-05, + "loss": 0.4836, + "step": 12042 + }, + { + "epoch": 15.459563543003851, + "grad_norm": 1.695756196975708, + "learning_rate": 2.8182284980744544e-05, + "loss": 0.5, + "step": 12043 + }, + { + "epoch": 15.460847240051347, + "grad_norm": 0.997005820274353, + "learning_rate": 2.8181857081728713e-05, + "loss": 0.4986, + "step": 12044 + }, + { + "epoch": 15.462130937098845, + "grad_norm": 1.6225666999816895, + "learning_rate": 2.818142918271288e-05, + "loss": 0.503, + "step": 12045 + }, + { + "epoch": 15.463414634146341, + "grad_norm": 1.8518685102462769, + "learning_rate": 2.818100128369705e-05, + "loss": 0.5209, + "step": 12046 + }, + { + "epoch": 15.464698331193839, + "grad_norm": 4.814833164215088, + "learning_rate": 2.8180573384681218e-05, + "loss": 0.4776, + "step": 12047 + }, + { + "epoch": 15.465982028241335, + "grad_norm": 7.717833042144775, + "learning_rate": 2.8180145485665383e-05, + "loss": 0.5168, + "step": 12048 + }, + { + "epoch": 15.467265725288831, + "grad_norm": 1.3002103567123413, + "learning_rate": 2.817971758664955e-05, + "loss": 0.5033, + "step": 12049 + }, + { + "epoch": 15.468549422336329, + "grad_norm": 1.0791442394256592, + "learning_rate": 2.817928968763372e-05, + "loss": 0.5127, + "step": 12050 + }, + { + "epoch": 15.469833119383825, + "grad_norm": 1.8101726770401, + "learning_rate": 2.8178861788617885e-05, + "loss": 0.4974, + "step": 12051 + }, + { + "epoch": 15.471116816431323, + "grad_norm": 2.2053306102752686, + "learning_rate": 2.8178433889602056e-05, + "loss": 0.539, + "step": 12052 + }, + { + "epoch": 15.472400513478819, + "grad_norm": 2.0642600059509277, + "learning_rate": 2.817800599058622e-05, + "loss": 0.4725, + "step": 12053 + }, + { + "epoch": 15.473684210526315, + "grad_norm": 1.731697678565979, + "learning_rate": 2.817757809157039e-05, + "loss": 0.482, + "step": 12054 + }, + { + "epoch": 15.474967907573813, + "grad_norm": 1.3759390115737915, + "learning_rate": 2.817715019255456e-05, + "loss": 0.4975, + "step": 12055 + }, + { + "epoch": 15.476251604621309, + "grad_norm": 1.2047654390335083, + "learning_rate": 2.8176722293538723e-05, + "loss": 0.5138, + "step": 12056 + }, + { + "epoch": 15.477535301668807, + "grad_norm": 3.114410400390625, + "learning_rate": 2.8176294394522895e-05, + "loss": 0.5077, + "step": 12057 + }, + { + "epoch": 15.478818998716303, + "grad_norm": 1.2316042184829712, + "learning_rate": 2.817586649550706e-05, + "loss": 0.5575, + "step": 12058 + }, + { + "epoch": 15.480102695763799, + "grad_norm": 1.5820790529251099, + "learning_rate": 2.817543859649123e-05, + "loss": 0.5173, + "step": 12059 + }, + { + "epoch": 15.481386392811297, + "grad_norm": 1.958678960800171, + "learning_rate": 2.8175010697475397e-05, + "loss": 0.4835, + "step": 12060 + }, + { + "epoch": 15.482670089858793, + "grad_norm": 1.5304077863693237, + "learning_rate": 2.8174582798459565e-05, + "loss": 0.4556, + "step": 12061 + }, + { + "epoch": 15.48395378690629, + "grad_norm": 2.2938599586486816, + "learning_rate": 2.817415489944373e-05, + "loss": 0.474, + "step": 12062 + }, + { + "epoch": 15.485237483953787, + "grad_norm": 1.6337734460830688, + "learning_rate": 2.81737270004279e-05, + "loss": 0.5483, + "step": 12063 + }, + { + "epoch": 15.486521181001283, + "grad_norm": 1.3612487316131592, + "learning_rate": 2.8173299101412067e-05, + "loss": 0.4641, + "step": 12064 + }, + { + "epoch": 15.487804878048781, + "grad_norm": 1.7055459022521973, + "learning_rate": 2.8172871202396236e-05, + "loss": 0.4956, + "step": 12065 + }, + { + "epoch": 15.489088575096277, + "grad_norm": 1.035056471824646, + "learning_rate": 2.8172443303380404e-05, + "loss": 0.4733, + "step": 12066 + }, + { + "epoch": 15.490372272143775, + "grad_norm": 2.2182438373565674, + "learning_rate": 2.817201540436457e-05, + "loss": 0.4988, + "step": 12067 + }, + { + "epoch": 15.491655969191271, + "grad_norm": 2.2361512184143066, + "learning_rate": 2.817158750534874e-05, + "loss": 0.5335, + "step": 12068 + }, + { + "epoch": 15.492939666238767, + "grad_norm": 1.4329413175582886, + "learning_rate": 2.8171159606332906e-05, + "loss": 0.5459, + "step": 12069 + }, + { + "epoch": 15.494223363286265, + "grad_norm": 2.527391195297241, + "learning_rate": 2.817073170731707e-05, + "loss": 0.5247, + "step": 12070 + }, + { + "epoch": 15.495507060333761, + "grad_norm": 1.746047019958496, + "learning_rate": 2.8170303808301243e-05, + "loss": 0.4922, + "step": 12071 + }, + { + "epoch": 15.496790757381259, + "grad_norm": 2.1316561698913574, + "learning_rate": 2.8169875909285408e-05, + "loss": 0.5535, + "step": 12072 + }, + { + "epoch": 15.498074454428755, + "grad_norm": 1.5285626649856567, + "learning_rate": 2.816944801026958e-05, + "loss": 0.5083, + "step": 12073 + }, + { + "epoch": 15.499358151476251, + "grad_norm": 1.1988617181777954, + "learning_rate": 2.8169020111253745e-05, + "loss": 0.5033, + "step": 12074 + }, + { + "epoch": 15.500641848523749, + "grad_norm": 3.061223030090332, + "learning_rate": 2.8168592212237913e-05, + "loss": 0.556, + "step": 12075 + }, + { + "epoch": 15.501925545571245, + "grad_norm": 1.4093749523162842, + "learning_rate": 2.816816431322208e-05, + "loss": 0.5659, + "step": 12076 + }, + { + "epoch": 15.503209242618741, + "grad_norm": 2.340667486190796, + "learning_rate": 2.8167736414206246e-05, + "loss": 0.4807, + "step": 12077 + }, + { + "epoch": 15.504492939666239, + "grad_norm": 1.2752578258514404, + "learning_rate": 2.8167308515190415e-05, + "loss": 0.5412, + "step": 12078 + }, + { + "epoch": 15.505776636713735, + "grad_norm": 1.396818995475769, + "learning_rate": 2.8166880616174583e-05, + "loss": 0.5036, + "step": 12079 + }, + { + "epoch": 15.507060333761233, + "grad_norm": 16.361469268798828, + "learning_rate": 2.816645271715875e-05, + "loss": 0.541, + "step": 12080 + }, + { + "epoch": 15.508344030808729, + "grad_norm": 1.7332563400268555, + "learning_rate": 2.816602481814292e-05, + "loss": 0.6167, + "step": 12081 + }, + { + "epoch": 15.509627727856225, + "grad_norm": 6.038658142089844, + "learning_rate": 2.8165596919127085e-05, + "loss": 0.524, + "step": 12082 + }, + { + "epoch": 15.510911424903723, + "grad_norm": 6.419640064239502, + "learning_rate": 2.8165169020111254e-05, + "loss": 0.6051, + "step": 12083 + }, + { + "epoch": 15.512195121951219, + "grad_norm": 3.4525609016418457, + "learning_rate": 2.8164741121095422e-05, + "loss": 0.6423, + "step": 12084 + }, + { + "epoch": 15.513478818998717, + "grad_norm": 1.875169277191162, + "learning_rate": 2.816431322207959e-05, + "loss": 0.705, + "step": 12085 + }, + { + "epoch": 15.514762516046213, + "grad_norm": 0.9788851141929626, + "learning_rate": 2.8163885323063755e-05, + "loss": 0.479, + "step": 12086 + }, + { + "epoch": 15.51604621309371, + "grad_norm": 1.3155003786087036, + "learning_rate": 2.8163457424047927e-05, + "loss": 0.4514, + "step": 12087 + }, + { + "epoch": 15.517329910141207, + "grad_norm": 1.7340071201324463, + "learning_rate": 2.8163029525032092e-05, + "loss": 0.5247, + "step": 12088 + }, + { + "epoch": 15.518613607188703, + "grad_norm": 2.9835476875305176, + "learning_rate": 2.816260162601626e-05, + "loss": 0.4942, + "step": 12089 + }, + { + "epoch": 15.519897304236201, + "grad_norm": 1.1601531505584717, + "learning_rate": 2.816217372700043e-05, + "loss": 0.4842, + "step": 12090 + }, + { + "epoch": 15.521181001283697, + "grad_norm": 2.198068380355835, + "learning_rate": 2.8161745827984594e-05, + "loss": 0.5289, + "step": 12091 + }, + { + "epoch": 15.522464698331193, + "grad_norm": 1.30864679813385, + "learning_rate": 2.8161317928968766e-05, + "loss": 0.507, + "step": 12092 + }, + { + "epoch": 15.523748395378691, + "grad_norm": 1.5244693756103516, + "learning_rate": 2.816089002995293e-05, + "loss": 0.5244, + "step": 12093 + }, + { + "epoch": 15.525032092426187, + "grad_norm": 0.8869876265525818, + "learning_rate": 2.81604621309371e-05, + "loss": 0.5243, + "step": 12094 + }, + { + "epoch": 15.526315789473685, + "grad_norm": 4.191350936889648, + "learning_rate": 2.8160034231921268e-05, + "loss": 0.485, + "step": 12095 + }, + { + "epoch": 15.527599486521181, + "grad_norm": 1.285295009613037, + "learning_rate": 2.8159606332905433e-05, + "loss": 0.5095, + "step": 12096 + }, + { + "epoch": 15.528883183568677, + "grad_norm": 1.0642839670181274, + "learning_rate": 2.8159178433889605e-05, + "loss": 0.5062, + "step": 12097 + }, + { + "epoch": 15.530166880616175, + "grad_norm": 3.792065143585205, + "learning_rate": 2.815875053487377e-05, + "loss": 0.4893, + "step": 12098 + }, + { + "epoch": 15.531450577663671, + "grad_norm": 1.5490567684173584, + "learning_rate": 2.8158322635857938e-05, + "loss": 0.4927, + "step": 12099 + }, + { + "epoch": 15.532734274711169, + "grad_norm": 1.6586532592773438, + "learning_rate": 2.8157894736842106e-05, + "loss": 0.4863, + "step": 12100 + }, + { + "epoch": 15.534017971758665, + "grad_norm": 1.1075645685195923, + "learning_rate": 2.8157466837826275e-05, + "loss": 0.5083, + "step": 12101 + }, + { + "epoch": 15.535301668806161, + "grad_norm": 2.062610626220703, + "learning_rate": 2.815703893881044e-05, + "loss": 0.5183, + "step": 12102 + }, + { + "epoch": 15.536585365853659, + "grad_norm": 11.832595825195312, + "learning_rate": 2.8156611039794608e-05, + "loss": 0.4835, + "step": 12103 + }, + { + "epoch": 15.537869062901155, + "grad_norm": 1.9292209148406982, + "learning_rate": 2.8156183140778777e-05, + "loss": 0.4753, + "step": 12104 + }, + { + "epoch": 15.539152759948653, + "grad_norm": 0.8866629004478455, + "learning_rate": 2.8155755241762945e-05, + "loss": 0.4806, + "step": 12105 + }, + { + "epoch": 15.540436456996149, + "grad_norm": 1.1092398166656494, + "learning_rate": 2.8155327342747113e-05, + "loss": 0.4996, + "step": 12106 + }, + { + "epoch": 15.541720154043645, + "grad_norm": 1.263209581375122, + "learning_rate": 2.815489944373128e-05, + "loss": 0.4869, + "step": 12107 + }, + { + "epoch": 15.543003851091143, + "grad_norm": 1.5344493389129639, + "learning_rate": 2.815447154471545e-05, + "loss": 0.4723, + "step": 12108 + }, + { + "epoch": 15.544287548138639, + "grad_norm": 1.9046069383621216, + "learning_rate": 2.8154043645699615e-05, + "loss": 0.5189, + "step": 12109 + }, + { + "epoch": 15.545571245186135, + "grad_norm": 1.1488643884658813, + "learning_rate": 2.815361574668378e-05, + "loss": 0.4376, + "step": 12110 + }, + { + "epoch": 15.546854942233633, + "grad_norm": 1.6366236209869385, + "learning_rate": 2.8153187847667952e-05, + "loss": 0.4697, + "step": 12111 + }, + { + "epoch": 15.54813863928113, + "grad_norm": 1.756325602531433, + "learning_rate": 2.8152759948652117e-05, + "loss": 0.4833, + "step": 12112 + }, + { + "epoch": 15.549422336328627, + "grad_norm": 1.4460484981536865, + "learning_rate": 2.815233204963629e-05, + "loss": 0.5358, + "step": 12113 + }, + { + "epoch": 15.550706033376123, + "grad_norm": 1.562567949295044, + "learning_rate": 2.8151904150620454e-05, + "loss": 0.536, + "step": 12114 + }, + { + "epoch": 15.55198973042362, + "grad_norm": 0.9893152713775635, + "learning_rate": 2.8151476251604622e-05, + "loss": 0.5123, + "step": 12115 + }, + { + "epoch": 15.553273427471117, + "grad_norm": 1.3824961185455322, + "learning_rate": 2.815104835258879e-05, + "loss": 0.483, + "step": 12116 + }, + { + "epoch": 15.554557124518613, + "grad_norm": 1.92072331905365, + "learning_rate": 2.8150620453572956e-05, + "loss": 0.5016, + "step": 12117 + }, + { + "epoch": 15.555840821566111, + "grad_norm": 2.8720054626464844, + "learning_rate": 2.8150192554557124e-05, + "loss": 0.4717, + "step": 12118 + }, + { + "epoch": 15.557124518613607, + "grad_norm": 1.8361296653747559, + "learning_rate": 2.8149764655541293e-05, + "loss": 0.4867, + "step": 12119 + }, + { + "epoch": 15.558408215661103, + "grad_norm": 2.289026975631714, + "learning_rate": 2.814933675652546e-05, + "loss": 0.5179, + "step": 12120 + }, + { + "epoch": 15.559691912708601, + "grad_norm": 1.5158741474151611, + "learning_rate": 2.814890885750963e-05, + "loss": 0.5258, + "step": 12121 + }, + { + "epoch": 15.560975609756097, + "grad_norm": 2.8660027980804443, + "learning_rate": 2.8148480958493798e-05, + "loss": 0.5039, + "step": 12122 + }, + { + "epoch": 15.562259306803595, + "grad_norm": 2.2315402030944824, + "learning_rate": 2.8148053059477963e-05, + "loss": 0.5038, + "step": 12123 + }, + { + "epoch": 15.563543003851091, + "grad_norm": 8.662510871887207, + "learning_rate": 2.814762516046213e-05, + "loss": 0.5392, + "step": 12124 + }, + { + "epoch": 15.564826700898587, + "grad_norm": 1.2320451736450195, + "learning_rate": 2.81471972614463e-05, + "loss": 0.5109, + "step": 12125 + }, + { + "epoch": 15.566110397946085, + "grad_norm": 2.1444365978240967, + "learning_rate": 2.8146769362430465e-05, + "loss": 0.5276, + "step": 12126 + }, + { + "epoch": 15.567394094993581, + "grad_norm": 2.231480121612549, + "learning_rate": 2.8146341463414637e-05, + "loss": 0.5836, + "step": 12127 + }, + { + "epoch": 15.568677792041079, + "grad_norm": 1.4329543113708496, + "learning_rate": 2.81459135643988e-05, + "loss": 0.584, + "step": 12128 + }, + { + "epoch": 15.569961489088575, + "grad_norm": 1.4577308893203735, + "learning_rate": 2.8145485665382973e-05, + "loss": 0.4696, + "step": 12129 + }, + { + "epoch": 15.571245186136071, + "grad_norm": 1.8035598993301392, + "learning_rate": 2.814505776636714e-05, + "loss": 0.5145, + "step": 12130 + }, + { + "epoch": 15.572528883183569, + "grad_norm": 5.860340118408203, + "learning_rate": 2.8144629867351303e-05, + "loss": 0.5546, + "step": 12131 + }, + { + "epoch": 15.573812580231065, + "grad_norm": 3.371000051498413, + "learning_rate": 2.8144201968335475e-05, + "loss": 0.5631, + "step": 12132 + }, + { + "epoch": 15.575096277278563, + "grad_norm": 2.1899356842041016, + "learning_rate": 2.814377406931964e-05, + "loss": 0.642, + "step": 12133 + }, + { + "epoch": 15.57637997432606, + "grad_norm": 1.9475961923599243, + "learning_rate": 2.814334617030381e-05, + "loss": 0.6077, + "step": 12134 + }, + { + "epoch": 15.577663671373555, + "grad_norm": 2.436836004257202, + "learning_rate": 2.8142918271287977e-05, + "loss": 0.7018, + "step": 12135 + }, + { + "epoch": 15.578947368421053, + "grad_norm": 0.8654265403747559, + "learning_rate": 2.8142490372272145e-05, + "loss": 0.4672, + "step": 12136 + }, + { + "epoch": 15.58023106546855, + "grad_norm": 1.1113406419754028, + "learning_rate": 2.8142062473256314e-05, + "loss": 0.4702, + "step": 12137 + }, + { + "epoch": 15.581514762516047, + "grad_norm": 2.1897244453430176, + "learning_rate": 2.814163457424048e-05, + "loss": 0.5034, + "step": 12138 + }, + { + "epoch": 15.582798459563543, + "grad_norm": 1.2455284595489502, + "learning_rate": 2.8141206675224647e-05, + "loss": 0.5264, + "step": 12139 + }, + { + "epoch": 15.58408215661104, + "grad_norm": 0.9337416887283325, + "learning_rate": 2.8140778776208816e-05, + "loss": 0.468, + "step": 12140 + }, + { + "epoch": 15.585365853658537, + "grad_norm": 1.4337294101715088, + "learning_rate": 2.8140350877192984e-05, + "loss": 0.5261, + "step": 12141 + }, + { + "epoch": 15.586649550706033, + "grad_norm": 0.9952126741409302, + "learning_rate": 2.813992297817715e-05, + "loss": 0.4708, + "step": 12142 + }, + { + "epoch": 15.58793324775353, + "grad_norm": 1.0105079412460327, + "learning_rate": 2.8139495079161318e-05, + "loss": 0.4703, + "step": 12143 + }, + { + "epoch": 15.589216944801027, + "grad_norm": 2.769618272781372, + "learning_rate": 2.8139067180145486e-05, + "loss": 0.4803, + "step": 12144 + }, + { + "epoch": 15.590500641848523, + "grad_norm": 1.831730842590332, + "learning_rate": 2.8138639281129654e-05, + "loss": 0.4939, + "step": 12145 + }, + { + "epoch": 15.591784338896021, + "grad_norm": 1.965525507926941, + "learning_rate": 2.8138211382113823e-05, + "loss": 0.4857, + "step": 12146 + }, + { + "epoch": 15.593068035943517, + "grad_norm": 2.145247220993042, + "learning_rate": 2.8137783483097988e-05, + "loss": 0.5505, + "step": 12147 + }, + { + "epoch": 15.594351732991013, + "grad_norm": 1.2301855087280273, + "learning_rate": 2.813735558408216e-05, + "loss": 0.4823, + "step": 12148 + }, + { + "epoch": 15.595635430038511, + "grad_norm": 1.1658424139022827, + "learning_rate": 2.8136927685066325e-05, + "loss": 0.4962, + "step": 12149 + }, + { + "epoch": 15.596919127086007, + "grad_norm": 1.0624656677246094, + "learning_rate": 2.813649978605049e-05, + "loss": 0.4728, + "step": 12150 + }, + { + "epoch": 15.598202824133505, + "grad_norm": 1.6185587644577026, + "learning_rate": 2.813607188703466e-05, + "loss": 0.5248, + "step": 12151 + }, + { + "epoch": 15.599486521181001, + "grad_norm": 6.152905464172363, + "learning_rate": 2.8135643988018827e-05, + "loss": 0.5085, + "step": 12152 + }, + { + "epoch": 15.600770218228497, + "grad_norm": 4.6425018310546875, + "learning_rate": 2.8135216089002998e-05, + "loss": 0.4928, + "step": 12153 + }, + { + "epoch": 15.602053915275995, + "grad_norm": 2.444579839706421, + "learning_rate": 2.8134788189987163e-05, + "loss": 0.5332, + "step": 12154 + }, + { + "epoch": 15.603337612323491, + "grad_norm": 1.1513117551803589, + "learning_rate": 2.8134360290971332e-05, + "loss": 0.4936, + "step": 12155 + }, + { + "epoch": 15.60462130937099, + "grad_norm": 1.0557674169540405, + "learning_rate": 2.81339323919555e-05, + "loss": 0.4762, + "step": 12156 + }, + { + "epoch": 15.605905006418485, + "grad_norm": 1.7507745027542114, + "learning_rate": 2.8133504492939665e-05, + "loss": 0.4824, + "step": 12157 + }, + { + "epoch": 15.607188703465981, + "grad_norm": 1.217865228652954, + "learning_rate": 2.8133076593923834e-05, + "loss": 0.5271, + "step": 12158 + }, + { + "epoch": 15.60847240051348, + "grad_norm": 1.8688100576400757, + "learning_rate": 2.8132648694908002e-05, + "loss": 0.4941, + "step": 12159 + }, + { + "epoch": 15.609756097560975, + "grad_norm": 2.6156208515167236, + "learning_rate": 2.813222079589217e-05, + "loss": 0.4863, + "step": 12160 + }, + { + "epoch": 15.611039794608473, + "grad_norm": 1.8995386362075806, + "learning_rate": 2.813179289687634e-05, + "loss": 0.5261, + "step": 12161 + }, + { + "epoch": 15.61232349165597, + "grad_norm": 1.2305271625518799, + "learning_rate": 2.8131364997860507e-05, + "loss": 0.5805, + "step": 12162 + }, + { + "epoch": 15.613607188703465, + "grad_norm": 0.987743616104126, + "learning_rate": 2.8130937098844672e-05, + "loss": 0.4751, + "step": 12163 + }, + { + "epoch": 15.614890885750963, + "grad_norm": 1.4083319902420044, + "learning_rate": 2.813050919982884e-05, + "loss": 0.5078, + "step": 12164 + }, + { + "epoch": 15.61617458279846, + "grad_norm": 1.5700865983963013, + "learning_rate": 2.813008130081301e-05, + "loss": 0.5189, + "step": 12165 + }, + { + "epoch": 15.617458279845957, + "grad_norm": 2.030709743499756, + "learning_rate": 2.8129653401797174e-05, + "loss": 0.4813, + "step": 12166 + }, + { + "epoch": 15.618741976893453, + "grad_norm": 1.964241862297058, + "learning_rate": 2.8129225502781346e-05, + "loss": 0.5525, + "step": 12167 + }, + { + "epoch": 15.62002567394095, + "grad_norm": 1.4321285486221313, + "learning_rate": 2.812879760376551e-05, + "loss": 0.487, + "step": 12168 + }, + { + "epoch": 15.621309370988447, + "grad_norm": 2.080906867980957, + "learning_rate": 2.8128369704749683e-05, + "loss": 0.5722, + "step": 12169 + }, + { + "epoch": 15.622593068035943, + "grad_norm": 1.5897401571273804, + "learning_rate": 2.8127941805733848e-05, + "loss": 0.4897, + "step": 12170 + }, + { + "epoch": 15.623876765083441, + "grad_norm": 3.7209856510162354, + "learning_rate": 2.8127513906718013e-05, + "loss": 0.5315, + "step": 12171 + }, + { + "epoch": 15.625160462130937, + "grad_norm": 1.2044377326965332, + "learning_rate": 2.8127086007702185e-05, + "loss": 0.5223, + "step": 12172 + }, + { + "epoch": 15.626444159178433, + "grad_norm": 1.527950644493103, + "learning_rate": 2.812665810868635e-05, + "loss": 0.5081, + "step": 12173 + }, + { + "epoch": 15.627727856225931, + "grad_norm": 2.1554884910583496, + "learning_rate": 2.8126230209670518e-05, + "loss": 0.5099, + "step": 12174 + }, + { + "epoch": 15.629011553273427, + "grad_norm": 3.0005245208740234, + "learning_rate": 2.8125802310654686e-05, + "loss": 0.4962, + "step": 12175 + }, + { + "epoch": 15.630295250320923, + "grad_norm": 2.0556581020355225, + "learning_rate": 2.8125374411638855e-05, + "loss": 0.4708, + "step": 12176 + }, + { + "epoch": 15.631578947368421, + "grad_norm": 12.82331371307373, + "learning_rate": 2.8124946512623023e-05, + "loss": 0.537, + "step": 12177 + }, + { + "epoch": 15.632862644415917, + "grad_norm": 2.0832579135894775, + "learning_rate": 2.8124518613607188e-05, + "loss": 0.5248, + "step": 12178 + }, + { + "epoch": 15.634146341463415, + "grad_norm": 1.8251140117645264, + "learning_rate": 2.8124090714591357e-05, + "loss": 0.5827, + "step": 12179 + }, + { + "epoch": 15.635430038510911, + "grad_norm": 1.9957712888717651, + "learning_rate": 2.8123662815575525e-05, + "loss": 0.5659, + "step": 12180 + }, + { + "epoch": 15.63671373555841, + "grad_norm": 1.229722499847412, + "learning_rate": 2.8123234916559694e-05, + "loss": 0.588, + "step": 12181 + }, + { + "epoch": 15.637997432605905, + "grad_norm": 2.902531147003174, + "learning_rate": 2.812280701754386e-05, + "loss": 0.5908, + "step": 12182 + }, + { + "epoch": 15.639281129653401, + "grad_norm": 6.548379421234131, + "learning_rate": 2.812237911852803e-05, + "loss": 0.5775, + "step": 12183 + }, + { + "epoch": 15.6405648267009, + "grad_norm": 2.1419973373413086, + "learning_rate": 2.8121951219512195e-05, + "loss": 0.6053, + "step": 12184 + }, + { + "epoch": 15.641848523748395, + "grad_norm": 3.0608973503112793, + "learning_rate": 2.8121523320496364e-05, + "loss": 0.7162, + "step": 12185 + }, + { + "epoch": 15.643132220795891, + "grad_norm": 1.5756410360336304, + "learning_rate": 2.8121095421480532e-05, + "loss": 0.4752, + "step": 12186 + }, + { + "epoch": 15.64441591784339, + "grad_norm": 1.3909165859222412, + "learning_rate": 2.8120667522464697e-05, + "loss": 0.4785, + "step": 12187 + }, + { + "epoch": 15.645699614890885, + "grad_norm": 1.691736102104187, + "learning_rate": 2.812023962344887e-05, + "loss": 0.4999, + "step": 12188 + }, + { + "epoch": 15.646983311938383, + "grad_norm": 1.301695466041565, + "learning_rate": 2.8119811724433034e-05, + "loss": 0.5124, + "step": 12189 + }, + { + "epoch": 15.64826700898588, + "grad_norm": 2.372382640838623, + "learning_rate": 2.8119383825417202e-05, + "loss": 0.4945, + "step": 12190 + }, + { + "epoch": 15.649550706033375, + "grad_norm": 2.060445785522461, + "learning_rate": 2.811895592640137e-05, + "loss": 0.4725, + "step": 12191 + }, + { + "epoch": 15.650834403080873, + "grad_norm": 1.6594150066375732, + "learning_rate": 2.8118528027385536e-05, + "loss": 0.504, + "step": 12192 + }, + { + "epoch": 15.65211810012837, + "grad_norm": 3.5851728916168213, + "learning_rate": 2.8118100128369708e-05, + "loss": 0.4636, + "step": 12193 + }, + { + "epoch": 15.653401797175867, + "grad_norm": 6.231527805328369, + "learning_rate": 2.8117672229353873e-05, + "loss": 0.5089, + "step": 12194 + }, + { + "epoch": 15.654685494223363, + "grad_norm": 1.6732090711593628, + "learning_rate": 2.811724433033804e-05, + "loss": 0.5276, + "step": 12195 + }, + { + "epoch": 15.65596919127086, + "grad_norm": 1.2381113767623901, + "learning_rate": 2.811681643132221e-05, + "loss": 0.4392, + "step": 12196 + }, + { + "epoch": 15.657252888318357, + "grad_norm": 1.6970852613449097, + "learning_rate": 2.8116388532306378e-05, + "loss": 0.5135, + "step": 12197 + }, + { + "epoch": 15.658536585365853, + "grad_norm": 1.3893481492996216, + "learning_rate": 2.8115960633290543e-05, + "loss": 0.456, + "step": 12198 + }, + { + "epoch": 15.659820282413351, + "grad_norm": 1.8396137952804565, + "learning_rate": 2.811553273427471e-05, + "loss": 0.5116, + "step": 12199 + }, + { + "epoch": 15.661103979460847, + "grad_norm": 1.2064629793167114, + "learning_rate": 2.811510483525888e-05, + "loss": 0.475, + "step": 12200 + }, + { + "epoch": 15.662387676508343, + "grad_norm": 1.7040966749191284, + "learning_rate": 2.8114676936243048e-05, + "loss": 0.4844, + "step": 12201 + }, + { + "epoch": 15.663671373555841, + "grad_norm": 3.400346040725708, + "learning_rate": 2.8114249037227217e-05, + "loss": 0.4982, + "step": 12202 + }, + { + "epoch": 15.664955070603337, + "grad_norm": 1.8128135204315186, + "learning_rate": 2.811382113821138e-05, + "loss": 0.4589, + "step": 12203 + }, + { + "epoch": 15.666238767650835, + "grad_norm": 4.482548236846924, + "learning_rate": 2.811339323919555e-05, + "loss": 0.4849, + "step": 12204 + }, + { + "epoch": 15.667522464698331, + "grad_norm": 1.9206911325454712, + "learning_rate": 2.811296534017972e-05, + "loss": 0.4744, + "step": 12205 + }, + { + "epoch": 15.668806161745827, + "grad_norm": 1.1709412336349487, + "learning_rate": 2.8112537441163883e-05, + "loss": 0.4417, + "step": 12206 + }, + { + "epoch": 15.670089858793325, + "grad_norm": 1.6378353834152222, + "learning_rate": 2.8112109542148055e-05, + "loss": 0.4833, + "step": 12207 + }, + { + "epoch": 15.671373555840821, + "grad_norm": 1.78165602684021, + "learning_rate": 2.811168164313222e-05, + "loss": 0.4705, + "step": 12208 + }, + { + "epoch": 15.672657252888317, + "grad_norm": 1.8490931987762451, + "learning_rate": 2.8111253744116392e-05, + "loss": 0.4653, + "step": 12209 + }, + { + "epoch": 15.673940949935815, + "grad_norm": 3.801039218902588, + "learning_rate": 2.8110825845100557e-05, + "loss": 0.524, + "step": 12210 + }, + { + "epoch": 15.675224646983311, + "grad_norm": 3.983527421951294, + "learning_rate": 2.8110397946084722e-05, + "loss": 0.5266, + "step": 12211 + }, + { + "epoch": 15.67650834403081, + "grad_norm": 1.5652339458465576, + "learning_rate": 2.8109970047068894e-05, + "loss": 0.4749, + "step": 12212 + }, + { + "epoch": 15.677792041078305, + "grad_norm": 1.2292280197143555, + "learning_rate": 2.810954214805306e-05, + "loss": 0.4829, + "step": 12213 + }, + { + "epoch": 15.679075738125803, + "grad_norm": 2.621460199356079, + "learning_rate": 2.8109114249037227e-05, + "loss": 0.5377, + "step": 12214 + }, + { + "epoch": 15.6803594351733, + "grad_norm": 3.0253517627716064, + "learning_rate": 2.8108686350021396e-05, + "loss": 0.4863, + "step": 12215 + }, + { + "epoch": 15.681643132220795, + "grad_norm": 1.5688942670822144, + "learning_rate": 2.8108258451005564e-05, + "loss": 0.5108, + "step": 12216 + }, + { + "epoch": 15.682926829268293, + "grad_norm": 3.9883062839508057, + "learning_rate": 2.8107830551989733e-05, + "loss": 0.5081, + "step": 12217 + }, + { + "epoch": 15.68421052631579, + "grad_norm": 1.6215399503707886, + "learning_rate": 2.8107402652973898e-05, + "loss": 0.5249, + "step": 12218 + }, + { + "epoch": 15.685494223363285, + "grad_norm": 5.1793365478515625, + "learning_rate": 2.8106974753958066e-05, + "loss": 0.5022, + "step": 12219 + }, + { + "epoch": 15.686777920410783, + "grad_norm": 1.4996297359466553, + "learning_rate": 2.8106546854942234e-05, + "loss": 0.5678, + "step": 12220 + }, + { + "epoch": 15.68806161745828, + "grad_norm": 4.020941734313965, + "learning_rate": 2.8106118955926403e-05, + "loss": 0.5663, + "step": 12221 + }, + { + "epoch": 15.689345314505777, + "grad_norm": 3.9868264198303223, + "learning_rate": 2.8105691056910568e-05, + "loss": 0.5209, + "step": 12222 + }, + { + "epoch": 15.690629011553273, + "grad_norm": 5.05746603012085, + "learning_rate": 2.810526315789474e-05, + "loss": 0.5205, + "step": 12223 + }, + { + "epoch": 15.69191270860077, + "grad_norm": 1.943791389465332, + "learning_rate": 2.8104835258878905e-05, + "loss": 0.5149, + "step": 12224 + }, + { + "epoch": 15.693196405648267, + "grad_norm": 2.4514100551605225, + "learning_rate": 2.8104407359863073e-05, + "loss": 0.5504, + "step": 12225 + }, + { + "epoch": 15.694480102695763, + "grad_norm": 8.934173583984375, + "learning_rate": 2.810397946084724e-05, + "loss": 0.5043, + "step": 12226 + }, + { + "epoch": 15.695763799743261, + "grad_norm": 3.234398365020752, + "learning_rate": 2.8103551561831407e-05, + "loss": 0.5181, + "step": 12227 + }, + { + "epoch": 15.697047496790757, + "grad_norm": 1.1482112407684326, + "learning_rate": 2.810312366281558e-05, + "loss": 0.5371, + "step": 12228 + }, + { + "epoch": 15.698331193838253, + "grad_norm": 2.5546531677246094, + "learning_rate": 2.8102695763799743e-05, + "loss": 0.5434, + "step": 12229 + }, + { + "epoch": 15.699614890885751, + "grad_norm": 1.6591112613677979, + "learning_rate": 2.8102267864783912e-05, + "loss": 0.5812, + "step": 12230 + }, + { + "epoch": 15.700898587933247, + "grad_norm": 3.9640369415283203, + "learning_rate": 2.810183996576808e-05, + "loss": 0.5283, + "step": 12231 + }, + { + "epoch": 15.702182284980745, + "grad_norm": 2.2445011138916016, + "learning_rate": 2.8101412066752245e-05, + "loss": 0.5508, + "step": 12232 + }, + { + "epoch": 15.703465982028241, + "grad_norm": 2.7682793140411377, + "learning_rate": 2.8100984167736417e-05, + "loss": 0.6032, + "step": 12233 + }, + { + "epoch": 15.704749679075737, + "grad_norm": 1.5916848182678223, + "learning_rate": 2.8100556268720582e-05, + "loss": 0.6376, + "step": 12234 + }, + { + "epoch": 15.706033376123235, + "grad_norm": 2.553229808807373, + "learning_rate": 2.810012836970475e-05, + "loss": 0.6482, + "step": 12235 + }, + { + "epoch": 15.707317073170731, + "grad_norm": 1.710379958152771, + "learning_rate": 2.809970047068892e-05, + "loss": 0.4741, + "step": 12236 + }, + { + "epoch": 15.70860077021823, + "grad_norm": 2.1954944133758545, + "learning_rate": 2.8099272571673087e-05, + "loss": 0.4606, + "step": 12237 + }, + { + "epoch": 15.709884467265725, + "grad_norm": 1.4748294353485107, + "learning_rate": 2.8098844672657252e-05, + "loss": 0.4548, + "step": 12238 + }, + { + "epoch": 15.711168164313221, + "grad_norm": 1.7596200704574585, + "learning_rate": 2.809841677364142e-05, + "loss": 0.5036, + "step": 12239 + }, + { + "epoch": 15.71245186136072, + "grad_norm": 1.5114623308181763, + "learning_rate": 2.809798887462559e-05, + "loss": 0.5083, + "step": 12240 + }, + { + "epoch": 15.713735558408215, + "grad_norm": 1.178555965423584, + "learning_rate": 2.8097560975609758e-05, + "loss": 0.4655, + "step": 12241 + }, + { + "epoch": 15.715019255455712, + "grad_norm": 2.677274703979492, + "learning_rate": 2.8097133076593926e-05, + "loss": 0.4903, + "step": 12242 + }, + { + "epoch": 15.71630295250321, + "grad_norm": 4.002655506134033, + "learning_rate": 2.809670517757809e-05, + "loss": 0.5205, + "step": 12243 + }, + { + "epoch": 15.717586649550706, + "grad_norm": 1.3578464984893799, + "learning_rate": 2.8096277278562263e-05, + "loss": 0.4593, + "step": 12244 + }, + { + "epoch": 15.718870346598203, + "grad_norm": 1.7871589660644531, + "learning_rate": 2.8095849379546428e-05, + "loss": 0.4968, + "step": 12245 + }, + { + "epoch": 15.7201540436457, + "grad_norm": 1.1790410280227661, + "learning_rate": 2.8095421480530593e-05, + "loss": 0.4662, + "step": 12246 + }, + { + "epoch": 15.721437740693197, + "grad_norm": 1.9430601596832275, + "learning_rate": 2.8094993581514765e-05, + "loss": 0.4667, + "step": 12247 + }, + { + "epoch": 15.722721437740693, + "grad_norm": 1.16180419921875, + "learning_rate": 2.809456568249893e-05, + "loss": 0.4463, + "step": 12248 + }, + { + "epoch": 15.72400513478819, + "grad_norm": 1.2668042182922363, + "learning_rate": 2.80941377834831e-05, + "loss": 0.4978, + "step": 12249 + }, + { + "epoch": 15.725288831835687, + "grad_norm": 1.256252408027649, + "learning_rate": 2.8093709884467266e-05, + "loss": 0.5014, + "step": 12250 + }, + { + "epoch": 15.726572528883183, + "grad_norm": 1.0162678956985474, + "learning_rate": 2.8093281985451435e-05, + "loss": 0.495, + "step": 12251 + }, + { + "epoch": 15.72785622593068, + "grad_norm": 1.0912398099899292, + "learning_rate": 2.8092854086435603e-05, + "loss": 0.4985, + "step": 12252 + }, + { + "epoch": 15.729139922978177, + "grad_norm": 1.9973887205123901, + "learning_rate": 2.809242618741977e-05, + "loss": 0.4839, + "step": 12253 + }, + { + "epoch": 15.730423620025674, + "grad_norm": 2.1831607818603516, + "learning_rate": 2.8091998288403937e-05, + "loss": 0.4813, + "step": 12254 + }, + { + "epoch": 15.731707317073171, + "grad_norm": 3.59491229057312, + "learning_rate": 2.8091570389388105e-05, + "loss": 0.5134, + "step": 12255 + }, + { + "epoch": 15.732991014120667, + "grad_norm": 0.8248077034950256, + "learning_rate": 2.8091142490372274e-05, + "loss": 0.4812, + "step": 12256 + }, + { + "epoch": 15.734274711168164, + "grad_norm": 1.94291353225708, + "learning_rate": 2.809071459135644e-05, + "loss": 0.5548, + "step": 12257 + }, + { + "epoch": 15.735558408215661, + "grad_norm": 1.4434881210327148, + "learning_rate": 2.809028669234061e-05, + "loss": 0.4686, + "step": 12258 + }, + { + "epoch": 15.736842105263158, + "grad_norm": 3.08687424659729, + "learning_rate": 2.8089858793324775e-05, + "loss": 0.5324, + "step": 12259 + }, + { + "epoch": 15.738125802310655, + "grad_norm": 1.7515257596969604, + "learning_rate": 2.8089430894308944e-05, + "loss": 0.4912, + "step": 12260 + }, + { + "epoch": 15.739409499358151, + "grad_norm": 3.2055726051330566, + "learning_rate": 2.8089002995293112e-05, + "loss": 0.4922, + "step": 12261 + }, + { + "epoch": 15.740693196405648, + "grad_norm": 1.3159061670303345, + "learning_rate": 2.8088575096277277e-05, + "loss": 0.4898, + "step": 12262 + }, + { + "epoch": 15.741976893453145, + "grad_norm": 5.677592754364014, + "learning_rate": 2.808814719726145e-05, + "loss": 0.5007, + "step": 12263 + }, + { + "epoch": 15.743260590500642, + "grad_norm": 1.2178891897201538, + "learning_rate": 2.8087719298245614e-05, + "loss": 0.5527, + "step": 12264 + }, + { + "epoch": 15.74454428754814, + "grad_norm": 2.5421738624572754, + "learning_rate": 2.8087291399229782e-05, + "loss": 0.4984, + "step": 12265 + }, + { + "epoch": 15.745827984595635, + "grad_norm": 1.6906099319458008, + "learning_rate": 2.808686350021395e-05, + "loss": 0.5547, + "step": 12266 + }, + { + "epoch": 15.747111681643132, + "grad_norm": 5.622663497924805, + "learning_rate": 2.8086435601198116e-05, + "loss": 0.4635, + "step": 12267 + }, + { + "epoch": 15.74839537869063, + "grad_norm": 1.4536548852920532, + "learning_rate": 2.8086007702182288e-05, + "loss": 0.4805, + "step": 12268 + }, + { + "epoch": 15.749679075738126, + "grad_norm": 1.5838210582733154, + "learning_rate": 2.8085579803166453e-05, + "loss": 0.5282, + "step": 12269 + }, + { + "epoch": 15.750962772785623, + "grad_norm": 1.888056993484497, + "learning_rate": 2.808515190415062e-05, + "loss": 0.4868, + "step": 12270 + }, + { + "epoch": 15.75224646983312, + "grad_norm": 2.3904659748077393, + "learning_rate": 2.808472400513479e-05, + "loss": 0.5093, + "step": 12271 + }, + { + "epoch": 15.753530166880616, + "grad_norm": 2.0061533451080322, + "learning_rate": 2.8084296106118955e-05, + "loss": 0.5797, + "step": 12272 + }, + { + "epoch": 15.754813863928113, + "grad_norm": 2.9478821754455566, + "learning_rate": 2.8083868207103123e-05, + "loss": 0.535, + "step": 12273 + }, + { + "epoch": 15.75609756097561, + "grad_norm": 1.7833303213119507, + "learning_rate": 2.808344030808729e-05, + "loss": 0.5326, + "step": 12274 + }, + { + "epoch": 15.757381258023106, + "grad_norm": 4.1224470138549805, + "learning_rate": 2.808301240907146e-05, + "loss": 0.4904, + "step": 12275 + }, + { + "epoch": 15.758664955070603, + "grad_norm": 3.5987203121185303, + "learning_rate": 2.8082584510055628e-05, + "loss": 0.5214, + "step": 12276 + }, + { + "epoch": 15.7599486521181, + "grad_norm": 2.5007009506225586, + "learning_rate": 2.8082156611039797e-05, + "loss": 0.5204, + "step": 12277 + }, + { + "epoch": 15.761232349165597, + "grad_norm": 3.280233144760132, + "learning_rate": 2.808172871202396e-05, + "loss": 0.5222, + "step": 12278 + }, + { + "epoch": 15.762516046213094, + "grad_norm": 2.2246487140655518, + "learning_rate": 2.808130081300813e-05, + "loss": 0.5207, + "step": 12279 + }, + { + "epoch": 15.763799743260591, + "grad_norm": 1.4738901853561401, + "learning_rate": 2.80808729139923e-05, + "loss": 0.4922, + "step": 12280 + }, + { + "epoch": 15.765083440308088, + "grad_norm": 1.895211935043335, + "learning_rate": 2.8080445014976464e-05, + "loss": 0.5508, + "step": 12281 + }, + { + "epoch": 15.766367137355584, + "grad_norm": 1.724708080291748, + "learning_rate": 2.8080017115960635e-05, + "loss": 0.5344, + "step": 12282 + }, + { + "epoch": 15.767650834403081, + "grad_norm": 1.6525599956512451, + "learning_rate": 2.80795892169448e-05, + "loss": 0.591, + "step": 12283 + }, + { + "epoch": 15.768934531450578, + "grad_norm": 1.3799468278884888, + "learning_rate": 2.8079161317928972e-05, + "loss": 0.5729, + "step": 12284 + }, + { + "epoch": 15.770218228498074, + "grad_norm": 3.4594759941101074, + "learning_rate": 2.8078733418913137e-05, + "loss": 0.7367, + "step": 12285 + }, + { + "epoch": 15.771501925545572, + "grad_norm": 1.5347715616226196, + "learning_rate": 2.8078305519897302e-05, + "loss": 0.5232, + "step": 12286 + }, + { + "epoch": 15.772785622593068, + "grad_norm": 1.0311001539230347, + "learning_rate": 2.8077877620881474e-05, + "loss": 0.5081, + "step": 12287 + }, + { + "epoch": 15.774069319640565, + "grad_norm": 1.3605707883834839, + "learning_rate": 2.807744972186564e-05, + "loss": 0.5012, + "step": 12288 + }, + { + "epoch": 15.775353016688062, + "grad_norm": 2.17153263092041, + "learning_rate": 2.8077021822849807e-05, + "loss": 0.493, + "step": 12289 + }, + { + "epoch": 15.776636713735558, + "grad_norm": 1.480173110961914, + "learning_rate": 2.8076593923833976e-05, + "loss": 0.5235, + "step": 12290 + }, + { + "epoch": 15.777920410783056, + "grad_norm": 1.0019872188568115, + "learning_rate": 2.8076166024818144e-05, + "loss": 0.464, + "step": 12291 + }, + { + "epoch": 15.779204107830552, + "grad_norm": 1.1327682733535767, + "learning_rate": 2.8075738125802313e-05, + "loss": 0.5208, + "step": 12292 + }, + { + "epoch": 15.78048780487805, + "grad_norm": 1.7551989555358887, + "learning_rate": 2.8075310226786478e-05, + "loss": 0.517, + "step": 12293 + }, + { + "epoch": 15.781771501925546, + "grad_norm": 2.3897621631622314, + "learning_rate": 2.8074882327770646e-05, + "loss": 0.5025, + "step": 12294 + }, + { + "epoch": 15.783055198973042, + "grad_norm": 1.0386260747909546, + "learning_rate": 2.8074454428754815e-05, + "loss": 0.5124, + "step": 12295 + }, + { + "epoch": 15.78433889602054, + "grad_norm": 1.2613414525985718, + "learning_rate": 2.8074026529738983e-05, + "loss": 0.5282, + "step": 12296 + }, + { + "epoch": 15.785622593068036, + "grad_norm": 1.3141590356826782, + "learning_rate": 2.8073598630723148e-05, + "loss": 0.5364, + "step": 12297 + }, + { + "epoch": 15.786906290115533, + "grad_norm": 1.5397250652313232, + "learning_rate": 2.807317073170732e-05, + "loss": 0.4843, + "step": 12298 + }, + { + "epoch": 15.78818998716303, + "grad_norm": 3.944605588912964, + "learning_rate": 2.8072742832691485e-05, + "loss": 0.4797, + "step": 12299 + }, + { + "epoch": 15.789473684210526, + "grad_norm": 1.8117390871047974, + "learning_rate": 2.8072314933675653e-05, + "loss": 0.486, + "step": 12300 + }, + { + "epoch": 15.790757381258024, + "grad_norm": 1.2243950366973877, + "learning_rate": 2.807188703465982e-05, + "loss": 0.4996, + "step": 12301 + }, + { + "epoch": 15.79204107830552, + "grad_norm": 1.106597661972046, + "learning_rate": 2.8071459135643987e-05, + "loss": 0.4924, + "step": 12302 + }, + { + "epoch": 15.793324775353017, + "grad_norm": 2.1713414192199707, + "learning_rate": 2.807103123662816e-05, + "loss": 0.4775, + "step": 12303 + }, + { + "epoch": 15.794608472400514, + "grad_norm": 2.092310667037964, + "learning_rate": 2.8070603337612323e-05, + "loss": 0.4971, + "step": 12304 + }, + { + "epoch": 15.79589216944801, + "grad_norm": 2.37644100189209, + "learning_rate": 2.8070175438596492e-05, + "loss": 0.4651, + "step": 12305 + }, + { + "epoch": 15.797175866495508, + "grad_norm": 1.1454089879989624, + "learning_rate": 2.806974753958066e-05, + "loss": 0.4775, + "step": 12306 + }, + { + "epoch": 15.798459563543004, + "grad_norm": 1.9916455745697021, + "learning_rate": 2.8069319640564825e-05, + "loss": 0.5058, + "step": 12307 + }, + { + "epoch": 15.7997432605905, + "grad_norm": 1.3739103078842163, + "learning_rate": 2.8068891741548997e-05, + "loss": 0.4814, + "step": 12308 + }, + { + "epoch": 15.801026957637998, + "grad_norm": 1.7791273593902588, + "learning_rate": 2.8068463842533162e-05, + "loss": 0.4983, + "step": 12309 + }, + { + "epoch": 15.802310654685494, + "grad_norm": 1.3247753381729126, + "learning_rate": 2.806803594351733e-05, + "loss": 0.5349, + "step": 12310 + }, + { + "epoch": 15.803594351732992, + "grad_norm": 1.6306307315826416, + "learning_rate": 2.80676080445015e-05, + "loss": 0.5426, + "step": 12311 + }, + { + "epoch": 15.804878048780488, + "grad_norm": 1.1787610054016113, + "learning_rate": 2.8067180145485667e-05, + "loss": 0.4871, + "step": 12312 + }, + { + "epoch": 15.806161745827985, + "grad_norm": 1.5271693468093872, + "learning_rate": 2.8066752246469832e-05, + "loss": 0.4997, + "step": 12313 + }, + { + "epoch": 15.807445442875482, + "grad_norm": 2.2178421020507812, + "learning_rate": 2.8066324347454e-05, + "loss": 0.5112, + "step": 12314 + }, + { + "epoch": 15.808729139922978, + "grad_norm": 1.5027614831924438, + "learning_rate": 2.806589644843817e-05, + "loss": 0.4935, + "step": 12315 + }, + { + "epoch": 15.810012836970476, + "grad_norm": 1.1111721992492676, + "learning_rate": 2.8065468549422338e-05, + "loss": 0.4923, + "step": 12316 + }, + { + "epoch": 15.811296534017972, + "grad_norm": 1.1949419975280762, + "learning_rate": 2.8065040650406506e-05, + "loss": 0.4926, + "step": 12317 + }, + { + "epoch": 15.812580231065468, + "grad_norm": 2.043982744216919, + "learning_rate": 2.806461275139067e-05, + "loss": 0.5445, + "step": 12318 + }, + { + "epoch": 15.813863928112966, + "grad_norm": 1.6763625144958496, + "learning_rate": 2.8064184852374843e-05, + "loss": 0.5324, + "step": 12319 + }, + { + "epoch": 15.815147625160462, + "grad_norm": 2.139113187789917, + "learning_rate": 2.8063756953359008e-05, + "loss": 0.4691, + "step": 12320 + }, + { + "epoch": 15.81643132220796, + "grad_norm": 2.7009782791137695, + "learning_rate": 2.8063329054343173e-05, + "loss": 0.5285, + "step": 12321 + }, + { + "epoch": 15.817715019255456, + "grad_norm": 1.1350345611572266, + "learning_rate": 2.8062901155327345e-05, + "loss": 0.5285, + "step": 12322 + }, + { + "epoch": 15.818998716302952, + "grad_norm": 1.523109793663025, + "learning_rate": 2.806247325631151e-05, + "loss": 0.5547, + "step": 12323 + }, + { + "epoch": 15.82028241335045, + "grad_norm": 1.4006402492523193, + "learning_rate": 2.806204535729568e-05, + "loss": 0.534, + "step": 12324 + }, + { + "epoch": 15.821566110397946, + "grad_norm": 2.254551649093628, + "learning_rate": 2.8061617458279847e-05, + "loss": 0.5115, + "step": 12325 + }, + { + "epoch": 15.822849807445444, + "grad_norm": 2.078796625137329, + "learning_rate": 2.8061189559264015e-05, + "loss": 0.5723, + "step": 12326 + }, + { + "epoch": 15.82413350449294, + "grad_norm": 1.3491581678390503, + "learning_rate": 2.8060761660248183e-05, + "loss": 0.4925, + "step": 12327 + }, + { + "epoch": 15.825417201540436, + "grad_norm": 2.2042906284332275, + "learning_rate": 2.806033376123235e-05, + "loss": 0.5275, + "step": 12328 + }, + { + "epoch": 15.826700898587934, + "grad_norm": 2.2584407329559326, + "learning_rate": 2.8059905862216517e-05, + "loss": 0.5298, + "step": 12329 + }, + { + "epoch": 15.82798459563543, + "grad_norm": 1.0625427961349487, + "learning_rate": 2.8059477963200685e-05, + "loss": 0.5949, + "step": 12330 + }, + { + "epoch": 15.829268292682928, + "grad_norm": 2.1097774505615234, + "learning_rate": 2.8059050064184854e-05, + "loss": 0.6027, + "step": 12331 + }, + { + "epoch": 15.830551989730424, + "grad_norm": 1.4382421970367432, + "learning_rate": 2.8058622165169022e-05, + "loss": 0.5922, + "step": 12332 + }, + { + "epoch": 15.83183568677792, + "grad_norm": 4.271119117736816, + "learning_rate": 2.8058194266153187e-05, + "loss": 0.6012, + "step": 12333 + }, + { + "epoch": 15.833119383825418, + "grad_norm": 1.872497320175171, + "learning_rate": 2.8057766367137355e-05, + "loss": 0.6659, + "step": 12334 + }, + { + "epoch": 15.834403080872914, + "grad_norm": 4.172530651092529, + "learning_rate": 2.8057338468121524e-05, + "loss": 0.6786, + "step": 12335 + }, + { + "epoch": 15.835686777920412, + "grad_norm": 1.1373275518417358, + "learning_rate": 2.8056910569105692e-05, + "loss": 0.5038, + "step": 12336 + }, + { + "epoch": 15.836970474967908, + "grad_norm": 0.968475341796875, + "learning_rate": 2.8056482670089857e-05, + "loss": 0.4582, + "step": 12337 + }, + { + "epoch": 15.838254172015404, + "grad_norm": 1.780411720275879, + "learning_rate": 2.805605477107403e-05, + "loss": 0.4666, + "step": 12338 + }, + { + "epoch": 15.839537869062902, + "grad_norm": 1.2038236856460571, + "learning_rate": 2.8055626872058194e-05, + "loss": 0.5043, + "step": 12339 + }, + { + "epoch": 15.840821566110398, + "grad_norm": 1.6056619882583618, + "learning_rate": 2.8055198973042363e-05, + "loss": 0.4946, + "step": 12340 + }, + { + "epoch": 15.842105263157894, + "grad_norm": 1.3836700916290283, + "learning_rate": 2.805477107402653e-05, + "loss": 0.4968, + "step": 12341 + }, + { + "epoch": 15.843388960205392, + "grad_norm": 1.008852481842041, + "learning_rate": 2.8054343175010696e-05, + "loss": 0.4631, + "step": 12342 + }, + { + "epoch": 15.844672657252888, + "grad_norm": 1.0461993217468262, + "learning_rate": 2.8053915275994868e-05, + "loss": 0.5248, + "step": 12343 + }, + { + "epoch": 15.845956354300386, + "grad_norm": 1.7081971168518066, + "learning_rate": 2.8053487376979033e-05, + "loss": 0.4747, + "step": 12344 + }, + { + "epoch": 15.847240051347882, + "grad_norm": 2.658423662185669, + "learning_rate": 2.80530594779632e-05, + "loss": 0.5101, + "step": 12345 + }, + { + "epoch": 15.84852374839538, + "grad_norm": 1.3840372562408447, + "learning_rate": 2.805263157894737e-05, + "loss": 0.4768, + "step": 12346 + }, + { + "epoch": 15.849807445442876, + "grad_norm": 2.6123828887939453, + "learning_rate": 2.8052203679931535e-05, + "loss": 0.5159, + "step": 12347 + }, + { + "epoch": 15.851091142490372, + "grad_norm": 5.271624565124512, + "learning_rate": 2.8051775780915706e-05, + "loss": 0.5455, + "step": 12348 + }, + { + "epoch": 15.85237483953787, + "grad_norm": 1.606610655784607, + "learning_rate": 2.805134788189987e-05, + "loss": 0.4778, + "step": 12349 + }, + { + "epoch": 15.853658536585366, + "grad_norm": 1.3889148235321045, + "learning_rate": 2.805091998288404e-05, + "loss": 0.458, + "step": 12350 + }, + { + "epoch": 15.854942233632862, + "grad_norm": 3.282557249069214, + "learning_rate": 2.805049208386821e-05, + "loss": 0.5177, + "step": 12351 + }, + { + "epoch": 15.85622593068036, + "grad_norm": 1.4014302492141724, + "learning_rate": 2.8050064184852377e-05, + "loss": 0.4705, + "step": 12352 + }, + { + "epoch": 15.857509627727856, + "grad_norm": 1.6462887525558472, + "learning_rate": 2.8049636285836542e-05, + "loss": 0.4938, + "step": 12353 + }, + { + "epoch": 15.858793324775354, + "grad_norm": 0.9905973076820374, + "learning_rate": 2.804920838682071e-05, + "loss": 0.4871, + "step": 12354 + }, + { + "epoch": 15.86007702182285, + "grad_norm": 1.8400074243545532, + "learning_rate": 2.804878048780488e-05, + "loss": 0.5029, + "step": 12355 + }, + { + "epoch": 15.861360718870346, + "grad_norm": 1.1624176502227783, + "learning_rate": 2.8048352588789047e-05, + "loss": 0.4863, + "step": 12356 + }, + { + "epoch": 15.862644415917844, + "grad_norm": 1.8923730850219727, + "learning_rate": 2.8047924689773215e-05, + "loss": 0.4865, + "step": 12357 + }, + { + "epoch": 15.86392811296534, + "grad_norm": 1.4018701314926147, + "learning_rate": 2.804749679075738e-05, + "loss": 0.5125, + "step": 12358 + }, + { + "epoch": 15.865211810012838, + "grad_norm": 1.876664161682129, + "learning_rate": 2.8047068891741552e-05, + "loss": 0.4941, + "step": 12359 + }, + { + "epoch": 15.866495507060334, + "grad_norm": 2.056894063949585, + "learning_rate": 2.8046640992725717e-05, + "loss": 0.546, + "step": 12360 + }, + { + "epoch": 15.86777920410783, + "grad_norm": 1.6200249195098877, + "learning_rate": 2.8046213093709882e-05, + "loss": 0.5056, + "step": 12361 + }, + { + "epoch": 15.869062901155328, + "grad_norm": 1.2554737329483032, + "learning_rate": 2.8045785194694054e-05, + "loss": 0.4939, + "step": 12362 + }, + { + "epoch": 15.870346598202824, + "grad_norm": 1.837312936782837, + "learning_rate": 2.804535729567822e-05, + "loss": 0.4718, + "step": 12363 + }, + { + "epoch": 15.871630295250322, + "grad_norm": 0.9875289797782898, + "learning_rate": 2.804492939666239e-05, + "loss": 0.4444, + "step": 12364 + }, + { + "epoch": 15.872913992297818, + "grad_norm": 1.6642255783081055, + "learning_rate": 2.8044501497646556e-05, + "loss": 0.4762, + "step": 12365 + }, + { + "epoch": 15.874197689345314, + "grad_norm": 0.9752752780914307, + "learning_rate": 2.8044073598630724e-05, + "loss": 0.5372, + "step": 12366 + }, + { + "epoch": 15.875481386392812, + "grad_norm": 4.779494285583496, + "learning_rate": 2.8043645699614893e-05, + "loss": 0.5698, + "step": 12367 + }, + { + "epoch": 15.876765083440308, + "grad_norm": 1.479629635810852, + "learning_rate": 2.8043217800599058e-05, + "loss": 0.4995, + "step": 12368 + }, + { + "epoch": 15.878048780487806, + "grad_norm": 2.063791036605835, + "learning_rate": 2.8042789901583226e-05, + "loss": 0.5033, + "step": 12369 + }, + { + "epoch": 15.879332477535302, + "grad_norm": 6.02177095413208, + "learning_rate": 2.8042362002567395e-05, + "loss": 0.5417, + "step": 12370 + }, + { + "epoch": 15.880616174582798, + "grad_norm": 3.6720383167266846, + "learning_rate": 2.8041934103551563e-05, + "loss": 0.4724, + "step": 12371 + }, + { + "epoch": 15.881899871630296, + "grad_norm": 2.757031202316284, + "learning_rate": 2.804150620453573e-05, + "loss": 0.4868, + "step": 12372 + }, + { + "epoch": 15.883183568677792, + "grad_norm": 1.9953075647354126, + "learning_rate": 2.80410783055199e-05, + "loss": 0.4867, + "step": 12373 + }, + { + "epoch": 15.88446726572529, + "grad_norm": 2.647451877593994, + "learning_rate": 2.8040650406504065e-05, + "loss": 0.4932, + "step": 12374 + }, + { + "epoch": 15.885750962772786, + "grad_norm": 1.8146848678588867, + "learning_rate": 2.8040222507488233e-05, + "loss": 0.4895, + "step": 12375 + }, + { + "epoch": 15.887034659820282, + "grad_norm": 9.03377914428711, + "learning_rate": 2.80397946084724e-05, + "loss": 0.5416, + "step": 12376 + }, + { + "epoch": 15.88831835686778, + "grad_norm": 2.5032031536102295, + "learning_rate": 2.8039366709456567e-05, + "loss": 0.5367, + "step": 12377 + }, + { + "epoch": 15.889602053915276, + "grad_norm": 2.6609392166137695, + "learning_rate": 2.803893881044074e-05, + "loss": 0.5602, + "step": 12378 + }, + { + "epoch": 15.890885750962774, + "grad_norm": 2.802992105484009, + "learning_rate": 2.8038510911424904e-05, + "loss": 0.6023, + "step": 12379 + }, + { + "epoch": 15.89216944801027, + "grad_norm": 3.0164599418640137, + "learning_rate": 2.8038083012409075e-05, + "loss": 0.5244, + "step": 12380 + }, + { + "epoch": 15.893453145057766, + "grad_norm": 2.1909000873565674, + "learning_rate": 2.803765511339324e-05, + "loss": 0.6003, + "step": 12381 + }, + { + "epoch": 15.894736842105264, + "grad_norm": 1.9763951301574707, + "learning_rate": 2.8037227214377405e-05, + "loss": 0.5795, + "step": 12382 + }, + { + "epoch": 15.89602053915276, + "grad_norm": 1.2786004543304443, + "learning_rate": 2.8036799315361577e-05, + "loss": 0.5688, + "step": 12383 + }, + { + "epoch": 15.897304236200256, + "grad_norm": 3.4382851123809814, + "learning_rate": 2.8036371416345742e-05, + "loss": 0.6516, + "step": 12384 + }, + { + "epoch": 15.898587933247754, + "grad_norm": 4.125183582305908, + "learning_rate": 2.803594351732991e-05, + "loss": 0.7064, + "step": 12385 + }, + { + "epoch": 15.89987163029525, + "grad_norm": 1.6247024536132812, + "learning_rate": 2.803551561831408e-05, + "loss": 0.4666, + "step": 12386 + }, + { + "epoch": 15.901155327342748, + "grad_norm": 4.6875786781311035, + "learning_rate": 2.8035087719298247e-05, + "loss": 0.4898, + "step": 12387 + }, + { + "epoch": 15.902439024390244, + "grad_norm": 1.1889508962631226, + "learning_rate": 2.8034659820282416e-05, + "loss": 0.464, + "step": 12388 + }, + { + "epoch": 15.90372272143774, + "grad_norm": 2.242316722869873, + "learning_rate": 2.803423192126658e-05, + "loss": 0.482, + "step": 12389 + }, + { + "epoch": 15.905006418485238, + "grad_norm": 1.269988775253296, + "learning_rate": 2.803380402225075e-05, + "loss": 0.4873, + "step": 12390 + }, + { + "epoch": 15.906290115532734, + "grad_norm": 1.1111490726470947, + "learning_rate": 2.8033376123234918e-05, + "loss": 0.4641, + "step": 12391 + }, + { + "epoch": 15.907573812580232, + "grad_norm": 1.8257766962051392, + "learning_rate": 2.8032948224219086e-05, + "loss": 0.4704, + "step": 12392 + }, + { + "epoch": 15.908857509627728, + "grad_norm": 1.1170657873153687, + "learning_rate": 2.803252032520325e-05, + "loss": 0.4701, + "step": 12393 + }, + { + "epoch": 15.910141206675224, + "grad_norm": 1.2160751819610596, + "learning_rate": 2.803209242618742e-05, + "loss": 0.4492, + "step": 12394 + }, + { + "epoch": 15.911424903722722, + "grad_norm": 1.1820091009140015, + "learning_rate": 2.8031664527171588e-05, + "loss": 0.5082, + "step": 12395 + }, + { + "epoch": 15.912708600770218, + "grad_norm": 1.5083256959915161, + "learning_rate": 2.8031236628155756e-05, + "loss": 0.4802, + "step": 12396 + }, + { + "epoch": 15.913992297817716, + "grad_norm": 1.1245514154434204, + "learning_rate": 2.8030808729139925e-05, + "loss": 0.4383, + "step": 12397 + }, + { + "epoch": 15.915275994865212, + "grad_norm": 1.5537493228912354, + "learning_rate": 2.803038083012409e-05, + "loss": 0.4574, + "step": 12398 + }, + { + "epoch": 15.916559691912708, + "grad_norm": 1.2328325510025024, + "learning_rate": 2.802995293110826e-05, + "loss": 0.5024, + "step": 12399 + }, + { + "epoch": 15.917843388960206, + "grad_norm": 0.9754120707511902, + "learning_rate": 2.8029525032092427e-05, + "loss": 0.4869, + "step": 12400 + }, + { + "epoch": 15.919127086007702, + "grad_norm": 1.9218207597732544, + "learning_rate": 2.802909713307659e-05, + "loss": 0.4546, + "step": 12401 + }, + { + "epoch": 15.9204107830552, + "grad_norm": 1.1752967834472656, + "learning_rate": 2.8028669234060763e-05, + "loss": 0.4327, + "step": 12402 + }, + { + "epoch": 15.921694480102696, + "grad_norm": 1.52505624294281, + "learning_rate": 2.802824133504493e-05, + "loss": 0.4719, + "step": 12403 + }, + { + "epoch": 15.922978177150192, + "grad_norm": 1.6652264595031738, + "learning_rate": 2.80278134360291e-05, + "loss": 0.4912, + "step": 12404 + }, + { + "epoch": 15.92426187419769, + "grad_norm": 1.5283030271530151, + "learning_rate": 2.8027385537013265e-05, + "loss": 0.5044, + "step": 12405 + }, + { + "epoch": 15.925545571245186, + "grad_norm": 1.2327512502670288, + "learning_rate": 2.8026957637997434e-05, + "loss": 0.5096, + "step": 12406 + }, + { + "epoch": 15.926829268292684, + "grad_norm": 2.161925792694092, + "learning_rate": 2.8026529738981602e-05, + "loss": 0.4637, + "step": 12407 + }, + { + "epoch": 15.92811296534018, + "grad_norm": 1.7588990926742554, + "learning_rate": 2.8026101839965767e-05, + "loss": 0.4908, + "step": 12408 + }, + { + "epoch": 15.929396662387676, + "grad_norm": 5.764177322387695, + "learning_rate": 2.8025673940949936e-05, + "loss": 0.4661, + "step": 12409 + }, + { + "epoch": 15.930680359435174, + "grad_norm": 2.7508492469787598, + "learning_rate": 2.8025246041934104e-05, + "loss": 0.4952, + "step": 12410 + }, + { + "epoch": 15.93196405648267, + "grad_norm": 2.3988187313079834, + "learning_rate": 2.8024818142918272e-05, + "loss": 0.4658, + "step": 12411 + }, + { + "epoch": 15.933247753530168, + "grad_norm": 2.453078269958496, + "learning_rate": 2.802439024390244e-05, + "loss": 0.4869, + "step": 12412 + }, + { + "epoch": 15.934531450577664, + "grad_norm": 1.0353220701217651, + "learning_rate": 2.802396234488661e-05, + "loss": 0.5208, + "step": 12413 + }, + { + "epoch": 15.93581514762516, + "grad_norm": 3.298072099685669, + "learning_rate": 2.8023534445870774e-05, + "loss": 0.4825, + "step": 12414 + }, + { + "epoch": 15.937098844672658, + "grad_norm": 2.4021973609924316, + "learning_rate": 2.8023106546854943e-05, + "loss": 0.5054, + "step": 12415 + }, + { + "epoch": 15.938382541720154, + "grad_norm": 1.6187797784805298, + "learning_rate": 2.802267864783911e-05, + "loss": 0.5078, + "step": 12416 + }, + { + "epoch": 15.93966623876765, + "grad_norm": 3.2320356369018555, + "learning_rate": 2.8022250748823276e-05, + "loss": 0.5256, + "step": 12417 + }, + { + "epoch": 15.940949935815148, + "grad_norm": 1.3728511333465576, + "learning_rate": 2.8021822849807448e-05, + "loss": 0.4375, + "step": 12418 + }, + { + "epoch": 15.942233632862644, + "grad_norm": 3.4922215938568115, + "learning_rate": 2.8021394950791613e-05, + "loss": 0.5115, + "step": 12419 + }, + { + "epoch": 15.943517329910142, + "grad_norm": 1.5279743671417236, + "learning_rate": 2.8020967051775785e-05, + "loss": 0.4956, + "step": 12420 + }, + { + "epoch": 15.944801026957638, + "grad_norm": 2.510509729385376, + "learning_rate": 2.802053915275995e-05, + "loss": 0.4929, + "step": 12421 + }, + { + "epoch": 15.946084724005134, + "grad_norm": 1.476287603378296, + "learning_rate": 2.8020111253744115e-05, + "loss": 0.5269, + "step": 12422 + }, + { + "epoch": 15.947368421052632, + "grad_norm": 1.6230095624923706, + "learning_rate": 2.8019683354728287e-05, + "loss": 0.4775, + "step": 12423 + }, + { + "epoch": 15.948652118100128, + "grad_norm": 3.0811619758605957, + "learning_rate": 2.801925545571245e-05, + "loss": 0.512, + "step": 12424 + }, + { + "epoch": 15.949935815147626, + "grad_norm": 2.062628984451294, + "learning_rate": 2.801882755669662e-05, + "loss": 0.5479, + "step": 12425 + }, + { + "epoch": 15.951219512195122, + "grad_norm": 2.705711603164673, + "learning_rate": 2.801839965768079e-05, + "loss": 0.5979, + "step": 12426 + }, + { + "epoch": 15.952503209242618, + "grad_norm": 4.0203962326049805, + "learning_rate": 2.8017971758664957e-05, + "loss": 0.5852, + "step": 12427 + }, + { + "epoch": 15.953786906290116, + "grad_norm": 1.6185508966445923, + "learning_rate": 2.8017543859649125e-05, + "loss": 0.5164, + "step": 12428 + }, + { + "epoch": 15.955070603337612, + "grad_norm": 2.3870365619659424, + "learning_rate": 2.801711596063329e-05, + "loss": 0.5066, + "step": 12429 + }, + { + "epoch": 15.95635430038511, + "grad_norm": 1.654257893562317, + "learning_rate": 2.801668806161746e-05, + "loss": 0.5493, + "step": 12430 + }, + { + "epoch": 15.957637997432606, + "grad_norm": 3.443075180053711, + "learning_rate": 2.8016260162601627e-05, + "loss": 0.5922, + "step": 12431 + }, + { + "epoch": 15.958921694480102, + "grad_norm": 1.6544419527053833, + "learning_rate": 2.8015832263585795e-05, + "loss": 0.5511, + "step": 12432 + }, + { + "epoch": 15.9602053915276, + "grad_norm": 1.6483826637268066, + "learning_rate": 2.801540436456996e-05, + "loss": 0.5872, + "step": 12433 + }, + { + "epoch": 15.961489088575096, + "grad_norm": 1.6622718572616577, + "learning_rate": 2.8014976465554132e-05, + "loss": 0.6689, + "step": 12434 + }, + { + "epoch": 15.962772785622594, + "grad_norm": 3.321852922439575, + "learning_rate": 2.8014548566538297e-05, + "loss": 0.7795, + "step": 12435 + }, + { + "epoch": 15.96405648267009, + "grad_norm": 2.3897552490234375, + "learning_rate": 2.8014120667522466e-05, + "loss": 0.4507, + "step": 12436 + }, + { + "epoch": 15.965340179717586, + "grad_norm": 0.8743383884429932, + "learning_rate": 2.8013692768506634e-05, + "loss": 0.4837, + "step": 12437 + }, + { + "epoch": 15.966623876765084, + "grad_norm": 1.0623525381088257, + "learning_rate": 2.80132648694908e-05, + "loss": 0.5118, + "step": 12438 + }, + { + "epoch": 15.96790757381258, + "grad_norm": 1.482308030128479, + "learning_rate": 2.801283697047497e-05, + "loss": 0.4866, + "step": 12439 + }, + { + "epoch": 15.969191270860078, + "grad_norm": 1.9797452688217163, + "learning_rate": 2.8012409071459136e-05, + "loss": 0.4949, + "step": 12440 + }, + { + "epoch": 15.970474967907574, + "grad_norm": 0.7688905000686646, + "learning_rate": 2.8011981172443304e-05, + "loss": 0.4454, + "step": 12441 + }, + { + "epoch": 15.97175866495507, + "grad_norm": 0.9275468587875366, + "learning_rate": 2.8011553273427473e-05, + "loss": 0.4941, + "step": 12442 + }, + { + "epoch": 15.973042362002568, + "grad_norm": 1.6191978454589844, + "learning_rate": 2.8011125374411638e-05, + "loss": 0.4759, + "step": 12443 + }, + { + "epoch": 15.974326059050064, + "grad_norm": 1.5419416427612305, + "learning_rate": 2.801069747539581e-05, + "loss": 0.5013, + "step": 12444 + }, + { + "epoch": 15.975609756097562, + "grad_norm": Infinity, + "learning_rate": 2.801069747539581e-05, + "loss": 0.4925, + "step": 12445 + }, + { + "epoch": 15.976893453145058, + "grad_norm": 1.816900372505188, + "learning_rate": 2.8010269576379975e-05, + "loss": 0.5818, + "step": 12446 + }, + { + "epoch": 15.978177150192554, + "grad_norm": 2.417052984237671, + "learning_rate": 2.8009841677364143e-05, + "loss": 0.4786, + "step": 12447 + }, + { + "epoch": 15.979460847240052, + "grad_norm": 2.3546457290649414, + "learning_rate": 2.800941377834831e-05, + "loss": 0.4775, + "step": 12448 + }, + { + "epoch": 15.980744544287548, + "grad_norm": 1.4876374006271362, + "learning_rate": 2.800898587933248e-05, + "loss": 0.5053, + "step": 12449 + }, + { + "epoch": 15.982028241335044, + "grad_norm": 1.13480806350708, + "learning_rate": 2.8008557980316645e-05, + "loss": 0.5105, + "step": 12450 + }, + { + "epoch": 15.983311938382542, + "grad_norm": 1.189958930015564, + "learning_rate": 2.8008130081300813e-05, + "loss": 0.5531, + "step": 12451 + }, + { + "epoch": 15.984595635430038, + "grad_norm": 0.991096019744873, + "learning_rate": 2.8007702182284982e-05, + "loss": 0.4417, + "step": 12452 + }, + { + "epoch": 15.985879332477536, + "grad_norm": 2.615971565246582, + "learning_rate": 2.800727428326915e-05, + "loss": 0.4654, + "step": 12453 + }, + { + "epoch": 15.987163029525032, + "grad_norm": 1.1240187883377075, + "learning_rate": 2.800684638425332e-05, + "loss": 0.4788, + "step": 12454 + }, + { + "epoch": 15.988446726572528, + "grad_norm": 1.0013136863708496, + "learning_rate": 2.8006418485237484e-05, + "loss": 0.5578, + "step": 12455 + }, + { + "epoch": 15.989730423620026, + "grad_norm": 1.3021962642669678, + "learning_rate": 2.8005990586221652e-05, + "loss": 0.5152, + "step": 12456 + }, + { + "epoch": 15.991014120667522, + "grad_norm": 1.3413583040237427, + "learning_rate": 2.800556268720582e-05, + "loss": 0.5425, + "step": 12457 + }, + { + "epoch": 15.99229781771502, + "grad_norm": 4.790539264678955, + "learning_rate": 2.8005134788189985e-05, + "loss": 0.5241, + "step": 12458 + }, + { + "epoch": 15.993581514762516, + "grad_norm": 4.8666181564331055, + "learning_rate": 2.8004706889174157e-05, + "loss": 0.5117, + "step": 12459 + }, + { + "epoch": 15.994865211810012, + "grad_norm": 1.3503918647766113, + "learning_rate": 2.8004278990158322e-05, + "loss": 0.5452, + "step": 12460 + }, + { + "epoch": 15.99614890885751, + "grad_norm": 2.0583138465881348, + "learning_rate": 2.800385109114249e-05, + "loss": 0.5509, + "step": 12461 + }, + { + "epoch": 15.997432605905006, + "grad_norm": 1.917181372642517, + "learning_rate": 2.800342319212666e-05, + "loss": 0.582, + "step": 12462 + }, + { + "epoch": 15.998716302952504, + "grad_norm": 2.0836429595947266, + "learning_rate": 2.8002995293110824e-05, + "loss": 0.6, + "step": 12463 + }, + { + "epoch": 16.0, + "grad_norm": 4.708427906036377, + "learning_rate": 2.8002567394094996e-05, + "loss": 0.8565, + "step": 12464 + }, + { + "epoch": 16.001283697047498, + "grad_norm": 1.4002811908721924, + "learning_rate": 2.800213949507916e-05, + "loss": 0.4724, + "step": 12465 + }, + { + "epoch": 16.002567394094992, + "grad_norm": 1.4183188676834106, + "learning_rate": 2.800171159606333e-05, + "loss": 0.4886, + "step": 12466 + }, + { + "epoch": 16.00385109114249, + "grad_norm": 1.3542871475219727, + "learning_rate": 2.8001283697047498e-05, + "loss": 0.5016, + "step": 12467 + }, + { + "epoch": 16.005134788189988, + "grad_norm": 2.099330186843872, + "learning_rate": 2.8000855798031666e-05, + "loss": 0.4671, + "step": 12468 + }, + { + "epoch": 16.006418485237482, + "grad_norm": 1.7861390113830566, + "learning_rate": 2.800042789901583e-05, + "loss": 0.4865, + "step": 12469 + }, + { + "epoch": 16.00770218228498, + "grad_norm": 1.7714422941207886, + "learning_rate": 2.8e-05, + "loss": 0.4799, + "step": 12470 + }, + { + "epoch": 16.008985879332478, + "grad_norm": 1.3905820846557617, + "learning_rate": 2.7999572100984168e-05, + "loss": 0.4789, + "step": 12471 + }, + { + "epoch": 16.010269576379976, + "grad_norm": 1.4750632047653198, + "learning_rate": 2.7999144201968336e-05, + "loss": 0.4591, + "step": 12472 + }, + { + "epoch": 16.01155327342747, + "grad_norm": 1.2052134275436401, + "learning_rate": 2.7998716302952505e-05, + "loss": 0.4439, + "step": 12473 + }, + { + "epoch": 16.012836970474968, + "grad_norm": 1.3591365814208984, + "learning_rate": 2.799828840393667e-05, + "loss": 0.5167, + "step": 12474 + }, + { + "epoch": 16.014120667522466, + "grad_norm": 1.235405445098877, + "learning_rate": 2.799786050492084e-05, + "loss": 0.4794, + "step": 12475 + }, + { + "epoch": 16.01540436456996, + "grad_norm": 1.53355872631073, + "learning_rate": 2.7997432605905007e-05, + "loss": 0.5079, + "step": 12476 + }, + { + "epoch": 16.016688061617458, + "grad_norm": 1.844499111175537, + "learning_rate": 2.799700470688917e-05, + "loss": 0.4685, + "step": 12477 + }, + { + "epoch": 16.017971758664956, + "grad_norm": 1.1689379215240479, + "learning_rate": 2.7996576807873343e-05, + "loss": 0.4932, + "step": 12478 + }, + { + "epoch": 16.01925545571245, + "grad_norm": 0.9124563336372375, + "learning_rate": 2.799614890885751e-05, + "loss": 0.482, + "step": 12479 + }, + { + "epoch": 16.020539152759948, + "grad_norm": 1.1655099391937256, + "learning_rate": 2.799572100984168e-05, + "loss": 0.4701, + "step": 12480 + }, + { + "epoch": 16.021822849807446, + "grad_norm": 1.3452866077423096, + "learning_rate": 2.7995293110825845e-05, + "loss": 0.483, + "step": 12481 + }, + { + "epoch": 16.023106546854944, + "grad_norm": 1.7943209409713745, + "learning_rate": 2.7994865211810014e-05, + "loss": 0.4618, + "step": 12482 + }, + { + "epoch": 16.024390243902438, + "grad_norm": 1.8003498315811157, + "learning_rate": 2.7994437312794182e-05, + "loss": 0.4548, + "step": 12483 + }, + { + "epoch": 16.025673940949936, + "grad_norm": 1.049665927886963, + "learning_rate": 2.7994009413778347e-05, + "loss": 0.4808, + "step": 12484 + }, + { + "epoch": 16.026957637997434, + "grad_norm": 1.1608787775039673, + "learning_rate": 2.7993581514762516e-05, + "loss": 0.4426, + "step": 12485 + }, + { + "epoch": 16.028241335044928, + "grad_norm": 2.1519837379455566, + "learning_rate": 2.7993153615746684e-05, + "loss": 0.4593, + "step": 12486 + }, + { + "epoch": 16.029525032092426, + "grad_norm": 1.3759721517562866, + "learning_rate": 2.7992725716730852e-05, + "loss": 0.4828, + "step": 12487 + }, + { + "epoch": 16.030808729139924, + "grad_norm": 3.098606586456299, + "learning_rate": 2.799229781771502e-05, + "loss": 0.4797, + "step": 12488 + }, + { + "epoch": 16.03209242618742, + "grad_norm": 3.570039749145508, + "learning_rate": 2.799186991869919e-05, + "loss": 0.475, + "step": 12489 + }, + { + "epoch": 16.033376123234916, + "grad_norm": 7.532938480377197, + "learning_rate": 2.7991442019683354e-05, + "loss": 0.5053, + "step": 12490 + }, + { + "epoch": 16.034659820282414, + "grad_norm": 1.8892556428909302, + "learning_rate": 2.7991014120667523e-05, + "loss": 0.4697, + "step": 12491 + }, + { + "epoch": 16.035943517329912, + "grad_norm": 1.7670975923538208, + "learning_rate": 2.799058622165169e-05, + "loss": 0.4968, + "step": 12492 + }, + { + "epoch": 16.037227214377406, + "grad_norm": 3.221027374267578, + "learning_rate": 2.7990158322635856e-05, + "loss": 0.4596, + "step": 12493 + }, + { + "epoch": 16.038510911424904, + "grad_norm": 1.242382526397705, + "learning_rate": 2.7989730423620028e-05, + "loss": 0.4638, + "step": 12494 + }, + { + "epoch": 16.039794608472402, + "grad_norm": 2.966721534729004, + "learning_rate": 2.7989302524604193e-05, + "loss": 0.5345, + "step": 12495 + }, + { + "epoch": 16.041078305519896, + "grad_norm": 2.742828130722046, + "learning_rate": 2.7988874625588365e-05, + "loss": 0.4738, + "step": 12496 + }, + { + "epoch": 16.042362002567394, + "grad_norm": 1.3322523832321167, + "learning_rate": 2.798844672657253e-05, + "loss": 0.4899, + "step": 12497 + }, + { + "epoch": 16.043645699614892, + "grad_norm": 2.299978017807007, + "learning_rate": 2.7988018827556695e-05, + "loss": 0.493, + "step": 12498 + }, + { + "epoch": 16.044929396662386, + "grad_norm": 0.8955047130584717, + "learning_rate": 2.7987590928540867e-05, + "loss": 0.4453, + "step": 12499 + }, + { + "epoch": 16.046213093709884, + "grad_norm": 1.0540611743927002, + "learning_rate": 2.798716302952503e-05, + "loss": 0.454, + "step": 12500 + }, + { + "epoch": 16.047496790757382, + "grad_norm": 1.7904045581817627, + "learning_rate": 2.79867351305092e-05, + "loss": 0.4746, + "step": 12501 + }, + { + "epoch": 16.048780487804876, + "grad_norm": 2.043545961380005, + "learning_rate": 2.798630723149337e-05, + "loss": 0.473, + "step": 12502 + }, + { + "epoch": 16.050064184852374, + "grad_norm": 1.6742841005325317, + "learning_rate": 2.7985879332477537e-05, + "loss": 0.4983, + "step": 12503 + }, + { + "epoch": 16.051347881899872, + "grad_norm": 1.029072642326355, + "learning_rate": 2.7985451433461705e-05, + "loss": 0.4983, + "step": 12504 + }, + { + "epoch": 16.05263157894737, + "grad_norm": 1.0799058675765991, + "learning_rate": 2.798502353444587e-05, + "loss": 0.4817, + "step": 12505 + }, + { + "epoch": 16.053915275994864, + "grad_norm": 1.9043123722076416, + "learning_rate": 2.798459563543004e-05, + "loss": 0.51, + "step": 12506 + }, + { + "epoch": 16.055198973042362, + "grad_norm": 2.334716796875, + "learning_rate": 2.7984167736414207e-05, + "loss": 0.4878, + "step": 12507 + }, + { + "epoch": 16.05648267008986, + "grad_norm": 1.4658598899841309, + "learning_rate": 2.7983739837398376e-05, + "loss": 0.4747, + "step": 12508 + }, + { + "epoch": 16.057766367137354, + "grad_norm": 2.5985827445983887, + "learning_rate": 2.798331193838254e-05, + "loss": 0.5169, + "step": 12509 + }, + { + "epoch": 16.059050064184852, + "grad_norm": 1.9397114515304565, + "learning_rate": 2.7982884039366712e-05, + "loss": 0.5591, + "step": 12510 + }, + { + "epoch": 16.06033376123235, + "grad_norm": 3.482689619064331, + "learning_rate": 2.7982456140350877e-05, + "loss": 0.497, + "step": 12511 + }, + { + "epoch": 16.061617458279844, + "grad_norm": 2.7553529739379883, + "learning_rate": 2.7982028241335046e-05, + "loss": 0.5669, + "step": 12512 + }, + { + "epoch": 16.062901155327342, + "grad_norm": 2.5431008338928223, + "learning_rate": 2.7981600342319214e-05, + "loss": 0.6858, + "step": 12513 + }, + { + "epoch": 16.06418485237484, + "grad_norm": 5.739465713500977, + "learning_rate": 2.798117244330338e-05, + "loss": 0.7536, + "step": 12514 + }, + { + "epoch": 16.065468549422338, + "grad_norm": 1.5112411975860596, + "learning_rate": 2.798074454428755e-05, + "loss": 0.4533, + "step": 12515 + }, + { + "epoch": 16.066752246469832, + "grad_norm": 1.3454452753067017, + "learning_rate": 2.7980316645271716e-05, + "loss": 0.4582, + "step": 12516 + }, + { + "epoch": 16.06803594351733, + "grad_norm": 2.4186148643493652, + "learning_rate": 2.7979888746255884e-05, + "loss": 0.4666, + "step": 12517 + }, + { + "epoch": 16.069319640564828, + "grad_norm": 1.2593737840652466, + "learning_rate": 2.7979460847240053e-05, + "loss": 0.485, + "step": 12518 + }, + { + "epoch": 16.070603337612322, + "grad_norm": 1.344958782196045, + "learning_rate": 2.7979032948224218e-05, + "loss": 0.4986, + "step": 12519 + }, + { + "epoch": 16.07188703465982, + "grad_norm": 1.6349555253982544, + "learning_rate": 2.797860504920839e-05, + "loss": 0.4888, + "step": 12520 + }, + { + "epoch": 16.073170731707318, + "grad_norm": 1.3793880939483643, + "learning_rate": 2.7978177150192555e-05, + "loss": 0.4643, + "step": 12521 + }, + { + "epoch": 16.074454428754812, + "grad_norm": 2.159787178039551, + "learning_rate": 2.7977749251176723e-05, + "loss": 0.4917, + "step": 12522 + }, + { + "epoch": 16.07573812580231, + "grad_norm": 2.663341760635376, + "learning_rate": 2.797732135216089e-05, + "loss": 0.4872, + "step": 12523 + }, + { + "epoch": 16.077021822849808, + "grad_norm": 2.0619850158691406, + "learning_rate": 2.7976893453145057e-05, + "loss": 0.4934, + "step": 12524 + }, + { + "epoch": 16.078305519897306, + "grad_norm": 2.015317678451538, + "learning_rate": 2.7976465554129225e-05, + "loss": 0.4654, + "step": 12525 + }, + { + "epoch": 16.0795892169448, + "grad_norm": 1.788658618927002, + "learning_rate": 2.7976037655113393e-05, + "loss": 0.4684, + "step": 12526 + }, + { + "epoch": 16.080872913992298, + "grad_norm": 1.6560426950454712, + "learning_rate": 2.7975609756097562e-05, + "loss": 0.4788, + "step": 12527 + }, + { + "epoch": 16.082156611039796, + "grad_norm": 1.2280021905899048, + "learning_rate": 2.797518185708173e-05, + "loss": 0.4825, + "step": 12528 + }, + { + "epoch": 16.08344030808729, + "grad_norm": 2.3476309776306152, + "learning_rate": 2.79747539580659e-05, + "loss": 0.4659, + "step": 12529 + }, + { + "epoch": 16.084724005134788, + "grad_norm": 1.1445564031600952, + "learning_rate": 2.7974326059050064e-05, + "loss": 0.5015, + "step": 12530 + }, + { + "epoch": 16.086007702182286, + "grad_norm": 2.2231364250183105, + "learning_rate": 2.7973898160034232e-05, + "loss": 0.4785, + "step": 12531 + }, + { + "epoch": 16.08729139922978, + "grad_norm": 2.4699244499206543, + "learning_rate": 2.79734702610184e-05, + "loss": 0.4567, + "step": 12532 + }, + { + "epoch": 16.088575096277278, + "grad_norm": 12.74730110168457, + "learning_rate": 2.7973042362002565e-05, + "loss": 0.5004, + "step": 12533 + }, + { + "epoch": 16.089858793324776, + "grad_norm": 1.8104161024093628, + "learning_rate": 2.7972614462986737e-05, + "loss": 0.4983, + "step": 12534 + }, + { + "epoch": 16.09114249037227, + "grad_norm": 1.4964733123779297, + "learning_rate": 2.7972186563970902e-05, + "loss": 0.4402, + "step": 12535 + }, + { + "epoch": 16.09242618741977, + "grad_norm": 2.9658806324005127, + "learning_rate": 2.7971758664955074e-05, + "loss": 0.4729, + "step": 12536 + }, + { + "epoch": 16.093709884467266, + "grad_norm": 3.0303115844726562, + "learning_rate": 2.797133076593924e-05, + "loss": 0.5036, + "step": 12537 + }, + { + "epoch": 16.094993581514764, + "grad_norm": 1.7925915718078613, + "learning_rate": 2.7970902866923404e-05, + "loss": 0.5014, + "step": 12538 + }, + { + "epoch": 16.09627727856226, + "grad_norm": 1.7607358694076538, + "learning_rate": 2.7970474967907576e-05, + "loss": 0.5333, + "step": 12539 + }, + { + "epoch": 16.097560975609756, + "grad_norm": 2.2488620281219482, + "learning_rate": 2.797004706889174e-05, + "loss": 0.4967, + "step": 12540 + }, + { + "epoch": 16.098844672657254, + "grad_norm": 1.4101600646972656, + "learning_rate": 2.796961916987591e-05, + "loss": 0.4861, + "step": 12541 + }, + { + "epoch": 16.10012836970475, + "grad_norm": 2.0823397636413574, + "learning_rate": 2.7969191270860078e-05, + "loss": 0.5321, + "step": 12542 + }, + { + "epoch": 16.101412066752246, + "grad_norm": 58.5665168762207, + "learning_rate": 2.7968763371844246e-05, + "loss": 0.4645, + "step": 12543 + }, + { + "epoch": 16.102695763799744, + "grad_norm": 1.236465573310852, + "learning_rate": 2.7968335472828415e-05, + "loss": 0.4688, + "step": 12544 + }, + { + "epoch": 16.10397946084724, + "grad_norm": 3.238419532775879, + "learning_rate": 2.796790757381258e-05, + "loss": 0.494, + "step": 12545 + }, + { + "epoch": 16.105263157894736, + "grad_norm": 2.222313404083252, + "learning_rate": 2.7967479674796748e-05, + "loss": 0.491, + "step": 12546 + }, + { + "epoch": 16.106546854942234, + "grad_norm": 2.9000113010406494, + "learning_rate": 2.7967051775780916e-05, + "loss": 0.5445, + "step": 12547 + }, + { + "epoch": 16.107830551989732, + "grad_norm": 1.1243003606796265, + "learning_rate": 2.7966623876765085e-05, + "loss": 0.5274, + "step": 12548 + }, + { + "epoch": 16.109114249037226, + "grad_norm": 1.3483928442001343, + "learning_rate": 2.796619597774925e-05, + "loss": 0.5052, + "step": 12549 + }, + { + "epoch": 16.110397946084724, + "grad_norm": 1.8580658435821533, + "learning_rate": 2.7965768078733422e-05, + "loss": 0.5316, + "step": 12550 + }, + { + "epoch": 16.111681643132222, + "grad_norm": 1.96030592918396, + "learning_rate": 2.7965340179717587e-05, + "loss": 0.5354, + "step": 12551 + }, + { + "epoch": 16.112965340179716, + "grad_norm": 1.5155606269836426, + "learning_rate": 2.7964912280701755e-05, + "loss": 0.5169, + "step": 12552 + }, + { + "epoch": 16.114249037227214, + "grad_norm": 1.2004398107528687, + "learning_rate": 2.7964484381685924e-05, + "loss": 0.5033, + "step": 12553 + }, + { + "epoch": 16.115532734274712, + "grad_norm": 1.2473689317703247, + "learning_rate": 2.796405648267009e-05, + "loss": 0.4906, + "step": 12554 + }, + { + "epoch": 16.116816431322206, + "grad_norm": 1.4909391403198242, + "learning_rate": 2.796362858365426e-05, + "loss": 0.6528, + "step": 12555 + }, + { + "epoch": 16.118100128369704, + "grad_norm": 4.680151462554932, + "learning_rate": 2.7963200684638425e-05, + "loss": 0.5924, + "step": 12556 + }, + { + "epoch": 16.119383825417202, + "grad_norm": 3.825281858444214, + "learning_rate": 2.7962772785622594e-05, + "loss": 0.5734, + "step": 12557 + }, + { + "epoch": 16.1206675224647, + "grad_norm": 1.7335160970687866, + "learning_rate": 2.7962344886606762e-05, + "loss": 0.4914, + "step": 12558 + }, + { + "epoch": 16.121951219512194, + "grad_norm": 2.3388755321502686, + "learning_rate": 2.7961916987590927e-05, + "loss": 0.5034, + "step": 12559 + }, + { + "epoch": 16.123234916559692, + "grad_norm": 7.832240104675293, + "learning_rate": 2.79614890885751e-05, + "loss": 0.5846, + "step": 12560 + }, + { + "epoch": 16.12451861360719, + "grad_norm": 2.604947090148926, + "learning_rate": 2.7961061189559264e-05, + "loss": 0.5546, + "step": 12561 + }, + { + "epoch": 16.125802310654684, + "grad_norm": 1.7339863777160645, + "learning_rate": 2.7960633290543432e-05, + "loss": 0.6094, + "step": 12562 + }, + { + "epoch": 16.127086007702182, + "grad_norm": 1.7253819704055786, + "learning_rate": 2.79602053915276e-05, + "loss": 0.5827, + "step": 12563 + }, + { + "epoch": 16.12836970474968, + "grad_norm": 2.1685972213745117, + "learning_rate": 2.795977749251177e-05, + "loss": 0.6979, + "step": 12564 + }, + { + "epoch": 16.129653401797174, + "grad_norm": 2.936563730239868, + "learning_rate": 2.7959349593495934e-05, + "loss": 0.4599, + "step": 12565 + }, + { + "epoch": 16.130937098844672, + "grad_norm": 2.0463902950286865, + "learning_rate": 2.7958921694480103e-05, + "loss": 0.4338, + "step": 12566 + }, + { + "epoch": 16.13222079589217, + "grad_norm": 1.5286369323730469, + "learning_rate": 2.795849379546427e-05, + "loss": 0.4954, + "step": 12567 + }, + { + "epoch": 16.133504492939664, + "grad_norm": 1.0294349193572998, + "learning_rate": 2.795806589644844e-05, + "loss": 0.4452, + "step": 12568 + }, + { + "epoch": 16.134788189987162, + "grad_norm": 1.3157545328140259, + "learning_rate": 2.7957637997432608e-05, + "loss": 0.481, + "step": 12569 + }, + { + "epoch": 16.13607188703466, + "grad_norm": 1.354695200920105, + "learning_rate": 2.7957210098416773e-05, + "loss": 0.4924, + "step": 12570 + }, + { + "epoch": 16.137355584082158, + "grad_norm": 2.51055645942688, + "learning_rate": 2.7956782199400945e-05, + "loss": 0.4598, + "step": 12571 + }, + { + "epoch": 16.138639281129652, + "grad_norm": 1.0308516025543213, + "learning_rate": 2.795635430038511e-05, + "loss": 0.4815, + "step": 12572 + }, + { + "epoch": 16.13992297817715, + "grad_norm": 1.796375036239624, + "learning_rate": 2.7955926401369275e-05, + "loss": 0.5066, + "step": 12573 + }, + { + "epoch": 16.141206675224648, + "grad_norm": 2.623680353164673, + "learning_rate": 2.7955498502353447e-05, + "loss": 0.5077, + "step": 12574 + }, + { + "epoch": 16.142490372272142, + "grad_norm": 1.6784032583236694, + "learning_rate": 2.795507060333761e-05, + "loss": 0.4431, + "step": 12575 + }, + { + "epoch": 16.14377406931964, + "grad_norm": 1.9796230792999268, + "learning_rate": 2.7954642704321783e-05, + "loss": 0.5029, + "step": 12576 + }, + { + "epoch": 16.145057766367138, + "grad_norm": 2.667048215866089, + "learning_rate": 2.795421480530595e-05, + "loss": 0.4582, + "step": 12577 + }, + { + "epoch": 16.146341463414632, + "grad_norm": 0.9415404796600342, + "learning_rate": 2.7953786906290117e-05, + "loss": 0.5094, + "step": 12578 + }, + { + "epoch": 16.14762516046213, + "grad_norm": 2.292945384979248, + "learning_rate": 2.7953359007274285e-05, + "loss": 0.4771, + "step": 12579 + }, + { + "epoch": 16.14890885750963, + "grad_norm": 1.337754249572754, + "learning_rate": 2.795293110825845e-05, + "loss": 0.5045, + "step": 12580 + }, + { + "epoch": 16.150192554557126, + "grad_norm": 1.3348543643951416, + "learning_rate": 2.795250320924262e-05, + "loss": 0.4794, + "step": 12581 + }, + { + "epoch": 16.15147625160462, + "grad_norm": 1.4795317649841309, + "learning_rate": 2.7952075310226787e-05, + "loss": 0.4474, + "step": 12582 + }, + { + "epoch": 16.15275994865212, + "grad_norm": 2.2306172847747803, + "learning_rate": 2.7951647411210956e-05, + "loss": 0.4799, + "step": 12583 + }, + { + "epoch": 16.154043645699616, + "grad_norm": 0.9065873026847839, + "learning_rate": 2.7951219512195124e-05, + "loss": 0.4916, + "step": 12584 + }, + { + "epoch": 16.15532734274711, + "grad_norm": 1.648085594177246, + "learning_rate": 2.795079161317929e-05, + "loss": 0.4651, + "step": 12585 + }, + { + "epoch": 16.15661103979461, + "grad_norm": 8.21812915802002, + "learning_rate": 2.7950363714163457e-05, + "loss": 0.502, + "step": 12586 + }, + { + "epoch": 16.157894736842106, + "grad_norm": 1.2932859659194946, + "learning_rate": 2.7949935815147626e-05, + "loss": 0.477, + "step": 12587 + }, + { + "epoch": 16.1591784338896, + "grad_norm": 1.0292811393737793, + "learning_rate": 2.7949507916131794e-05, + "loss": 0.4446, + "step": 12588 + }, + { + "epoch": 16.1604621309371, + "grad_norm": 1.1327606439590454, + "learning_rate": 2.794908001711596e-05, + "loss": 0.4917, + "step": 12589 + }, + { + "epoch": 16.161745827984596, + "grad_norm": 1.907486915588379, + "learning_rate": 2.794865211810013e-05, + "loss": 0.5216, + "step": 12590 + }, + { + "epoch": 16.163029525032094, + "grad_norm": 1.758618950843811, + "learning_rate": 2.7948224219084296e-05, + "loss": 0.5266, + "step": 12591 + }, + { + "epoch": 16.16431322207959, + "grad_norm": 0.8598850965499878, + "learning_rate": 2.7947796320068464e-05, + "loss": 0.4697, + "step": 12592 + }, + { + "epoch": 16.165596919127086, + "grad_norm": 1.4325666427612305, + "learning_rate": 2.7947368421052633e-05, + "loss": 0.4861, + "step": 12593 + }, + { + "epoch": 16.166880616174584, + "grad_norm": 1.2343721389770508, + "learning_rate": 2.7946940522036798e-05, + "loss": 0.4948, + "step": 12594 + }, + { + "epoch": 16.16816431322208, + "grad_norm": 1.6544777154922485, + "learning_rate": 2.794651262302097e-05, + "loss": 0.5413, + "step": 12595 + }, + { + "epoch": 16.169448010269576, + "grad_norm": 1.4979263544082642, + "learning_rate": 2.7946084724005135e-05, + "loss": 0.5147, + "step": 12596 + }, + { + "epoch": 16.170731707317074, + "grad_norm": 1.6907144784927368, + "learning_rate": 2.7945656824989303e-05, + "loss": 0.5071, + "step": 12597 + }, + { + "epoch": 16.17201540436457, + "grad_norm": 3.3529105186462402, + "learning_rate": 2.794522892597347e-05, + "loss": 0.5186, + "step": 12598 + }, + { + "epoch": 16.173299101412066, + "grad_norm": 2.879730701446533, + "learning_rate": 2.7944801026957637e-05, + "loss": 0.4883, + "step": 12599 + }, + { + "epoch": 16.174582798459564, + "grad_norm": 1.9989722967147827, + "learning_rate": 2.794437312794181e-05, + "loss": 0.5114, + "step": 12600 + }, + { + "epoch": 16.17586649550706, + "grad_norm": 2.4495370388031006, + "learning_rate": 2.7943945228925973e-05, + "loss": 0.4863, + "step": 12601 + }, + { + "epoch": 16.177150192554556, + "grad_norm": 8.586441040039062, + "learning_rate": 2.7943517329910142e-05, + "loss": 0.5331, + "step": 12602 + }, + { + "epoch": 16.178433889602054, + "grad_norm": 5.31204080581665, + "learning_rate": 2.794308943089431e-05, + "loss": 0.5325, + "step": 12603 + }, + { + "epoch": 16.179717586649552, + "grad_norm": 2.389378309249878, + "learning_rate": 2.794266153187848e-05, + "loss": 0.5208, + "step": 12604 + }, + { + "epoch": 16.181001283697046, + "grad_norm": 2.5080196857452393, + "learning_rate": 2.7942233632862644e-05, + "loss": 0.5155, + "step": 12605 + }, + { + "epoch": 16.182284980744544, + "grad_norm": 23.35330581665039, + "learning_rate": 2.7941805733846812e-05, + "loss": 0.5271, + "step": 12606 + }, + { + "epoch": 16.183568677792042, + "grad_norm": 2.9071664810180664, + "learning_rate": 2.794137783483098e-05, + "loss": 0.5048, + "step": 12607 + }, + { + "epoch": 16.184852374839537, + "grad_norm": 2.587994337081909, + "learning_rate": 2.794094993581515e-05, + "loss": 0.4838, + "step": 12608 + }, + { + "epoch": 16.186136071887034, + "grad_norm": 1.7850697040557861, + "learning_rate": 2.7940522036799317e-05, + "loss": 0.5592, + "step": 12609 + }, + { + "epoch": 16.187419768934532, + "grad_norm": 3.0193402767181396, + "learning_rate": 2.7940094137783482e-05, + "loss": 0.5833, + "step": 12610 + }, + { + "epoch": 16.188703465982027, + "grad_norm": 2.1864736080169678, + "learning_rate": 2.7939666238767654e-05, + "loss": 0.5809, + "step": 12611 + }, + { + "epoch": 16.189987163029524, + "grad_norm": 4.079489707946777, + "learning_rate": 2.793923833975182e-05, + "loss": 0.6291, + "step": 12612 + }, + { + "epoch": 16.191270860077022, + "grad_norm": 2.571199893951416, + "learning_rate": 2.7938810440735984e-05, + "loss": 0.5645, + "step": 12613 + }, + { + "epoch": 16.19255455712452, + "grad_norm": 2.304190158843994, + "learning_rate": 2.7938382541720156e-05, + "loss": 0.7137, + "step": 12614 + }, + { + "epoch": 16.193838254172015, + "grad_norm": 0.8797549605369568, + "learning_rate": 2.793795464270432e-05, + "loss": 0.471, + "step": 12615 + }, + { + "epoch": 16.195121951219512, + "grad_norm": 1.200143814086914, + "learning_rate": 2.7937526743688493e-05, + "loss": 0.4791, + "step": 12616 + }, + { + "epoch": 16.19640564826701, + "grad_norm": 1.0920480489730835, + "learning_rate": 2.7937098844672658e-05, + "loss": 0.4801, + "step": 12617 + }, + { + "epoch": 16.197689345314505, + "grad_norm": 1.273412823677063, + "learning_rate": 2.7936670945656826e-05, + "loss": 0.4963, + "step": 12618 + }, + { + "epoch": 16.198973042362002, + "grad_norm": 1.4964587688446045, + "learning_rate": 2.7936243046640995e-05, + "loss": 0.5083, + "step": 12619 + }, + { + "epoch": 16.2002567394095, + "grad_norm": 1.557587742805481, + "learning_rate": 2.793581514762516e-05, + "loss": 0.4583, + "step": 12620 + }, + { + "epoch": 16.201540436456995, + "grad_norm": 5.28344202041626, + "learning_rate": 2.7935387248609328e-05, + "loss": 0.5374, + "step": 12621 + }, + { + "epoch": 16.202824133504492, + "grad_norm": 2.0991904735565186, + "learning_rate": 2.7934959349593497e-05, + "loss": 0.494, + "step": 12622 + }, + { + "epoch": 16.20410783055199, + "grad_norm": 1.6205403804779053, + "learning_rate": 2.7934531450577665e-05, + "loss": 0.4688, + "step": 12623 + }, + { + "epoch": 16.205391527599488, + "grad_norm": 1.2028846740722656, + "learning_rate": 2.7934103551561833e-05, + "loss": 0.4363, + "step": 12624 + }, + { + "epoch": 16.206675224646983, + "grad_norm": 1.2804001569747925, + "learning_rate": 2.7933675652546002e-05, + "loss": 0.5392, + "step": 12625 + }, + { + "epoch": 16.20795892169448, + "grad_norm": 1.7936174869537354, + "learning_rate": 2.7933247753530167e-05, + "loss": 0.4452, + "step": 12626 + }, + { + "epoch": 16.20924261874198, + "grad_norm": 2.2234280109405518, + "learning_rate": 2.7932819854514335e-05, + "loss": 0.4755, + "step": 12627 + }, + { + "epoch": 16.210526315789473, + "grad_norm": 2.7610442638397217, + "learning_rate": 2.7932391955498504e-05, + "loss": 0.4869, + "step": 12628 + }, + { + "epoch": 16.21181001283697, + "grad_norm": 2.4306113719940186, + "learning_rate": 2.793196405648267e-05, + "loss": 0.4912, + "step": 12629 + }, + { + "epoch": 16.21309370988447, + "grad_norm": 1.9723870754241943, + "learning_rate": 2.793153615746684e-05, + "loss": 0.5195, + "step": 12630 + }, + { + "epoch": 16.214377406931963, + "grad_norm": 1.256420373916626, + "learning_rate": 2.7931108258451005e-05, + "loss": 0.4669, + "step": 12631 + }, + { + "epoch": 16.21566110397946, + "grad_norm": 1.103868007659912, + "learning_rate": 2.7930680359435177e-05, + "loss": 0.4886, + "step": 12632 + }, + { + "epoch": 16.21694480102696, + "grad_norm": 0.9800569415092468, + "learning_rate": 2.7930252460419342e-05, + "loss": 0.4573, + "step": 12633 + }, + { + "epoch": 16.218228498074453, + "grad_norm": 1.104988932609558, + "learning_rate": 2.7929824561403507e-05, + "loss": 0.4954, + "step": 12634 + }, + { + "epoch": 16.21951219512195, + "grad_norm": 1.049368143081665, + "learning_rate": 2.792939666238768e-05, + "loss": 0.465, + "step": 12635 + }, + { + "epoch": 16.22079589216945, + "grad_norm": 1.4004161357879639, + "learning_rate": 2.7928968763371844e-05, + "loss": 0.4754, + "step": 12636 + }, + { + "epoch": 16.222079589216946, + "grad_norm": 2.6943399906158447, + "learning_rate": 2.7928540864356013e-05, + "loss": 0.5349, + "step": 12637 + }, + { + "epoch": 16.22336328626444, + "grad_norm": 0.8669583201408386, + "learning_rate": 2.792811296534018e-05, + "loss": 0.4847, + "step": 12638 + }, + { + "epoch": 16.22464698331194, + "grad_norm": 1.6071304082870483, + "learning_rate": 2.792768506632435e-05, + "loss": 0.4543, + "step": 12639 + }, + { + "epoch": 16.225930680359436, + "grad_norm": 2.808608293533325, + "learning_rate": 2.7927257167308518e-05, + "loss": 0.4606, + "step": 12640 + }, + { + "epoch": 16.22721437740693, + "grad_norm": 2.2067294120788574, + "learning_rate": 2.7926829268292683e-05, + "loss": 0.4938, + "step": 12641 + }, + { + "epoch": 16.22849807445443, + "grad_norm": 2.84501576423645, + "learning_rate": 2.792640136927685e-05, + "loss": 0.4956, + "step": 12642 + }, + { + "epoch": 16.229781771501926, + "grad_norm": 1.8730885982513428, + "learning_rate": 2.792597347026102e-05, + "loss": 0.4813, + "step": 12643 + }, + { + "epoch": 16.23106546854942, + "grad_norm": 2.704267740249634, + "learning_rate": 2.7925545571245188e-05, + "loss": 0.4649, + "step": 12644 + }, + { + "epoch": 16.23234916559692, + "grad_norm": 1.1292637586593628, + "learning_rate": 2.7925117672229353e-05, + "loss": 0.5264, + "step": 12645 + }, + { + "epoch": 16.233632862644416, + "grad_norm": 5.437878608703613, + "learning_rate": 2.792468977321352e-05, + "loss": 0.4769, + "step": 12646 + }, + { + "epoch": 16.234916559691914, + "grad_norm": 1.8881115913391113, + "learning_rate": 2.792426187419769e-05, + "loss": 0.4751, + "step": 12647 + }, + { + "epoch": 16.23620025673941, + "grad_norm": 1.937990665435791, + "learning_rate": 2.7923833975181858e-05, + "loss": 0.4695, + "step": 12648 + }, + { + "epoch": 16.237483953786906, + "grad_norm": 1.030722975730896, + "learning_rate": 2.7923406076166027e-05, + "loss": 0.4609, + "step": 12649 + }, + { + "epoch": 16.238767650834404, + "grad_norm": 1.872209072113037, + "learning_rate": 2.7922978177150192e-05, + "loss": 0.5344, + "step": 12650 + }, + { + "epoch": 16.2400513478819, + "grad_norm": 2.02862548828125, + "learning_rate": 2.7922550278134364e-05, + "loss": 0.5357, + "step": 12651 + }, + { + "epoch": 16.241335044929397, + "grad_norm": 1.261637806892395, + "learning_rate": 2.792212237911853e-05, + "loss": 0.5256, + "step": 12652 + }, + { + "epoch": 16.242618741976894, + "grad_norm": 1.1013716459274292, + "learning_rate": 2.7921694480102694e-05, + "loss": 0.5362, + "step": 12653 + }, + { + "epoch": 16.24390243902439, + "grad_norm": 2.3778676986694336, + "learning_rate": 2.7921266581086865e-05, + "loss": 0.5636, + "step": 12654 + }, + { + "epoch": 16.245186136071887, + "grad_norm": 1.463894009590149, + "learning_rate": 2.792083868207103e-05, + "loss": 0.5114, + "step": 12655 + }, + { + "epoch": 16.246469833119384, + "grad_norm": 5.2032952308654785, + "learning_rate": 2.7920410783055202e-05, + "loss": 0.5442, + "step": 12656 + }, + { + "epoch": 16.247753530166882, + "grad_norm": 2.418623924255371, + "learning_rate": 2.7919982884039367e-05, + "loss": 0.5319, + "step": 12657 + }, + { + "epoch": 16.249037227214377, + "grad_norm": 5.372801303863525, + "learning_rate": 2.7919554985023536e-05, + "loss": 0.5144, + "step": 12658 + }, + { + "epoch": 16.250320924261874, + "grad_norm": 1.8674317598342896, + "learning_rate": 2.7919127086007704e-05, + "loss": 0.5451, + "step": 12659 + }, + { + "epoch": 16.251604621309372, + "grad_norm": 2.1106514930725098, + "learning_rate": 2.791869918699187e-05, + "loss": 0.5418, + "step": 12660 + }, + { + "epoch": 16.252888318356867, + "grad_norm": 2.5594327449798584, + "learning_rate": 2.7918271287976037e-05, + "loss": 0.5403, + "step": 12661 + }, + { + "epoch": 16.254172015404365, + "grad_norm": 3.337587356567383, + "learning_rate": 2.7917843388960206e-05, + "loss": 0.6064, + "step": 12662 + }, + { + "epoch": 16.255455712451862, + "grad_norm": 3.8257625102996826, + "learning_rate": 2.7917415489944374e-05, + "loss": 0.5755, + "step": 12663 + }, + { + "epoch": 16.256739409499357, + "grad_norm": 2.7926039695739746, + "learning_rate": 2.791698759092854e-05, + "loss": 0.7037, + "step": 12664 + }, + { + "epoch": 16.258023106546855, + "grad_norm": 1.961984634399414, + "learning_rate": 2.791655969191271e-05, + "loss": 0.4747, + "step": 12665 + }, + { + "epoch": 16.259306803594352, + "grad_norm": 1.437001347541809, + "learning_rate": 2.7916131792896876e-05, + "loss": 0.4939, + "step": 12666 + }, + { + "epoch": 16.260590500641847, + "grad_norm": 1.443361759185791, + "learning_rate": 2.7915703893881045e-05, + "loss": 0.511, + "step": 12667 + }, + { + "epoch": 16.261874197689345, + "grad_norm": 1.6098368167877197, + "learning_rate": 2.7915275994865213e-05, + "loss": 0.46, + "step": 12668 + }, + { + "epoch": 16.263157894736842, + "grad_norm": 2.3957176208496094, + "learning_rate": 2.7914848095849378e-05, + "loss": 0.496, + "step": 12669 + }, + { + "epoch": 16.26444159178434, + "grad_norm": 1.8555020093917847, + "learning_rate": 2.791442019683355e-05, + "loss": 0.4601, + "step": 12670 + }, + { + "epoch": 16.265725288831835, + "grad_norm": 1.896133303642273, + "learning_rate": 2.7913992297817715e-05, + "loss": 0.4981, + "step": 12671 + }, + { + "epoch": 16.267008985879333, + "grad_norm": 2.1034371852874756, + "learning_rate": 2.7913564398801883e-05, + "loss": 0.4969, + "step": 12672 + }, + { + "epoch": 16.26829268292683, + "grad_norm": 1.2536991834640503, + "learning_rate": 2.791313649978605e-05, + "loss": 0.4585, + "step": 12673 + }, + { + "epoch": 16.269576379974325, + "grad_norm": 1.4029946327209473, + "learning_rate": 2.7912708600770217e-05, + "loss": 0.5409, + "step": 12674 + }, + { + "epoch": 16.270860077021823, + "grad_norm": 1.8895281553268433, + "learning_rate": 2.791228070175439e-05, + "loss": 0.514, + "step": 12675 + }, + { + "epoch": 16.27214377406932, + "grad_norm": 2.1457512378692627, + "learning_rate": 2.7911852802738553e-05, + "loss": 0.4978, + "step": 12676 + }, + { + "epoch": 16.273427471116815, + "grad_norm": 1.3349727392196655, + "learning_rate": 2.7911424903722722e-05, + "loss": 0.4669, + "step": 12677 + }, + { + "epoch": 16.274711168164313, + "grad_norm": 1.2748618125915527, + "learning_rate": 2.791099700470689e-05, + "loss": 0.4597, + "step": 12678 + }, + { + "epoch": 16.27599486521181, + "grad_norm": 5.719204425811768, + "learning_rate": 2.791056910569106e-05, + "loss": 0.4638, + "step": 12679 + }, + { + "epoch": 16.27727856225931, + "grad_norm": 3.9559950828552246, + "learning_rate": 2.7910141206675224e-05, + "loss": 0.4946, + "step": 12680 + }, + { + "epoch": 16.278562259306803, + "grad_norm": 2.676973581314087, + "learning_rate": 2.7909713307659392e-05, + "loss": 0.4375, + "step": 12681 + }, + { + "epoch": 16.2798459563543, + "grad_norm": 3.920555353164673, + "learning_rate": 2.790928540864356e-05, + "loss": 0.4718, + "step": 12682 + }, + { + "epoch": 16.2811296534018, + "grad_norm": 1.8928922414779663, + "learning_rate": 2.790885750962773e-05, + "loss": 0.5103, + "step": 12683 + }, + { + "epoch": 16.282413350449293, + "grad_norm": 6.051109790802002, + "learning_rate": 2.7908429610611897e-05, + "loss": 0.5046, + "step": 12684 + }, + { + "epoch": 16.28369704749679, + "grad_norm": 1.492841362953186, + "learning_rate": 2.7908001711596062e-05, + "loss": 0.5028, + "step": 12685 + }, + { + "epoch": 16.28498074454429, + "grad_norm": 7.560024261474609, + "learning_rate": 2.7907573812580234e-05, + "loss": 0.5046, + "step": 12686 + }, + { + "epoch": 16.286264441591783, + "grad_norm": 3.231322765350342, + "learning_rate": 2.79071459135644e-05, + "loss": 0.5215, + "step": 12687 + }, + { + "epoch": 16.28754813863928, + "grad_norm": 1.4698206186294556, + "learning_rate": 2.7906718014548564e-05, + "loss": 0.4826, + "step": 12688 + }, + { + "epoch": 16.28883183568678, + "grad_norm": 2.2454681396484375, + "learning_rate": 2.7906290115532736e-05, + "loss": 0.5273, + "step": 12689 + }, + { + "epoch": 16.290115532734276, + "grad_norm": 1.7906743288040161, + "learning_rate": 2.79058622165169e-05, + "loss": 0.4672, + "step": 12690 + }, + { + "epoch": 16.29139922978177, + "grad_norm": 1.569543480873108, + "learning_rate": 2.7905434317501073e-05, + "loss": 0.4986, + "step": 12691 + }, + { + "epoch": 16.29268292682927, + "grad_norm": 0.9358606338500977, + "learning_rate": 2.7905006418485238e-05, + "loss": 0.4827, + "step": 12692 + }, + { + "epoch": 16.293966623876766, + "grad_norm": 3.8116774559020996, + "learning_rate": 2.7904578519469406e-05, + "loss": 0.4771, + "step": 12693 + }, + { + "epoch": 16.29525032092426, + "grad_norm": 1.4616655111312866, + "learning_rate": 2.7904150620453575e-05, + "loss": 0.5186, + "step": 12694 + }, + { + "epoch": 16.29653401797176, + "grad_norm": 2.689051866531372, + "learning_rate": 2.790372272143774e-05, + "loss": 0.4664, + "step": 12695 + }, + { + "epoch": 16.297817715019256, + "grad_norm": 4.7115654945373535, + "learning_rate": 2.7903294822421908e-05, + "loss": 0.4977, + "step": 12696 + }, + { + "epoch": 16.29910141206675, + "grad_norm": 1.917142391204834, + "learning_rate": 2.7902866923406077e-05, + "loss": 0.4778, + "step": 12697 + }, + { + "epoch": 16.30038510911425, + "grad_norm": 3.766082525253296, + "learning_rate": 2.7902439024390245e-05, + "loss": 0.4717, + "step": 12698 + }, + { + "epoch": 16.301668806161747, + "grad_norm": 2.191749095916748, + "learning_rate": 2.7902011125374413e-05, + "loss": 0.4833, + "step": 12699 + }, + { + "epoch": 16.30295250320924, + "grad_norm": 3.7971482276916504, + "learning_rate": 2.7901583226358582e-05, + "loss": 0.5051, + "step": 12700 + }, + { + "epoch": 16.30423620025674, + "grad_norm": 2.2221009731292725, + "learning_rate": 2.7901155327342747e-05, + "loss": 0.5234, + "step": 12701 + }, + { + "epoch": 16.305519897304237, + "grad_norm": 4.081989288330078, + "learning_rate": 2.7900727428326915e-05, + "loss": 0.5164, + "step": 12702 + }, + { + "epoch": 16.306803594351734, + "grad_norm": 2.754601001739502, + "learning_rate": 2.7900299529311084e-05, + "loss": 0.4917, + "step": 12703 + }, + { + "epoch": 16.30808729139923, + "grad_norm": 2.803262233734131, + "learning_rate": 2.789987163029525e-05, + "loss": 0.4958, + "step": 12704 + }, + { + "epoch": 16.309370988446727, + "grad_norm": 1.4980096817016602, + "learning_rate": 2.789944373127942e-05, + "loss": 0.5458, + "step": 12705 + }, + { + "epoch": 16.310654685494224, + "grad_norm": 1.8472590446472168, + "learning_rate": 2.7899015832263586e-05, + "loss": 0.5164, + "step": 12706 + }, + { + "epoch": 16.31193838254172, + "grad_norm": 2.4373838901519775, + "learning_rate": 2.7898587933247754e-05, + "loss": 0.5269, + "step": 12707 + }, + { + "epoch": 16.313222079589217, + "grad_norm": 3.512012004852295, + "learning_rate": 2.7898160034231922e-05, + "loss": 0.5721, + "step": 12708 + }, + { + "epoch": 16.314505776636715, + "grad_norm": 1.322085976600647, + "learning_rate": 2.7897732135216087e-05, + "loss": 0.5728, + "step": 12709 + }, + { + "epoch": 16.31578947368421, + "grad_norm": 2.4300663471221924, + "learning_rate": 2.789730423620026e-05, + "loss": 0.5547, + "step": 12710 + }, + { + "epoch": 16.317073170731707, + "grad_norm": 1.5885754823684692, + "learning_rate": 2.7896876337184424e-05, + "loss": 0.5866, + "step": 12711 + }, + { + "epoch": 16.318356867779205, + "grad_norm": 2.1064000129699707, + "learning_rate": 2.7896448438168593e-05, + "loss": 0.5841, + "step": 12712 + }, + { + "epoch": 16.319640564826702, + "grad_norm": 37.25496292114258, + "learning_rate": 2.789602053915276e-05, + "loss": 0.6624, + "step": 12713 + }, + { + "epoch": 16.320924261874197, + "grad_norm": 6.9317307472229, + "learning_rate": 2.7895592640136926e-05, + "loss": 0.7007, + "step": 12714 + }, + { + "epoch": 16.322207958921695, + "grad_norm": 1.1125738620758057, + "learning_rate": 2.7895164741121098e-05, + "loss": 0.4845, + "step": 12715 + }, + { + "epoch": 16.323491655969192, + "grad_norm": 1.4935609102249146, + "learning_rate": 2.7894736842105263e-05, + "loss": 0.4739, + "step": 12716 + }, + { + "epoch": 16.324775353016687, + "grad_norm": 2.377847909927368, + "learning_rate": 2.789430894308943e-05, + "loss": 0.5023, + "step": 12717 + }, + { + "epoch": 16.326059050064185, + "grad_norm": 2.4395267963409424, + "learning_rate": 2.78938810440736e-05, + "loss": 0.5039, + "step": 12718 + }, + { + "epoch": 16.327342747111683, + "grad_norm": 2.1117610931396484, + "learning_rate": 2.7893453145057768e-05, + "loss": 0.496, + "step": 12719 + }, + { + "epoch": 16.328626444159177, + "grad_norm": 0.8650978803634644, + "learning_rate": 2.7893025246041933e-05, + "loss": 0.4846, + "step": 12720 + }, + { + "epoch": 16.329910141206675, + "grad_norm": 0.9717085361480713, + "learning_rate": 2.78925973470261e-05, + "loss": 0.4548, + "step": 12721 + }, + { + "epoch": 16.331193838254173, + "grad_norm": 1.5491973161697388, + "learning_rate": 2.789216944801027e-05, + "loss": 0.4628, + "step": 12722 + }, + { + "epoch": 16.33247753530167, + "grad_norm": 2.8229331970214844, + "learning_rate": 2.789174154899444e-05, + "loss": 0.4705, + "step": 12723 + }, + { + "epoch": 16.333761232349165, + "grad_norm": 1.773108959197998, + "learning_rate": 2.7891313649978607e-05, + "loss": 0.5386, + "step": 12724 + }, + { + "epoch": 16.335044929396663, + "grad_norm": 2.715078115463257, + "learning_rate": 2.7890885750962772e-05, + "loss": 0.4697, + "step": 12725 + }, + { + "epoch": 16.33632862644416, + "grad_norm": 2.974881649017334, + "learning_rate": 2.7890457851946944e-05, + "loss": 0.5359, + "step": 12726 + }, + { + "epoch": 16.337612323491655, + "grad_norm": 1.5663834810256958, + "learning_rate": 2.789002995293111e-05, + "loss": 0.5146, + "step": 12727 + }, + { + "epoch": 16.338896020539153, + "grad_norm": 1.7622098922729492, + "learning_rate": 2.7889602053915274e-05, + "loss": 0.4866, + "step": 12728 + }, + { + "epoch": 16.34017971758665, + "grad_norm": 1.5401169061660767, + "learning_rate": 2.7889174154899445e-05, + "loss": 0.4724, + "step": 12729 + }, + { + "epoch": 16.341463414634145, + "grad_norm": 1.5904971361160278, + "learning_rate": 2.788874625588361e-05, + "loss": 0.4911, + "step": 12730 + }, + { + "epoch": 16.342747111681643, + "grad_norm": 1.4939210414886475, + "learning_rate": 2.7888318356867782e-05, + "loss": 0.457, + "step": 12731 + }, + { + "epoch": 16.34403080872914, + "grad_norm": 1.3262128829956055, + "learning_rate": 2.7887890457851947e-05, + "loss": 0.4925, + "step": 12732 + }, + { + "epoch": 16.345314505776635, + "grad_norm": 1.3430562019348145, + "learning_rate": 2.7887462558836116e-05, + "loss": 0.4678, + "step": 12733 + }, + { + "epoch": 16.346598202824133, + "grad_norm": 1.1381350755691528, + "learning_rate": 2.7887034659820284e-05, + "loss": 0.4789, + "step": 12734 + }, + { + "epoch": 16.34788189987163, + "grad_norm": 1.6910309791564941, + "learning_rate": 2.788660676080445e-05, + "loss": 0.4994, + "step": 12735 + }, + { + "epoch": 16.34916559691913, + "grad_norm": 1.051872968673706, + "learning_rate": 2.7886178861788618e-05, + "loss": 0.4566, + "step": 12736 + }, + { + "epoch": 16.350449293966623, + "grad_norm": 1.6179174184799194, + "learning_rate": 2.7885750962772786e-05, + "loss": 0.4603, + "step": 12737 + }, + { + "epoch": 16.35173299101412, + "grad_norm": 2.6201844215393066, + "learning_rate": 2.7885323063756954e-05, + "loss": 0.5022, + "step": 12738 + }, + { + "epoch": 16.35301668806162, + "grad_norm": 4.093585968017578, + "learning_rate": 2.7884895164741123e-05, + "loss": 0.515, + "step": 12739 + }, + { + "epoch": 16.354300385109113, + "grad_norm": 1.2664828300476074, + "learning_rate": 2.788446726572529e-05, + "loss": 0.5335, + "step": 12740 + }, + { + "epoch": 16.35558408215661, + "grad_norm": 0.9450545907020569, + "learning_rate": 2.7884039366709456e-05, + "loss": 0.5011, + "step": 12741 + }, + { + "epoch": 16.35686777920411, + "grad_norm": 2.2018256187438965, + "learning_rate": 2.7883611467693625e-05, + "loss": 0.4773, + "step": 12742 + }, + { + "epoch": 16.358151476251603, + "grad_norm": 1.1389861106872559, + "learning_rate": 2.7883183568677793e-05, + "loss": 0.5231, + "step": 12743 + }, + { + "epoch": 16.3594351732991, + "grad_norm": 2.322685956954956, + "learning_rate": 2.7882755669661958e-05, + "loss": 0.5201, + "step": 12744 + }, + { + "epoch": 16.3607188703466, + "grad_norm": 1.191017746925354, + "learning_rate": 2.788232777064613e-05, + "loss": 0.4598, + "step": 12745 + }, + { + "epoch": 16.362002567394097, + "grad_norm": 1.6200658082962036, + "learning_rate": 2.7881899871630295e-05, + "loss": 0.5072, + "step": 12746 + }, + { + "epoch": 16.36328626444159, + "grad_norm": 2.3342833518981934, + "learning_rate": 2.7881471972614467e-05, + "loss": 0.4656, + "step": 12747 + }, + { + "epoch": 16.36456996148909, + "grad_norm": 1.923439860343933, + "learning_rate": 2.7881044073598632e-05, + "loss": 0.5199, + "step": 12748 + }, + { + "epoch": 16.365853658536587, + "grad_norm": 2.5613110065460205, + "learning_rate": 2.7880616174582797e-05, + "loss": 0.5082, + "step": 12749 + }, + { + "epoch": 16.36713735558408, + "grad_norm": 2.84377384185791, + "learning_rate": 2.788018827556697e-05, + "loss": 0.4849, + "step": 12750 + }, + { + "epoch": 16.36842105263158, + "grad_norm": 1.6752222776412964, + "learning_rate": 2.7879760376551134e-05, + "loss": 0.5351, + "step": 12751 + }, + { + "epoch": 16.369704749679077, + "grad_norm": 2.2544033527374268, + "learning_rate": 2.7879332477535302e-05, + "loss": 0.5345, + "step": 12752 + }, + { + "epoch": 16.37098844672657, + "grad_norm": 1.5823029279708862, + "learning_rate": 2.787890457851947e-05, + "loss": 0.4886, + "step": 12753 + }, + { + "epoch": 16.37227214377407, + "grad_norm": 3.460836172103882, + "learning_rate": 2.787847667950364e-05, + "loss": 0.4709, + "step": 12754 + }, + { + "epoch": 16.373555840821567, + "grad_norm": 1.8843413591384888, + "learning_rate": 2.7878048780487807e-05, + "loss": 0.5258, + "step": 12755 + }, + { + "epoch": 16.374839537869065, + "grad_norm": 1.2564927339553833, + "learning_rate": 2.7877620881471972e-05, + "loss": 0.5371, + "step": 12756 + }, + { + "epoch": 16.37612323491656, + "grad_norm": 1.4467406272888184, + "learning_rate": 2.787719298245614e-05, + "loss": 0.5463, + "step": 12757 + }, + { + "epoch": 16.377406931964057, + "grad_norm": 1.2570407390594482, + "learning_rate": 2.787676508344031e-05, + "loss": 0.5233, + "step": 12758 + }, + { + "epoch": 16.378690629011555, + "grad_norm": 3.626375436782837, + "learning_rate": 2.7876337184424477e-05, + "loss": 0.5399, + "step": 12759 + }, + { + "epoch": 16.37997432605905, + "grad_norm": 1.5421370267868042, + "learning_rate": 2.7875909285408642e-05, + "loss": 0.5927, + "step": 12760 + }, + { + "epoch": 16.381258023106547, + "grad_norm": 1.6083731651306152, + "learning_rate": 2.7875481386392814e-05, + "loss": 0.4957, + "step": 12761 + }, + { + "epoch": 16.382541720154045, + "grad_norm": 2.1854443550109863, + "learning_rate": 2.787505348737698e-05, + "loss": 0.606, + "step": 12762 + }, + { + "epoch": 16.38382541720154, + "grad_norm": 1.326930284500122, + "learning_rate": 2.7874625588361148e-05, + "loss": 0.6415, + "step": 12763 + }, + { + "epoch": 16.385109114249037, + "grad_norm": 7.716550350189209, + "learning_rate": 2.7874197689345316e-05, + "loss": 0.6871, + "step": 12764 + }, + { + "epoch": 16.386392811296535, + "grad_norm": 0.7943944931030273, + "learning_rate": 2.787376979032948e-05, + "loss": 0.4516, + "step": 12765 + }, + { + "epoch": 16.387676508344033, + "grad_norm": 1.0977816581726074, + "learning_rate": 2.7873341891313653e-05, + "loss": 0.4314, + "step": 12766 + }, + { + "epoch": 16.388960205391527, + "grad_norm": 1.1701457500457764, + "learning_rate": 2.7872913992297818e-05, + "loss": 0.4363, + "step": 12767 + }, + { + "epoch": 16.390243902439025, + "grad_norm": 1.1517460346221924, + "learning_rate": 2.7872486093281986e-05, + "loss": 0.4829, + "step": 12768 + }, + { + "epoch": 16.391527599486523, + "grad_norm": 1.4802680015563965, + "learning_rate": 2.7872058194266155e-05, + "loss": 0.4438, + "step": 12769 + }, + { + "epoch": 16.392811296534017, + "grad_norm": 1.39479660987854, + "learning_rate": 2.787163029525032e-05, + "loss": 0.4795, + "step": 12770 + }, + { + "epoch": 16.394094993581515, + "grad_norm": 0.8665237426757812, + "learning_rate": 2.787120239623449e-05, + "loss": 0.4626, + "step": 12771 + }, + { + "epoch": 16.395378690629013, + "grad_norm": 1.1225801706314087, + "learning_rate": 2.7870774497218657e-05, + "loss": 0.4589, + "step": 12772 + }, + { + "epoch": 16.396662387676507, + "grad_norm": 1.0761487483978271, + "learning_rate": 2.7870346598202825e-05, + "loss": 0.4866, + "step": 12773 + }, + { + "epoch": 16.397946084724005, + "grad_norm": 1.3986197710037231, + "learning_rate": 2.7869918699186993e-05, + "loss": 0.4606, + "step": 12774 + }, + { + "epoch": 16.399229781771503, + "grad_norm": 0.9685918688774109, + "learning_rate": 2.786949080017116e-05, + "loss": 0.4665, + "step": 12775 + }, + { + "epoch": 16.400513478818997, + "grad_norm": 0.9827427864074707, + "learning_rate": 2.7869062901155327e-05, + "loss": 0.4558, + "step": 12776 + }, + { + "epoch": 16.401797175866495, + "grad_norm": 1.2771835327148438, + "learning_rate": 2.7868635002139495e-05, + "loss": 0.4863, + "step": 12777 + }, + { + "epoch": 16.403080872913993, + "grad_norm": 1.0346506834030151, + "learning_rate": 2.7868207103123664e-05, + "loss": 0.4795, + "step": 12778 + }, + { + "epoch": 16.40436456996149, + "grad_norm": 1.0428047180175781, + "learning_rate": 2.7867779204107832e-05, + "loss": 0.4696, + "step": 12779 + }, + { + "epoch": 16.405648267008985, + "grad_norm": 7.382641792297363, + "learning_rate": 2.7867351305092e-05, + "loss": 0.4799, + "step": 12780 + }, + { + "epoch": 16.406931964056483, + "grad_norm": 2.225956678390503, + "learning_rate": 2.7866923406076166e-05, + "loss": 0.4884, + "step": 12781 + }, + { + "epoch": 16.40821566110398, + "grad_norm": 1.4200236797332764, + "learning_rate": 2.7866495507060334e-05, + "loss": 0.5193, + "step": 12782 + }, + { + "epoch": 16.409499358151475, + "grad_norm": 1.2924741506576538, + "learning_rate": 2.7866067608044502e-05, + "loss": 0.4566, + "step": 12783 + }, + { + "epoch": 16.410783055198973, + "grad_norm": 1.08016037940979, + "learning_rate": 2.7865639709028667e-05, + "loss": 0.4462, + "step": 12784 + }, + { + "epoch": 16.41206675224647, + "grad_norm": 1.024835228919983, + "learning_rate": 2.786521181001284e-05, + "loss": 0.4444, + "step": 12785 + }, + { + "epoch": 16.413350449293965, + "grad_norm": 1.0301628112792969, + "learning_rate": 2.7864783910997004e-05, + "loss": 0.4804, + "step": 12786 + }, + { + "epoch": 16.414634146341463, + "grad_norm": 1.490944504737854, + "learning_rate": 2.7864356011981176e-05, + "loss": 0.5545, + "step": 12787 + }, + { + "epoch": 16.41591784338896, + "grad_norm": 1.2813441753387451, + "learning_rate": 2.786392811296534e-05, + "loss": 0.4926, + "step": 12788 + }, + { + "epoch": 16.41720154043646, + "grad_norm": 2.1410906314849854, + "learning_rate": 2.7863500213949506e-05, + "loss": 0.524, + "step": 12789 + }, + { + "epoch": 16.418485237483953, + "grad_norm": 1.7814924716949463, + "learning_rate": 2.7863072314933678e-05, + "loss": 0.4558, + "step": 12790 + }, + { + "epoch": 16.41976893453145, + "grad_norm": 6.65915060043335, + "learning_rate": 2.7862644415917843e-05, + "loss": 0.5177, + "step": 12791 + }, + { + "epoch": 16.42105263157895, + "grad_norm": 1.2633495330810547, + "learning_rate": 2.786221651690201e-05, + "loss": 0.485, + "step": 12792 + }, + { + "epoch": 16.422336328626443, + "grad_norm": 1.1398859024047852, + "learning_rate": 2.786178861788618e-05, + "loss": 0.4507, + "step": 12793 + }, + { + "epoch": 16.42362002567394, + "grad_norm": 1.4211655855178833, + "learning_rate": 2.7861360718870348e-05, + "loss": 0.5344, + "step": 12794 + }, + { + "epoch": 16.42490372272144, + "grad_norm": 1.1379200220108032, + "learning_rate": 2.7860932819854517e-05, + "loss": 0.4751, + "step": 12795 + }, + { + "epoch": 16.426187419768933, + "grad_norm": 1.715160608291626, + "learning_rate": 2.786050492083868e-05, + "loss": 0.5137, + "step": 12796 + }, + { + "epoch": 16.42747111681643, + "grad_norm": 1.5112308263778687, + "learning_rate": 2.786007702182285e-05, + "loss": 0.5274, + "step": 12797 + }, + { + "epoch": 16.42875481386393, + "grad_norm": 1.2405675649642944, + "learning_rate": 2.785964912280702e-05, + "loss": 0.5793, + "step": 12798 + }, + { + "epoch": 16.430038510911427, + "grad_norm": 1.2193374633789062, + "learning_rate": 2.7859221223791187e-05, + "loss": 0.5071, + "step": 12799 + }, + { + "epoch": 16.43132220795892, + "grad_norm": 2.6592862606048584, + "learning_rate": 2.7858793324775352e-05, + "loss": 0.4951, + "step": 12800 + }, + { + "epoch": 16.43260590500642, + "grad_norm": 1.2485418319702148, + "learning_rate": 2.7858365425759524e-05, + "loss": 0.4969, + "step": 12801 + }, + { + "epoch": 16.433889602053917, + "grad_norm": 4.918032169342041, + "learning_rate": 2.785793752674369e-05, + "loss": 0.5261, + "step": 12802 + }, + { + "epoch": 16.43517329910141, + "grad_norm": 1.5187267065048218, + "learning_rate": 2.7857509627727857e-05, + "loss": 0.5369, + "step": 12803 + }, + { + "epoch": 16.43645699614891, + "grad_norm": 2.018282651901245, + "learning_rate": 2.7857081728712025e-05, + "loss": 0.5294, + "step": 12804 + }, + { + "epoch": 16.437740693196407, + "grad_norm": 1.294893503189087, + "learning_rate": 2.785665382969619e-05, + "loss": 0.5155, + "step": 12805 + }, + { + "epoch": 16.4390243902439, + "grad_norm": 1.873927116394043, + "learning_rate": 2.7856225930680362e-05, + "loss": 0.5065, + "step": 12806 + }, + { + "epoch": 16.4403080872914, + "grad_norm": 7.863803863525391, + "learning_rate": 2.7855798031664527e-05, + "loss": 0.5268, + "step": 12807 + }, + { + "epoch": 16.441591784338897, + "grad_norm": 2.160217046737671, + "learning_rate": 2.7855370132648696e-05, + "loss": 0.4837, + "step": 12808 + }, + { + "epoch": 16.44287548138639, + "grad_norm": 2.071925401687622, + "learning_rate": 2.7854942233632864e-05, + "loss": 0.5394, + "step": 12809 + }, + { + "epoch": 16.44415917843389, + "grad_norm": 1.55045747756958, + "learning_rate": 2.785451433461703e-05, + "loss": 0.5659, + "step": 12810 + }, + { + "epoch": 16.445442875481387, + "grad_norm": 2.402881145477295, + "learning_rate": 2.78540864356012e-05, + "loss": 0.5345, + "step": 12811 + }, + { + "epoch": 16.446726572528885, + "grad_norm": 2.778956174850464, + "learning_rate": 2.7853658536585366e-05, + "loss": 0.5735, + "step": 12812 + }, + { + "epoch": 16.44801026957638, + "grad_norm": 3.19507098197937, + "learning_rate": 2.7853230637569534e-05, + "loss": 0.666, + "step": 12813 + }, + { + "epoch": 16.449293966623877, + "grad_norm": 2.155186176300049, + "learning_rate": 2.7852802738553703e-05, + "loss": 0.6866, + "step": 12814 + }, + { + "epoch": 16.450577663671375, + "grad_norm": 0.7816353440284729, + "learning_rate": 2.785237483953787e-05, + "loss": 0.4477, + "step": 12815 + }, + { + "epoch": 16.45186136071887, + "grad_norm": 1.3002326488494873, + "learning_rate": 2.7851946940522036e-05, + "loss": 0.4728, + "step": 12816 + }, + { + "epoch": 16.453145057766367, + "grad_norm": 1.1836780309677124, + "learning_rate": 2.7851519041506205e-05, + "loss": 0.4998, + "step": 12817 + }, + { + "epoch": 16.454428754813865, + "grad_norm": 1.547086238861084, + "learning_rate": 2.7851091142490373e-05, + "loss": 0.5003, + "step": 12818 + }, + { + "epoch": 16.45571245186136, + "grad_norm": 1.251775860786438, + "learning_rate": 2.785066324347454e-05, + "loss": 0.4804, + "step": 12819 + }, + { + "epoch": 16.456996148908857, + "grad_norm": 1.396807074546814, + "learning_rate": 2.785023534445871e-05, + "loss": 0.4813, + "step": 12820 + }, + { + "epoch": 16.458279845956355, + "grad_norm": 2.7085609436035156, + "learning_rate": 2.7849807445442875e-05, + "loss": 0.4699, + "step": 12821 + }, + { + "epoch": 16.459563543003853, + "grad_norm": 1.0200921297073364, + "learning_rate": 2.7849379546427047e-05, + "loss": 0.5312, + "step": 12822 + }, + { + "epoch": 16.460847240051347, + "grad_norm": 1.3908036947250366, + "learning_rate": 2.7848951647411212e-05, + "loss": 0.4489, + "step": 12823 + }, + { + "epoch": 16.462130937098845, + "grad_norm": 1.1360561847686768, + "learning_rate": 2.7848523748395377e-05, + "loss": 0.4451, + "step": 12824 + }, + { + "epoch": 16.463414634146343, + "grad_norm": 1.1217803955078125, + "learning_rate": 2.784809584937955e-05, + "loss": 0.5097, + "step": 12825 + }, + { + "epoch": 16.464698331193837, + "grad_norm": 1.3153231143951416, + "learning_rate": 2.7847667950363714e-05, + "loss": 0.4891, + "step": 12826 + }, + { + "epoch": 16.465982028241335, + "grad_norm": 1.0789273977279663, + "learning_rate": 2.7847240051347885e-05, + "loss": 0.4751, + "step": 12827 + }, + { + "epoch": 16.467265725288833, + "grad_norm": 1.0969475507736206, + "learning_rate": 2.784681215233205e-05, + "loss": 0.4716, + "step": 12828 + }, + { + "epoch": 16.468549422336327, + "grad_norm": 1.8395854234695435, + "learning_rate": 2.784638425331622e-05, + "loss": 0.4557, + "step": 12829 + }, + { + "epoch": 16.469833119383825, + "grad_norm": 1.156766653060913, + "learning_rate": 2.7845956354300387e-05, + "loss": 0.4725, + "step": 12830 + }, + { + "epoch": 16.471116816431323, + "grad_norm": 1.028955101966858, + "learning_rate": 2.7845528455284552e-05, + "loss": 0.4527, + "step": 12831 + }, + { + "epoch": 16.47240051347882, + "grad_norm": 1.1276180744171143, + "learning_rate": 2.784510055626872e-05, + "loss": 0.4697, + "step": 12832 + }, + { + "epoch": 16.473684210526315, + "grad_norm": 0.9500848650932312, + "learning_rate": 2.784467265725289e-05, + "loss": 0.4452, + "step": 12833 + }, + { + "epoch": 16.474967907573813, + "grad_norm": 1.0948567390441895, + "learning_rate": 2.7844244758237058e-05, + "loss": 0.4509, + "step": 12834 + }, + { + "epoch": 16.47625160462131, + "grad_norm": 1.9159178733825684, + "learning_rate": 2.7843816859221226e-05, + "loss": 0.523, + "step": 12835 + }, + { + "epoch": 16.477535301668805, + "grad_norm": 1.0935394763946533, + "learning_rate": 2.784338896020539e-05, + "loss": 0.4971, + "step": 12836 + }, + { + "epoch": 16.478818998716303, + "grad_norm": 0.7553435564041138, + "learning_rate": 2.784296106118956e-05, + "loss": 0.4714, + "step": 12837 + }, + { + "epoch": 16.4801026957638, + "grad_norm": 0.9456979036331177, + "learning_rate": 2.7842533162173728e-05, + "loss": 0.5076, + "step": 12838 + }, + { + "epoch": 16.481386392811295, + "grad_norm": 1.4757388830184937, + "learning_rate": 2.7842105263157896e-05, + "loss": 0.4579, + "step": 12839 + }, + { + "epoch": 16.482670089858793, + "grad_norm": 1.6630207300186157, + "learning_rate": 2.784167736414206e-05, + "loss": 0.4946, + "step": 12840 + }, + { + "epoch": 16.48395378690629, + "grad_norm": 1.131354570388794, + "learning_rate": 2.7841249465126233e-05, + "loss": 0.4441, + "step": 12841 + }, + { + "epoch": 16.485237483953785, + "grad_norm": 1.6505154371261597, + "learning_rate": 2.7840821566110398e-05, + "loss": 0.4895, + "step": 12842 + }, + { + "epoch": 16.486521181001283, + "grad_norm": 1.6756689548492432, + "learning_rate": 2.7840393667094566e-05, + "loss": 0.4664, + "step": 12843 + }, + { + "epoch": 16.48780487804878, + "grad_norm": 1.7921768426895142, + "learning_rate": 2.7839965768078735e-05, + "loss": 0.4556, + "step": 12844 + }, + { + "epoch": 16.48908857509628, + "grad_norm": 1.0049737691879272, + "learning_rate": 2.78395378690629e-05, + "loss": 0.483, + "step": 12845 + }, + { + "epoch": 16.490372272143773, + "grad_norm": 1.3839232921600342, + "learning_rate": 2.783910997004707e-05, + "loss": 0.4516, + "step": 12846 + }, + { + "epoch": 16.49165596919127, + "grad_norm": 1.4355497360229492, + "learning_rate": 2.7838682071031237e-05, + "loss": 0.4872, + "step": 12847 + }, + { + "epoch": 16.49293966623877, + "grad_norm": 0.885701060295105, + "learning_rate": 2.7838254172015405e-05, + "loss": 0.4809, + "step": 12848 + }, + { + "epoch": 16.494223363286263, + "grad_norm": 1.4287700653076172, + "learning_rate": 2.7837826272999574e-05, + "loss": 0.5237, + "step": 12849 + }, + { + "epoch": 16.49550706033376, + "grad_norm": 2.1474075317382812, + "learning_rate": 2.783739837398374e-05, + "loss": 0.5073, + "step": 12850 + }, + { + "epoch": 16.49679075738126, + "grad_norm": 1.7592002153396606, + "learning_rate": 2.783697047496791e-05, + "loss": 0.5111, + "step": 12851 + }, + { + "epoch": 16.498074454428753, + "grad_norm": 1.303357481956482, + "learning_rate": 2.7836542575952075e-05, + "loss": 0.4838, + "step": 12852 + }, + { + "epoch": 16.49935815147625, + "grad_norm": 2.45816969871521, + "learning_rate": 2.7836114676936244e-05, + "loss": 0.4741, + "step": 12853 + }, + { + "epoch": 16.50064184852375, + "grad_norm": 1.2392231225967407, + "learning_rate": 2.7835686777920412e-05, + "loss": 0.4678, + "step": 12854 + }, + { + "epoch": 16.501925545571247, + "grad_norm": 2.196871280670166, + "learning_rate": 2.783525887890458e-05, + "loss": 0.5218, + "step": 12855 + }, + { + "epoch": 16.50320924261874, + "grad_norm": 1.6209510564804077, + "learning_rate": 2.7834830979888746e-05, + "loss": 0.5857, + "step": 12856 + }, + { + "epoch": 16.50449293966624, + "grad_norm": 1.3694589138031006, + "learning_rate": 2.7834403080872914e-05, + "loss": 0.5055, + "step": 12857 + }, + { + "epoch": 16.505776636713737, + "grad_norm": 2.8750290870666504, + "learning_rate": 2.7833975181857082e-05, + "loss": 0.5057, + "step": 12858 + }, + { + "epoch": 16.50706033376123, + "grad_norm": 2.1255385875701904, + "learning_rate": 2.783354728284125e-05, + "loss": 0.4939, + "step": 12859 + }, + { + "epoch": 16.50834403080873, + "grad_norm": 1.9245877265930176, + "learning_rate": 2.783311938382542e-05, + "loss": 0.5012, + "step": 12860 + }, + { + "epoch": 16.509627727856227, + "grad_norm": 3.530158042907715, + "learning_rate": 2.7832691484809584e-05, + "loss": 0.5707, + "step": 12861 + }, + { + "epoch": 16.51091142490372, + "grad_norm": 1.9007329940795898, + "learning_rate": 2.7832263585793756e-05, + "loss": 0.5366, + "step": 12862 + }, + { + "epoch": 16.51219512195122, + "grad_norm": 2.386599063873291, + "learning_rate": 2.783183568677792e-05, + "loss": 0.6, + "step": 12863 + }, + { + "epoch": 16.513478818998717, + "grad_norm": 2.781944513320923, + "learning_rate": 2.7831407787762086e-05, + "loss": 0.7242, + "step": 12864 + }, + { + "epoch": 16.514762516046215, + "grad_norm": 1.078267216682434, + "learning_rate": 2.7830979888746258e-05, + "loss": 0.4798, + "step": 12865 + }, + { + "epoch": 16.51604621309371, + "grad_norm": 0.8301724195480347, + "learning_rate": 2.7830551989730423e-05, + "loss": 0.4533, + "step": 12866 + }, + { + "epoch": 16.517329910141207, + "grad_norm": 1.0611709356307983, + "learning_rate": 2.783012409071459e-05, + "loss": 0.4777, + "step": 12867 + }, + { + "epoch": 16.518613607188705, + "grad_norm": 1.1578989028930664, + "learning_rate": 2.782969619169876e-05, + "loss": 0.4997, + "step": 12868 + }, + { + "epoch": 16.5198973042362, + "grad_norm": 0.7784866094589233, + "learning_rate": 2.7829268292682928e-05, + "loss": 0.4519, + "step": 12869 + }, + { + "epoch": 16.521181001283697, + "grad_norm": 1.4767249822616577, + "learning_rate": 2.7828840393667097e-05, + "loss": 0.4762, + "step": 12870 + }, + { + "epoch": 16.522464698331195, + "grad_norm": 1.3026938438415527, + "learning_rate": 2.782841249465126e-05, + "loss": 0.4916, + "step": 12871 + }, + { + "epoch": 16.52374839537869, + "grad_norm": 0.908921480178833, + "learning_rate": 2.782798459563543e-05, + "loss": 0.4999, + "step": 12872 + }, + { + "epoch": 16.525032092426187, + "grad_norm": 1.598488688468933, + "learning_rate": 2.78275566966196e-05, + "loss": 0.4848, + "step": 12873 + }, + { + "epoch": 16.526315789473685, + "grad_norm": 1.237417221069336, + "learning_rate": 2.7827128797603767e-05, + "loss": 0.4932, + "step": 12874 + }, + { + "epoch": 16.527599486521183, + "grad_norm": 0.8270008563995361, + "learning_rate": 2.7826700898587932e-05, + "loss": 0.453, + "step": 12875 + }, + { + "epoch": 16.528883183568677, + "grad_norm": 1.622238039970398, + "learning_rate": 2.7826272999572104e-05, + "loss": 0.5377, + "step": 12876 + }, + { + "epoch": 16.530166880616175, + "grad_norm": 1.2209426164627075, + "learning_rate": 2.782584510055627e-05, + "loss": 0.4926, + "step": 12877 + }, + { + "epoch": 16.531450577663673, + "grad_norm": 1.2336020469665527, + "learning_rate": 2.7825417201540437e-05, + "loss": 0.4848, + "step": 12878 + }, + { + "epoch": 16.532734274711167, + "grad_norm": 1.34903085231781, + "learning_rate": 2.7824989302524606e-05, + "loss": 0.5318, + "step": 12879 + }, + { + "epoch": 16.534017971758665, + "grad_norm": 1.2591028213500977, + "learning_rate": 2.782456140350877e-05, + "loss": 0.4529, + "step": 12880 + }, + { + "epoch": 16.535301668806163, + "grad_norm": 1.2549747228622437, + "learning_rate": 2.7824133504492942e-05, + "loss": 0.5127, + "step": 12881 + }, + { + "epoch": 16.536585365853657, + "grad_norm": 1.3046127557754517, + "learning_rate": 2.7823705605477107e-05, + "loss": 0.5025, + "step": 12882 + }, + { + "epoch": 16.537869062901155, + "grad_norm": 2.2967886924743652, + "learning_rate": 2.7823277706461276e-05, + "loss": 0.4756, + "step": 12883 + }, + { + "epoch": 16.539152759948653, + "grad_norm": 1.096856713294983, + "learning_rate": 2.7822849807445444e-05, + "loss": 0.4828, + "step": 12884 + }, + { + "epoch": 16.540436456996147, + "grad_norm": 1.442988634109497, + "learning_rate": 2.782242190842961e-05, + "loss": 0.485, + "step": 12885 + }, + { + "epoch": 16.541720154043645, + "grad_norm": 1.1296286582946777, + "learning_rate": 2.782199400941378e-05, + "loss": 0.5233, + "step": 12886 + }, + { + "epoch": 16.543003851091143, + "grad_norm": 1.7121357917785645, + "learning_rate": 2.7821566110397946e-05, + "loss": 0.5121, + "step": 12887 + }, + { + "epoch": 16.54428754813864, + "grad_norm": 1.1032218933105469, + "learning_rate": 2.7821138211382114e-05, + "loss": 0.4294, + "step": 12888 + }, + { + "epoch": 16.545571245186135, + "grad_norm": 1.1793392896652222, + "learning_rate": 2.7820710312366283e-05, + "loss": 0.4591, + "step": 12889 + }, + { + "epoch": 16.546854942233633, + "grad_norm": 1.2718615531921387, + "learning_rate": 2.782028241335045e-05, + "loss": 0.4875, + "step": 12890 + }, + { + "epoch": 16.54813863928113, + "grad_norm": 4.1927103996276855, + "learning_rate": 2.7819854514334616e-05, + "loss": 0.4515, + "step": 12891 + }, + { + "epoch": 16.549422336328625, + "grad_norm": 1.9682682752609253, + "learning_rate": 2.7819426615318785e-05, + "loss": 0.4623, + "step": 12892 + }, + { + "epoch": 16.550706033376123, + "grad_norm": 3.0139265060424805, + "learning_rate": 2.7818998716302953e-05, + "loss": 0.4478, + "step": 12893 + }, + { + "epoch": 16.55198973042362, + "grad_norm": 1.1403435468673706, + "learning_rate": 2.781857081728712e-05, + "loss": 0.4767, + "step": 12894 + }, + { + "epoch": 16.553273427471115, + "grad_norm": 1.347765564918518, + "learning_rate": 2.781814291827129e-05, + "loss": 0.5442, + "step": 12895 + }, + { + "epoch": 16.554557124518613, + "grad_norm": 2.0575358867645264, + "learning_rate": 2.7817715019255455e-05, + "loss": 0.5438, + "step": 12896 + }, + { + "epoch": 16.55584082156611, + "grad_norm": 2.9419147968292236, + "learning_rate": 2.7817287120239623e-05, + "loss": 0.4644, + "step": 12897 + }, + { + "epoch": 16.55712451861361, + "grad_norm": 1.6652674674987793, + "learning_rate": 2.7816859221223792e-05, + "loss": 0.51, + "step": 12898 + }, + { + "epoch": 16.558408215661103, + "grad_norm": 2.416995048522949, + "learning_rate": 2.7816431322207957e-05, + "loss": 0.4994, + "step": 12899 + }, + { + "epoch": 16.5596919127086, + "grad_norm": 1.7656512260437012, + "learning_rate": 2.781600342319213e-05, + "loss": 0.4412, + "step": 12900 + }, + { + "epoch": 16.5609756097561, + "grad_norm": 0.8871029615402222, + "learning_rate": 2.7815575524176294e-05, + "loss": 0.5556, + "step": 12901 + }, + { + "epoch": 16.562259306803593, + "grad_norm": 2.4897327423095703, + "learning_rate": 2.7815147625160465e-05, + "loss": 0.4596, + "step": 12902 + }, + { + "epoch": 16.56354300385109, + "grad_norm": 1.9017153978347778, + "learning_rate": 2.781471972614463e-05, + "loss": 0.4994, + "step": 12903 + }, + { + "epoch": 16.56482670089859, + "grad_norm": 1.9442131519317627, + "learning_rate": 2.7814291827128796e-05, + "loss": 0.5207, + "step": 12904 + }, + { + "epoch": 16.566110397946083, + "grad_norm": 1.1795634031295776, + "learning_rate": 2.7813863928112967e-05, + "loss": 0.4929, + "step": 12905 + }, + { + "epoch": 16.56739409499358, + "grad_norm": 4.661215782165527, + "learning_rate": 2.7813436029097132e-05, + "loss": 0.5165, + "step": 12906 + }, + { + "epoch": 16.56867779204108, + "grad_norm": 2.8698039054870605, + "learning_rate": 2.78130081300813e-05, + "loss": 0.5068, + "step": 12907 + }, + { + "epoch": 16.569961489088577, + "grad_norm": 1.8036210536956787, + "learning_rate": 2.781258023106547e-05, + "loss": 0.5164, + "step": 12908 + }, + { + "epoch": 16.57124518613607, + "grad_norm": 4.07652473449707, + "learning_rate": 2.7812152332049638e-05, + "loss": 0.52, + "step": 12909 + }, + { + "epoch": 16.57252888318357, + "grad_norm": 1.7250288724899292, + "learning_rate": 2.7811724433033806e-05, + "loss": 0.5226, + "step": 12910 + }, + { + "epoch": 16.573812580231067, + "grad_norm": 3.9778056144714355, + "learning_rate": 2.781129653401797e-05, + "loss": 0.5075, + "step": 12911 + }, + { + "epoch": 16.57509627727856, + "grad_norm": 2.0611743927001953, + "learning_rate": 2.781086863500214e-05, + "loss": 0.5754, + "step": 12912 + }, + { + "epoch": 16.57637997432606, + "grad_norm": 2.287020444869995, + "learning_rate": 2.7810440735986308e-05, + "loss": 0.6188, + "step": 12913 + }, + { + "epoch": 16.577663671373557, + "grad_norm": 2.211845874786377, + "learning_rate": 2.7810012836970476e-05, + "loss": 0.6463, + "step": 12914 + }, + { + "epoch": 16.57894736842105, + "grad_norm": 2.368528366088867, + "learning_rate": 2.780958493795464e-05, + "loss": 0.465, + "step": 12915 + }, + { + "epoch": 16.58023106546855, + "grad_norm": 1.944891095161438, + "learning_rate": 2.7809157038938813e-05, + "loss": 0.4684, + "step": 12916 + }, + { + "epoch": 16.581514762516047, + "grad_norm": 3.154341220855713, + "learning_rate": 2.7808729139922978e-05, + "loss": 0.4451, + "step": 12917 + }, + { + "epoch": 16.58279845956354, + "grad_norm": 1.7278034687042236, + "learning_rate": 2.7808301240907147e-05, + "loss": 0.4811, + "step": 12918 + }, + { + "epoch": 16.58408215661104, + "grad_norm": 1.500151515007019, + "learning_rate": 2.7807873341891315e-05, + "loss": 0.4631, + "step": 12919 + }, + { + "epoch": 16.585365853658537, + "grad_norm": 1.3882571458816528, + "learning_rate": 2.780744544287548e-05, + "loss": 0.4608, + "step": 12920 + }, + { + "epoch": 16.586649550706035, + "grad_norm": 1.1124590635299683, + "learning_rate": 2.7807017543859652e-05, + "loss": 0.4597, + "step": 12921 + }, + { + "epoch": 16.58793324775353, + "grad_norm": 1.2841098308563232, + "learning_rate": 2.7806589644843817e-05, + "loss": 0.5123, + "step": 12922 + }, + { + "epoch": 16.589216944801027, + "grad_norm": 1.9519284963607788, + "learning_rate": 2.7806161745827985e-05, + "loss": 0.5113, + "step": 12923 + }, + { + "epoch": 16.590500641848525, + "grad_norm": 1.1248692274093628, + "learning_rate": 2.7805733846812154e-05, + "loss": 0.4572, + "step": 12924 + }, + { + "epoch": 16.59178433889602, + "grad_norm": 0.713057816028595, + "learning_rate": 2.780530594779632e-05, + "loss": 0.4731, + "step": 12925 + }, + { + "epoch": 16.593068035943517, + "grad_norm": 1.497048020362854, + "learning_rate": 2.780487804878049e-05, + "loss": 0.458, + "step": 12926 + }, + { + "epoch": 16.594351732991015, + "grad_norm": 0.9498214721679688, + "learning_rate": 2.7804450149764655e-05, + "loss": 0.4821, + "step": 12927 + }, + { + "epoch": 16.59563543003851, + "grad_norm": 2.3706655502319336, + "learning_rate": 2.7804022250748824e-05, + "loss": 0.4699, + "step": 12928 + }, + { + "epoch": 16.596919127086007, + "grad_norm": 1.5046117305755615, + "learning_rate": 2.7803594351732992e-05, + "loss": 0.4373, + "step": 12929 + }, + { + "epoch": 16.598202824133505, + "grad_norm": 1.902351975440979, + "learning_rate": 2.780316645271716e-05, + "loss": 0.5471, + "step": 12930 + }, + { + "epoch": 16.599486521181003, + "grad_norm": 1.3897347450256348, + "learning_rate": 2.7802738553701326e-05, + "loss": 0.4677, + "step": 12931 + }, + { + "epoch": 16.600770218228497, + "grad_norm": 1.998605728149414, + "learning_rate": 2.7802310654685494e-05, + "loss": 0.4515, + "step": 12932 + }, + { + "epoch": 16.602053915275995, + "grad_norm": 0.8708691000938416, + "learning_rate": 2.7801882755669663e-05, + "loss": 0.456, + "step": 12933 + }, + { + "epoch": 16.603337612323493, + "grad_norm": 3.506875514984131, + "learning_rate": 2.780145485665383e-05, + "loss": 0.5063, + "step": 12934 + }, + { + "epoch": 16.604621309370987, + "grad_norm": 2.105347156524658, + "learning_rate": 2.7801026957638e-05, + "loss": 0.4806, + "step": 12935 + }, + { + "epoch": 16.605905006418485, + "grad_norm": 13.214434623718262, + "learning_rate": 2.7800599058622164e-05, + "loss": 0.4609, + "step": 12936 + }, + { + "epoch": 16.607188703465983, + "grad_norm": 1.1436530351638794, + "learning_rate": 2.7800171159606336e-05, + "loss": 0.4554, + "step": 12937 + }, + { + "epoch": 16.608472400513477, + "grad_norm": 6.655704021453857, + "learning_rate": 2.77997432605905e-05, + "loss": 0.4673, + "step": 12938 + }, + { + "epoch": 16.609756097560975, + "grad_norm": 1.1013860702514648, + "learning_rate": 2.7799315361574666e-05, + "loss": 0.4867, + "step": 12939 + }, + { + "epoch": 16.611039794608473, + "grad_norm": 8.57442855834961, + "learning_rate": 2.7798887462558838e-05, + "loss": 0.4566, + "step": 12940 + }, + { + "epoch": 16.61232349165597, + "grad_norm": 1.3803397417068481, + "learning_rate": 2.7798459563543003e-05, + "loss": 0.4824, + "step": 12941 + }, + { + "epoch": 16.613607188703465, + "grad_norm": 1.3940905332565308, + "learning_rate": 2.7798031664527175e-05, + "loss": 0.449, + "step": 12942 + }, + { + "epoch": 16.614890885750963, + "grad_norm": 1.5629024505615234, + "learning_rate": 2.779760376551134e-05, + "loss": 0.4766, + "step": 12943 + }, + { + "epoch": 16.61617458279846, + "grad_norm": 2.6843526363372803, + "learning_rate": 2.7797175866495508e-05, + "loss": 0.5379, + "step": 12944 + }, + { + "epoch": 16.617458279845955, + "grad_norm": 2.428826332092285, + "learning_rate": 2.7796747967479677e-05, + "loss": 0.4258, + "step": 12945 + }, + { + "epoch": 16.618741976893453, + "grad_norm": 0.8591152429580688, + "learning_rate": 2.7796320068463842e-05, + "loss": 0.5052, + "step": 12946 + }, + { + "epoch": 16.62002567394095, + "grad_norm": 2.9183709621429443, + "learning_rate": 2.779589216944801e-05, + "loss": 0.4814, + "step": 12947 + }, + { + "epoch": 16.621309370988445, + "grad_norm": 1.610757827758789, + "learning_rate": 2.779546427043218e-05, + "loss": 0.5194, + "step": 12948 + }, + { + "epoch": 16.622593068035943, + "grad_norm": 1.5546327829360962, + "learning_rate": 2.7795036371416347e-05, + "loss": 0.5058, + "step": 12949 + }, + { + "epoch": 16.62387676508344, + "grad_norm": 2.871882200241089, + "learning_rate": 2.7794608472400515e-05, + "loss": 0.46, + "step": 12950 + }, + { + "epoch": 16.625160462130935, + "grad_norm": 2.043928861618042, + "learning_rate": 2.7794180573384684e-05, + "loss": 0.4768, + "step": 12951 + }, + { + "epoch": 16.626444159178433, + "grad_norm": 1.504122257232666, + "learning_rate": 2.779375267436885e-05, + "loss": 0.4952, + "step": 12952 + }, + { + "epoch": 16.62772785622593, + "grad_norm": 1.4145227670669556, + "learning_rate": 2.7793324775353017e-05, + "loss": 0.5078, + "step": 12953 + }, + { + "epoch": 16.62901155327343, + "grad_norm": 1.8706390857696533, + "learning_rate": 2.7792896876337186e-05, + "loss": 0.5386, + "step": 12954 + }, + { + "epoch": 16.630295250320923, + "grad_norm": 3.889359712600708, + "learning_rate": 2.779246897732135e-05, + "loss": 0.4782, + "step": 12955 + }, + { + "epoch": 16.63157894736842, + "grad_norm": 2.1042585372924805, + "learning_rate": 2.7792041078305522e-05, + "loss": 0.5715, + "step": 12956 + }, + { + "epoch": 16.63286264441592, + "grad_norm": 1.902073621749878, + "learning_rate": 2.7791613179289687e-05, + "loss": 0.5151, + "step": 12957 + }, + { + "epoch": 16.634146341463413, + "grad_norm": 3.045168399810791, + "learning_rate": 2.7791185280273856e-05, + "loss": 0.5278, + "step": 12958 + }, + { + "epoch": 16.63543003851091, + "grad_norm": 2.4020495414733887, + "learning_rate": 2.7790757381258024e-05, + "loss": 0.4924, + "step": 12959 + }, + { + "epoch": 16.63671373555841, + "grad_norm": 1.804465413093567, + "learning_rate": 2.779032948224219e-05, + "loss": 0.5901, + "step": 12960 + }, + { + "epoch": 16.637997432605903, + "grad_norm": 1.8063229322433472, + "learning_rate": 2.778990158322636e-05, + "loss": 0.5585, + "step": 12961 + }, + { + "epoch": 16.6392811296534, + "grad_norm": 2.380974769592285, + "learning_rate": 2.7789473684210526e-05, + "loss": 0.5466, + "step": 12962 + }, + { + "epoch": 16.6405648267009, + "grad_norm": 3.5602850914001465, + "learning_rate": 2.7789045785194695e-05, + "loss": 0.5606, + "step": 12963 + }, + { + "epoch": 16.641848523748397, + "grad_norm": 1.8760827779769897, + "learning_rate": 2.7788617886178863e-05, + "loss": 0.7874, + "step": 12964 + }, + { + "epoch": 16.64313222079589, + "grad_norm": 0.9247428178787231, + "learning_rate": 2.7788189987163028e-05, + "loss": 0.4277, + "step": 12965 + }, + { + "epoch": 16.64441591784339, + "grad_norm": 1.1133151054382324, + "learning_rate": 2.77877620881472e-05, + "loss": 0.4478, + "step": 12966 + }, + { + "epoch": 16.645699614890887, + "grad_norm": 1.343202829360962, + "learning_rate": 2.7787334189131365e-05, + "loss": 0.4307, + "step": 12967 + }, + { + "epoch": 16.64698331193838, + "grad_norm": 1.2703495025634766, + "learning_rate": 2.7786906290115533e-05, + "loss": 0.4739, + "step": 12968 + }, + { + "epoch": 16.64826700898588, + "grad_norm": 3.8208818435668945, + "learning_rate": 2.77864783910997e-05, + "loss": 0.5151, + "step": 12969 + }, + { + "epoch": 16.649550706033377, + "grad_norm": 1.4116593599319458, + "learning_rate": 2.778605049208387e-05, + "loss": 0.4618, + "step": 12970 + }, + { + "epoch": 16.65083440308087, + "grad_norm": 0.8760963082313538, + "learning_rate": 2.7785622593068035e-05, + "loss": 0.4619, + "step": 12971 + }, + { + "epoch": 16.65211810012837, + "grad_norm": 1.111344337463379, + "learning_rate": 2.7785194694052203e-05, + "loss": 0.4604, + "step": 12972 + }, + { + "epoch": 16.653401797175867, + "grad_norm": 1.5886424779891968, + "learning_rate": 2.7784766795036372e-05, + "loss": 0.4703, + "step": 12973 + }, + { + "epoch": 16.654685494223365, + "grad_norm": 1.1714810132980347, + "learning_rate": 2.778433889602054e-05, + "loss": 0.4554, + "step": 12974 + }, + { + "epoch": 16.65596919127086, + "grad_norm": 1.4416468143463135, + "learning_rate": 2.778391099700471e-05, + "loss": 0.4927, + "step": 12975 + }, + { + "epoch": 16.657252888318357, + "grad_norm": 0.8853751420974731, + "learning_rate": 2.7783483097988874e-05, + "loss": 0.515, + "step": 12976 + }, + { + "epoch": 16.658536585365855, + "grad_norm": 1.0140372514724731, + "learning_rate": 2.7783055198973046e-05, + "loss": 0.4734, + "step": 12977 + }, + { + "epoch": 16.65982028241335, + "grad_norm": 1.7372406721115112, + "learning_rate": 2.778262729995721e-05, + "loss": 0.4557, + "step": 12978 + }, + { + "epoch": 16.661103979460847, + "grad_norm": 1.5743706226348877, + "learning_rate": 2.7782199400941376e-05, + "loss": 0.4951, + "step": 12979 + }, + { + "epoch": 16.662387676508345, + "grad_norm": 2.184539318084717, + "learning_rate": 2.7781771501925547e-05, + "loss": 0.4622, + "step": 12980 + }, + { + "epoch": 16.66367137355584, + "grad_norm": 1.4987821578979492, + "learning_rate": 2.7781343602909712e-05, + "loss": 0.496, + "step": 12981 + }, + { + "epoch": 16.664955070603337, + "grad_norm": 1.2497062683105469, + "learning_rate": 2.7780915703893884e-05, + "loss": 0.4512, + "step": 12982 + }, + { + "epoch": 16.666238767650835, + "grad_norm": 1.0915201902389526, + "learning_rate": 2.778048780487805e-05, + "loss": 0.4932, + "step": 12983 + }, + { + "epoch": 16.66752246469833, + "grad_norm": 3.625357151031494, + "learning_rate": 2.7780059905862218e-05, + "loss": 0.4756, + "step": 12984 + }, + { + "epoch": 16.668806161745827, + "grad_norm": 2.745821714401245, + "learning_rate": 2.7779632006846386e-05, + "loss": 0.4778, + "step": 12985 + }, + { + "epoch": 16.670089858793325, + "grad_norm": 1.12123441696167, + "learning_rate": 2.777920410783055e-05, + "loss": 0.4754, + "step": 12986 + }, + { + "epoch": 16.671373555840823, + "grad_norm": 1.7985259294509888, + "learning_rate": 2.777877620881472e-05, + "loss": 0.4631, + "step": 12987 + }, + { + "epoch": 16.672657252888317, + "grad_norm": 1.6523208618164062, + "learning_rate": 2.7778348309798888e-05, + "loss": 0.459, + "step": 12988 + }, + { + "epoch": 16.673940949935815, + "grad_norm": 0.7796059846878052, + "learning_rate": 2.7777920410783056e-05, + "loss": 0.4599, + "step": 12989 + }, + { + "epoch": 16.675224646983313, + "grad_norm": 1.1886968612670898, + "learning_rate": 2.7777492511767225e-05, + "loss": 0.4641, + "step": 12990 + }, + { + "epoch": 16.676508344030808, + "grad_norm": 5.284054279327393, + "learning_rate": 2.7777064612751393e-05, + "loss": 0.5223, + "step": 12991 + }, + { + "epoch": 16.677792041078305, + "grad_norm": 1.2004603147506714, + "learning_rate": 2.7776636713735558e-05, + "loss": 0.5209, + "step": 12992 + }, + { + "epoch": 16.679075738125803, + "grad_norm": 1.4388154745101929, + "learning_rate": 2.7776208814719727e-05, + "loss": 0.4891, + "step": 12993 + }, + { + "epoch": 16.680359435173298, + "grad_norm": 0.9960558414459229, + "learning_rate": 2.7775780915703895e-05, + "loss": 0.513, + "step": 12994 + }, + { + "epoch": 16.681643132220795, + "grad_norm": 1.0609709024429321, + "learning_rate": 2.777535301668806e-05, + "loss": 0.5185, + "step": 12995 + }, + { + "epoch": 16.682926829268293, + "grad_norm": 1.8218597173690796, + "learning_rate": 2.7774925117672232e-05, + "loss": 0.5003, + "step": 12996 + }, + { + "epoch": 16.68421052631579, + "grad_norm": 1.2375849485397339, + "learning_rate": 2.7774497218656397e-05, + "loss": 0.547, + "step": 12997 + }, + { + "epoch": 16.685494223363285, + "grad_norm": 2.0782625675201416, + "learning_rate": 2.777406931964057e-05, + "loss": 0.5353, + "step": 12998 + }, + { + "epoch": 16.686777920410783, + "grad_norm": 1.5363506078720093, + "learning_rate": 2.7773641420624734e-05, + "loss": 0.4997, + "step": 12999 + }, + { + "epoch": 16.68806161745828, + "grad_norm": 3.4728991985321045, + "learning_rate": 2.77732135216089e-05, + "loss": 0.5229, + "step": 13000 + }, + { + "epoch": 16.68806161745828, + "eval_cer": 0.28512336009398864, + "eval_loss": 0.5179556608200073, + "eval_runtime": 13.7692, + "eval_samples_per_second": 71.391, + "eval_steps_per_second": 0.508, + "eval_wer": 0.5050447145150195, + "step": 13000 + }, + { + "epoch": 16.689345314505776, + "grad_norm": 1.1196695566177368, + "learning_rate": 2.777278562259307e-05, + "loss": 0.5262, + "step": 13001 + }, + { + "epoch": 16.690629011553273, + "grad_norm": 8.749300956726074, + "learning_rate": 2.7772357723577235e-05, + "loss": 0.5439, + "step": 13002 + }, + { + "epoch": 16.69191270860077, + "grad_norm": 1.1638200283050537, + "learning_rate": 2.7771929824561404e-05, + "loss": 0.515, + "step": 13003 + }, + { + "epoch": 16.693196405648266, + "grad_norm": 0.8434774875640869, + "learning_rate": 2.7771501925545572e-05, + "loss": 0.4951, + "step": 13004 + }, + { + "epoch": 16.694480102695763, + "grad_norm": 2.0572874546051025, + "learning_rate": 2.777107402652974e-05, + "loss": 0.553, + "step": 13005 + }, + { + "epoch": 16.69576379974326, + "grad_norm": 1.337157130241394, + "learning_rate": 2.777064612751391e-05, + "loss": 0.4746, + "step": 13006 + }, + { + "epoch": 16.69704749679076, + "grad_norm": 1.7957247495651245, + "learning_rate": 2.7770218228498074e-05, + "loss": 0.5314, + "step": 13007 + }, + { + "epoch": 16.698331193838253, + "grad_norm": 1.3684853315353394, + "learning_rate": 2.7769790329482243e-05, + "loss": 0.4389, + "step": 13008 + }, + { + "epoch": 16.69961489088575, + "grad_norm": 2.0450427532196045, + "learning_rate": 2.776936243046641e-05, + "loss": 0.5465, + "step": 13009 + }, + { + "epoch": 16.70089858793325, + "grad_norm": 3.6056463718414307, + "learning_rate": 2.776893453145058e-05, + "loss": 0.5911, + "step": 13010 + }, + { + "epoch": 16.702182284980744, + "grad_norm": 1.8370376825332642, + "learning_rate": 2.7768506632434744e-05, + "loss": 0.5629, + "step": 13011 + }, + { + "epoch": 16.70346598202824, + "grad_norm": 1.2742772102355957, + "learning_rate": 2.7768078733418916e-05, + "loss": 0.6072, + "step": 13012 + }, + { + "epoch": 16.70474967907574, + "grad_norm": 1.5445797443389893, + "learning_rate": 2.776765083440308e-05, + "loss": 0.578, + "step": 13013 + }, + { + "epoch": 16.706033376123234, + "grad_norm": 1.8364627361297607, + "learning_rate": 2.776722293538725e-05, + "loss": 0.7306, + "step": 13014 + }, + { + "epoch": 16.70731707317073, + "grad_norm": 2.6021523475646973, + "learning_rate": 2.7766795036371418e-05, + "loss": 0.4529, + "step": 13015 + }, + { + "epoch": 16.70860077021823, + "grad_norm": 3.7935571670532227, + "learning_rate": 2.7766367137355583e-05, + "loss": 0.5154, + "step": 13016 + }, + { + "epoch": 16.709884467265724, + "grad_norm": 0.9653099775314331, + "learning_rate": 2.7765939238339755e-05, + "loss": 0.4734, + "step": 13017 + }, + { + "epoch": 16.71116816431322, + "grad_norm": 1.2322527170181274, + "learning_rate": 2.776551133932392e-05, + "loss": 0.4904, + "step": 13018 + }, + { + "epoch": 16.71245186136072, + "grad_norm": 1.0328922271728516, + "learning_rate": 2.776508344030809e-05, + "loss": 0.4972, + "step": 13019 + }, + { + "epoch": 16.713735558408217, + "grad_norm": 1.3182549476623535, + "learning_rate": 2.7764655541292257e-05, + "loss": 0.4937, + "step": 13020 + }, + { + "epoch": 16.71501925545571, + "grad_norm": 1.6128450632095337, + "learning_rate": 2.7764227642276422e-05, + "loss": 0.5001, + "step": 13021 + }, + { + "epoch": 16.71630295250321, + "grad_norm": 1.2879090309143066, + "learning_rate": 2.7763799743260594e-05, + "loss": 0.4774, + "step": 13022 + }, + { + "epoch": 16.717586649550707, + "grad_norm": 1.2165566682815552, + "learning_rate": 2.776337184424476e-05, + "loss": 0.4717, + "step": 13023 + }, + { + "epoch": 16.7188703465982, + "grad_norm": 2.121526002883911, + "learning_rate": 2.7762943945228927e-05, + "loss": 0.5042, + "step": 13024 + }, + { + "epoch": 16.7201540436457, + "grad_norm": 1.6153632402420044, + "learning_rate": 2.7762516046213095e-05, + "loss": 0.5132, + "step": 13025 + }, + { + "epoch": 16.721437740693197, + "grad_norm": 2.6947433948516846, + "learning_rate": 2.776208814719726e-05, + "loss": 0.5027, + "step": 13026 + }, + { + "epoch": 16.72272143774069, + "grad_norm": 1.3218224048614502, + "learning_rate": 2.776166024818143e-05, + "loss": 0.4875, + "step": 13027 + }, + { + "epoch": 16.72400513478819, + "grad_norm": 1.4522769451141357, + "learning_rate": 2.7761232349165597e-05, + "loss": 0.4993, + "step": 13028 + }, + { + "epoch": 16.725288831835687, + "grad_norm": 2.153243064880371, + "learning_rate": 2.7760804450149766e-05, + "loss": 0.4497, + "step": 13029 + }, + { + "epoch": 16.726572528883185, + "grad_norm": 2.075409412384033, + "learning_rate": 2.7760376551133934e-05, + "loss": 0.4769, + "step": 13030 + }, + { + "epoch": 16.72785622593068, + "grad_norm": 0.965876042842865, + "learning_rate": 2.7759948652118102e-05, + "loss": 0.5123, + "step": 13031 + }, + { + "epoch": 16.729139922978177, + "grad_norm": 1.1503688097000122, + "learning_rate": 2.7759520753102268e-05, + "loss": 0.4646, + "step": 13032 + }, + { + "epoch": 16.730423620025675, + "grad_norm": 0.990681529045105, + "learning_rate": 2.7759092854086436e-05, + "loss": 0.4753, + "step": 13033 + }, + { + "epoch": 16.73170731707317, + "grad_norm": 0.8492978811264038, + "learning_rate": 2.7758664955070604e-05, + "loss": 0.4415, + "step": 13034 + }, + { + "epoch": 16.732991014120667, + "grad_norm": 1.352929949760437, + "learning_rate": 2.775823705605477e-05, + "loss": 0.5206, + "step": 13035 + }, + { + "epoch": 16.734274711168165, + "grad_norm": 1.2603462934494019, + "learning_rate": 2.775780915703894e-05, + "loss": 0.4417, + "step": 13036 + }, + { + "epoch": 16.73555840821566, + "grad_norm": 1.2623599767684937, + "learning_rate": 2.7757381258023106e-05, + "loss": 0.4548, + "step": 13037 + }, + { + "epoch": 16.736842105263158, + "grad_norm": 0.8244913816452026, + "learning_rate": 2.7756953359007278e-05, + "loss": 0.4655, + "step": 13038 + }, + { + "epoch": 16.738125802310655, + "grad_norm": 0.9811791777610779, + "learning_rate": 2.7756525459991443e-05, + "loss": 0.4912, + "step": 13039 + }, + { + "epoch": 16.739409499358153, + "grad_norm": 0.9387068748474121, + "learning_rate": 2.7756097560975608e-05, + "loss": 0.5, + "step": 13040 + }, + { + "epoch": 16.740693196405648, + "grad_norm": 1.3668872117996216, + "learning_rate": 2.775566966195978e-05, + "loss": 0.4469, + "step": 13041 + }, + { + "epoch": 16.741976893453145, + "grad_norm": 6.892902374267578, + "learning_rate": 2.7755241762943945e-05, + "loss": 0.4516, + "step": 13042 + }, + { + "epoch": 16.743260590500643, + "grad_norm": 1.0740110874176025, + "learning_rate": 2.7754813863928113e-05, + "loss": 0.501, + "step": 13043 + }, + { + "epoch": 16.744544287548138, + "grad_norm": 2.0733022689819336, + "learning_rate": 2.775438596491228e-05, + "loss": 0.5049, + "step": 13044 + }, + { + "epoch": 16.745827984595635, + "grad_norm": 1.7086799144744873, + "learning_rate": 2.775395806589645e-05, + "loss": 0.5658, + "step": 13045 + }, + { + "epoch": 16.747111681643133, + "grad_norm": 1.5720536708831787, + "learning_rate": 2.775353016688062e-05, + "loss": 0.5093, + "step": 13046 + }, + { + "epoch": 16.748395378690628, + "grad_norm": 1.0866124629974365, + "learning_rate": 2.7753102267864784e-05, + "loss": 0.4754, + "step": 13047 + }, + { + "epoch": 16.749679075738126, + "grad_norm": 1.2114943265914917, + "learning_rate": 2.7752674368848952e-05, + "loss": 0.4858, + "step": 13048 + }, + { + "epoch": 16.750962772785623, + "grad_norm": 1.4476184844970703, + "learning_rate": 2.775224646983312e-05, + "loss": 0.5149, + "step": 13049 + }, + { + "epoch": 16.752246469833118, + "grad_norm": 3.5427098274230957, + "learning_rate": 2.775181857081729e-05, + "loss": 0.4967, + "step": 13050 + }, + { + "epoch": 16.753530166880616, + "grad_norm": 2.507418155670166, + "learning_rate": 2.7751390671801454e-05, + "loss": 0.5288, + "step": 13051 + }, + { + "epoch": 16.754813863928113, + "grad_norm": 1.9525985717773438, + "learning_rate": 2.7750962772785626e-05, + "loss": 0.5369, + "step": 13052 + }, + { + "epoch": 16.75609756097561, + "grad_norm": 0.945637047290802, + "learning_rate": 2.775053487376979e-05, + "loss": 0.4831, + "step": 13053 + }, + { + "epoch": 16.757381258023106, + "grad_norm": 1.6367201805114746, + "learning_rate": 2.775010697475396e-05, + "loss": 0.4689, + "step": 13054 + }, + { + "epoch": 16.758664955070603, + "grad_norm": 2.6106534004211426, + "learning_rate": 2.7749679075738127e-05, + "loss": 0.5044, + "step": 13055 + }, + { + "epoch": 16.7599486521181, + "grad_norm": 3.229609251022339, + "learning_rate": 2.7749251176722292e-05, + "loss": 0.5137, + "step": 13056 + }, + { + "epoch": 16.761232349165596, + "grad_norm": 1.5270711183547974, + "learning_rate": 2.7748823277706464e-05, + "loss": 0.46, + "step": 13057 + }, + { + "epoch": 16.762516046213094, + "grad_norm": 1.2872883081436157, + "learning_rate": 2.774839537869063e-05, + "loss": 0.5065, + "step": 13058 + }, + { + "epoch": 16.76379974326059, + "grad_norm": 1.5176595449447632, + "learning_rate": 2.7747967479674798e-05, + "loss": 0.4702, + "step": 13059 + }, + { + "epoch": 16.765083440308086, + "grad_norm": 1.6955312490463257, + "learning_rate": 2.7747539580658966e-05, + "loss": 0.5374, + "step": 13060 + }, + { + "epoch": 16.766367137355584, + "grad_norm": 1.2566481828689575, + "learning_rate": 2.774711168164313e-05, + "loss": 0.5043, + "step": 13061 + }, + { + "epoch": 16.76765083440308, + "grad_norm": 2.333542823791504, + "learning_rate": 2.7746683782627303e-05, + "loss": 0.6207, + "step": 13062 + }, + { + "epoch": 16.76893453145058, + "grad_norm": 3.9371497631073, + "learning_rate": 2.7746255883611468e-05, + "loss": 0.5515, + "step": 13063 + }, + { + "epoch": 16.770218228498074, + "grad_norm": 2.1091291904449463, + "learning_rate": 2.7745827984595636e-05, + "loss": 0.7179, + "step": 13064 + }, + { + "epoch": 16.77150192554557, + "grad_norm": 1.1744134426116943, + "learning_rate": 2.7745400085579805e-05, + "loss": 0.4325, + "step": 13065 + }, + { + "epoch": 16.77278562259307, + "grad_norm": 1.0623726844787598, + "learning_rate": 2.7744972186563973e-05, + "loss": 0.4514, + "step": 13066 + }, + { + "epoch": 16.774069319640564, + "grad_norm": 1.5481964349746704, + "learning_rate": 2.7744544287548138e-05, + "loss": 0.4772, + "step": 13067 + }, + { + "epoch": 16.77535301668806, + "grad_norm": 2.4488911628723145, + "learning_rate": 2.7744116388532307e-05, + "loss": 0.4728, + "step": 13068 + }, + { + "epoch": 16.77663671373556, + "grad_norm": 1.8514996767044067, + "learning_rate": 2.7743688489516475e-05, + "loss": 0.4454, + "step": 13069 + }, + { + "epoch": 16.777920410783054, + "grad_norm": 1.6634560823440552, + "learning_rate": 2.774326059050064e-05, + "loss": 0.4975, + "step": 13070 + }, + { + "epoch": 16.77920410783055, + "grad_norm": 1.5723443031311035, + "learning_rate": 2.7742832691484812e-05, + "loss": 0.4955, + "step": 13071 + }, + { + "epoch": 16.78048780487805, + "grad_norm": 1.1608998775482178, + "learning_rate": 2.7742404792468977e-05, + "loss": 0.5021, + "step": 13072 + }, + { + "epoch": 16.781771501925547, + "grad_norm": 0.8373628258705139, + "learning_rate": 2.774197689345315e-05, + "loss": 0.4927, + "step": 13073 + }, + { + "epoch": 16.78305519897304, + "grad_norm": 2.2620091438293457, + "learning_rate": 2.7741548994437314e-05, + "loss": 0.5248, + "step": 13074 + }, + { + "epoch": 16.78433889602054, + "grad_norm": 1.0696724653244019, + "learning_rate": 2.774112109542148e-05, + "loss": 0.4731, + "step": 13075 + }, + { + "epoch": 16.785622593068037, + "grad_norm": 1.0615384578704834, + "learning_rate": 2.774069319640565e-05, + "loss": 0.4737, + "step": 13076 + }, + { + "epoch": 16.78690629011553, + "grad_norm": 1.2138257026672363, + "learning_rate": 2.7740265297389816e-05, + "loss": 0.4852, + "step": 13077 + }, + { + "epoch": 16.78818998716303, + "grad_norm": 82.26592254638672, + "learning_rate": 2.7739837398373984e-05, + "loss": 0.487, + "step": 13078 + }, + { + "epoch": 16.789473684210527, + "grad_norm": 1.6843000650405884, + "learning_rate": 2.7739409499358152e-05, + "loss": 0.5244, + "step": 13079 + }, + { + "epoch": 16.79075738125802, + "grad_norm": 1.2010225057601929, + "learning_rate": 2.773898160034232e-05, + "loss": 0.4549, + "step": 13080 + }, + { + "epoch": 16.79204107830552, + "grad_norm": 1.229820966720581, + "learning_rate": 2.773855370132649e-05, + "loss": 0.4938, + "step": 13081 + }, + { + "epoch": 16.793324775353017, + "grad_norm": 2.721421718597412, + "learning_rate": 2.7738125802310654e-05, + "loss": 0.4609, + "step": 13082 + }, + { + "epoch": 16.794608472400512, + "grad_norm": 4.885201930999756, + "learning_rate": 2.7737697903294823e-05, + "loss": 0.4741, + "step": 13083 + }, + { + "epoch": 16.79589216944801, + "grad_norm": 1.3567577600479126, + "learning_rate": 2.773727000427899e-05, + "loss": 0.4995, + "step": 13084 + }, + { + "epoch": 16.797175866495508, + "grad_norm": 1.5482803583145142, + "learning_rate": 2.773684210526316e-05, + "loss": 0.507, + "step": 13085 + }, + { + "epoch": 16.798459563543005, + "grad_norm": 0.9696534276008606, + "learning_rate": 2.7736414206247324e-05, + "loss": 0.4758, + "step": 13086 + }, + { + "epoch": 16.7997432605905, + "grad_norm": 2.0017359256744385, + "learning_rate": 2.7735986307231493e-05, + "loss": 0.4811, + "step": 13087 + }, + { + "epoch": 16.801026957637998, + "grad_norm": 0.9374757409095764, + "learning_rate": 2.773555840821566e-05, + "loss": 0.4525, + "step": 13088 + }, + { + "epoch": 16.802310654685495, + "grad_norm": 1.0308808088302612, + "learning_rate": 2.773513050919983e-05, + "loss": 0.4697, + "step": 13089 + }, + { + "epoch": 16.80359435173299, + "grad_norm": 1.1274399757385254, + "learning_rate": 2.7734702610183998e-05, + "loss": 0.4529, + "step": 13090 + }, + { + "epoch": 16.804878048780488, + "grad_norm": 1.854258418083191, + "learning_rate": 2.7734274711168163e-05, + "loss": 0.452, + "step": 13091 + }, + { + "epoch": 16.806161745827985, + "grad_norm": 3.8531441688537598, + "learning_rate": 2.7733846812152335e-05, + "loss": 0.4845, + "step": 13092 + }, + { + "epoch": 16.80744544287548, + "grad_norm": 2.811028003692627, + "learning_rate": 2.77334189131365e-05, + "loss": 0.4801, + "step": 13093 + }, + { + "epoch": 16.808729139922978, + "grad_norm": 3.1500439643859863, + "learning_rate": 2.7732991014120665e-05, + "loss": 0.494, + "step": 13094 + }, + { + "epoch": 16.810012836970476, + "grad_norm": 2.2873048782348633, + "learning_rate": 2.7732563115104837e-05, + "loss": 0.4475, + "step": 13095 + }, + { + "epoch": 16.811296534017973, + "grad_norm": 3.94763445854187, + "learning_rate": 2.7732135216089002e-05, + "loss": 0.5398, + "step": 13096 + }, + { + "epoch": 16.812580231065468, + "grad_norm": 2.0054497718811035, + "learning_rate": 2.7731707317073174e-05, + "loss": 0.471, + "step": 13097 + }, + { + "epoch": 16.813863928112966, + "grad_norm": 3.23905873298645, + "learning_rate": 2.773127941805734e-05, + "loss": 0.466, + "step": 13098 + }, + { + "epoch": 16.815147625160463, + "grad_norm": 1.2533913850784302, + "learning_rate": 2.7730851519041507e-05, + "loss": 0.4877, + "step": 13099 + }, + { + "epoch": 16.816431322207958, + "grad_norm": 2.027631998062134, + "learning_rate": 2.7730423620025675e-05, + "loss": 0.5797, + "step": 13100 + }, + { + "epoch": 16.817715019255456, + "grad_norm": 1.4409229755401611, + "learning_rate": 2.772999572100984e-05, + "loss": 0.4925, + "step": 13101 + }, + { + "epoch": 16.818998716302954, + "grad_norm": 2.5267579555511475, + "learning_rate": 2.772956782199401e-05, + "loss": 0.4709, + "step": 13102 + }, + { + "epoch": 16.820282413350448, + "grad_norm": 2.783721446990967, + "learning_rate": 2.7729139922978177e-05, + "loss": 0.4761, + "step": 13103 + }, + { + "epoch": 16.821566110397946, + "grad_norm": 1.4244309663772583, + "learning_rate": 2.7728712023962346e-05, + "loss": 0.5131, + "step": 13104 + }, + { + "epoch": 16.822849807445444, + "grad_norm": 4.887997150421143, + "learning_rate": 2.7728284124946514e-05, + "loss": 0.4962, + "step": 13105 + }, + { + "epoch": 16.82413350449294, + "grad_norm": 1.5087789297103882, + "learning_rate": 2.7727856225930683e-05, + "loss": 0.6035, + "step": 13106 + }, + { + "epoch": 16.825417201540436, + "grad_norm": 2.5815553665161133, + "learning_rate": 2.7727428326914848e-05, + "loss": 0.5282, + "step": 13107 + }, + { + "epoch": 16.826700898587934, + "grad_norm": 1.6346720457077026, + "learning_rate": 2.7727000427899016e-05, + "loss": 0.5341, + "step": 13108 + }, + { + "epoch": 16.82798459563543, + "grad_norm": 9.056670188903809, + "learning_rate": 2.7726572528883184e-05, + "loss": 0.5303, + "step": 13109 + }, + { + "epoch": 16.829268292682926, + "grad_norm": 1.9931640625, + "learning_rate": 2.772614462986735e-05, + "loss": 0.5459, + "step": 13110 + }, + { + "epoch": 16.830551989730424, + "grad_norm": 1.3971065282821655, + "learning_rate": 2.772571673085152e-05, + "loss": 0.5261, + "step": 13111 + }, + { + "epoch": 16.83183568677792, + "grad_norm": 6.138516426086426, + "learning_rate": 2.7725288831835686e-05, + "loss": 0.6048, + "step": 13112 + }, + { + "epoch": 16.833119383825416, + "grad_norm": 2.1847550868988037, + "learning_rate": 2.7724860932819858e-05, + "loss": 0.574, + "step": 13113 + }, + { + "epoch": 16.834403080872914, + "grad_norm": 1.9670687913894653, + "learning_rate": 2.7724433033804023e-05, + "loss": 0.6049, + "step": 13114 + }, + { + "epoch": 16.83568677792041, + "grad_norm": 1.0692987442016602, + "learning_rate": 2.7724005134788188e-05, + "loss": 0.4402, + "step": 13115 + }, + { + "epoch": 16.836970474967906, + "grad_norm": 0.9219298362731934, + "learning_rate": 2.772357723577236e-05, + "loss": 0.4736, + "step": 13116 + }, + { + "epoch": 16.838254172015404, + "grad_norm": 1.0438975095748901, + "learning_rate": 2.7723149336756525e-05, + "loss": 0.5168, + "step": 13117 + }, + { + "epoch": 16.8395378690629, + "grad_norm": 1.840011715888977, + "learning_rate": 2.7722721437740693e-05, + "loss": 0.5049, + "step": 13118 + }, + { + "epoch": 16.8408215661104, + "grad_norm": 1.3159347772598267, + "learning_rate": 2.7722293538724862e-05, + "loss": 0.4961, + "step": 13119 + }, + { + "epoch": 16.842105263157894, + "grad_norm": 1.0867401361465454, + "learning_rate": 2.772186563970903e-05, + "loss": 0.4623, + "step": 13120 + }, + { + "epoch": 16.84338896020539, + "grad_norm": 0.8958145380020142, + "learning_rate": 2.77214377406932e-05, + "loss": 0.4748, + "step": 13121 + }, + { + "epoch": 16.84467265725289, + "grad_norm": 1.3585865497589111, + "learning_rate": 2.7721009841677364e-05, + "loss": 0.461, + "step": 13122 + }, + { + "epoch": 16.845956354300384, + "grad_norm": 1.6509661674499512, + "learning_rate": 2.7720581942661532e-05, + "loss": 0.4888, + "step": 13123 + }, + { + "epoch": 16.84724005134788, + "grad_norm": 1.006287693977356, + "learning_rate": 2.77201540436457e-05, + "loss": 0.4704, + "step": 13124 + }, + { + "epoch": 16.84852374839538, + "grad_norm": 1.2886520624160767, + "learning_rate": 2.771972614462987e-05, + "loss": 0.4718, + "step": 13125 + }, + { + "epoch": 16.849807445442874, + "grad_norm": 1.1685129404067993, + "learning_rate": 2.7719298245614034e-05, + "loss": 0.4907, + "step": 13126 + }, + { + "epoch": 16.85109114249037, + "grad_norm": 1.5580090284347534, + "learning_rate": 2.7718870346598206e-05, + "loss": 0.4953, + "step": 13127 + }, + { + "epoch": 16.85237483953787, + "grad_norm": 1.0591832399368286, + "learning_rate": 2.771844244758237e-05, + "loss": 0.5065, + "step": 13128 + }, + { + "epoch": 16.853658536585368, + "grad_norm": 1.6852515935897827, + "learning_rate": 2.771801454856654e-05, + "loss": 0.4757, + "step": 13129 + }, + { + "epoch": 16.854942233632862, + "grad_norm": 1.7744187116622925, + "learning_rate": 2.7717586649550707e-05, + "loss": 0.4777, + "step": 13130 + }, + { + "epoch": 16.85622593068036, + "grad_norm": 2.4983842372894287, + "learning_rate": 2.7717158750534873e-05, + "loss": 0.4756, + "step": 13131 + }, + { + "epoch": 16.857509627727858, + "grad_norm": 1.2116461992263794, + "learning_rate": 2.7716730851519044e-05, + "loss": 0.4831, + "step": 13132 + }, + { + "epoch": 16.858793324775352, + "grad_norm": 1.043243408203125, + "learning_rate": 2.771630295250321e-05, + "loss": 0.4968, + "step": 13133 + }, + { + "epoch": 16.86007702182285, + "grad_norm": 1.0169938802719116, + "learning_rate": 2.7715875053487378e-05, + "loss": 0.4905, + "step": 13134 + }, + { + "epoch": 16.861360718870348, + "grad_norm": 1.2106261253356934, + "learning_rate": 2.7715447154471546e-05, + "loss": 0.4663, + "step": 13135 + }, + { + "epoch": 16.862644415917842, + "grad_norm": 0.9052469730377197, + "learning_rate": 2.771501925545571e-05, + "loss": 0.4595, + "step": 13136 + }, + { + "epoch": 16.86392811296534, + "grad_norm": 1.250758409500122, + "learning_rate": 2.7714591356439883e-05, + "loss": 0.4406, + "step": 13137 + }, + { + "epoch": 16.865211810012838, + "grad_norm": 1.0160725116729736, + "learning_rate": 2.7714163457424048e-05, + "loss": 0.4389, + "step": 13138 + }, + { + "epoch": 16.866495507060336, + "grad_norm": 1.5591158866882324, + "learning_rate": 2.7713735558408216e-05, + "loss": 0.5062, + "step": 13139 + }, + { + "epoch": 16.86777920410783, + "grad_norm": 2.2236623764038086, + "learning_rate": 2.7713307659392385e-05, + "loss": 0.5745, + "step": 13140 + }, + { + "epoch": 16.869062901155328, + "grad_norm": 1.2354973554611206, + "learning_rate": 2.7712879760376553e-05, + "loss": 0.5085, + "step": 13141 + }, + { + "epoch": 16.870346598202826, + "grad_norm": 1.151320457458496, + "learning_rate": 2.7712451861360718e-05, + "loss": 0.505, + "step": 13142 + }, + { + "epoch": 16.87163029525032, + "grad_norm": 1.7371397018432617, + "learning_rate": 2.7712023962344887e-05, + "loss": 0.4711, + "step": 13143 + }, + { + "epoch": 16.872913992297818, + "grad_norm": 6.254327297210693, + "learning_rate": 2.7711596063329055e-05, + "loss": 0.5074, + "step": 13144 + }, + { + "epoch": 16.874197689345316, + "grad_norm": 1.2779641151428223, + "learning_rate": 2.7711168164313224e-05, + "loss": 0.4865, + "step": 13145 + }, + { + "epoch": 16.87548138639281, + "grad_norm": 4.021182537078857, + "learning_rate": 2.7710740265297392e-05, + "loss": 0.484, + "step": 13146 + }, + { + "epoch": 16.876765083440308, + "grad_norm": 3.0269112586975098, + "learning_rate": 2.7710312366281557e-05, + "loss": 0.5059, + "step": 13147 + }, + { + "epoch": 16.878048780487806, + "grad_norm": 2.0250933170318604, + "learning_rate": 2.7709884467265725e-05, + "loss": 0.5397, + "step": 13148 + }, + { + "epoch": 16.8793324775353, + "grad_norm": 1.4909826517105103, + "learning_rate": 2.7709456568249894e-05, + "loss": 0.4539, + "step": 13149 + }, + { + "epoch": 16.880616174582798, + "grad_norm": 1.1354238986968994, + "learning_rate": 2.770902866923406e-05, + "loss": 0.5128, + "step": 13150 + }, + { + "epoch": 16.881899871630296, + "grad_norm": 1.5565388202667236, + "learning_rate": 2.770860077021823e-05, + "loss": 0.4873, + "step": 13151 + }, + { + "epoch": 16.883183568677794, + "grad_norm": 1.3891103267669678, + "learning_rate": 2.7708172871202396e-05, + "loss": 0.4942, + "step": 13152 + }, + { + "epoch": 16.884467265725288, + "grad_norm": 3.5656068325042725, + "learning_rate": 2.7707744972186567e-05, + "loss": 0.5407, + "step": 13153 + }, + { + "epoch": 16.885750962772786, + "grad_norm": 1.4138599634170532, + "learning_rate": 2.7707317073170732e-05, + "loss": 0.4625, + "step": 13154 + }, + { + "epoch": 16.887034659820284, + "grad_norm": 6.243634223937988, + "learning_rate": 2.7706889174154897e-05, + "loss": 0.5409, + "step": 13155 + }, + { + "epoch": 16.888318356867778, + "grad_norm": 1.2437657117843628, + "learning_rate": 2.770646127513907e-05, + "loss": 0.5258, + "step": 13156 + }, + { + "epoch": 16.889602053915276, + "grad_norm": 2.7127878665924072, + "learning_rate": 2.7706033376123234e-05, + "loss": 0.5201, + "step": 13157 + }, + { + "epoch": 16.890885750962774, + "grad_norm": 3.273669481277466, + "learning_rate": 2.7705605477107403e-05, + "loss": 0.4966, + "step": 13158 + }, + { + "epoch": 16.892169448010268, + "grad_norm": 1.8933321237564087, + "learning_rate": 2.770517757809157e-05, + "loss": 0.5603, + "step": 13159 + }, + { + "epoch": 16.893453145057766, + "grad_norm": 2.544893741607666, + "learning_rate": 2.770474967907574e-05, + "loss": 0.5596, + "step": 13160 + }, + { + "epoch": 16.894736842105264, + "grad_norm": 4.360031604766846, + "learning_rate": 2.7704321780059908e-05, + "loss": 0.5477, + "step": 13161 + }, + { + "epoch": 16.89602053915276, + "grad_norm": 1.3792619705200195, + "learning_rate": 2.7703893881044073e-05, + "loss": 0.5582, + "step": 13162 + }, + { + "epoch": 16.897304236200256, + "grad_norm": 5.824681282043457, + "learning_rate": 2.770346598202824e-05, + "loss": 0.615, + "step": 13163 + }, + { + "epoch": 16.898587933247754, + "grad_norm": 4.542207717895508, + "learning_rate": 2.770303808301241e-05, + "loss": 0.7002, + "step": 13164 + }, + { + "epoch": 16.89987163029525, + "grad_norm": 2.160149335861206, + "learning_rate": 2.7702610183996578e-05, + "loss": 0.4644, + "step": 13165 + }, + { + "epoch": 16.901155327342746, + "grad_norm": 2.8966445922851562, + "learning_rate": 2.7702182284980743e-05, + "loss": 0.4422, + "step": 13166 + }, + { + "epoch": 16.902439024390244, + "grad_norm": 1.621989369392395, + "learning_rate": 2.7701754385964915e-05, + "loss": 0.4393, + "step": 13167 + }, + { + "epoch": 16.90372272143774, + "grad_norm": 0.8524444103240967, + "learning_rate": 2.770132648694908e-05, + "loss": 0.4842, + "step": 13168 + }, + { + "epoch": 16.905006418485236, + "grad_norm": 1.0200752019882202, + "learning_rate": 2.770089858793325e-05, + "loss": 0.4655, + "step": 13169 + }, + { + "epoch": 16.906290115532734, + "grad_norm": 1.1862843036651611, + "learning_rate": 2.7700470688917417e-05, + "loss": 0.5004, + "step": 13170 + }, + { + "epoch": 16.90757381258023, + "grad_norm": 1.0407794713974, + "learning_rate": 2.7700042789901582e-05, + "loss": 0.5049, + "step": 13171 + }, + { + "epoch": 16.90885750962773, + "grad_norm": 1.7734204530715942, + "learning_rate": 2.7699614890885754e-05, + "loss": 0.468, + "step": 13172 + }, + { + "epoch": 16.910141206675224, + "grad_norm": 1.0858278274536133, + "learning_rate": 2.769918699186992e-05, + "loss": 0.4971, + "step": 13173 + }, + { + "epoch": 16.911424903722722, + "grad_norm": 0.832958996295929, + "learning_rate": 2.7698759092854087e-05, + "loss": 0.4446, + "step": 13174 + }, + { + "epoch": 16.91270860077022, + "grad_norm": 2.621037006378174, + "learning_rate": 2.7698331193838256e-05, + "loss": 0.4584, + "step": 13175 + }, + { + "epoch": 16.913992297817714, + "grad_norm": 2.477781057357788, + "learning_rate": 2.769790329482242e-05, + "loss": 0.4805, + "step": 13176 + }, + { + "epoch": 16.915275994865212, + "grad_norm": 1.5449644327163696, + "learning_rate": 2.7697475395806592e-05, + "loss": 0.4959, + "step": 13177 + }, + { + "epoch": 16.91655969191271, + "grad_norm": 1.9323155879974365, + "learning_rate": 2.7697047496790757e-05, + "loss": 0.4479, + "step": 13178 + }, + { + "epoch": 16.917843388960204, + "grad_norm": 1.295265555381775, + "learning_rate": 2.7696619597774926e-05, + "loss": 0.4976, + "step": 13179 + }, + { + "epoch": 16.919127086007702, + "grad_norm": 1.9938198328018188, + "learning_rate": 2.7696191698759094e-05, + "loss": 0.4968, + "step": 13180 + }, + { + "epoch": 16.9204107830552, + "grad_norm": 1.3687750101089478, + "learning_rate": 2.7695763799743263e-05, + "loss": 0.4519, + "step": 13181 + }, + { + "epoch": 16.921694480102694, + "grad_norm": 1.2701395750045776, + "learning_rate": 2.7695335900727428e-05, + "loss": 0.4935, + "step": 13182 + }, + { + "epoch": 16.922978177150192, + "grad_norm": 2.4559223651885986, + "learning_rate": 2.7694908001711596e-05, + "loss": 0.4599, + "step": 13183 + }, + { + "epoch": 16.92426187419769, + "grad_norm": 1.2864353656768799, + "learning_rate": 2.7694480102695764e-05, + "loss": 0.4787, + "step": 13184 + }, + { + "epoch": 16.925545571245188, + "grad_norm": 3.5598466396331787, + "learning_rate": 2.7694052203679933e-05, + "loss": 0.4608, + "step": 13185 + }, + { + "epoch": 16.926829268292682, + "grad_norm": 1.0864931344985962, + "learning_rate": 2.76936243046641e-05, + "loss": 0.4769, + "step": 13186 + }, + { + "epoch": 16.92811296534018, + "grad_norm": 1.6978005170822144, + "learning_rate": 2.7693196405648266e-05, + "loss": 0.5039, + "step": 13187 + }, + { + "epoch": 16.929396662387678, + "grad_norm": 4.587523460388184, + "learning_rate": 2.7692768506632438e-05, + "loss": 0.4837, + "step": 13188 + }, + { + "epoch": 16.930680359435172, + "grad_norm": 1.6620718240737915, + "learning_rate": 2.7692340607616603e-05, + "loss": 0.4843, + "step": 13189 + }, + { + "epoch": 16.93196405648267, + "grad_norm": 2.6543643474578857, + "learning_rate": 2.7691912708600768e-05, + "loss": 0.5103, + "step": 13190 + }, + { + "epoch": 16.933247753530168, + "grad_norm": 1.1842906475067139, + "learning_rate": 2.769148480958494e-05, + "loss": 0.4804, + "step": 13191 + }, + { + "epoch": 16.934531450577662, + "grad_norm": 0.9943047761917114, + "learning_rate": 2.7691056910569105e-05, + "loss": 0.484, + "step": 13192 + }, + { + "epoch": 16.93581514762516, + "grad_norm": 1.4112952947616577, + "learning_rate": 2.7690629011553277e-05, + "loss": 0.538, + "step": 13193 + }, + { + "epoch": 16.937098844672658, + "grad_norm": 0.8565147519111633, + "learning_rate": 2.7690201112537442e-05, + "loss": 0.4679, + "step": 13194 + }, + { + "epoch": 16.938382541720156, + "grad_norm": 0.9854446053504944, + "learning_rate": 2.768977321352161e-05, + "loss": 0.4916, + "step": 13195 + }, + { + "epoch": 16.93966623876765, + "grad_norm": 0.9501777291297913, + "learning_rate": 2.768934531450578e-05, + "loss": 0.4791, + "step": 13196 + }, + { + "epoch": 16.940949935815148, + "grad_norm": 10.807352066040039, + "learning_rate": 2.7688917415489944e-05, + "loss": 0.5329, + "step": 13197 + }, + { + "epoch": 16.942233632862646, + "grad_norm": 3.7601029872894287, + "learning_rate": 2.7688489516474112e-05, + "loss": 0.4889, + "step": 13198 + }, + { + "epoch": 16.94351732991014, + "grad_norm": 1.3456995487213135, + "learning_rate": 2.768806161745828e-05, + "loss": 0.522, + "step": 13199 + }, + { + "epoch": 16.944801026957638, + "grad_norm": 2.072803258895874, + "learning_rate": 2.768763371844245e-05, + "loss": 0.5171, + "step": 13200 + }, + { + "epoch": 16.946084724005136, + "grad_norm": 1.9073418378829956, + "learning_rate": 2.7687205819426617e-05, + "loss": 0.488, + "step": 13201 + }, + { + "epoch": 16.94736842105263, + "grad_norm": 0.9038680195808411, + "learning_rate": 2.7686777920410786e-05, + "loss": 0.5219, + "step": 13202 + }, + { + "epoch": 16.948652118100128, + "grad_norm": 1.1377732753753662, + "learning_rate": 2.768635002139495e-05, + "loss": 0.5568, + "step": 13203 + }, + { + "epoch": 16.949935815147626, + "grad_norm": 4.7695512771606445, + "learning_rate": 2.768592212237912e-05, + "loss": 0.5151, + "step": 13204 + }, + { + "epoch": 16.951219512195124, + "grad_norm": 11.705842018127441, + "learning_rate": 2.7685494223363288e-05, + "loss": 0.4798, + "step": 13205 + }, + { + "epoch": 16.952503209242618, + "grad_norm": 1.1566261053085327, + "learning_rate": 2.7685066324347453e-05, + "loss": 0.532, + "step": 13206 + }, + { + "epoch": 16.953786906290116, + "grad_norm": 2.014395236968994, + "learning_rate": 2.7684638425331624e-05, + "loss": 0.5182, + "step": 13207 + }, + { + "epoch": 16.955070603337614, + "grad_norm": 1.7703568935394287, + "learning_rate": 2.768421052631579e-05, + "loss": 0.554, + "step": 13208 + }, + { + "epoch": 16.956354300385108, + "grad_norm": 1.2434989213943481, + "learning_rate": 2.7683782627299958e-05, + "loss": 0.5182, + "step": 13209 + }, + { + "epoch": 16.957637997432606, + "grad_norm": 2.0319535732269287, + "learning_rate": 2.7683354728284126e-05, + "loss": 0.5625, + "step": 13210 + }, + { + "epoch": 16.958921694480104, + "grad_norm": 1.4430580139160156, + "learning_rate": 2.768292682926829e-05, + "loss": 0.5783, + "step": 13211 + }, + { + "epoch": 16.960205391527598, + "grad_norm": 5.902266502380371, + "learning_rate": 2.7682498930252463e-05, + "loss": 0.6178, + "step": 13212 + }, + { + "epoch": 16.961489088575096, + "grad_norm": 1.4588379859924316, + "learning_rate": 2.7682071031236628e-05, + "loss": 0.5995, + "step": 13213 + }, + { + "epoch": 16.962772785622594, + "grad_norm": 1.6860722303390503, + "learning_rate": 2.7681643132220796e-05, + "loss": 0.7109, + "step": 13214 + }, + { + "epoch": 16.964056482670088, + "grad_norm": 0.8360679149627686, + "learning_rate": 2.7681215233204965e-05, + "loss": 0.4632, + "step": 13215 + }, + { + "epoch": 16.965340179717586, + "grad_norm": 0.8557924032211304, + "learning_rate": 2.768078733418913e-05, + "loss": 0.4613, + "step": 13216 + }, + { + "epoch": 16.966623876765084, + "grad_norm": 2.2063992023468018, + "learning_rate": 2.7680359435173302e-05, + "loss": 0.4583, + "step": 13217 + }, + { + "epoch": 16.96790757381258, + "grad_norm": 1.4251015186309814, + "learning_rate": 2.7679931536157467e-05, + "loss": 0.4311, + "step": 13218 + }, + { + "epoch": 16.969191270860076, + "grad_norm": 1.8567383289337158, + "learning_rate": 2.7679503637141635e-05, + "loss": 0.4765, + "step": 13219 + }, + { + "epoch": 16.970474967907574, + "grad_norm": 1.134117603302002, + "learning_rate": 2.7679075738125804e-05, + "loss": 0.4907, + "step": 13220 + }, + { + "epoch": 16.971758664955072, + "grad_norm": 1.5540831089019775, + "learning_rate": 2.7678647839109972e-05, + "loss": 0.4396, + "step": 13221 + }, + { + "epoch": 16.973042362002566, + "grad_norm": 1.7054574489593506, + "learning_rate": 2.7678219940094137e-05, + "loss": 0.4938, + "step": 13222 + }, + { + "epoch": 16.974326059050064, + "grad_norm": 1.9488385915756226, + "learning_rate": 2.7677792041078305e-05, + "loss": 0.4837, + "step": 13223 + }, + { + "epoch": 16.975609756097562, + "grad_norm": 1.3831347227096558, + "learning_rate": 2.7677364142062474e-05, + "loss": 0.4955, + "step": 13224 + }, + { + "epoch": 16.976893453145056, + "grad_norm": 2.13156795501709, + "learning_rate": 2.7676936243046642e-05, + "loss": 0.4953, + "step": 13225 + }, + { + "epoch": 16.978177150192554, + "grad_norm": 2.46099853515625, + "learning_rate": 2.767650834403081e-05, + "loss": 0.5203, + "step": 13226 + }, + { + "epoch": 16.979460847240052, + "grad_norm": 1.6092311143875122, + "learning_rate": 2.7676080445014976e-05, + "loss": 0.4856, + "step": 13227 + }, + { + "epoch": 16.98074454428755, + "grad_norm": 1.592154860496521, + "learning_rate": 2.7675652545999147e-05, + "loss": 0.4522, + "step": 13228 + }, + { + "epoch": 16.982028241335044, + "grad_norm": 1.478472113609314, + "learning_rate": 2.7675224646983312e-05, + "loss": 0.4555, + "step": 13229 + }, + { + "epoch": 16.983311938382542, + "grad_norm": 1.6722095012664795, + "learning_rate": 2.7674796747967478e-05, + "loss": 0.4982, + "step": 13230 + }, + { + "epoch": 16.98459563543004, + "grad_norm": 1.4396369457244873, + "learning_rate": 2.767436884895165e-05, + "loss": 0.5224, + "step": 13231 + }, + { + "epoch": 16.985879332477534, + "grad_norm": 1.4228134155273438, + "learning_rate": 2.7673940949935814e-05, + "loss": 0.4922, + "step": 13232 + }, + { + "epoch": 16.987163029525032, + "grad_norm": 1.5423930883407593, + "learning_rate": 2.7673513050919986e-05, + "loss": 0.4763, + "step": 13233 + }, + { + "epoch": 16.98844672657253, + "grad_norm": 1.685694694519043, + "learning_rate": 2.767308515190415e-05, + "loss": 0.4667, + "step": 13234 + }, + { + "epoch": 16.989730423620024, + "grad_norm": 4.036201000213623, + "learning_rate": 2.767265725288832e-05, + "loss": 0.5102, + "step": 13235 + }, + { + "epoch": 16.991014120667522, + "grad_norm": 1.7033092975616455, + "learning_rate": 2.7672229353872488e-05, + "loss": 0.5222, + "step": 13236 + }, + { + "epoch": 16.99229781771502, + "grad_norm": 2.414139747619629, + "learning_rate": 2.7671801454856653e-05, + "loss": 0.5258, + "step": 13237 + }, + { + "epoch": 16.993581514762518, + "grad_norm": 1.5459691286087036, + "learning_rate": 2.767137355584082e-05, + "loss": 0.5336, + "step": 13238 + }, + { + "epoch": 16.994865211810012, + "grad_norm": 1.5597087144851685, + "learning_rate": 2.767094565682499e-05, + "loss": 0.5004, + "step": 13239 + }, + { + "epoch": 16.99614890885751, + "grad_norm": 1.5928924083709717, + "learning_rate": 2.7670517757809158e-05, + "loss": 0.5176, + "step": 13240 + }, + { + "epoch": 16.997432605905008, + "grad_norm": 2.491807222366333, + "learning_rate": 2.7670089858793327e-05, + "loss": 0.5785, + "step": 13241 + }, + { + "epoch": 16.998716302952502, + "grad_norm": 1.8402132987976074, + "learning_rate": 2.7669661959777495e-05, + "loss": 0.5838, + "step": 13242 + }, + { + "epoch": 17.0, + "grad_norm": 3.2525599002838135, + "learning_rate": 2.766923406076166e-05, + "loss": 0.6628, + "step": 13243 + }, + { + "epoch": 17.001283697047498, + "grad_norm": 1.1539745330810547, + "learning_rate": 2.766880616174583e-05, + "loss": 0.4536, + "step": 13244 + }, + { + "epoch": 17.002567394094992, + "grad_norm": 2.6074910163879395, + "learning_rate": 2.7668378262729997e-05, + "loss": 0.4164, + "step": 13245 + }, + { + "epoch": 17.00385109114249, + "grad_norm": 1.5216174125671387, + "learning_rate": 2.7667950363714162e-05, + "loss": 0.5027, + "step": 13246 + }, + { + "epoch": 17.005134788189988, + "grad_norm": 1.159334421157837, + "learning_rate": 2.7667522464698334e-05, + "loss": 0.4479, + "step": 13247 + }, + { + "epoch": 17.006418485237482, + "grad_norm": 1.7382973432540894, + "learning_rate": 2.76670945656825e-05, + "loss": 0.4581, + "step": 13248 + }, + { + "epoch": 17.00770218228498, + "grad_norm": 1.5224028825759888, + "learning_rate": 2.766666666666667e-05, + "loss": 0.4404, + "step": 13249 + }, + { + "epoch": 17.008985879332478, + "grad_norm": 1.6560935974121094, + "learning_rate": 2.7666238767650836e-05, + "loss": 0.4568, + "step": 13250 + }, + { + "epoch": 17.010269576379976, + "grad_norm": 8.227422714233398, + "learning_rate": 2.7665810868635e-05, + "loss": 0.4945, + "step": 13251 + }, + { + "epoch": 17.01155327342747, + "grad_norm": 1.4641894102096558, + "learning_rate": 2.7665382969619172e-05, + "loss": 0.5341, + "step": 13252 + }, + { + "epoch": 17.012836970474968, + "grad_norm": 1.2348576784133911, + "learning_rate": 2.7664955070603337e-05, + "loss": 0.4743, + "step": 13253 + }, + { + "epoch": 17.014120667522466, + "grad_norm": 1.5963212251663208, + "learning_rate": 2.7664527171587506e-05, + "loss": 0.4871, + "step": 13254 + }, + { + "epoch": 17.01540436456996, + "grad_norm": 1.094545841217041, + "learning_rate": 2.7664099272571674e-05, + "loss": 0.4653, + "step": 13255 + }, + { + "epoch": 17.016688061617458, + "grad_norm": 1.0257611274719238, + "learning_rate": 2.7663671373555843e-05, + "loss": 0.4828, + "step": 13256 + }, + { + "epoch": 17.017971758664956, + "grad_norm": 0.9396768808364868, + "learning_rate": 2.766324347454001e-05, + "loss": 0.4713, + "step": 13257 + }, + { + "epoch": 17.01925545571245, + "grad_norm": 1.9873569011688232, + "learning_rate": 2.7662815575524176e-05, + "loss": 0.4476, + "step": 13258 + }, + { + "epoch": 17.020539152759948, + "grad_norm": 1.408443808555603, + "learning_rate": 2.7662387676508345e-05, + "loss": 0.4835, + "step": 13259 + }, + { + "epoch": 17.021822849807446, + "grad_norm": 3.4977922439575195, + "learning_rate": 2.7661959777492513e-05, + "loss": 0.4685, + "step": 13260 + }, + { + "epoch": 17.023106546854944, + "grad_norm": 1.6966149806976318, + "learning_rate": 2.766153187847668e-05, + "loss": 0.4782, + "step": 13261 + }, + { + "epoch": 17.024390243902438, + "grad_norm": 1.132264256477356, + "learning_rate": 2.7661103979460846e-05, + "loss": 0.4563, + "step": 13262 + }, + { + "epoch": 17.025673940949936, + "grad_norm": 1.7759302854537964, + "learning_rate": 2.7660676080445018e-05, + "loss": 0.4459, + "step": 13263 + }, + { + "epoch": 17.026957637997434, + "grad_norm": 2.033935785293579, + "learning_rate": 2.7660248181429183e-05, + "loss": 0.4232, + "step": 13264 + }, + { + "epoch": 17.028241335044928, + "grad_norm": 5.461703777313232, + "learning_rate": 2.765982028241335e-05, + "loss": 0.468, + "step": 13265 + }, + { + "epoch": 17.029525032092426, + "grad_norm": 1.1507114171981812, + "learning_rate": 2.765939238339752e-05, + "loss": 0.5115, + "step": 13266 + }, + { + "epoch": 17.030808729139924, + "grad_norm": 2.9225680828094482, + "learning_rate": 2.7658964484381685e-05, + "loss": 0.4503, + "step": 13267 + }, + { + "epoch": 17.03209242618742, + "grad_norm": 4.884652614593506, + "learning_rate": 2.7658536585365857e-05, + "loss": 0.4816, + "step": 13268 + }, + { + "epoch": 17.033376123234916, + "grad_norm": 1.0249083042144775, + "learning_rate": 2.7658108686350022e-05, + "loss": 0.47, + "step": 13269 + }, + { + "epoch": 17.034659820282414, + "grad_norm": 0.9096640944480896, + "learning_rate": 2.765768078733419e-05, + "loss": 0.4401, + "step": 13270 + }, + { + "epoch": 17.035943517329912, + "grad_norm": 3.733255624771118, + "learning_rate": 2.765725288831836e-05, + "loss": 0.433, + "step": 13271 + }, + { + "epoch": 17.037227214377406, + "grad_norm": 1.308557391166687, + "learning_rate": 2.7656824989302524e-05, + "loss": 0.4928, + "step": 13272 + }, + { + "epoch": 17.038510911424904, + "grad_norm": 2.469938278198242, + "learning_rate": 2.7656397090286692e-05, + "loss": 0.4859, + "step": 13273 + }, + { + "epoch": 17.039794608472402, + "grad_norm": 3.192089557647705, + "learning_rate": 2.765596919127086e-05, + "loss": 0.4744, + "step": 13274 + }, + { + "epoch": 17.041078305519896, + "grad_norm": 1.1481910943984985, + "learning_rate": 2.765554129225503e-05, + "loss": 0.462, + "step": 13275 + }, + { + "epoch": 17.042362002567394, + "grad_norm": 2.186044454574585, + "learning_rate": 2.7655113393239197e-05, + "loss": 0.4831, + "step": 13276 + }, + { + "epoch": 17.043645699614892, + "grad_norm": 1.4384194612503052, + "learning_rate": 2.7654685494223362e-05, + "loss": 0.4649, + "step": 13277 + }, + { + "epoch": 17.044929396662386, + "grad_norm": 2.7779133319854736, + "learning_rate": 2.765425759520753e-05, + "loss": 0.5007, + "step": 13278 + }, + { + "epoch": 17.046213093709884, + "grad_norm": 4.599409103393555, + "learning_rate": 2.76538296961917e-05, + "loss": 0.4933, + "step": 13279 + }, + { + "epoch": 17.047496790757382, + "grad_norm": 1.18020761013031, + "learning_rate": 2.7653401797175868e-05, + "loss": 0.4698, + "step": 13280 + }, + { + "epoch": 17.048780487804876, + "grad_norm": 2.8461382389068604, + "learning_rate": 2.7652973898160033e-05, + "loss": 0.5646, + "step": 13281 + }, + { + "epoch": 17.050064184852374, + "grad_norm": 10.871271133422852, + "learning_rate": 2.7652545999144204e-05, + "loss": 0.5088, + "step": 13282 + }, + { + "epoch": 17.051347881899872, + "grad_norm": 1.8819092512130737, + "learning_rate": 2.765211810012837e-05, + "loss": 0.4814, + "step": 13283 + }, + { + "epoch": 17.05263157894737, + "grad_norm": 1.5770620107650757, + "learning_rate": 2.7651690201112538e-05, + "loss": 0.4884, + "step": 13284 + }, + { + "epoch": 17.053915275994864, + "grad_norm": 2.1902806758880615, + "learning_rate": 2.7651262302096706e-05, + "loss": 0.5328, + "step": 13285 + }, + { + "epoch": 17.055198973042362, + "grad_norm": 2.1376147270202637, + "learning_rate": 2.765083440308087e-05, + "loss": 0.4745, + "step": 13286 + }, + { + "epoch": 17.05648267008986, + "grad_norm": 1.8499788045883179, + "learning_rate": 2.7650406504065043e-05, + "loss": 0.4819, + "step": 13287 + }, + { + "epoch": 17.057766367137354, + "grad_norm": 2.5354042053222656, + "learning_rate": 2.7649978605049208e-05, + "loss": 0.5575, + "step": 13288 + }, + { + "epoch": 17.059050064184852, + "grad_norm": 10.251222610473633, + "learning_rate": 2.7649550706033377e-05, + "loss": 0.4758, + "step": 13289 + }, + { + "epoch": 17.06033376123235, + "grad_norm": 10.460314750671387, + "learning_rate": 2.7649122807017545e-05, + "loss": 0.5152, + "step": 13290 + }, + { + "epoch": 17.061617458279844, + "grad_norm": 3.045814037322998, + "learning_rate": 2.764869490800171e-05, + "loss": 0.536, + "step": 13291 + }, + { + "epoch": 17.062901155327342, + "grad_norm": 1.3981190919876099, + "learning_rate": 2.7648267008985882e-05, + "loss": 0.6311, + "step": 13292 + }, + { + "epoch": 17.06418485237484, + "grad_norm": 1.8811280727386475, + "learning_rate": 2.7647839109970047e-05, + "loss": 0.6592, + "step": 13293 + }, + { + "epoch": 17.065468549422338, + "grad_norm": 1.9384713172912598, + "learning_rate": 2.7647411210954215e-05, + "loss": 0.4338, + "step": 13294 + }, + { + "epoch": 17.066752246469832, + "grad_norm": 1.0889885425567627, + "learning_rate": 2.7646983311938384e-05, + "loss": 0.4586, + "step": 13295 + }, + { + "epoch": 17.06803594351733, + "grad_norm": 3.8447113037109375, + "learning_rate": 2.7646555412922552e-05, + "loss": 0.4596, + "step": 13296 + }, + { + "epoch": 17.069319640564828, + "grad_norm": 1.3658312559127808, + "learning_rate": 2.7646127513906717e-05, + "loss": 0.4602, + "step": 13297 + }, + { + "epoch": 17.070603337612322, + "grad_norm": 1.000826120376587, + "learning_rate": 2.7645699614890885e-05, + "loss": 0.4837, + "step": 13298 + }, + { + "epoch": 17.07188703465982, + "grad_norm": 1.3551852703094482, + "learning_rate": 2.7645271715875054e-05, + "loss": 0.4322, + "step": 13299 + }, + { + "epoch": 17.073170731707318, + "grad_norm": 1.4531608819961548, + "learning_rate": 2.7644843816859222e-05, + "loss": 0.4371, + "step": 13300 + }, + { + "epoch": 17.074454428754812, + "grad_norm": 1.6735187768936157, + "learning_rate": 2.764441591784339e-05, + "loss": 0.4743, + "step": 13301 + }, + { + "epoch": 17.07573812580231, + "grad_norm": 1.945711374282837, + "learning_rate": 2.7643988018827556e-05, + "loss": 0.479, + "step": 13302 + }, + { + "epoch": 17.077021822849808, + "grad_norm": 1.1548027992248535, + "learning_rate": 2.7643560119811728e-05, + "loss": 0.4898, + "step": 13303 + }, + { + "epoch": 17.078305519897306, + "grad_norm": 1.973357081413269, + "learning_rate": 2.7643132220795893e-05, + "loss": 0.4501, + "step": 13304 + }, + { + "epoch": 17.0795892169448, + "grad_norm": 1.428478717803955, + "learning_rate": 2.7642704321780058e-05, + "loss": 0.4715, + "step": 13305 + }, + { + "epoch": 17.080872913992298, + "grad_norm": 1.2051138877868652, + "learning_rate": 2.764227642276423e-05, + "loss": 0.4402, + "step": 13306 + }, + { + "epoch": 17.082156611039796, + "grad_norm": 2.024707555770874, + "learning_rate": 2.7641848523748394e-05, + "loss": 0.5148, + "step": 13307 + }, + { + "epoch": 17.08344030808729, + "grad_norm": 1.5707759857177734, + "learning_rate": 2.7641420624732566e-05, + "loss": 0.515, + "step": 13308 + }, + { + "epoch": 17.084724005134788, + "grad_norm": 1.246214747428894, + "learning_rate": 2.764099272571673e-05, + "loss": 0.4872, + "step": 13309 + }, + { + "epoch": 17.086007702182286, + "grad_norm": 4.19212007522583, + "learning_rate": 2.76405648267009e-05, + "loss": 0.4727, + "step": 13310 + }, + { + "epoch": 17.08729139922978, + "grad_norm": 2.537199020385742, + "learning_rate": 2.7640136927685068e-05, + "loss": 0.4695, + "step": 13311 + }, + { + "epoch": 17.088575096277278, + "grad_norm": 1.8395156860351562, + "learning_rate": 2.7639709028669233e-05, + "loss": 0.5017, + "step": 13312 + }, + { + "epoch": 17.089858793324776, + "grad_norm": 1.330886721611023, + "learning_rate": 2.76392811296534e-05, + "loss": 0.4902, + "step": 13313 + }, + { + "epoch": 17.09114249037227, + "grad_norm": 2.2410714626312256, + "learning_rate": 2.763885323063757e-05, + "loss": 0.4573, + "step": 13314 + }, + { + "epoch": 17.09242618741977, + "grad_norm": 2.844654083251953, + "learning_rate": 2.763842533162174e-05, + "loss": 0.4569, + "step": 13315 + }, + { + "epoch": 17.093709884467266, + "grad_norm": 1.9783406257629395, + "learning_rate": 2.7637997432605907e-05, + "loss": 0.4677, + "step": 13316 + }, + { + "epoch": 17.094993581514764, + "grad_norm": 2.4505834579467773, + "learning_rate": 2.7637569533590075e-05, + "loss": 0.4592, + "step": 13317 + }, + { + "epoch": 17.09627727856226, + "grad_norm": 2.2341432571411133, + "learning_rate": 2.763714163457424e-05, + "loss": 0.4483, + "step": 13318 + }, + { + "epoch": 17.097560975609756, + "grad_norm": 1.6422300338745117, + "learning_rate": 2.763671373555841e-05, + "loss": 0.4662, + "step": 13319 + }, + { + "epoch": 17.098844672657254, + "grad_norm": 2.5132453441619873, + "learning_rate": 2.7636285836542577e-05, + "loss": 0.4446, + "step": 13320 + }, + { + "epoch": 17.10012836970475, + "grad_norm": 4.19901704788208, + "learning_rate": 2.7635857937526742e-05, + "loss": 0.4632, + "step": 13321 + }, + { + "epoch": 17.101412066752246, + "grad_norm": 1.8334490060806274, + "learning_rate": 2.7635430038510914e-05, + "loss": 0.4808, + "step": 13322 + }, + { + "epoch": 17.102695763799744, + "grad_norm": 2.62612247467041, + "learning_rate": 2.763500213949508e-05, + "loss": 0.4573, + "step": 13323 + }, + { + "epoch": 17.10397946084724, + "grad_norm": 1.40956449508667, + "learning_rate": 2.763457424047925e-05, + "loss": 0.4825, + "step": 13324 + }, + { + "epoch": 17.105263157894736, + "grad_norm": 1.0734984874725342, + "learning_rate": 2.7634146341463416e-05, + "loss": 0.5197, + "step": 13325 + }, + { + "epoch": 17.106546854942234, + "grad_norm": 2.657582998275757, + "learning_rate": 2.763371844244758e-05, + "loss": 0.5243, + "step": 13326 + }, + { + "epoch": 17.107830551989732, + "grad_norm": 3.642723321914673, + "learning_rate": 2.7633290543431752e-05, + "loss": 0.5254, + "step": 13327 + }, + { + "epoch": 17.109114249037226, + "grad_norm": 1.645534873008728, + "learning_rate": 2.7632862644415917e-05, + "loss": 0.464, + "step": 13328 + }, + { + "epoch": 17.110397946084724, + "grad_norm": 17.875173568725586, + "learning_rate": 2.7632434745400086e-05, + "loss": 0.5042, + "step": 13329 + }, + { + "epoch": 17.111681643132222, + "grad_norm": 0.9283944964408875, + "learning_rate": 2.7632006846384254e-05, + "loss": 0.5412, + "step": 13330 + }, + { + "epoch": 17.112965340179716, + "grad_norm": 1.1737219095230103, + "learning_rate": 2.7631578947368423e-05, + "loss": 0.5159, + "step": 13331 + }, + { + "epoch": 17.114249037227214, + "grad_norm": 3.3889758586883545, + "learning_rate": 2.763115104835259e-05, + "loss": 0.5035, + "step": 13332 + }, + { + "epoch": 17.115532734274712, + "grad_norm": 2.061471700668335, + "learning_rate": 2.7630723149336756e-05, + "loss": 0.4864, + "step": 13333 + }, + { + "epoch": 17.116816431322206, + "grad_norm": 2.7513113021850586, + "learning_rate": 2.7630295250320925e-05, + "loss": 0.4998, + "step": 13334 + }, + { + "epoch": 17.118100128369704, + "grad_norm": 1.2315624952316284, + "learning_rate": 2.7629867351305093e-05, + "loss": 0.527, + "step": 13335 + }, + { + "epoch": 17.119383825417202, + "grad_norm": 2.4732589721679688, + "learning_rate": 2.762943945228926e-05, + "loss": 0.4563, + "step": 13336 + }, + { + "epoch": 17.1206675224647, + "grad_norm": 2.8542401790618896, + "learning_rate": 2.7629011553273426e-05, + "loss": 0.512, + "step": 13337 + }, + { + "epoch": 17.121951219512194, + "grad_norm": 2.1242716312408447, + "learning_rate": 2.7628583654257595e-05, + "loss": 0.4967, + "step": 13338 + }, + { + "epoch": 17.123234916559692, + "grad_norm": 1.7147133350372314, + "learning_rate": 2.7628155755241763e-05, + "loss": 0.4838, + "step": 13339 + }, + { + "epoch": 17.12451861360719, + "grad_norm": 1.5268899202346802, + "learning_rate": 2.762772785622593e-05, + "loss": 0.4879, + "step": 13340 + }, + { + "epoch": 17.125802310654684, + "grad_norm": 4.147542476654053, + "learning_rate": 2.76272999572101e-05, + "loss": 0.5634, + "step": 13341 + }, + { + "epoch": 17.127086007702182, + "grad_norm": 2.758915424346924, + "learning_rate": 2.7626872058194265e-05, + "loss": 0.5631, + "step": 13342 + }, + { + "epoch": 17.12836970474968, + "grad_norm": 1.8868663311004639, + "learning_rate": 2.7626444159178437e-05, + "loss": 0.6012, + "step": 13343 + }, + { + "epoch": 17.129653401797174, + "grad_norm": 1.4151132106781006, + "learning_rate": 2.7626016260162602e-05, + "loss": 0.4194, + "step": 13344 + }, + { + "epoch": 17.130937098844672, + "grad_norm": 2.2167727947235107, + "learning_rate": 2.7625588361146767e-05, + "loss": 0.4335, + "step": 13345 + }, + { + "epoch": 17.13222079589217, + "grad_norm": 3.7206969261169434, + "learning_rate": 2.762516046213094e-05, + "loss": 0.4601, + "step": 13346 + }, + { + "epoch": 17.133504492939664, + "grad_norm": 1.0882818698883057, + "learning_rate": 2.7624732563115104e-05, + "loss": 0.4649, + "step": 13347 + }, + { + "epoch": 17.134788189987162, + "grad_norm": 4.442968368530273, + "learning_rate": 2.7624304664099276e-05, + "loss": 0.4782, + "step": 13348 + }, + { + "epoch": 17.13607188703466, + "grad_norm": 1.8264356851577759, + "learning_rate": 2.762387676508344e-05, + "loss": 0.494, + "step": 13349 + }, + { + "epoch": 17.137355584082158, + "grad_norm": 1.5368242263793945, + "learning_rate": 2.762344886606761e-05, + "loss": 0.4508, + "step": 13350 + }, + { + "epoch": 17.138639281129652, + "grad_norm": 1.601840853691101, + "learning_rate": 2.7623020967051777e-05, + "loss": 0.4582, + "step": 13351 + }, + { + "epoch": 17.13992297817715, + "grad_norm": 1.269781231880188, + "learning_rate": 2.7622593068035942e-05, + "loss": 0.4561, + "step": 13352 + }, + { + "epoch": 17.141206675224648, + "grad_norm": 2.5366899967193604, + "learning_rate": 2.762216516902011e-05, + "loss": 0.4481, + "step": 13353 + }, + { + "epoch": 17.142490372272142, + "grad_norm": 2.1851420402526855, + "learning_rate": 2.762173727000428e-05, + "loss": 0.448, + "step": 13354 + }, + { + "epoch": 17.14377406931964, + "grad_norm": 2.0670523643493652, + "learning_rate": 2.7621309370988448e-05, + "loss": 0.5028, + "step": 13355 + }, + { + "epoch": 17.145057766367138, + "grad_norm": 1.2189170122146606, + "learning_rate": 2.7620881471972616e-05, + "loss": 0.4763, + "step": 13356 + }, + { + "epoch": 17.146341463414632, + "grad_norm": 1.5540568828582764, + "learning_rate": 2.7620453572956784e-05, + "loss": 0.4669, + "step": 13357 + }, + { + "epoch": 17.14762516046213, + "grad_norm": 1.588354229927063, + "learning_rate": 2.762002567394095e-05, + "loss": 0.5078, + "step": 13358 + }, + { + "epoch": 17.14890885750963, + "grad_norm": 1.5668481588363647, + "learning_rate": 2.7619597774925118e-05, + "loss": 0.5099, + "step": 13359 + }, + { + "epoch": 17.150192554557126, + "grad_norm": 11.206189155578613, + "learning_rate": 2.7619169875909286e-05, + "loss": 0.4532, + "step": 13360 + }, + { + "epoch": 17.15147625160462, + "grad_norm": 1.6482583284378052, + "learning_rate": 2.761874197689345e-05, + "loss": 0.4488, + "step": 13361 + }, + { + "epoch": 17.15275994865212, + "grad_norm": 2.143672466278076, + "learning_rate": 2.7618314077877623e-05, + "loss": 0.4675, + "step": 13362 + }, + { + "epoch": 17.154043645699616, + "grad_norm": 1.3652242422103882, + "learning_rate": 2.7617886178861788e-05, + "loss": 0.4585, + "step": 13363 + }, + { + "epoch": 17.15532734274711, + "grad_norm": 1.9763730764389038, + "learning_rate": 2.761745827984596e-05, + "loss": 0.4512, + "step": 13364 + }, + { + "epoch": 17.15661103979461, + "grad_norm": 1.2444521188735962, + "learning_rate": 2.7617030380830125e-05, + "loss": 0.4505, + "step": 13365 + }, + { + "epoch": 17.157894736842106, + "grad_norm": 6.095229148864746, + "learning_rate": 2.761660248181429e-05, + "loss": 0.4548, + "step": 13366 + }, + { + "epoch": 17.1591784338896, + "grad_norm": 0.9874228835105896, + "learning_rate": 2.7616174582798462e-05, + "loss": 0.4592, + "step": 13367 + }, + { + "epoch": 17.1604621309371, + "grad_norm": 1.6889164447784424, + "learning_rate": 2.7615746683782627e-05, + "loss": 0.4363, + "step": 13368 + }, + { + "epoch": 17.161745827984596, + "grad_norm": 1.3697906732559204, + "learning_rate": 2.7615318784766795e-05, + "loss": 0.5192, + "step": 13369 + }, + { + "epoch": 17.163029525032094, + "grad_norm": 1.5989665985107422, + "learning_rate": 2.7614890885750964e-05, + "loss": 0.5119, + "step": 13370 + }, + { + "epoch": 17.16431322207959, + "grad_norm": 1.2915724515914917, + "learning_rate": 2.7614462986735132e-05, + "loss": 0.4605, + "step": 13371 + }, + { + "epoch": 17.165596919127086, + "grad_norm": 1.223683476448059, + "learning_rate": 2.76140350877193e-05, + "loss": 0.4203, + "step": 13372 + }, + { + "epoch": 17.166880616174584, + "grad_norm": 2.177942991256714, + "learning_rate": 2.7613607188703466e-05, + "loss": 0.5119, + "step": 13373 + }, + { + "epoch": 17.16816431322208, + "grad_norm": 1.791856050491333, + "learning_rate": 2.7613179289687634e-05, + "loss": 0.4734, + "step": 13374 + }, + { + "epoch": 17.169448010269576, + "grad_norm": 1.2651426792144775, + "learning_rate": 2.7612751390671802e-05, + "loss": 0.5102, + "step": 13375 + }, + { + "epoch": 17.170731707317074, + "grad_norm": 1.2742767333984375, + "learning_rate": 2.761232349165597e-05, + "loss": 0.5052, + "step": 13376 + }, + { + "epoch": 17.17201540436457, + "grad_norm": 2.9378132820129395, + "learning_rate": 2.7611895592640136e-05, + "loss": 0.513, + "step": 13377 + }, + { + "epoch": 17.173299101412066, + "grad_norm": 1.145214557647705, + "learning_rate": 2.7611467693624308e-05, + "loss": 0.468, + "step": 13378 + }, + { + "epoch": 17.174582798459564, + "grad_norm": 1.2158037424087524, + "learning_rate": 2.7611039794608473e-05, + "loss": 0.5516, + "step": 13379 + }, + { + "epoch": 17.17586649550706, + "grad_norm": 1.6438544988632202, + "learning_rate": 2.761061189559264e-05, + "loss": 0.4894, + "step": 13380 + }, + { + "epoch": 17.177150192554556, + "grad_norm": 1.3986716270446777, + "learning_rate": 2.761018399657681e-05, + "loss": 0.4994, + "step": 13381 + }, + { + "epoch": 17.178433889602054, + "grad_norm": 1.0821946859359741, + "learning_rate": 2.7609756097560974e-05, + "loss": 0.4972, + "step": 13382 + }, + { + "epoch": 17.179717586649552, + "grad_norm": 2.1411452293395996, + "learning_rate": 2.7609328198545146e-05, + "loss": 0.488, + "step": 13383 + }, + { + "epoch": 17.181001283697046, + "grad_norm": 1.742050051689148, + "learning_rate": 2.760890029952931e-05, + "loss": 0.5178, + "step": 13384 + }, + { + "epoch": 17.182284980744544, + "grad_norm": 8.695640563964844, + "learning_rate": 2.760847240051348e-05, + "loss": 0.4918, + "step": 13385 + }, + { + "epoch": 17.183568677792042, + "grad_norm": 4.10731315612793, + "learning_rate": 2.7608044501497648e-05, + "loss": 0.5121, + "step": 13386 + }, + { + "epoch": 17.184852374839537, + "grad_norm": 2.2021968364715576, + "learning_rate": 2.7607616602481813e-05, + "loss": 0.5535, + "step": 13387 + }, + { + "epoch": 17.186136071887034, + "grad_norm": 2.3826069831848145, + "learning_rate": 2.7607188703465985e-05, + "loss": 0.5051, + "step": 13388 + }, + { + "epoch": 17.187419768934532, + "grad_norm": 1.7978770732879639, + "learning_rate": 2.760676080445015e-05, + "loss": 0.555, + "step": 13389 + }, + { + "epoch": 17.188703465982027, + "grad_norm": 1.3888394832611084, + "learning_rate": 2.760633290543432e-05, + "loss": 0.5535, + "step": 13390 + }, + { + "epoch": 17.189987163029524, + "grad_norm": 3.5742764472961426, + "learning_rate": 2.7605905006418487e-05, + "loss": 0.6528, + "step": 13391 + }, + { + "epoch": 17.191270860077022, + "grad_norm": 2.308213233947754, + "learning_rate": 2.7605477107402655e-05, + "loss": 0.6065, + "step": 13392 + }, + { + "epoch": 17.19255455712452, + "grad_norm": 3.188587188720703, + "learning_rate": 2.760504920838682e-05, + "loss": 0.6957, + "step": 13393 + }, + { + "epoch": 17.193838254172015, + "grad_norm": 3.2785098552703857, + "learning_rate": 2.760462130937099e-05, + "loss": 0.4512, + "step": 13394 + }, + { + "epoch": 17.195121951219512, + "grad_norm": 1.6930490732192993, + "learning_rate": 2.7604193410355157e-05, + "loss": 0.4514, + "step": 13395 + }, + { + "epoch": 17.19640564826701, + "grad_norm": 1.2071261405944824, + "learning_rate": 2.7603765511339325e-05, + "loss": 0.4785, + "step": 13396 + }, + { + "epoch": 17.197689345314505, + "grad_norm": 1.6281737089157104, + "learning_rate": 2.7603337612323494e-05, + "loss": 0.5032, + "step": 13397 + }, + { + "epoch": 17.198973042362002, + "grad_norm": 2.3983070850372314, + "learning_rate": 2.760290971330766e-05, + "loss": 0.4966, + "step": 13398 + }, + { + "epoch": 17.2002567394095, + "grad_norm": 1.891666293144226, + "learning_rate": 2.7602481814291827e-05, + "loss": 0.4327, + "step": 13399 + }, + { + "epoch": 17.201540436456995, + "grad_norm": 1.138059139251709, + "learning_rate": 2.7602053915275996e-05, + "loss": 0.4658, + "step": 13400 + }, + { + "epoch": 17.202824133504492, + "grad_norm": 1.030798077583313, + "learning_rate": 2.760162601626016e-05, + "loss": 0.4594, + "step": 13401 + }, + { + "epoch": 17.20410783055199, + "grad_norm": 0.8766888976097107, + "learning_rate": 2.7601198117244333e-05, + "loss": 0.4574, + "step": 13402 + }, + { + "epoch": 17.205391527599488, + "grad_norm": 1.846879005432129, + "learning_rate": 2.7600770218228498e-05, + "loss": 0.4936, + "step": 13403 + }, + { + "epoch": 17.206675224646983, + "grad_norm": 1.5615400075912476, + "learning_rate": 2.760034231921267e-05, + "loss": 0.4886, + "step": 13404 + }, + { + "epoch": 17.20795892169448, + "grad_norm": 1.2053415775299072, + "learning_rate": 2.7599914420196834e-05, + "loss": 0.4678, + "step": 13405 + }, + { + "epoch": 17.20924261874198, + "grad_norm": 3.1230087280273438, + "learning_rate": 2.7599486521181e-05, + "loss": 0.4689, + "step": 13406 + }, + { + "epoch": 17.210526315789473, + "grad_norm": 1.070571780204773, + "learning_rate": 2.759905862216517e-05, + "loss": 0.4524, + "step": 13407 + }, + { + "epoch": 17.21181001283697, + "grad_norm": 1.059752345085144, + "learning_rate": 2.7598630723149336e-05, + "loss": 0.4631, + "step": 13408 + }, + { + "epoch": 17.21309370988447, + "grad_norm": 1.6950479745864868, + "learning_rate": 2.7598202824133505e-05, + "loss": 0.4669, + "step": 13409 + }, + { + "epoch": 17.214377406931963, + "grad_norm": 2.4826841354370117, + "learning_rate": 2.7597774925117673e-05, + "loss": 0.5199, + "step": 13410 + }, + { + "epoch": 17.21566110397946, + "grad_norm": 1.3836454153060913, + "learning_rate": 2.759734702610184e-05, + "loss": 0.4278, + "step": 13411 + }, + { + "epoch": 17.21694480102696, + "grad_norm": 10.498031616210938, + "learning_rate": 2.759691912708601e-05, + "loss": 0.488, + "step": 13412 + }, + { + "epoch": 17.218228498074453, + "grad_norm": 1.8092323541641235, + "learning_rate": 2.7596491228070175e-05, + "loss": 0.4614, + "step": 13413 + }, + { + "epoch": 17.21951219512195, + "grad_norm": 1.6088889837265015, + "learning_rate": 2.7596063329054343e-05, + "loss": 0.5168, + "step": 13414 + }, + { + "epoch": 17.22079589216945, + "grad_norm": 4.461475849151611, + "learning_rate": 2.7595635430038512e-05, + "loss": 0.4862, + "step": 13415 + }, + { + "epoch": 17.222079589216946, + "grad_norm": 1.5357269048690796, + "learning_rate": 2.759520753102268e-05, + "loss": 0.4993, + "step": 13416 + }, + { + "epoch": 17.22336328626444, + "grad_norm": 1.1803832054138184, + "learning_rate": 2.7594779632006845e-05, + "loss": 0.428, + "step": 13417 + }, + { + "epoch": 17.22464698331194, + "grad_norm": 1.4052668809890747, + "learning_rate": 2.7594351732991017e-05, + "loss": 0.4329, + "step": 13418 + }, + { + "epoch": 17.225930680359436, + "grad_norm": 0.996974527835846, + "learning_rate": 2.7593923833975182e-05, + "loss": 0.4253, + "step": 13419 + }, + { + "epoch": 17.22721437740693, + "grad_norm": 0.9920785427093506, + "learning_rate": 2.759349593495935e-05, + "loss": 0.4693, + "step": 13420 + }, + { + "epoch": 17.22849807445443, + "grad_norm": 1.5197505950927734, + "learning_rate": 2.759306803594352e-05, + "loss": 0.4766, + "step": 13421 + }, + { + "epoch": 17.229781771501926, + "grad_norm": 2.627758026123047, + "learning_rate": 2.7592640136927684e-05, + "loss": 0.48, + "step": 13422 + }, + { + "epoch": 17.23106546854942, + "grad_norm": 0.8625651001930237, + "learning_rate": 2.7592212237911856e-05, + "loss": 0.5054, + "step": 13423 + }, + { + "epoch": 17.23234916559692, + "grad_norm": 0.754258930683136, + "learning_rate": 2.759178433889602e-05, + "loss": 0.4666, + "step": 13424 + }, + { + "epoch": 17.233632862644416, + "grad_norm": 1.7543121576309204, + "learning_rate": 2.759135643988019e-05, + "loss": 0.537, + "step": 13425 + }, + { + "epoch": 17.234916559691914, + "grad_norm": 1.69648277759552, + "learning_rate": 2.7590928540864357e-05, + "loss": 0.4715, + "step": 13426 + }, + { + "epoch": 17.23620025673941, + "grad_norm": 4.2695631980896, + "learning_rate": 2.7590500641848522e-05, + "loss": 0.5289, + "step": 13427 + }, + { + "epoch": 17.237483953786906, + "grad_norm": 1.293616771697998, + "learning_rate": 2.7590072742832694e-05, + "loss": 0.4393, + "step": 13428 + }, + { + "epoch": 17.238767650834404, + "grad_norm": 2.3164212703704834, + "learning_rate": 2.758964484381686e-05, + "loss": 0.4621, + "step": 13429 + }, + { + "epoch": 17.2400513478819, + "grad_norm": 0.9596962928771973, + "learning_rate": 2.7589216944801028e-05, + "loss": 0.4773, + "step": 13430 + }, + { + "epoch": 17.241335044929397, + "grad_norm": 0.9272326231002808, + "learning_rate": 2.7588789045785196e-05, + "loss": 0.5049, + "step": 13431 + }, + { + "epoch": 17.242618741976894, + "grad_norm": 1.3570853471755981, + "learning_rate": 2.7588361146769365e-05, + "loss": 0.4778, + "step": 13432 + }, + { + "epoch": 17.24390243902439, + "grad_norm": 2.263049840927124, + "learning_rate": 2.758793324775353e-05, + "loss": 0.5514, + "step": 13433 + }, + { + "epoch": 17.245186136071887, + "grad_norm": 1.0109221935272217, + "learning_rate": 2.7587505348737698e-05, + "loss": 0.4851, + "step": 13434 + }, + { + "epoch": 17.246469833119384, + "grad_norm": 1.2040818929672241, + "learning_rate": 2.7587077449721866e-05, + "loss": 0.5842, + "step": 13435 + }, + { + "epoch": 17.247753530166882, + "grad_norm": 1.7601860761642456, + "learning_rate": 2.7586649550706035e-05, + "loss": 0.4946, + "step": 13436 + }, + { + "epoch": 17.249037227214377, + "grad_norm": 1.1790348291397095, + "learning_rate": 2.7586221651690203e-05, + "loss": 0.4683, + "step": 13437 + }, + { + "epoch": 17.250320924261874, + "grad_norm": 1.2047159671783447, + "learning_rate": 2.7585793752674368e-05, + "loss": 0.5535, + "step": 13438 + }, + { + "epoch": 17.251604621309372, + "grad_norm": 1.7871679067611694, + "learning_rate": 2.758536585365854e-05, + "loss": 0.5839, + "step": 13439 + }, + { + "epoch": 17.252888318356867, + "grad_norm": 1.5863547325134277, + "learning_rate": 2.7584937954642705e-05, + "loss": 0.556, + "step": 13440 + }, + { + "epoch": 17.254172015404365, + "grad_norm": 1.0550906658172607, + "learning_rate": 2.758451005562687e-05, + "loss": 0.5331, + "step": 13441 + }, + { + "epoch": 17.255455712451862, + "grad_norm": 3.2547404766082764, + "learning_rate": 2.7584082156611042e-05, + "loss": 0.5744, + "step": 13442 + }, + { + "epoch": 17.256739409499357, + "grad_norm": 2.4280753135681152, + "learning_rate": 2.7583654257595207e-05, + "loss": 0.6876, + "step": 13443 + }, + { + "epoch": 17.258023106546855, + "grad_norm": 1.2051458358764648, + "learning_rate": 2.758322635857938e-05, + "loss": 0.4656, + "step": 13444 + }, + { + "epoch": 17.259306803594352, + "grad_norm": 1.2175889015197754, + "learning_rate": 2.7582798459563544e-05, + "loss": 0.4516, + "step": 13445 + }, + { + "epoch": 17.260590500641847, + "grad_norm": 1.2166776657104492, + "learning_rate": 2.7582370560547712e-05, + "loss": 0.4351, + "step": 13446 + }, + { + "epoch": 17.261874197689345, + "grad_norm": 1.5728133916854858, + "learning_rate": 2.758194266153188e-05, + "loss": 0.4643, + "step": 13447 + }, + { + "epoch": 17.263157894736842, + "grad_norm": 1.282684326171875, + "learning_rate": 2.7581514762516046e-05, + "loss": 0.4552, + "step": 13448 + }, + { + "epoch": 17.26444159178434, + "grad_norm": 1.3948850631713867, + "learning_rate": 2.7581086863500214e-05, + "loss": 0.4617, + "step": 13449 + }, + { + "epoch": 17.265725288831835, + "grad_norm": 1.1501758098602295, + "learning_rate": 2.7580658964484382e-05, + "loss": 0.4691, + "step": 13450 + }, + { + "epoch": 17.267008985879333, + "grad_norm": 2.4645137786865234, + "learning_rate": 2.758023106546855e-05, + "loss": 0.4617, + "step": 13451 + }, + { + "epoch": 17.26829268292683, + "grad_norm": 0.8707500696182251, + "learning_rate": 2.757980316645272e-05, + "loss": 0.4676, + "step": 13452 + }, + { + "epoch": 17.269576379974325, + "grad_norm": 0.7574926018714905, + "learning_rate": 2.7579375267436888e-05, + "loss": 0.4475, + "step": 13453 + }, + { + "epoch": 17.270860077021823, + "grad_norm": 2.0974552631378174, + "learning_rate": 2.7578947368421053e-05, + "loss": 0.4784, + "step": 13454 + }, + { + "epoch": 17.27214377406932, + "grad_norm": 1.4630428552627563, + "learning_rate": 2.757851946940522e-05, + "loss": 0.4321, + "step": 13455 + }, + { + "epoch": 17.273427471116815, + "grad_norm": 1.3770668506622314, + "learning_rate": 2.757809157038939e-05, + "loss": 0.4798, + "step": 13456 + }, + { + "epoch": 17.274711168164313, + "grad_norm": 0.9165605902671814, + "learning_rate": 2.7577663671373555e-05, + "loss": 0.4635, + "step": 13457 + }, + { + "epoch": 17.27599486521181, + "grad_norm": 1.373488187789917, + "learning_rate": 2.7577235772357726e-05, + "loss": 0.4434, + "step": 13458 + }, + { + "epoch": 17.27727856225931, + "grad_norm": 1.180280089378357, + "learning_rate": 2.757680787334189e-05, + "loss": 0.4669, + "step": 13459 + }, + { + "epoch": 17.278562259306803, + "grad_norm": 1.5545847415924072, + "learning_rate": 2.757637997432606e-05, + "loss": 0.454, + "step": 13460 + }, + { + "epoch": 17.2798459563543, + "grad_norm": 1.2592699527740479, + "learning_rate": 2.7575952075310228e-05, + "loss": 0.4369, + "step": 13461 + }, + { + "epoch": 17.2811296534018, + "grad_norm": 0.9847643971443176, + "learning_rate": 2.7575524176294393e-05, + "loss": 0.4629, + "step": 13462 + }, + { + "epoch": 17.282413350449293, + "grad_norm": 1.7139604091644287, + "learning_rate": 2.7575096277278565e-05, + "loss": 0.4605, + "step": 13463 + }, + { + "epoch": 17.28369704749679, + "grad_norm": 1.3613256216049194, + "learning_rate": 2.757466837826273e-05, + "loss": 0.4748, + "step": 13464 + }, + { + "epoch": 17.28498074454429, + "grad_norm": 1.6362406015396118, + "learning_rate": 2.75742404792469e-05, + "loss": 0.4637, + "step": 13465 + }, + { + "epoch": 17.286264441591783, + "grad_norm": 2.1623058319091797, + "learning_rate": 2.7573812580231067e-05, + "loss": 0.455, + "step": 13466 + }, + { + "epoch": 17.28754813863928, + "grad_norm": 1.7282947301864624, + "learning_rate": 2.7573384681215232e-05, + "loss": 0.4561, + "step": 13467 + }, + { + "epoch": 17.28883183568678, + "grad_norm": 1.302344560623169, + "learning_rate": 2.7572956782199404e-05, + "loss": 0.4597, + "step": 13468 + }, + { + "epoch": 17.290115532734276, + "grad_norm": 1.0212417840957642, + "learning_rate": 2.757252888318357e-05, + "loss": 0.517, + "step": 13469 + }, + { + "epoch": 17.29139922978177, + "grad_norm": 1.1687816381454468, + "learning_rate": 2.7572100984167737e-05, + "loss": 0.4637, + "step": 13470 + }, + { + "epoch": 17.29268292682927, + "grad_norm": 1.059834361076355, + "learning_rate": 2.7571673085151906e-05, + "loss": 0.5036, + "step": 13471 + }, + { + "epoch": 17.293966623876766, + "grad_norm": 0.9227279424667358, + "learning_rate": 2.7571245186136074e-05, + "loss": 0.4673, + "step": 13472 + }, + { + "epoch": 17.29525032092426, + "grad_norm": 0.8777983784675598, + "learning_rate": 2.757081728712024e-05, + "loss": 0.4277, + "step": 13473 + }, + { + "epoch": 17.29653401797176, + "grad_norm": 2.138093948364258, + "learning_rate": 2.7570389388104407e-05, + "loss": 0.5069, + "step": 13474 + }, + { + "epoch": 17.297817715019256, + "grad_norm": 2.956155776977539, + "learning_rate": 2.7569961489088576e-05, + "loss": 0.5194, + "step": 13475 + }, + { + "epoch": 17.29910141206675, + "grad_norm": 2.138122081756592, + "learning_rate": 2.756953359007274e-05, + "loss": 0.4803, + "step": 13476 + }, + { + "epoch": 17.30038510911425, + "grad_norm": 1.3667631149291992, + "learning_rate": 2.7569105691056913e-05, + "loss": 0.4895, + "step": 13477 + }, + { + "epoch": 17.301668806161747, + "grad_norm": 2.0701253414154053, + "learning_rate": 2.7568677792041078e-05, + "loss": 0.4768, + "step": 13478 + }, + { + "epoch": 17.30295250320924, + "grad_norm": 3.540318250656128, + "learning_rate": 2.756824989302525e-05, + "loss": 0.5199, + "step": 13479 + }, + { + "epoch": 17.30423620025674, + "grad_norm": 1.9035403728485107, + "learning_rate": 2.7567821994009414e-05, + "loss": 0.5391, + "step": 13480 + }, + { + "epoch": 17.305519897304237, + "grad_norm": 2.3237128257751465, + "learning_rate": 2.756739409499358e-05, + "loss": 0.4591, + "step": 13481 + }, + { + "epoch": 17.306803594351734, + "grad_norm": 1.33457612991333, + "learning_rate": 2.756696619597775e-05, + "loss": 0.5064, + "step": 13482 + }, + { + "epoch": 17.30808729139923, + "grad_norm": 3.90535044670105, + "learning_rate": 2.7566538296961916e-05, + "loss": 0.5242, + "step": 13483 + }, + { + "epoch": 17.309370988446727, + "grad_norm": 1.747165322303772, + "learning_rate": 2.7566110397946085e-05, + "loss": 0.5118, + "step": 13484 + }, + { + "epoch": 17.310654685494224, + "grad_norm": 1.2531664371490479, + "learning_rate": 2.7565682498930253e-05, + "loss": 0.5361, + "step": 13485 + }, + { + "epoch": 17.31193838254172, + "grad_norm": 2.7187247276306152, + "learning_rate": 2.756525459991442e-05, + "loss": 0.5227, + "step": 13486 + }, + { + "epoch": 17.313222079589217, + "grad_norm": 0.878974974155426, + "learning_rate": 2.756482670089859e-05, + "loss": 0.4774, + "step": 13487 + }, + { + "epoch": 17.314505776636715, + "grad_norm": 1.9884644746780396, + "learning_rate": 2.7564398801882755e-05, + "loss": 0.4865, + "step": 13488 + }, + { + "epoch": 17.31578947368421, + "grad_norm": 1.1518454551696777, + "learning_rate": 2.7563970902866923e-05, + "loss": 0.534, + "step": 13489 + }, + { + "epoch": 17.317073170731707, + "grad_norm": 1.4810224771499634, + "learning_rate": 2.7563543003851092e-05, + "loss": 0.5158, + "step": 13490 + }, + { + "epoch": 17.318356867779205, + "grad_norm": 1.5624656677246094, + "learning_rate": 2.756311510483526e-05, + "loss": 0.5724, + "step": 13491 + }, + { + "epoch": 17.319640564826702, + "grad_norm": 2.1851229667663574, + "learning_rate": 2.7562687205819425e-05, + "loss": 0.623, + "step": 13492 + }, + { + "epoch": 17.320924261874197, + "grad_norm": 2.359957695007324, + "learning_rate": 2.7562259306803597e-05, + "loss": 0.67, + "step": 13493 + }, + { + "epoch": 17.322207958921695, + "grad_norm": 1.0797775983810425, + "learning_rate": 2.7561831407787762e-05, + "loss": 0.4511, + "step": 13494 + }, + { + "epoch": 17.323491655969192, + "grad_norm": 0.9138684868812561, + "learning_rate": 2.756140350877193e-05, + "loss": 0.4346, + "step": 13495 + }, + { + "epoch": 17.324775353016687, + "grad_norm": 1.6968457698822021, + "learning_rate": 2.75609756097561e-05, + "loss": 0.4518, + "step": 13496 + }, + { + "epoch": 17.326059050064185, + "grad_norm": 1.0135753154754639, + "learning_rate": 2.7560547710740264e-05, + "loss": 0.4517, + "step": 13497 + }, + { + "epoch": 17.327342747111683, + "grad_norm": 1.6712062358856201, + "learning_rate": 2.7560119811724436e-05, + "loss": 0.4584, + "step": 13498 + }, + { + "epoch": 17.328626444159177, + "grad_norm": 1.0407201051712036, + "learning_rate": 2.75596919127086e-05, + "loss": 0.4567, + "step": 13499 + }, + { + "epoch": 17.329910141206675, + "grad_norm": 3.196026563644409, + "learning_rate": 2.755926401369277e-05, + "loss": 0.491, + "step": 13500 + }, + { + "epoch": 17.331193838254173, + "grad_norm": 2.1205217838287354, + "learning_rate": 2.7558836114676938e-05, + "loss": 0.4396, + "step": 13501 + }, + { + "epoch": 17.33247753530167, + "grad_norm": 1.960193395614624, + "learning_rate": 2.7558408215661103e-05, + "loss": 0.4516, + "step": 13502 + }, + { + "epoch": 17.333761232349165, + "grad_norm": 1.2584179639816284, + "learning_rate": 2.7557980316645274e-05, + "loss": 0.4958, + "step": 13503 + }, + { + "epoch": 17.335044929396663, + "grad_norm": 1.647688865661621, + "learning_rate": 2.755755241762944e-05, + "loss": 0.4512, + "step": 13504 + }, + { + "epoch": 17.33632862644416, + "grad_norm": 1.0450900793075562, + "learning_rate": 2.7557124518613608e-05, + "loss": 0.4546, + "step": 13505 + }, + { + "epoch": 17.337612323491655, + "grad_norm": 0.9915867447853088, + "learning_rate": 2.7556696619597776e-05, + "loss": 0.4674, + "step": 13506 + }, + { + "epoch": 17.338896020539153, + "grad_norm": 1.1876263618469238, + "learning_rate": 2.7556268720581945e-05, + "loss": 0.4724, + "step": 13507 + }, + { + "epoch": 17.34017971758665, + "grad_norm": 1.2177892923355103, + "learning_rate": 2.755584082156611e-05, + "loss": 0.4366, + "step": 13508 + }, + { + "epoch": 17.341463414634145, + "grad_norm": 1.3570711612701416, + "learning_rate": 2.7555412922550278e-05, + "loss": 0.4783, + "step": 13509 + }, + { + "epoch": 17.342747111681643, + "grad_norm": 0.85418701171875, + "learning_rate": 2.7554985023534446e-05, + "loss": 0.4743, + "step": 13510 + }, + { + "epoch": 17.34403080872914, + "grad_norm": 1.0568302869796753, + "learning_rate": 2.7554557124518615e-05, + "loss": 0.4432, + "step": 13511 + }, + { + "epoch": 17.345314505776635, + "grad_norm": 2.989105701446533, + "learning_rate": 2.7554129225502783e-05, + "loss": 0.4417, + "step": 13512 + }, + { + "epoch": 17.346598202824133, + "grad_norm": 1.208145022392273, + "learning_rate": 2.755370132648695e-05, + "loss": 0.4606, + "step": 13513 + }, + { + "epoch": 17.34788189987163, + "grad_norm": 0.9729220271110535, + "learning_rate": 2.755327342747112e-05, + "loss": 0.4801, + "step": 13514 + }, + { + "epoch": 17.34916559691913, + "grad_norm": 0.8804508447647095, + "learning_rate": 2.7552845528455285e-05, + "loss": 0.521, + "step": 13515 + }, + { + "epoch": 17.350449293966623, + "grad_norm": 0.7922667860984802, + "learning_rate": 2.755241762943945e-05, + "loss": 0.4872, + "step": 13516 + }, + { + "epoch": 17.35173299101412, + "grad_norm": 1.0368080139160156, + "learning_rate": 2.7551989730423622e-05, + "loss": 0.3984, + "step": 13517 + }, + { + "epoch": 17.35301668806162, + "grad_norm": 1.1198766231536865, + "learning_rate": 2.7551561831407787e-05, + "loss": 0.4916, + "step": 13518 + }, + { + "epoch": 17.354300385109113, + "grad_norm": 0.8527964353561401, + "learning_rate": 2.755113393239196e-05, + "loss": 0.4791, + "step": 13519 + }, + { + "epoch": 17.35558408215661, + "grad_norm": 2.8415443897247314, + "learning_rate": 2.7550706033376124e-05, + "loss": 0.4543, + "step": 13520 + }, + { + "epoch": 17.35686777920411, + "grad_norm": 1.0724722146987915, + "learning_rate": 2.7550278134360292e-05, + "loss": 0.4638, + "step": 13521 + }, + { + "epoch": 17.358151476251603, + "grad_norm": 3.2364747524261475, + "learning_rate": 2.754985023534446e-05, + "loss": 0.4754, + "step": 13522 + }, + { + "epoch": 17.3594351732991, + "grad_norm": 1.1388212442398071, + "learning_rate": 2.7549422336328626e-05, + "loss": 0.4901, + "step": 13523 + }, + { + "epoch": 17.3607188703466, + "grad_norm": 1.6827244758605957, + "learning_rate": 2.7548994437312794e-05, + "loss": 0.4798, + "step": 13524 + }, + { + "epoch": 17.362002567394097, + "grad_norm": 2.0279898643493652, + "learning_rate": 2.7548566538296962e-05, + "loss": 0.4576, + "step": 13525 + }, + { + "epoch": 17.36328626444159, + "grad_norm": 1.4536192417144775, + "learning_rate": 2.754813863928113e-05, + "loss": 0.5347, + "step": 13526 + }, + { + "epoch": 17.36456996148909, + "grad_norm": 1.0703943967819214, + "learning_rate": 2.75477107402653e-05, + "loss": 0.4333, + "step": 13527 + }, + { + "epoch": 17.365853658536587, + "grad_norm": 2.3870866298675537, + "learning_rate": 2.7547282841249464e-05, + "loss": 0.4935, + "step": 13528 + }, + { + "epoch": 17.36713735558408, + "grad_norm": 1.6938989162445068, + "learning_rate": 2.7546854942233633e-05, + "loss": 0.5506, + "step": 13529 + }, + { + "epoch": 17.36842105263158, + "grad_norm": 1.3856960535049438, + "learning_rate": 2.75464270432178e-05, + "loss": 0.4965, + "step": 13530 + }, + { + "epoch": 17.369704749679077, + "grad_norm": 1.6241434812545776, + "learning_rate": 2.754599914420197e-05, + "loss": 0.4842, + "step": 13531 + }, + { + "epoch": 17.37098844672657, + "grad_norm": 1.6114338636398315, + "learning_rate": 2.7545571245186135e-05, + "loss": 0.5176, + "step": 13532 + }, + { + "epoch": 17.37227214377407, + "grad_norm": 1.0783095359802246, + "learning_rate": 2.7545143346170306e-05, + "loss": 0.5504, + "step": 13533 + }, + { + "epoch": 17.373555840821567, + "grad_norm": 1.6839890480041504, + "learning_rate": 2.754471544715447e-05, + "loss": 0.5541, + "step": 13534 + }, + { + "epoch": 17.374839537869065, + "grad_norm": 2.7539587020874023, + "learning_rate": 2.754428754813864e-05, + "loss": 0.5212, + "step": 13535 + }, + { + "epoch": 17.37612323491656, + "grad_norm": 2.620978355407715, + "learning_rate": 2.7543859649122808e-05, + "loss": 0.5124, + "step": 13536 + }, + { + "epoch": 17.377406931964057, + "grad_norm": 1.7070465087890625, + "learning_rate": 2.7543431750106973e-05, + "loss": 0.5459, + "step": 13537 + }, + { + "epoch": 17.378690629011555, + "grad_norm": 2.570535659790039, + "learning_rate": 2.7543003851091145e-05, + "loss": 0.5718, + "step": 13538 + }, + { + "epoch": 17.37997432605905, + "grad_norm": 3.3046233654022217, + "learning_rate": 2.754257595207531e-05, + "loss": 0.5506, + "step": 13539 + }, + { + "epoch": 17.381258023106547, + "grad_norm": 1.4218559265136719, + "learning_rate": 2.754214805305948e-05, + "loss": 0.4851, + "step": 13540 + }, + { + "epoch": 17.382541720154045, + "grad_norm": 3.879267692565918, + "learning_rate": 2.7541720154043647e-05, + "loss": 0.6144, + "step": 13541 + }, + { + "epoch": 17.38382541720154, + "grad_norm": 1.7052009105682373, + "learning_rate": 2.7541292255027812e-05, + "loss": 0.5926, + "step": 13542 + }, + { + "epoch": 17.385109114249037, + "grad_norm": 3.6638617515563965, + "learning_rate": 2.7540864356011984e-05, + "loss": 0.7307, + "step": 13543 + }, + { + "epoch": 17.386392811296535, + "grad_norm": 1.1310977935791016, + "learning_rate": 2.754043645699615e-05, + "loss": 0.4456, + "step": 13544 + }, + { + "epoch": 17.387676508344033, + "grad_norm": 1.4104771614074707, + "learning_rate": 2.7540008557980317e-05, + "loss": 0.4434, + "step": 13545 + }, + { + "epoch": 17.388960205391527, + "grad_norm": 1.9382859468460083, + "learning_rate": 2.7539580658964486e-05, + "loss": 0.451, + "step": 13546 + }, + { + "epoch": 17.390243902439025, + "grad_norm": 1.6866226196289062, + "learning_rate": 2.7539152759948654e-05, + "loss": 0.4939, + "step": 13547 + }, + { + "epoch": 17.391527599486523, + "grad_norm": 0.851138174533844, + "learning_rate": 2.753872486093282e-05, + "loss": 0.4784, + "step": 13548 + }, + { + "epoch": 17.392811296534017, + "grad_norm": 1.6036655902862549, + "learning_rate": 2.7538296961916987e-05, + "loss": 0.4467, + "step": 13549 + }, + { + "epoch": 17.394094993581515, + "grad_norm": 0.9772493839263916, + "learning_rate": 2.7537869062901156e-05, + "loss": 0.4653, + "step": 13550 + }, + { + "epoch": 17.395378690629013, + "grad_norm": 1.8256436586380005, + "learning_rate": 2.7537441163885324e-05, + "loss": 0.4564, + "step": 13551 + }, + { + "epoch": 17.396662387676507, + "grad_norm": 1.630332350730896, + "learning_rate": 2.7537013264869493e-05, + "loss": 0.424, + "step": 13552 + }, + { + "epoch": 17.397946084724005, + "grad_norm": 1.1356781721115112, + "learning_rate": 2.7536585365853658e-05, + "loss": 0.4731, + "step": 13553 + }, + { + "epoch": 17.399229781771503, + "grad_norm": 0.8361197710037231, + "learning_rate": 2.753615746683783e-05, + "loss": 0.4526, + "step": 13554 + }, + { + "epoch": 17.400513478818997, + "grad_norm": 2.5914132595062256, + "learning_rate": 2.7535729567821995e-05, + "loss": 0.5044, + "step": 13555 + }, + { + "epoch": 17.401797175866495, + "grad_norm": 1.8496699333190918, + "learning_rate": 2.753530166880616e-05, + "loss": 0.4517, + "step": 13556 + }, + { + "epoch": 17.403080872913993, + "grad_norm": 2.09220027923584, + "learning_rate": 2.753487376979033e-05, + "loss": 0.4996, + "step": 13557 + }, + { + "epoch": 17.40436456996149, + "grad_norm": 1.2636253833770752, + "learning_rate": 2.7534445870774496e-05, + "loss": 0.4852, + "step": 13558 + }, + { + "epoch": 17.405648267008985, + "grad_norm": 1.1465457677841187, + "learning_rate": 2.7534017971758668e-05, + "loss": 0.462, + "step": 13559 + }, + { + "epoch": 17.406931964056483, + "grad_norm": 1.544594645500183, + "learning_rate": 2.7533590072742833e-05, + "loss": 0.4388, + "step": 13560 + }, + { + "epoch": 17.40821566110398, + "grad_norm": 1.9715150594711304, + "learning_rate": 2.7533162173727e-05, + "loss": 0.4847, + "step": 13561 + }, + { + "epoch": 17.409499358151475, + "grad_norm": 1.5274707078933716, + "learning_rate": 2.753273427471117e-05, + "loss": 0.4278, + "step": 13562 + }, + { + "epoch": 17.410783055198973, + "grad_norm": 1.4380384683609009, + "learning_rate": 2.7532306375695335e-05, + "loss": 0.4711, + "step": 13563 + }, + { + "epoch": 17.41206675224647, + "grad_norm": 1.0152382850646973, + "learning_rate": 2.7531878476679503e-05, + "loss": 0.4785, + "step": 13564 + }, + { + "epoch": 17.413350449293965, + "grad_norm": 1.148857593536377, + "learning_rate": 2.7531450577663672e-05, + "loss": 0.4407, + "step": 13565 + }, + { + "epoch": 17.414634146341463, + "grad_norm": 1.268319845199585, + "learning_rate": 2.753102267864784e-05, + "loss": 0.4492, + "step": 13566 + }, + { + "epoch": 17.41591784338896, + "grad_norm": 0.9442114233970642, + "learning_rate": 2.753059477963201e-05, + "loss": 0.4686, + "step": 13567 + }, + { + "epoch": 17.41720154043646, + "grad_norm": 1.4160559177398682, + "learning_rate": 2.7530166880616177e-05, + "loss": 0.4821, + "step": 13568 + }, + { + "epoch": 17.418485237483953, + "grad_norm": 1.786865234375, + "learning_rate": 2.7529738981600342e-05, + "loss": 0.5156, + "step": 13569 + }, + { + "epoch": 17.41976893453145, + "grad_norm": 2.45957612991333, + "learning_rate": 2.752931108258451e-05, + "loss": 0.451, + "step": 13570 + }, + { + "epoch": 17.42105263157895, + "grad_norm": 1.3164196014404297, + "learning_rate": 2.752888318356868e-05, + "loss": 0.4486, + "step": 13571 + }, + { + "epoch": 17.422336328626443, + "grad_norm": 1.0322108268737793, + "learning_rate": 2.7528455284552844e-05, + "loss": 0.4907, + "step": 13572 + }, + { + "epoch": 17.42362002567394, + "grad_norm": 1.0110431909561157, + "learning_rate": 2.7528027385537016e-05, + "loss": 0.5021, + "step": 13573 + }, + { + "epoch": 17.42490372272144, + "grad_norm": 2.0036580562591553, + "learning_rate": 2.752759948652118e-05, + "loss": 0.4891, + "step": 13574 + }, + { + "epoch": 17.426187419768933, + "grad_norm": 1.2214761972427368, + "learning_rate": 2.7527171587505353e-05, + "loss": 0.4952, + "step": 13575 + }, + { + "epoch": 17.42747111681643, + "grad_norm": 0.989580512046814, + "learning_rate": 2.7526743688489518e-05, + "loss": 0.4602, + "step": 13576 + }, + { + "epoch": 17.42875481386393, + "grad_norm": 0.9593207240104675, + "learning_rate": 2.7526315789473683e-05, + "loss": 0.4671, + "step": 13577 + }, + { + "epoch": 17.430038510911427, + "grad_norm": 1.4142147302627563, + "learning_rate": 2.7525887890457854e-05, + "loss": 0.5372, + "step": 13578 + }, + { + "epoch": 17.43132220795892, + "grad_norm": 0.9189007878303528, + "learning_rate": 2.752545999144202e-05, + "loss": 0.472, + "step": 13579 + }, + { + "epoch": 17.43260590500642, + "grad_norm": 3.973788022994995, + "learning_rate": 2.7525032092426188e-05, + "loss": 0.4704, + "step": 13580 + }, + { + "epoch": 17.433889602053917, + "grad_norm": 1.0307908058166504, + "learning_rate": 2.7524604193410356e-05, + "loss": 0.5079, + "step": 13581 + }, + { + "epoch": 17.43517329910141, + "grad_norm": 1.304336667060852, + "learning_rate": 2.7524176294394525e-05, + "loss": 0.459, + "step": 13582 + }, + { + "epoch": 17.43645699614891, + "grad_norm": 2.0819315910339355, + "learning_rate": 2.7523748395378693e-05, + "loss": 0.4661, + "step": 13583 + }, + { + "epoch": 17.437740693196407, + "grad_norm": 6.270044326782227, + "learning_rate": 2.7523320496362858e-05, + "loss": 0.5114, + "step": 13584 + }, + { + "epoch": 17.4390243902439, + "grad_norm": 1.6664880514144897, + "learning_rate": 2.7522892597347027e-05, + "loss": 0.573, + "step": 13585 + }, + { + "epoch": 17.4403080872914, + "grad_norm": 1.4888073205947876, + "learning_rate": 2.7522464698331195e-05, + "loss": 0.5152, + "step": 13586 + }, + { + "epoch": 17.441591784338897, + "grad_norm": 1.3684046268463135, + "learning_rate": 2.7522036799315363e-05, + "loss": 0.5109, + "step": 13587 + }, + { + "epoch": 17.44287548138639, + "grad_norm": 2.504375696182251, + "learning_rate": 2.752160890029953e-05, + "loss": 0.5026, + "step": 13588 + }, + { + "epoch": 17.44415917843389, + "grad_norm": 1.405259370803833, + "learning_rate": 2.7521181001283697e-05, + "loss": 0.506, + "step": 13589 + }, + { + "epoch": 17.445442875481387, + "grad_norm": 2.9696390628814697, + "learning_rate": 2.7520753102267865e-05, + "loss": 0.5642, + "step": 13590 + }, + { + "epoch": 17.446726572528885, + "grad_norm": 2.71905779838562, + "learning_rate": 2.7520325203252034e-05, + "loss": 0.488, + "step": 13591 + }, + { + "epoch": 17.44801026957638, + "grad_norm": 1.7896808385849, + "learning_rate": 2.7519897304236202e-05, + "loss": 0.5513, + "step": 13592 + }, + { + "epoch": 17.449293966623877, + "grad_norm": 2.095590114593506, + "learning_rate": 2.7519469405220367e-05, + "loss": 0.7008, + "step": 13593 + }, + { + "epoch": 17.450577663671375, + "grad_norm": 2.3661985397338867, + "learning_rate": 2.751904150620454e-05, + "loss": 0.4141, + "step": 13594 + }, + { + "epoch": 17.45186136071887, + "grad_norm": 1.1186405420303345, + "learning_rate": 2.7518613607188704e-05, + "loss": 0.4641, + "step": 13595 + }, + { + "epoch": 17.453145057766367, + "grad_norm": 0.8939502835273743, + "learning_rate": 2.751818570817287e-05, + "loss": 0.4592, + "step": 13596 + }, + { + "epoch": 17.454428754813865, + "grad_norm": 1.201045036315918, + "learning_rate": 2.751775780915704e-05, + "loss": 0.4878, + "step": 13597 + }, + { + "epoch": 17.45571245186136, + "grad_norm": 0.8458627462387085, + "learning_rate": 2.7517329910141206e-05, + "loss": 0.4689, + "step": 13598 + }, + { + "epoch": 17.456996148908857, + "grad_norm": 2.915555953979492, + "learning_rate": 2.7516902011125378e-05, + "loss": 0.4488, + "step": 13599 + }, + { + "epoch": 17.458279845956355, + "grad_norm": 1.5912926197052002, + "learning_rate": 2.7516474112109543e-05, + "loss": 0.4402, + "step": 13600 + }, + { + "epoch": 17.459563543003853, + "grad_norm": 1.254011869430542, + "learning_rate": 2.751604621309371e-05, + "loss": 0.4968, + "step": 13601 + }, + { + "epoch": 17.460847240051347, + "grad_norm": 1.258847713470459, + "learning_rate": 2.751561831407788e-05, + "loss": 0.474, + "step": 13602 + }, + { + "epoch": 17.462130937098845, + "grad_norm": 2.4518885612487793, + "learning_rate": 2.7515190415062044e-05, + "loss": 0.49, + "step": 13603 + }, + { + "epoch": 17.463414634146343, + "grad_norm": 0.8643206357955933, + "learning_rate": 2.7514762516046213e-05, + "loss": 0.497, + "step": 13604 + }, + { + "epoch": 17.464698331193837, + "grad_norm": 1.3601945638656616, + "learning_rate": 2.751433461703038e-05, + "loss": 0.4519, + "step": 13605 + }, + { + "epoch": 17.465982028241335, + "grad_norm": 1.0155000686645508, + "learning_rate": 2.751390671801455e-05, + "loss": 0.4714, + "step": 13606 + }, + { + "epoch": 17.467265725288833, + "grad_norm": 1.2623569965362549, + "learning_rate": 2.7513478818998718e-05, + "loss": 0.4583, + "step": 13607 + }, + { + "epoch": 17.468549422336327, + "grad_norm": 0.8011582493782043, + "learning_rate": 2.7513050919982886e-05, + "loss": 0.476, + "step": 13608 + }, + { + "epoch": 17.469833119383825, + "grad_norm": 1.327239751815796, + "learning_rate": 2.751262302096705e-05, + "loss": 0.4698, + "step": 13609 + }, + { + "epoch": 17.471116816431323, + "grad_norm": 1.2926456928253174, + "learning_rate": 2.751219512195122e-05, + "loss": 0.4619, + "step": 13610 + }, + { + "epoch": 17.47240051347882, + "grad_norm": 1.5942108631134033, + "learning_rate": 2.7511767222935388e-05, + "loss": 0.4855, + "step": 13611 + }, + { + "epoch": 17.473684210526315, + "grad_norm": 1.1445881128311157, + "learning_rate": 2.7511339323919553e-05, + "loss": 0.4405, + "step": 13612 + }, + { + "epoch": 17.474967907573813, + "grad_norm": 3.7884111404418945, + "learning_rate": 2.7510911424903725e-05, + "loss": 0.4729, + "step": 13613 + }, + { + "epoch": 17.47625160462131, + "grad_norm": 2.8387959003448486, + "learning_rate": 2.751048352588789e-05, + "loss": 0.4802, + "step": 13614 + }, + { + "epoch": 17.477535301668805, + "grad_norm": 1.980439305305481, + "learning_rate": 2.7510055626872062e-05, + "loss": 0.4449, + "step": 13615 + }, + { + "epoch": 17.478818998716303, + "grad_norm": 1.2214897871017456, + "learning_rate": 2.7509627727856227e-05, + "loss": 0.499, + "step": 13616 + }, + { + "epoch": 17.4801026957638, + "grad_norm": 1.4525556564331055, + "learning_rate": 2.7509199828840392e-05, + "loss": 0.4473, + "step": 13617 + }, + { + "epoch": 17.481386392811295, + "grad_norm": 8.483610153198242, + "learning_rate": 2.7508771929824564e-05, + "loss": 0.4597, + "step": 13618 + }, + { + "epoch": 17.482670089858793, + "grad_norm": 1.0280908346176147, + "learning_rate": 2.750834403080873e-05, + "loss": 0.4559, + "step": 13619 + }, + { + "epoch": 17.48395378690629, + "grad_norm": 2.7385427951812744, + "learning_rate": 2.7507916131792897e-05, + "loss": 0.4729, + "step": 13620 + }, + { + "epoch": 17.485237483953785, + "grad_norm": 1.3820842504501343, + "learning_rate": 2.7507488232777066e-05, + "loss": 0.5035, + "step": 13621 + }, + { + "epoch": 17.486521181001283, + "grad_norm": 1.181979775428772, + "learning_rate": 2.7507060333761234e-05, + "loss": 0.5076, + "step": 13622 + }, + { + "epoch": 17.48780487804878, + "grad_norm": 0.958216667175293, + "learning_rate": 2.7506632434745402e-05, + "loss": 0.4938, + "step": 13623 + }, + { + "epoch": 17.48908857509628, + "grad_norm": 1.3753477334976196, + "learning_rate": 2.7506204535729567e-05, + "loss": 0.4579, + "step": 13624 + }, + { + "epoch": 17.490372272143773, + "grad_norm": 1.6615948677062988, + "learning_rate": 2.7505776636713736e-05, + "loss": 0.4577, + "step": 13625 + }, + { + "epoch": 17.49165596919127, + "grad_norm": 2.3168575763702393, + "learning_rate": 2.7505348737697904e-05, + "loss": 0.5112, + "step": 13626 + }, + { + "epoch": 17.49293966623877, + "grad_norm": 1.3026684522628784, + "learning_rate": 2.7504920838682073e-05, + "loss": 0.4904, + "step": 13627 + }, + { + "epoch": 17.494223363286263, + "grad_norm": 1.2206956148147583, + "learning_rate": 2.7504492939666238e-05, + "loss": 0.4687, + "step": 13628 + }, + { + "epoch": 17.49550706033376, + "grad_norm": 1.3242899179458618, + "learning_rate": 2.750406504065041e-05, + "loss": 0.4815, + "step": 13629 + }, + { + "epoch": 17.49679075738126, + "grad_norm": 0.7446723580360413, + "learning_rate": 2.7503637141634575e-05, + "loss": 0.4677, + "step": 13630 + }, + { + "epoch": 17.498074454428753, + "grad_norm": 1.3463947772979736, + "learning_rate": 2.7503209242618743e-05, + "loss": 0.4454, + "step": 13631 + }, + { + "epoch": 17.49935815147625, + "grad_norm": 2.5037059783935547, + "learning_rate": 2.750278134360291e-05, + "loss": 0.4873, + "step": 13632 + }, + { + "epoch": 17.50064184852375, + "grad_norm": 1.2618346214294434, + "learning_rate": 2.7502353444587076e-05, + "loss": 0.4527, + "step": 13633 + }, + { + "epoch": 17.501925545571247, + "grad_norm": 2.4681482315063477, + "learning_rate": 2.7501925545571248e-05, + "loss": 0.485, + "step": 13634 + }, + { + "epoch": 17.50320924261874, + "grad_norm": 4.194278240203857, + "learning_rate": 2.7501497646555413e-05, + "loss": 0.5026, + "step": 13635 + }, + { + "epoch": 17.50449293966624, + "grad_norm": 2.2517497539520264, + "learning_rate": 2.750106974753958e-05, + "loss": 0.5094, + "step": 13636 + }, + { + "epoch": 17.505776636713737, + "grad_norm": 2.493192672729492, + "learning_rate": 2.750064184852375e-05, + "loss": 0.4553, + "step": 13637 + }, + { + "epoch": 17.50706033376123, + "grad_norm": 1.1311379671096802, + "learning_rate": 2.7500213949507915e-05, + "loss": 0.5029, + "step": 13638 + }, + { + "epoch": 17.50834403080873, + "grad_norm": 3.918950319290161, + "learning_rate": 2.7499786050492087e-05, + "loss": 0.5882, + "step": 13639 + }, + { + "epoch": 17.509627727856227, + "grad_norm": 2.370950698852539, + "learning_rate": 2.7499358151476252e-05, + "loss": 0.5875, + "step": 13640 + }, + { + "epoch": 17.51091142490372, + "grad_norm": 2.3168601989746094, + "learning_rate": 2.749893025246042e-05, + "loss": 0.5693, + "step": 13641 + }, + { + "epoch": 17.51219512195122, + "grad_norm": 2.066541910171509, + "learning_rate": 2.749850235344459e-05, + "loss": 0.6076, + "step": 13642 + }, + { + "epoch": 17.513478818998717, + "grad_norm": 3.3511974811553955, + "learning_rate": 2.7498074454428757e-05, + "loss": 0.6568, + "step": 13643 + }, + { + "epoch": 17.514762516046215, + "grad_norm": 2.3807461261749268, + "learning_rate": 2.7497646555412922e-05, + "loss": 0.4997, + "step": 13644 + }, + { + "epoch": 17.51604621309371, + "grad_norm": 1.1952723264694214, + "learning_rate": 2.749721865639709e-05, + "loss": 0.4198, + "step": 13645 + }, + { + "epoch": 17.517329910141207, + "grad_norm": 1.476741075515747, + "learning_rate": 2.749679075738126e-05, + "loss": 0.5093, + "step": 13646 + }, + { + "epoch": 17.518613607188705, + "grad_norm": 1.8165956735610962, + "learning_rate": 2.7496362858365427e-05, + "loss": 0.4777, + "step": 13647 + }, + { + "epoch": 17.5198973042362, + "grad_norm": 1.1379761695861816, + "learning_rate": 2.7495934959349596e-05, + "loss": 0.4346, + "step": 13648 + }, + { + "epoch": 17.521181001283697, + "grad_norm": 0.8696683049201965, + "learning_rate": 2.749550706033376e-05, + "loss": 0.4182, + "step": 13649 + }, + { + "epoch": 17.522464698331195, + "grad_norm": 1.212331771850586, + "learning_rate": 2.749507916131793e-05, + "loss": 0.458, + "step": 13650 + }, + { + "epoch": 17.52374839537869, + "grad_norm": 2.168452262878418, + "learning_rate": 2.7494651262302098e-05, + "loss": 0.4787, + "step": 13651 + }, + { + "epoch": 17.525032092426187, + "grad_norm": 1.2816499471664429, + "learning_rate": 2.7494223363286263e-05, + "loss": 0.4351, + "step": 13652 + }, + { + "epoch": 17.526315789473685, + "grad_norm": 4.1006855964660645, + "learning_rate": 2.7493795464270434e-05, + "loss": 0.4654, + "step": 13653 + }, + { + "epoch": 17.527599486521183, + "grad_norm": 1.4501874446868896, + "learning_rate": 2.74933675652546e-05, + "loss": 0.4671, + "step": 13654 + }, + { + "epoch": 17.528883183568677, + "grad_norm": 1.0063540935516357, + "learning_rate": 2.749293966623877e-05, + "loss": 0.4754, + "step": 13655 + }, + { + "epoch": 17.530166880616175, + "grad_norm": 2.1179115772247314, + "learning_rate": 2.7492511767222936e-05, + "loss": 0.5018, + "step": 13656 + }, + { + "epoch": 17.531450577663673, + "grad_norm": 1.4922643899917603, + "learning_rate": 2.74920838682071e-05, + "loss": 0.4417, + "step": 13657 + }, + { + "epoch": 17.532734274711167, + "grad_norm": 1.002713918685913, + "learning_rate": 2.7491655969191273e-05, + "loss": 0.4367, + "step": 13658 + }, + { + "epoch": 17.534017971758665, + "grad_norm": 3.507300853729248, + "learning_rate": 2.7491228070175438e-05, + "loss": 0.4755, + "step": 13659 + }, + { + "epoch": 17.535301668806163, + "grad_norm": 1.727128505706787, + "learning_rate": 2.7490800171159607e-05, + "loss": 0.4703, + "step": 13660 + }, + { + "epoch": 17.536585365853657, + "grad_norm": 1.1151026487350464, + "learning_rate": 2.7490372272143775e-05, + "loss": 0.4419, + "step": 13661 + }, + { + "epoch": 17.537869062901155, + "grad_norm": 0.9819729328155518, + "learning_rate": 2.7489944373127943e-05, + "loss": 0.5317, + "step": 13662 + }, + { + "epoch": 17.539152759948653, + "grad_norm": 1.4029101133346558, + "learning_rate": 2.7489516474112112e-05, + "loss": 0.4718, + "step": 13663 + }, + { + "epoch": 17.540436456996147, + "grad_norm": 1.027931809425354, + "learning_rate": 2.7489088575096277e-05, + "loss": 0.503, + "step": 13664 + }, + { + "epoch": 17.541720154043645, + "grad_norm": 3.328137159347534, + "learning_rate": 2.7488660676080445e-05, + "loss": 0.471, + "step": 13665 + }, + { + "epoch": 17.543003851091143, + "grad_norm": 4.899099349975586, + "learning_rate": 2.7488232777064614e-05, + "loss": 0.4857, + "step": 13666 + }, + { + "epoch": 17.54428754813864, + "grad_norm": 1.5394313335418701, + "learning_rate": 2.7487804878048782e-05, + "loss": 0.5106, + "step": 13667 + }, + { + "epoch": 17.545571245186135, + "grad_norm": 1.2160491943359375, + "learning_rate": 2.7487376979032947e-05, + "loss": 0.5552, + "step": 13668 + }, + { + "epoch": 17.546854942233633, + "grad_norm": 0.8185883164405823, + "learning_rate": 2.748694908001712e-05, + "loss": 0.4594, + "step": 13669 + }, + { + "epoch": 17.54813863928113, + "grad_norm": 0.8497800230979919, + "learning_rate": 2.7486521181001284e-05, + "loss": 0.4978, + "step": 13670 + }, + { + "epoch": 17.549422336328625, + "grad_norm": 1.6196469068527222, + "learning_rate": 2.7486093281985452e-05, + "loss": 0.4713, + "step": 13671 + }, + { + "epoch": 17.550706033376123, + "grad_norm": 1.6506593227386475, + "learning_rate": 2.748566538296962e-05, + "loss": 0.4539, + "step": 13672 + }, + { + "epoch": 17.55198973042362, + "grad_norm": 0.8799092173576355, + "learning_rate": 2.7485237483953786e-05, + "loss": 0.486, + "step": 13673 + }, + { + "epoch": 17.553273427471115, + "grad_norm": 2.043468952178955, + "learning_rate": 2.7484809584937958e-05, + "loss": 0.483, + "step": 13674 + }, + { + "epoch": 17.554557124518613, + "grad_norm": 2.2301137447357178, + "learning_rate": 2.7484381685922123e-05, + "loss": 0.4601, + "step": 13675 + }, + { + "epoch": 17.55584082156611, + "grad_norm": 0.9073394536972046, + "learning_rate": 2.748395378690629e-05, + "loss": 0.465, + "step": 13676 + }, + { + "epoch": 17.55712451861361, + "grad_norm": 1.3370306491851807, + "learning_rate": 2.748352588789046e-05, + "loss": 0.5356, + "step": 13677 + }, + { + "epoch": 17.558408215661103, + "grad_norm": 8.09884262084961, + "learning_rate": 2.7483097988874624e-05, + "loss": 0.4512, + "step": 13678 + }, + { + "epoch": 17.5596919127086, + "grad_norm": 1.1516398191452026, + "learning_rate": 2.7482670089858793e-05, + "loss": 0.5406, + "step": 13679 + }, + { + "epoch": 17.5609756097561, + "grad_norm": 0.9051611423492432, + "learning_rate": 2.748224219084296e-05, + "loss": 0.4328, + "step": 13680 + }, + { + "epoch": 17.562259306803593, + "grad_norm": 2.252504348754883, + "learning_rate": 2.748181429182713e-05, + "loss": 0.5133, + "step": 13681 + }, + { + "epoch": 17.56354300385109, + "grad_norm": 2.3134422302246094, + "learning_rate": 2.7481386392811298e-05, + "loss": 0.505, + "step": 13682 + }, + { + "epoch": 17.56482670089859, + "grad_norm": 1.1598366498947144, + "learning_rate": 2.7480958493795467e-05, + "loss": 0.4862, + "step": 13683 + }, + { + "epoch": 17.566110397946083, + "grad_norm": 1.3535982370376587, + "learning_rate": 2.748053059477963e-05, + "loss": 0.4691, + "step": 13684 + }, + { + "epoch": 17.56739409499358, + "grad_norm": 1.5321519374847412, + "learning_rate": 2.74801026957638e-05, + "loss": 0.5106, + "step": 13685 + }, + { + "epoch": 17.56867779204108, + "grad_norm": 1.3425161838531494, + "learning_rate": 2.747967479674797e-05, + "loss": 0.5351, + "step": 13686 + }, + { + "epoch": 17.569961489088577, + "grad_norm": 1.1072916984558105, + "learning_rate": 2.7479246897732133e-05, + "loss": 0.4982, + "step": 13687 + }, + { + "epoch": 17.57124518613607, + "grad_norm": 1.0768396854400635, + "learning_rate": 2.7478818998716305e-05, + "loss": 0.5049, + "step": 13688 + }, + { + "epoch": 17.57252888318357, + "grad_norm": 1.5109572410583496, + "learning_rate": 2.747839109970047e-05, + "loss": 0.5387, + "step": 13689 + }, + { + "epoch": 17.573812580231067, + "grad_norm": 2.1115822792053223, + "learning_rate": 2.7477963200684642e-05, + "loss": 0.5654, + "step": 13690 + }, + { + "epoch": 17.57509627727856, + "grad_norm": 2.190441846847534, + "learning_rate": 2.7477535301668807e-05, + "loss": 0.5304, + "step": 13691 + }, + { + "epoch": 17.57637997432606, + "grad_norm": 15.889466285705566, + "learning_rate": 2.7477107402652972e-05, + "loss": 0.5596, + "step": 13692 + }, + { + "epoch": 17.577663671373557, + "grad_norm": 2.20227313041687, + "learning_rate": 2.7476679503637144e-05, + "loss": 0.746, + "step": 13693 + }, + { + "epoch": 17.57894736842105, + "grad_norm": 1.7805191278457642, + "learning_rate": 2.747625160462131e-05, + "loss": 0.4662, + "step": 13694 + }, + { + "epoch": 17.58023106546855, + "grad_norm": 1.0661247968673706, + "learning_rate": 2.7475823705605477e-05, + "loss": 0.4234, + "step": 13695 + }, + { + "epoch": 17.581514762516047, + "grad_norm": 1.3792617321014404, + "learning_rate": 2.7475395806589646e-05, + "loss": 0.4685, + "step": 13696 + }, + { + "epoch": 17.58279845956354, + "grad_norm": 0.9777414202690125, + "learning_rate": 2.7474967907573814e-05, + "loss": 0.4515, + "step": 13697 + }, + { + "epoch": 17.58408215661104, + "grad_norm": 1.274906873703003, + "learning_rate": 2.7474540008557983e-05, + "loss": 0.4614, + "step": 13698 + }, + { + "epoch": 17.585365853658537, + "grad_norm": 1.333730697631836, + "learning_rate": 2.7474112109542148e-05, + "loss": 0.4885, + "step": 13699 + }, + { + "epoch": 17.586649550706035, + "grad_norm": 0.9584210515022278, + "learning_rate": 2.7473684210526316e-05, + "loss": 0.4636, + "step": 13700 + }, + { + "epoch": 17.58793324775353, + "grad_norm": 1.042577862739563, + "learning_rate": 2.7473256311510484e-05, + "loss": 0.4502, + "step": 13701 + }, + { + "epoch": 17.589216944801027, + "grad_norm": 1.3062725067138672, + "learning_rate": 2.7472828412494653e-05, + "loss": 0.4729, + "step": 13702 + }, + { + "epoch": 17.590500641848525, + "grad_norm": 0.8406639099121094, + "learning_rate": 2.7472400513478818e-05, + "loss": 0.4889, + "step": 13703 + }, + { + "epoch": 17.59178433889602, + "grad_norm": 0.7703270316123962, + "learning_rate": 2.747197261446299e-05, + "loss": 0.459, + "step": 13704 + }, + { + "epoch": 17.593068035943517, + "grad_norm": 0.9136335849761963, + "learning_rate": 2.7471544715447155e-05, + "loss": 0.4798, + "step": 13705 + }, + { + "epoch": 17.594351732991015, + "grad_norm": 1.5535364151000977, + "learning_rate": 2.7471116816431323e-05, + "loss": 0.4566, + "step": 13706 + }, + { + "epoch": 17.59563543003851, + "grad_norm": 0.789312481880188, + "learning_rate": 2.747068891741549e-05, + "loss": 0.4907, + "step": 13707 + }, + { + "epoch": 17.596919127086007, + "grad_norm": 0.8381483554840088, + "learning_rate": 2.7470261018399656e-05, + "loss": 0.4681, + "step": 13708 + }, + { + "epoch": 17.598202824133505, + "grad_norm": 0.8052424788475037, + "learning_rate": 2.7469833119383828e-05, + "loss": 0.508, + "step": 13709 + }, + { + "epoch": 17.599486521181003, + "grad_norm": 1.2719383239746094, + "learning_rate": 2.7469405220367993e-05, + "loss": 0.4351, + "step": 13710 + }, + { + "epoch": 17.600770218228497, + "grad_norm": 2.1011087894439697, + "learning_rate": 2.746897732135216e-05, + "loss": 0.4492, + "step": 13711 + }, + { + "epoch": 17.602053915275995, + "grad_norm": 1.518284559249878, + "learning_rate": 2.746854942233633e-05, + "loss": 0.4747, + "step": 13712 + }, + { + "epoch": 17.603337612323493, + "grad_norm": 0.9073631167411804, + "learning_rate": 2.7468121523320495e-05, + "loss": 0.4733, + "step": 13713 + }, + { + "epoch": 17.604621309370987, + "grad_norm": 0.7809440493583679, + "learning_rate": 2.7467693624304667e-05, + "loss": 0.454, + "step": 13714 + }, + { + "epoch": 17.605905006418485, + "grad_norm": 1.2423338890075684, + "learning_rate": 2.7467265725288832e-05, + "loss": 0.4877, + "step": 13715 + }, + { + "epoch": 17.607188703465983, + "grad_norm": 1.654801607131958, + "learning_rate": 2.7466837826273e-05, + "loss": 0.4641, + "step": 13716 + }, + { + "epoch": 17.608472400513477, + "grad_norm": 1.960048794746399, + "learning_rate": 2.746640992725717e-05, + "loss": 0.504, + "step": 13717 + }, + { + "epoch": 17.609756097560975, + "grad_norm": 1.9676430225372314, + "learning_rate": 2.7465982028241334e-05, + "loss": 0.5173, + "step": 13718 + }, + { + "epoch": 17.611039794608473, + "grad_norm": 1.4367141723632812, + "learning_rate": 2.7465554129225502e-05, + "loss": 0.5172, + "step": 13719 + }, + { + "epoch": 17.61232349165597, + "grad_norm": 1.7686930894851685, + "learning_rate": 2.746512623020967e-05, + "loss": 0.4612, + "step": 13720 + }, + { + "epoch": 17.613607188703465, + "grad_norm": 1.3029875755310059, + "learning_rate": 2.746469833119384e-05, + "loss": 0.4582, + "step": 13721 + }, + { + "epoch": 17.614890885750963, + "grad_norm": 1.2701913118362427, + "learning_rate": 2.7464270432178007e-05, + "loss": 0.4783, + "step": 13722 + }, + { + "epoch": 17.61617458279846, + "grad_norm": 5.6775360107421875, + "learning_rate": 2.7463842533162176e-05, + "loss": 0.5131, + "step": 13723 + }, + { + "epoch": 17.617458279845955, + "grad_norm": 1.1967206001281738, + "learning_rate": 2.746341463414634e-05, + "loss": 0.4839, + "step": 13724 + }, + { + "epoch": 17.618741976893453, + "grad_norm": 1.0808534622192383, + "learning_rate": 2.746298673513051e-05, + "loss": 0.4851, + "step": 13725 + }, + { + "epoch": 17.62002567394095, + "grad_norm": 3.6440563201904297, + "learning_rate": 2.7462558836114678e-05, + "loss": 0.4562, + "step": 13726 + }, + { + "epoch": 17.621309370988445, + "grad_norm": 2.562985420227051, + "learning_rate": 2.7462130937098843e-05, + "loss": 0.4939, + "step": 13727 + }, + { + "epoch": 17.622593068035943, + "grad_norm": 2.9353437423706055, + "learning_rate": 2.7461703038083015e-05, + "loss": 0.4987, + "step": 13728 + }, + { + "epoch": 17.62387676508344, + "grad_norm": 1.6569324731826782, + "learning_rate": 2.746127513906718e-05, + "loss": 0.4636, + "step": 13729 + }, + { + "epoch": 17.625160462130935, + "grad_norm": 3.1467533111572266, + "learning_rate": 2.746084724005135e-05, + "loss": 0.5208, + "step": 13730 + }, + { + "epoch": 17.626444159178433, + "grad_norm": 2.2861204147338867, + "learning_rate": 2.7460419341035516e-05, + "loss": 0.5189, + "step": 13731 + }, + { + "epoch": 17.62772785622593, + "grad_norm": 3.6413209438323975, + "learning_rate": 2.745999144201968e-05, + "loss": 0.4791, + "step": 13732 + }, + { + "epoch": 17.62901155327343, + "grad_norm": 0.9664206504821777, + "learning_rate": 2.7459563543003853e-05, + "loss": 0.4955, + "step": 13733 + }, + { + "epoch": 17.630295250320923, + "grad_norm": 1.4246147871017456, + "learning_rate": 2.7459135643988018e-05, + "loss": 0.5214, + "step": 13734 + }, + { + "epoch": 17.63157894736842, + "grad_norm": 1.6338677406311035, + "learning_rate": 2.7458707744972187e-05, + "loss": 0.5283, + "step": 13735 + }, + { + "epoch": 17.63286264441592, + "grad_norm": 2.179529905319214, + "learning_rate": 2.7458279845956355e-05, + "loss": 0.4569, + "step": 13736 + }, + { + "epoch": 17.634146341463413, + "grad_norm": 1.714442491531372, + "learning_rate": 2.7457851946940523e-05, + "loss": 0.5363, + "step": 13737 + }, + { + "epoch": 17.63543003851091, + "grad_norm": 2.6775004863739014, + "learning_rate": 2.7457424047924692e-05, + "loss": 0.4907, + "step": 13738 + }, + { + "epoch": 17.63671373555841, + "grad_norm": 2.8497328758239746, + "learning_rate": 2.7456996148908857e-05, + "loss": 0.578, + "step": 13739 + }, + { + "epoch": 17.637997432605903, + "grad_norm": 2.1654529571533203, + "learning_rate": 2.7456568249893025e-05, + "loss": 0.5249, + "step": 13740 + }, + { + "epoch": 17.6392811296534, + "grad_norm": 1.8213740587234497, + "learning_rate": 2.7456140350877194e-05, + "loss": 0.6092, + "step": 13741 + }, + { + "epoch": 17.6405648267009, + "grad_norm": 2.017489194869995, + "learning_rate": 2.7455712451861362e-05, + "loss": 0.5886, + "step": 13742 + }, + { + "epoch": 17.641848523748397, + "grad_norm": 1.726500153541565, + "learning_rate": 2.7455284552845527e-05, + "loss": 0.6251, + "step": 13743 + }, + { + "epoch": 17.64313222079589, + "grad_norm": 0.798902690410614, + "learning_rate": 2.74548566538297e-05, + "loss": 0.4646, + "step": 13744 + }, + { + "epoch": 17.64441591784339, + "grad_norm": 3.2264420986175537, + "learning_rate": 2.7454428754813864e-05, + "loss": 0.4974, + "step": 13745 + }, + { + "epoch": 17.645699614890887, + "grad_norm": 1.0334532260894775, + "learning_rate": 2.7454000855798032e-05, + "loss": 0.4295, + "step": 13746 + }, + { + "epoch": 17.64698331193838, + "grad_norm": 0.8714597821235657, + "learning_rate": 2.74535729567822e-05, + "loss": 0.4867, + "step": 13747 + }, + { + "epoch": 17.64826700898588, + "grad_norm": 0.9426793456077576, + "learning_rate": 2.7453145057766366e-05, + "loss": 0.4506, + "step": 13748 + }, + { + "epoch": 17.649550706033377, + "grad_norm": 1.206488013267517, + "learning_rate": 2.7452717158750538e-05, + "loss": 0.4405, + "step": 13749 + }, + { + "epoch": 17.65083440308087, + "grad_norm": 4.771864414215088, + "learning_rate": 2.7452289259734703e-05, + "loss": 0.481, + "step": 13750 + }, + { + "epoch": 17.65211810012837, + "grad_norm": 0.8864704966545105, + "learning_rate": 2.745186136071887e-05, + "loss": 0.4413, + "step": 13751 + }, + { + "epoch": 17.653401797175867, + "grad_norm": 0.7026444673538208, + "learning_rate": 2.745143346170304e-05, + "loss": 0.4816, + "step": 13752 + }, + { + "epoch": 17.654685494223365, + "grad_norm": 2.6293931007385254, + "learning_rate": 2.7451005562687205e-05, + "loss": 0.4641, + "step": 13753 + }, + { + "epoch": 17.65596919127086, + "grad_norm": 2.7429604530334473, + "learning_rate": 2.7450577663671376e-05, + "loss": 0.4795, + "step": 13754 + }, + { + "epoch": 17.657252888318357, + "grad_norm": 1.6431748867034912, + "learning_rate": 2.745014976465554e-05, + "loss": 0.4636, + "step": 13755 + }, + { + "epoch": 17.658536585365855, + "grad_norm": 1.3239655494689941, + "learning_rate": 2.744972186563971e-05, + "loss": 0.4645, + "step": 13756 + }, + { + "epoch": 17.65982028241335, + "grad_norm": 0.9273697733879089, + "learning_rate": 2.7449293966623878e-05, + "loss": 0.4753, + "step": 13757 + }, + { + "epoch": 17.661103979460847, + "grad_norm": 1.7708172798156738, + "learning_rate": 2.7448866067608047e-05, + "loss": 0.4721, + "step": 13758 + }, + { + "epoch": 17.662387676508345, + "grad_norm": 1.568949818611145, + "learning_rate": 2.744843816859221e-05, + "loss": 0.4288, + "step": 13759 + }, + { + "epoch": 17.66367137355584, + "grad_norm": 0.9688519239425659, + "learning_rate": 2.744801026957638e-05, + "loss": 0.4803, + "step": 13760 + }, + { + "epoch": 17.664955070603337, + "grad_norm": 1.6901781558990479, + "learning_rate": 2.744758237056055e-05, + "loss": 0.4305, + "step": 13761 + }, + { + "epoch": 17.666238767650835, + "grad_norm": 2.04766845703125, + "learning_rate": 2.7447154471544717e-05, + "loss": 0.4304, + "step": 13762 + }, + { + "epoch": 17.66752246469833, + "grad_norm": 1.221590280532837, + "learning_rate": 2.7446726572528885e-05, + "loss": 0.5242, + "step": 13763 + }, + { + "epoch": 17.668806161745827, + "grad_norm": 1.0473244190216064, + "learning_rate": 2.744629867351305e-05, + "loss": 0.4413, + "step": 13764 + }, + { + "epoch": 17.670089858793325, + "grad_norm": 1.4886565208435059, + "learning_rate": 2.7445870774497222e-05, + "loss": 0.4309, + "step": 13765 + }, + { + "epoch": 17.671373555840823, + "grad_norm": 1.2448536157608032, + "learning_rate": 2.7445442875481387e-05, + "loss": 0.4757, + "step": 13766 + }, + { + "epoch": 17.672657252888317, + "grad_norm": 6.292624473571777, + "learning_rate": 2.7445014976465552e-05, + "loss": 0.4385, + "step": 13767 + }, + { + "epoch": 17.673940949935815, + "grad_norm": 1.2614871263504028, + "learning_rate": 2.7444587077449724e-05, + "loss": 0.4792, + "step": 13768 + }, + { + "epoch": 17.675224646983313, + "grad_norm": 1.8586785793304443, + "learning_rate": 2.744415917843389e-05, + "loss": 0.4707, + "step": 13769 + }, + { + "epoch": 17.676508344030808, + "grad_norm": 0.9423242807388306, + "learning_rate": 2.744373127941806e-05, + "loss": 0.4574, + "step": 13770 + }, + { + "epoch": 17.677792041078305, + "grad_norm": 1.6952967643737793, + "learning_rate": 2.7443303380402226e-05, + "loss": 0.5345, + "step": 13771 + }, + { + "epoch": 17.679075738125803, + "grad_norm": 0.955788254737854, + "learning_rate": 2.744287548138639e-05, + "loss": 0.4451, + "step": 13772 + }, + { + "epoch": 17.680359435173298, + "grad_norm": 1.43727445602417, + "learning_rate": 2.7442447582370563e-05, + "loss": 0.4886, + "step": 13773 + }, + { + "epoch": 17.681643132220795, + "grad_norm": 1.1907784938812256, + "learning_rate": 2.7442019683354728e-05, + "loss": 0.4909, + "step": 13774 + }, + { + "epoch": 17.682926829268293, + "grad_norm": 1.8236008882522583, + "learning_rate": 2.7441591784338896e-05, + "loss": 0.4901, + "step": 13775 + }, + { + "epoch": 17.68421052631579, + "grad_norm": 0.9849012494087219, + "learning_rate": 2.7441163885323064e-05, + "loss": 0.4922, + "step": 13776 + }, + { + "epoch": 17.685494223363285, + "grad_norm": 1.0299428701400757, + "learning_rate": 2.7440735986307233e-05, + "loss": 0.5385, + "step": 13777 + }, + { + "epoch": 17.686777920410783, + "grad_norm": 1.573042631149292, + "learning_rate": 2.74403080872914e-05, + "loss": 0.4439, + "step": 13778 + }, + { + "epoch": 17.68806161745828, + "grad_norm": 0.9786680340766907, + "learning_rate": 2.7439880188275566e-05, + "loss": 0.4255, + "step": 13779 + }, + { + "epoch": 17.689345314505776, + "grad_norm": 1.3351942300796509, + "learning_rate": 2.7439452289259735e-05, + "loss": 0.4806, + "step": 13780 + }, + { + "epoch": 17.690629011553273, + "grad_norm": 2.9601051807403564, + "learning_rate": 2.7439024390243903e-05, + "loss": 0.5073, + "step": 13781 + }, + { + "epoch": 17.69191270860077, + "grad_norm": 3.203157663345337, + "learning_rate": 2.743859649122807e-05, + "loss": 0.5344, + "step": 13782 + }, + { + "epoch": 17.693196405648266, + "grad_norm": 1.5893434286117554, + "learning_rate": 2.7438168592212237e-05, + "loss": 0.4635, + "step": 13783 + }, + { + "epoch": 17.694480102695763, + "grad_norm": 2.8645732402801514, + "learning_rate": 2.743774069319641e-05, + "loss": 0.5252, + "step": 13784 + }, + { + "epoch": 17.69576379974326, + "grad_norm": 1.6340659856796265, + "learning_rate": 2.7437312794180573e-05, + "loss": 0.5252, + "step": 13785 + }, + { + "epoch": 17.69704749679076, + "grad_norm": 1.9543336629867554, + "learning_rate": 2.7436884895164742e-05, + "loss": 0.5216, + "step": 13786 + }, + { + "epoch": 17.698331193838253, + "grad_norm": 1.418177604675293, + "learning_rate": 2.743645699614891e-05, + "loss": 0.5077, + "step": 13787 + }, + { + "epoch": 17.69961489088575, + "grad_norm": 1.2542842626571655, + "learning_rate": 2.7436029097133075e-05, + "loss": 0.5211, + "step": 13788 + }, + { + "epoch": 17.70089858793325, + "grad_norm": 3.6899561882019043, + "learning_rate": 2.7435601198117247e-05, + "loss": 0.561, + "step": 13789 + }, + { + "epoch": 17.702182284980744, + "grad_norm": 1.4417939186096191, + "learning_rate": 2.7435173299101412e-05, + "loss": 0.5275, + "step": 13790 + }, + { + "epoch": 17.70346598202824, + "grad_norm": 1.8367093801498413, + "learning_rate": 2.743474540008558e-05, + "loss": 0.5553, + "step": 13791 + }, + { + "epoch": 17.70474967907574, + "grad_norm": 3.134507417678833, + "learning_rate": 2.743431750106975e-05, + "loss": 0.5855, + "step": 13792 + }, + { + "epoch": 17.706033376123234, + "grad_norm": 2.874645471572876, + "learning_rate": 2.7433889602053914e-05, + "loss": 0.702, + "step": 13793 + }, + { + "epoch": 17.70731707317073, + "grad_norm": 2.291722536087036, + "learning_rate": 2.7433461703038086e-05, + "loss": 0.4906, + "step": 13794 + }, + { + "epoch": 17.70860077021823, + "grad_norm": 1.150107741355896, + "learning_rate": 2.743303380402225e-05, + "loss": 0.48, + "step": 13795 + }, + { + "epoch": 17.709884467265724, + "grad_norm": 1.7586092948913574, + "learning_rate": 2.743260590500642e-05, + "loss": 0.4609, + "step": 13796 + }, + { + "epoch": 17.71116816431322, + "grad_norm": 1.5468435287475586, + "learning_rate": 2.7432178005990588e-05, + "loss": 0.5187, + "step": 13797 + }, + { + "epoch": 17.71245186136072, + "grad_norm": 0.8313505053520203, + "learning_rate": 2.7431750106974756e-05, + "loss": 0.4818, + "step": 13798 + }, + { + "epoch": 17.713735558408217, + "grad_norm": 0.9774009585380554, + "learning_rate": 2.743132220795892e-05, + "loss": 0.4669, + "step": 13799 + }, + { + "epoch": 17.71501925545571, + "grad_norm": 3.2862513065338135, + "learning_rate": 2.743089430894309e-05, + "loss": 0.473, + "step": 13800 + }, + { + "epoch": 17.71630295250321, + "grad_norm": 2.07131290435791, + "learning_rate": 2.7430466409927258e-05, + "loss": 0.5169, + "step": 13801 + }, + { + "epoch": 17.717586649550707, + "grad_norm": 1.9341970682144165, + "learning_rate": 2.7430038510911426e-05, + "loss": 0.5337, + "step": 13802 + }, + { + "epoch": 17.7188703465982, + "grad_norm": 0.7957596778869629, + "learning_rate": 2.7429610611895595e-05, + "loss": 0.4425, + "step": 13803 + }, + { + "epoch": 17.7201540436457, + "grad_norm": 1.4084306955337524, + "learning_rate": 2.742918271287976e-05, + "loss": 0.4676, + "step": 13804 + }, + { + "epoch": 17.721437740693197, + "grad_norm": 2.3029627799987793, + "learning_rate": 2.742875481386393e-05, + "loss": 0.4564, + "step": 13805 + }, + { + "epoch": 17.72272143774069, + "grad_norm": 0.9647662043571472, + "learning_rate": 2.7428326914848096e-05, + "loss": 0.4605, + "step": 13806 + }, + { + "epoch": 17.72400513478819, + "grad_norm": 1.3500961065292358, + "learning_rate": 2.742789901583226e-05, + "loss": 0.4613, + "step": 13807 + }, + { + "epoch": 17.725288831835687, + "grad_norm": 1.2609286308288574, + "learning_rate": 2.7427471116816433e-05, + "loss": 0.4761, + "step": 13808 + }, + { + "epoch": 17.726572528883185, + "grad_norm": 1.0014692544937134, + "learning_rate": 2.7427043217800598e-05, + "loss": 0.468, + "step": 13809 + }, + { + "epoch": 17.72785622593068, + "grad_norm": 1.2090888023376465, + "learning_rate": 2.742661531878477e-05, + "loss": 0.4882, + "step": 13810 + }, + { + "epoch": 17.729139922978177, + "grad_norm": 0.7153835892677307, + "learning_rate": 2.7426187419768935e-05, + "loss": 0.4627, + "step": 13811 + }, + { + "epoch": 17.730423620025675, + "grad_norm": 1.0921757221221924, + "learning_rate": 2.7425759520753104e-05, + "loss": 0.4386, + "step": 13812 + }, + { + "epoch": 17.73170731707317, + "grad_norm": 1.8035593032836914, + "learning_rate": 2.7425331621737272e-05, + "loss": 0.463, + "step": 13813 + }, + { + "epoch": 17.732991014120667, + "grad_norm": 1.0800825357437134, + "learning_rate": 2.7424903722721437e-05, + "loss": 0.4673, + "step": 13814 + }, + { + "epoch": 17.734274711168165, + "grad_norm": 1.328534722328186, + "learning_rate": 2.7424475823705605e-05, + "loss": 0.4065, + "step": 13815 + }, + { + "epoch": 17.73555840821566, + "grad_norm": 3.3848960399627686, + "learning_rate": 2.7424047924689774e-05, + "loss": 0.4554, + "step": 13816 + }, + { + "epoch": 17.736842105263158, + "grad_norm": 3.1356260776519775, + "learning_rate": 2.7423620025673942e-05, + "loss": 0.4784, + "step": 13817 + }, + { + "epoch": 17.738125802310655, + "grad_norm": 1.1559314727783203, + "learning_rate": 2.742319212665811e-05, + "loss": 0.4945, + "step": 13818 + }, + { + "epoch": 17.739409499358153, + "grad_norm": 8.321525573730469, + "learning_rate": 2.742276422764228e-05, + "loss": 0.483, + "step": 13819 + }, + { + "epoch": 17.740693196405648, + "grad_norm": 3.9494969844818115, + "learning_rate": 2.7422336328626444e-05, + "loss": 0.4814, + "step": 13820 + }, + { + "epoch": 17.741976893453145, + "grad_norm": 3.986621141433716, + "learning_rate": 2.7421908429610612e-05, + "loss": 0.4878, + "step": 13821 + }, + { + "epoch": 17.743260590500643, + "grad_norm": 3.480214834213257, + "learning_rate": 2.742148053059478e-05, + "loss": 0.534, + "step": 13822 + }, + { + "epoch": 17.744544287548138, + "grad_norm": 2.9673473834991455, + "learning_rate": 2.7421052631578946e-05, + "loss": 0.4758, + "step": 13823 + }, + { + "epoch": 17.745827984595635, + "grad_norm": 3.338322401046753, + "learning_rate": 2.7420624732563118e-05, + "loss": 0.4943, + "step": 13824 + }, + { + "epoch": 17.747111681643133, + "grad_norm": 3.771192789077759, + "learning_rate": 2.7420196833547283e-05, + "loss": 0.4966, + "step": 13825 + }, + { + "epoch": 17.748395378690628, + "grad_norm": 0.9898361563682556, + "learning_rate": 2.7419768934531455e-05, + "loss": 0.5088, + "step": 13826 + }, + { + "epoch": 17.749679075738126, + "grad_norm": 1.3959399461746216, + "learning_rate": 2.741934103551562e-05, + "loss": 0.5056, + "step": 13827 + }, + { + "epoch": 17.750962772785623, + "grad_norm": 2.304166555404663, + "learning_rate": 2.7418913136499785e-05, + "loss": 0.5552, + "step": 13828 + }, + { + "epoch": 17.752246469833118, + "grad_norm": 1.7665033340454102, + "learning_rate": 2.7418485237483956e-05, + "loss": 0.504, + "step": 13829 + }, + { + "epoch": 17.753530166880616, + "grad_norm": 1.3308905363082886, + "learning_rate": 2.741805733846812e-05, + "loss": 0.5007, + "step": 13830 + }, + { + "epoch": 17.754813863928113, + "grad_norm": 1.148665428161621, + "learning_rate": 2.741762943945229e-05, + "loss": 0.5426, + "step": 13831 + }, + { + "epoch": 17.75609756097561, + "grad_norm": 2.1829960346221924, + "learning_rate": 2.7417201540436458e-05, + "loss": 0.4721, + "step": 13832 + }, + { + "epoch": 17.757381258023106, + "grad_norm": 1.7107938528060913, + "learning_rate": 2.7416773641420623e-05, + "loss": 0.462, + "step": 13833 + }, + { + "epoch": 17.758664955070603, + "grad_norm": 1.5170071125030518, + "learning_rate": 2.7416345742404795e-05, + "loss": 0.4928, + "step": 13834 + }, + { + "epoch": 17.7599486521181, + "grad_norm": 1.8900012969970703, + "learning_rate": 2.741591784338896e-05, + "loss": 0.4823, + "step": 13835 + }, + { + "epoch": 17.761232349165596, + "grad_norm": 6.779740810394287, + "learning_rate": 2.741548994437313e-05, + "loss": 0.504, + "step": 13836 + }, + { + "epoch": 17.762516046213094, + "grad_norm": 1.299566388130188, + "learning_rate": 2.7415062045357297e-05, + "loss": 0.4565, + "step": 13837 + }, + { + "epoch": 17.76379974326059, + "grad_norm": 1.6530874967575073, + "learning_rate": 2.7414634146341465e-05, + "loss": 0.5476, + "step": 13838 + }, + { + "epoch": 17.765083440308086, + "grad_norm": 5.627525806427002, + "learning_rate": 2.741420624732563e-05, + "loss": 0.5032, + "step": 13839 + }, + { + "epoch": 17.766367137355584, + "grad_norm": 2.821307420730591, + "learning_rate": 2.74137783483098e-05, + "loss": 0.567, + "step": 13840 + }, + { + "epoch": 17.76765083440308, + "grad_norm": 1.6630101203918457, + "learning_rate": 2.7413350449293967e-05, + "loss": 0.5601, + "step": 13841 + }, + { + "epoch": 17.76893453145058, + "grad_norm": 1.1265504360198975, + "learning_rate": 2.7412922550278136e-05, + "loss": 0.6189, + "step": 13842 + }, + { + "epoch": 17.770218228498074, + "grad_norm": 1.8231624364852905, + "learning_rate": 2.7412494651262304e-05, + "loss": 0.6968, + "step": 13843 + }, + { + "epoch": 17.77150192554557, + "grad_norm": 0.9119982719421387, + "learning_rate": 2.741206675224647e-05, + "loss": 0.4313, + "step": 13844 + }, + { + "epoch": 17.77278562259307, + "grad_norm": 1.0473045110702515, + "learning_rate": 2.741163885323064e-05, + "loss": 0.4297, + "step": 13845 + }, + { + "epoch": 17.774069319640564, + "grad_norm": 1.1898987293243408, + "learning_rate": 2.7411210954214806e-05, + "loss": 0.4465, + "step": 13846 + }, + { + "epoch": 17.77535301668806, + "grad_norm": 0.830571711063385, + "learning_rate": 2.741078305519897e-05, + "loss": 0.4598, + "step": 13847 + }, + { + "epoch": 17.77663671373556, + "grad_norm": 1.6265535354614258, + "learning_rate": 2.7410355156183143e-05, + "loss": 0.4754, + "step": 13848 + }, + { + "epoch": 17.777920410783054, + "grad_norm": 1.4252914190292358, + "learning_rate": 2.7409927257167308e-05, + "loss": 0.4889, + "step": 13849 + }, + { + "epoch": 17.77920410783055, + "grad_norm": 0.7601225972175598, + "learning_rate": 2.740949935815148e-05, + "loss": 0.4751, + "step": 13850 + }, + { + "epoch": 17.78048780487805, + "grad_norm": 1.280806064605713, + "learning_rate": 2.7409071459135644e-05, + "loss": 0.4469, + "step": 13851 + }, + { + "epoch": 17.781771501925547, + "grad_norm": 1.1745755672454834, + "learning_rate": 2.7408643560119813e-05, + "loss": 0.4484, + "step": 13852 + }, + { + "epoch": 17.78305519897304, + "grad_norm": 2.9782886505126953, + "learning_rate": 2.740821566110398e-05, + "loss": 0.4942, + "step": 13853 + }, + { + "epoch": 17.78433889602054, + "grad_norm": 1.8134678602218628, + "learning_rate": 2.7407787762088146e-05, + "loss": 0.5067, + "step": 13854 + }, + { + "epoch": 17.785622593068037, + "grad_norm": 1.1324061155319214, + "learning_rate": 2.7407359863072315e-05, + "loss": 0.4726, + "step": 13855 + }, + { + "epoch": 17.78690629011553, + "grad_norm": 1.1297003030776978, + "learning_rate": 2.7406931964056483e-05, + "loss": 0.457, + "step": 13856 + }, + { + "epoch": 17.78818998716303, + "grad_norm": 1.7608397006988525, + "learning_rate": 2.740650406504065e-05, + "loss": 0.4722, + "step": 13857 + }, + { + "epoch": 17.789473684210527, + "grad_norm": 6.00633430480957, + "learning_rate": 2.740607616602482e-05, + "loss": 0.4758, + "step": 13858 + }, + { + "epoch": 17.79075738125802, + "grad_norm": 1.9193856716156006, + "learning_rate": 2.740564826700899e-05, + "loss": 0.4638, + "step": 13859 + }, + { + "epoch": 17.79204107830552, + "grad_norm": 1.5857940912246704, + "learning_rate": 2.7405220367993153e-05, + "loss": 0.4911, + "step": 13860 + }, + { + "epoch": 17.793324775353017, + "grad_norm": 1.80289626121521, + "learning_rate": 2.7404792468977322e-05, + "loss": 0.4965, + "step": 13861 + }, + { + "epoch": 17.794608472400512, + "grad_norm": 1.3202414512634277, + "learning_rate": 2.740436456996149e-05, + "loss": 0.4836, + "step": 13862 + }, + { + "epoch": 17.79589216944801, + "grad_norm": 1.3419644832611084, + "learning_rate": 2.7403936670945655e-05, + "loss": 0.4606, + "step": 13863 + }, + { + "epoch": 17.797175866495508, + "grad_norm": 0.8144277930259705, + "learning_rate": 2.7403508771929827e-05, + "loss": 0.4838, + "step": 13864 + }, + { + "epoch": 17.798459563543005, + "grad_norm": 1.1468052864074707, + "learning_rate": 2.7403080872913992e-05, + "loss": 0.505, + "step": 13865 + }, + { + "epoch": 17.7997432605905, + "grad_norm": 2.504716396331787, + "learning_rate": 2.7402652973898164e-05, + "loss": 0.4588, + "step": 13866 + }, + { + "epoch": 17.801026957637998, + "grad_norm": 2.65523624420166, + "learning_rate": 2.740222507488233e-05, + "loss": 0.4829, + "step": 13867 + }, + { + "epoch": 17.802310654685495, + "grad_norm": 1.9257792234420776, + "learning_rate": 2.7401797175866494e-05, + "loss": 0.5288, + "step": 13868 + }, + { + "epoch": 17.80359435173299, + "grad_norm": 3.54781174659729, + "learning_rate": 2.7401369276850666e-05, + "loss": 0.4765, + "step": 13869 + }, + { + "epoch": 17.804878048780488, + "grad_norm": 1.3971558809280396, + "learning_rate": 2.740094137783483e-05, + "loss": 0.4735, + "step": 13870 + }, + { + "epoch": 17.806161745827985, + "grad_norm": 5.284867763519287, + "learning_rate": 2.7400513478819e-05, + "loss": 0.4753, + "step": 13871 + }, + { + "epoch": 17.80744544287548, + "grad_norm": 0.9510630369186401, + "learning_rate": 2.7400085579803168e-05, + "loss": 0.5396, + "step": 13872 + }, + { + "epoch": 17.808729139922978, + "grad_norm": 1.916016697883606, + "learning_rate": 2.7399657680787336e-05, + "loss": 0.4775, + "step": 13873 + }, + { + "epoch": 17.810012836970476, + "grad_norm": 0.9757972955703735, + "learning_rate": 2.7399229781771504e-05, + "loss": 0.4881, + "step": 13874 + }, + { + "epoch": 17.811296534017973, + "grad_norm": 3.560992479324341, + "learning_rate": 2.739880188275567e-05, + "loss": 0.5214, + "step": 13875 + }, + { + "epoch": 17.812580231065468, + "grad_norm": 1.0331708192825317, + "learning_rate": 2.7398373983739838e-05, + "loss": 0.5026, + "step": 13876 + }, + { + "epoch": 17.813863928112966, + "grad_norm": 2.8718578815460205, + "learning_rate": 2.7397946084724006e-05, + "loss": 0.5064, + "step": 13877 + }, + { + "epoch": 17.815147625160463, + "grad_norm": 2.141397476196289, + "learning_rate": 2.7397518185708175e-05, + "loss": 0.4823, + "step": 13878 + }, + { + "epoch": 17.816431322207958, + "grad_norm": 2.177726984024048, + "learning_rate": 2.739709028669234e-05, + "loss": 0.4931, + "step": 13879 + }, + { + "epoch": 17.817715019255456, + "grad_norm": 1.6261149644851685, + "learning_rate": 2.739666238767651e-05, + "loss": 0.4758, + "step": 13880 + }, + { + "epoch": 17.818998716302954, + "grad_norm": 3.3226187229156494, + "learning_rate": 2.7396234488660677e-05, + "loss": 0.5404, + "step": 13881 + }, + { + "epoch": 17.820282413350448, + "grad_norm": 2.70467472076416, + "learning_rate": 2.739580658964484e-05, + "loss": 0.484, + "step": 13882 + }, + { + "epoch": 17.821566110397946, + "grad_norm": 1.1275321245193481, + "learning_rate": 2.7395378690629013e-05, + "loss": 0.4896, + "step": 13883 + }, + { + "epoch": 17.822849807445444, + "grad_norm": 1.770348072052002, + "learning_rate": 2.739495079161318e-05, + "loss": 0.5344, + "step": 13884 + }, + { + "epoch": 17.82413350449294, + "grad_norm": 1.3511407375335693, + "learning_rate": 2.739452289259735e-05, + "loss": 0.546, + "step": 13885 + }, + { + "epoch": 17.825417201540436, + "grad_norm": 3.6386075019836426, + "learning_rate": 2.7394094993581515e-05, + "loss": 0.5134, + "step": 13886 + }, + { + "epoch": 17.826700898587934, + "grad_norm": 1.1386241912841797, + "learning_rate": 2.7393667094565684e-05, + "loss": 0.5568, + "step": 13887 + }, + { + "epoch": 17.82798459563543, + "grad_norm": 3.8354952335357666, + "learning_rate": 2.7393239195549852e-05, + "loss": 0.4684, + "step": 13888 + }, + { + "epoch": 17.829268292682926, + "grad_norm": 0.9322922229766846, + "learning_rate": 2.7392811296534017e-05, + "loss": 0.5187, + "step": 13889 + }, + { + "epoch": 17.830551989730424, + "grad_norm": 2.8061206340789795, + "learning_rate": 2.7392383397518185e-05, + "loss": 0.5588, + "step": 13890 + }, + { + "epoch": 17.83183568677792, + "grad_norm": 13.2223539352417, + "learning_rate": 2.7391955498502354e-05, + "loss": 0.5718, + "step": 13891 + }, + { + "epoch": 17.833119383825416, + "grad_norm": 1.603037714958191, + "learning_rate": 2.7391527599486522e-05, + "loss": 0.6527, + "step": 13892 + }, + { + "epoch": 17.834403080872914, + "grad_norm": 2.6712629795074463, + "learning_rate": 2.739109970047069e-05, + "loss": 0.7554, + "step": 13893 + }, + { + "epoch": 17.83568677792041, + "grad_norm": 1.244397759437561, + "learning_rate": 2.7390671801454856e-05, + "loss": 0.421, + "step": 13894 + }, + { + "epoch": 17.836970474967906, + "grad_norm": 1.0599130392074585, + "learning_rate": 2.7390243902439024e-05, + "loss": 0.4576, + "step": 13895 + }, + { + "epoch": 17.838254172015404, + "grad_norm": 1.0687376260757446, + "learning_rate": 2.7389816003423193e-05, + "loss": 0.4682, + "step": 13896 + }, + { + "epoch": 17.8395378690629, + "grad_norm": 1.4811557531356812, + "learning_rate": 2.738938810440736e-05, + "loss": 0.4451, + "step": 13897 + }, + { + "epoch": 17.8408215661104, + "grad_norm": 1.103435754776001, + "learning_rate": 2.7388960205391526e-05, + "loss": 0.4676, + "step": 13898 + }, + { + "epoch": 17.842105263157894, + "grad_norm": 1.0565669536590576, + "learning_rate": 2.7388532306375698e-05, + "loss": 0.4903, + "step": 13899 + }, + { + "epoch": 17.84338896020539, + "grad_norm": 1.7824517488479614, + "learning_rate": 2.7388104407359863e-05, + "loss": 0.5183, + "step": 13900 + }, + { + "epoch": 17.84467265725289, + "grad_norm": 1.7361027002334595, + "learning_rate": 2.738767650834403e-05, + "loss": 0.4659, + "step": 13901 + }, + { + "epoch": 17.845956354300384, + "grad_norm": 1.21517014503479, + "learning_rate": 2.73872486093282e-05, + "loss": 0.4703, + "step": 13902 + }, + { + "epoch": 17.84724005134788, + "grad_norm": 0.9192317724227905, + "learning_rate": 2.7386820710312365e-05, + "loss": 0.4519, + "step": 13903 + }, + { + "epoch": 17.84852374839538, + "grad_norm": 0.8926845192909241, + "learning_rate": 2.7386392811296536e-05, + "loss": 0.4638, + "step": 13904 + }, + { + "epoch": 17.849807445442874, + "grad_norm": 0.9276664853096008, + "learning_rate": 2.73859649122807e-05, + "loss": 0.4641, + "step": 13905 + }, + { + "epoch": 17.85109114249037, + "grad_norm": 2.0464725494384766, + "learning_rate": 2.738553701326487e-05, + "loss": 0.4277, + "step": 13906 + }, + { + "epoch": 17.85237483953787, + "grad_norm": 1.6789515018463135, + "learning_rate": 2.7385109114249038e-05, + "loss": 0.48, + "step": 13907 + }, + { + "epoch": 17.853658536585368, + "grad_norm": 0.691721498966217, + "learning_rate": 2.7384681215233203e-05, + "loss": 0.4002, + "step": 13908 + }, + { + "epoch": 17.854942233632862, + "grad_norm": 1.1629528999328613, + "learning_rate": 2.7384253316217375e-05, + "loss": 0.4295, + "step": 13909 + }, + { + "epoch": 17.85622593068036, + "grad_norm": 0.7734412550926208, + "learning_rate": 2.738382541720154e-05, + "loss": 0.4328, + "step": 13910 + }, + { + "epoch": 17.857509627727858, + "grad_norm": 0.8444445133209229, + "learning_rate": 2.738339751818571e-05, + "loss": 0.4164, + "step": 13911 + }, + { + "epoch": 17.858793324775352, + "grad_norm": 1.3061144351959229, + "learning_rate": 2.7382969619169877e-05, + "loss": 0.488, + "step": 13912 + }, + { + "epoch": 17.86007702182285, + "grad_norm": 0.8960488438606262, + "learning_rate": 2.7382541720154045e-05, + "loss": 0.4666, + "step": 13913 + }, + { + "epoch": 17.861360718870348, + "grad_norm": 1.9433047771453857, + "learning_rate": 2.738211382113821e-05, + "loss": 0.4756, + "step": 13914 + }, + { + "epoch": 17.862644415917842, + "grad_norm": 3.037022113800049, + "learning_rate": 2.738168592212238e-05, + "loss": 0.4733, + "step": 13915 + }, + { + "epoch": 17.86392811296534, + "grad_norm": 4.947826385498047, + "learning_rate": 2.7381258023106547e-05, + "loss": 0.4705, + "step": 13916 + }, + { + "epoch": 17.865211810012838, + "grad_norm": 0.7380445599555969, + "learning_rate": 2.7380830124090716e-05, + "loss": 0.4602, + "step": 13917 + }, + { + "epoch": 17.866495507060336, + "grad_norm": 1.1823927164077759, + "learning_rate": 2.7380402225074884e-05, + "loss": 0.5019, + "step": 13918 + }, + { + "epoch": 17.86777920410783, + "grad_norm": 1.0633677244186401, + "learning_rate": 2.737997432605905e-05, + "loss": 0.4157, + "step": 13919 + }, + { + "epoch": 17.869062901155328, + "grad_norm": 1.9054335355758667, + "learning_rate": 2.737954642704322e-05, + "loss": 0.4331, + "step": 13920 + }, + { + "epoch": 17.870346598202826, + "grad_norm": 1.661382794380188, + "learning_rate": 2.7379118528027386e-05, + "loss": 0.4544, + "step": 13921 + }, + { + "epoch": 17.87163029525032, + "grad_norm": 1.4940526485443115, + "learning_rate": 2.737869062901155e-05, + "loss": 0.4483, + "step": 13922 + }, + { + "epoch": 17.872913992297818, + "grad_norm": 1.0780391693115234, + "learning_rate": 2.7378262729995723e-05, + "loss": 0.473, + "step": 13923 + }, + { + "epoch": 17.874197689345316, + "grad_norm": 1.6318862438201904, + "learning_rate": 2.7377834830979888e-05, + "loss": 0.4495, + "step": 13924 + }, + { + "epoch": 17.87548138639281, + "grad_norm": 1.915743350982666, + "learning_rate": 2.737740693196406e-05, + "loss": 0.4393, + "step": 13925 + }, + { + "epoch": 17.876765083440308, + "grad_norm": 1.7679189443588257, + "learning_rate": 2.7376979032948225e-05, + "loss": 0.4465, + "step": 13926 + }, + { + "epoch": 17.878048780487806, + "grad_norm": 1.3450382947921753, + "learning_rate": 2.7376551133932393e-05, + "loss": 0.4806, + "step": 13927 + }, + { + "epoch": 17.8793324775353, + "grad_norm": 1.2216235399246216, + "learning_rate": 2.737612323491656e-05, + "loss": 0.4977, + "step": 13928 + }, + { + "epoch": 17.880616174582798, + "grad_norm": 1.0263466835021973, + "learning_rate": 2.7375695335900726e-05, + "loss": 0.4932, + "step": 13929 + }, + { + "epoch": 17.881899871630296, + "grad_norm": 2.5046563148498535, + "learning_rate": 2.7375267436884895e-05, + "loss": 0.489, + "step": 13930 + }, + { + "epoch": 17.883183568677794, + "grad_norm": 3.2924108505249023, + "learning_rate": 2.7374839537869063e-05, + "loss": 0.4516, + "step": 13931 + }, + { + "epoch": 17.884467265725288, + "grad_norm": 2.436704635620117, + "learning_rate": 2.737441163885323e-05, + "loss": 0.493, + "step": 13932 + }, + { + "epoch": 17.885750962772786, + "grad_norm": 5.62474250793457, + "learning_rate": 2.73739837398374e-05, + "loss": 0.4431, + "step": 13933 + }, + { + "epoch": 17.887034659820284, + "grad_norm": 1.2616398334503174, + "learning_rate": 2.737355584082157e-05, + "loss": 0.5271, + "step": 13934 + }, + { + "epoch": 17.888318356867778, + "grad_norm": 1.3229379653930664, + "learning_rate": 2.7373127941805733e-05, + "loss": 0.6028, + "step": 13935 + }, + { + "epoch": 17.889602053915276, + "grad_norm": 2.046135425567627, + "learning_rate": 2.7372700042789902e-05, + "loss": 0.4876, + "step": 13936 + }, + { + "epoch": 17.890885750962774, + "grad_norm": 2.6968626976013184, + "learning_rate": 2.737227214377407e-05, + "loss": 0.4712, + "step": 13937 + }, + { + "epoch": 17.892169448010268, + "grad_norm": 1.924825668334961, + "learning_rate": 2.7371844244758235e-05, + "loss": 0.4801, + "step": 13938 + }, + { + "epoch": 17.893453145057766, + "grad_norm": 5.102120399475098, + "learning_rate": 2.7371416345742407e-05, + "loss": 0.4991, + "step": 13939 + }, + { + "epoch": 17.894736842105264, + "grad_norm": 2.4487452507019043, + "learning_rate": 2.7370988446726572e-05, + "loss": 0.512, + "step": 13940 + }, + { + "epoch": 17.89602053915276, + "grad_norm": 3.153412103652954, + "learning_rate": 2.7370560547710744e-05, + "loss": 0.5653, + "step": 13941 + }, + { + "epoch": 17.897304236200256, + "grad_norm": 1.2382838726043701, + "learning_rate": 2.737013264869491e-05, + "loss": 0.606, + "step": 13942 + }, + { + "epoch": 17.898587933247754, + "grad_norm": 1.272250771522522, + "learning_rate": 2.7369704749679074e-05, + "loss": 0.619, + "step": 13943 + }, + { + "epoch": 17.89987163029525, + "grad_norm": 1.3820021152496338, + "learning_rate": 2.7369276850663246e-05, + "loss": 0.4486, + "step": 13944 + }, + { + "epoch": 17.901155327342746, + "grad_norm": 1.5955307483673096, + "learning_rate": 2.736884895164741e-05, + "loss": 0.4707, + "step": 13945 + }, + { + "epoch": 17.902439024390244, + "grad_norm": 2.686831474304199, + "learning_rate": 2.736842105263158e-05, + "loss": 0.4672, + "step": 13946 + }, + { + "epoch": 17.90372272143774, + "grad_norm": 1.442034363746643, + "learning_rate": 2.7367993153615748e-05, + "loss": 0.4461, + "step": 13947 + }, + { + "epoch": 17.905006418485236, + "grad_norm": 1.314865231513977, + "learning_rate": 2.7367565254599916e-05, + "loss": 0.4726, + "step": 13948 + }, + { + "epoch": 17.906290115532734, + "grad_norm": 0.9714824557304382, + "learning_rate": 2.7367137355584084e-05, + "loss": 0.5024, + "step": 13949 + }, + { + "epoch": 17.90757381258023, + "grad_norm": 2.1688926219940186, + "learning_rate": 2.736670945656825e-05, + "loss": 0.4722, + "step": 13950 + }, + { + "epoch": 17.90885750962773, + "grad_norm": 1.0247752666473389, + "learning_rate": 2.7366281557552418e-05, + "loss": 0.463, + "step": 13951 + }, + { + "epoch": 17.910141206675224, + "grad_norm": 1.1315404176712036, + "learning_rate": 2.7365853658536586e-05, + "loss": 0.4942, + "step": 13952 + }, + { + "epoch": 17.911424903722722, + "grad_norm": 0.9602296352386475, + "learning_rate": 2.7365425759520755e-05, + "loss": 0.4877, + "step": 13953 + }, + { + "epoch": 17.91270860077022, + "grad_norm": 1.4248429536819458, + "learning_rate": 2.736499786050492e-05, + "loss": 0.4891, + "step": 13954 + }, + { + "epoch": 17.913992297817714, + "grad_norm": 1.0533279180526733, + "learning_rate": 2.736456996148909e-05, + "loss": 0.4739, + "step": 13955 + }, + { + "epoch": 17.915275994865212, + "grad_norm": 0.9357280135154724, + "learning_rate": 2.7364142062473257e-05, + "loss": 0.4598, + "step": 13956 + }, + { + "epoch": 17.91655969191271, + "grad_norm": 1.2606157064437866, + "learning_rate": 2.7363714163457425e-05, + "loss": 0.45, + "step": 13957 + }, + { + "epoch": 17.917843388960204, + "grad_norm": 2.454850196838379, + "learning_rate": 2.7363286264441593e-05, + "loss": 0.4792, + "step": 13958 + }, + { + "epoch": 17.919127086007702, + "grad_norm": 1.5184427499771118, + "learning_rate": 2.736285836542576e-05, + "loss": 0.4526, + "step": 13959 + }, + { + "epoch": 17.9204107830552, + "grad_norm": 1.5270074605941772, + "learning_rate": 2.736243046640993e-05, + "loss": 0.5028, + "step": 13960 + }, + { + "epoch": 17.921694480102694, + "grad_norm": 4.279797077178955, + "learning_rate": 2.7362002567394095e-05, + "loss": 0.5244, + "step": 13961 + }, + { + "epoch": 17.922978177150192, + "grad_norm": 1.1567506790161133, + "learning_rate": 2.736157466837826e-05, + "loss": 0.4549, + "step": 13962 + }, + { + "epoch": 17.92426187419769, + "grad_norm": 1.1162296533584595, + "learning_rate": 2.7361146769362432e-05, + "loss": 0.4767, + "step": 13963 + }, + { + "epoch": 17.925545571245188, + "grad_norm": 18.484458923339844, + "learning_rate": 2.7360718870346597e-05, + "loss": 0.4676, + "step": 13964 + }, + { + "epoch": 17.926829268292682, + "grad_norm": 1.2783186435699463, + "learning_rate": 2.736029097133077e-05, + "loss": 0.4318, + "step": 13965 + }, + { + "epoch": 17.92811296534018, + "grad_norm": 1.47956120967865, + "learning_rate": 2.7359863072314934e-05, + "loss": 0.5002, + "step": 13966 + }, + { + "epoch": 17.929396662387678, + "grad_norm": 0.9984580278396606, + "learning_rate": 2.7359435173299102e-05, + "loss": 0.4643, + "step": 13967 + }, + { + "epoch": 17.930680359435172, + "grad_norm": 5.621718883514404, + "learning_rate": 2.735900727428327e-05, + "loss": 0.4303, + "step": 13968 + }, + { + "epoch": 17.93196405648267, + "grad_norm": 0.9865211248397827, + "learning_rate": 2.7358579375267436e-05, + "loss": 0.4918, + "step": 13969 + }, + { + "epoch": 17.933247753530168, + "grad_norm": 2.1292126178741455, + "learning_rate": 2.7358151476251604e-05, + "loss": 0.4662, + "step": 13970 + }, + { + "epoch": 17.934531450577662, + "grad_norm": 2.753145456314087, + "learning_rate": 2.7357723577235773e-05, + "loss": 0.472, + "step": 13971 + }, + { + "epoch": 17.93581514762516, + "grad_norm": 1.6415355205535889, + "learning_rate": 2.735729567821994e-05, + "loss": 0.4904, + "step": 13972 + }, + { + "epoch": 17.937098844672658, + "grad_norm": 1.1009331941604614, + "learning_rate": 2.735686777920411e-05, + "loss": 0.5156, + "step": 13973 + }, + { + "epoch": 17.938382541720156, + "grad_norm": 2.5671756267547607, + "learning_rate": 2.7356439880188278e-05, + "loss": 0.4648, + "step": 13974 + }, + { + "epoch": 17.93966623876765, + "grad_norm": 1.1178022623062134, + "learning_rate": 2.7356011981172443e-05, + "loss": 0.4236, + "step": 13975 + }, + { + "epoch": 17.940949935815148, + "grad_norm": 1.1102511882781982, + "learning_rate": 2.735558408215661e-05, + "loss": 0.4741, + "step": 13976 + }, + { + "epoch": 17.942233632862646, + "grad_norm": 1.2220596075057983, + "learning_rate": 2.735515618314078e-05, + "loss": 0.4974, + "step": 13977 + }, + { + "epoch": 17.94351732991014, + "grad_norm": 4.238386631011963, + "learning_rate": 2.7354728284124945e-05, + "loss": 0.4571, + "step": 13978 + }, + { + "epoch": 17.944801026957638, + "grad_norm": 1.1799627542495728, + "learning_rate": 2.7354300385109116e-05, + "loss": 0.4834, + "step": 13979 + }, + { + "epoch": 17.946084724005136, + "grad_norm": 1.9414095878601074, + "learning_rate": 2.735387248609328e-05, + "loss": 0.4914, + "step": 13980 + }, + { + "epoch": 17.94736842105263, + "grad_norm": 1.8325697183609009, + "learning_rate": 2.7353444587077453e-05, + "loss": 0.4873, + "step": 13981 + }, + { + "epoch": 17.948652118100128, + "grad_norm": 2.289883613586426, + "learning_rate": 2.735301668806162e-05, + "loss": 0.4791, + "step": 13982 + }, + { + "epoch": 17.949935815147626, + "grad_norm": 2.1125316619873047, + "learning_rate": 2.7352588789045783e-05, + "loss": 0.4837, + "step": 13983 + }, + { + "epoch": 17.951219512195124, + "grad_norm": 1.901474952697754, + "learning_rate": 2.7352160890029955e-05, + "loss": 0.4764, + "step": 13984 + }, + { + "epoch": 17.952503209242618, + "grad_norm": 1.4647620916366577, + "learning_rate": 2.735173299101412e-05, + "loss": 0.5084, + "step": 13985 + }, + { + "epoch": 17.953786906290116, + "grad_norm": 2.291302442550659, + "learning_rate": 2.735130509199829e-05, + "loss": 0.5116, + "step": 13986 + }, + { + "epoch": 17.955070603337614, + "grad_norm": 3.103687047958374, + "learning_rate": 2.7350877192982457e-05, + "loss": 0.5513, + "step": 13987 + }, + { + "epoch": 17.956354300385108, + "grad_norm": 27.25351905822754, + "learning_rate": 2.7350449293966625e-05, + "loss": 0.4998, + "step": 13988 + }, + { + "epoch": 17.957637997432606, + "grad_norm": 3.043912172317505, + "learning_rate": 2.7350021394950794e-05, + "loss": 0.5784, + "step": 13989 + }, + { + "epoch": 17.958921694480104, + "grad_norm": 1.1305105686187744, + "learning_rate": 2.734959349593496e-05, + "loss": 0.514, + "step": 13990 + }, + { + "epoch": 17.960205391527598, + "grad_norm": 2.264833927154541, + "learning_rate": 2.7349165596919127e-05, + "loss": 0.5559, + "step": 13991 + }, + { + "epoch": 17.961489088575096, + "grad_norm": 3.413724422454834, + "learning_rate": 2.7348737697903296e-05, + "loss": 0.5898, + "step": 13992 + }, + { + "epoch": 17.962772785622594, + "grad_norm": 5.52740478515625, + "learning_rate": 2.7348309798887464e-05, + "loss": 0.7196, + "step": 13993 + }, + { + "epoch": 17.964056482670088, + "grad_norm": 1.2440524101257324, + "learning_rate": 2.734788189987163e-05, + "loss": 0.4524, + "step": 13994 + }, + { + "epoch": 17.965340179717586, + "grad_norm": 1.131446361541748, + "learning_rate": 2.73474540008558e-05, + "loss": 0.5078, + "step": 13995 + }, + { + "epoch": 17.966623876765084, + "grad_norm": 0.9579315781593323, + "learning_rate": 2.7347026101839966e-05, + "loss": 0.4915, + "step": 13996 + }, + { + "epoch": 17.96790757381258, + "grad_norm": 1.3969849348068237, + "learning_rate": 2.7346598202824134e-05, + "loss": 0.4801, + "step": 13997 + }, + { + "epoch": 17.969191270860076, + "grad_norm": 1.2389020919799805, + "learning_rate": 2.7346170303808303e-05, + "loss": 0.4572, + "step": 13998 + }, + { + "epoch": 17.970474967907574, + "grad_norm": 3.1593775749206543, + "learning_rate": 2.7345742404792468e-05, + "loss": 0.4865, + "step": 13999 + }, + { + "epoch": 17.971758664955072, + "grad_norm": 2.126211643218994, + "learning_rate": 2.734531450577664e-05, + "loss": 0.5162, + "step": 14000 + }, + { + "epoch": 17.971758664955072, + "eval_cer": 0.2859310749951048, + "eval_loss": 0.5127441883087158, + "eval_runtime": 13.6288, + "eval_samples_per_second": 72.127, + "eval_steps_per_second": 0.514, + "eval_wer": 0.504356798899335, + "step": 14000 + }, + { + "epoch": 17.973042362002566, + "grad_norm": 1.2273029088974, + "learning_rate": 2.7344886606760805e-05, + "loss": 0.4853, + "step": 14001 + }, + { + "epoch": 17.974326059050064, + "grad_norm": 0.9605283141136169, + "learning_rate": 2.7344458707744973e-05, + "loss": 0.4332, + "step": 14002 + }, + { + "epoch": 17.975609756097562, + "grad_norm": 2.8902461528778076, + "learning_rate": 2.734403080872914e-05, + "loss": 0.4974, + "step": 14003 + }, + { + "epoch": 17.976893453145056, + "grad_norm": 1.0414786338806152, + "learning_rate": 2.7343602909713306e-05, + "loss": 0.5098, + "step": 14004 + }, + { + "epoch": 17.978177150192554, + "grad_norm": 1.0544028282165527, + "learning_rate": 2.7343175010697478e-05, + "loss": 0.4752, + "step": 14005 + }, + { + "epoch": 17.979460847240052, + "grad_norm": 2.497286558151245, + "learning_rate": 2.7342747111681643e-05, + "loss": 0.459, + "step": 14006 + }, + { + "epoch": 17.98074454428755, + "grad_norm": 1.210595726966858, + "learning_rate": 2.734231921266581e-05, + "loss": 0.4427, + "step": 14007 + }, + { + "epoch": 17.982028241335044, + "grad_norm": 3.733457088470459, + "learning_rate": 2.734189131364998e-05, + "loss": 0.4967, + "step": 14008 + }, + { + "epoch": 17.983311938382542, + "grad_norm": 5.4223856925964355, + "learning_rate": 2.734146341463415e-05, + "loss": 0.4335, + "step": 14009 + }, + { + "epoch": 17.98459563543004, + "grad_norm": 2.3834116458892822, + "learning_rate": 2.7341035515618314e-05, + "loss": 0.4935, + "step": 14010 + }, + { + "epoch": 17.985879332477534, + "grad_norm": 3.4380087852478027, + "learning_rate": 2.7340607616602482e-05, + "loss": 0.4572, + "step": 14011 + }, + { + "epoch": 17.987163029525032, + "grad_norm": 1.0183944702148438, + "learning_rate": 2.734017971758665e-05, + "loss": 0.4623, + "step": 14012 + }, + { + "epoch": 17.98844672657253, + "grad_norm": 0.9819822907447815, + "learning_rate": 2.733975181857082e-05, + "loss": 0.5175, + "step": 14013 + }, + { + "epoch": 17.989730423620024, + "grad_norm": 1.5704635381698608, + "learning_rate": 2.7339323919554987e-05, + "loss": 0.4573, + "step": 14014 + }, + { + "epoch": 17.991014120667522, + "grad_norm": 1.346464991569519, + "learning_rate": 2.7338896020539152e-05, + "loss": 0.4992, + "step": 14015 + }, + { + "epoch": 17.99229781771502, + "grad_norm": 3.092175245285034, + "learning_rate": 2.7338468121523324e-05, + "loss": 0.5017, + "step": 14016 + }, + { + "epoch": 17.993581514762518, + "grad_norm": 2.469932794570923, + "learning_rate": 2.733804022250749e-05, + "loss": 0.491, + "step": 14017 + }, + { + "epoch": 17.994865211810012, + "grad_norm": 1.579477310180664, + "learning_rate": 2.7337612323491654e-05, + "loss": 0.5557, + "step": 14018 + }, + { + "epoch": 17.99614890885751, + "grad_norm": 4.198239326477051, + "learning_rate": 2.7337184424475826e-05, + "loss": 0.5248, + "step": 14019 + }, + { + "epoch": 17.997432605905008, + "grad_norm": 2.723629951477051, + "learning_rate": 2.733675652545999e-05, + "loss": 0.5247, + "step": 14020 + }, + { + "epoch": 17.998716302952502, + "grad_norm": 2.2568187713623047, + "learning_rate": 2.7336328626444163e-05, + "loss": 0.5144, + "step": 14021 + }, + { + "epoch": 18.0, + "grad_norm": 3.8323678970336914, + "learning_rate": 2.7335900727428328e-05, + "loss": 0.6862, + "step": 14022 + }, + { + "epoch": 18.001283697047498, + "grad_norm": 0.9423307776451111, + "learning_rate": 2.7335472828412493e-05, + "loss": 0.4318, + "step": 14023 + }, + { + "epoch": 18.002567394094992, + "grad_norm": 2.1944525241851807, + "learning_rate": 2.7335044929396665e-05, + "loss": 0.4358, + "step": 14024 + }, + { + "epoch": 18.00385109114249, + "grad_norm": 1.2253618240356445, + "learning_rate": 2.733461703038083e-05, + "loss": 0.4103, + "step": 14025 + }, + { + "epoch": 18.005134788189988, + "grad_norm": 0.9533997178077698, + "learning_rate": 2.7334189131364998e-05, + "loss": 0.4216, + "step": 14026 + }, + { + "epoch": 18.006418485237482, + "grad_norm": 1.9666613340377808, + "learning_rate": 2.7333761232349166e-05, + "loss": 0.4206, + "step": 14027 + }, + { + "epoch": 18.00770218228498, + "grad_norm": 1.4652369022369385, + "learning_rate": 2.7333333333333335e-05, + "loss": 0.4538, + "step": 14028 + }, + { + "epoch": 18.008985879332478, + "grad_norm": 2.5878331661224365, + "learning_rate": 2.7332905434317503e-05, + "loss": 0.4473, + "step": 14029 + }, + { + "epoch": 18.010269576379976, + "grad_norm": 1.4551174640655518, + "learning_rate": 2.7332477535301668e-05, + "loss": 0.4502, + "step": 14030 + }, + { + "epoch": 18.01155327342747, + "grad_norm": 1.5859811305999756, + "learning_rate": 2.7332049636285837e-05, + "loss": 0.4134, + "step": 14031 + }, + { + "epoch": 18.012836970474968, + "grad_norm": 3.6076812744140625, + "learning_rate": 2.7331621737270005e-05, + "loss": 0.4802, + "step": 14032 + }, + { + "epoch": 18.014120667522466, + "grad_norm": 1.3877369165420532, + "learning_rate": 2.7331193838254173e-05, + "loss": 0.4412, + "step": 14033 + }, + { + "epoch": 18.01540436456996, + "grad_norm": 1.3273931741714478, + "learning_rate": 2.733076593923834e-05, + "loss": 0.512, + "step": 14034 + }, + { + "epoch": 18.016688061617458, + "grad_norm": 1.0804506540298462, + "learning_rate": 2.733033804022251e-05, + "loss": 0.4354, + "step": 14035 + }, + { + "epoch": 18.017971758664956, + "grad_norm": 1.8070884943008423, + "learning_rate": 2.7329910141206675e-05, + "loss": 0.4294, + "step": 14036 + }, + { + "epoch": 18.01925545571245, + "grad_norm": 3.5796737670898438, + "learning_rate": 2.7329482242190844e-05, + "loss": 0.4747, + "step": 14037 + }, + { + "epoch": 18.020539152759948, + "grad_norm": 1.201158881187439, + "learning_rate": 2.7329054343175012e-05, + "loss": 0.4772, + "step": 14038 + }, + { + "epoch": 18.021822849807446, + "grad_norm": 1.6684951782226562, + "learning_rate": 2.7328626444159177e-05, + "loss": 0.4451, + "step": 14039 + }, + { + "epoch": 18.023106546854944, + "grad_norm": 2.949770927429199, + "learning_rate": 2.732819854514335e-05, + "loss": 0.4242, + "step": 14040 + }, + { + "epoch": 18.024390243902438, + "grad_norm": 1.28084135055542, + "learning_rate": 2.7327770646127514e-05, + "loss": 0.4401, + "step": 14041 + }, + { + "epoch": 18.025673940949936, + "grad_norm": 2.21423077583313, + "learning_rate": 2.7327342747111682e-05, + "loss": 0.4505, + "step": 14042 + }, + { + "epoch": 18.026957637997434, + "grad_norm": 3.110747814178467, + "learning_rate": 2.732691484809585e-05, + "loss": 0.4755, + "step": 14043 + }, + { + "epoch": 18.028241335044928, + "grad_norm": 3.584261178970337, + "learning_rate": 2.7326486949080016e-05, + "loss": 0.4805, + "step": 14044 + }, + { + "epoch": 18.029525032092426, + "grad_norm": 5.298620223999023, + "learning_rate": 2.7326059050064188e-05, + "loss": 0.4679, + "step": 14045 + }, + { + "epoch": 18.030808729139924, + "grad_norm": 1.3190058469772339, + "learning_rate": 2.7325631151048353e-05, + "loss": 0.4558, + "step": 14046 + }, + { + "epoch": 18.03209242618742, + "grad_norm": 2.8808741569519043, + "learning_rate": 2.732520325203252e-05, + "loss": 0.454, + "step": 14047 + }, + { + "epoch": 18.033376123234916, + "grad_norm": 2.9231112003326416, + "learning_rate": 2.732477535301669e-05, + "loss": 0.4656, + "step": 14048 + }, + { + "epoch": 18.034659820282414, + "grad_norm": 1.461242914199829, + "learning_rate": 2.7324347454000858e-05, + "loss": 0.4406, + "step": 14049 + }, + { + "epoch": 18.035943517329912, + "grad_norm": 2.314439535140991, + "learning_rate": 2.7323919554985023e-05, + "loss": 0.5067, + "step": 14050 + }, + { + "epoch": 18.037227214377406, + "grad_norm": 1.9433170557022095, + "learning_rate": 2.732349165596919e-05, + "loss": 0.5346, + "step": 14051 + }, + { + "epoch": 18.038510911424904, + "grad_norm": 1.842162847518921, + "learning_rate": 2.732306375695336e-05, + "loss": 0.4623, + "step": 14052 + }, + { + "epoch": 18.039794608472402, + "grad_norm": 1.7587884664535522, + "learning_rate": 2.7322635857937528e-05, + "loss": 0.4553, + "step": 14053 + }, + { + "epoch": 18.041078305519896, + "grad_norm": 1.6828949451446533, + "learning_rate": 2.7322207958921697e-05, + "loss": 0.4582, + "step": 14054 + }, + { + "epoch": 18.042362002567394, + "grad_norm": 1.2019747495651245, + "learning_rate": 2.732178005990586e-05, + "loss": 0.4846, + "step": 14055 + }, + { + "epoch": 18.043645699614892, + "grad_norm": 1.446263074874878, + "learning_rate": 2.7321352160890033e-05, + "loss": 0.5218, + "step": 14056 + }, + { + "epoch": 18.044929396662386, + "grad_norm": 1.2102768421173096, + "learning_rate": 2.73209242618742e-05, + "loss": 0.4702, + "step": 14057 + }, + { + "epoch": 18.046213093709884, + "grad_norm": 1.2402245998382568, + "learning_rate": 2.7320496362858363e-05, + "loss": 0.4382, + "step": 14058 + }, + { + "epoch": 18.047496790757382, + "grad_norm": 5.257505416870117, + "learning_rate": 2.7320068463842535e-05, + "loss": 0.4579, + "step": 14059 + }, + { + "epoch": 18.048780487804876, + "grad_norm": 2.130463123321533, + "learning_rate": 2.73196405648267e-05, + "loss": 0.4921, + "step": 14060 + }, + { + "epoch": 18.050064184852374, + "grad_norm": 3.210855007171631, + "learning_rate": 2.7319212665810872e-05, + "loss": 0.4846, + "step": 14061 + }, + { + "epoch": 18.051347881899872, + "grad_norm": 5.8563995361328125, + "learning_rate": 2.7318784766795037e-05, + "loss": 0.5494, + "step": 14062 + }, + { + "epoch": 18.05263157894737, + "grad_norm": 1.2192238569259644, + "learning_rate": 2.7318356867779205e-05, + "loss": 0.4754, + "step": 14063 + }, + { + "epoch": 18.053915275994864, + "grad_norm": 6.568171977996826, + "learning_rate": 2.7317928968763374e-05, + "loss": 0.5523, + "step": 14064 + }, + { + "epoch": 18.055198973042362, + "grad_norm": 6.9883856773376465, + "learning_rate": 2.731750106974754e-05, + "loss": 0.5095, + "step": 14065 + }, + { + "epoch": 18.05648267008986, + "grad_norm": 1.4621858596801758, + "learning_rate": 2.7317073170731707e-05, + "loss": 0.4829, + "step": 14066 + }, + { + "epoch": 18.057766367137354, + "grad_norm": 1.686644196510315, + "learning_rate": 2.7316645271715876e-05, + "loss": 0.5207, + "step": 14067 + }, + { + "epoch": 18.059050064184852, + "grad_norm": 1.735351324081421, + "learning_rate": 2.7316217372700044e-05, + "loss": 0.5494, + "step": 14068 + }, + { + "epoch": 18.06033376123235, + "grad_norm": 1.9093918800354004, + "learning_rate": 2.7315789473684213e-05, + "loss": 0.5369, + "step": 14069 + }, + { + "epoch": 18.061617458279844, + "grad_norm": 3.6101865768432617, + "learning_rate": 2.731536157466838e-05, + "loss": 0.4952, + "step": 14070 + }, + { + "epoch": 18.062901155327342, + "grad_norm": 4.339571475982666, + "learning_rate": 2.7314933675652546e-05, + "loss": 0.5351, + "step": 14071 + }, + { + "epoch": 18.06418485237484, + "grad_norm": 5.183016777038574, + "learning_rate": 2.7314505776636714e-05, + "loss": 0.6785, + "step": 14072 + }, + { + "epoch": 18.065468549422338, + "grad_norm": 1.6825629472732544, + "learning_rate": 2.7314077877620883e-05, + "loss": 0.4655, + "step": 14073 + }, + { + "epoch": 18.066752246469832, + "grad_norm": 2.119798183441162, + "learning_rate": 2.7313649978605048e-05, + "loss": 0.4421, + "step": 14074 + }, + { + "epoch": 18.06803594351733, + "grad_norm": 1.0386319160461426, + "learning_rate": 2.731322207958922e-05, + "loss": 0.4449, + "step": 14075 + }, + { + "epoch": 18.069319640564828, + "grad_norm": 2.021864175796509, + "learning_rate": 2.7312794180573385e-05, + "loss": 0.4621, + "step": 14076 + }, + { + "epoch": 18.070603337612322, + "grad_norm": 1.8136141300201416, + "learning_rate": 2.7312366281557556e-05, + "loss": 0.4322, + "step": 14077 + }, + { + "epoch": 18.07188703465982, + "grad_norm": 1.5631331205368042, + "learning_rate": 2.731193838254172e-05, + "loss": 0.4576, + "step": 14078 + }, + { + "epoch": 18.073170731707318, + "grad_norm": 3.8407576084136963, + "learning_rate": 2.7311510483525887e-05, + "loss": 0.4588, + "step": 14079 + }, + { + "epoch": 18.074454428754812, + "grad_norm": 1.5573019981384277, + "learning_rate": 2.731108258451006e-05, + "loss": 0.4602, + "step": 14080 + }, + { + "epoch": 18.07573812580231, + "grad_norm": 2.0730273723602295, + "learning_rate": 2.7310654685494223e-05, + "loss": 0.4405, + "step": 14081 + }, + { + "epoch": 18.077021822849808, + "grad_norm": 1.267505407333374, + "learning_rate": 2.7310226786478392e-05, + "loss": 0.473, + "step": 14082 + }, + { + "epoch": 18.078305519897306, + "grad_norm": 1.0225801467895508, + "learning_rate": 2.730979888746256e-05, + "loss": 0.4323, + "step": 14083 + }, + { + "epoch": 18.0795892169448, + "grad_norm": 1.200728416442871, + "learning_rate": 2.7309370988446725e-05, + "loss": 0.4462, + "step": 14084 + }, + { + "epoch": 18.080872913992298, + "grad_norm": 1.4003726243972778, + "learning_rate": 2.7308943089430894e-05, + "loss": 0.4786, + "step": 14085 + }, + { + "epoch": 18.082156611039796, + "grad_norm": 1.3838895559310913, + "learning_rate": 2.7308515190415062e-05, + "loss": 0.4441, + "step": 14086 + }, + { + "epoch": 18.08344030808729, + "grad_norm": 1.5741534233093262, + "learning_rate": 2.730808729139923e-05, + "loss": 0.4585, + "step": 14087 + }, + { + "epoch": 18.084724005134788, + "grad_norm": 1.3893027305603027, + "learning_rate": 2.73076593923834e-05, + "loss": 0.4834, + "step": 14088 + }, + { + "epoch": 18.086007702182286, + "grad_norm": 1.475265383720398, + "learning_rate": 2.7307231493367567e-05, + "loss": 0.4763, + "step": 14089 + }, + { + "epoch": 18.08729139922978, + "grad_norm": 0.963260293006897, + "learning_rate": 2.7306803594351732e-05, + "loss": 0.4608, + "step": 14090 + }, + { + "epoch": 18.088575096277278, + "grad_norm": 1.1352486610412598, + "learning_rate": 2.73063756953359e-05, + "loss": 0.429, + "step": 14091 + }, + { + "epoch": 18.089858793324776, + "grad_norm": 2.6197078227996826, + "learning_rate": 2.730594779632007e-05, + "loss": 0.4592, + "step": 14092 + }, + { + "epoch": 18.09114249037227, + "grad_norm": 1.106570839881897, + "learning_rate": 2.7305519897304234e-05, + "loss": 0.4335, + "step": 14093 + }, + { + "epoch": 18.09242618741977, + "grad_norm": 2.34970760345459, + "learning_rate": 2.7305091998288406e-05, + "loss": 0.4881, + "step": 14094 + }, + { + "epoch": 18.093709884467266, + "grad_norm": 2.3486077785491943, + "learning_rate": 2.730466409927257e-05, + "loss": 0.4486, + "step": 14095 + }, + { + "epoch": 18.094993581514764, + "grad_norm": 1.8842120170593262, + "learning_rate": 2.7304236200256743e-05, + "loss": 0.4674, + "step": 14096 + }, + { + "epoch": 18.09627727856226, + "grad_norm": 1.0510737895965576, + "learning_rate": 2.7303808301240908e-05, + "loss": 0.5098, + "step": 14097 + }, + { + "epoch": 18.097560975609756, + "grad_norm": 1.0431796312332153, + "learning_rate": 2.7303380402225073e-05, + "loss": 0.4681, + "step": 14098 + }, + { + "epoch": 18.098844672657254, + "grad_norm": 2.065739870071411, + "learning_rate": 2.7302952503209245e-05, + "loss": 0.4613, + "step": 14099 + }, + { + "epoch": 18.10012836970475, + "grad_norm": 1.6736278533935547, + "learning_rate": 2.730252460419341e-05, + "loss": 0.4936, + "step": 14100 + }, + { + "epoch": 18.101412066752246, + "grad_norm": 3.4038164615631104, + "learning_rate": 2.7302096705177578e-05, + "loss": 0.444, + "step": 14101 + }, + { + "epoch": 18.102695763799744, + "grad_norm": 2.345123529434204, + "learning_rate": 2.7301668806161746e-05, + "loss": 0.4495, + "step": 14102 + }, + { + "epoch": 18.10397946084724, + "grad_norm": 1.895352840423584, + "learning_rate": 2.7301240907145915e-05, + "loss": 0.4365, + "step": 14103 + }, + { + "epoch": 18.105263157894736, + "grad_norm": 2.4811129570007324, + "learning_rate": 2.7300813008130083e-05, + "loss": 0.4734, + "step": 14104 + }, + { + "epoch": 18.106546854942234, + "grad_norm": 0.9550304412841797, + "learning_rate": 2.7300385109114248e-05, + "loss": 0.4774, + "step": 14105 + }, + { + "epoch": 18.107830551989732, + "grad_norm": 2.116262435913086, + "learning_rate": 2.7299957210098417e-05, + "loss": 0.4549, + "step": 14106 + }, + { + "epoch": 18.109114249037226, + "grad_norm": 0.921739399433136, + "learning_rate": 2.7299529311082585e-05, + "loss": 0.4695, + "step": 14107 + }, + { + "epoch": 18.110397946084724, + "grad_norm": 4.757866382598877, + "learning_rate": 2.7299101412066754e-05, + "loss": 0.4611, + "step": 14108 + }, + { + "epoch": 18.111681643132222, + "grad_norm": 1.5524802207946777, + "learning_rate": 2.729867351305092e-05, + "loss": 0.5001, + "step": 14109 + }, + { + "epoch": 18.112965340179716, + "grad_norm": 2.7861130237579346, + "learning_rate": 2.729824561403509e-05, + "loss": 0.5203, + "step": 14110 + }, + { + "epoch": 18.114249037227214, + "grad_norm": 1.2170555591583252, + "learning_rate": 2.7297817715019255e-05, + "loss": 0.4432, + "step": 14111 + }, + { + "epoch": 18.115532734274712, + "grad_norm": 1.8526830673217773, + "learning_rate": 2.7297389816003424e-05, + "loss": 0.4268, + "step": 14112 + }, + { + "epoch": 18.116816431322206, + "grad_norm": 1.5116075277328491, + "learning_rate": 2.7296961916987592e-05, + "loss": 0.5232, + "step": 14113 + }, + { + "epoch": 18.118100128369704, + "grad_norm": 4.240783214569092, + "learning_rate": 2.7296534017971757e-05, + "loss": 0.5046, + "step": 14114 + }, + { + "epoch": 18.119383825417202, + "grad_norm": 1.7204045057296753, + "learning_rate": 2.729610611895593e-05, + "loss": 0.482, + "step": 14115 + }, + { + "epoch": 18.1206675224647, + "grad_norm": 2.04650616645813, + "learning_rate": 2.7295678219940094e-05, + "loss": 0.5164, + "step": 14116 + }, + { + "epoch": 18.121951219512194, + "grad_norm": 3.7890844345092773, + "learning_rate": 2.7295250320924262e-05, + "loss": 0.4792, + "step": 14117 + }, + { + "epoch": 18.123234916559692, + "grad_norm": 1.7339067459106445, + "learning_rate": 2.729482242190843e-05, + "loss": 0.5353, + "step": 14118 + }, + { + "epoch": 18.12451861360719, + "grad_norm": 3.057810068130493, + "learning_rate": 2.7294394522892596e-05, + "loss": 0.5272, + "step": 14119 + }, + { + "epoch": 18.125802310654684, + "grad_norm": 3.00612211227417, + "learning_rate": 2.7293966623876768e-05, + "loss": 0.5361, + "step": 14120 + }, + { + "epoch": 18.127086007702182, + "grad_norm": 4.9903411865234375, + "learning_rate": 2.7293538724860933e-05, + "loss": 0.5991, + "step": 14121 + }, + { + "epoch": 18.12836970474968, + "grad_norm": 2.7158758640289307, + "learning_rate": 2.72931108258451e-05, + "loss": 0.7561, + "step": 14122 + }, + { + "epoch": 18.129653401797174, + "grad_norm": 6.674044609069824, + "learning_rate": 2.729268292682927e-05, + "loss": 0.4314, + "step": 14123 + }, + { + "epoch": 18.130937098844672, + "grad_norm": 1.7331101894378662, + "learning_rate": 2.7292255027813438e-05, + "loss": 0.4348, + "step": 14124 + }, + { + "epoch": 18.13222079589217, + "grad_norm": 1.1232256889343262, + "learning_rate": 2.7291827128797603e-05, + "loss": 0.4877, + "step": 14125 + }, + { + "epoch": 18.133504492939664, + "grad_norm": 1.8434653282165527, + "learning_rate": 2.729139922978177e-05, + "loss": 0.4769, + "step": 14126 + }, + { + "epoch": 18.134788189987162, + "grad_norm": 1.3004422187805176, + "learning_rate": 2.729097133076594e-05, + "loss": 0.4648, + "step": 14127 + }, + { + "epoch": 18.13607188703466, + "grad_norm": 2.0836501121520996, + "learning_rate": 2.7290543431750108e-05, + "loss": 0.45, + "step": 14128 + }, + { + "epoch": 18.137355584082158, + "grad_norm": 1.5078105926513672, + "learning_rate": 2.7290115532734277e-05, + "loss": 0.4406, + "step": 14129 + }, + { + "epoch": 18.138639281129652, + "grad_norm": 1.6759917736053467, + "learning_rate": 2.728968763371844e-05, + "loss": 0.4306, + "step": 14130 + }, + { + "epoch": 18.13992297817715, + "grad_norm": 6.367386341094971, + "learning_rate": 2.7289259734702613e-05, + "loss": 0.4962, + "step": 14131 + }, + { + "epoch": 18.141206675224648, + "grad_norm": 3.035287618637085, + "learning_rate": 2.728883183568678e-05, + "loss": 0.4495, + "step": 14132 + }, + { + "epoch": 18.142490372272142, + "grad_norm": 1.4015740156173706, + "learning_rate": 2.7288403936670943e-05, + "loss": 0.4713, + "step": 14133 + }, + { + "epoch": 18.14377406931964, + "grad_norm": 4.072743892669678, + "learning_rate": 2.7287976037655115e-05, + "loss": 0.4653, + "step": 14134 + }, + { + "epoch": 18.145057766367138, + "grad_norm": 1.2118419408798218, + "learning_rate": 2.728754813863928e-05, + "loss": 0.4231, + "step": 14135 + }, + { + "epoch": 18.146341463414632, + "grad_norm": 2.4106273651123047, + "learning_rate": 2.7287120239623452e-05, + "loss": 0.4116, + "step": 14136 + }, + { + "epoch": 18.14762516046213, + "grad_norm": 4.210010528564453, + "learning_rate": 2.7286692340607617e-05, + "loss": 0.4514, + "step": 14137 + }, + { + "epoch": 18.14890885750963, + "grad_norm": 1.0390112400054932, + "learning_rate": 2.7286264441591786e-05, + "loss": 0.4717, + "step": 14138 + }, + { + "epoch": 18.150192554557126, + "grad_norm": 1.1262580156326294, + "learning_rate": 2.7285836542575954e-05, + "loss": 0.4382, + "step": 14139 + }, + { + "epoch": 18.15147625160462, + "grad_norm": 1.1220901012420654, + "learning_rate": 2.728540864356012e-05, + "loss": 0.4408, + "step": 14140 + }, + { + "epoch": 18.15275994865212, + "grad_norm": 1.5795695781707764, + "learning_rate": 2.7284980744544287e-05, + "loss": 0.4777, + "step": 14141 + }, + { + "epoch": 18.154043645699616, + "grad_norm": 2.7136693000793457, + "learning_rate": 2.7284552845528456e-05, + "loss": 0.4709, + "step": 14142 + }, + { + "epoch": 18.15532734274711, + "grad_norm": 1.1763813495635986, + "learning_rate": 2.7284124946512624e-05, + "loss": 0.5058, + "step": 14143 + }, + { + "epoch": 18.15661103979461, + "grad_norm": 1.5568313598632812, + "learning_rate": 2.7283697047496793e-05, + "loss": 0.4538, + "step": 14144 + }, + { + "epoch": 18.157894736842106, + "grad_norm": 1.723570704460144, + "learning_rate": 2.7283269148480958e-05, + "loss": 0.4476, + "step": 14145 + }, + { + "epoch": 18.1591784338896, + "grad_norm": 1.6599526405334473, + "learning_rate": 2.7282841249465126e-05, + "loss": 0.4212, + "step": 14146 + }, + { + "epoch": 18.1604621309371, + "grad_norm": 1.3912463188171387, + "learning_rate": 2.7282413350449294e-05, + "loss": 0.4801, + "step": 14147 + }, + { + "epoch": 18.161745827984596, + "grad_norm": 1.0184791088104248, + "learning_rate": 2.7281985451433463e-05, + "loss": 0.4734, + "step": 14148 + }, + { + "epoch": 18.163029525032094, + "grad_norm": 1.1582096815109253, + "learning_rate": 2.7281557552417628e-05, + "loss": 0.4862, + "step": 14149 + }, + { + "epoch": 18.16431322207959, + "grad_norm": 2.468989610671997, + "learning_rate": 2.72811296534018e-05, + "loss": 0.4789, + "step": 14150 + }, + { + "epoch": 18.165596919127086, + "grad_norm": 0.8609746694564819, + "learning_rate": 2.7280701754385965e-05, + "loss": 0.4696, + "step": 14151 + }, + { + "epoch": 18.166880616174584, + "grad_norm": 0.9587302803993225, + "learning_rate": 2.7280273855370133e-05, + "loss": 0.4658, + "step": 14152 + }, + { + "epoch": 18.16816431322208, + "grad_norm": 0.8652838468551636, + "learning_rate": 2.72798459563543e-05, + "loss": 0.5063, + "step": 14153 + }, + { + "epoch": 18.169448010269576, + "grad_norm": 1.7751853466033936, + "learning_rate": 2.7279418057338467e-05, + "loss": 0.4462, + "step": 14154 + }, + { + "epoch": 18.170731707317074, + "grad_norm": 3.4952001571655273, + "learning_rate": 2.727899015832264e-05, + "loss": 0.4574, + "step": 14155 + }, + { + "epoch": 18.17201540436457, + "grad_norm": 1.4758905172348022, + "learning_rate": 2.7278562259306803e-05, + "loss": 0.4707, + "step": 14156 + }, + { + "epoch": 18.173299101412066, + "grad_norm": 3.3388919830322266, + "learning_rate": 2.7278134360290972e-05, + "loss": 0.4709, + "step": 14157 + }, + { + "epoch": 18.174582798459564, + "grad_norm": 1.1217784881591797, + "learning_rate": 2.727770646127514e-05, + "loss": 0.4756, + "step": 14158 + }, + { + "epoch": 18.17586649550706, + "grad_norm": 1.5875409841537476, + "learning_rate": 2.7277278562259305e-05, + "loss": 0.4853, + "step": 14159 + }, + { + "epoch": 18.177150192554556, + "grad_norm": 1.5047433376312256, + "learning_rate": 2.7276850663243477e-05, + "loss": 0.5138, + "step": 14160 + }, + { + "epoch": 18.178433889602054, + "grad_norm": 1.3136240243911743, + "learning_rate": 2.7276422764227642e-05, + "loss": 0.5043, + "step": 14161 + }, + { + "epoch": 18.179717586649552, + "grad_norm": 1.4619479179382324, + "learning_rate": 2.727599486521181e-05, + "loss": 0.4889, + "step": 14162 + }, + { + "epoch": 18.181001283697046, + "grad_norm": 1.146456003189087, + "learning_rate": 2.727556696619598e-05, + "loss": 0.495, + "step": 14163 + }, + { + "epoch": 18.182284980744544, + "grad_norm": 2.179999351501465, + "learning_rate": 2.7275139067180147e-05, + "loss": 0.5152, + "step": 14164 + }, + { + "epoch": 18.183568677792042, + "grad_norm": 2.1608216762542725, + "learning_rate": 2.7274711168164312e-05, + "loss": 0.5055, + "step": 14165 + }, + { + "epoch": 18.184852374839537, + "grad_norm": 1.1916069984436035, + "learning_rate": 2.727428326914848e-05, + "loss": 0.452, + "step": 14166 + }, + { + "epoch": 18.186136071887034, + "grad_norm": 1.5822627544403076, + "learning_rate": 2.727385537013265e-05, + "loss": 0.4695, + "step": 14167 + }, + { + "epoch": 18.187419768934532, + "grad_norm": 1.219987392425537, + "learning_rate": 2.7273427471116818e-05, + "loss": 0.4979, + "step": 14168 + }, + { + "epoch": 18.188703465982027, + "grad_norm": 3.2780303955078125, + "learning_rate": 2.7272999572100986e-05, + "loss": 0.5795, + "step": 14169 + }, + { + "epoch": 18.189987163029524, + "grad_norm": 3.467515707015991, + "learning_rate": 2.727257167308515e-05, + "loss": 0.5406, + "step": 14170 + }, + { + "epoch": 18.191270860077022, + "grad_norm": 1.4481021165847778, + "learning_rate": 2.7272143774069323e-05, + "loss": 0.6434, + "step": 14171 + }, + { + "epoch": 18.19255455712452, + "grad_norm": 1.7684581279754639, + "learning_rate": 2.7271715875053488e-05, + "loss": 0.6945, + "step": 14172 + }, + { + "epoch": 18.193838254172015, + "grad_norm": 1.5405794382095337, + "learning_rate": 2.7271287976037653e-05, + "loss": 0.4353, + "step": 14173 + }, + { + "epoch": 18.195121951219512, + "grad_norm": 1.1929072141647339, + "learning_rate": 2.7270860077021825e-05, + "loss": 0.4156, + "step": 14174 + }, + { + "epoch": 18.19640564826701, + "grad_norm": 2.1216254234313965, + "learning_rate": 2.727043217800599e-05, + "loss": 0.4744, + "step": 14175 + }, + { + "epoch": 18.197689345314505, + "grad_norm": 1.0300079584121704, + "learning_rate": 2.727000427899016e-05, + "loss": 0.4443, + "step": 14176 + }, + { + "epoch": 18.198973042362002, + "grad_norm": 1.222261905670166, + "learning_rate": 2.7269576379974326e-05, + "loss": 0.4458, + "step": 14177 + }, + { + "epoch": 18.2002567394095, + "grad_norm": 1.2667537927627563, + "learning_rate": 2.7269148480958495e-05, + "loss": 0.496, + "step": 14178 + }, + { + "epoch": 18.201540436456995, + "grad_norm": 1.439029335975647, + "learning_rate": 2.7268720581942663e-05, + "loss": 0.4437, + "step": 14179 + }, + { + "epoch": 18.202824133504492, + "grad_norm": 1.2193924188613892, + "learning_rate": 2.726829268292683e-05, + "loss": 0.4411, + "step": 14180 + }, + { + "epoch": 18.20410783055199, + "grad_norm": 0.8712036609649658, + "learning_rate": 2.7267864783910997e-05, + "loss": 0.452, + "step": 14181 + }, + { + "epoch": 18.205391527599488, + "grad_norm": 1.0266139507293701, + "learning_rate": 2.7267436884895165e-05, + "loss": 0.4585, + "step": 14182 + }, + { + "epoch": 18.206675224646983, + "grad_norm": 2.222729444503784, + "learning_rate": 2.7267008985879334e-05, + "loss": 0.4491, + "step": 14183 + }, + { + "epoch": 18.20795892169448, + "grad_norm": 1.5673325061798096, + "learning_rate": 2.7266581086863502e-05, + "loss": 0.4563, + "step": 14184 + }, + { + "epoch": 18.20924261874198, + "grad_norm": 2.678799867630005, + "learning_rate": 2.726615318784767e-05, + "loss": 0.464, + "step": 14185 + }, + { + "epoch": 18.210526315789473, + "grad_norm": 3.0924198627471924, + "learning_rate": 2.7265725288831835e-05, + "loss": 0.4781, + "step": 14186 + }, + { + "epoch": 18.21181001283697, + "grad_norm": 0.9209860563278198, + "learning_rate": 2.7265297389816004e-05, + "loss": 0.4977, + "step": 14187 + }, + { + "epoch": 18.21309370988447, + "grad_norm": 1.5823289155960083, + "learning_rate": 2.7264869490800172e-05, + "loss": 0.4485, + "step": 14188 + }, + { + "epoch": 18.214377406931963, + "grad_norm": 1.5665875673294067, + "learning_rate": 2.7264441591784337e-05, + "loss": 0.4664, + "step": 14189 + }, + { + "epoch": 18.21566110397946, + "grad_norm": 1.19875967502594, + "learning_rate": 2.726401369276851e-05, + "loss": 0.4491, + "step": 14190 + }, + { + "epoch": 18.21694480102696, + "grad_norm": 2.8455257415771484, + "learning_rate": 2.7263585793752674e-05, + "loss": 0.4478, + "step": 14191 + }, + { + "epoch": 18.218228498074453, + "grad_norm": 1.6117726564407349, + "learning_rate": 2.7263157894736846e-05, + "loss": 0.4705, + "step": 14192 + }, + { + "epoch": 18.21951219512195, + "grad_norm": 0.9161317944526672, + "learning_rate": 2.726272999572101e-05, + "loss": 0.4122, + "step": 14193 + }, + { + "epoch": 18.22079589216945, + "grad_norm": 0.9353960156440735, + "learning_rate": 2.7262302096705176e-05, + "loss": 0.5207, + "step": 14194 + }, + { + "epoch": 18.222079589216946, + "grad_norm": 1.2153886556625366, + "learning_rate": 2.7261874197689348e-05, + "loss": 0.5277, + "step": 14195 + }, + { + "epoch": 18.22336328626444, + "grad_norm": 1.039232850074768, + "learning_rate": 2.7261446298673513e-05, + "loss": 0.4365, + "step": 14196 + }, + { + "epoch": 18.22464698331194, + "grad_norm": 1.8717058897018433, + "learning_rate": 2.726101839965768e-05, + "loss": 0.464, + "step": 14197 + }, + { + "epoch": 18.225930680359436, + "grad_norm": 0.8501673340797424, + "learning_rate": 2.726059050064185e-05, + "loss": 0.468, + "step": 14198 + }, + { + "epoch": 18.22721437740693, + "grad_norm": 1.033918857574463, + "learning_rate": 2.7260162601626018e-05, + "loss": 0.4444, + "step": 14199 + }, + { + "epoch": 18.22849807445443, + "grad_norm": 1.26010262966156, + "learning_rate": 2.7259734702610186e-05, + "loss": 0.4413, + "step": 14200 + }, + { + "epoch": 18.229781771501926, + "grad_norm": 1.2890615463256836, + "learning_rate": 2.725930680359435e-05, + "loss": 0.4365, + "step": 14201 + }, + { + "epoch": 18.23106546854942, + "grad_norm": 0.9594338536262512, + "learning_rate": 2.725887890457852e-05, + "loss": 0.4335, + "step": 14202 + }, + { + "epoch": 18.23234916559692, + "grad_norm": 3.413005828857422, + "learning_rate": 2.7258451005562688e-05, + "loss": 0.4663, + "step": 14203 + }, + { + "epoch": 18.233632862644416, + "grad_norm": 1.5332673788070679, + "learning_rate": 2.7258023106546857e-05, + "loss": 0.4777, + "step": 14204 + }, + { + "epoch": 18.234916559691914, + "grad_norm": 1.0344107151031494, + "learning_rate": 2.725759520753102e-05, + "loss": 0.4782, + "step": 14205 + }, + { + "epoch": 18.23620025673941, + "grad_norm": 1.3838249444961548, + "learning_rate": 2.725716730851519e-05, + "loss": 0.4989, + "step": 14206 + }, + { + "epoch": 18.237483953786906, + "grad_norm": 0.8925936818122864, + "learning_rate": 2.725673940949936e-05, + "loss": 0.4552, + "step": 14207 + }, + { + "epoch": 18.238767650834404, + "grad_norm": 1.2563204765319824, + "learning_rate": 2.7256311510483527e-05, + "loss": 0.4723, + "step": 14208 + }, + { + "epoch": 18.2400513478819, + "grad_norm": 1.5910981893539429, + "learning_rate": 2.7255883611467695e-05, + "loss": 0.4569, + "step": 14209 + }, + { + "epoch": 18.241335044929397, + "grad_norm": 1.19707453250885, + "learning_rate": 2.725545571245186e-05, + "loss": 0.4317, + "step": 14210 + }, + { + "epoch": 18.242618741976894, + "grad_norm": 0.9606711268424988, + "learning_rate": 2.7255027813436032e-05, + "loss": 0.4876, + "step": 14211 + }, + { + "epoch": 18.24390243902439, + "grad_norm": 1.283495306968689, + "learning_rate": 2.7254599914420197e-05, + "loss": 0.485, + "step": 14212 + }, + { + "epoch": 18.245186136071887, + "grad_norm": 1.3208556175231934, + "learning_rate": 2.7254172015404362e-05, + "loss": 0.4635, + "step": 14213 + }, + { + "epoch": 18.246469833119384, + "grad_norm": 1.4875141382217407, + "learning_rate": 2.7253744116388534e-05, + "loss": 0.426, + "step": 14214 + }, + { + "epoch": 18.247753530166882, + "grad_norm": 3.0759873390197754, + "learning_rate": 2.72533162173727e-05, + "loss": 0.495, + "step": 14215 + }, + { + "epoch": 18.249037227214377, + "grad_norm": 1.5430001020431519, + "learning_rate": 2.725288831835687e-05, + "loss": 0.5034, + "step": 14216 + }, + { + "epoch": 18.250320924261874, + "grad_norm": 1.316166639328003, + "learning_rate": 2.7252460419341036e-05, + "loss": 0.473, + "step": 14217 + }, + { + "epoch": 18.251604621309372, + "grad_norm": 1.0278081893920898, + "learning_rate": 2.7252032520325204e-05, + "loss": 0.5022, + "step": 14218 + }, + { + "epoch": 18.252888318356867, + "grad_norm": 17.630699157714844, + "learning_rate": 2.7251604621309373e-05, + "loss": 0.495, + "step": 14219 + }, + { + "epoch": 18.254172015404365, + "grad_norm": 1.7411372661590576, + "learning_rate": 2.7251176722293538e-05, + "loss": 0.6035, + "step": 14220 + }, + { + "epoch": 18.255455712451862, + "grad_norm": 1.6151738166809082, + "learning_rate": 2.7250748823277706e-05, + "loss": 0.5314, + "step": 14221 + }, + { + "epoch": 18.256739409499357, + "grad_norm": 2.569263458251953, + "learning_rate": 2.7250320924261875e-05, + "loss": 0.7046, + "step": 14222 + }, + { + "epoch": 18.258023106546855, + "grad_norm": 1.6981632709503174, + "learning_rate": 2.7249893025246043e-05, + "loss": 0.4291, + "step": 14223 + }, + { + "epoch": 18.259306803594352, + "grad_norm": 0.8434416651725769, + "learning_rate": 2.724946512623021e-05, + "loss": 0.4661, + "step": 14224 + }, + { + "epoch": 18.260590500641847, + "grad_norm": 1.2524415254592896, + "learning_rate": 2.724903722721438e-05, + "loss": 0.434, + "step": 14225 + }, + { + "epoch": 18.261874197689345, + "grad_norm": 1.0140442848205566, + "learning_rate": 2.7248609328198545e-05, + "loss": 0.4301, + "step": 14226 + }, + { + "epoch": 18.263157894736842, + "grad_norm": 1.0937976837158203, + "learning_rate": 2.7248181429182713e-05, + "loss": 0.4664, + "step": 14227 + }, + { + "epoch": 18.26444159178434, + "grad_norm": 0.8580169081687927, + "learning_rate": 2.724775353016688e-05, + "loss": 0.3932, + "step": 14228 + }, + { + "epoch": 18.265725288831835, + "grad_norm": 1.2015023231506348, + "learning_rate": 2.7247325631151047e-05, + "loss": 0.4489, + "step": 14229 + }, + { + "epoch": 18.267008985879333, + "grad_norm": 2.187357187271118, + "learning_rate": 2.724689773213522e-05, + "loss": 0.4332, + "step": 14230 + }, + { + "epoch": 18.26829268292683, + "grad_norm": 14.280938148498535, + "learning_rate": 2.7246469833119383e-05, + "loss": 0.4782, + "step": 14231 + }, + { + "epoch": 18.269576379974325, + "grad_norm": 1.0585665702819824, + "learning_rate": 2.7246041934103555e-05, + "loss": 0.4714, + "step": 14232 + }, + { + "epoch": 18.270860077021823, + "grad_norm": 1.4974135160446167, + "learning_rate": 2.724561403508772e-05, + "loss": 0.4401, + "step": 14233 + }, + { + "epoch": 18.27214377406932, + "grad_norm": 1.7614511251449585, + "learning_rate": 2.7245186136071885e-05, + "loss": 0.4748, + "step": 14234 + }, + { + "epoch": 18.273427471116815, + "grad_norm": 2.3397750854492188, + "learning_rate": 2.7244758237056057e-05, + "loss": 0.4435, + "step": 14235 + }, + { + "epoch": 18.274711168164313, + "grad_norm": 1.5718824863433838, + "learning_rate": 2.7244330338040222e-05, + "loss": 0.4639, + "step": 14236 + }, + { + "epoch": 18.27599486521181, + "grad_norm": 1.9499825239181519, + "learning_rate": 2.724390243902439e-05, + "loss": 0.4695, + "step": 14237 + }, + { + "epoch": 18.27727856225931, + "grad_norm": 1.1152589321136475, + "learning_rate": 2.724347454000856e-05, + "loss": 0.4613, + "step": 14238 + }, + { + "epoch": 18.278562259306803, + "grad_norm": 0.9084228277206421, + "learning_rate": 2.7243046640992727e-05, + "loss": 0.4459, + "step": 14239 + }, + { + "epoch": 18.2798459563543, + "grad_norm": 2.8250184059143066, + "learning_rate": 2.7242618741976896e-05, + "loss": 0.4379, + "step": 14240 + }, + { + "epoch": 18.2811296534018, + "grad_norm": 1.3366148471832275, + "learning_rate": 2.724219084296106e-05, + "loss": 0.4674, + "step": 14241 + }, + { + "epoch": 18.282413350449293, + "grad_norm": 1.130552887916565, + "learning_rate": 2.724176294394523e-05, + "loss": 0.4536, + "step": 14242 + }, + { + "epoch": 18.28369704749679, + "grad_norm": 1.4152454137802124, + "learning_rate": 2.7241335044929398e-05, + "loss": 0.4793, + "step": 14243 + }, + { + "epoch": 18.28498074454429, + "grad_norm": 2.325019359588623, + "learning_rate": 2.7240907145913566e-05, + "loss": 0.462, + "step": 14244 + }, + { + "epoch": 18.286264441591783, + "grad_norm": 1.7145544290542603, + "learning_rate": 2.724047924689773e-05, + "loss": 0.4817, + "step": 14245 + }, + { + "epoch": 18.28754813863928, + "grad_norm": 2.107933759689331, + "learning_rate": 2.7240051347881903e-05, + "loss": 0.4638, + "step": 14246 + }, + { + "epoch": 18.28883183568678, + "grad_norm": 2.033933401107788, + "learning_rate": 2.7239623448866068e-05, + "loss": 0.4641, + "step": 14247 + }, + { + "epoch": 18.290115532734276, + "grad_norm": 3.084332227706909, + "learning_rate": 2.7239195549850236e-05, + "loss": 0.472, + "step": 14248 + }, + { + "epoch": 18.29139922978177, + "grad_norm": 2.1301777362823486, + "learning_rate": 2.7238767650834405e-05, + "loss": 0.4583, + "step": 14249 + }, + { + "epoch": 18.29268292682927, + "grad_norm": 1.5811444520950317, + "learning_rate": 2.723833975181857e-05, + "loss": 0.4201, + "step": 14250 + }, + { + "epoch": 18.293966623876766, + "grad_norm": 2.1427459716796875, + "learning_rate": 2.723791185280274e-05, + "loss": 0.4737, + "step": 14251 + }, + { + "epoch": 18.29525032092426, + "grad_norm": 1.2041749954223633, + "learning_rate": 2.7237483953786907e-05, + "loss": 0.4805, + "step": 14252 + }, + { + "epoch": 18.29653401797176, + "grad_norm": 1.619282841682434, + "learning_rate": 2.7237056054771075e-05, + "loss": 0.4842, + "step": 14253 + }, + { + "epoch": 18.297817715019256, + "grad_norm": 0.8695156574249268, + "learning_rate": 2.7236628155755243e-05, + "loss": 0.446, + "step": 14254 + }, + { + "epoch": 18.29910141206675, + "grad_norm": 1.0429450273513794, + "learning_rate": 2.723620025673941e-05, + "loss": 0.4959, + "step": 14255 + }, + { + "epoch": 18.30038510911425, + "grad_norm": 3.9179329872131348, + "learning_rate": 2.723577235772358e-05, + "loss": 0.4387, + "step": 14256 + }, + { + "epoch": 18.301668806161747, + "grad_norm": 4.512699127197266, + "learning_rate": 2.7235344458707745e-05, + "loss": 0.5069, + "step": 14257 + }, + { + "epoch": 18.30295250320924, + "grad_norm": 1.2757006883621216, + "learning_rate": 2.7234916559691914e-05, + "loss": 0.4763, + "step": 14258 + }, + { + "epoch": 18.30423620025674, + "grad_norm": 1.2715579271316528, + "learning_rate": 2.7234488660676082e-05, + "loss": 0.4698, + "step": 14259 + }, + { + "epoch": 18.305519897304237, + "grad_norm": 4.067446708679199, + "learning_rate": 2.723406076166025e-05, + "loss": 0.5126, + "step": 14260 + }, + { + "epoch": 18.306803594351734, + "grad_norm": 1.6785352230072021, + "learning_rate": 2.7233632862644415e-05, + "loss": 0.4643, + "step": 14261 + }, + { + "epoch": 18.30808729139923, + "grad_norm": 0.9885120391845703, + "learning_rate": 2.7233204963628584e-05, + "loss": 0.4917, + "step": 14262 + }, + { + "epoch": 18.309370988446727, + "grad_norm": 1.1657917499542236, + "learning_rate": 2.7232777064612752e-05, + "loss": 0.5675, + "step": 14263 + }, + { + "epoch": 18.310654685494224, + "grad_norm": 2.7802257537841797, + "learning_rate": 2.723234916559692e-05, + "loss": 0.4824, + "step": 14264 + }, + { + "epoch": 18.31193838254172, + "grad_norm": 3.1520884037017822, + "learning_rate": 2.723192126658109e-05, + "loss": 0.4614, + "step": 14265 + }, + { + "epoch": 18.313222079589217, + "grad_norm": 1.2042880058288574, + "learning_rate": 2.7231493367565254e-05, + "loss": 0.4612, + "step": 14266 + }, + { + "epoch": 18.314505776636715, + "grad_norm": 1.54746675491333, + "learning_rate": 2.7231065468549423e-05, + "loss": 0.5234, + "step": 14267 + }, + { + "epoch": 18.31578947368421, + "grad_norm": 1.9131869077682495, + "learning_rate": 2.723063756953359e-05, + "loss": 0.5369, + "step": 14268 + }, + { + "epoch": 18.317073170731707, + "grad_norm": 2.7140023708343506, + "learning_rate": 2.7230209670517756e-05, + "loss": 0.5409, + "step": 14269 + }, + { + "epoch": 18.318356867779205, + "grad_norm": 1.2280527353286743, + "learning_rate": 2.7229781771501928e-05, + "loss": 0.558, + "step": 14270 + }, + { + "epoch": 18.319640564826702, + "grad_norm": 1.6089612245559692, + "learning_rate": 2.7229353872486093e-05, + "loss": 0.5751, + "step": 14271 + }, + { + "epoch": 18.320924261874197, + "grad_norm": 3.7125067710876465, + "learning_rate": 2.7228925973470265e-05, + "loss": 0.6449, + "step": 14272 + }, + { + "epoch": 18.322207958921695, + "grad_norm": 1.8111283779144287, + "learning_rate": 2.722849807445443e-05, + "loss": 0.4123, + "step": 14273 + }, + { + "epoch": 18.323491655969192, + "grad_norm": 2.094712495803833, + "learning_rate": 2.7228070175438595e-05, + "loss": 0.4407, + "step": 14274 + }, + { + "epoch": 18.324775353016687, + "grad_norm": 2.0076093673706055, + "learning_rate": 2.7227642276422766e-05, + "loss": 0.4613, + "step": 14275 + }, + { + "epoch": 18.326059050064185, + "grad_norm": 1.796096682548523, + "learning_rate": 2.722721437740693e-05, + "loss": 0.4776, + "step": 14276 + }, + { + "epoch": 18.327342747111683, + "grad_norm": 2.4061877727508545, + "learning_rate": 2.72267864783911e-05, + "loss": 0.4661, + "step": 14277 + }, + { + "epoch": 18.328626444159177, + "grad_norm": 1.5894122123718262, + "learning_rate": 2.722635857937527e-05, + "loss": 0.4551, + "step": 14278 + }, + { + "epoch": 18.329910141206675, + "grad_norm": 1.4553773403167725, + "learning_rate": 2.7225930680359437e-05, + "loss": 0.4955, + "step": 14279 + }, + { + "epoch": 18.331193838254173, + "grad_norm": 5.5045928955078125, + "learning_rate": 2.7225502781343605e-05, + "loss": 0.4856, + "step": 14280 + }, + { + "epoch": 18.33247753530167, + "grad_norm": 1.5602785348892212, + "learning_rate": 2.722507488232777e-05, + "loss": 0.4395, + "step": 14281 + }, + { + "epoch": 18.333761232349165, + "grad_norm": 1.5358810424804688, + "learning_rate": 2.722464698331194e-05, + "loss": 0.4778, + "step": 14282 + }, + { + "epoch": 18.335044929396663, + "grad_norm": 2.94325852394104, + "learning_rate": 2.7224219084296107e-05, + "loss": 0.5109, + "step": 14283 + }, + { + "epoch": 18.33632862644416, + "grad_norm": 1.298902153968811, + "learning_rate": 2.7223791185280275e-05, + "loss": 0.4923, + "step": 14284 + }, + { + "epoch": 18.337612323491655, + "grad_norm": 0.918603241443634, + "learning_rate": 2.722336328626444e-05, + "loss": 0.4118, + "step": 14285 + }, + { + "epoch": 18.338896020539153, + "grad_norm": 0.993306040763855, + "learning_rate": 2.7222935387248612e-05, + "loss": 0.4634, + "step": 14286 + }, + { + "epoch": 18.34017971758665, + "grad_norm": 1.1824828386306763, + "learning_rate": 2.7222507488232777e-05, + "loss": 0.442, + "step": 14287 + }, + { + "epoch": 18.341463414634145, + "grad_norm": 1.3913394212722778, + "learning_rate": 2.7222079589216942e-05, + "loss": 0.4441, + "step": 14288 + }, + { + "epoch": 18.342747111681643, + "grad_norm": 1.3539866209030151, + "learning_rate": 2.7221651690201114e-05, + "loss": 0.4509, + "step": 14289 + }, + { + "epoch": 18.34403080872914, + "grad_norm": 1.5806307792663574, + "learning_rate": 2.722122379118528e-05, + "loss": 0.478, + "step": 14290 + }, + { + "epoch": 18.345314505776635, + "grad_norm": 3.6989169120788574, + "learning_rate": 2.722079589216945e-05, + "loss": 0.4134, + "step": 14291 + }, + { + "epoch": 18.346598202824133, + "grad_norm": 1.5716050863265991, + "learning_rate": 2.7220367993153616e-05, + "loss": 0.4168, + "step": 14292 + }, + { + "epoch": 18.34788189987163, + "grad_norm": 1.984475016593933, + "learning_rate": 2.7219940094137784e-05, + "loss": 0.4156, + "step": 14293 + }, + { + "epoch": 18.34916559691913, + "grad_norm": 2.0752828121185303, + "learning_rate": 2.7219512195121953e-05, + "loss": 0.4471, + "step": 14294 + }, + { + "epoch": 18.350449293966623, + "grad_norm": 3.348296642303467, + "learning_rate": 2.7219084296106118e-05, + "loss": 0.4644, + "step": 14295 + }, + { + "epoch": 18.35173299101412, + "grad_norm": 1.6429173946380615, + "learning_rate": 2.7218656397090286e-05, + "loss": 0.4247, + "step": 14296 + }, + { + "epoch": 18.35301668806162, + "grad_norm": 2.4556446075439453, + "learning_rate": 2.7218228498074455e-05, + "loss": 0.501, + "step": 14297 + }, + { + "epoch": 18.354300385109113, + "grad_norm": 1.4766185283660889, + "learning_rate": 2.7217800599058623e-05, + "loss": 0.4673, + "step": 14298 + }, + { + "epoch": 18.35558408215661, + "grad_norm": 2.039224147796631, + "learning_rate": 2.721737270004279e-05, + "loss": 0.4205, + "step": 14299 + }, + { + "epoch": 18.35686777920411, + "grad_norm": 4.649758338928223, + "learning_rate": 2.721694480102696e-05, + "loss": 0.4632, + "step": 14300 + }, + { + "epoch": 18.358151476251603, + "grad_norm": 1.324184536933899, + "learning_rate": 2.7216516902011125e-05, + "loss": 0.4664, + "step": 14301 + }, + { + "epoch": 18.3594351732991, + "grad_norm": 4.075526237487793, + "learning_rate": 2.7216089002995293e-05, + "loss": 0.5025, + "step": 14302 + }, + { + "epoch": 18.3607188703466, + "grad_norm": 1.5091943740844727, + "learning_rate": 2.721566110397946e-05, + "loss": 0.4953, + "step": 14303 + }, + { + "epoch": 18.362002567394097, + "grad_norm": 1.2795536518096924, + "learning_rate": 2.7215233204963627e-05, + "loss": 0.4959, + "step": 14304 + }, + { + "epoch": 18.36328626444159, + "grad_norm": 4.201032638549805, + "learning_rate": 2.72148053059478e-05, + "loss": 0.4627, + "step": 14305 + }, + { + "epoch": 18.36456996148909, + "grad_norm": 2.0412228107452393, + "learning_rate": 2.7214377406931964e-05, + "loss": 0.4516, + "step": 14306 + }, + { + "epoch": 18.365853658536587, + "grad_norm": 1.311631679534912, + "learning_rate": 2.7213949507916135e-05, + "loss": 0.5041, + "step": 14307 + }, + { + "epoch": 18.36713735558408, + "grad_norm": 2.0023279190063477, + "learning_rate": 2.72135216089003e-05, + "loss": 0.4615, + "step": 14308 + }, + { + "epoch": 18.36842105263158, + "grad_norm": 2.080775260925293, + "learning_rate": 2.7213093709884465e-05, + "loss": 0.4765, + "step": 14309 + }, + { + "epoch": 18.369704749679077, + "grad_norm": 1.3034532070159912, + "learning_rate": 2.7212665810868637e-05, + "loss": 0.4558, + "step": 14310 + }, + { + "epoch": 18.37098844672657, + "grad_norm": 1.4001145362854004, + "learning_rate": 2.7212237911852802e-05, + "loss": 0.4584, + "step": 14311 + }, + { + "epoch": 18.37227214377407, + "grad_norm": 4.105370044708252, + "learning_rate": 2.721181001283697e-05, + "loss": 0.4803, + "step": 14312 + }, + { + "epoch": 18.373555840821567, + "grad_norm": 1.9198009967803955, + "learning_rate": 2.721138211382114e-05, + "loss": 0.5011, + "step": 14313 + }, + { + "epoch": 18.374839537869065, + "grad_norm": 2.674912691116333, + "learning_rate": 2.7210954214805307e-05, + "loss": 0.5286, + "step": 14314 + }, + { + "epoch": 18.37612323491656, + "grad_norm": 4.564897537231445, + "learning_rate": 2.7210526315789476e-05, + "loss": 0.4742, + "step": 14315 + }, + { + "epoch": 18.377406931964057, + "grad_norm": 4.2377543449401855, + "learning_rate": 2.721009841677364e-05, + "loss": 0.4599, + "step": 14316 + }, + { + "epoch": 18.378690629011555, + "grad_norm": 2.788019895553589, + "learning_rate": 2.720967051775781e-05, + "loss": 0.5432, + "step": 14317 + }, + { + "epoch": 18.37997432605905, + "grad_norm": 2.5820770263671875, + "learning_rate": 2.7209242618741978e-05, + "loss": 0.6275, + "step": 14318 + }, + { + "epoch": 18.381258023106547, + "grad_norm": 1.7898918390274048, + "learning_rate": 2.7208814719726146e-05, + "loss": 0.5318, + "step": 14319 + }, + { + "epoch": 18.382541720154045, + "grad_norm": 2.892651081085205, + "learning_rate": 2.720838682071031e-05, + "loss": 0.5902, + "step": 14320 + }, + { + "epoch": 18.38382541720154, + "grad_norm": 6.78886079788208, + "learning_rate": 2.7207958921694483e-05, + "loss": 0.5829, + "step": 14321 + }, + { + "epoch": 18.385109114249037, + "grad_norm": 2.394763708114624, + "learning_rate": 2.7207531022678648e-05, + "loss": 0.6893, + "step": 14322 + }, + { + "epoch": 18.386392811296535, + "grad_norm": 1.6416538953781128, + "learning_rate": 2.7207103123662816e-05, + "loss": 0.4435, + "step": 14323 + }, + { + "epoch": 18.387676508344033, + "grad_norm": 1.5578231811523438, + "learning_rate": 2.7206675224646985e-05, + "loss": 0.4374, + "step": 14324 + }, + { + "epoch": 18.388960205391527, + "grad_norm": 2.1818301677703857, + "learning_rate": 2.720624732563115e-05, + "loss": 0.4405, + "step": 14325 + }, + { + "epoch": 18.390243902439025, + "grad_norm": 1.8648860454559326, + "learning_rate": 2.720581942661532e-05, + "loss": 0.4838, + "step": 14326 + }, + { + "epoch": 18.391527599486523, + "grad_norm": 1.5870556831359863, + "learning_rate": 2.7205391527599487e-05, + "loss": 0.4647, + "step": 14327 + }, + { + "epoch": 18.392811296534017, + "grad_norm": 1.564229965209961, + "learning_rate": 2.7204963628583655e-05, + "loss": 0.4653, + "step": 14328 + }, + { + "epoch": 18.394094993581515, + "grad_norm": 1.8285596370697021, + "learning_rate": 2.7204535729567823e-05, + "loss": 0.4762, + "step": 14329 + }, + { + "epoch": 18.395378690629013, + "grad_norm": 1.6215194463729858, + "learning_rate": 2.720410783055199e-05, + "loss": 0.468, + "step": 14330 + }, + { + "epoch": 18.396662387676507, + "grad_norm": 1.8158643245697021, + "learning_rate": 2.720367993153616e-05, + "loss": 0.4874, + "step": 14331 + }, + { + "epoch": 18.397946084724005, + "grad_norm": 1.8584799766540527, + "learning_rate": 2.7203252032520325e-05, + "loss": 0.4873, + "step": 14332 + }, + { + "epoch": 18.399229781771503, + "grad_norm": 1.5062977075576782, + "learning_rate": 2.7202824133504494e-05, + "loss": 0.4615, + "step": 14333 + }, + { + "epoch": 18.400513478818997, + "grad_norm": 4.5770158767700195, + "learning_rate": 2.7202396234488662e-05, + "loss": 0.4482, + "step": 14334 + }, + { + "epoch": 18.401797175866495, + "grad_norm": 1.6415456533432007, + "learning_rate": 2.7201968335472827e-05, + "loss": 0.4407, + "step": 14335 + }, + { + "epoch": 18.403080872913993, + "grad_norm": 1.3460309505462646, + "learning_rate": 2.7201540436456996e-05, + "loss": 0.5006, + "step": 14336 + }, + { + "epoch": 18.40436456996149, + "grad_norm": 3.4575510025024414, + "learning_rate": 2.7201112537441164e-05, + "loss": 0.4509, + "step": 14337 + }, + { + "epoch": 18.405648267008985, + "grad_norm": 0.9046427011489868, + "learning_rate": 2.7200684638425332e-05, + "loss": 0.4617, + "step": 14338 + }, + { + "epoch": 18.406931964056483, + "grad_norm": 2.4036691188812256, + "learning_rate": 2.72002567394095e-05, + "loss": 0.4641, + "step": 14339 + }, + { + "epoch": 18.40821566110398, + "grad_norm": 5.322656154632568, + "learning_rate": 2.719982884039367e-05, + "loss": 0.5019, + "step": 14340 + }, + { + "epoch": 18.409499358151475, + "grad_norm": 1.0421240329742432, + "learning_rate": 2.7199400941377834e-05, + "loss": 0.5097, + "step": 14341 + }, + { + "epoch": 18.410783055198973, + "grad_norm": 3.5532803535461426, + "learning_rate": 2.7198973042362003e-05, + "loss": 0.4458, + "step": 14342 + }, + { + "epoch": 18.41206675224647, + "grad_norm": 1.2017767429351807, + "learning_rate": 2.719854514334617e-05, + "loss": 0.4469, + "step": 14343 + }, + { + "epoch": 18.413350449293965, + "grad_norm": 1.2735846042633057, + "learning_rate": 2.7198117244330336e-05, + "loss": 0.4474, + "step": 14344 + }, + { + "epoch": 18.414634146341463, + "grad_norm": 6.121431350708008, + "learning_rate": 2.7197689345314508e-05, + "loss": 0.4884, + "step": 14345 + }, + { + "epoch": 18.41591784338896, + "grad_norm": 1.365052580833435, + "learning_rate": 2.7197261446298673e-05, + "loss": 0.4295, + "step": 14346 + }, + { + "epoch": 18.41720154043646, + "grad_norm": 1.5450782775878906, + "learning_rate": 2.7196833547282845e-05, + "loss": 0.4503, + "step": 14347 + }, + { + "epoch": 18.418485237483953, + "grad_norm": 2.6666746139526367, + "learning_rate": 2.719640564826701e-05, + "loss": 0.4591, + "step": 14348 + }, + { + "epoch": 18.41976893453145, + "grad_norm": 2.045609712600708, + "learning_rate": 2.7195977749251175e-05, + "loss": 0.4385, + "step": 14349 + }, + { + "epoch": 18.42105263157895, + "grad_norm": 3.7175424098968506, + "learning_rate": 2.7195549850235347e-05, + "loss": 0.4466, + "step": 14350 + }, + { + "epoch": 18.422336328626443, + "grad_norm": 1.9150588512420654, + "learning_rate": 2.719512195121951e-05, + "loss": 0.4517, + "step": 14351 + }, + { + "epoch": 18.42362002567394, + "grad_norm": 1.659748911857605, + "learning_rate": 2.719469405220368e-05, + "loss": 0.4569, + "step": 14352 + }, + { + "epoch": 18.42490372272144, + "grad_norm": 3.5091822147369385, + "learning_rate": 2.719426615318785e-05, + "loss": 0.4336, + "step": 14353 + }, + { + "epoch": 18.426187419768933, + "grad_norm": 1.6357526779174805, + "learning_rate": 2.7193838254172017e-05, + "loss": 0.4655, + "step": 14354 + }, + { + "epoch": 18.42747111681643, + "grad_norm": 2.0718398094177246, + "learning_rate": 2.7193410355156185e-05, + "loss": 0.4555, + "step": 14355 + }, + { + "epoch": 18.42875481386393, + "grad_norm": 1.6521514654159546, + "learning_rate": 2.719298245614035e-05, + "loss": 0.461, + "step": 14356 + }, + { + "epoch": 18.430038510911427, + "grad_norm": 1.4223660230636597, + "learning_rate": 2.719255455712452e-05, + "loss": 0.4382, + "step": 14357 + }, + { + "epoch": 18.43132220795892, + "grad_norm": 8.378984451293945, + "learning_rate": 2.7192126658108687e-05, + "loss": 0.4778, + "step": 14358 + }, + { + "epoch": 18.43260590500642, + "grad_norm": 1.6532307863235474, + "learning_rate": 2.7191698759092855e-05, + "loss": 0.4804, + "step": 14359 + }, + { + "epoch": 18.433889602053917, + "grad_norm": 2.055291175842285, + "learning_rate": 2.719127086007702e-05, + "loss": 0.5129, + "step": 14360 + }, + { + "epoch": 18.43517329910141, + "grad_norm": 1.2613681554794312, + "learning_rate": 2.7190842961061192e-05, + "loss": 0.4552, + "step": 14361 + }, + { + "epoch": 18.43645699614891, + "grad_norm": 1.902395248413086, + "learning_rate": 2.7190415062045357e-05, + "loss": 0.4673, + "step": 14362 + }, + { + "epoch": 18.437740693196407, + "grad_norm": 1.4180023670196533, + "learning_rate": 2.7189987163029526e-05, + "loss": 0.5291, + "step": 14363 + }, + { + "epoch": 18.4390243902439, + "grad_norm": 1.6488420963287354, + "learning_rate": 2.7189559264013694e-05, + "loss": 0.5086, + "step": 14364 + }, + { + "epoch": 18.4403080872914, + "grad_norm": 1.2959429025650024, + "learning_rate": 2.718913136499786e-05, + "loss": 0.4652, + "step": 14365 + }, + { + "epoch": 18.441591784338897, + "grad_norm": 1.3555574417114258, + "learning_rate": 2.718870346598203e-05, + "loss": 0.581, + "step": 14366 + }, + { + "epoch": 18.44287548138639, + "grad_norm": 2.200937032699585, + "learning_rate": 2.7188275566966196e-05, + "loss": 0.5027, + "step": 14367 + }, + { + "epoch": 18.44415917843389, + "grad_norm": 2.036929130554199, + "learning_rate": 2.7187847667950364e-05, + "loss": 0.4841, + "step": 14368 + }, + { + "epoch": 18.445442875481387, + "grad_norm": 3.468675136566162, + "learning_rate": 2.7187419768934533e-05, + "loss": 0.5422, + "step": 14369 + }, + { + "epoch": 18.446726572528885, + "grad_norm": 4.244406223297119, + "learning_rate": 2.7186991869918698e-05, + "loss": 0.5461, + "step": 14370 + }, + { + "epoch": 18.44801026957638, + "grad_norm": 1.1355717182159424, + "learning_rate": 2.718656397090287e-05, + "loss": 0.5767, + "step": 14371 + }, + { + "epoch": 18.449293966623877, + "grad_norm": 2.485356569290161, + "learning_rate": 2.7186136071887035e-05, + "loss": 0.6591, + "step": 14372 + }, + { + "epoch": 18.450577663671375, + "grad_norm": 1.3114367723464966, + "learning_rate": 2.7185708172871203e-05, + "loss": 0.4512, + "step": 14373 + }, + { + "epoch": 18.45186136071887, + "grad_norm": 1.175136685371399, + "learning_rate": 2.718528027385537e-05, + "loss": 0.4285, + "step": 14374 + }, + { + "epoch": 18.453145057766367, + "grad_norm": 2.0970513820648193, + "learning_rate": 2.718485237483954e-05, + "loss": 0.4513, + "step": 14375 + }, + { + "epoch": 18.454428754813865, + "grad_norm": 3.7275824546813965, + "learning_rate": 2.7184424475823705e-05, + "loss": 0.4853, + "step": 14376 + }, + { + "epoch": 18.45571245186136, + "grad_norm": 2.6685895919799805, + "learning_rate": 2.7183996576807873e-05, + "loss": 0.4873, + "step": 14377 + }, + { + "epoch": 18.456996148908857, + "grad_norm": 2.589540719985962, + "learning_rate": 2.7183568677792042e-05, + "loss": 0.4592, + "step": 14378 + }, + { + "epoch": 18.458279845956355, + "grad_norm": 1.2683513164520264, + "learning_rate": 2.718314077877621e-05, + "loss": 0.4912, + "step": 14379 + }, + { + "epoch": 18.459563543003853, + "grad_norm": 1.1113626956939697, + "learning_rate": 2.718271287976038e-05, + "loss": 0.4191, + "step": 14380 + }, + { + "epoch": 18.460847240051347, + "grad_norm": 1.2892404794692993, + "learning_rate": 2.7182284980744544e-05, + "loss": 0.4582, + "step": 14381 + }, + { + "epoch": 18.462130937098845, + "grad_norm": 2.0531976222991943, + "learning_rate": 2.7181857081728715e-05, + "loss": 0.4779, + "step": 14382 + }, + { + "epoch": 18.463414634146343, + "grad_norm": 2.004587173461914, + "learning_rate": 2.718142918271288e-05, + "loss": 0.439, + "step": 14383 + }, + { + "epoch": 18.464698331193837, + "grad_norm": 1.4259406328201294, + "learning_rate": 2.7181001283697045e-05, + "loss": 0.5215, + "step": 14384 + }, + { + "epoch": 18.465982028241335, + "grad_norm": 1.8227639198303223, + "learning_rate": 2.7180573384681217e-05, + "loss": 0.4591, + "step": 14385 + }, + { + "epoch": 18.467265725288833, + "grad_norm": 1.4029326438903809, + "learning_rate": 2.7180145485665382e-05, + "loss": 0.4346, + "step": 14386 + }, + { + "epoch": 18.468549422336327, + "grad_norm": 2.2733333110809326, + "learning_rate": 2.7179717586649554e-05, + "loss": 0.4667, + "step": 14387 + }, + { + "epoch": 18.469833119383825, + "grad_norm": 1.7335543632507324, + "learning_rate": 2.717928968763372e-05, + "loss": 0.4623, + "step": 14388 + }, + { + "epoch": 18.471116816431323, + "grad_norm": 0.9149330258369446, + "learning_rate": 2.7178861788617887e-05, + "loss": 0.4668, + "step": 14389 + }, + { + "epoch": 18.47240051347882, + "grad_norm": 3.2037947177886963, + "learning_rate": 2.7178433889602056e-05, + "loss": 0.432, + "step": 14390 + }, + { + "epoch": 18.473684210526315, + "grad_norm": 0.9575721025466919, + "learning_rate": 2.717800599058622e-05, + "loss": 0.4699, + "step": 14391 + }, + { + "epoch": 18.474967907573813, + "grad_norm": 3.2010064125061035, + "learning_rate": 2.717757809157039e-05, + "loss": 0.4867, + "step": 14392 + }, + { + "epoch": 18.47625160462131, + "grad_norm": 0.8827381134033203, + "learning_rate": 2.7177150192554558e-05, + "loss": 0.4353, + "step": 14393 + }, + { + "epoch": 18.477535301668805, + "grad_norm": 0.9303033351898193, + "learning_rate": 2.7176722293538726e-05, + "loss": 0.4167, + "step": 14394 + }, + { + "epoch": 18.478818998716303, + "grad_norm": 1.2555183172225952, + "learning_rate": 2.7176294394522895e-05, + "loss": 0.44, + "step": 14395 + }, + { + "epoch": 18.4801026957638, + "grad_norm": 1.497230052947998, + "learning_rate": 2.717586649550706e-05, + "loss": 0.5421, + "step": 14396 + }, + { + "epoch": 18.481386392811295, + "grad_norm": 2.158600330352783, + "learning_rate": 2.7175438596491228e-05, + "loss": 0.4707, + "step": 14397 + }, + { + "epoch": 18.482670089858793, + "grad_norm": 1.6859686374664307, + "learning_rate": 2.7175010697475396e-05, + "loss": 0.4667, + "step": 14398 + }, + { + "epoch": 18.48395378690629, + "grad_norm": 1.7780177593231201, + "learning_rate": 2.7174582798459565e-05, + "loss": 0.4815, + "step": 14399 + }, + { + "epoch": 18.485237483953785, + "grad_norm": 1.2966188192367554, + "learning_rate": 2.717415489944373e-05, + "loss": 0.4362, + "step": 14400 + }, + { + "epoch": 18.486521181001283, + "grad_norm": 2.4713823795318604, + "learning_rate": 2.71737270004279e-05, + "loss": 0.4609, + "step": 14401 + }, + { + "epoch": 18.48780487804878, + "grad_norm": 2.0441439151763916, + "learning_rate": 2.7173299101412067e-05, + "loss": 0.4931, + "step": 14402 + }, + { + "epoch": 18.48908857509628, + "grad_norm": 1.3287744522094727, + "learning_rate": 2.7172871202396235e-05, + "loss": 0.4703, + "step": 14403 + }, + { + "epoch": 18.490372272143773, + "grad_norm": 3.3358445167541504, + "learning_rate": 2.7172443303380403e-05, + "loss": 0.5198, + "step": 14404 + }, + { + "epoch": 18.49165596919127, + "grad_norm": 1.4540660381317139, + "learning_rate": 2.717201540436457e-05, + "loss": 0.4592, + "step": 14405 + }, + { + "epoch": 18.49293966623877, + "grad_norm": 3.1868581771850586, + "learning_rate": 2.717158750534874e-05, + "loss": 0.499, + "step": 14406 + }, + { + "epoch": 18.494223363286263, + "grad_norm": 1.6927801370620728, + "learning_rate": 2.7171159606332905e-05, + "loss": 0.4997, + "step": 14407 + }, + { + "epoch": 18.49550706033376, + "grad_norm": 1.7775335311889648, + "learning_rate": 2.7170731707317074e-05, + "loss": 0.5443, + "step": 14408 + }, + { + "epoch": 18.49679075738126, + "grad_norm": 1.1663912534713745, + "learning_rate": 2.7170303808301242e-05, + "loss": 0.4677, + "step": 14409 + }, + { + "epoch": 18.498074454428753, + "grad_norm": 1.5396915674209595, + "learning_rate": 2.7169875909285407e-05, + "loss": 0.4487, + "step": 14410 + }, + { + "epoch": 18.49935815147625, + "grad_norm": 1.7587486505508423, + "learning_rate": 2.716944801026958e-05, + "loss": 0.4696, + "step": 14411 + }, + { + "epoch": 18.50064184852375, + "grad_norm": 9.132577896118164, + "learning_rate": 2.7169020111253744e-05, + "loss": 0.4778, + "step": 14412 + }, + { + "epoch": 18.501925545571247, + "grad_norm": 2.527627944946289, + "learning_rate": 2.7168592212237912e-05, + "loss": 0.4453, + "step": 14413 + }, + { + "epoch": 18.50320924261874, + "grad_norm": 1.1935913562774658, + "learning_rate": 2.716816431322208e-05, + "loss": 0.5363, + "step": 14414 + }, + { + "epoch": 18.50449293966624, + "grad_norm": 1.2284256219863892, + "learning_rate": 2.716773641420625e-05, + "loss": 0.5274, + "step": 14415 + }, + { + "epoch": 18.505776636713737, + "grad_norm": 2.3069865703582764, + "learning_rate": 2.7167308515190414e-05, + "loss": 0.5187, + "step": 14416 + }, + { + "epoch": 18.50706033376123, + "grad_norm": 1.4701989889144897, + "learning_rate": 2.7166880616174583e-05, + "loss": 0.5834, + "step": 14417 + }, + { + "epoch": 18.50834403080873, + "grad_norm": 1.6315367221832275, + "learning_rate": 2.716645271715875e-05, + "loss": 0.481, + "step": 14418 + }, + { + "epoch": 18.509627727856227, + "grad_norm": 1.214907169342041, + "learning_rate": 2.716602481814292e-05, + "loss": 0.5093, + "step": 14419 + }, + { + "epoch": 18.51091142490372, + "grad_norm": 1.9021774530410767, + "learning_rate": 2.7165596919127088e-05, + "loss": 0.5283, + "step": 14420 + }, + { + "epoch": 18.51219512195122, + "grad_norm": 1.3396943807601929, + "learning_rate": 2.7165169020111253e-05, + "loss": 0.5731, + "step": 14421 + }, + { + "epoch": 18.513478818998717, + "grad_norm": 2.4767041206359863, + "learning_rate": 2.7164741121095425e-05, + "loss": 0.7414, + "step": 14422 + }, + { + "epoch": 18.514762516046215, + "grad_norm": 1.565177083015442, + "learning_rate": 2.716431322207959e-05, + "loss": 0.4361, + "step": 14423 + }, + { + "epoch": 18.51604621309371, + "grad_norm": 1.138330340385437, + "learning_rate": 2.7163885323063755e-05, + "loss": 0.4574, + "step": 14424 + }, + { + "epoch": 18.517329910141207, + "grad_norm": 5.100091457366943, + "learning_rate": 2.7163457424047927e-05, + "loss": 0.4516, + "step": 14425 + }, + { + "epoch": 18.518613607188705, + "grad_norm": 1.0485107898712158, + "learning_rate": 2.716302952503209e-05, + "loss": 0.4339, + "step": 14426 + }, + { + "epoch": 18.5198973042362, + "grad_norm": 0.977959394454956, + "learning_rate": 2.7162601626016263e-05, + "loss": 0.4633, + "step": 14427 + }, + { + "epoch": 18.521181001283697, + "grad_norm": 3.8167405128479004, + "learning_rate": 2.716217372700043e-05, + "loss": 0.4528, + "step": 14428 + }, + { + "epoch": 18.522464698331195, + "grad_norm": 0.9213509559631348, + "learning_rate": 2.7161745827984597e-05, + "loss": 0.4617, + "step": 14429 + }, + { + "epoch": 18.52374839537869, + "grad_norm": 2.547679901123047, + "learning_rate": 2.7161317928968765e-05, + "loss": 0.4376, + "step": 14430 + }, + { + "epoch": 18.525032092426187, + "grad_norm": 0.9293085932731628, + "learning_rate": 2.716089002995293e-05, + "loss": 0.4601, + "step": 14431 + }, + { + "epoch": 18.526315789473685, + "grad_norm": 1.810917615890503, + "learning_rate": 2.71604621309371e-05, + "loss": 0.4437, + "step": 14432 + }, + { + "epoch": 18.527599486521183, + "grad_norm": 1.7927021980285645, + "learning_rate": 2.7160034231921267e-05, + "loss": 0.4297, + "step": 14433 + }, + { + "epoch": 18.528883183568677, + "grad_norm": 5.067801475524902, + "learning_rate": 2.7159606332905436e-05, + "loss": 0.4822, + "step": 14434 + }, + { + "epoch": 18.530166880616175, + "grad_norm": 1.4542763233184814, + "learning_rate": 2.7159178433889604e-05, + "loss": 0.4532, + "step": 14435 + }, + { + "epoch": 18.531450577663673, + "grad_norm": 1.0393657684326172, + "learning_rate": 2.7158750534873772e-05, + "loss": 0.4559, + "step": 14436 + }, + { + "epoch": 18.532734274711167, + "grad_norm": 2.6050283908843994, + "learning_rate": 2.7158322635857937e-05, + "loss": 0.436, + "step": 14437 + }, + { + "epoch": 18.534017971758665, + "grad_norm": 1.356638789176941, + "learning_rate": 2.7157894736842106e-05, + "loss": 0.4591, + "step": 14438 + }, + { + "epoch": 18.535301668806163, + "grad_norm": 1.8155786991119385, + "learning_rate": 2.7157466837826274e-05, + "loss": 0.4316, + "step": 14439 + }, + { + "epoch": 18.536585365853657, + "grad_norm": 1.1624372005462646, + "learning_rate": 2.715703893881044e-05, + "loss": 0.4623, + "step": 14440 + }, + { + "epoch": 18.537869062901155, + "grad_norm": 1.5110644102096558, + "learning_rate": 2.715661103979461e-05, + "loss": 0.443, + "step": 14441 + }, + { + "epoch": 18.539152759948653, + "grad_norm": 1.1290174722671509, + "learning_rate": 2.7156183140778776e-05, + "loss": 0.4664, + "step": 14442 + }, + { + "epoch": 18.540436456996147, + "grad_norm": 1.0708556175231934, + "learning_rate": 2.7155755241762948e-05, + "loss": 0.4532, + "step": 14443 + }, + { + "epoch": 18.541720154043645, + "grad_norm": 1.3804683685302734, + "learning_rate": 2.7155327342747113e-05, + "loss": 0.4793, + "step": 14444 + }, + { + "epoch": 18.543003851091143, + "grad_norm": 1.53593111038208, + "learning_rate": 2.7154899443731278e-05, + "loss": 0.4527, + "step": 14445 + }, + { + "epoch": 18.54428754813864, + "grad_norm": 1.4262148141860962, + "learning_rate": 2.715447154471545e-05, + "loss": 0.4198, + "step": 14446 + }, + { + "epoch": 18.545571245186135, + "grad_norm": 2.3351006507873535, + "learning_rate": 2.7154043645699615e-05, + "loss": 0.5074, + "step": 14447 + }, + { + "epoch": 18.546854942233633, + "grad_norm": 1.1710669994354248, + "learning_rate": 2.7153615746683783e-05, + "loss": 0.4726, + "step": 14448 + }, + { + "epoch": 18.54813863928113, + "grad_norm": 1.9777978658676147, + "learning_rate": 2.715318784766795e-05, + "loss": 0.4823, + "step": 14449 + }, + { + "epoch": 18.549422336328625, + "grad_norm": 2.8999006748199463, + "learning_rate": 2.715275994865212e-05, + "loss": 0.4281, + "step": 14450 + }, + { + "epoch": 18.550706033376123, + "grad_norm": 2.1188955307006836, + "learning_rate": 2.715233204963629e-05, + "loss": 0.4619, + "step": 14451 + }, + { + "epoch": 18.55198973042362, + "grad_norm": 0.9281800985336304, + "learning_rate": 2.7151904150620453e-05, + "loss": 0.4508, + "step": 14452 + }, + { + "epoch": 18.553273427471115, + "grad_norm": 1.246497631072998, + "learning_rate": 2.7151476251604622e-05, + "loss": 0.4691, + "step": 14453 + }, + { + "epoch": 18.554557124518613, + "grad_norm": 1.0639704465866089, + "learning_rate": 2.715104835258879e-05, + "loss": 0.4713, + "step": 14454 + }, + { + "epoch": 18.55584082156611, + "grad_norm": 2.38264799118042, + "learning_rate": 2.715062045357296e-05, + "loss": 0.4857, + "step": 14455 + }, + { + "epoch": 18.55712451861361, + "grad_norm": 1.1879433393478394, + "learning_rate": 2.7150192554557124e-05, + "loss": 0.4692, + "step": 14456 + }, + { + "epoch": 18.558408215661103, + "grad_norm": 2.9019153118133545, + "learning_rate": 2.7149764655541292e-05, + "loss": 0.5117, + "step": 14457 + }, + { + "epoch": 18.5596919127086, + "grad_norm": 1.006655216217041, + "learning_rate": 2.714933675652546e-05, + "loss": 0.4562, + "step": 14458 + }, + { + "epoch": 18.5609756097561, + "grad_norm": 1.668432354927063, + "learning_rate": 2.714890885750963e-05, + "loss": 0.4845, + "step": 14459 + }, + { + "epoch": 18.562259306803593, + "grad_norm": 2.125206708908081, + "learning_rate": 2.7148480958493797e-05, + "loss": 0.4428, + "step": 14460 + }, + { + "epoch": 18.56354300385109, + "grad_norm": 1.735709309577942, + "learning_rate": 2.7148053059477962e-05, + "loss": 0.525, + "step": 14461 + }, + { + "epoch": 18.56482670089859, + "grad_norm": 1.8944854736328125, + "learning_rate": 2.7147625160462134e-05, + "loss": 0.4843, + "step": 14462 + }, + { + "epoch": 18.566110397946083, + "grad_norm": 1.335976004600525, + "learning_rate": 2.71471972614463e-05, + "loss": 0.4437, + "step": 14463 + }, + { + "epoch": 18.56739409499358, + "grad_norm": 1.3852131366729736, + "learning_rate": 2.7146769362430464e-05, + "loss": 0.4949, + "step": 14464 + }, + { + "epoch": 18.56867779204108, + "grad_norm": 5.831880569458008, + "learning_rate": 2.7146341463414636e-05, + "loss": 0.4621, + "step": 14465 + }, + { + "epoch": 18.569961489088577, + "grad_norm": 5.967462539672852, + "learning_rate": 2.71459135643988e-05, + "loss": 0.4916, + "step": 14466 + }, + { + "epoch": 18.57124518613607, + "grad_norm": 1.05511474609375, + "learning_rate": 2.7145485665382973e-05, + "loss": 0.4582, + "step": 14467 + }, + { + "epoch": 18.57252888318357, + "grad_norm": 1.9246779680252075, + "learning_rate": 2.7145057766367138e-05, + "loss": 0.499, + "step": 14468 + }, + { + "epoch": 18.573812580231067, + "grad_norm": 2.7371506690979004, + "learning_rate": 2.7144629867351306e-05, + "loss": 0.4948, + "step": 14469 + }, + { + "epoch": 18.57509627727856, + "grad_norm": 3.7162466049194336, + "learning_rate": 2.7144201968335475e-05, + "loss": 0.5416, + "step": 14470 + }, + { + "epoch": 18.57637997432606, + "grad_norm": 1.9037357568740845, + "learning_rate": 2.714377406931964e-05, + "loss": 0.5821, + "step": 14471 + }, + { + "epoch": 18.577663671373557, + "grad_norm": 3.6327321529388428, + "learning_rate": 2.7143346170303808e-05, + "loss": 0.6228, + "step": 14472 + }, + { + "epoch": 18.57894736842105, + "grad_norm": 1.5301028490066528, + "learning_rate": 2.7142918271287976e-05, + "loss": 0.4234, + "step": 14473 + }, + { + "epoch": 18.58023106546855, + "grad_norm": 1.299494981765747, + "learning_rate": 2.7142490372272145e-05, + "loss": 0.419, + "step": 14474 + }, + { + "epoch": 18.581514762516047, + "grad_norm": 1.2950375080108643, + "learning_rate": 2.7142062473256313e-05, + "loss": 0.446, + "step": 14475 + }, + { + "epoch": 18.58279845956354, + "grad_norm": 2.8507373332977295, + "learning_rate": 2.7141634574240482e-05, + "loss": 0.4751, + "step": 14476 + }, + { + "epoch": 18.58408215661104, + "grad_norm": 4.878745079040527, + "learning_rate": 2.7141206675224647e-05, + "loss": 0.4804, + "step": 14477 + }, + { + "epoch": 18.585365853658537, + "grad_norm": 1.4255385398864746, + "learning_rate": 2.7140778776208815e-05, + "loss": 0.4503, + "step": 14478 + }, + { + "epoch": 18.586649550706035, + "grad_norm": 2.2723357677459717, + "learning_rate": 2.7140350877192984e-05, + "loss": 0.4764, + "step": 14479 + }, + { + "epoch": 18.58793324775353, + "grad_norm": 2.772434949874878, + "learning_rate": 2.713992297817715e-05, + "loss": 0.4752, + "step": 14480 + }, + { + "epoch": 18.589216944801027, + "grad_norm": 4.281891345977783, + "learning_rate": 2.713949507916132e-05, + "loss": 0.4319, + "step": 14481 + }, + { + "epoch": 18.590500641848525, + "grad_norm": 2.785891532897949, + "learning_rate": 2.7139067180145485e-05, + "loss": 0.4607, + "step": 14482 + }, + { + "epoch": 18.59178433889602, + "grad_norm": 2.032440662384033, + "learning_rate": 2.7138639281129654e-05, + "loss": 0.4583, + "step": 14483 + }, + { + "epoch": 18.593068035943517, + "grad_norm": 2.4426774978637695, + "learning_rate": 2.7138211382113822e-05, + "loss": 0.4216, + "step": 14484 + }, + { + "epoch": 18.594351732991015, + "grad_norm": 1.393280267715454, + "learning_rate": 2.7137783483097987e-05, + "loss": 0.4351, + "step": 14485 + }, + { + "epoch": 18.59563543003851, + "grad_norm": 5.330832481384277, + "learning_rate": 2.713735558408216e-05, + "loss": 0.4509, + "step": 14486 + }, + { + "epoch": 18.596919127086007, + "grad_norm": 2.0707991123199463, + "learning_rate": 2.7136927685066324e-05, + "loss": 0.4626, + "step": 14487 + }, + { + "epoch": 18.598202824133505, + "grad_norm": 1.6779567003250122, + "learning_rate": 2.7136499786050492e-05, + "loss": 0.5023, + "step": 14488 + }, + { + "epoch": 18.599486521181003, + "grad_norm": 1.3051308393478394, + "learning_rate": 2.713607188703466e-05, + "loss": 0.4995, + "step": 14489 + }, + { + "epoch": 18.600770218228497, + "grad_norm": 1.152329921722412, + "learning_rate": 2.713564398801883e-05, + "loss": 0.4291, + "step": 14490 + }, + { + "epoch": 18.602053915275995, + "grad_norm": 2.422219753265381, + "learning_rate": 2.7135216089002994e-05, + "loss": 0.4313, + "step": 14491 + }, + { + "epoch": 18.603337612323493, + "grad_norm": 1.8930721282958984, + "learning_rate": 2.7134788189987163e-05, + "loss": 0.4628, + "step": 14492 + }, + { + "epoch": 18.604621309370987, + "grad_norm": 2.0184760093688965, + "learning_rate": 2.713436029097133e-05, + "loss": 0.4706, + "step": 14493 + }, + { + "epoch": 18.605905006418485, + "grad_norm": 1.9490190744400024, + "learning_rate": 2.71339323919555e-05, + "loss": 0.4751, + "step": 14494 + }, + { + "epoch": 18.607188703465983, + "grad_norm": 2.3088061809539795, + "learning_rate": 2.7133504492939668e-05, + "loss": 0.4624, + "step": 14495 + }, + { + "epoch": 18.608472400513477, + "grad_norm": 2.5013418197631836, + "learning_rate": 2.7133076593923833e-05, + "loss": 0.4484, + "step": 14496 + }, + { + "epoch": 18.609756097560975, + "grad_norm": 1.8489794731140137, + "learning_rate": 2.7132648694908005e-05, + "loss": 0.4554, + "step": 14497 + }, + { + "epoch": 18.611039794608473, + "grad_norm": 1.4611042737960815, + "learning_rate": 2.713222079589217e-05, + "loss": 0.4744, + "step": 14498 + }, + { + "epoch": 18.61232349165597, + "grad_norm": 1.2457784414291382, + "learning_rate": 2.7131792896876335e-05, + "loss": 0.4835, + "step": 14499 + }, + { + "epoch": 18.613607188703465, + "grad_norm": 2.2803120613098145, + "learning_rate": 2.7131364997860507e-05, + "loss": 0.4794, + "step": 14500 + }, + { + "epoch": 18.614890885750963, + "grad_norm": 1.6981980800628662, + "learning_rate": 2.713093709884467e-05, + "loss": 0.4724, + "step": 14501 + }, + { + "epoch": 18.61617458279846, + "grad_norm": 1.6544585227966309, + "learning_rate": 2.7130509199828843e-05, + "loss": 0.4344, + "step": 14502 + }, + { + "epoch": 18.617458279845955, + "grad_norm": 1.0787664651870728, + "learning_rate": 2.713008130081301e-05, + "loss": 0.4873, + "step": 14503 + }, + { + "epoch": 18.618741976893453, + "grad_norm": 1.1419200897216797, + "learning_rate": 2.7129653401797177e-05, + "loss": 0.4493, + "step": 14504 + }, + { + "epoch": 18.62002567394095, + "grad_norm": 2.120262384414673, + "learning_rate": 2.7129225502781345e-05, + "loss": 0.4694, + "step": 14505 + }, + { + "epoch": 18.621309370988445, + "grad_norm": 3.5537381172180176, + "learning_rate": 2.712879760376551e-05, + "loss": 0.4243, + "step": 14506 + }, + { + "epoch": 18.622593068035943, + "grad_norm": 1.5051007270812988, + "learning_rate": 2.712836970474968e-05, + "loss": 0.4907, + "step": 14507 + }, + { + "epoch": 18.62387676508344, + "grad_norm": 2.6926815509796143, + "learning_rate": 2.7127941805733847e-05, + "loss": 0.5069, + "step": 14508 + }, + { + "epoch": 18.625160462130935, + "grad_norm": 1.3871958255767822, + "learning_rate": 2.7127513906718016e-05, + "loss": 0.4596, + "step": 14509 + }, + { + "epoch": 18.626444159178433, + "grad_norm": 2.2593822479248047, + "learning_rate": 2.7127086007702184e-05, + "loss": 0.484, + "step": 14510 + }, + { + "epoch": 18.62772785622593, + "grad_norm": 1.4460238218307495, + "learning_rate": 2.7126658108686352e-05, + "loss": 0.511, + "step": 14511 + }, + { + "epoch": 18.62901155327343, + "grad_norm": 2.2431414127349854, + "learning_rate": 2.7126230209670517e-05, + "loss": 0.458, + "step": 14512 + }, + { + "epoch": 18.630295250320923, + "grad_norm": 1.9891281127929688, + "learning_rate": 2.7125802310654686e-05, + "loss": 0.5031, + "step": 14513 + }, + { + "epoch": 18.63157894736842, + "grad_norm": 1.9955226182937622, + "learning_rate": 2.7125374411638854e-05, + "loss": 0.4933, + "step": 14514 + }, + { + "epoch": 18.63286264441592, + "grad_norm": 1.801072120666504, + "learning_rate": 2.712494651262302e-05, + "loss": 0.4863, + "step": 14515 + }, + { + "epoch": 18.634146341463413, + "grad_norm": 4.885872840881348, + "learning_rate": 2.712451861360719e-05, + "loss": 0.4773, + "step": 14516 + }, + { + "epoch": 18.63543003851091, + "grad_norm": 1.7011111974716187, + "learning_rate": 2.7124090714591356e-05, + "loss": 0.5693, + "step": 14517 + }, + { + "epoch": 18.63671373555841, + "grad_norm": 2.5184166431427, + "learning_rate": 2.7123662815575525e-05, + "loss": 0.573, + "step": 14518 + }, + { + "epoch": 18.637997432605903, + "grad_norm": 2.534400463104248, + "learning_rate": 2.7123234916559693e-05, + "loss": 0.4717, + "step": 14519 + }, + { + "epoch": 18.6392811296534, + "grad_norm": 1.6783665418624878, + "learning_rate": 2.7122807017543858e-05, + "loss": 0.6008, + "step": 14520 + }, + { + "epoch": 18.6405648267009, + "grad_norm": 3.430649757385254, + "learning_rate": 2.712237911852803e-05, + "loss": 0.5344, + "step": 14521 + }, + { + "epoch": 18.641848523748397, + "grad_norm": 3.364490270614624, + "learning_rate": 2.7121951219512195e-05, + "loss": 0.6558, + "step": 14522 + }, + { + "epoch": 18.64313222079589, + "grad_norm": 1.1624057292938232, + "learning_rate": 2.7121523320496363e-05, + "loss": 0.4452, + "step": 14523 + }, + { + "epoch": 18.64441591784339, + "grad_norm": 1.4111601114273071, + "learning_rate": 2.712109542148053e-05, + "loss": 0.4479, + "step": 14524 + }, + { + "epoch": 18.645699614890887, + "grad_norm": 2.841395616531372, + "learning_rate": 2.7120667522464697e-05, + "loss": 0.4668, + "step": 14525 + }, + { + "epoch": 18.64698331193838, + "grad_norm": 1.5214028358459473, + "learning_rate": 2.712023962344887e-05, + "loss": 0.4548, + "step": 14526 + }, + { + "epoch": 18.64826700898588, + "grad_norm": 3.1693904399871826, + "learning_rate": 2.7119811724433033e-05, + "loss": 0.4713, + "step": 14527 + }, + { + "epoch": 18.649550706033377, + "grad_norm": 2.0659947395324707, + "learning_rate": 2.7119383825417202e-05, + "loss": 0.4512, + "step": 14528 + }, + { + "epoch": 18.65083440308087, + "grad_norm": 7.308879375457764, + "learning_rate": 2.711895592640137e-05, + "loss": 0.4466, + "step": 14529 + }, + { + "epoch": 18.65211810012837, + "grad_norm": 1.1483837366104126, + "learning_rate": 2.711852802738554e-05, + "loss": 0.4766, + "step": 14530 + }, + { + "epoch": 18.653401797175867, + "grad_norm": 1.0849354267120361, + "learning_rate": 2.7118100128369704e-05, + "loss": 0.4584, + "step": 14531 + }, + { + "epoch": 18.654685494223365, + "grad_norm": 1.1337659358978271, + "learning_rate": 2.7117672229353872e-05, + "loss": 0.4499, + "step": 14532 + }, + { + "epoch": 18.65596919127086, + "grad_norm": 0.9650226831436157, + "learning_rate": 2.711724433033804e-05, + "loss": 0.4896, + "step": 14533 + }, + { + "epoch": 18.657252888318357, + "grad_norm": 2.4749324321746826, + "learning_rate": 2.711681643132221e-05, + "loss": 0.4453, + "step": 14534 + }, + { + "epoch": 18.658536585365855, + "grad_norm": 1.3110653162002563, + "learning_rate": 2.7116388532306377e-05, + "loss": 0.4619, + "step": 14535 + }, + { + "epoch": 18.65982028241335, + "grad_norm": 1.8417733907699585, + "learning_rate": 2.7115960633290542e-05, + "loss": 0.4921, + "step": 14536 + }, + { + "epoch": 18.661103979460847, + "grad_norm": 3.977609395980835, + "learning_rate": 2.7115532734274714e-05, + "loss": 0.4587, + "step": 14537 + }, + { + "epoch": 18.662387676508345, + "grad_norm": 2.0754668712615967, + "learning_rate": 2.711510483525888e-05, + "loss": 0.4295, + "step": 14538 + }, + { + "epoch": 18.66367137355584, + "grad_norm": 3.5651767253875732, + "learning_rate": 2.7114676936243044e-05, + "loss": 0.435, + "step": 14539 + }, + { + "epoch": 18.664955070603337, + "grad_norm": 2.0579986572265625, + "learning_rate": 2.7114249037227216e-05, + "loss": 0.4515, + "step": 14540 + }, + { + "epoch": 18.666238767650835, + "grad_norm": 2.0120129585266113, + "learning_rate": 2.711382113821138e-05, + "loss": 0.4605, + "step": 14541 + }, + { + "epoch": 18.66752246469833, + "grad_norm": 2.103111982345581, + "learning_rate": 2.7113393239195553e-05, + "loss": 0.4822, + "step": 14542 + }, + { + "epoch": 18.668806161745827, + "grad_norm": 1.7828247547149658, + "learning_rate": 2.7112965340179718e-05, + "loss": 0.4788, + "step": 14543 + }, + { + "epoch": 18.670089858793325, + "grad_norm": 2.0144379138946533, + "learning_rate": 2.7112537441163886e-05, + "loss": 0.4666, + "step": 14544 + }, + { + "epoch": 18.671373555840823, + "grad_norm": 3.678107261657715, + "learning_rate": 2.7112109542148055e-05, + "loss": 0.4558, + "step": 14545 + }, + { + "epoch": 18.672657252888317, + "grad_norm": 2.0959954261779785, + "learning_rate": 2.711168164313222e-05, + "loss": 0.4707, + "step": 14546 + }, + { + "epoch": 18.673940949935815, + "grad_norm": 2.115002155303955, + "learning_rate": 2.7111253744116388e-05, + "loss": 0.4415, + "step": 14547 + }, + { + "epoch": 18.675224646983313, + "grad_norm": 2.837822675704956, + "learning_rate": 2.7110825845100557e-05, + "loss": 0.5051, + "step": 14548 + }, + { + "epoch": 18.676508344030808, + "grad_norm": 2.8302416801452637, + "learning_rate": 2.7110397946084725e-05, + "loss": 0.4451, + "step": 14549 + }, + { + "epoch": 18.677792041078305, + "grad_norm": 2.8841447830200195, + "learning_rate": 2.7109970047068893e-05, + "loss": 0.4379, + "step": 14550 + }, + { + "epoch": 18.679075738125803, + "grad_norm": 6.775182247161865, + "learning_rate": 2.7109542148053062e-05, + "loss": 0.4857, + "step": 14551 + }, + { + "epoch": 18.680359435173298, + "grad_norm": 2.176537036895752, + "learning_rate": 2.7109114249037227e-05, + "loss": 0.4418, + "step": 14552 + }, + { + "epoch": 18.681643132220795, + "grad_norm": 5.828571796417236, + "learning_rate": 2.7108686350021395e-05, + "loss": 0.5141, + "step": 14553 + }, + { + "epoch": 18.682926829268293, + "grad_norm": 2.844280242919922, + "learning_rate": 2.7108258451005564e-05, + "loss": 0.4777, + "step": 14554 + }, + { + "epoch": 18.68421052631579, + "grad_norm": 3.248612880706787, + "learning_rate": 2.710783055198973e-05, + "loss": 0.4572, + "step": 14555 + }, + { + "epoch": 18.685494223363285, + "grad_norm": 2.3275110721588135, + "learning_rate": 2.71074026529739e-05, + "loss": 0.4736, + "step": 14556 + }, + { + "epoch": 18.686777920410783, + "grad_norm": 3.046457529067993, + "learning_rate": 2.7106974753958065e-05, + "loss": 0.5358, + "step": 14557 + }, + { + "epoch": 18.68806161745828, + "grad_norm": 3.180088758468628, + "learning_rate": 2.7106546854942237e-05, + "loss": 0.471, + "step": 14558 + }, + { + "epoch": 18.689345314505776, + "grad_norm": 4.923488616943359, + "learning_rate": 2.7106118955926402e-05, + "loss": 0.5154, + "step": 14559 + }, + { + "epoch": 18.690629011553273, + "grad_norm": 3.5477712154388428, + "learning_rate": 2.7105691056910567e-05, + "loss": 0.5465, + "step": 14560 + }, + { + "epoch": 18.69191270860077, + "grad_norm": 4.75790548324585, + "learning_rate": 2.710526315789474e-05, + "loss": 0.5211, + "step": 14561 + }, + { + "epoch": 18.693196405648266, + "grad_norm": 2.30598521232605, + "learning_rate": 2.7104835258878904e-05, + "loss": 0.4907, + "step": 14562 + }, + { + "epoch": 18.694480102695763, + "grad_norm": 3.5087623596191406, + "learning_rate": 2.7104407359863073e-05, + "loss": 0.5068, + "step": 14563 + }, + { + "epoch": 18.69576379974326, + "grad_norm": 2.2414181232452393, + "learning_rate": 2.710397946084724e-05, + "loss": 0.5103, + "step": 14564 + }, + { + "epoch": 18.69704749679076, + "grad_norm": 10.711689949035645, + "learning_rate": 2.710355156183141e-05, + "loss": 0.5241, + "step": 14565 + }, + { + "epoch": 18.698331193838253, + "grad_norm": 3.691817045211792, + "learning_rate": 2.7103123662815578e-05, + "loss": 0.4918, + "step": 14566 + }, + { + "epoch": 18.69961489088575, + "grad_norm": 2.174182415008545, + "learning_rate": 2.7102695763799743e-05, + "loss": 0.5242, + "step": 14567 + }, + { + "epoch": 18.70089858793325, + "grad_norm": 2.462907552719116, + "learning_rate": 2.710226786478391e-05, + "loss": 0.49, + "step": 14568 + }, + { + "epoch": 18.702182284980744, + "grad_norm": 4.031064987182617, + "learning_rate": 2.710183996576808e-05, + "loss": 0.5571, + "step": 14569 + }, + { + "epoch": 18.70346598202824, + "grad_norm": 3.472670078277588, + "learning_rate": 2.7101412066752248e-05, + "loss": 0.5409, + "step": 14570 + }, + { + "epoch": 18.70474967907574, + "grad_norm": 2.0249557495117188, + "learning_rate": 2.7100984167736413e-05, + "loss": 0.6575, + "step": 14571 + }, + { + "epoch": 18.706033376123234, + "grad_norm": 3.7141358852386475, + "learning_rate": 2.7100556268720585e-05, + "loss": 0.7235, + "step": 14572 + }, + { + "epoch": 18.70731707317073, + "grad_norm": 1.4258038997650146, + "learning_rate": 2.710012836970475e-05, + "loss": 0.4225, + "step": 14573 + }, + { + "epoch": 18.70860077021823, + "grad_norm": 1.191613793373108, + "learning_rate": 2.7099700470688918e-05, + "loss": 0.4308, + "step": 14574 + }, + { + "epoch": 18.709884467265724, + "grad_norm": 2.297144889831543, + "learning_rate": 2.7099272571673087e-05, + "loss": 0.4434, + "step": 14575 + }, + { + "epoch": 18.71116816431322, + "grad_norm": 3.5212833881378174, + "learning_rate": 2.7098844672657252e-05, + "loss": 0.4299, + "step": 14576 + }, + { + "epoch": 18.71245186136072, + "grad_norm": 1.1717617511749268, + "learning_rate": 2.7098416773641424e-05, + "loss": 0.4626, + "step": 14577 + }, + { + "epoch": 18.713735558408217, + "grad_norm": 1.4628361463546753, + "learning_rate": 2.709798887462559e-05, + "loss": 0.4386, + "step": 14578 + }, + { + "epoch": 18.71501925545571, + "grad_norm": 2.464114189147949, + "learning_rate": 2.7097560975609757e-05, + "loss": 0.4688, + "step": 14579 + }, + { + "epoch": 18.71630295250321, + "grad_norm": 2.1849617958068848, + "learning_rate": 2.7097133076593925e-05, + "loss": 0.4194, + "step": 14580 + }, + { + "epoch": 18.717586649550707, + "grad_norm": 1.226850152015686, + "learning_rate": 2.709670517757809e-05, + "loss": 0.4485, + "step": 14581 + }, + { + "epoch": 18.7188703465982, + "grad_norm": 3.6253228187561035, + "learning_rate": 2.7096277278562262e-05, + "loss": 0.4709, + "step": 14582 + }, + { + "epoch": 18.7201540436457, + "grad_norm": 1.8245034217834473, + "learning_rate": 2.7095849379546427e-05, + "loss": 0.4649, + "step": 14583 + }, + { + "epoch": 18.721437740693197, + "grad_norm": 2.239809989929199, + "learning_rate": 2.7095421480530596e-05, + "loss": 0.4532, + "step": 14584 + }, + { + "epoch": 18.72272143774069, + "grad_norm": 1.1220468282699585, + "learning_rate": 2.7094993581514764e-05, + "loss": 0.47, + "step": 14585 + }, + { + "epoch": 18.72400513478819, + "grad_norm": 1.776875615119934, + "learning_rate": 2.709456568249893e-05, + "loss": 0.482, + "step": 14586 + }, + { + "epoch": 18.725288831835687, + "grad_norm": 2.0801172256469727, + "learning_rate": 2.7094137783483097e-05, + "loss": 0.4131, + "step": 14587 + }, + { + "epoch": 18.726572528883185, + "grad_norm": 1.8854883909225464, + "learning_rate": 2.7093709884467266e-05, + "loss": 0.4778, + "step": 14588 + }, + { + "epoch": 18.72785622593068, + "grad_norm": 1.5155106782913208, + "learning_rate": 2.7093281985451434e-05, + "loss": 0.4436, + "step": 14589 + }, + { + "epoch": 18.729139922978177, + "grad_norm": 1.063246250152588, + "learning_rate": 2.7092854086435603e-05, + "loss": 0.461, + "step": 14590 + }, + { + "epoch": 18.730423620025675, + "grad_norm": 23.130104064941406, + "learning_rate": 2.709242618741977e-05, + "loss": 0.4893, + "step": 14591 + }, + { + "epoch": 18.73170731707317, + "grad_norm": 2.01023530960083, + "learning_rate": 2.7091998288403936e-05, + "loss": 0.4485, + "step": 14592 + }, + { + "epoch": 18.732991014120667, + "grad_norm": 2.613097667694092, + "learning_rate": 2.7091570389388105e-05, + "loss": 0.442, + "step": 14593 + }, + { + "epoch": 18.734274711168165, + "grad_norm": 3.250093936920166, + "learning_rate": 2.7091142490372273e-05, + "loss": 0.4861, + "step": 14594 + }, + { + "epoch": 18.73555840821566, + "grad_norm": 1.4111005067825317, + "learning_rate": 2.7090714591356438e-05, + "loss": 0.4055, + "step": 14595 + }, + { + "epoch": 18.736842105263158, + "grad_norm": 1.6126456260681152, + "learning_rate": 2.709028669234061e-05, + "loss": 0.4364, + "step": 14596 + }, + { + "epoch": 18.738125802310655, + "grad_norm": 1.3987411260604858, + "learning_rate": 2.7089858793324775e-05, + "loss": 0.465, + "step": 14597 + }, + { + "epoch": 18.739409499358153, + "grad_norm": 1.2759578227996826, + "learning_rate": 2.7089430894308947e-05, + "loss": 0.4995, + "step": 14598 + }, + { + "epoch": 18.740693196405648, + "grad_norm": 1.7316105365753174, + "learning_rate": 2.708900299529311e-05, + "loss": 0.4409, + "step": 14599 + }, + { + "epoch": 18.741976893453145, + "grad_norm": 3.882479429244995, + "learning_rate": 2.7088575096277277e-05, + "loss": 0.442, + "step": 14600 + }, + { + "epoch": 18.743260590500643, + "grad_norm": 2.098991870880127, + "learning_rate": 2.708814719726145e-05, + "loss": 0.4816, + "step": 14601 + }, + { + "epoch": 18.744544287548138, + "grad_norm": 2.4740898609161377, + "learning_rate": 2.7087719298245613e-05, + "loss": 0.5159, + "step": 14602 + }, + { + "epoch": 18.745827984595635, + "grad_norm": 1.2862554788589478, + "learning_rate": 2.7087291399229782e-05, + "loss": 0.4694, + "step": 14603 + }, + { + "epoch": 18.747111681643133, + "grad_norm": 2.905014991760254, + "learning_rate": 2.708686350021395e-05, + "loss": 0.4902, + "step": 14604 + }, + { + "epoch": 18.748395378690628, + "grad_norm": 2.2969677448272705, + "learning_rate": 2.708643560119812e-05, + "loss": 0.5196, + "step": 14605 + }, + { + "epoch": 18.749679075738126, + "grad_norm": 0.9384205341339111, + "learning_rate": 2.7086007702182287e-05, + "loss": 0.4511, + "step": 14606 + }, + { + "epoch": 18.750962772785623, + "grad_norm": 1.6300801038742065, + "learning_rate": 2.7085579803166452e-05, + "loss": 0.457, + "step": 14607 + }, + { + "epoch": 18.752246469833118, + "grad_norm": 1.4793238639831543, + "learning_rate": 2.708515190415062e-05, + "loss": 0.467, + "step": 14608 + }, + { + "epoch": 18.753530166880616, + "grad_norm": 7.145192623138428, + "learning_rate": 2.708472400513479e-05, + "loss": 0.4912, + "step": 14609 + }, + { + "epoch": 18.754813863928113, + "grad_norm": 2.2691657543182373, + "learning_rate": 2.7084296106118957e-05, + "loss": 0.4803, + "step": 14610 + }, + { + "epoch": 18.75609756097561, + "grad_norm": 6.635376453399658, + "learning_rate": 2.7083868207103122e-05, + "loss": 0.4918, + "step": 14611 + }, + { + "epoch": 18.757381258023106, + "grad_norm": 2.750492572784424, + "learning_rate": 2.7083440308087294e-05, + "loss": 0.4689, + "step": 14612 + }, + { + "epoch": 18.758664955070603, + "grad_norm": 3.326645851135254, + "learning_rate": 2.708301240907146e-05, + "loss": 0.5203, + "step": 14613 + }, + { + "epoch": 18.7599486521181, + "grad_norm": 1.16032874584198, + "learning_rate": 2.7082584510055628e-05, + "loss": 0.5523, + "step": 14614 + }, + { + "epoch": 18.761232349165596, + "grad_norm": 3.8443613052368164, + "learning_rate": 2.7082156611039796e-05, + "loss": 0.5076, + "step": 14615 + }, + { + "epoch": 18.762516046213094, + "grad_norm": 1.076058030128479, + "learning_rate": 2.708172871202396e-05, + "loss": 0.4923, + "step": 14616 + }, + { + "epoch": 18.76379974326059, + "grad_norm": 1.0463390350341797, + "learning_rate": 2.7081300813008133e-05, + "loss": 0.488, + "step": 14617 + }, + { + "epoch": 18.765083440308086, + "grad_norm": 1.6109837293624878, + "learning_rate": 2.7080872913992298e-05, + "loss": 0.5717, + "step": 14618 + }, + { + "epoch": 18.766367137355584, + "grad_norm": 3.625053882598877, + "learning_rate": 2.7080445014976466e-05, + "loss": 0.5078, + "step": 14619 + }, + { + "epoch": 18.76765083440308, + "grad_norm": 1.7290029525756836, + "learning_rate": 2.7080017115960635e-05, + "loss": 0.5686, + "step": 14620 + }, + { + "epoch": 18.76893453145058, + "grad_norm": 3.062519073486328, + "learning_rate": 2.70795892169448e-05, + "loss": 0.6723, + "step": 14621 + }, + { + "epoch": 18.770218228498074, + "grad_norm": 3.0774245262145996, + "learning_rate": 2.707916131792897e-05, + "loss": 0.6184, + "step": 14622 + }, + { + "epoch": 18.77150192554557, + "grad_norm": 2.147108793258667, + "learning_rate": 2.7078733418913137e-05, + "loss": 0.4365, + "step": 14623 + }, + { + "epoch": 18.77278562259307, + "grad_norm": 2.3222198486328125, + "learning_rate": 2.7078305519897305e-05, + "loss": 0.4409, + "step": 14624 + }, + { + "epoch": 18.774069319640564, + "grad_norm": 1.2184784412384033, + "learning_rate": 2.7077877620881473e-05, + "loss": 0.4714, + "step": 14625 + }, + { + "epoch": 18.77535301668806, + "grad_norm": 2.069150447845459, + "learning_rate": 2.7077449721865642e-05, + "loss": 0.4553, + "step": 14626 + }, + { + "epoch": 18.77663671373556, + "grad_norm": 0.8850622177124023, + "learning_rate": 2.7077021822849807e-05, + "loss": 0.4334, + "step": 14627 + }, + { + "epoch": 18.777920410783054, + "grad_norm": 0.9129486680030823, + "learning_rate": 2.7076593923833975e-05, + "loss": 0.4338, + "step": 14628 + }, + { + "epoch": 18.77920410783055, + "grad_norm": 2.1362035274505615, + "learning_rate": 2.7076166024818144e-05, + "loss": 0.4561, + "step": 14629 + }, + { + "epoch": 18.78048780487805, + "grad_norm": 1.1057881116867065, + "learning_rate": 2.7075738125802312e-05, + "loss": 0.435, + "step": 14630 + }, + { + "epoch": 18.781771501925547, + "grad_norm": 0.7859196662902832, + "learning_rate": 2.707531022678648e-05, + "loss": 0.4769, + "step": 14631 + }, + { + "epoch": 18.78305519897304, + "grad_norm": 1.7977370023727417, + "learning_rate": 2.7074882327770646e-05, + "loss": 0.4749, + "step": 14632 + }, + { + "epoch": 18.78433889602054, + "grad_norm": 1.046726942062378, + "learning_rate": 2.7074454428754817e-05, + "loss": 0.4896, + "step": 14633 + }, + { + "epoch": 18.785622593068037, + "grad_norm": 1.3481783866882324, + "learning_rate": 2.7074026529738982e-05, + "loss": 0.4424, + "step": 14634 + }, + { + "epoch": 18.78690629011553, + "grad_norm": 0.9152480363845825, + "learning_rate": 2.7073598630723147e-05, + "loss": 0.4395, + "step": 14635 + }, + { + "epoch": 18.78818998716303, + "grad_norm": 1.223196029663086, + "learning_rate": 2.707317073170732e-05, + "loss": 0.4952, + "step": 14636 + }, + { + "epoch": 18.789473684210527, + "grad_norm": 1.9667218923568726, + "learning_rate": 2.7072742832691484e-05, + "loss": 0.4897, + "step": 14637 + }, + { + "epoch": 18.79075738125802, + "grad_norm": 2.159512996673584, + "learning_rate": 2.7072314933675656e-05, + "loss": 0.4725, + "step": 14638 + }, + { + "epoch": 18.79204107830552, + "grad_norm": 1.1789546012878418, + "learning_rate": 2.707188703465982e-05, + "loss": 0.45, + "step": 14639 + }, + { + "epoch": 18.793324775353017, + "grad_norm": 1.6015384197235107, + "learning_rate": 2.707145913564399e-05, + "loss": 0.4689, + "step": 14640 + }, + { + "epoch": 18.794608472400512, + "grad_norm": 9.13711166381836, + "learning_rate": 2.7071031236628158e-05, + "loss": 0.4408, + "step": 14641 + }, + { + "epoch": 18.79589216944801, + "grad_norm": 2.832760810852051, + "learning_rate": 2.7070603337612323e-05, + "loss": 0.4415, + "step": 14642 + }, + { + "epoch": 18.797175866495508, + "grad_norm": 3.4232702255249023, + "learning_rate": 2.707017543859649e-05, + "loss": 0.4339, + "step": 14643 + }, + { + "epoch": 18.798459563543005, + "grad_norm": 0.9834019541740417, + "learning_rate": 2.706974753958066e-05, + "loss": 0.4726, + "step": 14644 + }, + { + "epoch": 18.7997432605905, + "grad_norm": 1.6389172077178955, + "learning_rate": 2.7069319640564828e-05, + "loss": 0.4696, + "step": 14645 + }, + { + "epoch": 18.801026957637998, + "grad_norm": 1.2229446172714233, + "learning_rate": 2.7068891741548997e-05, + "loss": 0.4501, + "step": 14646 + }, + { + "epoch": 18.802310654685495, + "grad_norm": 1.7621861696243286, + "learning_rate": 2.706846384253316e-05, + "loss": 0.4537, + "step": 14647 + }, + { + "epoch": 18.80359435173299, + "grad_norm": 1.515090823173523, + "learning_rate": 2.706803594351733e-05, + "loss": 0.4469, + "step": 14648 + }, + { + "epoch": 18.804878048780488, + "grad_norm": 1.823392391204834, + "learning_rate": 2.70676080445015e-05, + "loss": 0.4959, + "step": 14649 + }, + { + "epoch": 18.806161745827985, + "grad_norm": 1.0963257551193237, + "learning_rate": 2.7067180145485667e-05, + "loss": 0.501, + "step": 14650 + }, + { + "epoch": 18.80744544287548, + "grad_norm": 1.3806349039077759, + "learning_rate": 2.7066752246469832e-05, + "loss": 0.5244, + "step": 14651 + }, + { + "epoch": 18.808729139922978, + "grad_norm": 1.1675093173980713, + "learning_rate": 2.7066324347454004e-05, + "loss": 0.4719, + "step": 14652 + }, + { + "epoch": 18.810012836970476, + "grad_norm": 5.4338202476501465, + "learning_rate": 2.706589644843817e-05, + "loss": 0.4982, + "step": 14653 + }, + { + "epoch": 18.811296534017973, + "grad_norm": 0.9905257821083069, + "learning_rate": 2.7065468549422337e-05, + "loss": 0.4468, + "step": 14654 + }, + { + "epoch": 18.812580231065468, + "grad_norm": 1.952842116355896, + "learning_rate": 2.7065040650406505e-05, + "loss": 0.4818, + "step": 14655 + }, + { + "epoch": 18.813863928112966, + "grad_norm": 1.7157881259918213, + "learning_rate": 2.706461275139067e-05, + "loss": 0.5286, + "step": 14656 + }, + { + "epoch": 18.815147625160463, + "grad_norm": 1.0501619577407837, + "learning_rate": 2.7064184852374842e-05, + "loss": 0.4995, + "step": 14657 + }, + { + "epoch": 18.816431322207958, + "grad_norm": 0.9282203912734985, + "learning_rate": 2.7063756953359007e-05, + "loss": 0.459, + "step": 14658 + }, + { + "epoch": 18.817715019255456, + "grad_norm": 2.4894752502441406, + "learning_rate": 2.7063329054343176e-05, + "loss": 0.497, + "step": 14659 + }, + { + "epoch": 18.818998716302954, + "grad_norm": 9.35683536529541, + "learning_rate": 2.7062901155327344e-05, + "loss": 0.5291, + "step": 14660 + }, + { + "epoch": 18.820282413350448, + "grad_norm": 1.0832774639129639, + "learning_rate": 2.706247325631151e-05, + "loss": 0.5093, + "step": 14661 + }, + { + "epoch": 18.821566110397946, + "grad_norm": 1.8827943801879883, + "learning_rate": 2.706204535729568e-05, + "loss": 0.503, + "step": 14662 + }, + { + "epoch": 18.822849807445444, + "grad_norm": 4.344974517822266, + "learning_rate": 2.7061617458279846e-05, + "loss": 0.4808, + "step": 14663 + }, + { + "epoch": 18.82413350449294, + "grad_norm": 2.1558258533477783, + "learning_rate": 2.7061189559264014e-05, + "loss": 0.5694, + "step": 14664 + }, + { + "epoch": 18.825417201540436, + "grad_norm": 2.1280953884124756, + "learning_rate": 2.7060761660248183e-05, + "loss": 0.5247, + "step": 14665 + }, + { + "epoch": 18.826700898587934, + "grad_norm": 1.1743459701538086, + "learning_rate": 2.706033376123235e-05, + "loss": 0.4923, + "step": 14666 + }, + { + "epoch": 18.82798459563543, + "grad_norm": 3.186527729034424, + "learning_rate": 2.7059905862216516e-05, + "loss": 0.5026, + "step": 14667 + }, + { + "epoch": 18.829268292682926, + "grad_norm": 1.3204964399337769, + "learning_rate": 2.7059477963200685e-05, + "loss": 0.5178, + "step": 14668 + }, + { + "epoch": 18.830551989730424, + "grad_norm": 2.4990758895874023, + "learning_rate": 2.7059050064184853e-05, + "loss": 0.5922, + "step": 14669 + }, + { + "epoch": 18.83183568677792, + "grad_norm": 1.6903373003005981, + "learning_rate": 2.705862216516902e-05, + "loss": 0.5567, + "step": 14670 + }, + { + "epoch": 18.833119383825416, + "grad_norm": 2.570328950881958, + "learning_rate": 2.705819426615319e-05, + "loss": 0.5706, + "step": 14671 + }, + { + "epoch": 18.834403080872914, + "grad_norm": 3.8969945907592773, + "learning_rate": 2.7057766367137355e-05, + "loss": 0.6041, + "step": 14672 + }, + { + "epoch": 18.83568677792041, + "grad_norm": 1.292866826057434, + "learning_rate": 2.7057338468121527e-05, + "loss": 0.4304, + "step": 14673 + }, + { + "epoch": 18.836970474967906, + "grad_norm": 1.1541999578475952, + "learning_rate": 2.7056910569105692e-05, + "loss": 0.4183, + "step": 14674 + }, + { + "epoch": 18.838254172015404, + "grad_norm": 0.9285330176353455, + "learning_rate": 2.7056482670089857e-05, + "loss": 0.4523, + "step": 14675 + }, + { + "epoch": 18.8395378690629, + "grad_norm": 3.0665290355682373, + "learning_rate": 2.705605477107403e-05, + "loss": 0.5069, + "step": 14676 + }, + { + "epoch": 18.8408215661104, + "grad_norm": 1.565011978149414, + "learning_rate": 2.7055626872058194e-05, + "loss": 0.4667, + "step": 14677 + }, + { + "epoch": 18.842105263157894, + "grad_norm": 0.9963200092315674, + "learning_rate": 2.7055198973042365e-05, + "loss": 0.4027, + "step": 14678 + }, + { + "epoch": 18.84338896020539, + "grad_norm": 1.1094037294387817, + "learning_rate": 2.705477107402653e-05, + "loss": 0.4883, + "step": 14679 + }, + { + "epoch": 18.84467265725289, + "grad_norm": 2.628549098968506, + "learning_rate": 2.70543431750107e-05, + "loss": 0.4482, + "step": 14680 + }, + { + "epoch": 18.845956354300384, + "grad_norm": 3.7084438800811768, + "learning_rate": 2.7053915275994867e-05, + "loss": 0.4488, + "step": 14681 + }, + { + "epoch": 18.84724005134788, + "grad_norm": 1.9569623470306396, + "learning_rate": 2.7053487376979032e-05, + "loss": 0.4671, + "step": 14682 + }, + { + "epoch": 18.84852374839538, + "grad_norm": 1.278043508529663, + "learning_rate": 2.70530594779632e-05, + "loss": 0.4554, + "step": 14683 + }, + { + "epoch": 18.849807445442874, + "grad_norm": 1.3173375129699707, + "learning_rate": 2.705263157894737e-05, + "loss": 0.462, + "step": 14684 + }, + { + "epoch": 18.85109114249037, + "grad_norm": 2.1301491260528564, + "learning_rate": 2.7052203679931537e-05, + "loss": 0.4525, + "step": 14685 + }, + { + "epoch": 18.85237483953787, + "grad_norm": 1.0507245063781738, + "learning_rate": 2.7051775780915702e-05, + "loss": 0.4474, + "step": 14686 + }, + { + "epoch": 18.853658536585368, + "grad_norm": 2.336656093597412, + "learning_rate": 2.7051347881899874e-05, + "loss": 0.4428, + "step": 14687 + }, + { + "epoch": 18.854942233632862, + "grad_norm": 3.025179147720337, + "learning_rate": 2.705091998288404e-05, + "loss": 0.4395, + "step": 14688 + }, + { + "epoch": 18.85622593068036, + "grad_norm": 1.4338833093643188, + "learning_rate": 2.7050492083868208e-05, + "loss": 0.4668, + "step": 14689 + }, + { + "epoch": 18.857509627727858, + "grad_norm": 2.112870931625366, + "learning_rate": 2.7050064184852376e-05, + "loss": 0.4491, + "step": 14690 + }, + { + "epoch": 18.858793324775352, + "grad_norm": 3.791800022125244, + "learning_rate": 2.704963628583654e-05, + "loss": 0.4772, + "step": 14691 + }, + { + "epoch": 18.86007702182285, + "grad_norm": 2.0440139770507812, + "learning_rate": 2.7049208386820713e-05, + "loss": 0.4786, + "step": 14692 + }, + { + "epoch": 18.861360718870348, + "grad_norm": 2.687537908554077, + "learning_rate": 2.7048780487804878e-05, + "loss": 0.4448, + "step": 14693 + }, + { + "epoch": 18.862644415917842, + "grad_norm": 0.9991484880447388, + "learning_rate": 2.7048352588789046e-05, + "loss": 0.457, + "step": 14694 + }, + { + "epoch": 18.86392811296534, + "grad_norm": 2.033895969390869, + "learning_rate": 2.7047924689773215e-05, + "loss": 0.4466, + "step": 14695 + }, + { + "epoch": 18.865211810012838, + "grad_norm": 1.4061028957366943, + "learning_rate": 2.704749679075738e-05, + "loss": 0.4869, + "step": 14696 + }, + { + "epoch": 18.866495507060336, + "grad_norm": 1.6516858339309692, + "learning_rate": 2.704706889174155e-05, + "loss": 0.4912, + "step": 14697 + }, + { + "epoch": 18.86777920410783, + "grad_norm": 1.2678930759429932, + "learning_rate": 2.7046640992725717e-05, + "loss": 0.453, + "step": 14698 + }, + { + "epoch": 18.869062901155328, + "grad_norm": 1.2959272861480713, + "learning_rate": 2.7046213093709885e-05, + "loss": 0.4588, + "step": 14699 + }, + { + "epoch": 18.870346598202826, + "grad_norm": 6.5867438316345215, + "learning_rate": 2.7045785194694053e-05, + "loss": 0.442, + "step": 14700 + }, + { + "epoch": 18.87163029525032, + "grad_norm": 2.494753122329712, + "learning_rate": 2.7045357295678222e-05, + "loss": 0.4548, + "step": 14701 + }, + { + "epoch": 18.872913992297818, + "grad_norm": 2.0589187145233154, + "learning_rate": 2.7044929396662387e-05, + "loss": 0.4441, + "step": 14702 + }, + { + "epoch": 18.874197689345316, + "grad_norm": 1.4065932035446167, + "learning_rate": 2.7044501497646555e-05, + "loss": 0.4988, + "step": 14703 + }, + { + "epoch": 18.87548138639281, + "grad_norm": 1.4435216188430786, + "learning_rate": 2.7044073598630724e-05, + "loss": 0.4737, + "step": 14704 + }, + { + "epoch": 18.876765083440308, + "grad_norm": 1.200711965560913, + "learning_rate": 2.7043645699614892e-05, + "loss": 0.5137, + "step": 14705 + }, + { + "epoch": 18.878048780487806, + "grad_norm": 1.5960183143615723, + "learning_rate": 2.704321780059906e-05, + "loss": 0.4417, + "step": 14706 + }, + { + "epoch": 18.8793324775353, + "grad_norm": 1.8588893413543701, + "learning_rate": 2.7042789901583226e-05, + "loss": 0.4836, + "step": 14707 + }, + { + "epoch": 18.880616174582798, + "grad_norm": 1.387946605682373, + "learning_rate": 2.7042362002567394e-05, + "loss": 0.4372, + "step": 14708 + }, + { + "epoch": 18.881899871630296, + "grad_norm": 3.3904199600219727, + "learning_rate": 2.7041934103551562e-05, + "loss": 0.5323, + "step": 14709 + }, + { + "epoch": 18.883183568677794, + "grad_norm": 1.161565899848938, + "learning_rate": 2.7041506204535727e-05, + "loss": 0.484, + "step": 14710 + }, + { + "epoch": 18.884467265725288, + "grad_norm": 1.7438769340515137, + "learning_rate": 2.70410783055199e-05, + "loss": 0.5031, + "step": 14711 + }, + { + "epoch": 18.885750962772786, + "grad_norm": 1.3231183290481567, + "learning_rate": 2.7040650406504064e-05, + "loss": 0.494, + "step": 14712 + }, + { + "epoch": 18.887034659820284, + "grad_norm": 1.9209421873092651, + "learning_rate": 2.7040222507488236e-05, + "loss": 0.4453, + "step": 14713 + }, + { + "epoch": 18.888318356867778, + "grad_norm": 1.8877768516540527, + "learning_rate": 2.70397946084724e-05, + "loss": 0.507, + "step": 14714 + }, + { + "epoch": 18.889602053915276, + "grad_norm": 0.9526986479759216, + "learning_rate": 2.7039366709456566e-05, + "loss": 0.5235, + "step": 14715 + }, + { + "epoch": 18.890885750962774, + "grad_norm": 1.1206928491592407, + "learning_rate": 2.7038938810440738e-05, + "loss": 0.5021, + "step": 14716 + }, + { + "epoch": 18.892169448010268, + "grad_norm": 1.6425906419754028, + "learning_rate": 2.7038510911424903e-05, + "loss": 0.4696, + "step": 14717 + }, + { + "epoch": 18.893453145057766, + "grad_norm": 6.424445152282715, + "learning_rate": 2.703808301240907e-05, + "loss": 0.5127, + "step": 14718 + }, + { + "epoch": 18.894736842105264, + "grad_norm": 2.535008668899536, + "learning_rate": 2.703765511339324e-05, + "loss": 0.5205, + "step": 14719 + }, + { + "epoch": 18.89602053915276, + "grad_norm": 1.5026171207427979, + "learning_rate": 2.7037227214377408e-05, + "loss": 0.5978, + "step": 14720 + }, + { + "epoch": 18.897304236200256, + "grad_norm": 2.781430244445801, + "learning_rate": 2.7036799315361577e-05, + "loss": 0.5314, + "step": 14721 + }, + { + "epoch": 18.898587933247754, + "grad_norm": 2.880795478820801, + "learning_rate": 2.703637141634574e-05, + "loss": 0.6629, + "step": 14722 + }, + { + "epoch": 18.89987163029525, + "grad_norm": 1.6902409791946411, + "learning_rate": 2.703594351732991e-05, + "loss": 0.4241, + "step": 14723 + }, + { + "epoch": 18.901155327342746, + "grad_norm": 1.1091282367706299, + "learning_rate": 2.703551561831408e-05, + "loss": 0.454, + "step": 14724 + }, + { + "epoch": 18.902439024390244, + "grad_norm": 0.9140360355377197, + "learning_rate": 2.7035087719298247e-05, + "loss": 0.422, + "step": 14725 + }, + { + "epoch": 18.90372272143774, + "grad_norm": 1.902040719985962, + "learning_rate": 2.7034659820282412e-05, + "loss": 0.4804, + "step": 14726 + }, + { + "epoch": 18.905006418485236, + "grad_norm": 1.1696234941482544, + "learning_rate": 2.7034231921266584e-05, + "loss": 0.486, + "step": 14727 + }, + { + "epoch": 18.906290115532734, + "grad_norm": 2.209468364715576, + "learning_rate": 2.703380402225075e-05, + "loss": 0.4262, + "step": 14728 + }, + { + "epoch": 18.90757381258023, + "grad_norm": 1.4257235527038574, + "learning_rate": 2.7033376123234917e-05, + "loss": 0.4722, + "step": 14729 + }, + { + "epoch": 18.90885750962773, + "grad_norm": 1.3210164308547974, + "learning_rate": 2.7032948224219085e-05, + "loss": 0.4709, + "step": 14730 + }, + { + "epoch": 18.910141206675224, + "grad_norm": 2.5113556385040283, + "learning_rate": 2.703252032520325e-05, + "loss": 0.4519, + "step": 14731 + }, + { + "epoch": 18.911424903722722, + "grad_norm": 1.3584363460540771, + "learning_rate": 2.7032092426187422e-05, + "loss": 0.443, + "step": 14732 + }, + { + "epoch": 18.91270860077022, + "grad_norm": 10.204439163208008, + "learning_rate": 2.7031664527171587e-05, + "loss": 0.4559, + "step": 14733 + }, + { + "epoch": 18.913992297817714, + "grad_norm": 17.49310302734375, + "learning_rate": 2.7031236628155756e-05, + "loss": 0.4353, + "step": 14734 + }, + { + "epoch": 18.915275994865212, + "grad_norm": 1.3088935613632202, + "learning_rate": 2.7030808729139924e-05, + "loss": 0.4927, + "step": 14735 + }, + { + "epoch": 18.91655969191271, + "grad_norm": 1.4513479471206665, + "learning_rate": 2.703038083012409e-05, + "loss": 0.4507, + "step": 14736 + }, + { + "epoch": 18.917843388960204, + "grad_norm": 1.8152116537094116, + "learning_rate": 2.702995293110826e-05, + "loss": 0.4334, + "step": 14737 + }, + { + "epoch": 18.919127086007702, + "grad_norm": 4.285555839538574, + "learning_rate": 2.7029525032092426e-05, + "loss": 0.4896, + "step": 14738 + }, + { + "epoch": 18.9204107830552, + "grad_norm": 1.2938191890716553, + "learning_rate": 2.7029097133076594e-05, + "loss": 0.4543, + "step": 14739 + }, + { + "epoch": 18.921694480102694, + "grad_norm": 1.138644814491272, + "learning_rate": 2.7028669234060763e-05, + "loss": 0.4243, + "step": 14740 + }, + { + "epoch": 18.922978177150192, + "grad_norm": 1.5843502283096313, + "learning_rate": 2.702824133504493e-05, + "loss": 0.4295, + "step": 14741 + }, + { + "epoch": 18.92426187419769, + "grad_norm": 3.5871448516845703, + "learning_rate": 2.7027813436029096e-05, + "loss": 0.4567, + "step": 14742 + }, + { + "epoch": 18.925545571245188, + "grad_norm": 1.274005651473999, + "learning_rate": 2.7027385537013265e-05, + "loss": 0.4558, + "step": 14743 + }, + { + "epoch": 18.926829268292682, + "grad_norm": 1.8104822635650635, + "learning_rate": 2.7026957637997433e-05, + "loss": 0.4463, + "step": 14744 + }, + { + "epoch": 18.92811296534018, + "grad_norm": 1.6593092679977417, + "learning_rate": 2.70265297389816e-05, + "loss": 0.4667, + "step": 14745 + }, + { + "epoch": 18.929396662387678, + "grad_norm": 2.492493152618408, + "learning_rate": 2.702610183996577e-05, + "loss": 0.43, + "step": 14746 + }, + { + "epoch": 18.930680359435172, + "grad_norm": 0.9852235913276672, + "learning_rate": 2.7025673940949935e-05, + "loss": 0.4777, + "step": 14747 + }, + { + "epoch": 18.93196405648267, + "grad_norm": 1.7614448070526123, + "learning_rate": 2.7025246041934107e-05, + "loss": 0.4459, + "step": 14748 + }, + { + "epoch": 18.933247753530168, + "grad_norm": 1.1331992149353027, + "learning_rate": 2.7024818142918272e-05, + "loss": 0.4439, + "step": 14749 + }, + { + "epoch": 18.934531450577662, + "grad_norm": 1.310747742652893, + "learning_rate": 2.7024390243902437e-05, + "loss": 0.4687, + "step": 14750 + }, + { + "epoch": 18.93581514762516, + "grad_norm": 6.375281810760498, + "learning_rate": 2.702396234488661e-05, + "loss": 0.4866, + "step": 14751 + }, + { + "epoch": 18.937098844672658, + "grad_norm": 1.8751732110977173, + "learning_rate": 2.7023534445870774e-05, + "loss": 0.4753, + "step": 14752 + }, + { + "epoch": 18.938382541720156, + "grad_norm": 0.9779677391052246, + "learning_rate": 2.7023106546854945e-05, + "loss": 0.4701, + "step": 14753 + }, + { + "epoch": 18.93966623876765, + "grad_norm": 1.2203651666641235, + "learning_rate": 2.702267864783911e-05, + "loss": 0.4375, + "step": 14754 + }, + { + "epoch": 18.940949935815148, + "grad_norm": 3.1084749698638916, + "learning_rate": 2.702225074882328e-05, + "loss": 0.485, + "step": 14755 + }, + { + "epoch": 18.942233632862646, + "grad_norm": 1.3699411153793335, + "learning_rate": 2.7021822849807447e-05, + "loss": 0.443, + "step": 14756 + }, + { + "epoch": 18.94351732991014, + "grad_norm": 0.8970527648925781, + "learning_rate": 2.7021394950791612e-05, + "loss": 0.4672, + "step": 14757 + }, + { + "epoch": 18.944801026957638, + "grad_norm": 3.2750766277313232, + "learning_rate": 2.702096705177578e-05, + "loss": 0.4811, + "step": 14758 + }, + { + "epoch": 18.946084724005136, + "grad_norm": 0.9669984579086304, + "learning_rate": 2.702053915275995e-05, + "loss": 0.4863, + "step": 14759 + }, + { + "epoch": 18.94736842105263, + "grad_norm": 1.82438063621521, + "learning_rate": 2.7020111253744118e-05, + "loss": 0.5149, + "step": 14760 + }, + { + "epoch": 18.948652118100128, + "grad_norm": 2.569269895553589, + "learning_rate": 2.7019683354728286e-05, + "loss": 0.4854, + "step": 14761 + }, + { + "epoch": 18.949935815147626, + "grad_norm": 1.6067193746566772, + "learning_rate": 2.7019255455712454e-05, + "loss": 0.5312, + "step": 14762 + }, + { + "epoch": 18.951219512195124, + "grad_norm": 1.8000415563583374, + "learning_rate": 2.701882755669662e-05, + "loss": 0.4747, + "step": 14763 + }, + { + "epoch": 18.952503209242618, + "grad_norm": 1.2816287279129028, + "learning_rate": 2.7018399657680788e-05, + "loss": 0.4933, + "step": 14764 + }, + { + "epoch": 18.953786906290116, + "grad_norm": 0.9945545792579651, + "learning_rate": 2.7017971758664956e-05, + "loss": 0.4767, + "step": 14765 + }, + { + "epoch": 18.955070603337614, + "grad_norm": 2.260003089904785, + "learning_rate": 2.701754385964912e-05, + "loss": 0.4532, + "step": 14766 + }, + { + "epoch": 18.956354300385108, + "grad_norm": 1.7030054330825806, + "learning_rate": 2.7017115960633293e-05, + "loss": 0.5057, + "step": 14767 + }, + { + "epoch": 18.957637997432606, + "grad_norm": 1.5558990240097046, + "learning_rate": 2.7016688061617458e-05, + "loss": 0.5495, + "step": 14768 + }, + { + "epoch": 18.958921694480104, + "grad_norm": 1.300426959991455, + "learning_rate": 2.7016260162601626e-05, + "loss": 0.48, + "step": 14769 + }, + { + "epoch": 18.960205391527598, + "grad_norm": 1.3054510354995728, + "learning_rate": 2.7015832263585795e-05, + "loss": 0.5332, + "step": 14770 + }, + { + "epoch": 18.961489088575096, + "grad_norm": 2.330885648727417, + "learning_rate": 2.701540436456996e-05, + "loss": 0.5986, + "step": 14771 + }, + { + "epoch": 18.962772785622594, + "grad_norm": 3.302727699279785, + "learning_rate": 2.701497646555413e-05, + "loss": 0.6986, + "step": 14772 + }, + { + "epoch": 18.964056482670088, + "grad_norm": 1.8452140092849731, + "learning_rate": 2.7014548566538297e-05, + "loss": 0.4293, + "step": 14773 + }, + { + "epoch": 18.965340179717586, + "grad_norm": 1.0836002826690674, + "learning_rate": 2.7014120667522465e-05, + "loss": 0.4384, + "step": 14774 + }, + { + "epoch": 18.966623876765084, + "grad_norm": 1.4609113931655884, + "learning_rate": 2.7013692768506634e-05, + "loss": 0.4499, + "step": 14775 + }, + { + "epoch": 18.96790757381258, + "grad_norm": 1.1078394651412964, + "learning_rate": 2.70132648694908e-05, + "loss": 0.425, + "step": 14776 + }, + { + "epoch": 18.969191270860076, + "grad_norm": 1.3258795738220215, + "learning_rate": 2.701283697047497e-05, + "loss": 0.4907, + "step": 14777 + }, + { + "epoch": 18.970474967907574, + "grad_norm": 4.818121433258057, + "learning_rate": 2.7012409071459135e-05, + "loss": 0.4846, + "step": 14778 + }, + { + "epoch": 18.971758664955072, + "grad_norm": 0.7094292044639587, + "learning_rate": 2.7011981172443304e-05, + "loss": 0.4557, + "step": 14779 + }, + { + "epoch": 18.973042362002566, + "grad_norm": 1.2287737131118774, + "learning_rate": 2.7011553273427472e-05, + "loss": 0.425, + "step": 14780 + }, + { + "epoch": 18.974326059050064, + "grad_norm": 1.1857343912124634, + "learning_rate": 2.701112537441164e-05, + "loss": 0.4833, + "step": 14781 + }, + { + "epoch": 18.975609756097562, + "grad_norm": 1.177949070930481, + "learning_rate": 2.7010697475395806e-05, + "loss": 0.4716, + "step": 14782 + }, + { + "epoch": 18.976893453145056, + "grad_norm": 2.0400190353393555, + "learning_rate": 2.7010269576379974e-05, + "loss": 0.4398, + "step": 14783 + }, + { + "epoch": 18.978177150192554, + "grad_norm": 0.9346357583999634, + "learning_rate": 2.7009841677364142e-05, + "loss": 0.4559, + "step": 14784 + }, + { + "epoch": 18.979460847240052, + "grad_norm": 1.6860301494598389, + "learning_rate": 2.700941377834831e-05, + "loss": 0.4319, + "step": 14785 + }, + { + "epoch": 18.98074454428755, + "grad_norm": 1.0367993116378784, + "learning_rate": 2.700898587933248e-05, + "loss": 0.4683, + "step": 14786 + }, + { + "epoch": 18.982028241335044, + "grad_norm": 4.675259113311768, + "learning_rate": 2.7008557980316644e-05, + "loss": 0.4639, + "step": 14787 + }, + { + "epoch": 18.983311938382542, + "grad_norm": 2.458491086959839, + "learning_rate": 2.7008130081300816e-05, + "loss": 0.4343, + "step": 14788 + }, + { + "epoch": 18.98459563543004, + "grad_norm": 5.4706525802612305, + "learning_rate": 2.700770218228498e-05, + "loss": 0.448, + "step": 14789 + }, + { + "epoch": 18.985879332477534, + "grad_norm": 3.0262465476989746, + "learning_rate": 2.7007274283269146e-05, + "loss": 0.4565, + "step": 14790 + }, + { + "epoch": 18.987163029525032, + "grad_norm": 1.1102880239486694, + "learning_rate": 2.7006846384253318e-05, + "loss": 0.4657, + "step": 14791 + }, + { + "epoch": 18.98844672657253, + "grad_norm": 4.744394302368164, + "learning_rate": 2.7006418485237483e-05, + "loss": 0.4825, + "step": 14792 + }, + { + "epoch": 18.989730423620024, + "grad_norm": 1.9118893146514893, + "learning_rate": 2.7005990586221655e-05, + "loss": 0.495, + "step": 14793 + }, + { + "epoch": 18.991014120667522, + "grad_norm": 1.4043521881103516, + "learning_rate": 2.700556268720582e-05, + "loss": 0.4506, + "step": 14794 + }, + { + "epoch": 18.99229781771502, + "grad_norm": 1.351712942123413, + "learning_rate": 2.7005134788189988e-05, + "loss": 0.5183, + "step": 14795 + }, + { + "epoch": 18.993581514762518, + "grad_norm": 1.3272533416748047, + "learning_rate": 2.7004706889174157e-05, + "loss": 0.4998, + "step": 14796 + }, + { + "epoch": 18.994865211810012, + "grad_norm": 1.5686482191085815, + "learning_rate": 2.700427899015832e-05, + "loss": 0.4815, + "step": 14797 + }, + { + "epoch": 18.99614890885751, + "grad_norm": 1.9490779638290405, + "learning_rate": 2.700385109114249e-05, + "loss": 0.5253, + "step": 14798 + }, + { + "epoch": 18.997432605905008, + "grad_norm": 1.333466649055481, + "learning_rate": 2.700342319212666e-05, + "loss": 0.5173, + "step": 14799 + }, + { + "epoch": 18.998716302952502, + "grad_norm": 3.5314347743988037, + "learning_rate": 2.7002995293110827e-05, + "loss": 0.5924, + "step": 14800 + }, + { + "epoch": 19.0, + "grad_norm": 2.608944892883301, + "learning_rate": 2.7002567394094995e-05, + "loss": 0.6651, + "step": 14801 + }, + { + "epoch": 19.001283697047498, + "grad_norm": 1.3039050102233887, + "learning_rate": 2.7002139495079164e-05, + "loss": 0.4144, + "step": 14802 + }, + { + "epoch": 19.002567394094992, + "grad_norm": 1.277435064315796, + "learning_rate": 2.700171159606333e-05, + "loss": 0.4454, + "step": 14803 + }, + { + "epoch": 19.00385109114249, + "grad_norm": 3.543565273284912, + "learning_rate": 2.7001283697047497e-05, + "loss": 0.4081, + "step": 14804 + }, + { + "epoch": 19.005134788189988, + "grad_norm": 1.155888557434082, + "learning_rate": 2.7000855798031666e-05, + "loss": 0.4566, + "step": 14805 + }, + { + "epoch": 19.006418485237482, + "grad_norm": 0.8618676066398621, + "learning_rate": 2.700042789901583e-05, + "loss": 0.4269, + "step": 14806 + }, + { + "epoch": 19.00770218228498, + "grad_norm": 0.9812639951705933, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.4591, + "step": 14807 + }, + { + "epoch": 19.008985879332478, + "grad_norm": 1.3107261657714844, + "learning_rate": 2.6999572100984167e-05, + "loss": 0.4388, + "step": 14808 + }, + { + "epoch": 19.010269576379976, + "grad_norm": 1.4691047668457031, + "learning_rate": 2.699914420196834e-05, + "loss": 0.4601, + "step": 14809 + }, + { + "epoch": 19.01155327342747, + "grad_norm": 2.4753522872924805, + "learning_rate": 2.6998716302952504e-05, + "loss": 0.4375, + "step": 14810 + }, + { + "epoch": 19.012836970474968, + "grad_norm": 1.6324880123138428, + "learning_rate": 2.699828840393667e-05, + "loss": 0.4741, + "step": 14811 + }, + { + "epoch": 19.014120667522466, + "grad_norm": 2.836857318878174, + "learning_rate": 2.699786050492084e-05, + "loss": 0.4441, + "step": 14812 + }, + { + "epoch": 19.01540436456996, + "grad_norm": 1.193660855293274, + "learning_rate": 2.6997432605905006e-05, + "loss": 0.4442, + "step": 14813 + }, + { + "epoch": 19.016688061617458, + "grad_norm": 1.3041282892227173, + "learning_rate": 2.6997004706889174e-05, + "loss": 0.4158, + "step": 14814 + }, + { + "epoch": 19.017971758664956, + "grad_norm": 3.1618032455444336, + "learning_rate": 2.6996576807873343e-05, + "loss": 0.4261, + "step": 14815 + }, + { + "epoch": 19.01925545571245, + "grad_norm": 3.4228501319885254, + "learning_rate": 2.699614890885751e-05, + "loss": 0.4306, + "step": 14816 + }, + { + "epoch": 19.020539152759948, + "grad_norm": 1.2007747888565063, + "learning_rate": 2.699572100984168e-05, + "loss": 0.4322, + "step": 14817 + }, + { + "epoch": 19.021822849807446, + "grad_norm": 1.445026159286499, + "learning_rate": 2.6995293110825845e-05, + "loss": 0.4361, + "step": 14818 + }, + { + "epoch": 19.023106546854944, + "grad_norm": 5.957614421844482, + "learning_rate": 2.6994865211810013e-05, + "loss": 0.4474, + "step": 14819 + }, + { + "epoch": 19.024390243902438, + "grad_norm": 1.9808940887451172, + "learning_rate": 2.699443731279418e-05, + "loss": 0.4267, + "step": 14820 + }, + { + "epoch": 19.025673940949936, + "grad_norm": 1.0557304620742798, + "learning_rate": 2.699400941377835e-05, + "loss": 0.4274, + "step": 14821 + }, + { + "epoch": 19.026957637997434, + "grad_norm": 1.1368738412857056, + "learning_rate": 2.6993581514762515e-05, + "loss": 0.4669, + "step": 14822 + }, + { + "epoch": 19.028241335044928, + "grad_norm": 3.1196107864379883, + "learning_rate": 2.6993153615746687e-05, + "loss": 0.4532, + "step": 14823 + }, + { + "epoch": 19.029525032092426, + "grad_norm": 1.4401696920394897, + "learning_rate": 2.6992725716730852e-05, + "loss": 0.4229, + "step": 14824 + }, + { + "epoch": 19.030808729139924, + "grad_norm": 2.4290966987609863, + "learning_rate": 2.699229781771502e-05, + "loss": 0.4124, + "step": 14825 + }, + { + "epoch": 19.03209242618742, + "grad_norm": 1.7286317348480225, + "learning_rate": 2.699186991869919e-05, + "loss": 0.4603, + "step": 14826 + }, + { + "epoch": 19.033376123234916, + "grad_norm": 1.386616826057434, + "learning_rate": 2.6991442019683354e-05, + "loss": 0.4327, + "step": 14827 + }, + { + "epoch": 19.034659820282414, + "grad_norm": 1.283193588256836, + "learning_rate": 2.6991014120667525e-05, + "loss": 0.3891, + "step": 14828 + }, + { + "epoch": 19.035943517329912, + "grad_norm": 0.9859154224395752, + "learning_rate": 2.699058622165169e-05, + "loss": 0.433, + "step": 14829 + }, + { + "epoch": 19.037227214377406, + "grad_norm": 2.3060429096221924, + "learning_rate": 2.699015832263586e-05, + "loss": 0.4063, + "step": 14830 + }, + { + "epoch": 19.038510911424904, + "grad_norm": 1.1715747117996216, + "learning_rate": 2.6989730423620027e-05, + "loss": 0.4377, + "step": 14831 + }, + { + "epoch": 19.039794608472402, + "grad_norm": 1.4067057371139526, + "learning_rate": 2.6989302524604192e-05, + "loss": 0.4641, + "step": 14832 + }, + { + "epoch": 19.041078305519896, + "grad_norm": 2.0680460929870605, + "learning_rate": 2.6988874625588364e-05, + "loss": 0.4452, + "step": 14833 + }, + { + "epoch": 19.042362002567394, + "grad_norm": 3.250302791595459, + "learning_rate": 2.698844672657253e-05, + "loss": 0.4689, + "step": 14834 + }, + { + "epoch": 19.043645699614892, + "grad_norm": 25.657825469970703, + "learning_rate": 2.6988018827556698e-05, + "loss": 0.4418, + "step": 14835 + }, + { + "epoch": 19.044929396662386, + "grad_norm": 1.5909854173660278, + "learning_rate": 2.6987590928540866e-05, + "loss": 0.4473, + "step": 14836 + }, + { + "epoch": 19.046213093709884, + "grad_norm": 1.4350398778915405, + "learning_rate": 2.698716302952503e-05, + "loss": 0.449, + "step": 14837 + }, + { + "epoch": 19.047496790757382, + "grad_norm": 3.9036073684692383, + "learning_rate": 2.69867351305092e-05, + "loss": 0.4437, + "step": 14838 + }, + { + "epoch": 19.048780487804876, + "grad_norm": 4.678514003753662, + "learning_rate": 2.6986307231493368e-05, + "loss": 0.5159, + "step": 14839 + }, + { + "epoch": 19.050064184852374, + "grad_norm": 1.541546106338501, + "learning_rate": 2.6985879332477536e-05, + "loss": 0.4549, + "step": 14840 + }, + { + "epoch": 19.051347881899872, + "grad_norm": 2.6431705951690674, + "learning_rate": 2.6985451433461705e-05, + "loss": 0.4298, + "step": 14841 + }, + { + "epoch": 19.05263157894737, + "grad_norm": 2.163710832595825, + "learning_rate": 2.6985023534445873e-05, + "loss": 0.513, + "step": 14842 + }, + { + "epoch": 19.053915275994864, + "grad_norm": 1.2209889888763428, + "learning_rate": 2.6984595635430038e-05, + "loss": 0.4651, + "step": 14843 + }, + { + "epoch": 19.055198973042362, + "grad_norm": 7.524646759033203, + "learning_rate": 2.6984167736414207e-05, + "loss": 0.4819, + "step": 14844 + }, + { + "epoch": 19.05648267008986, + "grad_norm": 1.6009858846664429, + "learning_rate": 2.6983739837398375e-05, + "loss": 0.509, + "step": 14845 + }, + { + "epoch": 19.057766367137354, + "grad_norm": 1.7441039085388184, + "learning_rate": 2.698331193838254e-05, + "loss": 0.5028, + "step": 14846 + }, + { + "epoch": 19.059050064184852, + "grad_norm": 2.022263526916504, + "learning_rate": 2.6982884039366712e-05, + "loss": 0.4907, + "step": 14847 + }, + { + "epoch": 19.06033376123235, + "grad_norm": 1.8783721923828125, + "learning_rate": 2.6982456140350877e-05, + "loss": 0.4671, + "step": 14848 + }, + { + "epoch": 19.061617458279844, + "grad_norm": 2.136432647705078, + "learning_rate": 2.698202824133505e-05, + "loss": 0.576, + "step": 14849 + }, + { + "epoch": 19.062901155327342, + "grad_norm": 3.505390167236328, + "learning_rate": 2.6981600342319214e-05, + "loss": 0.5739, + "step": 14850 + }, + { + "epoch": 19.06418485237484, + "grad_norm": 2.323103427886963, + "learning_rate": 2.698117244330338e-05, + "loss": 0.6609, + "step": 14851 + }, + { + "epoch": 19.065468549422338, + "grad_norm": 2.0331296920776367, + "learning_rate": 2.698074454428755e-05, + "loss": 0.3957, + "step": 14852 + }, + { + "epoch": 19.066752246469832, + "grad_norm": 1.7576078176498413, + "learning_rate": 2.6980316645271715e-05, + "loss": 0.4154, + "step": 14853 + }, + { + "epoch": 19.06803594351733, + "grad_norm": 1.055058479309082, + "learning_rate": 2.6979888746255884e-05, + "loss": 0.4261, + "step": 14854 + }, + { + "epoch": 19.069319640564828, + "grad_norm": 1.0047391653060913, + "learning_rate": 2.6979460847240052e-05, + "loss": 0.441, + "step": 14855 + }, + { + "epoch": 19.070603337612322, + "grad_norm": 0.9067020416259766, + "learning_rate": 2.697903294822422e-05, + "loss": 0.4262, + "step": 14856 + }, + { + "epoch": 19.07188703465982, + "grad_norm": 2.1231017112731934, + "learning_rate": 2.697860504920839e-05, + "loss": 0.4029, + "step": 14857 + }, + { + "epoch": 19.073170731707318, + "grad_norm": 1.1987286806106567, + "learning_rate": 2.6978177150192554e-05, + "loss": 0.4619, + "step": 14858 + }, + { + "epoch": 19.074454428754812, + "grad_norm": 1.3288599252700806, + "learning_rate": 2.6977749251176723e-05, + "loss": 0.4196, + "step": 14859 + }, + { + "epoch": 19.07573812580231, + "grad_norm": 2.854015588760376, + "learning_rate": 2.697732135216089e-05, + "loss": 0.4266, + "step": 14860 + }, + { + "epoch": 19.077021822849808, + "grad_norm": 1.2281908988952637, + "learning_rate": 2.697689345314506e-05, + "loss": 0.468, + "step": 14861 + }, + { + "epoch": 19.078305519897306, + "grad_norm": 0.9839210510253906, + "learning_rate": 2.6976465554129224e-05, + "loss": 0.4139, + "step": 14862 + }, + { + "epoch": 19.0795892169448, + "grad_norm": 1.089715838432312, + "learning_rate": 2.6976037655113396e-05, + "loss": 0.4017, + "step": 14863 + }, + { + "epoch": 19.080872913992298, + "grad_norm": 6.537239074707031, + "learning_rate": 2.697560975609756e-05, + "loss": 0.424, + "step": 14864 + }, + { + "epoch": 19.082156611039796, + "grad_norm": 1.4071401357650757, + "learning_rate": 2.697518185708173e-05, + "loss": 0.4535, + "step": 14865 + }, + { + "epoch": 19.08344030808729, + "grad_norm": 2.775541067123413, + "learning_rate": 2.6974753958065898e-05, + "loss": 0.4415, + "step": 14866 + }, + { + "epoch": 19.084724005134788, + "grad_norm": 1.6648204326629639, + "learning_rate": 2.6974326059050063e-05, + "loss": 0.4348, + "step": 14867 + }, + { + "epoch": 19.086007702182286, + "grad_norm": 1.4112695455551147, + "learning_rate": 2.6973898160034235e-05, + "loss": 0.4281, + "step": 14868 + }, + { + "epoch": 19.08729139922978, + "grad_norm": 1.483958125114441, + "learning_rate": 2.69734702610184e-05, + "loss": 0.4362, + "step": 14869 + }, + { + "epoch": 19.088575096277278, + "grad_norm": 6.546392917633057, + "learning_rate": 2.6973042362002568e-05, + "loss": 0.4702, + "step": 14870 + }, + { + "epoch": 19.089858793324776, + "grad_norm": 1.4211269617080688, + "learning_rate": 2.6972614462986737e-05, + "loss": 0.439, + "step": 14871 + }, + { + "epoch": 19.09114249037227, + "grad_norm": 1.1633695363998413, + "learning_rate": 2.6972186563970902e-05, + "loss": 0.4251, + "step": 14872 + }, + { + "epoch": 19.09242618741977, + "grad_norm": 3.1723220348358154, + "learning_rate": 2.6971758664955074e-05, + "loss": 0.4389, + "step": 14873 + }, + { + "epoch": 19.093709884467266, + "grad_norm": 3.1015024185180664, + "learning_rate": 2.697133076593924e-05, + "loss": 0.4062, + "step": 14874 + }, + { + "epoch": 19.094993581514764, + "grad_norm": 2.263700485229492, + "learning_rate": 2.6970902866923407e-05, + "loss": 0.4751, + "step": 14875 + }, + { + "epoch": 19.09627727856226, + "grad_norm": 1.8406802415847778, + "learning_rate": 2.6970474967907575e-05, + "loss": 0.4568, + "step": 14876 + }, + { + "epoch": 19.097560975609756, + "grad_norm": 2.8202714920043945, + "learning_rate": 2.6970047068891744e-05, + "loss": 0.4231, + "step": 14877 + }, + { + "epoch": 19.098844672657254, + "grad_norm": 1.4446721076965332, + "learning_rate": 2.696961916987591e-05, + "loss": 0.4041, + "step": 14878 + }, + { + "epoch": 19.10012836970475, + "grad_norm": 0.9963717460632324, + "learning_rate": 2.6969191270860077e-05, + "loss": 0.4503, + "step": 14879 + }, + { + "epoch": 19.101412066752246, + "grad_norm": 2.728095531463623, + "learning_rate": 2.6968763371844246e-05, + "loss": 0.4466, + "step": 14880 + }, + { + "epoch": 19.102695763799744, + "grad_norm": 2.322026491165161, + "learning_rate": 2.6968335472828414e-05, + "loss": 0.4736, + "step": 14881 + }, + { + "epoch": 19.10397946084724, + "grad_norm": 1.6414145231246948, + "learning_rate": 2.6967907573812582e-05, + "loss": 0.4626, + "step": 14882 + }, + { + "epoch": 19.105263157894736, + "grad_norm": 3.4593546390533447, + "learning_rate": 2.6967479674796747e-05, + "loss": 0.5086, + "step": 14883 + }, + { + "epoch": 19.106546854942234, + "grad_norm": 2.754228115081787, + "learning_rate": 2.696705177578092e-05, + "loss": 0.4718, + "step": 14884 + }, + { + "epoch": 19.107830551989732, + "grad_norm": 4.139229774475098, + "learning_rate": 2.6966623876765084e-05, + "loss": 0.4238, + "step": 14885 + }, + { + "epoch": 19.109114249037226, + "grad_norm": 1.5151461362838745, + "learning_rate": 2.696619597774925e-05, + "loss": 0.4999, + "step": 14886 + }, + { + "epoch": 19.110397946084724, + "grad_norm": 1.3942146301269531, + "learning_rate": 2.696576807873342e-05, + "loss": 0.4491, + "step": 14887 + }, + { + "epoch": 19.111681643132222, + "grad_norm": 2.2611167430877686, + "learning_rate": 2.6965340179717586e-05, + "loss": 0.4524, + "step": 14888 + }, + { + "epoch": 19.112965340179716, + "grad_norm": 2.2731542587280273, + "learning_rate": 2.6964912280701755e-05, + "loss": 0.4301, + "step": 14889 + }, + { + "epoch": 19.114249037227214, + "grad_norm": 3.379033327102661, + "learning_rate": 2.6964484381685923e-05, + "loss": 0.4189, + "step": 14890 + }, + { + "epoch": 19.115532734274712, + "grad_norm": 3.095942497253418, + "learning_rate": 2.696405648267009e-05, + "loss": 0.495, + "step": 14891 + }, + { + "epoch": 19.116816431322206, + "grad_norm": 2.0879244804382324, + "learning_rate": 2.696362858365426e-05, + "loss": 0.4765, + "step": 14892 + }, + { + "epoch": 19.118100128369704, + "grad_norm": 1.7340461015701294, + "learning_rate": 2.6963200684638425e-05, + "loss": 0.4612, + "step": 14893 + }, + { + "epoch": 19.119383825417202, + "grad_norm": 1.0922826528549194, + "learning_rate": 2.6962772785622593e-05, + "loss": 0.4272, + "step": 14894 + }, + { + "epoch": 19.1206675224647, + "grad_norm": 1.5437384843826294, + "learning_rate": 2.696234488660676e-05, + "loss": 0.4731, + "step": 14895 + }, + { + "epoch": 19.121951219512194, + "grad_norm": 1.2346675395965576, + "learning_rate": 2.696191698759093e-05, + "loss": 0.4728, + "step": 14896 + }, + { + "epoch": 19.123234916559692, + "grad_norm": 7.188694000244141, + "learning_rate": 2.6961489088575095e-05, + "loss": 0.4562, + "step": 14897 + }, + { + "epoch": 19.12451861360719, + "grad_norm": 1.8098459243774414, + "learning_rate": 2.6961061189559263e-05, + "loss": 0.4746, + "step": 14898 + }, + { + "epoch": 19.125802310654684, + "grad_norm": 2.3842809200286865, + "learning_rate": 2.6960633290543432e-05, + "loss": 0.5341, + "step": 14899 + }, + { + "epoch": 19.127086007702182, + "grad_norm": 3.9867193698883057, + "learning_rate": 2.69602053915276e-05, + "loss": 0.4814, + "step": 14900 + }, + { + "epoch": 19.12836970474968, + "grad_norm": 14.78775405883789, + "learning_rate": 2.695977749251177e-05, + "loss": 0.6136, + "step": 14901 + }, + { + "epoch": 19.129653401797174, + "grad_norm": 5.124026298522949, + "learning_rate": 2.6959349593495934e-05, + "loss": 0.4143, + "step": 14902 + }, + { + "epoch": 19.130937098844672, + "grad_norm": 1.2260957956314087, + "learning_rate": 2.6958921694480106e-05, + "loss": 0.4054, + "step": 14903 + }, + { + "epoch": 19.13222079589217, + "grad_norm": 1.6874122619628906, + "learning_rate": 2.695849379546427e-05, + "loss": 0.4434, + "step": 14904 + }, + { + "epoch": 19.133504492939664, + "grad_norm": 1.6631277799606323, + "learning_rate": 2.6958065896448436e-05, + "loss": 0.4131, + "step": 14905 + }, + { + "epoch": 19.134788189987162, + "grad_norm": 1.1904524564743042, + "learning_rate": 2.6957637997432607e-05, + "loss": 0.4515, + "step": 14906 + }, + { + "epoch": 19.13607188703466, + "grad_norm": 2.145611047744751, + "learning_rate": 2.6957210098416772e-05, + "loss": 0.4457, + "step": 14907 + }, + { + "epoch": 19.137355584082158, + "grad_norm": 1.164116382598877, + "learning_rate": 2.6956782199400944e-05, + "loss": 0.4384, + "step": 14908 + }, + { + "epoch": 19.138639281129652, + "grad_norm": 1.1222962141036987, + "learning_rate": 2.695635430038511e-05, + "loss": 0.446, + "step": 14909 + }, + { + "epoch": 19.13992297817715, + "grad_norm": 1.6081151962280273, + "learning_rate": 2.6955926401369278e-05, + "loss": 0.4233, + "step": 14910 + }, + { + "epoch": 19.141206675224648, + "grad_norm": 1.370553970336914, + "learning_rate": 2.6955498502353446e-05, + "loss": 0.4546, + "step": 14911 + }, + { + "epoch": 19.142490372272142, + "grad_norm": 5.252354621887207, + "learning_rate": 2.695507060333761e-05, + "loss": 0.4237, + "step": 14912 + }, + { + "epoch": 19.14377406931964, + "grad_norm": 1.5070710182189941, + "learning_rate": 2.695464270432178e-05, + "loss": 0.4631, + "step": 14913 + }, + { + "epoch": 19.145057766367138, + "grad_norm": 3.087400436401367, + "learning_rate": 2.6954214805305948e-05, + "loss": 0.4172, + "step": 14914 + }, + { + "epoch": 19.146341463414632, + "grad_norm": 2.132852792739868, + "learning_rate": 2.6953786906290116e-05, + "loss": 0.4205, + "step": 14915 + }, + { + "epoch": 19.14762516046213, + "grad_norm": 1.8649641275405884, + "learning_rate": 2.6953359007274285e-05, + "loss": 0.4013, + "step": 14916 + }, + { + "epoch": 19.14890885750963, + "grad_norm": 1.1354793310165405, + "learning_rate": 2.6952931108258453e-05, + "loss": 0.442, + "step": 14917 + }, + { + "epoch": 19.150192554557126, + "grad_norm": 1.4518442153930664, + "learning_rate": 2.6952503209242618e-05, + "loss": 0.4102, + "step": 14918 + }, + { + "epoch": 19.15147625160462, + "grad_norm": 1.2954280376434326, + "learning_rate": 2.6952075310226787e-05, + "loss": 0.4377, + "step": 14919 + }, + { + "epoch": 19.15275994865212, + "grad_norm": 1.1304333209991455, + "learning_rate": 2.6951647411210955e-05, + "loss": 0.4211, + "step": 14920 + }, + { + "epoch": 19.154043645699616, + "grad_norm": 4.570643424987793, + "learning_rate": 2.695121951219512e-05, + "loss": 0.4551, + "step": 14921 + }, + { + "epoch": 19.15532734274711, + "grad_norm": 1.4469525814056396, + "learning_rate": 2.6950791613179292e-05, + "loss": 0.4368, + "step": 14922 + }, + { + "epoch": 19.15661103979461, + "grad_norm": 4.6810622215271, + "learning_rate": 2.6950363714163457e-05, + "loss": 0.4435, + "step": 14923 + }, + { + "epoch": 19.157894736842106, + "grad_norm": 3.4807281494140625, + "learning_rate": 2.694993581514763e-05, + "loss": 0.4569, + "step": 14924 + }, + { + "epoch": 19.1591784338896, + "grad_norm": 1.0767353773117065, + "learning_rate": 2.6949507916131794e-05, + "loss": 0.4198, + "step": 14925 + }, + { + "epoch": 19.1604621309371, + "grad_norm": 2.5958895683288574, + "learning_rate": 2.694908001711596e-05, + "loss": 0.4354, + "step": 14926 + }, + { + "epoch": 19.161745827984596, + "grad_norm": 1.8587251901626587, + "learning_rate": 2.694865211810013e-05, + "loss": 0.4247, + "step": 14927 + }, + { + "epoch": 19.163029525032094, + "grad_norm": 7.604872226715088, + "learning_rate": 2.6948224219084295e-05, + "loss": 0.4585, + "step": 14928 + }, + { + "epoch": 19.16431322207959, + "grad_norm": 1.2569167613983154, + "learning_rate": 2.6947796320068464e-05, + "loss": 0.4615, + "step": 14929 + }, + { + "epoch": 19.165596919127086, + "grad_norm": 1.3668136596679688, + "learning_rate": 2.6947368421052632e-05, + "loss": 0.4146, + "step": 14930 + }, + { + "epoch": 19.166880616174584, + "grad_norm": 1.722599744796753, + "learning_rate": 2.69469405220368e-05, + "loss": 0.4815, + "step": 14931 + }, + { + "epoch": 19.16816431322208, + "grad_norm": 3.353116273880005, + "learning_rate": 2.694651262302097e-05, + "loss": 0.4476, + "step": 14932 + }, + { + "epoch": 19.169448010269576, + "grad_norm": 5.463720798492432, + "learning_rate": 2.6946084724005134e-05, + "loss": 0.4043, + "step": 14933 + }, + { + "epoch": 19.170731707317074, + "grad_norm": 3.435800313949585, + "learning_rate": 2.6945656824989303e-05, + "loss": 0.4693, + "step": 14934 + }, + { + "epoch": 19.17201540436457, + "grad_norm": 1.8747169971466064, + "learning_rate": 2.694522892597347e-05, + "loss": 0.4475, + "step": 14935 + }, + { + "epoch": 19.173299101412066, + "grad_norm": 1.5324875116348267, + "learning_rate": 2.694480102695764e-05, + "loss": 0.4578, + "step": 14936 + }, + { + "epoch": 19.174582798459564, + "grad_norm": 2.1566226482391357, + "learning_rate": 2.6944373127941804e-05, + "loss": 0.4249, + "step": 14937 + }, + { + "epoch": 19.17586649550706, + "grad_norm": 1.3009999990463257, + "learning_rate": 2.6943945228925976e-05, + "loss": 0.5011, + "step": 14938 + }, + { + "epoch": 19.177150192554556, + "grad_norm": 1.4783570766448975, + "learning_rate": 2.694351732991014e-05, + "loss": 0.4705, + "step": 14939 + }, + { + "epoch": 19.178433889602054, + "grad_norm": 2.563509702682495, + "learning_rate": 2.694308943089431e-05, + "loss": 0.4817, + "step": 14940 + }, + { + "epoch": 19.179717586649552, + "grad_norm": 2.9167208671569824, + "learning_rate": 2.6942661531878478e-05, + "loss": 0.447, + "step": 14941 + }, + { + "epoch": 19.181001283697046, + "grad_norm": 1.3428329229354858, + "learning_rate": 2.6942233632862643e-05, + "loss": 0.4269, + "step": 14942 + }, + { + "epoch": 19.182284980744544, + "grad_norm": 2.8069863319396973, + "learning_rate": 2.6941805733846815e-05, + "loss": 0.5071, + "step": 14943 + }, + { + "epoch": 19.183568677792042, + "grad_norm": 1.4167726039886475, + "learning_rate": 2.694137783483098e-05, + "loss": 0.4669, + "step": 14944 + }, + { + "epoch": 19.184852374839537, + "grad_norm": 3.4191930294036865, + "learning_rate": 2.694094993581515e-05, + "loss": 0.5115, + "step": 14945 + }, + { + "epoch": 19.186136071887034, + "grad_norm": 1.5126967430114746, + "learning_rate": 2.6940522036799317e-05, + "loss": 0.4751, + "step": 14946 + }, + { + "epoch": 19.187419768934532, + "grad_norm": 1.641258716583252, + "learning_rate": 2.6940094137783482e-05, + "loss": 0.4716, + "step": 14947 + }, + { + "epoch": 19.188703465982027, + "grad_norm": 1.8350684642791748, + "learning_rate": 2.6939666238767654e-05, + "loss": 0.5086, + "step": 14948 + }, + { + "epoch": 19.189987163029524, + "grad_norm": 4.778802394866943, + "learning_rate": 2.693923833975182e-05, + "loss": 0.5176, + "step": 14949 + }, + { + "epoch": 19.191270860077022, + "grad_norm": 2.216984748840332, + "learning_rate": 2.6938810440735987e-05, + "loss": 0.5406, + "step": 14950 + }, + { + "epoch": 19.19255455712452, + "grad_norm": 1.922377347946167, + "learning_rate": 2.6938382541720155e-05, + "loss": 0.6714, + "step": 14951 + }, + { + "epoch": 19.193838254172015, + "grad_norm": 1.0584619045257568, + "learning_rate": 2.6937954642704324e-05, + "loss": 0.4365, + "step": 14952 + }, + { + "epoch": 19.195121951219512, + "grad_norm": 1.0122994184494019, + "learning_rate": 2.693752674368849e-05, + "loss": 0.3986, + "step": 14953 + }, + { + "epoch": 19.19640564826701, + "grad_norm": 2.0213332176208496, + "learning_rate": 2.6937098844672657e-05, + "loss": 0.4172, + "step": 14954 + }, + { + "epoch": 19.197689345314505, + "grad_norm": 2.768584728240967, + "learning_rate": 2.6936670945656826e-05, + "loss": 0.4325, + "step": 14955 + }, + { + "epoch": 19.198973042362002, + "grad_norm": 2.280836343765259, + "learning_rate": 2.6936243046640994e-05, + "loss": 0.4473, + "step": 14956 + }, + { + "epoch": 19.2002567394095, + "grad_norm": 2.832242965698242, + "learning_rate": 2.6935815147625162e-05, + "loss": 0.4543, + "step": 14957 + }, + { + "epoch": 19.201540436456995, + "grad_norm": 2.319483757019043, + "learning_rate": 2.6935387248609328e-05, + "loss": 0.4567, + "step": 14958 + }, + { + "epoch": 19.202824133504492, + "grad_norm": 1.7990138530731201, + "learning_rate": 2.6934959349593496e-05, + "loss": 0.4319, + "step": 14959 + }, + { + "epoch": 19.20410783055199, + "grad_norm": 1.7816380262374878, + "learning_rate": 2.6934531450577664e-05, + "loss": 0.4371, + "step": 14960 + }, + { + "epoch": 19.205391527599488, + "grad_norm": 1.0190850496292114, + "learning_rate": 2.693410355156183e-05, + "loss": 0.44, + "step": 14961 + }, + { + "epoch": 19.206675224646983, + "grad_norm": 1.2400964498519897, + "learning_rate": 2.6933675652546e-05, + "loss": 0.4666, + "step": 14962 + }, + { + "epoch": 19.20795892169448, + "grad_norm": 2.5354249477386475, + "learning_rate": 2.6933247753530166e-05, + "loss": 0.4202, + "step": 14963 + }, + { + "epoch": 19.20924261874198, + "grad_norm": 1.241003394126892, + "learning_rate": 2.6932819854514338e-05, + "loss": 0.4344, + "step": 14964 + }, + { + "epoch": 19.210526315789473, + "grad_norm": 2.1754705905914307, + "learning_rate": 2.6932391955498503e-05, + "loss": 0.4374, + "step": 14965 + }, + { + "epoch": 19.21181001283697, + "grad_norm": 3.5589637756347656, + "learning_rate": 2.6931964056482668e-05, + "loss": 0.428, + "step": 14966 + }, + { + "epoch": 19.21309370988447, + "grad_norm": 1.2429484128952026, + "learning_rate": 2.693153615746684e-05, + "loss": 0.4528, + "step": 14967 + }, + { + "epoch": 19.214377406931963, + "grad_norm": 2.0353007316589355, + "learning_rate": 2.6931108258451005e-05, + "loss": 0.4283, + "step": 14968 + }, + { + "epoch": 19.21566110397946, + "grad_norm": 4.044621467590332, + "learning_rate": 2.6930680359435173e-05, + "loss": 0.4289, + "step": 14969 + }, + { + "epoch": 19.21694480102696, + "grad_norm": 1.4136979579925537, + "learning_rate": 2.693025246041934e-05, + "loss": 0.47, + "step": 14970 + }, + { + "epoch": 19.218228498074453, + "grad_norm": 1.2746553421020508, + "learning_rate": 2.692982456140351e-05, + "loss": 0.4198, + "step": 14971 + }, + { + "epoch": 19.21951219512195, + "grad_norm": 1.2948375940322876, + "learning_rate": 2.692939666238768e-05, + "loss": 0.4286, + "step": 14972 + }, + { + "epoch": 19.22079589216945, + "grad_norm": 2.042372465133667, + "learning_rate": 2.6928968763371844e-05, + "loss": 0.453, + "step": 14973 + }, + { + "epoch": 19.222079589216946, + "grad_norm": 1.0735845565795898, + "learning_rate": 2.6928540864356012e-05, + "loss": 0.4334, + "step": 14974 + }, + { + "epoch": 19.22336328626444, + "grad_norm": 3.9222629070281982, + "learning_rate": 2.692811296534018e-05, + "loss": 0.4172, + "step": 14975 + }, + { + "epoch": 19.22464698331194, + "grad_norm": 3.270136833190918, + "learning_rate": 2.692768506632435e-05, + "loss": 0.4388, + "step": 14976 + }, + { + "epoch": 19.225930680359436, + "grad_norm": 5.344621181488037, + "learning_rate": 2.6927257167308514e-05, + "loss": 0.4208, + "step": 14977 + }, + { + "epoch": 19.22721437740693, + "grad_norm": 2.908125638961792, + "learning_rate": 2.6926829268292686e-05, + "loss": 0.4551, + "step": 14978 + }, + { + "epoch": 19.22849807445443, + "grad_norm": 1.952010989189148, + "learning_rate": 2.692640136927685e-05, + "loss": 0.4044, + "step": 14979 + }, + { + "epoch": 19.229781771501926, + "grad_norm": 1.3180009126663208, + "learning_rate": 2.692597347026102e-05, + "loss": 0.4951, + "step": 14980 + }, + { + "epoch": 19.23106546854942, + "grad_norm": 1.9429893493652344, + "learning_rate": 2.6925545571245187e-05, + "loss": 0.484, + "step": 14981 + }, + { + "epoch": 19.23234916559692, + "grad_norm": 5.633926868438721, + "learning_rate": 2.6925117672229352e-05, + "loss": 0.4525, + "step": 14982 + }, + { + "epoch": 19.233632862644416, + "grad_norm": 2.829206705093384, + "learning_rate": 2.6924689773213524e-05, + "loss": 0.4886, + "step": 14983 + }, + { + "epoch": 19.234916559691914, + "grad_norm": 3.0121185779571533, + "learning_rate": 2.692426187419769e-05, + "loss": 0.4726, + "step": 14984 + }, + { + "epoch": 19.23620025673941, + "grad_norm": 1.3331716060638428, + "learning_rate": 2.6923833975181858e-05, + "loss": 0.4419, + "step": 14985 + }, + { + "epoch": 19.237483953786906, + "grad_norm": 2.3848841190338135, + "learning_rate": 2.6923406076166026e-05, + "loss": 0.4462, + "step": 14986 + }, + { + "epoch": 19.238767650834404, + "grad_norm": 4.538712024688721, + "learning_rate": 2.692297817715019e-05, + "loss": 0.5029, + "step": 14987 + }, + { + "epoch": 19.2400513478819, + "grad_norm": 2.094343900680542, + "learning_rate": 2.6922550278134363e-05, + "loss": 0.4271, + "step": 14988 + }, + { + "epoch": 19.241335044929397, + "grad_norm": 1.7776384353637695, + "learning_rate": 2.6922122379118528e-05, + "loss": 0.4692, + "step": 14989 + }, + { + "epoch": 19.242618741976894, + "grad_norm": 2.063903331756592, + "learning_rate": 2.6921694480102696e-05, + "loss": 0.4902, + "step": 14990 + }, + { + "epoch": 19.24390243902439, + "grad_norm": 1.306745171546936, + "learning_rate": 2.6921266581086865e-05, + "loss": 0.4552, + "step": 14991 + }, + { + "epoch": 19.245186136071887, + "grad_norm": 2.5539662837982178, + "learning_rate": 2.6920838682071033e-05, + "loss": 0.4969, + "step": 14992 + }, + { + "epoch": 19.246469833119384, + "grad_norm": 2.892042398452759, + "learning_rate": 2.6920410783055198e-05, + "loss": 0.5151, + "step": 14993 + }, + { + "epoch": 19.247753530166882, + "grad_norm": 3.86124849319458, + "learning_rate": 2.6919982884039367e-05, + "loss": 0.4823, + "step": 14994 + }, + { + "epoch": 19.249037227214377, + "grad_norm": 1.5307189226150513, + "learning_rate": 2.6919554985023535e-05, + "loss": 0.4981, + "step": 14995 + }, + { + "epoch": 19.250320924261874, + "grad_norm": 1.6955444812774658, + "learning_rate": 2.6919127086007703e-05, + "loss": 0.5163, + "step": 14996 + }, + { + "epoch": 19.251604621309372, + "grad_norm": 2.695693254470825, + "learning_rate": 2.6918699186991872e-05, + "loss": 0.5378, + "step": 14997 + }, + { + "epoch": 19.252888318356867, + "grad_norm": 1.4026424884796143, + "learning_rate": 2.6918271287976037e-05, + "loss": 0.5333, + "step": 14998 + }, + { + "epoch": 19.254172015404365, + "grad_norm": 2.348435163497925, + "learning_rate": 2.691784338896021e-05, + "loss": 0.4838, + "step": 14999 + }, + { + "epoch": 19.255455712451862, + "grad_norm": 1.9305814504623413, + "learning_rate": 2.6917415489944374e-05, + "loss": 0.5709, + "step": 15000 + }, + { + "epoch": 19.255455712451862, + "eval_cer": 0.28314078715488544, + "eval_loss": 0.5087926983833313, + "eval_runtime": 14.4174, + "eval_samples_per_second": 68.182, + "eval_steps_per_second": 0.486, + "eval_wer": 0.5009172208209126, + "step": 15000 + }, + { + "epoch": 19.256739409499357, + "grad_norm": 5.788949966430664, + "learning_rate": 2.691698759092854e-05, + "loss": 0.608, + "step": 15001 + }, + { + "epoch": 19.258023106546855, + "grad_norm": 2.340296745300293, + "learning_rate": 2.691655969191271e-05, + "loss": 0.4286, + "step": 15002 + }, + { + "epoch": 19.259306803594352, + "grad_norm": 1.1176354885101318, + "learning_rate": 2.6916131792896876e-05, + "loss": 0.4282, + "step": 15003 + }, + { + "epoch": 19.260590500641847, + "grad_norm": 2.9983596801757812, + "learning_rate": 2.6915703893881047e-05, + "loss": 0.439, + "step": 15004 + }, + { + "epoch": 19.261874197689345, + "grad_norm": 2.8020026683807373, + "learning_rate": 2.6915275994865212e-05, + "loss": 0.4975, + "step": 15005 + }, + { + "epoch": 19.263157894736842, + "grad_norm": 3.166025161743164, + "learning_rate": 2.691484809584938e-05, + "loss": 0.4181, + "step": 15006 + }, + { + "epoch": 19.26444159178434, + "grad_norm": 1.4952585697174072, + "learning_rate": 2.691442019683355e-05, + "loss": 0.4536, + "step": 15007 + }, + { + "epoch": 19.265725288831835, + "grad_norm": 3.3616347312927246, + "learning_rate": 2.6913992297817714e-05, + "loss": 0.3943, + "step": 15008 + }, + { + "epoch": 19.267008985879333, + "grad_norm": 1.5874992609024048, + "learning_rate": 2.6913564398801883e-05, + "loss": 0.4546, + "step": 15009 + }, + { + "epoch": 19.26829268292683, + "grad_norm": 4.090325355529785, + "learning_rate": 2.691313649978605e-05, + "loss": 0.446, + "step": 15010 + }, + { + "epoch": 19.269576379974325, + "grad_norm": 2.508754253387451, + "learning_rate": 2.691270860077022e-05, + "loss": 0.4249, + "step": 15011 + }, + { + "epoch": 19.270860077021823, + "grad_norm": 1.415967583656311, + "learning_rate": 2.6912280701754388e-05, + "loss": 0.4719, + "step": 15012 + }, + { + "epoch": 19.27214377406932, + "grad_norm": 1.7254940271377563, + "learning_rate": 2.6911852802738556e-05, + "loss": 0.4606, + "step": 15013 + }, + { + "epoch": 19.273427471116815, + "grad_norm": 1.2427914142608643, + "learning_rate": 2.691142490372272e-05, + "loss": 0.4591, + "step": 15014 + }, + { + "epoch": 19.274711168164313, + "grad_norm": 5.381077289581299, + "learning_rate": 2.691099700470689e-05, + "loss": 0.4445, + "step": 15015 + }, + { + "epoch": 19.27599486521181, + "grad_norm": 3.505308151245117, + "learning_rate": 2.6910569105691058e-05, + "loss": 0.5007, + "step": 15016 + }, + { + "epoch": 19.27727856225931, + "grad_norm": 1.5056582689285278, + "learning_rate": 2.6910141206675223e-05, + "loss": 0.4651, + "step": 15017 + }, + { + "epoch": 19.278562259306803, + "grad_norm": 1.683336615562439, + "learning_rate": 2.6909713307659395e-05, + "loss": 0.4294, + "step": 15018 + }, + { + "epoch": 19.2798459563543, + "grad_norm": 2.9821057319641113, + "learning_rate": 2.690928540864356e-05, + "loss": 0.4471, + "step": 15019 + }, + { + "epoch": 19.2811296534018, + "grad_norm": 1.5981537103652954, + "learning_rate": 2.690885750962773e-05, + "loss": 0.4051, + "step": 15020 + }, + { + "epoch": 19.282413350449293, + "grad_norm": 13.247089385986328, + "learning_rate": 2.6908429610611897e-05, + "loss": 0.4143, + "step": 15021 + }, + { + "epoch": 19.28369704749679, + "grad_norm": 2.1307079792022705, + "learning_rate": 2.6908001711596062e-05, + "loss": 0.436, + "step": 15022 + }, + { + "epoch": 19.28498074454429, + "grad_norm": 1.4770793914794922, + "learning_rate": 2.6907573812580234e-05, + "loss": 0.4395, + "step": 15023 + }, + { + "epoch": 19.286264441591783, + "grad_norm": 1.521286129951477, + "learning_rate": 2.69071459135644e-05, + "loss": 0.449, + "step": 15024 + }, + { + "epoch": 19.28754813863928, + "grad_norm": 1.1708102226257324, + "learning_rate": 2.6906718014548567e-05, + "loss": 0.4532, + "step": 15025 + }, + { + "epoch": 19.28883183568678, + "grad_norm": 2.5395805835723877, + "learning_rate": 2.6906290115532735e-05, + "loss": 0.4355, + "step": 15026 + }, + { + "epoch": 19.290115532734276, + "grad_norm": 2.220684766769409, + "learning_rate": 2.69058622165169e-05, + "loss": 0.4519, + "step": 15027 + }, + { + "epoch": 19.29139922978177, + "grad_norm": 1.1046066284179688, + "learning_rate": 2.6905434317501072e-05, + "loss": 0.4721, + "step": 15028 + }, + { + "epoch": 19.29268292682927, + "grad_norm": 1.423161268234253, + "learning_rate": 2.6905006418485237e-05, + "loss": 0.4234, + "step": 15029 + }, + { + "epoch": 19.293966623876766, + "grad_norm": 6.964531421661377, + "learning_rate": 2.6904578519469406e-05, + "loss": 0.5105, + "step": 15030 + }, + { + "epoch": 19.29525032092426, + "grad_norm": 1.7940688133239746, + "learning_rate": 2.6904150620453574e-05, + "loss": 0.4589, + "step": 15031 + }, + { + "epoch": 19.29653401797176, + "grad_norm": 6.658924579620361, + "learning_rate": 2.6903722721437743e-05, + "loss": 0.4279, + "step": 15032 + }, + { + "epoch": 19.297817715019256, + "grad_norm": 2.314878225326538, + "learning_rate": 2.6903294822421908e-05, + "loss": 0.4675, + "step": 15033 + }, + { + "epoch": 19.29910141206675, + "grad_norm": 1.1279985904693604, + "learning_rate": 2.6902866923406076e-05, + "loss": 0.4954, + "step": 15034 + }, + { + "epoch": 19.30038510911425, + "grad_norm": 2.6979472637176514, + "learning_rate": 2.6902439024390244e-05, + "loss": 0.49, + "step": 15035 + }, + { + "epoch": 19.301668806161747, + "grad_norm": 1.9646551609039307, + "learning_rate": 2.6902011125374413e-05, + "loss": 0.4915, + "step": 15036 + }, + { + "epoch": 19.30295250320924, + "grad_norm": 1.4861531257629395, + "learning_rate": 2.690158322635858e-05, + "loss": 0.4609, + "step": 15037 + }, + { + "epoch": 19.30423620025674, + "grad_norm": 0.9947761297225952, + "learning_rate": 2.6901155327342746e-05, + "loss": 0.4676, + "step": 15038 + }, + { + "epoch": 19.305519897304237, + "grad_norm": 1.3630201816558838, + "learning_rate": 2.6900727428326918e-05, + "loss": 0.4411, + "step": 15039 + }, + { + "epoch": 19.306803594351734, + "grad_norm": 6.450471878051758, + "learning_rate": 2.6900299529311083e-05, + "loss": 0.4652, + "step": 15040 + }, + { + "epoch": 19.30808729139923, + "grad_norm": 1.6906702518463135, + "learning_rate": 2.6899871630295248e-05, + "loss": 0.4917, + "step": 15041 + }, + { + "epoch": 19.309370988446727, + "grad_norm": 1.2606663703918457, + "learning_rate": 2.689944373127942e-05, + "loss": 0.4689, + "step": 15042 + }, + { + "epoch": 19.310654685494224, + "grad_norm": 4.706424236297607, + "learning_rate": 2.6899015832263585e-05, + "loss": 0.5827, + "step": 15043 + }, + { + "epoch": 19.31193838254172, + "grad_norm": 19.97403335571289, + "learning_rate": 2.6898587933247757e-05, + "loss": 0.4502, + "step": 15044 + }, + { + "epoch": 19.313222079589217, + "grad_norm": 4.518772602081299, + "learning_rate": 2.6898160034231922e-05, + "loss": 0.4477, + "step": 15045 + }, + { + "epoch": 19.314505776636715, + "grad_norm": 1.597501516342163, + "learning_rate": 2.689773213521609e-05, + "loss": 0.481, + "step": 15046 + }, + { + "epoch": 19.31578947368421, + "grad_norm": 14.091059684753418, + "learning_rate": 2.689730423620026e-05, + "loss": 0.5454, + "step": 15047 + }, + { + "epoch": 19.317073170731707, + "grad_norm": 4.8363847732543945, + "learning_rate": 2.6896876337184424e-05, + "loss": 0.5514, + "step": 15048 + }, + { + "epoch": 19.318356867779205, + "grad_norm": 1.3141635656356812, + "learning_rate": 2.6896448438168592e-05, + "loss": 0.4919, + "step": 15049 + }, + { + "epoch": 19.319640564826702, + "grad_norm": 1.5626875162124634, + "learning_rate": 2.689602053915276e-05, + "loss": 0.5719, + "step": 15050 + }, + { + "epoch": 19.320924261874197, + "grad_norm": 2.5778326988220215, + "learning_rate": 2.689559264013693e-05, + "loss": 0.6352, + "step": 15051 + }, + { + "epoch": 19.322207958921695, + "grad_norm": 1.0470943450927734, + "learning_rate": 2.6895164741121097e-05, + "loss": 0.4414, + "step": 15052 + }, + { + "epoch": 19.323491655969192, + "grad_norm": 1.9581425189971924, + "learning_rate": 2.6894736842105266e-05, + "loss": 0.4301, + "step": 15053 + }, + { + "epoch": 19.324775353016687, + "grad_norm": 0.8987404108047485, + "learning_rate": 2.689430894308943e-05, + "loss": 0.4366, + "step": 15054 + }, + { + "epoch": 19.326059050064185, + "grad_norm": 1.4668420553207397, + "learning_rate": 2.68938810440736e-05, + "loss": 0.45, + "step": 15055 + }, + { + "epoch": 19.327342747111683, + "grad_norm": 1.2662240266799927, + "learning_rate": 2.6893453145057767e-05, + "loss": 0.4182, + "step": 15056 + }, + { + "epoch": 19.328626444159177, + "grad_norm": 1.5500186681747437, + "learning_rate": 2.6893025246041933e-05, + "loss": 0.4331, + "step": 15057 + }, + { + "epoch": 19.329910141206675, + "grad_norm": 2.0920259952545166, + "learning_rate": 2.6892597347026104e-05, + "loss": 0.4407, + "step": 15058 + }, + { + "epoch": 19.331193838254173, + "grad_norm": 1.4608526229858398, + "learning_rate": 2.689216944801027e-05, + "loss": 0.4403, + "step": 15059 + }, + { + "epoch": 19.33247753530167, + "grad_norm": 1.9635002613067627, + "learning_rate": 2.689174154899444e-05, + "loss": 0.4597, + "step": 15060 + }, + { + "epoch": 19.333761232349165, + "grad_norm": 6.077988624572754, + "learning_rate": 2.6891313649978606e-05, + "loss": 0.4501, + "step": 15061 + }, + { + "epoch": 19.335044929396663, + "grad_norm": 1.0911537408828735, + "learning_rate": 2.689088575096277e-05, + "loss": 0.4189, + "step": 15062 + }, + { + "epoch": 19.33632862644416, + "grad_norm": 2.38450288772583, + "learning_rate": 2.6890457851946943e-05, + "loss": 0.4288, + "step": 15063 + }, + { + "epoch": 19.337612323491655, + "grad_norm": 1.2896790504455566, + "learning_rate": 2.6890029952931108e-05, + "loss": 0.4654, + "step": 15064 + }, + { + "epoch": 19.338896020539153, + "grad_norm": 1.8671934604644775, + "learning_rate": 2.6889602053915276e-05, + "loss": 0.4621, + "step": 15065 + }, + { + "epoch": 19.34017971758665, + "grad_norm": 3.464871406555176, + "learning_rate": 2.6889174154899445e-05, + "loss": 0.4236, + "step": 15066 + }, + { + "epoch": 19.341463414634145, + "grad_norm": 2.409280300140381, + "learning_rate": 2.6888746255883613e-05, + "loss": 0.4642, + "step": 15067 + }, + { + "epoch": 19.342747111681643, + "grad_norm": 5.960669994354248, + "learning_rate": 2.688831835686778e-05, + "loss": 0.4822, + "step": 15068 + }, + { + "epoch": 19.34403080872914, + "grad_norm": 1.6674234867095947, + "learning_rate": 2.6887890457851947e-05, + "loss": 0.4539, + "step": 15069 + }, + { + "epoch": 19.345314505776635, + "grad_norm": 1.2481341361999512, + "learning_rate": 2.6887462558836115e-05, + "loss": 0.4011, + "step": 15070 + }, + { + "epoch": 19.346598202824133, + "grad_norm": 1.3101866245269775, + "learning_rate": 2.6887034659820284e-05, + "loss": 0.4314, + "step": 15071 + }, + { + "epoch": 19.34788189987163, + "grad_norm": 3.791421890258789, + "learning_rate": 2.6886606760804452e-05, + "loss": 0.4487, + "step": 15072 + }, + { + "epoch": 19.34916559691913, + "grad_norm": 2.9003405570983887, + "learning_rate": 2.6886178861788617e-05, + "loss": 0.4413, + "step": 15073 + }, + { + "epoch": 19.350449293966623, + "grad_norm": 1.7379579544067383, + "learning_rate": 2.688575096277279e-05, + "loss": 0.4375, + "step": 15074 + }, + { + "epoch": 19.35173299101412, + "grad_norm": 1.4539023637771606, + "learning_rate": 2.6885323063756954e-05, + "loss": 0.4273, + "step": 15075 + }, + { + "epoch": 19.35301668806162, + "grad_norm": 6.3120622634887695, + "learning_rate": 2.6884895164741122e-05, + "loss": 0.4934, + "step": 15076 + }, + { + "epoch": 19.354300385109113, + "grad_norm": 1.8579838275909424, + "learning_rate": 2.688446726572529e-05, + "loss": 0.4878, + "step": 15077 + }, + { + "epoch": 19.35558408215661, + "grad_norm": 2.2475929260253906, + "learning_rate": 2.6884039366709456e-05, + "loss": 0.4074, + "step": 15078 + }, + { + "epoch": 19.35686777920411, + "grad_norm": 2.802212953567505, + "learning_rate": 2.6883611467693627e-05, + "loss": 0.4652, + "step": 15079 + }, + { + "epoch": 19.358151476251603, + "grad_norm": 1.5193480253219604, + "learning_rate": 2.6883183568677792e-05, + "loss": 0.4897, + "step": 15080 + }, + { + "epoch": 19.3594351732991, + "grad_norm": 1.3686078786849976, + "learning_rate": 2.688275566966196e-05, + "loss": 0.4608, + "step": 15081 + }, + { + "epoch": 19.3607188703466, + "grad_norm": 1.2645010948181152, + "learning_rate": 2.688232777064613e-05, + "loss": 0.4578, + "step": 15082 + }, + { + "epoch": 19.362002567394097, + "grad_norm": 1.887528419494629, + "learning_rate": 2.6881899871630294e-05, + "loss": 0.4439, + "step": 15083 + }, + { + "epoch": 19.36328626444159, + "grad_norm": 2.871734142303467, + "learning_rate": 2.6881471972614466e-05, + "loss": 0.4642, + "step": 15084 + }, + { + "epoch": 19.36456996148909, + "grad_norm": 6.604577541351318, + "learning_rate": 2.688104407359863e-05, + "loss": 0.4348, + "step": 15085 + }, + { + "epoch": 19.365853658536587, + "grad_norm": 1.1507301330566406, + "learning_rate": 2.68806161745828e-05, + "loss": 0.4566, + "step": 15086 + }, + { + "epoch": 19.36713735558408, + "grad_norm": 1.9572103023529053, + "learning_rate": 2.6880188275566968e-05, + "loss": 0.4535, + "step": 15087 + }, + { + "epoch": 19.36842105263158, + "grad_norm": 1.966869592666626, + "learning_rate": 2.6879760376551133e-05, + "loss": 0.4333, + "step": 15088 + }, + { + "epoch": 19.369704749679077, + "grad_norm": 1.3608510494232178, + "learning_rate": 2.68793324775353e-05, + "loss": 0.4641, + "step": 15089 + }, + { + "epoch": 19.37098844672657, + "grad_norm": 1.2303378582000732, + "learning_rate": 2.687890457851947e-05, + "loss": 0.5094, + "step": 15090 + }, + { + "epoch": 19.37227214377407, + "grad_norm": 2.6917600631713867, + "learning_rate": 2.6878476679503638e-05, + "loss": 0.4666, + "step": 15091 + }, + { + "epoch": 19.373555840821567, + "grad_norm": 1.5528759956359863, + "learning_rate": 2.6878048780487803e-05, + "loss": 0.4539, + "step": 15092 + }, + { + "epoch": 19.374839537869065, + "grad_norm": 5.805881977081299, + "learning_rate": 2.6877620881471975e-05, + "loss": 0.4953, + "step": 15093 + }, + { + "epoch": 19.37612323491656, + "grad_norm": 1.2149336338043213, + "learning_rate": 2.687719298245614e-05, + "loss": 0.4427, + "step": 15094 + }, + { + "epoch": 19.377406931964057, + "grad_norm": 1.4508256912231445, + "learning_rate": 2.687676508344031e-05, + "loss": 0.5074, + "step": 15095 + }, + { + "epoch": 19.378690629011555, + "grad_norm": 1.5224794149398804, + "learning_rate": 2.6876337184424477e-05, + "loss": 0.5007, + "step": 15096 + }, + { + "epoch": 19.37997432605905, + "grad_norm": 2.1506388187408447, + "learning_rate": 2.6875909285408642e-05, + "loss": 0.5251, + "step": 15097 + }, + { + "epoch": 19.381258023106547, + "grad_norm": 1.3222206830978394, + "learning_rate": 2.6875481386392814e-05, + "loss": 0.475, + "step": 15098 + }, + { + "epoch": 19.382541720154045, + "grad_norm": 3.0557470321655273, + "learning_rate": 2.687505348737698e-05, + "loss": 0.5796, + "step": 15099 + }, + { + "epoch": 19.38382541720154, + "grad_norm": 2.840961217880249, + "learning_rate": 2.6874625588361147e-05, + "loss": 0.5589, + "step": 15100 + }, + { + "epoch": 19.385109114249037, + "grad_norm": 2.970890522003174, + "learning_rate": 2.6874197689345316e-05, + "loss": 0.6997, + "step": 15101 + }, + { + "epoch": 19.386392811296535, + "grad_norm": 1.9368897676467896, + "learning_rate": 2.687376979032948e-05, + "loss": 0.455, + "step": 15102 + }, + { + "epoch": 19.387676508344033, + "grad_norm": 1.8057191371917725, + "learning_rate": 2.6873341891313652e-05, + "loss": 0.4239, + "step": 15103 + }, + { + "epoch": 19.388960205391527, + "grad_norm": 0.9097927808761597, + "learning_rate": 2.6872913992297817e-05, + "loss": 0.4425, + "step": 15104 + }, + { + "epoch": 19.390243902439025, + "grad_norm": 1.2439789772033691, + "learning_rate": 2.6872486093281986e-05, + "loss": 0.471, + "step": 15105 + }, + { + "epoch": 19.391527599486523, + "grad_norm": 1.5486533641815186, + "learning_rate": 2.6872058194266154e-05, + "loss": 0.4439, + "step": 15106 + }, + { + "epoch": 19.392811296534017, + "grad_norm": 1.9440563917160034, + "learning_rate": 2.6871630295250323e-05, + "loss": 0.4395, + "step": 15107 + }, + { + "epoch": 19.394094993581515, + "grad_norm": 1.053676962852478, + "learning_rate": 2.6871202396234488e-05, + "loss": 0.42, + "step": 15108 + }, + { + "epoch": 19.395378690629013, + "grad_norm": 0.9725732207298279, + "learning_rate": 2.6870774497218656e-05, + "loss": 0.4242, + "step": 15109 + }, + { + "epoch": 19.396662387676507, + "grad_norm": 1.3236366510391235, + "learning_rate": 2.6870346598202824e-05, + "loss": 0.432, + "step": 15110 + }, + { + "epoch": 19.397946084724005, + "grad_norm": 1.3768306970596313, + "learning_rate": 2.6869918699186993e-05, + "loss": 0.4736, + "step": 15111 + }, + { + "epoch": 19.399229781771503, + "grad_norm": 0.8319738507270813, + "learning_rate": 2.686949080017116e-05, + "loss": 0.3958, + "step": 15112 + }, + { + "epoch": 19.400513478818997, + "grad_norm": 2.3421709537506104, + "learning_rate": 2.6869062901155326e-05, + "loss": 0.4778, + "step": 15113 + }, + { + "epoch": 19.401797175866495, + "grad_norm": 1.0021145343780518, + "learning_rate": 2.6868635002139498e-05, + "loss": 0.4413, + "step": 15114 + }, + { + "epoch": 19.403080872913993, + "grad_norm": 1.9400067329406738, + "learning_rate": 2.6868207103123663e-05, + "loss": 0.4806, + "step": 15115 + }, + { + "epoch": 19.40436456996149, + "grad_norm": 1.9344372749328613, + "learning_rate": 2.6867779204107828e-05, + "loss": 0.4554, + "step": 15116 + }, + { + "epoch": 19.405648267008985, + "grad_norm": 1.3721638917922974, + "learning_rate": 2.6867351305092e-05, + "loss": 0.4338, + "step": 15117 + }, + { + "epoch": 19.406931964056483, + "grad_norm": 1.8122438192367554, + "learning_rate": 2.6866923406076165e-05, + "loss": 0.4393, + "step": 15118 + }, + { + "epoch": 19.40821566110398, + "grad_norm": 1.3978266716003418, + "learning_rate": 2.6866495507060337e-05, + "loss": 0.4115, + "step": 15119 + }, + { + "epoch": 19.409499358151475, + "grad_norm": 1.5978400707244873, + "learning_rate": 2.6866067608044502e-05, + "loss": 0.4627, + "step": 15120 + }, + { + "epoch": 19.410783055198973, + "grad_norm": 1.1013860702514648, + "learning_rate": 2.686563970902867e-05, + "loss": 0.4501, + "step": 15121 + }, + { + "epoch": 19.41206675224647, + "grad_norm": 1.1735748052597046, + "learning_rate": 2.686521181001284e-05, + "loss": 0.4422, + "step": 15122 + }, + { + "epoch": 19.413350449293965, + "grad_norm": 1.235690712928772, + "learning_rate": 2.6864783910997004e-05, + "loss": 0.3942, + "step": 15123 + }, + { + "epoch": 19.414634146341463, + "grad_norm": 1.5951613187789917, + "learning_rate": 2.6864356011981172e-05, + "loss": 0.4658, + "step": 15124 + }, + { + "epoch": 19.41591784338896, + "grad_norm": 1.0457983016967773, + "learning_rate": 2.686392811296534e-05, + "loss": 0.4151, + "step": 15125 + }, + { + "epoch": 19.41720154043646, + "grad_norm": 1.32906174659729, + "learning_rate": 2.686350021394951e-05, + "loss": 0.4518, + "step": 15126 + }, + { + "epoch": 19.418485237483953, + "grad_norm": 2.165592908859253, + "learning_rate": 2.6863072314933677e-05, + "loss": 0.4643, + "step": 15127 + }, + { + "epoch": 19.41976893453145, + "grad_norm": 1.3569018840789795, + "learning_rate": 2.6862644415917846e-05, + "loss": 0.4559, + "step": 15128 + }, + { + "epoch": 19.42105263157895, + "grad_norm": 3.534006118774414, + "learning_rate": 2.686221651690201e-05, + "loss": 0.4353, + "step": 15129 + }, + { + "epoch": 19.422336328626443, + "grad_norm": 1.1410911083221436, + "learning_rate": 2.686178861788618e-05, + "loss": 0.458, + "step": 15130 + }, + { + "epoch": 19.42362002567394, + "grad_norm": 1.8531121015548706, + "learning_rate": 2.6861360718870348e-05, + "loss": 0.4801, + "step": 15131 + }, + { + "epoch": 19.42490372272144, + "grad_norm": 1.6405061483383179, + "learning_rate": 2.6860932819854513e-05, + "loss": 0.457, + "step": 15132 + }, + { + "epoch": 19.426187419768933, + "grad_norm": 1.9574909210205078, + "learning_rate": 2.6860504920838684e-05, + "loss": 0.4928, + "step": 15133 + }, + { + "epoch": 19.42747111681643, + "grad_norm": 1.5019398927688599, + "learning_rate": 2.686007702182285e-05, + "loss": 0.4582, + "step": 15134 + }, + { + "epoch": 19.42875481386393, + "grad_norm": 1.3769993782043457, + "learning_rate": 2.685964912280702e-05, + "loss": 0.4697, + "step": 15135 + }, + { + "epoch": 19.430038510911427, + "grad_norm": 1.8157777786254883, + "learning_rate": 2.6859221223791186e-05, + "loss": 0.4934, + "step": 15136 + }, + { + "epoch": 19.43132220795892, + "grad_norm": 1.2612024545669556, + "learning_rate": 2.685879332477535e-05, + "loss": 0.4358, + "step": 15137 + }, + { + "epoch": 19.43260590500642, + "grad_norm": 2.197603464126587, + "learning_rate": 2.6858365425759523e-05, + "loss": 0.4664, + "step": 15138 + }, + { + "epoch": 19.433889602053917, + "grad_norm": 1.166526198387146, + "learning_rate": 2.6857937526743688e-05, + "loss": 0.5342, + "step": 15139 + }, + { + "epoch": 19.43517329910141, + "grad_norm": 1.7759279012680054, + "learning_rate": 2.6857509627727856e-05, + "loss": 0.4681, + "step": 15140 + }, + { + "epoch": 19.43645699614891, + "grad_norm": 1.9687124490737915, + "learning_rate": 2.6857081728712025e-05, + "loss": 0.4982, + "step": 15141 + }, + { + "epoch": 19.437740693196407, + "grad_norm": 1.9597338438034058, + "learning_rate": 2.6856653829696193e-05, + "loss": 0.4203, + "step": 15142 + }, + { + "epoch": 19.4390243902439, + "grad_norm": 3.4748077392578125, + "learning_rate": 2.6856225930680362e-05, + "loss": 0.4697, + "step": 15143 + }, + { + "epoch": 19.4403080872914, + "grad_norm": 3.5253801345825195, + "learning_rate": 2.6855798031664527e-05, + "loss": 0.4668, + "step": 15144 + }, + { + "epoch": 19.441591784338897, + "grad_norm": 2.723426580429077, + "learning_rate": 2.6855370132648695e-05, + "loss": 0.4758, + "step": 15145 + }, + { + "epoch": 19.44287548138639, + "grad_norm": 1.9549015760421753, + "learning_rate": 2.6854942233632864e-05, + "loss": 0.5222, + "step": 15146 + }, + { + "epoch": 19.44415917843389, + "grad_norm": 1.324675440788269, + "learning_rate": 2.6854514334617032e-05, + "loss": 0.5286, + "step": 15147 + }, + { + "epoch": 19.445442875481387, + "grad_norm": 1.4978222846984863, + "learning_rate": 2.6854086435601197e-05, + "loss": 0.474, + "step": 15148 + }, + { + "epoch": 19.446726572528885, + "grad_norm": 1.483777642250061, + "learning_rate": 2.6853658536585365e-05, + "loss": 0.6182, + "step": 15149 + }, + { + "epoch": 19.44801026957638, + "grad_norm": 2.165870428085327, + "learning_rate": 2.6853230637569534e-05, + "loss": 0.6221, + "step": 15150 + }, + { + "epoch": 19.449293966623877, + "grad_norm": 2.3646605014801025, + "learning_rate": 2.6852802738553702e-05, + "loss": 0.5982, + "step": 15151 + }, + { + "epoch": 19.450577663671375, + "grad_norm": 2.6759467124938965, + "learning_rate": 2.685237483953787e-05, + "loss": 0.4282, + "step": 15152 + }, + { + "epoch": 19.45186136071887, + "grad_norm": 3.1927716732025146, + "learning_rate": 2.6851946940522036e-05, + "loss": 0.4266, + "step": 15153 + }, + { + "epoch": 19.453145057766367, + "grad_norm": 1.4197369813919067, + "learning_rate": 2.6851519041506207e-05, + "loss": 0.4382, + "step": 15154 + }, + { + "epoch": 19.454428754813865, + "grad_norm": 1.4062819480895996, + "learning_rate": 2.6851091142490373e-05, + "loss": 0.4442, + "step": 15155 + }, + { + "epoch": 19.45571245186136, + "grad_norm": 1.3301548957824707, + "learning_rate": 2.6850663243474538e-05, + "loss": 0.4457, + "step": 15156 + }, + { + "epoch": 19.456996148908857, + "grad_norm": 1.2428057193756104, + "learning_rate": 2.685023534445871e-05, + "loss": 0.4277, + "step": 15157 + }, + { + "epoch": 19.458279845956355, + "grad_norm": 1.2085813283920288, + "learning_rate": 2.6849807445442874e-05, + "loss": 0.4627, + "step": 15158 + }, + { + "epoch": 19.459563543003853, + "grad_norm": 2.34606671333313, + "learning_rate": 2.6849379546427046e-05, + "loss": 0.4252, + "step": 15159 + }, + { + "epoch": 19.460847240051347, + "grad_norm": 1.2398368120193481, + "learning_rate": 2.684895164741121e-05, + "loss": 0.4216, + "step": 15160 + }, + { + "epoch": 19.462130937098845, + "grad_norm": 1.0232735872268677, + "learning_rate": 2.684852374839538e-05, + "loss": 0.4309, + "step": 15161 + }, + { + "epoch": 19.463414634146343, + "grad_norm": 2.6890172958374023, + "learning_rate": 2.6848095849379548e-05, + "loss": 0.4507, + "step": 15162 + }, + { + "epoch": 19.464698331193837, + "grad_norm": 1.669122338294983, + "learning_rate": 2.6847667950363713e-05, + "loss": 0.4477, + "step": 15163 + }, + { + "epoch": 19.465982028241335, + "grad_norm": 5.144916534423828, + "learning_rate": 2.684724005134788e-05, + "loss": 0.4074, + "step": 15164 + }, + { + "epoch": 19.467265725288833, + "grad_norm": 1.3489960432052612, + "learning_rate": 2.684681215233205e-05, + "loss": 0.4443, + "step": 15165 + }, + { + "epoch": 19.468549422336327, + "grad_norm": 1.68476140499115, + "learning_rate": 2.6846384253316218e-05, + "loss": 0.4233, + "step": 15166 + }, + { + "epoch": 19.469833119383825, + "grad_norm": 1.2331475019454956, + "learning_rate": 2.6845956354300387e-05, + "loss": 0.4715, + "step": 15167 + }, + { + "epoch": 19.471116816431323, + "grad_norm": 1.2320225238800049, + "learning_rate": 2.6845528455284555e-05, + "loss": 0.4289, + "step": 15168 + }, + { + "epoch": 19.47240051347882, + "grad_norm": 3.5568387508392334, + "learning_rate": 2.684510055626872e-05, + "loss": 0.4304, + "step": 15169 + }, + { + "epoch": 19.473684210526315, + "grad_norm": 0.9770057797431946, + "learning_rate": 2.684467265725289e-05, + "loss": 0.4468, + "step": 15170 + }, + { + "epoch": 19.474967907573813, + "grad_norm": 1.9432190656661987, + "learning_rate": 2.6844244758237057e-05, + "loss": 0.4573, + "step": 15171 + }, + { + "epoch": 19.47625160462131, + "grad_norm": 1.4636884927749634, + "learning_rate": 2.6843816859221222e-05, + "loss": 0.4539, + "step": 15172 + }, + { + "epoch": 19.477535301668805, + "grad_norm": 1.386520504951477, + "learning_rate": 2.6843388960205394e-05, + "loss": 0.4598, + "step": 15173 + }, + { + "epoch": 19.478818998716303, + "grad_norm": 3.0845229625701904, + "learning_rate": 2.684296106118956e-05, + "loss": 0.4479, + "step": 15174 + }, + { + "epoch": 19.4801026957638, + "grad_norm": 1.327967882156372, + "learning_rate": 2.684253316217373e-05, + "loss": 0.4371, + "step": 15175 + }, + { + "epoch": 19.481386392811295, + "grad_norm": 1.3569387197494507, + "learning_rate": 2.6842105263157896e-05, + "loss": 0.4475, + "step": 15176 + }, + { + "epoch": 19.482670089858793, + "grad_norm": 1.4819196462631226, + "learning_rate": 2.684167736414206e-05, + "loss": 0.4512, + "step": 15177 + }, + { + "epoch": 19.48395378690629, + "grad_norm": 1.6081486940383911, + "learning_rate": 2.6841249465126232e-05, + "loss": 0.4355, + "step": 15178 + }, + { + "epoch": 19.485237483953785, + "grad_norm": 2.3610293865203857, + "learning_rate": 2.6840821566110397e-05, + "loss": 0.3916, + "step": 15179 + }, + { + "epoch": 19.486521181001283, + "grad_norm": 1.4014441967010498, + "learning_rate": 2.6840393667094566e-05, + "loss": 0.428, + "step": 15180 + }, + { + "epoch": 19.48780487804878, + "grad_norm": 1.4893839359283447, + "learning_rate": 2.6839965768078734e-05, + "loss": 0.4927, + "step": 15181 + }, + { + "epoch": 19.48908857509628, + "grad_norm": 1.3170148134231567, + "learning_rate": 2.6839537869062903e-05, + "loss": 0.485, + "step": 15182 + }, + { + "epoch": 19.490372272143773, + "grad_norm": 1.359824299812317, + "learning_rate": 2.683910997004707e-05, + "loss": 0.4292, + "step": 15183 + }, + { + "epoch": 19.49165596919127, + "grad_norm": 0.9492613673210144, + "learning_rate": 2.6838682071031236e-05, + "loss": 0.4439, + "step": 15184 + }, + { + "epoch": 19.49293966623877, + "grad_norm": 1.8850687742233276, + "learning_rate": 2.6838254172015405e-05, + "loss": 0.4486, + "step": 15185 + }, + { + "epoch": 19.494223363286263, + "grad_norm": 1.4641566276550293, + "learning_rate": 2.6837826272999573e-05, + "loss": 0.4954, + "step": 15186 + }, + { + "epoch": 19.49550706033376, + "grad_norm": 2.170475721359253, + "learning_rate": 2.683739837398374e-05, + "loss": 0.4301, + "step": 15187 + }, + { + "epoch": 19.49679075738126, + "grad_norm": 1.754933476448059, + "learning_rate": 2.6836970474967906e-05, + "loss": 0.4619, + "step": 15188 + }, + { + "epoch": 19.498074454428753, + "grad_norm": 1.8733468055725098, + "learning_rate": 2.6836542575952078e-05, + "loss": 0.4513, + "step": 15189 + }, + { + "epoch": 19.49935815147625, + "grad_norm": 3.3854358196258545, + "learning_rate": 2.6836114676936243e-05, + "loss": 0.4278, + "step": 15190 + }, + { + "epoch": 19.50064184852375, + "grad_norm": 1.3980270624160767, + "learning_rate": 2.683568677792041e-05, + "loss": 0.4269, + "step": 15191 + }, + { + "epoch": 19.501925545571247, + "grad_norm": 1.6513699293136597, + "learning_rate": 2.683525887890458e-05, + "loss": 0.5152, + "step": 15192 + }, + { + "epoch": 19.50320924261874, + "grad_norm": 1.8645678758621216, + "learning_rate": 2.6834830979888745e-05, + "loss": 0.4673, + "step": 15193 + }, + { + "epoch": 19.50449293966624, + "grad_norm": 1.7452044486999512, + "learning_rate": 2.6834403080872917e-05, + "loss": 0.4732, + "step": 15194 + }, + { + "epoch": 19.505776636713737, + "grad_norm": 1.7427501678466797, + "learning_rate": 2.6833975181857082e-05, + "loss": 0.4565, + "step": 15195 + }, + { + "epoch": 19.50706033376123, + "grad_norm": 3.9897637367248535, + "learning_rate": 2.683354728284125e-05, + "loss": 0.454, + "step": 15196 + }, + { + "epoch": 19.50834403080873, + "grad_norm": 1.381662130355835, + "learning_rate": 2.683311938382542e-05, + "loss": 0.5034, + "step": 15197 + }, + { + "epoch": 19.509627727856227, + "grad_norm": 2.4500153064727783, + "learning_rate": 2.6832691484809584e-05, + "loss": 0.5352, + "step": 15198 + }, + { + "epoch": 19.51091142490372, + "grad_norm": 1.1431809663772583, + "learning_rate": 2.6832263585793756e-05, + "loss": 0.5342, + "step": 15199 + }, + { + "epoch": 19.51219512195122, + "grad_norm": 1.5327749252319336, + "learning_rate": 2.683183568677792e-05, + "loss": 0.5812, + "step": 15200 + }, + { + "epoch": 19.513478818998717, + "grad_norm": 1.4376044273376465, + "learning_rate": 2.683140778776209e-05, + "loss": 0.6669, + "step": 15201 + }, + { + "epoch": 19.514762516046215, + "grad_norm": 3.962354898452759, + "learning_rate": 2.6830979888746257e-05, + "loss": 0.3928, + "step": 15202 + }, + { + "epoch": 19.51604621309371, + "grad_norm": 2.747518539428711, + "learning_rate": 2.6830551989730426e-05, + "loss": 0.403, + "step": 15203 + }, + { + "epoch": 19.517329910141207, + "grad_norm": 1.6029436588287354, + "learning_rate": 2.683012409071459e-05, + "loss": 0.4413, + "step": 15204 + }, + { + "epoch": 19.518613607188705, + "grad_norm": 1.253242015838623, + "learning_rate": 2.682969619169876e-05, + "loss": 0.4765, + "step": 15205 + }, + { + "epoch": 19.5198973042362, + "grad_norm": 1.2318150997161865, + "learning_rate": 2.6829268292682928e-05, + "loss": 0.4405, + "step": 15206 + }, + { + "epoch": 19.521181001283697, + "grad_norm": 1.5437172651290894, + "learning_rate": 2.6828840393667096e-05, + "loss": 0.452, + "step": 15207 + }, + { + "epoch": 19.522464698331195, + "grad_norm": 1.3637803792953491, + "learning_rate": 2.6828412494651264e-05, + "loss": 0.4478, + "step": 15208 + }, + { + "epoch": 19.52374839537869, + "grad_norm": 2.9912545680999756, + "learning_rate": 2.682798459563543e-05, + "loss": 0.4433, + "step": 15209 + }, + { + "epoch": 19.525032092426187, + "grad_norm": 1.9453219175338745, + "learning_rate": 2.6827556696619598e-05, + "loss": 0.4276, + "step": 15210 + }, + { + "epoch": 19.526315789473685, + "grad_norm": 1.4578887224197388, + "learning_rate": 2.6827128797603766e-05, + "loss": 0.4871, + "step": 15211 + }, + { + "epoch": 19.527599486521183, + "grad_norm": 1.0350064039230347, + "learning_rate": 2.682670089858793e-05, + "loss": 0.4188, + "step": 15212 + }, + { + "epoch": 19.528883183568677, + "grad_norm": 1.4003081321716309, + "learning_rate": 2.6826272999572103e-05, + "loss": 0.4472, + "step": 15213 + }, + { + "epoch": 19.530166880616175, + "grad_norm": 1.0820205211639404, + "learning_rate": 2.6825845100556268e-05, + "loss": 0.4138, + "step": 15214 + }, + { + "epoch": 19.531450577663673, + "grad_norm": 2.173389434814453, + "learning_rate": 2.682541720154044e-05, + "loss": 0.3894, + "step": 15215 + }, + { + "epoch": 19.532734274711167, + "grad_norm": 2.4633753299713135, + "learning_rate": 2.6824989302524605e-05, + "loss": 0.4638, + "step": 15216 + }, + { + "epoch": 19.534017971758665, + "grad_norm": 0.9602518081665039, + "learning_rate": 2.682456140350877e-05, + "loss": 0.4794, + "step": 15217 + }, + { + "epoch": 19.535301668806163, + "grad_norm": 2.1243934631347656, + "learning_rate": 2.6824133504492942e-05, + "loss": 0.4309, + "step": 15218 + }, + { + "epoch": 19.536585365853657, + "grad_norm": 1.3392943143844604, + "learning_rate": 2.6823705605477107e-05, + "loss": 0.4261, + "step": 15219 + }, + { + "epoch": 19.537869062901155, + "grad_norm": 1.7954736948013306, + "learning_rate": 2.6823277706461275e-05, + "loss": 0.452, + "step": 15220 + }, + { + "epoch": 19.539152759948653, + "grad_norm": 1.0703626871109009, + "learning_rate": 2.6822849807445444e-05, + "loss": 0.4084, + "step": 15221 + }, + { + "epoch": 19.540436456996147, + "grad_norm": 2.795560121536255, + "learning_rate": 2.6822421908429612e-05, + "loss": 0.4396, + "step": 15222 + }, + { + "epoch": 19.541720154043645, + "grad_norm": 3.1941349506378174, + "learning_rate": 2.682199400941378e-05, + "loss": 0.4549, + "step": 15223 + }, + { + "epoch": 19.543003851091143, + "grad_norm": 3.629225730895996, + "learning_rate": 2.6821566110397945e-05, + "loss": 0.4295, + "step": 15224 + }, + { + "epoch": 19.54428754813864, + "grad_norm": 3.8297107219696045, + "learning_rate": 2.6821138211382114e-05, + "loss": 0.4304, + "step": 15225 + }, + { + "epoch": 19.545571245186135, + "grad_norm": 2.579782724380493, + "learning_rate": 2.6820710312366282e-05, + "loss": 0.4942, + "step": 15226 + }, + { + "epoch": 19.546854942233633, + "grad_norm": 6.227581024169922, + "learning_rate": 2.682028241335045e-05, + "loss": 0.4712, + "step": 15227 + }, + { + "epoch": 19.54813863928113, + "grad_norm": 3.454664707183838, + "learning_rate": 2.6819854514334616e-05, + "loss": 0.4111, + "step": 15228 + }, + { + "epoch": 19.549422336328625, + "grad_norm": 1.2462654113769531, + "learning_rate": 2.6819426615318788e-05, + "loss": 0.4815, + "step": 15229 + }, + { + "epoch": 19.550706033376123, + "grad_norm": 2.4757239818573, + "learning_rate": 2.6818998716302953e-05, + "loss": 0.4519, + "step": 15230 + }, + { + "epoch": 19.55198973042362, + "grad_norm": 1.1942853927612305, + "learning_rate": 2.681857081728712e-05, + "loss": 0.5344, + "step": 15231 + }, + { + "epoch": 19.553273427471115, + "grad_norm": 2.7276341915130615, + "learning_rate": 2.681814291827129e-05, + "loss": 0.4539, + "step": 15232 + }, + { + "epoch": 19.554557124518613, + "grad_norm": 1.8385460376739502, + "learning_rate": 2.6817715019255454e-05, + "loss": 0.4203, + "step": 15233 + }, + { + "epoch": 19.55584082156611, + "grad_norm": 2.150521993637085, + "learning_rate": 2.6817287120239626e-05, + "loss": 0.4278, + "step": 15234 + }, + { + "epoch": 19.55712451861361, + "grad_norm": 0.8166936635971069, + "learning_rate": 2.681685922122379e-05, + "loss": 0.4724, + "step": 15235 + }, + { + "epoch": 19.558408215661103, + "grad_norm": 0.8740129470825195, + "learning_rate": 2.681643132220796e-05, + "loss": 0.5159, + "step": 15236 + }, + { + "epoch": 19.5596919127086, + "grad_norm": 1.8889552354812622, + "learning_rate": 2.6816003423192128e-05, + "loss": 0.4315, + "step": 15237 + }, + { + "epoch": 19.5609756097561, + "grad_norm": 1.335296869277954, + "learning_rate": 2.6815575524176293e-05, + "loss": 0.5378, + "step": 15238 + }, + { + "epoch": 19.562259306803593, + "grad_norm": 4.798678874969482, + "learning_rate": 2.6815147625160465e-05, + "loss": 0.469, + "step": 15239 + }, + { + "epoch": 19.56354300385109, + "grad_norm": 2.026495933532715, + "learning_rate": 2.681471972614463e-05, + "loss": 0.4836, + "step": 15240 + }, + { + "epoch": 19.56482670089859, + "grad_norm": 1.0749399662017822, + "learning_rate": 2.68142918271288e-05, + "loss": 0.4452, + "step": 15241 + }, + { + "epoch": 19.566110397946083, + "grad_norm": 2.610982656478882, + "learning_rate": 2.6813863928112967e-05, + "loss": 0.4882, + "step": 15242 + }, + { + "epoch": 19.56739409499358, + "grad_norm": 2.247114658355713, + "learning_rate": 2.6813436029097135e-05, + "loss": 0.5427, + "step": 15243 + }, + { + "epoch": 19.56867779204108, + "grad_norm": 2.3720550537109375, + "learning_rate": 2.68130081300813e-05, + "loss": 0.5511, + "step": 15244 + }, + { + "epoch": 19.569961489088577, + "grad_norm": 1.2952238321304321, + "learning_rate": 2.681258023106547e-05, + "loss": 0.488, + "step": 15245 + }, + { + "epoch": 19.57124518613607, + "grad_norm": 2.573951482772827, + "learning_rate": 2.6812152332049637e-05, + "loss": 0.5131, + "step": 15246 + }, + { + "epoch": 19.57252888318357, + "grad_norm": 1.9602136611938477, + "learning_rate": 2.6811724433033805e-05, + "loss": 0.4563, + "step": 15247 + }, + { + "epoch": 19.573812580231067, + "grad_norm": 1.831581473350525, + "learning_rate": 2.6811296534017974e-05, + "loss": 0.5565, + "step": 15248 + }, + { + "epoch": 19.57509627727856, + "grad_norm": 1.8129061460494995, + "learning_rate": 2.681086863500214e-05, + "loss": 0.5532, + "step": 15249 + }, + { + "epoch": 19.57637997432606, + "grad_norm": 2.6830861568450928, + "learning_rate": 2.681044073598631e-05, + "loss": 0.5881, + "step": 15250 + }, + { + "epoch": 19.577663671373557, + "grad_norm": 2.6920623779296875, + "learning_rate": 2.6810012836970476e-05, + "loss": 0.6573, + "step": 15251 + }, + { + "epoch": 19.57894736842105, + "grad_norm": 2.0403058528900146, + "learning_rate": 2.680958493795464e-05, + "loss": 0.4222, + "step": 15252 + }, + { + "epoch": 19.58023106546855, + "grad_norm": 2.0288288593292236, + "learning_rate": 2.6809157038938812e-05, + "loss": 0.4143, + "step": 15253 + }, + { + "epoch": 19.581514762516047, + "grad_norm": 2.450453281402588, + "learning_rate": 2.6808729139922978e-05, + "loss": 0.4414, + "step": 15254 + }, + { + "epoch": 19.58279845956354, + "grad_norm": 2.1916255950927734, + "learning_rate": 2.680830124090715e-05, + "loss": 0.4412, + "step": 15255 + }, + { + "epoch": 19.58408215661104, + "grad_norm": 1.0289740562438965, + "learning_rate": 2.6807873341891314e-05, + "loss": 0.4482, + "step": 15256 + }, + { + "epoch": 19.585365853658537, + "grad_norm": 1.0529276132583618, + "learning_rate": 2.6807445442875483e-05, + "loss": 0.4226, + "step": 15257 + }, + { + "epoch": 19.586649550706035, + "grad_norm": 2.5052602291107178, + "learning_rate": 2.680701754385965e-05, + "loss": 0.4852, + "step": 15258 + }, + { + "epoch": 19.58793324775353, + "grad_norm": 1.6432777643203735, + "learning_rate": 2.6806589644843816e-05, + "loss": 0.4356, + "step": 15259 + }, + { + "epoch": 19.589216944801027, + "grad_norm": 0.8777653574943542, + "learning_rate": 2.6806161745827985e-05, + "loss": 0.4028, + "step": 15260 + }, + { + "epoch": 19.590500641848525, + "grad_norm": 1.9101362228393555, + "learning_rate": 2.6805733846812153e-05, + "loss": 0.4551, + "step": 15261 + }, + { + "epoch": 19.59178433889602, + "grad_norm": 1.441261887550354, + "learning_rate": 2.680530594779632e-05, + "loss": 0.4502, + "step": 15262 + }, + { + "epoch": 19.593068035943517, + "grad_norm": 1.07398521900177, + "learning_rate": 2.680487804878049e-05, + "loss": 0.4588, + "step": 15263 + }, + { + "epoch": 19.594351732991015, + "grad_norm": 1.902982473373413, + "learning_rate": 2.6804450149764658e-05, + "loss": 0.4343, + "step": 15264 + }, + { + "epoch": 19.59563543003851, + "grad_norm": 0.8298689126968384, + "learning_rate": 2.6804022250748823e-05, + "loss": 0.4243, + "step": 15265 + }, + { + "epoch": 19.596919127086007, + "grad_norm": 1.0990833044052124, + "learning_rate": 2.680359435173299e-05, + "loss": 0.443, + "step": 15266 + }, + { + "epoch": 19.598202824133505, + "grad_norm": 1.0243947505950928, + "learning_rate": 2.680316645271716e-05, + "loss": 0.404, + "step": 15267 + }, + { + "epoch": 19.599486521181003, + "grad_norm": 1.1712795495986938, + "learning_rate": 2.6802738553701325e-05, + "loss": 0.4281, + "step": 15268 + }, + { + "epoch": 19.600770218228497, + "grad_norm": 1.1019065380096436, + "learning_rate": 2.6802310654685497e-05, + "loss": 0.4755, + "step": 15269 + }, + { + "epoch": 19.602053915275995, + "grad_norm": 0.8053672909736633, + "learning_rate": 2.6801882755669662e-05, + "loss": 0.4477, + "step": 15270 + }, + { + "epoch": 19.603337612323493, + "grad_norm": 1.3511735200881958, + "learning_rate": 2.680145485665383e-05, + "loss": 0.4464, + "step": 15271 + }, + { + "epoch": 19.604621309370987, + "grad_norm": 1.9929364919662476, + "learning_rate": 2.6801026957638e-05, + "loss": 0.4247, + "step": 15272 + }, + { + "epoch": 19.605905006418485, + "grad_norm": 1.525810956954956, + "learning_rate": 2.6800599058622164e-05, + "loss": 0.4547, + "step": 15273 + }, + { + "epoch": 19.607188703465983, + "grad_norm": 0.9243838787078857, + "learning_rate": 2.6800171159606336e-05, + "loss": 0.4439, + "step": 15274 + }, + { + "epoch": 19.608472400513477, + "grad_norm": 1.3229384422302246, + "learning_rate": 2.67997432605905e-05, + "loss": 0.4674, + "step": 15275 + }, + { + "epoch": 19.609756097560975, + "grad_norm": 1.5341756343841553, + "learning_rate": 2.679931536157467e-05, + "loss": 0.4629, + "step": 15276 + }, + { + "epoch": 19.611039794608473, + "grad_norm": 1.2446908950805664, + "learning_rate": 2.6798887462558837e-05, + "loss": 0.4761, + "step": 15277 + }, + { + "epoch": 19.61232349165597, + "grad_norm": 1.6481988430023193, + "learning_rate": 2.6798459563543002e-05, + "loss": 0.4524, + "step": 15278 + }, + { + "epoch": 19.613607188703465, + "grad_norm": 2.14021897315979, + "learning_rate": 2.6798031664527174e-05, + "loss": 0.4456, + "step": 15279 + }, + { + "epoch": 19.614890885750963, + "grad_norm": 2.0562243461608887, + "learning_rate": 2.679760376551134e-05, + "loss": 0.4325, + "step": 15280 + }, + { + "epoch": 19.61617458279846, + "grad_norm": 1.2417237758636475, + "learning_rate": 2.6797175866495508e-05, + "loss": 0.3819, + "step": 15281 + }, + { + "epoch": 19.617458279845955, + "grad_norm": 1.0665310621261597, + "learning_rate": 2.6796747967479676e-05, + "loss": 0.428, + "step": 15282 + }, + { + "epoch": 19.618741976893453, + "grad_norm": 7.847725868225098, + "learning_rate": 2.6796320068463845e-05, + "loss": 0.4384, + "step": 15283 + }, + { + "epoch": 19.62002567394095, + "grad_norm": 1.324533462524414, + "learning_rate": 2.679589216944801e-05, + "loss": 0.4522, + "step": 15284 + }, + { + "epoch": 19.621309370988445, + "grad_norm": 1.6081265211105347, + "learning_rate": 2.6795464270432178e-05, + "loss": 0.4603, + "step": 15285 + }, + { + "epoch": 19.622593068035943, + "grad_norm": 2.46309757232666, + "learning_rate": 2.6795036371416346e-05, + "loss": 0.4572, + "step": 15286 + }, + { + "epoch": 19.62387676508344, + "grad_norm": 1.3178907632827759, + "learning_rate": 2.6794608472400515e-05, + "loss": 0.4551, + "step": 15287 + }, + { + "epoch": 19.625160462130935, + "grad_norm": 1.032797932624817, + "learning_rate": 2.6794180573384683e-05, + "loss": 0.4568, + "step": 15288 + }, + { + "epoch": 19.626444159178433, + "grad_norm": 1.3720872402191162, + "learning_rate": 2.6793752674368848e-05, + "loss": 0.4891, + "step": 15289 + }, + { + "epoch": 19.62772785622593, + "grad_norm": 2.9547555446624756, + "learning_rate": 2.679332477535302e-05, + "loss": 0.5039, + "step": 15290 + }, + { + "epoch": 19.62901155327343, + "grad_norm": 3.334502696990967, + "learning_rate": 2.6792896876337185e-05, + "loss": 0.4504, + "step": 15291 + }, + { + "epoch": 19.630295250320923, + "grad_norm": 1.526614785194397, + "learning_rate": 2.679246897732135e-05, + "loss": 0.4444, + "step": 15292 + }, + { + "epoch": 19.63157894736842, + "grad_norm": 2.6220293045043945, + "learning_rate": 2.6792041078305522e-05, + "loss": 0.5147, + "step": 15293 + }, + { + "epoch": 19.63286264441592, + "grad_norm": 1.3122835159301758, + "learning_rate": 2.6791613179289687e-05, + "loss": 0.4752, + "step": 15294 + }, + { + "epoch": 19.634146341463413, + "grad_norm": 2.0724446773529053, + "learning_rate": 2.6791185280273855e-05, + "loss": 0.436, + "step": 15295 + }, + { + "epoch": 19.63543003851091, + "grad_norm": 1.5782279968261719, + "learning_rate": 2.6790757381258024e-05, + "loss": 0.4706, + "step": 15296 + }, + { + "epoch": 19.63671373555841, + "grad_norm": 2.4754750728607178, + "learning_rate": 2.6790329482242192e-05, + "loss": 0.4903, + "step": 15297 + }, + { + "epoch": 19.637997432605903, + "grad_norm": 1.357574701309204, + "learning_rate": 2.678990158322636e-05, + "loss": 0.453, + "step": 15298 + }, + { + "epoch": 19.6392811296534, + "grad_norm": 1.8737237453460693, + "learning_rate": 2.6789473684210526e-05, + "loss": 0.472, + "step": 15299 + }, + { + "epoch": 19.6405648267009, + "grad_norm": 5.4344658851623535, + "learning_rate": 2.6789045785194694e-05, + "loss": 0.5697, + "step": 15300 + }, + { + "epoch": 19.641848523748397, + "grad_norm": 5.3933329582214355, + "learning_rate": 2.6788617886178862e-05, + "loss": 0.6191, + "step": 15301 + }, + { + "epoch": 19.64313222079589, + "grad_norm": 1.308957815170288, + "learning_rate": 2.678818998716303e-05, + "loss": 0.4186, + "step": 15302 + }, + { + "epoch": 19.64441591784339, + "grad_norm": 1.363533616065979, + "learning_rate": 2.6787762088147196e-05, + "loss": 0.4443, + "step": 15303 + }, + { + "epoch": 19.645699614890887, + "grad_norm": 1.699954628944397, + "learning_rate": 2.6787334189131368e-05, + "loss": 0.4181, + "step": 15304 + }, + { + "epoch": 19.64698331193838, + "grad_norm": 1.8552170991897583, + "learning_rate": 2.6786906290115533e-05, + "loss": 0.4161, + "step": 15305 + }, + { + "epoch": 19.64826700898588, + "grad_norm": 1.6492624282836914, + "learning_rate": 2.67864783910997e-05, + "loss": 0.4365, + "step": 15306 + }, + { + "epoch": 19.649550706033377, + "grad_norm": 1.4267241954803467, + "learning_rate": 2.678605049208387e-05, + "loss": 0.4221, + "step": 15307 + }, + { + "epoch": 19.65083440308087, + "grad_norm": 2.0540711879730225, + "learning_rate": 2.6785622593068034e-05, + "loss": 0.441, + "step": 15308 + }, + { + "epoch": 19.65211810012837, + "grad_norm": 2.3527331352233887, + "learning_rate": 2.6785194694052206e-05, + "loss": 0.467, + "step": 15309 + }, + { + "epoch": 19.653401797175867, + "grad_norm": 1.557627558708191, + "learning_rate": 2.678476679503637e-05, + "loss": 0.4668, + "step": 15310 + }, + { + "epoch": 19.654685494223365, + "grad_norm": 0.9778966307640076, + "learning_rate": 2.678433889602054e-05, + "loss": 0.4245, + "step": 15311 + }, + { + "epoch": 19.65596919127086, + "grad_norm": 1.7739957571029663, + "learning_rate": 2.6783910997004708e-05, + "loss": 0.4622, + "step": 15312 + }, + { + "epoch": 19.657252888318357, + "grad_norm": 0.9212932586669922, + "learning_rate": 2.6783483097988873e-05, + "loss": 0.4368, + "step": 15313 + }, + { + "epoch": 19.658536585365855, + "grad_norm": 0.9380995631217957, + "learning_rate": 2.6783055198973045e-05, + "loss": 0.4151, + "step": 15314 + }, + { + "epoch": 19.65982028241335, + "grad_norm": 1.4364985227584839, + "learning_rate": 2.678262729995721e-05, + "loss": 0.4913, + "step": 15315 + }, + { + "epoch": 19.661103979460847, + "grad_norm": 2.131556510925293, + "learning_rate": 2.678219940094138e-05, + "loss": 0.4244, + "step": 15316 + }, + { + "epoch": 19.662387676508345, + "grad_norm": 0.929992139339447, + "learning_rate": 2.6781771501925547e-05, + "loss": 0.4592, + "step": 15317 + }, + { + "epoch": 19.66367137355584, + "grad_norm": 0.8087905645370483, + "learning_rate": 2.6781343602909715e-05, + "loss": 0.439, + "step": 15318 + }, + { + "epoch": 19.664955070603337, + "grad_norm": 1.9311319589614868, + "learning_rate": 2.678091570389388e-05, + "loss": 0.4238, + "step": 15319 + }, + { + "epoch": 19.666238767650835, + "grad_norm": 1.6665552854537964, + "learning_rate": 2.678048780487805e-05, + "loss": 0.4401, + "step": 15320 + }, + { + "epoch": 19.66752246469833, + "grad_norm": 1.3511521816253662, + "learning_rate": 2.6780059905862217e-05, + "loss": 0.4755, + "step": 15321 + }, + { + "epoch": 19.668806161745827, + "grad_norm": 1.8097949028015137, + "learning_rate": 2.6779632006846385e-05, + "loss": 0.449, + "step": 15322 + }, + { + "epoch": 19.670089858793325, + "grad_norm": 1.0921939611434937, + "learning_rate": 2.6779204107830554e-05, + "loss": 0.4989, + "step": 15323 + }, + { + "epoch": 19.671373555840823, + "grad_norm": 0.9668959379196167, + "learning_rate": 2.677877620881472e-05, + "loss": 0.4436, + "step": 15324 + }, + { + "epoch": 19.672657252888317, + "grad_norm": 4.813899040222168, + "learning_rate": 2.677834830979889e-05, + "loss": 0.4245, + "step": 15325 + }, + { + "epoch": 19.673940949935815, + "grad_norm": 1.3155299425125122, + "learning_rate": 2.6777920410783056e-05, + "loss": 0.4335, + "step": 15326 + }, + { + "epoch": 19.675224646983313, + "grad_norm": 1.3218791484832764, + "learning_rate": 2.677749251176722e-05, + "loss": 0.443, + "step": 15327 + }, + { + "epoch": 19.676508344030808, + "grad_norm": 1.3903614282608032, + "learning_rate": 2.6777064612751393e-05, + "loss": 0.4386, + "step": 15328 + }, + { + "epoch": 19.677792041078305, + "grad_norm": 1.4776580333709717, + "learning_rate": 2.6776636713735558e-05, + "loss": 0.4463, + "step": 15329 + }, + { + "epoch": 19.679075738125803, + "grad_norm": 1.1227535009384155, + "learning_rate": 2.677620881471973e-05, + "loss": 0.4279, + "step": 15330 + }, + { + "epoch": 19.680359435173298, + "grad_norm": 2.33514142036438, + "learning_rate": 2.6775780915703894e-05, + "loss": 0.4364, + "step": 15331 + }, + { + "epoch": 19.681643132220795, + "grad_norm": 0.9650523066520691, + "learning_rate": 2.6775353016688063e-05, + "loss": 0.4397, + "step": 15332 + }, + { + "epoch": 19.682926829268293, + "grad_norm": 6.458824157714844, + "learning_rate": 2.677492511767223e-05, + "loss": 0.4145, + "step": 15333 + }, + { + "epoch": 19.68421052631579, + "grad_norm": 0.8971146941184998, + "learning_rate": 2.6774497218656396e-05, + "loss": 0.4213, + "step": 15334 + }, + { + "epoch": 19.685494223363285, + "grad_norm": 0.9895238280296326, + "learning_rate": 2.6774069319640565e-05, + "loss": 0.4743, + "step": 15335 + }, + { + "epoch": 19.686777920410783, + "grad_norm": 2.131234884262085, + "learning_rate": 2.6773641420624733e-05, + "loss": 0.4139, + "step": 15336 + }, + { + "epoch": 19.68806161745828, + "grad_norm": 1.2322962284088135, + "learning_rate": 2.67732135216089e-05, + "loss": 0.4316, + "step": 15337 + }, + { + "epoch": 19.689345314505776, + "grad_norm": 2.8481030464172363, + "learning_rate": 2.677278562259307e-05, + "loss": 0.4398, + "step": 15338 + }, + { + "epoch": 19.690629011553273, + "grad_norm": 1.930225133895874, + "learning_rate": 2.6772357723577235e-05, + "loss": 0.4289, + "step": 15339 + }, + { + "epoch": 19.69191270860077, + "grad_norm": 3.761840581893921, + "learning_rate": 2.6771929824561403e-05, + "loss": 0.4239, + "step": 15340 + }, + { + "epoch": 19.693196405648266, + "grad_norm": 1.221565842628479, + "learning_rate": 2.6771501925545572e-05, + "loss": 0.4899, + "step": 15341 + }, + { + "epoch": 19.694480102695763, + "grad_norm": 2.3190503120422363, + "learning_rate": 2.677107402652974e-05, + "loss": 0.4832, + "step": 15342 + }, + { + "epoch": 19.69576379974326, + "grad_norm": 4.370282173156738, + "learning_rate": 2.6770646127513905e-05, + "loss": 0.4817, + "step": 15343 + }, + { + "epoch": 19.69704749679076, + "grad_norm": 1.2939932346343994, + "learning_rate": 2.6770218228498077e-05, + "loss": 0.4682, + "step": 15344 + }, + { + "epoch": 19.698331193838253, + "grad_norm": 2.4138078689575195, + "learning_rate": 2.6769790329482242e-05, + "loss": 0.4498, + "step": 15345 + }, + { + "epoch": 19.69961489088575, + "grad_norm": 4.526739597320557, + "learning_rate": 2.676936243046641e-05, + "loss": 0.5243, + "step": 15346 + }, + { + "epoch": 19.70089858793325, + "grad_norm": 1.5688430070877075, + "learning_rate": 2.676893453145058e-05, + "loss": 0.5888, + "step": 15347 + }, + { + "epoch": 19.702182284980744, + "grad_norm": 6.2743611335754395, + "learning_rate": 2.6768506632434744e-05, + "loss": 0.5022, + "step": 15348 + }, + { + "epoch": 19.70346598202824, + "grad_norm": 1.79301917552948, + "learning_rate": 2.6768078733418916e-05, + "loss": 0.4992, + "step": 15349 + }, + { + "epoch": 19.70474967907574, + "grad_norm": 2.0405561923980713, + "learning_rate": 2.676765083440308e-05, + "loss": 0.5928, + "step": 15350 + }, + { + "epoch": 19.706033376123234, + "grad_norm": 1.4982653856277466, + "learning_rate": 2.676722293538725e-05, + "loss": 0.6444, + "step": 15351 + }, + { + "epoch": 19.70731707317073, + "grad_norm": 0.8873366713523865, + "learning_rate": 2.6766795036371417e-05, + "loss": 0.3862, + "step": 15352 + }, + { + "epoch": 19.70860077021823, + "grad_norm": 2.5070650577545166, + "learning_rate": 2.6766367137355583e-05, + "loss": 0.3809, + "step": 15353 + }, + { + "epoch": 19.709884467265724, + "grad_norm": 3.0242760181427, + "learning_rate": 2.6765939238339754e-05, + "loss": 0.4803, + "step": 15354 + }, + { + "epoch": 19.71116816431322, + "grad_norm": 1.1006617546081543, + "learning_rate": 2.676551133932392e-05, + "loss": 0.4195, + "step": 15355 + }, + { + "epoch": 19.71245186136072, + "grad_norm": 5.443369388580322, + "learning_rate": 2.6765083440308088e-05, + "loss": 0.3953, + "step": 15356 + }, + { + "epoch": 19.713735558408217, + "grad_norm": 1.7864034175872803, + "learning_rate": 2.6764655541292256e-05, + "loss": 0.4326, + "step": 15357 + }, + { + "epoch": 19.71501925545571, + "grad_norm": 1.4939405918121338, + "learning_rate": 2.6764227642276425e-05, + "loss": 0.4269, + "step": 15358 + }, + { + "epoch": 19.71630295250321, + "grad_norm": 4.001301288604736, + "learning_rate": 2.676379974326059e-05, + "loss": 0.4376, + "step": 15359 + }, + { + "epoch": 19.717586649550707, + "grad_norm": 1.492911458015442, + "learning_rate": 2.6763371844244758e-05, + "loss": 0.4659, + "step": 15360 + }, + { + "epoch": 19.7188703465982, + "grad_norm": 1.8756524324417114, + "learning_rate": 2.6762943945228926e-05, + "loss": 0.4249, + "step": 15361 + }, + { + "epoch": 19.7201540436457, + "grad_norm": 0.9796969890594482, + "learning_rate": 2.6762516046213095e-05, + "loss": 0.4661, + "step": 15362 + }, + { + "epoch": 19.721437740693197, + "grad_norm": 1.2629797458648682, + "learning_rate": 2.6762088147197263e-05, + "loss": 0.4486, + "step": 15363 + }, + { + "epoch": 19.72272143774069, + "grad_norm": 1.0974693298339844, + "learning_rate": 2.6761660248181428e-05, + "loss": 0.4336, + "step": 15364 + }, + { + "epoch": 19.72400513478819, + "grad_norm": 2.044361114501953, + "learning_rate": 2.67612323491656e-05, + "loss": 0.5009, + "step": 15365 + }, + { + "epoch": 19.725288831835687, + "grad_norm": 1.0306376218795776, + "learning_rate": 2.6760804450149765e-05, + "loss": 0.4418, + "step": 15366 + }, + { + "epoch": 19.726572528883185, + "grad_norm": 1.685643196105957, + "learning_rate": 2.676037655113393e-05, + "loss": 0.4785, + "step": 15367 + }, + { + "epoch": 19.72785622593068, + "grad_norm": 1.5271445512771606, + "learning_rate": 2.6759948652118102e-05, + "loss": 0.4046, + "step": 15368 + }, + { + "epoch": 19.729139922978177, + "grad_norm": 10.090630531311035, + "learning_rate": 2.6759520753102267e-05, + "loss": 0.4398, + "step": 15369 + }, + { + "epoch": 19.730423620025675, + "grad_norm": 1.8754805326461792, + "learning_rate": 2.675909285408644e-05, + "loss": 0.4251, + "step": 15370 + }, + { + "epoch": 19.73170731707317, + "grad_norm": 2.1397464275360107, + "learning_rate": 2.6758664955070604e-05, + "loss": 0.4505, + "step": 15371 + }, + { + "epoch": 19.732991014120667, + "grad_norm": 0.9554893970489502, + "learning_rate": 2.6758237056054772e-05, + "loss": 0.4388, + "step": 15372 + }, + { + "epoch": 19.734274711168165, + "grad_norm": 1.0684245824813843, + "learning_rate": 2.675780915703894e-05, + "loss": 0.434, + "step": 15373 + }, + { + "epoch": 19.73555840821566, + "grad_norm": 1.000993013381958, + "learning_rate": 2.6757381258023106e-05, + "loss": 0.429, + "step": 15374 + }, + { + "epoch": 19.736842105263158, + "grad_norm": 1.0899293422698975, + "learning_rate": 2.6756953359007274e-05, + "loss": 0.4009, + "step": 15375 + }, + { + "epoch": 19.738125802310655, + "grad_norm": 1.5179733037948608, + "learning_rate": 2.6756525459991442e-05, + "loss": 0.4334, + "step": 15376 + }, + { + "epoch": 19.739409499358153, + "grad_norm": 1.4104373455047607, + "learning_rate": 2.675609756097561e-05, + "loss": 0.4127, + "step": 15377 + }, + { + "epoch": 19.740693196405648, + "grad_norm": 2.1430704593658447, + "learning_rate": 2.675566966195978e-05, + "loss": 0.4817, + "step": 15378 + }, + { + "epoch": 19.741976893453145, + "grad_norm": 1.656249761581421, + "learning_rate": 2.6755241762943948e-05, + "loss": 0.4246, + "step": 15379 + }, + { + "epoch": 19.743260590500643, + "grad_norm": 1.2295960187911987, + "learning_rate": 2.6754813863928113e-05, + "loss": 0.447, + "step": 15380 + }, + { + "epoch": 19.744544287548138, + "grad_norm": 1.021532416343689, + "learning_rate": 2.675438596491228e-05, + "loss": 0.4886, + "step": 15381 + }, + { + "epoch": 19.745827984595635, + "grad_norm": 2.2645833492279053, + "learning_rate": 2.675395806589645e-05, + "loss": 0.4205, + "step": 15382 + }, + { + "epoch": 19.747111681643133, + "grad_norm": 4.469203948974609, + "learning_rate": 2.6753530166880615e-05, + "loss": 0.4512, + "step": 15383 + }, + { + "epoch": 19.748395378690628, + "grad_norm": 1.8802967071533203, + "learning_rate": 2.6753102267864786e-05, + "loss": 0.4614, + "step": 15384 + }, + { + "epoch": 19.749679075738126, + "grad_norm": 0.9360284209251404, + "learning_rate": 2.675267436884895e-05, + "loss": 0.4413, + "step": 15385 + }, + { + "epoch": 19.750962772785623, + "grad_norm": 4.223013877868652, + "learning_rate": 2.6752246469833123e-05, + "loss": 0.4397, + "step": 15386 + }, + { + "epoch": 19.752246469833118, + "grad_norm": 2.2624051570892334, + "learning_rate": 2.6751818570817288e-05, + "loss": 0.4466, + "step": 15387 + }, + { + "epoch": 19.753530166880616, + "grad_norm": 1.966302752494812, + "learning_rate": 2.6751390671801453e-05, + "loss": 0.4554, + "step": 15388 + }, + { + "epoch": 19.754813863928113, + "grad_norm": 1.2479721307754517, + "learning_rate": 2.6750962772785625e-05, + "loss": 0.4743, + "step": 15389 + }, + { + "epoch": 19.75609756097561, + "grad_norm": 1.2155873775482178, + "learning_rate": 2.675053487376979e-05, + "loss": 0.4619, + "step": 15390 + }, + { + "epoch": 19.757381258023106, + "grad_norm": 2.5563013553619385, + "learning_rate": 2.675010697475396e-05, + "loss": 0.4714, + "step": 15391 + }, + { + "epoch": 19.758664955070603, + "grad_norm": 2.042163848876953, + "learning_rate": 2.6749679075738127e-05, + "loss": 0.4968, + "step": 15392 + }, + { + "epoch": 19.7599486521181, + "grad_norm": 1.3014919757843018, + "learning_rate": 2.6749251176722295e-05, + "loss": 0.4725, + "step": 15393 + }, + { + "epoch": 19.761232349165596, + "grad_norm": 1.4995228052139282, + "learning_rate": 2.6748823277706464e-05, + "loss": 0.505, + "step": 15394 + }, + { + "epoch": 19.762516046213094, + "grad_norm": 1.7364938259124756, + "learning_rate": 2.674839537869063e-05, + "loss": 0.4678, + "step": 15395 + }, + { + "epoch": 19.76379974326059, + "grad_norm": 13.418405532836914, + "learning_rate": 2.6747967479674797e-05, + "loss": 0.5058, + "step": 15396 + }, + { + "epoch": 19.765083440308086, + "grad_norm": 3.8972108364105225, + "learning_rate": 2.6747539580658966e-05, + "loss": 0.4776, + "step": 15397 + }, + { + "epoch": 19.766367137355584, + "grad_norm": 1.4633644819259644, + "learning_rate": 2.6747111681643134e-05, + "loss": 0.4784, + "step": 15398 + }, + { + "epoch": 19.76765083440308, + "grad_norm": 1.377723217010498, + "learning_rate": 2.67466837826273e-05, + "loss": 0.5518, + "step": 15399 + }, + { + "epoch": 19.76893453145058, + "grad_norm": 2.144800901412964, + "learning_rate": 2.6746255883611467e-05, + "loss": 0.5708, + "step": 15400 + }, + { + "epoch": 19.770218228498074, + "grad_norm": 1.6769756078720093, + "learning_rate": 2.6745827984595636e-05, + "loss": 0.6142, + "step": 15401 + }, + { + "epoch": 19.77150192554557, + "grad_norm": 1.4936727285385132, + "learning_rate": 2.6745400085579804e-05, + "loss": 0.4176, + "step": 15402 + }, + { + "epoch": 19.77278562259307, + "grad_norm": 1.4349762201309204, + "learning_rate": 2.6744972186563973e-05, + "loss": 0.4439, + "step": 15403 + }, + { + "epoch": 19.774069319640564, + "grad_norm": 1.211130976676941, + "learning_rate": 2.6744544287548138e-05, + "loss": 0.4328, + "step": 15404 + }, + { + "epoch": 19.77535301668806, + "grad_norm": 1.7135419845581055, + "learning_rate": 2.674411638853231e-05, + "loss": 0.4472, + "step": 15405 + }, + { + "epoch": 19.77663671373556, + "grad_norm": 2.6238813400268555, + "learning_rate": 2.6743688489516474e-05, + "loss": 0.471, + "step": 15406 + }, + { + "epoch": 19.777920410783054, + "grad_norm": 1.0285662412643433, + "learning_rate": 2.674326059050064e-05, + "loss": 0.3977, + "step": 15407 + }, + { + "epoch": 19.77920410783055, + "grad_norm": 1.0113379955291748, + "learning_rate": 2.674283269148481e-05, + "loss": 0.4121, + "step": 15408 + }, + { + "epoch": 19.78048780487805, + "grad_norm": 0.9820452332496643, + "learning_rate": 2.6742404792468976e-05, + "loss": 0.4384, + "step": 15409 + }, + { + "epoch": 19.781771501925547, + "grad_norm": 1.658669114112854, + "learning_rate": 2.6741976893453148e-05, + "loss": 0.4052, + "step": 15410 + }, + { + "epoch": 19.78305519897304, + "grad_norm": 1.200613021850586, + "learning_rate": 2.6741548994437313e-05, + "loss": 0.4699, + "step": 15411 + }, + { + "epoch": 19.78433889602054, + "grad_norm": 5.854940414428711, + "learning_rate": 2.674112109542148e-05, + "loss": 0.4132, + "step": 15412 + }, + { + "epoch": 19.785622593068037, + "grad_norm": 0.9729947447776794, + "learning_rate": 2.674069319640565e-05, + "loss": 0.4671, + "step": 15413 + }, + { + "epoch": 19.78690629011553, + "grad_norm": 0.8722844123840332, + "learning_rate": 2.6740265297389815e-05, + "loss": 0.4177, + "step": 15414 + }, + { + "epoch": 19.78818998716303, + "grad_norm": 1.2171355485916138, + "learning_rate": 2.6739837398373983e-05, + "loss": 0.4128, + "step": 15415 + }, + { + "epoch": 19.789473684210527, + "grad_norm": 2.5183563232421875, + "learning_rate": 2.6739409499358152e-05, + "loss": 0.4608, + "step": 15416 + }, + { + "epoch": 19.79075738125802, + "grad_norm": 2.94238018989563, + "learning_rate": 2.673898160034232e-05, + "loss": 0.4337, + "step": 15417 + }, + { + "epoch": 19.79204107830552, + "grad_norm": 1.133504033088684, + "learning_rate": 2.673855370132649e-05, + "loss": 0.4339, + "step": 15418 + }, + { + "epoch": 19.793324775353017, + "grad_norm": 0.9063124060630798, + "learning_rate": 2.6738125802310657e-05, + "loss": 0.4272, + "step": 15419 + }, + { + "epoch": 19.794608472400512, + "grad_norm": 1.7728992700576782, + "learning_rate": 2.6737697903294822e-05, + "loss": 0.4068, + "step": 15420 + }, + { + "epoch": 19.79589216944801, + "grad_norm": 1.9617719650268555, + "learning_rate": 2.673727000427899e-05, + "loss": 0.4416, + "step": 15421 + }, + { + "epoch": 19.797175866495508, + "grad_norm": 1.197841763496399, + "learning_rate": 2.673684210526316e-05, + "loss": 0.4359, + "step": 15422 + }, + { + "epoch": 19.798459563543005, + "grad_norm": 1.1605051755905151, + "learning_rate": 2.6736414206247324e-05, + "loss": 0.4393, + "step": 15423 + }, + { + "epoch": 19.7997432605905, + "grad_norm": 4.030681133270264, + "learning_rate": 2.6735986307231496e-05, + "loss": 0.4364, + "step": 15424 + }, + { + "epoch": 19.801026957637998, + "grad_norm": 1.5633314847946167, + "learning_rate": 2.673555840821566e-05, + "loss": 0.4109, + "step": 15425 + }, + { + "epoch": 19.802310654685495, + "grad_norm": 0.9941782355308533, + "learning_rate": 2.6735130509199833e-05, + "loss": 0.4436, + "step": 15426 + }, + { + "epoch": 19.80359435173299, + "grad_norm": 2.1887402534484863, + "learning_rate": 2.6734702610183998e-05, + "loss": 0.5188, + "step": 15427 + }, + { + "epoch": 19.804878048780488, + "grad_norm": 2.3905062675476074, + "learning_rate": 2.6734274711168163e-05, + "loss": 0.437, + "step": 15428 + }, + { + "epoch": 19.806161745827985, + "grad_norm": 0.9406556487083435, + "learning_rate": 2.6733846812152334e-05, + "loss": 0.4533, + "step": 15429 + }, + { + "epoch": 19.80744544287548, + "grad_norm": 1.6425809860229492, + "learning_rate": 2.67334189131365e-05, + "loss": 0.4452, + "step": 15430 + }, + { + "epoch": 19.808729139922978, + "grad_norm": 1.3112705945968628, + "learning_rate": 2.6732991014120668e-05, + "loss": 0.4432, + "step": 15431 + }, + { + "epoch": 19.810012836970476, + "grad_norm": 0.9874541759490967, + "learning_rate": 2.6732563115104836e-05, + "loss": 0.478, + "step": 15432 + }, + { + "epoch": 19.811296534017973, + "grad_norm": 1.0571074485778809, + "learning_rate": 2.6732135216089005e-05, + "loss": 0.4367, + "step": 15433 + }, + { + "epoch": 19.812580231065468, + "grad_norm": 2.1919846534729004, + "learning_rate": 2.6731707317073173e-05, + "loss": 0.4489, + "step": 15434 + }, + { + "epoch": 19.813863928112966, + "grad_norm": 1.1330541372299194, + "learning_rate": 2.6731279418057338e-05, + "loss": 0.4828, + "step": 15435 + }, + { + "epoch": 19.815147625160463, + "grad_norm": 1.0749950408935547, + "learning_rate": 2.6730851519041506e-05, + "loss": 0.4519, + "step": 15436 + }, + { + "epoch": 19.816431322207958, + "grad_norm": 1.5536136627197266, + "learning_rate": 2.6730423620025675e-05, + "loss": 0.498, + "step": 15437 + }, + { + "epoch": 19.817715019255456, + "grad_norm": 1.6122888326644897, + "learning_rate": 2.6729995721009843e-05, + "loss": 0.4742, + "step": 15438 + }, + { + "epoch": 19.818998716302954, + "grad_norm": 1.1873189210891724, + "learning_rate": 2.672956782199401e-05, + "loss": 0.4834, + "step": 15439 + }, + { + "epoch": 19.820282413350448, + "grad_norm": 1.328989028930664, + "learning_rate": 2.672913992297818e-05, + "loss": 0.4596, + "step": 15440 + }, + { + "epoch": 19.821566110397946, + "grad_norm": 1.4329105615615845, + "learning_rate": 2.6728712023962345e-05, + "loss": 0.4773, + "step": 15441 + }, + { + "epoch": 19.822849807445444, + "grad_norm": 2.174877166748047, + "learning_rate": 2.6728284124946514e-05, + "loss": 0.5035, + "step": 15442 + }, + { + "epoch": 19.82413350449294, + "grad_norm": 2.0880041122436523, + "learning_rate": 2.6727856225930682e-05, + "loss": 0.4883, + "step": 15443 + }, + { + "epoch": 19.825417201540436, + "grad_norm": 3.4667177200317383, + "learning_rate": 2.6727428326914847e-05, + "loss": 0.5086, + "step": 15444 + }, + { + "epoch": 19.826700898587934, + "grad_norm": 1.7613615989685059, + "learning_rate": 2.672700042789902e-05, + "loss": 0.4201, + "step": 15445 + }, + { + "epoch": 19.82798459563543, + "grad_norm": 3.4787585735321045, + "learning_rate": 2.6726572528883184e-05, + "loss": 0.4465, + "step": 15446 + }, + { + "epoch": 19.829268292682926, + "grad_norm": 1.8317712545394897, + "learning_rate": 2.6726144629867352e-05, + "loss": 0.54, + "step": 15447 + }, + { + "epoch": 19.830551989730424, + "grad_norm": 3.9341275691986084, + "learning_rate": 2.672571673085152e-05, + "loss": 0.5164, + "step": 15448 + }, + { + "epoch": 19.83183568677792, + "grad_norm": 1.4895350933074951, + "learning_rate": 2.6725288831835686e-05, + "loss": 0.513, + "step": 15449 + }, + { + "epoch": 19.833119383825416, + "grad_norm": 1.6124247312545776, + "learning_rate": 2.6724860932819857e-05, + "loss": 0.5895, + "step": 15450 + }, + { + "epoch": 19.834403080872914, + "grad_norm": 2.3204033374786377, + "learning_rate": 2.6724433033804022e-05, + "loss": 0.6726, + "step": 15451 + }, + { + "epoch": 19.83568677792041, + "grad_norm": 1.0183557271957397, + "learning_rate": 2.672400513478819e-05, + "loss": 0.4182, + "step": 15452 + }, + { + "epoch": 19.836970474967906, + "grad_norm": 1.3211416006088257, + "learning_rate": 2.672357723577236e-05, + "loss": 0.4252, + "step": 15453 + }, + { + "epoch": 19.838254172015404, + "grad_norm": 1.460368275642395, + "learning_rate": 2.6723149336756528e-05, + "loss": 0.4525, + "step": 15454 + }, + { + "epoch": 19.8395378690629, + "grad_norm": 1.0289498567581177, + "learning_rate": 2.6722721437740693e-05, + "loss": 0.4697, + "step": 15455 + }, + { + "epoch": 19.8408215661104, + "grad_norm": 1.2590693235397339, + "learning_rate": 2.672229353872486e-05, + "loss": 0.4659, + "step": 15456 + }, + { + "epoch": 19.842105263157894, + "grad_norm": 2.665977716445923, + "learning_rate": 2.672186563970903e-05, + "loss": 0.5102, + "step": 15457 + }, + { + "epoch": 19.84338896020539, + "grad_norm": 2.0490667819976807, + "learning_rate": 2.6721437740693198e-05, + "loss": 0.4828, + "step": 15458 + }, + { + "epoch": 19.84467265725289, + "grad_norm": 1.2672243118286133, + "learning_rate": 2.6721009841677366e-05, + "loss": 0.4507, + "step": 15459 + }, + { + "epoch": 19.845956354300384, + "grad_norm": 6.098684787750244, + "learning_rate": 2.672058194266153e-05, + "loss": 0.4196, + "step": 15460 + }, + { + "epoch": 19.84724005134788, + "grad_norm": 1.494028925895691, + "learning_rate": 2.67201540436457e-05, + "loss": 0.4635, + "step": 15461 + }, + { + "epoch": 19.84852374839538, + "grad_norm": 0.8866334557533264, + "learning_rate": 2.6719726144629868e-05, + "loss": 0.4081, + "step": 15462 + }, + { + "epoch": 19.849807445442874, + "grad_norm": 1.0594958066940308, + "learning_rate": 2.6719298245614033e-05, + "loss": 0.4791, + "step": 15463 + }, + { + "epoch": 19.85109114249037, + "grad_norm": 1.128925085067749, + "learning_rate": 2.6718870346598205e-05, + "loss": 0.4199, + "step": 15464 + }, + { + "epoch": 19.85237483953787, + "grad_norm": 1.5888596773147583, + "learning_rate": 2.671844244758237e-05, + "loss": 0.4475, + "step": 15465 + }, + { + "epoch": 19.853658536585368, + "grad_norm": 0.996843159198761, + "learning_rate": 2.6718014548566542e-05, + "loss": 0.4317, + "step": 15466 + }, + { + "epoch": 19.854942233632862, + "grad_norm": 1.4717475175857544, + "learning_rate": 2.6717586649550707e-05, + "loss": 0.4691, + "step": 15467 + }, + { + "epoch": 19.85622593068036, + "grad_norm": 1.5258013010025024, + "learning_rate": 2.6717158750534872e-05, + "loss": 0.4354, + "step": 15468 + }, + { + "epoch": 19.857509627727858, + "grad_norm": 1.3599967956542969, + "learning_rate": 2.6716730851519044e-05, + "loss": 0.4285, + "step": 15469 + }, + { + "epoch": 19.858793324775352, + "grad_norm": 1.2137036323547363, + "learning_rate": 2.671630295250321e-05, + "loss": 0.4656, + "step": 15470 + }, + { + "epoch": 19.86007702182285, + "grad_norm": 1.8295872211456299, + "learning_rate": 2.6715875053487377e-05, + "loss": 0.4561, + "step": 15471 + }, + { + "epoch": 19.861360718870348, + "grad_norm": 7.442967891693115, + "learning_rate": 2.6715447154471546e-05, + "loss": 0.4878, + "step": 15472 + }, + { + "epoch": 19.862644415917842, + "grad_norm": 1.9630826711654663, + "learning_rate": 2.6715019255455714e-05, + "loss": 0.4779, + "step": 15473 + }, + { + "epoch": 19.86392811296534, + "grad_norm": 0.718241810798645, + "learning_rate": 2.6714591356439882e-05, + "loss": 0.4372, + "step": 15474 + }, + { + "epoch": 19.865211810012838, + "grad_norm": 1.3112242221832275, + "learning_rate": 2.6714163457424047e-05, + "loss": 0.4364, + "step": 15475 + }, + { + "epoch": 19.866495507060336, + "grad_norm": 1.5333595275878906, + "learning_rate": 2.6713735558408216e-05, + "loss": 0.4416, + "step": 15476 + }, + { + "epoch": 19.86777920410783, + "grad_norm": 1.0142673254013062, + "learning_rate": 2.6713307659392384e-05, + "loss": 0.4586, + "step": 15477 + }, + { + "epoch": 19.869062901155328, + "grad_norm": 3.941211462020874, + "learning_rate": 2.6712879760376553e-05, + "loss": 0.4845, + "step": 15478 + }, + { + "epoch": 19.870346598202826, + "grad_norm": 1.61285400390625, + "learning_rate": 2.6712451861360718e-05, + "loss": 0.4343, + "step": 15479 + }, + { + "epoch": 19.87163029525032, + "grad_norm": 1.2257388830184937, + "learning_rate": 2.671202396234489e-05, + "loss": 0.5055, + "step": 15480 + }, + { + "epoch": 19.872913992297818, + "grad_norm": 4.689775466918945, + "learning_rate": 2.6711596063329055e-05, + "loss": 0.4658, + "step": 15481 + }, + { + "epoch": 19.874197689345316, + "grad_norm": 1.5088200569152832, + "learning_rate": 2.6711168164313223e-05, + "loss": 0.4368, + "step": 15482 + }, + { + "epoch": 19.87548138639281, + "grad_norm": 1.0417084693908691, + "learning_rate": 2.671074026529739e-05, + "loss": 0.444, + "step": 15483 + }, + { + "epoch": 19.876765083440308, + "grad_norm": 1.7374961376190186, + "learning_rate": 2.6710312366281556e-05, + "loss": 0.4464, + "step": 15484 + }, + { + "epoch": 19.878048780487806, + "grad_norm": 1.8493119478225708, + "learning_rate": 2.6709884467265728e-05, + "loss": 0.4578, + "step": 15485 + }, + { + "epoch": 19.8793324775353, + "grad_norm": 1.5102180242538452, + "learning_rate": 2.6709456568249893e-05, + "loss": 0.4405, + "step": 15486 + }, + { + "epoch": 19.880616174582798, + "grad_norm": 1.8002169132232666, + "learning_rate": 2.670902866923406e-05, + "loss": 0.4911, + "step": 15487 + }, + { + "epoch": 19.881899871630296, + "grad_norm": 1.3526790142059326, + "learning_rate": 2.670860077021823e-05, + "loss": 0.4448, + "step": 15488 + }, + { + "epoch": 19.883183568677794, + "grad_norm": 4.200967788696289, + "learning_rate": 2.6708172871202395e-05, + "loss": 0.4692, + "step": 15489 + }, + { + "epoch": 19.884467265725288, + "grad_norm": 1.1190199851989746, + "learning_rate": 2.6707744972186567e-05, + "loss": 0.4624, + "step": 15490 + }, + { + "epoch": 19.885750962772786, + "grad_norm": 1.6851332187652588, + "learning_rate": 2.6707317073170732e-05, + "loss": 0.4636, + "step": 15491 + }, + { + "epoch": 19.887034659820284, + "grad_norm": 1.2871270179748535, + "learning_rate": 2.67068891741549e-05, + "loss": 0.4645, + "step": 15492 + }, + { + "epoch": 19.888318356867778, + "grad_norm": 2.829669237136841, + "learning_rate": 2.670646127513907e-05, + "loss": 0.4768, + "step": 15493 + }, + { + "epoch": 19.889602053915276, + "grad_norm": 3.4591708183288574, + "learning_rate": 2.6706033376123237e-05, + "loss": 0.4753, + "step": 15494 + }, + { + "epoch": 19.890885750962774, + "grad_norm": 1.2429828643798828, + "learning_rate": 2.6705605477107402e-05, + "loss": 0.4619, + "step": 15495 + }, + { + "epoch": 19.892169448010268, + "grad_norm": 3.5505528450012207, + "learning_rate": 2.670517757809157e-05, + "loss": 0.5395, + "step": 15496 + }, + { + "epoch": 19.893453145057766, + "grad_norm": 2.3468852043151855, + "learning_rate": 2.670474967907574e-05, + "loss": 0.5236, + "step": 15497 + }, + { + "epoch": 19.894736842105264, + "grad_norm": 1.7035764455795288, + "learning_rate": 2.6704321780059904e-05, + "loss": 0.528, + "step": 15498 + }, + { + "epoch": 19.89602053915276, + "grad_norm": 1.080427885055542, + "learning_rate": 2.6703893881044076e-05, + "loss": 0.5039, + "step": 15499 + }, + { + "epoch": 19.897304236200256, + "grad_norm": 2.6181511878967285, + "learning_rate": 2.670346598202824e-05, + "loss": 0.568, + "step": 15500 + }, + { + "epoch": 19.898587933247754, + "grad_norm": 9.378171920776367, + "learning_rate": 2.6703038083012413e-05, + "loss": 0.6996, + "step": 15501 + }, + { + "epoch": 19.89987163029525, + "grad_norm": 1.74847412109375, + "learning_rate": 2.6702610183996578e-05, + "loss": 0.426, + "step": 15502 + }, + { + "epoch": 19.901155327342746, + "grad_norm": 1.5418418645858765, + "learning_rate": 2.6702182284980743e-05, + "loss": 0.4276, + "step": 15503 + }, + { + "epoch": 19.902439024390244, + "grad_norm": 1.3688684701919556, + "learning_rate": 2.6701754385964914e-05, + "loss": 0.4291, + "step": 15504 + }, + { + "epoch": 19.90372272143774, + "grad_norm": 1.6538892984390259, + "learning_rate": 2.670132648694908e-05, + "loss": 0.4373, + "step": 15505 + }, + { + "epoch": 19.905006418485236, + "grad_norm": 1.4626481533050537, + "learning_rate": 2.6700898587933248e-05, + "loss": 0.3968, + "step": 15506 + }, + { + "epoch": 19.906290115532734, + "grad_norm": 1.1133368015289307, + "learning_rate": 2.6700470688917416e-05, + "loss": 0.403, + "step": 15507 + }, + { + "epoch": 19.90757381258023, + "grad_norm": 0.941641628742218, + "learning_rate": 2.6700042789901585e-05, + "loss": 0.4341, + "step": 15508 + }, + { + "epoch": 19.90885750962773, + "grad_norm": 1.0071276426315308, + "learning_rate": 2.6699614890885753e-05, + "loss": 0.4272, + "step": 15509 + }, + { + "epoch": 19.910141206675224, + "grad_norm": 2.8699419498443604, + "learning_rate": 2.6699186991869918e-05, + "loss": 0.4632, + "step": 15510 + }, + { + "epoch": 19.911424903722722, + "grad_norm": 2.895303249359131, + "learning_rate": 2.6698759092854087e-05, + "loss": 0.448, + "step": 15511 + }, + { + "epoch": 19.91270860077022, + "grad_norm": 1.7342368364334106, + "learning_rate": 2.6698331193838255e-05, + "loss": 0.4317, + "step": 15512 + }, + { + "epoch": 19.913992297817714, + "grad_norm": 2.2334258556365967, + "learning_rate": 2.6697903294822423e-05, + "loss": 0.478, + "step": 15513 + }, + { + "epoch": 19.915275994865212, + "grad_norm": 4.167854309082031, + "learning_rate": 2.669747539580659e-05, + "loss": 0.4408, + "step": 15514 + }, + { + "epoch": 19.91655969191271, + "grad_norm": 4.708273410797119, + "learning_rate": 2.669704749679076e-05, + "loss": 0.4217, + "step": 15515 + }, + { + "epoch": 19.917843388960204, + "grad_norm": 1.0136451721191406, + "learning_rate": 2.6696619597774925e-05, + "loss": 0.4342, + "step": 15516 + }, + { + "epoch": 19.919127086007702, + "grad_norm": 0.8325221538543701, + "learning_rate": 2.6696191698759094e-05, + "loss": 0.4491, + "step": 15517 + }, + { + "epoch": 19.9204107830552, + "grad_norm": 1.9804697036743164, + "learning_rate": 2.6695763799743262e-05, + "loss": 0.4697, + "step": 15518 + }, + { + "epoch": 19.921694480102694, + "grad_norm": 0.8399656414985657, + "learning_rate": 2.6695335900727427e-05, + "loss": 0.4003, + "step": 15519 + }, + { + "epoch": 19.922978177150192, + "grad_norm": 1.7043581008911133, + "learning_rate": 2.66949080017116e-05, + "loss": 0.45, + "step": 15520 + }, + { + "epoch": 19.92426187419769, + "grad_norm": 1.9166079759597778, + "learning_rate": 2.6694480102695764e-05, + "loss": 0.4341, + "step": 15521 + }, + { + "epoch": 19.925545571245188, + "grad_norm": 1.6090812683105469, + "learning_rate": 2.6694052203679932e-05, + "loss": 0.4617, + "step": 15522 + }, + { + "epoch": 19.926829268292682, + "grad_norm": 1.210909128189087, + "learning_rate": 2.66936243046641e-05, + "loss": 0.4228, + "step": 15523 + }, + { + "epoch": 19.92811296534018, + "grad_norm": 2.0035271644592285, + "learning_rate": 2.6693196405648266e-05, + "loss": 0.4491, + "step": 15524 + }, + { + "epoch": 19.929396662387678, + "grad_norm": 1.0364419221878052, + "learning_rate": 2.6692768506632438e-05, + "loss": 0.4679, + "step": 15525 + }, + { + "epoch": 19.930680359435172, + "grad_norm": 1.4919947385787964, + "learning_rate": 2.6692340607616603e-05, + "loss": 0.4756, + "step": 15526 + }, + { + "epoch": 19.93196405648267, + "grad_norm": 1.6284682750701904, + "learning_rate": 2.669191270860077e-05, + "loss": 0.402, + "step": 15527 + }, + { + "epoch": 19.933247753530168, + "grad_norm": 2.0746965408325195, + "learning_rate": 2.669148480958494e-05, + "loss": 0.4459, + "step": 15528 + }, + { + "epoch": 19.934531450577662, + "grad_norm": 7.528881072998047, + "learning_rate": 2.6691056910569104e-05, + "loss": 0.4612, + "step": 15529 + }, + { + "epoch": 19.93581514762516, + "grad_norm": 1.3463367223739624, + "learning_rate": 2.6690629011553273e-05, + "loss": 0.4556, + "step": 15530 + }, + { + "epoch": 19.937098844672658, + "grad_norm": 1.0339478254318237, + "learning_rate": 2.669020111253744e-05, + "loss": 0.4174, + "step": 15531 + }, + { + "epoch": 19.938382541720156, + "grad_norm": 0.9965205788612366, + "learning_rate": 2.668977321352161e-05, + "loss": 0.4373, + "step": 15532 + }, + { + "epoch": 19.93966623876765, + "grad_norm": 1.645919680595398, + "learning_rate": 2.6689345314505778e-05, + "loss": 0.4463, + "step": 15533 + }, + { + "epoch": 19.940949935815148, + "grad_norm": 2.2125301361083984, + "learning_rate": 2.6688917415489946e-05, + "loss": 0.4674, + "step": 15534 + }, + { + "epoch": 19.942233632862646, + "grad_norm": 3.6043362617492676, + "learning_rate": 2.668848951647411e-05, + "loss": 0.482, + "step": 15535 + }, + { + "epoch": 19.94351732991014, + "grad_norm": 2.8749732971191406, + "learning_rate": 2.668806161745828e-05, + "loss": 0.4132, + "step": 15536 + }, + { + "epoch": 19.944801026957638, + "grad_norm": 2.1270079612731934, + "learning_rate": 2.6687633718442448e-05, + "loss": 0.4873, + "step": 15537 + }, + { + "epoch": 19.946084724005136, + "grad_norm": 5.475948810577393, + "learning_rate": 2.6687205819426613e-05, + "loss": 0.5128, + "step": 15538 + }, + { + "epoch": 19.94736842105263, + "grad_norm": 2.0678648948669434, + "learning_rate": 2.6686777920410785e-05, + "loss": 0.501, + "step": 15539 + }, + { + "epoch": 19.948652118100128, + "grad_norm": 1.4347584247589111, + "learning_rate": 2.668635002139495e-05, + "loss": 0.4693, + "step": 15540 + }, + { + "epoch": 19.949935815147626, + "grad_norm": 1.1877014636993408, + "learning_rate": 2.6685922122379122e-05, + "loss": 0.4463, + "step": 15541 + }, + { + "epoch": 19.951219512195124, + "grad_norm": 1.6823116540908813, + "learning_rate": 2.6685494223363287e-05, + "loss": 0.4788, + "step": 15542 + }, + { + "epoch": 19.952503209242618, + "grad_norm": 3.352102756500244, + "learning_rate": 2.6685066324347452e-05, + "loss": 0.4883, + "step": 15543 + }, + { + "epoch": 19.953786906290116, + "grad_norm": 1.6322373151779175, + "learning_rate": 2.6684638425331624e-05, + "loss": 0.4373, + "step": 15544 + }, + { + "epoch": 19.955070603337614, + "grad_norm": 1.3575308322906494, + "learning_rate": 2.668421052631579e-05, + "loss": 0.4288, + "step": 15545 + }, + { + "epoch": 19.956354300385108, + "grad_norm": 3.1919031143188477, + "learning_rate": 2.6683782627299957e-05, + "loss": 0.4733, + "step": 15546 + }, + { + "epoch": 19.957637997432606, + "grad_norm": 4.5289154052734375, + "learning_rate": 2.6683354728284126e-05, + "loss": 0.4646, + "step": 15547 + }, + { + "epoch": 19.958921694480104, + "grad_norm": 1.8846224546432495, + "learning_rate": 2.6682926829268294e-05, + "loss": 0.5128, + "step": 15548 + }, + { + "epoch": 19.960205391527598, + "grad_norm": 2.2298543453216553, + "learning_rate": 2.6682498930252462e-05, + "loss": 0.539, + "step": 15549 + }, + { + "epoch": 19.961489088575096, + "grad_norm": 3.920189380645752, + "learning_rate": 2.6682071031236627e-05, + "loss": 0.5676, + "step": 15550 + }, + { + "epoch": 19.962772785622594, + "grad_norm": 3.9759058952331543, + "learning_rate": 2.6681643132220796e-05, + "loss": 0.6865, + "step": 15551 + }, + { + "epoch": 19.964056482670088, + "grad_norm": 2.2200279235839844, + "learning_rate": 2.6681215233204964e-05, + "loss": 0.421, + "step": 15552 + }, + { + "epoch": 19.965340179717586, + "grad_norm": 2.673266887664795, + "learning_rate": 2.6680787334189133e-05, + "loss": 0.4275, + "step": 15553 + }, + { + "epoch": 19.966623876765084, + "grad_norm": 1.0160462856292725, + "learning_rate": 2.6680359435173298e-05, + "loss": 0.4438, + "step": 15554 + }, + { + "epoch": 19.96790757381258, + "grad_norm": 1.158102035522461, + "learning_rate": 2.667993153615747e-05, + "loss": 0.4293, + "step": 15555 + }, + { + "epoch": 19.969191270860076, + "grad_norm": 1.3070735931396484, + "learning_rate": 2.6679503637141635e-05, + "loss": 0.4724, + "step": 15556 + }, + { + "epoch": 19.970474967907574, + "grad_norm": 1.2811498641967773, + "learning_rate": 2.6679075738125803e-05, + "loss": 0.4203, + "step": 15557 + }, + { + "epoch": 19.971758664955072, + "grad_norm": 1.0501339435577393, + "learning_rate": 2.667864783910997e-05, + "loss": 0.4377, + "step": 15558 + }, + { + "epoch": 19.973042362002566, + "grad_norm": 0.933129608631134, + "learning_rate": 2.6678219940094136e-05, + "loss": 0.4181, + "step": 15559 + }, + { + "epoch": 19.974326059050064, + "grad_norm": 1.0424600839614868, + "learning_rate": 2.6677792041078308e-05, + "loss": 0.4113, + "step": 15560 + }, + { + "epoch": 19.975609756097562, + "grad_norm": 1.4563435316085815, + "learning_rate": 2.6677364142062473e-05, + "loss": 0.4622, + "step": 15561 + }, + { + "epoch": 19.976893453145056, + "grad_norm": 2.227245330810547, + "learning_rate": 2.667693624304664e-05, + "loss": 0.4613, + "step": 15562 + }, + { + "epoch": 19.978177150192554, + "grad_norm": 1.805374264717102, + "learning_rate": 2.667650834403081e-05, + "loss": 0.387, + "step": 15563 + }, + { + "epoch": 19.979460847240052, + "grad_norm": 5.399188041687012, + "learning_rate": 2.6676080445014975e-05, + "loss": 0.4391, + "step": 15564 + }, + { + "epoch": 19.98074454428755, + "grad_norm": 1.868782639503479, + "learning_rate": 2.6675652545999147e-05, + "loss": 0.4852, + "step": 15565 + }, + { + "epoch": 19.982028241335044, + "grad_norm": 1.7770413160324097, + "learning_rate": 2.6675224646983312e-05, + "loss": 0.4483, + "step": 15566 + }, + { + "epoch": 19.983311938382542, + "grad_norm": 1.3924862146377563, + "learning_rate": 2.667479674796748e-05, + "loss": 0.4246, + "step": 15567 + }, + { + "epoch": 19.98459563543004, + "grad_norm": 0.9323235750198364, + "learning_rate": 2.667436884895165e-05, + "loss": 0.4423, + "step": 15568 + }, + { + "epoch": 19.985879332477534, + "grad_norm": 3.2953133583068848, + "learning_rate": 2.6673940949935817e-05, + "loss": 0.4307, + "step": 15569 + }, + { + "epoch": 19.987163029525032, + "grad_norm": 1.0148425102233887, + "learning_rate": 2.6673513050919982e-05, + "loss": 0.4613, + "step": 15570 + }, + { + "epoch": 19.98844672657253, + "grad_norm": 2.2372586727142334, + "learning_rate": 2.667308515190415e-05, + "loss": 0.4612, + "step": 15571 + }, + { + "epoch": 19.989730423620024, + "grad_norm": 1.0960177183151245, + "learning_rate": 2.667265725288832e-05, + "loss": 0.4284, + "step": 15572 + }, + { + "epoch": 19.991014120667522, + "grad_norm": 1.087197184562683, + "learning_rate": 2.6672229353872487e-05, + "loss": 0.4292, + "step": 15573 + }, + { + "epoch": 19.99229781771502, + "grad_norm": 1.7699270248413086, + "learning_rate": 2.6671801454856656e-05, + "loss": 0.4809, + "step": 15574 + }, + { + "epoch": 19.993581514762518, + "grad_norm": 1.2834656238555908, + "learning_rate": 2.667137355584082e-05, + "loss": 0.4655, + "step": 15575 + }, + { + "epoch": 19.994865211810012, + "grad_norm": 1.7596094608306885, + "learning_rate": 2.6670945656824993e-05, + "loss": 0.4839, + "step": 15576 + }, + { + "epoch": 19.99614890885751, + "grad_norm": 1.448447585105896, + "learning_rate": 2.6670517757809158e-05, + "loss": 0.5513, + "step": 15577 + }, + { + "epoch": 19.997432605905008, + "grad_norm": 3.1025478839874268, + "learning_rate": 2.6670089858793323e-05, + "loss": 0.5337, + "step": 15578 + }, + { + "epoch": 19.998716302952502, + "grad_norm": 2.5320003032684326, + "learning_rate": 2.6669661959777494e-05, + "loss": 0.5352, + "step": 15579 + }, + { + "epoch": 20.0, + "grad_norm": 2.597231149673462, + "learning_rate": 2.666923406076166e-05, + "loss": 0.587, + "step": 15580 + }, + { + "epoch": 20.001283697047498, + "grad_norm": 9.535999298095703, + "learning_rate": 2.666880616174583e-05, + "loss": 0.3901, + "step": 15581 + }, + { + "epoch": 20.002567394094992, + "grad_norm": 1.2430427074432373, + "learning_rate": 2.6668378262729996e-05, + "loss": 0.441, + "step": 15582 + }, + { + "epoch": 20.00385109114249, + "grad_norm": 1.3038766384124756, + "learning_rate": 2.6667950363714165e-05, + "loss": 0.4547, + "step": 15583 + }, + { + "epoch": 20.005134788189988, + "grad_norm": 1.6476281881332397, + "learning_rate": 2.6667522464698333e-05, + "loss": 0.4408, + "step": 15584 + }, + { + "epoch": 20.006418485237482, + "grad_norm": 1.322585105895996, + "learning_rate": 2.6667094565682498e-05, + "loss": 0.4261, + "step": 15585 + }, + { + "epoch": 20.00770218228498, + "grad_norm": 1.4507989883422852, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.4114, + "step": 15586 + }, + { + "epoch": 20.008985879332478, + "grad_norm": 3.543966054916382, + "learning_rate": 2.6666238767650835e-05, + "loss": 0.4743, + "step": 15587 + }, + { + "epoch": 20.010269576379976, + "grad_norm": 1.5266238451004028, + "learning_rate": 2.6665810868635003e-05, + "loss": 0.4494, + "step": 15588 + }, + { + "epoch": 20.01155327342747, + "grad_norm": 2.671635866165161, + "learning_rate": 2.6665382969619172e-05, + "loss": 0.4306, + "step": 15589 + }, + { + "epoch": 20.012836970474968, + "grad_norm": 1.5174976587295532, + "learning_rate": 2.6664955070603337e-05, + "loss": 0.4358, + "step": 15590 + }, + { + "epoch": 20.014120667522466, + "grad_norm": 1.350419282913208, + "learning_rate": 2.6664527171587505e-05, + "loss": 0.4772, + "step": 15591 + }, + { + "epoch": 20.01540436456996, + "grad_norm": 1.707484245300293, + "learning_rate": 2.6664099272571674e-05, + "loss": 0.4296, + "step": 15592 + }, + { + "epoch": 20.016688061617458, + "grad_norm": 1.5657202005386353, + "learning_rate": 2.6663671373555842e-05, + "loss": 0.4484, + "step": 15593 + }, + { + "epoch": 20.017971758664956, + "grad_norm": 2.402522087097168, + "learning_rate": 2.6663243474540007e-05, + "loss": 0.481, + "step": 15594 + }, + { + "epoch": 20.01925545571245, + "grad_norm": 1.255325198173523, + "learning_rate": 2.666281557552418e-05, + "loss": 0.4394, + "step": 15595 + }, + { + "epoch": 20.020539152759948, + "grad_norm": 2.1129205226898193, + "learning_rate": 2.6662387676508344e-05, + "loss": 0.4515, + "step": 15596 + }, + { + "epoch": 20.021822849807446, + "grad_norm": 1.484135627746582, + "learning_rate": 2.6661959777492512e-05, + "loss": 0.4283, + "step": 15597 + }, + { + "epoch": 20.023106546854944, + "grad_norm": 1.0961637496948242, + "learning_rate": 2.666153187847668e-05, + "loss": 0.4239, + "step": 15598 + }, + { + "epoch": 20.024390243902438, + "grad_norm": 1.4723620414733887, + "learning_rate": 2.6661103979460846e-05, + "loss": 0.4501, + "step": 15599 + }, + { + "epoch": 20.025673940949936, + "grad_norm": 2.2776038646698, + "learning_rate": 2.6660676080445018e-05, + "loss": 0.4453, + "step": 15600 + }, + { + "epoch": 20.026957637997434, + "grad_norm": 1.1581635475158691, + "learning_rate": 2.6660248181429183e-05, + "loss": 0.37, + "step": 15601 + }, + { + "epoch": 20.028241335044928, + "grad_norm": 0.9137828350067139, + "learning_rate": 2.665982028241335e-05, + "loss": 0.3977, + "step": 15602 + }, + { + "epoch": 20.029525032092426, + "grad_norm": 1.61632239818573, + "learning_rate": 2.665939238339752e-05, + "loss": 0.3755, + "step": 15603 + }, + { + "epoch": 20.030808729139924, + "grad_norm": 1.0081849098205566, + "learning_rate": 2.6658964484381684e-05, + "loss": 0.389, + "step": 15604 + }, + { + "epoch": 20.03209242618742, + "grad_norm": 2.4389421939849854, + "learning_rate": 2.6658536585365856e-05, + "loss": 0.4112, + "step": 15605 + }, + { + "epoch": 20.033376123234916, + "grad_norm": 1.1908222436904907, + "learning_rate": 2.665810868635002e-05, + "loss": 0.409, + "step": 15606 + }, + { + "epoch": 20.034659820282414, + "grad_norm": 1.2959504127502441, + "learning_rate": 2.665768078733419e-05, + "loss": 0.4274, + "step": 15607 + }, + { + "epoch": 20.035943517329912, + "grad_norm": 1.1508485078811646, + "learning_rate": 2.6657252888318358e-05, + "loss": 0.4156, + "step": 15608 + }, + { + "epoch": 20.037227214377406, + "grad_norm": 0.9461773037910461, + "learning_rate": 2.6656824989302527e-05, + "loss": 0.4375, + "step": 15609 + }, + { + "epoch": 20.038510911424904, + "grad_norm": 2.882908821105957, + "learning_rate": 2.665639709028669e-05, + "loss": 0.3965, + "step": 15610 + }, + { + "epoch": 20.039794608472402, + "grad_norm": 1.088381290435791, + "learning_rate": 2.665596919127086e-05, + "loss": 0.4264, + "step": 15611 + }, + { + "epoch": 20.041078305519896, + "grad_norm": 1.3184545040130615, + "learning_rate": 2.665554129225503e-05, + "loss": 0.431, + "step": 15612 + }, + { + "epoch": 20.042362002567394, + "grad_norm": 1.1668596267700195, + "learning_rate": 2.6655113393239197e-05, + "loss": 0.4687, + "step": 15613 + }, + { + "epoch": 20.043645699614892, + "grad_norm": 1.2433254718780518, + "learning_rate": 2.6654685494223365e-05, + "loss": 0.4385, + "step": 15614 + }, + { + "epoch": 20.044929396662386, + "grad_norm": 2.216977119445801, + "learning_rate": 2.665425759520753e-05, + "loss": 0.4386, + "step": 15615 + }, + { + "epoch": 20.046213093709884, + "grad_norm": 1.3164881467819214, + "learning_rate": 2.6653829696191702e-05, + "loss": 0.4032, + "step": 15616 + }, + { + "epoch": 20.047496790757382, + "grad_norm": 2.369838237762451, + "learning_rate": 2.6653401797175867e-05, + "loss": 0.436, + "step": 15617 + }, + { + "epoch": 20.048780487804876, + "grad_norm": 1.5288846492767334, + "learning_rate": 2.6652973898160032e-05, + "loss": 0.4164, + "step": 15618 + }, + { + "epoch": 20.050064184852374, + "grad_norm": 1.3169610500335693, + "learning_rate": 2.6652545999144204e-05, + "loss": 0.4547, + "step": 15619 + }, + { + "epoch": 20.051347881899872, + "grad_norm": 1.5057543516159058, + "learning_rate": 2.665211810012837e-05, + "loss": 0.4703, + "step": 15620 + }, + { + "epoch": 20.05263157894737, + "grad_norm": 1.8136385679244995, + "learning_rate": 2.665169020111254e-05, + "loss": 0.4817, + "step": 15621 + }, + { + "epoch": 20.053915275994864, + "grad_norm": 2.6364376544952393, + "learning_rate": 2.6651262302096706e-05, + "loss": 0.5178, + "step": 15622 + }, + { + "epoch": 20.055198973042362, + "grad_norm": 1.6611406803131104, + "learning_rate": 2.6650834403080874e-05, + "loss": 0.4062, + "step": 15623 + }, + { + "epoch": 20.05648267008986, + "grad_norm": 1.6259480714797974, + "learning_rate": 2.6650406504065043e-05, + "loss": 0.4575, + "step": 15624 + }, + { + "epoch": 20.057766367137354, + "grad_norm": 1.5448805093765259, + "learning_rate": 2.6649978605049208e-05, + "loss": 0.4581, + "step": 15625 + }, + { + "epoch": 20.059050064184852, + "grad_norm": 2.305903196334839, + "learning_rate": 2.6649550706033376e-05, + "loss": 0.4977, + "step": 15626 + }, + { + "epoch": 20.06033376123235, + "grad_norm": 1.6033025979995728, + "learning_rate": 2.6649122807017544e-05, + "loss": 0.5399, + "step": 15627 + }, + { + "epoch": 20.061617458279844, + "grad_norm": 3.8410730361938477, + "learning_rate": 2.6648694908001713e-05, + "loss": 0.474, + "step": 15628 + }, + { + "epoch": 20.062901155327342, + "grad_norm": 2.3099215030670166, + "learning_rate": 2.664826700898588e-05, + "loss": 0.5525, + "step": 15629 + }, + { + "epoch": 20.06418485237484, + "grad_norm": 3.778687000274658, + "learning_rate": 2.664783910997005e-05, + "loss": 0.5994, + "step": 15630 + }, + { + "epoch": 20.065468549422338, + "grad_norm": 1.0081672668457031, + "learning_rate": 2.6647411210954215e-05, + "loss": 0.4017, + "step": 15631 + }, + { + "epoch": 20.066752246469832, + "grad_norm": 8.097123146057129, + "learning_rate": 2.6646983311938383e-05, + "loss": 0.4078, + "step": 15632 + }, + { + "epoch": 20.06803594351733, + "grad_norm": 2.1654672622680664, + "learning_rate": 2.664655541292255e-05, + "loss": 0.4403, + "step": 15633 + }, + { + "epoch": 20.069319640564828, + "grad_norm": 3.158895969390869, + "learning_rate": 2.6646127513906716e-05, + "loss": 0.4301, + "step": 15634 + }, + { + "epoch": 20.070603337612322, + "grad_norm": 1.1716517210006714, + "learning_rate": 2.6645699614890888e-05, + "loss": 0.398, + "step": 15635 + }, + { + "epoch": 20.07188703465982, + "grad_norm": 1.1766705513000488, + "learning_rate": 2.6645271715875053e-05, + "loss": 0.3958, + "step": 15636 + }, + { + "epoch": 20.073170731707318, + "grad_norm": 1.1790071725845337, + "learning_rate": 2.6644843816859225e-05, + "loss": 0.4221, + "step": 15637 + }, + { + "epoch": 20.074454428754812, + "grad_norm": 1.9673117399215698, + "learning_rate": 2.664441591784339e-05, + "loss": 0.4419, + "step": 15638 + }, + { + "epoch": 20.07573812580231, + "grad_norm": 2.7084569931030273, + "learning_rate": 2.6643988018827555e-05, + "loss": 0.4403, + "step": 15639 + }, + { + "epoch": 20.077021822849808, + "grad_norm": 1.16310715675354, + "learning_rate": 2.6643560119811727e-05, + "loss": 0.4044, + "step": 15640 + }, + { + "epoch": 20.078305519897306, + "grad_norm": 1.5654128789901733, + "learning_rate": 2.6643132220795892e-05, + "loss": 0.4137, + "step": 15641 + }, + { + "epoch": 20.0795892169448, + "grad_norm": NaN, + "learning_rate": 2.6643132220795892e-05, + "loss": 0.4563, + "step": 15642 + }, + { + "epoch": 20.080872913992298, + "grad_norm": 1.658786416053772, + "learning_rate": 2.664270432178006e-05, + "loss": 0.3607, + "step": 15643 + }, + { + "epoch": 20.082156611039796, + "grad_norm": 5.559004783630371, + "learning_rate": 2.664227642276423e-05, + "loss": 0.3903, + "step": 15644 + }, + { + "epoch": 20.08344030808729, + "grad_norm": 1.252278208732605, + "learning_rate": 2.6641848523748397e-05, + "loss": 0.4188, + "step": 15645 + }, + { + "epoch": 20.084724005134788, + "grad_norm": 2.572221517562866, + "learning_rate": 2.6641420624732566e-05, + "loss": 0.4409, + "step": 15646 + }, + { + "epoch": 20.086007702182286, + "grad_norm": 1.149282455444336, + "learning_rate": 2.664099272571673e-05, + "loss": 0.4094, + "step": 15647 + }, + { + "epoch": 20.08729139922978, + "grad_norm": 3.1306352615356445, + "learning_rate": 2.66405648267009e-05, + "loss": 0.4317, + "step": 15648 + }, + { + "epoch": 20.088575096277278, + "grad_norm": 1.492405891418457, + "learning_rate": 2.6640136927685067e-05, + "loss": 0.4248, + "step": 15649 + }, + { + "epoch": 20.089858793324776, + "grad_norm": 2.0460500717163086, + "learning_rate": 2.6639709028669236e-05, + "loss": 0.4213, + "step": 15650 + }, + { + "epoch": 20.09114249037227, + "grad_norm": 1.2615395784378052, + "learning_rate": 2.66392811296534e-05, + "loss": 0.4134, + "step": 15651 + }, + { + "epoch": 20.09242618741977, + "grad_norm": 1.3868675231933594, + "learning_rate": 2.663885323063757e-05, + "loss": 0.4384, + "step": 15652 + }, + { + "epoch": 20.093709884467266, + "grad_norm": 1.620784878730774, + "learning_rate": 2.6638425331621738e-05, + "loss": 0.441, + "step": 15653 + }, + { + "epoch": 20.094993581514764, + "grad_norm": 1.8080378770828247, + "learning_rate": 2.6637997432605906e-05, + "loss": 0.4166, + "step": 15654 + }, + { + "epoch": 20.09627727856226, + "grad_norm": 1.4381343126296997, + "learning_rate": 2.6637569533590075e-05, + "loss": 0.4194, + "step": 15655 + }, + { + "epoch": 20.097560975609756, + "grad_norm": 1.207200288772583, + "learning_rate": 2.663714163457424e-05, + "loss": 0.4496, + "step": 15656 + }, + { + "epoch": 20.098844672657254, + "grad_norm": 1.7669074535369873, + "learning_rate": 2.663671373555841e-05, + "loss": 0.4127, + "step": 15657 + }, + { + "epoch": 20.10012836970475, + "grad_norm": 1.8951219320297241, + "learning_rate": 2.6636285836542576e-05, + "loss": 0.3875, + "step": 15658 + }, + { + "epoch": 20.101412066752246, + "grad_norm": 1.5327250957489014, + "learning_rate": 2.663585793752674e-05, + "loss": 0.4617, + "step": 15659 + }, + { + "epoch": 20.102695763799744, + "grad_norm": 2.1126821041107178, + "learning_rate": 2.6635430038510913e-05, + "loss": 0.4279, + "step": 15660 + }, + { + "epoch": 20.10397946084724, + "grad_norm": 3.462782144546509, + "learning_rate": 2.6635002139495078e-05, + "loss": 0.4405, + "step": 15661 + }, + { + "epoch": 20.105263157894736, + "grad_norm": 1.073107123374939, + "learning_rate": 2.663457424047925e-05, + "loss": 0.463, + "step": 15662 + }, + { + "epoch": 20.106546854942234, + "grad_norm": 3.2827484607696533, + "learning_rate": 2.6634146341463415e-05, + "loss": 0.4203, + "step": 15663 + }, + { + "epoch": 20.107830551989732, + "grad_norm": 1.9618955850601196, + "learning_rate": 2.6633718442447583e-05, + "loss": 0.4556, + "step": 15664 + }, + { + "epoch": 20.109114249037226, + "grad_norm": 1.4736313819885254, + "learning_rate": 2.6633290543431752e-05, + "loss": 0.4517, + "step": 15665 + }, + { + "epoch": 20.110397946084724, + "grad_norm": 2.6137475967407227, + "learning_rate": 2.6632862644415917e-05, + "loss": 0.4195, + "step": 15666 + }, + { + "epoch": 20.111681643132222, + "grad_norm": 1.3946236371994019, + "learning_rate": 2.6632434745400085e-05, + "loss": 0.4696, + "step": 15667 + }, + { + "epoch": 20.112965340179716, + "grad_norm": 1.1479772329330444, + "learning_rate": 2.6632006846384254e-05, + "loss": 0.4598, + "step": 15668 + }, + { + "epoch": 20.114249037227214, + "grad_norm": 0.9959003329277039, + "learning_rate": 2.6631578947368422e-05, + "loss": 0.4467, + "step": 15669 + }, + { + "epoch": 20.115532734274712, + "grad_norm": 5.191657543182373, + "learning_rate": 2.663115104835259e-05, + "loss": 0.4252, + "step": 15670 + }, + { + "epoch": 20.116816431322206, + "grad_norm": 1.6046162843704224, + "learning_rate": 2.663072314933676e-05, + "loss": 0.4546, + "step": 15671 + }, + { + "epoch": 20.118100128369704, + "grad_norm": 1.365282654762268, + "learning_rate": 2.6630295250320924e-05, + "loss": 0.4845, + "step": 15672 + }, + { + "epoch": 20.119383825417202, + "grad_norm": 1.4194389581680298, + "learning_rate": 2.6629867351305092e-05, + "loss": 0.4445, + "step": 15673 + }, + { + "epoch": 20.1206675224647, + "grad_norm": 1.176281213760376, + "learning_rate": 2.662943945228926e-05, + "loss": 0.4964, + "step": 15674 + }, + { + "epoch": 20.121951219512194, + "grad_norm": 1.506332278251648, + "learning_rate": 2.6629011553273426e-05, + "loss": 0.4377, + "step": 15675 + }, + { + "epoch": 20.123234916559692, + "grad_norm": 2.6343860626220703, + "learning_rate": 2.6628583654257598e-05, + "loss": 0.5272, + "step": 15676 + }, + { + "epoch": 20.12451861360719, + "grad_norm": 5.08827018737793, + "learning_rate": 2.6628155755241763e-05, + "loss": 0.4794, + "step": 15677 + }, + { + "epoch": 20.125802310654684, + "grad_norm": 1.8948357105255127, + "learning_rate": 2.6627727856225934e-05, + "loss": 0.5405, + "step": 15678 + }, + { + "epoch": 20.127086007702182, + "grad_norm": 3.9578936100006104, + "learning_rate": 2.66272999572101e-05, + "loss": 0.6054, + "step": 15679 + }, + { + "epoch": 20.12836970474968, + "grad_norm": 2.216722011566162, + "learning_rate": 2.6626872058194265e-05, + "loss": 0.6184, + "step": 15680 + }, + { + "epoch": 20.129653401797174, + "grad_norm": 0.8971359133720398, + "learning_rate": 2.6626444159178436e-05, + "loss": 0.3752, + "step": 15681 + }, + { + "epoch": 20.130937098844672, + "grad_norm": 1.2479093074798584, + "learning_rate": 2.66260162601626e-05, + "loss": 0.4102, + "step": 15682 + }, + { + "epoch": 20.13222079589217, + "grad_norm": 2.3234498500823975, + "learning_rate": 2.662558836114677e-05, + "loss": 0.4622, + "step": 15683 + }, + { + "epoch": 20.133504492939664, + "grad_norm": 1.6319665908813477, + "learning_rate": 2.6625160462130938e-05, + "loss": 0.4272, + "step": 15684 + }, + { + "epoch": 20.134788189987162, + "grad_norm": 2.558448553085327, + "learning_rate": 2.6624732563115107e-05, + "loss": 0.418, + "step": 15685 + }, + { + "epoch": 20.13607188703466, + "grad_norm": 1.4273390769958496, + "learning_rate": 2.6624304664099275e-05, + "loss": 0.3933, + "step": 15686 + }, + { + "epoch": 20.137355584082158, + "grad_norm": 1.1108330488204956, + "learning_rate": 2.662387676508344e-05, + "loss": 0.4186, + "step": 15687 + }, + { + "epoch": 20.138639281129652, + "grad_norm": 1.9628366231918335, + "learning_rate": 2.662344886606761e-05, + "loss": 0.4041, + "step": 15688 + }, + { + "epoch": 20.13992297817715, + "grad_norm": 1.7637200355529785, + "learning_rate": 2.6623020967051777e-05, + "loss": 0.4484, + "step": 15689 + }, + { + "epoch": 20.141206675224648, + "grad_norm": 1.4046173095703125, + "learning_rate": 2.6622593068035945e-05, + "loss": 0.4423, + "step": 15690 + }, + { + "epoch": 20.142490372272142, + "grad_norm": 1.5903326272964478, + "learning_rate": 2.662216516902011e-05, + "loss": 0.4102, + "step": 15691 + }, + { + "epoch": 20.14377406931964, + "grad_norm": 1.6817368268966675, + "learning_rate": 2.6621737270004282e-05, + "loss": 0.4014, + "step": 15692 + }, + { + "epoch": 20.145057766367138, + "grad_norm": 1.5545953512191772, + "learning_rate": 2.6621309370988447e-05, + "loss": 0.4364, + "step": 15693 + }, + { + "epoch": 20.146341463414632, + "grad_norm": 1.0979784727096558, + "learning_rate": 2.6620881471972615e-05, + "loss": 0.4194, + "step": 15694 + }, + { + "epoch": 20.14762516046213, + "grad_norm": 1.1129803657531738, + "learning_rate": 2.6620453572956784e-05, + "loss": 0.4419, + "step": 15695 + }, + { + "epoch": 20.14890885750963, + "grad_norm": 1.2297781705856323, + "learning_rate": 2.662002567394095e-05, + "loss": 0.438, + "step": 15696 + }, + { + "epoch": 20.150192554557126, + "grad_norm": 1.886196255683899, + "learning_rate": 2.661959777492512e-05, + "loss": 0.4448, + "step": 15697 + }, + { + "epoch": 20.15147625160462, + "grad_norm": 0.9593652486801147, + "learning_rate": 2.6619169875909286e-05, + "loss": 0.39, + "step": 15698 + }, + { + "epoch": 20.15275994865212, + "grad_norm": 1.6743059158325195, + "learning_rate": 2.6618741976893454e-05, + "loss": 0.4339, + "step": 15699 + }, + { + "epoch": 20.154043645699616, + "grad_norm": 3.4349818229675293, + "learning_rate": 2.6618314077877623e-05, + "loss": 0.4386, + "step": 15700 + }, + { + "epoch": 20.15532734274711, + "grad_norm": 1.016878366470337, + "learning_rate": 2.6617886178861788e-05, + "loss": 0.4108, + "step": 15701 + }, + { + "epoch": 20.15661103979461, + "grad_norm": 1.168575644493103, + "learning_rate": 2.6617458279845956e-05, + "loss": 0.4288, + "step": 15702 + }, + { + "epoch": 20.157894736842106, + "grad_norm": 0.9940716028213501, + "learning_rate": 2.6617030380830124e-05, + "loss": 0.4483, + "step": 15703 + }, + { + "epoch": 20.1591784338896, + "grad_norm": 1.056566834449768, + "learning_rate": 2.6616602481814293e-05, + "loss": 0.4629, + "step": 15704 + }, + { + "epoch": 20.1604621309371, + "grad_norm": 3.9120686054229736, + "learning_rate": 2.661617458279846e-05, + "loss": 0.4404, + "step": 15705 + }, + { + "epoch": 20.161745827984596, + "grad_norm": 1.003556728363037, + "learning_rate": 2.661574668378263e-05, + "loss": 0.404, + "step": 15706 + }, + { + "epoch": 20.163029525032094, + "grad_norm": 6.403543472290039, + "learning_rate": 2.6615318784766795e-05, + "loss": 0.4202, + "step": 15707 + }, + { + "epoch": 20.16431322207959, + "grad_norm": 1.7791417837142944, + "learning_rate": 2.6614890885750963e-05, + "loss": 0.4455, + "step": 15708 + }, + { + "epoch": 20.165596919127086, + "grad_norm": 0.9013654589653015, + "learning_rate": 2.661446298673513e-05, + "loss": 0.4613, + "step": 15709 + }, + { + "epoch": 20.166880616174584, + "grad_norm": 0.8744439482688904, + "learning_rate": 2.6614035087719297e-05, + "loss": 0.4312, + "step": 15710 + }, + { + "epoch": 20.16816431322208, + "grad_norm": 1.1492316722869873, + "learning_rate": 2.661360718870347e-05, + "loss": 0.43, + "step": 15711 + }, + { + "epoch": 20.169448010269576, + "grad_norm": 0.9904969334602356, + "learning_rate": 2.6613179289687633e-05, + "loss": 0.4431, + "step": 15712 + }, + { + "epoch": 20.170731707317074, + "grad_norm": 1.7329901456832886, + "learning_rate": 2.6612751390671802e-05, + "loss": 0.4347, + "step": 15713 + }, + { + "epoch": 20.17201540436457, + "grad_norm": 1.2418863773345947, + "learning_rate": 2.661232349165597e-05, + "loss": 0.4345, + "step": 15714 + }, + { + "epoch": 20.173299101412066, + "grad_norm": 2.889058828353882, + "learning_rate": 2.6611895592640135e-05, + "loss": 0.4309, + "step": 15715 + }, + { + "epoch": 20.174582798459564, + "grad_norm": 1.3772673606872559, + "learning_rate": 2.6611467693624307e-05, + "loss": 0.4222, + "step": 15716 + }, + { + "epoch": 20.17586649550706, + "grad_norm": 1.5714212656021118, + "learning_rate": 2.6611039794608472e-05, + "loss": 0.4399, + "step": 15717 + }, + { + "epoch": 20.177150192554556, + "grad_norm": 1.6944063901901245, + "learning_rate": 2.661061189559264e-05, + "loss": 0.4432, + "step": 15718 + }, + { + "epoch": 20.178433889602054, + "grad_norm": 1.5443816184997559, + "learning_rate": 2.661018399657681e-05, + "loss": 0.3771, + "step": 15719 + }, + { + "epoch": 20.179717586649552, + "grad_norm": 1.29288649559021, + "learning_rate": 2.6609756097560974e-05, + "loss": 0.439, + "step": 15720 + }, + { + "epoch": 20.181001283697046, + "grad_norm": 1.5470987558364868, + "learning_rate": 2.6609328198545146e-05, + "loss": 0.4225, + "step": 15721 + }, + { + "epoch": 20.182284980744544, + "grad_norm": 1.6222745180130005, + "learning_rate": 2.660890029952931e-05, + "loss": 0.4986, + "step": 15722 + }, + { + "epoch": 20.183568677792042, + "grad_norm": 1.8528783321380615, + "learning_rate": 2.660847240051348e-05, + "loss": 0.4643, + "step": 15723 + }, + { + "epoch": 20.184852374839537, + "grad_norm": 2.125894069671631, + "learning_rate": 2.6608044501497648e-05, + "loss": 0.4801, + "step": 15724 + }, + { + "epoch": 20.186136071887034, + "grad_norm": 3.584852457046509, + "learning_rate": 2.6607616602481816e-05, + "loss": 0.4688, + "step": 15725 + }, + { + "epoch": 20.187419768934532, + "grad_norm": 2.586782932281494, + "learning_rate": 2.660718870346598e-05, + "loss": 0.4841, + "step": 15726 + }, + { + "epoch": 20.188703465982027, + "grad_norm": 4.303097248077393, + "learning_rate": 2.660676080445015e-05, + "loss": 0.4834, + "step": 15727 + }, + { + "epoch": 20.189987163029524, + "grad_norm": 5.587726593017578, + "learning_rate": 2.6606332905434318e-05, + "loss": 0.5167, + "step": 15728 + }, + { + "epoch": 20.191270860077022, + "grad_norm": 2.075878381729126, + "learning_rate": 2.6605905006418486e-05, + "loss": 0.5658, + "step": 15729 + }, + { + "epoch": 20.19255455712452, + "grad_norm": 7.355410099029541, + "learning_rate": 2.6605477107402655e-05, + "loss": 0.6221, + "step": 15730 + }, + { + "epoch": 20.193838254172015, + "grad_norm": 1.1353785991668701, + "learning_rate": 2.660504920838682e-05, + "loss": 0.4138, + "step": 15731 + }, + { + "epoch": 20.195121951219512, + "grad_norm": 1.4543029069900513, + "learning_rate": 2.660462130937099e-05, + "loss": 0.3933, + "step": 15732 + }, + { + "epoch": 20.19640564826701, + "grad_norm": 1.4073846340179443, + "learning_rate": 2.6604193410355156e-05, + "loss": 0.4219, + "step": 15733 + }, + { + "epoch": 20.197689345314505, + "grad_norm": 1.8327407836914062, + "learning_rate": 2.660376551133932e-05, + "loss": 0.437, + "step": 15734 + }, + { + "epoch": 20.198973042362002, + "grad_norm": 1.8635598421096802, + "learning_rate": 2.6603337612323493e-05, + "loss": 0.4243, + "step": 15735 + }, + { + "epoch": 20.2002567394095, + "grad_norm": 0.884979248046875, + "learning_rate": 2.6602909713307658e-05, + "loss": 0.4165, + "step": 15736 + }, + { + "epoch": 20.201540436456995, + "grad_norm": 1.919843316078186, + "learning_rate": 2.660248181429183e-05, + "loss": 0.4169, + "step": 15737 + }, + { + "epoch": 20.202824133504492, + "grad_norm": 1.297316074371338, + "learning_rate": 2.6602053915275995e-05, + "loss": 0.4041, + "step": 15738 + }, + { + "epoch": 20.20410783055199, + "grad_norm": 0.9651117324829102, + "learning_rate": 2.6601626016260164e-05, + "loss": 0.4395, + "step": 15739 + }, + { + "epoch": 20.205391527599488, + "grad_norm": 1.2387925386428833, + "learning_rate": 2.6601198117244332e-05, + "loss": 0.4215, + "step": 15740 + }, + { + "epoch": 20.206675224646983, + "grad_norm": 1.0430046319961548, + "learning_rate": 2.6600770218228497e-05, + "loss": 0.4003, + "step": 15741 + }, + { + "epoch": 20.20795892169448, + "grad_norm": 1.2141512632369995, + "learning_rate": 2.6600342319212665e-05, + "loss": 0.4417, + "step": 15742 + }, + { + "epoch": 20.20924261874198, + "grad_norm": 0.904974102973938, + "learning_rate": 2.6599914420196834e-05, + "loss": 0.4243, + "step": 15743 + }, + { + "epoch": 20.210526315789473, + "grad_norm": 0.9434323310852051, + "learning_rate": 2.6599486521181002e-05, + "loss": 0.4301, + "step": 15744 + }, + { + "epoch": 20.21181001283697, + "grad_norm": 1.2780019044876099, + "learning_rate": 2.659905862216517e-05, + "loss": 0.4715, + "step": 15745 + }, + { + "epoch": 20.21309370988447, + "grad_norm": 1.0590360164642334, + "learning_rate": 2.659863072314934e-05, + "loss": 0.4594, + "step": 15746 + }, + { + "epoch": 20.214377406931963, + "grad_norm": 1.4147757291793823, + "learning_rate": 2.6598202824133504e-05, + "loss": 0.4269, + "step": 15747 + }, + { + "epoch": 20.21566110397946, + "grad_norm": 1.0555781126022339, + "learning_rate": 2.6597774925117672e-05, + "loss": 0.4334, + "step": 15748 + }, + { + "epoch": 20.21694480102696, + "grad_norm": 0.9276991486549377, + "learning_rate": 2.659734702610184e-05, + "loss": 0.4273, + "step": 15749 + }, + { + "epoch": 20.218228498074453, + "grad_norm": 1.6200000047683716, + "learning_rate": 2.6596919127086006e-05, + "loss": 0.4113, + "step": 15750 + }, + { + "epoch": 20.21951219512195, + "grad_norm": 2.232593059539795, + "learning_rate": 2.6596491228070178e-05, + "loss": 0.4258, + "step": 15751 + }, + { + "epoch": 20.22079589216945, + "grad_norm": 0.9096815586090088, + "learning_rate": 2.6596063329054343e-05, + "loss": 0.4102, + "step": 15752 + }, + { + "epoch": 20.222079589216946, + "grad_norm": 1.5124895572662354, + "learning_rate": 2.6595635430038515e-05, + "loss": 0.4532, + "step": 15753 + }, + { + "epoch": 20.22336328626444, + "grad_norm": 1.5472887754440308, + "learning_rate": 2.659520753102268e-05, + "loss": 0.4491, + "step": 15754 + }, + { + "epoch": 20.22464698331194, + "grad_norm": 0.9508328437805176, + "learning_rate": 2.6594779632006845e-05, + "loss": 0.4448, + "step": 15755 + }, + { + "epoch": 20.225930680359436, + "grad_norm": 1.6758918762207031, + "learning_rate": 2.6594351732991016e-05, + "loss": 0.4336, + "step": 15756 + }, + { + "epoch": 20.22721437740693, + "grad_norm": 1.1602675914764404, + "learning_rate": 2.659392383397518e-05, + "loss": 0.4161, + "step": 15757 + }, + { + "epoch": 20.22849807445443, + "grad_norm": 2.3458211421966553, + "learning_rate": 2.659349593495935e-05, + "loss": 0.4445, + "step": 15758 + }, + { + "epoch": 20.229781771501926, + "grad_norm": 1.6133493185043335, + "learning_rate": 2.6593068035943518e-05, + "loss": 0.4196, + "step": 15759 + }, + { + "epoch": 20.23106546854942, + "grad_norm": 2.3968100547790527, + "learning_rate": 2.6592640136927687e-05, + "loss": 0.4335, + "step": 15760 + }, + { + "epoch": 20.23234916559692, + "grad_norm": 1.5420591831207275, + "learning_rate": 2.6592212237911855e-05, + "loss": 0.4259, + "step": 15761 + }, + { + "epoch": 20.233632862644416, + "grad_norm": 3.341153860092163, + "learning_rate": 2.659178433889602e-05, + "loss": 0.4592, + "step": 15762 + }, + { + "epoch": 20.234916559691914, + "grad_norm": 4.725498676300049, + "learning_rate": 2.659135643988019e-05, + "loss": 0.4048, + "step": 15763 + }, + { + "epoch": 20.23620025673941, + "grad_norm": 2.639033794403076, + "learning_rate": 2.6590928540864357e-05, + "loss": 0.4805, + "step": 15764 + }, + { + "epoch": 20.237483953786906, + "grad_norm": 1.7456309795379639, + "learning_rate": 2.6590500641848525e-05, + "loss": 0.4144, + "step": 15765 + }, + { + "epoch": 20.238767650834404, + "grad_norm": 1.9718223810195923, + "learning_rate": 2.659007274283269e-05, + "loss": 0.4332, + "step": 15766 + }, + { + "epoch": 20.2400513478819, + "grad_norm": 7.266548156738281, + "learning_rate": 2.6589644843816862e-05, + "loss": 0.4225, + "step": 15767 + }, + { + "epoch": 20.241335044929397, + "grad_norm": 1.2353612184524536, + "learning_rate": 2.6589216944801027e-05, + "loss": 0.4731, + "step": 15768 + }, + { + "epoch": 20.242618741976894, + "grad_norm": 1.950889229774475, + "learning_rate": 2.6588789045785196e-05, + "loss": 0.4279, + "step": 15769 + }, + { + "epoch": 20.24390243902439, + "grad_norm": 3.5670599937438965, + "learning_rate": 2.6588361146769364e-05, + "loss": 0.4072, + "step": 15770 + }, + { + "epoch": 20.245186136071887, + "grad_norm": 1.2666321992874146, + "learning_rate": 2.658793324775353e-05, + "loss": 0.4184, + "step": 15771 + }, + { + "epoch": 20.246469833119384, + "grad_norm": 1.6693834066390991, + "learning_rate": 2.65875053487377e-05, + "loss": 0.4795, + "step": 15772 + }, + { + "epoch": 20.247753530166882, + "grad_norm": 0.985573410987854, + "learning_rate": 2.6587077449721866e-05, + "loss": 0.4384, + "step": 15773 + }, + { + "epoch": 20.249037227214377, + "grad_norm": 15.832405090332031, + "learning_rate": 2.6586649550706034e-05, + "loss": 0.4289, + "step": 15774 + }, + { + "epoch": 20.250320924261874, + "grad_norm": 4.556268692016602, + "learning_rate": 2.6586221651690203e-05, + "loss": 0.529, + "step": 15775 + }, + { + "epoch": 20.251604621309372, + "grad_norm": 1.2500685453414917, + "learning_rate": 2.6585793752674368e-05, + "loss": 0.4678, + "step": 15776 + }, + { + "epoch": 20.252888318356867, + "grad_norm": 1.961308240890503, + "learning_rate": 2.658536585365854e-05, + "loss": 0.4686, + "step": 15777 + }, + { + "epoch": 20.254172015404365, + "grad_norm": 3.573904037475586, + "learning_rate": 2.6584937954642704e-05, + "loss": 0.4759, + "step": 15778 + }, + { + "epoch": 20.255455712451862, + "grad_norm": 4.093413352966309, + "learning_rate": 2.6584510055626873e-05, + "loss": 0.5788, + "step": 15779 + }, + { + "epoch": 20.256739409499357, + "grad_norm": 1.820011019706726, + "learning_rate": 2.658408215661104e-05, + "loss": 0.6797, + "step": 15780 + }, + { + "epoch": 20.258023106546855, + "grad_norm": 1.338868498802185, + "learning_rate": 2.6583654257595206e-05, + "loss": 0.3958, + "step": 15781 + }, + { + "epoch": 20.259306803594352, + "grad_norm": 3.7717220783233643, + "learning_rate": 2.6583226358579375e-05, + "loss": 0.4351, + "step": 15782 + }, + { + "epoch": 20.260590500641847, + "grad_norm": 1.5740817785263062, + "learning_rate": 2.6582798459563543e-05, + "loss": 0.3943, + "step": 15783 + }, + { + "epoch": 20.261874197689345, + "grad_norm": 1.3268190622329712, + "learning_rate": 2.658237056054771e-05, + "loss": 0.4244, + "step": 15784 + }, + { + "epoch": 20.263157894736842, + "grad_norm": 1.3510514497756958, + "learning_rate": 2.658194266153188e-05, + "loss": 0.3857, + "step": 15785 + }, + { + "epoch": 20.26444159178434, + "grad_norm": 3.6058313846588135, + "learning_rate": 2.658151476251605e-05, + "loss": 0.4138, + "step": 15786 + }, + { + "epoch": 20.265725288831835, + "grad_norm": 2.3745827674865723, + "learning_rate": 2.6581086863500213e-05, + "loss": 0.3884, + "step": 15787 + }, + { + "epoch": 20.267008985879333, + "grad_norm": 1.2232972383499146, + "learning_rate": 2.6580658964484382e-05, + "loss": 0.4184, + "step": 15788 + }, + { + "epoch": 20.26829268292683, + "grad_norm": 1.4101309776306152, + "learning_rate": 2.658023106546855e-05, + "loss": 0.4122, + "step": 15789 + }, + { + "epoch": 20.269576379974325, + "grad_norm": 1.1657997369766235, + "learning_rate": 2.6579803166452715e-05, + "loss": 0.4192, + "step": 15790 + }, + { + "epoch": 20.270860077021823, + "grad_norm": 1.2954622507095337, + "learning_rate": 2.6579375267436887e-05, + "loss": 0.4128, + "step": 15791 + }, + { + "epoch": 20.27214377406932, + "grad_norm": 6.228112697601318, + "learning_rate": 2.6578947368421052e-05, + "loss": 0.4211, + "step": 15792 + }, + { + "epoch": 20.273427471116815, + "grad_norm": 1.0337481498718262, + "learning_rate": 2.6578519469405224e-05, + "loss": 0.4229, + "step": 15793 + }, + { + "epoch": 20.274711168164313, + "grad_norm": 1.7636666297912598, + "learning_rate": 2.657809157038939e-05, + "loss": 0.4161, + "step": 15794 + }, + { + "epoch": 20.27599486521181, + "grad_norm": 0.929568350315094, + "learning_rate": 2.6577663671373554e-05, + "loss": 0.4188, + "step": 15795 + }, + { + "epoch": 20.27727856225931, + "grad_norm": 2.6265616416931152, + "learning_rate": 2.6577235772357726e-05, + "loss": 0.4023, + "step": 15796 + }, + { + "epoch": 20.278562259306803, + "grad_norm": 1.1073170900344849, + "learning_rate": 2.657680787334189e-05, + "loss": 0.4159, + "step": 15797 + }, + { + "epoch": 20.2798459563543, + "grad_norm": 0.9818645715713501, + "learning_rate": 2.657637997432606e-05, + "loss": 0.4229, + "step": 15798 + }, + { + "epoch": 20.2811296534018, + "grad_norm": 1.0254745483398438, + "learning_rate": 2.6575952075310228e-05, + "loss": 0.4171, + "step": 15799 + }, + { + "epoch": 20.282413350449293, + "grad_norm": 1.6072402000427246, + "learning_rate": 2.6575524176294396e-05, + "loss": 0.4181, + "step": 15800 + }, + { + "epoch": 20.28369704749679, + "grad_norm": 1.2123888731002808, + "learning_rate": 2.6575096277278564e-05, + "loss": 0.4151, + "step": 15801 + }, + { + "epoch": 20.28498074454429, + "grad_norm": 1.2988687753677368, + "learning_rate": 2.657466837826273e-05, + "loss": 0.4484, + "step": 15802 + }, + { + "epoch": 20.286264441591783, + "grad_norm": 1.7305078506469727, + "learning_rate": 2.6574240479246898e-05, + "loss": 0.4317, + "step": 15803 + }, + { + "epoch": 20.28754813863928, + "grad_norm": 1.3157926797866821, + "learning_rate": 2.6573812580231066e-05, + "loss": 0.4833, + "step": 15804 + }, + { + "epoch": 20.28883183568678, + "grad_norm": 1.3580248355865479, + "learning_rate": 2.6573384681215235e-05, + "loss": 0.4568, + "step": 15805 + }, + { + "epoch": 20.290115532734276, + "grad_norm": 2.507448434829712, + "learning_rate": 2.65729567821994e-05, + "loss": 0.4559, + "step": 15806 + }, + { + "epoch": 20.29139922978177, + "grad_norm": 0.900110125541687, + "learning_rate": 2.657252888318357e-05, + "loss": 0.4422, + "step": 15807 + }, + { + "epoch": 20.29268292682927, + "grad_norm": 1.186072587966919, + "learning_rate": 2.6572100984167737e-05, + "loss": 0.4292, + "step": 15808 + }, + { + "epoch": 20.293966623876766, + "grad_norm": 1.3265578746795654, + "learning_rate": 2.6571673085151905e-05, + "loss": 0.4338, + "step": 15809 + }, + { + "epoch": 20.29525032092426, + "grad_norm": 2.640604019165039, + "learning_rate": 2.6571245186136073e-05, + "loss": 0.435, + "step": 15810 + }, + { + "epoch": 20.29653401797176, + "grad_norm": 1.7014812231063843, + "learning_rate": 2.657081728712024e-05, + "loss": 0.4082, + "step": 15811 + }, + { + "epoch": 20.297817715019256, + "grad_norm": 1.9894044399261475, + "learning_rate": 2.657038938810441e-05, + "loss": 0.4523, + "step": 15812 + }, + { + "epoch": 20.29910141206675, + "grad_norm": 1.3984355926513672, + "learning_rate": 2.6569961489088575e-05, + "loss": 0.4335, + "step": 15813 + }, + { + "epoch": 20.30038510911425, + "grad_norm": 0.9139813184738159, + "learning_rate": 2.6569533590072744e-05, + "loss": 0.4123, + "step": 15814 + }, + { + "epoch": 20.301668806161747, + "grad_norm": 1.2861003875732422, + "learning_rate": 2.6569105691056912e-05, + "loss": 0.4482, + "step": 15815 + }, + { + "epoch": 20.30295250320924, + "grad_norm": 3.089038372039795, + "learning_rate": 2.6568677792041077e-05, + "loss": 0.4224, + "step": 15816 + }, + { + "epoch": 20.30423620025674, + "grad_norm": 1.351772427558899, + "learning_rate": 2.656824989302525e-05, + "loss": 0.4437, + "step": 15817 + }, + { + "epoch": 20.305519897304237, + "grad_norm": 1.8912397623062134, + "learning_rate": 2.6567821994009414e-05, + "loss": 0.4847, + "step": 15818 + }, + { + "epoch": 20.306803594351734, + "grad_norm": 2.425626754760742, + "learning_rate": 2.6567394094993582e-05, + "loss": 0.4441, + "step": 15819 + }, + { + "epoch": 20.30808729139923, + "grad_norm": 1.7258353233337402, + "learning_rate": 2.656696619597775e-05, + "loss": 0.4245, + "step": 15820 + }, + { + "epoch": 20.309370988446727, + "grad_norm": 1.9412157535552979, + "learning_rate": 2.656653829696192e-05, + "loss": 0.4494, + "step": 15821 + }, + { + "epoch": 20.310654685494224, + "grad_norm": 1.75211763381958, + "learning_rate": 2.6566110397946084e-05, + "loss": 0.4845, + "step": 15822 + }, + { + "epoch": 20.31193838254172, + "grad_norm": 2.1465935707092285, + "learning_rate": 2.6565682498930253e-05, + "loss": 0.4251, + "step": 15823 + }, + { + "epoch": 20.313222079589217, + "grad_norm": 3.0924248695373535, + "learning_rate": 2.656525459991442e-05, + "loss": 0.4703, + "step": 15824 + }, + { + "epoch": 20.314505776636715, + "grad_norm": 2.6971278190612793, + "learning_rate": 2.656482670089859e-05, + "loss": 0.463, + "step": 15825 + }, + { + "epoch": 20.31578947368421, + "grad_norm": 1.269313931465149, + "learning_rate": 2.6564398801882758e-05, + "loss": 0.5063, + "step": 15826 + }, + { + "epoch": 20.317073170731707, + "grad_norm": 2.844259738922119, + "learning_rate": 2.6563970902866923e-05, + "loss": 0.5303, + "step": 15827 + }, + { + "epoch": 20.318356867779205, + "grad_norm": 1.5509310960769653, + "learning_rate": 2.6563543003851095e-05, + "loss": 0.5058, + "step": 15828 + }, + { + "epoch": 20.319640564826702, + "grad_norm": 1.4012186527252197, + "learning_rate": 2.656311510483526e-05, + "loss": 0.5536, + "step": 15829 + }, + { + "epoch": 20.320924261874197, + "grad_norm": 2.2631208896636963, + "learning_rate": 2.6562687205819425e-05, + "loss": 0.6802, + "step": 15830 + }, + { + "epoch": 20.322207958921695, + "grad_norm": 1.4225431680679321, + "learning_rate": 2.6562259306803596e-05, + "loss": 0.3884, + "step": 15831 + }, + { + "epoch": 20.323491655969192, + "grad_norm": 1.1827107667922974, + "learning_rate": 2.656183140778776e-05, + "loss": 0.4092, + "step": 15832 + }, + { + "epoch": 20.324775353016687, + "grad_norm": 1.2016018629074097, + "learning_rate": 2.6561403508771933e-05, + "loss": 0.4164, + "step": 15833 + }, + { + "epoch": 20.326059050064185, + "grad_norm": 2.7681899070739746, + "learning_rate": 2.6560975609756098e-05, + "loss": 0.4575, + "step": 15834 + }, + { + "epoch": 20.327342747111683, + "grad_norm": 1.5663468837738037, + "learning_rate": 2.6560547710740267e-05, + "loss": 0.4275, + "step": 15835 + }, + { + "epoch": 20.328626444159177, + "grad_norm": 0.9192193150520325, + "learning_rate": 2.6560119811724435e-05, + "loss": 0.4029, + "step": 15836 + }, + { + "epoch": 20.329910141206675, + "grad_norm": 3.385270118713379, + "learning_rate": 2.65596919127086e-05, + "loss": 0.4213, + "step": 15837 + }, + { + "epoch": 20.331193838254173, + "grad_norm": 2.713472366333008, + "learning_rate": 2.655926401369277e-05, + "loss": 0.4865, + "step": 15838 + }, + { + "epoch": 20.33247753530167, + "grad_norm": 2.248976469039917, + "learning_rate": 2.6558836114676937e-05, + "loss": 0.4555, + "step": 15839 + }, + { + "epoch": 20.333761232349165, + "grad_norm": 1.8573185205459595, + "learning_rate": 2.6558408215661105e-05, + "loss": 0.4408, + "step": 15840 + }, + { + "epoch": 20.335044929396663, + "grad_norm": 1.1652501821517944, + "learning_rate": 2.6557980316645274e-05, + "loss": 0.4093, + "step": 15841 + }, + { + "epoch": 20.33632862644416, + "grad_norm": 1.7533036470413208, + "learning_rate": 2.655755241762944e-05, + "loss": 0.4147, + "step": 15842 + }, + { + "epoch": 20.337612323491655, + "grad_norm": 1.0872838497161865, + "learning_rate": 2.6557124518613607e-05, + "loss": 0.4202, + "step": 15843 + }, + { + "epoch": 20.338896020539153, + "grad_norm": 1.6225792169570923, + "learning_rate": 2.6556696619597776e-05, + "loss": 0.4318, + "step": 15844 + }, + { + "epoch": 20.34017971758665, + "grad_norm": 2.2371482849121094, + "learning_rate": 2.6556268720581944e-05, + "loss": 0.3948, + "step": 15845 + }, + { + "epoch": 20.341463414634145, + "grad_norm": 1.1402876377105713, + "learning_rate": 2.655584082156611e-05, + "loss": 0.4254, + "step": 15846 + }, + { + "epoch": 20.342747111681643, + "grad_norm": 1.0877734422683716, + "learning_rate": 2.655541292255028e-05, + "loss": 0.4073, + "step": 15847 + }, + { + "epoch": 20.34403080872914, + "grad_norm": 1.2197929620742798, + "learning_rate": 2.6554985023534446e-05, + "loss": 0.4009, + "step": 15848 + }, + { + "epoch": 20.345314505776635, + "grad_norm": 1.3406364917755127, + "learning_rate": 2.6554557124518614e-05, + "loss": 0.4348, + "step": 15849 + }, + { + "epoch": 20.346598202824133, + "grad_norm": 1.3836570978164673, + "learning_rate": 2.6554129225502783e-05, + "loss": 0.4162, + "step": 15850 + }, + { + "epoch": 20.34788189987163, + "grad_norm": 1.3945053815841675, + "learning_rate": 2.6553701326486948e-05, + "loss": 0.4225, + "step": 15851 + }, + { + "epoch": 20.34916559691913, + "grad_norm": 1.4495347738265991, + "learning_rate": 2.655327342747112e-05, + "loss": 0.4384, + "step": 15852 + }, + { + "epoch": 20.350449293966623, + "grad_norm": 2.451490879058838, + "learning_rate": 2.6552845528455285e-05, + "loss": 0.4204, + "step": 15853 + }, + { + "epoch": 20.35173299101412, + "grad_norm": 3.526200771331787, + "learning_rate": 2.6552417629439453e-05, + "loss": 0.4254, + "step": 15854 + }, + { + "epoch": 20.35301668806162, + "grad_norm": 16.652935028076172, + "learning_rate": 2.655198973042362e-05, + "loss": 0.445, + "step": 15855 + }, + { + "epoch": 20.354300385109113, + "grad_norm": 2.3856360912323, + "learning_rate": 2.6551561831407786e-05, + "loss": 0.482, + "step": 15856 + }, + { + "epoch": 20.35558408215661, + "grad_norm": 1.6901510953903198, + "learning_rate": 2.6551133932391958e-05, + "loss": 0.4547, + "step": 15857 + }, + { + "epoch": 20.35686777920411, + "grad_norm": 1.4558696746826172, + "learning_rate": 2.6550706033376123e-05, + "loss": 0.4061, + "step": 15858 + }, + { + "epoch": 20.358151476251603, + "grad_norm": 1.4573789834976196, + "learning_rate": 2.655027813436029e-05, + "loss": 0.3811, + "step": 15859 + }, + { + "epoch": 20.3594351732991, + "grad_norm": 1.6396760940551758, + "learning_rate": 2.654985023534446e-05, + "loss": 0.4177, + "step": 15860 + }, + { + "epoch": 20.3607188703466, + "grad_norm": 3.7651615142822266, + "learning_rate": 2.654942233632863e-05, + "loss": 0.4339, + "step": 15861 + }, + { + "epoch": 20.362002567394097, + "grad_norm": 1.621389627456665, + "learning_rate": 2.6548994437312793e-05, + "loss": 0.4392, + "step": 15862 + }, + { + "epoch": 20.36328626444159, + "grad_norm": 2.0118629932403564, + "learning_rate": 2.6548566538296962e-05, + "loss": 0.3972, + "step": 15863 + }, + { + "epoch": 20.36456996148909, + "grad_norm": 12.671053886413574, + "learning_rate": 2.654813863928113e-05, + "loss": 0.4104, + "step": 15864 + }, + { + "epoch": 20.365853658536587, + "grad_norm": 1.579862356185913, + "learning_rate": 2.65477107402653e-05, + "loss": 0.45, + "step": 15865 + }, + { + "epoch": 20.36713735558408, + "grad_norm": 1.1418912410736084, + "learning_rate": 2.6547282841249467e-05, + "loss": 0.4122, + "step": 15866 + }, + { + "epoch": 20.36842105263158, + "grad_norm": 1.2176841497421265, + "learning_rate": 2.6546854942233632e-05, + "loss": 0.4172, + "step": 15867 + }, + { + "epoch": 20.369704749679077, + "grad_norm": 1.2622463703155518, + "learning_rate": 2.6546427043217804e-05, + "loss": 0.4563, + "step": 15868 + }, + { + "epoch": 20.37098844672657, + "grad_norm": 2.501432418823242, + "learning_rate": 2.654599914420197e-05, + "loss": 0.4605, + "step": 15869 + }, + { + "epoch": 20.37227214377407, + "grad_norm": 2.715254783630371, + "learning_rate": 2.6545571245186134e-05, + "loss": 0.4547, + "step": 15870 + }, + { + "epoch": 20.373555840821567, + "grad_norm": 1.7419416904449463, + "learning_rate": 2.6545143346170306e-05, + "loss": 0.3766, + "step": 15871 + }, + { + "epoch": 20.374839537869065, + "grad_norm": 2.355165481567383, + "learning_rate": 2.654471544715447e-05, + "loss": 0.5003, + "step": 15872 + }, + { + "epoch": 20.37612323491656, + "grad_norm": 3.633551836013794, + "learning_rate": 2.6544287548138643e-05, + "loss": 0.4737, + "step": 15873 + }, + { + "epoch": 20.377406931964057, + "grad_norm": 2.301910877227783, + "learning_rate": 2.6543859649122808e-05, + "loss": 0.4616, + "step": 15874 + }, + { + "epoch": 20.378690629011555, + "grad_norm": 2.574343681335449, + "learning_rate": 2.6543431750106976e-05, + "loss": 0.4863, + "step": 15875 + }, + { + "epoch": 20.37997432605905, + "grad_norm": 5.201275825500488, + "learning_rate": 2.6543003851091144e-05, + "loss": 0.4665, + "step": 15876 + }, + { + "epoch": 20.381258023106547, + "grad_norm": 3.532719373703003, + "learning_rate": 2.654257595207531e-05, + "loss": 0.5057, + "step": 15877 + }, + { + "epoch": 20.382541720154045, + "grad_norm": 4.708061695098877, + "learning_rate": 2.6542148053059478e-05, + "loss": 0.5531, + "step": 15878 + }, + { + "epoch": 20.38382541720154, + "grad_norm": 2.428126335144043, + "learning_rate": 2.6541720154043646e-05, + "loss": 0.5313, + "step": 15879 + }, + { + "epoch": 20.385109114249037, + "grad_norm": 2.213236093521118, + "learning_rate": 2.6541292255027815e-05, + "loss": 0.64, + "step": 15880 + }, + { + "epoch": 20.386392811296535, + "grad_norm": 4.196563720703125, + "learning_rate": 2.6540864356011983e-05, + "loss": 0.4182, + "step": 15881 + }, + { + "epoch": 20.387676508344033, + "grad_norm": 1.225692629814148, + "learning_rate": 2.654043645699615e-05, + "loss": 0.3697, + "step": 15882 + }, + { + "epoch": 20.388960205391527, + "grad_norm": 3.415186643600464, + "learning_rate": 2.6540008557980317e-05, + "loss": 0.3839, + "step": 15883 + }, + { + "epoch": 20.390243902439025, + "grad_norm": 1.4042954444885254, + "learning_rate": 2.6539580658964485e-05, + "loss": 0.4427, + "step": 15884 + }, + { + "epoch": 20.391527599486523, + "grad_norm": 1.4749852418899536, + "learning_rate": 2.6539152759948653e-05, + "loss": 0.4247, + "step": 15885 + }, + { + "epoch": 20.392811296534017, + "grad_norm": 1.3614312410354614, + "learning_rate": 2.653872486093282e-05, + "loss": 0.4195, + "step": 15886 + }, + { + "epoch": 20.394094993581515, + "grad_norm": 1.2899153232574463, + "learning_rate": 2.653829696191699e-05, + "loss": 0.4786, + "step": 15887 + }, + { + "epoch": 20.395378690629013, + "grad_norm": 1.2898441553115845, + "learning_rate": 2.6537869062901155e-05, + "loss": 0.4663, + "step": 15888 + }, + { + "epoch": 20.396662387676507, + "grad_norm": 1.617272973060608, + "learning_rate": 2.6537441163885327e-05, + "loss": 0.4886, + "step": 15889 + }, + { + "epoch": 20.397946084724005, + "grad_norm": 1.3992273807525635, + "learning_rate": 2.6537013264869492e-05, + "loss": 0.4472, + "step": 15890 + }, + { + "epoch": 20.399229781771503, + "grad_norm": 1.538506269454956, + "learning_rate": 2.6536585365853657e-05, + "loss": 0.4506, + "step": 15891 + }, + { + "epoch": 20.400513478818997, + "grad_norm": 2.2593753337860107, + "learning_rate": 2.653615746683783e-05, + "loss": 0.4412, + "step": 15892 + }, + { + "epoch": 20.401797175866495, + "grad_norm": 2.000617265701294, + "learning_rate": 2.6535729567821994e-05, + "loss": 0.3988, + "step": 15893 + }, + { + "epoch": 20.403080872913993, + "grad_norm": 2.0269131660461426, + "learning_rate": 2.6535301668806162e-05, + "loss": 0.424, + "step": 15894 + }, + { + "epoch": 20.40436456996149, + "grad_norm": 2.954256534576416, + "learning_rate": 2.653487376979033e-05, + "loss": 0.4119, + "step": 15895 + }, + { + "epoch": 20.405648267008985, + "grad_norm": 1.3087464570999146, + "learning_rate": 2.65344458707745e-05, + "loss": 0.4047, + "step": 15896 + }, + { + "epoch": 20.406931964056483, + "grad_norm": 12.1776762008667, + "learning_rate": 2.6534017971758668e-05, + "loss": 0.4222, + "step": 15897 + }, + { + "epoch": 20.40821566110398, + "grad_norm": 2.8121237754821777, + "learning_rate": 2.6533590072742833e-05, + "loss": 0.415, + "step": 15898 + }, + { + "epoch": 20.409499358151475, + "grad_norm": 1.4171303510665894, + "learning_rate": 2.6533162173727e-05, + "loss": 0.4237, + "step": 15899 + }, + { + "epoch": 20.410783055198973, + "grad_norm": 1.6007517576217651, + "learning_rate": 2.653273427471117e-05, + "loss": 0.4241, + "step": 15900 + }, + { + "epoch": 20.41206675224647, + "grad_norm": 2.42925763130188, + "learning_rate": 2.6532306375695338e-05, + "loss": 0.4441, + "step": 15901 + }, + { + "epoch": 20.413350449293965, + "grad_norm": 1.4338464736938477, + "learning_rate": 2.6531878476679503e-05, + "loss": 0.4761, + "step": 15902 + }, + { + "epoch": 20.414634146341463, + "grad_norm": 1.760560154914856, + "learning_rate": 2.653145057766367e-05, + "loss": 0.4472, + "step": 15903 + }, + { + "epoch": 20.41591784338896, + "grad_norm": 2.321181535720825, + "learning_rate": 2.653102267864784e-05, + "loss": 0.4153, + "step": 15904 + }, + { + "epoch": 20.41720154043646, + "grad_norm": 1.779314398765564, + "learning_rate": 2.6530594779632005e-05, + "loss": 0.41, + "step": 15905 + }, + { + "epoch": 20.418485237483953, + "grad_norm": 3.857321262359619, + "learning_rate": 2.6530166880616176e-05, + "loss": 0.4551, + "step": 15906 + }, + { + "epoch": 20.41976893453145, + "grad_norm": 1.1520861387252808, + "learning_rate": 2.652973898160034e-05, + "loss": 0.4326, + "step": 15907 + }, + { + "epoch": 20.42105263157895, + "grad_norm": 1.289734959602356, + "learning_rate": 2.6529311082584513e-05, + "loss": 0.4361, + "step": 15908 + }, + { + "epoch": 20.422336328626443, + "grad_norm": 2.0473859310150146, + "learning_rate": 2.652888318356868e-05, + "loss": 0.4128, + "step": 15909 + }, + { + "epoch": 20.42362002567394, + "grad_norm": 3.179561138153076, + "learning_rate": 2.6528455284552843e-05, + "loss": 0.4425, + "step": 15910 + }, + { + "epoch": 20.42490372272144, + "grad_norm": 1.5056713819503784, + "learning_rate": 2.6528027385537015e-05, + "loss": 0.3976, + "step": 15911 + }, + { + "epoch": 20.426187419768933, + "grad_norm": 2.8567733764648438, + "learning_rate": 2.652759948652118e-05, + "loss": 0.4279, + "step": 15912 + }, + { + "epoch": 20.42747111681643, + "grad_norm": 1.1464016437530518, + "learning_rate": 2.652717158750535e-05, + "loss": 0.4344, + "step": 15913 + }, + { + "epoch": 20.42875481386393, + "grad_norm": 1.3978513479232788, + "learning_rate": 2.6526743688489517e-05, + "loss": 0.4048, + "step": 15914 + }, + { + "epoch": 20.430038510911427, + "grad_norm": 1.4567372798919678, + "learning_rate": 2.6526315789473685e-05, + "loss": 0.4088, + "step": 15915 + }, + { + "epoch": 20.43132220795892, + "grad_norm": 2.651660919189453, + "learning_rate": 2.6525887890457854e-05, + "loss": 0.503, + "step": 15916 + }, + { + "epoch": 20.43260590500642, + "grad_norm": 4.830177307128906, + "learning_rate": 2.652545999144202e-05, + "loss": 0.4877, + "step": 15917 + }, + { + "epoch": 20.433889602053917, + "grad_norm": 4.801399230957031, + "learning_rate": 2.6525032092426187e-05, + "loss": 0.4821, + "step": 15918 + }, + { + "epoch": 20.43517329910141, + "grad_norm": 1.270874261856079, + "learning_rate": 2.6524604193410356e-05, + "loss": 0.4614, + "step": 15919 + }, + { + "epoch": 20.43645699614891, + "grad_norm": 1.869701862335205, + "learning_rate": 2.6524176294394524e-05, + "loss": 0.4642, + "step": 15920 + }, + { + "epoch": 20.437740693196407, + "grad_norm": 1.239809513092041, + "learning_rate": 2.652374839537869e-05, + "loss": 0.452, + "step": 15921 + }, + { + "epoch": 20.4390243902439, + "grad_norm": 13.735870361328125, + "learning_rate": 2.652332049636286e-05, + "loss": 0.4815, + "step": 15922 + }, + { + "epoch": 20.4403080872914, + "grad_norm": 6.597353458404541, + "learning_rate": 2.6522892597347026e-05, + "loss": 0.5096, + "step": 15923 + }, + { + "epoch": 20.441591784338897, + "grad_norm": 6.509334564208984, + "learning_rate": 2.6522464698331194e-05, + "loss": 0.3976, + "step": 15924 + }, + { + "epoch": 20.44287548138639, + "grad_norm": 2.3647656440734863, + "learning_rate": 2.6522036799315363e-05, + "loss": 0.4557, + "step": 15925 + }, + { + "epoch": 20.44415917843389, + "grad_norm": 1.650089144706726, + "learning_rate": 2.6521608900299528e-05, + "loss": 0.4734, + "step": 15926 + }, + { + "epoch": 20.445442875481387, + "grad_norm": 1.3090777397155762, + "learning_rate": 2.65211810012837e-05, + "loss": 0.4857, + "step": 15927 + }, + { + "epoch": 20.446726572528885, + "grad_norm": 4.172025203704834, + "learning_rate": 2.6520753102267865e-05, + "loss": 0.5167, + "step": 15928 + }, + { + "epoch": 20.44801026957638, + "grad_norm": 1.389424204826355, + "learning_rate": 2.6520325203252033e-05, + "loss": 0.5137, + "step": 15929 + }, + { + "epoch": 20.449293966623877, + "grad_norm": 2.160041570663452, + "learning_rate": 2.65198973042362e-05, + "loss": 0.6866, + "step": 15930 + }, + { + "epoch": 20.450577663671375, + "grad_norm": 1.5691585540771484, + "learning_rate": 2.6519469405220366e-05, + "loss": 0.4243, + "step": 15931 + }, + { + "epoch": 20.45186136071887, + "grad_norm": 2.8272860050201416, + "learning_rate": 2.6519041506204538e-05, + "loss": 0.4239, + "step": 15932 + }, + { + "epoch": 20.453145057766367, + "grad_norm": 1.1821341514587402, + "learning_rate": 2.6518613607188703e-05, + "loss": 0.4264, + "step": 15933 + }, + { + "epoch": 20.454428754813865, + "grad_norm": 1.53467857837677, + "learning_rate": 2.651818570817287e-05, + "loss": 0.4104, + "step": 15934 + }, + { + "epoch": 20.45571245186136, + "grad_norm": 1.01466703414917, + "learning_rate": 2.651775780915704e-05, + "loss": 0.4283, + "step": 15935 + }, + { + "epoch": 20.456996148908857, + "grad_norm": 1.8418006896972656, + "learning_rate": 2.651732991014121e-05, + "loss": 0.4003, + "step": 15936 + }, + { + "epoch": 20.458279845956355, + "grad_norm": 1.7198361158370972, + "learning_rate": 2.6516902011125374e-05, + "loss": 0.4115, + "step": 15937 + }, + { + "epoch": 20.459563543003853, + "grad_norm": 1.2904731035232544, + "learning_rate": 2.6516474112109542e-05, + "loss": 0.4141, + "step": 15938 + }, + { + "epoch": 20.460847240051347, + "grad_norm": 0.9597405791282654, + "learning_rate": 2.651604621309371e-05, + "loss": 0.4196, + "step": 15939 + }, + { + "epoch": 20.462130937098845, + "grad_norm": 1.0205857753753662, + "learning_rate": 2.651561831407788e-05, + "loss": 0.4029, + "step": 15940 + }, + { + "epoch": 20.463414634146343, + "grad_norm": 0.7371537685394287, + "learning_rate": 2.6515190415062047e-05, + "loss": 0.3975, + "step": 15941 + }, + { + "epoch": 20.464698331193837, + "grad_norm": 1.136122703552246, + "learning_rate": 2.6514762516046212e-05, + "loss": 0.4382, + "step": 15942 + }, + { + "epoch": 20.465982028241335, + "grad_norm": 1.5238640308380127, + "learning_rate": 2.6514334617030384e-05, + "loss": 0.4332, + "step": 15943 + }, + { + "epoch": 20.467265725288833, + "grad_norm": 1.7111457586288452, + "learning_rate": 2.651390671801455e-05, + "loss": 0.4204, + "step": 15944 + }, + { + "epoch": 20.468549422336327, + "grad_norm": 1.7393686771392822, + "learning_rate": 2.6513478818998714e-05, + "loss": 0.4633, + "step": 15945 + }, + { + "epoch": 20.469833119383825, + "grad_norm": 1.1190754175186157, + "learning_rate": 2.6513050919982886e-05, + "loss": 0.4113, + "step": 15946 + }, + { + "epoch": 20.471116816431323, + "grad_norm": 1.2635934352874756, + "learning_rate": 2.651262302096705e-05, + "loss": 0.4444, + "step": 15947 + }, + { + "epoch": 20.47240051347882, + "grad_norm": 0.92521071434021, + "learning_rate": 2.6512195121951223e-05, + "loss": 0.4232, + "step": 15948 + }, + { + "epoch": 20.473684210526315, + "grad_norm": 1.3588179349899292, + "learning_rate": 2.6511767222935388e-05, + "loss": 0.4273, + "step": 15949 + }, + { + "epoch": 20.474967907573813, + "grad_norm": 1.9096592664718628, + "learning_rate": 2.6511339323919556e-05, + "loss": 0.4182, + "step": 15950 + }, + { + "epoch": 20.47625160462131, + "grad_norm": 1.1365039348602295, + "learning_rate": 2.6510911424903725e-05, + "loss": 0.4387, + "step": 15951 + }, + { + "epoch": 20.477535301668805, + "grad_norm": 1.1621744632720947, + "learning_rate": 2.651048352588789e-05, + "loss": 0.4525, + "step": 15952 + }, + { + "epoch": 20.478818998716303, + "grad_norm": 1.9303137063980103, + "learning_rate": 2.6510055626872058e-05, + "loss": 0.4426, + "step": 15953 + }, + { + "epoch": 20.4801026957638, + "grad_norm": 2.2410519123077393, + "learning_rate": 2.6509627727856226e-05, + "loss": 0.3987, + "step": 15954 + }, + { + "epoch": 20.481386392811295, + "grad_norm": 2.613614559173584, + "learning_rate": 2.6509199828840395e-05, + "loss": 0.4026, + "step": 15955 + }, + { + "epoch": 20.482670089858793, + "grad_norm": 1.5220866203308105, + "learning_rate": 2.6508771929824563e-05, + "loss": 0.409, + "step": 15956 + }, + { + "epoch": 20.48395378690629, + "grad_norm": 0.9485884308815002, + "learning_rate": 2.650834403080873e-05, + "loss": 0.4491, + "step": 15957 + }, + { + "epoch": 20.485237483953785, + "grad_norm": 1.0361477136611938, + "learning_rate": 2.6507916131792897e-05, + "loss": 0.4388, + "step": 15958 + }, + { + "epoch": 20.486521181001283, + "grad_norm": 2.0401360988616943, + "learning_rate": 2.6507488232777065e-05, + "loss": 0.4498, + "step": 15959 + }, + { + "epoch": 20.48780487804878, + "grad_norm": 1.2857586145401, + "learning_rate": 2.6507060333761233e-05, + "loss": 0.4499, + "step": 15960 + }, + { + "epoch": 20.48908857509628, + "grad_norm": 0.969992458820343, + "learning_rate": 2.65066324347454e-05, + "loss": 0.4136, + "step": 15961 + }, + { + "epoch": 20.490372272143773, + "grad_norm": 2.5975115299224854, + "learning_rate": 2.650620453572957e-05, + "loss": 0.3931, + "step": 15962 + }, + { + "epoch": 20.49165596919127, + "grad_norm": 1.1313514709472656, + "learning_rate": 2.6505776636713735e-05, + "loss": 0.4279, + "step": 15963 + }, + { + "epoch": 20.49293966623877, + "grad_norm": 1.998176097869873, + "learning_rate": 2.6505348737697904e-05, + "loss": 0.4687, + "step": 15964 + }, + { + "epoch": 20.494223363286263, + "grad_norm": 2.849008798599243, + "learning_rate": 2.6504920838682072e-05, + "loss": 0.4382, + "step": 15965 + }, + { + "epoch": 20.49550706033376, + "grad_norm": 1.2104171514511108, + "learning_rate": 2.6504492939666237e-05, + "loss": 0.4541, + "step": 15966 + }, + { + "epoch": 20.49679075738126, + "grad_norm": 2.35929012298584, + "learning_rate": 2.650406504065041e-05, + "loss": 0.4312, + "step": 15967 + }, + { + "epoch": 20.498074454428753, + "grad_norm": 1.9576352834701538, + "learning_rate": 2.6503637141634574e-05, + "loss": 0.4749, + "step": 15968 + }, + { + "epoch": 20.49935815147625, + "grad_norm": 2.1217751502990723, + "learning_rate": 2.6503209242618742e-05, + "loss": 0.4548, + "step": 15969 + }, + { + "epoch": 20.50064184852375, + "grad_norm": 3.303635358810425, + "learning_rate": 2.650278134360291e-05, + "loss": 0.462, + "step": 15970 + }, + { + "epoch": 20.501925545571247, + "grad_norm": 2.2733311653137207, + "learning_rate": 2.6502353444587076e-05, + "loss": 0.4495, + "step": 15971 + }, + { + "epoch": 20.50320924261874, + "grad_norm": 1.224708914756775, + "learning_rate": 2.6501925545571248e-05, + "loss": 0.5187, + "step": 15972 + }, + { + "epoch": 20.50449293966624, + "grad_norm": 25.379241943359375, + "learning_rate": 2.6501497646555413e-05, + "loss": 0.4806, + "step": 15973 + }, + { + "epoch": 20.505776636713737, + "grad_norm": 1.7892799377441406, + "learning_rate": 2.650106974753958e-05, + "loss": 0.4442, + "step": 15974 + }, + { + "epoch": 20.50706033376123, + "grad_norm": 1.5116149187088013, + "learning_rate": 2.650064184852375e-05, + "loss": 0.4422, + "step": 15975 + }, + { + "epoch": 20.50834403080873, + "grad_norm": 9.383234024047852, + "learning_rate": 2.6500213949507918e-05, + "loss": 0.5331, + "step": 15976 + }, + { + "epoch": 20.509627727856227, + "grad_norm": 2.997267484664917, + "learning_rate": 2.6499786050492083e-05, + "loss": 0.4733, + "step": 15977 + }, + { + "epoch": 20.51091142490372, + "grad_norm": 3.9177746772766113, + "learning_rate": 2.649935815147625e-05, + "loss": 0.5308, + "step": 15978 + }, + { + "epoch": 20.51219512195122, + "grad_norm": 2.429333448410034, + "learning_rate": 2.649893025246042e-05, + "loss": 0.5967, + "step": 15979 + }, + { + "epoch": 20.513478818998717, + "grad_norm": 7.957846641540527, + "learning_rate": 2.6498502353444588e-05, + "loss": 0.6525, + "step": 15980 + }, + { + "epoch": 20.514762516046215, + "grad_norm": 2.1880991458892822, + "learning_rate": 2.6498074454428757e-05, + "loss": 0.42, + "step": 15981 + }, + { + "epoch": 20.51604621309371, + "grad_norm": 1.1746819019317627, + "learning_rate": 2.649764655541292e-05, + "loss": 0.4048, + "step": 15982 + }, + { + "epoch": 20.517329910141207, + "grad_norm": 0.9195306897163391, + "learning_rate": 2.6497218656397093e-05, + "loss": 0.433, + "step": 15983 + }, + { + "epoch": 20.518613607188705, + "grad_norm": 1.415032982826233, + "learning_rate": 2.649679075738126e-05, + "loss": 0.4339, + "step": 15984 + }, + { + "epoch": 20.5198973042362, + "grad_norm": 1.5880903005599976, + "learning_rate": 2.6496362858365423e-05, + "loss": 0.4409, + "step": 15985 + }, + { + "epoch": 20.521181001283697, + "grad_norm": 2.038329601287842, + "learning_rate": 2.6495934959349595e-05, + "loss": 0.3995, + "step": 15986 + }, + { + "epoch": 20.522464698331195, + "grad_norm": 2.4713611602783203, + "learning_rate": 2.649550706033376e-05, + "loss": 0.3947, + "step": 15987 + }, + { + "epoch": 20.52374839537869, + "grad_norm": 2.7242069244384766, + "learning_rate": 2.6495079161317932e-05, + "loss": 0.4505, + "step": 15988 + }, + { + "epoch": 20.525032092426187, + "grad_norm": 1.7453304529190063, + "learning_rate": 2.6494651262302097e-05, + "loss": 0.4705, + "step": 15989 + }, + { + "epoch": 20.526315789473685, + "grad_norm": 0.9642501473426819, + "learning_rate": 2.6494223363286265e-05, + "loss": 0.4341, + "step": 15990 + }, + { + "epoch": 20.527599486521183, + "grad_norm": 0.9687753319740295, + "learning_rate": 2.6493795464270434e-05, + "loss": 0.432, + "step": 15991 + }, + { + "epoch": 20.528883183568677, + "grad_norm": 1.18849778175354, + "learning_rate": 2.64933675652546e-05, + "loss": 0.4524, + "step": 15992 + }, + { + "epoch": 20.530166880616175, + "grad_norm": 1.9422383308410645, + "learning_rate": 2.6492939666238767e-05, + "loss": 0.4181, + "step": 15993 + }, + { + "epoch": 20.531450577663673, + "grad_norm": 0.7843466997146606, + "learning_rate": 2.6492511767222936e-05, + "loss": 0.369, + "step": 15994 + }, + { + "epoch": 20.532734274711167, + "grad_norm": 1.2876778841018677, + "learning_rate": 2.6492083868207104e-05, + "loss": 0.4262, + "step": 15995 + }, + { + "epoch": 20.534017971758665, + "grad_norm": 1.4919766187667847, + "learning_rate": 2.6491655969191273e-05, + "loss": 0.4154, + "step": 15996 + }, + { + "epoch": 20.535301668806163, + "grad_norm": 1.073559045791626, + "learning_rate": 2.649122807017544e-05, + "loss": 0.4264, + "step": 15997 + }, + { + "epoch": 20.536585365853657, + "grad_norm": 1.3786524534225464, + "learning_rate": 2.6490800171159606e-05, + "loss": 0.4334, + "step": 15998 + }, + { + "epoch": 20.537869062901155, + "grad_norm": 0.9879610538482666, + "learning_rate": 2.6490372272143774e-05, + "loss": 0.4035, + "step": 15999 + }, + { + "epoch": 20.539152759948653, + "grad_norm": 1.9320077896118164, + "learning_rate": 2.6489944373127943e-05, + "loss": 0.4316, + "step": 16000 + }, + { + "epoch": 20.539152759948653, + "eval_cer": 0.28279812022713924, + "eval_loss": 0.5025850534439087, + "eval_runtime": 14.4105, + "eval_samples_per_second": 68.214, + "eval_steps_per_second": 0.486, + "eval_wer": 0.4837193304288007, + "step": 16000 + }, + { + "epoch": 20.540436456996147, + "grad_norm": 3.020934820175171, + "learning_rate": 2.6489516474112108e-05, + "loss": 0.4486, + "step": 16001 + }, + { + "epoch": 20.541720154043645, + "grad_norm": 1.2644951343536377, + "learning_rate": 2.648908857509628e-05, + "loss": 0.442, + "step": 16002 + }, + { + "epoch": 20.543003851091143, + "grad_norm": 1.85248863697052, + "learning_rate": 2.6488660676080445e-05, + "loss": 0.4261, + "step": 16003 + }, + { + "epoch": 20.54428754813864, + "grad_norm": 0.9254617691040039, + "learning_rate": 2.6488232777064616e-05, + "loss": 0.4189, + "step": 16004 + }, + { + "epoch": 20.545571245186135, + "grad_norm": 1.5876598358154297, + "learning_rate": 2.648780487804878e-05, + "loss": 0.4327, + "step": 16005 + }, + { + "epoch": 20.546854942233633, + "grad_norm": 1.7143242359161377, + "learning_rate": 2.6487376979032947e-05, + "loss": 0.4658, + "step": 16006 + }, + { + "epoch": 20.54813863928113, + "grad_norm": 1.263726830482483, + "learning_rate": 2.648694908001712e-05, + "loss": 0.4087, + "step": 16007 + }, + { + "epoch": 20.549422336328625, + "grad_norm": 3.1242663860321045, + "learning_rate": 2.6486521181001283e-05, + "loss": 0.4216, + "step": 16008 + }, + { + "epoch": 20.550706033376123, + "grad_norm": 1.5974985361099243, + "learning_rate": 2.6486093281985452e-05, + "loss": 0.4227, + "step": 16009 + }, + { + "epoch": 20.55198973042362, + "grad_norm": 0.9888717532157898, + "learning_rate": 2.648566538296962e-05, + "loss": 0.412, + "step": 16010 + }, + { + "epoch": 20.553273427471115, + "grad_norm": 1.6140695810317993, + "learning_rate": 2.648523748395379e-05, + "loss": 0.4349, + "step": 16011 + }, + { + "epoch": 20.554557124518613, + "grad_norm": 1.4067288637161255, + "learning_rate": 2.6484809584937957e-05, + "loss": 0.4251, + "step": 16012 + }, + { + "epoch": 20.55584082156611, + "grad_norm": 1.215930700302124, + "learning_rate": 2.6484381685922122e-05, + "loss": 0.4418, + "step": 16013 + }, + { + "epoch": 20.55712451861361, + "grad_norm": 5.515439987182617, + "learning_rate": 2.648395378690629e-05, + "loss": 0.4431, + "step": 16014 + }, + { + "epoch": 20.558408215661103, + "grad_norm": 2.5531041622161865, + "learning_rate": 2.648352588789046e-05, + "loss": 0.4487, + "step": 16015 + }, + { + "epoch": 20.5596919127086, + "grad_norm": 1.9521746635437012, + "learning_rate": 2.6483097988874627e-05, + "loss": 0.4326, + "step": 16016 + }, + { + "epoch": 20.5609756097561, + "grad_norm": 1.0058109760284424, + "learning_rate": 2.6482670089858792e-05, + "loss": 0.4801, + "step": 16017 + }, + { + "epoch": 20.562259306803593, + "grad_norm": 2.4485442638397217, + "learning_rate": 2.6482242190842964e-05, + "loss": 0.4224, + "step": 16018 + }, + { + "epoch": 20.56354300385109, + "grad_norm": 1.0601656436920166, + "learning_rate": 2.648181429182713e-05, + "loss": 0.4129, + "step": 16019 + }, + { + "epoch": 20.56482670089859, + "grad_norm": 1.2905311584472656, + "learning_rate": 2.6481386392811298e-05, + "loss": 0.455, + "step": 16020 + }, + { + "epoch": 20.566110397946083, + "grad_norm": 1.712104320526123, + "learning_rate": 2.6480958493795466e-05, + "loss": 0.4694, + "step": 16021 + }, + { + "epoch": 20.56739409499358, + "grad_norm": 3.9729838371276855, + "learning_rate": 2.648053059477963e-05, + "loss": 0.4765, + "step": 16022 + }, + { + "epoch": 20.56867779204108, + "grad_norm": 2.7768356800079346, + "learning_rate": 2.6480102695763803e-05, + "loss": 0.5284, + "step": 16023 + }, + { + "epoch": 20.569961489088577, + "grad_norm": 1.283767580986023, + "learning_rate": 2.6479674796747968e-05, + "loss": 0.4309, + "step": 16024 + }, + { + "epoch": 20.57124518613607, + "grad_norm": 1.663847804069519, + "learning_rate": 2.6479246897732133e-05, + "loss": 0.482, + "step": 16025 + }, + { + "epoch": 20.57252888318357, + "grad_norm": 1.7214851379394531, + "learning_rate": 2.6478818998716305e-05, + "loss": 0.4291, + "step": 16026 + }, + { + "epoch": 20.573812580231067, + "grad_norm": 1.7883878946304321, + "learning_rate": 2.647839109970047e-05, + "loss": 0.4874, + "step": 16027 + }, + { + "epoch": 20.57509627727856, + "grad_norm": 3.61832332611084, + "learning_rate": 2.647796320068464e-05, + "loss": 0.4629, + "step": 16028 + }, + { + "epoch": 20.57637997432606, + "grad_norm": 2.0690717697143555, + "learning_rate": 2.6477535301668806e-05, + "loss": 0.5552, + "step": 16029 + }, + { + "epoch": 20.577663671373557, + "grad_norm": 1.6334280967712402, + "learning_rate": 2.6477107402652975e-05, + "loss": 0.6244, + "step": 16030 + }, + { + "epoch": 20.57894736842105, + "grad_norm": 1.524617075920105, + "learning_rate": 2.6476679503637143e-05, + "loss": 0.4111, + "step": 16031 + }, + { + "epoch": 20.58023106546855, + "grad_norm": 1.5144926309585571, + "learning_rate": 2.6476251604621308e-05, + "loss": 0.4164, + "step": 16032 + }, + { + "epoch": 20.581514762516047, + "grad_norm": 1.2540639638900757, + "learning_rate": 2.6475823705605477e-05, + "loss": 0.4094, + "step": 16033 + }, + { + "epoch": 20.58279845956354, + "grad_norm": 1.0488414764404297, + "learning_rate": 2.6475395806589645e-05, + "loss": 0.4466, + "step": 16034 + }, + { + "epoch": 20.58408215661104, + "grad_norm": 0.9400504231452942, + "learning_rate": 2.6474967907573814e-05, + "loss": 0.4153, + "step": 16035 + }, + { + "epoch": 20.585365853658537, + "grad_norm": 1.1628427505493164, + "learning_rate": 2.6474540008557982e-05, + "loss": 0.4312, + "step": 16036 + }, + { + "epoch": 20.586649550706035, + "grad_norm": 1.4031976461410522, + "learning_rate": 2.647411210954215e-05, + "loss": 0.4482, + "step": 16037 + }, + { + "epoch": 20.58793324775353, + "grad_norm": 1.2793124914169312, + "learning_rate": 2.6473684210526315e-05, + "loss": 0.392, + "step": 16038 + }, + { + "epoch": 20.589216944801027, + "grad_norm": 1.783652424812317, + "learning_rate": 2.6473256311510484e-05, + "loss": 0.3896, + "step": 16039 + }, + { + "epoch": 20.590500641848525, + "grad_norm": 1.4305098056793213, + "learning_rate": 2.6472828412494652e-05, + "loss": 0.4306, + "step": 16040 + }, + { + "epoch": 20.59178433889602, + "grad_norm": 1.7496633529663086, + "learning_rate": 2.6472400513478817e-05, + "loss": 0.3898, + "step": 16041 + }, + { + "epoch": 20.593068035943517, + "grad_norm": 4.192530155181885, + "learning_rate": 2.647197261446299e-05, + "loss": 0.4348, + "step": 16042 + }, + { + "epoch": 20.594351732991015, + "grad_norm": 1.1602753400802612, + "learning_rate": 2.6471544715447154e-05, + "loss": 0.4065, + "step": 16043 + }, + { + "epoch": 20.59563543003851, + "grad_norm": 1.6183714866638184, + "learning_rate": 2.6471116816431326e-05, + "loss": 0.4144, + "step": 16044 + }, + { + "epoch": 20.596919127086007, + "grad_norm": 0.856163740158081, + "learning_rate": 2.647068891741549e-05, + "loss": 0.4213, + "step": 16045 + }, + { + "epoch": 20.598202824133505, + "grad_norm": 4.102535247802734, + "learning_rate": 2.6470261018399656e-05, + "loss": 0.4555, + "step": 16046 + }, + { + "epoch": 20.599486521181003, + "grad_norm": 2.93068528175354, + "learning_rate": 2.6469833119383828e-05, + "loss": 0.4709, + "step": 16047 + }, + { + "epoch": 20.600770218228497, + "grad_norm": 3.8734633922576904, + "learning_rate": 2.6469405220367993e-05, + "loss": 0.474, + "step": 16048 + }, + { + "epoch": 20.602053915275995, + "grad_norm": 1.33416748046875, + "learning_rate": 2.646897732135216e-05, + "loss": 0.4612, + "step": 16049 + }, + { + "epoch": 20.603337612323493, + "grad_norm": 1.0129096508026123, + "learning_rate": 2.646854942233633e-05, + "loss": 0.4674, + "step": 16050 + }, + { + "epoch": 20.604621309370987, + "grad_norm": 1.9472781419754028, + "learning_rate": 2.6468121523320498e-05, + "loss": 0.4213, + "step": 16051 + }, + { + "epoch": 20.605905006418485, + "grad_norm": 2.3357748985290527, + "learning_rate": 2.6467693624304666e-05, + "loss": 0.462, + "step": 16052 + }, + { + "epoch": 20.607188703465983, + "grad_norm": 2.8151004314422607, + "learning_rate": 2.646726572528883e-05, + "loss": 0.4487, + "step": 16053 + }, + { + "epoch": 20.608472400513477, + "grad_norm": 1.6315362453460693, + "learning_rate": 2.6466837826273e-05, + "loss": 0.4248, + "step": 16054 + }, + { + "epoch": 20.609756097560975, + "grad_norm": 3.417184591293335, + "learning_rate": 2.6466409927257168e-05, + "loss": 0.4363, + "step": 16055 + }, + { + "epoch": 20.611039794608473, + "grad_norm": 1.2375918626785278, + "learning_rate": 2.6465982028241337e-05, + "loss": 0.4305, + "step": 16056 + }, + { + "epoch": 20.61232349165597, + "grad_norm": 11.020870208740234, + "learning_rate": 2.64655541292255e-05, + "loss": 0.4111, + "step": 16057 + }, + { + "epoch": 20.613607188703465, + "grad_norm": 1.7865158319473267, + "learning_rate": 2.6465126230209673e-05, + "loss": 0.4748, + "step": 16058 + }, + { + "epoch": 20.614890885750963, + "grad_norm": 6.348862171173096, + "learning_rate": 2.646469833119384e-05, + "loss": 0.454, + "step": 16059 + }, + { + "epoch": 20.61617458279846, + "grad_norm": 1.6988085508346558, + "learning_rate": 2.6464270432178007e-05, + "loss": 0.4804, + "step": 16060 + }, + { + "epoch": 20.617458279845955, + "grad_norm": 1.4375865459442139, + "learning_rate": 2.6463842533162175e-05, + "loss": 0.4772, + "step": 16061 + }, + { + "epoch": 20.618741976893453, + "grad_norm": 2.4833269119262695, + "learning_rate": 2.646341463414634e-05, + "loss": 0.4709, + "step": 16062 + }, + { + "epoch": 20.62002567394095, + "grad_norm": 1.312058925628662, + "learning_rate": 2.6462986735130512e-05, + "loss": 0.4869, + "step": 16063 + }, + { + "epoch": 20.621309370988445, + "grad_norm": 1.806128740310669, + "learning_rate": 2.6462558836114677e-05, + "loss": 0.4633, + "step": 16064 + }, + { + "epoch": 20.622593068035943, + "grad_norm": 1.7822988033294678, + "learning_rate": 2.6462130937098846e-05, + "loss": 0.4312, + "step": 16065 + }, + { + "epoch": 20.62387676508344, + "grad_norm": 1.358552098274231, + "learning_rate": 2.6461703038083014e-05, + "loss": 0.4747, + "step": 16066 + }, + { + "epoch": 20.625160462130935, + "grad_norm": 1.2276349067687988, + "learning_rate": 2.646127513906718e-05, + "loss": 0.4093, + "step": 16067 + }, + { + "epoch": 20.626444159178433, + "grad_norm": 1.6891553401947021, + "learning_rate": 2.646084724005135e-05, + "loss": 0.5228, + "step": 16068 + }, + { + "epoch": 20.62772785622593, + "grad_norm": 2.318650960922241, + "learning_rate": 2.6460419341035516e-05, + "loss": 0.44, + "step": 16069 + }, + { + "epoch": 20.62901155327343, + "grad_norm": 2.9909865856170654, + "learning_rate": 2.6459991442019684e-05, + "loss": 0.4872, + "step": 16070 + }, + { + "epoch": 20.630295250320923, + "grad_norm": 4.078999042510986, + "learning_rate": 2.6459563543003853e-05, + "loss": 0.4959, + "step": 16071 + }, + { + "epoch": 20.63157894736842, + "grad_norm": 1.298828363418579, + "learning_rate": 2.645913564398802e-05, + "loss": 0.5147, + "step": 16072 + }, + { + "epoch": 20.63286264441592, + "grad_norm": 1.6856157779693604, + "learning_rate": 2.6458707744972186e-05, + "loss": 0.4544, + "step": 16073 + }, + { + "epoch": 20.634146341463413, + "grad_norm": 1.6165202856063843, + "learning_rate": 2.6458279845956354e-05, + "loss": 0.4807, + "step": 16074 + }, + { + "epoch": 20.63543003851091, + "grad_norm": 1.6303883790969849, + "learning_rate": 2.6457851946940523e-05, + "loss": 0.4499, + "step": 16075 + }, + { + "epoch": 20.63671373555841, + "grad_norm": 1.278383731842041, + "learning_rate": 2.645742404792469e-05, + "loss": 0.4708, + "step": 16076 + }, + { + "epoch": 20.637997432605903, + "grad_norm": 2.1851816177368164, + "learning_rate": 2.645699614890886e-05, + "loss": 0.5404, + "step": 16077 + }, + { + "epoch": 20.6392811296534, + "grad_norm": 1.7774018049240112, + "learning_rate": 2.6456568249893025e-05, + "loss": 0.5324, + "step": 16078 + }, + { + "epoch": 20.6405648267009, + "grad_norm": 1.6862331628799438, + "learning_rate": 2.6456140350877197e-05, + "loss": 0.5444, + "step": 16079 + }, + { + "epoch": 20.641848523748397, + "grad_norm": 2.566640615463257, + "learning_rate": 2.645571245186136e-05, + "loss": 0.7068, + "step": 16080 + }, + { + "epoch": 20.64313222079589, + "grad_norm": 1.3420612812042236, + "learning_rate": 2.6455284552845527e-05, + "loss": 0.4125, + "step": 16081 + }, + { + "epoch": 20.64441591784339, + "grad_norm": 1.3403910398483276, + "learning_rate": 2.64548566538297e-05, + "loss": 0.4342, + "step": 16082 + }, + { + "epoch": 20.645699614890887, + "grad_norm": 1.1487693786621094, + "learning_rate": 2.6454428754813863e-05, + "loss": 0.3925, + "step": 16083 + }, + { + "epoch": 20.64698331193838, + "grad_norm": 1.636155605316162, + "learning_rate": 2.6454000855798035e-05, + "loss": 0.3937, + "step": 16084 + }, + { + "epoch": 20.64826700898588, + "grad_norm": 1.4428455829620361, + "learning_rate": 2.64535729567822e-05, + "loss": 0.4043, + "step": 16085 + }, + { + "epoch": 20.649550706033377, + "grad_norm": 1.1738451719284058, + "learning_rate": 2.6453145057766365e-05, + "loss": 0.4485, + "step": 16086 + }, + { + "epoch": 20.65083440308087, + "grad_norm": 1.4002571105957031, + "learning_rate": 2.6452717158750537e-05, + "loss": 0.3915, + "step": 16087 + }, + { + "epoch": 20.65211810012837, + "grad_norm": 0.9761350154876709, + "learning_rate": 2.6452289259734702e-05, + "loss": 0.438, + "step": 16088 + }, + { + "epoch": 20.653401797175867, + "grad_norm": 1.578930377960205, + "learning_rate": 2.645186136071887e-05, + "loss": 0.4343, + "step": 16089 + }, + { + "epoch": 20.654685494223365, + "grad_norm": 0.9507348537445068, + "learning_rate": 2.645143346170304e-05, + "loss": 0.4012, + "step": 16090 + }, + { + "epoch": 20.65596919127086, + "grad_norm": 1.3413262367248535, + "learning_rate": 2.6451005562687207e-05, + "loss": 0.4143, + "step": 16091 + }, + { + "epoch": 20.657252888318357, + "grad_norm": 1.3443665504455566, + "learning_rate": 2.6450577663671376e-05, + "loss": 0.4556, + "step": 16092 + }, + { + "epoch": 20.658536585365855, + "grad_norm": 1.6892287731170654, + "learning_rate": 2.645014976465554e-05, + "loss": 0.4247, + "step": 16093 + }, + { + "epoch": 20.65982028241335, + "grad_norm": 1.0745368003845215, + "learning_rate": 2.644972186563971e-05, + "loss": 0.4819, + "step": 16094 + }, + { + "epoch": 20.661103979460847, + "grad_norm": 1.6216297149658203, + "learning_rate": 2.6449293966623878e-05, + "loss": 0.4193, + "step": 16095 + }, + { + "epoch": 20.662387676508345, + "grad_norm": 1.955107569694519, + "learning_rate": 2.6448866067608046e-05, + "loss": 0.4461, + "step": 16096 + }, + { + "epoch": 20.66367137355584, + "grad_norm": 1.135788083076477, + "learning_rate": 2.644843816859221e-05, + "loss": 0.408, + "step": 16097 + }, + { + "epoch": 20.664955070603337, + "grad_norm": 1.3246568441390991, + "learning_rate": 2.6448010269576383e-05, + "loss": 0.3933, + "step": 16098 + }, + { + "epoch": 20.666238767650835, + "grad_norm": 1.010420799255371, + "learning_rate": 2.6447582370560548e-05, + "loss": 0.4542, + "step": 16099 + }, + { + "epoch": 20.66752246469833, + "grad_norm": 1.322869896888733, + "learning_rate": 2.6447154471544716e-05, + "loss": 0.4059, + "step": 16100 + }, + { + "epoch": 20.668806161745827, + "grad_norm": 2.3176567554473877, + "learning_rate": 2.6446726572528885e-05, + "loss": 0.4313, + "step": 16101 + }, + { + "epoch": 20.670089858793325, + "grad_norm": 1.0108246803283691, + "learning_rate": 2.644629867351305e-05, + "loss": 0.427, + "step": 16102 + }, + { + "epoch": 20.671373555840823, + "grad_norm": 1.5996527671813965, + "learning_rate": 2.644587077449722e-05, + "loss": 0.4143, + "step": 16103 + }, + { + "epoch": 20.672657252888317, + "grad_norm": 3.076366662979126, + "learning_rate": 2.6445442875481386e-05, + "loss": 0.3913, + "step": 16104 + }, + { + "epoch": 20.673940949935815, + "grad_norm": 1.6459780931472778, + "learning_rate": 2.6445014976465555e-05, + "loss": 0.4385, + "step": 16105 + }, + { + "epoch": 20.675224646983313, + "grad_norm": 1.1734291315078735, + "learning_rate": 2.6444587077449723e-05, + "loss": 0.4166, + "step": 16106 + }, + { + "epoch": 20.676508344030808, + "grad_norm": 5.563997268676758, + "learning_rate": 2.644415917843389e-05, + "loss": 0.4473, + "step": 16107 + }, + { + "epoch": 20.677792041078305, + "grad_norm": 0.926511287689209, + "learning_rate": 2.6443731279418057e-05, + "loss": 0.434, + "step": 16108 + }, + { + "epoch": 20.679075738125803, + "grad_norm": 1.8654789924621582, + "learning_rate": 2.6443303380402225e-05, + "loss": 0.4417, + "step": 16109 + }, + { + "epoch": 20.680359435173298, + "grad_norm": 1.8027657270431519, + "learning_rate": 2.6442875481386394e-05, + "loss": 0.5144, + "step": 16110 + }, + { + "epoch": 20.681643132220795, + "grad_norm": 1.4182941913604736, + "learning_rate": 2.6442447582370562e-05, + "loss": 0.3964, + "step": 16111 + }, + { + "epoch": 20.682926829268293, + "grad_norm": 0.8462675213813782, + "learning_rate": 2.644201968335473e-05, + "loss": 0.3887, + "step": 16112 + }, + { + "epoch": 20.68421052631579, + "grad_norm": 8.383219718933105, + "learning_rate": 2.6441591784338895e-05, + "loss": 0.4244, + "step": 16113 + }, + { + "epoch": 20.685494223363285, + "grad_norm": 1.0895054340362549, + "learning_rate": 2.6441163885323064e-05, + "loss": 0.4799, + "step": 16114 + }, + { + "epoch": 20.686777920410783, + "grad_norm": 2.271259307861328, + "learning_rate": 2.6440735986307232e-05, + "loss": 0.4779, + "step": 16115 + }, + { + "epoch": 20.68806161745828, + "grad_norm": 1.1625193357467651, + "learning_rate": 2.6440308087291397e-05, + "loss": 0.4248, + "step": 16116 + }, + { + "epoch": 20.689345314505776, + "grad_norm": 1.4991943836212158, + "learning_rate": 2.643988018827557e-05, + "loss": 0.4666, + "step": 16117 + }, + { + "epoch": 20.690629011553273, + "grad_norm": 2.346684455871582, + "learning_rate": 2.6439452289259734e-05, + "loss": 0.4935, + "step": 16118 + }, + { + "epoch": 20.69191270860077, + "grad_norm": 8.04234504699707, + "learning_rate": 2.6439024390243906e-05, + "loss": 0.433, + "step": 16119 + }, + { + "epoch": 20.693196405648266, + "grad_norm": 4.132792949676514, + "learning_rate": 2.643859649122807e-05, + "loss": 0.4448, + "step": 16120 + }, + { + "epoch": 20.694480102695763, + "grad_norm": 1.3472929000854492, + "learning_rate": 2.6438168592212236e-05, + "loss": 0.4534, + "step": 16121 + }, + { + "epoch": 20.69576379974326, + "grad_norm": 2.558655023574829, + "learning_rate": 2.6437740693196408e-05, + "loss": 0.4619, + "step": 16122 + }, + { + "epoch": 20.69704749679076, + "grad_norm": 1.5980268716812134, + "learning_rate": 2.6437312794180573e-05, + "loss": 0.5042, + "step": 16123 + }, + { + "epoch": 20.698331193838253, + "grad_norm": 3.1287596225738525, + "learning_rate": 2.643688489516474e-05, + "loss": 0.468, + "step": 16124 + }, + { + "epoch": 20.69961489088575, + "grad_norm": 1.8680702447891235, + "learning_rate": 2.643645699614891e-05, + "loss": 0.4939, + "step": 16125 + }, + { + "epoch": 20.70089858793325, + "grad_norm": 1.2357288599014282, + "learning_rate": 2.6436029097133078e-05, + "loss": 0.5085, + "step": 16126 + }, + { + "epoch": 20.702182284980744, + "grad_norm": 1.2225040197372437, + "learning_rate": 2.6435601198117246e-05, + "loss": 0.4962, + "step": 16127 + }, + { + "epoch": 20.70346598202824, + "grad_norm": 2.0582778453826904, + "learning_rate": 2.643517329910141e-05, + "loss": 0.4813, + "step": 16128 + }, + { + "epoch": 20.70474967907574, + "grad_norm": 2.5450968742370605, + "learning_rate": 2.643474540008558e-05, + "loss": 0.5013, + "step": 16129 + }, + { + "epoch": 20.706033376123234, + "grad_norm": 2.967164993286133, + "learning_rate": 2.6434317501069748e-05, + "loss": 0.6661, + "step": 16130 + }, + { + "epoch": 20.70731707317073, + "grad_norm": 1.258730173110962, + "learning_rate": 2.6433889602053917e-05, + "loss": 0.3807, + "step": 16131 + }, + { + "epoch": 20.70860077021823, + "grad_norm": 1.5760717391967773, + "learning_rate": 2.643346170303808e-05, + "loss": 0.3618, + "step": 16132 + }, + { + "epoch": 20.709884467265724, + "grad_norm": 0.880986213684082, + "learning_rate": 2.6433033804022253e-05, + "loss": 0.4117, + "step": 16133 + }, + { + "epoch": 20.71116816431322, + "grad_norm": 1.2111824750900269, + "learning_rate": 2.643260590500642e-05, + "loss": 0.4088, + "step": 16134 + }, + { + "epoch": 20.71245186136072, + "grad_norm": 1.0057157278060913, + "learning_rate": 2.6432178005990587e-05, + "loss": 0.4383, + "step": 16135 + }, + { + "epoch": 20.713735558408217, + "grad_norm": 1.1996495723724365, + "learning_rate": 2.6431750106974755e-05, + "loss": 0.4527, + "step": 16136 + }, + { + "epoch": 20.71501925545571, + "grad_norm": 1.333648681640625, + "learning_rate": 2.643132220795892e-05, + "loss": 0.4129, + "step": 16137 + }, + { + "epoch": 20.71630295250321, + "grad_norm": 1.970504641532898, + "learning_rate": 2.6430894308943092e-05, + "loss": 0.4166, + "step": 16138 + }, + { + "epoch": 20.717586649550707, + "grad_norm": 1.2538750171661377, + "learning_rate": 2.6430466409927257e-05, + "loss": 0.3805, + "step": 16139 + }, + { + "epoch": 20.7188703465982, + "grad_norm": 1.1415235996246338, + "learning_rate": 2.6430038510911426e-05, + "loss": 0.4381, + "step": 16140 + }, + { + "epoch": 20.7201540436457, + "grad_norm": 1.1359012126922607, + "learning_rate": 2.6429610611895594e-05, + "loss": 0.4275, + "step": 16141 + }, + { + "epoch": 20.721437740693197, + "grad_norm": 1.1759675741195679, + "learning_rate": 2.642918271287976e-05, + "loss": 0.478, + "step": 16142 + }, + { + "epoch": 20.72272143774069, + "grad_norm": 1.2479920387268066, + "learning_rate": 2.642875481386393e-05, + "loss": 0.4588, + "step": 16143 + }, + { + "epoch": 20.72400513478819, + "grad_norm": 1.1072216033935547, + "learning_rate": 2.6428326914848096e-05, + "loss": 0.4254, + "step": 16144 + }, + { + "epoch": 20.725288831835687, + "grad_norm": 1.3592629432678223, + "learning_rate": 2.6427899015832264e-05, + "loss": 0.4089, + "step": 16145 + }, + { + "epoch": 20.726572528883185, + "grad_norm": 1.4894838333129883, + "learning_rate": 2.6427471116816433e-05, + "loss": 0.433, + "step": 16146 + }, + { + "epoch": 20.72785622593068, + "grad_norm": 1.5930112600326538, + "learning_rate": 2.64270432178006e-05, + "loss": 0.3966, + "step": 16147 + }, + { + "epoch": 20.729139922978177, + "grad_norm": 1.2180836200714111, + "learning_rate": 2.6426615318784766e-05, + "loss": 0.4406, + "step": 16148 + }, + { + "epoch": 20.730423620025675, + "grad_norm": 2.4225006103515625, + "learning_rate": 2.6426187419768935e-05, + "loss": 0.3814, + "step": 16149 + }, + { + "epoch": 20.73170731707317, + "grad_norm": 5.5134077072143555, + "learning_rate": 2.6425759520753103e-05, + "loss": 0.4232, + "step": 16150 + }, + { + "epoch": 20.732991014120667, + "grad_norm": 1.1846705675125122, + "learning_rate": 2.642533162173727e-05, + "loss": 0.4491, + "step": 16151 + }, + { + "epoch": 20.734274711168165, + "grad_norm": 1.4108316898345947, + "learning_rate": 2.642490372272144e-05, + "loss": 0.4231, + "step": 16152 + }, + { + "epoch": 20.73555840821566, + "grad_norm": 1.0576740503311157, + "learning_rate": 2.6424475823705605e-05, + "loss": 0.3904, + "step": 16153 + }, + { + "epoch": 20.736842105263158, + "grad_norm": 1.3364078998565674, + "learning_rate": 2.6424047924689773e-05, + "loss": 0.397, + "step": 16154 + }, + { + "epoch": 20.738125802310655, + "grad_norm": 1.2735304832458496, + "learning_rate": 2.642362002567394e-05, + "loss": 0.4354, + "step": 16155 + }, + { + "epoch": 20.739409499358153, + "grad_norm": 6.288150787353516, + "learning_rate": 2.6423192126658107e-05, + "loss": 0.4426, + "step": 16156 + }, + { + "epoch": 20.740693196405648, + "grad_norm": 1.7806998491287231, + "learning_rate": 2.642276422764228e-05, + "loss": 0.4447, + "step": 16157 + }, + { + "epoch": 20.741976893453145, + "grad_norm": 1.100443720817566, + "learning_rate": 2.6422336328626443e-05, + "loss": 0.4446, + "step": 16158 + }, + { + "epoch": 20.743260590500643, + "grad_norm": 1.124314785003662, + "learning_rate": 2.6421908429610615e-05, + "loss": 0.3907, + "step": 16159 + }, + { + "epoch": 20.744544287548138, + "grad_norm": 2.1002869606018066, + "learning_rate": 2.642148053059478e-05, + "loss": 0.4514, + "step": 16160 + }, + { + "epoch": 20.745827984595635, + "grad_norm": 2.011693239212036, + "learning_rate": 2.6421052631578945e-05, + "loss": 0.4695, + "step": 16161 + }, + { + "epoch": 20.747111681643133, + "grad_norm": 3.889713764190674, + "learning_rate": 2.6420624732563117e-05, + "loss": 0.4889, + "step": 16162 + }, + { + "epoch": 20.748395378690628, + "grad_norm": 1.1507607698440552, + "learning_rate": 2.6420196833547282e-05, + "loss": 0.4316, + "step": 16163 + }, + { + "epoch": 20.749679075738126, + "grad_norm": 1.8678078651428223, + "learning_rate": 2.641976893453145e-05, + "loss": 0.4095, + "step": 16164 + }, + { + "epoch": 20.750962772785623, + "grad_norm": 1.227095365524292, + "learning_rate": 2.641934103551562e-05, + "loss": 0.464, + "step": 16165 + }, + { + "epoch": 20.752246469833118, + "grad_norm": 2.6704471111297607, + "learning_rate": 2.6418913136499787e-05, + "loss": 0.474, + "step": 16166 + }, + { + "epoch": 20.753530166880616, + "grad_norm": 1.3031113147735596, + "learning_rate": 2.6418485237483956e-05, + "loss": 0.4443, + "step": 16167 + }, + { + "epoch": 20.754813863928113, + "grad_norm": 1.156366229057312, + "learning_rate": 2.641805733846812e-05, + "loss": 0.4403, + "step": 16168 + }, + { + "epoch": 20.75609756097561, + "grad_norm": 1.5428318977355957, + "learning_rate": 2.641762943945229e-05, + "loss": 0.5215, + "step": 16169 + }, + { + "epoch": 20.757381258023106, + "grad_norm": 1.1402852535247803, + "learning_rate": 2.6417201540436458e-05, + "loss": 0.4401, + "step": 16170 + }, + { + "epoch": 20.758664955070603, + "grad_norm": 2.5300614833831787, + "learning_rate": 2.6416773641420626e-05, + "loss": 0.4611, + "step": 16171 + }, + { + "epoch": 20.7599486521181, + "grad_norm": 1.5173611640930176, + "learning_rate": 2.641634574240479e-05, + "loss": 0.4765, + "step": 16172 + }, + { + "epoch": 20.761232349165596, + "grad_norm": 1.633770227432251, + "learning_rate": 2.6415917843388963e-05, + "loss": 0.4529, + "step": 16173 + }, + { + "epoch": 20.762516046213094, + "grad_norm": 2.462782621383667, + "learning_rate": 2.6415489944373128e-05, + "loss": 0.4442, + "step": 16174 + }, + { + "epoch": 20.76379974326059, + "grad_norm": 1.7384562492370605, + "learning_rate": 2.6415062045357296e-05, + "loss": 0.4623, + "step": 16175 + }, + { + "epoch": 20.765083440308086, + "grad_norm": 1.1185171604156494, + "learning_rate": 2.6414634146341465e-05, + "loss": 0.463, + "step": 16176 + }, + { + "epoch": 20.766367137355584, + "grad_norm": 1.2393978834152222, + "learning_rate": 2.641420624732563e-05, + "loss": 0.463, + "step": 16177 + }, + { + "epoch": 20.76765083440308, + "grad_norm": 1.9670239686965942, + "learning_rate": 2.64137783483098e-05, + "loss": 0.4767, + "step": 16178 + }, + { + "epoch": 20.76893453145058, + "grad_norm": 2.074326753616333, + "learning_rate": 2.6413350449293967e-05, + "loss": 0.5444, + "step": 16179 + }, + { + "epoch": 20.770218228498074, + "grad_norm": 5.243113040924072, + "learning_rate": 2.6412922550278135e-05, + "loss": 0.6232, + "step": 16180 + }, + { + "epoch": 20.77150192554557, + "grad_norm": 1.1376217603683472, + "learning_rate": 2.6412494651262303e-05, + "loss": 0.409, + "step": 16181 + }, + { + "epoch": 20.77278562259307, + "grad_norm": 1.5489012002944946, + "learning_rate": 2.641206675224647e-05, + "loss": 0.3999, + "step": 16182 + }, + { + "epoch": 20.774069319640564, + "grad_norm": 0.9959220886230469, + "learning_rate": 2.641163885323064e-05, + "loss": 0.4225, + "step": 16183 + }, + { + "epoch": 20.77535301668806, + "grad_norm": 1.0030707120895386, + "learning_rate": 2.6411210954214805e-05, + "loss": 0.4426, + "step": 16184 + }, + { + "epoch": 20.77663671373556, + "grad_norm": 1.1103229522705078, + "learning_rate": 2.6410783055198974e-05, + "loss": 0.4381, + "step": 16185 + }, + { + "epoch": 20.777920410783054, + "grad_norm": 0.9310020804405212, + "learning_rate": 2.6410355156183142e-05, + "loss": 0.4018, + "step": 16186 + }, + { + "epoch": 20.77920410783055, + "grad_norm": 1.4665535688400269, + "learning_rate": 2.640992725716731e-05, + "loss": 0.3958, + "step": 16187 + }, + { + "epoch": 20.78048780487805, + "grad_norm": 3.1892035007476807, + "learning_rate": 2.6409499358151475e-05, + "loss": 0.4169, + "step": 16188 + }, + { + "epoch": 20.781771501925547, + "grad_norm": 1.3493647575378418, + "learning_rate": 2.6409071459135644e-05, + "loss": 0.4056, + "step": 16189 + }, + { + "epoch": 20.78305519897304, + "grad_norm": 1.3233474493026733, + "learning_rate": 2.6408643560119812e-05, + "loss": 0.4516, + "step": 16190 + }, + { + "epoch": 20.78433889602054, + "grad_norm": 4.492897987365723, + "learning_rate": 2.640821566110398e-05, + "loss": 0.4584, + "step": 16191 + }, + { + "epoch": 20.785622593068037, + "grad_norm": 1.5137200355529785, + "learning_rate": 2.640778776208815e-05, + "loss": 0.3804, + "step": 16192 + }, + { + "epoch": 20.78690629011553, + "grad_norm": 0.8987361788749695, + "learning_rate": 2.6407359863072314e-05, + "loss": 0.4359, + "step": 16193 + }, + { + "epoch": 20.78818998716303, + "grad_norm": 2.6268420219421387, + "learning_rate": 2.6406931964056486e-05, + "loss": 0.4738, + "step": 16194 + }, + { + "epoch": 20.789473684210527, + "grad_norm": 3.4943487644195557, + "learning_rate": 2.640650406504065e-05, + "loss": 0.4756, + "step": 16195 + }, + { + "epoch": 20.79075738125802, + "grad_norm": 1.0734378099441528, + "learning_rate": 2.6406076166024816e-05, + "loss": 0.3899, + "step": 16196 + }, + { + "epoch": 20.79204107830552, + "grad_norm": 1.9964897632598877, + "learning_rate": 2.6405648267008988e-05, + "loss": 0.469, + "step": 16197 + }, + { + "epoch": 20.793324775353017, + "grad_norm": 1.3237351179122925, + "learning_rate": 2.6405220367993153e-05, + "loss": 0.4154, + "step": 16198 + }, + { + "epoch": 20.794608472400512, + "grad_norm": 1.5936812162399292, + "learning_rate": 2.6404792468977325e-05, + "loss": 0.409, + "step": 16199 + }, + { + "epoch": 20.79589216944801, + "grad_norm": 2.5670199394226074, + "learning_rate": 2.640436456996149e-05, + "loss": 0.4404, + "step": 16200 + }, + { + "epoch": 20.797175866495508, + "grad_norm": 1.1422877311706543, + "learning_rate": 2.6403936670945658e-05, + "loss": 0.4335, + "step": 16201 + }, + { + "epoch": 20.798459563543005, + "grad_norm": 2.4013020992279053, + "learning_rate": 2.6403508771929826e-05, + "loss": 0.434, + "step": 16202 + }, + { + "epoch": 20.7997432605905, + "grad_norm": 1.8915541172027588, + "learning_rate": 2.640308087291399e-05, + "loss": 0.4309, + "step": 16203 + }, + { + "epoch": 20.801026957637998, + "grad_norm": 1.169520378112793, + "learning_rate": 2.640265297389816e-05, + "loss": 0.4354, + "step": 16204 + }, + { + "epoch": 20.802310654685495, + "grad_norm": 1.1211267709732056, + "learning_rate": 2.640222507488233e-05, + "loss": 0.4196, + "step": 16205 + }, + { + "epoch": 20.80359435173299, + "grad_norm": 1.121490478515625, + "learning_rate": 2.6401797175866497e-05, + "loss": 0.4217, + "step": 16206 + }, + { + "epoch": 20.804878048780488, + "grad_norm": 1.2224957942962646, + "learning_rate": 2.6401369276850665e-05, + "loss": 0.4396, + "step": 16207 + }, + { + "epoch": 20.806161745827985, + "grad_norm": 1.4164917469024658, + "learning_rate": 2.6400941377834834e-05, + "loss": 0.4115, + "step": 16208 + }, + { + "epoch": 20.80744544287548, + "grad_norm": 1.3746259212493896, + "learning_rate": 2.6400513478819e-05, + "loss": 0.4376, + "step": 16209 + }, + { + "epoch": 20.808729139922978, + "grad_norm": 3.087881565093994, + "learning_rate": 2.6400085579803167e-05, + "loss": 0.4203, + "step": 16210 + }, + { + "epoch": 20.810012836970476, + "grad_norm": 1.9825398921966553, + "learning_rate": 2.6399657680787335e-05, + "loss": 0.4482, + "step": 16211 + }, + { + "epoch": 20.811296534017973, + "grad_norm": 1.5816950798034668, + "learning_rate": 2.63992297817715e-05, + "loss": 0.4626, + "step": 16212 + }, + { + "epoch": 20.812580231065468, + "grad_norm": 1.4618638753890991, + "learning_rate": 2.6398801882755672e-05, + "loss": 0.4527, + "step": 16213 + }, + { + "epoch": 20.813863928112966, + "grad_norm": 1.8318357467651367, + "learning_rate": 2.6398373983739837e-05, + "loss": 0.4466, + "step": 16214 + }, + { + "epoch": 20.815147625160463, + "grad_norm": 1.4842708110809326, + "learning_rate": 2.6397946084724006e-05, + "loss": 0.456, + "step": 16215 + }, + { + "epoch": 20.816431322207958, + "grad_norm": 1.0285135507583618, + "learning_rate": 2.6397518185708174e-05, + "loss": 0.4332, + "step": 16216 + }, + { + "epoch": 20.817715019255456, + "grad_norm": 2.373420000076294, + "learning_rate": 2.639709028669234e-05, + "loss": 0.3966, + "step": 16217 + }, + { + "epoch": 20.818998716302954, + "grad_norm": 3.1741366386413574, + "learning_rate": 2.639666238767651e-05, + "loss": 0.4514, + "step": 16218 + }, + { + "epoch": 20.820282413350448, + "grad_norm": 1.4151256084442139, + "learning_rate": 2.6396234488660676e-05, + "loss": 0.4327, + "step": 16219 + }, + { + "epoch": 20.821566110397946, + "grad_norm": 2.834655284881592, + "learning_rate": 2.6395806589644844e-05, + "loss": 0.4321, + "step": 16220 + }, + { + "epoch": 20.822849807445444, + "grad_norm": 1.7362817525863647, + "learning_rate": 2.6395378690629013e-05, + "loss": 0.4442, + "step": 16221 + }, + { + "epoch": 20.82413350449294, + "grad_norm": 1.6429016590118408, + "learning_rate": 2.6394950791613178e-05, + "loss": 0.4608, + "step": 16222 + }, + { + "epoch": 20.825417201540436, + "grad_norm": 1.4360942840576172, + "learning_rate": 2.639452289259735e-05, + "loss": 0.4076, + "step": 16223 + }, + { + "epoch": 20.826700898587934, + "grad_norm": 1.5395207405090332, + "learning_rate": 2.6394094993581515e-05, + "loss": 0.5397, + "step": 16224 + }, + { + "epoch": 20.82798459563543, + "grad_norm": 1.709143877029419, + "learning_rate": 2.6393667094565683e-05, + "loss": 0.5155, + "step": 16225 + }, + { + "epoch": 20.829268292682926, + "grad_norm": 3.080573558807373, + "learning_rate": 2.639323919554985e-05, + "loss": 0.483, + "step": 16226 + }, + { + "epoch": 20.830551989730424, + "grad_norm": 1.4847599267959595, + "learning_rate": 2.639281129653402e-05, + "loss": 0.4762, + "step": 16227 + }, + { + "epoch": 20.83183568677792, + "grad_norm": 3.596182107925415, + "learning_rate": 2.6392383397518185e-05, + "loss": 0.5073, + "step": 16228 + }, + { + "epoch": 20.833119383825416, + "grad_norm": 2.0442962646484375, + "learning_rate": 2.6391955498502353e-05, + "loss": 0.5283, + "step": 16229 + }, + { + "epoch": 20.834403080872914, + "grad_norm": 1.8539460897445679, + "learning_rate": 2.639152759948652e-05, + "loss": 0.6829, + "step": 16230 + }, + { + "epoch": 20.83568677792041, + "grad_norm": 1.5554351806640625, + "learning_rate": 2.639109970047069e-05, + "loss": 0.4153, + "step": 16231 + }, + { + "epoch": 20.836970474967906, + "grad_norm": 1.0277867317199707, + "learning_rate": 2.639067180145486e-05, + "loss": 0.4079, + "step": 16232 + }, + { + "epoch": 20.838254172015404, + "grad_norm": 2.4941208362579346, + "learning_rate": 2.6390243902439024e-05, + "loss": 0.4147, + "step": 16233 + }, + { + "epoch": 20.8395378690629, + "grad_norm": 1.418351411819458, + "learning_rate": 2.6389816003423195e-05, + "loss": 0.4286, + "step": 16234 + }, + { + "epoch": 20.8408215661104, + "grad_norm": 0.8415317535400391, + "learning_rate": 2.638938810440736e-05, + "loss": 0.4275, + "step": 16235 + }, + { + "epoch": 20.842105263157894, + "grad_norm": 1.1578469276428223, + "learning_rate": 2.6388960205391525e-05, + "loss": 0.4211, + "step": 16236 + }, + { + "epoch": 20.84338896020539, + "grad_norm": 1.2835627794265747, + "learning_rate": 2.6388532306375697e-05, + "loss": 0.3931, + "step": 16237 + }, + { + "epoch": 20.84467265725289, + "grad_norm": 2.4598755836486816, + "learning_rate": 2.6388104407359862e-05, + "loss": 0.4161, + "step": 16238 + }, + { + "epoch": 20.845956354300384, + "grad_norm": 1.4183859825134277, + "learning_rate": 2.6387676508344034e-05, + "loss": 0.4075, + "step": 16239 + }, + { + "epoch": 20.84724005134788, + "grad_norm": 10.001462936401367, + "learning_rate": 2.63872486093282e-05, + "loss": 0.4498, + "step": 16240 + }, + { + "epoch": 20.84852374839538, + "grad_norm": 1.1490302085876465, + "learning_rate": 2.6386820710312367e-05, + "loss": 0.4564, + "step": 16241 + }, + { + "epoch": 20.849807445442874, + "grad_norm": 1.3751519918441772, + "learning_rate": 2.6386392811296536e-05, + "loss": 0.406, + "step": 16242 + }, + { + "epoch": 20.85109114249037, + "grad_norm": 1.039837121963501, + "learning_rate": 2.63859649122807e-05, + "loss": 0.4259, + "step": 16243 + }, + { + "epoch": 20.85237483953787, + "grad_norm": 1.0939973592758179, + "learning_rate": 2.638553701326487e-05, + "loss": 0.4124, + "step": 16244 + }, + { + "epoch": 20.853658536585368, + "grad_norm": 1.6546626091003418, + "learning_rate": 2.6385109114249038e-05, + "loss": 0.408, + "step": 16245 + }, + { + "epoch": 20.854942233632862, + "grad_norm": 2.360861301422119, + "learning_rate": 2.6384681215233206e-05, + "loss": 0.4329, + "step": 16246 + }, + { + "epoch": 20.85622593068036, + "grad_norm": 1.2338013648986816, + "learning_rate": 2.6384253316217375e-05, + "loss": 0.4212, + "step": 16247 + }, + { + "epoch": 20.857509627727858, + "grad_norm": 1.2294609546661377, + "learning_rate": 2.6383825417201543e-05, + "loss": 0.3928, + "step": 16248 + }, + { + "epoch": 20.858793324775352, + "grad_norm": 1.7584943771362305, + "learning_rate": 2.6383397518185708e-05, + "loss": 0.4028, + "step": 16249 + }, + { + "epoch": 20.86007702182285, + "grad_norm": 4.432960510253906, + "learning_rate": 2.6382969619169876e-05, + "loss": 0.4158, + "step": 16250 + }, + { + "epoch": 20.861360718870348, + "grad_norm": 1.2059135437011719, + "learning_rate": 2.6382541720154045e-05, + "loss": 0.4157, + "step": 16251 + }, + { + "epoch": 20.862644415917842, + "grad_norm": 1.0918720960617065, + "learning_rate": 2.638211382113821e-05, + "loss": 0.3827, + "step": 16252 + }, + { + "epoch": 20.86392811296534, + "grad_norm": 1.7238154411315918, + "learning_rate": 2.638168592212238e-05, + "loss": 0.4077, + "step": 16253 + }, + { + "epoch": 20.865211810012838, + "grad_norm": 1.012194037437439, + "learning_rate": 2.6381258023106547e-05, + "loss": 0.478, + "step": 16254 + }, + { + "epoch": 20.866495507060336, + "grad_norm": 1.4007923603057861, + "learning_rate": 2.638083012409072e-05, + "loss": 0.4515, + "step": 16255 + }, + { + "epoch": 20.86777920410783, + "grad_norm": 2.02276611328125, + "learning_rate": 2.6380402225074883e-05, + "loss": 0.4577, + "step": 16256 + }, + { + "epoch": 20.869062901155328, + "grad_norm": 1.8052880764007568, + "learning_rate": 2.637997432605905e-05, + "loss": 0.4334, + "step": 16257 + }, + { + "epoch": 20.870346598202826, + "grad_norm": 0.6959625482559204, + "learning_rate": 2.637954642704322e-05, + "loss": 0.4004, + "step": 16258 + }, + { + "epoch": 20.87163029525032, + "grad_norm": 2.2514524459838867, + "learning_rate": 2.6379118528027385e-05, + "loss": 0.3986, + "step": 16259 + }, + { + "epoch": 20.872913992297818, + "grad_norm": 1.2424256801605225, + "learning_rate": 2.6378690629011554e-05, + "loss": 0.4283, + "step": 16260 + }, + { + "epoch": 20.874197689345316, + "grad_norm": 0.9637300372123718, + "learning_rate": 2.6378262729995722e-05, + "loss": 0.426, + "step": 16261 + }, + { + "epoch": 20.87548138639281, + "grad_norm": 1.8659627437591553, + "learning_rate": 2.637783483097989e-05, + "loss": 0.4413, + "step": 16262 + }, + { + "epoch": 20.876765083440308, + "grad_norm": 1.5887635946273804, + "learning_rate": 2.637740693196406e-05, + "loss": 0.4397, + "step": 16263 + }, + { + "epoch": 20.878048780487806, + "grad_norm": 1.6806942224502563, + "learning_rate": 2.6376979032948224e-05, + "loss": 0.4413, + "step": 16264 + }, + { + "epoch": 20.8793324775353, + "grad_norm": 1.0627892017364502, + "learning_rate": 2.6376551133932392e-05, + "loss": 0.4658, + "step": 16265 + }, + { + "epoch": 20.880616174582798, + "grad_norm": 1.0883867740631104, + "learning_rate": 2.637612323491656e-05, + "loss": 0.4831, + "step": 16266 + }, + { + "epoch": 20.881899871630296, + "grad_norm": 1.830641746520996, + "learning_rate": 2.637569533590073e-05, + "loss": 0.4445, + "step": 16267 + }, + { + "epoch": 20.883183568677794, + "grad_norm": 1.467261552810669, + "learning_rate": 2.6375267436884894e-05, + "loss": 0.4496, + "step": 16268 + }, + { + "epoch": 20.884467265725288, + "grad_norm": 1.1387808322906494, + "learning_rate": 2.6374839537869066e-05, + "loss": 0.4934, + "step": 16269 + }, + { + "epoch": 20.885750962772786, + "grad_norm": 1.6000930070877075, + "learning_rate": 2.637441163885323e-05, + "loss": 0.4233, + "step": 16270 + }, + { + "epoch": 20.887034659820284, + "grad_norm": 2.613891124725342, + "learning_rate": 2.63739837398374e-05, + "loss": 0.4202, + "step": 16271 + }, + { + "epoch": 20.888318356867778, + "grad_norm": 1.4979363679885864, + "learning_rate": 2.6373555840821568e-05, + "loss": 0.4342, + "step": 16272 + }, + { + "epoch": 20.889602053915276, + "grad_norm": 2.8824806213378906, + "learning_rate": 2.6373127941805733e-05, + "loss": 0.4176, + "step": 16273 + }, + { + "epoch": 20.890885750962774, + "grad_norm": 1.677268147468567, + "learning_rate": 2.6372700042789905e-05, + "loss": 0.4455, + "step": 16274 + }, + { + "epoch": 20.892169448010268, + "grad_norm": 3.8029801845550537, + "learning_rate": 2.637227214377407e-05, + "loss": 0.4804, + "step": 16275 + }, + { + "epoch": 20.893453145057766, + "grad_norm": 3.5350019931793213, + "learning_rate": 2.6371844244758235e-05, + "loss": 0.4466, + "step": 16276 + }, + { + "epoch": 20.894736842105264, + "grad_norm": 2.230360269546509, + "learning_rate": 2.6371416345742407e-05, + "loss": 0.5321, + "step": 16277 + }, + { + "epoch": 20.89602053915276, + "grad_norm": 1.5862618684768677, + "learning_rate": 2.637098844672657e-05, + "loss": 0.545, + "step": 16278 + }, + { + "epoch": 20.897304236200256, + "grad_norm": 2.6305325031280518, + "learning_rate": 2.6370560547710743e-05, + "loss": 0.5405, + "step": 16279 + }, + { + "epoch": 20.898587933247754, + "grad_norm": 1.8133082389831543, + "learning_rate": 2.637013264869491e-05, + "loss": 0.5748, + "step": 16280 + }, + { + "epoch": 20.89987163029525, + "grad_norm": 1.2008566856384277, + "learning_rate": 2.6369704749679077e-05, + "loss": 0.392, + "step": 16281 + }, + { + "epoch": 20.901155327342746, + "grad_norm": 1.878752589225769, + "learning_rate": 2.6369276850663245e-05, + "loss": 0.4051, + "step": 16282 + }, + { + "epoch": 20.902439024390244, + "grad_norm": 1.0930951833724976, + "learning_rate": 2.636884895164741e-05, + "loss": 0.4272, + "step": 16283 + }, + { + "epoch": 20.90372272143774, + "grad_norm": 0.9687420725822449, + "learning_rate": 2.636842105263158e-05, + "loss": 0.4263, + "step": 16284 + }, + { + "epoch": 20.905006418485236, + "grad_norm": 6.8355536460876465, + "learning_rate": 2.6367993153615747e-05, + "loss": 0.4192, + "step": 16285 + }, + { + "epoch": 20.906290115532734, + "grad_norm": 1.6771999597549438, + "learning_rate": 2.6367565254599915e-05, + "loss": 0.4225, + "step": 16286 + }, + { + "epoch": 20.90757381258023, + "grad_norm": 1.0289751291275024, + "learning_rate": 2.6367137355584084e-05, + "loss": 0.429, + "step": 16287 + }, + { + "epoch": 20.90885750962773, + "grad_norm": 1.1494640111923218, + "learning_rate": 2.6366709456568252e-05, + "loss": 0.4386, + "step": 16288 + }, + { + "epoch": 20.910141206675224, + "grad_norm": 1.1657766103744507, + "learning_rate": 2.6366281557552417e-05, + "loss": 0.3977, + "step": 16289 + }, + { + "epoch": 20.911424903722722, + "grad_norm": 1.4376105070114136, + "learning_rate": 2.6365853658536586e-05, + "loss": 0.4314, + "step": 16290 + }, + { + "epoch": 20.91270860077022, + "grad_norm": 1.0921192169189453, + "learning_rate": 2.6365425759520754e-05, + "loss": 0.4289, + "step": 16291 + }, + { + "epoch": 20.913992297817714, + "grad_norm": 1.3716241121292114, + "learning_rate": 2.636499786050492e-05, + "loss": 0.4364, + "step": 16292 + }, + { + "epoch": 20.915275994865212, + "grad_norm": 8.148805618286133, + "learning_rate": 2.636456996148909e-05, + "loss": 0.4052, + "step": 16293 + }, + { + "epoch": 20.91655969191271, + "grad_norm": 2.194087028503418, + "learning_rate": 2.6364142062473256e-05, + "loss": 0.454, + "step": 16294 + }, + { + "epoch": 20.917843388960204, + "grad_norm": 1.1741245985031128, + "learning_rate": 2.6363714163457428e-05, + "loss": 0.4264, + "step": 16295 + }, + { + "epoch": 20.919127086007702, + "grad_norm": 1.2767572402954102, + "learning_rate": 2.6363286264441593e-05, + "loss": 0.4296, + "step": 16296 + }, + { + "epoch": 20.9204107830552, + "grad_norm": 1.6068428754806519, + "learning_rate": 2.6362858365425758e-05, + "loss": 0.4042, + "step": 16297 + }, + { + "epoch": 20.921694480102694, + "grad_norm": 1.9058908224105835, + "learning_rate": 2.636243046640993e-05, + "loss": 0.4101, + "step": 16298 + }, + { + "epoch": 20.922978177150192, + "grad_norm": 1.6805171966552734, + "learning_rate": 2.6362002567394095e-05, + "loss": 0.4249, + "step": 16299 + }, + { + "epoch": 20.92426187419769, + "grad_norm": 2.102907180786133, + "learning_rate": 2.6361574668378263e-05, + "loss": 0.4717, + "step": 16300 + }, + { + "epoch": 20.925545571245188, + "grad_norm": 1.8700332641601562, + "learning_rate": 2.636114676936243e-05, + "loss": 0.4631, + "step": 16301 + }, + { + "epoch": 20.926829268292682, + "grad_norm": 1.4847289323806763, + "learning_rate": 2.63607188703466e-05, + "loss": 0.404, + "step": 16302 + }, + { + "epoch": 20.92811296534018, + "grad_norm": 0.6790487766265869, + "learning_rate": 2.6360290971330768e-05, + "loss": 0.4347, + "step": 16303 + }, + { + "epoch": 20.929396662387678, + "grad_norm": 1.2302534580230713, + "learning_rate": 2.6359863072314933e-05, + "loss": 0.4175, + "step": 16304 + }, + { + "epoch": 20.930680359435172, + "grad_norm": 1.5204044580459595, + "learning_rate": 2.6359435173299102e-05, + "loss": 0.4259, + "step": 16305 + }, + { + "epoch": 20.93196405648267, + "grad_norm": 0.864797830581665, + "learning_rate": 2.635900727428327e-05, + "loss": 0.3994, + "step": 16306 + }, + { + "epoch": 20.933247753530168, + "grad_norm": 4.2456135749816895, + "learning_rate": 2.635857937526744e-05, + "loss": 0.4028, + "step": 16307 + }, + { + "epoch": 20.934531450577662, + "grad_norm": 1.0925230979919434, + "learning_rate": 2.6358151476251604e-05, + "loss": 0.4529, + "step": 16308 + }, + { + "epoch": 20.93581514762516, + "grad_norm": 3.5559191703796387, + "learning_rate": 2.6357723577235775e-05, + "loss": 0.4245, + "step": 16309 + }, + { + "epoch": 20.937098844672658, + "grad_norm": 2.2219045162200928, + "learning_rate": 2.635729567821994e-05, + "loss": 0.4516, + "step": 16310 + }, + { + "epoch": 20.938382541720156, + "grad_norm": 1.0274187326431274, + "learning_rate": 2.6356867779204105e-05, + "loss": 0.4497, + "step": 16311 + }, + { + "epoch": 20.93966623876765, + "grad_norm": 0.979144811630249, + "learning_rate": 2.6356439880188277e-05, + "loss": 0.3882, + "step": 16312 + }, + { + "epoch": 20.940949935815148, + "grad_norm": 1.5577229261398315, + "learning_rate": 2.6356011981172442e-05, + "loss": 0.4245, + "step": 16313 + }, + { + "epoch": 20.942233632862646, + "grad_norm": 2.882582426071167, + "learning_rate": 2.6355584082156614e-05, + "loss": 0.448, + "step": 16314 + }, + { + "epoch": 20.94351732991014, + "grad_norm": 1.8181384801864624, + "learning_rate": 2.635515618314078e-05, + "loss": 0.4583, + "step": 16315 + }, + { + "epoch": 20.944801026957638, + "grad_norm": 1.3235070705413818, + "learning_rate": 2.6354728284124947e-05, + "loss": 0.3979, + "step": 16316 + }, + { + "epoch": 20.946084724005136, + "grad_norm": 1.6193032264709473, + "learning_rate": 2.6354300385109116e-05, + "loss": 0.4496, + "step": 16317 + }, + { + "epoch": 20.94736842105263, + "grad_norm": 1.2799652814865112, + "learning_rate": 2.635387248609328e-05, + "loss": 0.45, + "step": 16318 + }, + { + "epoch": 20.948652118100128, + "grad_norm": 2.0904908180236816, + "learning_rate": 2.635344458707745e-05, + "loss": 0.4028, + "step": 16319 + }, + { + "epoch": 20.949935815147626, + "grad_norm": 2.16996693611145, + "learning_rate": 2.6353016688061618e-05, + "loss": 0.4424, + "step": 16320 + }, + { + "epoch": 20.951219512195124, + "grad_norm": 1.6023255586624146, + "learning_rate": 2.6352588789045786e-05, + "loss": 0.4318, + "step": 16321 + }, + { + "epoch": 20.952503209242618, + "grad_norm": 1.0911542177200317, + "learning_rate": 2.6352160890029955e-05, + "loss": 0.4833, + "step": 16322 + }, + { + "epoch": 20.953786906290116, + "grad_norm": 2.7674806118011475, + "learning_rate": 2.6351732991014123e-05, + "loss": 0.5496, + "step": 16323 + }, + { + "epoch": 20.955070603337614, + "grad_norm": 2.061601161956787, + "learning_rate": 2.6351305091998288e-05, + "loss": 0.4558, + "step": 16324 + }, + { + "epoch": 20.956354300385108, + "grad_norm": 1.1399635076522827, + "learning_rate": 2.6350877192982456e-05, + "loss": 0.4548, + "step": 16325 + }, + { + "epoch": 20.957637997432606, + "grad_norm": 1.7700430154800415, + "learning_rate": 2.6350449293966625e-05, + "loss": 0.5303, + "step": 16326 + }, + { + "epoch": 20.958921694480104, + "grad_norm": 2.4413514137268066, + "learning_rate": 2.635002139495079e-05, + "loss": 0.4541, + "step": 16327 + }, + { + "epoch": 20.960205391527598, + "grad_norm": 2.0645368099212646, + "learning_rate": 2.634959349593496e-05, + "loss": 0.5245, + "step": 16328 + }, + { + "epoch": 20.961489088575096, + "grad_norm": 9.875027656555176, + "learning_rate": 2.6349165596919127e-05, + "loss": 0.5339, + "step": 16329 + }, + { + "epoch": 20.962772785622594, + "grad_norm": 1.7535293102264404, + "learning_rate": 2.63487376979033e-05, + "loss": 0.5843, + "step": 16330 + }, + { + "epoch": 20.964056482670088, + "grad_norm": 1.2850420475006104, + "learning_rate": 2.6348309798887463e-05, + "loss": 0.4092, + "step": 16331 + }, + { + "epoch": 20.965340179717586, + "grad_norm": 2.1496422290802, + "learning_rate": 2.634788189987163e-05, + "loss": 0.3856, + "step": 16332 + }, + { + "epoch": 20.966623876765084, + "grad_norm": 1.0136370658874512, + "learning_rate": 2.63474540008558e-05, + "loss": 0.4049, + "step": 16333 + }, + { + "epoch": 20.96790757381258, + "grad_norm": 1.1061245203018188, + "learning_rate": 2.6347026101839965e-05, + "loss": 0.4221, + "step": 16334 + }, + { + "epoch": 20.969191270860076, + "grad_norm": 1.2800573110580444, + "learning_rate": 2.6346598202824134e-05, + "loss": 0.4407, + "step": 16335 + }, + { + "epoch": 20.970474967907574, + "grad_norm": 2.3947606086730957, + "learning_rate": 2.6346170303808302e-05, + "loss": 0.4252, + "step": 16336 + }, + { + "epoch": 20.971758664955072, + "grad_norm": 1.665278434753418, + "learning_rate": 2.6345742404792467e-05, + "loss": 0.4294, + "step": 16337 + }, + { + "epoch": 20.973042362002566, + "grad_norm": 1.370980143547058, + "learning_rate": 2.634531450577664e-05, + "loss": 0.4247, + "step": 16338 + }, + { + "epoch": 20.974326059050064, + "grad_norm": 1.1425102949142456, + "learning_rate": 2.6344886606760804e-05, + "loss": 0.4371, + "step": 16339 + }, + { + "epoch": 20.975609756097562, + "grad_norm": 1.4383047819137573, + "learning_rate": 2.6344458707744972e-05, + "loss": 0.4173, + "step": 16340 + }, + { + "epoch": 20.976893453145056, + "grad_norm": 1.2125754356384277, + "learning_rate": 2.634403080872914e-05, + "loss": 0.4112, + "step": 16341 + }, + { + "epoch": 20.978177150192554, + "grad_norm": 1.0132535696029663, + "learning_rate": 2.634360290971331e-05, + "loss": 0.3836, + "step": 16342 + }, + { + "epoch": 20.979460847240052, + "grad_norm": 0.9514357447624207, + "learning_rate": 2.6343175010697474e-05, + "loss": 0.4185, + "step": 16343 + }, + { + "epoch": 20.98074454428755, + "grad_norm": 1.5223766565322876, + "learning_rate": 2.6342747111681643e-05, + "loss": 0.4162, + "step": 16344 + }, + { + "epoch": 20.982028241335044, + "grad_norm": 1.6706030368804932, + "learning_rate": 2.634231921266581e-05, + "loss": 0.3833, + "step": 16345 + }, + { + "epoch": 20.983311938382542, + "grad_norm": 1.2922848463058472, + "learning_rate": 2.634189131364998e-05, + "loss": 0.436, + "step": 16346 + }, + { + "epoch": 20.98459563543004, + "grad_norm": 1.5927027463912964, + "learning_rate": 2.6341463414634148e-05, + "loss": 0.4139, + "step": 16347 + }, + { + "epoch": 20.985879332477534, + "grad_norm": 2.1464085578918457, + "learning_rate": 2.6341035515618313e-05, + "loss": 0.4591, + "step": 16348 + }, + { + "epoch": 20.987163029525032, + "grad_norm": 2.0587306022644043, + "learning_rate": 2.6340607616602485e-05, + "loss": 0.4321, + "step": 16349 + }, + { + "epoch": 20.98844672657253, + "grad_norm": 1.7287724018096924, + "learning_rate": 2.634017971758665e-05, + "loss": 0.4581, + "step": 16350 + }, + { + "epoch": 20.989730423620024, + "grad_norm": 2.417912721633911, + "learning_rate": 2.6339751818570815e-05, + "loss": 0.4248, + "step": 16351 + }, + { + "epoch": 20.991014120667522, + "grad_norm": 1.8042947053909302, + "learning_rate": 2.6339323919554987e-05, + "loss": 0.4359, + "step": 16352 + }, + { + "epoch": 20.99229781771502, + "grad_norm": 2.028975248336792, + "learning_rate": 2.633889602053915e-05, + "loss": 0.4703, + "step": 16353 + }, + { + "epoch": 20.993581514762518, + "grad_norm": 2.328723907470703, + "learning_rate": 2.6338468121523323e-05, + "loss": 0.5013, + "step": 16354 + }, + { + "epoch": 20.994865211810012, + "grad_norm": 5.135355472564697, + "learning_rate": 2.633804022250749e-05, + "loss": 0.4781, + "step": 16355 + }, + { + "epoch": 20.99614890885751, + "grad_norm": 1.6725894212722778, + "learning_rate": 2.6337612323491657e-05, + "loss": 0.4553, + "step": 16356 + }, + { + "epoch": 20.997432605905008, + "grad_norm": 1.5571098327636719, + "learning_rate": 2.6337184424475825e-05, + "loss": 0.5081, + "step": 16357 + }, + { + "epoch": 20.998716302952502, + "grad_norm": 2.163545846939087, + "learning_rate": 2.633675652545999e-05, + "loss": 0.6121, + "step": 16358 + }, + { + "epoch": 21.0, + "grad_norm": 8.505151748657227, + "learning_rate": 2.633632862644416e-05, + "loss": 0.5633, + "step": 16359 + }, + { + "epoch": 21.001283697047498, + "grad_norm": 0.8936794996261597, + "learning_rate": 2.6335900727428327e-05, + "loss": 0.3761, + "step": 16360 + }, + { + "epoch": 21.002567394094992, + "grad_norm": 1.601245641708374, + "learning_rate": 2.6335472828412496e-05, + "loss": 0.362, + "step": 16361 + }, + { + "epoch": 21.00385109114249, + "grad_norm": 1.51505708694458, + "learning_rate": 2.6335044929396664e-05, + "loss": 0.4068, + "step": 16362 + }, + { + "epoch": 21.005134788189988, + "grad_norm": 0.7752988934516907, + "learning_rate": 2.6334617030380832e-05, + "loss": 0.3597, + "step": 16363 + }, + { + "epoch": 21.006418485237482, + "grad_norm": 1.573090672492981, + "learning_rate": 2.6334189131364997e-05, + "loss": 0.4165, + "step": 16364 + }, + { + "epoch": 21.00770218228498, + "grad_norm": 1.139737606048584, + "learning_rate": 2.6333761232349166e-05, + "loss": 0.3878, + "step": 16365 + }, + { + "epoch": 21.008985879332478, + "grad_norm": 1.1130690574645996, + "learning_rate": 2.6333333333333334e-05, + "loss": 0.419, + "step": 16366 + }, + { + "epoch": 21.010269576379976, + "grad_norm": 1.0365365743637085, + "learning_rate": 2.63329054343175e-05, + "loss": 0.4001, + "step": 16367 + }, + { + "epoch": 21.01155327342747, + "grad_norm": 0.8815232515335083, + "learning_rate": 2.633247753530167e-05, + "loss": 0.3665, + "step": 16368 + }, + { + "epoch": 21.012836970474968, + "grad_norm": 6.136595249176025, + "learning_rate": 2.6332049636285836e-05, + "loss": 0.4263, + "step": 16369 + }, + { + "epoch": 21.014120667522466, + "grad_norm": 0.9066773056983948, + "learning_rate": 2.6331621737270008e-05, + "loss": 0.376, + "step": 16370 + }, + { + "epoch": 21.01540436456996, + "grad_norm": 2.1949946880340576, + "learning_rate": 2.6331193838254173e-05, + "loss": 0.4035, + "step": 16371 + }, + { + "epoch": 21.016688061617458, + "grad_norm": 0.7972808480262756, + "learning_rate": 2.6330765939238338e-05, + "loss": 0.4044, + "step": 16372 + }, + { + "epoch": 21.017971758664956, + "grad_norm": 1.7108125686645508, + "learning_rate": 2.633033804022251e-05, + "loss": 0.4355, + "step": 16373 + }, + { + "epoch": 21.01925545571245, + "grad_norm": 1.7229245901107788, + "learning_rate": 2.6329910141206675e-05, + "loss": 0.3759, + "step": 16374 + }, + { + "epoch": 21.020539152759948, + "grad_norm": 1.1666778326034546, + "learning_rate": 2.6329482242190843e-05, + "loss": 0.3862, + "step": 16375 + }, + { + "epoch": 21.021822849807446, + "grad_norm": 1.2564775943756104, + "learning_rate": 2.632905434317501e-05, + "loss": 0.3916, + "step": 16376 + }, + { + "epoch": 21.023106546854944, + "grad_norm": 6.04573917388916, + "learning_rate": 2.632862644415918e-05, + "loss": 0.3659, + "step": 16377 + }, + { + "epoch": 21.024390243902438, + "grad_norm": 1.0785974264144897, + "learning_rate": 2.632819854514335e-05, + "loss": 0.4181, + "step": 16378 + }, + { + "epoch": 21.025673940949936, + "grad_norm": 2.213468074798584, + "learning_rate": 2.6327770646127513e-05, + "loss": 0.4234, + "step": 16379 + }, + { + "epoch": 21.026957637997434, + "grad_norm": 1.6748915910720825, + "learning_rate": 2.6327342747111682e-05, + "loss": 0.406, + "step": 16380 + }, + { + "epoch": 21.028241335044928, + "grad_norm": 1.0942186117172241, + "learning_rate": 2.632691484809585e-05, + "loss": 0.4213, + "step": 16381 + }, + { + "epoch": 21.029525032092426, + "grad_norm": 2.024235725402832, + "learning_rate": 2.632648694908002e-05, + "loss": 0.4306, + "step": 16382 + }, + { + "epoch": 21.030808729139924, + "grad_norm": 1.4232069253921509, + "learning_rate": 2.6326059050064184e-05, + "loss": 0.4185, + "step": 16383 + }, + { + "epoch": 21.03209242618742, + "grad_norm": 1.118491768836975, + "learning_rate": 2.6325631151048355e-05, + "loss": 0.4108, + "step": 16384 + }, + { + "epoch": 21.033376123234916, + "grad_norm": 7.776151180267334, + "learning_rate": 2.632520325203252e-05, + "loss": 0.3936, + "step": 16385 + }, + { + "epoch": 21.034659820282414, + "grad_norm": 1.7793983221054077, + "learning_rate": 2.632477535301669e-05, + "loss": 0.4044, + "step": 16386 + }, + { + "epoch": 21.035943517329912, + "grad_norm": 2.019505262374878, + "learning_rate": 2.6324347454000857e-05, + "loss": 0.4396, + "step": 16387 + }, + { + "epoch": 21.037227214377406, + "grad_norm": 1.38068425655365, + "learning_rate": 2.6323919554985022e-05, + "loss": 0.4571, + "step": 16388 + }, + { + "epoch": 21.038510911424904, + "grad_norm": 3.82023286819458, + "learning_rate": 2.6323491655969194e-05, + "loss": 0.4289, + "step": 16389 + }, + { + "epoch": 21.039794608472402, + "grad_norm": 3.469987630844116, + "learning_rate": 2.632306375695336e-05, + "loss": 0.4402, + "step": 16390 + }, + { + "epoch": 21.041078305519896, + "grad_norm": 1.9326114654541016, + "learning_rate": 2.6322635857937528e-05, + "loss": 0.4092, + "step": 16391 + }, + { + "epoch": 21.042362002567394, + "grad_norm": 1.0838184356689453, + "learning_rate": 2.6322207958921696e-05, + "loss": 0.4292, + "step": 16392 + }, + { + "epoch": 21.043645699614892, + "grad_norm": 1.732790470123291, + "learning_rate": 2.632178005990586e-05, + "loss": 0.4638, + "step": 16393 + }, + { + "epoch": 21.044929396662386, + "grad_norm": 0.9640170335769653, + "learning_rate": 2.6321352160890033e-05, + "loss": 0.4526, + "step": 16394 + }, + { + "epoch": 21.046213093709884, + "grad_norm": 0.8460679650306702, + "learning_rate": 2.6320924261874198e-05, + "loss": 0.4125, + "step": 16395 + }, + { + "epoch": 21.047496790757382, + "grad_norm": 11.615012168884277, + "learning_rate": 2.6320496362858366e-05, + "loss": 0.478, + "step": 16396 + }, + { + "epoch": 21.048780487804876, + "grad_norm": 2.5170600414276123, + "learning_rate": 2.6320068463842535e-05, + "loss": 0.4894, + "step": 16397 + }, + { + "epoch": 21.050064184852374, + "grad_norm": 1.6662800312042236, + "learning_rate": 2.63196405648267e-05, + "loss": 0.376, + "step": 16398 + }, + { + "epoch": 21.051347881899872, + "grad_norm": 1.706052541732788, + "learning_rate": 2.6319212665810868e-05, + "loss": 0.4598, + "step": 16399 + }, + { + "epoch": 21.05263157894737, + "grad_norm": 3.0473105907440186, + "learning_rate": 2.6318784766795036e-05, + "loss": 0.4012, + "step": 16400 + }, + { + "epoch": 21.053915275994864, + "grad_norm": 3.262378454208374, + "learning_rate": 2.6318356867779205e-05, + "loss": 0.4389, + "step": 16401 + }, + { + "epoch": 21.055198973042362, + "grad_norm": 2.2661385536193848, + "learning_rate": 2.6317928968763373e-05, + "loss": 0.513, + "step": 16402 + }, + { + "epoch": 21.05648267008986, + "grad_norm": 1.128143548965454, + "learning_rate": 2.6317501069747542e-05, + "loss": 0.4257, + "step": 16403 + }, + { + "epoch": 21.057766367137354, + "grad_norm": 1.7671360969543457, + "learning_rate": 2.6317073170731707e-05, + "loss": 0.4147, + "step": 16404 + }, + { + "epoch": 21.059050064184852, + "grad_norm": 2.004791498184204, + "learning_rate": 2.6316645271715875e-05, + "loss": 0.4609, + "step": 16405 + }, + { + "epoch": 21.06033376123235, + "grad_norm": 1.2611407041549683, + "learning_rate": 2.6316217372700044e-05, + "loss": 0.4816, + "step": 16406 + }, + { + "epoch": 21.061617458279844, + "grad_norm": 1.298413872718811, + "learning_rate": 2.631578947368421e-05, + "loss": 0.5081, + "step": 16407 + }, + { + "epoch": 21.062901155327342, + "grad_norm": 2.045797109603882, + "learning_rate": 2.631536157466838e-05, + "loss": 0.5585, + "step": 16408 + }, + { + "epoch": 21.06418485237484, + "grad_norm": 1.4952718019485474, + "learning_rate": 2.6314933675652545e-05, + "loss": 0.6101, + "step": 16409 + }, + { + "epoch": 21.065468549422338, + "grad_norm": 1.823035717010498, + "learning_rate": 2.6314505776636717e-05, + "loss": 0.4024, + "step": 16410 + }, + { + "epoch": 21.066752246469832, + "grad_norm": 1.2495698928833008, + "learning_rate": 2.6314077877620882e-05, + "loss": 0.3873, + "step": 16411 + }, + { + "epoch": 21.06803594351733, + "grad_norm": 1.1486361026763916, + "learning_rate": 2.6313649978605047e-05, + "loss": 0.3672, + "step": 16412 + }, + { + "epoch": 21.069319640564828, + "grad_norm": 1.4564926624298096, + "learning_rate": 2.631322207958922e-05, + "loss": 0.405, + "step": 16413 + }, + { + "epoch": 21.070603337612322, + "grad_norm": 1.32049560546875, + "learning_rate": 2.6312794180573384e-05, + "loss": 0.3884, + "step": 16414 + }, + { + "epoch": 21.07188703465982, + "grad_norm": 1.6208016872406006, + "learning_rate": 2.6312366281557552e-05, + "loss": 0.396, + "step": 16415 + }, + { + "epoch": 21.073170731707318, + "grad_norm": 0.9764462113380432, + "learning_rate": 2.631193838254172e-05, + "loss": 0.4123, + "step": 16416 + }, + { + "epoch": 21.074454428754812, + "grad_norm": 1.4418666362762451, + "learning_rate": 2.631151048352589e-05, + "loss": 0.3794, + "step": 16417 + }, + { + "epoch": 21.07573812580231, + "grad_norm": 1.1032477617263794, + "learning_rate": 2.6311082584510058e-05, + "loss": 0.4145, + "step": 16418 + }, + { + "epoch": 21.077021822849808, + "grad_norm": 1.956810474395752, + "learning_rate": 2.6310654685494223e-05, + "loss": 0.4055, + "step": 16419 + }, + { + "epoch": 21.078305519897306, + "grad_norm": 1.7392297983169556, + "learning_rate": 2.631022678647839e-05, + "loss": 0.3812, + "step": 16420 + }, + { + "epoch": 21.0795892169448, + "grad_norm": 2.827179431915283, + "learning_rate": 2.630979888746256e-05, + "loss": 0.4683, + "step": 16421 + }, + { + "epoch": 21.080872913992298, + "grad_norm": 1.1356850862503052, + "learning_rate": 2.6309370988446728e-05, + "loss": 0.3751, + "step": 16422 + }, + { + "epoch": 21.082156611039796, + "grad_norm": 1.6799862384796143, + "learning_rate": 2.6308943089430893e-05, + "loss": 0.4186, + "step": 16423 + }, + { + "epoch": 21.08344030808729, + "grad_norm": 2.5517239570617676, + "learning_rate": 2.6308515190415065e-05, + "loss": 0.3692, + "step": 16424 + }, + { + "epoch": 21.084724005134788, + "grad_norm": 3.69635009765625, + "learning_rate": 2.630808729139923e-05, + "loss": 0.4117, + "step": 16425 + }, + { + "epoch": 21.086007702182286, + "grad_norm": 1.2952111959457397, + "learning_rate": 2.6307659392383398e-05, + "loss": 0.3924, + "step": 16426 + }, + { + "epoch": 21.08729139922978, + "grad_norm": 1.465052843093872, + "learning_rate": 2.6307231493367567e-05, + "loss": 0.4309, + "step": 16427 + }, + { + "epoch": 21.088575096277278, + "grad_norm": 1.4211019277572632, + "learning_rate": 2.630680359435173e-05, + "loss": 0.3804, + "step": 16428 + }, + { + "epoch": 21.089858793324776, + "grad_norm": 2.4649147987365723, + "learning_rate": 2.6306375695335903e-05, + "loss": 0.4141, + "step": 16429 + }, + { + "epoch": 21.09114249037227, + "grad_norm": 1.0525906085968018, + "learning_rate": 2.630594779632007e-05, + "loss": 0.411, + "step": 16430 + }, + { + "epoch": 21.09242618741977, + "grad_norm": 2.5041379928588867, + "learning_rate": 2.6305519897304237e-05, + "loss": 0.3933, + "step": 16431 + }, + { + "epoch": 21.093709884467266, + "grad_norm": 1.7598421573638916, + "learning_rate": 2.6305091998288405e-05, + "loss": 0.367, + "step": 16432 + }, + { + "epoch": 21.094993581514764, + "grad_norm": 1.5465824604034424, + "learning_rate": 2.630466409927257e-05, + "loss": 0.4273, + "step": 16433 + }, + { + "epoch": 21.09627727856226, + "grad_norm": 1.5656007528305054, + "learning_rate": 2.6304236200256742e-05, + "loss": 0.3975, + "step": 16434 + }, + { + "epoch": 21.097560975609756, + "grad_norm": 1.0571749210357666, + "learning_rate": 2.6303808301240907e-05, + "loss": 0.4118, + "step": 16435 + }, + { + "epoch": 21.098844672657254, + "grad_norm": 1.3487238883972168, + "learning_rate": 2.6303380402225076e-05, + "loss": 0.4122, + "step": 16436 + }, + { + "epoch": 21.10012836970475, + "grad_norm": 1.7371100187301636, + "learning_rate": 2.6302952503209244e-05, + "loss": 0.4036, + "step": 16437 + }, + { + "epoch": 21.101412066752246, + "grad_norm": 1.6201311349868774, + "learning_rate": 2.6302524604193412e-05, + "loss": 0.4425, + "step": 16438 + }, + { + "epoch": 21.102695763799744, + "grad_norm": 1.2235608100891113, + "learning_rate": 2.6302096705177577e-05, + "loss": 0.3893, + "step": 16439 + }, + { + "epoch": 21.10397946084724, + "grad_norm": 1.5746294260025024, + "learning_rate": 2.6301668806161746e-05, + "loss": 0.4243, + "step": 16440 + }, + { + "epoch": 21.105263157894736, + "grad_norm": 2.4384686946868896, + "learning_rate": 2.6301240907145914e-05, + "loss": 0.4009, + "step": 16441 + }, + { + "epoch": 21.106546854942234, + "grad_norm": 2.4845709800720215, + "learning_rate": 2.6300813008130083e-05, + "loss": 0.4594, + "step": 16442 + }, + { + "epoch": 21.107830551989732, + "grad_norm": 1.3472044467926025, + "learning_rate": 2.630038510911425e-05, + "loss": 0.4416, + "step": 16443 + }, + { + "epoch": 21.109114249037226, + "grad_norm": 1.1201163530349731, + "learning_rate": 2.6299957210098416e-05, + "loss": 0.4345, + "step": 16444 + }, + { + "epoch": 21.110397946084724, + "grad_norm": 1.527994990348816, + "learning_rate": 2.6299529311082588e-05, + "loss": 0.4618, + "step": 16445 + }, + { + "epoch": 21.111681643132222, + "grad_norm": 1.7094478607177734, + "learning_rate": 2.6299101412066753e-05, + "loss": 0.426, + "step": 16446 + }, + { + "epoch": 21.112965340179716, + "grad_norm": 1.8375539779663086, + "learning_rate": 2.6298673513050918e-05, + "loss": 0.4307, + "step": 16447 + }, + { + "epoch": 21.114249037227214, + "grad_norm": 5.484838485717773, + "learning_rate": 2.629824561403509e-05, + "loss": 0.4061, + "step": 16448 + }, + { + "epoch": 21.115532734274712, + "grad_norm": 3.2267510890960693, + "learning_rate": 2.6297817715019255e-05, + "loss": 0.4344, + "step": 16449 + }, + { + "epoch": 21.116816431322206, + "grad_norm": 1.5960156917572021, + "learning_rate": 2.6297389816003427e-05, + "loss": 0.537, + "step": 16450 + }, + { + "epoch": 21.118100128369704, + "grad_norm": 1.7225263118743896, + "learning_rate": 2.629696191698759e-05, + "loss": 0.4545, + "step": 16451 + }, + { + "epoch": 21.119383825417202, + "grad_norm": 2.039480447769165, + "learning_rate": 2.629653401797176e-05, + "loss": 0.4464, + "step": 16452 + }, + { + "epoch": 21.1206675224647, + "grad_norm": 1.2467206716537476, + "learning_rate": 2.629610611895593e-05, + "loss": 0.4421, + "step": 16453 + }, + { + "epoch": 21.121951219512194, + "grad_norm": 1.9829109907150269, + "learning_rate": 2.6295678219940093e-05, + "loss": 0.4739, + "step": 16454 + }, + { + "epoch": 21.123234916559692, + "grad_norm": 1.4469610452651978, + "learning_rate": 2.6295250320924262e-05, + "loss": 0.4671, + "step": 16455 + }, + { + "epoch": 21.12451861360719, + "grad_norm": 2.7006447315216064, + "learning_rate": 2.629482242190843e-05, + "loss": 0.4596, + "step": 16456 + }, + { + "epoch": 21.125802310654684, + "grad_norm": 1.837628722190857, + "learning_rate": 2.62943945228926e-05, + "loss": 0.5439, + "step": 16457 + }, + { + "epoch": 21.127086007702182, + "grad_norm": 2.1803507804870605, + "learning_rate": 2.6293966623876767e-05, + "loss": 0.5311, + "step": 16458 + }, + { + "epoch": 21.12836970474968, + "grad_norm": 2.384817123413086, + "learning_rate": 2.6293538724860932e-05, + "loss": 0.6427, + "step": 16459 + }, + { + "epoch": 21.129653401797174, + "grad_norm": 2.0185186862945557, + "learning_rate": 2.62931108258451e-05, + "loss": 0.3888, + "step": 16460 + }, + { + "epoch": 21.130937098844672, + "grad_norm": 3.624497413635254, + "learning_rate": 2.629268292682927e-05, + "loss": 0.4047, + "step": 16461 + }, + { + "epoch": 21.13222079589217, + "grad_norm": 2.530021905899048, + "learning_rate": 2.6292255027813437e-05, + "loss": 0.388, + "step": 16462 + }, + { + "epoch": 21.133504492939664, + "grad_norm": 2.325568914413452, + "learning_rate": 2.6291827128797602e-05, + "loss": 0.3747, + "step": 16463 + }, + { + "epoch": 21.134788189987162, + "grad_norm": 1.7530444860458374, + "learning_rate": 2.6291399229781774e-05, + "loss": 0.4116, + "step": 16464 + }, + { + "epoch": 21.13607188703466, + "grad_norm": 1.4841628074645996, + "learning_rate": 2.629097133076594e-05, + "loss": 0.3887, + "step": 16465 + }, + { + "epoch": 21.137355584082158, + "grad_norm": 0.9952470660209656, + "learning_rate": 2.6290543431750108e-05, + "loss": 0.4207, + "step": 16466 + }, + { + "epoch": 21.138639281129652, + "grad_norm": 1.4158152341842651, + "learning_rate": 2.6290115532734276e-05, + "loss": 0.411, + "step": 16467 + }, + { + "epoch": 21.13992297817715, + "grad_norm": 1.0972225666046143, + "learning_rate": 2.628968763371844e-05, + "loss": 0.3952, + "step": 16468 + }, + { + "epoch": 21.141206675224648, + "grad_norm": 1.2575743198394775, + "learning_rate": 2.6289259734702613e-05, + "loss": 0.3839, + "step": 16469 + }, + { + "epoch": 21.142490372272142, + "grad_norm": 1.387779712677002, + "learning_rate": 2.6288831835686778e-05, + "loss": 0.3745, + "step": 16470 + }, + { + "epoch": 21.14377406931964, + "grad_norm": 2.168388843536377, + "learning_rate": 2.6288403936670946e-05, + "loss": 0.4194, + "step": 16471 + }, + { + "epoch": 21.145057766367138, + "grad_norm": 1.4692806005477905, + "learning_rate": 2.6287976037655115e-05, + "loss": 0.4086, + "step": 16472 + }, + { + "epoch": 21.146341463414632, + "grad_norm": 3.6077821254730225, + "learning_rate": 2.628754813863928e-05, + "loss": 0.3978, + "step": 16473 + }, + { + "epoch": 21.14762516046213, + "grad_norm": 1.435011863708496, + "learning_rate": 2.628712023962345e-05, + "loss": 0.4092, + "step": 16474 + }, + { + "epoch": 21.14890885750963, + "grad_norm": 1.450985312461853, + "learning_rate": 2.6286692340607617e-05, + "loss": 0.3806, + "step": 16475 + }, + { + "epoch": 21.150192554557126, + "grad_norm": 1.8686522245407104, + "learning_rate": 2.6286264441591785e-05, + "loss": 0.4289, + "step": 16476 + }, + { + "epoch": 21.15147625160462, + "grad_norm": 1.207474946975708, + "learning_rate": 2.6285836542575953e-05, + "loss": 0.3976, + "step": 16477 + }, + { + "epoch": 21.15275994865212, + "grad_norm": 1.8733898401260376, + "learning_rate": 2.6285408643560122e-05, + "loss": 0.4291, + "step": 16478 + }, + { + "epoch": 21.154043645699616, + "grad_norm": 1.266768455505371, + "learning_rate": 2.6284980744544287e-05, + "loss": 0.399, + "step": 16479 + }, + { + "epoch": 21.15532734274711, + "grad_norm": 1.4221349954605103, + "learning_rate": 2.6284552845528455e-05, + "loss": 0.4296, + "step": 16480 + }, + { + "epoch": 21.15661103979461, + "grad_norm": 1.3090415000915527, + "learning_rate": 2.6284124946512624e-05, + "loss": 0.3844, + "step": 16481 + }, + { + "epoch": 21.157894736842106, + "grad_norm": 1.2611329555511475, + "learning_rate": 2.6283697047496792e-05, + "loss": 0.4112, + "step": 16482 + }, + { + "epoch": 21.1591784338896, + "grad_norm": 2.382617950439453, + "learning_rate": 2.628326914848096e-05, + "loss": 0.4116, + "step": 16483 + }, + { + "epoch": 21.1604621309371, + "grad_norm": 1.486006498336792, + "learning_rate": 2.6282841249465125e-05, + "loss": 0.4192, + "step": 16484 + }, + { + "epoch": 21.161745827984596, + "grad_norm": 1.77679443359375, + "learning_rate": 2.6282413350449297e-05, + "loss": 0.4323, + "step": 16485 + }, + { + "epoch": 21.163029525032094, + "grad_norm": 1.042112946510315, + "learning_rate": 2.6281985451433462e-05, + "loss": 0.4056, + "step": 16486 + }, + { + "epoch": 21.16431322207959, + "grad_norm": 1.7030179500579834, + "learning_rate": 2.6281557552417627e-05, + "loss": 0.3912, + "step": 16487 + }, + { + "epoch": 21.165596919127086, + "grad_norm": 0.974872350692749, + "learning_rate": 2.62811296534018e-05, + "loss": 0.3824, + "step": 16488 + }, + { + "epoch": 21.166880616174584, + "grad_norm": 1.883025884628296, + "learning_rate": 2.6280701754385964e-05, + "loss": 0.4236, + "step": 16489 + }, + { + "epoch": 21.16816431322208, + "grad_norm": 2.1477694511413574, + "learning_rate": 2.6280273855370136e-05, + "loss": 0.4095, + "step": 16490 + }, + { + "epoch": 21.169448010269576, + "grad_norm": 1.871588110923767, + "learning_rate": 2.62798459563543e-05, + "loss": 0.4051, + "step": 16491 + }, + { + "epoch": 21.170731707317074, + "grad_norm": 1.6993367671966553, + "learning_rate": 2.627941805733847e-05, + "loss": 0.4532, + "step": 16492 + }, + { + "epoch": 21.17201540436457, + "grad_norm": 3.9185643196105957, + "learning_rate": 2.6278990158322638e-05, + "loss": 0.4438, + "step": 16493 + }, + { + "epoch": 21.173299101412066, + "grad_norm": 1.5180224180221558, + "learning_rate": 2.6278562259306803e-05, + "loss": 0.4418, + "step": 16494 + }, + { + "epoch": 21.174582798459564, + "grad_norm": 1.4048799276351929, + "learning_rate": 2.627813436029097e-05, + "loss": 0.4553, + "step": 16495 + }, + { + "epoch": 21.17586649550706, + "grad_norm": 4.5275068283081055, + "learning_rate": 2.627770646127514e-05, + "loss": 0.4069, + "step": 16496 + }, + { + "epoch": 21.177150192554556, + "grad_norm": 1.959061622619629, + "learning_rate": 2.6277278562259308e-05, + "loss": 0.3961, + "step": 16497 + }, + { + "epoch": 21.178433889602054, + "grad_norm": 2.339585542678833, + "learning_rate": 2.6276850663243476e-05, + "loss": 0.4392, + "step": 16498 + }, + { + "epoch": 21.179717586649552, + "grad_norm": 2.3387045860290527, + "learning_rate": 2.6276422764227645e-05, + "loss": 0.4709, + "step": 16499 + }, + { + "epoch": 21.181001283697046, + "grad_norm": 2.199331521987915, + "learning_rate": 2.627599486521181e-05, + "loss": 0.4523, + "step": 16500 + }, + { + "epoch": 21.182284980744544, + "grad_norm": 4.254043102264404, + "learning_rate": 2.6275566966195978e-05, + "loss": 0.4392, + "step": 16501 + }, + { + "epoch": 21.183568677792042, + "grad_norm": 6.9012956619262695, + "learning_rate": 2.6275139067180147e-05, + "loss": 0.4498, + "step": 16502 + }, + { + "epoch": 21.184852374839537, + "grad_norm": 3.354750156402588, + "learning_rate": 2.6274711168164312e-05, + "loss": 0.4141, + "step": 16503 + }, + { + "epoch": 21.186136071887034, + "grad_norm": 3.285055160522461, + "learning_rate": 2.6274283269148484e-05, + "loss": 0.4901, + "step": 16504 + }, + { + "epoch": 21.187419768934532, + "grad_norm": 2.4641098976135254, + "learning_rate": 2.627385537013265e-05, + "loss": 0.4621, + "step": 16505 + }, + { + "epoch": 21.188703465982027, + "grad_norm": 2.2649927139282227, + "learning_rate": 2.627342747111682e-05, + "loss": 0.4401, + "step": 16506 + }, + { + "epoch": 21.189987163029524, + "grad_norm": 1.4294265508651733, + "learning_rate": 2.6272999572100985e-05, + "loss": 0.4467, + "step": 16507 + }, + { + "epoch": 21.191270860077022, + "grad_norm": 1.7348204851150513, + "learning_rate": 2.627257167308515e-05, + "loss": 0.5188, + "step": 16508 + }, + { + "epoch": 21.19255455712452, + "grad_norm": 4.796988010406494, + "learning_rate": 2.6272143774069322e-05, + "loss": 0.6811, + "step": 16509 + }, + { + "epoch": 21.193838254172015, + "grad_norm": 1.7715988159179688, + "learning_rate": 2.6271715875053487e-05, + "loss": 0.3771, + "step": 16510 + }, + { + "epoch": 21.195121951219512, + "grad_norm": 3.5479071140289307, + "learning_rate": 2.6271287976037656e-05, + "loss": 0.3801, + "step": 16511 + }, + { + "epoch": 21.19640564826701, + "grad_norm": 1.4519169330596924, + "learning_rate": 2.6270860077021824e-05, + "loss": 0.4047, + "step": 16512 + }, + { + "epoch": 21.197689345314505, + "grad_norm": 1.8511266708374023, + "learning_rate": 2.6270432178005992e-05, + "loss": 0.3792, + "step": 16513 + }, + { + "epoch": 21.198973042362002, + "grad_norm": 1.3118706941604614, + "learning_rate": 2.6270004278990157e-05, + "loss": 0.4069, + "step": 16514 + }, + { + "epoch": 21.2002567394095, + "grad_norm": 1.2135330438613892, + "learning_rate": 2.6269576379974326e-05, + "loss": 0.418, + "step": 16515 + }, + { + "epoch": 21.201540436456995, + "grad_norm": 1.381008505821228, + "learning_rate": 2.6269148480958494e-05, + "loss": 0.3823, + "step": 16516 + }, + { + "epoch": 21.202824133504492, + "grad_norm": 2.056180953979492, + "learning_rate": 2.6268720581942663e-05, + "loss": 0.4102, + "step": 16517 + }, + { + "epoch": 21.20410783055199, + "grad_norm": 1.2780424356460571, + "learning_rate": 2.626829268292683e-05, + "loss": 0.3928, + "step": 16518 + }, + { + "epoch": 21.205391527599488, + "grad_norm": 2.6656391620635986, + "learning_rate": 2.6267864783910996e-05, + "loss": 0.4135, + "step": 16519 + }, + { + "epoch": 21.206675224646983, + "grad_norm": 1.0527911186218262, + "learning_rate": 2.6267436884895165e-05, + "loss": 0.4172, + "step": 16520 + }, + { + "epoch": 21.20795892169448, + "grad_norm": 1.1100893020629883, + "learning_rate": 2.6267008985879333e-05, + "loss": 0.4505, + "step": 16521 + }, + { + "epoch": 21.20924261874198, + "grad_norm": 2.040567636489868, + "learning_rate": 2.6266581086863498e-05, + "loss": 0.3974, + "step": 16522 + }, + { + "epoch": 21.210526315789473, + "grad_norm": 1.4994665384292603, + "learning_rate": 2.626615318784767e-05, + "loss": 0.3942, + "step": 16523 + }, + { + "epoch": 21.21181001283697, + "grad_norm": 1.2604953050613403, + "learning_rate": 2.6265725288831835e-05, + "loss": 0.4243, + "step": 16524 + }, + { + "epoch": 21.21309370988447, + "grad_norm": 4.197391510009766, + "learning_rate": 2.6265297389816007e-05, + "loss": 0.4132, + "step": 16525 + }, + { + "epoch": 21.214377406931963, + "grad_norm": 1.518970012664795, + "learning_rate": 2.626486949080017e-05, + "loss": 0.4073, + "step": 16526 + }, + { + "epoch": 21.21566110397946, + "grad_norm": 1.0807191133499146, + "learning_rate": 2.6264441591784337e-05, + "loss": 0.3801, + "step": 16527 + }, + { + "epoch": 21.21694480102696, + "grad_norm": 0.9976283311843872, + "learning_rate": 2.626401369276851e-05, + "loss": 0.397, + "step": 16528 + }, + { + "epoch": 21.218228498074453, + "grad_norm": 3.558159351348877, + "learning_rate": 2.6263585793752673e-05, + "loss": 0.3904, + "step": 16529 + }, + { + "epoch": 21.21951219512195, + "grad_norm": 1.632846713066101, + "learning_rate": 2.6263157894736842e-05, + "loss": 0.4051, + "step": 16530 + }, + { + "epoch": 21.22079589216945, + "grad_norm": 3.222503900527954, + "learning_rate": 2.626272999572101e-05, + "loss": 0.4113, + "step": 16531 + }, + { + "epoch": 21.222079589216946, + "grad_norm": 4.223008155822754, + "learning_rate": 2.626230209670518e-05, + "loss": 0.4386, + "step": 16532 + }, + { + "epoch": 21.22336328626444, + "grad_norm": 1.954957365989685, + "learning_rate": 2.6261874197689347e-05, + "loss": 0.4022, + "step": 16533 + }, + { + "epoch": 21.22464698331194, + "grad_norm": 1.2396585941314697, + "learning_rate": 2.6261446298673512e-05, + "loss": 0.4138, + "step": 16534 + }, + { + "epoch": 21.225930680359436, + "grad_norm": 1.588931918144226, + "learning_rate": 2.626101839965768e-05, + "loss": 0.43, + "step": 16535 + }, + { + "epoch": 21.22721437740693, + "grad_norm": 13.768827438354492, + "learning_rate": 2.626059050064185e-05, + "loss": 0.4222, + "step": 16536 + }, + { + "epoch": 21.22849807445443, + "grad_norm": 3.043006420135498, + "learning_rate": 2.6260162601626017e-05, + "loss": 0.401, + "step": 16537 + }, + { + "epoch": 21.229781771501926, + "grad_norm": 1.3705121278762817, + "learning_rate": 2.6259734702610182e-05, + "loss": 0.3969, + "step": 16538 + }, + { + "epoch": 21.23106546854942, + "grad_norm": 1.4839142560958862, + "learning_rate": 2.6259306803594354e-05, + "loss": 0.4288, + "step": 16539 + }, + { + "epoch": 21.23234916559692, + "grad_norm": 2.069955587387085, + "learning_rate": 2.625887890457852e-05, + "loss": 0.4002, + "step": 16540 + }, + { + "epoch": 21.233632862644416, + "grad_norm": 2.423088788986206, + "learning_rate": 2.6258451005562688e-05, + "loss": 0.423, + "step": 16541 + }, + { + "epoch": 21.234916559691914, + "grad_norm": 3.291917085647583, + "learning_rate": 2.6258023106546856e-05, + "loss": 0.4092, + "step": 16542 + }, + { + "epoch": 21.23620025673941, + "grad_norm": 3.8094680309295654, + "learning_rate": 2.625759520753102e-05, + "loss": 0.4358, + "step": 16543 + }, + { + "epoch": 21.237483953786906, + "grad_norm": 1.4725288152694702, + "learning_rate": 2.6257167308515193e-05, + "loss": 0.4393, + "step": 16544 + }, + { + "epoch": 21.238767650834404, + "grad_norm": 2.790796995162964, + "learning_rate": 2.6256739409499358e-05, + "loss": 0.4051, + "step": 16545 + }, + { + "epoch": 21.2400513478819, + "grad_norm": 1.5937215089797974, + "learning_rate": 2.6256311510483526e-05, + "loss": 0.4693, + "step": 16546 + }, + { + "epoch": 21.241335044929397, + "grad_norm": 3.665637731552124, + "learning_rate": 2.6255883611467695e-05, + "loss": 0.4544, + "step": 16547 + }, + { + "epoch": 21.242618741976894, + "grad_norm": 2.3552629947662354, + "learning_rate": 2.625545571245186e-05, + "loss": 0.4228, + "step": 16548 + }, + { + "epoch": 21.24390243902439, + "grad_norm": 1.5517228841781616, + "learning_rate": 2.625502781343603e-05, + "loss": 0.4012, + "step": 16549 + }, + { + "epoch": 21.245186136071887, + "grad_norm": 1.7122199535369873, + "learning_rate": 2.6254599914420197e-05, + "loss": 0.4767, + "step": 16550 + }, + { + "epoch": 21.246469833119384, + "grad_norm": 2.9372756481170654, + "learning_rate": 2.6254172015404365e-05, + "loss": 0.4377, + "step": 16551 + }, + { + "epoch": 21.247753530166882, + "grad_norm": 1.833579421043396, + "learning_rate": 2.6253744116388533e-05, + "loss": 0.4484, + "step": 16552 + }, + { + "epoch": 21.249037227214377, + "grad_norm": 13.555416107177734, + "learning_rate": 2.6253316217372702e-05, + "loss": 0.3932, + "step": 16553 + }, + { + "epoch": 21.250320924261874, + "grad_norm": 7.754823684692383, + "learning_rate": 2.6252888318356867e-05, + "loss": 0.4943, + "step": 16554 + }, + { + "epoch": 21.251604621309372, + "grad_norm": 1.2649719715118408, + "learning_rate": 2.6252460419341035e-05, + "loss": 0.454, + "step": 16555 + }, + { + "epoch": 21.252888318356867, + "grad_norm": 2.2875640392303467, + "learning_rate": 2.6252032520325204e-05, + "loss": 0.4541, + "step": 16556 + }, + { + "epoch": 21.254172015404365, + "grad_norm": 1.9931541681289673, + "learning_rate": 2.6251604621309372e-05, + "loss": 0.536, + "step": 16557 + }, + { + "epoch": 21.255455712451862, + "grad_norm": 12.453817367553711, + "learning_rate": 2.625117672229354e-05, + "loss": 0.5502, + "step": 16558 + }, + { + "epoch": 21.256739409499357, + "grad_norm": 5.6181840896606445, + "learning_rate": 2.6250748823277706e-05, + "loss": 0.579, + "step": 16559 + }, + { + "epoch": 21.258023106546855, + "grad_norm": 1.0850623846054077, + "learning_rate": 2.6250320924261877e-05, + "loss": 0.3721, + "step": 16560 + }, + { + "epoch": 21.259306803594352, + "grad_norm": 1.0418810844421387, + "learning_rate": 2.6249893025246042e-05, + "loss": 0.4338, + "step": 16561 + }, + { + "epoch": 21.260590500641847, + "grad_norm": 1.4157310724258423, + "learning_rate": 2.6249465126230207e-05, + "loss": 0.4122, + "step": 16562 + }, + { + "epoch": 21.261874197689345, + "grad_norm": 2.565786361694336, + "learning_rate": 2.624903722721438e-05, + "loss": 0.4183, + "step": 16563 + }, + { + "epoch": 21.263157894736842, + "grad_norm": 3.4764456748962402, + "learning_rate": 2.6248609328198544e-05, + "loss": 0.4125, + "step": 16564 + }, + { + "epoch": 21.26444159178434, + "grad_norm": 0.9647695422172546, + "learning_rate": 2.6248181429182716e-05, + "loss": 0.4093, + "step": 16565 + }, + { + "epoch": 21.265725288831835, + "grad_norm": 1.3111803531646729, + "learning_rate": 2.624775353016688e-05, + "loss": 0.3818, + "step": 16566 + }, + { + "epoch": 21.267008985879333, + "grad_norm": 1.3695276975631714, + "learning_rate": 2.624732563115105e-05, + "loss": 0.3924, + "step": 16567 + }, + { + "epoch": 21.26829268292683, + "grad_norm": 1.717341661453247, + "learning_rate": 2.6246897732135218e-05, + "loss": 0.4167, + "step": 16568 + }, + { + "epoch": 21.269576379974325, + "grad_norm": 1.5696558952331543, + "learning_rate": 2.6246469833119383e-05, + "loss": 0.4257, + "step": 16569 + }, + { + "epoch": 21.270860077021823, + "grad_norm": 1.769134759902954, + "learning_rate": 2.624604193410355e-05, + "loss": 0.4271, + "step": 16570 + }, + { + "epoch": 21.27214377406932, + "grad_norm": 2.9806528091430664, + "learning_rate": 2.624561403508772e-05, + "loss": 0.429, + "step": 16571 + }, + { + "epoch": 21.273427471116815, + "grad_norm": 2.726956605911255, + "learning_rate": 2.6245186136071888e-05, + "loss": 0.4084, + "step": 16572 + }, + { + "epoch": 21.274711168164313, + "grad_norm": 1.636573076248169, + "learning_rate": 2.6244758237056057e-05, + "loss": 0.4128, + "step": 16573 + }, + { + "epoch": 21.27599486521181, + "grad_norm": 1.9228698015213013, + "learning_rate": 2.6244330338040225e-05, + "loss": 0.3924, + "step": 16574 + }, + { + "epoch": 21.27727856225931, + "grad_norm": 1.772202491760254, + "learning_rate": 2.624390243902439e-05, + "loss": 0.408, + "step": 16575 + }, + { + "epoch": 21.278562259306803, + "grad_norm": 2.0119874477386475, + "learning_rate": 2.624347454000856e-05, + "loss": 0.4222, + "step": 16576 + }, + { + "epoch": 21.2798459563543, + "grad_norm": 2.3809475898742676, + "learning_rate": 2.6243046640992727e-05, + "loss": 0.4132, + "step": 16577 + }, + { + "epoch": 21.2811296534018, + "grad_norm": 1.1492966413497925, + "learning_rate": 2.6242618741976892e-05, + "loss": 0.4053, + "step": 16578 + }, + { + "epoch": 21.282413350449293, + "grad_norm": 2.9053027629852295, + "learning_rate": 2.6242190842961064e-05, + "loss": 0.4013, + "step": 16579 + }, + { + "epoch": 21.28369704749679, + "grad_norm": 1.6956590414047241, + "learning_rate": 2.624176294394523e-05, + "loss": 0.4173, + "step": 16580 + }, + { + "epoch": 21.28498074454429, + "grad_norm": 1.975951910018921, + "learning_rate": 2.62413350449294e-05, + "loss": 0.4686, + "step": 16581 + }, + { + "epoch": 21.286264441591783, + "grad_norm": 0.9623923301696777, + "learning_rate": 2.6240907145913565e-05, + "loss": 0.4017, + "step": 16582 + }, + { + "epoch": 21.28754813863928, + "grad_norm": 4.679821491241455, + "learning_rate": 2.624047924689773e-05, + "loss": 0.3622, + "step": 16583 + }, + { + "epoch": 21.28883183568678, + "grad_norm": 5.339022636413574, + "learning_rate": 2.6240051347881902e-05, + "loss": 0.397, + "step": 16584 + }, + { + "epoch": 21.290115532734276, + "grad_norm": 3.8638157844543457, + "learning_rate": 2.6239623448866067e-05, + "loss": 0.4045, + "step": 16585 + }, + { + "epoch": 21.29139922978177, + "grad_norm": 2.0864686965942383, + "learning_rate": 2.6239195549850236e-05, + "loss": 0.4271, + "step": 16586 + }, + { + "epoch": 21.29268292682927, + "grad_norm": 2.109930992126465, + "learning_rate": 2.6238767650834404e-05, + "loss": 0.4219, + "step": 16587 + }, + { + "epoch": 21.293966623876766, + "grad_norm": 1.3042621612548828, + "learning_rate": 2.623833975181857e-05, + "loss": 0.4431, + "step": 16588 + }, + { + "epoch": 21.29525032092426, + "grad_norm": 5.228926181793213, + "learning_rate": 2.623791185280274e-05, + "loss": 0.4259, + "step": 16589 + }, + { + "epoch": 21.29653401797176, + "grad_norm": 2.3648438453674316, + "learning_rate": 2.6237483953786906e-05, + "loss": 0.448, + "step": 16590 + }, + { + "epoch": 21.297817715019256, + "grad_norm": 2.2232112884521484, + "learning_rate": 2.6237056054771074e-05, + "loss": 0.4541, + "step": 16591 + }, + { + "epoch": 21.29910141206675, + "grad_norm": 4.93815803527832, + "learning_rate": 2.6236628155755243e-05, + "loss": 0.4117, + "step": 16592 + }, + { + "epoch": 21.30038510911425, + "grad_norm": 1.9704517126083374, + "learning_rate": 2.623620025673941e-05, + "loss": 0.4235, + "step": 16593 + }, + { + "epoch": 21.301668806161747, + "grad_norm": 2.4519736766815186, + "learning_rate": 2.6235772357723576e-05, + "loss": 0.4028, + "step": 16594 + }, + { + "epoch": 21.30295250320924, + "grad_norm": 1.7003822326660156, + "learning_rate": 2.6235344458707745e-05, + "loss": 0.413, + "step": 16595 + }, + { + "epoch": 21.30423620025674, + "grad_norm": 1.3328624963760376, + "learning_rate": 2.6234916559691913e-05, + "loss": 0.4683, + "step": 16596 + }, + { + "epoch": 21.305519897304237, + "grad_norm": 1.4380775690078735, + "learning_rate": 2.623448866067608e-05, + "loss": 0.4217, + "step": 16597 + }, + { + "epoch": 21.306803594351734, + "grad_norm": 1.6239900588989258, + "learning_rate": 2.623406076166025e-05, + "loss": 0.4137, + "step": 16598 + }, + { + "epoch": 21.30808729139923, + "grad_norm": 1.196203351020813, + "learning_rate": 2.6233632862644415e-05, + "loss": 0.4462, + "step": 16599 + }, + { + "epoch": 21.309370988446727, + "grad_norm": 1.5790623426437378, + "learning_rate": 2.6233204963628587e-05, + "loss": 0.3929, + "step": 16600 + }, + { + "epoch": 21.310654685494224, + "grad_norm": 1.5695583820343018, + "learning_rate": 2.6232777064612752e-05, + "loss": 0.4523, + "step": 16601 + }, + { + "epoch": 21.31193838254172, + "grad_norm": 1.4646743535995483, + "learning_rate": 2.6232349165596917e-05, + "loss": 0.469, + "step": 16602 + }, + { + "epoch": 21.313222079589217, + "grad_norm": 1.1782739162445068, + "learning_rate": 2.623192126658109e-05, + "loss": 0.4303, + "step": 16603 + }, + { + "epoch": 21.314505776636715, + "grad_norm": 2.511206865310669, + "learning_rate": 2.6231493367565254e-05, + "loss": 0.4212, + "step": 16604 + }, + { + "epoch": 21.31578947368421, + "grad_norm": 3.032994270324707, + "learning_rate": 2.6231065468549425e-05, + "loss": 0.4761, + "step": 16605 + }, + { + "epoch": 21.317073170731707, + "grad_norm": 2.2561817169189453, + "learning_rate": 2.623063756953359e-05, + "loss": 0.511, + "step": 16606 + }, + { + "epoch": 21.318356867779205, + "grad_norm": 3.0628182888031006, + "learning_rate": 2.623020967051776e-05, + "loss": 0.5262, + "step": 16607 + }, + { + "epoch": 21.319640564826702, + "grad_norm": 1.8357975482940674, + "learning_rate": 2.6229781771501927e-05, + "loss": 0.5002, + "step": 16608 + }, + { + "epoch": 21.320924261874197, + "grad_norm": 2.0564448833465576, + "learning_rate": 2.6229353872486092e-05, + "loss": 0.6118, + "step": 16609 + }, + { + "epoch": 21.322207958921695, + "grad_norm": 1.7710427045822144, + "learning_rate": 2.622892597347026e-05, + "loss": 0.3539, + "step": 16610 + }, + { + "epoch": 21.323491655969192, + "grad_norm": 1.9626573324203491, + "learning_rate": 2.622849807445443e-05, + "loss": 0.3965, + "step": 16611 + }, + { + "epoch": 21.324775353016687, + "grad_norm": 1.5273679494857788, + "learning_rate": 2.6228070175438597e-05, + "loss": 0.4002, + "step": 16612 + }, + { + "epoch": 21.326059050064185, + "grad_norm": 0.8605750203132629, + "learning_rate": 2.6227642276422766e-05, + "loss": 0.4117, + "step": 16613 + }, + { + "epoch": 21.327342747111683, + "grad_norm": 2.6950812339782715, + "learning_rate": 2.6227214377406934e-05, + "loss": 0.4092, + "step": 16614 + }, + { + "epoch": 21.328626444159177, + "grad_norm": 2.545767068862915, + "learning_rate": 2.62267864783911e-05, + "loss": 0.3859, + "step": 16615 + }, + { + "epoch": 21.329910141206675, + "grad_norm": 1.3883726596832275, + "learning_rate": 2.6226358579375268e-05, + "loss": 0.4228, + "step": 16616 + }, + { + "epoch": 21.331193838254173, + "grad_norm": 2.0698375701904297, + "learning_rate": 2.6225930680359436e-05, + "loss": 0.3674, + "step": 16617 + }, + { + "epoch": 21.33247753530167, + "grad_norm": 1.315962314605713, + "learning_rate": 2.62255027813436e-05, + "loss": 0.3997, + "step": 16618 + }, + { + "epoch": 21.333761232349165, + "grad_norm": 1.119879126548767, + "learning_rate": 2.6225074882327773e-05, + "loss": 0.4142, + "step": 16619 + }, + { + "epoch": 21.335044929396663, + "grad_norm": 5.374594688415527, + "learning_rate": 2.6224646983311938e-05, + "loss": 0.4081, + "step": 16620 + }, + { + "epoch": 21.33632862644416, + "grad_norm": 1.347673773765564, + "learning_rate": 2.622421908429611e-05, + "loss": 0.4363, + "step": 16621 + }, + { + "epoch": 21.337612323491655, + "grad_norm": 0.9981092214584351, + "learning_rate": 2.6223791185280275e-05, + "loss": 0.3827, + "step": 16622 + }, + { + "epoch": 21.338896020539153, + "grad_norm": 2.8681671619415283, + "learning_rate": 2.622336328626444e-05, + "loss": 0.4091, + "step": 16623 + }, + { + "epoch": 21.34017971758665, + "grad_norm": 2.9499828815460205, + "learning_rate": 2.622293538724861e-05, + "loss": 0.4032, + "step": 16624 + }, + { + "epoch": 21.341463414634145, + "grad_norm": 1.4408015012741089, + "learning_rate": 2.6222507488232777e-05, + "loss": 0.4281, + "step": 16625 + }, + { + "epoch": 21.342747111681643, + "grad_norm": 1.1290419101715088, + "learning_rate": 2.6222079589216945e-05, + "loss": 0.4349, + "step": 16626 + }, + { + "epoch": 21.34403080872914, + "grad_norm": 1.3134957551956177, + "learning_rate": 2.6221651690201113e-05, + "loss": 0.4051, + "step": 16627 + }, + { + "epoch": 21.345314505776635, + "grad_norm": 2.6467251777648926, + "learning_rate": 2.6221223791185282e-05, + "loss": 0.4106, + "step": 16628 + }, + { + "epoch": 21.346598202824133, + "grad_norm": 1.8039405345916748, + "learning_rate": 2.622079589216945e-05, + "loss": 0.422, + "step": 16629 + }, + { + "epoch": 21.34788189987163, + "grad_norm": 1.8177698850631714, + "learning_rate": 2.6220367993153615e-05, + "loss": 0.4279, + "step": 16630 + }, + { + "epoch": 21.34916559691913, + "grad_norm": 4.001677989959717, + "learning_rate": 2.6219940094137784e-05, + "loss": 0.4266, + "step": 16631 + }, + { + "epoch": 21.350449293966623, + "grad_norm": 1.8317500352859497, + "learning_rate": 2.6219512195121952e-05, + "loss": 0.4411, + "step": 16632 + }, + { + "epoch": 21.35173299101412, + "grad_norm": 2.576709032058716, + "learning_rate": 2.621908429610612e-05, + "loss": 0.4332, + "step": 16633 + }, + { + "epoch": 21.35301668806162, + "grad_norm": 1.6995397806167603, + "learning_rate": 2.6218656397090286e-05, + "loss": 0.4633, + "step": 16634 + }, + { + "epoch": 21.354300385109113, + "grad_norm": 1.1114715337753296, + "learning_rate": 2.6218228498074457e-05, + "loss": 0.4373, + "step": 16635 + }, + { + "epoch": 21.35558408215661, + "grad_norm": 2.4711999893188477, + "learning_rate": 2.6217800599058622e-05, + "loss": 0.4145, + "step": 16636 + }, + { + "epoch": 21.35686777920411, + "grad_norm": 1.1636865139007568, + "learning_rate": 2.621737270004279e-05, + "loss": 0.4159, + "step": 16637 + }, + { + "epoch": 21.358151476251603, + "grad_norm": 1.5694092512130737, + "learning_rate": 2.621694480102696e-05, + "loss": 0.4099, + "step": 16638 + }, + { + "epoch": 21.3594351732991, + "grad_norm": 1.3173266649246216, + "learning_rate": 2.6216516902011124e-05, + "loss": 0.3989, + "step": 16639 + }, + { + "epoch": 21.3607188703466, + "grad_norm": 2.1489367485046387, + "learning_rate": 2.6216089002995296e-05, + "loss": 0.476, + "step": 16640 + }, + { + "epoch": 21.362002567394097, + "grad_norm": 1.9977093935012817, + "learning_rate": 2.621566110397946e-05, + "loss": 0.4171, + "step": 16641 + }, + { + "epoch": 21.36328626444159, + "grad_norm": 3.3235089778900146, + "learning_rate": 2.621523320496363e-05, + "loss": 0.4358, + "step": 16642 + }, + { + "epoch": 21.36456996148909, + "grad_norm": 2.440587282180786, + "learning_rate": 2.6214805305947798e-05, + "loss": 0.4264, + "step": 16643 + }, + { + "epoch": 21.365853658536587, + "grad_norm": 2.277418613433838, + "learning_rate": 2.6214377406931963e-05, + "loss": 0.4115, + "step": 16644 + }, + { + "epoch": 21.36713735558408, + "grad_norm": 1.1689884662628174, + "learning_rate": 2.6213949507916135e-05, + "loss": 0.428, + "step": 16645 + }, + { + "epoch": 21.36842105263158, + "grad_norm": 1.3994622230529785, + "learning_rate": 2.62135216089003e-05, + "loss": 0.3949, + "step": 16646 + }, + { + "epoch": 21.369704749679077, + "grad_norm": 1.7638723850250244, + "learning_rate": 2.6213093709884468e-05, + "loss": 0.4286, + "step": 16647 + }, + { + "epoch": 21.37098844672657, + "grad_norm": 2.4455044269561768, + "learning_rate": 2.6212665810868637e-05, + "loss": 0.4547, + "step": 16648 + }, + { + "epoch": 21.37227214377407, + "grad_norm": 1.3913674354553223, + "learning_rate": 2.62122379118528e-05, + "loss": 0.4216, + "step": 16649 + }, + { + "epoch": 21.373555840821567, + "grad_norm": 1.606177806854248, + "learning_rate": 2.621181001283697e-05, + "loss": 0.4273, + "step": 16650 + }, + { + "epoch": 21.374839537869065, + "grad_norm": 1.3598248958587646, + "learning_rate": 2.621138211382114e-05, + "loss": 0.4546, + "step": 16651 + }, + { + "epoch": 21.37612323491656, + "grad_norm": 1.6547044515609741, + "learning_rate": 2.6210954214805307e-05, + "loss": 0.4594, + "step": 16652 + }, + { + "epoch": 21.377406931964057, + "grad_norm": 1.9034923315048218, + "learning_rate": 2.6210526315789475e-05, + "loss": 0.4347, + "step": 16653 + }, + { + "epoch": 21.378690629011555, + "grad_norm": 1.8059660196304321, + "learning_rate": 2.6210098416773644e-05, + "loss": 0.5063, + "step": 16654 + }, + { + "epoch": 21.37997432605905, + "grad_norm": 3.6713149547576904, + "learning_rate": 2.620967051775781e-05, + "loss": 0.4446, + "step": 16655 + }, + { + "epoch": 21.381258023106547, + "grad_norm": 4.722664833068848, + "learning_rate": 2.6209242618741977e-05, + "loss": 0.466, + "step": 16656 + }, + { + "epoch": 21.382541720154045, + "grad_norm": 1.5295169353485107, + "learning_rate": 2.6208814719726145e-05, + "loss": 0.4508, + "step": 16657 + }, + { + "epoch": 21.38382541720154, + "grad_norm": 2.5095558166503906, + "learning_rate": 2.620838682071031e-05, + "loss": 0.5114, + "step": 16658 + }, + { + "epoch": 21.385109114249037, + "grad_norm": 2.4927241802215576, + "learning_rate": 2.6207958921694482e-05, + "loss": 0.6573, + "step": 16659 + }, + { + "epoch": 21.386392811296535, + "grad_norm": 0.9515627026557922, + "learning_rate": 2.6207531022678647e-05, + "loss": 0.3742, + "step": 16660 + }, + { + "epoch": 21.387676508344033, + "grad_norm": 1.151018738746643, + "learning_rate": 2.620710312366282e-05, + "loss": 0.3882, + "step": 16661 + }, + { + "epoch": 21.388960205391527, + "grad_norm": 1.7245901823043823, + "learning_rate": 2.6206675224646984e-05, + "loss": 0.4031, + "step": 16662 + }, + { + "epoch": 21.390243902439025, + "grad_norm": 1.2808005809783936, + "learning_rate": 2.620624732563115e-05, + "loss": 0.3798, + "step": 16663 + }, + { + "epoch": 21.391527599486523, + "grad_norm": 1.434226632118225, + "learning_rate": 2.620581942661532e-05, + "loss": 0.3656, + "step": 16664 + }, + { + "epoch": 21.392811296534017, + "grad_norm": 1.670250415802002, + "learning_rate": 2.6205391527599486e-05, + "loss": 0.3913, + "step": 16665 + }, + { + "epoch": 21.394094993581515, + "grad_norm": 1.6345033645629883, + "learning_rate": 2.6204963628583654e-05, + "loss": 0.43, + "step": 16666 + }, + { + "epoch": 21.395378690629013, + "grad_norm": 3.9087421894073486, + "learning_rate": 2.6204535729567823e-05, + "loss": 0.3863, + "step": 16667 + }, + { + "epoch": 21.396662387676507, + "grad_norm": 1.690811038017273, + "learning_rate": 2.620410783055199e-05, + "loss": 0.4249, + "step": 16668 + }, + { + "epoch": 21.397946084724005, + "grad_norm": 1.7991105318069458, + "learning_rate": 2.620367993153616e-05, + "loss": 0.4177, + "step": 16669 + }, + { + "epoch": 21.399229781771503, + "grad_norm": 0.9349187016487122, + "learning_rate": 2.6203252032520325e-05, + "loss": 0.402, + "step": 16670 + }, + { + "epoch": 21.400513478818997, + "grad_norm": 1.166306495666504, + "learning_rate": 2.6202824133504493e-05, + "loss": 0.4455, + "step": 16671 + }, + { + "epoch": 21.401797175866495, + "grad_norm": 1.397466778755188, + "learning_rate": 2.620239623448866e-05, + "loss": 0.3639, + "step": 16672 + }, + { + "epoch": 21.403080872913993, + "grad_norm": 1.949738621711731, + "learning_rate": 2.620196833547283e-05, + "loss": 0.3792, + "step": 16673 + }, + { + "epoch": 21.40436456996149, + "grad_norm": 0.9076437950134277, + "learning_rate": 2.6201540436456995e-05, + "loss": 0.3813, + "step": 16674 + }, + { + "epoch": 21.405648267008985, + "grad_norm": 2.7157013416290283, + "learning_rate": 2.6201112537441167e-05, + "loss": 0.3879, + "step": 16675 + }, + { + "epoch": 21.406931964056483, + "grad_norm": 1.7336245775222778, + "learning_rate": 2.6200684638425332e-05, + "loss": 0.4282, + "step": 16676 + }, + { + "epoch": 21.40821566110398, + "grad_norm": 1.4500336647033691, + "learning_rate": 2.62002567394095e-05, + "loss": 0.436, + "step": 16677 + }, + { + "epoch": 21.409499358151475, + "grad_norm": 1.560233235359192, + "learning_rate": 2.619982884039367e-05, + "loss": 0.4138, + "step": 16678 + }, + { + "epoch": 21.410783055198973, + "grad_norm": 2.3097915649414062, + "learning_rate": 2.6199400941377834e-05, + "loss": 0.3987, + "step": 16679 + }, + { + "epoch": 21.41206675224647, + "grad_norm": 2.1625232696533203, + "learning_rate": 2.6198973042362005e-05, + "loss": 0.3953, + "step": 16680 + }, + { + "epoch": 21.413350449293965, + "grad_norm": 0.9404199719429016, + "learning_rate": 2.619854514334617e-05, + "loss": 0.4193, + "step": 16681 + }, + { + "epoch": 21.414634146341463, + "grad_norm": 1.128434181213379, + "learning_rate": 2.619811724433034e-05, + "loss": 0.3998, + "step": 16682 + }, + { + "epoch": 21.41591784338896, + "grad_norm": 1.362428069114685, + "learning_rate": 2.6197689345314507e-05, + "loss": 0.4571, + "step": 16683 + }, + { + "epoch": 21.41720154043646, + "grad_norm": 3.850172996520996, + "learning_rate": 2.6197261446298672e-05, + "loss": 0.3974, + "step": 16684 + }, + { + "epoch": 21.418485237483953, + "grad_norm": 1.8077867031097412, + "learning_rate": 2.6196833547282844e-05, + "loss": 0.437, + "step": 16685 + }, + { + "epoch": 21.41976893453145, + "grad_norm": 2.6159439086914062, + "learning_rate": 2.619640564826701e-05, + "loss": 0.4149, + "step": 16686 + }, + { + "epoch": 21.42105263157895, + "grad_norm": 1.7912518978118896, + "learning_rate": 2.6195977749251178e-05, + "loss": 0.4194, + "step": 16687 + }, + { + "epoch": 21.422336328626443, + "grad_norm": 1.320701003074646, + "learning_rate": 2.6195549850235346e-05, + "loss": 0.3974, + "step": 16688 + }, + { + "epoch": 21.42362002567394, + "grad_norm": 3.192084550857544, + "learning_rate": 2.6195121951219514e-05, + "loss": 0.4459, + "step": 16689 + }, + { + "epoch": 21.42490372272144, + "grad_norm": 1.5102272033691406, + "learning_rate": 2.619469405220368e-05, + "loss": 0.4019, + "step": 16690 + }, + { + "epoch": 21.426187419768933, + "grad_norm": 1.2546061277389526, + "learning_rate": 2.6194266153187848e-05, + "loss": 0.4135, + "step": 16691 + }, + { + "epoch": 21.42747111681643, + "grad_norm": 2.3416390419006348, + "learning_rate": 2.6193838254172016e-05, + "loss": 0.4348, + "step": 16692 + }, + { + "epoch": 21.42875481386393, + "grad_norm": 1.262667179107666, + "learning_rate": 2.6193410355156185e-05, + "loss": 0.447, + "step": 16693 + }, + { + "epoch": 21.430038510911427, + "grad_norm": 1.1058260202407837, + "learning_rate": 2.6192982456140353e-05, + "loss": 0.429, + "step": 16694 + }, + { + "epoch": 21.43132220795892, + "grad_norm": 3.5481340885162354, + "learning_rate": 2.6192554557124518e-05, + "loss": 0.4842, + "step": 16695 + }, + { + "epoch": 21.43260590500642, + "grad_norm": 2.4241504669189453, + "learning_rate": 2.619212665810869e-05, + "loss": 0.4052, + "step": 16696 + }, + { + "epoch": 21.433889602053917, + "grad_norm": 2.26932692527771, + "learning_rate": 2.6191698759092855e-05, + "loss": 0.4754, + "step": 16697 + }, + { + "epoch": 21.43517329910141, + "grad_norm": 1.8013112545013428, + "learning_rate": 2.619127086007702e-05, + "loss": 0.3952, + "step": 16698 + }, + { + "epoch": 21.43645699614891, + "grad_norm": 1.426152229309082, + "learning_rate": 2.619084296106119e-05, + "loss": 0.4022, + "step": 16699 + }, + { + "epoch": 21.437740693196407, + "grad_norm": 1.1400012969970703, + "learning_rate": 2.6190415062045357e-05, + "loss": 0.4892, + "step": 16700 + }, + { + "epoch": 21.4390243902439, + "grad_norm": 2.561628818511963, + "learning_rate": 2.618998716302953e-05, + "loss": 0.4584, + "step": 16701 + }, + { + "epoch": 21.4403080872914, + "grad_norm": 1.7475484609603882, + "learning_rate": 2.6189559264013694e-05, + "loss": 0.4399, + "step": 16702 + }, + { + "epoch": 21.441591784338897, + "grad_norm": 3.0917043685913086, + "learning_rate": 2.6189131364997862e-05, + "loss": 0.4962, + "step": 16703 + }, + { + "epoch": 21.44287548138639, + "grad_norm": 1.85256826877594, + "learning_rate": 2.618870346598203e-05, + "loss": 0.4252, + "step": 16704 + }, + { + "epoch": 21.44415917843389, + "grad_norm": 2.272953987121582, + "learning_rate": 2.6188275566966195e-05, + "loss": 0.4713, + "step": 16705 + }, + { + "epoch": 21.445442875481387, + "grad_norm": 1.5357557535171509, + "learning_rate": 2.6187847667950364e-05, + "loss": 0.4684, + "step": 16706 + }, + { + "epoch": 21.446726572528885, + "grad_norm": 2.231487274169922, + "learning_rate": 2.6187419768934532e-05, + "loss": 0.4741, + "step": 16707 + }, + { + "epoch": 21.44801026957638, + "grad_norm": 3.8033201694488525, + "learning_rate": 2.61869918699187e-05, + "loss": 0.5116, + "step": 16708 + }, + { + "epoch": 21.449293966623877, + "grad_norm": 4.084671497344971, + "learning_rate": 2.618656397090287e-05, + "loss": 0.6156, + "step": 16709 + }, + { + "epoch": 21.450577663671375, + "grad_norm": 1.424430012702942, + "learning_rate": 2.6186136071887034e-05, + "loss": 0.399, + "step": 16710 + }, + { + "epoch": 21.45186136071887, + "grad_norm": 2.0720295906066895, + "learning_rate": 2.6185708172871202e-05, + "loss": 0.3655, + "step": 16711 + }, + { + "epoch": 21.453145057766367, + "grad_norm": 1.1726230382919312, + "learning_rate": 2.618528027385537e-05, + "loss": 0.4143, + "step": 16712 + }, + { + "epoch": 21.454428754813865, + "grad_norm": 4.433260440826416, + "learning_rate": 2.618485237483954e-05, + "loss": 0.4042, + "step": 16713 + }, + { + "epoch": 21.45571245186136, + "grad_norm": 3.325547456741333, + "learning_rate": 2.6184424475823704e-05, + "loss": 0.404, + "step": 16714 + }, + { + "epoch": 21.456996148908857, + "grad_norm": 1.0676767826080322, + "learning_rate": 2.6183996576807876e-05, + "loss": 0.387, + "step": 16715 + }, + { + "epoch": 21.458279845956355, + "grad_norm": 11.441583633422852, + "learning_rate": 2.618356867779204e-05, + "loss": 0.3953, + "step": 16716 + }, + { + "epoch": 21.459563543003853, + "grad_norm": 1.257209062576294, + "learning_rate": 2.6183140778776206e-05, + "loss": 0.4027, + "step": 16717 + }, + { + "epoch": 21.460847240051347, + "grad_norm": 4.0420379638671875, + "learning_rate": 2.6182712879760378e-05, + "loss": 0.4372, + "step": 16718 + }, + { + "epoch": 21.462130937098845, + "grad_norm": 1.803910255432129, + "learning_rate": 2.6182284980744543e-05, + "loss": 0.4543, + "step": 16719 + }, + { + "epoch": 21.463414634146343, + "grad_norm": 1.4798749685287476, + "learning_rate": 2.6181857081728715e-05, + "loss": 0.4141, + "step": 16720 + }, + { + "epoch": 21.464698331193837, + "grad_norm": 18.141462326049805, + "learning_rate": 2.618142918271288e-05, + "loss": 0.4, + "step": 16721 + }, + { + "epoch": 21.465982028241335, + "grad_norm": 1.2260420322418213, + "learning_rate": 2.6181001283697048e-05, + "loss": 0.3954, + "step": 16722 + }, + { + "epoch": 21.467265725288833, + "grad_norm": 2.46777081489563, + "learning_rate": 2.6180573384681217e-05, + "loss": 0.3991, + "step": 16723 + }, + { + "epoch": 21.468549422336327, + "grad_norm": 1.8529174327850342, + "learning_rate": 2.618014548566538e-05, + "loss": 0.4351, + "step": 16724 + }, + { + "epoch": 21.469833119383825, + "grad_norm": 2.6106343269348145, + "learning_rate": 2.617971758664955e-05, + "loss": 0.4659, + "step": 16725 + }, + { + "epoch": 21.471116816431323, + "grad_norm": 4.463810443878174, + "learning_rate": 2.617928968763372e-05, + "loss": 0.4438, + "step": 16726 + }, + { + "epoch": 21.47240051347882, + "grad_norm": 1.2270978689193726, + "learning_rate": 2.6178861788617887e-05, + "loss": 0.393, + "step": 16727 + }, + { + "epoch": 21.473684210526315, + "grad_norm": 1.355297565460205, + "learning_rate": 2.6178433889602055e-05, + "loss": 0.3656, + "step": 16728 + }, + { + "epoch": 21.474967907573813, + "grad_norm": 2.5063626766204834, + "learning_rate": 2.6178005990586224e-05, + "loss": 0.3908, + "step": 16729 + }, + { + "epoch": 21.47625160462131, + "grad_norm": 3.171290159225464, + "learning_rate": 2.617757809157039e-05, + "loss": 0.4055, + "step": 16730 + }, + { + "epoch": 21.477535301668805, + "grad_norm": 1.1388355493545532, + "learning_rate": 2.6177150192554557e-05, + "loss": 0.3741, + "step": 16731 + }, + { + "epoch": 21.478818998716303, + "grad_norm": 1.8953361511230469, + "learning_rate": 2.6176722293538726e-05, + "loss": 0.4179, + "step": 16732 + }, + { + "epoch": 21.4801026957638, + "grad_norm": 2.6443097591400146, + "learning_rate": 2.617629439452289e-05, + "loss": 0.3807, + "step": 16733 + }, + { + "epoch": 21.481386392811295, + "grad_norm": 2.0398263931274414, + "learning_rate": 2.6175866495507062e-05, + "loss": 0.4015, + "step": 16734 + }, + { + "epoch": 21.482670089858793, + "grad_norm": 1.080644130706787, + "learning_rate": 2.6175438596491227e-05, + "loss": 0.4183, + "step": 16735 + }, + { + "epoch": 21.48395378690629, + "grad_norm": 3.827014923095703, + "learning_rate": 2.61750106974754e-05, + "loss": 0.3906, + "step": 16736 + }, + { + "epoch": 21.485237483953785, + "grad_norm": 2.276567220687866, + "learning_rate": 2.6174582798459564e-05, + "loss": 0.4126, + "step": 16737 + }, + { + "epoch": 21.486521181001283, + "grad_norm": 0.8868993520736694, + "learning_rate": 2.617415489944373e-05, + "loss": 0.4204, + "step": 16738 + }, + { + "epoch": 21.48780487804878, + "grad_norm": 2.1172614097595215, + "learning_rate": 2.61737270004279e-05, + "loss": 0.3907, + "step": 16739 + }, + { + "epoch": 21.48908857509628, + "grad_norm": 2.895296812057495, + "learning_rate": 2.6173299101412066e-05, + "loss": 0.4379, + "step": 16740 + }, + { + "epoch": 21.490372272143773, + "grad_norm": 2.4399585723876953, + "learning_rate": 2.6172871202396234e-05, + "loss": 0.4291, + "step": 16741 + }, + { + "epoch": 21.49165596919127, + "grad_norm": 6.928569316864014, + "learning_rate": 2.6172443303380403e-05, + "loss": 0.4278, + "step": 16742 + }, + { + "epoch": 21.49293966623877, + "grad_norm": 1.4286880493164062, + "learning_rate": 2.617201540436457e-05, + "loss": 0.4256, + "step": 16743 + }, + { + "epoch": 21.494223363286263, + "grad_norm": 2.47210955619812, + "learning_rate": 2.617158750534874e-05, + "loss": 0.386, + "step": 16744 + }, + { + "epoch": 21.49550706033376, + "grad_norm": 1.492841362953186, + "learning_rate": 2.6171159606332905e-05, + "loss": 0.4357, + "step": 16745 + }, + { + "epoch": 21.49679075738126, + "grad_norm": 2.360447406768799, + "learning_rate": 2.6170731707317073e-05, + "loss": 0.4586, + "step": 16746 + }, + { + "epoch": 21.498074454428753, + "grad_norm": 3.6104252338409424, + "learning_rate": 2.617030380830124e-05, + "loss": 0.4207, + "step": 16747 + }, + { + "epoch": 21.49935815147625, + "grad_norm": 3.79365611076355, + "learning_rate": 2.616987590928541e-05, + "loss": 0.4332, + "step": 16748 + }, + { + "epoch": 21.50064184852375, + "grad_norm": 8.16480827331543, + "learning_rate": 2.6169448010269575e-05, + "loss": 0.4326, + "step": 16749 + }, + { + "epoch": 21.501925545571247, + "grad_norm": 2.012484073638916, + "learning_rate": 2.6169020111253747e-05, + "loss": 0.4416, + "step": 16750 + }, + { + "epoch": 21.50320924261874, + "grad_norm": 1.9495720863342285, + "learning_rate": 2.6168592212237912e-05, + "loss": 0.4751, + "step": 16751 + }, + { + "epoch": 21.50449293966624, + "grad_norm": 1.4569602012634277, + "learning_rate": 2.616816431322208e-05, + "loss": 0.4725, + "step": 16752 + }, + { + "epoch": 21.505776636713737, + "grad_norm": 1.1882045269012451, + "learning_rate": 2.616773641420625e-05, + "loss": 0.4344, + "step": 16753 + }, + { + "epoch": 21.50706033376123, + "grad_norm": 20.208913803100586, + "learning_rate": 2.6167308515190414e-05, + "loss": 0.449, + "step": 16754 + }, + { + "epoch": 21.50834403080873, + "grad_norm": 3.91709041595459, + "learning_rate": 2.6166880616174585e-05, + "loss": 0.4772, + "step": 16755 + }, + { + "epoch": 21.509627727856227, + "grad_norm": 1.630894422531128, + "learning_rate": 2.616645271715875e-05, + "loss": 0.5232, + "step": 16756 + }, + { + "epoch": 21.51091142490372, + "grad_norm": 2.2605597972869873, + "learning_rate": 2.616602481814292e-05, + "loss": 0.4775, + "step": 16757 + }, + { + "epoch": 21.51219512195122, + "grad_norm": 3.050665855407715, + "learning_rate": 2.6165596919127087e-05, + "loss": 0.4845, + "step": 16758 + }, + { + "epoch": 21.513478818998717, + "grad_norm": 2.5919439792633057, + "learning_rate": 2.6165169020111252e-05, + "loss": 0.5538, + "step": 16759 + }, + { + "epoch": 21.514762516046215, + "grad_norm": 0.9622332453727722, + "learning_rate": 2.6164741121095424e-05, + "loss": 0.3785, + "step": 16760 + }, + { + "epoch": 21.51604621309371, + "grad_norm": 1.034510612487793, + "learning_rate": 2.616431322207959e-05, + "loss": 0.3641, + "step": 16761 + }, + { + "epoch": 21.517329910141207, + "grad_norm": 2.707658290863037, + "learning_rate": 2.6163885323063758e-05, + "loss": 0.4072, + "step": 16762 + }, + { + "epoch": 21.518613607188705, + "grad_norm": 1.1644773483276367, + "learning_rate": 2.6163457424047926e-05, + "loss": 0.4135, + "step": 16763 + }, + { + "epoch": 21.5198973042362, + "grad_norm": 1.6919429302215576, + "learning_rate": 2.6163029525032094e-05, + "loss": 0.3854, + "step": 16764 + }, + { + "epoch": 21.521181001283697, + "grad_norm": 0.9137884974479675, + "learning_rate": 2.616260162601626e-05, + "loss": 0.4208, + "step": 16765 + }, + { + "epoch": 21.522464698331195, + "grad_norm": 1.368511438369751, + "learning_rate": 2.6162173727000428e-05, + "loss": 0.4061, + "step": 16766 + }, + { + "epoch": 21.52374839537869, + "grad_norm": 1.110798954963684, + "learning_rate": 2.6161745827984596e-05, + "loss": 0.3933, + "step": 16767 + }, + { + "epoch": 21.525032092426187, + "grad_norm": 0.9950128197669983, + "learning_rate": 2.6161317928968765e-05, + "loss": 0.4332, + "step": 16768 + }, + { + "epoch": 21.526315789473685, + "grad_norm": 1.2286810874938965, + "learning_rate": 2.6160890029952933e-05, + "loss": 0.3891, + "step": 16769 + }, + { + "epoch": 21.527599486521183, + "grad_norm": 1.1167042255401611, + "learning_rate": 2.6160462130937098e-05, + "loss": 0.3946, + "step": 16770 + }, + { + "epoch": 21.528883183568677, + "grad_norm": 3.5911455154418945, + "learning_rate": 2.6160034231921267e-05, + "loss": 0.4148, + "step": 16771 + }, + { + "epoch": 21.530166880616175, + "grad_norm": 2.0815107822418213, + "learning_rate": 2.6159606332905435e-05, + "loss": 0.3915, + "step": 16772 + }, + { + "epoch": 21.531450577663673, + "grad_norm": 1.6934369802474976, + "learning_rate": 2.61591784338896e-05, + "loss": 0.4526, + "step": 16773 + }, + { + "epoch": 21.532734274711167, + "grad_norm": 1.4190820455551147, + "learning_rate": 2.6158750534873772e-05, + "loss": 0.4109, + "step": 16774 + }, + { + "epoch": 21.534017971758665, + "grad_norm": 1.689878225326538, + "learning_rate": 2.6158322635857937e-05, + "loss": 0.3915, + "step": 16775 + }, + { + "epoch": 21.535301668806163, + "grad_norm": 1.6086211204528809, + "learning_rate": 2.615789473684211e-05, + "loss": 0.4056, + "step": 16776 + }, + { + "epoch": 21.536585365853657, + "grad_norm": 0.9506701827049255, + "learning_rate": 2.6157466837826274e-05, + "loss": 0.4009, + "step": 16777 + }, + { + "epoch": 21.537869062901155, + "grad_norm": 5.369347095489502, + "learning_rate": 2.615703893881044e-05, + "loss": 0.4139, + "step": 16778 + }, + { + "epoch": 21.539152759948653, + "grad_norm": 1.4061439037322998, + "learning_rate": 2.615661103979461e-05, + "loss": 0.3917, + "step": 16779 + }, + { + "epoch": 21.540436456996147, + "grad_norm": 1.8274226188659668, + "learning_rate": 2.6156183140778775e-05, + "loss": 0.3613, + "step": 16780 + }, + { + "epoch": 21.541720154043645, + "grad_norm": 1.270202875137329, + "learning_rate": 2.6155755241762944e-05, + "loss": 0.4389, + "step": 16781 + }, + { + "epoch": 21.543003851091143, + "grad_norm": 4.16269063949585, + "learning_rate": 2.6155327342747112e-05, + "loss": 0.4261, + "step": 16782 + }, + { + "epoch": 21.54428754813864, + "grad_norm": 1.9154877662658691, + "learning_rate": 2.615489944373128e-05, + "loss": 0.394, + "step": 16783 + }, + { + "epoch": 21.545571245186135, + "grad_norm": 1.2279609441757202, + "learning_rate": 2.615447154471545e-05, + "loss": 0.3745, + "step": 16784 + }, + { + "epoch": 21.546854942233633, + "grad_norm": 3.4280264377593994, + "learning_rate": 2.6154043645699614e-05, + "loss": 0.4142, + "step": 16785 + }, + { + "epoch": 21.54813863928113, + "grad_norm": 1.843704342842102, + "learning_rate": 2.6153615746683783e-05, + "loss": 0.428, + "step": 16786 + }, + { + "epoch": 21.549422336328625, + "grad_norm": 2.4772863388061523, + "learning_rate": 2.615318784766795e-05, + "loss": 0.4392, + "step": 16787 + }, + { + "epoch": 21.550706033376123, + "grad_norm": 4.8144707679748535, + "learning_rate": 2.615275994865212e-05, + "loss": 0.4625, + "step": 16788 + }, + { + "epoch": 21.55198973042362, + "grad_norm": 2.388732433319092, + "learning_rate": 2.6152332049636284e-05, + "loss": 0.406, + "step": 16789 + }, + { + "epoch": 21.553273427471115, + "grad_norm": 1.2738466262817383, + "learning_rate": 2.6151904150620456e-05, + "loss": 0.4351, + "step": 16790 + }, + { + "epoch": 21.554557124518613, + "grad_norm": 1.6868896484375, + "learning_rate": 2.615147625160462e-05, + "loss": 0.4327, + "step": 16791 + }, + { + "epoch": 21.55584082156611, + "grad_norm": 0.9594456553459167, + "learning_rate": 2.615104835258879e-05, + "loss": 0.3969, + "step": 16792 + }, + { + "epoch": 21.55712451861361, + "grad_norm": 2.251225709915161, + "learning_rate": 2.6150620453572958e-05, + "loss": 0.4098, + "step": 16793 + }, + { + "epoch": 21.558408215661103, + "grad_norm": 2.505732297897339, + "learning_rate": 2.6150192554557123e-05, + "loss": 0.4371, + "step": 16794 + }, + { + "epoch": 21.5596919127086, + "grad_norm": 1.3054903745651245, + "learning_rate": 2.6149764655541295e-05, + "loss": 0.412, + "step": 16795 + }, + { + "epoch": 21.5609756097561, + "grad_norm": 1.4329622983932495, + "learning_rate": 2.614933675652546e-05, + "loss": 0.4197, + "step": 16796 + }, + { + "epoch": 21.562259306803593, + "grad_norm": 1.3720647096633911, + "learning_rate": 2.6148908857509628e-05, + "loss": 0.4302, + "step": 16797 + }, + { + "epoch": 21.56354300385109, + "grad_norm": 1.4127334356307983, + "learning_rate": 2.6148480958493797e-05, + "loss": 0.3789, + "step": 16798 + }, + { + "epoch": 21.56482670089859, + "grad_norm": 1.2598778009414673, + "learning_rate": 2.6148053059477962e-05, + "loss": 0.4363, + "step": 16799 + }, + { + "epoch": 21.566110397946083, + "grad_norm": 1.188372254371643, + "learning_rate": 2.6147625160462134e-05, + "loss": 0.4796, + "step": 16800 + }, + { + "epoch": 21.56739409499358, + "grad_norm": 1.7758980989456177, + "learning_rate": 2.61471972614463e-05, + "loss": 0.4544, + "step": 16801 + }, + { + "epoch": 21.56867779204108, + "grad_norm": 1.0771785974502563, + "learning_rate": 2.6146769362430467e-05, + "loss": 0.4046, + "step": 16802 + }, + { + "epoch": 21.569961489088577, + "grad_norm": 1.5001333951950073, + "learning_rate": 2.6146341463414635e-05, + "loss": 0.4469, + "step": 16803 + }, + { + "epoch": 21.57124518613607, + "grad_norm": 1.7462493181228638, + "learning_rate": 2.6145913564398804e-05, + "loss": 0.493, + "step": 16804 + }, + { + "epoch": 21.57252888318357, + "grad_norm": 4.177256107330322, + "learning_rate": 2.614548566538297e-05, + "loss": 0.4548, + "step": 16805 + }, + { + "epoch": 21.573812580231067, + "grad_norm": 2.256908416748047, + "learning_rate": 2.6145057766367137e-05, + "loss": 0.4948, + "step": 16806 + }, + { + "epoch": 21.57509627727856, + "grad_norm": 1.8318018913269043, + "learning_rate": 2.6144629867351306e-05, + "loss": 0.5199, + "step": 16807 + }, + { + "epoch": 21.57637997432606, + "grad_norm": 3.875843048095703, + "learning_rate": 2.6144201968335474e-05, + "loss": 0.5596, + "step": 16808 + }, + { + "epoch": 21.577663671373557, + "grad_norm": 3.8285796642303467, + "learning_rate": 2.6143774069319642e-05, + "loss": 0.614, + "step": 16809 + }, + { + "epoch": 21.57894736842105, + "grad_norm": 0.6995242834091187, + "learning_rate": 2.6143346170303807e-05, + "loss": 0.3715, + "step": 16810 + }, + { + "epoch": 21.58023106546855, + "grad_norm": 1.4482399225234985, + "learning_rate": 2.614291827128798e-05, + "loss": 0.3724, + "step": 16811 + }, + { + "epoch": 21.581514762516047, + "grad_norm": 2.0260777473449707, + "learning_rate": 2.6142490372272144e-05, + "loss": 0.3719, + "step": 16812 + }, + { + "epoch": 21.58279845956354, + "grad_norm": 1.0839444398880005, + "learning_rate": 2.614206247325631e-05, + "loss": 0.4154, + "step": 16813 + }, + { + "epoch": 21.58408215661104, + "grad_norm": 1.0288779735565186, + "learning_rate": 2.614163457424048e-05, + "loss": 0.406, + "step": 16814 + }, + { + "epoch": 21.585365853658537, + "grad_norm": 0.8713738918304443, + "learning_rate": 2.6141206675224646e-05, + "loss": 0.4096, + "step": 16815 + }, + { + "epoch": 21.586649550706035, + "grad_norm": 1.3060709238052368, + "learning_rate": 2.6140778776208818e-05, + "loss": 0.3956, + "step": 16816 + }, + { + "epoch": 21.58793324775353, + "grad_norm": 1.0233510732650757, + "learning_rate": 2.6140350877192983e-05, + "loss": 0.4297, + "step": 16817 + }, + { + "epoch": 21.589216944801027, + "grad_norm": 0.9011499881744385, + "learning_rate": 2.613992297817715e-05, + "loss": 0.3785, + "step": 16818 + }, + { + "epoch": 21.590500641848525, + "grad_norm": 1.5266231298446655, + "learning_rate": 2.613949507916132e-05, + "loss": 0.3915, + "step": 16819 + }, + { + "epoch": 21.59178433889602, + "grad_norm": 1.0586391687393188, + "learning_rate": 2.6139067180145485e-05, + "loss": 0.4018, + "step": 16820 + }, + { + "epoch": 21.593068035943517, + "grad_norm": 1.588551640510559, + "learning_rate": 2.6138639281129653e-05, + "loss": 0.4208, + "step": 16821 + }, + { + "epoch": 21.594351732991015, + "grad_norm": 1.1889876127243042, + "learning_rate": 2.613821138211382e-05, + "loss": 0.383, + "step": 16822 + }, + { + "epoch": 21.59563543003851, + "grad_norm": 0.8514032363891602, + "learning_rate": 2.613778348309799e-05, + "loss": 0.4074, + "step": 16823 + }, + { + "epoch": 21.596919127086007, + "grad_norm": 2.4693329334259033, + "learning_rate": 2.613735558408216e-05, + "loss": 0.4512, + "step": 16824 + }, + { + "epoch": 21.598202824133505, + "grad_norm": 1.960841417312622, + "learning_rate": 2.6136927685066327e-05, + "loss": 0.4424, + "step": 16825 + }, + { + "epoch": 21.599486521181003, + "grad_norm": 1.5309902429580688, + "learning_rate": 2.6136499786050492e-05, + "loss": 0.3939, + "step": 16826 + }, + { + "epoch": 21.600770218228497, + "grad_norm": 1.6507782936096191, + "learning_rate": 2.613607188703466e-05, + "loss": 0.3754, + "step": 16827 + }, + { + "epoch": 21.602053915275995, + "grad_norm": 2.4592113494873047, + "learning_rate": 2.613564398801883e-05, + "loss": 0.4253, + "step": 16828 + }, + { + "epoch": 21.603337612323493, + "grad_norm": 1.8078798055648804, + "learning_rate": 2.6135216089002994e-05, + "loss": 0.4017, + "step": 16829 + }, + { + "epoch": 21.604621309370987, + "grad_norm": 1.0564883947372437, + "learning_rate": 2.6134788189987166e-05, + "loss": 0.3822, + "step": 16830 + }, + { + "epoch": 21.605905006418485, + "grad_norm": 0.7867007851600647, + "learning_rate": 2.613436029097133e-05, + "loss": 0.4045, + "step": 16831 + }, + { + "epoch": 21.607188703465983, + "grad_norm": 0.9602672457695007, + "learning_rate": 2.61339323919555e-05, + "loss": 0.4292, + "step": 16832 + }, + { + "epoch": 21.608472400513477, + "grad_norm": 1.1973084211349487, + "learning_rate": 2.6133504492939667e-05, + "loss": 0.4374, + "step": 16833 + }, + { + "epoch": 21.609756097560975, + "grad_norm": 1.1802916526794434, + "learning_rate": 2.6133076593923832e-05, + "loss": 0.4274, + "step": 16834 + }, + { + "epoch": 21.611039794608473, + "grad_norm": 0.8514737486839294, + "learning_rate": 2.6132648694908004e-05, + "loss": 0.4061, + "step": 16835 + }, + { + "epoch": 21.61232349165597, + "grad_norm": 1.918518304824829, + "learning_rate": 2.613222079589217e-05, + "loss": 0.4585, + "step": 16836 + }, + { + "epoch": 21.613607188703465, + "grad_norm": 1.070410132408142, + "learning_rate": 2.6131792896876338e-05, + "loss": 0.3972, + "step": 16837 + }, + { + "epoch": 21.614890885750963, + "grad_norm": 3.630664110183716, + "learning_rate": 2.6131364997860506e-05, + "loss": 0.4073, + "step": 16838 + }, + { + "epoch": 21.61617458279846, + "grad_norm": 1.609391689300537, + "learning_rate": 2.613093709884467e-05, + "loss": 0.4179, + "step": 16839 + }, + { + "epoch": 21.617458279845955, + "grad_norm": 0.924778401851654, + "learning_rate": 2.6130509199828843e-05, + "loss": 0.3853, + "step": 16840 + }, + { + "epoch": 21.618741976893453, + "grad_norm": 1.6500786542892456, + "learning_rate": 2.6130081300813008e-05, + "loss": 0.3936, + "step": 16841 + }, + { + "epoch": 21.62002567394095, + "grad_norm": 1.092057228088379, + "learning_rate": 2.6129653401797176e-05, + "loss": 0.4508, + "step": 16842 + }, + { + "epoch": 21.621309370988445, + "grad_norm": 0.7839867472648621, + "learning_rate": 2.6129225502781345e-05, + "loss": 0.4077, + "step": 16843 + }, + { + "epoch": 21.622593068035943, + "grad_norm": 3.012819290161133, + "learning_rate": 2.6128797603765513e-05, + "loss": 0.4146, + "step": 16844 + }, + { + "epoch": 21.62387676508344, + "grad_norm": 1.3482270240783691, + "learning_rate": 2.6128369704749678e-05, + "loss": 0.4002, + "step": 16845 + }, + { + "epoch": 21.625160462130935, + "grad_norm": 0.8489207625389099, + "learning_rate": 2.6127941805733847e-05, + "loss": 0.4128, + "step": 16846 + }, + { + "epoch": 21.626444159178433, + "grad_norm": 1.0741554498672485, + "learning_rate": 2.6127513906718015e-05, + "loss": 0.4316, + "step": 16847 + }, + { + "epoch": 21.62772785622593, + "grad_norm": 1.8498965501785278, + "learning_rate": 2.6127086007702183e-05, + "loss": 0.4451, + "step": 16848 + }, + { + "epoch": 21.62901155327343, + "grad_norm": 1.7915624380111694, + "learning_rate": 2.6126658108686352e-05, + "loss": 0.3907, + "step": 16849 + }, + { + "epoch": 21.630295250320923, + "grad_norm": 7.6567254066467285, + "learning_rate": 2.6126230209670517e-05, + "loss": 0.4573, + "step": 16850 + }, + { + "epoch": 21.63157894736842, + "grad_norm": 1.2398333549499512, + "learning_rate": 2.612580231065469e-05, + "loss": 0.4305, + "step": 16851 + }, + { + "epoch": 21.63286264441592, + "grad_norm": 1.4219683408737183, + "learning_rate": 2.6125374411638854e-05, + "loss": 0.4199, + "step": 16852 + }, + { + "epoch": 21.634146341463413, + "grad_norm": 3.8936996459960938, + "learning_rate": 2.612494651262302e-05, + "loss": 0.4143, + "step": 16853 + }, + { + "epoch": 21.63543003851091, + "grad_norm": 6.520181179046631, + "learning_rate": 2.612451861360719e-05, + "loss": 0.4591, + "step": 16854 + }, + { + "epoch": 21.63671373555841, + "grad_norm": 1.319366455078125, + "learning_rate": 2.6124090714591356e-05, + "loss": 0.437, + "step": 16855 + }, + { + "epoch": 21.637997432605903, + "grad_norm": 2.2293906211853027, + "learning_rate": 2.6123662815575527e-05, + "loss": 0.5096, + "step": 16856 + }, + { + "epoch": 21.6392811296534, + "grad_norm": 2.3584909439086914, + "learning_rate": 2.6123234916559692e-05, + "loss": 0.445, + "step": 16857 + }, + { + "epoch": 21.6405648267009, + "grad_norm": 1.816208004951477, + "learning_rate": 2.612280701754386e-05, + "loss": 0.5506, + "step": 16858 + }, + { + "epoch": 21.641848523748397, + "grad_norm": 2.331664562225342, + "learning_rate": 2.612237911852803e-05, + "loss": 0.6626, + "step": 16859 + }, + { + "epoch": 21.64313222079589, + "grad_norm": 1.044921875, + "learning_rate": 2.6121951219512194e-05, + "loss": 0.3821, + "step": 16860 + }, + { + "epoch": 21.64441591784339, + "grad_norm": 1.1555074453353882, + "learning_rate": 2.6121523320496363e-05, + "loss": 0.3798, + "step": 16861 + }, + { + "epoch": 21.645699614890887, + "grad_norm": 0.8814330697059631, + "learning_rate": 2.612109542148053e-05, + "loss": 0.3855, + "step": 16862 + }, + { + "epoch": 21.64698331193838, + "grad_norm": 0.9188055396080017, + "learning_rate": 2.61206675224647e-05, + "loss": 0.4268, + "step": 16863 + }, + { + "epoch": 21.64826700898588, + "grad_norm": 1.3997974395751953, + "learning_rate": 2.6120239623448868e-05, + "loss": 0.3801, + "step": 16864 + }, + { + "epoch": 21.649550706033377, + "grad_norm": 1.6072536706924438, + "learning_rate": 2.6119811724433036e-05, + "loss": 0.3972, + "step": 16865 + }, + { + "epoch": 21.65083440308087, + "grad_norm": 1.4306321144104004, + "learning_rate": 2.61193838254172e-05, + "loss": 0.4293, + "step": 16866 + }, + { + "epoch": 21.65211810012837, + "grad_norm": 2.440216541290283, + "learning_rate": 2.611895592640137e-05, + "loss": 0.4451, + "step": 16867 + }, + { + "epoch": 21.653401797175867, + "grad_norm": 1.0639058351516724, + "learning_rate": 2.6118528027385538e-05, + "loss": 0.4089, + "step": 16868 + }, + { + "epoch": 21.654685494223365, + "grad_norm": 1.459477424621582, + "learning_rate": 2.6118100128369703e-05, + "loss": 0.4189, + "step": 16869 + }, + { + "epoch": 21.65596919127086, + "grad_norm": 1.5287108421325684, + "learning_rate": 2.6117672229353875e-05, + "loss": 0.4129, + "step": 16870 + }, + { + "epoch": 21.657252888318357, + "grad_norm": 2.01397705078125, + "learning_rate": 2.611724433033804e-05, + "loss": 0.4064, + "step": 16871 + }, + { + "epoch": 21.658536585365855, + "grad_norm": 1.1932361125946045, + "learning_rate": 2.6116816431322212e-05, + "loss": 0.3751, + "step": 16872 + }, + { + "epoch": 21.65982028241335, + "grad_norm": 1.6199408769607544, + "learning_rate": 2.6116388532306377e-05, + "loss": 0.429, + "step": 16873 + }, + { + "epoch": 21.661103979460847, + "grad_norm": 1.6450841426849365, + "learning_rate": 2.6115960633290542e-05, + "loss": 0.415, + "step": 16874 + }, + { + "epoch": 21.662387676508345, + "grad_norm": 1.699126124382019, + "learning_rate": 2.6115532734274714e-05, + "loss": 0.4345, + "step": 16875 + }, + { + "epoch": 21.66367137355584, + "grad_norm": 0.9609577059745789, + "learning_rate": 2.611510483525888e-05, + "loss": 0.3692, + "step": 16876 + }, + { + "epoch": 21.664955070603337, + "grad_norm": 1.6597963571548462, + "learning_rate": 2.6114676936243047e-05, + "loss": 0.4015, + "step": 16877 + }, + { + "epoch": 21.666238767650835, + "grad_norm": 1.0743205547332764, + "learning_rate": 2.6114249037227215e-05, + "loss": 0.4063, + "step": 16878 + }, + { + "epoch": 21.66752246469833, + "grad_norm": 1.2402561902999878, + "learning_rate": 2.6113821138211384e-05, + "loss": 0.3955, + "step": 16879 + }, + { + "epoch": 21.668806161745827, + "grad_norm": 1.4617365598678589, + "learning_rate": 2.6113393239195552e-05, + "loss": 0.4358, + "step": 16880 + }, + { + "epoch": 21.670089858793325, + "grad_norm": 1.6708253622055054, + "learning_rate": 2.6112965340179717e-05, + "loss": 0.4202, + "step": 16881 + }, + { + "epoch": 21.671373555840823, + "grad_norm": 1.4678642749786377, + "learning_rate": 2.6112537441163886e-05, + "loss": 0.4342, + "step": 16882 + }, + { + "epoch": 21.672657252888317, + "grad_norm": 2.1190712451934814, + "learning_rate": 2.6112109542148054e-05, + "loss": 0.4057, + "step": 16883 + }, + { + "epoch": 21.673940949935815, + "grad_norm": 1.709208369255066, + "learning_rate": 2.6111681643132223e-05, + "loss": 0.4204, + "step": 16884 + }, + { + "epoch": 21.675224646983313, + "grad_norm": 1.3834363222122192, + "learning_rate": 2.6111253744116388e-05, + "loss": 0.4952, + "step": 16885 + }, + { + "epoch": 21.676508344030808, + "grad_norm": 1.4099211692810059, + "learning_rate": 2.611082584510056e-05, + "loss": 0.4006, + "step": 16886 + }, + { + "epoch": 21.677792041078305, + "grad_norm": 1.1038445234298706, + "learning_rate": 2.6110397946084724e-05, + "loss": 0.4414, + "step": 16887 + }, + { + "epoch": 21.679075738125803, + "grad_norm": 1.4340426921844482, + "learning_rate": 2.6109970047068893e-05, + "loss": 0.4314, + "step": 16888 + }, + { + "epoch": 21.680359435173298, + "grad_norm": 1.1680850982666016, + "learning_rate": 2.610954214805306e-05, + "loss": 0.4007, + "step": 16889 + }, + { + "epoch": 21.681643132220795, + "grad_norm": 1.1507816314697266, + "learning_rate": 2.6109114249037226e-05, + "loss": 0.4354, + "step": 16890 + }, + { + "epoch": 21.682926829268293, + "grad_norm": 2.4787447452545166, + "learning_rate": 2.6108686350021398e-05, + "loss": 0.4369, + "step": 16891 + }, + { + "epoch": 21.68421052631579, + "grad_norm": 1.9037896394729614, + "learning_rate": 2.6108258451005563e-05, + "loss": 0.4648, + "step": 16892 + }, + { + "epoch": 21.685494223363285, + "grad_norm": 1.4335675239562988, + "learning_rate": 2.610783055198973e-05, + "loss": 0.453, + "step": 16893 + }, + { + "epoch": 21.686777920410783, + "grad_norm": 1.102052927017212, + "learning_rate": 2.61074026529739e-05, + "loss": 0.4142, + "step": 16894 + }, + { + "epoch": 21.68806161745828, + "grad_norm": 1.2377270460128784, + "learning_rate": 2.6106974753958065e-05, + "loss": 0.3954, + "step": 16895 + }, + { + "epoch": 21.689345314505776, + "grad_norm": 2.478520631790161, + "learning_rate": 2.6106546854942237e-05, + "loss": 0.4364, + "step": 16896 + }, + { + "epoch": 21.690629011553273, + "grad_norm": 1.5599288940429688, + "learning_rate": 2.61061189559264e-05, + "loss": 0.4327, + "step": 16897 + }, + { + "epoch": 21.69191270860077, + "grad_norm": 1.4070008993148804, + "learning_rate": 2.610569105691057e-05, + "loss": 0.4177, + "step": 16898 + }, + { + "epoch": 21.693196405648266, + "grad_norm": 1.8064043521881104, + "learning_rate": 2.610526315789474e-05, + "loss": 0.4632, + "step": 16899 + }, + { + "epoch": 21.694480102695763, + "grad_norm": 28.31871223449707, + "learning_rate": 2.6104835258878904e-05, + "loss": 0.4452, + "step": 16900 + }, + { + "epoch": 21.69576379974326, + "grad_norm": 4.48402214050293, + "learning_rate": 2.6104407359863072e-05, + "loss": 0.4586, + "step": 16901 + }, + { + "epoch": 21.69704749679076, + "grad_norm": 1.7269147634506226, + "learning_rate": 2.610397946084724e-05, + "loss": 0.4877, + "step": 16902 + }, + { + "epoch": 21.698331193838253, + "grad_norm": 2.0618014335632324, + "learning_rate": 2.610355156183141e-05, + "loss": 0.4861, + "step": 16903 + }, + { + "epoch": 21.69961489088575, + "grad_norm": 6.388575077056885, + "learning_rate": 2.6103123662815577e-05, + "loss": 0.4131, + "step": 16904 + }, + { + "epoch": 21.70089858793325, + "grad_norm": 2.839081048965454, + "learning_rate": 2.6102695763799746e-05, + "loss": 0.4856, + "step": 16905 + }, + { + "epoch": 21.702182284980744, + "grad_norm": 4.332163333892822, + "learning_rate": 2.610226786478391e-05, + "loss": 0.4703, + "step": 16906 + }, + { + "epoch": 21.70346598202824, + "grad_norm": 29.094144821166992, + "learning_rate": 2.610183996576808e-05, + "loss": 0.5317, + "step": 16907 + }, + { + "epoch": 21.70474967907574, + "grad_norm": 2.759610176086426, + "learning_rate": 2.6101412066752247e-05, + "loss": 0.5083, + "step": 16908 + }, + { + "epoch": 21.706033376123234, + "grad_norm": 2.5501227378845215, + "learning_rate": 2.6100984167736412e-05, + "loss": 0.6015, + "step": 16909 + }, + { + "epoch": 21.70731707317073, + "grad_norm": 2.4173531532287598, + "learning_rate": 2.6100556268720584e-05, + "loss": 0.4251, + "step": 16910 + }, + { + "epoch": 21.70860077021823, + "grad_norm": 1.1111000776290894, + "learning_rate": 2.610012836970475e-05, + "loss": 0.3906, + "step": 16911 + }, + { + "epoch": 21.709884467265724, + "grad_norm": 1.799511432647705, + "learning_rate": 2.609970047068892e-05, + "loss": 0.3941, + "step": 16912 + }, + { + "epoch": 21.71116816431322, + "grad_norm": 2.4686903953552246, + "learning_rate": 2.6099272571673086e-05, + "loss": 0.4079, + "step": 16913 + }, + { + "epoch": 21.71245186136072, + "grad_norm": 1.5116528272628784, + "learning_rate": 2.609884467265725e-05, + "loss": 0.4505, + "step": 16914 + }, + { + "epoch": 21.713735558408217, + "grad_norm": 1.605604887008667, + "learning_rate": 2.6098416773641423e-05, + "loss": 0.4303, + "step": 16915 + }, + { + "epoch": 21.71501925545571, + "grad_norm": 2.6364083290100098, + "learning_rate": 2.6097988874625588e-05, + "loss": 0.3864, + "step": 16916 + }, + { + "epoch": 21.71630295250321, + "grad_norm": 2.2955734729766846, + "learning_rate": 2.6097560975609756e-05, + "loss": 0.4117, + "step": 16917 + }, + { + "epoch": 21.717586649550707, + "grad_norm": 1.2876842021942139, + "learning_rate": 2.6097133076593925e-05, + "loss": 0.3911, + "step": 16918 + }, + { + "epoch": 21.7188703465982, + "grad_norm": 1.348376989364624, + "learning_rate": 2.6096705177578093e-05, + "loss": 0.4355, + "step": 16919 + }, + { + "epoch": 21.7201540436457, + "grad_norm": 1.3478035926818848, + "learning_rate": 2.6096277278562258e-05, + "loss": 0.3919, + "step": 16920 + }, + { + "epoch": 21.721437740693197, + "grad_norm": 1.401196837425232, + "learning_rate": 2.6095849379546427e-05, + "loss": 0.4134, + "step": 16921 + }, + { + "epoch": 21.72272143774069, + "grad_norm": 1.2337279319763184, + "learning_rate": 2.6095421480530595e-05, + "loss": 0.4266, + "step": 16922 + }, + { + "epoch": 21.72400513478819, + "grad_norm": 1.1246181726455688, + "learning_rate": 2.6094993581514763e-05, + "loss": 0.4118, + "step": 16923 + }, + { + "epoch": 21.725288831835687, + "grad_norm": 1.8288418054580688, + "learning_rate": 2.6094565682498932e-05, + "loss": 0.4255, + "step": 16924 + }, + { + "epoch": 21.726572528883185, + "grad_norm": 1.2622579336166382, + "learning_rate": 2.6094137783483097e-05, + "loss": 0.402, + "step": 16925 + }, + { + "epoch": 21.72785622593068, + "grad_norm": 2.3676884174346924, + "learning_rate": 2.609370988446727e-05, + "loss": 0.4073, + "step": 16926 + }, + { + "epoch": 21.729139922978177, + "grad_norm": 1.2062163352966309, + "learning_rate": 2.6093281985451434e-05, + "loss": 0.398, + "step": 16927 + }, + { + "epoch": 21.730423620025675, + "grad_norm": 2.985698699951172, + "learning_rate": 2.60928540864356e-05, + "loss": 0.457, + "step": 16928 + }, + { + "epoch": 21.73170731707317, + "grad_norm": 1.526949405670166, + "learning_rate": 2.609242618741977e-05, + "loss": 0.4652, + "step": 16929 + }, + { + "epoch": 21.732991014120667, + "grad_norm": 6.4509477615356445, + "learning_rate": 2.6091998288403936e-05, + "loss": 0.4459, + "step": 16930 + }, + { + "epoch": 21.734274711168165, + "grad_norm": 1.1960443258285522, + "learning_rate": 2.6091570389388107e-05, + "loss": 0.4242, + "step": 16931 + }, + { + "epoch": 21.73555840821566, + "grad_norm": 1.6812325716018677, + "learning_rate": 2.6091142490372272e-05, + "loss": 0.4478, + "step": 16932 + }, + { + "epoch": 21.736842105263158, + "grad_norm": 0.8799163699150085, + "learning_rate": 2.609071459135644e-05, + "loss": 0.4199, + "step": 16933 + }, + { + "epoch": 21.738125802310655, + "grad_norm": 2.36378812789917, + "learning_rate": 2.609028669234061e-05, + "loss": 0.4371, + "step": 16934 + }, + { + "epoch": 21.739409499358153, + "grad_norm": 1.58971107006073, + "learning_rate": 2.6089858793324774e-05, + "loss": 0.4178, + "step": 16935 + }, + { + "epoch": 21.740693196405648, + "grad_norm": 1.2281131744384766, + "learning_rate": 2.6089430894308943e-05, + "loss": 0.3771, + "step": 16936 + }, + { + "epoch": 21.741976893453145, + "grad_norm": 1.2079837322235107, + "learning_rate": 2.608900299529311e-05, + "loss": 0.4577, + "step": 16937 + }, + { + "epoch": 21.743260590500643, + "grad_norm": 0.9169016480445862, + "learning_rate": 2.608857509627728e-05, + "loss": 0.4396, + "step": 16938 + }, + { + "epoch": 21.744544287548138, + "grad_norm": 2.392646551132202, + "learning_rate": 2.6088147197261448e-05, + "loss": 0.4745, + "step": 16939 + }, + { + "epoch": 21.745827984595635, + "grad_norm": 1.8582645654678345, + "learning_rate": 2.6087719298245616e-05, + "loss": 0.4109, + "step": 16940 + }, + { + "epoch": 21.747111681643133, + "grad_norm": 1.2999838590621948, + "learning_rate": 2.608729139922978e-05, + "loss": 0.4477, + "step": 16941 + }, + { + "epoch": 21.748395378690628, + "grad_norm": 2.5228049755096436, + "learning_rate": 2.608686350021395e-05, + "loss": 0.388, + "step": 16942 + }, + { + "epoch": 21.749679075738126, + "grad_norm": 2.3330883979797363, + "learning_rate": 2.6086435601198118e-05, + "loss": 0.4647, + "step": 16943 + }, + { + "epoch": 21.750962772785623, + "grad_norm": 1.58022940158844, + "learning_rate": 2.6086007702182283e-05, + "loss": 0.4117, + "step": 16944 + }, + { + "epoch": 21.752246469833118, + "grad_norm": 1.8690792322158813, + "learning_rate": 2.6085579803166455e-05, + "loss": 0.4468, + "step": 16945 + }, + { + "epoch": 21.753530166880616, + "grad_norm": 2.251607894897461, + "learning_rate": 2.608515190415062e-05, + "loss": 0.4141, + "step": 16946 + }, + { + "epoch": 21.754813863928113, + "grad_norm": 1.0485610961914062, + "learning_rate": 2.6084724005134792e-05, + "loss": 0.4212, + "step": 16947 + }, + { + "epoch": 21.75609756097561, + "grad_norm": 1.3759311437606812, + "learning_rate": 2.6084296106118957e-05, + "loss": 0.4611, + "step": 16948 + }, + { + "epoch": 21.757381258023106, + "grad_norm": 2.4156885147094727, + "learning_rate": 2.6083868207103122e-05, + "loss": 0.4198, + "step": 16949 + }, + { + "epoch": 21.758664955070603, + "grad_norm": 3.739670991897583, + "learning_rate": 2.6083440308087294e-05, + "loss": 0.4836, + "step": 16950 + }, + { + "epoch": 21.7599486521181, + "grad_norm": 4.506738185882568, + "learning_rate": 2.608301240907146e-05, + "loss": 0.4646, + "step": 16951 + }, + { + "epoch": 21.761232349165596, + "grad_norm": 1.1434903144836426, + "learning_rate": 2.6082584510055627e-05, + "loss": 0.4599, + "step": 16952 + }, + { + "epoch": 21.762516046213094, + "grad_norm": 2.3327386379241943, + "learning_rate": 2.6082156611039795e-05, + "loss": 0.4109, + "step": 16953 + }, + { + "epoch": 21.76379974326059, + "grad_norm": 1.665572166442871, + "learning_rate": 2.6081728712023964e-05, + "loss": 0.3844, + "step": 16954 + }, + { + "epoch": 21.765083440308086, + "grad_norm": 2.7890331745147705, + "learning_rate": 2.6081300813008132e-05, + "loss": 0.4733, + "step": 16955 + }, + { + "epoch": 21.766367137355584, + "grad_norm": 1.8418536186218262, + "learning_rate": 2.6080872913992297e-05, + "loss": 0.4668, + "step": 16956 + }, + { + "epoch": 21.76765083440308, + "grad_norm": 1.3934897184371948, + "learning_rate": 2.6080445014976466e-05, + "loss": 0.4616, + "step": 16957 + }, + { + "epoch": 21.76893453145058, + "grad_norm": 5.180170059204102, + "learning_rate": 2.6080017115960634e-05, + "loss": 0.5661, + "step": 16958 + }, + { + "epoch": 21.770218228498074, + "grad_norm": 6.202630996704102, + "learning_rate": 2.6079589216944803e-05, + "loss": 0.6303, + "step": 16959 + }, + { + "epoch": 21.77150192554557, + "grad_norm": 1.3674997091293335, + "learning_rate": 2.6079161317928968e-05, + "loss": 0.3746, + "step": 16960 + }, + { + "epoch": 21.77278562259307, + "grad_norm": 0.8295171856880188, + "learning_rate": 2.6078733418913136e-05, + "loss": 0.3837, + "step": 16961 + }, + { + "epoch": 21.774069319640564, + "grad_norm": 2.1771507263183594, + "learning_rate": 2.6078305519897304e-05, + "loss": 0.3899, + "step": 16962 + }, + { + "epoch": 21.77535301668806, + "grad_norm": 1.7198796272277832, + "learning_rate": 2.6077877620881473e-05, + "loss": 0.4757, + "step": 16963 + }, + { + "epoch": 21.77663671373556, + "grad_norm": 1.3882824182510376, + "learning_rate": 2.607744972186564e-05, + "loss": 0.3973, + "step": 16964 + }, + { + "epoch": 21.777920410783054, + "grad_norm": 1.0802664756774902, + "learning_rate": 2.6077021822849806e-05, + "loss": 0.4023, + "step": 16965 + }, + { + "epoch": 21.77920410783055, + "grad_norm": 1.103947639465332, + "learning_rate": 2.6076593923833978e-05, + "loss": 0.3983, + "step": 16966 + }, + { + "epoch": 21.78048780487805, + "grad_norm": 1.3839603662490845, + "learning_rate": 2.6076166024818143e-05, + "loss": 0.4094, + "step": 16967 + }, + { + "epoch": 21.781771501925547, + "grad_norm": 2.9598915576934814, + "learning_rate": 2.6075738125802308e-05, + "loss": 0.3967, + "step": 16968 + }, + { + "epoch": 21.78305519897304, + "grad_norm": 1.9312697649002075, + "learning_rate": 2.607531022678648e-05, + "loss": 0.3972, + "step": 16969 + }, + { + "epoch": 21.78433889602054, + "grad_norm": 1.3327314853668213, + "learning_rate": 2.6074882327770645e-05, + "loss": 0.391, + "step": 16970 + }, + { + "epoch": 21.785622593068037, + "grad_norm": 1.0574326515197754, + "learning_rate": 2.6074454428754817e-05, + "loss": 0.3987, + "step": 16971 + }, + { + "epoch": 21.78690629011553, + "grad_norm": 1.3439847230911255, + "learning_rate": 2.6074026529738982e-05, + "loss": 0.4003, + "step": 16972 + }, + { + "epoch": 21.78818998716303, + "grad_norm": 1.5993140935897827, + "learning_rate": 2.607359863072315e-05, + "loss": 0.4241, + "step": 16973 + }, + { + "epoch": 21.789473684210527, + "grad_norm": 1.4674488306045532, + "learning_rate": 2.607317073170732e-05, + "loss": 0.4228, + "step": 16974 + }, + { + "epoch": 21.79075738125802, + "grad_norm": 1.428065299987793, + "learning_rate": 2.6072742832691484e-05, + "loss": 0.4114, + "step": 16975 + }, + { + "epoch": 21.79204107830552, + "grad_norm": 1.1170531511306763, + "learning_rate": 2.6072314933675652e-05, + "loss": 0.4024, + "step": 16976 + }, + { + "epoch": 21.793324775353017, + "grad_norm": 0.843090832233429, + "learning_rate": 2.607188703465982e-05, + "loss": 0.359, + "step": 16977 + }, + { + "epoch": 21.794608472400512, + "grad_norm": 1.0214190483093262, + "learning_rate": 2.607145913564399e-05, + "loss": 0.4107, + "step": 16978 + }, + { + "epoch": 21.79589216944801, + "grad_norm": 4.436569690704346, + "learning_rate": 2.6071031236628157e-05, + "loss": 0.3894, + "step": 16979 + }, + { + "epoch": 21.797175866495508, + "grad_norm": 3.9841818809509277, + "learning_rate": 2.6070603337612326e-05, + "loss": 0.4232, + "step": 16980 + }, + { + "epoch": 21.798459563543005, + "grad_norm": 1.996091365814209, + "learning_rate": 2.607017543859649e-05, + "loss": 0.4112, + "step": 16981 + }, + { + "epoch": 21.7997432605905, + "grad_norm": 1.495867133140564, + "learning_rate": 2.606974753958066e-05, + "loss": 0.3907, + "step": 16982 + }, + { + "epoch": 21.801026957637998, + "grad_norm": 1.2655078172683716, + "learning_rate": 2.6069319640564828e-05, + "loss": 0.411, + "step": 16983 + }, + { + "epoch": 21.802310654685495, + "grad_norm": 4.588610649108887, + "learning_rate": 2.6068891741548993e-05, + "loss": 0.3498, + "step": 16984 + }, + { + "epoch": 21.80359435173299, + "grad_norm": 1.3592582941055298, + "learning_rate": 2.6068463842533164e-05, + "loss": 0.4048, + "step": 16985 + }, + { + "epoch": 21.804878048780488, + "grad_norm": 1.2309045791625977, + "learning_rate": 2.606803594351733e-05, + "loss": 0.407, + "step": 16986 + }, + { + "epoch": 21.806161745827985, + "grad_norm": 1.346373438835144, + "learning_rate": 2.60676080445015e-05, + "loss": 0.3848, + "step": 16987 + }, + { + "epoch": 21.80744544287548, + "grad_norm": 2.1865570545196533, + "learning_rate": 2.6067180145485666e-05, + "loss": 0.4006, + "step": 16988 + }, + { + "epoch": 21.808729139922978, + "grad_norm": 1.696042537689209, + "learning_rate": 2.606675224646983e-05, + "loss": 0.4058, + "step": 16989 + }, + { + "epoch": 21.810012836970476, + "grad_norm": 2.644397735595703, + "learning_rate": 2.6066324347454003e-05, + "loss": 0.4225, + "step": 16990 + }, + { + "epoch": 21.811296534017973, + "grad_norm": 1.511117696762085, + "learning_rate": 2.6065896448438168e-05, + "loss": 0.4325, + "step": 16991 + }, + { + "epoch": 21.812580231065468, + "grad_norm": 1.3797812461853027, + "learning_rate": 2.6065468549422336e-05, + "loss": 0.4217, + "step": 16992 + }, + { + "epoch": 21.813863928112966, + "grad_norm": 3.5929486751556396, + "learning_rate": 2.6065040650406505e-05, + "loss": 0.4154, + "step": 16993 + }, + { + "epoch": 21.815147625160463, + "grad_norm": 2.0232226848602295, + "learning_rate": 2.6064612751390673e-05, + "loss": 0.4022, + "step": 16994 + }, + { + "epoch": 21.816431322207958, + "grad_norm": 6.310202121734619, + "learning_rate": 2.606418485237484e-05, + "loss": 0.4095, + "step": 16995 + }, + { + "epoch": 21.817715019255456, + "grad_norm": 2.029449462890625, + "learning_rate": 2.6063756953359007e-05, + "loss": 0.4075, + "step": 16996 + }, + { + "epoch": 21.818998716302954, + "grad_norm": 4.456124305725098, + "learning_rate": 2.6063329054343175e-05, + "loss": 0.4628, + "step": 16997 + }, + { + "epoch": 21.820282413350448, + "grad_norm": 5.130229473114014, + "learning_rate": 2.6062901155327344e-05, + "loss": 0.4379, + "step": 16998 + }, + { + "epoch": 21.821566110397946, + "grad_norm": 1.576045274734497, + "learning_rate": 2.6062473256311512e-05, + "loss": 0.4355, + "step": 16999 + }, + { + "epoch": 21.822849807445444, + "grad_norm": 1.7908333539962769, + "learning_rate": 2.6062045357295677e-05, + "loss": 0.4892, + "step": 17000 + }, + { + "epoch": 21.822849807445444, + "eval_cer": 0.28299392990013705, + "eval_loss": 0.498418390750885, + "eval_runtime": 13.6694, + "eval_samples_per_second": 71.912, + "eval_steps_per_second": 0.512, + "eval_wer": 0.4861270350836964, + "step": 17000 + }, + { + "epoch": 21.82413350449294, + "grad_norm": 2.3146469593048096, + "learning_rate": 2.606161745827985e-05, + "loss": 0.4646, + "step": 17001 + }, + { + "epoch": 21.825417201540436, + "grad_norm": 1.760339379310608, + "learning_rate": 2.6061189559264014e-05, + "loss": 0.4132, + "step": 17002 + }, + { + "epoch": 21.826700898587934, + "grad_norm": 1.8875362873077393, + "learning_rate": 2.6060761660248182e-05, + "loss": 0.4415, + "step": 17003 + }, + { + "epoch": 21.82798459563543, + "grad_norm": 1.7938652038574219, + "learning_rate": 2.606033376123235e-05, + "loss": 0.4664, + "step": 17004 + }, + { + "epoch": 21.829268292682926, + "grad_norm": 3.241664409637451, + "learning_rate": 2.6059905862216516e-05, + "loss": 0.4287, + "step": 17005 + }, + { + "epoch": 21.830551989730424, + "grad_norm": 1.653214693069458, + "learning_rate": 2.6059477963200687e-05, + "loss": 0.4885, + "step": 17006 + }, + { + "epoch": 21.83183568677792, + "grad_norm": 3.081007957458496, + "learning_rate": 2.6059050064184852e-05, + "loss": 0.4893, + "step": 17007 + }, + { + "epoch": 21.833119383825416, + "grad_norm": 1.6377629041671753, + "learning_rate": 2.605862216516902e-05, + "loss": 0.5605, + "step": 17008 + }, + { + "epoch": 21.834403080872914, + "grad_norm": 1.8878904581069946, + "learning_rate": 2.605819426615319e-05, + "loss": 0.5962, + "step": 17009 + }, + { + "epoch": 21.83568677792041, + "grad_norm": 1.0382575988769531, + "learning_rate": 2.6057766367137354e-05, + "loss": 0.4243, + "step": 17010 + }, + { + "epoch": 21.836970474967906, + "grad_norm": 1.0096133947372437, + "learning_rate": 2.6057338468121526e-05, + "loss": 0.384, + "step": 17011 + }, + { + "epoch": 21.838254172015404, + "grad_norm": 1.514161467552185, + "learning_rate": 2.605691056910569e-05, + "loss": 0.4016, + "step": 17012 + }, + { + "epoch": 21.8395378690629, + "grad_norm": 1.232506275177002, + "learning_rate": 2.605648267008986e-05, + "loss": 0.4212, + "step": 17013 + }, + { + "epoch": 21.8408215661104, + "grad_norm": 1.8966186046600342, + "learning_rate": 2.6056054771074028e-05, + "loss": 0.3946, + "step": 17014 + }, + { + "epoch": 21.842105263157894, + "grad_norm": 1.4810914993286133, + "learning_rate": 2.6055626872058196e-05, + "loss": 0.4061, + "step": 17015 + }, + { + "epoch": 21.84338896020539, + "grad_norm": 1.199371576309204, + "learning_rate": 2.605519897304236e-05, + "loss": 0.3928, + "step": 17016 + }, + { + "epoch": 21.84467265725289, + "grad_norm": 1.096167802810669, + "learning_rate": 2.605477107402653e-05, + "loss": 0.4048, + "step": 17017 + }, + { + "epoch": 21.845956354300384, + "grad_norm": 0.9680678248405457, + "learning_rate": 2.6054343175010698e-05, + "loss": 0.4281, + "step": 17018 + }, + { + "epoch": 21.84724005134788, + "grad_norm": 2.4443206787109375, + "learning_rate": 2.6053915275994867e-05, + "loss": 0.3969, + "step": 17019 + }, + { + "epoch": 21.84852374839538, + "grad_norm": 2.646045207977295, + "learning_rate": 2.6053487376979035e-05, + "loss": 0.3977, + "step": 17020 + }, + { + "epoch": 21.849807445442874, + "grad_norm": 1.6413463354110718, + "learning_rate": 2.60530594779632e-05, + "loss": 0.4112, + "step": 17021 + }, + { + "epoch": 21.85109114249037, + "grad_norm": 2.2140467166900635, + "learning_rate": 2.605263157894737e-05, + "loss": 0.4182, + "step": 17022 + }, + { + "epoch": 21.85237483953787, + "grad_norm": 1.5262835025787354, + "learning_rate": 2.6052203679931537e-05, + "loss": 0.3903, + "step": 17023 + }, + { + "epoch": 21.853658536585368, + "grad_norm": 2.2312819957733154, + "learning_rate": 2.6051775780915702e-05, + "loss": 0.3767, + "step": 17024 + }, + { + "epoch": 21.854942233632862, + "grad_norm": 3.389533758163452, + "learning_rate": 2.6051347881899874e-05, + "loss": 0.4131, + "step": 17025 + }, + { + "epoch": 21.85622593068036, + "grad_norm": 1.491836428642273, + "learning_rate": 2.605091998288404e-05, + "loss": 0.3974, + "step": 17026 + }, + { + "epoch": 21.857509627727858, + "grad_norm": 1.5413808822631836, + "learning_rate": 2.605049208386821e-05, + "loss": 0.393, + "step": 17027 + }, + { + "epoch": 21.858793324775352, + "grad_norm": 1.261025071144104, + "learning_rate": 2.6050064184852376e-05, + "loss": 0.4184, + "step": 17028 + }, + { + "epoch": 21.86007702182285, + "grad_norm": 1.1865358352661133, + "learning_rate": 2.604963628583654e-05, + "loss": 0.3852, + "step": 17029 + }, + { + "epoch": 21.861360718870348, + "grad_norm": 1.2945281267166138, + "learning_rate": 2.6049208386820712e-05, + "loss": 0.4211, + "step": 17030 + }, + { + "epoch": 21.862644415917842, + "grad_norm": 0.9309272170066833, + "learning_rate": 2.6048780487804877e-05, + "loss": 0.4193, + "step": 17031 + }, + { + "epoch": 21.86392811296534, + "grad_norm": 1.4245431423187256, + "learning_rate": 2.6048352588789046e-05, + "loss": 0.3758, + "step": 17032 + }, + { + "epoch": 21.865211810012838, + "grad_norm": 0.9608796834945679, + "learning_rate": 2.6047924689773214e-05, + "loss": 0.3934, + "step": 17033 + }, + { + "epoch": 21.866495507060336, + "grad_norm": 2.525362968444824, + "learning_rate": 2.6047496790757383e-05, + "loss": 0.4081, + "step": 17034 + }, + { + "epoch": 21.86777920410783, + "grad_norm": 1.671194314956665, + "learning_rate": 2.604706889174155e-05, + "loss": 0.3775, + "step": 17035 + }, + { + "epoch": 21.869062901155328, + "grad_norm": 1.4613323211669922, + "learning_rate": 2.6046640992725716e-05, + "loss": 0.3728, + "step": 17036 + }, + { + "epoch": 21.870346598202826, + "grad_norm": 9.069083213806152, + "learning_rate": 2.6046213093709884e-05, + "loss": 0.375, + "step": 17037 + }, + { + "epoch": 21.87163029525032, + "grad_norm": 2.6728968620300293, + "learning_rate": 2.6045785194694053e-05, + "loss": 0.4156, + "step": 17038 + }, + { + "epoch": 21.872913992297818, + "grad_norm": 1.2713078260421753, + "learning_rate": 2.604535729567822e-05, + "loss": 0.3917, + "step": 17039 + }, + { + "epoch": 21.874197689345316, + "grad_norm": 1.3084092140197754, + "learning_rate": 2.6044929396662386e-05, + "loss": 0.3857, + "step": 17040 + }, + { + "epoch": 21.87548138639281, + "grad_norm": 0.883705735206604, + "learning_rate": 2.6044501497646558e-05, + "loss": 0.4251, + "step": 17041 + }, + { + "epoch": 21.876765083440308, + "grad_norm": 1.2501296997070312, + "learning_rate": 2.6044073598630723e-05, + "loss": 0.3933, + "step": 17042 + }, + { + "epoch": 21.878048780487806, + "grad_norm": 1.8239519596099854, + "learning_rate": 2.604364569961489e-05, + "loss": 0.3726, + "step": 17043 + }, + { + "epoch": 21.8793324775353, + "grad_norm": 1.494103193283081, + "learning_rate": 2.604321780059906e-05, + "loss": 0.4357, + "step": 17044 + }, + { + "epoch": 21.880616174582798, + "grad_norm": 2.316873550415039, + "learning_rate": 2.6042789901583225e-05, + "loss": 0.4011, + "step": 17045 + }, + { + "epoch": 21.881899871630296, + "grad_norm": 2.160048484802246, + "learning_rate": 2.6042362002567397e-05, + "loss": 0.4878, + "step": 17046 + }, + { + "epoch": 21.883183568677794, + "grad_norm": 1.9488022327423096, + "learning_rate": 2.6041934103551562e-05, + "loss": 0.415, + "step": 17047 + }, + { + "epoch": 21.884467265725288, + "grad_norm": 2.397113561630249, + "learning_rate": 2.604150620453573e-05, + "loss": 0.4137, + "step": 17048 + }, + { + "epoch": 21.885750962772786, + "grad_norm": 1.2291865348815918, + "learning_rate": 2.60410783055199e-05, + "loss": 0.4186, + "step": 17049 + }, + { + "epoch": 21.887034659820284, + "grad_norm": 2.2543387413024902, + "learning_rate": 2.6040650406504064e-05, + "loss": 0.463, + "step": 17050 + }, + { + "epoch": 21.888318356867778, + "grad_norm": 1.4846317768096924, + "learning_rate": 2.6040222507488235e-05, + "loss": 0.4597, + "step": 17051 + }, + { + "epoch": 21.889602053915276, + "grad_norm": 1.8380204439163208, + "learning_rate": 2.60397946084724e-05, + "loss": 0.4417, + "step": 17052 + }, + { + "epoch": 21.890885750962774, + "grad_norm": 2.228327751159668, + "learning_rate": 2.603936670945657e-05, + "loss": 0.4219, + "step": 17053 + }, + { + "epoch": 21.892169448010268, + "grad_norm": 1.821368932723999, + "learning_rate": 2.6038938810440737e-05, + "loss": 0.4456, + "step": 17054 + }, + { + "epoch": 21.893453145057766, + "grad_norm": 1.079679250717163, + "learning_rate": 2.6038510911424906e-05, + "loss": 0.4638, + "step": 17055 + }, + { + "epoch": 21.894736842105264, + "grad_norm": 2.109740734100342, + "learning_rate": 2.603808301240907e-05, + "loss": 0.4708, + "step": 17056 + }, + { + "epoch": 21.89602053915276, + "grad_norm": 2.3290674686431885, + "learning_rate": 2.603765511339324e-05, + "loss": 0.5457, + "step": 17057 + }, + { + "epoch": 21.897304236200256, + "grad_norm": 2.765899181365967, + "learning_rate": 2.6037227214377408e-05, + "loss": 0.4884, + "step": 17058 + }, + { + "epoch": 21.898587933247754, + "grad_norm": 2.396434783935547, + "learning_rate": 2.6036799315361576e-05, + "loss": 0.6489, + "step": 17059 + }, + { + "epoch": 21.89987163029525, + "grad_norm": 1.6913094520568848, + "learning_rate": 2.6036371416345744e-05, + "loss": 0.3838, + "step": 17060 + }, + { + "epoch": 21.901155327342746, + "grad_norm": 2.0033304691314697, + "learning_rate": 2.603594351732991e-05, + "loss": 0.391, + "step": 17061 + }, + { + "epoch": 21.902439024390244, + "grad_norm": 1.3701289892196655, + "learning_rate": 2.603551561831408e-05, + "loss": 0.4151, + "step": 17062 + }, + { + "epoch": 21.90372272143774, + "grad_norm": 1.5053839683532715, + "learning_rate": 2.6035087719298246e-05, + "loss": 0.477, + "step": 17063 + }, + { + "epoch": 21.905006418485236, + "grad_norm": 0.9169126152992249, + "learning_rate": 2.603465982028241e-05, + "loss": 0.3881, + "step": 17064 + }, + { + "epoch": 21.906290115532734, + "grad_norm": 0.9494152069091797, + "learning_rate": 2.6034231921266583e-05, + "loss": 0.3709, + "step": 17065 + }, + { + "epoch": 21.90757381258023, + "grad_norm": 1.1610654592514038, + "learning_rate": 2.6033804022250748e-05, + "loss": 0.4583, + "step": 17066 + }, + { + "epoch": 21.90885750962773, + "grad_norm": 1.3617768287658691, + "learning_rate": 2.603337612323492e-05, + "loss": 0.4032, + "step": 17067 + }, + { + "epoch": 21.910141206675224, + "grad_norm": 1.0906164646148682, + "learning_rate": 2.6032948224219085e-05, + "loss": 0.3921, + "step": 17068 + }, + { + "epoch": 21.911424903722722, + "grad_norm": 1.5168155431747437, + "learning_rate": 2.6032520325203253e-05, + "loss": 0.4269, + "step": 17069 + }, + { + "epoch": 21.91270860077022, + "grad_norm": 1.1206318140029907, + "learning_rate": 2.6032092426187422e-05, + "loss": 0.4178, + "step": 17070 + }, + { + "epoch": 21.913992297817714, + "grad_norm": 1.2654222249984741, + "learning_rate": 2.6031664527171587e-05, + "loss": 0.4024, + "step": 17071 + }, + { + "epoch": 21.915275994865212, + "grad_norm": 1.5307397842407227, + "learning_rate": 2.6031236628155755e-05, + "loss": 0.4192, + "step": 17072 + }, + { + "epoch": 21.91655969191271, + "grad_norm": 1.7419852018356323, + "learning_rate": 2.6030808729139924e-05, + "loss": 0.3731, + "step": 17073 + }, + { + "epoch": 21.917843388960204, + "grad_norm": 2.574201822280884, + "learning_rate": 2.6030380830124092e-05, + "loss": 0.3765, + "step": 17074 + }, + { + "epoch": 21.919127086007702, + "grad_norm": 1.3742669820785522, + "learning_rate": 2.602995293110826e-05, + "loss": 0.3858, + "step": 17075 + }, + { + "epoch": 21.9204107830552, + "grad_norm": 1.0706902742385864, + "learning_rate": 2.602952503209243e-05, + "loss": 0.3855, + "step": 17076 + }, + { + "epoch": 21.921694480102694, + "grad_norm": 1.6991840600967407, + "learning_rate": 2.6029097133076594e-05, + "loss": 0.3868, + "step": 17077 + }, + { + "epoch": 21.922978177150192, + "grad_norm": 1.9342297315597534, + "learning_rate": 2.6028669234060762e-05, + "loss": 0.4306, + "step": 17078 + }, + { + "epoch": 21.92426187419769, + "grad_norm": 1.255058765411377, + "learning_rate": 2.602824133504493e-05, + "loss": 0.4147, + "step": 17079 + }, + { + "epoch": 21.925545571245188, + "grad_norm": 3.817361831665039, + "learning_rate": 2.6027813436029096e-05, + "loss": 0.3906, + "step": 17080 + }, + { + "epoch": 21.926829268292682, + "grad_norm": 1.6094074249267578, + "learning_rate": 2.6027385537013267e-05, + "loss": 0.3734, + "step": 17081 + }, + { + "epoch": 21.92811296534018, + "grad_norm": 1.2770261764526367, + "learning_rate": 2.6026957637997433e-05, + "loss": 0.3811, + "step": 17082 + }, + { + "epoch": 21.929396662387678, + "grad_norm": 0.992374837398529, + "learning_rate": 2.60265297389816e-05, + "loss": 0.4036, + "step": 17083 + }, + { + "epoch": 21.930680359435172, + "grad_norm": 2.476968765258789, + "learning_rate": 2.602610183996577e-05, + "loss": 0.3888, + "step": 17084 + }, + { + "epoch": 21.93196405648267, + "grad_norm": 1.9811517000198364, + "learning_rate": 2.6025673940949934e-05, + "loss": 0.4183, + "step": 17085 + }, + { + "epoch": 21.933247753530168, + "grad_norm": 1.1485867500305176, + "learning_rate": 2.6025246041934106e-05, + "loss": 0.3989, + "step": 17086 + }, + { + "epoch": 21.934531450577662, + "grad_norm": 1.3516225814819336, + "learning_rate": 2.602481814291827e-05, + "loss": 0.4094, + "step": 17087 + }, + { + "epoch": 21.93581514762516, + "grad_norm": 0.9967828989028931, + "learning_rate": 2.602439024390244e-05, + "loss": 0.4141, + "step": 17088 + }, + { + "epoch": 21.937098844672658, + "grad_norm": 3.1097278594970703, + "learning_rate": 2.6023962344886608e-05, + "loss": 0.4152, + "step": 17089 + }, + { + "epoch": 21.938382541720156, + "grad_norm": 1.5592622756958008, + "learning_rate": 2.6023534445870773e-05, + "loss": 0.4104, + "step": 17090 + }, + { + "epoch": 21.93966623876765, + "grad_norm": 2.258669376373291, + "learning_rate": 2.6023106546854945e-05, + "loss": 0.4302, + "step": 17091 + }, + { + "epoch": 21.940949935815148, + "grad_norm": 1.5052542686462402, + "learning_rate": 2.602267864783911e-05, + "loss": 0.4004, + "step": 17092 + }, + { + "epoch": 21.942233632862646, + "grad_norm": 1.0356560945510864, + "learning_rate": 2.6022250748823278e-05, + "loss": 0.4541, + "step": 17093 + }, + { + "epoch": 21.94351732991014, + "grad_norm": 2.5462429523468018, + "learning_rate": 2.6021822849807447e-05, + "loss": 0.4184, + "step": 17094 + }, + { + "epoch": 21.944801026957638, + "grad_norm": 1.3212759494781494, + "learning_rate": 2.6021394950791615e-05, + "loss": 0.4446, + "step": 17095 + }, + { + "epoch": 21.946084724005136, + "grad_norm": 0.996844470500946, + "learning_rate": 2.602096705177578e-05, + "loss": 0.4396, + "step": 17096 + }, + { + "epoch": 21.94736842105263, + "grad_norm": 1.7946900129318237, + "learning_rate": 2.602053915275995e-05, + "loss": 0.4832, + "step": 17097 + }, + { + "epoch": 21.948652118100128, + "grad_norm": 0.9405632019042969, + "learning_rate": 2.6020111253744117e-05, + "loss": 0.402, + "step": 17098 + }, + { + "epoch": 21.949935815147626, + "grad_norm": 1.5193219184875488, + "learning_rate": 2.6019683354728285e-05, + "loss": 0.4581, + "step": 17099 + }, + { + "epoch": 21.951219512195124, + "grad_norm": 1.63100266456604, + "learning_rate": 2.6019255455712454e-05, + "loss": 0.4062, + "step": 17100 + }, + { + "epoch": 21.952503209242618, + "grad_norm": 1.4040879011154175, + "learning_rate": 2.601882755669662e-05, + "loss": 0.451, + "step": 17101 + }, + { + "epoch": 21.953786906290116, + "grad_norm": 2.4881019592285156, + "learning_rate": 2.601839965768079e-05, + "loss": 0.4992, + "step": 17102 + }, + { + "epoch": 21.955070603337614, + "grad_norm": 1.8086450099945068, + "learning_rate": 2.6017971758664956e-05, + "loss": 0.4939, + "step": 17103 + }, + { + "epoch": 21.956354300385108, + "grad_norm": 3.7948899269104004, + "learning_rate": 2.601754385964912e-05, + "loss": 0.4555, + "step": 17104 + }, + { + "epoch": 21.957637997432606, + "grad_norm": 1.5284955501556396, + "learning_rate": 2.6017115960633292e-05, + "loss": 0.4971, + "step": 17105 + }, + { + "epoch": 21.958921694480104, + "grad_norm": 2.098766326904297, + "learning_rate": 2.6016688061617457e-05, + "loss": 0.4538, + "step": 17106 + }, + { + "epoch": 21.960205391527598, + "grad_norm": Infinity, + "learning_rate": 2.6016688061617457e-05, + "loss": 0.5152, + "step": 17107 + }, + { + "epoch": 21.961489088575096, + "grad_norm": 1.7503795623779297, + "learning_rate": 2.601626016260163e-05, + "loss": 0.5206, + "step": 17108 + }, + { + "epoch": 21.962772785622594, + "grad_norm": 2.0888540744781494, + "learning_rate": 2.6015832263585794e-05, + "loss": 0.5687, + "step": 17109 + }, + { + "epoch": 21.964056482670088, + "grad_norm": 1.5558453798294067, + "learning_rate": 2.6015404364569963e-05, + "loss": 0.4004, + "step": 17110 + }, + { + "epoch": 21.965340179717586, + "grad_norm": 0.8299717307090759, + "learning_rate": 2.601497646555413e-05, + "loss": 0.4181, + "step": 17111 + }, + { + "epoch": 21.966623876765084, + "grad_norm": 1.4305195808410645, + "learning_rate": 2.6014548566538296e-05, + "loss": 0.3922, + "step": 17112 + }, + { + "epoch": 21.96790757381258, + "grad_norm": 1.2075364589691162, + "learning_rate": 2.6014120667522465e-05, + "loss": 0.3787, + "step": 17113 + }, + { + "epoch": 21.969191270860076, + "grad_norm": 1.8916311264038086, + "learning_rate": 2.6013692768506633e-05, + "loss": 0.4225, + "step": 17114 + }, + { + "epoch": 21.970474967907574, + "grad_norm": 1.3328980207443237, + "learning_rate": 2.60132648694908e-05, + "loss": 0.4135, + "step": 17115 + }, + { + "epoch": 21.971758664955072, + "grad_norm": 0.9867634773254395, + "learning_rate": 2.601283697047497e-05, + "loss": 0.3809, + "step": 17116 + }, + { + "epoch": 21.973042362002566, + "grad_norm": 3.146237373352051, + "learning_rate": 2.6012409071459138e-05, + "loss": 0.4037, + "step": 17117 + }, + { + "epoch": 21.974326059050064, + "grad_norm": 1.9484964609146118, + "learning_rate": 2.6011981172443303e-05, + "loss": 0.4162, + "step": 17118 + }, + { + "epoch": 21.975609756097562, + "grad_norm": 1.2161715030670166, + "learning_rate": 2.601155327342747e-05, + "loss": 0.3963, + "step": 17119 + }, + { + "epoch": 21.976893453145056, + "grad_norm": 1.050351619720459, + "learning_rate": 2.601112537441164e-05, + "loss": 0.3903, + "step": 17120 + }, + { + "epoch": 21.978177150192554, + "grad_norm": 1.5940279960632324, + "learning_rate": 2.6010697475395805e-05, + "loss": 0.3971, + "step": 17121 + }, + { + "epoch": 21.979460847240052, + "grad_norm": 3.3350815773010254, + "learning_rate": 2.6010269576379977e-05, + "loss": 0.4075, + "step": 17122 + }, + { + "epoch": 21.98074454428755, + "grad_norm": 1.1364821195602417, + "learning_rate": 2.6009841677364142e-05, + "loss": 0.4202, + "step": 17123 + }, + { + "epoch": 21.982028241335044, + "grad_norm": 1.4905650615692139, + "learning_rate": 2.600941377834831e-05, + "loss": 0.4234, + "step": 17124 + }, + { + "epoch": 21.983311938382542, + "grad_norm": 1.3216700553894043, + "learning_rate": 2.600898587933248e-05, + "loss": 0.4343, + "step": 17125 + }, + { + "epoch": 21.98459563543004, + "grad_norm": 1.4860492944717407, + "learning_rate": 2.6008557980316644e-05, + "loss": 0.4136, + "step": 17126 + }, + { + "epoch": 21.985879332477534, + "grad_norm": 1.0490485429763794, + "learning_rate": 2.6008130081300816e-05, + "loss": 0.4223, + "step": 17127 + }, + { + "epoch": 21.987163029525032, + "grad_norm": 3.1670725345611572, + "learning_rate": 2.600770218228498e-05, + "loss": 0.4291, + "step": 17128 + }, + { + "epoch": 21.98844672657253, + "grad_norm": 3.5833261013031006, + "learning_rate": 2.600727428326915e-05, + "loss": 0.4352, + "step": 17129 + }, + { + "epoch": 21.989730423620024, + "grad_norm": 2.335954427719116, + "learning_rate": 2.6006846384253317e-05, + "loss": 0.4451, + "step": 17130 + }, + { + "epoch": 21.991014120667522, + "grad_norm": 2.9069674015045166, + "learning_rate": 2.6006418485237486e-05, + "loss": 0.4306, + "step": 17131 + }, + { + "epoch": 21.99229781771502, + "grad_norm": 2.230268716812134, + "learning_rate": 2.600599058622165e-05, + "loss": 0.4374, + "step": 17132 + }, + { + "epoch": 21.993581514762518, + "grad_norm": 1.2194136381149292, + "learning_rate": 2.600556268720582e-05, + "loss": 0.4136, + "step": 17133 + }, + { + "epoch": 21.994865211810012, + "grad_norm": 3.2681760787963867, + "learning_rate": 2.6005134788189988e-05, + "loss": 0.4945, + "step": 17134 + }, + { + "epoch": 21.99614890885751, + "grad_norm": 2.8902997970581055, + "learning_rate": 2.6004706889174156e-05, + "loss": 0.4502, + "step": 17135 + }, + { + "epoch": 21.997432605905008, + "grad_norm": 1.4808170795440674, + "learning_rate": 2.6004278990158324e-05, + "loss": 0.5121, + "step": 17136 + }, + { + "epoch": 21.998716302952502, + "grad_norm": 2.1684858798980713, + "learning_rate": 2.600385109114249e-05, + "loss": 0.4861, + "step": 17137 + }, + { + "epoch": 22.0, + "grad_norm": 4.001806259155273, + "learning_rate": 2.600342319212666e-05, + "loss": 0.5465, + "step": 17138 + }, + { + "epoch": 22.001283697047498, + "grad_norm": 1.0447925329208374, + "learning_rate": 2.6002995293110826e-05, + "loss": 0.3918, + "step": 17139 + }, + { + "epoch": 22.002567394094992, + "grad_norm": 1.6086524724960327, + "learning_rate": 2.600256739409499e-05, + "loss": 0.387, + "step": 17140 + }, + { + "epoch": 22.00385109114249, + "grad_norm": 1.1889584064483643, + "learning_rate": 2.6002139495079163e-05, + "loss": 0.3591, + "step": 17141 + }, + { + "epoch": 22.005134788189988, + "grad_norm": 4.094753265380859, + "learning_rate": 2.6001711596063328e-05, + "loss": 0.3718, + "step": 17142 + }, + { + "epoch": 22.006418485237482, + "grad_norm": 2.1475164890289307, + "learning_rate": 2.60012836970475e-05, + "loss": 0.3903, + "step": 17143 + }, + { + "epoch": 22.00770218228498, + "grad_norm": 0.9422386884689331, + "learning_rate": 2.6000855798031665e-05, + "loss": 0.378, + "step": 17144 + }, + { + "epoch": 22.008985879332478, + "grad_norm": 3.3875625133514404, + "learning_rate": 2.6000427899015833e-05, + "loss": 0.3985, + "step": 17145 + }, + { + "epoch": 22.010269576379976, + "grad_norm": 1.0151715278625488, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.3898, + "step": 17146 + }, + { + "epoch": 22.01155327342747, + "grad_norm": 1.0163389444351196, + "learning_rate": 2.5999572100984167e-05, + "loss": 0.3774, + "step": 17147 + }, + { + "epoch": 22.012836970474968, + "grad_norm": 2.0205326080322266, + "learning_rate": 2.5999144201968335e-05, + "loss": 0.4077, + "step": 17148 + }, + { + "epoch": 22.014120667522466, + "grad_norm": 1.93195378780365, + "learning_rate": 2.5998716302952504e-05, + "loss": 0.3626, + "step": 17149 + }, + { + "epoch": 22.01540436456996, + "grad_norm": 1.1000556945800781, + "learning_rate": 2.5998288403936672e-05, + "loss": 0.4066, + "step": 17150 + }, + { + "epoch": 22.016688061617458, + "grad_norm": 1.5245026350021362, + "learning_rate": 2.599786050492084e-05, + "loss": 0.3868, + "step": 17151 + }, + { + "epoch": 22.017971758664956, + "grad_norm": 3.118187189102173, + "learning_rate": 2.5997432605905005e-05, + "loss": 0.3504, + "step": 17152 + }, + { + "epoch": 22.01925545571245, + "grad_norm": 1.4131978750228882, + "learning_rate": 2.5997004706889174e-05, + "loss": 0.3835, + "step": 17153 + }, + { + "epoch": 22.020539152759948, + "grad_norm": 1.3335421085357666, + "learning_rate": 2.5996576807873342e-05, + "loss": 0.3675, + "step": 17154 + }, + { + "epoch": 22.021822849807446, + "grad_norm": 1.0988765954971313, + "learning_rate": 2.599614890885751e-05, + "loss": 0.3999, + "step": 17155 + }, + { + "epoch": 22.023106546854944, + "grad_norm": 2.019932746887207, + "learning_rate": 2.5995721009841676e-05, + "loss": 0.4022, + "step": 17156 + }, + { + "epoch": 22.024390243902438, + "grad_norm": 3.067139148712158, + "learning_rate": 2.5995293110825848e-05, + "loss": 0.4097, + "step": 17157 + }, + { + "epoch": 22.025673940949936, + "grad_norm": 3.277409553527832, + "learning_rate": 2.5994865211810013e-05, + "loss": 0.3945, + "step": 17158 + }, + { + "epoch": 22.026957637997434, + "grad_norm": 1.7855271100997925, + "learning_rate": 2.599443731279418e-05, + "loss": 0.439, + "step": 17159 + }, + { + "epoch": 22.028241335044928, + "grad_norm": 1.6070308685302734, + "learning_rate": 2.599400941377835e-05, + "loss": 0.3986, + "step": 17160 + }, + { + "epoch": 22.029525032092426, + "grad_norm": 1.0811798572540283, + "learning_rate": 2.5993581514762514e-05, + "loss": 0.3677, + "step": 17161 + }, + { + "epoch": 22.030808729139924, + "grad_norm": 1.6747018098831177, + "learning_rate": 2.5993153615746686e-05, + "loss": 0.3902, + "step": 17162 + }, + { + "epoch": 22.03209242618742, + "grad_norm": 1.1151684522628784, + "learning_rate": 2.599272571673085e-05, + "loss": 0.3945, + "step": 17163 + }, + { + "epoch": 22.033376123234916, + "grad_norm": 1.7460805177688599, + "learning_rate": 2.599229781771502e-05, + "loss": 0.4515, + "step": 17164 + }, + { + "epoch": 22.034659820282414, + "grad_norm": 3.064469575881958, + "learning_rate": 2.5991869918699188e-05, + "loss": 0.3918, + "step": 17165 + }, + { + "epoch": 22.035943517329912, + "grad_norm": 3.347836971282959, + "learning_rate": 2.5991442019683353e-05, + "loss": 0.3506, + "step": 17166 + }, + { + "epoch": 22.037227214377406, + "grad_norm": 1.4452968835830688, + "learning_rate": 2.5991014120667525e-05, + "loss": 0.372, + "step": 17167 + }, + { + "epoch": 22.038510911424904, + "grad_norm": 1.2890738248825073, + "learning_rate": 2.599058622165169e-05, + "loss": 0.3925, + "step": 17168 + }, + { + "epoch": 22.039794608472402, + "grad_norm": 0.9757136702537537, + "learning_rate": 2.599015832263586e-05, + "loss": 0.4221, + "step": 17169 + }, + { + "epoch": 22.041078305519896, + "grad_norm": 0.9618952870368958, + "learning_rate": 2.5989730423620027e-05, + "loss": 0.3943, + "step": 17170 + }, + { + "epoch": 22.042362002567394, + "grad_norm": 1.0560437440872192, + "learning_rate": 2.5989302524604195e-05, + "loss": 0.4147, + "step": 17171 + }, + { + "epoch": 22.043645699614892, + "grad_norm": 1.3448264598846436, + "learning_rate": 2.598887462558836e-05, + "loss": 0.4285, + "step": 17172 + }, + { + "epoch": 22.044929396662386, + "grad_norm": 1.1557350158691406, + "learning_rate": 2.598844672657253e-05, + "loss": 0.3698, + "step": 17173 + }, + { + "epoch": 22.046213093709884, + "grad_norm": 1.3256224393844604, + "learning_rate": 2.5988018827556697e-05, + "loss": 0.3637, + "step": 17174 + }, + { + "epoch": 22.047496790757382, + "grad_norm": 2.1953201293945312, + "learning_rate": 2.5987590928540865e-05, + "loss": 0.435, + "step": 17175 + }, + { + "epoch": 22.048780487804876, + "grad_norm": 2.2569992542266846, + "learning_rate": 2.5987163029525034e-05, + "loss": 0.4218, + "step": 17176 + }, + { + "epoch": 22.050064184852374, + "grad_norm": 2.0691616535186768, + "learning_rate": 2.59867351305092e-05, + "loss": 0.372, + "step": 17177 + }, + { + "epoch": 22.051347881899872, + "grad_norm": 1.8316617012023926, + "learning_rate": 2.598630723149337e-05, + "loss": 0.4223, + "step": 17178 + }, + { + "epoch": 22.05263157894737, + "grad_norm": 3.836568832397461, + "learning_rate": 2.5985879332477536e-05, + "loss": 0.4365, + "step": 17179 + }, + { + "epoch": 22.053915275994864, + "grad_norm": 6.44552755355835, + "learning_rate": 2.59854514334617e-05, + "loss": 0.451, + "step": 17180 + }, + { + "epoch": 22.055198973042362, + "grad_norm": 7.118551254272461, + "learning_rate": 2.5985023534445872e-05, + "loss": 0.4217, + "step": 17181 + }, + { + "epoch": 22.05648267008986, + "grad_norm": 1.670104742050171, + "learning_rate": 2.5984595635430038e-05, + "loss": 0.3884, + "step": 17182 + }, + { + "epoch": 22.057766367137354, + "grad_norm": 2.491422176361084, + "learning_rate": 2.598416773641421e-05, + "loss": 0.4213, + "step": 17183 + }, + { + "epoch": 22.059050064184852, + "grad_norm": 1.538008689880371, + "learning_rate": 2.5983739837398374e-05, + "loss": 0.4411, + "step": 17184 + }, + { + "epoch": 22.06033376123235, + "grad_norm": 1.2371820211410522, + "learning_rate": 2.5983311938382543e-05, + "loss": 0.4896, + "step": 17185 + }, + { + "epoch": 22.061617458279844, + "grad_norm": 3.2006051540374756, + "learning_rate": 2.598288403936671e-05, + "loss": 0.5136, + "step": 17186 + }, + { + "epoch": 22.062901155327342, + "grad_norm": 1.5626847743988037, + "learning_rate": 2.5982456140350876e-05, + "loss": 0.4596, + "step": 17187 + }, + { + "epoch": 22.06418485237484, + "grad_norm": 4.631435871124268, + "learning_rate": 2.5982028241335045e-05, + "loss": 0.657, + "step": 17188 + }, + { + "epoch": 22.065468549422338, + "grad_norm": 2.5228309631347656, + "learning_rate": 2.5981600342319213e-05, + "loss": 0.3533, + "step": 17189 + }, + { + "epoch": 22.066752246469832, + "grad_norm": 2.3897624015808105, + "learning_rate": 2.598117244330338e-05, + "loss": 0.3624, + "step": 17190 + }, + { + "epoch": 22.06803594351733, + "grad_norm": 1.5057647228240967, + "learning_rate": 2.598074454428755e-05, + "loss": 0.4041, + "step": 17191 + }, + { + "epoch": 22.069319640564828, + "grad_norm": 1.8986440896987915, + "learning_rate": 2.5980316645271718e-05, + "loss": 0.4059, + "step": 17192 + }, + { + "epoch": 22.070603337612322, + "grad_norm": 1.3755288124084473, + "learning_rate": 2.5979888746255883e-05, + "loss": 0.3979, + "step": 17193 + }, + { + "epoch": 22.07188703465982, + "grad_norm": 1.3408904075622559, + "learning_rate": 2.597946084724005e-05, + "loss": 0.3791, + "step": 17194 + }, + { + "epoch": 22.073170731707318, + "grad_norm": 1.0201014280319214, + "learning_rate": 2.597903294822422e-05, + "loss": 0.3645, + "step": 17195 + }, + { + "epoch": 22.074454428754812, + "grad_norm": 1.4578121900558472, + "learning_rate": 2.5978605049208385e-05, + "loss": 0.3702, + "step": 17196 + }, + { + "epoch": 22.07573812580231, + "grad_norm": 2.1467485427856445, + "learning_rate": 2.5978177150192557e-05, + "loss": 0.3762, + "step": 17197 + }, + { + "epoch": 22.077021822849808, + "grad_norm": 0.8735862374305725, + "learning_rate": 2.5977749251176722e-05, + "loss": 0.4263, + "step": 17198 + }, + { + "epoch": 22.078305519897306, + "grad_norm": 1.2490421533584595, + "learning_rate": 2.5977321352160894e-05, + "loss": 0.3615, + "step": 17199 + }, + { + "epoch": 22.0795892169448, + "grad_norm": 1.7789621353149414, + "learning_rate": 2.597689345314506e-05, + "loss": 0.3957, + "step": 17200 + }, + { + "epoch": 22.080872913992298, + "grad_norm": 1.375856637954712, + "learning_rate": 2.5976465554129224e-05, + "loss": 0.3703, + "step": 17201 + }, + { + "epoch": 22.082156611039796, + "grad_norm": 1.890606164932251, + "learning_rate": 2.5976037655113396e-05, + "loss": 0.4108, + "step": 17202 + }, + { + "epoch": 22.08344030808729, + "grad_norm": 1.1646026372909546, + "learning_rate": 2.597560975609756e-05, + "loss": 0.3685, + "step": 17203 + }, + { + "epoch": 22.084724005134788, + "grad_norm": 8.863274574279785, + "learning_rate": 2.597518185708173e-05, + "loss": 0.4134, + "step": 17204 + }, + { + "epoch": 22.086007702182286, + "grad_norm": 2.5363738536834717, + "learning_rate": 2.5974753958065897e-05, + "loss": 0.41, + "step": 17205 + }, + { + "epoch": 22.08729139922978, + "grad_norm": 5.643470287322998, + "learning_rate": 2.5974326059050066e-05, + "loss": 0.3713, + "step": 17206 + }, + { + "epoch": 22.088575096277278, + "grad_norm": 1.1961805820465088, + "learning_rate": 2.5973898160034234e-05, + "loss": 0.4335, + "step": 17207 + }, + { + "epoch": 22.089858793324776, + "grad_norm": 1.498844861984253, + "learning_rate": 2.59734702610184e-05, + "loss": 0.3834, + "step": 17208 + }, + { + "epoch": 22.09114249037227, + "grad_norm": 1.0739293098449707, + "learning_rate": 2.5973042362002568e-05, + "loss": 0.3953, + "step": 17209 + }, + { + "epoch": 22.09242618741977, + "grad_norm": 1.3805145025253296, + "learning_rate": 2.5972614462986736e-05, + "loss": 0.3825, + "step": 17210 + }, + { + "epoch": 22.093709884467266, + "grad_norm": 1.2461998462677002, + "learning_rate": 2.5972186563970905e-05, + "loss": 0.3775, + "step": 17211 + }, + { + "epoch": 22.094993581514764, + "grad_norm": 1.3003555536270142, + "learning_rate": 2.597175866495507e-05, + "loss": 0.3802, + "step": 17212 + }, + { + "epoch": 22.09627727856226, + "grad_norm": 1.4974392652511597, + "learning_rate": 2.5971330765939238e-05, + "loss": 0.3879, + "step": 17213 + }, + { + "epoch": 22.097560975609756, + "grad_norm": 1.9862046241760254, + "learning_rate": 2.5970902866923406e-05, + "loss": 0.3683, + "step": 17214 + }, + { + "epoch": 22.098844672657254, + "grad_norm": 2.084165096282959, + "learning_rate": 2.5970474967907575e-05, + "loss": 0.389, + "step": 17215 + }, + { + "epoch": 22.10012836970475, + "grad_norm": 1.44005286693573, + "learning_rate": 2.5970047068891743e-05, + "loss": 0.3924, + "step": 17216 + }, + { + "epoch": 22.101412066752246, + "grad_norm": 1.3096566200256348, + "learning_rate": 2.5969619169875908e-05, + "loss": 0.3873, + "step": 17217 + }, + { + "epoch": 22.102695763799744, + "grad_norm": 4.243007659912109, + "learning_rate": 2.596919127086008e-05, + "loss": 0.4056, + "step": 17218 + }, + { + "epoch": 22.10397946084724, + "grad_norm": 2.009592294692993, + "learning_rate": 2.5968763371844245e-05, + "loss": 0.3944, + "step": 17219 + }, + { + "epoch": 22.105263157894736, + "grad_norm": 1.5403422117233276, + "learning_rate": 2.596833547282841e-05, + "loss": 0.3988, + "step": 17220 + }, + { + "epoch": 22.106546854942234, + "grad_norm": 1.128577470779419, + "learning_rate": 2.5967907573812582e-05, + "loss": 0.41, + "step": 17221 + }, + { + "epoch": 22.107830551989732, + "grad_norm": 1.3555039167404175, + "learning_rate": 2.5967479674796747e-05, + "loss": 0.3602, + "step": 17222 + }, + { + "epoch": 22.109114249037226, + "grad_norm": 1.3343156576156616, + "learning_rate": 2.596705177578092e-05, + "loss": 0.4613, + "step": 17223 + }, + { + "epoch": 22.110397946084724, + "grad_norm": 2.0537266731262207, + "learning_rate": 2.5966623876765084e-05, + "loss": 0.4489, + "step": 17224 + }, + { + "epoch": 22.111681643132222, + "grad_norm": 1.5637049674987793, + "learning_rate": 2.5966195977749252e-05, + "loss": 0.4188, + "step": 17225 + }, + { + "epoch": 22.112965340179716, + "grad_norm": 3.968686103820801, + "learning_rate": 2.596576807873342e-05, + "loss": 0.4483, + "step": 17226 + }, + { + "epoch": 22.114249037227214, + "grad_norm": 1.5087507963180542, + "learning_rate": 2.5965340179717586e-05, + "loss": 0.3421, + "step": 17227 + }, + { + "epoch": 22.115532734274712, + "grad_norm": 2.8468918800354004, + "learning_rate": 2.5964912280701754e-05, + "loss": 0.4371, + "step": 17228 + }, + { + "epoch": 22.116816431322206, + "grad_norm": 2.0820882320404053, + "learning_rate": 2.5964484381685922e-05, + "loss": 0.3857, + "step": 17229 + }, + { + "epoch": 22.118100128369704, + "grad_norm": 1.5566794872283936, + "learning_rate": 2.596405648267009e-05, + "loss": 0.4582, + "step": 17230 + }, + { + "epoch": 22.119383825417202, + "grad_norm": 1.7189918756484985, + "learning_rate": 2.596362858365426e-05, + "loss": 0.4099, + "step": 17231 + }, + { + "epoch": 22.1206675224647, + "grad_norm": 2.476013660430908, + "learning_rate": 2.5963200684638428e-05, + "loss": 0.4509, + "step": 17232 + }, + { + "epoch": 22.121951219512194, + "grad_norm": 2.058659076690674, + "learning_rate": 2.5962772785622593e-05, + "loss": 0.4507, + "step": 17233 + }, + { + "epoch": 22.123234916559692, + "grad_norm": 1.8602136373519897, + "learning_rate": 2.596234488660676e-05, + "loss": 0.4328, + "step": 17234 + }, + { + "epoch": 22.12451861360719, + "grad_norm": 1.3349581956863403, + "learning_rate": 2.596191698759093e-05, + "loss": 0.4716, + "step": 17235 + }, + { + "epoch": 22.125802310654684, + "grad_norm": 1.6392443180084229, + "learning_rate": 2.5961489088575094e-05, + "loss": 0.4762, + "step": 17236 + }, + { + "epoch": 22.127086007702182, + "grad_norm": 2.6570851802825928, + "learning_rate": 2.5961061189559266e-05, + "loss": 0.5193, + "step": 17237 + }, + { + "epoch": 22.12836970474968, + "grad_norm": 3.8759677410125732, + "learning_rate": 2.596063329054343e-05, + "loss": 0.6002, + "step": 17238 + }, + { + "epoch": 22.129653401797174, + "grad_norm": 1.400129795074463, + "learning_rate": 2.5960205391527603e-05, + "loss": 0.3886, + "step": 17239 + }, + { + "epoch": 22.130937098844672, + "grad_norm": 1.1189814805984497, + "learning_rate": 2.5959777492511768e-05, + "loss": 0.3682, + "step": 17240 + }, + { + "epoch": 22.13222079589217, + "grad_norm": 1.161270022392273, + "learning_rate": 2.5959349593495933e-05, + "loss": 0.4105, + "step": 17241 + }, + { + "epoch": 22.133504492939664, + "grad_norm": 1.61992609500885, + "learning_rate": 2.5958921694480105e-05, + "loss": 0.3723, + "step": 17242 + }, + { + "epoch": 22.134788189987162, + "grad_norm": 1.9887579679489136, + "learning_rate": 2.595849379546427e-05, + "loss": 0.4036, + "step": 17243 + }, + { + "epoch": 22.13607188703466, + "grad_norm": 0.8786041140556335, + "learning_rate": 2.595806589644844e-05, + "loss": 0.335, + "step": 17244 + }, + { + "epoch": 22.137355584082158, + "grad_norm": 1.384360671043396, + "learning_rate": 2.5957637997432607e-05, + "loss": 0.347, + "step": 17245 + }, + { + "epoch": 22.138639281129652, + "grad_norm": 1.8883165121078491, + "learning_rate": 2.5957210098416775e-05, + "loss": 0.4405, + "step": 17246 + }, + { + "epoch": 22.13992297817715, + "grad_norm": 5.9847283363342285, + "learning_rate": 2.5956782199400944e-05, + "loss": 0.3933, + "step": 17247 + }, + { + "epoch": 22.141206675224648, + "grad_norm": 1.4344499111175537, + "learning_rate": 2.595635430038511e-05, + "loss": 0.3865, + "step": 17248 + }, + { + "epoch": 22.142490372272142, + "grad_norm": 1.4185887575149536, + "learning_rate": 2.5955926401369277e-05, + "loss": 0.3724, + "step": 17249 + }, + { + "epoch": 22.14377406931964, + "grad_norm": 1.2669217586517334, + "learning_rate": 2.5955498502353445e-05, + "loss": 0.4467, + "step": 17250 + }, + { + "epoch": 22.145057766367138, + "grad_norm": 2.4796810150146484, + "learning_rate": 2.5955070603337614e-05, + "loss": 0.3773, + "step": 17251 + }, + { + "epoch": 22.146341463414632, + "grad_norm": 1.3530703783035278, + "learning_rate": 2.595464270432178e-05, + "loss": 0.4064, + "step": 17252 + }, + { + "epoch": 22.14762516046213, + "grad_norm": 1.6392782926559448, + "learning_rate": 2.595421480530595e-05, + "loss": 0.403, + "step": 17253 + }, + { + "epoch": 22.14890885750963, + "grad_norm": 1.6296813488006592, + "learning_rate": 2.5953786906290116e-05, + "loss": 0.4311, + "step": 17254 + }, + { + "epoch": 22.150192554557126, + "grad_norm": 1.1970795392990112, + "learning_rate": 2.5953359007274284e-05, + "loss": 0.346, + "step": 17255 + }, + { + "epoch": 22.15147625160462, + "grad_norm": 1.9029879570007324, + "learning_rate": 2.5952931108258453e-05, + "loss": 0.361, + "step": 17256 + }, + { + "epoch": 22.15275994865212, + "grad_norm": 1.086911678314209, + "learning_rate": 2.5952503209242618e-05, + "loss": 0.4096, + "step": 17257 + }, + { + "epoch": 22.154043645699616, + "grad_norm": 3.2762792110443115, + "learning_rate": 2.595207531022679e-05, + "loss": 0.3951, + "step": 17258 + }, + { + "epoch": 22.15532734274711, + "grad_norm": 1.0117714405059814, + "learning_rate": 2.5951647411210954e-05, + "loss": 0.3823, + "step": 17259 + }, + { + "epoch": 22.15661103979461, + "grad_norm": 0.9059821963310242, + "learning_rate": 2.5951219512195123e-05, + "loss": 0.3895, + "step": 17260 + }, + { + "epoch": 22.157894736842106, + "grad_norm": 1.710434079170227, + "learning_rate": 2.595079161317929e-05, + "loss": 0.398, + "step": 17261 + }, + { + "epoch": 22.1591784338896, + "grad_norm": 4.5708909034729, + "learning_rate": 2.5950363714163456e-05, + "loss": 0.3786, + "step": 17262 + }, + { + "epoch": 22.1604621309371, + "grad_norm": 1.323978066444397, + "learning_rate": 2.5949935815147628e-05, + "loss": 0.3804, + "step": 17263 + }, + { + "epoch": 22.161745827984596, + "grad_norm": 1.9135780334472656, + "learning_rate": 2.5949507916131793e-05, + "loss": 0.409, + "step": 17264 + }, + { + "epoch": 22.163029525032094, + "grad_norm": 1.093850016593933, + "learning_rate": 2.594908001711596e-05, + "loss": 0.3921, + "step": 17265 + }, + { + "epoch": 22.16431322207959, + "grad_norm": 1.1966038942337036, + "learning_rate": 2.594865211810013e-05, + "loss": 0.3791, + "step": 17266 + }, + { + "epoch": 22.165596919127086, + "grad_norm": 1.1652175188064575, + "learning_rate": 2.5948224219084298e-05, + "loss": 0.4044, + "step": 17267 + }, + { + "epoch": 22.166880616174584, + "grad_norm": 2.4808504581451416, + "learning_rate": 2.5947796320068463e-05, + "loss": 0.4558, + "step": 17268 + }, + { + "epoch": 22.16816431322208, + "grad_norm": 3.6123836040496826, + "learning_rate": 2.5947368421052632e-05, + "loss": 0.4019, + "step": 17269 + }, + { + "epoch": 22.169448010269576, + "grad_norm": 1.4943552017211914, + "learning_rate": 2.59469405220368e-05, + "loss": 0.3944, + "step": 17270 + }, + { + "epoch": 22.170731707317074, + "grad_norm": 1.3183749914169312, + "learning_rate": 2.594651262302097e-05, + "loss": 0.3748, + "step": 17271 + }, + { + "epoch": 22.17201540436457, + "grad_norm": 2.193511486053467, + "learning_rate": 2.5946084724005137e-05, + "loss": 0.3841, + "step": 17272 + }, + { + "epoch": 22.173299101412066, + "grad_norm": 1.5299408435821533, + "learning_rate": 2.5945656824989302e-05, + "loss": 0.3975, + "step": 17273 + }, + { + "epoch": 22.174582798459564, + "grad_norm": 2.130383014678955, + "learning_rate": 2.594522892597347e-05, + "loss": 0.3852, + "step": 17274 + }, + { + "epoch": 22.17586649550706, + "grad_norm": 2.4511241912841797, + "learning_rate": 2.594480102695764e-05, + "loss": 0.4079, + "step": 17275 + }, + { + "epoch": 22.177150192554556, + "grad_norm": 1.4946670532226562, + "learning_rate": 2.5944373127941804e-05, + "loss": 0.4135, + "step": 17276 + }, + { + "epoch": 22.178433889602054, + "grad_norm": 3.9373514652252197, + "learning_rate": 2.5943945228925976e-05, + "loss": 0.4426, + "step": 17277 + }, + { + "epoch": 22.179717586649552, + "grad_norm": 1.2936756610870361, + "learning_rate": 2.594351732991014e-05, + "loss": 0.4073, + "step": 17278 + }, + { + "epoch": 22.181001283697046, + "grad_norm": 2.1811280250549316, + "learning_rate": 2.5943089430894312e-05, + "loss": 0.4282, + "step": 17279 + }, + { + "epoch": 22.182284980744544, + "grad_norm": 1.3465460538864136, + "learning_rate": 2.5942661531878477e-05, + "loss": 0.4612, + "step": 17280 + }, + { + "epoch": 22.183568677792042, + "grad_norm": 1.3105021715164185, + "learning_rate": 2.5942233632862643e-05, + "loss": 0.4152, + "step": 17281 + }, + { + "epoch": 22.184852374839537, + "grad_norm": 2.360633373260498, + "learning_rate": 2.5941805733846814e-05, + "loss": 0.4311, + "step": 17282 + }, + { + "epoch": 22.186136071887034, + "grad_norm": 8.399184226989746, + "learning_rate": 2.594137783483098e-05, + "loss": 0.3965, + "step": 17283 + }, + { + "epoch": 22.187419768934532, + "grad_norm": 1.6833857297897339, + "learning_rate": 2.5940949935815148e-05, + "loss": 0.4362, + "step": 17284 + }, + { + "epoch": 22.188703465982027, + "grad_norm": 2.347625494003296, + "learning_rate": 2.5940522036799316e-05, + "loss": 0.4286, + "step": 17285 + }, + { + "epoch": 22.189987163029524, + "grad_norm": 1.9771655797958374, + "learning_rate": 2.5940094137783485e-05, + "loss": 0.4437, + "step": 17286 + }, + { + "epoch": 22.191270860077022, + "grad_norm": 6.117166519165039, + "learning_rate": 2.5939666238767653e-05, + "loss": 0.5087, + "step": 17287 + }, + { + "epoch": 22.19255455712452, + "grad_norm": 1.6472212076187134, + "learning_rate": 2.5939238339751818e-05, + "loss": 0.5726, + "step": 17288 + }, + { + "epoch": 22.193838254172015, + "grad_norm": 1.34187912940979, + "learning_rate": 2.5938810440735986e-05, + "loss": 0.3627, + "step": 17289 + }, + { + "epoch": 22.195121951219512, + "grad_norm": 2.7723731994628906, + "learning_rate": 2.5938382541720155e-05, + "loss": 0.388, + "step": 17290 + }, + { + "epoch": 22.19640564826701, + "grad_norm": 3.101787805557251, + "learning_rate": 2.5937954642704323e-05, + "loss": 0.3898, + "step": 17291 + }, + { + "epoch": 22.197689345314505, + "grad_norm": 1.729691982269287, + "learning_rate": 2.5937526743688488e-05, + "loss": 0.3868, + "step": 17292 + }, + { + "epoch": 22.198973042362002, + "grad_norm": 1.139878511428833, + "learning_rate": 2.593709884467266e-05, + "loss": 0.3575, + "step": 17293 + }, + { + "epoch": 22.2002567394095, + "grad_norm": 1.04538893699646, + "learning_rate": 2.5936670945656825e-05, + "loss": 0.3644, + "step": 17294 + }, + { + "epoch": 22.201540436456995, + "grad_norm": 1.2911133766174316, + "learning_rate": 2.5936243046640993e-05, + "loss": 0.4396, + "step": 17295 + }, + { + "epoch": 22.202824133504492, + "grad_norm": 0.8375039100646973, + "learning_rate": 2.5935815147625162e-05, + "loss": 0.3727, + "step": 17296 + }, + { + "epoch": 22.20410783055199, + "grad_norm": 3.1068406105041504, + "learning_rate": 2.5935387248609327e-05, + "loss": 0.4027, + "step": 17297 + }, + { + "epoch": 22.205391527599488, + "grad_norm": 4.965360641479492, + "learning_rate": 2.59349593495935e-05, + "loss": 0.4278, + "step": 17298 + }, + { + "epoch": 22.206675224646983, + "grad_norm": 5.136696815490723, + "learning_rate": 2.5934531450577664e-05, + "loss": 0.4186, + "step": 17299 + }, + { + "epoch": 22.20795892169448, + "grad_norm": 2.3868141174316406, + "learning_rate": 2.5934103551561832e-05, + "loss": 0.4257, + "step": 17300 + }, + { + "epoch": 22.20924261874198, + "grad_norm": 1.5614835023880005, + "learning_rate": 2.5933675652546e-05, + "loss": 0.3925, + "step": 17301 + }, + { + "epoch": 22.210526315789473, + "grad_norm": 1.3017804622650146, + "learning_rate": 2.5933247753530166e-05, + "loss": 0.4043, + "step": 17302 + }, + { + "epoch": 22.21181001283697, + "grad_norm": 2.2127208709716797, + "learning_rate": 2.5932819854514337e-05, + "loss": 0.392, + "step": 17303 + }, + { + "epoch": 22.21309370988447, + "grad_norm": 1.4224116802215576, + "learning_rate": 2.5932391955498502e-05, + "loss": 0.396, + "step": 17304 + }, + { + "epoch": 22.214377406931963, + "grad_norm": 1.3383023738861084, + "learning_rate": 2.593196405648267e-05, + "loss": 0.4162, + "step": 17305 + }, + { + "epoch": 22.21566110397946, + "grad_norm": 1.0903533697128296, + "learning_rate": 2.593153615746684e-05, + "loss": 0.3669, + "step": 17306 + }, + { + "epoch": 22.21694480102696, + "grad_norm": 2.2164762020111084, + "learning_rate": 2.5931108258451008e-05, + "loss": 0.3702, + "step": 17307 + }, + { + "epoch": 22.218228498074453, + "grad_norm": 1.8714916706085205, + "learning_rate": 2.5930680359435173e-05, + "loss": 0.4256, + "step": 17308 + }, + { + "epoch": 22.21951219512195, + "grad_norm": 1.1453551054000854, + "learning_rate": 2.593025246041934e-05, + "loss": 0.3744, + "step": 17309 + }, + { + "epoch": 22.22079589216945, + "grad_norm": 1.0552574396133423, + "learning_rate": 2.592982456140351e-05, + "loss": 0.3567, + "step": 17310 + }, + { + "epoch": 22.222079589216946, + "grad_norm": 1.6396797895431519, + "learning_rate": 2.5929396662387678e-05, + "loss": 0.4142, + "step": 17311 + }, + { + "epoch": 22.22336328626444, + "grad_norm": 1.6014227867126465, + "learning_rate": 2.5928968763371846e-05, + "loss": 0.3591, + "step": 17312 + }, + { + "epoch": 22.22464698331194, + "grad_norm": 1.1028730869293213, + "learning_rate": 2.592854086435601e-05, + "loss": 0.3982, + "step": 17313 + }, + { + "epoch": 22.225930680359436, + "grad_norm": 1.5815461874008179, + "learning_rate": 2.5928112965340183e-05, + "loss": 0.3798, + "step": 17314 + }, + { + "epoch": 22.22721437740693, + "grad_norm": 1.8376034498214722, + "learning_rate": 2.5927685066324348e-05, + "loss": 0.4134, + "step": 17315 + }, + { + "epoch": 22.22849807445443, + "grad_norm": 0.9893237948417664, + "learning_rate": 2.5927257167308513e-05, + "loss": 0.4038, + "step": 17316 + }, + { + "epoch": 22.229781771501926, + "grad_norm": 6.537512302398682, + "learning_rate": 2.5926829268292685e-05, + "loss": 0.3976, + "step": 17317 + }, + { + "epoch": 22.23106546854942, + "grad_norm": 1.2893872261047363, + "learning_rate": 2.592640136927685e-05, + "loss": 0.4056, + "step": 17318 + }, + { + "epoch": 22.23234916559692, + "grad_norm": 4.081088066101074, + "learning_rate": 2.5925973470261022e-05, + "loss": 0.3995, + "step": 17319 + }, + { + "epoch": 22.233632862644416, + "grad_norm": 1.3989912271499634, + "learning_rate": 2.5925545571245187e-05, + "loss": 0.3847, + "step": 17320 + }, + { + "epoch": 22.234916559691914, + "grad_norm": 2.417815685272217, + "learning_rate": 2.5925117672229355e-05, + "loss": 0.4108, + "step": 17321 + }, + { + "epoch": 22.23620025673941, + "grad_norm": 1.1415740251541138, + "learning_rate": 2.5924689773213524e-05, + "loss": 0.4159, + "step": 17322 + }, + { + "epoch": 22.237483953786906, + "grad_norm": 1.0456335544586182, + "learning_rate": 2.592426187419769e-05, + "loss": 0.3976, + "step": 17323 + }, + { + "epoch": 22.238767650834404, + "grad_norm": 5.111278057098389, + "learning_rate": 2.5923833975181857e-05, + "loss": 0.4191, + "step": 17324 + }, + { + "epoch": 22.2400513478819, + "grad_norm": 1.4081268310546875, + "learning_rate": 2.5923406076166026e-05, + "loss": 0.4221, + "step": 17325 + }, + { + "epoch": 22.241335044929397, + "grad_norm": 4.433136463165283, + "learning_rate": 2.5922978177150194e-05, + "loss": 0.4041, + "step": 17326 + }, + { + "epoch": 22.242618741976894, + "grad_norm": 1.7362940311431885, + "learning_rate": 2.592255027813436e-05, + "loss": 0.4067, + "step": 17327 + }, + { + "epoch": 22.24390243902439, + "grad_norm": 1.4690018892288208, + "learning_rate": 2.592212237911853e-05, + "loss": 0.4113, + "step": 17328 + }, + { + "epoch": 22.245186136071887, + "grad_norm": 2.1571462154388428, + "learning_rate": 2.5921694480102696e-05, + "loss": 0.4076, + "step": 17329 + }, + { + "epoch": 22.246469833119384, + "grad_norm": 2.3338987827301025, + "learning_rate": 2.5921266581086864e-05, + "loss": 0.4621, + "step": 17330 + }, + { + "epoch": 22.247753530166882, + "grad_norm": 1.3101963996887207, + "learning_rate": 2.5920838682071033e-05, + "loss": 0.4064, + "step": 17331 + }, + { + "epoch": 22.249037227214377, + "grad_norm": 1.3980299234390259, + "learning_rate": 2.5920410783055198e-05, + "loss": 0.4282, + "step": 17332 + }, + { + "epoch": 22.250320924261874, + "grad_norm": 1.1437253952026367, + "learning_rate": 2.591998288403937e-05, + "loss": 0.4247, + "step": 17333 + }, + { + "epoch": 22.251604621309372, + "grad_norm": 1.3928672075271606, + "learning_rate": 2.5919554985023534e-05, + "loss": 0.4224, + "step": 17334 + }, + { + "epoch": 22.252888318356867, + "grad_norm": 1.6541740894317627, + "learning_rate": 2.5919127086007703e-05, + "loss": 0.472, + "step": 17335 + }, + { + "epoch": 22.254172015404365, + "grad_norm": 3.096615791320801, + "learning_rate": 2.591869918699187e-05, + "loss": 0.4433, + "step": 17336 + }, + { + "epoch": 22.255455712451862, + "grad_norm": 2.2894551753997803, + "learning_rate": 2.5918271287976036e-05, + "loss": 0.5177, + "step": 17337 + }, + { + "epoch": 22.256739409499357, + "grad_norm": 1.9144538640975952, + "learning_rate": 2.5917843388960208e-05, + "loss": 0.6415, + "step": 17338 + }, + { + "epoch": 22.258023106546855, + "grad_norm": 1.5312610864639282, + "learning_rate": 2.5917415489944373e-05, + "loss": 0.3615, + "step": 17339 + }, + { + "epoch": 22.259306803594352, + "grad_norm": 1.1862900257110596, + "learning_rate": 2.591698759092854e-05, + "loss": 0.3826, + "step": 17340 + }, + { + "epoch": 22.260590500641847, + "grad_norm": 1.4854915142059326, + "learning_rate": 2.591655969191271e-05, + "loss": 0.3755, + "step": 17341 + }, + { + "epoch": 22.261874197689345, + "grad_norm": 0.9168660044670105, + "learning_rate": 2.5916131792896875e-05, + "loss": 0.3817, + "step": 17342 + }, + { + "epoch": 22.263157894736842, + "grad_norm": 1.0454051494598389, + "learning_rate": 2.5915703893881043e-05, + "loss": 0.3754, + "step": 17343 + }, + { + "epoch": 22.26444159178434, + "grad_norm": 1.221364140510559, + "learning_rate": 2.5915275994865212e-05, + "loss": 0.3785, + "step": 17344 + }, + { + "epoch": 22.265725288831835, + "grad_norm": 1.1753265857696533, + "learning_rate": 2.591484809584938e-05, + "loss": 0.3774, + "step": 17345 + }, + { + "epoch": 22.267008985879333, + "grad_norm": 1.8121825456619263, + "learning_rate": 2.591442019683355e-05, + "loss": 0.3921, + "step": 17346 + }, + { + "epoch": 22.26829268292683, + "grad_norm": 2.2143867015838623, + "learning_rate": 2.5913992297817717e-05, + "loss": 0.3503, + "step": 17347 + }, + { + "epoch": 22.269576379974325, + "grad_norm": 1.8437594175338745, + "learning_rate": 2.5913564398801882e-05, + "loss": 0.3943, + "step": 17348 + }, + { + "epoch": 22.270860077021823, + "grad_norm": 1.010363221168518, + "learning_rate": 2.591313649978605e-05, + "loss": 0.3726, + "step": 17349 + }, + { + "epoch": 22.27214377406932, + "grad_norm": 1.3062303066253662, + "learning_rate": 2.591270860077022e-05, + "loss": 0.3968, + "step": 17350 + }, + { + "epoch": 22.273427471116815, + "grad_norm": 1.2949268817901611, + "learning_rate": 2.5912280701754384e-05, + "loss": 0.3765, + "step": 17351 + }, + { + "epoch": 22.274711168164313, + "grad_norm": 2.822662830352783, + "learning_rate": 2.5911852802738556e-05, + "loss": 0.3688, + "step": 17352 + }, + { + "epoch": 22.27599486521181, + "grad_norm": 2.808178663253784, + "learning_rate": 2.591142490372272e-05, + "loss": 0.3599, + "step": 17353 + }, + { + "epoch": 22.27727856225931, + "grad_norm": 0.9632667899131775, + "learning_rate": 2.5910997004706893e-05, + "loss": 0.3927, + "step": 17354 + }, + { + "epoch": 22.278562259306803, + "grad_norm": 2.6270148754119873, + "learning_rate": 2.5910569105691058e-05, + "loss": 0.4026, + "step": 17355 + }, + { + "epoch": 22.2798459563543, + "grad_norm": 1.989658236503601, + "learning_rate": 2.5910141206675223e-05, + "loss": 0.3907, + "step": 17356 + }, + { + "epoch": 22.2811296534018, + "grad_norm": 2.3472461700439453, + "learning_rate": 2.5909713307659394e-05, + "loss": 0.3725, + "step": 17357 + }, + { + "epoch": 22.282413350449293, + "grad_norm": 1.480592966079712, + "learning_rate": 2.590928540864356e-05, + "loss": 0.4109, + "step": 17358 + }, + { + "epoch": 22.28369704749679, + "grad_norm": 1.6183513402938843, + "learning_rate": 2.5908857509627728e-05, + "loss": 0.3822, + "step": 17359 + }, + { + "epoch": 22.28498074454429, + "grad_norm": 1.6049778461456299, + "learning_rate": 2.5908429610611896e-05, + "loss": 0.3963, + "step": 17360 + }, + { + "epoch": 22.286264441591783, + "grad_norm": 1.5413798093795776, + "learning_rate": 2.5908001711596065e-05, + "loss": 0.3757, + "step": 17361 + }, + { + "epoch": 22.28754813863928, + "grad_norm": 0.9393236041069031, + "learning_rate": 2.5907573812580233e-05, + "loss": 0.383, + "step": 17362 + }, + { + "epoch": 22.28883183568678, + "grad_norm": 14.798690795898438, + "learning_rate": 2.5907145913564398e-05, + "loss": 0.3682, + "step": 17363 + }, + { + "epoch": 22.290115532734276, + "grad_norm": 1.0160508155822754, + "learning_rate": 2.5906718014548566e-05, + "loss": 0.3529, + "step": 17364 + }, + { + "epoch": 22.29139922978177, + "grad_norm": 1.365073800086975, + "learning_rate": 2.5906290115532735e-05, + "loss": 0.4224, + "step": 17365 + }, + { + "epoch": 22.29268292682927, + "grad_norm": 1.3079241514205933, + "learning_rate": 2.5905862216516903e-05, + "loss": 0.3746, + "step": 17366 + }, + { + "epoch": 22.293966623876766, + "grad_norm": 1.3629634380340576, + "learning_rate": 2.590543431750107e-05, + "loss": 0.3623, + "step": 17367 + }, + { + "epoch": 22.29525032092426, + "grad_norm": 3.117002010345459, + "learning_rate": 2.590500641848524e-05, + "loss": 0.4045, + "step": 17368 + }, + { + "epoch": 22.29653401797176, + "grad_norm": 1.24339759349823, + "learning_rate": 2.5904578519469405e-05, + "loss": 0.4398, + "step": 17369 + }, + { + "epoch": 22.297817715019256, + "grad_norm": 2.083543062210083, + "learning_rate": 2.5904150620453574e-05, + "loss": 0.4176, + "step": 17370 + }, + { + "epoch": 22.29910141206675, + "grad_norm": 1.2506718635559082, + "learning_rate": 2.5903722721437742e-05, + "loss": 0.4385, + "step": 17371 + }, + { + "epoch": 22.30038510911425, + "grad_norm": 1.148695468902588, + "learning_rate": 2.5903294822421907e-05, + "loss": 0.3883, + "step": 17372 + }, + { + "epoch": 22.301668806161747, + "grad_norm": 1.667117953300476, + "learning_rate": 2.590286692340608e-05, + "loss": 0.4287, + "step": 17373 + }, + { + "epoch": 22.30295250320924, + "grad_norm": 1.7762781381607056, + "learning_rate": 2.5902439024390244e-05, + "loss": 0.414, + "step": 17374 + }, + { + "epoch": 22.30423620025674, + "grad_norm": 3.3643574714660645, + "learning_rate": 2.5902011125374412e-05, + "loss": 0.4262, + "step": 17375 + }, + { + "epoch": 22.305519897304237, + "grad_norm": 2.426323890686035, + "learning_rate": 2.590158322635858e-05, + "loss": 0.4086, + "step": 17376 + }, + { + "epoch": 22.306803594351734, + "grad_norm": 1.7390269041061401, + "learning_rate": 2.5901155327342746e-05, + "loss": 0.402, + "step": 17377 + }, + { + "epoch": 22.30808729139923, + "grad_norm": 1.1473174095153809, + "learning_rate": 2.5900727428326917e-05, + "loss": 0.452, + "step": 17378 + }, + { + "epoch": 22.309370988446727, + "grad_norm": 2.137406587600708, + "learning_rate": 2.5900299529311082e-05, + "loss": 0.4477, + "step": 17379 + }, + { + "epoch": 22.310654685494224, + "grad_norm": 1.3610734939575195, + "learning_rate": 2.589987163029525e-05, + "loss": 0.4502, + "step": 17380 + }, + { + "epoch": 22.31193838254172, + "grad_norm": 4.325992584228516, + "learning_rate": 2.589944373127942e-05, + "loss": 0.4474, + "step": 17381 + }, + { + "epoch": 22.313222079589217, + "grad_norm": 1.548757791519165, + "learning_rate": 2.5899015832263588e-05, + "loss": 0.4297, + "step": 17382 + }, + { + "epoch": 22.314505776636715, + "grad_norm": 2.4248223304748535, + "learning_rate": 2.5898587933247753e-05, + "loss": 0.4833, + "step": 17383 + }, + { + "epoch": 22.31578947368421, + "grad_norm": 2.076573133468628, + "learning_rate": 2.589816003423192e-05, + "loss": 0.4815, + "step": 17384 + }, + { + "epoch": 22.317073170731707, + "grad_norm": 2.1602816581726074, + "learning_rate": 2.589773213521609e-05, + "loss": 0.5105, + "step": 17385 + }, + { + "epoch": 22.318356867779205, + "grad_norm": 2.1644041538238525, + "learning_rate": 2.5897304236200258e-05, + "loss": 0.5643, + "step": 17386 + }, + { + "epoch": 22.319640564826702, + "grad_norm": 2.8626770973205566, + "learning_rate": 2.5896876337184426e-05, + "loss": 0.5664, + "step": 17387 + }, + { + "epoch": 22.320924261874197, + "grad_norm": 2.6581058502197266, + "learning_rate": 2.589644843816859e-05, + "loss": 0.5961, + "step": 17388 + }, + { + "epoch": 22.322207958921695, + "grad_norm": 1.3786765336990356, + "learning_rate": 2.5896020539152763e-05, + "loss": 0.3618, + "step": 17389 + }, + { + "epoch": 22.323491655969192, + "grad_norm": 2.5147271156311035, + "learning_rate": 2.5895592640136928e-05, + "loss": 0.3531, + "step": 17390 + }, + { + "epoch": 22.324775353016687, + "grad_norm": 22.2672061920166, + "learning_rate": 2.5895164741121093e-05, + "loss": 0.3823, + "step": 17391 + }, + { + "epoch": 22.326059050064185, + "grad_norm": 1.7666646242141724, + "learning_rate": 2.5894736842105265e-05, + "loss": 0.4104, + "step": 17392 + }, + { + "epoch": 22.327342747111683, + "grad_norm": 2.2961976528167725, + "learning_rate": 2.589430894308943e-05, + "loss": 0.3828, + "step": 17393 + }, + { + "epoch": 22.328626444159177, + "grad_norm": 1.0025873184204102, + "learning_rate": 2.5893881044073602e-05, + "loss": 0.3787, + "step": 17394 + }, + { + "epoch": 22.329910141206675, + "grad_norm": 1.7944419384002686, + "learning_rate": 2.5893453145057767e-05, + "loss": 0.4188, + "step": 17395 + }, + { + "epoch": 22.331193838254173, + "grad_norm": 2.5673341751098633, + "learning_rate": 2.5893025246041935e-05, + "loss": 0.3893, + "step": 17396 + }, + { + "epoch": 22.33247753530167, + "grad_norm": 2.2591888904571533, + "learning_rate": 2.5892597347026104e-05, + "loss": 0.4278, + "step": 17397 + }, + { + "epoch": 22.333761232349165, + "grad_norm": 2.4205684661865234, + "learning_rate": 2.589216944801027e-05, + "loss": 0.4234, + "step": 17398 + }, + { + "epoch": 22.335044929396663, + "grad_norm": 1.068161129951477, + "learning_rate": 2.5891741548994437e-05, + "loss": 0.4172, + "step": 17399 + }, + { + "epoch": 22.33632862644416, + "grad_norm": 2.285472869873047, + "learning_rate": 2.5891313649978606e-05, + "loss": 0.42, + "step": 17400 + }, + { + "epoch": 22.337612323491655, + "grad_norm": 1.5208731889724731, + "learning_rate": 2.5890885750962774e-05, + "loss": 0.4357, + "step": 17401 + }, + { + "epoch": 22.338896020539153, + "grad_norm": 2.420619249343872, + "learning_rate": 2.5890457851946942e-05, + "loss": 0.4072, + "step": 17402 + }, + { + "epoch": 22.34017971758665, + "grad_norm": 1.9217147827148438, + "learning_rate": 2.5890029952931107e-05, + "loss": 0.4033, + "step": 17403 + }, + { + "epoch": 22.341463414634145, + "grad_norm": 1.8211442232131958, + "learning_rate": 2.5889602053915276e-05, + "loss": 0.4328, + "step": 17404 + }, + { + "epoch": 22.342747111681643, + "grad_norm": 1.2048026323318481, + "learning_rate": 2.5889174154899444e-05, + "loss": 0.4242, + "step": 17405 + }, + { + "epoch": 22.34403080872914, + "grad_norm": 1.2055695056915283, + "learning_rate": 2.5888746255883613e-05, + "loss": 0.3642, + "step": 17406 + }, + { + "epoch": 22.345314505776635, + "grad_norm": 1.5370107889175415, + "learning_rate": 2.5888318356867778e-05, + "loss": 0.392, + "step": 17407 + }, + { + "epoch": 22.346598202824133, + "grad_norm": 1.3962743282318115, + "learning_rate": 2.588789045785195e-05, + "loss": 0.4387, + "step": 17408 + }, + { + "epoch": 22.34788189987163, + "grad_norm": 3.62998104095459, + "learning_rate": 2.5887462558836115e-05, + "loss": 0.4067, + "step": 17409 + }, + { + "epoch": 22.34916559691913, + "grad_norm": 1.3373576402664185, + "learning_rate": 2.5887034659820283e-05, + "loss": 0.3924, + "step": 17410 + }, + { + "epoch": 22.350449293966623, + "grad_norm": 3.7276763916015625, + "learning_rate": 2.588660676080445e-05, + "loss": 0.3937, + "step": 17411 + }, + { + "epoch": 22.35173299101412, + "grad_norm": 4.120609760284424, + "learning_rate": 2.5886178861788616e-05, + "loss": 0.3996, + "step": 17412 + }, + { + "epoch": 22.35301668806162, + "grad_norm": 1.7086724042892456, + "learning_rate": 2.5885750962772788e-05, + "loss": 0.4247, + "step": 17413 + }, + { + "epoch": 22.354300385109113, + "grad_norm": 1.1702837944030762, + "learning_rate": 2.5885323063756953e-05, + "loss": 0.3653, + "step": 17414 + }, + { + "epoch": 22.35558408215661, + "grad_norm": 1.342542052268982, + "learning_rate": 2.588489516474112e-05, + "loss": 0.3698, + "step": 17415 + }, + { + "epoch": 22.35686777920411, + "grad_norm": 1.495123028755188, + "learning_rate": 2.588446726572529e-05, + "loss": 0.4151, + "step": 17416 + }, + { + "epoch": 22.358151476251603, + "grad_norm": 1.9293946027755737, + "learning_rate": 2.5884039366709455e-05, + "loss": 0.4004, + "step": 17417 + }, + { + "epoch": 22.3594351732991, + "grad_norm": 2.227633237838745, + "learning_rate": 2.5883611467693627e-05, + "loss": 0.3866, + "step": 17418 + }, + { + "epoch": 22.3607188703466, + "grad_norm": 1.4240373373031616, + "learning_rate": 2.5883183568677792e-05, + "loss": 0.3801, + "step": 17419 + }, + { + "epoch": 22.362002567394097, + "grad_norm": 1.0732489824295044, + "learning_rate": 2.588275566966196e-05, + "loss": 0.41, + "step": 17420 + }, + { + "epoch": 22.36328626444159, + "grad_norm": 6.974308967590332, + "learning_rate": 2.588232777064613e-05, + "loss": 0.4615, + "step": 17421 + }, + { + "epoch": 22.36456996148909, + "grad_norm": 1.458258867263794, + "learning_rate": 2.5881899871630297e-05, + "loss": 0.3965, + "step": 17422 + }, + { + "epoch": 22.365853658536587, + "grad_norm": 1.9613734483718872, + "learning_rate": 2.5881471972614462e-05, + "loss": 0.4403, + "step": 17423 + }, + { + "epoch": 22.36713735558408, + "grad_norm": 1.4058727025985718, + "learning_rate": 2.588104407359863e-05, + "loss": 0.3983, + "step": 17424 + }, + { + "epoch": 22.36842105263158, + "grad_norm": 2.5539777278900146, + "learning_rate": 2.58806161745828e-05, + "loss": 0.4118, + "step": 17425 + }, + { + "epoch": 22.369704749679077, + "grad_norm": 1.2996339797973633, + "learning_rate": 2.5880188275566967e-05, + "loss": 0.4118, + "step": 17426 + }, + { + "epoch": 22.37098844672657, + "grad_norm": 2.0384645462036133, + "learning_rate": 2.5879760376551136e-05, + "loss": 0.4301, + "step": 17427 + }, + { + "epoch": 22.37227214377407, + "grad_norm": 3.3935000896453857, + "learning_rate": 2.58793324775353e-05, + "loss": 0.445, + "step": 17428 + }, + { + "epoch": 22.373555840821567, + "grad_norm": 2.159874200820923, + "learning_rate": 2.5878904578519473e-05, + "loss": 0.4819, + "step": 17429 + }, + { + "epoch": 22.374839537869065, + "grad_norm": 1.440396785736084, + "learning_rate": 2.5878476679503638e-05, + "loss": 0.4278, + "step": 17430 + }, + { + "epoch": 22.37612323491656, + "grad_norm": 2.1376430988311768, + "learning_rate": 2.5878048780487803e-05, + "loss": 0.4808, + "step": 17431 + }, + { + "epoch": 22.377406931964057, + "grad_norm": 2.206753730773926, + "learning_rate": 2.5877620881471974e-05, + "loss": 0.4064, + "step": 17432 + }, + { + "epoch": 22.378690629011555, + "grad_norm": 2.1384458541870117, + "learning_rate": 2.587719298245614e-05, + "loss": 0.4404, + "step": 17433 + }, + { + "epoch": 22.37997432605905, + "grad_norm": 2.0545527935028076, + "learning_rate": 2.587676508344031e-05, + "loss": 0.4841, + "step": 17434 + }, + { + "epoch": 22.381258023106547, + "grad_norm": 3.217482328414917, + "learning_rate": 2.5876337184424476e-05, + "loss": 0.4832, + "step": 17435 + }, + { + "epoch": 22.382541720154045, + "grad_norm": 2.7754838466644287, + "learning_rate": 2.5875909285408645e-05, + "loss": 0.5266, + "step": 17436 + }, + { + "epoch": 22.38382541720154, + "grad_norm": 1.6569548845291138, + "learning_rate": 2.5875481386392813e-05, + "loss": 0.5904, + "step": 17437 + }, + { + "epoch": 22.385109114249037, + "grad_norm": 4.484818935394287, + "learning_rate": 2.5875053487376978e-05, + "loss": 0.6046, + "step": 17438 + }, + { + "epoch": 22.386392811296535, + "grad_norm": 2.3652946949005127, + "learning_rate": 2.5874625588361147e-05, + "loss": 0.3311, + "step": 17439 + }, + { + "epoch": 22.387676508344033, + "grad_norm": 1.16342294216156, + "learning_rate": 2.5874197689345315e-05, + "loss": 0.355, + "step": 17440 + }, + { + "epoch": 22.388960205391527, + "grad_norm": 3.2117879390716553, + "learning_rate": 2.5873769790329483e-05, + "loss": 0.3765, + "step": 17441 + }, + { + "epoch": 22.390243902439025, + "grad_norm": 1.1678111553192139, + "learning_rate": 2.5873341891313652e-05, + "loss": 0.3789, + "step": 17442 + }, + { + "epoch": 22.391527599486523, + "grad_norm": 1.4019094705581665, + "learning_rate": 2.587291399229782e-05, + "loss": 0.3975, + "step": 17443 + }, + { + "epoch": 22.392811296534017, + "grad_norm": 1.5730689764022827, + "learning_rate": 2.5872486093281985e-05, + "loss": 0.3899, + "step": 17444 + }, + { + "epoch": 22.394094993581515, + "grad_norm": 1.5714590549468994, + "learning_rate": 2.5872058194266154e-05, + "loss": 0.3851, + "step": 17445 + }, + { + "epoch": 22.395378690629013, + "grad_norm": 1.2219629287719727, + "learning_rate": 2.5871630295250322e-05, + "loss": 0.37, + "step": 17446 + }, + { + "epoch": 22.396662387676507, + "grad_norm": 1.5365475416183472, + "learning_rate": 2.5871202396234487e-05, + "loss": 0.3861, + "step": 17447 + }, + { + "epoch": 22.397946084724005, + "grad_norm": 1.0782264471054077, + "learning_rate": 2.587077449721866e-05, + "loss": 0.3943, + "step": 17448 + }, + { + "epoch": 22.399229781771503, + "grad_norm": 1.3238048553466797, + "learning_rate": 2.5870346598202824e-05, + "loss": 0.3929, + "step": 17449 + }, + { + "epoch": 22.400513478818997, + "grad_norm": 2.0973715782165527, + "learning_rate": 2.5869918699186996e-05, + "loss": 0.3955, + "step": 17450 + }, + { + "epoch": 22.401797175866495, + "grad_norm": 1.175572395324707, + "learning_rate": 2.586949080017116e-05, + "loss": 0.3805, + "step": 17451 + }, + { + "epoch": 22.403080872913993, + "grad_norm": 1.4410995244979858, + "learning_rate": 2.5869062901155326e-05, + "loss": 0.406, + "step": 17452 + }, + { + "epoch": 22.40436456996149, + "grad_norm": 4.054279327392578, + "learning_rate": 2.5868635002139498e-05, + "loss": 0.422, + "step": 17453 + }, + { + "epoch": 22.405648267008985, + "grad_norm": 2.1364693641662598, + "learning_rate": 2.5868207103123663e-05, + "loss": 0.3925, + "step": 17454 + }, + { + "epoch": 22.406931964056483, + "grad_norm": 2.3424901962280273, + "learning_rate": 2.586777920410783e-05, + "loss": 0.3921, + "step": 17455 + }, + { + "epoch": 22.40821566110398, + "grad_norm": 1.1434991359710693, + "learning_rate": 2.5867351305092e-05, + "loss": 0.4236, + "step": 17456 + }, + { + "epoch": 22.409499358151475, + "grad_norm": 1.9875528812408447, + "learning_rate": 2.5866923406076168e-05, + "loss": 0.3812, + "step": 17457 + }, + { + "epoch": 22.410783055198973, + "grad_norm": 1.8664987087249756, + "learning_rate": 2.5866495507060336e-05, + "loss": 0.4054, + "step": 17458 + }, + { + "epoch": 22.41206675224647, + "grad_norm": 2.497203826904297, + "learning_rate": 2.58660676080445e-05, + "loss": 0.3549, + "step": 17459 + }, + { + "epoch": 22.413350449293965, + "grad_norm": 2.760207414627075, + "learning_rate": 2.586563970902867e-05, + "loss": 0.4413, + "step": 17460 + }, + { + "epoch": 22.414634146341463, + "grad_norm": 0.9814227819442749, + "learning_rate": 2.5865211810012838e-05, + "loss": 0.4629, + "step": 17461 + }, + { + "epoch": 22.41591784338896, + "grad_norm": 1.8175413608551025, + "learning_rate": 2.5864783910997006e-05, + "loss": 0.4077, + "step": 17462 + }, + { + "epoch": 22.41720154043646, + "grad_norm": 1.5833938121795654, + "learning_rate": 2.586435601198117e-05, + "loss": 0.4146, + "step": 17463 + }, + { + "epoch": 22.418485237483953, + "grad_norm": 1.149760365486145, + "learning_rate": 2.586392811296534e-05, + "loss": 0.3922, + "step": 17464 + }, + { + "epoch": 22.41976893453145, + "grad_norm": 1.5030752420425415, + "learning_rate": 2.5863500213949508e-05, + "loss": 0.3839, + "step": 17465 + }, + { + "epoch": 22.42105263157895, + "grad_norm": 1.262481689453125, + "learning_rate": 2.5863072314933677e-05, + "loss": 0.3948, + "step": 17466 + }, + { + "epoch": 22.422336328626443, + "grad_norm": 3.2878546714782715, + "learning_rate": 2.5862644415917845e-05, + "loss": 0.3815, + "step": 17467 + }, + { + "epoch": 22.42362002567394, + "grad_norm": 1.4370036125183105, + "learning_rate": 2.586221651690201e-05, + "loss": 0.3746, + "step": 17468 + }, + { + "epoch": 22.42490372272144, + "grad_norm": 1.7094749212265015, + "learning_rate": 2.5861788617886182e-05, + "loss": 0.419, + "step": 17469 + }, + { + "epoch": 22.426187419768933, + "grad_norm": 1.1857107877731323, + "learning_rate": 2.5861360718870347e-05, + "loss": 0.4047, + "step": 17470 + }, + { + "epoch": 22.42747111681643, + "grad_norm": 2.3555827140808105, + "learning_rate": 2.5860932819854512e-05, + "loss": 0.4203, + "step": 17471 + }, + { + "epoch": 22.42875481386393, + "grad_norm": 1.5550129413604736, + "learning_rate": 2.5860504920838684e-05, + "loss": 0.4239, + "step": 17472 + }, + { + "epoch": 22.430038510911427, + "grad_norm": 2.7072513103485107, + "learning_rate": 2.586007702182285e-05, + "loss": 0.4067, + "step": 17473 + }, + { + "epoch": 22.43132220795892, + "grad_norm": 1.2192776203155518, + "learning_rate": 2.585964912280702e-05, + "loss": 0.4196, + "step": 17474 + }, + { + "epoch": 22.43260590500642, + "grad_norm": 1.2705055475234985, + "learning_rate": 2.5859221223791186e-05, + "loss": 0.3751, + "step": 17475 + }, + { + "epoch": 22.433889602053917, + "grad_norm": 8.960994720458984, + "learning_rate": 2.5858793324775354e-05, + "loss": 0.4088, + "step": 17476 + }, + { + "epoch": 22.43517329910141, + "grad_norm": 1.8671315908432007, + "learning_rate": 2.5858365425759522e-05, + "loss": 0.3839, + "step": 17477 + }, + { + "epoch": 22.43645699614891, + "grad_norm": 1.610291838645935, + "learning_rate": 2.5857937526743687e-05, + "loss": 0.4154, + "step": 17478 + }, + { + "epoch": 22.437740693196407, + "grad_norm": 1.9666470289230347, + "learning_rate": 2.5857509627727856e-05, + "loss": 0.4156, + "step": 17479 + }, + { + "epoch": 22.4390243902439, + "grad_norm": 1.9217596054077148, + "learning_rate": 2.5857081728712024e-05, + "loss": 0.4425, + "step": 17480 + }, + { + "epoch": 22.4403080872914, + "grad_norm": 1.5590660572052002, + "learning_rate": 2.5856653829696193e-05, + "loss": 0.4763, + "step": 17481 + }, + { + "epoch": 22.441591784338897, + "grad_norm": 1.2708889245986938, + "learning_rate": 2.585622593068036e-05, + "loss": 0.4382, + "step": 17482 + }, + { + "epoch": 22.44287548138639, + "grad_norm": 1.9706472158432007, + "learning_rate": 2.585579803166453e-05, + "loss": 0.4352, + "step": 17483 + }, + { + "epoch": 22.44415917843389, + "grad_norm": 1.179447054862976, + "learning_rate": 2.5855370132648695e-05, + "loss": 0.4627, + "step": 17484 + }, + { + "epoch": 22.445442875481387, + "grad_norm": 4.528955936431885, + "learning_rate": 2.5854942233632863e-05, + "loss": 0.458, + "step": 17485 + }, + { + "epoch": 22.446726572528885, + "grad_norm": 1.863881230354309, + "learning_rate": 2.585451433461703e-05, + "loss": 0.5271, + "step": 17486 + }, + { + "epoch": 22.44801026957638, + "grad_norm": 4.091843128204346, + "learning_rate": 2.5854086435601196e-05, + "loss": 0.4889, + "step": 17487 + }, + { + "epoch": 22.449293966623877, + "grad_norm": 5.6210832595825195, + "learning_rate": 2.5853658536585368e-05, + "loss": 0.6113, + "step": 17488 + }, + { + "epoch": 22.450577663671375, + "grad_norm": 1.0986578464508057, + "learning_rate": 2.5853230637569533e-05, + "loss": 0.3658, + "step": 17489 + }, + { + "epoch": 22.45186136071887, + "grad_norm": 3.7620856761932373, + "learning_rate": 2.5852802738553705e-05, + "loss": 0.3735, + "step": 17490 + }, + { + "epoch": 22.453145057766367, + "grad_norm": 2.477863073348999, + "learning_rate": 2.585237483953787e-05, + "loss": 0.3946, + "step": 17491 + }, + { + "epoch": 22.454428754813865, + "grad_norm": 2.123487949371338, + "learning_rate": 2.5851946940522035e-05, + "loss": 0.3787, + "step": 17492 + }, + { + "epoch": 22.45571245186136, + "grad_norm": 1.6574667692184448, + "learning_rate": 2.5851519041506207e-05, + "loss": 0.3824, + "step": 17493 + }, + { + "epoch": 22.456996148908857, + "grad_norm": 1.1058202981948853, + "learning_rate": 2.5851091142490372e-05, + "loss": 0.3752, + "step": 17494 + }, + { + "epoch": 22.458279845956355, + "grad_norm": 3.677517890930176, + "learning_rate": 2.585066324347454e-05, + "loss": 0.3749, + "step": 17495 + }, + { + "epoch": 22.459563543003853, + "grad_norm": 1.222664475440979, + "learning_rate": 2.585023534445871e-05, + "loss": 0.3756, + "step": 17496 + }, + { + "epoch": 22.460847240051347, + "grad_norm": 0.9862375259399414, + "learning_rate": 2.5849807445442877e-05, + "loss": 0.3808, + "step": 17497 + }, + { + "epoch": 22.462130937098845, + "grad_norm": 2.1698033809661865, + "learning_rate": 2.5849379546427046e-05, + "loss": 0.3957, + "step": 17498 + }, + { + "epoch": 22.463414634146343, + "grad_norm": 1.961195945739746, + "learning_rate": 2.584895164741121e-05, + "loss": 0.3632, + "step": 17499 + }, + { + "epoch": 22.464698331193837, + "grad_norm": 1.4324394464492798, + "learning_rate": 2.584852374839538e-05, + "loss": 0.3916, + "step": 17500 + }, + { + "epoch": 22.465982028241335, + "grad_norm": 1.2288249731063843, + "learning_rate": 2.5848095849379547e-05, + "loss": 0.3927, + "step": 17501 + }, + { + "epoch": 22.467265725288833, + "grad_norm": 1.1311966180801392, + "learning_rate": 2.5847667950363716e-05, + "loss": 0.3743, + "step": 17502 + }, + { + "epoch": 22.468549422336327, + "grad_norm": 1.3691136837005615, + "learning_rate": 2.584724005134788e-05, + "loss": 0.4029, + "step": 17503 + }, + { + "epoch": 22.469833119383825, + "grad_norm": 2.6343250274658203, + "learning_rate": 2.5846812152332053e-05, + "loss": 0.4184, + "step": 17504 + }, + { + "epoch": 22.471116816431323, + "grad_norm": 4.25090217590332, + "learning_rate": 2.5846384253316218e-05, + "loss": 0.3515, + "step": 17505 + }, + { + "epoch": 22.47240051347882, + "grad_norm": 1.4686866998672485, + "learning_rate": 2.5845956354300386e-05, + "loss": 0.406, + "step": 17506 + }, + { + "epoch": 22.473684210526315, + "grad_norm": 1.5638246536254883, + "learning_rate": 2.5845528455284554e-05, + "loss": 0.4055, + "step": 17507 + }, + { + "epoch": 22.474967907573813, + "grad_norm": 1.3190302848815918, + "learning_rate": 2.584510055626872e-05, + "loss": 0.4155, + "step": 17508 + }, + { + "epoch": 22.47625160462131, + "grad_norm": 1.2840547561645508, + "learning_rate": 2.584467265725289e-05, + "loss": 0.3829, + "step": 17509 + }, + { + "epoch": 22.477535301668805, + "grad_norm": 1.0735381841659546, + "learning_rate": 2.5844244758237056e-05, + "loss": 0.4063, + "step": 17510 + }, + { + "epoch": 22.478818998716303, + "grad_norm": 1.6684495210647583, + "learning_rate": 2.5843816859221225e-05, + "loss": 0.3975, + "step": 17511 + }, + { + "epoch": 22.4801026957638, + "grad_norm": 4.429065227508545, + "learning_rate": 2.5843388960205393e-05, + "loss": 0.3857, + "step": 17512 + }, + { + "epoch": 22.481386392811295, + "grad_norm": 0.9525928497314453, + "learning_rate": 2.5842961061189558e-05, + "loss": 0.3882, + "step": 17513 + }, + { + "epoch": 22.482670089858793, + "grad_norm": 1.2711952924728394, + "learning_rate": 2.584253316217373e-05, + "loss": 0.392, + "step": 17514 + }, + { + "epoch": 22.48395378690629, + "grad_norm": 1.7151774168014526, + "learning_rate": 2.5842105263157895e-05, + "loss": 0.4218, + "step": 17515 + }, + { + "epoch": 22.485237483953785, + "grad_norm": 1.6121026277542114, + "learning_rate": 2.5841677364142063e-05, + "loss": 0.3743, + "step": 17516 + }, + { + "epoch": 22.486521181001283, + "grad_norm": 1.6504623889923096, + "learning_rate": 2.5841249465126232e-05, + "loss": 0.3897, + "step": 17517 + }, + { + "epoch": 22.48780487804878, + "grad_norm": 1.087606430053711, + "learning_rate": 2.58408215661104e-05, + "loss": 0.4185, + "step": 17518 + }, + { + "epoch": 22.48908857509628, + "grad_norm": 3.4030139446258545, + "learning_rate": 2.5840393667094565e-05, + "loss": 0.3912, + "step": 17519 + }, + { + "epoch": 22.490372272143773, + "grad_norm": 1.839903473854065, + "learning_rate": 2.5839965768078734e-05, + "loss": 0.397, + "step": 17520 + }, + { + "epoch": 22.49165596919127, + "grad_norm": 1.8617442846298218, + "learning_rate": 2.5839537869062902e-05, + "loss": 0.3898, + "step": 17521 + }, + { + "epoch": 22.49293966623877, + "grad_norm": 1.3959640264511108, + "learning_rate": 2.583910997004707e-05, + "loss": 0.4237, + "step": 17522 + }, + { + "epoch": 22.494223363286263, + "grad_norm": 6.748298645019531, + "learning_rate": 2.583868207103124e-05, + "loss": 0.3963, + "step": 17523 + }, + { + "epoch": 22.49550706033376, + "grad_norm": 2.2724080085754395, + "learning_rate": 2.5838254172015404e-05, + "loss": 0.3874, + "step": 17524 + }, + { + "epoch": 22.49679075738126, + "grad_norm": 1.7721984386444092, + "learning_rate": 2.5837826272999572e-05, + "loss": 0.4075, + "step": 17525 + }, + { + "epoch": 22.498074454428753, + "grad_norm": 1.3415976762771606, + "learning_rate": 2.583739837398374e-05, + "loss": 0.4187, + "step": 17526 + }, + { + "epoch": 22.49935815147625, + "grad_norm": 18.08038330078125, + "learning_rate": 2.5836970474967906e-05, + "loss": 0.419, + "step": 17527 + }, + { + "epoch": 22.50064184852375, + "grad_norm": 1.2515075206756592, + "learning_rate": 2.5836542575952078e-05, + "loss": 0.4226, + "step": 17528 + }, + { + "epoch": 22.501925545571247, + "grad_norm": 1.241223931312561, + "learning_rate": 2.5836114676936243e-05, + "loss": 0.4206, + "step": 17529 + }, + { + "epoch": 22.50320924261874, + "grad_norm": 12.672677993774414, + "learning_rate": 2.583568677792041e-05, + "loss": 0.4072, + "step": 17530 + }, + { + "epoch": 22.50449293966624, + "grad_norm": 3.0037269592285156, + "learning_rate": 2.583525887890458e-05, + "loss": 0.4166, + "step": 17531 + }, + { + "epoch": 22.505776636713737, + "grad_norm": 3.702585220336914, + "learning_rate": 2.5834830979888744e-05, + "loss": 0.4075, + "step": 17532 + }, + { + "epoch": 22.50706033376123, + "grad_norm": 2.228893518447876, + "learning_rate": 2.5834403080872916e-05, + "loss": 0.5345, + "step": 17533 + }, + { + "epoch": 22.50834403080873, + "grad_norm": 2.4362576007843018, + "learning_rate": 2.583397518185708e-05, + "loss": 0.4211, + "step": 17534 + }, + { + "epoch": 22.509627727856227, + "grad_norm": 1.2652032375335693, + "learning_rate": 2.583354728284125e-05, + "loss": 0.4311, + "step": 17535 + }, + { + "epoch": 22.51091142490372, + "grad_norm": 3.5569629669189453, + "learning_rate": 2.5833119383825418e-05, + "loss": 0.5312, + "step": 17536 + }, + { + "epoch": 22.51219512195122, + "grad_norm": 1.3836065530776978, + "learning_rate": 2.5832691484809587e-05, + "loss": 0.5482, + "step": 17537 + }, + { + "epoch": 22.513478818998717, + "grad_norm": 4.584446430206299, + "learning_rate": 2.583226358579375e-05, + "loss": 0.6444, + "step": 17538 + }, + { + "epoch": 22.514762516046215, + "grad_norm": 1.2232046127319336, + "learning_rate": 2.583183568677792e-05, + "loss": 0.3686, + "step": 17539 + }, + { + "epoch": 22.51604621309371, + "grad_norm": 1.4820177555084229, + "learning_rate": 2.583140778776209e-05, + "loss": 0.3741, + "step": 17540 + }, + { + "epoch": 22.517329910141207, + "grad_norm": 0.964404284954071, + "learning_rate": 2.5830979888746257e-05, + "loss": 0.3455, + "step": 17541 + }, + { + "epoch": 22.518613607188705, + "grad_norm": 1.180165410041809, + "learning_rate": 2.5830551989730425e-05, + "loss": 0.4168, + "step": 17542 + }, + { + "epoch": 22.5198973042362, + "grad_norm": 0.93123859167099, + "learning_rate": 2.583012409071459e-05, + "loss": 0.3783, + "step": 17543 + }, + { + "epoch": 22.521181001283697, + "grad_norm": 1.7329105138778687, + "learning_rate": 2.5829696191698762e-05, + "loss": 0.4024, + "step": 17544 + }, + { + "epoch": 22.522464698331195, + "grad_norm": 2.0060484409332275, + "learning_rate": 2.5829268292682927e-05, + "loss": 0.3713, + "step": 17545 + }, + { + "epoch": 22.52374839537869, + "grad_norm": 1.3422787189483643, + "learning_rate": 2.5828840393667092e-05, + "loss": 0.3565, + "step": 17546 + }, + { + "epoch": 22.525032092426187, + "grad_norm": 1.8000935316085815, + "learning_rate": 2.5828412494651264e-05, + "loss": 0.3804, + "step": 17547 + }, + { + "epoch": 22.526315789473685, + "grad_norm": 1.8970857858657837, + "learning_rate": 2.582798459563543e-05, + "loss": 0.3968, + "step": 17548 + }, + { + "epoch": 22.527599486521183, + "grad_norm": 1.3722844123840332, + "learning_rate": 2.58275566966196e-05, + "loss": 0.4131, + "step": 17549 + }, + { + "epoch": 22.528883183568677, + "grad_norm": 1.1758513450622559, + "learning_rate": 2.5827128797603766e-05, + "loss": 0.4039, + "step": 17550 + }, + { + "epoch": 22.530166880616175, + "grad_norm": 1.5100109577178955, + "learning_rate": 2.5826700898587934e-05, + "loss": 0.3964, + "step": 17551 + }, + { + "epoch": 22.531450577663673, + "grad_norm": 1.1142927408218384, + "learning_rate": 2.5826272999572103e-05, + "loss": 0.3728, + "step": 17552 + }, + { + "epoch": 22.532734274711167, + "grad_norm": 1.2775750160217285, + "learning_rate": 2.5825845100556268e-05, + "loss": 0.408, + "step": 17553 + }, + { + "epoch": 22.534017971758665, + "grad_norm": 1.6111304759979248, + "learning_rate": 2.5825417201540436e-05, + "loss": 0.3801, + "step": 17554 + }, + { + "epoch": 22.535301668806163, + "grad_norm": 3.929716110229492, + "learning_rate": 2.5824989302524604e-05, + "loss": 0.3901, + "step": 17555 + }, + { + "epoch": 22.536585365853657, + "grad_norm": 1.7738616466522217, + "learning_rate": 2.5824561403508773e-05, + "loss": 0.3905, + "step": 17556 + }, + { + "epoch": 22.537869062901155, + "grad_norm": 2.0783231258392334, + "learning_rate": 2.582413350449294e-05, + "loss": 0.3798, + "step": 17557 + }, + { + "epoch": 22.539152759948653, + "grad_norm": 1.3704348802566528, + "learning_rate": 2.582370560547711e-05, + "loss": 0.3621, + "step": 17558 + }, + { + "epoch": 22.540436456996147, + "grad_norm": 17.029796600341797, + "learning_rate": 2.5823277706461275e-05, + "loss": 0.412, + "step": 17559 + }, + { + "epoch": 22.541720154043645, + "grad_norm": 2.0709359645843506, + "learning_rate": 2.5822849807445443e-05, + "loss": 0.3897, + "step": 17560 + }, + { + "epoch": 22.543003851091143, + "grad_norm": 0.9292053580284119, + "learning_rate": 2.582242190842961e-05, + "loss": 0.3818, + "step": 17561 + }, + { + "epoch": 22.54428754813864, + "grad_norm": 1.2649950981140137, + "learning_rate": 2.5821994009413776e-05, + "loss": 0.4119, + "step": 17562 + }, + { + "epoch": 22.545571245186135, + "grad_norm": 1.7982009649276733, + "learning_rate": 2.5821566110397948e-05, + "loss": 0.3741, + "step": 17563 + }, + { + "epoch": 22.546854942233633, + "grad_norm": 2.579489231109619, + "learning_rate": 2.5821138211382113e-05, + "loss": 0.4076, + "step": 17564 + }, + { + "epoch": 22.54813863928113, + "grad_norm": 1.6574294567108154, + "learning_rate": 2.5820710312366285e-05, + "loss": 0.4115, + "step": 17565 + }, + { + "epoch": 22.549422336328625, + "grad_norm": 1.1010332107543945, + "learning_rate": 2.582028241335045e-05, + "loss": 0.3978, + "step": 17566 + }, + { + "epoch": 22.550706033376123, + "grad_norm": 1.7534778118133545, + "learning_rate": 2.5819854514334615e-05, + "loss": 0.3691, + "step": 17567 + }, + { + "epoch": 22.55198973042362, + "grad_norm": 3.69453763961792, + "learning_rate": 2.5819426615318787e-05, + "loss": 0.362, + "step": 17568 + }, + { + "epoch": 22.553273427471115, + "grad_norm": 1.546724796295166, + "learning_rate": 2.5818998716302952e-05, + "loss": 0.4234, + "step": 17569 + }, + { + "epoch": 22.554557124518613, + "grad_norm": 2.782803535461426, + "learning_rate": 2.581857081728712e-05, + "loss": 0.3924, + "step": 17570 + }, + { + "epoch": 22.55584082156611, + "grad_norm": 1.8834229707717896, + "learning_rate": 2.581814291827129e-05, + "loss": 0.4325, + "step": 17571 + }, + { + "epoch": 22.55712451861361, + "grad_norm": 3.2721760272979736, + "learning_rate": 2.5817715019255457e-05, + "loss": 0.4117, + "step": 17572 + }, + { + "epoch": 22.558408215661103, + "grad_norm": 1.7129806280136108, + "learning_rate": 2.5817287120239626e-05, + "loss": 0.3727, + "step": 17573 + }, + { + "epoch": 22.5596919127086, + "grad_norm": 2.014747381210327, + "learning_rate": 2.581685922122379e-05, + "loss": 0.4119, + "step": 17574 + }, + { + "epoch": 22.5609756097561, + "grad_norm": 1.2421550750732422, + "learning_rate": 2.581643132220796e-05, + "loss": 0.4242, + "step": 17575 + }, + { + "epoch": 22.562259306803593, + "grad_norm": 1.949945092201233, + "learning_rate": 2.5816003423192127e-05, + "loss": 0.4899, + "step": 17576 + }, + { + "epoch": 22.56354300385109, + "grad_norm": 1.166551113128662, + "learning_rate": 2.5815575524176296e-05, + "loss": 0.4516, + "step": 17577 + }, + { + "epoch": 22.56482670089859, + "grad_norm": 1.2515727281570435, + "learning_rate": 2.581514762516046e-05, + "loss": 0.4056, + "step": 17578 + }, + { + "epoch": 22.566110397946083, + "grad_norm": 1.3154782056808472, + "learning_rate": 2.5814719726144633e-05, + "loss": 0.449, + "step": 17579 + }, + { + "epoch": 22.56739409499358, + "grad_norm": 2.6718451976776123, + "learning_rate": 2.5814291827128798e-05, + "loss": 0.466, + "step": 17580 + }, + { + "epoch": 22.56867779204108, + "grad_norm": 2.4155449867248535, + "learning_rate": 2.5813863928112966e-05, + "loss": 0.4791, + "step": 17581 + }, + { + "epoch": 22.569961489088577, + "grad_norm": 2.538872241973877, + "learning_rate": 2.5813436029097135e-05, + "loss": 0.4452, + "step": 17582 + }, + { + "epoch": 22.57124518613607, + "grad_norm": 2.4673140048980713, + "learning_rate": 2.58130081300813e-05, + "loss": 0.4604, + "step": 17583 + }, + { + "epoch": 22.57252888318357, + "grad_norm": 1.605441689491272, + "learning_rate": 2.581258023106547e-05, + "loss": 0.488, + "step": 17584 + }, + { + "epoch": 22.573812580231067, + "grad_norm": 6.501445770263672, + "learning_rate": 2.5812152332049636e-05, + "loss": 0.4404, + "step": 17585 + }, + { + "epoch": 22.57509627727856, + "grad_norm": 3.1201424598693848, + "learning_rate": 2.5811724433033805e-05, + "loss": 0.4787, + "step": 17586 + }, + { + "epoch": 22.57637997432606, + "grad_norm": 1.700750470161438, + "learning_rate": 2.5811296534017973e-05, + "loss": 0.531, + "step": 17587 + }, + { + "epoch": 22.577663671373557, + "grad_norm": 1.6514381170272827, + "learning_rate": 2.5810868635002138e-05, + "loss": 0.617, + "step": 17588 + }, + { + "epoch": 22.57894736842105, + "grad_norm": 1.2116402387619019, + "learning_rate": 2.581044073598631e-05, + "loss": 0.3904, + "step": 17589 + }, + { + "epoch": 22.58023106546855, + "grad_norm": 1.3420735597610474, + "learning_rate": 2.5810012836970475e-05, + "loss": 0.3847, + "step": 17590 + }, + { + "epoch": 22.581514762516047, + "grad_norm": 2.272696018218994, + "learning_rate": 2.5809584937954643e-05, + "loss": 0.3682, + "step": 17591 + }, + { + "epoch": 22.58279845956354, + "grad_norm": 1.5136958360671997, + "learning_rate": 2.5809157038938812e-05, + "loss": 0.3956, + "step": 17592 + }, + { + "epoch": 22.58408215661104, + "grad_norm": 1.5548697710037231, + "learning_rate": 2.5808729139922977e-05, + "loss": 0.3749, + "step": 17593 + }, + { + "epoch": 22.585365853658537, + "grad_norm": 1.5193864107131958, + "learning_rate": 2.5808301240907145e-05, + "loss": 0.3689, + "step": 17594 + }, + { + "epoch": 22.586649550706035, + "grad_norm": 1.6155767440795898, + "learning_rate": 2.5807873341891314e-05, + "loss": 0.39, + "step": 17595 + }, + { + "epoch": 22.58793324775353, + "grad_norm": 0.9939159750938416, + "learning_rate": 2.5807445442875482e-05, + "loss": 0.3979, + "step": 17596 + }, + { + "epoch": 22.589216944801027, + "grad_norm": 2.095644235610962, + "learning_rate": 2.580701754385965e-05, + "loss": 0.4283, + "step": 17597 + }, + { + "epoch": 22.590500641848525, + "grad_norm": 3.156449317932129, + "learning_rate": 2.580658964484382e-05, + "loss": 0.3881, + "step": 17598 + }, + { + "epoch": 22.59178433889602, + "grad_norm": 6.576415538787842, + "learning_rate": 2.5806161745827984e-05, + "loss": 0.4252, + "step": 17599 + }, + { + "epoch": 22.593068035943517, + "grad_norm": 1.461185097694397, + "learning_rate": 2.5805733846812152e-05, + "loss": 0.4057, + "step": 17600 + }, + { + "epoch": 22.594351732991015, + "grad_norm": 1.6824183464050293, + "learning_rate": 2.580530594779632e-05, + "loss": 0.4072, + "step": 17601 + }, + { + "epoch": 22.59563543003851, + "grad_norm": 2.2878258228302, + "learning_rate": 2.5804878048780486e-05, + "loss": 0.3831, + "step": 17602 + }, + { + "epoch": 22.596919127086007, + "grad_norm": 1.9506551027297974, + "learning_rate": 2.5804450149764658e-05, + "loss": 0.3938, + "step": 17603 + }, + { + "epoch": 22.598202824133505, + "grad_norm": 1.240295171737671, + "learning_rate": 2.5804022250748823e-05, + "loss": 0.3863, + "step": 17604 + }, + { + "epoch": 22.599486521181003, + "grad_norm": 2.8865432739257812, + "learning_rate": 2.5803594351732994e-05, + "loss": 0.4076, + "step": 17605 + }, + { + "epoch": 22.600770218228497, + "grad_norm": 1.7625828981399536, + "learning_rate": 2.580316645271716e-05, + "loss": 0.3564, + "step": 17606 + }, + { + "epoch": 22.602053915275995, + "grad_norm": 1.210995078086853, + "learning_rate": 2.5802738553701325e-05, + "loss": 0.3744, + "step": 17607 + }, + { + "epoch": 22.603337612323493, + "grad_norm": 2.657170295715332, + "learning_rate": 2.5802310654685496e-05, + "loss": 0.4024, + "step": 17608 + }, + { + "epoch": 22.604621309370987, + "grad_norm": 2.1674277782440186, + "learning_rate": 2.580188275566966e-05, + "loss": 0.4003, + "step": 17609 + }, + { + "epoch": 22.605905006418485, + "grad_norm": 2.2519471645355225, + "learning_rate": 2.580145485665383e-05, + "loss": 0.3944, + "step": 17610 + }, + { + "epoch": 22.607188703465983, + "grad_norm": 3.764101982116699, + "learning_rate": 2.5801026957637998e-05, + "loss": 0.4401, + "step": 17611 + }, + { + "epoch": 22.608472400513477, + "grad_norm": 2.0346200466156006, + "learning_rate": 2.5800599058622167e-05, + "loss": 0.4478, + "step": 17612 + }, + { + "epoch": 22.609756097560975, + "grad_norm": 1.7573235034942627, + "learning_rate": 2.5800171159606335e-05, + "loss": 0.4145, + "step": 17613 + }, + { + "epoch": 22.611039794608473, + "grad_norm": 1.7325888872146606, + "learning_rate": 2.57997432605905e-05, + "loss": 0.3983, + "step": 17614 + }, + { + "epoch": 22.61232349165597, + "grad_norm": 1.6138927936553955, + "learning_rate": 2.579931536157467e-05, + "loss": 0.4044, + "step": 17615 + }, + { + "epoch": 22.613607188703465, + "grad_norm": 2.166128635406494, + "learning_rate": 2.5798887462558837e-05, + "loss": 0.429, + "step": 17616 + }, + { + "epoch": 22.614890885750963, + "grad_norm": 2.9274580478668213, + "learning_rate": 2.5798459563543005e-05, + "loss": 0.418, + "step": 17617 + }, + { + "epoch": 22.61617458279846, + "grad_norm": 1.4864720106124878, + "learning_rate": 2.579803166452717e-05, + "loss": 0.3964, + "step": 17618 + }, + { + "epoch": 22.617458279845955, + "grad_norm": 1.484316349029541, + "learning_rate": 2.5797603765511342e-05, + "loss": 0.4405, + "step": 17619 + }, + { + "epoch": 22.618741976893453, + "grad_norm": 1.0273518562316895, + "learning_rate": 2.5797175866495507e-05, + "loss": 0.3863, + "step": 17620 + }, + { + "epoch": 22.62002567394095, + "grad_norm": 2.3221917152404785, + "learning_rate": 2.5796747967479676e-05, + "loss": 0.4155, + "step": 17621 + }, + { + "epoch": 22.621309370988445, + "grad_norm": 1.7484079599380493, + "learning_rate": 2.5796320068463844e-05, + "loss": 0.445, + "step": 17622 + }, + { + "epoch": 22.622593068035943, + "grad_norm": 1.5920697450637817, + "learning_rate": 2.579589216944801e-05, + "loss": 0.4383, + "step": 17623 + }, + { + "epoch": 22.62387676508344, + "grad_norm": 2.079455852508545, + "learning_rate": 2.579546427043218e-05, + "loss": 0.3909, + "step": 17624 + }, + { + "epoch": 22.625160462130935, + "grad_norm": 3.7521111965179443, + "learning_rate": 2.5795036371416346e-05, + "loss": 0.4182, + "step": 17625 + }, + { + "epoch": 22.626444159178433, + "grad_norm": 2.7506072521209717, + "learning_rate": 2.5794608472400514e-05, + "loss": 0.4346, + "step": 17626 + }, + { + "epoch": 22.62772785622593, + "grad_norm": 5.092485427856445, + "learning_rate": 2.5794180573384683e-05, + "loss": 0.4461, + "step": 17627 + }, + { + "epoch": 22.62901155327343, + "grad_norm": 1.7531694173812866, + "learning_rate": 2.5793752674368848e-05, + "loss": 0.4361, + "step": 17628 + }, + { + "epoch": 22.630295250320923, + "grad_norm": 1.6652299165725708, + "learning_rate": 2.579332477535302e-05, + "loss": 0.403, + "step": 17629 + }, + { + "epoch": 22.63157894736842, + "grad_norm": 5.05418586730957, + "learning_rate": 2.5792896876337184e-05, + "loss": 0.4264, + "step": 17630 + }, + { + "epoch": 22.63286264441592, + "grad_norm": 3.0007941722869873, + "learning_rate": 2.5792468977321353e-05, + "loss": 0.4013, + "step": 17631 + }, + { + "epoch": 22.634146341463413, + "grad_norm": 9.284939765930176, + "learning_rate": 2.579204107830552e-05, + "loss": 0.437, + "step": 17632 + }, + { + "epoch": 22.63543003851091, + "grad_norm": 2.760422706604004, + "learning_rate": 2.579161317928969e-05, + "loss": 0.4099, + "step": 17633 + }, + { + "epoch": 22.63671373555841, + "grad_norm": 1.6712816953659058, + "learning_rate": 2.5791185280273855e-05, + "loss": 0.4563, + "step": 17634 + }, + { + "epoch": 22.637997432605903, + "grad_norm": 3.078468084335327, + "learning_rate": 2.5790757381258023e-05, + "loss": 0.4353, + "step": 17635 + }, + { + "epoch": 22.6392811296534, + "grad_norm": 2.4344253540039062, + "learning_rate": 2.579032948224219e-05, + "loss": 0.4821, + "step": 17636 + }, + { + "epoch": 22.6405648267009, + "grad_norm": 2.9833619594573975, + "learning_rate": 2.578990158322636e-05, + "loss": 0.4977, + "step": 17637 + }, + { + "epoch": 22.641848523748397, + "grad_norm": 2.3399300575256348, + "learning_rate": 2.578947368421053e-05, + "loss": 0.598, + "step": 17638 + }, + { + "epoch": 22.64313222079589, + "grad_norm": 0.8046575784683228, + "learning_rate": 2.5789045785194693e-05, + "loss": 0.3453, + "step": 17639 + }, + { + "epoch": 22.64441591784339, + "grad_norm": 1.0066771507263184, + "learning_rate": 2.5788617886178865e-05, + "loss": 0.3542, + "step": 17640 + }, + { + "epoch": 22.645699614890887, + "grad_norm": 1.3316519260406494, + "learning_rate": 2.578818998716303e-05, + "loss": 0.4077, + "step": 17641 + }, + { + "epoch": 22.64698331193838, + "grad_norm": 1.8885785341262817, + "learning_rate": 2.5787762088147195e-05, + "loss": 0.4185, + "step": 17642 + }, + { + "epoch": 22.64826700898588, + "grad_norm": 1.8008971214294434, + "learning_rate": 2.5787334189131367e-05, + "loss": 0.3955, + "step": 17643 + }, + { + "epoch": 22.649550706033377, + "grad_norm": 1.1179035902023315, + "learning_rate": 2.5786906290115532e-05, + "loss": 0.3945, + "step": 17644 + }, + { + "epoch": 22.65083440308087, + "grad_norm": 1.3731329441070557, + "learning_rate": 2.5786478391099704e-05, + "loss": 0.427, + "step": 17645 + }, + { + "epoch": 22.65211810012837, + "grad_norm": 1.1953901052474976, + "learning_rate": 2.578605049208387e-05, + "loss": 0.3711, + "step": 17646 + }, + { + "epoch": 22.653401797175867, + "grad_norm": 1.6058763265609741, + "learning_rate": 2.5785622593068037e-05, + "loss": 0.3633, + "step": 17647 + }, + { + "epoch": 22.654685494223365, + "grad_norm": 4.404244899749756, + "learning_rate": 2.5785194694052206e-05, + "loss": 0.3747, + "step": 17648 + }, + { + "epoch": 22.65596919127086, + "grad_norm": 1.8097431659698486, + "learning_rate": 2.578476679503637e-05, + "loss": 0.4087, + "step": 17649 + }, + { + "epoch": 22.657252888318357, + "grad_norm": 1.2579373121261597, + "learning_rate": 2.578433889602054e-05, + "loss": 0.4302, + "step": 17650 + }, + { + "epoch": 22.658536585365855, + "grad_norm": 1.289617896080017, + "learning_rate": 2.5783910997004708e-05, + "loss": 0.3807, + "step": 17651 + }, + { + "epoch": 22.65982028241335, + "grad_norm": 1.8914116621017456, + "learning_rate": 2.5783483097988876e-05, + "loss": 0.3818, + "step": 17652 + }, + { + "epoch": 22.661103979460847, + "grad_norm": 1.2437102794647217, + "learning_rate": 2.5783055198973044e-05, + "loss": 0.4084, + "step": 17653 + }, + { + "epoch": 22.662387676508345, + "grad_norm": 1.5775959491729736, + "learning_rate": 2.578262729995721e-05, + "loss": 0.4019, + "step": 17654 + }, + { + "epoch": 22.66367137355584, + "grad_norm": 1.205419898033142, + "learning_rate": 2.5782199400941378e-05, + "loss": 0.3959, + "step": 17655 + }, + { + "epoch": 22.664955070603337, + "grad_norm": 1.2764424085617065, + "learning_rate": 2.5781771501925546e-05, + "loss": 0.3756, + "step": 17656 + }, + { + "epoch": 22.666238767650835, + "grad_norm": 1.331508994102478, + "learning_rate": 2.5781343602909715e-05, + "loss": 0.3868, + "step": 17657 + }, + { + "epoch": 22.66752246469833, + "grad_norm": 1.6746304035186768, + "learning_rate": 2.578091570389388e-05, + "loss": 0.4255, + "step": 17658 + }, + { + "epoch": 22.668806161745827, + "grad_norm": 1.250412106513977, + "learning_rate": 2.578048780487805e-05, + "loss": 0.3757, + "step": 17659 + }, + { + "epoch": 22.670089858793325, + "grad_norm": 1.1255543231964111, + "learning_rate": 2.5780059905862216e-05, + "loss": 0.4037, + "step": 17660 + }, + { + "epoch": 22.671373555840823, + "grad_norm": 1.08083975315094, + "learning_rate": 2.5779632006846385e-05, + "loss": 0.4044, + "step": 17661 + }, + { + "epoch": 22.672657252888317, + "grad_norm": 2.002077102661133, + "learning_rate": 2.5779204107830553e-05, + "loss": 0.4281, + "step": 17662 + }, + { + "epoch": 22.673940949935815, + "grad_norm": 1.0004023313522339, + "learning_rate": 2.5778776208814718e-05, + "loss": 0.4028, + "step": 17663 + }, + { + "epoch": 22.675224646983313, + "grad_norm": 1.5805206298828125, + "learning_rate": 2.577834830979889e-05, + "loss": 0.4102, + "step": 17664 + }, + { + "epoch": 22.676508344030808, + "grad_norm": 2.2986700534820557, + "learning_rate": 2.5777920410783055e-05, + "loss": 0.4058, + "step": 17665 + }, + { + "epoch": 22.677792041078305, + "grad_norm": 1.1538923978805542, + "learning_rate": 2.5777492511767224e-05, + "loss": 0.3977, + "step": 17666 + }, + { + "epoch": 22.679075738125803, + "grad_norm": 1.2106690406799316, + "learning_rate": 2.5777064612751392e-05, + "loss": 0.4022, + "step": 17667 + }, + { + "epoch": 22.680359435173298, + "grad_norm": 1.620338797569275, + "learning_rate": 2.5776636713735557e-05, + "loss": 0.4216, + "step": 17668 + }, + { + "epoch": 22.681643132220795, + "grad_norm": 2.4126689434051514, + "learning_rate": 2.577620881471973e-05, + "loss": 0.3964, + "step": 17669 + }, + { + "epoch": 22.682926829268293, + "grad_norm": 1.476279377937317, + "learning_rate": 2.5775780915703894e-05, + "loss": 0.3555, + "step": 17670 + }, + { + "epoch": 22.68421052631579, + "grad_norm": 1.2072677612304688, + "learning_rate": 2.5775353016688062e-05, + "loss": 0.4236, + "step": 17671 + }, + { + "epoch": 22.685494223363285, + "grad_norm": 1.750498652458191, + "learning_rate": 2.577492511767223e-05, + "loss": 0.4311, + "step": 17672 + }, + { + "epoch": 22.686777920410783, + "grad_norm": 3.3838541507720947, + "learning_rate": 2.57744972186564e-05, + "loss": 0.3916, + "step": 17673 + }, + { + "epoch": 22.68806161745828, + "grad_norm": 2.763418436050415, + "learning_rate": 2.5774069319640564e-05, + "loss": 0.4135, + "step": 17674 + }, + { + "epoch": 22.689345314505776, + "grad_norm": 2.045832633972168, + "learning_rate": 2.5773641420624732e-05, + "loss": 0.4421, + "step": 17675 + }, + { + "epoch": 22.690629011553273, + "grad_norm": 1.9859265089035034, + "learning_rate": 2.57732135216089e-05, + "loss": 0.4598, + "step": 17676 + }, + { + "epoch": 22.69191270860077, + "grad_norm": 3.433507204055786, + "learning_rate": 2.577278562259307e-05, + "loss": 0.4226, + "step": 17677 + }, + { + "epoch": 22.693196405648266, + "grad_norm": 3.6615376472473145, + "learning_rate": 2.5772357723577238e-05, + "loss": 0.3711, + "step": 17678 + }, + { + "epoch": 22.694480102695763, + "grad_norm": 1.495026707649231, + "learning_rate": 2.5771929824561403e-05, + "loss": 0.4314, + "step": 17679 + }, + { + "epoch": 22.69576379974326, + "grad_norm": 2.4990437030792236, + "learning_rate": 2.5771501925545575e-05, + "loss": 0.4185, + "step": 17680 + }, + { + "epoch": 22.69704749679076, + "grad_norm": 1.4157968759536743, + "learning_rate": 2.577107402652974e-05, + "loss": 0.4257, + "step": 17681 + }, + { + "epoch": 22.698331193838253, + "grad_norm": 3.3762567043304443, + "learning_rate": 2.5770646127513905e-05, + "loss": 0.4777, + "step": 17682 + }, + { + "epoch": 22.69961489088575, + "grad_norm": 1.9200688600540161, + "learning_rate": 2.5770218228498076e-05, + "loss": 0.4153, + "step": 17683 + }, + { + "epoch": 22.70089858793325, + "grad_norm": 1.8554664850234985, + "learning_rate": 2.576979032948224e-05, + "loss": 0.4316, + "step": 17684 + }, + { + "epoch": 22.702182284980744, + "grad_norm": 1.9488379955291748, + "learning_rate": 2.5769362430466413e-05, + "loss": 0.4644, + "step": 17685 + }, + { + "epoch": 22.70346598202824, + "grad_norm": 2.59053111076355, + "learning_rate": 2.5768934531450578e-05, + "loss": 0.4339, + "step": 17686 + }, + { + "epoch": 22.70474967907574, + "grad_norm": 6.65651798248291, + "learning_rate": 2.5768506632434747e-05, + "loss": 0.4663, + "step": 17687 + }, + { + "epoch": 22.706033376123234, + "grad_norm": 2.5470187664031982, + "learning_rate": 2.5768078733418915e-05, + "loss": 0.5766, + "step": 17688 + }, + { + "epoch": 22.70731707317073, + "grad_norm": 1.5302245616912842, + "learning_rate": 2.576765083440308e-05, + "loss": 0.4047, + "step": 17689 + }, + { + "epoch": 22.70860077021823, + "grad_norm": 1.8261350393295288, + "learning_rate": 2.576722293538725e-05, + "loss": 0.376, + "step": 17690 + }, + { + "epoch": 22.709884467265724, + "grad_norm": 1.1025285720825195, + "learning_rate": 2.5766795036371417e-05, + "loss": 0.3862, + "step": 17691 + }, + { + "epoch": 22.71116816431322, + "grad_norm": 1.54862380027771, + "learning_rate": 2.5766367137355585e-05, + "loss": 0.4034, + "step": 17692 + }, + { + "epoch": 22.71245186136072, + "grad_norm": 1.5781461000442505, + "learning_rate": 2.5765939238339754e-05, + "loss": 0.3908, + "step": 17693 + }, + { + "epoch": 22.713735558408217, + "grad_norm": 1.9611276388168335, + "learning_rate": 2.5765511339323922e-05, + "loss": 0.3755, + "step": 17694 + }, + { + "epoch": 22.71501925545571, + "grad_norm": 1.16141676902771, + "learning_rate": 2.5765083440308087e-05, + "loss": 0.387, + "step": 17695 + }, + { + "epoch": 22.71630295250321, + "grad_norm": 1.6158177852630615, + "learning_rate": 2.5764655541292256e-05, + "loss": 0.421, + "step": 17696 + }, + { + "epoch": 22.717586649550707, + "grad_norm": 1.2103334665298462, + "learning_rate": 2.5764227642276424e-05, + "loss": 0.3555, + "step": 17697 + }, + { + "epoch": 22.7188703465982, + "grad_norm": 2.9721689224243164, + "learning_rate": 2.576379974326059e-05, + "loss": 0.3988, + "step": 17698 + }, + { + "epoch": 22.7201540436457, + "grad_norm": 2.0299346446990967, + "learning_rate": 2.576337184424476e-05, + "loss": 0.3707, + "step": 17699 + }, + { + "epoch": 22.721437740693197, + "grad_norm": 1.1219141483306885, + "learning_rate": 2.5762943945228926e-05, + "loss": 0.4094, + "step": 17700 + }, + { + "epoch": 22.72272143774069, + "grad_norm": 2.327418565750122, + "learning_rate": 2.5762516046213098e-05, + "loss": 0.3799, + "step": 17701 + }, + { + "epoch": 22.72400513478819, + "grad_norm": 2.393002510070801, + "learning_rate": 2.5762088147197263e-05, + "loss": 0.3818, + "step": 17702 + }, + { + "epoch": 22.725288831835687, + "grad_norm": 3.368156909942627, + "learning_rate": 2.5761660248181428e-05, + "loss": 0.3942, + "step": 17703 + }, + { + "epoch": 22.726572528883185, + "grad_norm": 1.4065724611282349, + "learning_rate": 2.57612323491656e-05, + "loss": 0.4139, + "step": 17704 + }, + { + "epoch": 22.72785622593068, + "grad_norm": 1.2656160593032837, + "learning_rate": 2.5760804450149764e-05, + "loss": 0.4016, + "step": 17705 + }, + { + "epoch": 22.729139922978177, + "grad_norm": 1.3067729473114014, + "learning_rate": 2.5760376551133933e-05, + "loss": 0.4038, + "step": 17706 + }, + { + "epoch": 22.730423620025675, + "grad_norm": 4.460433006286621, + "learning_rate": 2.57599486521181e-05, + "loss": 0.3445, + "step": 17707 + }, + { + "epoch": 22.73170731707317, + "grad_norm": 2.2751264572143555, + "learning_rate": 2.575952075310227e-05, + "loss": 0.3741, + "step": 17708 + }, + { + "epoch": 22.732991014120667, + "grad_norm": 1.42554771900177, + "learning_rate": 2.5759092854086438e-05, + "loss": 0.3811, + "step": 17709 + }, + { + "epoch": 22.734274711168165, + "grad_norm": 1.2428799867630005, + "learning_rate": 2.5758664955070603e-05, + "loss": 0.3825, + "step": 17710 + }, + { + "epoch": 22.73555840821566, + "grad_norm": 1.988344669342041, + "learning_rate": 2.575823705605477e-05, + "loss": 0.3757, + "step": 17711 + }, + { + "epoch": 22.736842105263158, + "grad_norm": 2.4769084453582764, + "learning_rate": 2.575780915703894e-05, + "loss": 0.3856, + "step": 17712 + }, + { + "epoch": 22.738125802310655, + "grad_norm": 0.9639340043067932, + "learning_rate": 2.575738125802311e-05, + "loss": 0.3991, + "step": 17713 + }, + { + "epoch": 22.739409499358153, + "grad_norm": 1.7238037586212158, + "learning_rate": 2.5756953359007273e-05, + "loss": 0.3985, + "step": 17714 + }, + { + "epoch": 22.740693196405648, + "grad_norm": 1.8444976806640625, + "learning_rate": 2.5756525459991442e-05, + "loss": 0.3748, + "step": 17715 + }, + { + "epoch": 22.741976893453145, + "grad_norm": 2.4346437454223633, + "learning_rate": 2.575609756097561e-05, + "loss": 0.411, + "step": 17716 + }, + { + "epoch": 22.743260590500643, + "grad_norm": 1.57764732837677, + "learning_rate": 2.575566966195978e-05, + "loss": 0.435, + "step": 17717 + }, + { + "epoch": 22.744544287548138, + "grad_norm": 1.1862431764602661, + "learning_rate": 2.5755241762943947e-05, + "loss": 0.3841, + "step": 17718 + }, + { + "epoch": 22.745827984595635, + "grad_norm": 1.0209147930145264, + "learning_rate": 2.5754813863928112e-05, + "loss": 0.3896, + "step": 17719 + }, + { + "epoch": 22.747111681643133, + "grad_norm": 1.9565526247024536, + "learning_rate": 2.5754385964912284e-05, + "loss": 0.4208, + "step": 17720 + }, + { + "epoch": 22.748395378690628, + "grad_norm": 1.2199206352233887, + "learning_rate": 2.575395806589645e-05, + "loss": 0.416, + "step": 17721 + }, + { + "epoch": 22.749679075738126, + "grad_norm": 1.2709976434707642, + "learning_rate": 2.5753530166880614e-05, + "loss": 0.3595, + "step": 17722 + }, + { + "epoch": 22.750962772785623, + "grad_norm": 1.2811522483825684, + "learning_rate": 2.5753102267864786e-05, + "loss": 0.4645, + "step": 17723 + }, + { + "epoch": 22.752246469833118, + "grad_norm": 6.379744052886963, + "learning_rate": 2.575267436884895e-05, + "loss": 0.3984, + "step": 17724 + }, + { + "epoch": 22.753530166880616, + "grad_norm": 1.6307498216629028, + "learning_rate": 2.5752246469833123e-05, + "loss": 0.4171, + "step": 17725 + }, + { + "epoch": 22.754813863928113, + "grad_norm": 1.6542725563049316, + "learning_rate": 2.5751818570817288e-05, + "loss": 0.4334, + "step": 17726 + }, + { + "epoch": 22.75609756097561, + "grad_norm": 26.292356491088867, + "learning_rate": 2.5751390671801456e-05, + "loss": 0.447, + "step": 17727 + }, + { + "epoch": 22.757381258023106, + "grad_norm": 2.2995975017547607, + "learning_rate": 2.5750962772785624e-05, + "loss": 0.3977, + "step": 17728 + }, + { + "epoch": 22.758664955070603, + "grad_norm": 1.1014147996902466, + "learning_rate": 2.575053487376979e-05, + "loss": 0.4401, + "step": 17729 + }, + { + "epoch": 22.7599486521181, + "grad_norm": 1.9982341527938843, + "learning_rate": 2.5750106974753958e-05, + "loss": 0.4915, + "step": 17730 + }, + { + "epoch": 22.761232349165596, + "grad_norm": 1.7797868251800537, + "learning_rate": 2.5749679075738126e-05, + "loss": 0.43, + "step": 17731 + }, + { + "epoch": 22.762516046213094, + "grad_norm": 6.658722400665283, + "learning_rate": 2.5749251176722295e-05, + "loss": 0.4151, + "step": 17732 + }, + { + "epoch": 22.76379974326059, + "grad_norm": 2.3134868144989014, + "learning_rate": 2.574882327770646e-05, + "loss": 0.4706, + "step": 17733 + }, + { + "epoch": 22.765083440308086, + "grad_norm": 1.5343838930130005, + "learning_rate": 2.574839537869063e-05, + "loss": 0.4464, + "step": 17734 + }, + { + "epoch": 22.766367137355584, + "grad_norm": 1.6358267068862915, + "learning_rate": 2.5747967479674797e-05, + "loss": 0.424, + "step": 17735 + }, + { + "epoch": 22.76765083440308, + "grad_norm": 2.1559436321258545, + "learning_rate": 2.5747539580658965e-05, + "loss": 0.4658, + "step": 17736 + }, + { + "epoch": 22.76893453145058, + "grad_norm": 2.981893301010132, + "learning_rate": 2.5747111681643133e-05, + "loss": 0.4704, + "step": 17737 + }, + { + "epoch": 22.770218228498074, + "grad_norm": 3.7330234050750732, + "learning_rate": 2.57466837826273e-05, + "loss": 0.5523, + "step": 17738 + }, + { + "epoch": 22.77150192554557, + "grad_norm": 1.2645518779754639, + "learning_rate": 2.574625588361147e-05, + "loss": 0.3525, + "step": 17739 + }, + { + "epoch": 22.77278562259307, + "grad_norm": 1.1656948328018188, + "learning_rate": 2.5745827984595635e-05, + "loss": 0.4016, + "step": 17740 + }, + { + "epoch": 22.774069319640564, + "grad_norm": 2.197838544845581, + "learning_rate": 2.5745400085579804e-05, + "loss": 0.3938, + "step": 17741 + }, + { + "epoch": 22.77535301668806, + "grad_norm": 1.1295738220214844, + "learning_rate": 2.5744972186563972e-05, + "loss": 0.3872, + "step": 17742 + }, + { + "epoch": 22.77663671373556, + "grad_norm": 1.1867494583129883, + "learning_rate": 2.5744544287548137e-05, + "loss": 0.3634, + "step": 17743 + }, + { + "epoch": 22.777920410783054, + "grad_norm": 1.0832093954086304, + "learning_rate": 2.574411638853231e-05, + "loss": 0.3572, + "step": 17744 + }, + { + "epoch": 22.77920410783055, + "grad_norm": 1.0295345783233643, + "learning_rate": 2.5743688489516474e-05, + "loss": 0.3705, + "step": 17745 + }, + { + "epoch": 22.78048780487805, + "grad_norm": 3.36979341506958, + "learning_rate": 2.5743260590500642e-05, + "loss": 0.3777, + "step": 17746 + }, + { + "epoch": 22.781771501925547, + "grad_norm": 1.2856639623641968, + "learning_rate": 2.574283269148481e-05, + "loss": 0.3806, + "step": 17747 + }, + { + "epoch": 22.78305519897304, + "grad_norm": 1.0875427722930908, + "learning_rate": 2.574240479246898e-05, + "loss": 0.4002, + "step": 17748 + }, + { + "epoch": 22.78433889602054, + "grad_norm": 2.813331365585327, + "learning_rate": 2.5741976893453144e-05, + "loss": 0.4194, + "step": 17749 + }, + { + "epoch": 22.785622593068037, + "grad_norm": 1.2901452779769897, + "learning_rate": 2.5741548994437313e-05, + "loss": 0.3932, + "step": 17750 + }, + { + "epoch": 22.78690629011553, + "grad_norm": 4.372256755828857, + "learning_rate": 2.574112109542148e-05, + "loss": 0.3591, + "step": 17751 + }, + { + "epoch": 22.78818998716303, + "grad_norm": 1.2298877239227295, + "learning_rate": 2.574069319640565e-05, + "loss": 0.3902, + "step": 17752 + }, + { + "epoch": 22.789473684210527, + "grad_norm": 1.4955092668533325, + "learning_rate": 2.5740265297389818e-05, + "loss": 0.3808, + "step": 17753 + }, + { + "epoch": 22.79075738125802, + "grad_norm": 1.0500437021255493, + "learning_rate": 2.5739837398373983e-05, + "loss": 0.39, + "step": 17754 + }, + { + "epoch": 22.79204107830552, + "grad_norm": 0.8361338376998901, + "learning_rate": 2.5739409499358155e-05, + "loss": 0.3912, + "step": 17755 + }, + { + "epoch": 22.793324775353017, + "grad_norm": 1.967044711112976, + "learning_rate": 2.573898160034232e-05, + "loss": 0.3891, + "step": 17756 + }, + { + "epoch": 22.794608472400512, + "grad_norm": 1.0761353969573975, + "learning_rate": 2.5738553701326485e-05, + "loss": 0.3592, + "step": 17757 + }, + { + "epoch": 22.79589216944801, + "grad_norm": 1.737713098526001, + "learning_rate": 2.5738125802310656e-05, + "loss": 0.3735, + "step": 17758 + }, + { + "epoch": 22.797175866495508, + "grad_norm": 3.0829434394836426, + "learning_rate": 2.573769790329482e-05, + "loss": 0.4111, + "step": 17759 + }, + { + "epoch": 22.798459563543005, + "grad_norm": 1.0778067111968994, + "learning_rate": 2.5737270004278993e-05, + "loss": 0.4083, + "step": 17760 + }, + { + "epoch": 22.7997432605905, + "grad_norm": 1.8651630878448486, + "learning_rate": 2.5736842105263158e-05, + "loss": 0.4051, + "step": 17761 + }, + { + "epoch": 22.801026957637998, + "grad_norm": 4.17685079574585, + "learning_rate": 2.5736414206247327e-05, + "loss": 0.3985, + "step": 17762 + }, + { + "epoch": 22.802310654685495, + "grad_norm": 1.2175986766815186, + "learning_rate": 2.5735986307231495e-05, + "loss": 0.3992, + "step": 17763 + }, + { + "epoch": 22.80359435173299, + "grad_norm": 1.9405479431152344, + "learning_rate": 2.573555840821566e-05, + "loss": 0.4092, + "step": 17764 + }, + { + "epoch": 22.804878048780488, + "grad_norm": 0.8852283954620361, + "learning_rate": 2.573513050919983e-05, + "loss": 0.3961, + "step": 17765 + }, + { + "epoch": 22.806161745827985, + "grad_norm": 3.3574955463409424, + "learning_rate": 2.5734702610183997e-05, + "loss": 0.4143, + "step": 17766 + }, + { + "epoch": 22.80744544287548, + "grad_norm": 1.2429542541503906, + "learning_rate": 2.5734274711168165e-05, + "loss": 0.3749, + "step": 17767 + }, + { + "epoch": 22.808729139922978, + "grad_norm": 1.6395617723464966, + "learning_rate": 2.5733846812152334e-05, + "loss": 0.4037, + "step": 17768 + }, + { + "epoch": 22.810012836970476, + "grad_norm": 2.0969302654266357, + "learning_rate": 2.5733418913136502e-05, + "loss": 0.411, + "step": 17769 + }, + { + "epoch": 22.811296534017973, + "grad_norm": 3.188802719116211, + "learning_rate": 2.5732991014120667e-05, + "loss": 0.3778, + "step": 17770 + }, + { + "epoch": 22.812580231065468, + "grad_norm": 1.8510463237762451, + "learning_rate": 2.5732563115104836e-05, + "loss": 0.4735, + "step": 17771 + }, + { + "epoch": 22.813863928112966, + "grad_norm": 0.9458936452865601, + "learning_rate": 2.5732135216089004e-05, + "loss": 0.3948, + "step": 17772 + }, + { + "epoch": 22.815147625160463, + "grad_norm": 1.4114291667938232, + "learning_rate": 2.573170731707317e-05, + "loss": 0.4146, + "step": 17773 + }, + { + "epoch": 22.816431322207958, + "grad_norm": 6.472373962402344, + "learning_rate": 2.573127941805734e-05, + "loss": 0.421, + "step": 17774 + }, + { + "epoch": 22.817715019255456, + "grad_norm": 2.800733804702759, + "learning_rate": 2.5730851519041506e-05, + "loss": 0.3887, + "step": 17775 + }, + { + "epoch": 22.818998716302954, + "grad_norm": 1.6128228902816772, + "learning_rate": 2.5730423620025674e-05, + "loss": 0.4252, + "step": 17776 + }, + { + "epoch": 22.820282413350448, + "grad_norm": 1.97440767288208, + "learning_rate": 2.5729995721009843e-05, + "loss": 0.3863, + "step": 17777 + }, + { + "epoch": 22.821566110397946, + "grad_norm": 1.6698472499847412, + "learning_rate": 2.5729567821994008e-05, + "loss": 0.4265, + "step": 17778 + }, + { + "epoch": 22.822849807445444, + "grad_norm": 5.114582538604736, + "learning_rate": 2.572913992297818e-05, + "loss": 0.4221, + "step": 17779 + }, + { + "epoch": 22.82413350449294, + "grad_norm": 3.260531425476074, + "learning_rate": 2.5728712023962345e-05, + "loss": 0.3867, + "step": 17780 + }, + { + "epoch": 22.825417201540436, + "grad_norm": 2.744737386703491, + "learning_rate": 2.5728284124946513e-05, + "loss": 0.4373, + "step": 17781 + }, + { + "epoch": 22.826700898587934, + "grad_norm": 5.236939430236816, + "learning_rate": 2.572785622593068e-05, + "loss": 0.4287, + "step": 17782 + }, + { + "epoch": 22.82798459563543, + "grad_norm": 5.518549919128418, + "learning_rate": 2.5727428326914846e-05, + "loss": 0.4824, + "step": 17783 + }, + { + "epoch": 22.829268292682926, + "grad_norm": 2.701547622680664, + "learning_rate": 2.5727000427899018e-05, + "loss": 0.4557, + "step": 17784 + }, + { + "epoch": 22.830551989730424, + "grad_norm": 3.881345748901367, + "learning_rate": 2.5726572528883183e-05, + "loss": 0.4612, + "step": 17785 + }, + { + "epoch": 22.83183568677792, + "grad_norm": 3.6780447959899902, + "learning_rate": 2.572614462986735e-05, + "loss": 0.5226, + "step": 17786 + }, + { + "epoch": 22.833119383825416, + "grad_norm": 1.518623948097229, + "learning_rate": 2.572571673085152e-05, + "loss": 0.5333, + "step": 17787 + }, + { + "epoch": 22.834403080872914, + "grad_norm": 1.7617411613464355, + "learning_rate": 2.572528883183569e-05, + "loss": 0.5149, + "step": 17788 + }, + { + "epoch": 22.83568677792041, + "grad_norm": 1.1828324794769287, + "learning_rate": 2.5724860932819853e-05, + "loss": 0.3494, + "step": 17789 + }, + { + "epoch": 22.836970474967906, + "grad_norm": 0.9627506732940674, + "learning_rate": 2.5724433033804022e-05, + "loss": 0.3742, + "step": 17790 + }, + { + "epoch": 22.838254172015404, + "grad_norm": 1.0141819715499878, + "learning_rate": 2.572400513478819e-05, + "loss": 0.4137, + "step": 17791 + }, + { + "epoch": 22.8395378690629, + "grad_norm": 1.7156251668930054, + "learning_rate": 2.572357723577236e-05, + "loss": 0.4046, + "step": 17792 + }, + { + "epoch": 22.8408215661104, + "grad_norm": 1.4940236806869507, + "learning_rate": 2.5723149336756527e-05, + "loss": 0.4193, + "step": 17793 + }, + { + "epoch": 22.842105263157894, + "grad_norm": 2.7820568084716797, + "learning_rate": 2.5722721437740692e-05, + "loss": 0.3673, + "step": 17794 + }, + { + "epoch": 22.84338896020539, + "grad_norm": 1.115744948387146, + "learning_rate": 2.5722293538724864e-05, + "loss": 0.3958, + "step": 17795 + }, + { + "epoch": 22.84467265725289, + "grad_norm": 1.2654671669006348, + "learning_rate": 2.572186563970903e-05, + "loss": 0.3821, + "step": 17796 + }, + { + "epoch": 22.845956354300384, + "grad_norm": 1.5240674018859863, + "learning_rate": 2.5721437740693194e-05, + "loss": 0.4292, + "step": 17797 + }, + { + "epoch": 22.84724005134788, + "grad_norm": 2.8875858783721924, + "learning_rate": 2.5721009841677366e-05, + "loss": 0.4165, + "step": 17798 + }, + { + "epoch": 22.84852374839538, + "grad_norm": 1.4538487195968628, + "learning_rate": 2.572058194266153e-05, + "loss": 0.4063, + "step": 17799 + }, + { + "epoch": 22.849807445442874, + "grad_norm": 1.4619734287261963, + "learning_rate": 2.5720154043645703e-05, + "loss": 0.3877, + "step": 17800 + }, + { + "epoch": 22.85109114249037, + "grad_norm": 1.3323568105697632, + "learning_rate": 2.5719726144629868e-05, + "loss": 0.4087, + "step": 17801 + }, + { + "epoch": 22.85237483953787, + "grad_norm": 1.2168892621994019, + "learning_rate": 2.5719298245614036e-05, + "loss": 0.4038, + "step": 17802 + }, + { + "epoch": 22.853658536585368, + "grad_norm": 1.1717889308929443, + "learning_rate": 2.5718870346598204e-05, + "loss": 0.3801, + "step": 17803 + }, + { + "epoch": 22.854942233632862, + "grad_norm": 1.2755109071731567, + "learning_rate": 2.571844244758237e-05, + "loss": 0.3847, + "step": 17804 + }, + { + "epoch": 22.85622593068036, + "grad_norm": 1.4087531566619873, + "learning_rate": 2.5718014548566538e-05, + "loss": 0.3884, + "step": 17805 + }, + { + "epoch": 22.857509627727858, + "grad_norm": 2.360454559326172, + "learning_rate": 2.5717586649550706e-05, + "loss": 0.396, + "step": 17806 + }, + { + "epoch": 22.858793324775352, + "grad_norm": 3.8106131553649902, + "learning_rate": 2.5717158750534875e-05, + "loss": 0.3751, + "step": 17807 + }, + { + "epoch": 22.86007702182285, + "grad_norm": 1.7922029495239258, + "learning_rate": 2.5716730851519043e-05, + "loss": 0.3791, + "step": 17808 + }, + { + "epoch": 22.861360718870348, + "grad_norm": 1.0726866722106934, + "learning_rate": 2.571630295250321e-05, + "loss": 0.3762, + "step": 17809 + }, + { + "epoch": 22.862644415917842, + "grad_norm": 3.874974489212036, + "learning_rate": 2.5715875053487377e-05, + "loss": 0.4034, + "step": 17810 + }, + { + "epoch": 22.86392811296534, + "grad_norm": 1.0993214845657349, + "learning_rate": 2.5715447154471545e-05, + "loss": 0.3612, + "step": 17811 + }, + { + "epoch": 22.865211810012838, + "grad_norm": 1.5043773651123047, + "learning_rate": 2.5715019255455713e-05, + "loss": 0.4054, + "step": 17812 + }, + { + "epoch": 22.866495507060336, + "grad_norm": 1.2800953388214111, + "learning_rate": 2.571459135643988e-05, + "loss": 0.4139, + "step": 17813 + }, + { + "epoch": 22.86777920410783, + "grad_norm": 0.9418022036552429, + "learning_rate": 2.571416345742405e-05, + "loss": 0.3804, + "step": 17814 + }, + { + "epoch": 22.869062901155328, + "grad_norm": 1.1799596548080444, + "learning_rate": 2.5713735558408215e-05, + "loss": 0.4055, + "step": 17815 + }, + { + "epoch": 22.870346598202826, + "grad_norm": 1.1592375040054321, + "learning_rate": 2.5713307659392387e-05, + "loss": 0.3545, + "step": 17816 + }, + { + "epoch": 22.87163029525032, + "grad_norm": 1.1831785440444946, + "learning_rate": 2.5712879760376552e-05, + "loss": 0.4276, + "step": 17817 + }, + { + "epoch": 22.872913992297818, + "grad_norm": 1.0096451044082642, + "learning_rate": 2.5712451861360717e-05, + "loss": 0.4161, + "step": 17818 + }, + { + "epoch": 22.874197689345316, + "grad_norm": 2.5684239864349365, + "learning_rate": 2.571202396234489e-05, + "loss": 0.4043, + "step": 17819 + }, + { + "epoch": 22.87548138639281, + "grad_norm": 4.429774761199951, + "learning_rate": 2.5711596063329054e-05, + "loss": 0.4175, + "step": 17820 + }, + { + "epoch": 22.876765083440308, + "grad_norm": 1.739213466644287, + "learning_rate": 2.5711168164313222e-05, + "loss": 0.4075, + "step": 17821 + }, + { + "epoch": 22.878048780487806, + "grad_norm": 1.2251263856887817, + "learning_rate": 2.571074026529739e-05, + "loss": 0.392, + "step": 17822 + }, + { + "epoch": 22.8793324775353, + "grad_norm": 1.633711338043213, + "learning_rate": 2.571031236628156e-05, + "loss": 0.4121, + "step": 17823 + }, + { + "epoch": 22.880616174582798, + "grad_norm": 1.1463452577590942, + "learning_rate": 2.5709884467265728e-05, + "loss": 0.4018, + "step": 17824 + }, + { + "epoch": 22.881899871630296, + "grad_norm": 1.802895426750183, + "learning_rate": 2.5709456568249893e-05, + "loss": 0.4586, + "step": 17825 + }, + { + "epoch": 22.883183568677794, + "grad_norm": 2.4390454292297363, + "learning_rate": 2.570902866923406e-05, + "loss": 0.4023, + "step": 17826 + }, + { + "epoch": 22.884467265725288, + "grad_norm": 1.6482632160186768, + "learning_rate": 2.570860077021823e-05, + "loss": 0.4232, + "step": 17827 + }, + { + "epoch": 22.885750962772786, + "grad_norm": 1.4447474479675293, + "learning_rate": 2.5708172871202398e-05, + "loss": 0.438, + "step": 17828 + }, + { + "epoch": 22.887034659820284, + "grad_norm": 1.7778515815734863, + "learning_rate": 2.5707744972186563e-05, + "loss": 0.4753, + "step": 17829 + }, + { + "epoch": 22.888318356867778, + "grad_norm": 1.819324016571045, + "learning_rate": 2.5707317073170735e-05, + "loss": 0.4023, + "step": 17830 + }, + { + "epoch": 22.889602053915276, + "grad_norm": 2.1966969966888428, + "learning_rate": 2.57068891741549e-05, + "loss": 0.4141, + "step": 17831 + }, + { + "epoch": 22.890885750962774, + "grad_norm": 1.5120844841003418, + "learning_rate": 2.5706461275139068e-05, + "loss": 0.398, + "step": 17832 + }, + { + "epoch": 22.892169448010268, + "grad_norm": 4.114664554595947, + "learning_rate": 2.5706033376123236e-05, + "loss": 0.4015, + "step": 17833 + }, + { + "epoch": 22.893453145057766, + "grad_norm": 2.6591014862060547, + "learning_rate": 2.57056054771074e-05, + "loss": 0.4818, + "step": 17834 + }, + { + "epoch": 22.894736842105264, + "grad_norm": 2.0138251781463623, + "learning_rate": 2.5705177578091573e-05, + "loss": 0.429, + "step": 17835 + }, + { + "epoch": 22.89602053915276, + "grad_norm": 1.5957366228103638, + "learning_rate": 2.570474967907574e-05, + "loss": 0.4485, + "step": 17836 + }, + { + "epoch": 22.897304236200256, + "grad_norm": 1.4308010339736938, + "learning_rate": 2.5704321780059907e-05, + "loss": 0.5284, + "step": 17837 + }, + { + "epoch": 22.898587933247754, + "grad_norm": 4.827656269073486, + "learning_rate": 2.5703893881044075e-05, + "loss": 0.5937, + "step": 17838 + }, + { + "epoch": 22.89987163029525, + "grad_norm": 0.986089289188385, + "learning_rate": 2.570346598202824e-05, + "loss": 0.3886, + "step": 17839 + }, + { + "epoch": 22.901155327342746, + "grad_norm": 1.2050050497055054, + "learning_rate": 2.5703038083012412e-05, + "loss": 0.3665, + "step": 17840 + }, + { + "epoch": 22.902439024390244, + "grad_norm": 1.2607686519622803, + "learning_rate": 2.5702610183996577e-05, + "loss": 0.398, + "step": 17841 + }, + { + "epoch": 22.90372272143774, + "grad_norm": 1.0738325119018555, + "learning_rate": 2.5702182284980745e-05, + "loss": 0.4216, + "step": 17842 + }, + { + "epoch": 22.905006418485236, + "grad_norm": 1.881941318511963, + "learning_rate": 2.5701754385964914e-05, + "loss": 0.3983, + "step": 17843 + }, + { + "epoch": 22.906290115532734, + "grad_norm": 1.0287595987319946, + "learning_rate": 2.570132648694908e-05, + "loss": 0.4371, + "step": 17844 + }, + { + "epoch": 22.90757381258023, + "grad_norm": 1.0812759399414062, + "learning_rate": 2.5700898587933247e-05, + "loss": 0.3831, + "step": 17845 + }, + { + "epoch": 22.90885750962773, + "grad_norm": 1.4196292161941528, + "learning_rate": 2.5700470688917416e-05, + "loss": 0.4236, + "step": 17846 + }, + { + "epoch": 22.910141206675224, + "grad_norm": 1.4424394369125366, + "learning_rate": 2.5700042789901584e-05, + "loss": 0.3918, + "step": 17847 + }, + { + "epoch": 22.911424903722722, + "grad_norm": 1.266913890838623, + "learning_rate": 2.5699614890885753e-05, + "loss": 0.3873, + "step": 17848 + }, + { + "epoch": 22.91270860077022, + "grad_norm": 1.0959045886993408, + "learning_rate": 2.569918699186992e-05, + "loss": 0.4156, + "step": 17849 + }, + { + "epoch": 22.913992297817714, + "grad_norm": 1.3177223205566406, + "learning_rate": 2.5698759092854086e-05, + "loss": 0.3729, + "step": 17850 + }, + { + "epoch": 22.915275994865212, + "grad_norm": 1.3701119422912598, + "learning_rate": 2.5698331193838254e-05, + "loss": 0.3727, + "step": 17851 + }, + { + "epoch": 22.91655969191271, + "grad_norm": 1.3515371084213257, + "learning_rate": 2.5697903294822423e-05, + "loss": 0.3902, + "step": 17852 + }, + { + "epoch": 22.917843388960204, + "grad_norm": 1.132433295249939, + "learning_rate": 2.5697475395806588e-05, + "loss": 0.404, + "step": 17853 + }, + { + "epoch": 22.919127086007702, + "grad_norm": 1.2304315567016602, + "learning_rate": 2.569704749679076e-05, + "loss": 0.4107, + "step": 17854 + }, + { + "epoch": 22.9204107830552, + "grad_norm": 1.0056376457214355, + "learning_rate": 2.5696619597774925e-05, + "loss": 0.4032, + "step": 17855 + }, + { + "epoch": 22.921694480102694, + "grad_norm": 1.2469595670700073, + "learning_rate": 2.5696191698759096e-05, + "loss": 0.3539, + "step": 17856 + }, + { + "epoch": 22.922978177150192, + "grad_norm": 1.3293567895889282, + "learning_rate": 2.569576379974326e-05, + "loss": 0.3841, + "step": 17857 + }, + { + "epoch": 22.92426187419769, + "grad_norm": 1.2208880186080933, + "learning_rate": 2.5695335900727426e-05, + "loss": 0.4117, + "step": 17858 + }, + { + "epoch": 22.925545571245188, + "grad_norm": 2.1285293102264404, + "learning_rate": 2.5694908001711598e-05, + "loss": 0.3915, + "step": 17859 + }, + { + "epoch": 22.926829268292682, + "grad_norm": 1.2535144090652466, + "learning_rate": 2.5694480102695763e-05, + "loss": 0.4269, + "step": 17860 + }, + { + "epoch": 22.92811296534018, + "grad_norm": 1.248348593711853, + "learning_rate": 2.569405220367993e-05, + "loss": 0.3989, + "step": 17861 + }, + { + "epoch": 22.929396662387678, + "grad_norm": 2.0333924293518066, + "learning_rate": 2.56936243046641e-05, + "loss": 0.3747, + "step": 17862 + }, + { + "epoch": 22.930680359435172, + "grad_norm": 1.9372227191925049, + "learning_rate": 2.569319640564827e-05, + "loss": 0.3848, + "step": 17863 + }, + { + "epoch": 22.93196405648267, + "grad_norm": 3.1629602909088135, + "learning_rate": 2.5692768506632437e-05, + "loss": 0.4184, + "step": 17864 + }, + { + "epoch": 22.933247753530168, + "grad_norm": 0.9537546038627625, + "learning_rate": 2.5692340607616602e-05, + "loss": 0.3955, + "step": 17865 + }, + { + "epoch": 22.934531450577662, + "grad_norm": 1.0054725408554077, + "learning_rate": 2.569191270860077e-05, + "loss": 0.4143, + "step": 17866 + }, + { + "epoch": 22.93581514762516, + "grad_norm": 1.9898109436035156, + "learning_rate": 2.569148480958494e-05, + "loss": 0.405, + "step": 17867 + }, + { + "epoch": 22.937098844672658, + "grad_norm": 1.0846434831619263, + "learning_rate": 2.5691056910569107e-05, + "loss": 0.375, + "step": 17868 + }, + { + "epoch": 22.938382541720156, + "grad_norm": 4.539308071136475, + "learning_rate": 2.5690629011553272e-05, + "loss": 0.4286, + "step": 17869 + }, + { + "epoch": 22.93966623876765, + "grad_norm": 1.5072911977767944, + "learning_rate": 2.5690201112537444e-05, + "loss": 0.429, + "step": 17870 + }, + { + "epoch": 22.940949935815148, + "grad_norm": 2.300077199935913, + "learning_rate": 2.568977321352161e-05, + "loss": 0.3888, + "step": 17871 + }, + { + "epoch": 22.942233632862646, + "grad_norm": 5.041231155395508, + "learning_rate": 2.5689345314505777e-05, + "loss": 0.4388, + "step": 17872 + }, + { + "epoch": 22.94351732991014, + "grad_norm": 3.6224093437194824, + "learning_rate": 2.5688917415489946e-05, + "loss": 0.3742, + "step": 17873 + }, + { + "epoch": 22.944801026957638, + "grad_norm": 1.4958516359329224, + "learning_rate": 2.568848951647411e-05, + "loss": 0.4337, + "step": 17874 + }, + { + "epoch": 22.946084724005136, + "grad_norm": 1.4634166955947876, + "learning_rate": 2.5688061617458283e-05, + "loss": 0.3944, + "step": 17875 + }, + { + "epoch": 22.94736842105263, + "grad_norm": 1.1549458503723145, + "learning_rate": 2.5687633718442448e-05, + "loss": 0.4439, + "step": 17876 + }, + { + "epoch": 22.948652118100128, + "grad_norm": 1.6499786376953125, + "learning_rate": 2.5687205819426616e-05, + "loss": 0.4508, + "step": 17877 + }, + { + "epoch": 22.949935815147626, + "grad_norm": 1.2192469835281372, + "learning_rate": 2.5686777920410785e-05, + "loss": 0.39, + "step": 17878 + }, + { + "epoch": 22.951219512195124, + "grad_norm": 3.0103726387023926, + "learning_rate": 2.568635002139495e-05, + "loss": 0.4236, + "step": 17879 + }, + { + "epoch": 22.952503209242618, + "grad_norm": 1.5990537405014038, + "learning_rate": 2.568592212237912e-05, + "loss": 0.449, + "step": 17880 + }, + { + "epoch": 22.953786906290116, + "grad_norm": 2.000910520553589, + "learning_rate": 2.5685494223363286e-05, + "loss": 0.4253, + "step": 17881 + }, + { + "epoch": 22.955070603337614, + "grad_norm": 2.770242691040039, + "learning_rate": 2.5685066324347455e-05, + "loss": 0.5194, + "step": 17882 + }, + { + "epoch": 22.956354300385108, + "grad_norm": 2.2466695308685303, + "learning_rate": 2.5684638425331623e-05, + "loss": 0.4466, + "step": 17883 + }, + { + "epoch": 22.957637997432606, + "grad_norm": 1.111432433128357, + "learning_rate": 2.568421052631579e-05, + "loss": 0.4409, + "step": 17884 + }, + { + "epoch": 22.958921694480104, + "grad_norm": 1.4138174057006836, + "learning_rate": 2.5683782627299957e-05, + "loss": 0.4756, + "step": 17885 + }, + { + "epoch": 22.960205391527598, + "grad_norm": 1.486928939819336, + "learning_rate": 2.5683354728284125e-05, + "loss": 0.4502, + "step": 17886 + }, + { + "epoch": 22.961489088575096, + "grad_norm": 3.1246068477630615, + "learning_rate": 2.5682926829268293e-05, + "loss": 0.5287, + "step": 17887 + }, + { + "epoch": 22.962772785622594, + "grad_norm": 2.2144691944122314, + "learning_rate": 2.5682498930252462e-05, + "loss": 0.6222, + "step": 17888 + }, + { + "epoch": 22.964056482670088, + "grad_norm": 0.9827741980552673, + "learning_rate": 2.568207103123663e-05, + "loss": 0.3461, + "step": 17889 + }, + { + "epoch": 22.965340179717586, + "grad_norm": 1.515783667564392, + "learning_rate": 2.5681643132220795e-05, + "loss": 0.3488, + "step": 17890 + }, + { + "epoch": 22.966623876765084, + "grad_norm": 1.3478399515151978, + "learning_rate": 2.5681215233204967e-05, + "loss": 0.3935, + "step": 17891 + }, + { + "epoch": 22.96790757381258, + "grad_norm": 0.9524022340774536, + "learning_rate": 2.5680787334189132e-05, + "loss": 0.3768, + "step": 17892 + }, + { + "epoch": 22.969191270860076, + "grad_norm": 1.2622344493865967, + "learning_rate": 2.5680359435173297e-05, + "loss": 0.3914, + "step": 17893 + }, + { + "epoch": 22.970474967907574, + "grad_norm": 1.3339554071426392, + "learning_rate": 2.567993153615747e-05, + "loss": 0.3478, + "step": 17894 + }, + { + "epoch": 22.971758664955072, + "grad_norm": 1.1612598896026611, + "learning_rate": 2.5679503637141634e-05, + "loss": 0.3812, + "step": 17895 + }, + { + "epoch": 22.973042362002566, + "grad_norm": 1.0486149787902832, + "learning_rate": 2.5679075738125806e-05, + "loss": 0.4101, + "step": 17896 + }, + { + "epoch": 22.974326059050064, + "grad_norm": 1.1971335411071777, + "learning_rate": 2.567864783910997e-05, + "loss": 0.3869, + "step": 17897 + }, + { + "epoch": 22.975609756097562, + "grad_norm": 3.0567688941955566, + "learning_rate": 2.567821994009414e-05, + "loss": 0.372, + "step": 17898 + }, + { + "epoch": 22.976893453145056, + "grad_norm": 1.211380958557129, + "learning_rate": 2.5677792041078308e-05, + "loss": 0.3841, + "step": 17899 + }, + { + "epoch": 22.978177150192554, + "grad_norm": 1.0401347875595093, + "learning_rate": 2.5677364142062473e-05, + "loss": 0.3873, + "step": 17900 + }, + { + "epoch": 22.979460847240052, + "grad_norm": 2.072298765182495, + "learning_rate": 2.567693624304664e-05, + "loss": 0.3549, + "step": 17901 + }, + { + "epoch": 22.98074454428755, + "grad_norm": 1.176543116569519, + "learning_rate": 2.567650834403081e-05, + "loss": 0.3665, + "step": 17902 + }, + { + "epoch": 22.982028241335044, + "grad_norm": 1.5721279382705688, + "learning_rate": 2.5676080445014978e-05, + "loss": 0.3905, + "step": 17903 + }, + { + "epoch": 22.983311938382542, + "grad_norm": 1.476824164390564, + "learning_rate": 2.5675652545999146e-05, + "loss": 0.4274, + "step": 17904 + }, + { + "epoch": 22.98459563543004, + "grad_norm": 1.1390331983566284, + "learning_rate": 2.567522464698331e-05, + "loss": 0.4071, + "step": 17905 + }, + { + "epoch": 22.985879332477534, + "grad_norm": 2.0770342350006104, + "learning_rate": 2.567479674796748e-05, + "loss": 0.4009, + "step": 17906 + }, + { + "epoch": 22.987163029525032, + "grad_norm": 5.864295959472656, + "learning_rate": 2.5674368848951648e-05, + "loss": 0.4353, + "step": 17907 + }, + { + "epoch": 22.98844672657253, + "grad_norm": 1.3659712076187134, + "learning_rate": 2.5673940949935817e-05, + "loss": 0.3973, + "step": 17908 + }, + { + "epoch": 22.989730423620024, + "grad_norm": 1.1435257196426392, + "learning_rate": 2.567351305091998e-05, + "loss": 0.4181, + "step": 17909 + }, + { + "epoch": 22.991014120667522, + "grad_norm": 1.6795154809951782, + "learning_rate": 2.5673085151904153e-05, + "loss": 0.3947, + "step": 17910 + }, + { + "epoch": 22.99229781771502, + "grad_norm": 1.412467122077942, + "learning_rate": 2.567265725288832e-05, + "loss": 0.3871, + "step": 17911 + }, + { + "epoch": 22.993581514762518, + "grad_norm": 1.6734130382537842, + "learning_rate": 2.5672229353872487e-05, + "loss": 0.386, + "step": 17912 + }, + { + "epoch": 22.994865211810012, + "grad_norm": 4.614466667175293, + "learning_rate": 2.5671801454856655e-05, + "loss": 0.4399, + "step": 17913 + }, + { + "epoch": 22.99614890885751, + "grad_norm": 1.0940073728561401, + "learning_rate": 2.567137355584082e-05, + "loss": 0.416, + "step": 17914 + }, + { + "epoch": 22.997432605905008, + "grad_norm": 1.6988409757614136, + "learning_rate": 2.5670945656824992e-05, + "loss": 0.4666, + "step": 17915 + }, + { + "epoch": 22.998716302952502, + "grad_norm": 1.2747491598129272, + "learning_rate": 2.5670517757809157e-05, + "loss": 0.5404, + "step": 17916 + }, + { + "epoch": 23.0, + "grad_norm": 6.942187786102295, + "learning_rate": 2.5670089858793325e-05, + "loss": 0.5262, + "step": 17917 + }, + { + "epoch": 23.001283697047498, + "grad_norm": 1.6906988620758057, + "learning_rate": 2.5669661959777494e-05, + "loss": 0.3591, + "step": 17918 + }, + { + "epoch": 23.002567394094992, + "grad_norm": 2.6450002193450928, + "learning_rate": 2.566923406076166e-05, + "loss": 0.3726, + "step": 17919 + }, + { + "epoch": 23.00385109114249, + "grad_norm": 0.9247425198554993, + "learning_rate": 2.566880616174583e-05, + "loss": 0.3619, + "step": 17920 + }, + { + "epoch": 23.005134788189988, + "grad_norm": 1.022874116897583, + "learning_rate": 2.5668378262729996e-05, + "loss": 0.3768, + "step": 17921 + }, + { + "epoch": 23.006418485237482, + "grad_norm": 1.1152801513671875, + "learning_rate": 2.5667950363714164e-05, + "loss": 0.3622, + "step": 17922 + }, + { + "epoch": 23.00770218228498, + "grad_norm": 2.496755838394165, + "learning_rate": 2.5667522464698333e-05, + "loss": 0.3282, + "step": 17923 + }, + { + "epoch": 23.008985879332478, + "grad_norm": 1.224174976348877, + "learning_rate": 2.56670945656825e-05, + "loss": 0.3599, + "step": 17924 + }, + { + "epoch": 23.010269576379976, + "grad_norm": 1.16372549533844, + "learning_rate": 2.5666666666666666e-05, + "loss": 0.3717, + "step": 17925 + }, + { + "epoch": 23.01155327342747, + "grad_norm": 1.7740577459335327, + "learning_rate": 2.5666238767650834e-05, + "loss": 0.3659, + "step": 17926 + }, + { + "epoch": 23.012836970474968, + "grad_norm": 0.9526218175888062, + "learning_rate": 2.5665810868635003e-05, + "loss": 0.3802, + "step": 17927 + }, + { + "epoch": 23.014120667522466, + "grad_norm": 1.780260682106018, + "learning_rate": 2.566538296961917e-05, + "loss": 0.3681, + "step": 17928 + }, + { + "epoch": 23.01540436456996, + "grad_norm": 0.855936586856842, + "learning_rate": 2.566495507060334e-05, + "loss": 0.3543, + "step": 17929 + }, + { + "epoch": 23.016688061617458, + "grad_norm": 1.3694921731948853, + "learning_rate": 2.5664527171587505e-05, + "loss": 0.3566, + "step": 17930 + }, + { + "epoch": 23.017971758664956, + "grad_norm": 1.4126642942428589, + "learning_rate": 2.5664099272571676e-05, + "loss": 0.3418, + "step": 17931 + }, + { + "epoch": 23.01925545571245, + "grad_norm": 1.2060014009475708, + "learning_rate": 2.566367137355584e-05, + "loss": 0.365, + "step": 17932 + }, + { + "epoch": 23.020539152759948, + "grad_norm": 1.3233623504638672, + "learning_rate": 2.5663243474540007e-05, + "loss": 0.4038, + "step": 17933 + }, + { + "epoch": 23.021822849807446, + "grad_norm": 2.1591107845306396, + "learning_rate": 2.566281557552418e-05, + "loss": 0.3568, + "step": 17934 + }, + { + "epoch": 23.023106546854944, + "grad_norm": 1.3021546602249146, + "learning_rate": 2.5662387676508343e-05, + "loss": 0.3259, + "step": 17935 + }, + { + "epoch": 23.024390243902438, + "grad_norm": 0.8419588208198547, + "learning_rate": 2.5661959777492512e-05, + "loss": 0.3753, + "step": 17936 + }, + { + "epoch": 23.025673940949936, + "grad_norm": 2.46396803855896, + "learning_rate": 2.566153187847668e-05, + "loss": 0.3378, + "step": 17937 + }, + { + "epoch": 23.026957637997434, + "grad_norm": 1.4117028713226318, + "learning_rate": 2.566110397946085e-05, + "loss": 0.3844, + "step": 17938 + }, + { + "epoch": 23.028241335044928, + "grad_norm": 2.32118821144104, + "learning_rate": 2.5660676080445017e-05, + "loss": 0.3684, + "step": 17939 + }, + { + "epoch": 23.029525032092426, + "grad_norm": 2.386457920074463, + "learning_rate": 2.5660248181429182e-05, + "loss": 0.3388, + "step": 17940 + }, + { + "epoch": 23.030808729139924, + "grad_norm": 0.9913740754127502, + "learning_rate": 2.565982028241335e-05, + "loss": 0.3535, + "step": 17941 + }, + { + "epoch": 23.03209242618742, + "grad_norm": 1.4056683778762817, + "learning_rate": 2.565939238339752e-05, + "loss": 0.3953, + "step": 17942 + }, + { + "epoch": 23.033376123234916, + "grad_norm": 1.9904742240905762, + "learning_rate": 2.5658964484381687e-05, + "loss": 0.3976, + "step": 17943 + }, + { + "epoch": 23.034659820282414, + "grad_norm": 6.135872840881348, + "learning_rate": 2.5658536585365852e-05, + "loss": 0.3953, + "step": 17944 + }, + { + "epoch": 23.035943517329912, + "grad_norm": 1.2600476741790771, + "learning_rate": 2.5658108686350024e-05, + "loss": 0.3631, + "step": 17945 + }, + { + "epoch": 23.037227214377406, + "grad_norm": 1.986806035041809, + "learning_rate": 2.565768078733419e-05, + "loss": 0.417, + "step": 17946 + }, + { + "epoch": 23.038510911424904, + "grad_norm": 1.6051844358444214, + "learning_rate": 2.5657252888318358e-05, + "loss": 0.3899, + "step": 17947 + }, + { + "epoch": 23.039794608472402, + "grad_norm": 1.1528908014297485, + "learning_rate": 2.5656824989302526e-05, + "loss": 0.4061, + "step": 17948 + }, + { + "epoch": 23.041078305519896, + "grad_norm": 1.8852766752243042, + "learning_rate": 2.565639709028669e-05, + "loss": 0.4244, + "step": 17949 + }, + { + "epoch": 23.042362002567394, + "grad_norm": 2.9728844165802, + "learning_rate": 2.5655969191270863e-05, + "loss": 0.3603, + "step": 17950 + }, + { + "epoch": 23.043645699614892, + "grad_norm": 1.0635533332824707, + "learning_rate": 2.5655541292255028e-05, + "loss": 0.4084, + "step": 17951 + }, + { + "epoch": 23.044929396662386, + "grad_norm": 1.1047258377075195, + "learning_rate": 2.5655113393239196e-05, + "loss": 0.3864, + "step": 17952 + }, + { + "epoch": 23.046213093709884, + "grad_norm": 2.827718734741211, + "learning_rate": 2.5654685494223365e-05, + "loss": 0.3897, + "step": 17953 + }, + { + "epoch": 23.047496790757382, + "grad_norm": 1.5859211683273315, + "learning_rate": 2.565425759520753e-05, + "loss": 0.4088, + "step": 17954 + }, + { + "epoch": 23.048780487804876, + "grad_norm": 3.272641181945801, + "learning_rate": 2.56538296961917e-05, + "loss": 0.3998, + "step": 17955 + }, + { + "epoch": 23.050064184852374, + "grad_norm": 1.7680095434188843, + "learning_rate": 2.5653401797175866e-05, + "loss": 0.4357, + "step": 17956 + }, + { + "epoch": 23.051347881899872, + "grad_norm": 1.5524667501449585, + "learning_rate": 2.5652973898160035e-05, + "loss": 0.3694, + "step": 17957 + }, + { + "epoch": 23.05263157894737, + "grad_norm": 4.217823028564453, + "learning_rate": 2.5652545999144203e-05, + "loss": 0.4273, + "step": 17958 + }, + { + "epoch": 23.053915275994864, + "grad_norm": 2.0961151123046875, + "learning_rate": 2.565211810012837e-05, + "loss": 0.4076, + "step": 17959 + }, + { + "epoch": 23.055198973042362, + "grad_norm": 1.5079487562179565, + "learning_rate": 2.5651690201112537e-05, + "loss": 0.4276, + "step": 17960 + }, + { + "epoch": 23.05648267008986, + "grad_norm": 3.7176177501678467, + "learning_rate": 2.5651262302096705e-05, + "loss": 0.4321, + "step": 17961 + }, + { + "epoch": 23.057766367137354, + "grad_norm": 2.2713139057159424, + "learning_rate": 2.5650834403080874e-05, + "loss": 0.409, + "step": 17962 + }, + { + "epoch": 23.059050064184852, + "grad_norm": 1.3918979167938232, + "learning_rate": 2.5650406504065042e-05, + "loss": 0.3975, + "step": 17963 + }, + { + "epoch": 23.06033376123235, + "grad_norm": 3.254943370819092, + "learning_rate": 2.564997860504921e-05, + "loss": 0.4356, + "step": 17964 + }, + { + "epoch": 23.061617458279844, + "grad_norm": 2.124025821685791, + "learning_rate": 2.5649550706033375e-05, + "loss": 0.4864, + "step": 17965 + }, + { + "epoch": 23.062901155327342, + "grad_norm": 3.6403093338012695, + "learning_rate": 2.5649122807017544e-05, + "loss": 0.5271, + "step": 17966 + }, + { + "epoch": 23.06418485237484, + "grad_norm": 1.8057548999786377, + "learning_rate": 2.5648694908001712e-05, + "loss": 0.6304, + "step": 17967 + }, + { + "epoch": 23.065468549422338, + "grad_norm": 1.2412097454071045, + "learning_rate": 2.5648267008985877e-05, + "loss": 0.3789, + "step": 17968 + }, + { + "epoch": 23.066752246469832, + "grad_norm": 0.96799635887146, + "learning_rate": 2.564783910997005e-05, + "loss": 0.3367, + "step": 17969 + }, + { + "epoch": 23.06803594351733, + "grad_norm": 1.5322988033294678, + "learning_rate": 2.5647411210954214e-05, + "loss": 0.3823, + "step": 17970 + }, + { + "epoch": 23.069319640564828, + "grad_norm": 1.3093897104263306, + "learning_rate": 2.5646983311938386e-05, + "loss": 0.3508, + "step": 17971 + }, + { + "epoch": 23.070603337612322, + "grad_norm": 0.9564175009727478, + "learning_rate": 2.564655541292255e-05, + "loss": 0.3781, + "step": 17972 + }, + { + "epoch": 23.07188703465982, + "grad_norm": 0.931047797203064, + "learning_rate": 2.5646127513906716e-05, + "loss": 0.3643, + "step": 17973 + }, + { + "epoch": 23.073170731707318, + "grad_norm": 1.4415054321289062, + "learning_rate": 2.5645699614890888e-05, + "loss": 0.3775, + "step": 17974 + }, + { + "epoch": 23.074454428754812, + "grad_norm": 1.5699812173843384, + "learning_rate": 2.5645271715875053e-05, + "loss": 0.344, + "step": 17975 + }, + { + "epoch": 23.07573812580231, + "grad_norm": 0.9345686435699463, + "learning_rate": 2.564484381685922e-05, + "loss": 0.3787, + "step": 17976 + }, + { + "epoch": 23.077021822849808, + "grad_norm": 0.970041036605835, + "learning_rate": 2.564441591784339e-05, + "loss": 0.3746, + "step": 17977 + }, + { + "epoch": 23.078305519897306, + "grad_norm": 1.0347176790237427, + "learning_rate": 2.5643988018827558e-05, + "loss": 0.398, + "step": 17978 + }, + { + "epoch": 23.0795892169448, + "grad_norm": 1.1527111530303955, + "learning_rate": 2.5643560119811726e-05, + "loss": 0.4116, + "step": 17979 + }, + { + "epoch": 23.080872913992298, + "grad_norm": 1.7108596563339233, + "learning_rate": 2.564313222079589e-05, + "loss": 0.3606, + "step": 17980 + }, + { + "epoch": 23.082156611039796, + "grad_norm": 1.3105964660644531, + "learning_rate": 2.564270432178006e-05, + "loss": 0.3369, + "step": 17981 + }, + { + "epoch": 23.08344030808729, + "grad_norm": 1.0119348764419556, + "learning_rate": 2.5642276422764228e-05, + "loss": 0.3645, + "step": 17982 + }, + { + "epoch": 23.084724005134788, + "grad_norm": 0.9333193302154541, + "learning_rate": 2.5641848523748397e-05, + "loss": 0.3337, + "step": 17983 + }, + { + "epoch": 23.086007702182286, + "grad_norm": 2.31668758392334, + "learning_rate": 2.564142062473256e-05, + "loss": 0.3755, + "step": 17984 + }, + { + "epoch": 23.08729139922978, + "grad_norm": 1.100932002067566, + "learning_rate": 2.5640992725716733e-05, + "loss": 0.364, + "step": 17985 + }, + { + "epoch": 23.088575096277278, + "grad_norm": 2.685817241668701, + "learning_rate": 2.56405648267009e-05, + "loss": 0.4135, + "step": 17986 + }, + { + "epoch": 23.089858793324776, + "grad_norm": 1.509685754776001, + "learning_rate": 2.5640136927685067e-05, + "loss": 0.373, + "step": 17987 + }, + { + "epoch": 23.09114249037227, + "grad_norm": 1.754544973373413, + "learning_rate": 2.5639709028669235e-05, + "loss": 0.389, + "step": 17988 + }, + { + "epoch": 23.09242618741977, + "grad_norm": 2.4242424964904785, + "learning_rate": 2.56392811296534e-05, + "loss": 0.3748, + "step": 17989 + }, + { + "epoch": 23.093709884467266, + "grad_norm": 1.0090645551681519, + "learning_rate": 2.5638853230637572e-05, + "loss": 0.3607, + "step": 17990 + }, + { + "epoch": 23.094993581514764, + "grad_norm": 1.9925107955932617, + "learning_rate": 2.5638425331621737e-05, + "loss": 0.3683, + "step": 17991 + }, + { + "epoch": 23.09627727856226, + "grad_norm": 3.4660093784332275, + "learning_rate": 2.5637997432605906e-05, + "loss": 0.3922, + "step": 17992 + }, + { + "epoch": 23.097560975609756, + "grad_norm": 2.105694532394409, + "learning_rate": 2.5637569533590074e-05, + "loss": 0.4191, + "step": 17993 + }, + { + "epoch": 23.098844672657254, + "grad_norm": 1.300823450088501, + "learning_rate": 2.563714163457424e-05, + "loss": 0.3984, + "step": 17994 + }, + { + "epoch": 23.10012836970475, + "grad_norm": 3.4931087493896484, + "learning_rate": 2.563671373555841e-05, + "loss": 0.3842, + "step": 17995 + }, + { + "epoch": 23.101412066752246, + "grad_norm": 1.504381775856018, + "learning_rate": 2.5636285836542576e-05, + "loss": 0.4149, + "step": 17996 + }, + { + "epoch": 23.102695763799744, + "grad_norm": 1.940946340560913, + "learning_rate": 2.5635857937526744e-05, + "loss": 0.4014, + "step": 17997 + }, + { + "epoch": 23.10397946084724, + "grad_norm": 1.5719581842422485, + "learning_rate": 2.5635430038510913e-05, + "loss": 0.3608, + "step": 17998 + }, + { + "epoch": 23.105263157894736, + "grad_norm": 1.645835041999817, + "learning_rate": 2.563500213949508e-05, + "loss": 0.3809, + "step": 17999 + }, + { + "epoch": 23.106546854942234, + "grad_norm": 0.9126864671707153, + "learning_rate": 2.5634574240479246e-05, + "loss": 0.3795, + "step": 18000 + }, + { + "epoch": 23.106546854942234, + "eval_cer": 0.27278735069512433, + "eval_loss": 0.5082477927207947, + "eval_runtime": 13.6787, + "eval_samples_per_second": 71.864, + "eval_steps_per_second": 0.512, + "eval_wer": 0.4825728044026599, + "step": 18000 + }, + { + "epoch": 23.107830551989732, + "grad_norm": 1.7909173965454102, + "learning_rate": 2.5634146341463414e-05, + "loss": 0.3709, + "step": 18001 + }, + { + "epoch": 23.109114249037226, + "grad_norm": 24.70939064025879, + "learning_rate": 2.5633718442447583e-05, + "loss": 0.4192, + "step": 18002 + }, + { + "epoch": 23.110397946084724, + "grad_norm": 1.0158342123031616, + "learning_rate": 2.563329054343175e-05, + "loss": 0.3578, + "step": 18003 + }, + { + "epoch": 23.111681643132222, + "grad_norm": 1.615288496017456, + "learning_rate": 2.563286264441592e-05, + "loss": 0.3909, + "step": 18004 + }, + { + "epoch": 23.112965340179716, + "grad_norm": 1.4129297733306885, + "learning_rate": 2.5632434745400085e-05, + "loss": 0.3889, + "step": 18005 + }, + { + "epoch": 23.114249037227214, + "grad_norm": 1.4729788303375244, + "learning_rate": 2.5632006846384257e-05, + "loss": 0.4222, + "step": 18006 + }, + { + "epoch": 23.115532734274712, + "grad_norm": 2.3520894050598145, + "learning_rate": 2.563157894736842e-05, + "loss": 0.3843, + "step": 18007 + }, + { + "epoch": 23.116816431322206, + "grad_norm": 1.422764778137207, + "learning_rate": 2.5631151048352587e-05, + "loss": 0.4703, + "step": 18008 + }, + { + "epoch": 23.118100128369704, + "grad_norm": 1.9177716970443726, + "learning_rate": 2.563072314933676e-05, + "loss": 0.4263, + "step": 18009 + }, + { + "epoch": 23.119383825417202, + "grad_norm": 11.198360443115234, + "learning_rate": 2.5630295250320923e-05, + "loss": 0.4229, + "step": 18010 + }, + { + "epoch": 23.1206675224647, + "grad_norm": 1.2161468267440796, + "learning_rate": 2.5629867351305095e-05, + "loss": 0.4053, + "step": 18011 + }, + { + "epoch": 23.121951219512194, + "grad_norm": 1.444862961769104, + "learning_rate": 2.562943945228926e-05, + "loss": 0.464, + "step": 18012 + }, + { + "epoch": 23.123234916559692, + "grad_norm": 3.4994430541992188, + "learning_rate": 2.562901155327343e-05, + "loss": 0.4611, + "step": 18013 + }, + { + "epoch": 23.12451861360719, + "grad_norm": 1.4592013359069824, + "learning_rate": 2.5628583654257597e-05, + "loss": 0.4745, + "step": 18014 + }, + { + "epoch": 23.125802310654684, + "grad_norm": 2.5301599502563477, + "learning_rate": 2.5628155755241762e-05, + "loss": 0.4525, + "step": 18015 + }, + { + "epoch": 23.127086007702182, + "grad_norm": 2.0681586265563965, + "learning_rate": 2.562772785622593e-05, + "loss": 0.4777, + "step": 18016 + }, + { + "epoch": 23.12836970474968, + "grad_norm": 1.7060272693634033, + "learning_rate": 2.56272999572101e-05, + "loss": 0.6557, + "step": 18017 + }, + { + "epoch": 23.129653401797174, + "grad_norm": 1.2135810852050781, + "learning_rate": 2.5626872058194267e-05, + "loss": 0.3695, + "step": 18018 + }, + { + "epoch": 23.130937098844672, + "grad_norm": 1.151623010635376, + "learning_rate": 2.5626444159178436e-05, + "loss": 0.3933, + "step": 18019 + }, + { + "epoch": 23.13222079589217, + "grad_norm": 1.1305516958236694, + "learning_rate": 2.5626016260162604e-05, + "loss": 0.3889, + "step": 18020 + }, + { + "epoch": 23.133504492939664, + "grad_norm": 1.2249776124954224, + "learning_rate": 2.562558836114677e-05, + "loss": 0.3982, + "step": 18021 + }, + { + "epoch": 23.134788189987162, + "grad_norm": 1.3336299657821655, + "learning_rate": 2.5625160462130938e-05, + "loss": 0.3996, + "step": 18022 + }, + { + "epoch": 23.13607188703466, + "grad_norm": 3.2840330600738525, + "learning_rate": 2.5624732563115106e-05, + "loss": 0.3794, + "step": 18023 + }, + { + "epoch": 23.137355584082158, + "grad_norm": 1.5999693870544434, + "learning_rate": 2.562430466409927e-05, + "loss": 0.354, + "step": 18024 + }, + { + "epoch": 23.138639281129652, + "grad_norm": 1.9024251699447632, + "learning_rate": 2.5623876765083443e-05, + "loss": 0.3853, + "step": 18025 + }, + { + "epoch": 23.13992297817715, + "grad_norm": 1.883734941482544, + "learning_rate": 2.5623448866067608e-05, + "loss": 0.3543, + "step": 18026 + }, + { + "epoch": 23.141206675224648, + "grad_norm": 1.2161840200424194, + "learning_rate": 2.5623020967051776e-05, + "loss": 0.4054, + "step": 18027 + }, + { + "epoch": 23.142490372272142, + "grad_norm": 1.4954771995544434, + "learning_rate": 2.5622593068035945e-05, + "loss": 0.3738, + "step": 18028 + }, + { + "epoch": 23.14377406931964, + "grad_norm": 3.0064618587493896, + "learning_rate": 2.562216516902011e-05, + "loss": 0.3986, + "step": 18029 + }, + { + "epoch": 23.145057766367138, + "grad_norm": 3.6162731647491455, + "learning_rate": 2.562173727000428e-05, + "loss": 0.4023, + "step": 18030 + }, + { + "epoch": 23.146341463414632, + "grad_norm": 3.0199098587036133, + "learning_rate": 2.5621309370988446e-05, + "loss": 0.4261, + "step": 18031 + }, + { + "epoch": 23.14762516046213, + "grad_norm": 1.319322109222412, + "learning_rate": 2.5620881471972615e-05, + "loss": 0.3938, + "step": 18032 + }, + { + "epoch": 23.14890885750963, + "grad_norm": 1.0775179862976074, + "learning_rate": 2.5620453572956783e-05, + "loss": 0.3985, + "step": 18033 + }, + { + "epoch": 23.150192554557126, + "grad_norm": 1.4135888814926147, + "learning_rate": 2.562002567394095e-05, + "loss": 0.3586, + "step": 18034 + }, + { + "epoch": 23.15147625160462, + "grad_norm": 1.1025269031524658, + "learning_rate": 2.561959777492512e-05, + "loss": 0.3764, + "step": 18035 + }, + { + "epoch": 23.15275994865212, + "grad_norm": 1.1742538213729858, + "learning_rate": 2.5619169875909285e-05, + "loss": 0.3741, + "step": 18036 + }, + { + "epoch": 23.154043645699616, + "grad_norm": 1.3127436637878418, + "learning_rate": 2.5618741976893454e-05, + "loss": 0.3979, + "step": 18037 + }, + { + "epoch": 23.15532734274711, + "grad_norm": 1.6305261850357056, + "learning_rate": 2.5618314077877622e-05, + "loss": 0.3678, + "step": 18038 + }, + { + "epoch": 23.15661103979461, + "grad_norm": 1.3291670083999634, + "learning_rate": 2.561788617886179e-05, + "loss": 0.398, + "step": 18039 + }, + { + "epoch": 23.157894736842106, + "grad_norm": 1.2552951574325562, + "learning_rate": 2.5617458279845955e-05, + "loss": 0.3721, + "step": 18040 + }, + { + "epoch": 23.1591784338896, + "grad_norm": 1.175512433052063, + "learning_rate": 2.5617030380830124e-05, + "loss": 0.3593, + "step": 18041 + }, + { + "epoch": 23.1604621309371, + "grad_norm": 1.44503653049469, + "learning_rate": 2.5616602481814292e-05, + "loss": 0.3406, + "step": 18042 + }, + { + "epoch": 23.161745827984596, + "grad_norm": 2.7745485305786133, + "learning_rate": 2.561617458279846e-05, + "loss": 0.397, + "step": 18043 + }, + { + "epoch": 23.163029525032094, + "grad_norm": 1.575334072113037, + "learning_rate": 2.561574668378263e-05, + "loss": 0.3691, + "step": 18044 + }, + { + "epoch": 23.16431322207959, + "grad_norm": 1.4292750358581543, + "learning_rate": 2.5615318784766794e-05, + "loss": 0.3801, + "step": 18045 + }, + { + "epoch": 23.165596919127086, + "grad_norm": 1.3626071214675903, + "learning_rate": 2.5614890885750966e-05, + "loss": 0.4039, + "step": 18046 + }, + { + "epoch": 23.166880616174584, + "grad_norm": 1.6378123760223389, + "learning_rate": 2.561446298673513e-05, + "loss": 0.4137, + "step": 18047 + }, + { + "epoch": 23.16816431322208, + "grad_norm": 0.9715896248817444, + "learning_rate": 2.5614035087719296e-05, + "loss": 0.4174, + "step": 18048 + }, + { + "epoch": 23.169448010269576, + "grad_norm": 1.9191471338272095, + "learning_rate": 2.5613607188703468e-05, + "loss": 0.3725, + "step": 18049 + }, + { + "epoch": 23.170731707317074, + "grad_norm": 1.7985618114471436, + "learning_rate": 2.5613179289687633e-05, + "loss": 0.3933, + "step": 18050 + }, + { + "epoch": 23.17201540436457, + "grad_norm": 1.5383895635604858, + "learning_rate": 2.5612751390671805e-05, + "loss": 0.4286, + "step": 18051 + }, + { + "epoch": 23.173299101412066, + "grad_norm": 1.0297636985778809, + "learning_rate": 2.561232349165597e-05, + "loss": 0.3743, + "step": 18052 + }, + { + "epoch": 23.174582798459564, + "grad_norm": 0.8849809169769287, + "learning_rate": 2.5611895592640138e-05, + "loss": 0.3504, + "step": 18053 + }, + { + "epoch": 23.17586649550706, + "grad_norm": 1.4778523445129395, + "learning_rate": 2.5611467693624306e-05, + "loss": 0.3851, + "step": 18054 + }, + { + "epoch": 23.177150192554556, + "grad_norm": 1.7315770387649536, + "learning_rate": 2.561103979460847e-05, + "loss": 0.4068, + "step": 18055 + }, + { + "epoch": 23.178433889602054, + "grad_norm": 2.0152294635772705, + "learning_rate": 2.561061189559264e-05, + "loss": 0.4298, + "step": 18056 + }, + { + "epoch": 23.179717586649552, + "grad_norm": 26.768783569335938, + "learning_rate": 2.5610183996576808e-05, + "loss": 0.3882, + "step": 18057 + }, + { + "epoch": 23.181001283697046, + "grad_norm": 1.2624233961105347, + "learning_rate": 2.5609756097560977e-05, + "loss": 0.4331, + "step": 18058 + }, + { + "epoch": 23.182284980744544, + "grad_norm": 1.7817071676254272, + "learning_rate": 2.5609328198545145e-05, + "loss": 0.3888, + "step": 18059 + }, + { + "epoch": 23.183568677792042, + "grad_norm": 2.218322515487671, + "learning_rate": 2.5608900299529313e-05, + "loss": 0.3817, + "step": 18060 + }, + { + "epoch": 23.184852374839537, + "grad_norm": 1.230729579925537, + "learning_rate": 2.560847240051348e-05, + "loss": 0.3545, + "step": 18061 + }, + { + "epoch": 23.186136071887034, + "grad_norm": 1.639028549194336, + "learning_rate": 2.5608044501497647e-05, + "loss": 0.4523, + "step": 18062 + }, + { + "epoch": 23.187419768934532, + "grad_norm": 5.166121006011963, + "learning_rate": 2.5607616602481815e-05, + "loss": 0.4559, + "step": 18063 + }, + { + "epoch": 23.188703465982027, + "grad_norm": 4.20889139175415, + "learning_rate": 2.560718870346598e-05, + "loss": 0.4622, + "step": 18064 + }, + { + "epoch": 23.189987163029524, + "grad_norm": 1.3987866640090942, + "learning_rate": 2.5606760804450152e-05, + "loss": 0.5171, + "step": 18065 + }, + { + "epoch": 23.191270860077022, + "grad_norm": 1.7204346656799316, + "learning_rate": 2.5606332905434317e-05, + "loss": 0.4149, + "step": 18066 + }, + { + "epoch": 23.19255455712452, + "grad_norm": 3.387849807739258, + "learning_rate": 2.560590500641849e-05, + "loss": 0.5994, + "step": 18067 + }, + { + "epoch": 23.193838254172015, + "grad_norm": 1.7794971466064453, + "learning_rate": 2.5605477107402654e-05, + "loss": 0.3587, + "step": 18068 + }, + { + "epoch": 23.195121951219512, + "grad_norm": 1.0701755285263062, + "learning_rate": 2.560504920838682e-05, + "loss": 0.3609, + "step": 18069 + }, + { + "epoch": 23.19640564826701, + "grad_norm": 1.1399285793304443, + "learning_rate": 2.560462130937099e-05, + "loss": 0.4147, + "step": 18070 + }, + { + "epoch": 23.197689345314505, + "grad_norm": 1.3023866415023804, + "learning_rate": 2.5604193410355156e-05, + "loss": 0.3779, + "step": 18071 + }, + { + "epoch": 23.198973042362002, + "grad_norm": 1.7855453491210938, + "learning_rate": 2.5603765511339324e-05, + "loss": 0.3907, + "step": 18072 + }, + { + "epoch": 23.2002567394095, + "grad_norm": 1.149484634399414, + "learning_rate": 2.5603337612323493e-05, + "loss": 0.3533, + "step": 18073 + }, + { + "epoch": 23.201540436456995, + "grad_norm": 0.7514308094978333, + "learning_rate": 2.560290971330766e-05, + "loss": 0.3454, + "step": 18074 + }, + { + "epoch": 23.202824133504492, + "grad_norm": 2.288473129272461, + "learning_rate": 2.560248181429183e-05, + "loss": 0.3722, + "step": 18075 + }, + { + "epoch": 23.20410783055199, + "grad_norm": 1.4897037744522095, + "learning_rate": 2.5602053915275995e-05, + "loss": 0.3711, + "step": 18076 + }, + { + "epoch": 23.205391527599488, + "grad_norm": 0.9991649985313416, + "learning_rate": 2.5601626016260163e-05, + "loss": 0.4207, + "step": 18077 + }, + { + "epoch": 23.206675224646983, + "grad_norm": 1.3094348907470703, + "learning_rate": 2.560119811724433e-05, + "loss": 0.3653, + "step": 18078 + }, + { + "epoch": 23.20795892169448, + "grad_norm": 1.0462727546691895, + "learning_rate": 2.56007702182285e-05, + "loss": 0.3888, + "step": 18079 + }, + { + "epoch": 23.20924261874198, + "grad_norm": 1.9777582883834839, + "learning_rate": 2.5600342319212665e-05, + "loss": 0.3315, + "step": 18080 + }, + { + "epoch": 23.210526315789473, + "grad_norm": 1.566048264503479, + "learning_rate": 2.5599914420196837e-05, + "loss": 0.3771, + "step": 18081 + }, + { + "epoch": 23.21181001283697, + "grad_norm": 0.9940524101257324, + "learning_rate": 2.5599486521181e-05, + "loss": 0.364, + "step": 18082 + }, + { + "epoch": 23.21309370988447, + "grad_norm": 1.6575584411621094, + "learning_rate": 2.559905862216517e-05, + "loss": 0.3774, + "step": 18083 + }, + { + "epoch": 23.214377406931963, + "grad_norm": 0.9065134525299072, + "learning_rate": 2.559863072314934e-05, + "loss": 0.3774, + "step": 18084 + }, + { + "epoch": 23.21566110397946, + "grad_norm": 1.0327200889587402, + "learning_rate": 2.5598202824133503e-05, + "loss": 0.3558, + "step": 18085 + }, + { + "epoch": 23.21694480102696, + "grad_norm": 1.3437598943710327, + "learning_rate": 2.5597774925117675e-05, + "loss": 0.331, + "step": 18086 + }, + { + "epoch": 23.218228498074453, + "grad_norm": 11.37165355682373, + "learning_rate": 2.559734702610184e-05, + "loss": 0.3784, + "step": 18087 + }, + { + "epoch": 23.21951219512195, + "grad_norm": 1.4849971532821655, + "learning_rate": 2.559691912708601e-05, + "loss": 0.3794, + "step": 18088 + }, + { + "epoch": 23.22079589216945, + "grad_norm": 2.0380775928497314, + "learning_rate": 2.5596491228070177e-05, + "loss": 0.3926, + "step": 18089 + }, + { + "epoch": 23.222079589216946, + "grad_norm": 2.530501365661621, + "learning_rate": 2.5596063329054342e-05, + "loss": 0.4212, + "step": 18090 + }, + { + "epoch": 23.22336328626444, + "grad_norm": 1.5887606143951416, + "learning_rate": 2.5595635430038514e-05, + "loss": 0.442, + "step": 18091 + }, + { + "epoch": 23.22464698331194, + "grad_norm": 2.5921363830566406, + "learning_rate": 2.559520753102268e-05, + "loss": 0.3745, + "step": 18092 + }, + { + "epoch": 23.225930680359436, + "grad_norm": 1.3844422101974487, + "learning_rate": 2.5594779632006847e-05, + "loss": 0.3856, + "step": 18093 + }, + { + "epoch": 23.22721437740693, + "grad_norm": 1.074841022491455, + "learning_rate": 2.5594351732991016e-05, + "loss": 0.3636, + "step": 18094 + }, + { + "epoch": 23.22849807445443, + "grad_norm": 1.6368751525878906, + "learning_rate": 2.559392383397518e-05, + "loss": 0.3509, + "step": 18095 + }, + { + "epoch": 23.229781771501926, + "grad_norm": 1.1047531366348267, + "learning_rate": 2.559349593495935e-05, + "loss": 0.3711, + "step": 18096 + }, + { + "epoch": 23.23106546854942, + "grad_norm": 1.6257668733596802, + "learning_rate": 2.5593068035943518e-05, + "loss": 0.3668, + "step": 18097 + }, + { + "epoch": 23.23234916559692, + "grad_norm": 1.1084264516830444, + "learning_rate": 2.5592640136927686e-05, + "loss": 0.34, + "step": 18098 + }, + { + "epoch": 23.233632862644416, + "grad_norm": 2.124422073364258, + "learning_rate": 2.5592212237911854e-05, + "loss": 0.376, + "step": 18099 + }, + { + "epoch": 23.234916559691914, + "grad_norm": 1.6147841215133667, + "learning_rate": 2.5591784338896023e-05, + "loss": 0.4005, + "step": 18100 + }, + { + "epoch": 23.23620025673941, + "grad_norm": 0.8458216786384583, + "learning_rate": 2.5591356439880188e-05, + "loss": 0.3384, + "step": 18101 + }, + { + "epoch": 23.237483953786906, + "grad_norm": 1.2401819229125977, + "learning_rate": 2.5590928540864356e-05, + "loss": 0.3612, + "step": 18102 + }, + { + "epoch": 23.238767650834404, + "grad_norm": 2.390876293182373, + "learning_rate": 2.5590500641848525e-05, + "loss": 0.3931, + "step": 18103 + }, + { + "epoch": 23.2400513478819, + "grad_norm": 1.3843348026275635, + "learning_rate": 2.559007274283269e-05, + "loss": 0.4261, + "step": 18104 + }, + { + "epoch": 23.241335044929397, + "grad_norm": 1.4593677520751953, + "learning_rate": 2.558964484381686e-05, + "loss": 0.4553, + "step": 18105 + }, + { + "epoch": 23.242618741976894, + "grad_norm": 2.166332721710205, + "learning_rate": 2.5589216944801027e-05, + "loss": 0.3681, + "step": 18106 + }, + { + "epoch": 23.24390243902439, + "grad_norm": 1.2429782152175903, + "learning_rate": 2.55887890457852e-05, + "loss": 0.4147, + "step": 18107 + }, + { + "epoch": 23.245186136071887, + "grad_norm": 1.328018307685852, + "learning_rate": 2.5588361146769363e-05, + "loss": 0.3827, + "step": 18108 + }, + { + "epoch": 23.246469833119384, + "grad_norm": 4.440088272094727, + "learning_rate": 2.558793324775353e-05, + "loss": 0.4173, + "step": 18109 + }, + { + "epoch": 23.247753530166882, + "grad_norm": 1.1569331884384155, + "learning_rate": 2.55875053487377e-05, + "loss": 0.4694, + "step": 18110 + }, + { + "epoch": 23.249037227214377, + "grad_norm": 1.7418301105499268, + "learning_rate": 2.5587077449721865e-05, + "loss": 0.4618, + "step": 18111 + }, + { + "epoch": 23.250320924261874, + "grad_norm": 2.52831768989563, + "learning_rate": 2.5586649550706034e-05, + "loss": 0.4443, + "step": 18112 + }, + { + "epoch": 23.251604621309372, + "grad_norm": 1.235944390296936, + "learning_rate": 2.5586221651690202e-05, + "loss": 0.4256, + "step": 18113 + }, + { + "epoch": 23.252888318356867, + "grad_norm": 11.906118392944336, + "learning_rate": 2.558579375267437e-05, + "loss": 0.4171, + "step": 18114 + }, + { + "epoch": 23.254172015404365, + "grad_norm": 1.5897949934005737, + "learning_rate": 2.558536585365854e-05, + "loss": 0.4171, + "step": 18115 + }, + { + "epoch": 23.255455712451862, + "grad_norm": 2.3262317180633545, + "learning_rate": 2.5584937954642704e-05, + "loss": 0.4814, + "step": 18116 + }, + { + "epoch": 23.256739409499357, + "grad_norm": 2.0458333492279053, + "learning_rate": 2.5584510055626872e-05, + "loss": 0.531, + "step": 18117 + }, + { + "epoch": 23.258023106546855, + "grad_norm": 1.4342005252838135, + "learning_rate": 2.558408215661104e-05, + "loss": 0.3524, + "step": 18118 + }, + { + "epoch": 23.259306803594352, + "grad_norm": 1.4010167121887207, + "learning_rate": 2.558365425759521e-05, + "loss": 0.3661, + "step": 18119 + }, + { + "epoch": 23.260590500641847, + "grad_norm": 1.4247419834136963, + "learning_rate": 2.5583226358579374e-05, + "loss": 0.3567, + "step": 18120 + }, + { + "epoch": 23.261874197689345, + "grad_norm": 1.1578288078308105, + "learning_rate": 2.5582798459563546e-05, + "loss": 0.3612, + "step": 18121 + }, + { + "epoch": 23.263157894736842, + "grad_norm": 1.253669023513794, + "learning_rate": 2.558237056054771e-05, + "loss": 0.3769, + "step": 18122 + }, + { + "epoch": 23.26444159178434, + "grad_norm": 3.5928313732147217, + "learning_rate": 2.558194266153188e-05, + "loss": 0.3415, + "step": 18123 + }, + { + "epoch": 23.265725288831835, + "grad_norm": 1.7258611917495728, + "learning_rate": 2.5581514762516048e-05, + "loss": 0.4189, + "step": 18124 + }, + { + "epoch": 23.267008985879333, + "grad_norm": 2.9461734294891357, + "learning_rate": 2.5581086863500213e-05, + "loss": 0.3555, + "step": 18125 + }, + { + "epoch": 23.26829268292683, + "grad_norm": 1.4121123552322388, + "learning_rate": 2.5580658964484385e-05, + "loss": 0.3661, + "step": 18126 + }, + { + "epoch": 23.269576379974325, + "grad_norm": 2.1473472118377686, + "learning_rate": 2.558023106546855e-05, + "loss": 0.3925, + "step": 18127 + }, + { + "epoch": 23.270860077021823, + "grad_norm": 1.9997180700302124, + "learning_rate": 2.5579803166452718e-05, + "loss": 0.3904, + "step": 18128 + }, + { + "epoch": 23.27214377406932, + "grad_norm": 1.0406765937805176, + "learning_rate": 2.5579375267436886e-05, + "loss": 0.408, + "step": 18129 + }, + { + "epoch": 23.273427471116815, + "grad_norm": 1.4178292751312256, + "learning_rate": 2.557894736842105e-05, + "loss": 0.3748, + "step": 18130 + }, + { + "epoch": 23.274711168164313, + "grad_norm": 0.8917591571807861, + "learning_rate": 2.5578519469405223e-05, + "loss": 0.3431, + "step": 18131 + }, + { + "epoch": 23.27599486521181, + "grad_norm": 2.3190035820007324, + "learning_rate": 2.557809157038939e-05, + "loss": 0.4087, + "step": 18132 + }, + { + "epoch": 23.27727856225931, + "grad_norm": 2.145378351211548, + "learning_rate": 2.5577663671373557e-05, + "loss": 0.4364, + "step": 18133 + }, + { + "epoch": 23.278562259306803, + "grad_norm": 1.106027603149414, + "learning_rate": 2.5577235772357725e-05, + "loss": 0.4426, + "step": 18134 + }, + { + "epoch": 23.2798459563543, + "grad_norm": 1.0609265565872192, + "learning_rate": 2.5576807873341894e-05, + "loss": 0.3823, + "step": 18135 + }, + { + "epoch": 23.2811296534018, + "grad_norm": 1.2494304180145264, + "learning_rate": 2.557637997432606e-05, + "loss": 0.4089, + "step": 18136 + }, + { + "epoch": 23.282413350449293, + "grad_norm": 1.0883885622024536, + "learning_rate": 2.5575952075310227e-05, + "loss": 0.3652, + "step": 18137 + }, + { + "epoch": 23.28369704749679, + "grad_norm": 1.727108120918274, + "learning_rate": 2.5575524176294395e-05, + "loss": 0.3762, + "step": 18138 + }, + { + "epoch": 23.28498074454429, + "grad_norm": 1.4871103763580322, + "learning_rate": 2.557509627727856e-05, + "loss": 0.3869, + "step": 18139 + }, + { + "epoch": 23.286264441591783, + "grad_norm": 3.219752550125122, + "learning_rate": 2.5574668378262732e-05, + "loss": 0.3801, + "step": 18140 + }, + { + "epoch": 23.28754813863928, + "grad_norm": 2.1696395874023438, + "learning_rate": 2.5574240479246897e-05, + "loss": 0.3959, + "step": 18141 + }, + { + "epoch": 23.28883183568678, + "grad_norm": 1.6088975667953491, + "learning_rate": 2.557381258023107e-05, + "loss": 0.3916, + "step": 18142 + }, + { + "epoch": 23.290115532734276, + "grad_norm": 1.315605878829956, + "learning_rate": 2.5573384681215234e-05, + "loss": 0.3865, + "step": 18143 + }, + { + "epoch": 23.29139922978177, + "grad_norm": 2.446748971939087, + "learning_rate": 2.55729567821994e-05, + "loss": 0.392, + "step": 18144 + }, + { + "epoch": 23.29268292682927, + "grad_norm": 1.2087087631225586, + "learning_rate": 2.557252888318357e-05, + "loss": 0.3691, + "step": 18145 + }, + { + "epoch": 23.293966623876766, + "grad_norm": 1.460680603981018, + "learning_rate": 2.5572100984167736e-05, + "loss": 0.3649, + "step": 18146 + }, + { + "epoch": 23.29525032092426, + "grad_norm": 1.1225526332855225, + "learning_rate": 2.5571673085151904e-05, + "loss": 0.4067, + "step": 18147 + }, + { + "epoch": 23.29653401797176, + "grad_norm": 0.9925175905227661, + "learning_rate": 2.5571245186136073e-05, + "loss": 0.3913, + "step": 18148 + }, + { + "epoch": 23.297817715019256, + "grad_norm": 1.1524406671524048, + "learning_rate": 2.557081728712024e-05, + "loss": 0.3835, + "step": 18149 + }, + { + "epoch": 23.29910141206675, + "grad_norm": 1.1514453887939453, + "learning_rate": 2.557038938810441e-05, + "loss": 0.4057, + "step": 18150 + }, + { + "epoch": 23.30038510911425, + "grad_norm": 1.9409680366516113, + "learning_rate": 2.5569961489088575e-05, + "loss": 0.3604, + "step": 18151 + }, + { + "epoch": 23.301668806161747, + "grad_norm": 1.0117146968841553, + "learning_rate": 2.5569533590072743e-05, + "loss": 0.4022, + "step": 18152 + }, + { + "epoch": 23.30295250320924, + "grad_norm": 1.229163646697998, + "learning_rate": 2.556910569105691e-05, + "loss": 0.3853, + "step": 18153 + }, + { + "epoch": 23.30423620025674, + "grad_norm": 1.1874074935913086, + "learning_rate": 2.556867779204108e-05, + "loss": 0.4351, + "step": 18154 + }, + { + "epoch": 23.305519897304237, + "grad_norm": 0.9623952507972717, + "learning_rate": 2.5568249893025245e-05, + "loss": 0.4081, + "step": 18155 + }, + { + "epoch": 23.306803594351734, + "grad_norm": 3.2642691135406494, + "learning_rate": 2.5567821994009413e-05, + "loss": 0.3632, + "step": 18156 + }, + { + "epoch": 23.30808729139923, + "grad_norm": 0.967070996761322, + "learning_rate": 2.556739409499358e-05, + "loss": 0.3666, + "step": 18157 + }, + { + "epoch": 23.309370988446727, + "grad_norm": 0.9742642045021057, + "learning_rate": 2.556696619597775e-05, + "loss": 0.4233, + "step": 18158 + }, + { + "epoch": 23.310654685494224, + "grad_norm": 1.1454335451126099, + "learning_rate": 2.556653829696192e-05, + "loss": 0.4651, + "step": 18159 + }, + { + "epoch": 23.31193838254172, + "grad_norm": 1.6880347728729248, + "learning_rate": 2.5566110397946084e-05, + "loss": 0.3891, + "step": 18160 + }, + { + "epoch": 23.313222079589217, + "grad_norm": 1.2310254573822021, + "learning_rate": 2.5565682498930255e-05, + "loss": 0.3801, + "step": 18161 + }, + { + "epoch": 23.314505776636715, + "grad_norm": 3.5193402767181396, + "learning_rate": 2.556525459991442e-05, + "loss": 0.3823, + "step": 18162 + }, + { + "epoch": 23.31578947368421, + "grad_norm": 2.947816848754883, + "learning_rate": 2.5564826700898585e-05, + "loss": 0.4092, + "step": 18163 + }, + { + "epoch": 23.317073170731707, + "grad_norm": 5.116605281829834, + "learning_rate": 2.5564398801882757e-05, + "loss": 0.4359, + "step": 18164 + }, + { + "epoch": 23.318356867779205, + "grad_norm": 1.1524507999420166, + "learning_rate": 2.5563970902866922e-05, + "loss": 0.4531, + "step": 18165 + }, + { + "epoch": 23.319640564826702, + "grad_norm": 2.05309796333313, + "learning_rate": 2.5563543003851094e-05, + "loss": 0.5106, + "step": 18166 + }, + { + "epoch": 23.320924261874197, + "grad_norm": 4.450675964355469, + "learning_rate": 2.556311510483526e-05, + "loss": 0.5873, + "step": 18167 + }, + { + "epoch": 23.322207958921695, + "grad_norm": 1.426076054573059, + "learning_rate": 2.5562687205819427e-05, + "loss": 0.3441, + "step": 18168 + }, + { + "epoch": 23.323491655969192, + "grad_norm": 2.6797473430633545, + "learning_rate": 2.5562259306803596e-05, + "loss": 0.3499, + "step": 18169 + }, + { + "epoch": 23.324775353016687, + "grad_norm": 1.2172771692276, + "learning_rate": 2.556183140778776e-05, + "loss": 0.3912, + "step": 18170 + }, + { + "epoch": 23.326059050064185, + "grad_norm": 1.9626293182373047, + "learning_rate": 2.556140350877193e-05, + "loss": 0.3712, + "step": 18171 + }, + { + "epoch": 23.327342747111683, + "grad_norm": 1.183100700378418, + "learning_rate": 2.5560975609756098e-05, + "loss": 0.3752, + "step": 18172 + }, + { + "epoch": 23.328626444159177, + "grad_norm": 1.55950927734375, + "learning_rate": 2.5560547710740266e-05, + "loss": 0.3779, + "step": 18173 + }, + { + "epoch": 23.329910141206675, + "grad_norm": 0.9912763833999634, + "learning_rate": 2.5560119811724435e-05, + "loss": 0.367, + "step": 18174 + }, + { + "epoch": 23.331193838254173, + "grad_norm": 1.6519221067428589, + "learning_rate": 2.5559691912708603e-05, + "loss": 0.3923, + "step": 18175 + }, + { + "epoch": 23.33247753530167, + "grad_norm": 0.9220240116119385, + "learning_rate": 2.5559264013692768e-05, + "loss": 0.3601, + "step": 18176 + }, + { + "epoch": 23.333761232349165, + "grad_norm": 1.7220829725265503, + "learning_rate": 2.5558836114676936e-05, + "loss": 0.4072, + "step": 18177 + }, + { + "epoch": 23.335044929396663, + "grad_norm": 2.7793021202087402, + "learning_rate": 2.5558408215661105e-05, + "loss": 0.3663, + "step": 18178 + }, + { + "epoch": 23.33632862644416, + "grad_norm": 1.4954184293746948, + "learning_rate": 2.555798031664527e-05, + "loss": 0.3708, + "step": 18179 + }, + { + "epoch": 23.337612323491655, + "grad_norm": 1.2347919940948486, + "learning_rate": 2.555755241762944e-05, + "loss": 0.3885, + "step": 18180 + }, + { + "epoch": 23.338896020539153, + "grad_norm": 1.081935167312622, + "learning_rate": 2.5557124518613607e-05, + "loss": 0.3578, + "step": 18181 + }, + { + "epoch": 23.34017971758665, + "grad_norm": 1.1536897420883179, + "learning_rate": 2.555669661959778e-05, + "loss": 0.3478, + "step": 18182 + }, + { + "epoch": 23.341463414634145, + "grad_norm": 1.8331862688064575, + "learning_rate": 2.5556268720581943e-05, + "loss": 0.3274, + "step": 18183 + }, + { + "epoch": 23.342747111681643, + "grad_norm": 1.6848855018615723, + "learning_rate": 2.555584082156611e-05, + "loss": 0.3682, + "step": 18184 + }, + { + "epoch": 23.34403080872914, + "grad_norm": 1.2626979351043701, + "learning_rate": 2.555541292255028e-05, + "loss": 0.3761, + "step": 18185 + }, + { + "epoch": 23.345314505776635, + "grad_norm": 2.3193359375, + "learning_rate": 2.5554985023534445e-05, + "loss": 0.3891, + "step": 18186 + }, + { + "epoch": 23.346598202824133, + "grad_norm": 1.2243012189865112, + "learning_rate": 2.5554557124518614e-05, + "loss": 0.369, + "step": 18187 + }, + { + "epoch": 23.34788189987163, + "grad_norm": 1.2613110542297363, + "learning_rate": 2.5554129225502782e-05, + "loss": 0.4025, + "step": 18188 + }, + { + "epoch": 23.34916559691913, + "grad_norm": 0.8869873285293579, + "learning_rate": 2.555370132648695e-05, + "loss": 0.3584, + "step": 18189 + }, + { + "epoch": 23.350449293966623, + "grad_norm": 2.2000985145568848, + "learning_rate": 2.555327342747112e-05, + "loss": 0.3615, + "step": 18190 + }, + { + "epoch": 23.35173299101412, + "grad_norm": 0.9943927526473999, + "learning_rate": 2.5552845528455284e-05, + "loss": 0.4083, + "step": 18191 + }, + { + "epoch": 23.35301668806162, + "grad_norm": 1.2278389930725098, + "learning_rate": 2.5552417629439452e-05, + "loss": 0.3576, + "step": 18192 + }, + { + "epoch": 23.354300385109113, + "grad_norm": 2.28360652923584, + "learning_rate": 2.555198973042362e-05, + "loss": 0.3949, + "step": 18193 + }, + { + "epoch": 23.35558408215661, + "grad_norm": 1.5566799640655518, + "learning_rate": 2.555156183140779e-05, + "loss": 0.3653, + "step": 18194 + }, + { + "epoch": 23.35686777920411, + "grad_norm": 1.5747610330581665, + "learning_rate": 2.5551133932391954e-05, + "loss": 0.3765, + "step": 18195 + }, + { + "epoch": 23.358151476251603, + "grad_norm": 4.2916998863220215, + "learning_rate": 2.5550706033376126e-05, + "loss": 0.4007, + "step": 18196 + }, + { + "epoch": 23.3594351732991, + "grad_norm": 1.8202310800552368, + "learning_rate": 2.555027813436029e-05, + "loss": 0.3812, + "step": 18197 + }, + { + "epoch": 23.3607188703466, + "grad_norm": 1.2535018920898438, + "learning_rate": 2.554985023534446e-05, + "loss": 0.3843, + "step": 18198 + }, + { + "epoch": 23.362002567394097, + "grad_norm": 0.770124077796936, + "learning_rate": 2.5549422336328628e-05, + "loss": 0.3768, + "step": 18199 + }, + { + "epoch": 23.36328626444159, + "grad_norm": 1.0012377500534058, + "learning_rate": 2.5548994437312793e-05, + "loss": 0.4099, + "step": 18200 + }, + { + "epoch": 23.36456996148909, + "grad_norm": 1.2622250318527222, + "learning_rate": 2.5548566538296965e-05, + "loss": 0.4237, + "step": 18201 + }, + { + "epoch": 23.365853658536587, + "grad_norm": 1.1757420301437378, + "learning_rate": 2.554813863928113e-05, + "loss": 0.3899, + "step": 18202 + }, + { + "epoch": 23.36713735558408, + "grad_norm": 1.2521450519561768, + "learning_rate": 2.5547710740265298e-05, + "loss": 0.3664, + "step": 18203 + }, + { + "epoch": 23.36842105263158, + "grad_norm": 1.8022801876068115, + "learning_rate": 2.5547282841249467e-05, + "loss": 0.4247, + "step": 18204 + }, + { + "epoch": 23.369704749679077, + "grad_norm": 2.1719281673431396, + "learning_rate": 2.554685494223363e-05, + "loss": 0.3505, + "step": 18205 + }, + { + "epoch": 23.37098844672657, + "grad_norm": 2.7632250785827637, + "learning_rate": 2.5546427043217803e-05, + "loss": 0.4114, + "step": 18206 + }, + { + "epoch": 23.37227214377407, + "grad_norm": 2.375551700592041, + "learning_rate": 2.554599914420197e-05, + "loss": 0.4163, + "step": 18207 + }, + { + "epoch": 23.373555840821567, + "grad_norm": 1.8935422897338867, + "learning_rate": 2.5545571245186137e-05, + "loss": 0.4123, + "step": 18208 + }, + { + "epoch": 23.374839537869065, + "grad_norm": 2.561626672744751, + "learning_rate": 2.5545143346170305e-05, + "loss": 0.3837, + "step": 18209 + }, + { + "epoch": 23.37612323491656, + "grad_norm": 1.9169650077819824, + "learning_rate": 2.5544715447154474e-05, + "loss": 0.4013, + "step": 18210 + }, + { + "epoch": 23.377406931964057, + "grad_norm": 1.4398386478424072, + "learning_rate": 2.554428754813864e-05, + "loss": 0.4204, + "step": 18211 + }, + { + "epoch": 23.378690629011555, + "grad_norm": 1.6874620914459229, + "learning_rate": 2.5543859649122807e-05, + "loss": 0.4274, + "step": 18212 + }, + { + "epoch": 23.37997432605905, + "grad_norm": 9.127222061157227, + "learning_rate": 2.5543431750106975e-05, + "loss": 0.4349, + "step": 18213 + }, + { + "epoch": 23.381258023106547, + "grad_norm": 2.932525396347046, + "learning_rate": 2.5543003851091144e-05, + "loss": 0.4285, + "step": 18214 + }, + { + "epoch": 23.382541720154045, + "grad_norm": 1.3833777904510498, + "learning_rate": 2.5542575952075312e-05, + "loss": 0.4547, + "step": 18215 + }, + { + "epoch": 23.38382541720154, + "grad_norm": 2.2949421405792236, + "learning_rate": 2.5542148053059477e-05, + "loss": 0.5024, + "step": 18216 + }, + { + "epoch": 23.385109114249037, + "grad_norm": 17.468015670776367, + "learning_rate": 2.5541720154043646e-05, + "loss": 0.5916, + "step": 18217 + }, + { + "epoch": 23.386392811296535, + "grad_norm": 2.188413381576538, + "learning_rate": 2.5541292255027814e-05, + "loss": 0.3708, + "step": 18218 + }, + { + "epoch": 23.387676508344033, + "grad_norm": 1.1034406423568726, + "learning_rate": 2.554086435601198e-05, + "loss": 0.363, + "step": 18219 + }, + { + "epoch": 23.388960205391527, + "grad_norm": 1.0901312828063965, + "learning_rate": 2.554043645699615e-05, + "loss": 0.3538, + "step": 18220 + }, + { + "epoch": 23.390243902439025, + "grad_norm": 1.184893012046814, + "learning_rate": 2.5540008557980316e-05, + "loss": 0.3831, + "step": 18221 + }, + { + "epoch": 23.391527599486523, + "grad_norm": 1.2522181272506714, + "learning_rate": 2.5539580658964488e-05, + "loss": 0.3483, + "step": 18222 + }, + { + "epoch": 23.392811296534017, + "grad_norm": 1.0054221153259277, + "learning_rate": 2.5539152759948653e-05, + "loss": 0.3585, + "step": 18223 + }, + { + "epoch": 23.394094993581515, + "grad_norm": 1.0289678573608398, + "learning_rate": 2.5538724860932818e-05, + "loss": 0.3731, + "step": 18224 + }, + { + "epoch": 23.395378690629013, + "grad_norm": 1.4778059720993042, + "learning_rate": 2.553829696191699e-05, + "loss": 0.427, + "step": 18225 + }, + { + "epoch": 23.396662387676507, + "grad_norm": 1.4274498224258423, + "learning_rate": 2.5537869062901155e-05, + "loss": 0.376, + "step": 18226 + }, + { + "epoch": 23.397946084724005, + "grad_norm": 0.9327660799026489, + "learning_rate": 2.5537441163885323e-05, + "loss": 0.3715, + "step": 18227 + }, + { + "epoch": 23.399229781771503, + "grad_norm": 1.340412974357605, + "learning_rate": 2.553701326486949e-05, + "loss": 0.4186, + "step": 18228 + }, + { + "epoch": 23.400513478818997, + "grad_norm": 1.1297540664672852, + "learning_rate": 2.553658536585366e-05, + "loss": 0.4028, + "step": 18229 + }, + { + "epoch": 23.401797175866495, + "grad_norm": 0.8403692245483398, + "learning_rate": 2.5536157466837828e-05, + "loss": 0.3513, + "step": 18230 + }, + { + "epoch": 23.403080872913993, + "grad_norm": 1.995082139968872, + "learning_rate": 2.5535729567821993e-05, + "loss": 0.373, + "step": 18231 + }, + { + "epoch": 23.40436456996149, + "grad_norm": 1.3288731575012207, + "learning_rate": 2.5535301668806162e-05, + "loss": 0.3502, + "step": 18232 + }, + { + "epoch": 23.405648267008985, + "grad_norm": 2.6902952194213867, + "learning_rate": 2.553487376979033e-05, + "loss": 0.3777, + "step": 18233 + }, + { + "epoch": 23.406931964056483, + "grad_norm": 1.2273308038711548, + "learning_rate": 2.55344458707745e-05, + "loss": 0.345, + "step": 18234 + }, + { + "epoch": 23.40821566110398, + "grad_norm": 1.4092600345611572, + "learning_rate": 2.5534017971758664e-05, + "loss": 0.3606, + "step": 18235 + }, + { + "epoch": 23.409499358151475, + "grad_norm": 1.5530346632003784, + "learning_rate": 2.5533590072742835e-05, + "loss": 0.3839, + "step": 18236 + }, + { + "epoch": 23.410783055198973, + "grad_norm": 1.3551806211471558, + "learning_rate": 2.5533162173727e-05, + "loss": 0.3864, + "step": 18237 + }, + { + "epoch": 23.41206675224647, + "grad_norm": 0.9141874313354492, + "learning_rate": 2.553273427471117e-05, + "loss": 0.39, + "step": 18238 + }, + { + "epoch": 23.413350449293965, + "grad_norm": 1.3791979551315308, + "learning_rate": 2.5532306375695337e-05, + "loss": 0.3928, + "step": 18239 + }, + { + "epoch": 23.414634146341463, + "grad_norm": 1.496214747428894, + "learning_rate": 2.5531878476679502e-05, + "loss": 0.4263, + "step": 18240 + }, + { + "epoch": 23.41591784338896, + "grad_norm": 1.2130610942840576, + "learning_rate": 2.5531450577663674e-05, + "loss": 0.381, + "step": 18241 + }, + { + "epoch": 23.41720154043646, + "grad_norm": 0.8732167482376099, + "learning_rate": 2.553102267864784e-05, + "loss": 0.3616, + "step": 18242 + }, + { + "epoch": 23.418485237483953, + "grad_norm": 1.8399399518966675, + "learning_rate": 2.5530594779632007e-05, + "loss": 0.3838, + "step": 18243 + }, + { + "epoch": 23.41976893453145, + "grad_norm": 1.7616621255874634, + "learning_rate": 2.5530166880616176e-05, + "loss": 0.3342, + "step": 18244 + }, + { + "epoch": 23.42105263157895, + "grad_norm": 1.0797815322875977, + "learning_rate": 2.552973898160034e-05, + "loss": 0.3561, + "step": 18245 + }, + { + "epoch": 23.422336328626443, + "grad_norm": 1.0908056497573853, + "learning_rate": 2.5529311082584513e-05, + "loss": 0.4, + "step": 18246 + }, + { + "epoch": 23.42362002567394, + "grad_norm": 1.0555497407913208, + "learning_rate": 2.5528883183568678e-05, + "loss": 0.3741, + "step": 18247 + }, + { + "epoch": 23.42490372272144, + "grad_norm": 1.1343915462493896, + "learning_rate": 2.5528455284552846e-05, + "loss": 0.3732, + "step": 18248 + }, + { + "epoch": 23.426187419768933, + "grad_norm": 1.3904342651367188, + "learning_rate": 2.5528027385537015e-05, + "loss": 0.3998, + "step": 18249 + }, + { + "epoch": 23.42747111681643, + "grad_norm": 1.2860291004180908, + "learning_rate": 2.5527599486521183e-05, + "loss": 0.3928, + "step": 18250 + }, + { + "epoch": 23.42875481386393, + "grad_norm": 1.299680233001709, + "learning_rate": 2.5527171587505348e-05, + "loss": 0.4137, + "step": 18251 + }, + { + "epoch": 23.430038510911427, + "grad_norm": 7.621994972229004, + "learning_rate": 2.5526743688489516e-05, + "loss": 0.3887, + "step": 18252 + }, + { + "epoch": 23.43132220795892, + "grad_norm": 1.5479800701141357, + "learning_rate": 2.5526315789473685e-05, + "loss": 0.3879, + "step": 18253 + }, + { + "epoch": 23.43260590500642, + "grad_norm": 2.210064649581909, + "learning_rate": 2.5525887890457853e-05, + "loss": 0.3478, + "step": 18254 + }, + { + "epoch": 23.433889602053917, + "grad_norm": 1.0533020496368408, + "learning_rate": 2.552545999144202e-05, + "loss": 0.413, + "step": 18255 + }, + { + "epoch": 23.43517329910141, + "grad_norm": 1.1854075193405151, + "learning_rate": 2.5525032092426187e-05, + "loss": 0.4173, + "step": 18256 + }, + { + "epoch": 23.43645699614891, + "grad_norm": 1.6001638174057007, + "learning_rate": 2.552460419341036e-05, + "loss": 0.3825, + "step": 18257 + }, + { + "epoch": 23.437740693196407, + "grad_norm": 1.2147377729415894, + "learning_rate": 2.5524176294394523e-05, + "loss": 0.3815, + "step": 18258 + }, + { + "epoch": 23.4390243902439, + "grad_norm": 1.3310441970825195, + "learning_rate": 2.552374839537869e-05, + "loss": 0.4511, + "step": 18259 + }, + { + "epoch": 23.4403080872914, + "grad_norm": 1.4674832820892334, + "learning_rate": 2.552332049636286e-05, + "loss": 0.4072, + "step": 18260 + }, + { + "epoch": 23.441591784338897, + "grad_norm": 1.2030906677246094, + "learning_rate": 2.5522892597347025e-05, + "loss": 0.4281, + "step": 18261 + }, + { + "epoch": 23.44287548138639, + "grad_norm": 1.713975191116333, + "learning_rate": 2.5522464698331197e-05, + "loss": 0.4375, + "step": 18262 + }, + { + "epoch": 23.44415917843389, + "grad_norm": 2.068495035171509, + "learning_rate": 2.5522036799315362e-05, + "loss": 0.4738, + "step": 18263 + }, + { + "epoch": 23.445442875481387, + "grad_norm": 1.98995041847229, + "learning_rate": 2.552160890029953e-05, + "loss": 0.4347, + "step": 18264 + }, + { + "epoch": 23.446726572528885, + "grad_norm": 2.765674591064453, + "learning_rate": 2.55211810012837e-05, + "loss": 0.4613, + "step": 18265 + }, + { + "epoch": 23.44801026957638, + "grad_norm": 2.5325615406036377, + "learning_rate": 2.5520753102267864e-05, + "loss": 0.5357, + "step": 18266 + }, + { + "epoch": 23.449293966623877, + "grad_norm": 2.2136950492858887, + "learning_rate": 2.5520325203252032e-05, + "loss": 0.5814, + "step": 18267 + }, + { + "epoch": 23.450577663671375, + "grad_norm": 1.779914140701294, + "learning_rate": 2.55198973042362e-05, + "loss": 0.3352, + "step": 18268 + }, + { + "epoch": 23.45186136071887, + "grad_norm": 1.3771377801895142, + "learning_rate": 2.551946940522037e-05, + "loss": 0.3519, + "step": 18269 + }, + { + "epoch": 23.453145057766367, + "grad_norm": 1.4583022594451904, + "learning_rate": 2.5519041506204538e-05, + "loss": 0.3577, + "step": 18270 + }, + { + "epoch": 23.454428754813865, + "grad_norm": 2.0725643634796143, + "learning_rate": 2.5518613607188706e-05, + "loss": 0.3982, + "step": 18271 + }, + { + "epoch": 23.45571245186136, + "grad_norm": 3.035865545272827, + "learning_rate": 2.551818570817287e-05, + "loss": 0.3681, + "step": 18272 + }, + { + "epoch": 23.456996148908857, + "grad_norm": 1.5946366786956787, + "learning_rate": 2.551775780915704e-05, + "loss": 0.3773, + "step": 18273 + }, + { + "epoch": 23.458279845956355, + "grad_norm": 1.2354310750961304, + "learning_rate": 2.5517329910141208e-05, + "loss": 0.3703, + "step": 18274 + }, + { + "epoch": 23.459563543003853, + "grad_norm": 1.3151401281356812, + "learning_rate": 2.5516902011125373e-05, + "loss": 0.3892, + "step": 18275 + }, + { + "epoch": 23.460847240051347, + "grad_norm": 0.9751563668251038, + "learning_rate": 2.5516474112109545e-05, + "loss": 0.3609, + "step": 18276 + }, + { + "epoch": 23.462130937098845, + "grad_norm": 1.1862766742706299, + "learning_rate": 2.551604621309371e-05, + "loss": 0.3996, + "step": 18277 + }, + { + "epoch": 23.463414634146343, + "grad_norm": 1.5395606756210327, + "learning_rate": 2.5515618314077878e-05, + "loss": 0.3555, + "step": 18278 + }, + { + "epoch": 23.464698331193837, + "grad_norm": 4.572415828704834, + "learning_rate": 2.5515190415062047e-05, + "loss": 0.3648, + "step": 18279 + }, + { + "epoch": 23.465982028241335, + "grad_norm": 1.2427467107772827, + "learning_rate": 2.551476251604621e-05, + "loss": 0.4031, + "step": 18280 + }, + { + "epoch": 23.467265725288833, + "grad_norm": 1.389382004737854, + "learning_rate": 2.5514334617030383e-05, + "loss": 0.3988, + "step": 18281 + }, + { + "epoch": 23.468549422336327, + "grad_norm": 1.3566608428955078, + "learning_rate": 2.551390671801455e-05, + "loss": 0.3873, + "step": 18282 + }, + { + "epoch": 23.469833119383825, + "grad_norm": 1.3001052141189575, + "learning_rate": 2.5513478818998717e-05, + "loss": 0.3848, + "step": 18283 + }, + { + "epoch": 23.471116816431323, + "grad_norm": 2.3907525539398193, + "learning_rate": 2.5513050919982885e-05, + "loss": 0.3748, + "step": 18284 + }, + { + "epoch": 23.47240051347882, + "grad_norm": 0.9059511423110962, + "learning_rate": 2.551262302096705e-05, + "loss": 0.3663, + "step": 18285 + }, + { + "epoch": 23.473684210526315, + "grad_norm": 5.925637245178223, + "learning_rate": 2.5512195121951222e-05, + "loss": 0.344, + "step": 18286 + }, + { + "epoch": 23.474967907573813, + "grad_norm": 2.166177272796631, + "learning_rate": 2.5511767222935387e-05, + "loss": 0.3923, + "step": 18287 + }, + { + "epoch": 23.47625160462131, + "grad_norm": 0.940884530544281, + "learning_rate": 2.5511339323919556e-05, + "loss": 0.3954, + "step": 18288 + }, + { + "epoch": 23.477535301668805, + "grad_norm": 1.7697603702545166, + "learning_rate": 2.5510911424903724e-05, + "loss": 0.3535, + "step": 18289 + }, + { + "epoch": 23.478818998716303, + "grad_norm": 1.4463821649551392, + "learning_rate": 2.5510483525887892e-05, + "loss": 0.3809, + "step": 18290 + }, + { + "epoch": 23.4801026957638, + "grad_norm": 1.2552754878997803, + "learning_rate": 2.5510055626872057e-05, + "loss": 0.3489, + "step": 18291 + }, + { + "epoch": 23.481386392811295, + "grad_norm": 1.152681827545166, + "learning_rate": 2.5509627727856226e-05, + "loss": 0.3728, + "step": 18292 + }, + { + "epoch": 23.482670089858793, + "grad_norm": 1.3434025049209595, + "learning_rate": 2.5509199828840394e-05, + "loss": 0.4082, + "step": 18293 + }, + { + "epoch": 23.48395378690629, + "grad_norm": 1.0861718654632568, + "learning_rate": 2.5508771929824563e-05, + "loss": 0.3679, + "step": 18294 + }, + { + "epoch": 23.485237483953785, + "grad_norm": 1.4680237770080566, + "learning_rate": 2.550834403080873e-05, + "loss": 0.3411, + "step": 18295 + }, + { + "epoch": 23.486521181001283, + "grad_norm": 3.649859666824341, + "learning_rate": 2.5507916131792896e-05, + "loss": 0.3874, + "step": 18296 + }, + { + "epoch": 23.48780487804878, + "grad_norm": 1.5909830331802368, + "learning_rate": 2.5507488232777068e-05, + "loss": 0.3906, + "step": 18297 + }, + { + "epoch": 23.48908857509628, + "grad_norm": 1.3824788331985474, + "learning_rate": 2.5507060333761233e-05, + "loss": 0.3778, + "step": 18298 + }, + { + "epoch": 23.490372272143773, + "grad_norm": 0.9119117856025696, + "learning_rate": 2.5506632434745398e-05, + "loss": 0.381, + "step": 18299 + }, + { + "epoch": 23.49165596919127, + "grad_norm": 1.4391995668411255, + "learning_rate": 2.550620453572957e-05, + "loss": 0.3613, + "step": 18300 + }, + { + "epoch": 23.49293966623877, + "grad_norm": 1.0765671730041504, + "learning_rate": 2.5505776636713735e-05, + "loss": 0.3711, + "step": 18301 + }, + { + "epoch": 23.494223363286263, + "grad_norm": 1.423694372177124, + "learning_rate": 2.5505348737697907e-05, + "loss": 0.417, + "step": 18302 + }, + { + "epoch": 23.49550706033376, + "grad_norm": 3.9986836910247803, + "learning_rate": 2.550492083868207e-05, + "loss": 0.4394, + "step": 18303 + }, + { + "epoch": 23.49679075738126, + "grad_norm": 1.3737717866897583, + "learning_rate": 2.550449293966624e-05, + "loss": 0.4263, + "step": 18304 + }, + { + "epoch": 23.498074454428753, + "grad_norm": 3.073056697845459, + "learning_rate": 2.550406504065041e-05, + "loss": 0.4347, + "step": 18305 + }, + { + "epoch": 23.49935815147625, + "grad_norm": 1.567456603050232, + "learning_rate": 2.5503637141634573e-05, + "loss": 0.4184, + "step": 18306 + }, + { + "epoch": 23.50064184852375, + "grad_norm": 1.1524863243103027, + "learning_rate": 2.5503209242618742e-05, + "loss": 0.3812, + "step": 18307 + }, + { + "epoch": 23.501925545571247, + "grad_norm": 1.7472355365753174, + "learning_rate": 2.550278134360291e-05, + "loss": 0.4228, + "step": 18308 + }, + { + "epoch": 23.50320924261874, + "grad_norm": 2.102623224258423, + "learning_rate": 2.550235344458708e-05, + "loss": 0.4018, + "step": 18309 + }, + { + "epoch": 23.50449293966624, + "grad_norm": 2.487945556640625, + "learning_rate": 2.5501925545571247e-05, + "loss": 0.4496, + "step": 18310 + }, + { + "epoch": 23.505776636713737, + "grad_norm": 1.9011369943618774, + "learning_rate": 2.5501497646555415e-05, + "loss": 0.4001, + "step": 18311 + }, + { + "epoch": 23.50706033376123, + "grad_norm": 1.3714033365249634, + "learning_rate": 2.550106974753958e-05, + "loss": 0.4696, + "step": 18312 + }, + { + "epoch": 23.50834403080873, + "grad_norm": 2.071016550064087, + "learning_rate": 2.550064184852375e-05, + "loss": 0.4643, + "step": 18313 + }, + { + "epoch": 23.509627727856227, + "grad_norm": 1.7675989866256714, + "learning_rate": 2.5500213949507917e-05, + "loss": 0.3954, + "step": 18314 + }, + { + "epoch": 23.51091142490372, + "grad_norm": 1.7974189519882202, + "learning_rate": 2.5499786050492082e-05, + "loss": 0.5014, + "step": 18315 + }, + { + "epoch": 23.51219512195122, + "grad_norm": 1.6678011417388916, + "learning_rate": 2.5499358151476254e-05, + "loss": 0.5126, + "step": 18316 + }, + { + "epoch": 23.513478818998717, + "grad_norm": 3.0743467807769775, + "learning_rate": 2.549893025246042e-05, + "loss": 0.5963, + "step": 18317 + }, + { + "epoch": 23.514762516046215, + "grad_norm": 2.2043254375457764, + "learning_rate": 2.549850235344459e-05, + "loss": 0.3242, + "step": 18318 + }, + { + "epoch": 23.51604621309371, + "grad_norm": 1.4546160697937012, + "learning_rate": 2.5498074454428756e-05, + "loss": 0.3807, + "step": 18319 + }, + { + "epoch": 23.517329910141207, + "grad_norm": 0.9254487156867981, + "learning_rate": 2.549764655541292e-05, + "loss": 0.4034, + "step": 18320 + }, + { + "epoch": 23.518613607188705, + "grad_norm": 1.2389934062957764, + "learning_rate": 2.5497218656397093e-05, + "loss": 0.3603, + "step": 18321 + }, + { + "epoch": 23.5198973042362, + "grad_norm": 1.6888505220413208, + "learning_rate": 2.5496790757381258e-05, + "loss": 0.3714, + "step": 18322 + }, + { + "epoch": 23.521181001283697, + "grad_norm": 1.2871043682098389, + "learning_rate": 2.5496362858365426e-05, + "loss": 0.3327, + "step": 18323 + }, + { + "epoch": 23.522464698331195, + "grad_norm": 0.9584406018257141, + "learning_rate": 2.5495934959349595e-05, + "loss": 0.3859, + "step": 18324 + }, + { + "epoch": 23.52374839537869, + "grad_norm": 1.1052560806274414, + "learning_rate": 2.5495507060333763e-05, + "loss": 0.3551, + "step": 18325 + }, + { + "epoch": 23.525032092426187, + "grad_norm": 2.2205939292907715, + "learning_rate": 2.549507916131793e-05, + "loss": 0.392, + "step": 18326 + }, + { + "epoch": 23.526315789473685, + "grad_norm": 0.9903217554092407, + "learning_rate": 2.5494651262302096e-05, + "loss": 0.3901, + "step": 18327 + }, + { + "epoch": 23.527599486521183, + "grad_norm": 1.0493650436401367, + "learning_rate": 2.5494223363286265e-05, + "loss": 0.3784, + "step": 18328 + }, + { + "epoch": 23.528883183568677, + "grad_norm": 1.9929184913635254, + "learning_rate": 2.5493795464270433e-05, + "loss": 0.3608, + "step": 18329 + }, + { + "epoch": 23.530166880616175, + "grad_norm": 1.9169199466705322, + "learning_rate": 2.5493367565254602e-05, + "loss": 0.3682, + "step": 18330 + }, + { + "epoch": 23.531450577663673, + "grad_norm": 4.151062488555908, + "learning_rate": 2.5492939666238767e-05, + "loss": 0.4161, + "step": 18331 + }, + { + "epoch": 23.532734274711167, + "grad_norm": 0.8469922542572021, + "learning_rate": 2.549251176722294e-05, + "loss": 0.3694, + "step": 18332 + }, + { + "epoch": 23.534017971758665, + "grad_norm": 4.921602725982666, + "learning_rate": 2.5492083868207104e-05, + "loss": 0.3639, + "step": 18333 + }, + { + "epoch": 23.535301668806163, + "grad_norm": 1.081498384475708, + "learning_rate": 2.5491655969191272e-05, + "loss": 0.3937, + "step": 18334 + }, + { + "epoch": 23.536585365853657, + "grad_norm": 1.3021000623703003, + "learning_rate": 2.549122807017544e-05, + "loss": 0.3587, + "step": 18335 + }, + { + "epoch": 23.537869062901155, + "grad_norm": 1.7782490253448486, + "learning_rate": 2.5490800171159605e-05, + "loss": 0.3631, + "step": 18336 + }, + { + "epoch": 23.539152759948653, + "grad_norm": 1.291882872581482, + "learning_rate": 2.5490372272143777e-05, + "loss": 0.3797, + "step": 18337 + }, + { + "epoch": 23.540436456996147, + "grad_norm": 1.4365935325622559, + "learning_rate": 2.5489944373127942e-05, + "loss": 0.3893, + "step": 18338 + }, + { + "epoch": 23.541720154043645, + "grad_norm": 1.511301040649414, + "learning_rate": 2.548951647411211e-05, + "loss": 0.3666, + "step": 18339 + }, + { + "epoch": 23.543003851091143, + "grad_norm": 0.9607625007629395, + "learning_rate": 2.548908857509628e-05, + "loss": 0.3547, + "step": 18340 + }, + { + "epoch": 23.54428754813864, + "grad_norm": 0.8301942348480225, + "learning_rate": 2.5488660676080444e-05, + "loss": 0.3386, + "step": 18341 + }, + { + "epoch": 23.545571245186135, + "grad_norm": 1.1798527240753174, + "learning_rate": 2.5488232777064612e-05, + "loss": 0.3493, + "step": 18342 + }, + { + "epoch": 23.546854942233633, + "grad_norm": 0.9141796231269836, + "learning_rate": 2.548780487804878e-05, + "loss": 0.3471, + "step": 18343 + }, + { + "epoch": 23.54813863928113, + "grad_norm": 0.8484180569648743, + "learning_rate": 2.548737697903295e-05, + "loss": 0.3434, + "step": 18344 + }, + { + "epoch": 23.549422336328625, + "grad_norm": 1.0888713598251343, + "learning_rate": 2.5486949080017118e-05, + "loss": 0.3455, + "step": 18345 + }, + { + "epoch": 23.550706033376123, + "grad_norm": 1.661746859550476, + "learning_rate": 2.5486521181001283e-05, + "loss": 0.3785, + "step": 18346 + }, + { + "epoch": 23.55198973042362, + "grad_norm": 0.9634975790977478, + "learning_rate": 2.548609328198545e-05, + "loss": 0.3715, + "step": 18347 + }, + { + "epoch": 23.553273427471115, + "grad_norm": 1.2982277870178223, + "learning_rate": 2.548566538296962e-05, + "loss": 0.3838, + "step": 18348 + }, + { + "epoch": 23.554557124518613, + "grad_norm": 1.3193999528884888, + "learning_rate": 2.5485237483953788e-05, + "loss": 0.3817, + "step": 18349 + }, + { + "epoch": 23.55584082156611, + "grad_norm": 1.315595030784607, + "learning_rate": 2.5484809584937953e-05, + "loss": 0.3722, + "step": 18350 + }, + { + "epoch": 23.55712451861361, + "grad_norm": 1.5507882833480835, + "learning_rate": 2.5484381685922125e-05, + "loss": 0.3727, + "step": 18351 + }, + { + "epoch": 23.558408215661103, + "grad_norm": 1.758336067199707, + "learning_rate": 2.548395378690629e-05, + "loss": 0.4215, + "step": 18352 + }, + { + "epoch": 23.5596919127086, + "grad_norm": 1.3161922693252563, + "learning_rate": 2.5483525887890458e-05, + "loss": 0.3963, + "step": 18353 + }, + { + "epoch": 23.5609756097561, + "grad_norm": 2.2741472721099854, + "learning_rate": 2.5483097988874627e-05, + "loss": 0.441, + "step": 18354 + }, + { + "epoch": 23.562259306803593, + "grad_norm": 3.6454551219940186, + "learning_rate": 2.548267008985879e-05, + "loss": 0.4486, + "step": 18355 + }, + { + "epoch": 23.56354300385109, + "grad_norm": 1.7481402158737183, + "learning_rate": 2.5482242190842963e-05, + "loss": 0.3564, + "step": 18356 + }, + { + "epoch": 23.56482670089859, + "grad_norm": 3.1558287143707275, + "learning_rate": 2.548181429182713e-05, + "loss": 0.4429, + "step": 18357 + }, + { + "epoch": 23.566110397946083, + "grad_norm": 1.6052472591400146, + "learning_rate": 2.5481386392811297e-05, + "loss": 0.3809, + "step": 18358 + }, + { + "epoch": 23.56739409499358, + "grad_norm": 2.008800506591797, + "learning_rate": 2.5480958493795465e-05, + "loss": 0.4526, + "step": 18359 + }, + { + "epoch": 23.56867779204108, + "grad_norm": 1.0800458192825317, + "learning_rate": 2.548053059477963e-05, + "loss": 0.413, + "step": 18360 + }, + { + "epoch": 23.569961489088577, + "grad_norm": 1.6631940603256226, + "learning_rate": 2.5480102695763802e-05, + "loss": 0.4337, + "step": 18361 + }, + { + "epoch": 23.57124518613607, + "grad_norm": 2.6906630992889404, + "learning_rate": 2.5479674796747967e-05, + "loss": 0.4071, + "step": 18362 + }, + { + "epoch": 23.57252888318357, + "grad_norm": 1.2784093618392944, + "learning_rate": 2.5479246897732136e-05, + "loss": 0.3797, + "step": 18363 + }, + { + "epoch": 23.573812580231067, + "grad_norm": 2.3760647773742676, + "learning_rate": 2.5478818998716304e-05, + "loss": 0.4595, + "step": 18364 + }, + { + "epoch": 23.57509627727856, + "grad_norm": 1.4648443460464478, + "learning_rate": 2.5478391099700472e-05, + "loss": 0.4187, + "step": 18365 + }, + { + "epoch": 23.57637997432606, + "grad_norm": 2.757107734680176, + "learning_rate": 2.5477963200684637e-05, + "loss": 0.551, + "step": 18366 + }, + { + "epoch": 23.577663671373557, + "grad_norm": 4.82423734664917, + "learning_rate": 2.5477535301668806e-05, + "loss": 0.5496, + "step": 18367 + }, + { + "epoch": 23.57894736842105, + "grad_norm": 1.1055890321731567, + "learning_rate": 2.5477107402652974e-05, + "loss": 0.3513, + "step": 18368 + }, + { + "epoch": 23.58023106546855, + "grad_norm": 1.0845611095428467, + "learning_rate": 2.5476679503637143e-05, + "loss": 0.3381, + "step": 18369 + }, + { + "epoch": 23.581514762516047, + "grad_norm": 0.970960259437561, + "learning_rate": 2.547625160462131e-05, + "loss": 0.3561, + "step": 18370 + }, + { + "epoch": 23.58279845956354, + "grad_norm": 1.2401856184005737, + "learning_rate": 2.5475823705605476e-05, + "loss": 0.3846, + "step": 18371 + }, + { + "epoch": 23.58408215661104, + "grad_norm": 1.0284061431884766, + "learning_rate": 2.5475395806589648e-05, + "loss": 0.369, + "step": 18372 + }, + { + "epoch": 23.585365853658537, + "grad_norm": 2.332188367843628, + "learning_rate": 2.5474967907573813e-05, + "loss": 0.4007, + "step": 18373 + }, + { + "epoch": 23.586649550706035, + "grad_norm": 2.1273412704467773, + "learning_rate": 2.5474540008557978e-05, + "loss": 0.4075, + "step": 18374 + }, + { + "epoch": 23.58793324775353, + "grad_norm": 1.4512853622436523, + "learning_rate": 2.547411210954215e-05, + "loss": 0.3863, + "step": 18375 + }, + { + "epoch": 23.589216944801027, + "grad_norm": 1.331107258796692, + "learning_rate": 2.5473684210526315e-05, + "loss": 0.3653, + "step": 18376 + }, + { + "epoch": 23.590500641848525, + "grad_norm": 1.7931302785873413, + "learning_rate": 2.5473256311510487e-05, + "loss": 0.3932, + "step": 18377 + }, + { + "epoch": 23.59178433889602, + "grad_norm": 1.288786768913269, + "learning_rate": 2.547282841249465e-05, + "loss": 0.3281, + "step": 18378 + }, + { + "epoch": 23.593068035943517, + "grad_norm": 1.1029622554779053, + "learning_rate": 2.547240051347882e-05, + "loss": 0.4286, + "step": 18379 + }, + { + "epoch": 23.594351732991015, + "grad_norm": 1.2077988386154175, + "learning_rate": 2.547197261446299e-05, + "loss": 0.3803, + "step": 18380 + }, + { + "epoch": 23.59563543003851, + "grad_norm": 1.9619807004928589, + "learning_rate": 2.5471544715447153e-05, + "loss": 0.3765, + "step": 18381 + }, + { + "epoch": 23.596919127086007, + "grad_norm": 1.0140395164489746, + "learning_rate": 2.5471116816431322e-05, + "loss": 0.3607, + "step": 18382 + }, + { + "epoch": 23.598202824133505, + "grad_norm": 1.0829344987869263, + "learning_rate": 2.547068891741549e-05, + "loss": 0.3962, + "step": 18383 + }, + { + "epoch": 23.599486521181003, + "grad_norm": 0.8895203471183777, + "learning_rate": 2.547026101839966e-05, + "loss": 0.351, + "step": 18384 + }, + { + "epoch": 23.600770218228497, + "grad_norm": 1.0069761276245117, + "learning_rate": 2.5469833119383827e-05, + "loss": 0.3513, + "step": 18385 + }, + { + "epoch": 23.602053915275995, + "grad_norm": 1.8556840419769287, + "learning_rate": 2.5469405220367995e-05, + "loss": 0.3762, + "step": 18386 + }, + { + "epoch": 23.603337612323493, + "grad_norm": 1.5993036031723022, + "learning_rate": 2.546897732135216e-05, + "loss": 0.3679, + "step": 18387 + }, + { + "epoch": 23.604621309370987, + "grad_norm": 1.3743330240249634, + "learning_rate": 2.546854942233633e-05, + "loss": 0.3473, + "step": 18388 + }, + { + "epoch": 23.605905006418485, + "grad_norm": 2.127445697784424, + "learning_rate": 2.5468121523320497e-05, + "loss": 0.3701, + "step": 18389 + }, + { + "epoch": 23.607188703465983, + "grad_norm": 4.230437755584717, + "learning_rate": 2.5467693624304662e-05, + "loss": 0.3986, + "step": 18390 + }, + { + "epoch": 23.608472400513477, + "grad_norm": 2.2138638496398926, + "learning_rate": 2.5467265725288834e-05, + "loss": 0.3583, + "step": 18391 + }, + { + "epoch": 23.609756097560975, + "grad_norm": 0.969383716583252, + "learning_rate": 2.5466837826273e-05, + "loss": 0.4038, + "step": 18392 + }, + { + "epoch": 23.611039794608473, + "grad_norm": 1.4829514026641846, + "learning_rate": 2.546640992725717e-05, + "loss": 0.3972, + "step": 18393 + }, + { + "epoch": 23.61232349165597, + "grad_norm": 1.5019690990447998, + "learning_rate": 2.5465982028241336e-05, + "loss": 0.3934, + "step": 18394 + }, + { + "epoch": 23.613607188703465, + "grad_norm": 1.5999181270599365, + "learning_rate": 2.54655541292255e-05, + "loss": 0.3913, + "step": 18395 + }, + { + "epoch": 23.614890885750963, + "grad_norm": 1.0917890071868896, + "learning_rate": 2.5465126230209673e-05, + "loss": 0.428, + "step": 18396 + }, + { + "epoch": 23.61617458279846, + "grad_norm": 1.3549327850341797, + "learning_rate": 2.5464698331193838e-05, + "loss": 0.3846, + "step": 18397 + }, + { + "epoch": 23.617458279845955, + "grad_norm": 1.6832772493362427, + "learning_rate": 2.5464270432178006e-05, + "loss": 0.4167, + "step": 18398 + }, + { + "epoch": 23.618741976893453, + "grad_norm": 1.5821887254714966, + "learning_rate": 2.5463842533162175e-05, + "loss": 0.3601, + "step": 18399 + }, + { + "epoch": 23.62002567394095, + "grad_norm": 1.9140212535858154, + "learning_rate": 2.5463414634146343e-05, + "loss": 0.405, + "step": 18400 + }, + { + "epoch": 23.621309370988445, + "grad_norm": 1.6550942659378052, + "learning_rate": 2.546298673513051e-05, + "loss": 0.3953, + "step": 18401 + }, + { + "epoch": 23.622593068035943, + "grad_norm": 1.625435471534729, + "learning_rate": 2.5462558836114677e-05, + "loss": 0.3988, + "step": 18402 + }, + { + "epoch": 23.62387676508344, + "grad_norm": 1.6104735136032104, + "learning_rate": 2.5462130937098845e-05, + "loss": 0.38, + "step": 18403 + }, + { + "epoch": 23.625160462130935, + "grad_norm": 1.6182198524475098, + "learning_rate": 2.5461703038083013e-05, + "loss": 0.3642, + "step": 18404 + }, + { + "epoch": 23.626444159178433, + "grad_norm": 2.3760874271392822, + "learning_rate": 2.5461275139067182e-05, + "loss": 0.4271, + "step": 18405 + }, + { + "epoch": 23.62772785622593, + "grad_norm": 1.6067359447479248, + "learning_rate": 2.5460847240051347e-05, + "loss": 0.3823, + "step": 18406 + }, + { + "epoch": 23.62901155327343, + "grad_norm": 2.260715961456299, + "learning_rate": 2.5460419341035515e-05, + "loss": 0.4558, + "step": 18407 + }, + { + "epoch": 23.630295250320923, + "grad_norm": 1.2596428394317627, + "learning_rate": 2.5459991442019684e-05, + "loss": 0.4132, + "step": 18408 + }, + { + "epoch": 23.63157894736842, + "grad_norm": 2.143681764602661, + "learning_rate": 2.5459563543003852e-05, + "loss": 0.434, + "step": 18409 + }, + { + "epoch": 23.63286264441592, + "grad_norm": 1.4759631156921387, + "learning_rate": 2.545913564398802e-05, + "loss": 0.4316, + "step": 18410 + }, + { + "epoch": 23.634146341463413, + "grad_norm": 1.8390917778015137, + "learning_rate": 2.5458707744972185e-05, + "loss": 0.4132, + "step": 18411 + }, + { + "epoch": 23.63543003851091, + "grad_norm": 3.059202194213867, + "learning_rate": 2.5458279845956357e-05, + "loss": 0.3996, + "step": 18412 + }, + { + "epoch": 23.63671373555841, + "grad_norm": 2.102499485015869, + "learning_rate": 2.5457851946940522e-05, + "loss": 0.4325, + "step": 18413 + }, + { + "epoch": 23.637997432605903, + "grad_norm": 2.1664774417877197, + "learning_rate": 2.5457424047924687e-05, + "loss": 0.4731, + "step": 18414 + }, + { + "epoch": 23.6392811296534, + "grad_norm": 1.4970952272415161, + "learning_rate": 2.545699614890886e-05, + "loss": 0.4809, + "step": 18415 + }, + { + "epoch": 23.6405648267009, + "grad_norm": 1.8087235689163208, + "learning_rate": 2.5456568249893024e-05, + "loss": 0.5288, + "step": 18416 + }, + { + "epoch": 23.641848523748397, + "grad_norm": 2.4153831005096436, + "learning_rate": 2.5456140350877196e-05, + "loss": 0.5598, + "step": 18417 + }, + { + "epoch": 23.64313222079589, + "grad_norm": 2.2050631046295166, + "learning_rate": 2.545571245186136e-05, + "loss": 0.3845, + "step": 18418 + }, + { + "epoch": 23.64441591784339, + "grad_norm": 1.0382916927337646, + "learning_rate": 2.545528455284553e-05, + "loss": 0.3718, + "step": 18419 + }, + { + "epoch": 23.645699614890887, + "grad_norm": 1.5422886610031128, + "learning_rate": 2.5454856653829698e-05, + "loss": 0.3723, + "step": 18420 + }, + { + "epoch": 23.64698331193838, + "grad_norm": 1.238390326499939, + "learning_rate": 2.5454428754813863e-05, + "loss": 0.3729, + "step": 18421 + }, + { + "epoch": 23.64826700898588, + "grad_norm": 1.4369810819625854, + "learning_rate": 2.545400085579803e-05, + "loss": 0.3899, + "step": 18422 + }, + { + "epoch": 23.649550706033377, + "grad_norm": 1.3014335632324219, + "learning_rate": 2.54535729567822e-05, + "loss": 0.3766, + "step": 18423 + }, + { + "epoch": 23.65083440308087, + "grad_norm": 3.486585855484009, + "learning_rate": 2.5453145057766368e-05, + "loss": 0.3744, + "step": 18424 + }, + { + "epoch": 23.65211810012837, + "grad_norm": 1.221432089805603, + "learning_rate": 2.5452717158750536e-05, + "loss": 0.3802, + "step": 18425 + }, + { + "epoch": 23.653401797175867, + "grad_norm": 1.2153607606887817, + "learning_rate": 2.5452289259734705e-05, + "loss": 0.3896, + "step": 18426 + }, + { + "epoch": 23.654685494223365, + "grad_norm": 2.332951307296753, + "learning_rate": 2.545186136071887e-05, + "loss": 0.3791, + "step": 18427 + }, + { + "epoch": 23.65596919127086, + "grad_norm": 1.3962504863739014, + "learning_rate": 2.5451433461703038e-05, + "loss": 0.3745, + "step": 18428 + }, + { + "epoch": 23.657252888318357, + "grad_norm": 1.132682204246521, + "learning_rate": 2.5451005562687207e-05, + "loss": 0.3707, + "step": 18429 + }, + { + "epoch": 23.658536585365855, + "grad_norm": 1.6229407787322998, + "learning_rate": 2.5450577663671372e-05, + "loss": 0.3775, + "step": 18430 + }, + { + "epoch": 23.65982028241335, + "grad_norm": 1.154305338859558, + "learning_rate": 2.5450149764655544e-05, + "loss": 0.3785, + "step": 18431 + }, + { + "epoch": 23.661103979460847, + "grad_norm": 1.626173973083496, + "learning_rate": 2.544972186563971e-05, + "loss": 0.3479, + "step": 18432 + }, + { + "epoch": 23.662387676508345, + "grad_norm": 1.8123149871826172, + "learning_rate": 2.544929396662388e-05, + "loss": 0.361, + "step": 18433 + }, + { + "epoch": 23.66367137355584, + "grad_norm": 3.4445559978485107, + "learning_rate": 2.5448866067608045e-05, + "loss": 0.3686, + "step": 18434 + }, + { + "epoch": 23.664955070603337, + "grad_norm": 1.6147632598876953, + "learning_rate": 2.544843816859221e-05, + "loss": 0.3607, + "step": 18435 + }, + { + "epoch": 23.666238767650835, + "grad_norm": 1.716482162475586, + "learning_rate": 2.5448010269576382e-05, + "loss": 0.3724, + "step": 18436 + }, + { + "epoch": 23.66752246469833, + "grad_norm": 1.4639453887939453, + "learning_rate": 2.5447582370560547e-05, + "loss": 0.3908, + "step": 18437 + }, + { + "epoch": 23.668806161745827, + "grad_norm": 2.9882075786590576, + "learning_rate": 2.5447154471544716e-05, + "loss": 0.4396, + "step": 18438 + }, + { + "epoch": 23.670089858793325, + "grad_norm": 1.068698763847351, + "learning_rate": 2.5446726572528884e-05, + "loss": 0.3529, + "step": 18439 + }, + { + "epoch": 23.671373555840823, + "grad_norm": 0.8850210309028625, + "learning_rate": 2.5446298673513052e-05, + "loss": 0.3613, + "step": 18440 + }, + { + "epoch": 23.672657252888317, + "grad_norm": 1.2827370166778564, + "learning_rate": 2.544587077449722e-05, + "loss": 0.3708, + "step": 18441 + }, + { + "epoch": 23.673940949935815, + "grad_norm": 1.5919691324234009, + "learning_rate": 2.5445442875481386e-05, + "loss": 0.3589, + "step": 18442 + }, + { + "epoch": 23.675224646983313, + "grad_norm": 1.0935680866241455, + "learning_rate": 2.5445014976465554e-05, + "loss": 0.3891, + "step": 18443 + }, + { + "epoch": 23.676508344030808, + "grad_norm": 1.825258493423462, + "learning_rate": 2.5444587077449723e-05, + "loss": 0.3458, + "step": 18444 + }, + { + "epoch": 23.677792041078305, + "grad_norm": 1.1225916147232056, + "learning_rate": 2.544415917843389e-05, + "loss": 0.399, + "step": 18445 + }, + { + "epoch": 23.679075738125803, + "grad_norm": 1.4885008335113525, + "learning_rate": 2.5443731279418056e-05, + "loss": 0.3671, + "step": 18446 + }, + { + "epoch": 23.680359435173298, + "grad_norm": 1.5652426481246948, + "learning_rate": 2.5443303380402228e-05, + "loss": 0.3782, + "step": 18447 + }, + { + "epoch": 23.681643132220795, + "grad_norm": 1.5851415395736694, + "learning_rate": 2.5442875481386393e-05, + "loss": 0.3844, + "step": 18448 + }, + { + "epoch": 23.682926829268293, + "grad_norm": 3.2520833015441895, + "learning_rate": 2.544244758237056e-05, + "loss": 0.349, + "step": 18449 + }, + { + "epoch": 23.68421052631579, + "grad_norm": 2.0786054134368896, + "learning_rate": 2.544201968335473e-05, + "loss": 0.4223, + "step": 18450 + }, + { + "epoch": 23.685494223363285, + "grad_norm": 1.867618203163147, + "learning_rate": 2.5441591784338895e-05, + "loss": 0.4476, + "step": 18451 + }, + { + "epoch": 23.686777920410783, + "grad_norm": 1.4416251182556152, + "learning_rate": 2.5441163885323067e-05, + "loss": 0.3796, + "step": 18452 + }, + { + "epoch": 23.68806161745828, + "grad_norm": 1.4437148571014404, + "learning_rate": 2.544073598630723e-05, + "loss": 0.4025, + "step": 18453 + }, + { + "epoch": 23.689345314505776, + "grad_norm": 2.1699423789978027, + "learning_rate": 2.54403080872914e-05, + "loss": 0.3762, + "step": 18454 + }, + { + "epoch": 23.690629011553273, + "grad_norm": 2.3407537937164307, + "learning_rate": 2.543988018827557e-05, + "loss": 0.4237, + "step": 18455 + }, + { + "epoch": 23.69191270860077, + "grad_norm": 1.2341036796569824, + "learning_rate": 2.5439452289259734e-05, + "loss": 0.4104, + "step": 18456 + }, + { + "epoch": 23.693196405648266, + "grad_norm": 1.7765538692474365, + "learning_rate": 2.5439024390243905e-05, + "loss": 0.398, + "step": 18457 + }, + { + "epoch": 23.694480102695763, + "grad_norm": 1.50789475440979, + "learning_rate": 2.543859649122807e-05, + "loss": 0.3995, + "step": 18458 + }, + { + "epoch": 23.69576379974326, + "grad_norm": 2.4315013885498047, + "learning_rate": 2.543816859221224e-05, + "loss": 0.4509, + "step": 18459 + }, + { + "epoch": 23.69704749679076, + "grad_norm": 2.60837459564209, + "learning_rate": 2.5437740693196407e-05, + "loss": 0.4105, + "step": 18460 + }, + { + "epoch": 23.698331193838253, + "grad_norm": 1.219713568687439, + "learning_rate": 2.5437312794180576e-05, + "loss": 0.4343, + "step": 18461 + }, + { + "epoch": 23.69961489088575, + "grad_norm": 1.7127783298492432, + "learning_rate": 2.543688489516474e-05, + "loss": 0.4179, + "step": 18462 + }, + { + "epoch": 23.70089858793325, + "grad_norm": 3.0415940284729004, + "learning_rate": 2.543645699614891e-05, + "loss": 0.4315, + "step": 18463 + }, + { + "epoch": 23.702182284980744, + "grad_norm": 2.519655704498291, + "learning_rate": 2.5436029097133077e-05, + "loss": 0.4871, + "step": 18464 + }, + { + "epoch": 23.70346598202824, + "grad_norm": 1.6686604022979736, + "learning_rate": 2.5435601198117246e-05, + "loss": 0.4174, + "step": 18465 + }, + { + "epoch": 23.70474967907574, + "grad_norm": 2.301013946533203, + "learning_rate": 2.5435173299101414e-05, + "loss": 0.5244, + "step": 18466 + }, + { + "epoch": 23.706033376123234, + "grad_norm": 2.4921936988830566, + "learning_rate": 2.543474540008558e-05, + "loss": 0.568, + "step": 18467 + }, + { + "epoch": 23.70731707317073, + "grad_norm": 1.8298412561416626, + "learning_rate": 2.5434317501069748e-05, + "loss": 0.3627, + "step": 18468 + }, + { + "epoch": 23.70860077021823, + "grad_norm": 1.1559995412826538, + "learning_rate": 2.5433889602053916e-05, + "loss": 0.3354, + "step": 18469 + }, + { + "epoch": 23.709884467265724, + "grad_norm": 1.7519625425338745, + "learning_rate": 2.543346170303808e-05, + "loss": 0.3697, + "step": 18470 + }, + { + "epoch": 23.71116816431322, + "grad_norm": 3.920724630355835, + "learning_rate": 2.5433033804022253e-05, + "loss": 0.4242, + "step": 18471 + }, + { + "epoch": 23.71245186136072, + "grad_norm": 1.205984354019165, + "learning_rate": 2.5432605905006418e-05, + "loss": 0.347, + "step": 18472 + }, + { + "epoch": 23.713735558408217, + "grad_norm": 3.1785988807678223, + "learning_rate": 2.543217800599059e-05, + "loss": 0.3583, + "step": 18473 + }, + { + "epoch": 23.71501925545571, + "grad_norm": 1.1451071500778198, + "learning_rate": 2.5431750106974755e-05, + "loss": 0.4156, + "step": 18474 + }, + { + "epoch": 23.71630295250321, + "grad_norm": 1.6923178434371948, + "learning_rate": 2.543132220795892e-05, + "loss": 0.3474, + "step": 18475 + }, + { + "epoch": 23.717586649550707, + "grad_norm": 1.4587894678115845, + "learning_rate": 2.543089430894309e-05, + "loss": 0.3726, + "step": 18476 + }, + { + "epoch": 23.7188703465982, + "grad_norm": 1.3341926336288452, + "learning_rate": 2.5430466409927257e-05, + "loss": 0.4188, + "step": 18477 + }, + { + "epoch": 23.7201540436457, + "grad_norm": 2.1033668518066406, + "learning_rate": 2.5430038510911425e-05, + "loss": 0.3881, + "step": 18478 + }, + { + "epoch": 23.721437740693197, + "grad_norm": 1.610729455947876, + "learning_rate": 2.5429610611895593e-05, + "loss": 0.3346, + "step": 18479 + }, + { + "epoch": 23.72272143774069, + "grad_norm": 2.720423936843872, + "learning_rate": 2.5429182712879762e-05, + "loss": 0.3529, + "step": 18480 + }, + { + "epoch": 23.72400513478819, + "grad_norm": 1.3967759609222412, + "learning_rate": 2.542875481386393e-05, + "loss": 0.4123, + "step": 18481 + }, + { + "epoch": 23.725288831835687, + "grad_norm": 1.0659362077713013, + "learning_rate": 2.5428326914848095e-05, + "loss": 0.3766, + "step": 18482 + }, + { + "epoch": 23.726572528883185, + "grad_norm": 3.2995808124542236, + "learning_rate": 2.5427899015832264e-05, + "loss": 0.3828, + "step": 18483 + }, + { + "epoch": 23.72785622593068, + "grad_norm": 1.3088493347167969, + "learning_rate": 2.5427471116816432e-05, + "loss": 0.384, + "step": 18484 + }, + { + "epoch": 23.729139922978177, + "grad_norm": 1.0852473974227905, + "learning_rate": 2.54270432178006e-05, + "loss": 0.3833, + "step": 18485 + }, + { + "epoch": 23.730423620025675, + "grad_norm": 1.5082571506500244, + "learning_rate": 2.5426615318784766e-05, + "loss": 0.37, + "step": 18486 + }, + { + "epoch": 23.73170731707317, + "grad_norm": 1.613025188446045, + "learning_rate": 2.5426187419768937e-05, + "loss": 0.3808, + "step": 18487 + }, + { + "epoch": 23.732991014120667, + "grad_norm": 1.1722203493118286, + "learning_rate": 2.5425759520753102e-05, + "loss": 0.3814, + "step": 18488 + }, + { + "epoch": 23.734274711168165, + "grad_norm": 2.6167213916778564, + "learning_rate": 2.542533162173727e-05, + "loss": 0.3861, + "step": 18489 + }, + { + "epoch": 23.73555840821566, + "grad_norm": 1.2829043865203857, + "learning_rate": 2.542490372272144e-05, + "loss": 0.3805, + "step": 18490 + }, + { + "epoch": 23.736842105263158, + "grad_norm": 1.0901044607162476, + "learning_rate": 2.5424475823705604e-05, + "loss": 0.3672, + "step": 18491 + }, + { + "epoch": 23.738125802310655, + "grad_norm": 1.5421808958053589, + "learning_rate": 2.5424047924689776e-05, + "loss": 0.376, + "step": 18492 + }, + { + "epoch": 23.739409499358153, + "grad_norm": 0.9159801006317139, + "learning_rate": 2.542362002567394e-05, + "loss": 0.3598, + "step": 18493 + }, + { + "epoch": 23.740693196405648, + "grad_norm": 1.2000658512115479, + "learning_rate": 2.542319212665811e-05, + "loss": 0.398, + "step": 18494 + }, + { + "epoch": 23.741976893453145, + "grad_norm": 1.7652667760849, + "learning_rate": 2.5422764227642278e-05, + "loss": 0.3884, + "step": 18495 + }, + { + "epoch": 23.743260590500643, + "grad_norm": 4.910795211791992, + "learning_rate": 2.5422336328626443e-05, + "loss": 0.3557, + "step": 18496 + }, + { + "epoch": 23.744544287548138, + "grad_norm": 1.6383473873138428, + "learning_rate": 2.5421908429610615e-05, + "loss": 0.3571, + "step": 18497 + }, + { + "epoch": 23.745827984595635, + "grad_norm": 1.5212024450302124, + "learning_rate": 2.542148053059478e-05, + "loss": 0.3792, + "step": 18498 + }, + { + "epoch": 23.747111681643133, + "grad_norm": 1.4532936811447144, + "learning_rate": 2.5421052631578948e-05, + "loss": 0.3995, + "step": 18499 + }, + { + "epoch": 23.748395378690628, + "grad_norm": 1.632054090499878, + "learning_rate": 2.5420624732563117e-05, + "loss": 0.4508, + "step": 18500 + }, + { + "epoch": 23.749679075738126, + "grad_norm": 5.18654727935791, + "learning_rate": 2.5420196833547285e-05, + "loss": 0.3248, + "step": 18501 + }, + { + "epoch": 23.750962772785623, + "grad_norm": 1.6313555240631104, + "learning_rate": 2.541976893453145e-05, + "loss": 0.4061, + "step": 18502 + }, + { + "epoch": 23.752246469833118, + "grad_norm": 3.0461740493774414, + "learning_rate": 2.541934103551562e-05, + "loss": 0.3769, + "step": 18503 + }, + { + "epoch": 23.753530166880616, + "grad_norm": 1.3290677070617676, + "learning_rate": 2.5418913136499787e-05, + "loss": 0.3935, + "step": 18504 + }, + { + "epoch": 23.754813863928113, + "grad_norm": 1.4868801832199097, + "learning_rate": 2.5418485237483955e-05, + "loss": 0.4229, + "step": 18505 + }, + { + "epoch": 23.75609756097561, + "grad_norm": 3.6422781944274902, + "learning_rate": 2.5418057338468124e-05, + "loss": 0.4324, + "step": 18506 + }, + { + "epoch": 23.757381258023106, + "grad_norm": 1.4450432062149048, + "learning_rate": 2.541762943945229e-05, + "loss": 0.3891, + "step": 18507 + }, + { + "epoch": 23.758664955070603, + "grad_norm": 1.7348097562789917, + "learning_rate": 2.541720154043646e-05, + "loss": 0.4234, + "step": 18508 + }, + { + "epoch": 23.7599486521181, + "grad_norm": 1.673869252204895, + "learning_rate": 2.5416773641420625e-05, + "loss": 0.4323, + "step": 18509 + }, + { + "epoch": 23.761232349165596, + "grad_norm": 2.3452696800231934, + "learning_rate": 2.541634574240479e-05, + "loss": 0.4103, + "step": 18510 + }, + { + "epoch": 23.762516046213094, + "grad_norm": 5.481480598449707, + "learning_rate": 2.5415917843388962e-05, + "loss": 0.4355, + "step": 18511 + }, + { + "epoch": 23.76379974326059, + "grad_norm": 1.774790644645691, + "learning_rate": 2.5415489944373127e-05, + "loss": 0.4277, + "step": 18512 + }, + { + "epoch": 23.765083440308086, + "grad_norm": 8.504080772399902, + "learning_rate": 2.54150620453573e-05, + "loss": 0.418, + "step": 18513 + }, + { + "epoch": 23.766367137355584, + "grad_norm": 1.4473233222961426, + "learning_rate": 2.5414634146341464e-05, + "loss": 0.4861, + "step": 18514 + }, + { + "epoch": 23.76765083440308, + "grad_norm": 2.6392693519592285, + "learning_rate": 2.5414206247325633e-05, + "loss": 0.459, + "step": 18515 + }, + { + "epoch": 23.76893453145058, + "grad_norm": 1.690735101699829, + "learning_rate": 2.54137783483098e-05, + "loss": 0.4803, + "step": 18516 + }, + { + "epoch": 23.770218228498074, + "grad_norm": 2.368752956390381, + "learning_rate": 2.5413350449293966e-05, + "loss": 0.5727, + "step": 18517 + }, + { + "epoch": 23.77150192554557, + "grad_norm": 0.96003258228302, + "learning_rate": 2.5412922550278134e-05, + "loss": 0.3539, + "step": 18518 + }, + { + "epoch": 23.77278562259307, + "grad_norm": 0.8323605060577393, + "learning_rate": 2.5412494651262303e-05, + "loss": 0.3664, + "step": 18519 + }, + { + "epoch": 23.774069319640564, + "grad_norm": 1.3653384447097778, + "learning_rate": 2.541206675224647e-05, + "loss": 0.35, + "step": 18520 + }, + { + "epoch": 23.77535301668806, + "grad_norm": 1.4450441598892212, + "learning_rate": 2.541163885323064e-05, + "loss": 0.3406, + "step": 18521 + }, + { + "epoch": 23.77663671373556, + "grad_norm": 3.9332544803619385, + "learning_rate": 2.5411210954214808e-05, + "loss": 0.3413, + "step": 18522 + }, + { + "epoch": 23.777920410783054, + "grad_norm": 1.0535011291503906, + "learning_rate": 2.5410783055198973e-05, + "loss": 0.3518, + "step": 18523 + }, + { + "epoch": 23.77920410783055, + "grad_norm": 2.278749704360962, + "learning_rate": 2.541035515618314e-05, + "loss": 0.3387, + "step": 18524 + }, + { + "epoch": 23.78048780487805, + "grad_norm": 3.176501989364624, + "learning_rate": 2.540992725716731e-05, + "loss": 0.3847, + "step": 18525 + }, + { + "epoch": 23.781771501925547, + "grad_norm": 2.016921043395996, + "learning_rate": 2.5409499358151475e-05, + "loss": 0.3862, + "step": 18526 + }, + { + "epoch": 23.78305519897304, + "grad_norm": 0.8776808381080627, + "learning_rate": 2.5409071459135647e-05, + "loss": 0.3858, + "step": 18527 + }, + { + "epoch": 23.78433889602054, + "grad_norm": 1.4090754985809326, + "learning_rate": 2.5408643560119812e-05, + "loss": 0.3682, + "step": 18528 + }, + { + "epoch": 23.785622593068037, + "grad_norm": 1.1705325841903687, + "learning_rate": 2.540821566110398e-05, + "loss": 0.367, + "step": 18529 + }, + { + "epoch": 23.78690629011553, + "grad_norm": 0.9339678883552551, + "learning_rate": 2.540778776208815e-05, + "loss": 0.3832, + "step": 18530 + }, + { + "epoch": 23.78818998716303, + "grad_norm": 1.3804240226745605, + "learning_rate": 2.5407359863072314e-05, + "loss": 0.3648, + "step": 18531 + }, + { + "epoch": 23.789473684210527, + "grad_norm": 1.5048091411590576, + "learning_rate": 2.5406931964056485e-05, + "loss": 0.3897, + "step": 18532 + }, + { + "epoch": 23.79075738125802, + "grad_norm": 1.5247581005096436, + "learning_rate": 2.540650406504065e-05, + "loss": 0.4305, + "step": 18533 + }, + { + "epoch": 23.79204107830552, + "grad_norm": 1.7403802871704102, + "learning_rate": 2.540607616602482e-05, + "loss": 0.3632, + "step": 18534 + }, + { + "epoch": 23.793324775353017, + "grad_norm": 3.596583604812622, + "learning_rate": 2.5405648267008987e-05, + "loss": 0.3596, + "step": 18535 + }, + { + "epoch": 23.794608472400512, + "grad_norm": 2.2508883476257324, + "learning_rate": 2.5405220367993152e-05, + "loss": 0.348, + "step": 18536 + }, + { + "epoch": 23.79589216944801, + "grad_norm": 1.9727225303649902, + "learning_rate": 2.5404792468977324e-05, + "loss": 0.3943, + "step": 18537 + }, + { + "epoch": 23.797175866495508, + "grad_norm": 1.2500958442687988, + "learning_rate": 2.540436456996149e-05, + "loss": 0.372, + "step": 18538 + }, + { + "epoch": 23.798459563543005, + "grad_norm": 0.8927134275436401, + "learning_rate": 2.5403936670945657e-05, + "loss": 0.3378, + "step": 18539 + }, + { + "epoch": 23.7997432605905, + "grad_norm": 1.4809861183166504, + "learning_rate": 2.5403508771929826e-05, + "loss": 0.3644, + "step": 18540 + }, + { + "epoch": 23.801026957637998, + "grad_norm": 1.105387568473816, + "learning_rate": 2.5403080872913994e-05, + "loss": 0.3947, + "step": 18541 + }, + { + "epoch": 23.802310654685495, + "grad_norm": 1.6579160690307617, + "learning_rate": 2.540265297389816e-05, + "loss": 0.379, + "step": 18542 + }, + { + "epoch": 23.80359435173299, + "grad_norm": 0.9967074394226074, + "learning_rate": 2.5402225074882328e-05, + "loss": 0.3797, + "step": 18543 + }, + { + "epoch": 23.804878048780488, + "grad_norm": 1.0891865491867065, + "learning_rate": 2.5401797175866496e-05, + "loss": 0.3745, + "step": 18544 + }, + { + "epoch": 23.806161745827985, + "grad_norm": 1.2926312685012817, + "learning_rate": 2.540136927685066e-05, + "loss": 0.3933, + "step": 18545 + }, + { + "epoch": 23.80744544287548, + "grad_norm": 1.3198235034942627, + "learning_rate": 2.5400941377834833e-05, + "loss": 0.3831, + "step": 18546 + }, + { + "epoch": 23.808729139922978, + "grad_norm": 1.284926414489746, + "learning_rate": 2.5400513478818998e-05, + "loss": 0.3966, + "step": 18547 + }, + { + "epoch": 23.810012836970476, + "grad_norm": 1.175230622291565, + "learning_rate": 2.540008557980317e-05, + "loss": 0.3776, + "step": 18548 + }, + { + "epoch": 23.811296534017973, + "grad_norm": 4.551626205444336, + "learning_rate": 2.5399657680787335e-05, + "loss": 0.4444, + "step": 18549 + }, + { + "epoch": 23.812580231065468, + "grad_norm": 1.53264319896698, + "learning_rate": 2.53992297817715e-05, + "loss": 0.3955, + "step": 18550 + }, + { + "epoch": 23.813863928112966, + "grad_norm": 1.5186171531677246, + "learning_rate": 2.539880188275567e-05, + "loss": 0.3807, + "step": 18551 + }, + { + "epoch": 23.815147625160463, + "grad_norm": 1.8378732204437256, + "learning_rate": 2.5398373983739837e-05, + "loss": 0.3832, + "step": 18552 + }, + { + "epoch": 23.816431322207958, + "grad_norm": 2.8261938095092773, + "learning_rate": 2.5397946084724005e-05, + "loss": 0.3661, + "step": 18553 + }, + { + "epoch": 23.817715019255456, + "grad_norm": 2.636152982711792, + "learning_rate": 2.5397518185708173e-05, + "loss": 0.3751, + "step": 18554 + }, + { + "epoch": 23.818998716302954, + "grad_norm": 4.956929683685303, + "learning_rate": 2.5397090286692342e-05, + "loss": 0.4302, + "step": 18555 + }, + { + "epoch": 23.820282413350448, + "grad_norm": 1.852773904800415, + "learning_rate": 2.539666238767651e-05, + "loss": 0.3889, + "step": 18556 + }, + { + "epoch": 23.821566110397946, + "grad_norm": 1.5137197971343994, + "learning_rate": 2.5396234488660675e-05, + "loss": 0.3744, + "step": 18557 + }, + { + "epoch": 23.822849807445444, + "grad_norm": 1.230025291442871, + "learning_rate": 2.5395806589644844e-05, + "loss": 0.3763, + "step": 18558 + }, + { + "epoch": 23.82413350449294, + "grad_norm": 1.560242772102356, + "learning_rate": 2.5395378690629012e-05, + "loss": 0.4628, + "step": 18559 + }, + { + "epoch": 23.825417201540436, + "grad_norm": 1.3569504022598267, + "learning_rate": 2.539495079161318e-05, + "loss": 0.4288, + "step": 18560 + }, + { + "epoch": 23.826700898587934, + "grad_norm": 1.4057594537734985, + "learning_rate": 2.5394522892597346e-05, + "loss": 0.4194, + "step": 18561 + }, + { + "epoch": 23.82798459563543, + "grad_norm": 2.1551694869995117, + "learning_rate": 2.5394094993581517e-05, + "loss": 0.4186, + "step": 18562 + }, + { + "epoch": 23.829268292682926, + "grad_norm": 1.4324290752410889, + "learning_rate": 2.5393667094565682e-05, + "loss": 0.4404, + "step": 18563 + }, + { + "epoch": 23.830551989730424, + "grad_norm": 2.0697519779205322, + "learning_rate": 2.539323919554985e-05, + "loss": 0.4306, + "step": 18564 + }, + { + "epoch": 23.83183568677792, + "grad_norm": 2.078536033630371, + "learning_rate": 2.539281129653402e-05, + "loss": 0.4729, + "step": 18565 + }, + { + "epoch": 23.833119383825416, + "grad_norm": 2.7756803035736084, + "learning_rate": 2.5392383397518184e-05, + "loss": 0.4624, + "step": 18566 + }, + { + "epoch": 23.834403080872914, + "grad_norm": 1.6496742963790894, + "learning_rate": 2.5391955498502356e-05, + "loss": 0.5722, + "step": 18567 + }, + { + "epoch": 23.83568677792041, + "grad_norm": 1.054200530052185, + "learning_rate": 2.539152759948652e-05, + "loss": 0.3574, + "step": 18568 + }, + { + "epoch": 23.836970474967906, + "grad_norm": 3.2460973262786865, + "learning_rate": 2.539109970047069e-05, + "loss": 0.3712, + "step": 18569 + }, + { + "epoch": 23.838254172015404, + "grad_norm": 1.1667814254760742, + "learning_rate": 2.5390671801454858e-05, + "loss": 0.3688, + "step": 18570 + }, + { + "epoch": 23.8395378690629, + "grad_norm": 0.9588155746459961, + "learning_rate": 2.5390243902439023e-05, + "loss": 0.3845, + "step": 18571 + }, + { + "epoch": 23.8408215661104, + "grad_norm": 2.3453049659729004, + "learning_rate": 2.5389816003423195e-05, + "loss": 0.3548, + "step": 18572 + }, + { + "epoch": 23.842105263157894, + "grad_norm": 1.5707781314849854, + "learning_rate": 2.538938810440736e-05, + "loss": 0.3516, + "step": 18573 + }, + { + "epoch": 23.84338896020539, + "grad_norm": 2.394421100616455, + "learning_rate": 2.5388960205391528e-05, + "loss": 0.3815, + "step": 18574 + }, + { + "epoch": 23.84467265725289, + "grad_norm": 0.9287833571434021, + "learning_rate": 2.5388532306375697e-05, + "loss": 0.3475, + "step": 18575 + }, + { + "epoch": 23.845956354300384, + "grad_norm": 5.898213863372803, + "learning_rate": 2.5388104407359865e-05, + "loss": 0.3719, + "step": 18576 + }, + { + "epoch": 23.84724005134788, + "grad_norm": 1.672106385231018, + "learning_rate": 2.538767650834403e-05, + "loss": 0.4393, + "step": 18577 + }, + { + "epoch": 23.84852374839538, + "grad_norm": 1.5041428804397583, + "learning_rate": 2.53872486093282e-05, + "loss": 0.3758, + "step": 18578 + }, + { + "epoch": 23.849807445442874, + "grad_norm": 2.7557506561279297, + "learning_rate": 2.5386820710312367e-05, + "loss": 0.3934, + "step": 18579 + }, + { + "epoch": 23.85109114249037, + "grad_norm": 1.5894149541854858, + "learning_rate": 2.5386392811296535e-05, + "loss": 0.3823, + "step": 18580 + }, + { + "epoch": 23.85237483953787, + "grad_norm": 1.410500168800354, + "learning_rate": 2.5385964912280704e-05, + "loss": 0.416, + "step": 18581 + }, + { + "epoch": 23.853658536585368, + "grad_norm": 2.6846017837524414, + "learning_rate": 2.538553701326487e-05, + "loss": 0.363, + "step": 18582 + }, + { + "epoch": 23.854942233632862, + "grad_norm": 1.1827702522277832, + "learning_rate": 2.538510911424904e-05, + "loss": 0.3664, + "step": 18583 + }, + { + "epoch": 23.85622593068036, + "grad_norm": 1.124232530593872, + "learning_rate": 2.5384681215233206e-05, + "loss": 0.4346, + "step": 18584 + }, + { + "epoch": 23.857509627727858, + "grad_norm": 1.7627220153808594, + "learning_rate": 2.538425331621737e-05, + "loss": 0.4221, + "step": 18585 + }, + { + "epoch": 23.858793324775352, + "grad_norm": 1.2884095907211304, + "learning_rate": 2.5383825417201542e-05, + "loss": 0.3767, + "step": 18586 + }, + { + "epoch": 23.86007702182285, + "grad_norm": 1.6324915885925293, + "learning_rate": 2.5383397518185707e-05, + "loss": 0.3693, + "step": 18587 + }, + { + "epoch": 23.861360718870348, + "grad_norm": 1.116228461265564, + "learning_rate": 2.538296961916988e-05, + "loss": 0.3548, + "step": 18588 + }, + { + "epoch": 23.862644415917842, + "grad_norm": 3.5385801792144775, + "learning_rate": 2.5382541720154044e-05, + "loss": 0.3777, + "step": 18589 + }, + { + "epoch": 23.86392811296534, + "grad_norm": 1.8497296571731567, + "learning_rate": 2.538211382113821e-05, + "loss": 0.3931, + "step": 18590 + }, + { + "epoch": 23.865211810012838, + "grad_norm": 1.244827151298523, + "learning_rate": 2.538168592212238e-05, + "loss": 0.3735, + "step": 18591 + }, + { + "epoch": 23.866495507060336, + "grad_norm": 1.2301180362701416, + "learning_rate": 2.5381258023106546e-05, + "loss": 0.3923, + "step": 18592 + }, + { + "epoch": 23.86777920410783, + "grad_norm": 1.841412901878357, + "learning_rate": 2.5380830124090714e-05, + "loss": 0.4066, + "step": 18593 + }, + { + "epoch": 23.869062901155328, + "grad_norm": 1.8824682235717773, + "learning_rate": 2.5380402225074883e-05, + "loss": 0.3726, + "step": 18594 + }, + { + "epoch": 23.870346598202826, + "grad_norm": 0.9477377533912659, + "learning_rate": 2.537997432605905e-05, + "loss": 0.3465, + "step": 18595 + }, + { + "epoch": 23.87163029525032, + "grad_norm": 1.4979865550994873, + "learning_rate": 2.537954642704322e-05, + "loss": 0.3662, + "step": 18596 + }, + { + "epoch": 23.872913992297818, + "grad_norm": 4.890566825866699, + "learning_rate": 2.5379118528027385e-05, + "loss": 0.3826, + "step": 18597 + }, + { + "epoch": 23.874197689345316, + "grad_norm": 1.5360726118087769, + "learning_rate": 2.5378690629011553e-05, + "loss": 0.3848, + "step": 18598 + }, + { + "epoch": 23.87548138639281, + "grad_norm": 2.0136208534240723, + "learning_rate": 2.537826272999572e-05, + "loss": 0.362, + "step": 18599 + }, + { + "epoch": 23.876765083440308, + "grad_norm": 1.275683045387268, + "learning_rate": 2.537783483097989e-05, + "loss": 0.3903, + "step": 18600 + }, + { + "epoch": 23.878048780487806, + "grad_norm": 0.9393839240074158, + "learning_rate": 2.5377406931964055e-05, + "loss": 0.3789, + "step": 18601 + }, + { + "epoch": 23.8793324775353, + "grad_norm": 1.3997626304626465, + "learning_rate": 2.5376979032948227e-05, + "loss": 0.4129, + "step": 18602 + }, + { + "epoch": 23.880616174582798, + "grad_norm": 1.0040912628173828, + "learning_rate": 2.5376551133932392e-05, + "loss": 0.3829, + "step": 18603 + }, + { + "epoch": 23.881899871630296, + "grad_norm": 2.237783432006836, + "learning_rate": 2.537612323491656e-05, + "loss": 0.4045, + "step": 18604 + }, + { + "epoch": 23.883183568677794, + "grad_norm": 1.173824429512024, + "learning_rate": 2.537569533590073e-05, + "loss": 0.3825, + "step": 18605 + }, + { + "epoch": 23.884467265725288, + "grad_norm": 1.9750311374664307, + "learning_rate": 2.5375267436884894e-05, + "loss": 0.3582, + "step": 18606 + }, + { + "epoch": 23.885750962772786, + "grad_norm": 1.7786880731582642, + "learning_rate": 2.5374839537869065e-05, + "loss": 0.4071, + "step": 18607 + }, + { + "epoch": 23.887034659820284, + "grad_norm": 2.1466944217681885, + "learning_rate": 2.537441163885323e-05, + "loss": 0.3887, + "step": 18608 + }, + { + "epoch": 23.888318356867778, + "grad_norm": 1.5982398986816406, + "learning_rate": 2.53739837398374e-05, + "loss": 0.3931, + "step": 18609 + }, + { + "epoch": 23.889602053915276, + "grad_norm": 1.159561276435852, + "learning_rate": 2.5373555840821567e-05, + "loss": 0.4198, + "step": 18610 + }, + { + "epoch": 23.890885750962774, + "grad_norm": 1.977866291999817, + "learning_rate": 2.5373127941805732e-05, + "loss": 0.4649, + "step": 18611 + }, + { + "epoch": 23.892169448010268, + "grad_norm": 1.2130985260009766, + "learning_rate": 2.5372700042789904e-05, + "loss": 0.4549, + "step": 18612 + }, + { + "epoch": 23.893453145057766, + "grad_norm": 1.5887655019760132, + "learning_rate": 2.537227214377407e-05, + "loss": 0.4528, + "step": 18613 + }, + { + "epoch": 23.894736842105264, + "grad_norm": 1.7710692882537842, + "learning_rate": 2.5371844244758238e-05, + "loss": 0.4122, + "step": 18614 + }, + { + "epoch": 23.89602053915276, + "grad_norm": 1.6558246612548828, + "learning_rate": 2.5371416345742406e-05, + "loss": 0.4506, + "step": 18615 + }, + { + "epoch": 23.897304236200256, + "grad_norm": 2.4402153491973877, + "learning_rate": 2.5370988446726574e-05, + "loss": 0.5182, + "step": 18616 + }, + { + "epoch": 23.898587933247754, + "grad_norm": 1.834001898765564, + "learning_rate": 2.537056054771074e-05, + "loss": 0.5797, + "step": 18617 + }, + { + "epoch": 23.89987163029525, + "grad_norm": 1.0430881977081299, + "learning_rate": 2.5370132648694908e-05, + "loss": 0.3497, + "step": 18618 + }, + { + "epoch": 23.901155327342746, + "grad_norm": 0.8616861701011658, + "learning_rate": 2.5369704749679076e-05, + "loss": 0.3418, + "step": 18619 + }, + { + "epoch": 23.902439024390244, + "grad_norm": 2.2721002101898193, + "learning_rate": 2.5369276850663245e-05, + "loss": 0.3885, + "step": 18620 + }, + { + "epoch": 23.90372272143774, + "grad_norm": 6.219088077545166, + "learning_rate": 2.5368848951647413e-05, + "loss": 0.3685, + "step": 18621 + }, + { + "epoch": 23.905006418485236, + "grad_norm": 3.381986141204834, + "learning_rate": 2.5368421052631578e-05, + "loss": 0.3778, + "step": 18622 + }, + { + "epoch": 23.906290115532734, + "grad_norm": 1.3499497175216675, + "learning_rate": 2.536799315361575e-05, + "loss": 0.3666, + "step": 18623 + }, + { + "epoch": 23.90757381258023, + "grad_norm": 1.768902063369751, + "learning_rate": 2.5367565254599915e-05, + "loss": 0.3781, + "step": 18624 + }, + { + "epoch": 23.90885750962773, + "grad_norm": 0.7957895398139954, + "learning_rate": 2.536713735558408e-05, + "loss": 0.3654, + "step": 18625 + }, + { + "epoch": 23.910141206675224, + "grad_norm": 1.8024563789367676, + "learning_rate": 2.536670945656825e-05, + "loss": 0.3705, + "step": 18626 + }, + { + "epoch": 23.911424903722722, + "grad_norm": 1.8351078033447266, + "learning_rate": 2.5366281557552417e-05, + "loss": 0.3595, + "step": 18627 + }, + { + "epoch": 23.91270860077022, + "grad_norm": 0.9268519878387451, + "learning_rate": 2.536585365853659e-05, + "loss": 0.38, + "step": 18628 + }, + { + "epoch": 23.913992297817714, + "grad_norm": 2.527316093444824, + "learning_rate": 2.5365425759520754e-05, + "loss": 0.3946, + "step": 18629 + }, + { + "epoch": 23.915275994865212, + "grad_norm": 0.9644090533256531, + "learning_rate": 2.5364997860504922e-05, + "loss": 0.3771, + "step": 18630 + }, + { + "epoch": 23.91655969191271, + "grad_norm": 2.746925115585327, + "learning_rate": 2.536456996148909e-05, + "loss": 0.3923, + "step": 18631 + }, + { + "epoch": 23.917843388960204, + "grad_norm": 1.2404550313949585, + "learning_rate": 2.5364142062473255e-05, + "loss": 0.3632, + "step": 18632 + }, + { + "epoch": 23.919127086007702, + "grad_norm": 1.3347525596618652, + "learning_rate": 2.5363714163457424e-05, + "loss": 0.3861, + "step": 18633 + }, + { + "epoch": 23.9204107830552, + "grad_norm": 1.3654112815856934, + "learning_rate": 2.5363286264441592e-05, + "loss": 0.4246, + "step": 18634 + }, + { + "epoch": 23.921694480102694, + "grad_norm": 1.660218596458435, + "learning_rate": 2.536285836542576e-05, + "loss": 0.3619, + "step": 18635 + }, + { + "epoch": 23.922978177150192, + "grad_norm": 1.36130690574646, + "learning_rate": 2.536243046640993e-05, + "loss": 0.38, + "step": 18636 + }, + { + "epoch": 23.92426187419769, + "grad_norm": 1.5356571674346924, + "learning_rate": 2.5362002567394097e-05, + "loss": 0.3879, + "step": 18637 + }, + { + "epoch": 23.925545571245188, + "grad_norm": 1.9184811115264893, + "learning_rate": 2.5361574668378262e-05, + "loss": 0.3536, + "step": 18638 + }, + { + "epoch": 23.926829268292682, + "grad_norm": 1.5249390602111816, + "learning_rate": 2.536114676936243e-05, + "loss": 0.3513, + "step": 18639 + }, + { + "epoch": 23.92811296534018, + "grad_norm": 4.285731792449951, + "learning_rate": 2.53607188703466e-05, + "loss": 0.4072, + "step": 18640 + }, + { + "epoch": 23.929396662387678, + "grad_norm": 2.150643825531006, + "learning_rate": 2.5360290971330764e-05, + "loss": 0.4147, + "step": 18641 + }, + { + "epoch": 23.930680359435172, + "grad_norm": 1.1002949476242065, + "learning_rate": 2.5359863072314936e-05, + "loss": 0.3898, + "step": 18642 + }, + { + "epoch": 23.93196405648267, + "grad_norm": 2.0494675636291504, + "learning_rate": 2.53594351732991e-05, + "loss": 0.4081, + "step": 18643 + }, + { + "epoch": 23.933247753530168, + "grad_norm": 5.12990140914917, + "learning_rate": 2.5359007274283273e-05, + "loss": 0.3571, + "step": 18644 + }, + { + "epoch": 23.934531450577662, + "grad_norm": 1.8674957752227783, + "learning_rate": 2.5358579375267438e-05, + "loss": 0.365, + "step": 18645 + }, + { + "epoch": 23.93581514762516, + "grad_norm": 1.2735763788223267, + "learning_rate": 2.5358151476251603e-05, + "loss": 0.4044, + "step": 18646 + }, + { + "epoch": 23.937098844672658, + "grad_norm": 1.1820311546325684, + "learning_rate": 2.5357723577235775e-05, + "loss": 0.3646, + "step": 18647 + }, + { + "epoch": 23.938382541720156, + "grad_norm": 2.035681962966919, + "learning_rate": 2.535729567821994e-05, + "loss": 0.3994, + "step": 18648 + }, + { + "epoch": 23.93966623876765, + "grad_norm": 1.4006034135818481, + "learning_rate": 2.5356867779204108e-05, + "loss": 0.4254, + "step": 18649 + }, + { + "epoch": 23.940949935815148, + "grad_norm": 2.118426561355591, + "learning_rate": 2.5356439880188277e-05, + "loss": 0.4039, + "step": 18650 + }, + { + "epoch": 23.942233632862646, + "grad_norm": 2.7154743671417236, + "learning_rate": 2.535601198117244e-05, + "loss": 0.3962, + "step": 18651 + }, + { + "epoch": 23.94351732991014, + "grad_norm": 2.7507388591766357, + "learning_rate": 2.5355584082156613e-05, + "loss": 0.4043, + "step": 18652 + }, + { + "epoch": 23.944801026957638, + "grad_norm": 2.435096025466919, + "learning_rate": 2.535515618314078e-05, + "loss": 0.4203, + "step": 18653 + }, + { + "epoch": 23.946084724005136, + "grad_norm": 2.0605010986328125, + "learning_rate": 2.5354728284124947e-05, + "loss": 0.3845, + "step": 18654 + }, + { + "epoch": 23.94736842105263, + "grad_norm": 3.3287880420684814, + "learning_rate": 2.5354300385109115e-05, + "loss": 0.4045, + "step": 18655 + }, + { + "epoch": 23.948652118100128, + "grad_norm": 1.6840428113937378, + "learning_rate": 2.5353872486093284e-05, + "loss": 0.4171, + "step": 18656 + }, + { + "epoch": 23.949935815147626, + "grad_norm": 1.1370656490325928, + "learning_rate": 2.535344458707745e-05, + "loss": 0.3858, + "step": 18657 + }, + { + "epoch": 23.951219512195124, + "grad_norm": 1.8186601400375366, + "learning_rate": 2.5353016688061617e-05, + "loss": 0.4146, + "step": 18658 + }, + { + "epoch": 23.952503209242618, + "grad_norm": 1.442722201347351, + "learning_rate": 2.5352588789045786e-05, + "loss": 0.4123, + "step": 18659 + }, + { + "epoch": 23.953786906290116, + "grad_norm": 1.2846200466156006, + "learning_rate": 2.5352160890029954e-05, + "loss": 0.3976, + "step": 18660 + }, + { + "epoch": 23.955070603337614, + "grad_norm": 1.4393267631530762, + "learning_rate": 2.5351732991014122e-05, + "loss": 0.381, + "step": 18661 + }, + { + "epoch": 23.956354300385108, + "grad_norm": 1.1386642456054688, + "learning_rate": 2.5351305091998287e-05, + "loss": 0.3822, + "step": 18662 + }, + { + "epoch": 23.957637997432606, + "grad_norm": 1.459107756614685, + "learning_rate": 2.535087719298246e-05, + "loss": 0.4822, + "step": 18663 + }, + { + "epoch": 23.958921694480104, + "grad_norm": 3.9931435585021973, + "learning_rate": 2.5350449293966624e-05, + "loss": 0.4582, + "step": 18664 + }, + { + "epoch": 23.960205391527598, + "grad_norm": 2.5589773654937744, + "learning_rate": 2.535002139495079e-05, + "loss": 0.4522, + "step": 18665 + }, + { + "epoch": 23.961489088575096, + "grad_norm": 1.611991047859192, + "learning_rate": 2.534959349593496e-05, + "loss": 0.5177, + "step": 18666 + }, + { + "epoch": 23.962772785622594, + "grad_norm": 3.2782840728759766, + "learning_rate": 2.5349165596919126e-05, + "loss": 0.6066, + "step": 18667 + }, + { + "epoch": 23.964056482670088, + "grad_norm": 1.3824862241744995, + "learning_rate": 2.5348737697903298e-05, + "loss": 0.3748, + "step": 18668 + }, + { + "epoch": 23.965340179717586, + "grad_norm": 0.9496781229972839, + "learning_rate": 2.5348309798887463e-05, + "loss": 0.3558, + "step": 18669 + }, + { + "epoch": 23.966623876765084, + "grad_norm": 1.3946330547332764, + "learning_rate": 2.534788189987163e-05, + "loss": 0.3708, + "step": 18670 + }, + { + "epoch": 23.96790757381258, + "grad_norm": 1.9573194980621338, + "learning_rate": 2.53474540008558e-05, + "loss": 0.3767, + "step": 18671 + }, + { + "epoch": 23.969191270860076, + "grad_norm": 0.8146377205848694, + "learning_rate": 2.5347026101839965e-05, + "loss": 0.3773, + "step": 18672 + }, + { + "epoch": 23.970474967907574, + "grad_norm": 3.164844036102295, + "learning_rate": 2.5346598202824133e-05, + "loss": 0.3773, + "step": 18673 + }, + { + "epoch": 23.971758664955072, + "grad_norm": 1.408555507659912, + "learning_rate": 2.53461703038083e-05, + "loss": 0.3755, + "step": 18674 + }, + { + "epoch": 23.973042362002566, + "grad_norm": 1.455188512802124, + "learning_rate": 2.534574240479247e-05, + "loss": 0.3448, + "step": 18675 + }, + { + "epoch": 23.974326059050064, + "grad_norm": 1.5316240787506104, + "learning_rate": 2.534531450577664e-05, + "loss": 0.3677, + "step": 18676 + }, + { + "epoch": 23.975609756097562, + "grad_norm": 1.2820820808410645, + "learning_rate": 2.5344886606760807e-05, + "loss": 0.391, + "step": 18677 + }, + { + "epoch": 23.976893453145056, + "grad_norm": 0.8959318995475769, + "learning_rate": 2.5344458707744972e-05, + "loss": 0.3885, + "step": 18678 + }, + { + "epoch": 23.978177150192554, + "grad_norm": 3.0586371421813965, + "learning_rate": 2.534403080872914e-05, + "loss": 0.3624, + "step": 18679 + }, + { + "epoch": 23.979460847240052, + "grad_norm": 6.6066460609436035, + "learning_rate": 2.534360290971331e-05, + "loss": 0.3917, + "step": 18680 + }, + { + "epoch": 23.98074454428755, + "grad_norm": 1.7975924015045166, + "learning_rate": 2.5343175010697474e-05, + "loss": 0.417, + "step": 18681 + }, + { + "epoch": 23.982028241335044, + "grad_norm": 1.3837844133377075, + "learning_rate": 2.5342747111681645e-05, + "loss": 0.3797, + "step": 18682 + }, + { + "epoch": 23.983311938382542, + "grad_norm": 1.90226411819458, + "learning_rate": 2.534231921266581e-05, + "loss": 0.4213, + "step": 18683 + }, + { + "epoch": 23.98459563543004, + "grad_norm": 1.658351182937622, + "learning_rate": 2.5341891313649982e-05, + "loss": 0.3903, + "step": 18684 + }, + { + "epoch": 23.985879332477534, + "grad_norm": 3.2517030239105225, + "learning_rate": 2.5341463414634147e-05, + "loss": 0.4026, + "step": 18685 + }, + { + "epoch": 23.987163029525032, + "grad_norm": 1.6241601705551147, + "learning_rate": 2.5341035515618312e-05, + "loss": 0.3632, + "step": 18686 + }, + { + "epoch": 23.98844672657253, + "grad_norm": 1.6850389242172241, + "learning_rate": 2.5340607616602484e-05, + "loss": 0.4079, + "step": 18687 + }, + { + "epoch": 23.989730423620024, + "grad_norm": 1.1197196245193481, + "learning_rate": 2.534017971758665e-05, + "loss": 0.4227, + "step": 18688 + }, + { + "epoch": 23.991014120667522, + "grad_norm": 1.5422217845916748, + "learning_rate": 2.5339751818570818e-05, + "loss": 0.4094, + "step": 18689 + }, + { + "epoch": 23.99229781771502, + "grad_norm": 2.254950523376465, + "learning_rate": 2.5339323919554986e-05, + "loss": 0.399, + "step": 18690 + }, + { + "epoch": 23.993581514762518, + "grad_norm": 1.6888835430145264, + "learning_rate": 2.5338896020539154e-05, + "loss": 0.3834, + "step": 18691 + }, + { + "epoch": 23.994865211810012, + "grad_norm": 3.9453577995300293, + "learning_rate": 2.5338468121523323e-05, + "loss": 0.409, + "step": 18692 + }, + { + "epoch": 23.99614890885751, + "grad_norm": 1.8444983959197998, + "learning_rate": 2.5338040222507488e-05, + "loss": 0.4164, + "step": 18693 + }, + { + "epoch": 23.997432605905008, + "grad_norm": 2.772399663925171, + "learning_rate": 2.5337612323491656e-05, + "loss": 0.4221, + "step": 18694 + }, + { + "epoch": 23.998716302952502, + "grad_norm": 1.7146694660186768, + "learning_rate": 2.5337184424475825e-05, + "loss": 0.4664, + "step": 18695 + }, + { + "epoch": 24.0, + "grad_norm": 4.721467018127441, + "learning_rate": 2.5336756525459993e-05, + "loss": 0.5727, + "step": 18696 + }, + { + "epoch": 24.001283697047498, + "grad_norm": 1.0975478887557983, + "learning_rate": 2.5336328626444158e-05, + "loss": 0.3264, + "step": 18697 + }, + { + "epoch": 24.002567394094992, + "grad_norm": 1.4985271692276, + "learning_rate": 2.533590072742833e-05, + "loss": 0.3376, + "step": 18698 + }, + { + "epoch": 24.00385109114249, + "grad_norm": 1.333166480064392, + "learning_rate": 2.5335472828412495e-05, + "loss": 0.3483, + "step": 18699 + }, + { + "epoch": 24.005134788189988, + "grad_norm": 4.072522163391113, + "learning_rate": 2.5335044929396663e-05, + "loss": 0.3807, + "step": 18700 + }, + { + "epoch": 24.006418485237482, + "grad_norm": 1.112274408340454, + "learning_rate": 2.5334617030380832e-05, + "loss": 0.3598, + "step": 18701 + }, + { + "epoch": 24.00770218228498, + "grad_norm": 1.2969214916229248, + "learning_rate": 2.5334189131364997e-05, + "loss": 0.3436, + "step": 18702 + }, + { + "epoch": 24.008985879332478, + "grad_norm": 1.5930509567260742, + "learning_rate": 2.533376123234917e-05, + "loss": 0.3519, + "step": 18703 + }, + { + "epoch": 24.010269576379976, + "grad_norm": 5.033453941345215, + "learning_rate": 2.5333333333333334e-05, + "loss": 0.3303, + "step": 18704 + }, + { + "epoch": 24.01155327342747, + "grad_norm": 1.493969440460205, + "learning_rate": 2.5332905434317502e-05, + "loss": 0.3454, + "step": 18705 + }, + { + "epoch": 24.012836970474968, + "grad_norm": 1.436909794807434, + "learning_rate": 2.533247753530167e-05, + "loss": 0.3437, + "step": 18706 + }, + { + "epoch": 24.014120667522466, + "grad_norm": 1.1341222524642944, + "learning_rate": 2.5332049636285835e-05, + "loss": 0.3344, + "step": 18707 + }, + { + "epoch": 24.01540436456996, + "grad_norm": 1.1479436159133911, + "learning_rate": 2.5331621737270007e-05, + "loss": 0.3574, + "step": 18708 + }, + { + "epoch": 24.016688061617458, + "grad_norm": 1.200290322303772, + "learning_rate": 2.5331193838254172e-05, + "loss": 0.354, + "step": 18709 + }, + { + "epoch": 24.017971758664956, + "grad_norm": 1.1256237030029297, + "learning_rate": 2.533076593923834e-05, + "loss": 0.4032, + "step": 18710 + }, + { + "epoch": 24.01925545571245, + "grad_norm": 1.3223423957824707, + "learning_rate": 2.533033804022251e-05, + "loss": 0.3612, + "step": 18711 + }, + { + "epoch": 24.020539152759948, + "grad_norm": 1.0778889656066895, + "learning_rate": 2.5329910141206674e-05, + "loss": 0.3474, + "step": 18712 + }, + { + "epoch": 24.021822849807446, + "grad_norm": 1.2171614170074463, + "learning_rate": 2.5329482242190843e-05, + "loss": 0.3398, + "step": 18713 + }, + { + "epoch": 24.023106546854944, + "grad_norm": 2.2203927040100098, + "learning_rate": 2.532905434317501e-05, + "loss": 0.3202, + "step": 18714 + }, + { + "epoch": 24.024390243902438, + "grad_norm": 1.3058054447174072, + "learning_rate": 2.532862644415918e-05, + "loss": 0.3179, + "step": 18715 + }, + { + "epoch": 24.025673940949936, + "grad_norm": 0.9237610101699829, + "learning_rate": 2.5328198545143348e-05, + "loss": 0.3308, + "step": 18716 + }, + { + "epoch": 24.026957637997434, + "grad_norm": 2.0614397525787354, + "learning_rate": 2.5327770646127516e-05, + "loss": 0.364, + "step": 18717 + }, + { + "epoch": 24.028241335044928, + "grad_norm": 2.9675562381744385, + "learning_rate": 2.532734274711168e-05, + "loss": 0.3764, + "step": 18718 + }, + { + "epoch": 24.029525032092426, + "grad_norm": 3.4919447898864746, + "learning_rate": 2.532691484809585e-05, + "loss": 0.3541, + "step": 18719 + }, + { + "epoch": 24.030808729139924, + "grad_norm": 1.2633477449417114, + "learning_rate": 2.5326486949080018e-05, + "loss": 0.3204, + "step": 18720 + }, + { + "epoch": 24.03209242618742, + "grad_norm": 5.6682844161987305, + "learning_rate": 2.5326059050064183e-05, + "loss": 0.3376, + "step": 18721 + }, + { + "epoch": 24.033376123234916, + "grad_norm": 1.4673713445663452, + "learning_rate": 2.5325631151048355e-05, + "loss": 0.3336, + "step": 18722 + }, + { + "epoch": 24.034659820282414, + "grad_norm": 1.3415168523788452, + "learning_rate": 2.532520325203252e-05, + "loss": 0.388, + "step": 18723 + }, + { + "epoch": 24.035943517329912, + "grad_norm": 1.8137311935424805, + "learning_rate": 2.532477535301669e-05, + "loss": 0.3842, + "step": 18724 + }, + { + "epoch": 24.037227214377406, + "grad_norm": 0.9701972603797913, + "learning_rate": 2.5324347454000857e-05, + "loss": 0.3238, + "step": 18725 + }, + { + "epoch": 24.038510911424904, + "grad_norm": 1.1709773540496826, + "learning_rate": 2.5323919554985022e-05, + "loss": 0.3413, + "step": 18726 + }, + { + "epoch": 24.039794608472402, + "grad_norm": 3.2927680015563965, + "learning_rate": 2.5323491655969194e-05, + "loss": 0.3494, + "step": 18727 + }, + { + "epoch": 24.041078305519896, + "grad_norm": 1.4549579620361328, + "learning_rate": 2.532306375695336e-05, + "loss": 0.3057, + "step": 18728 + }, + { + "epoch": 24.042362002567394, + "grad_norm": 2.792867422103882, + "learning_rate": 2.5322635857937527e-05, + "loss": 0.3944, + "step": 18729 + }, + { + "epoch": 24.043645699614892, + "grad_norm": 2.0017054080963135, + "learning_rate": 2.5322207958921695e-05, + "loss": 0.3978, + "step": 18730 + }, + { + "epoch": 24.044929396662386, + "grad_norm": 1.3966039419174194, + "learning_rate": 2.5321780059905864e-05, + "loss": 0.3815, + "step": 18731 + }, + { + "epoch": 24.046213093709884, + "grad_norm": 2.5898611545562744, + "learning_rate": 2.5321352160890032e-05, + "loss": 0.3466, + "step": 18732 + }, + { + "epoch": 24.047496790757382, + "grad_norm": 1.6304739713668823, + "learning_rate": 2.5320924261874197e-05, + "loss": 0.3761, + "step": 18733 + }, + { + "epoch": 24.048780487804876, + "grad_norm": 2.1714563369750977, + "learning_rate": 2.5320496362858366e-05, + "loss": 0.3983, + "step": 18734 + }, + { + "epoch": 24.050064184852374, + "grad_norm": 2.3491761684417725, + "learning_rate": 2.5320068463842534e-05, + "loss": 0.3803, + "step": 18735 + }, + { + "epoch": 24.051347881899872, + "grad_norm": 1.3174245357513428, + "learning_rate": 2.5319640564826702e-05, + "loss": 0.3297, + "step": 18736 + }, + { + "epoch": 24.05263157894737, + "grad_norm": 1.1678065061569214, + "learning_rate": 2.5319212665810867e-05, + "loss": 0.4155, + "step": 18737 + }, + { + "epoch": 24.053915275994864, + "grad_norm": 2.229779005050659, + "learning_rate": 2.531878476679504e-05, + "loss": 0.3745, + "step": 18738 + }, + { + "epoch": 24.055198973042362, + "grad_norm": 1.4771997928619385, + "learning_rate": 2.5318356867779204e-05, + "loss": 0.4187, + "step": 18739 + }, + { + "epoch": 24.05648267008986, + "grad_norm": 6.140133380889893, + "learning_rate": 2.5317928968763373e-05, + "loss": 0.4025, + "step": 18740 + }, + { + "epoch": 24.057766367137354, + "grad_norm": 2.1968278884887695, + "learning_rate": 2.531750106974754e-05, + "loss": 0.37, + "step": 18741 + }, + { + "epoch": 24.059050064184852, + "grad_norm": 8.402290344238281, + "learning_rate": 2.5317073170731706e-05, + "loss": 0.4066, + "step": 18742 + }, + { + "epoch": 24.06033376123235, + "grad_norm": 2.593372106552124, + "learning_rate": 2.5316645271715878e-05, + "loss": 0.4308, + "step": 18743 + }, + { + "epoch": 24.061617458279844, + "grad_norm": 2.444481372833252, + "learning_rate": 2.5316217372700043e-05, + "loss": 0.473, + "step": 18744 + }, + { + "epoch": 24.062901155327342, + "grad_norm": 4.781376361846924, + "learning_rate": 2.531578947368421e-05, + "loss": 0.5225, + "step": 18745 + }, + { + "epoch": 24.06418485237484, + "grad_norm": 5.126748085021973, + "learning_rate": 2.531536157466838e-05, + "loss": 0.6058, + "step": 18746 + }, + { + "epoch": 24.065468549422338, + "grad_norm": 1.3078821897506714, + "learning_rate": 2.5314933675652545e-05, + "loss": 0.3563, + "step": 18747 + }, + { + "epoch": 24.066752246469832, + "grad_norm": 0.8238951563835144, + "learning_rate": 2.5314505776636713e-05, + "loss": 0.3493, + "step": 18748 + }, + { + "epoch": 24.06803594351733, + "grad_norm": 1.664388656616211, + "learning_rate": 2.531407787762088e-05, + "loss": 0.4006, + "step": 18749 + }, + { + "epoch": 24.069319640564828, + "grad_norm": 1.2235419750213623, + "learning_rate": 2.531364997860505e-05, + "loss": 0.344, + "step": 18750 + }, + { + "epoch": 24.070603337612322, + "grad_norm": 0.9258381724357605, + "learning_rate": 2.531322207958922e-05, + "loss": 0.357, + "step": 18751 + }, + { + "epoch": 24.07188703465982, + "grad_norm": 1.5347797870635986, + "learning_rate": 2.5312794180573387e-05, + "loss": 0.3751, + "step": 18752 + }, + { + "epoch": 24.073170731707318, + "grad_norm": 0.9172223806381226, + "learning_rate": 2.5312366281557552e-05, + "loss": 0.3422, + "step": 18753 + }, + { + "epoch": 24.074454428754812, + "grad_norm": 1.4072099924087524, + "learning_rate": 2.531193838254172e-05, + "loss": 0.3333, + "step": 18754 + }, + { + "epoch": 24.07573812580231, + "grad_norm": 1.330525279045105, + "learning_rate": 2.531151048352589e-05, + "loss": 0.3915, + "step": 18755 + }, + { + "epoch": 24.077021822849808, + "grad_norm": 5.3306708335876465, + "learning_rate": 2.5311082584510054e-05, + "loss": 0.3585, + "step": 18756 + }, + { + "epoch": 24.078305519897306, + "grad_norm": 1.591941237449646, + "learning_rate": 2.5310654685494226e-05, + "loss": 0.3918, + "step": 18757 + }, + { + "epoch": 24.0795892169448, + "grad_norm": 1.8753225803375244, + "learning_rate": 2.531022678647839e-05, + "loss": 0.3548, + "step": 18758 + }, + { + "epoch": 24.080872913992298, + "grad_norm": 1.170695185661316, + "learning_rate": 2.5309798887462562e-05, + "loss": 0.3656, + "step": 18759 + }, + { + "epoch": 24.082156611039796, + "grad_norm": 2.9518775939941406, + "learning_rate": 2.5309370988446727e-05, + "loss": 0.3644, + "step": 18760 + }, + { + "epoch": 24.08344030808729, + "grad_norm": 1.201367735862732, + "learning_rate": 2.5308943089430892e-05, + "loss": 0.3862, + "step": 18761 + }, + { + "epoch": 24.084724005134788, + "grad_norm": 1.5281708240509033, + "learning_rate": 2.5308515190415064e-05, + "loss": 0.3694, + "step": 18762 + }, + { + "epoch": 24.086007702182286, + "grad_norm": 4.088070869445801, + "learning_rate": 2.530808729139923e-05, + "loss": 0.365, + "step": 18763 + }, + { + "epoch": 24.08729139922978, + "grad_norm": 1.1676677465438843, + "learning_rate": 2.5307659392383398e-05, + "loss": 0.3444, + "step": 18764 + }, + { + "epoch": 24.088575096277278, + "grad_norm": 1.2797164916992188, + "learning_rate": 2.5307231493367566e-05, + "loss": 0.3208, + "step": 18765 + }, + { + "epoch": 24.089858793324776, + "grad_norm": 1.3876488208770752, + "learning_rate": 2.5306803594351734e-05, + "loss": 0.3628, + "step": 18766 + }, + { + "epoch": 24.09114249037227, + "grad_norm": 1.3701109886169434, + "learning_rate": 2.5306375695335903e-05, + "loss": 0.3425, + "step": 18767 + }, + { + "epoch": 24.09242618741977, + "grad_norm": 1.9621413946151733, + "learning_rate": 2.5305947796320068e-05, + "loss": 0.3464, + "step": 18768 + }, + { + "epoch": 24.093709884467266, + "grad_norm": 1.4656648635864258, + "learning_rate": 2.5305519897304236e-05, + "loss": 0.34, + "step": 18769 + }, + { + "epoch": 24.094993581514764, + "grad_norm": 1.136480450630188, + "learning_rate": 2.5305091998288405e-05, + "loss": 0.362, + "step": 18770 + }, + { + "epoch": 24.09627727856226, + "grad_norm": 2.1378304958343506, + "learning_rate": 2.5304664099272573e-05, + "loss": 0.3454, + "step": 18771 + }, + { + "epoch": 24.097560975609756, + "grad_norm": 1.9790406227111816, + "learning_rate": 2.5304236200256738e-05, + "loss": 0.3895, + "step": 18772 + }, + { + "epoch": 24.098844672657254, + "grad_norm": 1.278159499168396, + "learning_rate": 2.530380830124091e-05, + "loss": 0.3445, + "step": 18773 + }, + { + "epoch": 24.10012836970475, + "grad_norm": 1.953881025314331, + "learning_rate": 2.5303380402225075e-05, + "loss": 0.3277, + "step": 18774 + }, + { + "epoch": 24.101412066752246, + "grad_norm": 1.4370189905166626, + "learning_rate": 2.5302952503209243e-05, + "loss": 0.3629, + "step": 18775 + }, + { + "epoch": 24.102695763799744, + "grad_norm": 1.162028431892395, + "learning_rate": 2.5302524604193412e-05, + "loss": 0.3674, + "step": 18776 + }, + { + "epoch": 24.10397946084724, + "grad_norm": 1.4182204008102417, + "learning_rate": 2.5302096705177577e-05, + "loss": 0.3779, + "step": 18777 + }, + { + "epoch": 24.105263157894736, + "grad_norm": 2.81437349319458, + "learning_rate": 2.530166880616175e-05, + "loss": 0.3452, + "step": 18778 + }, + { + "epoch": 24.106546854942234, + "grad_norm": 1.2699570655822754, + "learning_rate": 2.5301240907145914e-05, + "loss": 0.407, + "step": 18779 + }, + { + "epoch": 24.107830551989732, + "grad_norm": 3.312671184539795, + "learning_rate": 2.530081300813008e-05, + "loss": 0.4241, + "step": 18780 + }, + { + "epoch": 24.109114249037226, + "grad_norm": 1.4981683492660522, + "learning_rate": 2.530038510911425e-05, + "loss": 0.3748, + "step": 18781 + }, + { + "epoch": 24.110397946084724, + "grad_norm": 4.086285591125488, + "learning_rate": 2.5299957210098416e-05, + "loss": 0.3392, + "step": 18782 + }, + { + "epoch": 24.111681643132222, + "grad_norm": 1.6122633218765259, + "learning_rate": 2.5299529311082587e-05, + "loss": 0.3496, + "step": 18783 + }, + { + "epoch": 24.112965340179716, + "grad_norm": 1.1680856943130493, + "learning_rate": 2.5299101412066752e-05, + "loss": 0.3816, + "step": 18784 + }, + { + "epoch": 24.114249037227214, + "grad_norm": 1.4369347095489502, + "learning_rate": 2.529867351305092e-05, + "loss": 0.3861, + "step": 18785 + }, + { + "epoch": 24.115532734274712, + "grad_norm": 1.8760361671447754, + "learning_rate": 2.529824561403509e-05, + "loss": 0.3937, + "step": 18786 + }, + { + "epoch": 24.116816431322206, + "grad_norm": 0.9758501648902893, + "learning_rate": 2.5297817715019254e-05, + "loss": 0.3563, + "step": 18787 + }, + { + "epoch": 24.118100128369704, + "grad_norm": 2.2691965103149414, + "learning_rate": 2.5297389816003423e-05, + "loss": 0.447, + "step": 18788 + }, + { + "epoch": 24.119383825417202, + "grad_norm": 1.6299952268600464, + "learning_rate": 2.529696191698759e-05, + "loss": 0.3797, + "step": 18789 + }, + { + "epoch": 24.1206675224647, + "grad_norm": 1.2273441553115845, + "learning_rate": 2.529653401797176e-05, + "loss": 0.3684, + "step": 18790 + }, + { + "epoch": 24.121951219512194, + "grad_norm": 2.5388684272766113, + "learning_rate": 2.5296106118955928e-05, + "loss": 0.4142, + "step": 18791 + }, + { + "epoch": 24.123234916559692, + "grad_norm": 2.1562047004699707, + "learning_rate": 2.5295678219940096e-05, + "loss": 0.4216, + "step": 18792 + }, + { + "epoch": 24.12451861360719, + "grad_norm": 3.6935667991638184, + "learning_rate": 2.529525032092426e-05, + "loss": 0.3837, + "step": 18793 + }, + { + "epoch": 24.125802310654684, + "grad_norm": 4.042551040649414, + "learning_rate": 2.529482242190843e-05, + "loss": 0.4283, + "step": 18794 + }, + { + "epoch": 24.127086007702182, + "grad_norm": 1.7482330799102783, + "learning_rate": 2.5294394522892598e-05, + "loss": 0.4121, + "step": 18795 + }, + { + "epoch": 24.12836970474968, + "grad_norm": 2.1451175212860107, + "learning_rate": 2.5293966623876763e-05, + "loss": 0.5283, + "step": 18796 + }, + { + "epoch": 24.129653401797174, + "grad_norm": 1.2267768383026123, + "learning_rate": 2.5293538724860935e-05, + "loss": 0.3694, + "step": 18797 + }, + { + "epoch": 24.130937098844672, + "grad_norm": 1.7087191343307495, + "learning_rate": 2.52931108258451e-05, + "loss": 0.3449, + "step": 18798 + }, + { + "epoch": 24.13222079589217, + "grad_norm": 1.0084295272827148, + "learning_rate": 2.5292682926829272e-05, + "loss": 0.3459, + "step": 18799 + }, + { + "epoch": 24.133504492939664, + "grad_norm": 1.0845000743865967, + "learning_rate": 2.5292255027813437e-05, + "loss": 0.356, + "step": 18800 + }, + { + "epoch": 24.134788189987162, + "grad_norm": 1.1403342485427856, + "learning_rate": 2.5291827128797602e-05, + "loss": 0.3298, + "step": 18801 + }, + { + "epoch": 24.13607188703466, + "grad_norm": 1.1558791399002075, + "learning_rate": 2.5291399229781774e-05, + "loss": 0.3455, + "step": 18802 + }, + { + "epoch": 24.137355584082158, + "grad_norm": 1.52286958694458, + "learning_rate": 2.529097133076594e-05, + "loss": 0.3578, + "step": 18803 + }, + { + "epoch": 24.138639281129652, + "grad_norm": 0.840972900390625, + "learning_rate": 2.5290543431750107e-05, + "loss": 0.3487, + "step": 18804 + }, + { + "epoch": 24.13992297817715, + "grad_norm": 1.3168983459472656, + "learning_rate": 2.5290115532734275e-05, + "loss": 0.3657, + "step": 18805 + }, + { + "epoch": 24.141206675224648, + "grad_norm": 1.3531804084777832, + "learning_rate": 2.5289687633718444e-05, + "loss": 0.3702, + "step": 18806 + }, + { + "epoch": 24.142490372272142, + "grad_norm": 2.1276233196258545, + "learning_rate": 2.5289259734702612e-05, + "loss": 0.3458, + "step": 18807 + }, + { + "epoch": 24.14377406931964, + "grad_norm": 0.9057385325431824, + "learning_rate": 2.5288831835686777e-05, + "loss": 0.3641, + "step": 18808 + }, + { + "epoch": 24.145057766367138, + "grad_norm": 2.0878067016601562, + "learning_rate": 2.5288403936670946e-05, + "loss": 0.3515, + "step": 18809 + }, + { + "epoch": 24.146341463414632, + "grad_norm": 0.9991466403007507, + "learning_rate": 2.5287976037655114e-05, + "loss": 0.3651, + "step": 18810 + }, + { + "epoch": 24.14762516046213, + "grad_norm": 1.3257083892822266, + "learning_rate": 2.5287548138639283e-05, + "loss": 0.344, + "step": 18811 + }, + { + "epoch": 24.14890885750963, + "grad_norm": 1.1091835498809814, + "learning_rate": 2.5287120239623448e-05, + "loss": 0.3559, + "step": 18812 + }, + { + "epoch": 24.150192554557126, + "grad_norm": 0.7738353610038757, + "learning_rate": 2.528669234060762e-05, + "loss": 0.3575, + "step": 18813 + }, + { + "epoch": 24.15147625160462, + "grad_norm": 0.9052010178565979, + "learning_rate": 2.5286264441591784e-05, + "loss": 0.3276, + "step": 18814 + }, + { + "epoch": 24.15275994865212, + "grad_norm": 1.199781894683838, + "learning_rate": 2.5285836542575953e-05, + "loss": 0.3372, + "step": 18815 + }, + { + "epoch": 24.154043645699616, + "grad_norm": 1.163091778755188, + "learning_rate": 2.528540864356012e-05, + "loss": 0.3852, + "step": 18816 + }, + { + "epoch": 24.15532734274711, + "grad_norm": 0.9000206589698792, + "learning_rate": 2.5284980744544286e-05, + "loss": 0.3387, + "step": 18817 + }, + { + "epoch": 24.15661103979461, + "grad_norm": 1.6433000564575195, + "learning_rate": 2.5284552845528458e-05, + "loss": 0.3595, + "step": 18818 + }, + { + "epoch": 24.157894736842106, + "grad_norm": 1.457039475440979, + "learning_rate": 2.5284124946512623e-05, + "loss": 0.3532, + "step": 18819 + }, + { + "epoch": 24.1591784338896, + "grad_norm": 3.6374549865722656, + "learning_rate": 2.528369704749679e-05, + "loss": 0.3586, + "step": 18820 + }, + { + "epoch": 24.1604621309371, + "grad_norm": 1.9385203123092651, + "learning_rate": 2.528326914848096e-05, + "loss": 0.3545, + "step": 18821 + }, + { + "epoch": 24.161745827984596, + "grad_norm": 1.3105770349502563, + "learning_rate": 2.5282841249465125e-05, + "loss": 0.3574, + "step": 18822 + }, + { + "epoch": 24.163029525032094, + "grad_norm": 1.0948776006698608, + "learning_rate": 2.5282413350449297e-05, + "loss": 0.3356, + "step": 18823 + }, + { + "epoch": 24.16431322207959, + "grad_norm": 4.657884120941162, + "learning_rate": 2.5281985451433462e-05, + "loss": 0.3724, + "step": 18824 + }, + { + "epoch": 24.165596919127086, + "grad_norm": 1.6449213027954102, + "learning_rate": 2.528155755241763e-05, + "loss": 0.3678, + "step": 18825 + }, + { + "epoch": 24.166880616174584, + "grad_norm": 1.4416801929473877, + "learning_rate": 2.52811296534018e-05, + "loss": 0.3846, + "step": 18826 + }, + { + "epoch": 24.16816431322208, + "grad_norm": 1.4171761274337769, + "learning_rate": 2.5280701754385967e-05, + "loss": 0.3609, + "step": 18827 + }, + { + "epoch": 24.169448010269576, + "grad_norm": 1.0165358781814575, + "learning_rate": 2.5280273855370132e-05, + "loss": 0.3508, + "step": 18828 + }, + { + "epoch": 24.170731707317074, + "grad_norm": 0.9855926036834717, + "learning_rate": 2.52798459563543e-05, + "loss": 0.3604, + "step": 18829 + }, + { + "epoch": 24.17201540436457, + "grad_norm": 1.7640188932418823, + "learning_rate": 2.527941805733847e-05, + "loss": 0.3631, + "step": 18830 + }, + { + "epoch": 24.173299101412066, + "grad_norm": 3.269355535507202, + "learning_rate": 2.5278990158322637e-05, + "loss": 0.3908, + "step": 18831 + }, + { + "epoch": 24.174582798459564, + "grad_norm": 4.403311252593994, + "learning_rate": 2.5278562259306806e-05, + "loss": 0.3565, + "step": 18832 + }, + { + "epoch": 24.17586649550706, + "grad_norm": 1.0820506811141968, + "learning_rate": 2.527813436029097e-05, + "loss": 0.3477, + "step": 18833 + }, + { + "epoch": 24.177150192554556, + "grad_norm": 1.544084072113037, + "learning_rate": 2.5277706461275142e-05, + "loss": 0.386, + "step": 18834 + }, + { + "epoch": 24.178433889602054, + "grad_norm": 2.3783373832702637, + "learning_rate": 2.5277278562259307e-05, + "loss": 0.376, + "step": 18835 + }, + { + "epoch": 24.179717586649552, + "grad_norm": 1.3762860298156738, + "learning_rate": 2.5276850663243472e-05, + "loss": 0.3705, + "step": 18836 + }, + { + "epoch": 24.181001283697046, + "grad_norm": 1.0662765502929688, + "learning_rate": 2.5276422764227644e-05, + "loss": 0.3921, + "step": 18837 + }, + { + "epoch": 24.182284980744544, + "grad_norm": 1.2943100929260254, + "learning_rate": 2.527599486521181e-05, + "loss": 0.4202, + "step": 18838 + }, + { + "epoch": 24.183568677792042, + "grad_norm": 1.7480480670928955, + "learning_rate": 2.527556696619598e-05, + "loss": 0.4006, + "step": 18839 + }, + { + "epoch": 24.184852374839537, + "grad_norm": 2.6016643047332764, + "learning_rate": 2.5275139067180146e-05, + "loss": 0.4192, + "step": 18840 + }, + { + "epoch": 24.186136071887034, + "grad_norm": 2.466275930404663, + "learning_rate": 2.527471116816431e-05, + "loss": 0.4176, + "step": 18841 + }, + { + "epoch": 24.187419768934532, + "grad_norm": 2.069120407104492, + "learning_rate": 2.5274283269148483e-05, + "loss": 0.367, + "step": 18842 + }, + { + "epoch": 24.188703465982027, + "grad_norm": 3.1771976947784424, + "learning_rate": 2.5273855370132648e-05, + "loss": 0.3725, + "step": 18843 + }, + { + "epoch": 24.189987163029524, + "grad_norm": 1.091370940208435, + "learning_rate": 2.5273427471116816e-05, + "loss": 0.4378, + "step": 18844 + }, + { + "epoch": 24.191270860077022, + "grad_norm": 1.9873427152633667, + "learning_rate": 2.5272999572100985e-05, + "loss": 0.4639, + "step": 18845 + }, + { + "epoch": 24.19255455712452, + "grad_norm": 1.9765324592590332, + "learning_rate": 2.5272571673085153e-05, + "loss": 0.558, + "step": 18846 + }, + { + "epoch": 24.193838254172015, + "grad_norm": 1.0266574621200562, + "learning_rate": 2.527214377406932e-05, + "loss": 0.3244, + "step": 18847 + }, + { + "epoch": 24.195121951219512, + "grad_norm": 0.8745529055595398, + "learning_rate": 2.5271715875053487e-05, + "loss": 0.3402, + "step": 18848 + }, + { + "epoch": 24.19640564826701, + "grad_norm": 1.8016737699508667, + "learning_rate": 2.5271287976037655e-05, + "loss": 0.321, + "step": 18849 + }, + { + "epoch": 24.197689345314505, + "grad_norm": 1.4469175338745117, + "learning_rate": 2.5270860077021823e-05, + "loss": 0.3697, + "step": 18850 + }, + { + "epoch": 24.198973042362002, + "grad_norm": 0.9043884873390198, + "learning_rate": 2.5270432178005992e-05, + "loss": 0.3462, + "step": 18851 + }, + { + "epoch": 24.2002567394095, + "grad_norm": 1.4945696592330933, + "learning_rate": 2.5270004278990157e-05, + "loss": 0.3352, + "step": 18852 + }, + { + "epoch": 24.201540436456995, + "grad_norm": 0.8557878732681274, + "learning_rate": 2.526957637997433e-05, + "loss": 0.3453, + "step": 18853 + }, + { + "epoch": 24.202824133504492, + "grad_norm": 1.17729651927948, + "learning_rate": 2.5269148480958494e-05, + "loss": 0.3254, + "step": 18854 + }, + { + "epoch": 24.20410783055199, + "grad_norm": 1.4560329914093018, + "learning_rate": 2.5268720581942662e-05, + "loss": 0.3289, + "step": 18855 + }, + { + "epoch": 24.205391527599488, + "grad_norm": 1.4451788663864136, + "learning_rate": 2.526829268292683e-05, + "loss": 0.3603, + "step": 18856 + }, + { + "epoch": 24.206675224646983, + "grad_norm": 3.134779453277588, + "learning_rate": 2.5267864783910996e-05, + "loss": 0.3696, + "step": 18857 + }, + { + "epoch": 24.20795892169448, + "grad_norm": 1.3461005687713623, + "learning_rate": 2.5267436884895167e-05, + "loss": 0.3323, + "step": 18858 + }, + { + "epoch": 24.20924261874198, + "grad_norm": 2.209947347640991, + "learning_rate": 2.5267008985879332e-05, + "loss": 0.3498, + "step": 18859 + }, + { + "epoch": 24.210526315789473, + "grad_norm": 2.2377216815948486, + "learning_rate": 2.52665810868635e-05, + "loss": 0.325, + "step": 18860 + }, + { + "epoch": 24.21181001283697, + "grad_norm": 1.4859070777893066, + "learning_rate": 2.526615318784767e-05, + "loss": 0.3414, + "step": 18861 + }, + { + "epoch": 24.21309370988447, + "grad_norm": 3.053246259689331, + "learning_rate": 2.5265725288831834e-05, + "loss": 0.3438, + "step": 18862 + }, + { + "epoch": 24.214377406931963, + "grad_norm": 1.1572191715240479, + "learning_rate": 2.5265297389816006e-05, + "loss": 0.364, + "step": 18863 + }, + { + "epoch": 24.21566110397946, + "grad_norm": 1.1738249063491821, + "learning_rate": 2.526486949080017e-05, + "loss": 0.3645, + "step": 18864 + }, + { + "epoch": 24.21694480102696, + "grad_norm": 1.5341076850891113, + "learning_rate": 2.526444159178434e-05, + "loss": 0.3329, + "step": 18865 + }, + { + "epoch": 24.218228498074453, + "grad_norm": 1.3070467710494995, + "learning_rate": 2.5264013692768508e-05, + "loss": 0.4071, + "step": 18866 + }, + { + "epoch": 24.21951219512195, + "grad_norm": 1.482451319694519, + "learning_rate": 2.5263585793752676e-05, + "loss": 0.3533, + "step": 18867 + }, + { + "epoch": 24.22079589216945, + "grad_norm": 1.4070385694503784, + "learning_rate": 2.526315789473684e-05, + "loss": 0.3581, + "step": 18868 + }, + { + "epoch": 24.222079589216946, + "grad_norm": 2.3536133766174316, + "learning_rate": 2.526272999572101e-05, + "loss": 0.3343, + "step": 18869 + }, + { + "epoch": 24.22336328626444, + "grad_norm": 1.2836520671844482, + "learning_rate": 2.5262302096705178e-05, + "loss": 0.372, + "step": 18870 + }, + { + "epoch": 24.22464698331194, + "grad_norm": 1.1918556690216064, + "learning_rate": 2.5261874197689347e-05, + "loss": 0.391, + "step": 18871 + }, + { + "epoch": 24.225930680359436, + "grad_norm": 1.7379323244094849, + "learning_rate": 2.5261446298673515e-05, + "loss": 0.3888, + "step": 18872 + }, + { + "epoch": 24.22721437740693, + "grad_norm": 1.6794158220291138, + "learning_rate": 2.526101839965768e-05, + "loss": 0.3581, + "step": 18873 + }, + { + "epoch": 24.22849807445443, + "grad_norm": 1.6718478202819824, + "learning_rate": 2.5260590500641852e-05, + "loss": 0.3407, + "step": 18874 + }, + { + "epoch": 24.229781771501926, + "grad_norm": 2.265207052230835, + "learning_rate": 2.5260162601626017e-05, + "loss": 0.4092, + "step": 18875 + }, + { + "epoch": 24.23106546854942, + "grad_norm": 0.9100618958473206, + "learning_rate": 2.5259734702610182e-05, + "loss": 0.3481, + "step": 18876 + }, + { + "epoch": 24.23234916559692, + "grad_norm": 1.5039525032043457, + "learning_rate": 2.5259306803594354e-05, + "loss": 0.4143, + "step": 18877 + }, + { + "epoch": 24.233632862644416, + "grad_norm": 2.2769689559936523, + "learning_rate": 2.525887890457852e-05, + "loss": 0.3641, + "step": 18878 + }, + { + "epoch": 24.234916559691914, + "grad_norm": 2.3412468433380127, + "learning_rate": 2.525845100556269e-05, + "loss": 0.3897, + "step": 18879 + }, + { + "epoch": 24.23620025673941, + "grad_norm": 2.5297813415527344, + "learning_rate": 2.5258023106546855e-05, + "loss": 0.4181, + "step": 18880 + }, + { + "epoch": 24.237483953786906, + "grad_norm": 1.9273189306259155, + "learning_rate": 2.5257595207531024e-05, + "loss": 0.3718, + "step": 18881 + }, + { + "epoch": 24.238767650834404, + "grad_norm": 1.6359986066818237, + "learning_rate": 2.5257167308515192e-05, + "loss": 0.3936, + "step": 18882 + }, + { + "epoch": 24.2400513478819, + "grad_norm": 4.747506618499756, + "learning_rate": 2.5256739409499357e-05, + "loss": 0.3579, + "step": 18883 + }, + { + "epoch": 24.241335044929397, + "grad_norm": 1.8359860181808472, + "learning_rate": 2.5256311510483526e-05, + "loss": 0.4131, + "step": 18884 + }, + { + "epoch": 24.242618741976894, + "grad_norm": 3.291213035583496, + "learning_rate": 2.5255883611467694e-05, + "loss": 0.3696, + "step": 18885 + }, + { + "epoch": 24.24390243902439, + "grad_norm": 0.9887590408325195, + "learning_rate": 2.5255455712451863e-05, + "loss": 0.3486, + "step": 18886 + }, + { + "epoch": 24.245186136071887, + "grad_norm": 2.458244562149048, + "learning_rate": 2.525502781343603e-05, + "loss": 0.4365, + "step": 18887 + }, + { + "epoch": 24.246469833119384, + "grad_norm": 1.8738183975219727, + "learning_rate": 2.52545999144202e-05, + "loss": 0.4131, + "step": 18888 + }, + { + "epoch": 24.247753530166882, + "grad_norm": 2.7422590255737305, + "learning_rate": 2.5254172015404364e-05, + "loss": 0.3616, + "step": 18889 + }, + { + "epoch": 24.249037227214377, + "grad_norm": 1.974379062652588, + "learning_rate": 2.5253744116388533e-05, + "loss": 0.385, + "step": 18890 + }, + { + "epoch": 24.250320924261874, + "grad_norm": 4.0672078132629395, + "learning_rate": 2.52533162173727e-05, + "loss": 0.4165, + "step": 18891 + }, + { + "epoch": 24.251604621309372, + "grad_norm": 2.898411750793457, + "learning_rate": 2.5252888318356866e-05, + "loss": 0.3727, + "step": 18892 + }, + { + "epoch": 24.252888318356867, + "grad_norm": 1.9809426069259644, + "learning_rate": 2.5252460419341038e-05, + "loss": 0.4391, + "step": 18893 + }, + { + "epoch": 24.254172015404365, + "grad_norm": 1.623180866241455, + "learning_rate": 2.5252032520325203e-05, + "loss": 0.4014, + "step": 18894 + }, + { + "epoch": 24.255455712451862, + "grad_norm": 1.8449512720108032, + "learning_rate": 2.5251604621309375e-05, + "loss": 0.4609, + "step": 18895 + }, + { + "epoch": 24.256739409499357, + "grad_norm": 2.0014595985412598, + "learning_rate": 2.525117672229354e-05, + "loss": 0.6008, + "step": 18896 + }, + { + "epoch": 24.258023106546855, + "grad_norm": 1.3678934574127197, + "learning_rate": 2.5250748823277705e-05, + "loss": 0.3445, + "step": 18897 + }, + { + "epoch": 24.259306803594352, + "grad_norm": 0.7454944849014282, + "learning_rate": 2.5250320924261877e-05, + "loss": 0.3284, + "step": 18898 + }, + { + "epoch": 24.260590500641847, + "grad_norm": 1.3454571962356567, + "learning_rate": 2.5249893025246042e-05, + "loss": 0.3815, + "step": 18899 + }, + { + "epoch": 24.261874197689345, + "grad_norm": 1.0358251333236694, + "learning_rate": 2.524946512623021e-05, + "loss": 0.3569, + "step": 18900 + }, + { + "epoch": 24.263157894736842, + "grad_norm": 1.0146514177322388, + "learning_rate": 2.524903722721438e-05, + "loss": 0.352, + "step": 18901 + }, + { + "epoch": 24.26444159178434, + "grad_norm": 0.9210399389266968, + "learning_rate": 2.5248609328198544e-05, + "loss": 0.3457, + "step": 18902 + }, + { + "epoch": 24.265725288831835, + "grad_norm": 1.3031362295150757, + "learning_rate": 2.5248181429182715e-05, + "loss": 0.3248, + "step": 18903 + }, + { + "epoch": 24.267008985879333, + "grad_norm": 1.1368550062179565, + "learning_rate": 2.524775353016688e-05, + "loss": 0.391, + "step": 18904 + }, + { + "epoch": 24.26829268292683, + "grad_norm": 0.7938165068626404, + "learning_rate": 2.524732563115105e-05, + "loss": 0.3005, + "step": 18905 + }, + { + "epoch": 24.269576379974325, + "grad_norm": 1.7425132989883423, + "learning_rate": 2.5246897732135217e-05, + "loss": 0.382, + "step": 18906 + }, + { + "epoch": 24.270860077021823, + "grad_norm": 1.1371959447860718, + "learning_rate": 2.5246469833119386e-05, + "loss": 0.3464, + "step": 18907 + }, + { + "epoch": 24.27214377406932, + "grad_norm": 1.8636815547943115, + "learning_rate": 2.524604193410355e-05, + "loss": 0.3607, + "step": 18908 + }, + { + "epoch": 24.273427471116815, + "grad_norm": 1.006378173828125, + "learning_rate": 2.524561403508772e-05, + "loss": 0.3345, + "step": 18909 + }, + { + "epoch": 24.274711168164313, + "grad_norm": 1.0500786304473877, + "learning_rate": 2.5245186136071888e-05, + "loss": 0.3219, + "step": 18910 + }, + { + "epoch": 24.27599486521181, + "grad_norm": 2.667975425720215, + "learning_rate": 2.5244758237056056e-05, + "loss": 0.3718, + "step": 18911 + }, + { + "epoch": 24.27727856225931, + "grad_norm": 1.6263353824615479, + "learning_rate": 2.5244330338040224e-05, + "loss": 0.3387, + "step": 18912 + }, + { + "epoch": 24.278562259306803, + "grad_norm": 1.3203859329223633, + "learning_rate": 2.524390243902439e-05, + "loss": 0.3649, + "step": 18913 + }, + { + "epoch": 24.2798459563543, + "grad_norm": 1.0908288955688477, + "learning_rate": 2.524347454000856e-05, + "loss": 0.3333, + "step": 18914 + }, + { + "epoch": 24.2811296534018, + "grad_norm": 4.338905334472656, + "learning_rate": 2.5243046640992726e-05, + "loss": 0.3768, + "step": 18915 + }, + { + "epoch": 24.282413350449293, + "grad_norm": 0.9629230499267578, + "learning_rate": 2.524261874197689e-05, + "loss": 0.3583, + "step": 18916 + }, + { + "epoch": 24.28369704749679, + "grad_norm": 0.8413133025169373, + "learning_rate": 2.5242190842961063e-05, + "loss": 0.3585, + "step": 18917 + }, + { + "epoch": 24.28498074454429, + "grad_norm": 1.6264764070510864, + "learning_rate": 2.5241762943945228e-05, + "loss": 0.3184, + "step": 18918 + }, + { + "epoch": 24.286264441591783, + "grad_norm": 1.0080152750015259, + "learning_rate": 2.52413350449294e-05, + "loss": 0.3741, + "step": 18919 + }, + { + "epoch": 24.28754813863928, + "grad_norm": 1.4367296695709229, + "learning_rate": 2.5240907145913565e-05, + "loss": 0.3518, + "step": 18920 + }, + { + "epoch": 24.28883183568678, + "grad_norm": 1.3324406147003174, + "learning_rate": 2.5240479246897733e-05, + "loss": 0.3772, + "step": 18921 + }, + { + "epoch": 24.290115532734276, + "grad_norm": 1.4083830118179321, + "learning_rate": 2.52400513478819e-05, + "loss": 0.3162, + "step": 18922 + }, + { + "epoch": 24.29139922978177, + "grad_norm": 1.2334851026535034, + "learning_rate": 2.5239623448866067e-05, + "loss": 0.3888, + "step": 18923 + }, + { + "epoch": 24.29268292682927, + "grad_norm": 1.2224937677383423, + "learning_rate": 2.5239195549850235e-05, + "loss": 0.3176, + "step": 18924 + }, + { + "epoch": 24.293966623876766, + "grad_norm": 1.8708351850509644, + "learning_rate": 2.5238767650834404e-05, + "loss": 0.3754, + "step": 18925 + }, + { + "epoch": 24.29525032092426, + "grad_norm": 1.686571717262268, + "learning_rate": 2.5238339751818572e-05, + "loss": 0.3486, + "step": 18926 + }, + { + "epoch": 24.29653401797176, + "grad_norm": 1.0962294340133667, + "learning_rate": 2.523791185280274e-05, + "loss": 0.3733, + "step": 18927 + }, + { + "epoch": 24.297817715019256, + "grad_norm": 1.2787840366363525, + "learning_rate": 2.523748395378691e-05, + "loss": 0.3571, + "step": 18928 + }, + { + "epoch": 24.29910141206675, + "grad_norm": 1.2230931520462036, + "learning_rate": 2.5237056054771074e-05, + "loss": 0.3767, + "step": 18929 + }, + { + "epoch": 24.30038510911425, + "grad_norm": 2.4080724716186523, + "learning_rate": 2.5236628155755242e-05, + "loss": 0.3906, + "step": 18930 + }, + { + "epoch": 24.301668806161747, + "grad_norm": 2.0794761180877686, + "learning_rate": 2.523620025673941e-05, + "loss": 0.3707, + "step": 18931 + }, + { + "epoch": 24.30295250320924, + "grad_norm": 1.1272211074829102, + "learning_rate": 2.5235772357723576e-05, + "loss": 0.3765, + "step": 18932 + }, + { + "epoch": 24.30423620025674, + "grad_norm": 1.5112439393997192, + "learning_rate": 2.5235344458707747e-05, + "loss": 0.4364, + "step": 18933 + }, + { + "epoch": 24.305519897304237, + "grad_norm": 1.0214135646820068, + "learning_rate": 2.5234916559691912e-05, + "loss": 0.3702, + "step": 18934 + }, + { + "epoch": 24.306803594351734, + "grad_norm": 2.290788173675537, + "learning_rate": 2.5234488660676084e-05, + "loss": 0.3559, + "step": 18935 + }, + { + "epoch": 24.30808729139923, + "grad_norm": 1.4782602787017822, + "learning_rate": 2.523406076166025e-05, + "loss": 0.359, + "step": 18936 + }, + { + "epoch": 24.309370988446727, + "grad_norm": 1.2671635150909424, + "learning_rate": 2.5233632862644414e-05, + "loss": 0.3888, + "step": 18937 + }, + { + "epoch": 24.310654685494224, + "grad_norm": 1.6077892780303955, + "learning_rate": 2.5233204963628586e-05, + "loss": 0.4242, + "step": 18938 + }, + { + "epoch": 24.31193838254172, + "grad_norm": 2.256511926651001, + "learning_rate": 2.523277706461275e-05, + "loss": 0.4127, + "step": 18939 + }, + { + "epoch": 24.313222079589217, + "grad_norm": 3.188166618347168, + "learning_rate": 2.523234916559692e-05, + "loss": 0.4091, + "step": 18940 + }, + { + "epoch": 24.314505776636715, + "grad_norm": 1.4019911289215088, + "learning_rate": 2.5231921266581088e-05, + "loss": 0.3768, + "step": 18941 + }, + { + "epoch": 24.31578947368421, + "grad_norm": 2.4196248054504395, + "learning_rate": 2.5231493367565256e-05, + "loss": 0.4303, + "step": 18942 + }, + { + "epoch": 24.317073170731707, + "grad_norm": 1.6362708806991577, + "learning_rate": 2.523106546854942e-05, + "loss": 0.5075, + "step": 18943 + }, + { + "epoch": 24.318356867779205, + "grad_norm": 3.2579433917999268, + "learning_rate": 2.523063756953359e-05, + "loss": 0.4175, + "step": 18944 + }, + { + "epoch": 24.319640564826702, + "grad_norm": 2.83074688911438, + "learning_rate": 2.5230209670517758e-05, + "loss": 0.4846, + "step": 18945 + }, + { + "epoch": 24.320924261874197, + "grad_norm": 2.1309800148010254, + "learning_rate": 2.5229781771501927e-05, + "loss": 0.5496, + "step": 18946 + }, + { + "epoch": 24.322207958921695, + "grad_norm": 1.0781958103179932, + "learning_rate": 2.5229353872486095e-05, + "loss": 0.3246, + "step": 18947 + }, + { + "epoch": 24.323491655969192, + "grad_norm": 1.486536979675293, + "learning_rate": 2.522892597347026e-05, + "loss": 0.3402, + "step": 18948 + }, + { + "epoch": 24.324775353016687, + "grad_norm": 1.7922632694244385, + "learning_rate": 2.5228498074454432e-05, + "loss": 0.3687, + "step": 18949 + }, + { + "epoch": 24.326059050064185, + "grad_norm": 1.5007354021072388, + "learning_rate": 2.5228070175438597e-05, + "loss": 0.3434, + "step": 18950 + }, + { + "epoch": 24.327342747111683, + "grad_norm": 1.2659608125686646, + "learning_rate": 2.5227642276422762e-05, + "loss": 0.3848, + "step": 18951 + }, + { + "epoch": 24.328626444159177, + "grad_norm": 1.3292229175567627, + "learning_rate": 2.5227214377406934e-05, + "loss": 0.3479, + "step": 18952 + }, + { + "epoch": 24.329910141206675, + "grad_norm": 0.8930450081825256, + "learning_rate": 2.52267864783911e-05, + "loss": 0.3443, + "step": 18953 + }, + { + "epoch": 24.331193838254173, + "grad_norm": 0.8594908118247986, + "learning_rate": 2.522635857937527e-05, + "loss": 0.3485, + "step": 18954 + }, + { + "epoch": 24.33247753530167, + "grad_norm": 0.8889841437339783, + "learning_rate": 2.5225930680359436e-05, + "loss": 0.3592, + "step": 18955 + }, + { + "epoch": 24.333761232349165, + "grad_norm": 0.9025724530220032, + "learning_rate": 2.5225502781343604e-05, + "loss": 0.3393, + "step": 18956 + }, + { + "epoch": 24.335044929396663, + "grad_norm": 1.3698945045471191, + "learning_rate": 2.5225074882327772e-05, + "loss": 0.3633, + "step": 18957 + }, + { + "epoch": 24.33632862644416, + "grad_norm": 3.4146173000335693, + "learning_rate": 2.5224646983311937e-05, + "loss": 0.3707, + "step": 18958 + }, + { + "epoch": 24.337612323491655, + "grad_norm": 1.2073804140090942, + "learning_rate": 2.5224219084296106e-05, + "loss": 0.3614, + "step": 18959 + }, + { + "epoch": 24.338896020539153, + "grad_norm": 1.9940826892852783, + "learning_rate": 2.5223791185280274e-05, + "loss": 0.3616, + "step": 18960 + }, + { + "epoch": 24.34017971758665, + "grad_norm": 0.9279116988182068, + "learning_rate": 2.5223363286264443e-05, + "loss": 0.36, + "step": 18961 + }, + { + "epoch": 24.341463414634145, + "grad_norm": 1.7048221826553345, + "learning_rate": 2.522293538724861e-05, + "loss": 0.3804, + "step": 18962 + }, + { + "epoch": 24.342747111681643, + "grad_norm": 1.0920213460922241, + "learning_rate": 2.5222507488232776e-05, + "loss": 0.3526, + "step": 18963 + }, + { + "epoch": 24.34403080872914, + "grad_norm": 1.0645729303359985, + "learning_rate": 2.5222079589216944e-05, + "loss": 0.3572, + "step": 18964 + }, + { + "epoch": 24.345314505776635, + "grad_norm": 1.1763359308242798, + "learning_rate": 2.5221651690201113e-05, + "loss": 0.3908, + "step": 18965 + }, + { + "epoch": 24.346598202824133, + "grad_norm": 1.2919334173202515, + "learning_rate": 2.522122379118528e-05, + "loss": 0.3897, + "step": 18966 + }, + { + "epoch": 24.34788189987163, + "grad_norm": 1.0321571826934814, + "learning_rate": 2.5220795892169446e-05, + "loss": 0.3588, + "step": 18967 + }, + { + "epoch": 24.34916559691913, + "grad_norm": 1.1815851926803589, + "learning_rate": 2.5220367993153618e-05, + "loss": 0.3329, + "step": 18968 + }, + { + "epoch": 24.350449293966623, + "grad_norm": 1.3546749353408813, + "learning_rate": 2.5219940094137783e-05, + "loss": 0.3512, + "step": 18969 + }, + { + "epoch": 24.35173299101412, + "grad_norm": 1.5663429498672485, + "learning_rate": 2.521951219512195e-05, + "loss": 0.3488, + "step": 18970 + }, + { + "epoch": 24.35301668806162, + "grad_norm": 1.18987238407135, + "learning_rate": 2.521908429610612e-05, + "loss": 0.4036, + "step": 18971 + }, + { + "epoch": 24.354300385109113, + "grad_norm": 1.3395551443099976, + "learning_rate": 2.5218656397090285e-05, + "loss": 0.3431, + "step": 18972 + }, + { + "epoch": 24.35558408215661, + "grad_norm": 1.7440578937530518, + "learning_rate": 2.5218228498074457e-05, + "loss": 0.3721, + "step": 18973 + }, + { + "epoch": 24.35686777920411, + "grad_norm": 1.4089404344558716, + "learning_rate": 2.5217800599058622e-05, + "loss": 0.3262, + "step": 18974 + }, + { + "epoch": 24.358151476251603, + "grad_norm": 2.156215190887451, + "learning_rate": 2.521737270004279e-05, + "loss": 0.3742, + "step": 18975 + }, + { + "epoch": 24.3594351732991, + "grad_norm": 0.9500334858894348, + "learning_rate": 2.521694480102696e-05, + "loss": 0.3435, + "step": 18976 + }, + { + "epoch": 24.3607188703466, + "grad_norm": 0.9282327890396118, + "learning_rate": 2.5216516902011124e-05, + "loss": 0.3437, + "step": 18977 + }, + { + "epoch": 24.362002567394097, + "grad_norm": 0.8644064664840698, + "learning_rate": 2.5216089002995295e-05, + "loss": 0.3438, + "step": 18978 + }, + { + "epoch": 24.36328626444159, + "grad_norm": 1.067457914352417, + "learning_rate": 2.521566110397946e-05, + "loss": 0.3407, + "step": 18979 + }, + { + "epoch": 24.36456996148909, + "grad_norm": 1.497515320777893, + "learning_rate": 2.521523320496363e-05, + "loss": 0.3551, + "step": 18980 + }, + { + "epoch": 24.365853658536587, + "grad_norm": 1.358665943145752, + "learning_rate": 2.5214805305947797e-05, + "loss": 0.343, + "step": 18981 + }, + { + "epoch": 24.36713735558408, + "grad_norm": 0.9865267276763916, + "learning_rate": 2.5214377406931966e-05, + "loss": 0.3377, + "step": 18982 + }, + { + "epoch": 24.36842105263158, + "grad_norm": 1.6309486627578735, + "learning_rate": 2.521394950791613e-05, + "loss": 0.407, + "step": 18983 + }, + { + "epoch": 24.369704749679077, + "grad_norm": 3.4661648273468018, + "learning_rate": 2.52135216089003e-05, + "loss": 0.3843, + "step": 18984 + }, + { + "epoch": 24.37098844672657, + "grad_norm": 1.118024230003357, + "learning_rate": 2.5213093709884468e-05, + "loss": 0.3744, + "step": 18985 + }, + { + "epoch": 24.37227214377407, + "grad_norm": 1.5287442207336426, + "learning_rate": 2.5212665810868636e-05, + "loss": 0.3646, + "step": 18986 + }, + { + "epoch": 24.373555840821567, + "grad_norm": 1.6325103044509888, + "learning_rate": 2.5212237911852804e-05, + "loss": 0.3651, + "step": 18987 + }, + { + "epoch": 24.374839537869065, + "grad_norm": 1.1884684562683105, + "learning_rate": 2.521181001283697e-05, + "loss": 0.4072, + "step": 18988 + }, + { + "epoch": 24.37612323491656, + "grad_norm": 1.3270078897476196, + "learning_rate": 2.521138211382114e-05, + "loss": 0.3648, + "step": 18989 + }, + { + "epoch": 24.377406931964057, + "grad_norm": 1.5699089765548706, + "learning_rate": 2.5210954214805306e-05, + "loss": 0.3627, + "step": 18990 + }, + { + "epoch": 24.378690629011555, + "grad_norm": 1.9842652082443237, + "learning_rate": 2.521052631578947e-05, + "loss": 0.4808, + "step": 18991 + }, + { + "epoch": 24.37997432605905, + "grad_norm": 3.3381900787353516, + "learning_rate": 2.5210098416773643e-05, + "loss": 0.4738, + "step": 18992 + }, + { + "epoch": 24.381258023106547, + "grad_norm": 1.7654632329940796, + "learning_rate": 2.5209670517757808e-05, + "loss": 0.3963, + "step": 18993 + }, + { + "epoch": 24.382541720154045, + "grad_norm": 3.5901920795440674, + "learning_rate": 2.520924261874198e-05, + "loss": 0.4387, + "step": 18994 + }, + { + "epoch": 24.38382541720154, + "grad_norm": 2.5165438652038574, + "learning_rate": 2.5208814719726145e-05, + "loss": 0.5087, + "step": 18995 + }, + { + "epoch": 24.385109114249037, + "grad_norm": 2.340689182281494, + "learning_rate": 2.5208386820710313e-05, + "loss": 0.5616, + "step": 18996 + }, + { + "epoch": 24.386392811296535, + "grad_norm": 1.1791009902954102, + "learning_rate": 2.5207958921694482e-05, + "loss": 0.3652, + "step": 18997 + }, + { + "epoch": 24.387676508344033, + "grad_norm": 1.1833443641662598, + "learning_rate": 2.5207531022678647e-05, + "loss": 0.3427, + "step": 18998 + }, + { + "epoch": 24.388960205391527, + "grad_norm": 0.7625055909156799, + "learning_rate": 2.5207103123662815e-05, + "loss": 0.3311, + "step": 18999 + }, + { + "epoch": 24.390243902439025, + "grad_norm": 2.790663480758667, + "learning_rate": 2.5206675224646984e-05, + "loss": 0.3209, + "step": 19000 + }, + { + "epoch": 24.390243902439025, + "eval_cer": 0.2711963971020168, + "eval_loss": 0.4841902256011963, + "eval_runtime": 13.9027, + "eval_samples_per_second": 70.706, + "eval_steps_per_second": 0.504, + "eval_wer": 0.47236872277000685, + "step": 19000 + }, + { + "epoch": 24.391527599486523, + "grad_norm": 1.4305561780929565, + "learning_rate": 2.5206247325631152e-05, + "loss": 0.3535, + "step": 19001 + }, + { + "epoch": 24.392811296534017, + "grad_norm": 1.3054417371749878, + "learning_rate": 2.520581942661532e-05, + "loss": 0.3424, + "step": 19002 + }, + { + "epoch": 24.394094993581515, + "grad_norm": 1.7060319185256958, + "learning_rate": 2.520539152759949e-05, + "loss": 0.3375, + "step": 19003 + }, + { + "epoch": 24.395378690629013, + "grad_norm": 1.1691769361495972, + "learning_rate": 2.5204963628583654e-05, + "loss": 0.3601, + "step": 19004 + }, + { + "epoch": 24.396662387676507, + "grad_norm": 1.8571583032608032, + "learning_rate": 2.5204535729567822e-05, + "loss": 0.3503, + "step": 19005 + }, + { + "epoch": 24.397946084724005, + "grad_norm": 1.4846328496932983, + "learning_rate": 2.520410783055199e-05, + "loss": 0.3354, + "step": 19006 + }, + { + "epoch": 24.399229781771503, + "grad_norm": 1.3072043657302856, + "learning_rate": 2.5203679931536156e-05, + "loss": 0.3849, + "step": 19007 + }, + { + "epoch": 24.400513478818997, + "grad_norm": 1.5962846279144287, + "learning_rate": 2.5203252032520327e-05, + "loss": 0.3742, + "step": 19008 + }, + { + "epoch": 24.401797175866495, + "grad_norm": 1.5494643449783325, + "learning_rate": 2.5202824133504493e-05, + "loss": 0.3499, + "step": 19009 + }, + { + "epoch": 24.403080872913993, + "grad_norm": 2.281052589416504, + "learning_rate": 2.5202396234488664e-05, + "loss": 0.3689, + "step": 19010 + }, + { + "epoch": 24.40436456996149, + "grad_norm": 3.562075614929199, + "learning_rate": 2.520196833547283e-05, + "loss": 0.385, + "step": 19011 + }, + { + "epoch": 24.405648267008985, + "grad_norm": 1.7570500373840332, + "learning_rate": 2.5201540436456994e-05, + "loss": 0.3536, + "step": 19012 + }, + { + "epoch": 24.406931964056483, + "grad_norm": 0.8688177466392517, + "learning_rate": 2.5201112537441166e-05, + "loss": 0.3338, + "step": 19013 + }, + { + "epoch": 24.40821566110398, + "grad_norm": 1.309946060180664, + "learning_rate": 2.520068463842533e-05, + "loss": 0.3472, + "step": 19014 + }, + { + "epoch": 24.409499358151475, + "grad_norm": 1.3136787414550781, + "learning_rate": 2.52002567394095e-05, + "loss": 0.3693, + "step": 19015 + }, + { + "epoch": 24.410783055198973, + "grad_norm": 1.344904899597168, + "learning_rate": 2.5199828840393668e-05, + "loss": 0.3715, + "step": 19016 + }, + { + "epoch": 24.41206675224647, + "grad_norm": 0.9876909852027893, + "learning_rate": 2.5199400941377836e-05, + "loss": 0.3409, + "step": 19017 + }, + { + "epoch": 24.413350449293965, + "grad_norm": 1.421380877494812, + "learning_rate": 2.5198973042362005e-05, + "loss": 0.377, + "step": 19018 + }, + { + "epoch": 24.414634146341463, + "grad_norm": 1.606821894645691, + "learning_rate": 2.519854514334617e-05, + "loss": 0.3571, + "step": 19019 + }, + { + "epoch": 24.41591784338896, + "grad_norm": 1.9227099418640137, + "learning_rate": 2.5198117244330338e-05, + "loss": 0.3732, + "step": 19020 + }, + { + "epoch": 24.41720154043646, + "grad_norm": 1.752004623413086, + "learning_rate": 2.5197689345314507e-05, + "loss": 0.3466, + "step": 19021 + }, + { + "epoch": 24.418485237483953, + "grad_norm": 0.9628397822380066, + "learning_rate": 2.5197261446298675e-05, + "loss": 0.3592, + "step": 19022 + }, + { + "epoch": 24.41976893453145, + "grad_norm": 1.0299464464187622, + "learning_rate": 2.519683354728284e-05, + "loss": 0.3435, + "step": 19023 + }, + { + "epoch": 24.42105263157895, + "grad_norm": 1.1660053730010986, + "learning_rate": 2.519640564826701e-05, + "loss": 0.3773, + "step": 19024 + }, + { + "epoch": 24.422336328626443, + "grad_norm": 1.3036344051361084, + "learning_rate": 2.5195977749251177e-05, + "loss": 0.3862, + "step": 19025 + }, + { + "epoch": 24.42362002567394, + "grad_norm": 2.8053977489471436, + "learning_rate": 2.5195549850235345e-05, + "loss": 0.3426, + "step": 19026 + }, + { + "epoch": 24.42490372272144, + "grad_norm": 1.2283787727355957, + "learning_rate": 2.5195121951219514e-05, + "loss": 0.3725, + "step": 19027 + }, + { + "epoch": 24.426187419768933, + "grad_norm": 1.3501222133636475, + "learning_rate": 2.519469405220368e-05, + "loss": 0.3775, + "step": 19028 + }, + { + "epoch": 24.42747111681643, + "grad_norm": 1.2594002485275269, + "learning_rate": 2.519426615318785e-05, + "loss": 0.394, + "step": 19029 + }, + { + "epoch": 24.42875481386393, + "grad_norm": 1.299396276473999, + "learning_rate": 2.5193838254172016e-05, + "loss": 0.3774, + "step": 19030 + }, + { + "epoch": 24.430038510911427, + "grad_norm": 1.8416086435317993, + "learning_rate": 2.519341035515618e-05, + "loss": 0.351, + "step": 19031 + }, + { + "epoch": 24.43132220795892, + "grad_norm": 2.856895685195923, + "learning_rate": 2.5192982456140352e-05, + "loss": 0.3406, + "step": 19032 + }, + { + "epoch": 24.43260590500642, + "grad_norm": 1.8802833557128906, + "learning_rate": 2.5192554557124517e-05, + "loss": 0.4096, + "step": 19033 + }, + { + "epoch": 24.433889602053917, + "grad_norm": 1.5534279346466064, + "learning_rate": 2.519212665810869e-05, + "loss": 0.3664, + "step": 19034 + }, + { + "epoch": 24.43517329910141, + "grad_norm": 1.523136854171753, + "learning_rate": 2.5191698759092854e-05, + "loss": 0.3666, + "step": 19035 + }, + { + "epoch": 24.43645699614891, + "grad_norm": 1.380373239517212, + "learning_rate": 2.5191270860077023e-05, + "loss": 0.4117, + "step": 19036 + }, + { + "epoch": 24.437740693196407, + "grad_norm": 1.2565416097640991, + "learning_rate": 2.519084296106119e-05, + "loss": 0.4013, + "step": 19037 + }, + { + "epoch": 24.4390243902439, + "grad_norm": 1.3091847896575928, + "learning_rate": 2.5190415062045356e-05, + "loss": 0.4357, + "step": 19038 + }, + { + "epoch": 24.4403080872914, + "grad_norm": 5.763146877288818, + "learning_rate": 2.5189987163029525e-05, + "loss": 0.4208, + "step": 19039 + }, + { + "epoch": 24.441591784338897, + "grad_norm": 1.668666958808899, + "learning_rate": 2.5189559264013693e-05, + "loss": 0.3639, + "step": 19040 + }, + { + "epoch": 24.44287548138639, + "grad_norm": 2.6394197940826416, + "learning_rate": 2.518913136499786e-05, + "loss": 0.4074, + "step": 19041 + }, + { + "epoch": 24.44415917843389, + "grad_norm": 1.6867656707763672, + "learning_rate": 2.518870346598203e-05, + "loss": 0.4506, + "step": 19042 + }, + { + "epoch": 24.445442875481387, + "grad_norm": 7.403298377990723, + "learning_rate": 2.5188275566966198e-05, + "loss": 0.3843, + "step": 19043 + }, + { + "epoch": 24.446726572528885, + "grad_norm": 3.667527914047241, + "learning_rate": 2.5187847667950363e-05, + "loss": 0.4609, + "step": 19044 + }, + { + "epoch": 24.44801026957638, + "grad_norm": 2.6866495609283447, + "learning_rate": 2.518741976893453e-05, + "loss": 0.4382, + "step": 19045 + }, + { + "epoch": 24.449293966623877, + "grad_norm": 3.246530294418335, + "learning_rate": 2.51869918699187e-05, + "loss": 0.5523, + "step": 19046 + }, + { + "epoch": 24.450577663671375, + "grad_norm": 1.5996947288513184, + "learning_rate": 2.5186563970902865e-05, + "loss": 0.3616, + "step": 19047 + }, + { + "epoch": 24.45186136071887, + "grad_norm": 1.2647076845169067, + "learning_rate": 2.5186136071887037e-05, + "loss": 0.3372, + "step": 19048 + }, + { + "epoch": 24.453145057766367, + "grad_norm": 0.9083801507949829, + "learning_rate": 2.5185708172871202e-05, + "loss": 0.3306, + "step": 19049 + }, + { + "epoch": 24.454428754813865, + "grad_norm": 1.1091150045394897, + "learning_rate": 2.5185280273855374e-05, + "loss": 0.3634, + "step": 19050 + }, + { + "epoch": 24.45571245186136, + "grad_norm": 1.296499490737915, + "learning_rate": 2.518485237483954e-05, + "loss": 0.3781, + "step": 19051 + }, + { + "epoch": 24.456996148908857, + "grad_norm": 1.5966546535491943, + "learning_rate": 2.5184424475823704e-05, + "loss": 0.3608, + "step": 19052 + }, + { + "epoch": 24.458279845956355, + "grad_norm": 1.1031718254089355, + "learning_rate": 2.5183996576807876e-05, + "loss": 0.3564, + "step": 19053 + }, + { + "epoch": 24.459563543003853, + "grad_norm": 1.898013949394226, + "learning_rate": 2.518356867779204e-05, + "loss": 0.3528, + "step": 19054 + }, + { + "epoch": 24.460847240051347, + "grad_norm": 0.8135574460029602, + "learning_rate": 2.518314077877621e-05, + "loss": 0.3456, + "step": 19055 + }, + { + "epoch": 24.462130937098845, + "grad_norm": 1.4764277935028076, + "learning_rate": 2.5182712879760377e-05, + "loss": 0.3476, + "step": 19056 + }, + { + "epoch": 24.463414634146343, + "grad_norm": 0.9943291544914246, + "learning_rate": 2.5182284980744546e-05, + "loss": 0.3481, + "step": 19057 + }, + { + "epoch": 24.464698331193837, + "grad_norm": 1.2957544326782227, + "learning_rate": 2.5181857081728714e-05, + "loss": 0.3815, + "step": 19058 + }, + { + "epoch": 24.465982028241335, + "grad_norm": 1.4101455211639404, + "learning_rate": 2.518142918271288e-05, + "loss": 0.337, + "step": 19059 + }, + { + "epoch": 24.467265725288833, + "grad_norm": 2.653272867202759, + "learning_rate": 2.5181001283697048e-05, + "loss": 0.314, + "step": 19060 + }, + { + "epoch": 24.468549422336327, + "grad_norm": 1.2504020929336548, + "learning_rate": 2.5180573384681216e-05, + "loss": 0.3208, + "step": 19061 + }, + { + "epoch": 24.469833119383825, + "grad_norm": 1.085678219795227, + "learning_rate": 2.5180145485665384e-05, + "loss": 0.3591, + "step": 19062 + }, + { + "epoch": 24.471116816431323, + "grad_norm": 1.1866146326065063, + "learning_rate": 2.517971758664955e-05, + "loss": 0.345, + "step": 19063 + }, + { + "epoch": 24.47240051347882, + "grad_norm": 1.1656073331832886, + "learning_rate": 2.517928968763372e-05, + "loss": 0.3387, + "step": 19064 + }, + { + "epoch": 24.473684210526315, + "grad_norm": 1.080087661743164, + "learning_rate": 2.5178861788617886e-05, + "loss": 0.3804, + "step": 19065 + }, + { + "epoch": 24.474967907573813, + "grad_norm": 1.0893481969833374, + "learning_rate": 2.5178433889602055e-05, + "loss": 0.3685, + "step": 19066 + }, + { + "epoch": 24.47625160462131, + "grad_norm": 1.0458106994628906, + "learning_rate": 2.5178005990586223e-05, + "loss": 0.4158, + "step": 19067 + }, + { + "epoch": 24.477535301668805, + "grad_norm": 2.3221497535705566, + "learning_rate": 2.5177578091570388e-05, + "loss": 0.349, + "step": 19068 + }, + { + "epoch": 24.478818998716303, + "grad_norm": 1.2823764085769653, + "learning_rate": 2.517715019255456e-05, + "loss": 0.3784, + "step": 19069 + }, + { + "epoch": 24.4801026957638, + "grad_norm": 1.9807790517807007, + "learning_rate": 2.5176722293538725e-05, + "loss": 0.3923, + "step": 19070 + }, + { + "epoch": 24.481386392811295, + "grad_norm": 1.3996580839157104, + "learning_rate": 2.5176294394522893e-05, + "loss": 0.3723, + "step": 19071 + }, + { + "epoch": 24.482670089858793, + "grad_norm": 1.2285223007202148, + "learning_rate": 2.5175866495507062e-05, + "loss": 0.3565, + "step": 19072 + }, + { + "epoch": 24.48395378690629, + "grad_norm": 1.0277327299118042, + "learning_rate": 2.5175438596491227e-05, + "loss": 0.3349, + "step": 19073 + }, + { + "epoch": 24.485237483953785, + "grad_norm": 1.2748228311538696, + "learning_rate": 2.51750106974754e-05, + "loss": 0.3684, + "step": 19074 + }, + { + "epoch": 24.486521181001283, + "grad_norm": 1.331284523010254, + "learning_rate": 2.5174582798459564e-05, + "loss": 0.3528, + "step": 19075 + }, + { + "epoch": 24.48780487804878, + "grad_norm": 1.543972134590149, + "learning_rate": 2.5174154899443732e-05, + "loss": 0.3743, + "step": 19076 + }, + { + "epoch": 24.48908857509628, + "grad_norm": 1.0103319883346558, + "learning_rate": 2.51737270004279e-05, + "loss": 0.3875, + "step": 19077 + }, + { + "epoch": 24.490372272143773, + "grad_norm": 1.299625277519226, + "learning_rate": 2.517329910141207e-05, + "loss": 0.4022, + "step": 19078 + }, + { + "epoch": 24.49165596919127, + "grad_norm": 1.2305878400802612, + "learning_rate": 2.5172871202396234e-05, + "loss": 0.3925, + "step": 19079 + }, + { + "epoch": 24.49293966623877, + "grad_norm": 1.3259345293045044, + "learning_rate": 2.5172443303380402e-05, + "loss": 0.3654, + "step": 19080 + }, + { + "epoch": 24.494223363286263, + "grad_norm": 1.668575644493103, + "learning_rate": 2.517201540436457e-05, + "loss": 0.4002, + "step": 19081 + }, + { + "epoch": 24.49550706033376, + "grad_norm": 1.5606014728546143, + "learning_rate": 2.517158750534874e-05, + "loss": 0.3618, + "step": 19082 + }, + { + "epoch": 24.49679075738126, + "grad_norm": 1.1427030563354492, + "learning_rate": 2.5171159606332908e-05, + "loss": 0.3452, + "step": 19083 + }, + { + "epoch": 24.498074454428753, + "grad_norm": 1.5904594659805298, + "learning_rate": 2.5170731707317073e-05, + "loss": 0.395, + "step": 19084 + }, + { + "epoch": 24.49935815147625, + "grad_norm": 1.1593852043151855, + "learning_rate": 2.517030380830124e-05, + "loss": 0.3498, + "step": 19085 + }, + { + "epoch": 24.50064184852375, + "grad_norm": 1.7771365642547607, + "learning_rate": 2.516987590928541e-05, + "loss": 0.3832, + "step": 19086 + }, + { + "epoch": 24.501925545571247, + "grad_norm": 1.4264038801193237, + "learning_rate": 2.5169448010269574e-05, + "loss": 0.3749, + "step": 19087 + }, + { + "epoch": 24.50320924261874, + "grad_norm": 0.9323105812072754, + "learning_rate": 2.5169020111253746e-05, + "loss": 0.3907, + "step": 19088 + }, + { + "epoch": 24.50449293966624, + "grad_norm": 1.4196839332580566, + "learning_rate": 2.516859221223791e-05, + "loss": 0.3728, + "step": 19089 + }, + { + "epoch": 24.505776636713737, + "grad_norm": 1.5244194269180298, + "learning_rate": 2.5168164313222083e-05, + "loss": 0.4084, + "step": 19090 + }, + { + "epoch": 24.50706033376123, + "grad_norm": 1.5896574258804321, + "learning_rate": 2.5167736414206248e-05, + "loss": 0.3912, + "step": 19091 + }, + { + "epoch": 24.50834403080873, + "grad_norm": 1.3713626861572266, + "learning_rate": 2.5167308515190413e-05, + "loss": 0.4225, + "step": 19092 + }, + { + "epoch": 24.509627727856227, + "grad_norm": 3.0708882808685303, + "learning_rate": 2.5166880616174585e-05, + "loss": 0.3919, + "step": 19093 + }, + { + "epoch": 24.51091142490372, + "grad_norm": 1.5708197355270386, + "learning_rate": 2.516645271715875e-05, + "loss": 0.4799, + "step": 19094 + }, + { + "epoch": 24.51219512195122, + "grad_norm": 1.3922686576843262, + "learning_rate": 2.516602481814292e-05, + "loss": 0.4459, + "step": 19095 + }, + { + "epoch": 24.513478818998717, + "grad_norm": 4.836024761199951, + "learning_rate": 2.5165596919127087e-05, + "loss": 0.5713, + "step": 19096 + }, + { + "epoch": 24.514762516046215, + "grad_norm": 0.8324069976806641, + "learning_rate": 2.5165169020111255e-05, + "loss": 0.2883, + "step": 19097 + }, + { + "epoch": 24.51604621309371, + "grad_norm": 0.9348486661911011, + "learning_rate": 2.5164741121095424e-05, + "loss": 0.3234, + "step": 19098 + }, + { + "epoch": 24.517329910141207, + "grad_norm": 1.5823862552642822, + "learning_rate": 2.516431322207959e-05, + "loss": 0.3588, + "step": 19099 + }, + { + "epoch": 24.518613607188705, + "grad_norm": 1.4291456937789917, + "learning_rate": 2.5163885323063757e-05, + "loss": 0.3789, + "step": 19100 + }, + { + "epoch": 24.5198973042362, + "grad_norm": 1.0480154752731323, + "learning_rate": 2.5163457424047925e-05, + "loss": 0.3475, + "step": 19101 + }, + { + "epoch": 24.521181001283697, + "grad_norm": 0.8316459655761719, + "learning_rate": 2.5163029525032094e-05, + "loss": 0.3546, + "step": 19102 + }, + { + "epoch": 24.522464698331195, + "grad_norm": 1.5952627658843994, + "learning_rate": 2.516260162601626e-05, + "loss": 0.3766, + "step": 19103 + }, + { + "epoch": 24.52374839537869, + "grad_norm": 1.4955204725265503, + "learning_rate": 2.516217372700043e-05, + "loss": 0.3315, + "step": 19104 + }, + { + "epoch": 24.525032092426187, + "grad_norm": 1.3128633499145508, + "learning_rate": 2.5161745827984596e-05, + "loss": 0.3523, + "step": 19105 + }, + { + "epoch": 24.526315789473685, + "grad_norm": 1.0352898836135864, + "learning_rate": 2.5161317928968764e-05, + "loss": 0.369, + "step": 19106 + }, + { + "epoch": 24.527599486521183, + "grad_norm": 0.917407751083374, + "learning_rate": 2.5160890029952932e-05, + "loss": 0.3731, + "step": 19107 + }, + { + "epoch": 24.528883183568677, + "grad_norm": 1.2126792669296265, + "learning_rate": 2.5160462130937098e-05, + "loss": 0.3627, + "step": 19108 + }, + { + "epoch": 24.530166880616175, + "grad_norm": 2.3754937648773193, + "learning_rate": 2.516003423192127e-05, + "loss": 0.3451, + "step": 19109 + }, + { + "epoch": 24.531450577663673, + "grad_norm": 1.2646886110305786, + "learning_rate": 2.5159606332905434e-05, + "loss": 0.376, + "step": 19110 + }, + { + "epoch": 24.532734274711167, + "grad_norm": 1.6202802658081055, + "learning_rate": 2.5159178433889603e-05, + "loss": 0.3494, + "step": 19111 + }, + { + "epoch": 24.534017971758665, + "grad_norm": 1.6412848234176636, + "learning_rate": 2.515875053487377e-05, + "loss": 0.3586, + "step": 19112 + }, + { + "epoch": 24.535301668806163, + "grad_norm": 1.2429019212722778, + "learning_rate": 2.5158322635857936e-05, + "loss": 0.3413, + "step": 19113 + }, + { + "epoch": 24.536585365853657, + "grad_norm": 1.2521014213562012, + "learning_rate": 2.5157894736842108e-05, + "loss": 0.3269, + "step": 19114 + }, + { + "epoch": 24.537869062901155, + "grad_norm": 2.5669829845428467, + "learning_rate": 2.5157466837826273e-05, + "loss": 0.3338, + "step": 19115 + }, + { + "epoch": 24.539152759948653, + "grad_norm": 1.0154547691345215, + "learning_rate": 2.515703893881044e-05, + "loss": 0.343, + "step": 19116 + }, + { + "epoch": 24.540436456996147, + "grad_norm": 1.0974730253219604, + "learning_rate": 2.515661103979461e-05, + "loss": 0.359, + "step": 19117 + }, + { + "epoch": 24.541720154043645, + "grad_norm": 1.3464128971099854, + "learning_rate": 2.5156183140778778e-05, + "loss": 0.3334, + "step": 19118 + }, + { + "epoch": 24.543003851091143, + "grad_norm": 1.9776843786239624, + "learning_rate": 2.5155755241762943e-05, + "loss": 0.3598, + "step": 19119 + }, + { + "epoch": 24.54428754813864, + "grad_norm": 1.179594874382019, + "learning_rate": 2.515532734274711e-05, + "loss": 0.3612, + "step": 19120 + }, + { + "epoch": 24.545571245186135, + "grad_norm": 1.303173542022705, + "learning_rate": 2.515489944373128e-05, + "loss": 0.3671, + "step": 19121 + }, + { + "epoch": 24.546854942233633, + "grad_norm": 2.105290174484253, + "learning_rate": 2.515447154471545e-05, + "loss": 0.3535, + "step": 19122 + }, + { + "epoch": 24.54813863928113, + "grad_norm": 1.1480886936187744, + "learning_rate": 2.5154043645699617e-05, + "loss": 0.34, + "step": 19123 + }, + { + "epoch": 24.549422336328625, + "grad_norm": 1.515450119972229, + "learning_rate": 2.5153615746683782e-05, + "loss": 0.3788, + "step": 19124 + }, + { + "epoch": 24.550706033376123, + "grad_norm": 1.4231594800949097, + "learning_rate": 2.5153187847667954e-05, + "loss": 0.3674, + "step": 19125 + }, + { + "epoch": 24.55198973042362, + "grad_norm": 0.9409602284431458, + "learning_rate": 2.515275994865212e-05, + "loss": 0.406, + "step": 19126 + }, + { + "epoch": 24.553273427471115, + "grad_norm": 1.7899972200393677, + "learning_rate": 2.5152332049636284e-05, + "loss": 0.3547, + "step": 19127 + }, + { + "epoch": 24.554557124518613, + "grad_norm": 1.6583341360092163, + "learning_rate": 2.5151904150620456e-05, + "loss": 0.3728, + "step": 19128 + }, + { + "epoch": 24.55584082156611, + "grad_norm": 1.2276240587234497, + "learning_rate": 2.515147625160462e-05, + "loss": 0.3665, + "step": 19129 + }, + { + "epoch": 24.55712451861361, + "grad_norm": 1.717236042022705, + "learning_rate": 2.5151048352588792e-05, + "loss": 0.414, + "step": 19130 + }, + { + "epoch": 24.558408215661103, + "grad_norm": 1.0063241720199585, + "learning_rate": 2.5150620453572957e-05, + "loss": 0.358, + "step": 19131 + }, + { + "epoch": 24.5596919127086, + "grad_norm": 1.2206172943115234, + "learning_rate": 2.5150192554557126e-05, + "loss": 0.4046, + "step": 19132 + }, + { + "epoch": 24.5609756097561, + "grad_norm": 1.0792003870010376, + "learning_rate": 2.5149764655541294e-05, + "loss": 0.363, + "step": 19133 + }, + { + "epoch": 24.562259306803593, + "grad_norm": 1.0880244970321655, + "learning_rate": 2.514933675652546e-05, + "loss": 0.3611, + "step": 19134 + }, + { + "epoch": 24.56354300385109, + "grad_norm": 2.1120247840881348, + "learning_rate": 2.5148908857509628e-05, + "loss": 0.3983, + "step": 19135 + }, + { + "epoch": 24.56482670089859, + "grad_norm": 1.1395906209945679, + "learning_rate": 2.5148480958493796e-05, + "loss": 0.3841, + "step": 19136 + }, + { + "epoch": 24.566110397946083, + "grad_norm": 2.7594103813171387, + "learning_rate": 2.5148053059477965e-05, + "loss": 0.4272, + "step": 19137 + }, + { + "epoch": 24.56739409499358, + "grad_norm": 1.2110247611999512, + "learning_rate": 2.5147625160462133e-05, + "loss": 0.4224, + "step": 19138 + }, + { + "epoch": 24.56867779204108, + "grad_norm": 2.427371025085449, + "learning_rate": 2.51471972614463e-05, + "loss": 0.4344, + "step": 19139 + }, + { + "epoch": 24.569961489088577, + "grad_norm": 1.371015191078186, + "learning_rate": 2.5146769362430466e-05, + "loss": 0.4124, + "step": 19140 + }, + { + "epoch": 24.57124518613607, + "grad_norm": 1.1908739805221558, + "learning_rate": 2.5146341463414635e-05, + "loss": 0.4068, + "step": 19141 + }, + { + "epoch": 24.57252888318357, + "grad_norm": 1.6865266561508179, + "learning_rate": 2.5145913564398803e-05, + "loss": 0.4287, + "step": 19142 + }, + { + "epoch": 24.573812580231067, + "grad_norm": 11.434919357299805, + "learning_rate": 2.5145485665382968e-05, + "loss": 0.4283, + "step": 19143 + }, + { + "epoch": 24.57509627727856, + "grad_norm": 1.7316659688949585, + "learning_rate": 2.514505776636714e-05, + "loss": 0.4836, + "step": 19144 + }, + { + "epoch": 24.57637997432606, + "grad_norm": 1.4740126132965088, + "learning_rate": 2.5144629867351305e-05, + "loss": 0.5042, + "step": 19145 + }, + { + "epoch": 24.577663671373557, + "grad_norm": 2.780146360397339, + "learning_rate": 2.5144201968335473e-05, + "loss": 0.5372, + "step": 19146 + }, + { + "epoch": 24.57894736842105, + "grad_norm": 1.4097647666931152, + "learning_rate": 2.5143774069319642e-05, + "loss": 0.3644, + "step": 19147 + }, + { + "epoch": 24.58023106546855, + "grad_norm": 1.344905138015747, + "learning_rate": 2.5143346170303807e-05, + "loss": 0.3711, + "step": 19148 + }, + { + "epoch": 24.581514762516047, + "grad_norm": 0.8484278321266174, + "learning_rate": 2.514291827128798e-05, + "loss": 0.3264, + "step": 19149 + }, + { + "epoch": 24.58279845956354, + "grad_norm": 0.9947144389152527, + "learning_rate": 2.5142490372272144e-05, + "loss": 0.3776, + "step": 19150 + }, + { + "epoch": 24.58408215661104, + "grad_norm": 1.3417471647262573, + "learning_rate": 2.5142062473256312e-05, + "loss": 0.3499, + "step": 19151 + }, + { + "epoch": 24.585365853658537, + "grad_norm": 1.182115912437439, + "learning_rate": 2.514163457424048e-05, + "loss": 0.3586, + "step": 19152 + }, + { + "epoch": 24.586649550706035, + "grad_norm": 1.0448468923568726, + "learning_rate": 2.5141206675224646e-05, + "loss": 0.3633, + "step": 19153 + }, + { + "epoch": 24.58793324775353, + "grad_norm": 1.4582260847091675, + "learning_rate": 2.5140778776208814e-05, + "loss": 0.3483, + "step": 19154 + }, + { + "epoch": 24.589216944801027, + "grad_norm": 0.959943950176239, + "learning_rate": 2.5140350877192982e-05, + "loss": 0.3747, + "step": 19155 + }, + { + "epoch": 24.590500641848525, + "grad_norm": 2.1072487831115723, + "learning_rate": 2.513992297817715e-05, + "loss": 0.3486, + "step": 19156 + }, + { + "epoch": 24.59178433889602, + "grad_norm": 1.0367511510849, + "learning_rate": 2.513949507916132e-05, + "loss": 0.3621, + "step": 19157 + }, + { + "epoch": 24.593068035943517, + "grad_norm": 0.9573931097984314, + "learning_rate": 2.5139067180145488e-05, + "loss": 0.3732, + "step": 19158 + }, + { + "epoch": 24.594351732991015, + "grad_norm": 0.8662066459655762, + "learning_rate": 2.5138639281129653e-05, + "loss": 0.3445, + "step": 19159 + }, + { + "epoch": 24.59563543003851, + "grad_norm": 1.0641403198242188, + "learning_rate": 2.513821138211382e-05, + "loss": 0.3619, + "step": 19160 + }, + { + "epoch": 24.596919127086007, + "grad_norm": 1.1389063596725464, + "learning_rate": 2.513778348309799e-05, + "loss": 0.3885, + "step": 19161 + }, + { + "epoch": 24.598202824133505, + "grad_norm": 1.7106647491455078, + "learning_rate": 2.5137355584082154e-05, + "loss": 0.3596, + "step": 19162 + }, + { + "epoch": 24.599486521181003, + "grad_norm": 1.3804908990859985, + "learning_rate": 2.5136927685066326e-05, + "loss": 0.3675, + "step": 19163 + }, + { + "epoch": 24.600770218228497, + "grad_norm": 1.3409321308135986, + "learning_rate": 2.513649978605049e-05, + "loss": 0.3855, + "step": 19164 + }, + { + "epoch": 24.602053915275995, + "grad_norm": 1.1400989294052124, + "learning_rate": 2.5136071887034663e-05, + "loss": 0.3647, + "step": 19165 + }, + { + "epoch": 24.603337612323493, + "grad_norm": 3.2383017539978027, + "learning_rate": 2.5135643988018828e-05, + "loss": 0.3336, + "step": 19166 + }, + { + "epoch": 24.604621309370987, + "grad_norm": 1.0180705785751343, + "learning_rate": 2.5135216089002993e-05, + "loss": 0.3802, + "step": 19167 + }, + { + "epoch": 24.605905006418485, + "grad_norm": 1.209195852279663, + "learning_rate": 2.5134788189987165e-05, + "loss": 0.3518, + "step": 19168 + }, + { + "epoch": 24.607188703465983, + "grad_norm": 1.0127514600753784, + "learning_rate": 2.513436029097133e-05, + "loss": 0.4013, + "step": 19169 + }, + { + "epoch": 24.608472400513477, + "grad_norm": 2.7225239276885986, + "learning_rate": 2.51339323919555e-05, + "loss": 0.3531, + "step": 19170 + }, + { + "epoch": 24.609756097560975, + "grad_norm": 1.6053754091262817, + "learning_rate": 2.5133504492939667e-05, + "loss": 0.3669, + "step": 19171 + }, + { + "epoch": 24.611039794608473, + "grad_norm": 1.1407322883605957, + "learning_rate": 2.5133076593923835e-05, + "loss": 0.3775, + "step": 19172 + }, + { + "epoch": 24.61232349165597, + "grad_norm": 1.1504766941070557, + "learning_rate": 2.5132648694908004e-05, + "loss": 0.377, + "step": 19173 + }, + { + "epoch": 24.613607188703465, + "grad_norm": 4.862046718597412, + "learning_rate": 2.513222079589217e-05, + "loss": 0.3353, + "step": 19174 + }, + { + "epoch": 24.614890885750963, + "grad_norm": 1.138850450515747, + "learning_rate": 2.5131792896876337e-05, + "loss": 0.3757, + "step": 19175 + }, + { + "epoch": 24.61617458279846, + "grad_norm": 0.9508501887321472, + "learning_rate": 2.5131364997860505e-05, + "loss": 0.3404, + "step": 19176 + }, + { + "epoch": 24.617458279845955, + "grad_norm": 2.421016216278076, + "learning_rate": 2.5130937098844674e-05, + "loss": 0.3763, + "step": 19177 + }, + { + "epoch": 24.618741976893453, + "grad_norm": 1.5770074129104614, + "learning_rate": 2.513050919982884e-05, + "loss": 0.3383, + "step": 19178 + }, + { + "epoch": 24.62002567394095, + "grad_norm": 2.783389091491699, + "learning_rate": 2.513008130081301e-05, + "loss": 0.3589, + "step": 19179 + }, + { + "epoch": 24.621309370988445, + "grad_norm": 3.8440816402435303, + "learning_rate": 2.5129653401797176e-05, + "loss": 0.353, + "step": 19180 + }, + { + "epoch": 24.622593068035943, + "grad_norm": 2.53501033782959, + "learning_rate": 2.5129225502781344e-05, + "loss": 0.4207, + "step": 19181 + }, + { + "epoch": 24.62387676508344, + "grad_norm": 4.019387722015381, + "learning_rate": 2.5128797603765513e-05, + "loss": 0.3598, + "step": 19182 + }, + { + "epoch": 24.625160462130935, + "grad_norm": 1.2213001251220703, + "learning_rate": 2.5128369704749678e-05, + "loss": 0.3656, + "step": 19183 + }, + { + "epoch": 24.626444159178433, + "grad_norm": 1.4540537595748901, + "learning_rate": 2.512794180573385e-05, + "loss": 0.4333, + "step": 19184 + }, + { + "epoch": 24.62772785622593, + "grad_norm": 2.6624155044555664, + "learning_rate": 2.5127513906718014e-05, + "loss": 0.4278, + "step": 19185 + }, + { + "epoch": 24.62901155327343, + "grad_norm": 3.9646008014678955, + "learning_rate": 2.5127086007702183e-05, + "loss": 0.4427, + "step": 19186 + }, + { + "epoch": 24.630295250320923, + "grad_norm": 2.5490224361419678, + "learning_rate": 2.512665810868635e-05, + "loss": 0.3393, + "step": 19187 + }, + { + "epoch": 24.63157894736842, + "grad_norm": 1.4403539896011353, + "learning_rate": 2.5126230209670516e-05, + "loss": 0.4294, + "step": 19188 + }, + { + "epoch": 24.63286264441592, + "grad_norm": 1.2493386268615723, + "learning_rate": 2.5125802310654688e-05, + "loss": 0.4074, + "step": 19189 + }, + { + "epoch": 24.634146341463413, + "grad_norm": 3.093449115753174, + "learning_rate": 2.5125374411638853e-05, + "loss": 0.3959, + "step": 19190 + }, + { + "epoch": 24.63543003851091, + "grad_norm": 2.251302719116211, + "learning_rate": 2.512494651262302e-05, + "loss": 0.3786, + "step": 19191 + }, + { + "epoch": 24.63671373555841, + "grad_norm": 1.8684751987457275, + "learning_rate": 2.512451861360719e-05, + "loss": 0.4114, + "step": 19192 + }, + { + "epoch": 24.637997432605903, + "grad_norm": 1.7767856121063232, + "learning_rate": 2.5124090714591358e-05, + "loss": 0.4756, + "step": 19193 + }, + { + "epoch": 24.6392811296534, + "grad_norm": 1.4060559272766113, + "learning_rate": 2.5123662815575523e-05, + "loss": 0.3987, + "step": 19194 + }, + { + "epoch": 24.6405648267009, + "grad_norm": 1.6472232341766357, + "learning_rate": 2.5123234916559692e-05, + "loss": 0.4844, + "step": 19195 + }, + { + "epoch": 24.641848523748397, + "grad_norm": 2.870227575302124, + "learning_rate": 2.512280701754386e-05, + "loss": 0.5809, + "step": 19196 + }, + { + "epoch": 24.64313222079589, + "grad_norm": 1.4246373176574707, + "learning_rate": 2.512237911852803e-05, + "loss": 0.3301, + "step": 19197 + }, + { + "epoch": 24.64441591784339, + "grad_norm": 3.6758549213409424, + "learning_rate": 2.5121951219512197e-05, + "loss": 0.3357, + "step": 19198 + }, + { + "epoch": 24.645699614890887, + "grad_norm": 2.2555899620056152, + "learning_rate": 2.5121523320496362e-05, + "loss": 0.3909, + "step": 19199 + }, + { + "epoch": 24.64698331193838, + "grad_norm": 1.1864978075027466, + "learning_rate": 2.5121095421480534e-05, + "loss": 0.359, + "step": 19200 + }, + { + "epoch": 24.64826700898588, + "grad_norm": 1.7203524112701416, + "learning_rate": 2.51206675224647e-05, + "loss": 0.3785, + "step": 19201 + }, + { + "epoch": 24.649550706033377, + "grad_norm": 1.296068549156189, + "learning_rate": 2.5120239623448864e-05, + "loss": 0.3611, + "step": 19202 + }, + { + "epoch": 24.65083440308087, + "grad_norm": 1.0800327062606812, + "learning_rate": 2.5119811724433036e-05, + "loss": 0.323, + "step": 19203 + }, + { + "epoch": 24.65211810012837, + "grad_norm": 1.8171820640563965, + "learning_rate": 2.51193838254172e-05, + "loss": 0.3496, + "step": 19204 + }, + { + "epoch": 24.653401797175867, + "grad_norm": 2.084603786468506, + "learning_rate": 2.5118955926401372e-05, + "loss": 0.3763, + "step": 19205 + }, + { + "epoch": 24.654685494223365, + "grad_norm": 1.3459765911102295, + "learning_rate": 2.5118528027385537e-05, + "loss": 0.3812, + "step": 19206 + }, + { + "epoch": 24.65596919127086, + "grad_norm": 0.8810021877288818, + "learning_rate": 2.5118100128369706e-05, + "loss": 0.3815, + "step": 19207 + }, + { + "epoch": 24.657252888318357, + "grad_norm": 0.9909038543701172, + "learning_rate": 2.5117672229353874e-05, + "loss": 0.3442, + "step": 19208 + }, + { + "epoch": 24.658536585365855, + "grad_norm": 1.0403169393539429, + "learning_rate": 2.511724433033804e-05, + "loss": 0.3526, + "step": 19209 + }, + { + "epoch": 24.65982028241335, + "grad_norm": 1.0971465110778809, + "learning_rate": 2.5116816431322208e-05, + "loss": 0.3713, + "step": 19210 + }, + { + "epoch": 24.661103979460847, + "grad_norm": 0.9807858467102051, + "learning_rate": 2.5116388532306376e-05, + "loss": 0.3642, + "step": 19211 + }, + { + "epoch": 24.662387676508345, + "grad_norm": 0.9640421271324158, + "learning_rate": 2.5115960633290545e-05, + "loss": 0.3901, + "step": 19212 + }, + { + "epoch": 24.66367137355584, + "grad_norm": 1.6195746660232544, + "learning_rate": 2.5115532734274713e-05, + "loss": 0.4098, + "step": 19213 + }, + { + "epoch": 24.664955070603337, + "grad_norm": 1.9469558000564575, + "learning_rate": 2.5115104835258878e-05, + "loss": 0.3432, + "step": 19214 + }, + { + "epoch": 24.666238767650835, + "grad_norm": 2.0161542892456055, + "learning_rate": 2.5114676936243046e-05, + "loss": 0.3602, + "step": 19215 + }, + { + "epoch": 24.66752246469833, + "grad_norm": 1.553977131843567, + "learning_rate": 2.5114249037227215e-05, + "loss": 0.36, + "step": 19216 + }, + { + "epoch": 24.668806161745827, + "grad_norm": 1.0112435817718506, + "learning_rate": 2.5113821138211383e-05, + "loss": 0.366, + "step": 19217 + }, + { + "epoch": 24.670089858793325, + "grad_norm": 1.165292739868164, + "learning_rate": 2.5113393239195548e-05, + "loss": 0.3447, + "step": 19218 + }, + { + "epoch": 24.671373555840823, + "grad_norm": 1.631566047668457, + "learning_rate": 2.511296534017972e-05, + "loss": 0.3717, + "step": 19219 + }, + { + "epoch": 24.672657252888317, + "grad_norm": 1.209313154220581, + "learning_rate": 2.5112537441163885e-05, + "loss": 0.3837, + "step": 19220 + }, + { + "epoch": 24.673940949935815, + "grad_norm": 1.6054238080978394, + "learning_rate": 2.5112109542148054e-05, + "loss": 0.3489, + "step": 19221 + }, + { + "epoch": 24.675224646983313, + "grad_norm": 0.9930808544158936, + "learning_rate": 2.5111681643132222e-05, + "loss": 0.3532, + "step": 19222 + }, + { + "epoch": 24.676508344030808, + "grad_norm": 1.0204323530197144, + "learning_rate": 2.5111253744116387e-05, + "loss": 0.3539, + "step": 19223 + }, + { + "epoch": 24.677792041078305, + "grad_norm": 1.2155628204345703, + "learning_rate": 2.511082584510056e-05, + "loss": 0.348, + "step": 19224 + }, + { + "epoch": 24.679075738125803, + "grad_norm": 1.4220589399337769, + "learning_rate": 2.5110397946084724e-05, + "loss": 0.3457, + "step": 19225 + }, + { + "epoch": 24.680359435173298, + "grad_norm": 1.0552453994750977, + "learning_rate": 2.5109970047068892e-05, + "loss": 0.3214, + "step": 19226 + }, + { + "epoch": 24.681643132220795, + "grad_norm": 1.5926306247711182, + "learning_rate": 2.510954214805306e-05, + "loss": 0.3719, + "step": 19227 + }, + { + "epoch": 24.682926829268293, + "grad_norm": 1.6955304145812988, + "learning_rate": 2.5109114249037226e-05, + "loss": 0.4035, + "step": 19228 + }, + { + "epoch": 24.68421052631579, + "grad_norm": 1.0978450775146484, + "learning_rate": 2.5108686350021397e-05, + "loss": 0.3667, + "step": 19229 + }, + { + "epoch": 24.685494223363285, + "grad_norm": 1.592156171798706, + "learning_rate": 2.5108258451005562e-05, + "loss": 0.3611, + "step": 19230 + }, + { + "epoch": 24.686777920410783, + "grad_norm": 1.3106865882873535, + "learning_rate": 2.510783055198973e-05, + "loss": 0.3609, + "step": 19231 + }, + { + "epoch": 24.68806161745828, + "grad_norm": 1.1123454570770264, + "learning_rate": 2.51074026529739e-05, + "loss": 0.4664, + "step": 19232 + }, + { + "epoch": 24.689345314505776, + "grad_norm": 1.4103494882583618, + "learning_rate": 2.5106974753958068e-05, + "loss": 0.379, + "step": 19233 + }, + { + "epoch": 24.690629011553273, + "grad_norm": 2.606245517730713, + "learning_rate": 2.5106546854942233e-05, + "loss": 0.3912, + "step": 19234 + }, + { + "epoch": 24.69191270860077, + "grad_norm": 1.1178666353225708, + "learning_rate": 2.51061189559264e-05, + "loss": 0.3825, + "step": 19235 + }, + { + "epoch": 24.693196405648266, + "grad_norm": 1.4598493576049805, + "learning_rate": 2.510569105691057e-05, + "loss": 0.3544, + "step": 19236 + }, + { + "epoch": 24.694480102695763, + "grad_norm": 2.596170425415039, + "learning_rate": 2.5105263157894738e-05, + "loss": 0.3837, + "step": 19237 + }, + { + "epoch": 24.69576379974326, + "grad_norm": 1.621458649635315, + "learning_rate": 2.5104835258878906e-05, + "loss": 0.411, + "step": 19238 + }, + { + "epoch": 24.69704749679076, + "grad_norm": 1.6331380605697632, + "learning_rate": 2.510440735986307e-05, + "loss": 0.3516, + "step": 19239 + }, + { + "epoch": 24.698331193838253, + "grad_norm": 1.4687491655349731, + "learning_rate": 2.5103979460847243e-05, + "loss": 0.376, + "step": 19240 + }, + { + "epoch": 24.69961489088575, + "grad_norm": 5.374337196350098, + "learning_rate": 2.5103551561831408e-05, + "loss": 0.3674, + "step": 19241 + }, + { + "epoch": 24.70089858793325, + "grad_norm": 1.6302647590637207, + "learning_rate": 2.5103123662815573e-05, + "loss": 0.4011, + "step": 19242 + }, + { + "epoch": 24.702182284980744, + "grad_norm": 2.5033516883850098, + "learning_rate": 2.5102695763799745e-05, + "loss": 0.4204, + "step": 19243 + }, + { + "epoch": 24.70346598202824, + "grad_norm": 1.4782496690750122, + "learning_rate": 2.510226786478391e-05, + "loss": 0.4733, + "step": 19244 + }, + { + "epoch": 24.70474967907574, + "grad_norm": 10.130388259887695, + "learning_rate": 2.5101839965768082e-05, + "loss": 0.4775, + "step": 19245 + }, + { + "epoch": 24.706033376123234, + "grad_norm": 5.2630228996276855, + "learning_rate": 2.5101412066752247e-05, + "loss": 0.5208, + "step": 19246 + }, + { + "epoch": 24.70731707317073, + "grad_norm": 1.311601996421814, + "learning_rate": 2.5100984167736415e-05, + "loss": 0.3484, + "step": 19247 + }, + { + "epoch": 24.70860077021823, + "grad_norm": 1.7275996208190918, + "learning_rate": 2.5100556268720584e-05, + "loss": 0.3134, + "step": 19248 + }, + { + "epoch": 24.709884467265724, + "grad_norm": 0.9465470314025879, + "learning_rate": 2.510012836970475e-05, + "loss": 0.3358, + "step": 19249 + }, + { + "epoch": 24.71116816431322, + "grad_norm": 1.1917005777359009, + "learning_rate": 2.5099700470688917e-05, + "loss": 0.3633, + "step": 19250 + }, + { + "epoch": 24.71245186136072, + "grad_norm": 1.8324542045593262, + "learning_rate": 2.5099272571673086e-05, + "loss": 0.317, + "step": 19251 + }, + { + "epoch": 24.713735558408217, + "grad_norm": 2.527373790740967, + "learning_rate": 2.5098844672657254e-05, + "loss": 0.3681, + "step": 19252 + }, + { + "epoch": 24.71501925545571, + "grad_norm": 2.1460306644439697, + "learning_rate": 2.5098416773641422e-05, + "loss": 0.3962, + "step": 19253 + }, + { + "epoch": 24.71630295250321, + "grad_norm": 1.324420690536499, + "learning_rate": 2.509798887462559e-05, + "loss": 0.3636, + "step": 19254 + }, + { + "epoch": 24.717586649550707, + "grad_norm": 1.7297542095184326, + "learning_rate": 2.5097560975609756e-05, + "loss": 0.3782, + "step": 19255 + }, + { + "epoch": 24.7188703465982, + "grad_norm": 0.9369808435440063, + "learning_rate": 2.5097133076593924e-05, + "loss": 0.3886, + "step": 19256 + }, + { + "epoch": 24.7201540436457, + "grad_norm": 1.3579012155532837, + "learning_rate": 2.5096705177578093e-05, + "loss": 0.3734, + "step": 19257 + }, + { + "epoch": 24.721437740693197, + "grad_norm": 1.6056088209152222, + "learning_rate": 2.5096277278562258e-05, + "loss": 0.3576, + "step": 19258 + }, + { + "epoch": 24.72272143774069, + "grad_norm": 1.745911717414856, + "learning_rate": 2.509584937954643e-05, + "loss": 0.375, + "step": 19259 + }, + { + "epoch": 24.72400513478819, + "grad_norm": 1.3409101963043213, + "learning_rate": 2.5095421480530594e-05, + "loss": 0.345, + "step": 19260 + }, + { + "epoch": 24.725288831835687, + "grad_norm": 0.889052152633667, + "learning_rate": 2.5094993581514766e-05, + "loss": 0.3449, + "step": 19261 + }, + { + "epoch": 24.726572528883185, + "grad_norm": 1.3542802333831787, + "learning_rate": 2.509456568249893e-05, + "loss": 0.3673, + "step": 19262 + }, + { + "epoch": 24.72785622593068, + "grad_norm": 1.0875903367996216, + "learning_rate": 2.5094137783483096e-05, + "loss": 0.3656, + "step": 19263 + }, + { + "epoch": 24.729139922978177, + "grad_norm": 1.7013472318649292, + "learning_rate": 2.5093709884467268e-05, + "loss": 0.3681, + "step": 19264 + }, + { + "epoch": 24.730423620025675, + "grad_norm": 1.0674742460250854, + "learning_rate": 2.5093281985451433e-05, + "loss": 0.3322, + "step": 19265 + }, + { + "epoch": 24.73170731707317, + "grad_norm": 1.9088510274887085, + "learning_rate": 2.50928540864356e-05, + "loss": 0.3411, + "step": 19266 + }, + { + "epoch": 24.732991014120667, + "grad_norm": 2.1643638610839844, + "learning_rate": 2.509242618741977e-05, + "loss": 0.3388, + "step": 19267 + }, + { + "epoch": 24.734274711168165, + "grad_norm": 4.144774913787842, + "learning_rate": 2.509199828840394e-05, + "loss": 0.385, + "step": 19268 + }, + { + "epoch": 24.73555840821566, + "grad_norm": 1.1307318210601807, + "learning_rate": 2.5091570389388107e-05, + "loss": 0.4015, + "step": 19269 + }, + { + "epoch": 24.736842105263158, + "grad_norm": 1.1613926887512207, + "learning_rate": 2.5091142490372272e-05, + "loss": 0.369, + "step": 19270 + }, + { + "epoch": 24.738125802310655, + "grad_norm": 0.8585028648376465, + "learning_rate": 2.509071459135644e-05, + "loss": 0.358, + "step": 19271 + }, + { + "epoch": 24.739409499358153, + "grad_norm": 3.2163524627685547, + "learning_rate": 2.509028669234061e-05, + "loss": 0.3786, + "step": 19272 + }, + { + "epoch": 24.740693196405648, + "grad_norm": 0.9708927273750305, + "learning_rate": 2.5089858793324777e-05, + "loss": 0.3512, + "step": 19273 + }, + { + "epoch": 24.741976893453145, + "grad_norm": 1.956119179725647, + "learning_rate": 2.5089430894308942e-05, + "loss": 0.3609, + "step": 19274 + }, + { + "epoch": 24.743260590500643, + "grad_norm": 1.1136884689331055, + "learning_rate": 2.508900299529311e-05, + "loss": 0.4004, + "step": 19275 + }, + { + "epoch": 24.744544287548138, + "grad_norm": 0.8419523239135742, + "learning_rate": 2.508857509627728e-05, + "loss": 0.3447, + "step": 19276 + }, + { + "epoch": 24.745827984595635, + "grad_norm": 0.9587979316711426, + "learning_rate": 2.5088147197261447e-05, + "loss": 0.3964, + "step": 19277 + }, + { + "epoch": 24.747111681643133, + "grad_norm": 1.2522685527801514, + "learning_rate": 2.5087719298245616e-05, + "loss": 0.3812, + "step": 19278 + }, + { + "epoch": 24.748395378690628, + "grad_norm": 1.942284345626831, + "learning_rate": 2.508729139922978e-05, + "loss": 0.3588, + "step": 19279 + }, + { + "epoch": 24.749679075738126, + "grad_norm": 1.405226230621338, + "learning_rate": 2.5086863500213953e-05, + "loss": 0.3595, + "step": 19280 + }, + { + "epoch": 24.750962772785623, + "grad_norm": 2.0057060718536377, + "learning_rate": 2.5086435601198118e-05, + "loss": 0.3872, + "step": 19281 + }, + { + "epoch": 24.752246469833118, + "grad_norm": 1.9391887187957764, + "learning_rate": 2.5086007702182283e-05, + "loss": 0.3491, + "step": 19282 + }, + { + "epoch": 24.753530166880616, + "grad_norm": 1.7119479179382324, + "learning_rate": 2.5085579803166454e-05, + "loss": 0.4023, + "step": 19283 + }, + { + "epoch": 24.754813863928113, + "grad_norm": 2.93296480178833, + "learning_rate": 2.508515190415062e-05, + "loss": 0.4072, + "step": 19284 + }, + { + "epoch": 24.75609756097561, + "grad_norm": 1.7350775003433228, + "learning_rate": 2.508472400513479e-05, + "loss": 0.3914, + "step": 19285 + }, + { + "epoch": 24.757381258023106, + "grad_norm": 1.1515060663223267, + "learning_rate": 2.5084296106118956e-05, + "loss": 0.3997, + "step": 19286 + }, + { + "epoch": 24.758664955070603, + "grad_norm": 2.416008234024048, + "learning_rate": 2.5083868207103125e-05, + "loss": 0.4182, + "step": 19287 + }, + { + "epoch": 24.7599486521181, + "grad_norm": 1.2745388746261597, + "learning_rate": 2.5083440308087293e-05, + "loss": 0.4008, + "step": 19288 + }, + { + "epoch": 24.761232349165596, + "grad_norm": 1.2322394847869873, + "learning_rate": 2.5083012409071458e-05, + "loss": 0.3986, + "step": 19289 + }, + { + "epoch": 24.762516046213094, + "grad_norm": 3.020662784576416, + "learning_rate": 2.5082584510055626e-05, + "loss": 0.4077, + "step": 19290 + }, + { + "epoch": 24.76379974326059, + "grad_norm": 3.1122186183929443, + "learning_rate": 2.5082156611039795e-05, + "loss": 0.3847, + "step": 19291 + }, + { + "epoch": 24.765083440308086, + "grad_norm": 3.070023775100708, + "learning_rate": 2.5081728712023963e-05, + "loss": 0.4172, + "step": 19292 + }, + { + "epoch": 24.766367137355584, + "grad_norm": 1.8503413200378418, + "learning_rate": 2.5081300813008132e-05, + "loss": 0.4525, + "step": 19293 + }, + { + "epoch": 24.76765083440308, + "grad_norm": 1.0906436443328857, + "learning_rate": 2.50808729139923e-05, + "loss": 0.4378, + "step": 19294 + }, + { + "epoch": 24.76893453145058, + "grad_norm": 1.834233045578003, + "learning_rate": 2.5080445014976465e-05, + "loss": 0.5279, + "step": 19295 + }, + { + "epoch": 24.770218228498074, + "grad_norm": 2.2007675170898438, + "learning_rate": 2.5080017115960634e-05, + "loss": 0.6207, + "step": 19296 + }, + { + "epoch": 24.77150192554557, + "grad_norm": 0.8292554020881653, + "learning_rate": 2.5079589216944802e-05, + "loss": 0.359, + "step": 19297 + }, + { + "epoch": 24.77278562259307, + "grad_norm": 0.8681576251983643, + "learning_rate": 2.5079161317928967e-05, + "loss": 0.3509, + "step": 19298 + }, + { + "epoch": 24.774069319640564, + "grad_norm": 0.671276330947876, + "learning_rate": 2.507873341891314e-05, + "loss": 0.3293, + "step": 19299 + }, + { + "epoch": 24.77535301668806, + "grad_norm": 11.295531272888184, + "learning_rate": 2.5078305519897304e-05, + "loss": 0.3946, + "step": 19300 + }, + { + "epoch": 24.77663671373556, + "grad_norm": 2.182105779647827, + "learning_rate": 2.5077877620881476e-05, + "loss": 0.3596, + "step": 19301 + }, + { + "epoch": 24.777920410783054, + "grad_norm": 1.9131258726119995, + "learning_rate": 2.507744972186564e-05, + "loss": 0.3667, + "step": 19302 + }, + { + "epoch": 24.77920410783055, + "grad_norm": 1.761521339416504, + "learning_rate": 2.5077021822849806e-05, + "loss": 0.4084, + "step": 19303 + }, + { + "epoch": 24.78048780487805, + "grad_norm": 0.9591111540794373, + "learning_rate": 2.5076593923833977e-05, + "loss": 0.3515, + "step": 19304 + }, + { + "epoch": 24.781771501925547, + "grad_norm": 0.9662957191467285, + "learning_rate": 2.5076166024818142e-05, + "loss": 0.3934, + "step": 19305 + }, + { + "epoch": 24.78305519897304, + "grad_norm": 1.3644757270812988, + "learning_rate": 2.507573812580231e-05, + "loss": 0.3855, + "step": 19306 + }, + { + "epoch": 24.78433889602054, + "grad_norm": 1.113578200340271, + "learning_rate": 2.507531022678648e-05, + "loss": 0.3783, + "step": 19307 + }, + { + "epoch": 24.785622593068037, + "grad_norm": 1.237287998199463, + "learning_rate": 2.5074882327770648e-05, + "loss": 0.3555, + "step": 19308 + }, + { + "epoch": 24.78690629011553, + "grad_norm": 1.564069390296936, + "learning_rate": 2.5074454428754816e-05, + "loss": 0.3846, + "step": 19309 + }, + { + "epoch": 24.78818998716303, + "grad_norm": 1.0528963804244995, + "learning_rate": 2.507402652973898e-05, + "loss": 0.3787, + "step": 19310 + }, + { + "epoch": 24.789473684210527, + "grad_norm": 1.6316550970077515, + "learning_rate": 2.507359863072315e-05, + "loss": 0.3763, + "step": 19311 + }, + { + "epoch": 24.79075738125802, + "grad_norm": 1.0733646154403687, + "learning_rate": 2.5073170731707318e-05, + "loss": 0.3698, + "step": 19312 + }, + { + "epoch": 24.79204107830552, + "grad_norm": 1.1082855463027954, + "learning_rate": 2.5072742832691486e-05, + "loss": 0.3448, + "step": 19313 + }, + { + "epoch": 24.793324775353017, + "grad_norm": 1.2617580890655518, + "learning_rate": 2.507231493367565e-05, + "loss": 0.3322, + "step": 19314 + }, + { + "epoch": 24.794608472400512, + "grad_norm": 1.1014986038208008, + "learning_rate": 2.5071887034659823e-05, + "loss": 0.3531, + "step": 19315 + }, + { + "epoch": 24.79589216944801, + "grad_norm": 1.1159075498580933, + "learning_rate": 2.5071459135643988e-05, + "loss": 0.3709, + "step": 19316 + }, + { + "epoch": 24.797175866495508, + "grad_norm": 1.3739038705825806, + "learning_rate": 2.5071031236628157e-05, + "loss": 0.3915, + "step": 19317 + }, + { + "epoch": 24.798459563543005, + "grad_norm": 2.029313564300537, + "learning_rate": 2.5070603337612325e-05, + "loss": 0.3465, + "step": 19318 + }, + { + "epoch": 24.7997432605905, + "grad_norm": 1.8849027156829834, + "learning_rate": 2.507017543859649e-05, + "loss": 0.3426, + "step": 19319 + }, + { + "epoch": 24.801026957637998, + "grad_norm": 0.846427321434021, + "learning_rate": 2.5069747539580662e-05, + "loss": 0.3527, + "step": 19320 + }, + { + "epoch": 24.802310654685495, + "grad_norm": 1.5697650909423828, + "learning_rate": 2.5069319640564827e-05, + "loss": 0.3613, + "step": 19321 + }, + { + "epoch": 24.80359435173299, + "grad_norm": 1.367322325706482, + "learning_rate": 2.5068891741548995e-05, + "loss": 0.3719, + "step": 19322 + }, + { + "epoch": 24.804878048780488, + "grad_norm": 1.8110421895980835, + "learning_rate": 2.5068463842533164e-05, + "loss": 0.3838, + "step": 19323 + }, + { + "epoch": 24.806161745827985, + "grad_norm": 1.0076795816421509, + "learning_rate": 2.506803594351733e-05, + "loss": 0.3572, + "step": 19324 + }, + { + "epoch": 24.80744544287548, + "grad_norm": 1.0083171129226685, + "learning_rate": 2.50676080445015e-05, + "loss": 0.3553, + "step": 19325 + }, + { + "epoch": 24.808729139922978, + "grad_norm": 1.6160260438919067, + "learning_rate": 2.5067180145485666e-05, + "loss": 0.3922, + "step": 19326 + }, + { + "epoch": 24.810012836970476, + "grad_norm": 1.4533827304840088, + "learning_rate": 2.5066752246469834e-05, + "loss": 0.3875, + "step": 19327 + }, + { + "epoch": 24.811296534017973, + "grad_norm": 2.8790969848632812, + "learning_rate": 2.5066324347454002e-05, + "loss": 0.4034, + "step": 19328 + }, + { + "epoch": 24.812580231065468, + "grad_norm": 1.5251432657241821, + "learning_rate": 2.506589644843817e-05, + "loss": 0.4097, + "step": 19329 + }, + { + "epoch": 24.813863928112966, + "grad_norm": 2.3319106101989746, + "learning_rate": 2.5065468549422336e-05, + "loss": 0.3547, + "step": 19330 + }, + { + "epoch": 24.815147625160463, + "grad_norm": 1.7642560005187988, + "learning_rate": 2.5065040650406504e-05, + "loss": 0.3857, + "step": 19331 + }, + { + "epoch": 24.816431322207958, + "grad_norm": 1.0624563694000244, + "learning_rate": 2.5064612751390673e-05, + "loss": 0.3766, + "step": 19332 + }, + { + "epoch": 24.817715019255456, + "grad_norm": 1.1726539134979248, + "learning_rate": 2.506418485237484e-05, + "loss": 0.3654, + "step": 19333 + }, + { + "epoch": 24.818998716302954, + "grad_norm": 1.8145298957824707, + "learning_rate": 2.506375695335901e-05, + "loss": 0.4283, + "step": 19334 + }, + { + "epoch": 24.820282413350448, + "grad_norm": 1.234933614730835, + "learning_rate": 2.5063329054343175e-05, + "loss": 0.3572, + "step": 19335 + }, + { + "epoch": 24.821566110397946, + "grad_norm": 1.4633457660675049, + "learning_rate": 2.5062901155327343e-05, + "loss": 0.3409, + "step": 19336 + }, + { + "epoch": 24.822849807445444, + "grad_norm": 2.1058056354522705, + "learning_rate": 2.506247325631151e-05, + "loss": 0.3893, + "step": 19337 + }, + { + "epoch": 24.82413350449294, + "grad_norm": 1.0118579864501953, + "learning_rate": 2.5062045357295676e-05, + "loss": 0.4339, + "step": 19338 + }, + { + "epoch": 24.825417201540436, + "grad_norm": 1.9137790203094482, + "learning_rate": 2.5061617458279848e-05, + "loss": 0.4041, + "step": 19339 + }, + { + "epoch": 24.826700898587934, + "grad_norm": 1.1893881559371948, + "learning_rate": 2.5061189559264013e-05, + "loss": 0.3627, + "step": 19340 + }, + { + "epoch": 24.82798459563543, + "grad_norm": 1.5788075923919678, + "learning_rate": 2.5060761660248185e-05, + "loss": 0.3874, + "step": 19341 + }, + { + "epoch": 24.829268292682926, + "grad_norm": 2.405355215072632, + "learning_rate": 2.506033376123235e-05, + "loss": 0.4372, + "step": 19342 + }, + { + "epoch": 24.830551989730424, + "grad_norm": 1.4179645776748657, + "learning_rate": 2.5059905862216515e-05, + "loss": 0.4556, + "step": 19343 + }, + { + "epoch": 24.83183568677792, + "grad_norm": 1.6859139204025269, + "learning_rate": 2.5059477963200687e-05, + "loss": 0.3648, + "step": 19344 + }, + { + "epoch": 24.833119383825416, + "grad_norm": 1.7353296279907227, + "learning_rate": 2.5059050064184852e-05, + "loss": 0.4512, + "step": 19345 + }, + { + "epoch": 24.834403080872914, + "grad_norm": 4.208488941192627, + "learning_rate": 2.505862216516902e-05, + "loss": 0.598, + "step": 19346 + }, + { + "epoch": 24.83568677792041, + "grad_norm": 1.1140894889831543, + "learning_rate": 2.505819426615319e-05, + "loss": 0.3497, + "step": 19347 + }, + { + "epoch": 24.836970474967906, + "grad_norm": 2.0502142906188965, + "learning_rate": 2.5057766367137357e-05, + "loss": 0.3218, + "step": 19348 + }, + { + "epoch": 24.838254172015404, + "grad_norm": 1.114441990852356, + "learning_rate": 2.5057338468121522e-05, + "loss": 0.3513, + "step": 19349 + }, + { + "epoch": 24.8395378690629, + "grad_norm": 0.9690033197402954, + "learning_rate": 2.505691056910569e-05, + "loss": 0.3615, + "step": 19350 + }, + { + "epoch": 24.8408215661104, + "grad_norm": 1.1911587715148926, + "learning_rate": 2.505648267008986e-05, + "loss": 0.3396, + "step": 19351 + }, + { + "epoch": 24.842105263157894, + "grad_norm": 1.1696275472640991, + "learning_rate": 2.5056054771074027e-05, + "loss": 0.3329, + "step": 19352 + }, + { + "epoch": 24.84338896020539, + "grad_norm": 0.8870454430580139, + "learning_rate": 2.5055626872058196e-05, + "loss": 0.3276, + "step": 19353 + }, + { + "epoch": 24.84467265725289, + "grad_norm": 1.3432289361953735, + "learning_rate": 2.505519897304236e-05, + "loss": 0.385, + "step": 19354 + }, + { + "epoch": 24.845956354300384, + "grad_norm": 1.2711082696914673, + "learning_rate": 2.5054771074026533e-05, + "loss": 0.3614, + "step": 19355 + }, + { + "epoch": 24.84724005134788, + "grad_norm": 0.8189542293548584, + "learning_rate": 2.5054343175010698e-05, + "loss": 0.3573, + "step": 19356 + }, + { + "epoch": 24.84852374839538, + "grad_norm": 1.9892404079437256, + "learning_rate": 2.5053915275994863e-05, + "loss": 0.3494, + "step": 19357 + }, + { + "epoch": 24.849807445442874, + "grad_norm": 1.054512619972229, + "learning_rate": 2.5053487376979034e-05, + "loss": 0.3823, + "step": 19358 + }, + { + "epoch": 24.85109114249037, + "grad_norm": 0.889880359172821, + "learning_rate": 2.50530594779632e-05, + "loss": 0.3621, + "step": 19359 + }, + { + "epoch": 24.85237483953787, + "grad_norm": 0.9658461809158325, + "learning_rate": 2.505263157894737e-05, + "loss": 0.3794, + "step": 19360 + }, + { + "epoch": 24.853658536585368, + "grad_norm": 0.9562357664108276, + "learning_rate": 2.5052203679931536e-05, + "loss": 0.3386, + "step": 19361 + }, + { + "epoch": 24.854942233632862, + "grad_norm": 1.1227325201034546, + "learning_rate": 2.5051775780915705e-05, + "loss": 0.3808, + "step": 19362 + }, + { + "epoch": 24.85622593068036, + "grad_norm": 1.3068430423736572, + "learning_rate": 2.5051347881899873e-05, + "loss": 0.3753, + "step": 19363 + }, + { + "epoch": 24.857509627727858, + "grad_norm": 1.001336693763733, + "learning_rate": 2.5050919982884038e-05, + "loss": 0.343, + "step": 19364 + }, + { + "epoch": 24.858793324775352, + "grad_norm": 1.185324788093567, + "learning_rate": 2.5050492083868207e-05, + "loss": 0.3897, + "step": 19365 + }, + { + "epoch": 24.86007702182285, + "grad_norm": 0.870671808719635, + "learning_rate": 2.5050064184852375e-05, + "loss": 0.3816, + "step": 19366 + }, + { + "epoch": 24.861360718870348, + "grad_norm": 2.3741166591644287, + "learning_rate": 2.5049636285836543e-05, + "loss": 0.3481, + "step": 19367 + }, + { + "epoch": 24.862644415917842, + "grad_norm": 1.1090404987335205, + "learning_rate": 2.5049208386820712e-05, + "loss": 0.3886, + "step": 19368 + }, + { + "epoch": 24.86392811296534, + "grad_norm": 1.197148323059082, + "learning_rate": 2.504878048780488e-05, + "loss": 0.3429, + "step": 19369 + }, + { + "epoch": 24.865211810012838, + "grad_norm": 1.2100015878677368, + "learning_rate": 2.5048352588789045e-05, + "loss": 0.3407, + "step": 19370 + }, + { + "epoch": 24.866495507060336, + "grad_norm": 1.4208937883377075, + "learning_rate": 2.5047924689773214e-05, + "loss": 0.3454, + "step": 19371 + }, + { + "epoch": 24.86777920410783, + "grad_norm": 2.7938385009765625, + "learning_rate": 2.5047496790757382e-05, + "loss": 0.3607, + "step": 19372 + }, + { + "epoch": 24.869062901155328, + "grad_norm": 1.0073076486587524, + "learning_rate": 2.5047068891741547e-05, + "loss": 0.4051, + "step": 19373 + }, + { + "epoch": 24.870346598202826, + "grad_norm": 1.1058536767959595, + "learning_rate": 2.504664099272572e-05, + "loss": 0.3445, + "step": 19374 + }, + { + "epoch": 24.87163029525032, + "grad_norm": 0.8649793863296509, + "learning_rate": 2.5046213093709884e-05, + "loss": 0.3534, + "step": 19375 + }, + { + "epoch": 24.872913992297818, + "grad_norm": 2.100827693939209, + "learning_rate": 2.5045785194694056e-05, + "loss": 0.3642, + "step": 19376 + }, + { + "epoch": 24.874197689345316, + "grad_norm": 1.2714632749557495, + "learning_rate": 2.504535729567822e-05, + "loss": 0.3543, + "step": 19377 + }, + { + "epoch": 24.87548138639281, + "grad_norm": 0.9960393905639648, + "learning_rate": 2.5044929396662386e-05, + "loss": 0.357, + "step": 19378 + }, + { + "epoch": 24.876765083440308, + "grad_norm": 2.9220340251922607, + "learning_rate": 2.5044501497646558e-05, + "loss": 0.3543, + "step": 19379 + }, + { + "epoch": 24.878048780487806, + "grad_norm": 2.3800759315490723, + "learning_rate": 2.5044073598630723e-05, + "loss": 0.3697, + "step": 19380 + }, + { + "epoch": 24.8793324775353, + "grad_norm": 2.518632173538208, + "learning_rate": 2.504364569961489e-05, + "loss": 0.3761, + "step": 19381 + }, + { + "epoch": 24.880616174582798, + "grad_norm": 1.5219541788101196, + "learning_rate": 2.504321780059906e-05, + "loss": 0.3678, + "step": 19382 + }, + { + "epoch": 24.881899871630296, + "grad_norm": 1.1405383348464966, + "learning_rate": 2.5042789901583228e-05, + "loss": 0.3581, + "step": 19383 + }, + { + "epoch": 24.883183568677794, + "grad_norm": 1.6894820928573608, + "learning_rate": 2.5042362002567396e-05, + "loss": 0.3678, + "step": 19384 + }, + { + "epoch": 24.884467265725288, + "grad_norm": 2.6106252670288086, + "learning_rate": 2.504193410355156e-05, + "loss": 0.3894, + "step": 19385 + }, + { + "epoch": 24.885750962772786, + "grad_norm": 1.1964101791381836, + "learning_rate": 2.504150620453573e-05, + "loss": 0.3672, + "step": 19386 + }, + { + "epoch": 24.887034659820284, + "grad_norm": 1.5279972553253174, + "learning_rate": 2.5041078305519898e-05, + "loss": 0.4509, + "step": 19387 + }, + { + "epoch": 24.888318356867778, + "grad_norm": 1.8817484378814697, + "learning_rate": 2.5040650406504066e-05, + "loss": 0.3551, + "step": 19388 + }, + { + "epoch": 24.889602053915276, + "grad_norm": 1.911903738975525, + "learning_rate": 2.504022250748823e-05, + "loss": 0.3933, + "step": 19389 + }, + { + "epoch": 24.890885750962774, + "grad_norm": 1.8717663288116455, + "learning_rate": 2.5039794608472403e-05, + "loss": 0.3686, + "step": 19390 + }, + { + "epoch": 24.892169448010268, + "grad_norm": 3.687556743621826, + "learning_rate": 2.5039366709456568e-05, + "loss": 0.4153, + "step": 19391 + }, + { + "epoch": 24.893453145057766, + "grad_norm": 2.9268712997436523, + "learning_rate": 2.5038938810440737e-05, + "loss": 0.419, + "step": 19392 + }, + { + "epoch": 24.894736842105264, + "grad_norm": 1.7431076765060425, + "learning_rate": 2.5038510911424905e-05, + "loss": 0.471, + "step": 19393 + }, + { + "epoch": 24.89602053915276, + "grad_norm": 2.838961362838745, + "learning_rate": 2.503808301240907e-05, + "loss": 0.4604, + "step": 19394 + }, + { + "epoch": 24.897304236200256, + "grad_norm": 2.0652754306793213, + "learning_rate": 2.5037655113393242e-05, + "loss": 0.4385, + "step": 19395 + }, + { + "epoch": 24.898587933247754, + "grad_norm": 2.322929620742798, + "learning_rate": 2.5037227214377407e-05, + "loss": 0.5425, + "step": 19396 + }, + { + "epoch": 24.89987163029525, + "grad_norm": 1.0959465503692627, + "learning_rate": 2.5036799315361575e-05, + "loss": 0.3138, + "step": 19397 + }, + { + "epoch": 24.901155327342746, + "grad_norm": 1.5349321365356445, + "learning_rate": 2.5036371416345744e-05, + "loss": 0.3506, + "step": 19398 + }, + { + "epoch": 24.902439024390244, + "grad_norm": 1.0735654830932617, + "learning_rate": 2.503594351732991e-05, + "loss": 0.3349, + "step": 19399 + }, + { + "epoch": 24.90372272143774, + "grad_norm": 0.8867810368537903, + "learning_rate": 2.503551561831408e-05, + "loss": 0.3533, + "step": 19400 + }, + { + "epoch": 24.905006418485236, + "grad_norm": 2.871272087097168, + "learning_rate": 2.5035087719298246e-05, + "loss": 0.3773, + "step": 19401 + }, + { + "epoch": 24.906290115532734, + "grad_norm": 0.9940610527992249, + "learning_rate": 2.5034659820282414e-05, + "loss": 0.3305, + "step": 19402 + }, + { + "epoch": 24.90757381258023, + "grad_norm": 2.1420557498931885, + "learning_rate": 2.5034231921266582e-05, + "loss": 0.3797, + "step": 19403 + }, + { + "epoch": 24.90885750962773, + "grad_norm": 1.7526681423187256, + "learning_rate": 2.5033804022250747e-05, + "loss": 0.3879, + "step": 19404 + }, + { + "epoch": 24.910141206675224, + "grad_norm": 1.4685298204421997, + "learning_rate": 2.5033376123234916e-05, + "loss": 0.3949, + "step": 19405 + }, + { + "epoch": 24.911424903722722, + "grad_norm": 1.1910954713821411, + "learning_rate": 2.5032948224219084e-05, + "loss": 0.3689, + "step": 19406 + }, + { + "epoch": 24.91270860077022, + "grad_norm": 1.0158421993255615, + "learning_rate": 2.5032520325203253e-05, + "loss": 0.3447, + "step": 19407 + }, + { + "epoch": 24.913992297817714, + "grad_norm": 1.4898210763931274, + "learning_rate": 2.503209242618742e-05, + "loss": 0.3474, + "step": 19408 + }, + { + "epoch": 24.915275994865212, + "grad_norm": 1.0729751586914062, + "learning_rate": 2.503166452717159e-05, + "loss": 0.3893, + "step": 19409 + }, + { + "epoch": 24.91655969191271, + "grad_norm": 1.9182095527648926, + "learning_rate": 2.5031236628155755e-05, + "loss": 0.3611, + "step": 19410 + }, + { + "epoch": 24.917843388960204, + "grad_norm": 1.1850450038909912, + "learning_rate": 2.5030808729139923e-05, + "loss": 0.3421, + "step": 19411 + }, + { + "epoch": 24.919127086007702, + "grad_norm": 1.1712844371795654, + "learning_rate": 2.503038083012409e-05, + "loss": 0.3594, + "step": 19412 + }, + { + "epoch": 24.9204107830552, + "grad_norm": 2.5825881958007812, + "learning_rate": 2.5029952931108256e-05, + "loss": 0.3609, + "step": 19413 + }, + { + "epoch": 24.921694480102694, + "grad_norm": 1.8003225326538086, + "learning_rate": 2.5029525032092428e-05, + "loss": 0.3434, + "step": 19414 + }, + { + "epoch": 24.922978177150192, + "grad_norm": 1.1360234022140503, + "learning_rate": 2.5029097133076593e-05, + "loss": 0.3734, + "step": 19415 + }, + { + "epoch": 24.92426187419769, + "grad_norm": 1.819342851638794, + "learning_rate": 2.5028669234060765e-05, + "loss": 0.3751, + "step": 19416 + }, + { + "epoch": 24.925545571245188, + "grad_norm": 1.3727821111679077, + "learning_rate": 2.502824133504493e-05, + "loss": 0.3701, + "step": 19417 + }, + { + "epoch": 24.926829268292682, + "grad_norm": 0.9690228700637817, + "learning_rate": 2.5027813436029095e-05, + "loss": 0.3703, + "step": 19418 + }, + { + "epoch": 24.92811296534018, + "grad_norm": 0.8085091710090637, + "learning_rate": 2.5027385537013267e-05, + "loss": 0.3616, + "step": 19419 + }, + { + "epoch": 24.929396662387678, + "grad_norm": 1.831204891204834, + "learning_rate": 2.5026957637997432e-05, + "loss": 0.3438, + "step": 19420 + }, + { + "epoch": 24.930680359435172, + "grad_norm": 1.1705724000930786, + "learning_rate": 2.50265297389816e-05, + "loss": 0.4203, + "step": 19421 + }, + { + "epoch": 24.93196405648267, + "grad_norm": 1.7053289413452148, + "learning_rate": 2.502610183996577e-05, + "loss": 0.3655, + "step": 19422 + }, + { + "epoch": 24.933247753530168, + "grad_norm": 1.124563455581665, + "learning_rate": 2.5025673940949937e-05, + "loss": 0.3732, + "step": 19423 + }, + { + "epoch": 24.934531450577662, + "grad_norm": 1.0826568603515625, + "learning_rate": 2.5025246041934106e-05, + "loss": 0.3472, + "step": 19424 + }, + { + "epoch": 24.93581514762516, + "grad_norm": 1.0358012914657593, + "learning_rate": 2.502481814291827e-05, + "loss": 0.3414, + "step": 19425 + }, + { + "epoch": 24.937098844672658, + "grad_norm": 1.6882961988449097, + "learning_rate": 2.502439024390244e-05, + "loss": 0.3645, + "step": 19426 + }, + { + "epoch": 24.938382541720156, + "grad_norm": 5.872500896453857, + "learning_rate": 2.5023962344886607e-05, + "loss": 0.3212, + "step": 19427 + }, + { + "epoch": 24.93966623876765, + "grad_norm": 1.1832873821258545, + "learning_rate": 2.5023534445870776e-05, + "loss": 0.3844, + "step": 19428 + }, + { + "epoch": 24.940949935815148, + "grad_norm": 1.8704336881637573, + "learning_rate": 2.502310654685494e-05, + "loss": 0.3839, + "step": 19429 + }, + { + "epoch": 24.942233632862646, + "grad_norm": 1.451822280883789, + "learning_rate": 2.5022678647839113e-05, + "loss": 0.4159, + "step": 19430 + }, + { + "epoch": 24.94351732991014, + "grad_norm": 1.2180403470993042, + "learning_rate": 2.5022250748823278e-05, + "loss": 0.3422, + "step": 19431 + }, + { + "epoch": 24.944801026957638, + "grad_norm": 1.28790283203125, + "learning_rate": 2.5021822849807446e-05, + "loss": 0.3859, + "step": 19432 + }, + { + "epoch": 24.946084724005136, + "grad_norm": 1.3687363862991333, + "learning_rate": 2.5021394950791614e-05, + "loss": 0.3937, + "step": 19433 + }, + { + "epoch": 24.94736842105263, + "grad_norm": 1.5966867208480835, + "learning_rate": 2.502096705177578e-05, + "loss": 0.3679, + "step": 19434 + }, + { + "epoch": 24.948652118100128, + "grad_norm": 1.2798763513565063, + "learning_rate": 2.502053915275995e-05, + "loss": 0.4215, + "step": 19435 + }, + { + "epoch": 24.949935815147626, + "grad_norm": 6.146673679351807, + "learning_rate": 2.5020111253744116e-05, + "loss": 0.3935, + "step": 19436 + }, + { + "epoch": 24.951219512195124, + "grad_norm": 2.2638862133026123, + "learning_rate": 2.5019683354728285e-05, + "loss": 0.3711, + "step": 19437 + }, + { + "epoch": 24.952503209242618, + "grad_norm": 1.287667155265808, + "learning_rate": 2.5019255455712453e-05, + "loss": 0.4118, + "step": 19438 + }, + { + "epoch": 24.953786906290116, + "grad_norm": 1.8411328792572021, + "learning_rate": 2.5018827556696618e-05, + "loss": 0.4096, + "step": 19439 + }, + { + "epoch": 24.955070603337614, + "grad_norm": 1.0499598979949951, + "learning_rate": 2.501839965768079e-05, + "loss": 0.3622, + "step": 19440 + }, + { + "epoch": 24.956354300385108, + "grad_norm": 3.7071309089660645, + "learning_rate": 2.5017971758664955e-05, + "loss": 0.3948, + "step": 19441 + }, + { + "epoch": 24.957637997432606, + "grad_norm": 1.3765792846679688, + "learning_rate": 2.5017543859649123e-05, + "loss": 0.3876, + "step": 19442 + }, + { + "epoch": 24.958921694480104, + "grad_norm": 12.873449325561523, + "learning_rate": 2.5017115960633292e-05, + "loss": 0.4349, + "step": 19443 + }, + { + "epoch": 24.960205391527598, + "grad_norm": 2.6799001693725586, + "learning_rate": 2.501668806161746e-05, + "loss": 0.4777, + "step": 19444 + }, + { + "epoch": 24.961489088575096, + "grad_norm": 1.9788316488265991, + "learning_rate": 2.5016260162601625e-05, + "loss": 0.4348, + "step": 19445 + }, + { + "epoch": 24.962772785622594, + "grad_norm": 2.6838223934173584, + "learning_rate": 2.5015832263585794e-05, + "loss": 0.5638, + "step": 19446 + }, + { + "epoch": 24.964056482670088, + "grad_norm": 1.1453183889389038, + "learning_rate": 2.5015404364569962e-05, + "loss": 0.3509, + "step": 19447 + }, + { + "epoch": 24.965340179717586, + "grad_norm": 1.1598273515701294, + "learning_rate": 2.501497646555413e-05, + "loss": 0.3684, + "step": 19448 + }, + { + "epoch": 24.966623876765084, + "grad_norm": 1.1705290079116821, + "learning_rate": 2.50145485665383e-05, + "loss": 0.3204, + "step": 19449 + }, + { + "epoch": 24.96790757381258, + "grad_norm": 1.1804323196411133, + "learning_rate": 2.5014120667522464e-05, + "loss": 0.3536, + "step": 19450 + }, + { + "epoch": 24.969191270860076, + "grad_norm": 1.3482600450515747, + "learning_rate": 2.5013692768506636e-05, + "loss": 0.3342, + "step": 19451 + }, + { + "epoch": 24.970474967907574, + "grad_norm": 0.8506467342376709, + "learning_rate": 2.50132648694908e-05, + "loss": 0.3654, + "step": 19452 + }, + { + "epoch": 24.971758664955072, + "grad_norm": 1.0139840841293335, + "learning_rate": 2.5012836970474966e-05, + "loss": 0.3618, + "step": 19453 + }, + { + "epoch": 24.973042362002566, + "grad_norm": 1.2944477796554565, + "learning_rate": 2.5012409071459138e-05, + "loss": 0.3307, + "step": 19454 + }, + { + "epoch": 24.974326059050064, + "grad_norm": 1.5793893337249756, + "learning_rate": 2.5011981172443303e-05, + "loss": 0.3859, + "step": 19455 + }, + { + "epoch": 24.975609756097562, + "grad_norm": 2.302180051803589, + "learning_rate": 2.5011553273427474e-05, + "loss": 0.3668, + "step": 19456 + }, + { + "epoch": 24.976893453145056, + "grad_norm": 1.361707329750061, + "learning_rate": 2.501112537441164e-05, + "loss": 0.3611, + "step": 19457 + }, + { + "epoch": 24.978177150192554, + "grad_norm": 0.9450500011444092, + "learning_rate": 2.5010697475395808e-05, + "loss": 0.3504, + "step": 19458 + }, + { + "epoch": 24.979460847240052, + "grad_norm": 1.5550987720489502, + "learning_rate": 2.5010269576379976e-05, + "loss": 0.3841, + "step": 19459 + }, + { + "epoch": 24.98074454428755, + "grad_norm": 8.806633949279785, + "learning_rate": 2.500984167736414e-05, + "loss": 0.3404, + "step": 19460 + }, + { + "epoch": 24.982028241335044, + "grad_norm": 1.384181261062622, + "learning_rate": 2.500941377834831e-05, + "loss": 0.3886, + "step": 19461 + }, + { + "epoch": 24.983311938382542, + "grad_norm": 0.980351984500885, + "learning_rate": 2.5008985879332478e-05, + "loss": 0.3257, + "step": 19462 + }, + { + "epoch": 24.98459563543004, + "grad_norm": 1.1128826141357422, + "learning_rate": 2.5008557980316647e-05, + "loss": 0.382, + "step": 19463 + }, + { + "epoch": 24.985879332477534, + "grad_norm": 1.2381949424743652, + "learning_rate": 2.5008130081300815e-05, + "loss": 0.3882, + "step": 19464 + }, + { + "epoch": 24.987163029525032, + "grad_norm": 1.6450177431106567, + "learning_rate": 2.500770218228498e-05, + "loss": 0.3732, + "step": 19465 + }, + { + "epoch": 24.98844672657253, + "grad_norm": 1.500456690788269, + "learning_rate": 2.500727428326915e-05, + "loss": 0.4113, + "step": 19466 + }, + { + "epoch": 24.989730423620024, + "grad_norm": 1.2608540058135986, + "learning_rate": 2.5006846384253317e-05, + "loss": 0.3888, + "step": 19467 + }, + { + "epoch": 24.991014120667522, + "grad_norm": 1.4906052350997925, + "learning_rate": 2.5006418485237485e-05, + "loss": 0.425, + "step": 19468 + }, + { + "epoch": 24.99229781771502, + "grad_norm": 2.0087695121765137, + "learning_rate": 2.500599058622165e-05, + "loss": 0.3556, + "step": 19469 + }, + { + "epoch": 24.993581514762518, + "grad_norm": 1.4770101308822632, + "learning_rate": 2.5005562687205822e-05, + "loss": 0.385, + "step": 19470 + }, + { + "epoch": 24.994865211810012, + "grad_norm": 2.111567974090576, + "learning_rate": 2.5005134788189987e-05, + "loss": 0.4278, + "step": 19471 + }, + { + "epoch": 24.99614890885751, + "grad_norm": 2.349965810775757, + "learning_rate": 2.5004706889174155e-05, + "loss": 0.437, + "step": 19472 + }, + { + "epoch": 24.997432605905008, + "grad_norm": 1.6129508018493652, + "learning_rate": 2.5004278990158324e-05, + "loss": 0.4228, + "step": 19473 + }, + { + "epoch": 24.998716302952502, + "grad_norm": 1.4464530944824219, + "learning_rate": 2.500385109114249e-05, + "loss": 0.4533, + "step": 19474 + }, + { + "epoch": 25.0, + "grad_norm": 2.3314881324768066, + "learning_rate": 2.500342319212666e-05, + "loss": 0.5117, + "step": 19475 + }, + { + "epoch": 25.001283697047498, + "grad_norm": 1.3157215118408203, + "learning_rate": 2.5002995293110826e-05, + "loss": 0.3122, + "step": 19476 + }, + { + "epoch": 25.002567394094992, + "grad_norm": 1.518775463104248, + "learning_rate": 2.5002567394094994e-05, + "loss": 0.3461, + "step": 19477 + }, + { + "epoch": 25.00385109114249, + "grad_norm": 0.8935828804969788, + "learning_rate": 2.5002139495079163e-05, + "loss": 0.333, + "step": 19478 + }, + { + "epoch": 25.005134788189988, + "grad_norm": 3.1510932445526123, + "learning_rate": 2.5001711596063328e-05, + "loss": 0.3504, + "step": 19479 + }, + { + "epoch": 25.006418485237482, + "grad_norm": 1.2812944650650024, + "learning_rate": 2.50012836970475e-05, + "loss": 0.3394, + "step": 19480 + }, + { + "epoch": 25.00770218228498, + "grad_norm": 0.7974109649658203, + "learning_rate": 2.5000855798031664e-05, + "loss": 0.3361, + "step": 19481 + }, + { + "epoch": 25.008985879332478, + "grad_norm": 0.7501029968261719, + "learning_rate": 2.5000427899015833e-05, + "loss": 0.3402, + "step": 19482 + }, + { + "epoch": 25.010269576379976, + "grad_norm": 1.1023433208465576, + "learning_rate": 2.5e-05, + "loss": 0.3337, + "step": 19483 + }, + { + "epoch": 25.01155327342747, + "grad_norm": 1.167704701423645, + "learning_rate": 2.499957210098417e-05, + "loss": 0.3387, + "step": 19484 + }, + { + "epoch": 25.012836970474968, + "grad_norm": 0.8570122122764587, + "learning_rate": 2.4999144201968335e-05, + "loss": 0.366, + "step": 19485 + }, + { + "epoch": 25.014120667522466, + "grad_norm": 3.01073956489563, + "learning_rate": 2.4998716302952503e-05, + "loss": 0.3535, + "step": 19486 + }, + { + "epoch": 25.01540436456996, + "grad_norm": 0.8437581062316895, + "learning_rate": 2.499828840393667e-05, + "loss": 0.3403, + "step": 19487 + }, + { + "epoch": 25.016688061617458, + "grad_norm": 1.0796124935150146, + "learning_rate": 2.499786050492084e-05, + "loss": 0.3286, + "step": 19488 + }, + { + "epoch": 25.017971758664956, + "grad_norm": 1.6226909160614014, + "learning_rate": 2.4997432605905008e-05, + "loss": 0.3076, + "step": 19489 + }, + { + "epoch": 25.01925545571245, + "grad_norm": 0.9331632256507874, + "learning_rate": 2.4997004706889173e-05, + "loss": 0.3563, + "step": 19490 + }, + { + "epoch": 25.020539152759948, + "grad_norm": 1.5245864391326904, + "learning_rate": 2.4996576807873345e-05, + "loss": 0.3288, + "step": 19491 + }, + { + "epoch": 25.021822849807446, + "grad_norm": 1.038705825805664, + "learning_rate": 2.499614890885751e-05, + "loss": 0.3571, + "step": 19492 + }, + { + "epoch": 25.023106546854944, + "grad_norm": 0.9760786890983582, + "learning_rate": 2.4995721009841675e-05, + "loss": 0.3179, + "step": 19493 + }, + { + "epoch": 25.024390243902438, + "grad_norm": 1.0980323553085327, + "learning_rate": 2.4995293110825847e-05, + "loss": 0.3212, + "step": 19494 + }, + { + "epoch": 25.025673940949936, + "grad_norm": 1.8694449663162231, + "learning_rate": 2.4994865211810012e-05, + "loss": 0.3666, + "step": 19495 + }, + { + "epoch": 25.026957637997434, + "grad_norm": 1.0962250232696533, + "learning_rate": 2.4994437312794184e-05, + "loss": 0.3496, + "step": 19496 + }, + { + "epoch": 25.028241335044928, + "grad_norm": 2.607191801071167, + "learning_rate": 2.499400941377835e-05, + "loss": 0.3416, + "step": 19497 + }, + { + "epoch": 25.029525032092426, + "grad_norm": 1.0932179689407349, + "learning_rate": 2.4993581514762517e-05, + "loss": 0.3462, + "step": 19498 + }, + { + "epoch": 25.030808729139924, + "grad_norm": 1.0578553676605225, + "learning_rate": 2.4993153615746686e-05, + "loss": 0.3121, + "step": 19499 + }, + { + "epoch": 25.03209242618742, + "grad_norm": 1.4707731008529663, + "learning_rate": 2.499272571673085e-05, + "loss": 0.3333, + "step": 19500 + }, + { + "epoch": 25.033376123234916, + "grad_norm": 0.946855366230011, + "learning_rate": 2.499229781771502e-05, + "loss": 0.3455, + "step": 19501 + }, + { + "epoch": 25.034659820282414, + "grad_norm": 1.0232905149459839, + "learning_rate": 2.4991869918699187e-05, + "loss": 0.3272, + "step": 19502 + }, + { + "epoch": 25.035943517329912, + "grad_norm": 0.9677678942680359, + "learning_rate": 2.4991442019683356e-05, + "loss": 0.3235, + "step": 19503 + }, + { + "epoch": 25.037227214377406, + "grad_norm": 0.8285080790519714, + "learning_rate": 2.4991014120667524e-05, + "loss": 0.3305, + "step": 19504 + }, + { + "epoch": 25.038510911424904, + "grad_norm": 1.1862460374832153, + "learning_rate": 2.4990586221651693e-05, + "loss": 0.3317, + "step": 19505 + }, + { + "epoch": 25.039794608472402, + "grad_norm": 1.4185622930526733, + "learning_rate": 2.4990158322635858e-05, + "loss": 0.3328, + "step": 19506 + }, + { + "epoch": 25.041078305519896, + "grad_norm": 0.9920966029167175, + "learning_rate": 2.4989730423620026e-05, + "loss": 0.3391, + "step": 19507 + }, + { + "epoch": 25.042362002567394, + "grad_norm": 2.350282907485962, + "learning_rate": 2.4989302524604195e-05, + "loss": 0.3658, + "step": 19508 + }, + { + "epoch": 25.043645699614892, + "grad_norm": 1.049832820892334, + "learning_rate": 2.498887462558836e-05, + "loss": 0.3778, + "step": 19509 + }, + { + "epoch": 25.044929396662386, + "grad_norm": 2.420133113861084, + "learning_rate": 2.498844672657253e-05, + "loss": 0.3907, + "step": 19510 + }, + { + "epoch": 25.046213093709884, + "grad_norm": 1.1598713397979736, + "learning_rate": 2.4988018827556696e-05, + "loss": 0.3568, + "step": 19511 + }, + { + "epoch": 25.047496790757382, + "grad_norm": 1.4536535739898682, + "learning_rate": 2.4987590928540868e-05, + "loss": 0.3466, + "step": 19512 + }, + { + "epoch": 25.048780487804876, + "grad_norm": 1.1526174545288086, + "learning_rate": 2.4987163029525033e-05, + "loss": 0.3532, + "step": 19513 + }, + { + "epoch": 25.050064184852374, + "grad_norm": 1.8847872018814087, + "learning_rate": 2.4986735130509198e-05, + "loss": 0.3679, + "step": 19514 + }, + { + "epoch": 25.051347881899872, + "grad_norm": 1.291656255722046, + "learning_rate": 2.498630723149337e-05, + "loss": 0.3664, + "step": 19515 + }, + { + "epoch": 25.05263157894737, + "grad_norm": 1.9212839603424072, + "learning_rate": 2.4985879332477535e-05, + "loss": 0.3473, + "step": 19516 + }, + { + "epoch": 25.053915275994864, + "grad_norm": 1.8866995573043823, + "learning_rate": 2.4985451433461703e-05, + "loss": 0.3907, + "step": 19517 + }, + { + "epoch": 25.055198973042362, + "grad_norm": 1.2179710865020752, + "learning_rate": 2.4985023534445872e-05, + "loss": 0.3744, + "step": 19518 + }, + { + "epoch": 25.05648267008986, + "grad_norm": 1.7582916021347046, + "learning_rate": 2.498459563543004e-05, + "loss": 0.3626, + "step": 19519 + }, + { + "epoch": 25.057766367137354, + "grad_norm": 1.5438860654830933, + "learning_rate": 2.498416773641421e-05, + "loss": 0.396, + "step": 19520 + }, + { + "epoch": 25.059050064184852, + "grad_norm": 82.70990753173828, + "learning_rate": 2.4983739837398374e-05, + "loss": 0.4262, + "step": 19521 + }, + { + "epoch": 25.06033376123235, + "grad_norm": 1.5423285961151123, + "learning_rate": 2.4983311938382542e-05, + "loss": 0.439, + "step": 19522 + }, + { + "epoch": 25.061617458279844, + "grad_norm": 1.5154917240142822, + "learning_rate": 2.498288403936671e-05, + "loss": 0.4486, + "step": 19523 + }, + { + "epoch": 25.062901155327342, + "grad_norm": 3.098176956176758, + "learning_rate": 2.498245614035088e-05, + "loss": 0.4778, + "step": 19524 + }, + { + "epoch": 25.06418485237484, + "grad_norm": 2.793001413345337, + "learning_rate": 2.4982028241335044e-05, + "loss": 0.5438, + "step": 19525 + }, + { + "epoch": 25.065468549422338, + "grad_norm": 1.137363076210022, + "learning_rate": 2.4981600342319212e-05, + "loss": 0.3335, + "step": 19526 + }, + { + "epoch": 25.066752246469832, + "grad_norm": 0.9463800191879272, + "learning_rate": 2.498117244330338e-05, + "loss": 0.3296, + "step": 19527 + }, + { + "epoch": 25.06803594351733, + "grad_norm": 0.8026967644691467, + "learning_rate": 2.498074454428755e-05, + "loss": 0.3537, + "step": 19528 + }, + { + "epoch": 25.069319640564828, + "grad_norm": 1.397131323814392, + "learning_rate": 2.4980316645271718e-05, + "loss": 0.3536, + "step": 19529 + }, + { + "epoch": 25.070603337612322, + "grad_norm": 1.2895255088806152, + "learning_rate": 2.4979888746255883e-05, + "loss": 0.334, + "step": 19530 + }, + { + "epoch": 25.07188703465982, + "grad_norm": 1.122346043586731, + "learning_rate": 2.4979460847240054e-05, + "loss": 0.3452, + "step": 19531 + }, + { + "epoch": 25.073170731707318, + "grad_norm": 1.2488819360733032, + "learning_rate": 2.497903294822422e-05, + "loss": 0.2992, + "step": 19532 + }, + { + "epoch": 25.074454428754812, + "grad_norm": 1.129670262336731, + "learning_rate": 2.4978605049208385e-05, + "loss": 0.3536, + "step": 19533 + }, + { + "epoch": 25.07573812580231, + "grad_norm": 1.2485637664794922, + "learning_rate": 2.4978177150192556e-05, + "loss": 0.3159, + "step": 19534 + }, + { + "epoch": 25.077021822849808, + "grad_norm": 1.0267378091812134, + "learning_rate": 2.497774925117672e-05, + "loss": 0.3569, + "step": 19535 + }, + { + "epoch": 25.078305519897306, + "grad_norm": 1.7990736961364746, + "learning_rate": 2.4977321352160893e-05, + "loss": 0.3124, + "step": 19536 + }, + { + "epoch": 25.0795892169448, + "grad_norm": 1.1845084428787231, + "learning_rate": 2.4976893453145058e-05, + "loss": 0.3583, + "step": 19537 + }, + { + "epoch": 25.080872913992298, + "grad_norm": 0.9263444542884827, + "learning_rate": 2.4976465554129227e-05, + "loss": 0.3242, + "step": 19538 + }, + { + "epoch": 25.082156611039796, + "grad_norm": 1.306204915046692, + "learning_rate": 2.4976037655113395e-05, + "loss": 0.3464, + "step": 19539 + }, + { + "epoch": 25.08344030808729, + "grad_norm": 2.1889679431915283, + "learning_rate": 2.497560975609756e-05, + "loss": 0.3323, + "step": 19540 + }, + { + "epoch": 25.084724005134788, + "grad_norm": 1.396630883216858, + "learning_rate": 2.497518185708173e-05, + "loss": 0.3332, + "step": 19541 + }, + { + "epoch": 25.086007702182286, + "grad_norm": 2.859403371810913, + "learning_rate": 2.4974753958065897e-05, + "loss": 0.348, + "step": 19542 + }, + { + "epoch": 25.08729139922978, + "grad_norm": 0.8239622116088867, + "learning_rate": 2.4974326059050065e-05, + "loss": 0.3283, + "step": 19543 + }, + { + "epoch": 25.088575096277278, + "grad_norm": 1.864782452583313, + "learning_rate": 2.4973898160034234e-05, + "loss": 0.3331, + "step": 19544 + }, + { + "epoch": 25.089858793324776, + "grad_norm": 1.2687857151031494, + "learning_rate": 2.4973470261018402e-05, + "loss": 0.3407, + "step": 19545 + }, + { + "epoch": 25.09114249037227, + "grad_norm": 1.2411764860153198, + "learning_rate": 2.4973042362002567e-05, + "loss": 0.3423, + "step": 19546 + }, + { + "epoch": 25.09242618741977, + "grad_norm": 6.9557600021362305, + "learning_rate": 2.4972614462986736e-05, + "loss": 0.319, + "step": 19547 + }, + { + "epoch": 25.093709884467266, + "grad_norm": 1.0212769508361816, + "learning_rate": 2.4972186563970904e-05, + "loss": 0.3341, + "step": 19548 + }, + { + "epoch": 25.094993581514764, + "grad_norm": 0.8763512372970581, + "learning_rate": 2.497175866495507e-05, + "loss": 0.3358, + "step": 19549 + }, + { + "epoch": 25.09627727856226, + "grad_norm": 1.0240668058395386, + "learning_rate": 2.497133076593924e-05, + "loss": 0.349, + "step": 19550 + }, + { + "epoch": 25.097560975609756, + "grad_norm": 1.4359817504882812, + "learning_rate": 2.4970902866923406e-05, + "loss": 0.3548, + "step": 19551 + }, + { + "epoch": 25.098844672657254, + "grad_norm": 0.9010595679283142, + "learning_rate": 2.4970474967907574e-05, + "loss": 0.3036, + "step": 19552 + }, + { + "epoch": 25.10012836970475, + "grad_norm": 0.9196829199790955, + "learning_rate": 2.4970047068891743e-05, + "loss": 0.3181, + "step": 19553 + }, + { + "epoch": 25.101412066752246, + "grad_norm": 1.1873266696929932, + "learning_rate": 2.4969619169875908e-05, + "loss": 0.3295, + "step": 19554 + }, + { + "epoch": 25.102695763799744, + "grad_norm": 1.6414523124694824, + "learning_rate": 2.496919127086008e-05, + "loss": 0.3441, + "step": 19555 + }, + { + "epoch": 25.10397946084724, + "grad_norm": 1.0090094804763794, + "learning_rate": 2.4968763371844244e-05, + "loss": 0.3844, + "step": 19556 + }, + { + "epoch": 25.105263157894736, + "grad_norm": 0.9371461272239685, + "learning_rate": 2.4968335472828413e-05, + "loss": 0.3611, + "step": 19557 + }, + { + "epoch": 25.106546854942234, + "grad_norm": 1.392160415649414, + "learning_rate": 2.496790757381258e-05, + "loss": 0.3412, + "step": 19558 + }, + { + "epoch": 25.107830551989732, + "grad_norm": 1.1518566608428955, + "learning_rate": 2.496747967479675e-05, + "loss": 0.3475, + "step": 19559 + }, + { + "epoch": 25.109114249037226, + "grad_norm": 1.5663031339645386, + "learning_rate": 2.4967051775780915e-05, + "loss": 0.3278, + "step": 19560 + }, + { + "epoch": 25.110397946084724, + "grad_norm": 3.792989492416382, + "learning_rate": 2.4966623876765083e-05, + "loss": 0.3552, + "step": 19561 + }, + { + "epoch": 25.111681643132222, + "grad_norm": 1.8279145956039429, + "learning_rate": 2.496619597774925e-05, + "loss": 0.3542, + "step": 19562 + }, + { + "epoch": 25.112965340179716, + "grad_norm": 2.1433932781219482, + "learning_rate": 2.496576807873342e-05, + "loss": 0.3756, + "step": 19563 + }, + { + "epoch": 25.114249037227214, + "grad_norm": 1.1018650531768799, + "learning_rate": 2.496534017971759e-05, + "loss": 0.3244, + "step": 19564 + }, + { + "epoch": 25.115532734274712, + "grad_norm": 1.5932366847991943, + "learning_rate": 2.4964912280701753e-05, + "loss": 0.3119, + "step": 19565 + }, + { + "epoch": 25.116816431322206, + "grad_norm": 1.7577465772628784, + "learning_rate": 2.4964484381685925e-05, + "loss": 0.3472, + "step": 19566 + }, + { + "epoch": 25.118100128369704, + "grad_norm": 2.123899221420288, + "learning_rate": 2.496405648267009e-05, + "loss": 0.3598, + "step": 19567 + }, + { + "epoch": 25.119383825417202, + "grad_norm": 1.3103693723678589, + "learning_rate": 2.4963628583654255e-05, + "loss": 0.38, + "step": 19568 + }, + { + "epoch": 25.1206675224647, + "grad_norm": 1.4317740201950073, + "learning_rate": 2.4963200684638427e-05, + "loss": 0.34, + "step": 19569 + }, + { + "epoch": 25.121951219512194, + "grad_norm": 1.291837215423584, + "learning_rate": 2.4962772785622592e-05, + "loss": 0.4021, + "step": 19570 + }, + { + "epoch": 25.123234916559692, + "grad_norm": 3.2688727378845215, + "learning_rate": 2.4962344886606764e-05, + "loss": 0.3844, + "step": 19571 + }, + { + "epoch": 25.12451861360719, + "grad_norm": 2.0995981693267822, + "learning_rate": 2.496191698759093e-05, + "loss": 0.372, + "step": 19572 + }, + { + "epoch": 25.125802310654684, + "grad_norm": 3.0465800762176514, + "learning_rate": 2.4961489088575097e-05, + "loss": 0.4198, + "step": 19573 + }, + { + "epoch": 25.127086007702182, + "grad_norm": 4.437832355499268, + "learning_rate": 2.4961061189559266e-05, + "loss": 0.4448, + "step": 19574 + }, + { + "epoch": 25.12836970474968, + "grad_norm": 1.8190079927444458, + "learning_rate": 2.496063329054343e-05, + "loss": 0.5326, + "step": 19575 + }, + { + "epoch": 25.129653401797174, + "grad_norm": 1.620910882949829, + "learning_rate": 2.49602053915276e-05, + "loss": 0.3389, + "step": 19576 + }, + { + "epoch": 25.130937098844672, + "grad_norm": 1.0648472309112549, + "learning_rate": 2.4959777492511768e-05, + "loss": 0.3328, + "step": 19577 + }, + { + "epoch": 25.13222079589217, + "grad_norm": 1.8406269550323486, + "learning_rate": 2.4959349593495936e-05, + "loss": 0.3491, + "step": 19578 + }, + { + "epoch": 25.133504492939664, + "grad_norm": 1.1060385704040527, + "learning_rate": 2.4958921694480104e-05, + "loss": 0.3621, + "step": 19579 + }, + { + "epoch": 25.134788189987162, + "grad_norm": 0.8299826383590698, + "learning_rate": 2.4958493795464273e-05, + "loss": 0.3386, + "step": 19580 + }, + { + "epoch": 25.13607188703466, + "grad_norm": 1.0272935628890991, + "learning_rate": 2.4958065896448438e-05, + "loss": 0.3429, + "step": 19581 + }, + { + "epoch": 25.137355584082158, + "grad_norm": 2.0899088382720947, + "learning_rate": 2.4957637997432606e-05, + "loss": 0.3721, + "step": 19582 + }, + { + "epoch": 25.138639281129652, + "grad_norm": 0.8664947748184204, + "learning_rate": 2.4957210098416775e-05, + "loss": 0.303, + "step": 19583 + }, + { + "epoch": 25.13992297817715, + "grad_norm": 1.5479896068572998, + "learning_rate": 2.495678219940094e-05, + "loss": 0.3453, + "step": 19584 + }, + { + "epoch": 25.141206675224648, + "grad_norm": 0.9028859734535217, + "learning_rate": 2.495635430038511e-05, + "loss": 0.3618, + "step": 19585 + }, + { + "epoch": 25.142490372272142, + "grad_norm": 1.850757360458374, + "learning_rate": 2.4955926401369276e-05, + "loss": 0.352, + "step": 19586 + }, + { + "epoch": 25.14377406931964, + "grad_norm": 0.9937478303909302, + "learning_rate": 2.4955498502353445e-05, + "loss": 0.3774, + "step": 19587 + }, + { + "epoch": 25.145057766367138, + "grad_norm": 0.8840941190719604, + "learning_rate": 2.4955070603337613e-05, + "loss": 0.3167, + "step": 19588 + }, + { + "epoch": 25.146341463414632, + "grad_norm": 1.9830255508422852, + "learning_rate": 2.4954642704321778e-05, + "loss": 0.3502, + "step": 19589 + }, + { + "epoch": 25.14762516046213, + "grad_norm": 0.8472638726234436, + "learning_rate": 2.495421480530595e-05, + "loss": 0.3302, + "step": 19590 + }, + { + "epoch": 25.14890885750963, + "grad_norm": 2.1230380535125732, + "learning_rate": 2.4953786906290115e-05, + "loss": 0.3592, + "step": 19591 + }, + { + "epoch": 25.150192554557126, + "grad_norm": 0.8269778490066528, + "learning_rate": 2.4953359007274284e-05, + "loss": 0.3545, + "step": 19592 + }, + { + "epoch": 25.15147625160462, + "grad_norm": 0.9788861870765686, + "learning_rate": 2.4952931108258452e-05, + "loss": 0.3146, + "step": 19593 + }, + { + "epoch": 25.15275994865212, + "grad_norm": 1.0814149379730225, + "learning_rate": 2.4952503209242617e-05, + "loss": 0.356, + "step": 19594 + }, + { + "epoch": 25.154043645699616, + "grad_norm": 1.3866263628005981, + "learning_rate": 2.495207531022679e-05, + "loss": 0.3365, + "step": 19595 + }, + { + "epoch": 25.15532734274711, + "grad_norm": 0.9690729975700378, + "learning_rate": 2.4951647411210954e-05, + "loss": 0.3481, + "step": 19596 + }, + { + "epoch": 25.15661103979461, + "grad_norm": 2.030031204223633, + "learning_rate": 2.4951219512195122e-05, + "loss": 0.3673, + "step": 19597 + }, + { + "epoch": 25.157894736842106, + "grad_norm": 0.8470256328582764, + "learning_rate": 2.495079161317929e-05, + "loss": 0.3805, + "step": 19598 + }, + { + "epoch": 25.1591784338896, + "grad_norm": 1.2869502305984497, + "learning_rate": 2.495036371416346e-05, + "loss": 0.3217, + "step": 19599 + }, + { + "epoch": 25.1604621309371, + "grad_norm": 1.1872906684875488, + "learning_rate": 2.4949935815147624e-05, + "loss": 0.3659, + "step": 19600 + }, + { + "epoch": 25.161745827984596, + "grad_norm": 1.0134302377700806, + "learning_rate": 2.4949507916131792e-05, + "loss": 0.3713, + "step": 19601 + }, + { + "epoch": 25.163029525032094, + "grad_norm": 1.109819769859314, + "learning_rate": 2.494908001711596e-05, + "loss": 0.3731, + "step": 19602 + }, + { + "epoch": 25.16431322207959, + "grad_norm": 16.397369384765625, + "learning_rate": 2.494865211810013e-05, + "loss": 0.3455, + "step": 19603 + }, + { + "epoch": 25.165596919127086, + "grad_norm": 1.1850119829177856, + "learning_rate": 2.4948224219084298e-05, + "loss": 0.3435, + "step": 19604 + }, + { + "epoch": 25.166880616174584, + "grad_norm": 1.6562395095825195, + "learning_rate": 2.4947796320068463e-05, + "loss": 0.3582, + "step": 19605 + }, + { + "epoch": 25.16816431322208, + "grad_norm": 1.5069853067398071, + "learning_rate": 2.4947368421052635e-05, + "loss": 0.3612, + "step": 19606 + }, + { + "epoch": 25.169448010269576, + "grad_norm": 3.18229079246521, + "learning_rate": 2.49469405220368e-05, + "loss": 0.3462, + "step": 19607 + }, + { + "epoch": 25.170731707317074, + "grad_norm": 2.337594509124756, + "learning_rate": 2.4946512623020965e-05, + "loss": 0.3538, + "step": 19608 + }, + { + "epoch": 25.17201540436457, + "grad_norm": 1.4482029676437378, + "learning_rate": 2.4946084724005136e-05, + "loss": 0.3516, + "step": 19609 + }, + { + "epoch": 25.173299101412066, + "grad_norm": 1.9419304132461548, + "learning_rate": 2.49456568249893e-05, + "loss": 0.331, + "step": 19610 + }, + { + "epoch": 25.174582798459564, + "grad_norm": 1.6213536262512207, + "learning_rate": 2.4945228925973473e-05, + "loss": 0.3723, + "step": 19611 + }, + { + "epoch": 25.17586649550706, + "grad_norm": 1.2988784313201904, + "learning_rate": 2.4944801026957638e-05, + "loss": 0.3339, + "step": 19612 + }, + { + "epoch": 25.177150192554556, + "grad_norm": 7.71073055267334, + "learning_rate": 2.4944373127941807e-05, + "loss": 0.3822, + "step": 19613 + }, + { + "epoch": 25.178433889602054, + "grad_norm": 1.6097482442855835, + "learning_rate": 2.4943945228925975e-05, + "loss": 0.393, + "step": 19614 + }, + { + "epoch": 25.179717586649552, + "grad_norm": 1.6557788848876953, + "learning_rate": 2.494351732991014e-05, + "loss": 0.3688, + "step": 19615 + }, + { + "epoch": 25.181001283697046, + "grad_norm": 1.8023953437805176, + "learning_rate": 2.494308943089431e-05, + "loss": 0.3739, + "step": 19616 + }, + { + "epoch": 25.182284980744544, + "grad_norm": 4.022557258605957, + "learning_rate": 2.4942661531878477e-05, + "loss": 0.3579, + "step": 19617 + }, + { + "epoch": 25.183568677792042, + "grad_norm": 1.3665170669555664, + "learning_rate": 2.4942233632862645e-05, + "loss": 0.3864, + "step": 19618 + }, + { + "epoch": 25.184852374839537, + "grad_norm": 1.4474315643310547, + "learning_rate": 2.4941805733846814e-05, + "loss": 0.365, + "step": 19619 + }, + { + "epoch": 25.186136071887034, + "grad_norm": 1.6648030281066895, + "learning_rate": 2.4941377834830982e-05, + "loss": 0.4099, + "step": 19620 + }, + { + "epoch": 25.187419768934532, + "grad_norm": 1.398577332496643, + "learning_rate": 2.4940949935815147e-05, + "loss": 0.4056, + "step": 19621 + }, + { + "epoch": 25.188703465982027, + "grad_norm": 2.9496593475341797, + "learning_rate": 2.4940522036799316e-05, + "loss": 0.3977, + "step": 19622 + }, + { + "epoch": 25.189987163029524, + "grad_norm": 8.94730281829834, + "learning_rate": 2.4940094137783484e-05, + "loss": 0.4264, + "step": 19623 + }, + { + "epoch": 25.191270860077022, + "grad_norm": 2.1030471324920654, + "learning_rate": 2.493966623876765e-05, + "loss": 0.449, + "step": 19624 + }, + { + "epoch": 25.19255455712452, + "grad_norm": 2.997907876968384, + "learning_rate": 2.493923833975182e-05, + "loss": 0.5421, + "step": 19625 + }, + { + "epoch": 25.193838254172015, + "grad_norm": 0.9613063335418701, + "learning_rate": 2.4938810440735986e-05, + "loss": 0.3419, + "step": 19626 + }, + { + "epoch": 25.195121951219512, + "grad_norm": 3.316364049911499, + "learning_rate": 2.4938382541720158e-05, + "loss": 0.3239, + "step": 19627 + }, + { + "epoch": 25.19640564826701, + "grad_norm": 2.5502209663391113, + "learning_rate": 2.4937954642704323e-05, + "loss": 0.3464, + "step": 19628 + }, + { + "epoch": 25.197689345314505, + "grad_norm": 2.5874063968658447, + "learning_rate": 2.4937526743688488e-05, + "loss": 0.3589, + "step": 19629 + }, + { + "epoch": 25.198973042362002, + "grad_norm": 1.3874883651733398, + "learning_rate": 2.493709884467266e-05, + "loss": 0.3535, + "step": 19630 + }, + { + "epoch": 25.2002567394095, + "grad_norm": 1.1518656015396118, + "learning_rate": 2.4936670945656824e-05, + "loss": 0.3533, + "step": 19631 + }, + { + "epoch": 25.201540436456995, + "grad_norm": 1.2607107162475586, + "learning_rate": 2.4936243046640993e-05, + "loss": 0.3399, + "step": 19632 + }, + { + "epoch": 25.202824133504492, + "grad_norm": 0.947281002998352, + "learning_rate": 2.493581514762516e-05, + "loss": 0.3258, + "step": 19633 + }, + { + "epoch": 25.20410783055199, + "grad_norm": 1.9612395763397217, + "learning_rate": 2.493538724860933e-05, + "loss": 0.3328, + "step": 19634 + }, + { + "epoch": 25.205391527599488, + "grad_norm": 1.5225865840911865, + "learning_rate": 2.4934959349593498e-05, + "loss": 0.3262, + "step": 19635 + }, + { + "epoch": 25.206675224646983, + "grad_norm": 1.486020565032959, + "learning_rate": 2.4934531450577663e-05, + "loss": 0.3318, + "step": 19636 + }, + { + "epoch": 25.20795892169448, + "grad_norm": 0.924623429775238, + "learning_rate": 2.493410355156183e-05, + "loss": 0.3545, + "step": 19637 + }, + { + "epoch": 25.20924261874198, + "grad_norm": 1.6880924701690674, + "learning_rate": 2.4933675652546e-05, + "loss": 0.3658, + "step": 19638 + }, + { + "epoch": 25.210526315789473, + "grad_norm": 1.8729134798049927, + "learning_rate": 2.493324775353017e-05, + "loss": 0.3452, + "step": 19639 + }, + { + "epoch": 25.21181001283697, + "grad_norm": 2.428907871246338, + "learning_rate": 2.4932819854514333e-05, + "loss": 0.3274, + "step": 19640 + }, + { + "epoch": 25.21309370988447, + "grad_norm": 1.2759511470794678, + "learning_rate": 2.4932391955498505e-05, + "loss": 0.3685, + "step": 19641 + }, + { + "epoch": 25.214377406931963, + "grad_norm": 1.1504783630371094, + "learning_rate": 2.493196405648267e-05, + "loss": 0.3464, + "step": 19642 + }, + { + "epoch": 25.21566110397946, + "grad_norm": 1.0254271030426025, + "learning_rate": 2.493153615746684e-05, + "loss": 0.3545, + "step": 19643 + }, + { + "epoch": 25.21694480102696, + "grad_norm": 4.915134906768799, + "learning_rate": 2.4931108258451007e-05, + "loss": 0.3356, + "step": 19644 + }, + { + "epoch": 25.218228498074453, + "grad_norm": 1.0889551639556885, + "learning_rate": 2.4930680359435172e-05, + "loss": 0.368, + "step": 19645 + }, + { + "epoch": 25.21951219512195, + "grad_norm": 1.6924742460250854, + "learning_rate": 2.4930252460419344e-05, + "loss": 0.3085, + "step": 19646 + }, + { + "epoch": 25.22079589216945, + "grad_norm": 1.078324794769287, + "learning_rate": 2.492982456140351e-05, + "loss": 0.3461, + "step": 19647 + }, + { + "epoch": 25.222079589216946, + "grad_norm": 1.978713035583496, + "learning_rate": 2.4929396662387677e-05, + "loss": 0.3338, + "step": 19648 + }, + { + "epoch": 25.22336328626444, + "grad_norm": 1.3685306310653687, + "learning_rate": 2.4928968763371846e-05, + "loss": 0.3459, + "step": 19649 + }, + { + "epoch": 25.22464698331194, + "grad_norm": 1.2249021530151367, + "learning_rate": 2.492854086435601e-05, + "loss": 0.3702, + "step": 19650 + }, + { + "epoch": 25.225930680359436, + "grad_norm": 1.3676295280456543, + "learning_rate": 2.4928112965340183e-05, + "loss": 0.3431, + "step": 19651 + }, + { + "epoch": 25.22721437740693, + "grad_norm": 1.969594955444336, + "learning_rate": 2.4927685066324348e-05, + "loss": 0.3644, + "step": 19652 + }, + { + "epoch": 25.22849807445443, + "grad_norm": 1.1106855869293213, + "learning_rate": 2.4927257167308516e-05, + "loss": 0.3385, + "step": 19653 + }, + { + "epoch": 25.229781771501926, + "grad_norm": 2.1079540252685547, + "learning_rate": 2.4926829268292684e-05, + "loss": 0.3266, + "step": 19654 + }, + { + "epoch": 25.23106546854942, + "grad_norm": 1.534924030303955, + "learning_rate": 2.492640136927685e-05, + "loss": 0.3509, + "step": 19655 + }, + { + "epoch": 25.23234916559692, + "grad_norm": 1.7081561088562012, + "learning_rate": 2.4925973470261018e-05, + "loss": 0.3531, + "step": 19656 + }, + { + "epoch": 25.233632862644416, + "grad_norm": 1.4258495569229126, + "learning_rate": 2.4925545571245186e-05, + "loss": 0.3489, + "step": 19657 + }, + { + "epoch": 25.234916559691914, + "grad_norm": 1.4878239631652832, + "learning_rate": 2.4925117672229355e-05, + "loss": 0.3751, + "step": 19658 + }, + { + "epoch": 25.23620025673941, + "grad_norm": 3.7688798904418945, + "learning_rate": 2.4924689773213523e-05, + "loss": 0.374, + "step": 19659 + }, + { + "epoch": 25.237483953786906, + "grad_norm": 1.9145609140396118, + "learning_rate": 2.492426187419769e-05, + "loss": 0.3307, + "step": 19660 + }, + { + "epoch": 25.238767650834404, + "grad_norm": 1.4606890678405762, + "learning_rate": 2.4923833975181857e-05, + "loss": 0.3387, + "step": 19661 + }, + { + "epoch": 25.2400513478819, + "grad_norm": 1.7392522096633911, + "learning_rate": 2.4923406076166025e-05, + "loss": 0.3878, + "step": 19662 + }, + { + "epoch": 25.241335044929397, + "grad_norm": 2.25160551071167, + "learning_rate": 2.4922978177150193e-05, + "loss": 0.3746, + "step": 19663 + }, + { + "epoch": 25.242618741976894, + "grad_norm": 1.3588976860046387, + "learning_rate": 2.492255027813436e-05, + "loss": 0.341, + "step": 19664 + }, + { + "epoch": 25.24390243902439, + "grad_norm": 1.4347455501556396, + "learning_rate": 2.492212237911853e-05, + "loss": 0.3426, + "step": 19665 + }, + { + "epoch": 25.245186136071887, + "grad_norm": 1.1509666442871094, + "learning_rate": 2.4921694480102695e-05, + "loss": 0.3577, + "step": 19666 + }, + { + "epoch": 25.246469833119384, + "grad_norm": 2.947694778442383, + "learning_rate": 2.4921266581086867e-05, + "loss": 0.4133, + "step": 19667 + }, + { + "epoch": 25.247753530166882, + "grad_norm": 1.6126853227615356, + "learning_rate": 2.4920838682071032e-05, + "loss": 0.3751, + "step": 19668 + }, + { + "epoch": 25.249037227214377, + "grad_norm": 1.8060063123703003, + "learning_rate": 2.4920410783055197e-05, + "loss": 0.374, + "step": 19669 + }, + { + "epoch": 25.250320924261874, + "grad_norm": 4.418889999389648, + "learning_rate": 2.491998288403937e-05, + "loss": 0.4022, + "step": 19670 + }, + { + "epoch": 25.251604621309372, + "grad_norm": 4.679035186767578, + "learning_rate": 2.4919554985023534e-05, + "loss": 0.3779, + "step": 19671 + }, + { + "epoch": 25.252888318356867, + "grad_norm": 2.788640260696411, + "learning_rate": 2.4919127086007702e-05, + "loss": 0.4388, + "step": 19672 + }, + { + "epoch": 25.254172015404365, + "grad_norm": 1.5387146472930908, + "learning_rate": 2.491869918699187e-05, + "loss": 0.3832, + "step": 19673 + }, + { + "epoch": 25.255455712451862, + "grad_norm": 4.628873825073242, + "learning_rate": 2.491827128797604e-05, + "loss": 0.4858, + "step": 19674 + }, + { + "epoch": 25.256739409499357, + "grad_norm": 8.201194763183594, + "learning_rate": 2.4917843388960208e-05, + "loss": 0.564, + "step": 19675 + }, + { + "epoch": 25.258023106546855, + "grad_norm": 1.0105139017105103, + "learning_rate": 2.4917415489944373e-05, + "loss": 0.3353, + "step": 19676 + }, + { + "epoch": 25.259306803594352, + "grad_norm": 0.9101086854934692, + "learning_rate": 2.491698759092854e-05, + "loss": 0.3326, + "step": 19677 + }, + { + "epoch": 25.260590500641847, + "grad_norm": 1.3826278448104858, + "learning_rate": 2.491655969191271e-05, + "loss": 0.3347, + "step": 19678 + }, + { + "epoch": 25.261874197689345, + "grad_norm": 1.3182011842727661, + "learning_rate": 2.4916131792896878e-05, + "loss": 0.3625, + "step": 19679 + }, + { + "epoch": 25.263157894736842, + "grad_norm": 1.552649736404419, + "learning_rate": 2.4915703893881043e-05, + "loss": 0.3463, + "step": 19680 + }, + { + "epoch": 25.26444159178434, + "grad_norm": 2.3890998363494873, + "learning_rate": 2.4915275994865215e-05, + "loss": 0.3171, + "step": 19681 + }, + { + "epoch": 25.265725288831835, + "grad_norm": 1.199999213218689, + "learning_rate": 2.491484809584938e-05, + "loss": 0.3086, + "step": 19682 + }, + { + "epoch": 25.267008985879333, + "grad_norm": 1.7773430347442627, + "learning_rate": 2.4914420196833548e-05, + "loss": 0.3393, + "step": 19683 + }, + { + "epoch": 25.26829268292683, + "grad_norm": 1.828071117401123, + "learning_rate": 2.4913992297817716e-05, + "loss": 0.365, + "step": 19684 + }, + { + "epoch": 25.269576379974325, + "grad_norm": 0.6857482194900513, + "learning_rate": 2.491356439880188e-05, + "loss": 0.3675, + "step": 19685 + }, + { + "epoch": 25.270860077021823, + "grad_norm": 1.176141381263733, + "learning_rate": 2.4913136499786053e-05, + "loss": 0.3386, + "step": 19686 + }, + { + "epoch": 25.27214377406932, + "grad_norm": 1.2838280200958252, + "learning_rate": 2.4912708600770218e-05, + "loss": 0.3271, + "step": 19687 + }, + { + "epoch": 25.273427471116815, + "grad_norm": 0.9329628944396973, + "learning_rate": 2.4912280701754387e-05, + "loss": 0.3431, + "step": 19688 + }, + { + "epoch": 25.274711168164313, + "grad_norm": 1.1277347803115845, + "learning_rate": 2.4911852802738555e-05, + "loss": 0.3393, + "step": 19689 + }, + { + "epoch": 25.27599486521181, + "grad_norm": 1.4504579305648804, + "learning_rate": 2.491142490372272e-05, + "loss": 0.3683, + "step": 19690 + }, + { + "epoch": 25.27727856225931, + "grad_norm": 1.0209201574325562, + "learning_rate": 2.4910997004706892e-05, + "loss": 0.321, + "step": 19691 + }, + { + "epoch": 25.278562259306803, + "grad_norm": 1.3427996635437012, + "learning_rate": 2.4910569105691057e-05, + "loss": 0.3559, + "step": 19692 + }, + { + "epoch": 25.2798459563543, + "grad_norm": 1.2582809925079346, + "learning_rate": 2.4910141206675225e-05, + "loss": 0.3651, + "step": 19693 + }, + { + "epoch": 25.2811296534018, + "grad_norm": 1.0546317100524902, + "learning_rate": 2.4909713307659394e-05, + "loss": 0.3422, + "step": 19694 + }, + { + "epoch": 25.282413350449293, + "grad_norm": 0.9942190051078796, + "learning_rate": 2.4909285408643562e-05, + "loss": 0.3165, + "step": 19695 + }, + { + "epoch": 25.28369704749679, + "grad_norm": 1.3299273252487183, + "learning_rate": 2.4908857509627727e-05, + "loss": 0.3062, + "step": 19696 + }, + { + "epoch": 25.28498074454429, + "grad_norm": 1.4548516273498535, + "learning_rate": 2.4908429610611896e-05, + "loss": 0.3568, + "step": 19697 + }, + { + "epoch": 25.286264441591783, + "grad_norm": 0.9823828339576721, + "learning_rate": 2.4908001711596064e-05, + "loss": 0.3648, + "step": 19698 + }, + { + "epoch": 25.28754813863928, + "grad_norm": 2.553774833679199, + "learning_rate": 2.4907573812580232e-05, + "loss": 0.366, + "step": 19699 + }, + { + "epoch": 25.28883183568678, + "grad_norm": 0.8737365007400513, + "learning_rate": 2.49071459135644e-05, + "loss": 0.3466, + "step": 19700 + }, + { + "epoch": 25.290115532734276, + "grad_norm": 1.20404052734375, + "learning_rate": 2.4906718014548566e-05, + "loss": 0.3649, + "step": 19701 + }, + { + "epoch": 25.29139922978177, + "grad_norm": 1.259130597114563, + "learning_rate": 2.4906290115532738e-05, + "loss": 0.325, + "step": 19702 + }, + { + "epoch": 25.29268292682927, + "grad_norm": 1.3927432298660278, + "learning_rate": 2.4905862216516903e-05, + "loss": 0.3235, + "step": 19703 + }, + { + "epoch": 25.293966623876766, + "grad_norm": 1.5667179822921753, + "learning_rate": 2.4905434317501068e-05, + "loss": 0.3172, + "step": 19704 + }, + { + "epoch": 25.29525032092426, + "grad_norm": 1.1512902975082397, + "learning_rate": 2.490500641848524e-05, + "loss": 0.375, + "step": 19705 + }, + { + "epoch": 25.29653401797176, + "grad_norm": 1.139281988143921, + "learning_rate": 2.4904578519469405e-05, + "loss": 0.3378, + "step": 19706 + }, + { + "epoch": 25.297817715019256, + "grad_norm": 1.7119743824005127, + "learning_rate": 2.4904150620453576e-05, + "loss": 0.3436, + "step": 19707 + }, + { + "epoch": 25.29910141206675, + "grad_norm": 1.00285804271698, + "learning_rate": 2.490372272143774e-05, + "loss": 0.3219, + "step": 19708 + }, + { + "epoch": 25.30038510911425, + "grad_norm": 1.2408376932144165, + "learning_rate": 2.490329482242191e-05, + "loss": 0.3553, + "step": 19709 + }, + { + "epoch": 25.301668806161747, + "grad_norm": 1.1926299333572388, + "learning_rate": 2.4902866923406078e-05, + "loss": 0.3542, + "step": 19710 + }, + { + "epoch": 25.30295250320924, + "grad_norm": 2.1153969764709473, + "learning_rate": 2.4902439024390243e-05, + "loss": 0.3694, + "step": 19711 + }, + { + "epoch": 25.30423620025674, + "grad_norm": 1.4243955612182617, + "learning_rate": 2.490201112537441e-05, + "loss": 0.3579, + "step": 19712 + }, + { + "epoch": 25.305519897304237, + "grad_norm": 1.2686095237731934, + "learning_rate": 2.490158322635858e-05, + "loss": 0.3694, + "step": 19713 + }, + { + "epoch": 25.306803594351734, + "grad_norm": 1.3797109127044678, + "learning_rate": 2.490115532734275e-05, + "loss": 0.372, + "step": 19714 + }, + { + "epoch": 25.30808729139923, + "grad_norm": 4.423212051391602, + "learning_rate": 2.4900727428326917e-05, + "loss": 0.3571, + "step": 19715 + }, + { + "epoch": 25.309370988446727, + "grad_norm": 2.5307769775390625, + "learning_rate": 2.4900299529311082e-05, + "loss": 0.3455, + "step": 19716 + }, + { + "epoch": 25.310654685494224, + "grad_norm": 1.7851130962371826, + "learning_rate": 2.489987163029525e-05, + "loss": 0.4075, + "step": 19717 + }, + { + "epoch": 25.31193838254172, + "grad_norm": 1.7833925485610962, + "learning_rate": 2.489944373127942e-05, + "loss": 0.3774, + "step": 19718 + }, + { + "epoch": 25.313222079589217, + "grad_norm": 1.7091286182403564, + "learning_rate": 2.4899015832263587e-05, + "loss": 0.3766, + "step": 19719 + }, + { + "epoch": 25.314505776636715, + "grad_norm": 1.7492913007736206, + "learning_rate": 2.4898587933247752e-05, + "loss": 0.3674, + "step": 19720 + }, + { + "epoch": 25.31578947368421, + "grad_norm": 2.7567813396453857, + "learning_rate": 2.4898160034231924e-05, + "loss": 0.4094, + "step": 19721 + }, + { + "epoch": 25.317073170731707, + "grad_norm": 1.2454043626785278, + "learning_rate": 2.489773213521609e-05, + "loss": 0.4035, + "step": 19722 + }, + { + "epoch": 25.318356867779205, + "grad_norm": 2.140009880065918, + "learning_rate": 2.4897304236200257e-05, + "loss": 0.4619, + "step": 19723 + }, + { + "epoch": 25.319640564826702, + "grad_norm": 2.2828080654144287, + "learning_rate": 2.4896876337184426e-05, + "loss": 0.4619, + "step": 19724 + }, + { + "epoch": 25.320924261874197, + "grad_norm": 2.7256667613983154, + "learning_rate": 2.489644843816859e-05, + "loss": 0.5563, + "step": 19725 + }, + { + "epoch": 25.322207958921695, + "grad_norm": 1.2352097034454346, + "learning_rate": 2.4896020539152763e-05, + "loss": 0.3271, + "step": 19726 + }, + { + "epoch": 25.323491655969192, + "grad_norm": 1.2493140697479248, + "learning_rate": 2.4895592640136928e-05, + "loss": 0.3338, + "step": 19727 + }, + { + "epoch": 25.324775353016687, + "grad_norm": 1.656424880027771, + "learning_rate": 2.4895164741121096e-05, + "loss": 0.3571, + "step": 19728 + }, + { + "epoch": 25.326059050064185, + "grad_norm": 1.3514138460159302, + "learning_rate": 2.4894736842105264e-05, + "loss": 0.3392, + "step": 19729 + }, + { + "epoch": 25.327342747111683, + "grad_norm": 1.174019455909729, + "learning_rate": 2.489430894308943e-05, + "loss": 0.3308, + "step": 19730 + }, + { + "epoch": 25.328626444159177, + "grad_norm": 1.0761101245880127, + "learning_rate": 2.48938810440736e-05, + "loss": 0.3553, + "step": 19731 + }, + { + "epoch": 25.329910141206675, + "grad_norm": 6.094371795654297, + "learning_rate": 2.4893453145057766e-05, + "loss": 0.3344, + "step": 19732 + }, + { + "epoch": 25.331193838254173, + "grad_norm": 1.8409185409545898, + "learning_rate": 2.4893025246041935e-05, + "loss": 0.3385, + "step": 19733 + }, + { + "epoch": 25.33247753530167, + "grad_norm": 3.64056658744812, + "learning_rate": 2.4892597347026103e-05, + "loss": 0.3492, + "step": 19734 + }, + { + "epoch": 25.333761232349165, + "grad_norm": 1.3278653621673584, + "learning_rate": 2.489216944801027e-05, + "loss": 0.3655, + "step": 19735 + }, + { + "epoch": 25.335044929396663, + "grad_norm": 1.4490644931793213, + "learning_rate": 2.4891741548994437e-05, + "loss": 0.3535, + "step": 19736 + }, + { + "epoch": 25.33632862644416, + "grad_norm": 1.3218181133270264, + "learning_rate": 2.4891313649978605e-05, + "loss": 0.3584, + "step": 19737 + }, + { + "epoch": 25.337612323491655, + "grad_norm": 1.3274072408676147, + "learning_rate": 2.4890885750962773e-05, + "loss": 0.3461, + "step": 19738 + }, + { + "epoch": 25.338896020539153, + "grad_norm": 1.7614881992340088, + "learning_rate": 2.4890457851946942e-05, + "loss": 0.3621, + "step": 19739 + }, + { + "epoch": 25.34017971758665, + "grad_norm": 1.532659649848938, + "learning_rate": 2.489002995293111e-05, + "loss": 0.3454, + "step": 19740 + }, + { + "epoch": 25.341463414634145, + "grad_norm": 5.034818172454834, + "learning_rate": 2.4889602053915275e-05, + "loss": 0.345, + "step": 19741 + }, + { + "epoch": 25.342747111681643, + "grad_norm": 9.32753849029541, + "learning_rate": 2.4889174154899447e-05, + "loss": 0.3156, + "step": 19742 + }, + { + "epoch": 25.34403080872914, + "grad_norm": 1.1112291812896729, + "learning_rate": 2.4888746255883612e-05, + "loss": 0.3443, + "step": 19743 + }, + { + "epoch": 25.345314505776635, + "grad_norm": 1.1889809370040894, + "learning_rate": 2.4888318356867777e-05, + "loss": 0.3267, + "step": 19744 + }, + { + "epoch": 25.346598202824133, + "grad_norm": 1.140423059463501, + "learning_rate": 2.488789045785195e-05, + "loss": 0.3331, + "step": 19745 + }, + { + "epoch": 25.34788189987163, + "grad_norm": 0.9691669344902039, + "learning_rate": 2.4887462558836114e-05, + "loss": 0.3287, + "step": 19746 + }, + { + "epoch": 25.34916559691913, + "grad_norm": 1.0959161520004272, + "learning_rate": 2.4887034659820286e-05, + "loss": 0.3587, + "step": 19747 + }, + { + "epoch": 25.350449293966623, + "grad_norm": 1.2477809190750122, + "learning_rate": 2.488660676080445e-05, + "loss": 0.333, + "step": 19748 + }, + { + "epoch": 25.35173299101412, + "grad_norm": 1.497391939163208, + "learning_rate": 2.488617886178862e-05, + "loss": 0.3408, + "step": 19749 + }, + { + "epoch": 25.35301668806162, + "grad_norm": 1.2660958766937256, + "learning_rate": 2.4885750962772788e-05, + "loss": 0.3453, + "step": 19750 + }, + { + "epoch": 25.354300385109113, + "grad_norm": 2.158660888671875, + "learning_rate": 2.4885323063756953e-05, + "loss": 0.3225, + "step": 19751 + }, + { + "epoch": 25.35558408215661, + "grad_norm": 1.020961046218872, + "learning_rate": 2.488489516474112e-05, + "loss": 0.3196, + "step": 19752 + }, + { + "epoch": 25.35686777920411, + "grad_norm": 1.242477536201477, + "learning_rate": 2.488446726572529e-05, + "loss": 0.4152, + "step": 19753 + }, + { + "epoch": 25.358151476251603, + "grad_norm": 2.782165050506592, + "learning_rate": 2.4884039366709458e-05, + "loss": 0.33, + "step": 19754 + }, + { + "epoch": 25.3594351732991, + "grad_norm": 1.3306732177734375, + "learning_rate": 2.4883611467693623e-05, + "loss": 0.4017, + "step": 19755 + }, + { + "epoch": 25.3607188703466, + "grad_norm": 1.2485066652297974, + "learning_rate": 2.4883183568677795e-05, + "loss": 0.351, + "step": 19756 + }, + { + "epoch": 25.362002567394097, + "grad_norm": 1.7208881378173828, + "learning_rate": 2.488275566966196e-05, + "loss": 0.3658, + "step": 19757 + }, + { + "epoch": 25.36328626444159, + "grad_norm": 1.2491587400436401, + "learning_rate": 2.4882327770646128e-05, + "loss": 0.3486, + "step": 19758 + }, + { + "epoch": 25.36456996148909, + "grad_norm": 1.2414288520812988, + "learning_rate": 2.4881899871630296e-05, + "loss": 0.4097, + "step": 19759 + }, + { + "epoch": 25.365853658536587, + "grad_norm": 2.1415019035339355, + "learning_rate": 2.488147197261446e-05, + "loss": 0.3355, + "step": 19760 + }, + { + "epoch": 25.36713735558408, + "grad_norm": 1.8876078128814697, + "learning_rate": 2.4881044073598633e-05, + "loss": 0.3459, + "step": 19761 + }, + { + "epoch": 25.36842105263158, + "grad_norm": 1.0363272428512573, + "learning_rate": 2.48806161745828e-05, + "loss": 0.3546, + "step": 19762 + }, + { + "epoch": 25.369704749679077, + "grad_norm": 1.077854871749878, + "learning_rate": 2.4880188275566967e-05, + "loss": 0.3514, + "step": 19763 + }, + { + "epoch": 25.37098844672657, + "grad_norm": 1.140440583229065, + "learning_rate": 2.4879760376551135e-05, + "loss": 0.3537, + "step": 19764 + }, + { + "epoch": 25.37227214377407, + "grad_norm": 4.579370975494385, + "learning_rate": 2.48793324775353e-05, + "loss": 0.3638, + "step": 19765 + }, + { + "epoch": 25.373555840821567, + "grad_norm": 3.6733129024505615, + "learning_rate": 2.4878904578519472e-05, + "loss": 0.3756, + "step": 19766 + }, + { + "epoch": 25.374839537869065, + "grad_norm": 1.6807620525360107, + "learning_rate": 2.4878476679503637e-05, + "loss": 0.3748, + "step": 19767 + }, + { + "epoch": 25.37612323491656, + "grad_norm": 2.375056743621826, + "learning_rate": 2.4878048780487805e-05, + "loss": 0.3843, + "step": 19768 + }, + { + "epoch": 25.377406931964057, + "grad_norm": 1.4868097305297852, + "learning_rate": 2.4877620881471974e-05, + "loss": 0.3664, + "step": 19769 + }, + { + "epoch": 25.378690629011555, + "grad_norm": 2.171642780303955, + "learning_rate": 2.4877192982456142e-05, + "loss": 0.3424, + "step": 19770 + }, + { + "epoch": 25.37997432605905, + "grad_norm": 1.8250696659088135, + "learning_rate": 2.4876765083440307e-05, + "loss": 0.4085, + "step": 19771 + }, + { + "epoch": 25.381258023106547, + "grad_norm": 3.7832577228546143, + "learning_rate": 2.4876337184424476e-05, + "loss": 0.4653, + "step": 19772 + }, + { + "epoch": 25.382541720154045, + "grad_norm": 1.5832910537719727, + "learning_rate": 2.4875909285408644e-05, + "loss": 0.3946, + "step": 19773 + }, + { + "epoch": 25.38382541720154, + "grad_norm": 3.255093812942505, + "learning_rate": 2.4875481386392813e-05, + "loss": 0.4508, + "step": 19774 + }, + { + "epoch": 25.385109114249037, + "grad_norm": 4.826735973358154, + "learning_rate": 2.487505348737698e-05, + "loss": 0.5196, + "step": 19775 + }, + { + "epoch": 25.386392811296535, + "grad_norm": 1.3979135751724243, + "learning_rate": 2.4874625588361146e-05, + "loss": 0.3192, + "step": 19776 + }, + { + "epoch": 25.387676508344033, + "grad_norm": 0.8440456986427307, + "learning_rate": 2.4874197689345314e-05, + "loss": 0.3433, + "step": 19777 + }, + { + "epoch": 25.388960205391527, + "grad_norm": 1.5648256540298462, + "learning_rate": 2.4873769790329483e-05, + "loss": 0.3394, + "step": 19778 + }, + { + "epoch": 25.390243902439025, + "grad_norm": 1.6314008235931396, + "learning_rate": 2.4873341891313648e-05, + "loss": 0.3844, + "step": 19779 + }, + { + "epoch": 25.391527599486523, + "grad_norm": 1.0670710802078247, + "learning_rate": 2.487291399229782e-05, + "loss": 0.3448, + "step": 19780 + }, + { + "epoch": 25.392811296534017, + "grad_norm": 0.9385104179382324, + "learning_rate": 2.4872486093281985e-05, + "loss": 0.3247, + "step": 19781 + }, + { + "epoch": 25.394094993581515, + "grad_norm": 1.5084134340286255, + "learning_rate": 2.4872058194266156e-05, + "loss": 0.331, + "step": 19782 + }, + { + "epoch": 25.395378690629013, + "grad_norm": 1.787121057510376, + "learning_rate": 2.487163029525032e-05, + "loss": 0.3408, + "step": 19783 + }, + { + "epoch": 25.396662387676507, + "grad_norm": 1.2398223876953125, + "learning_rate": 2.4871202396234486e-05, + "loss": 0.3553, + "step": 19784 + }, + { + "epoch": 25.397946084724005, + "grad_norm": 1.0203789472579956, + "learning_rate": 2.4870774497218658e-05, + "loss": 0.3538, + "step": 19785 + }, + { + "epoch": 25.399229781771503, + "grad_norm": 1.1050622463226318, + "learning_rate": 2.4870346598202823e-05, + "loss": 0.377, + "step": 19786 + }, + { + "epoch": 25.400513478818997, + "grad_norm": 1.5282920598983765, + "learning_rate": 2.4869918699186992e-05, + "loss": 0.3636, + "step": 19787 + }, + { + "epoch": 25.401797175866495, + "grad_norm": 1.6809849739074707, + "learning_rate": 2.486949080017116e-05, + "loss": 0.3195, + "step": 19788 + }, + { + "epoch": 25.403080872913993, + "grad_norm": 0.8751508593559265, + "learning_rate": 2.486906290115533e-05, + "loss": 0.3542, + "step": 19789 + }, + { + "epoch": 25.40436456996149, + "grad_norm": 1.153983473777771, + "learning_rate": 2.4868635002139497e-05, + "loss": 0.3326, + "step": 19790 + }, + { + "epoch": 25.405648267008985, + "grad_norm": 0.9963859915733337, + "learning_rate": 2.4868207103123662e-05, + "loss": 0.3532, + "step": 19791 + }, + { + "epoch": 25.406931964056483, + "grad_norm": 1.176095962524414, + "learning_rate": 2.486777920410783e-05, + "loss": 0.3906, + "step": 19792 + }, + { + "epoch": 25.40821566110398, + "grad_norm": 1.0026062726974487, + "learning_rate": 2.4867351305092e-05, + "loss": 0.3335, + "step": 19793 + }, + { + "epoch": 25.409499358151475, + "grad_norm": 1.5115879774093628, + "learning_rate": 2.4866923406076167e-05, + "loss": 0.3329, + "step": 19794 + }, + { + "epoch": 25.410783055198973, + "grad_norm": 1.264817476272583, + "learning_rate": 2.4866495507060332e-05, + "loss": 0.3489, + "step": 19795 + }, + { + "epoch": 25.41206675224647, + "grad_norm": 0.9801881313323975, + "learning_rate": 2.4866067608044504e-05, + "loss": 0.3507, + "step": 19796 + }, + { + "epoch": 25.413350449293965, + "grad_norm": 1.088531255722046, + "learning_rate": 2.486563970902867e-05, + "loss": 0.3087, + "step": 19797 + }, + { + "epoch": 25.414634146341463, + "grad_norm": 1.251428246498108, + "learning_rate": 2.4865211810012837e-05, + "loss": 0.3716, + "step": 19798 + }, + { + "epoch": 25.41591784338896, + "grad_norm": 2.6263275146484375, + "learning_rate": 2.4864783910997006e-05, + "loss": 0.3375, + "step": 19799 + }, + { + "epoch": 25.41720154043646, + "grad_norm": 1.069765329360962, + "learning_rate": 2.486435601198117e-05, + "loss": 0.3144, + "step": 19800 + }, + { + "epoch": 25.418485237483953, + "grad_norm": 2.0673346519470215, + "learning_rate": 2.4863928112965343e-05, + "loss": 0.3678, + "step": 19801 + }, + { + "epoch": 25.41976893453145, + "grad_norm": 1.8735193014144897, + "learning_rate": 2.4863500213949508e-05, + "loss": 0.3412, + "step": 19802 + }, + { + "epoch": 25.42105263157895, + "grad_norm": 1.148242473602295, + "learning_rate": 2.4863072314933676e-05, + "loss": 0.3571, + "step": 19803 + }, + { + "epoch": 25.422336328626443, + "grad_norm": 1.3559916019439697, + "learning_rate": 2.4862644415917845e-05, + "loss": 0.345, + "step": 19804 + }, + { + "epoch": 25.42362002567394, + "grad_norm": 0.9929934144020081, + "learning_rate": 2.486221651690201e-05, + "loss": 0.3515, + "step": 19805 + }, + { + "epoch": 25.42490372272144, + "grad_norm": 1.796437382698059, + "learning_rate": 2.486178861788618e-05, + "loss": 0.3162, + "step": 19806 + }, + { + "epoch": 25.426187419768933, + "grad_norm": 1.6518033742904663, + "learning_rate": 2.4861360718870346e-05, + "loss": 0.3237, + "step": 19807 + }, + { + "epoch": 25.42747111681643, + "grad_norm": 1.6385760307312012, + "learning_rate": 2.4860932819854515e-05, + "loss": 0.3768, + "step": 19808 + }, + { + "epoch": 25.42875481386393, + "grad_norm": 1.1453418731689453, + "learning_rate": 2.4860504920838683e-05, + "loss": 0.3775, + "step": 19809 + }, + { + "epoch": 25.430038510911427, + "grad_norm": 2.596205949783325, + "learning_rate": 2.486007702182285e-05, + "loss": 0.3702, + "step": 19810 + }, + { + "epoch": 25.43132220795892, + "grad_norm": 2.1171419620513916, + "learning_rate": 2.4859649122807017e-05, + "loss": 0.3771, + "step": 19811 + }, + { + "epoch": 25.43260590500642, + "grad_norm": 2.4824044704437256, + "learning_rate": 2.4859221223791185e-05, + "loss": 0.3915, + "step": 19812 + }, + { + "epoch": 25.433889602053917, + "grad_norm": 1.3145796060562134, + "learning_rate": 2.4858793324775353e-05, + "loss": 0.3789, + "step": 19813 + }, + { + "epoch": 25.43517329910141, + "grad_norm": 1.3349621295928955, + "learning_rate": 2.4858365425759522e-05, + "loss": 0.4075, + "step": 19814 + }, + { + "epoch": 25.43645699614891, + "grad_norm": 3.111600637435913, + "learning_rate": 2.485793752674369e-05, + "loss": 0.349, + "step": 19815 + }, + { + "epoch": 25.437740693196407, + "grad_norm": 1.5234016180038452, + "learning_rate": 2.4857509627727855e-05, + "loss": 0.3523, + "step": 19816 + }, + { + "epoch": 25.4390243902439, + "grad_norm": 1.210220456123352, + "learning_rate": 2.4857081728712027e-05, + "loss": 0.3815, + "step": 19817 + }, + { + "epoch": 25.4403080872914, + "grad_norm": 2.532917022705078, + "learning_rate": 2.4856653829696192e-05, + "loss": 0.3794, + "step": 19818 + }, + { + "epoch": 25.441591784338897, + "grad_norm": 2.00418758392334, + "learning_rate": 2.4856225930680357e-05, + "loss": 0.3656, + "step": 19819 + }, + { + "epoch": 25.44287548138639, + "grad_norm": 1.571722149848938, + "learning_rate": 2.485579803166453e-05, + "loss": 0.4053, + "step": 19820 + }, + { + "epoch": 25.44415917843389, + "grad_norm": 4.579723834991455, + "learning_rate": 2.4855370132648694e-05, + "loss": 0.3993, + "step": 19821 + }, + { + "epoch": 25.445442875481387, + "grad_norm": 2.023852825164795, + "learning_rate": 2.4854942233632866e-05, + "loss": 0.4161, + "step": 19822 + }, + { + "epoch": 25.446726572528885, + "grad_norm": 6.377770900726318, + "learning_rate": 2.485451433461703e-05, + "loss": 0.3922, + "step": 19823 + }, + { + "epoch": 25.44801026957638, + "grad_norm": 3.825732946395874, + "learning_rate": 2.48540864356012e-05, + "loss": 0.4831, + "step": 19824 + }, + { + "epoch": 25.449293966623877, + "grad_norm": 3.3649775981903076, + "learning_rate": 2.4853658536585368e-05, + "loss": 0.5788, + "step": 19825 + }, + { + "epoch": 25.450577663671375, + "grad_norm": 1.1384776830673218, + "learning_rate": 2.4853230637569533e-05, + "loss": 0.3232, + "step": 19826 + }, + { + "epoch": 25.45186136071887, + "grad_norm": 0.8691200613975525, + "learning_rate": 2.48528027385537e-05, + "loss": 0.3136, + "step": 19827 + }, + { + "epoch": 25.453145057766367, + "grad_norm": 2.472712755203247, + "learning_rate": 2.485237483953787e-05, + "loss": 0.3178, + "step": 19828 + }, + { + "epoch": 25.454428754813865, + "grad_norm": 0.9634220600128174, + "learning_rate": 2.4851946940522038e-05, + "loss": 0.3413, + "step": 19829 + }, + { + "epoch": 25.45571245186136, + "grad_norm": 0.8663222193717957, + "learning_rate": 2.4851519041506206e-05, + "loss": 0.3432, + "step": 19830 + }, + { + "epoch": 25.456996148908857, + "grad_norm": 1.3590569496154785, + "learning_rate": 2.4851091142490375e-05, + "loss": 0.3505, + "step": 19831 + }, + { + "epoch": 25.458279845956355, + "grad_norm": 1.0523552894592285, + "learning_rate": 2.485066324347454e-05, + "loss": 0.354, + "step": 19832 + }, + { + "epoch": 25.459563543003853, + "grad_norm": 1.3568081855773926, + "learning_rate": 2.4850235344458708e-05, + "loss": 0.3519, + "step": 19833 + }, + { + "epoch": 25.460847240051347, + "grad_norm": 3.377800226211548, + "learning_rate": 2.4849807445442877e-05, + "loss": 0.3602, + "step": 19834 + }, + { + "epoch": 25.462130937098845, + "grad_norm": 4.817241668701172, + "learning_rate": 2.484937954642704e-05, + "loss": 0.3649, + "step": 19835 + }, + { + "epoch": 25.463414634146343, + "grad_norm": 0.9868814945220947, + "learning_rate": 2.4848951647411213e-05, + "loss": 0.3501, + "step": 19836 + }, + { + "epoch": 25.464698331193837, + "grad_norm": 0.9512608051300049, + "learning_rate": 2.484852374839538e-05, + "loss": 0.3725, + "step": 19837 + }, + { + "epoch": 25.465982028241335, + "grad_norm": 2.27527117729187, + "learning_rate": 2.4848095849379547e-05, + "loss": 0.3358, + "step": 19838 + }, + { + "epoch": 25.467265725288833, + "grad_norm": 1.6026010513305664, + "learning_rate": 2.4847667950363715e-05, + "loss": 0.3863, + "step": 19839 + }, + { + "epoch": 25.468549422336327, + "grad_norm": 1.6606078147888184, + "learning_rate": 2.484724005134788e-05, + "loss": 0.3348, + "step": 19840 + }, + { + "epoch": 25.469833119383825, + "grad_norm": 1.483871340751648, + "learning_rate": 2.4846812152332052e-05, + "loss": 0.3266, + "step": 19841 + }, + { + "epoch": 25.471116816431323, + "grad_norm": 0.9993570446968079, + "learning_rate": 2.4846384253316217e-05, + "loss": 0.2702, + "step": 19842 + }, + { + "epoch": 25.47240051347882, + "grad_norm": 1.9536272287368774, + "learning_rate": 2.4845956354300385e-05, + "loss": 0.3357, + "step": 19843 + }, + { + "epoch": 25.473684210526315, + "grad_norm": 1.3259029388427734, + "learning_rate": 2.4845528455284554e-05, + "loss": 0.3522, + "step": 19844 + }, + { + "epoch": 25.474967907573813, + "grad_norm": 1.2810252904891968, + "learning_rate": 2.484510055626872e-05, + "loss": 0.3582, + "step": 19845 + }, + { + "epoch": 25.47625160462131, + "grad_norm": 2.3411777019500732, + "learning_rate": 2.484467265725289e-05, + "loss": 0.3471, + "step": 19846 + }, + { + "epoch": 25.477535301668805, + "grad_norm": 1.1833837032318115, + "learning_rate": 2.4844244758237056e-05, + "loss": 0.3584, + "step": 19847 + }, + { + "epoch": 25.478818998716303, + "grad_norm": 1.0218545198440552, + "learning_rate": 2.4843816859221224e-05, + "loss": 0.3202, + "step": 19848 + }, + { + "epoch": 25.4801026957638, + "grad_norm": 1.8478713035583496, + "learning_rate": 2.4843388960205393e-05, + "loss": 0.3793, + "step": 19849 + }, + { + "epoch": 25.481386392811295, + "grad_norm": 1.3101272583007812, + "learning_rate": 2.484296106118956e-05, + "loss": 0.3367, + "step": 19850 + }, + { + "epoch": 25.482670089858793, + "grad_norm": 0.9676716923713684, + "learning_rate": 2.4842533162173726e-05, + "loss": 0.3157, + "step": 19851 + }, + { + "epoch": 25.48395378690629, + "grad_norm": 1.1775033473968506, + "learning_rate": 2.4842105263157894e-05, + "loss": 0.3502, + "step": 19852 + }, + { + "epoch": 25.485237483953785, + "grad_norm": 1.083792805671692, + "learning_rate": 2.4841677364142063e-05, + "loss": 0.3273, + "step": 19853 + }, + { + "epoch": 25.486521181001283, + "grad_norm": 2.6746859550476074, + "learning_rate": 2.484124946512623e-05, + "loss": 0.36, + "step": 19854 + }, + { + "epoch": 25.48780487804878, + "grad_norm": 4.194845199584961, + "learning_rate": 2.48408215661104e-05, + "loss": 0.393, + "step": 19855 + }, + { + "epoch": 25.48908857509628, + "grad_norm": 1.9968817234039307, + "learning_rate": 2.4840393667094565e-05, + "loss": 0.3529, + "step": 19856 + }, + { + "epoch": 25.490372272143773, + "grad_norm": 1.9503381252288818, + "learning_rate": 2.4839965768078736e-05, + "loss": 0.3618, + "step": 19857 + }, + { + "epoch": 25.49165596919127, + "grad_norm": 2.8847768306732178, + "learning_rate": 2.48395378690629e-05, + "loss": 0.3369, + "step": 19858 + }, + { + "epoch": 25.49293966623877, + "grad_norm": 1.576572299003601, + "learning_rate": 2.4839109970047067e-05, + "loss": 0.3749, + "step": 19859 + }, + { + "epoch": 25.494223363286263, + "grad_norm": 2.4809329509735107, + "learning_rate": 2.483868207103124e-05, + "loss": 0.3839, + "step": 19860 + }, + { + "epoch": 25.49550706033376, + "grad_norm": 1.5421204566955566, + "learning_rate": 2.4838254172015403e-05, + "loss": 0.3653, + "step": 19861 + }, + { + "epoch": 25.49679075738126, + "grad_norm": 1.191011667251587, + "learning_rate": 2.4837826272999575e-05, + "loss": 0.3933, + "step": 19862 + }, + { + "epoch": 25.498074454428753, + "grad_norm": 1.3213053941726685, + "learning_rate": 2.483739837398374e-05, + "loss": 0.3709, + "step": 19863 + }, + { + "epoch": 25.49935815147625, + "grad_norm": 2.5206055641174316, + "learning_rate": 2.483697047496791e-05, + "loss": 0.4363, + "step": 19864 + }, + { + "epoch": 25.50064184852375, + "grad_norm": 1.5030173063278198, + "learning_rate": 2.4836542575952077e-05, + "loss": 0.3711, + "step": 19865 + }, + { + "epoch": 25.501925545571247, + "grad_norm": 1.0808244943618774, + "learning_rate": 2.4836114676936242e-05, + "loss": 0.3992, + "step": 19866 + }, + { + "epoch": 25.50320924261874, + "grad_norm": 1.8674331903457642, + "learning_rate": 2.483568677792041e-05, + "loss": 0.3598, + "step": 19867 + }, + { + "epoch": 25.50449293966624, + "grad_norm": 1.2123463153839111, + "learning_rate": 2.483525887890458e-05, + "loss": 0.3807, + "step": 19868 + }, + { + "epoch": 25.505776636713737, + "grad_norm": 4.397462368011475, + "learning_rate": 2.4834830979888747e-05, + "loss": 0.4009, + "step": 19869 + }, + { + "epoch": 25.50706033376123, + "grad_norm": 1.2207083702087402, + "learning_rate": 2.4834403080872916e-05, + "loss": 0.421, + "step": 19870 + }, + { + "epoch": 25.50834403080873, + "grad_norm": 1.2112185955047607, + "learning_rate": 2.4833975181857084e-05, + "loss": 0.4034, + "step": 19871 + }, + { + "epoch": 25.509627727856227, + "grad_norm": 1.6544183492660522, + "learning_rate": 2.483354728284125e-05, + "loss": 0.4208, + "step": 19872 + }, + { + "epoch": 25.51091142490372, + "grad_norm": 3.02824068069458, + "learning_rate": 2.4833119383825418e-05, + "loss": 0.4725, + "step": 19873 + }, + { + "epoch": 25.51219512195122, + "grad_norm": 2.1293070316314697, + "learning_rate": 2.4832691484809586e-05, + "loss": 0.4814, + "step": 19874 + }, + { + "epoch": 25.513478818998717, + "grad_norm": 4.437043190002441, + "learning_rate": 2.483226358579375e-05, + "loss": 0.5358, + "step": 19875 + }, + { + "epoch": 25.514762516046215, + "grad_norm": 1.926580548286438, + "learning_rate": 2.4831835686777923e-05, + "loss": 0.3499, + "step": 19876 + }, + { + "epoch": 25.51604621309371, + "grad_norm": 1.4288347959518433, + "learning_rate": 2.4831407787762088e-05, + "loss": 0.3424, + "step": 19877 + }, + { + "epoch": 25.517329910141207, + "grad_norm": 1.2981880903244019, + "learning_rate": 2.483097988874626e-05, + "loss": 0.3523, + "step": 19878 + }, + { + "epoch": 25.518613607188705, + "grad_norm": 0.8215571641921997, + "learning_rate": 2.4830551989730425e-05, + "loss": 0.3415, + "step": 19879 + }, + { + "epoch": 25.5198973042362, + "grad_norm": 0.9646615386009216, + "learning_rate": 2.483012409071459e-05, + "loss": 0.3422, + "step": 19880 + }, + { + "epoch": 25.521181001283697, + "grad_norm": 1.3024593591690063, + "learning_rate": 2.482969619169876e-05, + "loss": 0.321, + "step": 19881 + }, + { + "epoch": 25.522464698331195, + "grad_norm": 1.1598703861236572, + "learning_rate": 2.4829268292682926e-05, + "loss": 0.3233, + "step": 19882 + }, + { + "epoch": 25.52374839537869, + "grad_norm": 1.0377998352050781, + "learning_rate": 2.4828840393667095e-05, + "loss": 0.3398, + "step": 19883 + }, + { + "epoch": 25.525032092426187, + "grad_norm": 1.456946611404419, + "learning_rate": 2.4828412494651263e-05, + "loss": 0.3386, + "step": 19884 + }, + { + "epoch": 25.526315789473685, + "grad_norm": 0.8750092387199402, + "learning_rate": 2.482798459563543e-05, + "loss": 0.3631, + "step": 19885 + }, + { + "epoch": 25.527599486521183, + "grad_norm": 0.8183486461639404, + "learning_rate": 2.48275566966196e-05, + "loss": 0.2858, + "step": 19886 + }, + { + "epoch": 25.528883183568677, + "grad_norm": 3.688779830932617, + "learning_rate": 2.4827128797603765e-05, + "loss": 0.3493, + "step": 19887 + }, + { + "epoch": 25.530166880616175, + "grad_norm": 2.0556936264038086, + "learning_rate": 2.4826700898587934e-05, + "loss": 0.3542, + "step": 19888 + }, + { + "epoch": 25.531450577663673, + "grad_norm": 1.902559757232666, + "learning_rate": 2.4826272999572102e-05, + "loss": 0.3468, + "step": 19889 + }, + { + "epoch": 25.532734274711167, + "grad_norm": 1.4951732158660889, + "learning_rate": 2.482584510055627e-05, + "loss": 0.3464, + "step": 19890 + }, + { + "epoch": 25.534017971758665, + "grad_norm": 1.5847545862197876, + "learning_rate": 2.4825417201540435e-05, + "loss": 0.3602, + "step": 19891 + }, + { + "epoch": 25.535301668806163, + "grad_norm": 0.9734326601028442, + "learning_rate": 2.4824989302524607e-05, + "loss": 0.3366, + "step": 19892 + }, + { + "epoch": 25.536585365853657, + "grad_norm": 0.8773960471153259, + "learning_rate": 2.4824561403508772e-05, + "loss": 0.3418, + "step": 19893 + }, + { + "epoch": 25.537869062901155, + "grad_norm": 1.6062679290771484, + "learning_rate": 2.482413350449294e-05, + "loss": 0.3693, + "step": 19894 + }, + { + "epoch": 25.539152759948653, + "grad_norm": 1.1968618631362915, + "learning_rate": 2.482370560547711e-05, + "loss": 0.3208, + "step": 19895 + }, + { + "epoch": 25.540436456996147, + "grad_norm": 1.4080660343170166, + "learning_rate": 2.4823277706461274e-05, + "loss": 0.3979, + "step": 19896 + }, + { + "epoch": 25.541720154043645, + "grad_norm": 1.6127586364746094, + "learning_rate": 2.4822849807445446e-05, + "loss": 0.3424, + "step": 19897 + }, + { + "epoch": 25.543003851091143, + "grad_norm": 2.5147225856781006, + "learning_rate": 2.482242190842961e-05, + "loss": 0.3231, + "step": 19898 + }, + { + "epoch": 25.54428754813864, + "grad_norm": 1.1040011644363403, + "learning_rate": 2.482199400941378e-05, + "loss": 0.3357, + "step": 19899 + }, + { + "epoch": 25.545571245186135, + "grad_norm": 1.1844515800476074, + "learning_rate": 2.4821566110397948e-05, + "loss": 0.3465, + "step": 19900 + }, + { + "epoch": 25.546854942233633, + "grad_norm": 2.0188891887664795, + "learning_rate": 2.4821138211382113e-05, + "loss": 0.3601, + "step": 19901 + }, + { + "epoch": 25.54813863928113, + "grad_norm": 1.3842990398406982, + "learning_rate": 2.4820710312366285e-05, + "loss": 0.3228, + "step": 19902 + }, + { + "epoch": 25.549422336328625, + "grad_norm": 2.275472402572632, + "learning_rate": 2.482028241335045e-05, + "loss": 0.3374, + "step": 19903 + }, + { + "epoch": 25.550706033376123, + "grad_norm": 1.0543055534362793, + "learning_rate": 2.4819854514334618e-05, + "loss": 0.3402, + "step": 19904 + }, + { + "epoch": 25.55198973042362, + "grad_norm": 2.1846201419830322, + "learning_rate": 2.4819426615318786e-05, + "loss": 0.3693, + "step": 19905 + }, + { + "epoch": 25.553273427471115, + "grad_norm": 2.0953097343444824, + "learning_rate": 2.481899871630295e-05, + "loss": 0.3288, + "step": 19906 + }, + { + "epoch": 25.554557124518613, + "grad_norm": 2.51300311088562, + "learning_rate": 2.481857081728712e-05, + "loss": 0.3584, + "step": 19907 + }, + { + "epoch": 25.55584082156611, + "grad_norm": 1.4762814044952393, + "learning_rate": 2.4818142918271288e-05, + "loss": 0.3782, + "step": 19908 + }, + { + "epoch": 25.55712451861361, + "grad_norm": 2.3978421688079834, + "learning_rate": 2.4817715019255457e-05, + "loss": 0.3598, + "step": 19909 + }, + { + "epoch": 25.558408215661103, + "grad_norm": 2.2340452671051025, + "learning_rate": 2.4817287120239625e-05, + "loss": 0.3607, + "step": 19910 + }, + { + "epoch": 25.5596919127086, + "grad_norm": 1.4717779159545898, + "learning_rate": 2.4816859221223793e-05, + "loss": 0.3357, + "step": 19911 + }, + { + "epoch": 25.5609756097561, + "grad_norm": 1.1817455291748047, + "learning_rate": 2.481643132220796e-05, + "loss": 0.3655, + "step": 19912 + }, + { + "epoch": 25.562259306803593, + "grad_norm": 2.1792163848876953, + "learning_rate": 2.4816003423192127e-05, + "loss": 0.3815, + "step": 19913 + }, + { + "epoch": 25.56354300385109, + "grad_norm": 1.4382374286651611, + "learning_rate": 2.4815575524176295e-05, + "loss": 0.402, + "step": 19914 + }, + { + "epoch": 25.56482670089859, + "grad_norm": 1.2505056858062744, + "learning_rate": 2.481514762516046e-05, + "loss": 0.3576, + "step": 19915 + }, + { + "epoch": 25.566110397946083, + "grad_norm": 1.4935375452041626, + "learning_rate": 2.4814719726144632e-05, + "loss": 0.3464, + "step": 19916 + }, + { + "epoch": 25.56739409499358, + "grad_norm": 1.3815290927886963, + "learning_rate": 2.4814291827128797e-05, + "loss": 0.3959, + "step": 19917 + }, + { + "epoch": 25.56867779204108, + "grad_norm": 1.8121997117996216, + "learning_rate": 2.481386392811297e-05, + "loss": 0.3923, + "step": 19918 + }, + { + "epoch": 25.569961489088577, + "grad_norm": 2.202662229537964, + "learning_rate": 2.4813436029097134e-05, + "loss": 0.3709, + "step": 19919 + }, + { + "epoch": 25.57124518613607, + "grad_norm": 1.5885106325149536, + "learning_rate": 2.48130081300813e-05, + "loss": 0.422, + "step": 19920 + }, + { + "epoch": 25.57252888318357, + "grad_norm": 2.4182050228118896, + "learning_rate": 2.481258023106547e-05, + "loss": 0.3979, + "step": 19921 + }, + { + "epoch": 25.573812580231067, + "grad_norm": 2.0322911739349365, + "learning_rate": 2.4812152332049636e-05, + "loss": 0.4124, + "step": 19922 + }, + { + "epoch": 25.57509627727856, + "grad_norm": 2.897942543029785, + "learning_rate": 2.4811724433033804e-05, + "loss": 0.3984, + "step": 19923 + }, + { + "epoch": 25.57637997432606, + "grad_norm": 2.906074285507202, + "learning_rate": 2.4811296534017973e-05, + "loss": 0.4271, + "step": 19924 + }, + { + "epoch": 25.577663671373557, + "grad_norm": 4.426950931549072, + "learning_rate": 2.481086863500214e-05, + "loss": 0.5644, + "step": 19925 + }, + { + "epoch": 25.57894736842105, + "grad_norm": 1.2494840621948242, + "learning_rate": 2.481044073598631e-05, + "loss": 0.3131, + "step": 19926 + }, + { + "epoch": 25.58023106546855, + "grad_norm": 1.0186132192611694, + "learning_rate": 2.4810012836970474e-05, + "loss": 0.3291, + "step": 19927 + }, + { + "epoch": 25.581514762516047, + "grad_norm": 0.9970641136169434, + "learning_rate": 2.4809584937954643e-05, + "loss": 0.3373, + "step": 19928 + }, + { + "epoch": 25.58279845956354, + "grad_norm": 1.0748326778411865, + "learning_rate": 2.480915703893881e-05, + "loss": 0.3676, + "step": 19929 + }, + { + "epoch": 25.58408215661104, + "grad_norm": 1.3685176372528076, + "learning_rate": 2.480872913992298e-05, + "loss": 0.3424, + "step": 19930 + }, + { + "epoch": 25.585365853658537, + "grad_norm": 2.025991439819336, + "learning_rate": 2.4808301240907145e-05, + "loss": 0.3396, + "step": 19931 + }, + { + "epoch": 25.586649550706035, + "grad_norm": 1.325562596321106, + "learning_rate": 2.4807873341891317e-05, + "loss": 0.3773, + "step": 19932 + }, + { + "epoch": 25.58793324775353, + "grad_norm": 2.1182751655578613, + "learning_rate": 2.480744544287548e-05, + "loss": 0.3281, + "step": 19933 + }, + { + "epoch": 25.589216944801027, + "grad_norm": 1.0117439031600952, + "learning_rate": 2.480701754385965e-05, + "loss": 0.3463, + "step": 19934 + }, + { + "epoch": 25.590500641848525, + "grad_norm": 1.2701209783554077, + "learning_rate": 2.480658964484382e-05, + "loss": 0.3522, + "step": 19935 + }, + { + "epoch": 25.59178433889602, + "grad_norm": 2.226789712905884, + "learning_rate": 2.4806161745827983e-05, + "loss": 0.3347, + "step": 19936 + }, + { + "epoch": 25.593068035943517, + "grad_norm": 1.334750771522522, + "learning_rate": 2.4805733846812155e-05, + "loss": 0.3451, + "step": 19937 + }, + { + "epoch": 25.594351732991015, + "grad_norm": 1.1320852041244507, + "learning_rate": 2.480530594779632e-05, + "loss": 0.3263, + "step": 19938 + }, + { + "epoch": 25.59563543003851, + "grad_norm": 2.097853899002075, + "learning_rate": 2.480487804878049e-05, + "loss": 0.3602, + "step": 19939 + }, + { + "epoch": 25.596919127086007, + "grad_norm": 1.482618808746338, + "learning_rate": 2.4804450149764657e-05, + "loss": 0.3401, + "step": 19940 + }, + { + "epoch": 25.598202824133505, + "grad_norm": 1.6204712390899658, + "learning_rate": 2.4804022250748822e-05, + "loss": 0.3592, + "step": 19941 + }, + { + "epoch": 25.599486521181003, + "grad_norm": 1.0573650598526, + "learning_rate": 2.4803594351732994e-05, + "loss": 0.3686, + "step": 19942 + }, + { + "epoch": 25.600770218228497, + "grad_norm": 2.985786199569702, + "learning_rate": 2.480316645271716e-05, + "loss": 0.3211, + "step": 19943 + }, + { + "epoch": 25.602053915275995, + "grad_norm": 1.174121379852295, + "learning_rate": 2.4802738553701327e-05, + "loss": 0.3522, + "step": 19944 + }, + { + "epoch": 25.603337612323493, + "grad_norm": 1.4148585796356201, + "learning_rate": 2.4802310654685496e-05, + "loss": 0.3453, + "step": 19945 + }, + { + "epoch": 25.604621309370987, + "grad_norm": 1.6073710918426514, + "learning_rate": 2.4801882755669664e-05, + "loss": 0.3573, + "step": 19946 + }, + { + "epoch": 25.605905006418485, + "grad_norm": 0.8654749393463135, + "learning_rate": 2.480145485665383e-05, + "loss": 0.3204, + "step": 19947 + }, + { + "epoch": 25.607188703465983, + "grad_norm": 1.8573743104934692, + "learning_rate": 2.4801026957637998e-05, + "loss": 0.3612, + "step": 19948 + }, + { + "epoch": 25.608472400513477, + "grad_norm": 1.2013516426086426, + "learning_rate": 2.4800599058622166e-05, + "loss": 0.3386, + "step": 19949 + }, + { + "epoch": 25.609756097560975, + "grad_norm": 2.1526055335998535, + "learning_rate": 2.4800171159606334e-05, + "loss": 0.3292, + "step": 19950 + }, + { + "epoch": 25.611039794608473, + "grad_norm": 1.6499671936035156, + "learning_rate": 2.4799743260590503e-05, + "loss": 0.3519, + "step": 19951 + }, + { + "epoch": 25.61232349165597, + "grad_norm": 1.0026510953903198, + "learning_rate": 2.4799315361574668e-05, + "loss": 0.3198, + "step": 19952 + }, + { + "epoch": 25.613607188703465, + "grad_norm": 0.88084876537323, + "learning_rate": 2.479888746255884e-05, + "loss": 0.3381, + "step": 19953 + }, + { + "epoch": 25.614890885750963, + "grad_norm": 1.6067672967910767, + "learning_rate": 2.4798459563543005e-05, + "loss": 0.3254, + "step": 19954 + }, + { + "epoch": 25.61617458279846, + "grad_norm": 1.0379868745803833, + "learning_rate": 2.479803166452717e-05, + "loss": 0.3362, + "step": 19955 + }, + { + "epoch": 25.617458279845955, + "grad_norm": 1.3287127017974854, + "learning_rate": 2.479760376551134e-05, + "loss": 0.3476, + "step": 19956 + }, + { + "epoch": 25.618741976893453, + "grad_norm": 1.2583119869232178, + "learning_rate": 2.4797175866495506e-05, + "loss": 0.3835, + "step": 19957 + }, + { + "epoch": 25.62002567394095, + "grad_norm": 1.024986743927002, + "learning_rate": 2.4796747967479675e-05, + "loss": 0.3503, + "step": 19958 + }, + { + "epoch": 25.621309370988445, + "grad_norm": 1.72733736038208, + "learning_rate": 2.4796320068463843e-05, + "loss": 0.3605, + "step": 19959 + }, + { + "epoch": 25.622593068035943, + "grad_norm": 1.4108140468597412, + "learning_rate": 2.4795892169448012e-05, + "loss": 0.3502, + "step": 19960 + }, + { + "epoch": 25.62387676508344, + "grad_norm": 1.5432085990905762, + "learning_rate": 2.479546427043218e-05, + "loss": 0.3367, + "step": 19961 + }, + { + "epoch": 25.625160462130935, + "grad_norm": 1.5308159589767456, + "learning_rate": 2.4795036371416345e-05, + "loss": 0.3727, + "step": 19962 + }, + { + "epoch": 25.626444159178433, + "grad_norm": 1.065064549446106, + "learning_rate": 2.4794608472400514e-05, + "loss": 0.3192, + "step": 19963 + }, + { + "epoch": 25.62772785622593, + "grad_norm": 2.507354974746704, + "learning_rate": 2.4794180573384682e-05, + "loss": 0.3612, + "step": 19964 + }, + { + "epoch": 25.62901155327343, + "grad_norm": 1.3107413053512573, + "learning_rate": 2.479375267436885e-05, + "loss": 0.386, + "step": 19965 + }, + { + "epoch": 25.630295250320923, + "grad_norm": 1.1791936159133911, + "learning_rate": 2.4793324775353015e-05, + "loss": 0.3652, + "step": 19966 + }, + { + "epoch": 25.63157894736842, + "grad_norm": 4.39796257019043, + "learning_rate": 2.4792896876337184e-05, + "loss": 0.409, + "step": 19967 + }, + { + "epoch": 25.63286264441592, + "grad_norm": 1.145098328590393, + "learning_rate": 2.4792468977321352e-05, + "loss": 0.3988, + "step": 19968 + }, + { + "epoch": 25.634146341463413, + "grad_norm": 1.513686180114746, + "learning_rate": 2.479204107830552e-05, + "loss": 0.3736, + "step": 19969 + }, + { + "epoch": 25.63543003851091, + "grad_norm": 1.607159972190857, + "learning_rate": 2.479161317928969e-05, + "loss": 0.3851, + "step": 19970 + }, + { + "epoch": 25.63671373555841, + "grad_norm": 1.0046323537826538, + "learning_rate": 2.4791185280273854e-05, + "loss": 0.3944, + "step": 19971 + }, + { + "epoch": 25.637997432605903, + "grad_norm": 3.5917718410491943, + "learning_rate": 2.4790757381258026e-05, + "loss": 0.398, + "step": 19972 + }, + { + "epoch": 25.6392811296534, + "grad_norm": 1.5399214029312134, + "learning_rate": 2.479032948224219e-05, + "loss": 0.3963, + "step": 19973 + }, + { + "epoch": 25.6405648267009, + "grad_norm": 1.9753475189208984, + "learning_rate": 2.4789901583226356e-05, + "loss": 0.4359, + "step": 19974 + }, + { + "epoch": 25.641848523748397, + "grad_norm": 2.6792914867401123, + "learning_rate": 2.4789473684210528e-05, + "loss": 0.5308, + "step": 19975 + }, + { + "epoch": 25.64313222079589, + "grad_norm": 1.0546343326568604, + "learning_rate": 2.4789045785194693e-05, + "loss": 0.3164, + "step": 19976 + }, + { + "epoch": 25.64441591784339, + "grad_norm": 1.1650041341781616, + "learning_rate": 2.4788617886178865e-05, + "loss": 0.327, + "step": 19977 + }, + { + "epoch": 25.645699614890887, + "grad_norm": 3.3251681327819824, + "learning_rate": 2.478818998716303e-05, + "loss": 0.3282, + "step": 19978 + }, + { + "epoch": 25.64698331193838, + "grad_norm": 2.0327563285827637, + "learning_rate": 2.4787762088147198e-05, + "loss": 0.3397, + "step": 19979 + }, + { + "epoch": 25.64826700898588, + "grad_norm": 1.3010554313659668, + "learning_rate": 2.4787334189131366e-05, + "loss": 0.3489, + "step": 19980 + }, + { + "epoch": 25.649550706033377, + "grad_norm": 3.9389071464538574, + "learning_rate": 2.478690629011553e-05, + "loss": 0.3265, + "step": 19981 + }, + { + "epoch": 25.65083440308087, + "grad_norm": 1.4292216300964355, + "learning_rate": 2.47864783910997e-05, + "loss": 0.3805, + "step": 19982 + }, + { + "epoch": 25.65211810012837, + "grad_norm": 2.802269220352173, + "learning_rate": 2.4786050492083868e-05, + "loss": 0.3841, + "step": 19983 + }, + { + "epoch": 25.653401797175867, + "grad_norm": 1.7249614000320435, + "learning_rate": 2.4785622593068037e-05, + "loss": 0.3356, + "step": 19984 + }, + { + "epoch": 25.654685494223365, + "grad_norm": 2.178598165512085, + "learning_rate": 2.4785194694052205e-05, + "loss": 0.3291, + "step": 19985 + }, + { + "epoch": 25.65596919127086, + "grad_norm": 1.129634976387024, + "learning_rate": 2.4784766795036373e-05, + "loss": 0.3365, + "step": 19986 + }, + { + "epoch": 25.657252888318357, + "grad_norm": 2.1199662685394287, + "learning_rate": 2.478433889602054e-05, + "loss": 0.3512, + "step": 19987 + }, + { + "epoch": 25.658536585365855, + "grad_norm": 4.366046905517578, + "learning_rate": 2.4783910997004707e-05, + "loss": 0.3358, + "step": 19988 + }, + { + "epoch": 25.65982028241335, + "grad_norm": 1.0257598161697388, + "learning_rate": 2.4783483097988875e-05, + "loss": 0.3497, + "step": 19989 + }, + { + "epoch": 25.661103979460847, + "grad_norm": 2.189060926437378, + "learning_rate": 2.478305519897304e-05, + "loss": 0.3427, + "step": 19990 + }, + { + "epoch": 25.662387676508345, + "grad_norm": 1.0315234661102295, + "learning_rate": 2.4782627299957212e-05, + "loss": 0.3376, + "step": 19991 + }, + { + "epoch": 25.66367137355584, + "grad_norm": 1.3767919540405273, + "learning_rate": 2.4782199400941377e-05, + "loss": 0.3607, + "step": 19992 + }, + { + "epoch": 25.664955070603337, + "grad_norm": 1.5688999891281128, + "learning_rate": 2.478177150192555e-05, + "loss": 0.372, + "step": 19993 + }, + { + "epoch": 25.666238767650835, + "grad_norm": 2.9105143547058105, + "learning_rate": 2.4781343602909714e-05, + "loss": 0.3378, + "step": 19994 + }, + { + "epoch": 25.66752246469833, + "grad_norm": 1.4330902099609375, + "learning_rate": 2.478091570389388e-05, + "loss": 0.3687, + "step": 19995 + }, + { + "epoch": 25.668806161745827, + "grad_norm": 1.4990637302398682, + "learning_rate": 2.478048780487805e-05, + "loss": 0.3505, + "step": 19996 + }, + { + "epoch": 25.670089858793325, + "grad_norm": 2.787112236022949, + "learning_rate": 2.4780059905862216e-05, + "loss": 0.3309, + "step": 19997 + }, + { + "epoch": 25.671373555840823, + "grad_norm": 1.1438698768615723, + "learning_rate": 2.4779632006846384e-05, + "loss": 0.3585, + "step": 19998 + }, + { + "epoch": 25.672657252888317, + "grad_norm": 1.6451563835144043, + "learning_rate": 2.4779204107830553e-05, + "loss": 0.3257, + "step": 19999 + }, + { + "epoch": 25.673940949935815, + "grad_norm": 1.8670560121536255, + "learning_rate": 2.477877620881472e-05, + "loss": 0.3633, + "step": 20000 + }, + { + "epoch": 25.673940949935815, + "eval_cer": 0.27200411200313296, + "eval_loss": 0.4731506407260895, + "eval_runtime": 13.5382, + "eval_samples_per_second": 72.61, + "eval_steps_per_second": 0.517, + "eval_wer": 0.46594817702361846, + "step": 20000 + }, + { + "epoch": 25.675224646983313, + "grad_norm": 1.7856608629226685, + "learning_rate": 2.477834830979889e-05, + "loss": 0.3286, + "step": 20001 + }, + { + "epoch": 25.676508344030808, + "grad_norm": 1.300216794013977, + "learning_rate": 2.4777920410783055e-05, + "loss": 0.3269, + "step": 20002 + }, + { + "epoch": 25.677792041078305, + "grad_norm": 0.8845502734184265, + "learning_rate": 2.4777492511767223e-05, + "loss": 0.3284, + "step": 20003 + }, + { + "epoch": 25.679075738125803, + "grad_norm": 2.4767444133758545, + "learning_rate": 2.477706461275139e-05, + "loss": 0.3314, + "step": 20004 + }, + { + "epoch": 25.680359435173298, + "grad_norm": 1.9911226034164429, + "learning_rate": 2.477663671373556e-05, + "loss": 0.3208, + "step": 20005 + }, + { + "epoch": 25.681643132220795, + "grad_norm": 1.6621403694152832, + "learning_rate": 2.4776208814719725e-05, + "loss": 0.3529, + "step": 20006 + }, + { + "epoch": 25.682926829268293, + "grad_norm": 1.0125560760498047, + "learning_rate": 2.4775780915703897e-05, + "loss": 0.3811, + "step": 20007 + }, + { + "epoch": 25.68421052631579, + "grad_norm": 1.373025894165039, + "learning_rate": 2.477535301668806e-05, + "loss": 0.3722, + "step": 20008 + }, + { + "epoch": 25.685494223363285, + "grad_norm": 4.556556224822998, + "learning_rate": 2.477492511767223e-05, + "loss": 0.3521, + "step": 20009 + }, + { + "epoch": 25.686777920410783, + "grad_norm": 14.381887435913086, + "learning_rate": 2.47744972186564e-05, + "loss": 0.3709, + "step": 20010 + }, + { + "epoch": 25.68806161745828, + "grad_norm": 1.7111968994140625, + "learning_rate": 2.4774069319640563e-05, + "loss": 0.3444, + "step": 20011 + }, + { + "epoch": 25.689345314505776, + "grad_norm": 1.5685083866119385, + "learning_rate": 2.4773641420624735e-05, + "loss": 0.3953, + "step": 20012 + }, + { + "epoch": 25.690629011553273, + "grad_norm": 1.5019203424453735, + "learning_rate": 2.47732135216089e-05, + "loss": 0.3684, + "step": 20013 + }, + { + "epoch": 25.69191270860077, + "grad_norm": 5.528083324432373, + "learning_rate": 2.477278562259307e-05, + "loss": 0.3566, + "step": 20014 + }, + { + "epoch": 25.693196405648266, + "grad_norm": 4.520783424377441, + "learning_rate": 2.4772357723577237e-05, + "loss": 0.3759, + "step": 20015 + }, + { + "epoch": 25.694480102695763, + "grad_norm": 1.9760363101959229, + "learning_rate": 2.4771929824561402e-05, + "loss": 0.3496, + "step": 20016 + }, + { + "epoch": 25.69576379974326, + "grad_norm": 1.5928210020065308, + "learning_rate": 2.4771501925545574e-05, + "loss": 0.3872, + "step": 20017 + }, + { + "epoch": 25.69704749679076, + "grad_norm": 2.6540133953094482, + "learning_rate": 2.477107402652974e-05, + "loss": 0.4234, + "step": 20018 + }, + { + "epoch": 25.698331193838253, + "grad_norm": 1.181037187576294, + "learning_rate": 2.4770646127513907e-05, + "loss": 0.3578, + "step": 20019 + }, + { + "epoch": 25.69961489088575, + "grad_norm": 4.694679260253906, + "learning_rate": 2.4770218228498076e-05, + "loss": 0.3674, + "step": 20020 + }, + { + "epoch": 25.70089858793325, + "grad_norm": 1.7051643133163452, + "learning_rate": 2.4769790329482244e-05, + "loss": 0.4403, + "step": 20021 + }, + { + "epoch": 25.702182284980744, + "grad_norm": 1.6772209405899048, + "learning_rate": 2.476936243046641e-05, + "loss": 0.4195, + "step": 20022 + }, + { + "epoch": 25.70346598202824, + "grad_norm": 2.1040923595428467, + "learning_rate": 2.4768934531450578e-05, + "loss": 0.4942, + "step": 20023 + }, + { + "epoch": 25.70474967907574, + "grad_norm": 3.020535707473755, + "learning_rate": 2.4768506632434746e-05, + "loss": 0.4827, + "step": 20024 + }, + { + "epoch": 25.706033376123234, + "grad_norm": 3.2068567276000977, + "learning_rate": 2.4768078733418914e-05, + "loss": 0.6499, + "step": 20025 + }, + { + "epoch": 25.70731707317073, + "grad_norm": 2.4666380882263184, + "learning_rate": 2.4767650834403083e-05, + "loss": 0.3264, + "step": 20026 + }, + { + "epoch": 25.70860077021823, + "grad_norm": 1.5407534837722778, + "learning_rate": 2.4767222935387248e-05, + "loss": 0.3018, + "step": 20027 + }, + { + "epoch": 25.709884467265724, + "grad_norm": 1.8507013320922852, + "learning_rate": 2.4766795036371416e-05, + "loss": 0.35, + "step": 20028 + }, + { + "epoch": 25.71116816431322, + "grad_norm": 1.923220157623291, + "learning_rate": 2.4766367137355585e-05, + "loss": 0.3148, + "step": 20029 + }, + { + "epoch": 25.71245186136072, + "grad_norm": 1.0013890266418457, + "learning_rate": 2.476593923833975e-05, + "loss": 0.3127, + "step": 20030 + }, + { + "epoch": 25.713735558408217, + "grad_norm": 0.9832391142845154, + "learning_rate": 2.476551133932392e-05, + "loss": 0.3375, + "step": 20031 + }, + { + "epoch": 25.71501925545571, + "grad_norm": 1.4217673540115356, + "learning_rate": 2.4765083440308087e-05, + "loss": 0.3391, + "step": 20032 + }, + { + "epoch": 25.71630295250321, + "grad_norm": 0.9095937013626099, + "learning_rate": 2.476465554129226e-05, + "loss": 0.3037, + "step": 20033 + }, + { + "epoch": 25.717586649550707, + "grad_norm": 1.3613539934158325, + "learning_rate": 2.4764227642276423e-05, + "loss": 0.3235, + "step": 20034 + }, + { + "epoch": 25.7188703465982, + "grad_norm": 1.0250182151794434, + "learning_rate": 2.476379974326059e-05, + "loss": 0.3817, + "step": 20035 + }, + { + "epoch": 25.7201540436457, + "grad_norm": 1.0527143478393555, + "learning_rate": 2.476337184424476e-05, + "loss": 0.3496, + "step": 20036 + }, + { + "epoch": 25.721437740693197, + "grad_norm": 1.1213798522949219, + "learning_rate": 2.4762943945228925e-05, + "loss": 0.3094, + "step": 20037 + }, + { + "epoch": 25.72272143774069, + "grad_norm": 3.3464291095733643, + "learning_rate": 2.4762516046213094e-05, + "loss": 0.3499, + "step": 20038 + }, + { + "epoch": 25.72400513478819, + "grad_norm": 1.2922582626342773, + "learning_rate": 2.4762088147197262e-05, + "loss": 0.3606, + "step": 20039 + }, + { + "epoch": 25.725288831835687, + "grad_norm": 1.1738700866699219, + "learning_rate": 2.476166024818143e-05, + "loss": 0.3428, + "step": 20040 + }, + { + "epoch": 25.726572528883185, + "grad_norm": 2.5181994438171387, + "learning_rate": 2.47612323491656e-05, + "loss": 0.3695, + "step": 20041 + }, + { + "epoch": 25.72785622593068, + "grad_norm": 1.2172845602035522, + "learning_rate": 2.4760804450149764e-05, + "loss": 0.3179, + "step": 20042 + }, + { + "epoch": 25.729139922978177, + "grad_norm": 0.9885530471801758, + "learning_rate": 2.4760376551133932e-05, + "loss": 0.2956, + "step": 20043 + }, + { + "epoch": 25.730423620025675, + "grad_norm": 1.7370685338974, + "learning_rate": 2.47599486521181e-05, + "loss": 0.3441, + "step": 20044 + }, + { + "epoch": 25.73170731707317, + "grad_norm": 2.7485742568969727, + "learning_rate": 2.475952075310227e-05, + "loss": 0.3696, + "step": 20045 + }, + { + "epoch": 25.732991014120667, + "grad_norm": 1.237876057624817, + "learning_rate": 2.4759092854086434e-05, + "loss": 0.3494, + "step": 20046 + }, + { + "epoch": 25.734274711168165, + "grad_norm": 2.004070520401001, + "learning_rate": 2.4758664955070606e-05, + "loss": 0.3404, + "step": 20047 + }, + { + "epoch": 25.73555840821566, + "grad_norm": 2.3574860095977783, + "learning_rate": 2.475823705605477e-05, + "loss": 0.3572, + "step": 20048 + }, + { + "epoch": 25.736842105263158, + "grad_norm": 1.0579568147659302, + "learning_rate": 2.475780915703894e-05, + "loss": 0.3638, + "step": 20049 + }, + { + "epoch": 25.738125802310655, + "grad_norm": 1.5889557600021362, + "learning_rate": 2.4757381258023108e-05, + "loss": 0.3671, + "step": 20050 + }, + { + "epoch": 25.739409499358153, + "grad_norm": 1.1781648397445679, + "learning_rate": 2.4756953359007273e-05, + "loss": 0.3543, + "step": 20051 + }, + { + "epoch": 25.740693196405648, + "grad_norm": 1.15861177444458, + "learning_rate": 2.4756525459991445e-05, + "loss": 0.332, + "step": 20052 + }, + { + "epoch": 25.741976893453145, + "grad_norm": 0.9993641376495361, + "learning_rate": 2.475609756097561e-05, + "loss": 0.319, + "step": 20053 + }, + { + "epoch": 25.743260590500643, + "grad_norm": 1.0656906366348267, + "learning_rate": 2.4755669661959778e-05, + "loss": 0.3226, + "step": 20054 + }, + { + "epoch": 25.744544287548138, + "grad_norm": 1.6825767755508423, + "learning_rate": 2.4755241762943946e-05, + "loss": 0.3325, + "step": 20055 + }, + { + "epoch": 25.745827984595635, + "grad_norm": 1.3181952238082886, + "learning_rate": 2.475481386392811e-05, + "loss": 0.3481, + "step": 20056 + }, + { + "epoch": 25.747111681643133, + "grad_norm": 1.9554437398910522, + "learning_rate": 2.4754385964912283e-05, + "loss": 0.3473, + "step": 20057 + }, + { + "epoch": 25.748395378690628, + "grad_norm": 3.8790817260742188, + "learning_rate": 2.475395806589645e-05, + "loss": 0.3511, + "step": 20058 + }, + { + "epoch": 25.749679075738126, + "grad_norm": 2.5325725078582764, + "learning_rate": 2.4753530166880617e-05, + "loss": 0.3383, + "step": 20059 + }, + { + "epoch": 25.750962772785623, + "grad_norm": 1.1944937705993652, + "learning_rate": 2.4753102267864785e-05, + "loss": 0.3777, + "step": 20060 + }, + { + "epoch": 25.752246469833118, + "grad_norm": 2.348328113555908, + "learning_rate": 2.4752674368848954e-05, + "loss": 0.3083, + "step": 20061 + }, + { + "epoch": 25.753530166880616, + "grad_norm": 1.209350824356079, + "learning_rate": 2.475224646983312e-05, + "loss": 0.4018, + "step": 20062 + }, + { + "epoch": 25.754813863928113, + "grad_norm": 1.8257280588150024, + "learning_rate": 2.4751818570817287e-05, + "loss": 0.3964, + "step": 20063 + }, + { + "epoch": 25.75609756097561, + "grad_norm": 2.511169672012329, + "learning_rate": 2.4751390671801455e-05, + "loss": 0.3811, + "step": 20064 + }, + { + "epoch": 25.757381258023106, + "grad_norm": 1.8000050783157349, + "learning_rate": 2.4750962772785624e-05, + "loss": 0.3695, + "step": 20065 + }, + { + "epoch": 25.758664955070603, + "grad_norm": 1.8943766355514526, + "learning_rate": 2.4750534873769792e-05, + "loss": 0.3498, + "step": 20066 + }, + { + "epoch": 25.7599486521181, + "grad_norm": 2.3845696449279785, + "learning_rate": 2.4750106974753957e-05, + "loss": 0.3813, + "step": 20067 + }, + { + "epoch": 25.761232349165596, + "grad_norm": 1.2168524265289307, + "learning_rate": 2.474967907573813e-05, + "loss": 0.3577, + "step": 20068 + }, + { + "epoch": 25.762516046213094, + "grad_norm": 3.19647216796875, + "learning_rate": 2.4749251176722294e-05, + "loss": 0.3612, + "step": 20069 + }, + { + "epoch": 25.76379974326059, + "grad_norm": 4.026616096496582, + "learning_rate": 2.474882327770646e-05, + "loss": 0.3669, + "step": 20070 + }, + { + "epoch": 25.765083440308086, + "grad_norm": 2.2537288665771484, + "learning_rate": 2.474839537869063e-05, + "loss": 0.4163, + "step": 20071 + }, + { + "epoch": 25.766367137355584, + "grad_norm": 2.286301612854004, + "learning_rate": 2.4747967479674796e-05, + "loss": 0.3727, + "step": 20072 + }, + { + "epoch": 25.76765083440308, + "grad_norm": 1.1532902717590332, + "learning_rate": 2.4747539580658968e-05, + "loss": 0.4518, + "step": 20073 + }, + { + "epoch": 25.76893453145058, + "grad_norm": 1.9046651124954224, + "learning_rate": 2.4747111681643133e-05, + "loss": 0.441, + "step": 20074 + }, + { + "epoch": 25.770218228498074, + "grad_norm": 1.8339108228683472, + "learning_rate": 2.47466837826273e-05, + "loss": 0.5645, + "step": 20075 + }, + { + "epoch": 25.77150192554557, + "grad_norm": 0.9637982249259949, + "learning_rate": 2.474625588361147e-05, + "loss": 0.307, + "step": 20076 + }, + { + "epoch": 25.77278562259307, + "grad_norm": 1.1869844198226929, + "learning_rate": 2.4745827984595635e-05, + "loss": 0.3218, + "step": 20077 + }, + { + "epoch": 25.774069319640564, + "grad_norm": 1.301024079322815, + "learning_rate": 2.4745400085579803e-05, + "loss": 0.3251, + "step": 20078 + }, + { + "epoch": 25.77535301668806, + "grad_norm": 0.8076383471488953, + "learning_rate": 2.474497218656397e-05, + "loss": 0.3214, + "step": 20079 + }, + { + "epoch": 25.77663671373556, + "grad_norm": 1.7861720323562622, + "learning_rate": 2.474454428754814e-05, + "loss": 0.3232, + "step": 20080 + }, + { + "epoch": 25.777920410783054, + "grad_norm": 0.9028770923614502, + "learning_rate": 2.4744116388532308e-05, + "loss": 0.3359, + "step": 20081 + }, + { + "epoch": 25.77920410783055, + "grad_norm": 1.0448116064071655, + "learning_rate": 2.4743688489516477e-05, + "loss": 0.3652, + "step": 20082 + }, + { + "epoch": 25.78048780487805, + "grad_norm": 2.0842480659484863, + "learning_rate": 2.474326059050064e-05, + "loss": 0.3441, + "step": 20083 + }, + { + "epoch": 25.781771501925547, + "grad_norm": 1.5647141933441162, + "learning_rate": 2.474283269148481e-05, + "loss": 0.3827, + "step": 20084 + }, + { + "epoch": 25.78305519897304, + "grad_norm": 1.546588659286499, + "learning_rate": 2.474240479246898e-05, + "loss": 0.343, + "step": 20085 + }, + { + "epoch": 25.78433889602054, + "grad_norm": 0.9022968411445618, + "learning_rate": 2.4741976893453144e-05, + "loss": 0.3257, + "step": 20086 + }, + { + "epoch": 25.785622593068037, + "grad_norm": 1.1219533681869507, + "learning_rate": 2.4741548994437315e-05, + "loss": 0.3865, + "step": 20087 + }, + { + "epoch": 25.78690629011553, + "grad_norm": 1.009944200515747, + "learning_rate": 2.474112109542148e-05, + "loss": 0.3566, + "step": 20088 + }, + { + "epoch": 25.78818998716303, + "grad_norm": 1.0480200052261353, + "learning_rate": 2.474069319640565e-05, + "loss": 0.3399, + "step": 20089 + }, + { + "epoch": 25.789473684210527, + "grad_norm": 1.2387903928756714, + "learning_rate": 2.4740265297389817e-05, + "loss": 0.321, + "step": 20090 + }, + { + "epoch": 25.79075738125802, + "grad_norm": 1.8342344760894775, + "learning_rate": 2.4739837398373982e-05, + "loss": 0.3673, + "step": 20091 + }, + { + "epoch": 25.79204107830552, + "grad_norm": 1.1695659160614014, + "learning_rate": 2.4739409499358154e-05, + "loss": 0.3586, + "step": 20092 + }, + { + "epoch": 25.793324775353017, + "grad_norm": 1.2816200256347656, + "learning_rate": 2.473898160034232e-05, + "loss": 0.3016, + "step": 20093 + }, + { + "epoch": 25.794608472400512, + "grad_norm": 1.0622012615203857, + "learning_rate": 2.4738553701326487e-05, + "loss": 0.3479, + "step": 20094 + }, + { + "epoch": 25.79589216944801, + "grad_norm": 1.284056305885315, + "learning_rate": 2.4738125802310656e-05, + "loss": 0.3857, + "step": 20095 + }, + { + "epoch": 25.797175866495508, + "grad_norm": 0.8394838571548462, + "learning_rate": 2.473769790329482e-05, + "loss": 0.3147, + "step": 20096 + }, + { + "epoch": 25.798459563543005, + "grad_norm": 1.607422947883606, + "learning_rate": 2.4737270004278993e-05, + "loss": 0.3726, + "step": 20097 + }, + { + "epoch": 25.7997432605905, + "grad_norm": 2.340146064758301, + "learning_rate": 2.4736842105263158e-05, + "loss": 0.3175, + "step": 20098 + }, + { + "epoch": 25.801026957637998, + "grad_norm": 0.9022497534751892, + "learning_rate": 2.4736414206247326e-05, + "loss": 0.3233, + "step": 20099 + }, + { + "epoch": 25.802310654685495, + "grad_norm": 1.0313308238983154, + "learning_rate": 2.4735986307231495e-05, + "loss": 0.3607, + "step": 20100 + }, + { + "epoch": 25.80359435173299, + "grad_norm": 1.0860910415649414, + "learning_rate": 2.4735558408215663e-05, + "loss": 0.3517, + "step": 20101 + }, + { + "epoch": 25.804878048780488, + "grad_norm": 1.3024388551712036, + "learning_rate": 2.4735130509199828e-05, + "loss": 0.365, + "step": 20102 + }, + { + "epoch": 25.806161745827985, + "grad_norm": 1.241702914237976, + "learning_rate": 2.4734702610183996e-05, + "loss": 0.3632, + "step": 20103 + }, + { + "epoch": 25.80744544287548, + "grad_norm": 1.6780421733856201, + "learning_rate": 2.4734274711168165e-05, + "loss": 0.3468, + "step": 20104 + }, + { + "epoch": 25.808729139922978, + "grad_norm": 1.410438895225525, + "learning_rate": 2.4733846812152333e-05, + "loss": 0.317, + "step": 20105 + }, + { + "epoch": 25.810012836970476, + "grad_norm": 0.9331954717636108, + "learning_rate": 2.47334189131365e-05, + "loss": 0.3176, + "step": 20106 + }, + { + "epoch": 25.811296534017973, + "grad_norm": 1.4297561645507812, + "learning_rate": 2.4732991014120667e-05, + "loss": 0.4118, + "step": 20107 + }, + { + "epoch": 25.812580231065468, + "grad_norm": 1.4293664693832397, + "learning_rate": 2.473256311510484e-05, + "loss": 0.39, + "step": 20108 + }, + { + "epoch": 25.813863928112966, + "grad_norm": 1.1271628141403198, + "learning_rate": 2.4732135216089003e-05, + "loss": 0.3934, + "step": 20109 + }, + { + "epoch": 25.815147625160463, + "grad_norm": 1.5401053428649902, + "learning_rate": 2.473170731707317e-05, + "loss": 0.3387, + "step": 20110 + }, + { + "epoch": 25.816431322207958, + "grad_norm": 1.2158336639404297, + "learning_rate": 2.473127941805734e-05, + "loss": 0.3645, + "step": 20111 + }, + { + "epoch": 25.817715019255456, + "grad_norm": 1.1879158020019531, + "learning_rate": 2.4730851519041505e-05, + "loss": 0.3701, + "step": 20112 + }, + { + "epoch": 25.818998716302954, + "grad_norm": 1.5806398391723633, + "learning_rate": 2.4730423620025677e-05, + "loss": 0.3475, + "step": 20113 + }, + { + "epoch": 25.820282413350448, + "grad_norm": 1.559078574180603, + "learning_rate": 2.4729995721009842e-05, + "loss": 0.3859, + "step": 20114 + }, + { + "epoch": 25.821566110397946, + "grad_norm": 5.157236099243164, + "learning_rate": 2.472956782199401e-05, + "loss": 0.3457, + "step": 20115 + }, + { + "epoch": 25.822849807445444, + "grad_norm": 1.9228838682174683, + "learning_rate": 2.472913992297818e-05, + "loss": 0.4053, + "step": 20116 + }, + { + "epoch": 25.82413350449294, + "grad_norm": 1.4862704277038574, + "learning_rate": 2.4728712023962344e-05, + "loss": 0.3809, + "step": 20117 + }, + { + "epoch": 25.825417201540436, + "grad_norm": 1.6967871189117432, + "learning_rate": 2.4728284124946512e-05, + "loss": 0.3541, + "step": 20118 + }, + { + "epoch": 25.826700898587934, + "grad_norm": 2.383012533187866, + "learning_rate": 2.472785622593068e-05, + "loss": 0.3763, + "step": 20119 + }, + { + "epoch": 25.82798459563543, + "grad_norm": 1.5855085849761963, + "learning_rate": 2.472742832691485e-05, + "loss": 0.4008, + "step": 20120 + }, + { + "epoch": 25.829268292682926, + "grad_norm": 2.556593179702759, + "learning_rate": 2.4727000427899018e-05, + "loss": 0.3826, + "step": 20121 + }, + { + "epoch": 25.830551989730424, + "grad_norm": 1.6881576776504517, + "learning_rate": 2.4726572528883186e-05, + "loss": 0.3962, + "step": 20122 + }, + { + "epoch": 25.83183568677792, + "grad_norm": 1.8142993450164795, + "learning_rate": 2.472614462986735e-05, + "loss": 0.4617, + "step": 20123 + }, + { + "epoch": 25.833119383825416, + "grad_norm": 1.48975670337677, + "learning_rate": 2.472571673085152e-05, + "loss": 0.4712, + "step": 20124 + }, + { + "epoch": 25.834403080872914, + "grad_norm": 2.039823532104492, + "learning_rate": 2.4725288831835688e-05, + "loss": 0.4927, + "step": 20125 + }, + { + "epoch": 25.83568677792041, + "grad_norm": 1.5171129703521729, + "learning_rate": 2.4724860932819853e-05, + "loss": 0.3223, + "step": 20126 + }, + { + "epoch": 25.836970474967906, + "grad_norm": 1.4164289236068726, + "learning_rate": 2.4724433033804025e-05, + "loss": 0.3263, + "step": 20127 + }, + { + "epoch": 25.838254172015404, + "grad_norm": 0.8199660778045654, + "learning_rate": 2.472400513478819e-05, + "loss": 0.3353, + "step": 20128 + }, + { + "epoch": 25.8395378690629, + "grad_norm": 1.7520400285720825, + "learning_rate": 2.472357723577236e-05, + "loss": 0.3373, + "step": 20129 + }, + { + "epoch": 25.8408215661104, + "grad_norm": 4.4763970375061035, + "learning_rate": 2.4723149336756527e-05, + "loss": 0.3687, + "step": 20130 + }, + { + "epoch": 25.842105263157894, + "grad_norm": 1.0094122886657715, + "learning_rate": 2.472272143774069e-05, + "loss": 0.3112, + "step": 20131 + }, + { + "epoch": 25.84338896020539, + "grad_norm": 1.721562385559082, + "learning_rate": 2.4722293538724863e-05, + "loss": 0.3112, + "step": 20132 + }, + { + "epoch": 25.84467265725289, + "grad_norm": 1.061394453048706, + "learning_rate": 2.472186563970903e-05, + "loss": 0.3717, + "step": 20133 + }, + { + "epoch": 25.845956354300384, + "grad_norm": 4.1948771476745605, + "learning_rate": 2.4721437740693197e-05, + "loss": 0.3452, + "step": 20134 + }, + { + "epoch": 25.84724005134788, + "grad_norm": 1.1400072574615479, + "learning_rate": 2.4721009841677365e-05, + "loss": 0.3424, + "step": 20135 + }, + { + "epoch": 25.84852374839538, + "grad_norm": 1.489161491394043, + "learning_rate": 2.4720581942661534e-05, + "loss": 0.3264, + "step": 20136 + }, + { + "epoch": 25.849807445442874, + "grad_norm": 1.4180653095245361, + "learning_rate": 2.4720154043645702e-05, + "loss": 0.3492, + "step": 20137 + }, + { + "epoch": 25.85109114249037, + "grad_norm": 1.4744093418121338, + "learning_rate": 2.4719726144629867e-05, + "loss": 0.323, + "step": 20138 + }, + { + "epoch": 25.85237483953787, + "grad_norm": 2.36615252494812, + "learning_rate": 2.4719298245614035e-05, + "loss": 0.3429, + "step": 20139 + }, + { + "epoch": 25.853658536585368, + "grad_norm": 0.9668160080909729, + "learning_rate": 2.4718870346598204e-05, + "loss": 0.3731, + "step": 20140 + }, + { + "epoch": 25.854942233632862, + "grad_norm": 1.2251349687576294, + "learning_rate": 2.4718442447582372e-05, + "loss": 0.3627, + "step": 20141 + }, + { + "epoch": 25.85622593068036, + "grad_norm": 1.2326146364212036, + "learning_rate": 2.4718014548566537e-05, + "loss": 0.3794, + "step": 20142 + }, + { + "epoch": 25.857509627727858, + "grad_norm": 1.2082343101501465, + "learning_rate": 2.471758664955071e-05, + "loss": 0.2935, + "step": 20143 + }, + { + "epoch": 25.858793324775352, + "grad_norm": 1.2207612991333008, + "learning_rate": 2.4717158750534874e-05, + "loss": 0.3512, + "step": 20144 + }, + { + "epoch": 25.86007702182285, + "grad_norm": 3.1655237674713135, + "learning_rate": 2.4716730851519043e-05, + "loss": 0.3364, + "step": 20145 + }, + { + "epoch": 25.861360718870348, + "grad_norm": 1.0321296453475952, + "learning_rate": 2.471630295250321e-05, + "loss": 0.3782, + "step": 20146 + }, + { + "epoch": 25.862644415917842, + "grad_norm": 0.9049351215362549, + "learning_rate": 2.4715875053487376e-05, + "loss": 0.3317, + "step": 20147 + }, + { + "epoch": 25.86392811296534, + "grad_norm": 1.2855232954025269, + "learning_rate": 2.4715447154471548e-05, + "loss": 0.3502, + "step": 20148 + }, + { + "epoch": 25.865211810012838, + "grad_norm": 0.9559351205825806, + "learning_rate": 2.4715019255455713e-05, + "loss": 0.3189, + "step": 20149 + }, + { + "epoch": 25.866495507060336, + "grad_norm": 1.1502056121826172, + "learning_rate": 2.471459135643988e-05, + "loss": 0.3793, + "step": 20150 + }, + { + "epoch": 25.86777920410783, + "grad_norm": 1.6061375141143799, + "learning_rate": 2.471416345742405e-05, + "loss": 0.3677, + "step": 20151 + }, + { + "epoch": 25.869062901155328, + "grad_norm": 1.4084103107452393, + "learning_rate": 2.4713735558408215e-05, + "loss": 0.3667, + "step": 20152 + }, + { + "epoch": 25.870346598202826, + "grad_norm": 2.082956075668335, + "learning_rate": 2.4713307659392386e-05, + "loss": 0.3722, + "step": 20153 + }, + { + "epoch": 25.87163029525032, + "grad_norm": 1.3204678297042847, + "learning_rate": 2.471287976037655e-05, + "loss": 0.3568, + "step": 20154 + }, + { + "epoch": 25.872913992297818, + "grad_norm": 1.070723295211792, + "learning_rate": 2.471245186136072e-05, + "loss": 0.3646, + "step": 20155 + }, + { + "epoch": 25.874197689345316, + "grad_norm": 1.3646881580352783, + "learning_rate": 2.4712023962344888e-05, + "loss": 0.3549, + "step": 20156 + }, + { + "epoch": 25.87548138639281, + "grad_norm": 3.223914861679077, + "learning_rate": 2.4711596063329053e-05, + "loss": 0.3313, + "step": 20157 + }, + { + "epoch": 25.876765083440308, + "grad_norm": 1.3680262565612793, + "learning_rate": 2.4711168164313222e-05, + "loss": 0.3088, + "step": 20158 + }, + { + "epoch": 25.878048780487806, + "grad_norm": 1.413408637046814, + "learning_rate": 2.471074026529739e-05, + "loss": 0.3901, + "step": 20159 + }, + { + "epoch": 25.8793324775353, + "grad_norm": 2.195016622543335, + "learning_rate": 2.471031236628156e-05, + "loss": 0.3778, + "step": 20160 + }, + { + "epoch": 25.880616174582798, + "grad_norm": 1.413907766342163, + "learning_rate": 2.4709884467265724e-05, + "loss": 0.3952, + "step": 20161 + }, + { + "epoch": 25.881899871630296, + "grad_norm": 1.1812461614608765, + "learning_rate": 2.4709456568249895e-05, + "loss": 0.4058, + "step": 20162 + }, + { + "epoch": 25.883183568677794, + "grad_norm": 1.2400310039520264, + "learning_rate": 2.470902866923406e-05, + "loss": 0.3253, + "step": 20163 + }, + { + "epoch": 25.884467265725288, + "grad_norm": 1.4143246412277222, + "learning_rate": 2.470860077021823e-05, + "loss": 0.3669, + "step": 20164 + }, + { + "epoch": 25.885750962772786, + "grad_norm": 4.796844959259033, + "learning_rate": 2.4708172871202397e-05, + "loss": 0.36, + "step": 20165 + }, + { + "epoch": 25.887034659820284, + "grad_norm": 1.0515061616897583, + "learning_rate": 2.4707744972186562e-05, + "loss": 0.414, + "step": 20166 + }, + { + "epoch": 25.888318356867778, + "grad_norm": 1.0995432138442993, + "learning_rate": 2.4707317073170734e-05, + "loss": 0.3842, + "step": 20167 + }, + { + "epoch": 25.889602053915276, + "grad_norm": 0.9995795488357544, + "learning_rate": 2.47068891741549e-05, + "loss": 0.3604, + "step": 20168 + }, + { + "epoch": 25.890885750962774, + "grad_norm": 2.1603000164031982, + "learning_rate": 2.4706461275139067e-05, + "loss": 0.3934, + "step": 20169 + }, + { + "epoch": 25.892169448010268, + "grad_norm": 1.534339427947998, + "learning_rate": 2.4706033376123236e-05, + "loss": 0.3957, + "step": 20170 + }, + { + "epoch": 25.893453145057766, + "grad_norm": 1.6384044885635376, + "learning_rate": 2.47056054771074e-05, + "loss": 0.4254, + "step": 20171 + }, + { + "epoch": 25.894736842105264, + "grad_norm": 1.6374109983444214, + "learning_rate": 2.4705177578091573e-05, + "loss": 0.3998, + "step": 20172 + }, + { + "epoch": 25.89602053915276, + "grad_norm": 2.4492173194885254, + "learning_rate": 2.4704749679075738e-05, + "loss": 0.4715, + "step": 20173 + }, + { + "epoch": 25.897304236200256, + "grad_norm": 5.566817760467529, + "learning_rate": 2.4704321780059906e-05, + "loss": 0.5204, + "step": 20174 + }, + { + "epoch": 25.898587933247754, + "grad_norm": 2.3239986896514893, + "learning_rate": 2.4703893881044075e-05, + "loss": 0.5751, + "step": 20175 + }, + { + "epoch": 25.89987163029525, + "grad_norm": 1.8747860193252563, + "learning_rate": 2.4703465982028243e-05, + "loss": 0.3269, + "step": 20176 + }, + { + "epoch": 25.901155327342746, + "grad_norm": 1.0684547424316406, + "learning_rate": 2.4703038083012408e-05, + "loss": 0.3537, + "step": 20177 + }, + { + "epoch": 25.902439024390244, + "grad_norm": 0.8070211410522461, + "learning_rate": 2.4702610183996576e-05, + "loss": 0.3345, + "step": 20178 + }, + { + "epoch": 25.90372272143774, + "grad_norm": 0.9650954604148865, + "learning_rate": 2.4702182284980745e-05, + "loss": 0.3456, + "step": 20179 + }, + { + "epoch": 25.905006418485236, + "grad_norm": 1.367666482925415, + "learning_rate": 2.4701754385964913e-05, + "loss": 0.348, + "step": 20180 + }, + { + "epoch": 25.906290115532734, + "grad_norm": 1.9419591426849365, + "learning_rate": 2.470132648694908e-05, + "loss": 0.3371, + "step": 20181 + }, + { + "epoch": 25.90757381258023, + "grad_norm": 3.5625579357147217, + "learning_rate": 2.4700898587933247e-05, + "loss": 0.3438, + "step": 20182 + }, + { + "epoch": 25.90885750962773, + "grad_norm": 1.2818447351455688, + "learning_rate": 2.470047068891742e-05, + "loss": 0.3238, + "step": 20183 + }, + { + "epoch": 25.910141206675224, + "grad_norm": 1.0095561742782593, + "learning_rate": 2.4700042789901584e-05, + "loss": 0.3579, + "step": 20184 + }, + { + "epoch": 25.911424903722722, + "grad_norm": 2.7583861351013184, + "learning_rate": 2.469961489088575e-05, + "loss": 0.3701, + "step": 20185 + }, + { + "epoch": 25.91270860077022, + "grad_norm": 1.3603695631027222, + "learning_rate": 2.469918699186992e-05, + "loss": 0.3518, + "step": 20186 + }, + { + "epoch": 25.913992297817714, + "grad_norm": 0.965028703212738, + "learning_rate": 2.4698759092854085e-05, + "loss": 0.3549, + "step": 20187 + }, + { + "epoch": 25.915275994865212, + "grad_norm": 0.8556326627731323, + "learning_rate": 2.4698331193838257e-05, + "loss": 0.3248, + "step": 20188 + }, + { + "epoch": 25.91655969191271, + "grad_norm": 0.8840798735618591, + "learning_rate": 2.4697903294822422e-05, + "loss": 0.3322, + "step": 20189 + }, + { + "epoch": 25.917843388960204, + "grad_norm": 1.6767805814743042, + "learning_rate": 2.469747539580659e-05, + "loss": 0.3519, + "step": 20190 + }, + { + "epoch": 25.919127086007702, + "grad_norm": 1.5322691202163696, + "learning_rate": 2.469704749679076e-05, + "loss": 0.3543, + "step": 20191 + }, + { + "epoch": 25.9204107830552, + "grad_norm": 1.5349771976470947, + "learning_rate": 2.4696619597774924e-05, + "loss": 0.3516, + "step": 20192 + }, + { + "epoch": 25.921694480102694, + "grad_norm": 1.4433993101119995, + "learning_rate": 2.4696191698759092e-05, + "loss": 0.3137, + "step": 20193 + }, + { + "epoch": 25.922978177150192, + "grad_norm": 3.0354020595550537, + "learning_rate": 2.469576379974326e-05, + "loss": 0.3378, + "step": 20194 + }, + { + "epoch": 25.92426187419769, + "grad_norm": 1.7889914512634277, + "learning_rate": 2.469533590072743e-05, + "loss": 0.3745, + "step": 20195 + }, + { + "epoch": 25.925545571245188, + "grad_norm": 1.4037373065948486, + "learning_rate": 2.4694908001711598e-05, + "loss": 0.3252, + "step": 20196 + }, + { + "epoch": 25.926829268292682, + "grad_norm": 1.1640639305114746, + "learning_rate": 2.4694480102695766e-05, + "loss": 0.3543, + "step": 20197 + }, + { + "epoch": 25.92811296534018, + "grad_norm": 1.772193193435669, + "learning_rate": 2.469405220367993e-05, + "loss": 0.326, + "step": 20198 + }, + { + "epoch": 25.929396662387678, + "grad_norm": 3.5887985229492188, + "learning_rate": 2.46936243046641e-05, + "loss": 0.3377, + "step": 20199 + }, + { + "epoch": 25.930680359435172, + "grad_norm": 4.102062225341797, + "learning_rate": 2.4693196405648268e-05, + "loss": 0.3439, + "step": 20200 + }, + { + "epoch": 25.93196405648267, + "grad_norm": 3.0387704372406006, + "learning_rate": 2.4692768506632433e-05, + "loss": 0.3291, + "step": 20201 + }, + { + "epoch": 25.933247753530168, + "grad_norm": 1.1958848237991333, + "learning_rate": 2.4692340607616605e-05, + "loss": 0.3747, + "step": 20202 + }, + { + "epoch": 25.934531450577662, + "grad_norm": 1.6689748764038086, + "learning_rate": 2.469191270860077e-05, + "loss": 0.3284, + "step": 20203 + }, + { + "epoch": 25.93581514762516, + "grad_norm": 2.08377742767334, + "learning_rate": 2.469148480958494e-05, + "loss": 0.3338, + "step": 20204 + }, + { + "epoch": 25.937098844672658, + "grad_norm": 1.0808441638946533, + "learning_rate": 2.4691056910569107e-05, + "loss": 0.4035, + "step": 20205 + }, + { + "epoch": 25.938382541720156, + "grad_norm": 1.0930628776550293, + "learning_rate": 2.469062901155327e-05, + "loss": 0.3598, + "step": 20206 + }, + { + "epoch": 25.93966623876765, + "grad_norm": 1.0209555625915527, + "learning_rate": 2.4690201112537443e-05, + "loss": 0.3563, + "step": 20207 + }, + { + "epoch": 25.940949935815148, + "grad_norm": 0.9619274735450745, + "learning_rate": 2.468977321352161e-05, + "loss": 0.3407, + "step": 20208 + }, + { + "epoch": 25.942233632862646, + "grad_norm": 1.408168077468872, + "learning_rate": 2.4689345314505777e-05, + "loss": 0.3397, + "step": 20209 + }, + { + "epoch": 25.94351732991014, + "grad_norm": 1.35371994972229, + "learning_rate": 2.4688917415489945e-05, + "loss": 0.3854, + "step": 20210 + }, + { + "epoch": 25.944801026957638, + "grad_norm": 1.9594837427139282, + "learning_rate": 2.4688489516474114e-05, + "loss": 0.413, + "step": 20211 + }, + { + "epoch": 25.946084724005136, + "grad_norm": 1.7550086975097656, + "learning_rate": 2.4688061617458282e-05, + "loss": 0.3846, + "step": 20212 + }, + { + "epoch": 25.94736842105263, + "grad_norm": 1.8161548376083374, + "learning_rate": 2.4687633718442447e-05, + "loss": 0.399, + "step": 20213 + }, + { + "epoch": 25.948652118100128, + "grad_norm": 1.5652687549591064, + "learning_rate": 2.4687205819426616e-05, + "loss": 0.3413, + "step": 20214 + }, + { + "epoch": 25.949935815147626, + "grad_norm": 2.6754512786865234, + "learning_rate": 2.4686777920410784e-05, + "loss": 0.3497, + "step": 20215 + }, + { + "epoch": 25.951219512195124, + "grad_norm": 1.4220821857452393, + "learning_rate": 2.4686350021394952e-05, + "loss": 0.3357, + "step": 20216 + }, + { + "epoch": 25.952503209242618, + "grad_norm": 1.516609787940979, + "learning_rate": 2.4685922122379117e-05, + "loss": 0.4318, + "step": 20217 + }, + { + "epoch": 25.953786906290116, + "grad_norm": 2.6106224060058594, + "learning_rate": 2.4685494223363286e-05, + "loss": 0.4002, + "step": 20218 + }, + { + "epoch": 25.955070603337614, + "grad_norm": 1.813154935836792, + "learning_rate": 2.4685066324347454e-05, + "loss": 0.4054, + "step": 20219 + }, + { + "epoch": 25.956354300385108, + "grad_norm": 4.315509796142578, + "learning_rate": 2.4684638425331623e-05, + "loss": 0.4049, + "step": 20220 + }, + { + "epoch": 25.957637997432606, + "grad_norm": 1.9315394163131714, + "learning_rate": 2.468421052631579e-05, + "loss": 0.3771, + "step": 20221 + }, + { + "epoch": 25.958921694480104, + "grad_norm": 1.516717791557312, + "learning_rate": 2.4683782627299956e-05, + "loss": 0.4163, + "step": 20222 + }, + { + "epoch": 25.960205391527598, + "grad_norm": 1.8652583360671997, + "learning_rate": 2.4683354728284128e-05, + "loss": 0.3831, + "step": 20223 + }, + { + "epoch": 25.961489088575096, + "grad_norm": 2.845959186553955, + "learning_rate": 2.4682926829268293e-05, + "loss": 0.4404, + "step": 20224 + }, + { + "epoch": 25.962772785622594, + "grad_norm": 3.75555419921875, + "learning_rate": 2.4682498930252458e-05, + "loss": 0.5094, + "step": 20225 + }, + { + "epoch": 25.964056482670088, + "grad_norm": 1.0519161224365234, + "learning_rate": 2.468207103123663e-05, + "loss": 0.3227, + "step": 20226 + }, + { + "epoch": 25.965340179717586, + "grad_norm": 1.026632308959961, + "learning_rate": 2.4681643132220795e-05, + "loss": 0.337, + "step": 20227 + }, + { + "epoch": 25.966623876765084, + "grad_norm": 0.9125587940216064, + "learning_rate": 2.4681215233204967e-05, + "loss": 0.3294, + "step": 20228 + }, + { + "epoch": 25.96790757381258, + "grad_norm": 1.1108100414276123, + "learning_rate": 2.468078733418913e-05, + "loss": 0.3583, + "step": 20229 + }, + { + "epoch": 25.969191270860076, + "grad_norm": 4.12135124206543, + "learning_rate": 2.46803594351733e-05, + "loss": 0.3477, + "step": 20230 + }, + { + "epoch": 25.970474967907574, + "grad_norm": 1.2913554906845093, + "learning_rate": 2.467993153615747e-05, + "loss": 0.3606, + "step": 20231 + }, + { + "epoch": 25.971758664955072, + "grad_norm": 1.7353445291519165, + "learning_rate": 2.4679503637141633e-05, + "loss": 0.3322, + "step": 20232 + }, + { + "epoch": 25.973042362002566, + "grad_norm": 0.9048962593078613, + "learning_rate": 2.4679075738125802e-05, + "loss": 0.3483, + "step": 20233 + }, + { + "epoch": 25.974326059050064, + "grad_norm": 1.5469377040863037, + "learning_rate": 2.467864783910997e-05, + "loss": 0.3418, + "step": 20234 + }, + { + "epoch": 25.975609756097562, + "grad_norm": 3.281214714050293, + "learning_rate": 2.467821994009414e-05, + "loss": 0.3382, + "step": 20235 + }, + { + "epoch": 25.976893453145056, + "grad_norm": 1.6648600101470947, + "learning_rate": 2.4677792041078307e-05, + "loss": 0.3552, + "step": 20236 + }, + { + "epoch": 25.978177150192554, + "grad_norm": 1.097916603088379, + "learning_rate": 2.4677364142062475e-05, + "loss": 0.367, + "step": 20237 + }, + { + "epoch": 25.979460847240052, + "grad_norm": 2.1887900829315186, + "learning_rate": 2.467693624304664e-05, + "loss": 0.3382, + "step": 20238 + }, + { + "epoch": 25.98074454428755, + "grad_norm": 2.8255772590637207, + "learning_rate": 2.467650834403081e-05, + "loss": 0.3646, + "step": 20239 + }, + { + "epoch": 25.982028241335044, + "grad_norm": 2.19219970703125, + "learning_rate": 2.4676080445014977e-05, + "loss": 0.3437, + "step": 20240 + }, + { + "epoch": 25.983311938382542, + "grad_norm": 1.0698264837265015, + "learning_rate": 2.4675652545999142e-05, + "loss": 0.3449, + "step": 20241 + }, + { + "epoch": 25.98459563543004, + "grad_norm": 2.8827648162841797, + "learning_rate": 2.4675224646983314e-05, + "loss": 0.3648, + "step": 20242 + }, + { + "epoch": 25.985879332477534, + "grad_norm": 5.684160232543945, + "learning_rate": 2.467479674796748e-05, + "loss": 0.3511, + "step": 20243 + }, + { + "epoch": 25.987163029525032, + "grad_norm": 3.2578275203704834, + "learning_rate": 2.467436884895165e-05, + "loss": 0.3877, + "step": 20244 + }, + { + "epoch": 25.98844672657253, + "grad_norm": 2.213578224182129, + "learning_rate": 2.4673940949935816e-05, + "loss": 0.3398, + "step": 20245 + }, + { + "epoch": 25.989730423620024, + "grad_norm": 2.7319042682647705, + "learning_rate": 2.467351305091998e-05, + "loss": 0.3443, + "step": 20246 + }, + { + "epoch": 25.991014120667522, + "grad_norm": 13.816896438598633, + "learning_rate": 2.4673085151904153e-05, + "loss": 0.3979, + "step": 20247 + }, + { + "epoch": 25.99229781771502, + "grad_norm": 2.7556395530700684, + "learning_rate": 2.4672657252888318e-05, + "loss": 0.3772, + "step": 20248 + }, + { + "epoch": 25.993581514762518, + "grad_norm": 5.473087310791016, + "learning_rate": 2.4672229353872486e-05, + "loss": 0.4098, + "step": 20249 + }, + { + "epoch": 25.994865211810012, + "grad_norm": 3.506030559539795, + "learning_rate": 2.4671801454856655e-05, + "loss": 0.3725, + "step": 20250 + }, + { + "epoch": 25.99614890885751, + "grad_norm": 2.8598368167877197, + "learning_rate": 2.4671373555840823e-05, + "loss": 0.4446, + "step": 20251 + }, + { + "epoch": 25.997432605905008, + "grad_norm": 2.1380488872528076, + "learning_rate": 2.467094565682499e-05, + "loss": 0.3887, + "step": 20252 + }, + { + "epoch": 25.998716302952502, + "grad_norm": 2.0023252964019775, + "learning_rate": 2.4670517757809156e-05, + "loss": 0.4635, + "step": 20253 + }, + { + "epoch": 26.0, + "grad_norm": 2.0982961654663086, + "learning_rate": 2.4670089858793325e-05, + "loss": 0.4935, + "step": 20254 + }, + { + "epoch": 26.001283697047498, + "grad_norm": 1.2618008852005005, + "learning_rate": 2.4669661959777493e-05, + "loss": 0.3264, + "step": 20255 + }, + { + "epoch": 26.002567394094992, + "grad_norm": 1.1189918518066406, + "learning_rate": 2.4669234060761662e-05, + "loss": 0.3358, + "step": 20256 + }, + { + "epoch": 26.00385109114249, + "grad_norm": 0.7842966318130493, + "learning_rate": 2.4668806161745827e-05, + "loss": 0.3159, + "step": 20257 + }, + { + "epoch": 26.005134788189988, + "grad_norm": 1.1281442642211914, + "learning_rate": 2.466837826273e-05, + "loss": 0.3206, + "step": 20258 + }, + { + "epoch": 26.006418485237482, + "grad_norm": 1.2803254127502441, + "learning_rate": 2.4667950363714164e-05, + "loss": 0.3264, + "step": 20259 + }, + { + "epoch": 26.00770218228498, + "grad_norm": 1.0624079704284668, + "learning_rate": 2.4667522464698332e-05, + "loss": 0.3, + "step": 20260 + }, + { + "epoch": 26.008985879332478, + "grad_norm": 1.4106693267822266, + "learning_rate": 2.46670945656825e-05, + "loss": 0.3032, + "step": 20261 + }, + { + "epoch": 26.010269576379976, + "grad_norm": 1.0002962350845337, + "learning_rate": 2.4666666666666665e-05, + "loss": 0.365, + "step": 20262 + }, + { + "epoch": 26.01155327342747, + "grad_norm": 0.8958387970924377, + "learning_rate": 2.4666238767650837e-05, + "loss": 0.32, + "step": 20263 + }, + { + "epoch": 26.012836970474968, + "grad_norm": 1.0949734449386597, + "learning_rate": 2.4665810868635002e-05, + "loss": 0.3491, + "step": 20264 + }, + { + "epoch": 26.014120667522466, + "grad_norm": 1.0828649997711182, + "learning_rate": 2.466538296961917e-05, + "loss": 0.3484, + "step": 20265 + }, + { + "epoch": 26.01540436456996, + "grad_norm": 0.9265836477279663, + "learning_rate": 2.466495507060334e-05, + "loss": 0.3387, + "step": 20266 + }, + { + "epoch": 26.016688061617458, + "grad_norm": 0.9540228247642517, + "learning_rate": 2.4664527171587504e-05, + "loss": 0.3089, + "step": 20267 + }, + { + "epoch": 26.017971758664956, + "grad_norm": 2.051003932952881, + "learning_rate": 2.4664099272571676e-05, + "loss": 0.3489, + "step": 20268 + }, + { + "epoch": 26.01925545571245, + "grad_norm": 2.761343002319336, + "learning_rate": 2.466367137355584e-05, + "loss": 0.3228, + "step": 20269 + }, + { + "epoch": 26.020539152759948, + "grad_norm": 0.9420179724693298, + "learning_rate": 2.466324347454001e-05, + "loss": 0.3132, + "step": 20270 + }, + { + "epoch": 26.021822849807446, + "grad_norm": 1.5213744640350342, + "learning_rate": 2.4662815575524178e-05, + "loss": 0.3017, + "step": 20271 + }, + { + "epoch": 26.023106546854944, + "grad_norm": 0.7602954506874084, + "learning_rate": 2.4662387676508346e-05, + "loss": 0.3061, + "step": 20272 + }, + { + "epoch": 26.024390243902438, + "grad_norm": 1.5685738325119019, + "learning_rate": 2.466195977749251e-05, + "loss": 0.3267, + "step": 20273 + }, + { + "epoch": 26.025673940949936, + "grad_norm": 1.577979564666748, + "learning_rate": 2.466153187847668e-05, + "loss": 0.3534, + "step": 20274 + }, + { + "epoch": 26.026957637997434, + "grad_norm": 1.7931896448135376, + "learning_rate": 2.4661103979460848e-05, + "loss": 0.3296, + "step": 20275 + }, + { + "epoch": 26.028241335044928, + "grad_norm": 1.4474291801452637, + "learning_rate": 2.4660676080445016e-05, + "loss": 0.3131, + "step": 20276 + }, + { + "epoch": 26.029525032092426, + "grad_norm": 2.849170446395874, + "learning_rate": 2.4660248181429185e-05, + "loss": 0.3101, + "step": 20277 + }, + { + "epoch": 26.030808729139924, + "grad_norm": 0.8957362771034241, + "learning_rate": 2.465982028241335e-05, + "loss": 0.2885, + "step": 20278 + }, + { + "epoch": 26.03209242618742, + "grad_norm": 1.0659937858581543, + "learning_rate": 2.4659392383397518e-05, + "loss": 0.3012, + "step": 20279 + }, + { + "epoch": 26.033376123234916, + "grad_norm": 1.536697506904602, + "learning_rate": 2.4658964484381687e-05, + "loss": 0.3549, + "step": 20280 + }, + { + "epoch": 26.034659820282414, + "grad_norm": 1.0576688051223755, + "learning_rate": 2.465853658536585e-05, + "loss": 0.3348, + "step": 20281 + }, + { + "epoch": 26.035943517329912, + "grad_norm": 1.4396244287490845, + "learning_rate": 2.4658108686350023e-05, + "loss": 0.3097, + "step": 20282 + }, + { + "epoch": 26.037227214377406, + "grad_norm": 0.8746925592422485, + "learning_rate": 2.465768078733419e-05, + "loss": 0.2958, + "step": 20283 + }, + { + "epoch": 26.038510911424904, + "grad_norm": 1.525389552116394, + "learning_rate": 2.465725288831836e-05, + "loss": 0.3252, + "step": 20284 + }, + { + "epoch": 26.039794608472402, + "grad_norm": 0.9700495600700378, + "learning_rate": 2.4656824989302525e-05, + "loss": 0.354, + "step": 20285 + }, + { + "epoch": 26.041078305519896, + "grad_norm": 1.2893892526626587, + "learning_rate": 2.465639709028669e-05, + "loss": 0.3225, + "step": 20286 + }, + { + "epoch": 26.042362002567394, + "grad_norm": 1.050583004951477, + "learning_rate": 2.4655969191270862e-05, + "loss": 0.3617, + "step": 20287 + }, + { + "epoch": 26.043645699614892, + "grad_norm": 1.1763355731964111, + "learning_rate": 2.4655541292255027e-05, + "loss": 0.3399, + "step": 20288 + }, + { + "epoch": 26.044929396662386, + "grad_norm": 1.3361639976501465, + "learning_rate": 2.4655113393239196e-05, + "loss": 0.3356, + "step": 20289 + }, + { + "epoch": 26.046213093709884, + "grad_norm": 1.7797234058380127, + "learning_rate": 2.4654685494223364e-05, + "loss": 0.3421, + "step": 20290 + }, + { + "epoch": 26.047496790757382, + "grad_norm": 1.1903696060180664, + "learning_rate": 2.4654257595207532e-05, + "loss": 0.3091, + "step": 20291 + }, + { + "epoch": 26.048780487804876, + "grad_norm": 1.6767653226852417, + "learning_rate": 2.46538296961917e-05, + "loss": 0.3374, + "step": 20292 + }, + { + "epoch": 26.050064184852374, + "grad_norm": 2.460207939147949, + "learning_rate": 2.4653401797175866e-05, + "loss": 0.3551, + "step": 20293 + }, + { + "epoch": 26.051347881899872, + "grad_norm": 4.504824638366699, + "learning_rate": 2.4652973898160034e-05, + "loss": 0.3198, + "step": 20294 + }, + { + "epoch": 26.05263157894737, + "grad_norm": 1.8170413970947266, + "learning_rate": 2.4652545999144203e-05, + "loss": 0.3406, + "step": 20295 + }, + { + "epoch": 26.053915275994864, + "grad_norm": 2.867604970932007, + "learning_rate": 2.465211810012837e-05, + "loss": 0.3762, + "step": 20296 + }, + { + "epoch": 26.055198973042362, + "grad_norm": 1.4227502346038818, + "learning_rate": 2.4651690201112536e-05, + "loss": 0.3665, + "step": 20297 + }, + { + "epoch": 26.05648267008986, + "grad_norm": 4.42331600189209, + "learning_rate": 2.4651262302096708e-05, + "loss": 0.3904, + "step": 20298 + }, + { + "epoch": 26.057766367137354, + "grad_norm": 2.30592942237854, + "learning_rate": 2.4650834403080873e-05, + "loss": 0.3432, + "step": 20299 + }, + { + "epoch": 26.059050064184852, + "grad_norm": 1.93052077293396, + "learning_rate": 2.465040650406504e-05, + "loss": 0.3861, + "step": 20300 + }, + { + "epoch": 26.06033376123235, + "grad_norm": 3.2183375358581543, + "learning_rate": 2.464997860504921e-05, + "loss": 0.3794, + "step": 20301 + }, + { + "epoch": 26.061617458279844, + "grad_norm": 2.3084347248077393, + "learning_rate": 2.4649550706033375e-05, + "loss": 0.4028, + "step": 20302 + }, + { + "epoch": 26.062901155327342, + "grad_norm": 1.3283072710037231, + "learning_rate": 2.4649122807017547e-05, + "loss": 0.4254, + "step": 20303 + }, + { + "epoch": 26.06418485237484, + "grad_norm": 3.2664318084716797, + "learning_rate": 2.464869490800171e-05, + "loss": 0.5205, + "step": 20304 + }, + { + "epoch": 26.065468549422338, + "grad_norm": 0.9206898212432861, + "learning_rate": 2.464826700898588e-05, + "loss": 0.302, + "step": 20305 + }, + { + "epoch": 26.066752246469832, + "grad_norm": 0.913048505783081, + "learning_rate": 2.464783910997005e-05, + "loss": 0.3156, + "step": 20306 + }, + { + "epoch": 26.06803594351733, + "grad_norm": 1.7168322801589966, + "learning_rate": 2.4647411210954213e-05, + "loss": 0.3036, + "step": 20307 + }, + { + "epoch": 26.069319640564828, + "grad_norm": 1.3931418657302856, + "learning_rate": 2.4646983311938385e-05, + "loss": 0.345, + "step": 20308 + }, + { + "epoch": 26.070603337612322, + "grad_norm": 1.3254045248031616, + "learning_rate": 2.464655541292255e-05, + "loss": 0.2974, + "step": 20309 + }, + { + "epoch": 26.07188703465982, + "grad_norm": 0.878723680973053, + "learning_rate": 2.464612751390672e-05, + "loss": 0.313, + "step": 20310 + }, + { + "epoch": 26.073170731707318, + "grad_norm": 0.901147723197937, + "learning_rate": 2.4645699614890887e-05, + "loss": 0.3136, + "step": 20311 + }, + { + "epoch": 26.074454428754812, + "grad_norm": 0.9284752011299133, + "learning_rate": 2.4645271715875056e-05, + "loss": 0.3164, + "step": 20312 + }, + { + "epoch": 26.07573812580231, + "grad_norm": 1.0456087589263916, + "learning_rate": 2.464484381685922e-05, + "loss": 0.3525, + "step": 20313 + }, + { + "epoch": 26.077021822849808, + "grad_norm": 1.4584544897079468, + "learning_rate": 2.464441591784339e-05, + "loss": 0.34, + "step": 20314 + }, + { + "epoch": 26.078305519897306, + "grad_norm": 1.266237735748291, + "learning_rate": 2.4643988018827557e-05, + "loss": 0.3185, + "step": 20315 + }, + { + "epoch": 26.0795892169448, + "grad_norm": 1.1001968383789062, + "learning_rate": 2.4643560119811726e-05, + "loss": 0.308, + "step": 20316 + }, + { + "epoch": 26.080872913992298, + "grad_norm": 1.2332228422164917, + "learning_rate": 2.4643132220795894e-05, + "loss": 0.2891, + "step": 20317 + }, + { + "epoch": 26.082156611039796, + "grad_norm": 2.7895936965942383, + "learning_rate": 2.464270432178006e-05, + "loss": 0.3271, + "step": 20318 + }, + { + "epoch": 26.08344030808729, + "grad_norm": 2.3656556606292725, + "learning_rate": 2.464227642276423e-05, + "loss": 0.3589, + "step": 20319 + }, + { + "epoch": 26.084724005134788, + "grad_norm": 1.3228679895401, + "learning_rate": 2.4641848523748396e-05, + "loss": 0.3471, + "step": 20320 + }, + { + "epoch": 26.086007702182286, + "grad_norm": 1.2847394943237305, + "learning_rate": 2.464142062473256e-05, + "loss": 0.3192, + "step": 20321 + }, + { + "epoch": 26.08729139922978, + "grad_norm": 1.3516961336135864, + "learning_rate": 2.4640992725716733e-05, + "loss": 0.3177, + "step": 20322 + }, + { + "epoch": 26.088575096277278, + "grad_norm": 1.3378865718841553, + "learning_rate": 2.4640564826700898e-05, + "loss": 0.3011, + "step": 20323 + }, + { + "epoch": 26.089858793324776, + "grad_norm": 1.006395697593689, + "learning_rate": 2.464013692768507e-05, + "loss": 0.2975, + "step": 20324 + }, + { + "epoch": 26.09114249037227, + "grad_norm": 1.4036206007003784, + "learning_rate": 2.4639709028669235e-05, + "loss": 0.2823, + "step": 20325 + }, + { + "epoch": 26.09242618741977, + "grad_norm": 1.5506377220153809, + "learning_rate": 2.4639281129653403e-05, + "loss": 0.3175, + "step": 20326 + }, + { + "epoch": 26.093709884467266, + "grad_norm": 1.504970908164978, + "learning_rate": 2.463885323063757e-05, + "loss": 0.3209, + "step": 20327 + }, + { + "epoch": 26.094993581514764, + "grad_norm": 1.3646314144134521, + "learning_rate": 2.4638425331621737e-05, + "loss": 0.3185, + "step": 20328 + }, + { + "epoch": 26.09627727856226, + "grad_norm": 2.254749059677124, + "learning_rate": 2.4637997432605905e-05, + "loss": 0.3118, + "step": 20329 + }, + { + "epoch": 26.097560975609756, + "grad_norm": 0.9422313570976257, + "learning_rate": 2.4637569533590073e-05, + "loss": 0.3416, + "step": 20330 + }, + { + "epoch": 26.098844672657254, + "grad_norm": 0.8555966019630432, + "learning_rate": 2.4637141634574242e-05, + "loss": 0.366, + "step": 20331 + }, + { + "epoch": 26.10012836970475, + "grad_norm": 0.9821583032608032, + "learning_rate": 2.463671373555841e-05, + "loss": 0.3403, + "step": 20332 + }, + { + "epoch": 26.101412066752246, + "grad_norm": 1.4270461797714233, + "learning_rate": 2.463628583654258e-05, + "loss": 0.3357, + "step": 20333 + }, + { + "epoch": 26.102695763799744, + "grad_norm": 0.9827216267585754, + "learning_rate": 2.4635857937526744e-05, + "loss": 0.3235, + "step": 20334 + }, + { + "epoch": 26.10397946084724, + "grad_norm": 2.693164110183716, + "learning_rate": 2.4635430038510912e-05, + "loss": 0.3323, + "step": 20335 + }, + { + "epoch": 26.105263157894736, + "grad_norm": 1.215085744857788, + "learning_rate": 2.463500213949508e-05, + "loss": 0.3255, + "step": 20336 + }, + { + "epoch": 26.106546854942234, + "grad_norm": 1.4756065607070923, + "learning_rate": 2.4634574240479245e-05, + "loss": 0.404, + "step": 20337 + }, + { + "epoch": 26.107830551989732, + "grad_norm": 1.094771385192871, + "learning_rate": 2.4634146341463417e-05, + "loss": 0.3238, + "step": 20338 + }, + { + "epoch": 26.109114249037226, + "grad_norm": 1.6697261333465576, + "learning_rate": 2.4633718442447582e-05, + "loss": 0.3614, + "step": 20339 + }, + { + "epoch": 26.110397946084724, + "grad_norm": 1.4514501094818115, + "learning_rate": 2.463329054343175e-05, + "loss": 0.3396, + "step": 20340 + }, + { + "epoch": 26.111681643132222, + "grad_norm": 1.6104025840759277, + "learning_rate": 2.463286264441592e-05, + "loss": 0.3454, + "step": 20341 + }, + { + "epoch": 26.112965340179716, + "grad_norm": 1.3274213075637817, + "learning_rate": 2.4632434745400084e-05, + "loss": 0.3297, + "step": 20342 + }, + { + "epoch": 26.114249037227214, + "grad_norm": 3.2285962104797363, + "learning_rate": 2.4632006846384256e-05, + "loss": 0.3711, + "step": 20343 + }, + { + "epoch": 26.115532734274712, + "grad_norm": 1.5793393850326538, + "learning_rate": 2.463157894736842e-05, + "loss": 0.3308, + "step": 20344 + }, + { + "epoch": 26.116816431322206, + "grad_norm": 1.6779468059539795, + "learning_rate": 2.463115104835259e-05, + "loss": 0.3557, + "step": 20345 + }, + { + "epoch": 26.118100128369704, + "grad_norm": 4.158197402954102, + "learning_rate": 2.4630723149336758e-05, + "loss": 0.3665, + "step": 20346 + }, + { + "epoch": 26.119383825417202, + "grad_norm": 1.7064523696899414, + "learning_rate": 2.4630295250320923e-05, + "loss": 0.3271, + "step": 20347 + }, + { + "epoch": 26.1206675224647, + "grad_norm": 1.1132166385650635, + "learning_rate": 2.4629867351305095e-05, + "loss": 0.3762, + "step": 20348 + }, + { + "epoch": 26.121951219512194, + "grad_norm": 2.4776103496551514, + "learning_rate": 2.462943945228926e-05, + "loss": 0.3852, + "step": 20349 + }, + { + "epoch": 26.123234916559692, + "grad_norm": 2.6240479946136475, + "learning_rate": 2.4629011553273428e-05, + "loss": 0.3726, + "step": 20350 + }, + { + "epoch": 26.12451861360719, + "grad_norm": 2.231008291244507, + "learning_rate": 2.4628583654257596e-05, + "loss": 0.4232, + "step": 20351 + }, + { + "epoch": 26.125802310654684, + "grad_norm": 3.6770548820495605, + "learning_rate": 2.4628155755241765e-05, + "loss": 0.3926, + "step": 20352 + }, + { + "epoch": 26.127086007702182, + "grad_norm": 5.23264217376709, + "learning_rate": 2.462772785622593e-05, + "loss": 0.4413, + "step": 20353 + }, + { + "epoch": 26.12836970474968, + "grad_norm": 70.4671401977539, + "learning_rate": 2.4627299957210098e-05, + "loss": 0.5224, + "step": 20354 + }, + { + "epoch": 26.129653401797174, + "grad_norm": 1.5386396646499634, + "learning_rate": 2.4626872058194267e-05, + "loss": 0.3229, + "step": 20355 + }, + { + "epoch": 26.130937098844672, + "grad_norm": 1.063199758529663, + "learning_rate": 2.4626444159178435e-05, + "loss": 0.308, + "step": 20356 + }, + { + "epoch": 26.13222079589217, + "grad_norm": 1.2207690477371216, + "learning_rate": 2.4626016260162604e-05, + "loss": 0.3252, + "step": 20357 + }, + { + "epoch": 26.133504492939664, + "grad_norm": 2.2071335315704346, + "learning_rate": 2.462558836114677e-05, + "loss": 0.3242, + "step": 20358 + }, + { + "epoch": 26.134788189987162, + "grad_norm": 1.556933879852295, + "learning_rate": 2.462516046213094e-05, + "loss": 0.3167, + "step": 20359 + }, + { + "epoch": 26.13607188703466, + "grad_norm": 2.4270966053009033, + "learning_rate": 2.4624732563115105e-05, + "loss": 0.2916, + "step": 20360 + }, + { + "epoch": 26.137355584082158, + "grad_norm": 1.323380470275879, + "learning_rate": 2.462430466409927e-05, + "loss": 0.3566, + "step": 20361 + }, + { + "epoch": 26.138639281129652, + "grad_norm": 2.0109050273895264, + "learning_rate": 2.4623876765083442e-05, + "loss": 0.3094, + "step": 20362 + }, + { + "epoch": 26.13992297817715, + "grad_norm": 1.137552261352539, + "learning_rate": 2.4623448866067607e-05, + "loss": 0.3224, + "step": 20363 + }, + { + "epoch": 26.141206675224648, + "grad_norm": 1.4081555604934692, + "learning_rate": 2.4623020967051776e-05, + "loss": 0.3217, + "step": 20364 + }, + { + "epoch": 26.142490372272142, + "grad_norm": 2.623530387878418, + "learning_rate": 2.4622593068035944e-05, + "loss": 0.2989, + "step": 20365 + }, + { + "epoch": 26.14377406931964, + "grad_norm": 0.8676619529724121, + "learning_rate": 2.4622165169020112e-05, + "loss": 0.3185, + "step": 20366 + }, + { + "epoch": 26.145057766367138, + "grad_norm": 0.9961982369422913, + "learning_rate": 2.462173727000428e-05, + "loss": 0.3424, + "step": 20367 + }, + { + "epoch": 26.146341463414632, + "grad_norm": 1.501783847808838, + "learning_rate": 2.4621309370988446e-05, + "loss": 0.3207, + "step": 20368 + }, + { + "epoch": 26.14762516046213, + "grad_norm": 1.289709448814392, + "learning_rate": 2.4620881471972614e-05, + "loss": 0.3249, + "step": 20369 + }, + { + "epoch": 26.14890885750963, + "grad_norm": 1.1183319091796875, + "learning_rate": 2.4620453572956783e-05, + "loss": 0.3422, + "step": 20370 + }, + { + "epoch": 26.150192554557126, + "grad_norm": 0.9498004913330078, + "learning_rate": 2.462002567394095e-05, + "loss": 0.3276, + "step": 20371 + }, + { + "epoch": 26.15147625160462, + "grad_norm": 2.5201241970062256, + "learning_rate": 2.4619597774925116e-05, + "loss": 0.3079, + "step": 20372 + }, + { + "epoch": 26.15275994865212, + "grad_norm": 4.198441028594971, + "learning_rate": 2.4619169875909288e-05, + "loss": 0.3388, + "step": 20373 + }, + { + "epoch": 26.154043645699616, + "grad_norm": 1.961341142654419, + "learning_rate": 2.4618741976893453e-05, + "loss": 0.3214, + "step": 20374 + }, + { + "epoch": 26.15532734274711, + "grad_norm": 0.7368990182876587, + "learning_rate": 2.461831407787762e-05, + "loss": 0.3293, + "step": 20375 + }, + { + "epoch": 26.15661103979461, + "grad_norm": 1.406110167503357, + "learning_rate": 2.461788617886179e-05, + "loss": 0.3109, + "step": 20376 + }, + { + "epoch": 26.157894736842106, + "grad_norm": 1.0216621160507202, + "learning_rate": 2.4617458279845955e-05, + "loss": 0.3648, + "step": 20377 + }, + { + "epoch": 26.1591784338896, + "grad_norm": 1.19184410572052, + "learning_rate": 2.4617030380830127e-05, + "loss": 0.3026, + "step": 20378 + }, + { + "epoch": 26.1604621309371, + "grad_norm": 2.0646166801452637, + "learning_rate": 2.461660248181429e-05, + "loss": 0.3391, + "step": 20379 + }, + { + "epoch": 26.161745827984596, + "grad_norm": 1.0060876607894897, + "learning_rate": 2.461617458279846e-05, + "loss": 0.327, + "step": 20380 + }, + { + "epoch": 26.163029525032094, + "grad_norm": 1.2724003791809082, + "learning_rate": 2.461574668378263e-05, + "loss": 0.2817, + "step": 20381 + }, + { + "epoch": 26.16431322207959, + "grad_norm": 1.176300048828125, + "learning_rate": 2.4615318784766794e-05, + "loss": 0.3224, + "step": 20382 + }, + { + "epoch": 26.165596919127086, + "grad_norm": 0.957848846912384, + "learning_rate": 2.4614890885750965e-05, + "loss": 0.3216, + "step": 20383 + }, + { + "epoch": 26.166880616174584, + "grad_norm": 0.8987277150154114, + "learning_rate": 2.461446298673513e-05, + "loss": 0.3074, + "step": 20384 + }, + { + "epoch": 26.16816431322208, + "grad_norm": 1.226920247077942, + "learning_rate": 2.46140350877193e-05, + "loss": 0.3196, + "step": 20385 + }, + { + "epoch": 26.169448010269576, + "grad_norm": 1.7528092861175537, + "learning_rate": 2.4613607188703467e-05, + "loss": 0.317, + "step": 20386 + }, + { + "epoch": 26.170731707317074, + "grad_norm": 1.5418336391448975, + "learning_rate": 2.4613179289687636e-05, + "loss": 0.3736, + "step": 20387 + }, + { + "epoch": 26.17201540436457, + "grad_norm": 13.325982093811035, + "learning_rate": 2.46127513906718e-05, + "loss": 0.3228, + "step": 20388 + }, + { + "epoch": 26.173299101412066, + "grad_norm": 0.9358304738998413, + "learning_rate": 2.461232349165597e-05, + "loss": 0.3491, + "step": 20389 + }, + { + "epoch": 26.174582798459564, + "grad_norm": 1.6145291328430176, + "learning_rate": 2.4611895592640137e-05, + "loss": 0.3251, + "step": 20390 + }, + { + "epoch": 26.17586649550706, + "grad_norm": 1.6549742221832275, + "learning_rate": 2.4611467693624306e-05, + "loss": 0.35, + "step": 20391 + }, + { + "epoch": 26.177150192554556, + "grad_norm": 1.1082837581634521, + "learning_rate": 2.4611039794608474e-05, + "loss": 0.3194, + "step": 20392 + }, + { + "epoch": 26.178433889602054, + "grad_norm": 1.2105515003204346, + "learning_rate": 2.461061189559264e-05, + "loss": 0.3259, + "step": 20393 + }, + { + "epoch": 26.179717586649552, + "grad_norm": 1.4839152097702026, + "learning_rate": 2.461018399657681e-05, + "loss": 0.3544, + "step": 20394 + }, + { + "epoch": 26.181001283697046, + "grad_norm": 1.4167171716690063, + "learning_rate": 2.4609756097560976e-05, + "loss": 0.3506, + "step": 20395 + }, + { + "epoch": 26.182284980744544, + "grad_norm": 1.6443288326263428, + "learning_rate": 2.460932819854514e-05, + "loss": 0.3698, + "step": 20396 + }, + { + "epoch": 26.183568677792042, + "grad_norm": 1.9289344549179077, + "learning_rate": 2.4608900299529313e-05, + "loss": 0.3472, + "step": 20397 + }, + { + "epoch": 26.184852374839537, + "grad_norm": 4.852816104888916, + "learning_rate": 2.4608472400513478e-05, + "loss": 0.3278, + "step": 20398 + }, + { + "epoch": 26.186136071887034, + "grad_norm": 1.3924307823181152, + "learning_rate": 2.460804450149765e-05, + "loss": 0.3906, + "step": 20399 + }, + { + "epoch": 26.187419768934532, + "grad_norm": 1.4383219480514526, + "learning_rate": 2.4607616602481815e-05, + "loss": 0.3907, + "step": 20400 + }, + { + "epoch": 26.188703465982027, + "grad_norm": 1.5265799760818481, + "learning_rate": 2.4607188703465983e-05, + "loss": 0.424, + "step": 20401 + }, + { + "epoch": 26.189987163029524, + "grad_norm": 1.1810061931610107, + "learning_rate": 2.460676080445015e-05, + "loss": 0.392, + "step": 20402 + }, + { + "epoch": 26.191270860077022, + "grad_norm": 5.656484127044678, + "learning_rate": 2.4606332905434317e-05, + "loss": 0.4366, + "step": 20403 + }, + { + "epoch": 26.19255455712452, + "grad_norm": 5.097414970397949, + "learning_rate": 2.4605905006418485e-05, + "loss": 0.631, + "step": 20404 + }, + { + "epoch": 26.193838254172015, + "grad_norm": 1.0157634019851685, + "learning_rate": 2.4605477107402653e-05, + "loss": 0.2904, + "step": 20405 + }, + { + "epoch": 26.195121951219512, + "grad_norm": 1.7035819292068481, + "learning_rate": 2.4605049208386822e-05, + "loss": 0.3024, + "step": 20406 + }, + { + "epoch": 26.19640564826701, + "grad_norm": 0.7560495734214783, + "learning_rate": 2.460462130937099e-05, + "loss": 0.3039, + "step": 20407 + }, + { + "epoch": 26.197689345314505, + "grad_norm": 0.8508793115615845, + "learning_rate": 2.4604193410355155e-05, + "loss": 0.3282, + "step": 20408 + }, + { + "epoch": 26.198973042362002, + "grad_norm": 0.9053365588188171, + "learning_rate": 2.4603765511339324e-05, + "loss": 0.3338, + "step": 20409 + }, + { + "epoch": 26.2002567394095, + "grad_norm": 1.0167876482009888, + "learning_rate": 2.4603337612323492e-05, + "loss": 0.3367, + "step": 20410 + }, + { + "epoch": 26.201540436456995, + "grad_norm": 0.7421622276306152, + "learning_rate": 2.460290971330766e-05, + "loss": 0.2822, + "step": 20411 + }, + { + "epoch": 26.202824133504492, + "grad_norm": 1.1582947969436646, + "learning_rate": 2.4602481814291826e-05, + "loss": 0.3219, + "step": 20412 + }, + { + "epoch": 26.20410783055199, + "grad_norm": 1.3098257780075073, + "learning_rate": 2.4602053915275997e-05, + "loss": 0.3145, + "step": 20413 + }, + { + "epoch": 26.205391527599488, + "grad_norm": 0.9133513569831848, + "learning_rate": 2.4601626016260162e-05, + "loss": 0.344, + "step": 20414 + }, + { + "epoch": 26.206675224646983, + "grad_norm": 0.9661781191825867, + "learning_rate": 2.460119811724433e-05, + "loss": 0.2979, + "step": 20415 + }, + { + "epoch": 26.20795892169448, + "grad_norm": 2.516923189163208, + "learning_rate": 2.46007702182285e-05, + "loss": 0.3345, + "step": 20416 + }, + { + "epoch": 26.20924261874198, + "grad_norm": 1.2762328386306763, + "learning_rate": 2.4600342319212664e-05, + "loss": 0.3213, + "step": 20417 + }, + { + "epoch": 26.210526315789473, + "grad_norm": 1.8269621133804321, + "learning_rate": 2.4599914420196836e-05, + "loss": 0.3317, + "step": 20418 + }, + { + "epoch": 26.21181001283697, + "grad_norm": 1.0581777095794678, + "learning_rate": 2.4599486521181e-05, + "loss": 0.37, + "step": 20419 + }, + { + "epoch": 26.21309370988447, + "grad_norm": 2.3227691650390625, + "learning_rate": 2.459905862216517e-05, + "loss": 0.329, + "step": 20420 + }, + { + "epoch": 26.214377406931963, + "grad_norm": 2.029111862182617, + "learning_rate": 2.4598630723149338e-05, + "loss": 0.3061, + "step": 20421 + }, + { + "epoch": 26.21566110397946, + "grad_norm": 0.9357410073280334, + "learning_rate": 2.4598202824133503e-05, + "loss": 0.3329, + "step": 20422 + }, + { + "epoch": 26.21694480102696, + "grad_norm": 1.5840409994125366, + "learning_rate": 2.4597774925117675e-05, + "loss": 0.3552, + "step": 20423 + }, + { + "epoch": 26.218228498074453, + "grad_norm": 1.9711940288543701, + "learning_rate": 2.459734702610184e-05, + "loss": 0.3244, + "step": 20424 + }, + { + "epoch": 26.21951219512195, + "grad_norm": 0.998890221118927, + "learning_rate": 2.4596919127086008e-05, + "loss": 0.3051, + "step": 20425 + }, + { + "epoch": 26.22079589216945, + "grad_norm": 1.7972491979599, + "learning_rate": 2.4596491228070177e-05, + "loss": 0.3297, + "step": 20426 + }, + { + "epoch": 26.222079589216946, + "grad_norm": 2.3512163162231445, + "learning_rate": 2.4596063329054345e-05, + "loss": 0.2967, + "step": 20427 + }, + { + "epoch": 26.22336328626444, + "grad_norm": 1.2783933877944946, + "learning_rate": 2.459563543003851e-05, + "loss": 0.3254, + "step": 20428 + }, + { + "epoch": 26.22464698331194, + "grad_norm": 2.906883955001831, + "learning_rate": 2.459520753102268e-05, + "loss": 0.3449, + "step": 20429 + }, + { + "epoch": 26.225930680359436, + "grad_norm": 0.9011329412460327, + "learning_rate": 2.4594779632006847e-05, + "loss": 0.3129, + "step": 20430 + }, + { + "epoch": 26.22721437740693, + "grad_norm": 0.8732451796531677, + "learning_rate": 2.4594351732991015e-05, + "loss": 0.3382, + "step": 20431 + }, + { + "epoch": 26.22849807445443, + "grad_norm": 1.7300328016281128, + "learning_rate": 2.4593923833975184e-05, + "loss": 0.3257, + "step": 20432 + }, + { + "epoch": 26.229781771501926, + "grad_norm": 1.2263890504837036, + "learning_rate": 2.459349593495935e-05, + "loss": 0.3469, + "step": 20433 + }, + { + "epoch": 26.23106546854942, + "grad_norm": 1.751810073852539, + "learning_rate": 2.459306803594352e-05, + "loss": 0.3452, + "step": 20434 + }, + { + "epoch": 26.23234916559692, + "grad_norm": 1.090460181236267, + "learning_rate": 2.4592640136927685e-05, + "loss": 0.341, + "step": 20435 + }, + { + "epoch": 26.233632862644416, + "grad_norm": 2.018483877182007, + "learning_rate": 2.459221223791185e-05, + "loss": 0.3328, + "step": 20436 + }, + { + "epoch": 26.234916559691914, + "grad_norm": 1.0211843252182007, + "learning_rate": 2.4591784338896022e-05, + "loss": 0.3488, + "step": 20437 + }, + { + "epoch": 26.23620025673941, + "grad_norm": 2.5419442653656006, + "learning_rate": 2.4591356439880187e-05, + "loss": 0.3242, + "step": 20438 + }, + { + "epoch": 26.237483953786906, + "grad_norm": 1.256389856338501, + "learning_rate": 2.459092854086436e-05, + "loss": 0.3381, + "step": 20439 + }, + { + "epoch": 26.238767650834404, + "grad_norm": 1.9535013437271118, + "learning_rate": 2.4590500641848524e-05, + "loss": 0.3321, + "step": 20440 + }, + { + "epoch": 26.2400513478819, + "grad_norm": 1.2088125944137573, + "learning_rate": 2.4590072742832693e-05, + "loss": 0.3423, + "step": 20441 + }, + { + "epoch": 26.241335044929397, + "grad_norm": 1.0713081359863281, + "learning_rate": 2.458964484381686e-05, + "loss": 0.3304, + "step": 20442 + }, + { + "epoch": 26.242618741976894, + "grad_norm": 1.3313881158828735, + "learning_rate": 2.4589216944801026e-05, + "loss": 0.3541, + "step": 20443 + }, + { + "epoch": 26.24390243902439, + "grad_norm": 2.4981133937835693, + "learning_rate": 2.4588789045785194e-05, + "loss": 0.3807, + "step": 20444 + }, + { + "epoch": 26.245186136071887, + "grad_norm": 1.565841555595398, + "learning_rate": 2.4588361146769363e-05, + "loss": 0.3712, + "step": 20445 + }, + { + "epoch": 26.246469833119384, + "grad_norm": 1.406556248664856, + "learning_rate": 2.458793324775353e-05, + "loss": 0.4005, + "step": 20446 + }, + { + "epoch": 26.247753530166882, + "grad_norm": 0.9863939881324768, + "learning_rate": 2.45875053487377e-05, + "loss": 0.3931, + "step": 20447 + }, + { + "epoch": 26.249037227214377, + "grad_norm": 2.9379796981811523, + "learning_rate": 2.4587077449721868e-05, + "loss": 0.334, + "step": 20448 + }, + { + "epoch": 26.250320924261874, + "grad_norm": 3.076519012451172, + "learning_rate": 2.4586649550706033e-05, + "loss": 0.3837, + "step": 20449 + }, + { + "epoch": 26.251604621309372, + "grad_norm": 1.4167402982711792, + "learning_rate": 2.45862216516902e-05, + "loss": 0.347, + "step": 20450 + }, + { + "epoch": 26.252888318356867, + "grad_norm": 4.5648956298828125, + "learning_rate": 2.458579375267437e-05, + "loss": 0.3739, + "step": 20451 + }, + { + "epoch": 26.254172015404365, + "grad_norm": 2.5313644409179688, + "learning_rate": 2.4585365853658535e-05, + "loss": 0.4236, + "step": 20452 + }, + { + "epoch": 26.255455712451862, + "grad_norm": 1.5091300010681152, + "learning_rate": 2.4584937954642707e-05, + "loss": 0.3943, + "step": 20453 + }, + { + "epoch": 26.256739409499357, + "grad_norm": 1.6499663591384888, + "learning_rate": 2.4584510055626872e-05, + "loss": 0.4715, + "step": 20454 + }, + { + "epoch": 26.258023106546855, + "grad_norm": 1.865749478340149, + "learning_rate": 2.4584082156611044e-05, + "loss": 0.3154, + "step": 20455 + }, + { + "epoch": 26.259306803594352, + "grad_norm": 0.8623282313346863, + "learning_rate": 2.458365425759521e-05, + "loss": 0.3127, + "step": 20456 + }, + { + "epoch": 26.260590500641847, + "grad_norm": 1.3187527656555176, + "learning_rate": 2.4583226358579374e-05, + "loss": 0.3422, + "step": 20457 + }, + { + "epoch": 26.261874197689345, + "grad_norm": 1.9377652406692505, + "learning_rate": 2.4582798459563545e-05, + "loss": 0.3394, + "step": 20458 + }, + { + "epoch": 26.263157894736842, + "grad_norm": 2.322432279586792, + "learning_rate": 2.458237056054771e-05, + "loss": 0.3044, + "step": 20459 + }, + { + "epoch": 26.26444159178434, + "grad_norm": 1.35433030128479, + "learning_rate": 2.458194266153188e-05, + "loss": 0.3213, + "step": 20460 + }, + { + "epoch": 26.265725288831835, + "grad_norm": 0.8801272511482239, + "learning_rate": 2.4581514762516047e-05, + "loss": 0.3089, + "step": 20461 + }, + { + "epoch": 26.267008985879333, + "grad_norm": 1.041076421737671, + "learning_rate": 2.4581086863500216e-05, + "loss": 0.3267, + "step": 20462 + }, + { + "epoch": 26.26829268292683, + "grad_norm": 0.8225737810134888, + "learning_rate": 2.4580658964484384e-05, + "loss": 0.3247, + "step": 20463 + }, + { + "epoch": 26.269576379974325, + "grad_norm": 1.425408124923706, + "learning_rate": 2.458023106546855e-05, + "loss": 0.3404, + "step": 20464 + }, + { + "epoch": 26.270860077021823, + "grad_norm": 1.870413899421692, + "learning_rate": 2.4579803166452717e-05, + "loss": 0.3357, + "step": 20465 + }, + { + "epoch": 26.27214377406932, + "grad_norm": 0.8217566609382629, + "learning_rate": 2.4579375267436886e-05, + "loss": 0.3131, + "step": 20466 + }, + { + "epoch": 26.273427471116815, + "grad_norm": 4.745279312133789, + "learning_rate": 2.4578947368421054e-05, + "loss": 0.3245, + "step": 20467 + }, + { + "epoch": 26.274711168164313, + "grad_norm": 1.0634201765060425, + "learning_rate": 2.457851946940522e-05, + "loss": 0.3531, + "step": 20468 + }, + { + "epoch": 26.27599486521181, + "grad_norm": 1.9392316341400146, + "learning_rate": 2.4578091570389388e-05, + "loss": 0.3079, + "step": 20469 + }, + { + "epoch": 26.27727856225931, + "grad_norm": 1.2835685014724731, + "learning_rate": 2.4577663671373556e-05, + "loss": 0.3163, + "step": 20470 + }, + { + "epoch": 26.278562259306803, + "grad_norm": 0.9008775949478149, + "learning_rate": 2.4577235772357725e-05, + "loss": 0.3379, + "step": 20471 + }, + { + "epoch": 26.2798459563543, + "grad_norm": 1.4924736022949219, + "learning_rate": 2.4576807873341893e-05, + "loss": 0.3113, + "step": 20472 + }, + { + "epoch": 26.2811296534018, + "grad_norm": 1.0385764837265015, + "learning_rate": 2.4576379974326058e-05, + "loss": 0.3161, + "step": 20473 + }, + { + "epoch": 26.282413350449293, + "grad_norm": 1.2555514574050903, + "learning_rate": 2.457595207531023e-05, + "loss": 0.322, + "step": 20474 + }, + { + "epoch": 26.28369704749679, + "grad_norm": 3.791447162628174, + "learning_rate": 2.4575524176294395e-05, + "loss": 0.3243, + "step": 20475 + }, + { + "epoch": 26.28498074454429, + "grad_norm": 1.1283797025680542, + "learning_rate": 2.457509627727856e-05, + "loss": 0.3179, + "step": 20476 + }, + { + "epoch": 26.286264441591783, + "grad_norm": 2.1673409938812256, + "learning_rate": 2.457466837826273e-05, + "loss": 0.3462, + "step": 20477 + }, + { + "epoch": 26.28754813863928, + "grad_norm": 0.9845602512359619, + "learning_rate": 2.4574240479246897e-05, + "loss": 0.3191, + "step": 20478 + }, + { + "epoch": 26.28883183568678, + "grad_norm": 3.2637152671813965, + "learning_rate": 2.457381258023107e-05, + "loss": 0.3281, + "step": 20479 + }, + { + "epoch": 26.290115532734276, + "grad_norm": 0.9236567616462708, + "learning_rate": 2.4573384681215233e-05, + "loss": 0.3227, + "step": 20480 + }, + { + "epoch": 26.29139922978177, + "grad_norm": 1.253109097480774, + "learning_rate": 2.4572956782199402e-05, + "loss": 0.3263, + "step": 20481 + }, + { + "epoch": 26.29268292682927, + "grad_norm": 1.4712164402008057, + "learning_rate": 2.457252888318357e-05, + "loss": 0.3214, + "step": 20482 + }, + { + "epoch": 26.293966623876766, + "grad_norm": 1.0424691438674927, + "learning_rate": 2.4572100984167735e-05, + "loss": 0.2984, + "step": 20483 + }, + { + "epoch": 26.29525032092426, + "grad_norm": 1.4838050603866577, + "learning_rate": 2.4571673085151904e-05, + "loss": 0.3504, + "step": 20484 + }, + { + "epoch": 26.29653401797176, + "grad_norm": 1.2734159231185913, + "learning_rate": 2.4571245186136072e-05, + "loss": 0.3148, + "step": 20485 + }, + { + "epoch": 26.297817715019256, + "grad_norm": 0.9966868758201599, + "learning_rate": 2.457081728712024e-05, + "loss": 0.343, + "step": 20486 + }, + { + "epoch": 26.29910141206675, + "grad_norm": 1.4858314990997314, + "learning_rate": 2.457038938810441e-05, + "loss": 0.3171, + "step": 20487 + }, + { + "epoch": 26.30038510911425, + "grad_norm": 1.023308277130127, + "learning_rate": 2.4569961489088577e-05, + "loss": 0.332, + "step": 20488 + }, + { + "epoch": 26.301668806161747, + "grad_norm": 1.6367416381835938, + "learning_rate": 2.4569533590072742e-05, + "loss": 0.3059, + "step": 20489 + }, + { + "epoch": 26.30295250320924, + "grad_norm": 1.5537409782409668, + "learning_rate": 2.456910569105691e-05, + "loss": 0.3492, + "step": 20490 + }, + { + "epoch": 26.30423620025674, + "grad_norm": 1.1359070539474487, + "learning_rate": 2.456867779204108e-05, + "loss": 0.3691, + "step": 20491 + }, + { + "epoch": 26.305519897304237, + "grad_norm": 1.6346495151519775, + "learning_rate": 2.4568249893025244e-05, + "loss": 0.3481, + "step": 20492 + }, + { + "epoch": 26.306803594351734, + "grad_norm": 1.4297364950180054, + "learning_rate": 2.4567821994009416e-05, + "loss": 0.3305, + "step": 20493 + }, + { + "epoch": 26.30808729139923, + "grad_norm": 2.0207619667053223, + "learning_rate": 2.456739409499358e-05, + "loss": 0.3788, + "step": 20494 + }, + { + "epoch": 26.309370988446727, + "grad_norm": 1.6120244264602661, + "learning_rate": 2.4566966195977753e-05, + "loss": 0.3736, + "step": 20495 + }, + { + "epoch": 26.310654685494224, + "grad_norm": 3.330326795578003, + "learning_rate": 2.4566538296961918e-05, + "loss": 0.3289, + "step": 20496 + }, + { + "epoch": 26.31193838254172, + "grad_norm": 1.0673346519470215, + "learning_rate": 2.4566110397946083e-05, + "loss": 0.3312, + "step": 20497 + }, + { + "epoch": 26.313222079589217, + "grad_norm": 1.7945384979248047, + "learning_rate": 2.4565682498930255e-05, + "loss": 0.3716, + "step": 20498 + }, + { + "epoch": 26.314505776636715, + "grad_norm": 1.1298997402191162, + "learning_rate": 2.456525459991442e-05, + "loss": 0.357, + "step": 20499 + }, + { + "epoch": 26.31578947368421, + "grad_norm": 1.1653203964233398, + "learning_rate": 2.4564826700898588e-05, + "loss": 0.3808, + "step": 20500 + }, + { + "epoch": 26.317073170731707, + "grad_norm": 2.274167776107788, + "learning_rate": 2.4564398801882757e-05, + "loss": 0.3837, + "step": 20501 + }, + { + "epoch": 26.318356867779205, + "grad_norm": 5.001097679138184, + "learning_rate": 2.4563970902866925e-05, + "loss": 0.4143, + "step": 20502 + }, + { + "epoch": 26.319640564826702, + "grad_norm": 4.0127177238464355, + "learning_rate": 2.4563543003851093e-05, + "loss": 0.4362, + "step": 20503 + }, + { + "epoch": 26.320924261874197, + "grad_norm": 2.62684965133667, + "learning_rate": 2.456311510483526e-05, + "loss": 0.4697, + "step": 20504 + }, + { + "epoch": 26.322207958921695, + "grad_norm": 1.3108888864517212, + "learning_rate": 2.4562687205819427e-05, + "loss": 0.2992, + "step": 20505 + }, + { + "epoch": 26.323491655969192, + "grad_norm": 0.8980812430381775, + "learning_rate": 2.4562259306803595e-05, + "loss": 0.3248, + "step": 20506 + }, + { + "epoch": 26.324775353016687, + "grad_norm": 1.6354469060897827, + "learning_rate": 2.4561831407787764e-05, + "loss": 0.319, + "step": 20507 + }, + { + "epoch": 26.326059050064185, + "grad_norm": 2.3133881092071533, + "learning_rate": 2.456140350877193e-05, + "loss": 0.332, + "step": 20508 + }, + { + "epoch": 26.327342747111683, + "grad_norm": 0.8993603587150574, + "learning_rate": 2.45609756097561e-05, + "loss": 0.3079, + "step": 20509 + }, + { + "epoch": 26.328626444159177, + "grad_norm": 2.02964448928833, + "learning_rate": 2.4560547710740266e-05, + "loss": 0.3044, + "step": 20510 + }, + { + "epoch": 26.329910141206675, + "grad_norm": 1.0131721496582031, + "learning_rate": 2.4560119811724434e-05, + "loss": 0.3362, + "step": 20511 + }, + { + "epoch": 26.331193838254173, + "grad_norm": 4.545472145080566, + "learning_rate": 2.4559691912708602e-05, + "loss": 0.3253, + "step": 20512 + }, + { + "epoch": 26.33247753530167, + "grad_norm": 1.2823201417922974, + "learning_rate": 2.4559264013692767e-05, + "loss": 0.314, + "step": 20513 + }, + { + "epoch": 26.333761232349165, + "grad_norm": 1.6708768606185913, + "learning_rate": 2.455883611467694e-05, + "loss": 0.3304, + "step": 20514 + }, + { + "epoch": 26.335044929396663, + "grad_norm": 1.8660638332366943, + "learning_rate": 2.4558408215661104e-05, + "loss": 0.3525, + "step": 20515 + }, + { + "epoch": 26.33632862644416, + "grad_norm": 1.110939383506775, + "learning_rate": 2.4557980316645273e-05, + "loss": 0.3205, + "step": 20516 + }, + { + "epoch": 26.337612323491655, + "grad_norm": 2.2802228927612305, + "learning_rate": 2.455755241762944e-05, + "loss": 0.3101, + "step": 20517 + }, + { + "epoch": 26.338896020539153, + "grad_norm": 1.472245216369629, + "learning_rate": 2.4557124518613606e-05, + "loss": 0.3176, + "step": 20518 + }, + { + "epoch": 26.34017971758665, + "grad_norm": 1.581457257270813, + "learning_rate": 2.4556696619597778e-05, + "loss": 0.2968, + "step": 20519 + }, + { + "epoch": 26.341463414634145, + "grad_norm": 2.6628496646881104, + "learning_rate": 2.4556268720581943e-05, + "loss": 0.3471, + "step": 20520 + }, + { + "epoch": 26.342747111681643, + "grad_norm": 0.8787893056869507, + "learning_rate": 2.455584082156611e-05, + "loss": 0.3145, + "step": 20521 + }, + { + "epoch": 26.34403080872914, + "grad_norm": 1.1307094097137451, + "learning_rate": 2.455541292255028e-05, + "loss": 0.3105, + "step": 20522 + }, + { + "epoch": 26.345314505776635, + "grad_norm": 1.2909269332885742, + "learning_rate": 2.4554985023534448e-05, + "loss": 0.3272, + "step": 20523 + }, + { + "epoch": 26.346598202824133, + "grad_norm": 1.1278407573699951, + "learning_rate": 2.4554557124518613e-05, + "loss": 0.3187, + "step": 20524 + }, + { + "epoch": 26.34788189987163, + "grad_norm": 0.9820616841316223, + "learning_rate": 2.455412922550278e-05, + "loss": 0.3386, + "step": 20525 + }, + { + "epoch": 26.34916559691913, + "grad_norm": 1.1363604068756104, + "learning_rate": 2.455370132648695e-05, + "loss": 0.2859, + "step": 20526 + }, + { + "epoch": 26.350449293966623, + "grad_norm": 1.3382759094238281, + "learning_rate": 2.455327342747112e-05, + "loss": 0.32, + "step": 20527 + }, + { + "epoch": 26.35173299101412, + "grad_norm": 1.0917701721191406, + "learning_rate": 2.4552845528455287e-05, + "loss": 0.3428, + "step": 20528 + }, + { + "epoch": 26.35301668806162, + "grad_norm": 0.9712410569190979, + "learning_rate": 2.4552417629439452e-05, + "loss": 0.3095, + "step": 20529 + }, + { + "epoch": 26.354300385109113, + "grad_norm": 1.2528551816940308, + "learning_rate": 2.455198973042362e-05, + "loss": 0.3123, + "step": 20530 + }, + { + "epoch": 26.35558408215661, + "grad_norm": 1.0714143514633179, + "learning_rate": 2.455156183140779e-05, + "loss": 0.3389, + "step": 20531 + }, + { + "epoch": 26.35686777920411, + "grad_norm": 1.0488197803497314, + "learning_rate": 2.4551133932391954e-05, + "loss": 0.3245, + "step": 20532 + }, + { + "epoch": 26.358151476251603, + "grad_norm": 1.4320048093795776, + "learning_rate": 2.4550706033376125e-05, + "loss": 0.3399, + "step": 20533 + }, + { + "epoch": 26.3594351732991, + "grad_norm": 1.178789734840393, + "learning_rate": 2.455027813436029e-05, + "loss": 0.3509, + "step": 20534 + }, + { + "epoch": 26.3607188703466, + "grad_norm": 1.2615529298782349, + "learning_rate": 2.4549850235344462e-05, + "loss": 0.3042, + "step": 20535 + }, + { + "epoch": 26.362002567394097, + "grad_norm": 1.236266851425171, + "learning_rate": 2.4549422336328627e-05, + "loss": 0.3316, + "step": 20536 + }, + { + "epoch": 26.36328626444159, + "grad_norm": 1.0361764430999756, + "learning_rate": 2.4548994437312792e-05, + "loss": 0.3151, + "step": 20537 + }, + { + "epoch": 26.36456996148909, + "grad_norm": 2.5787086486816406, + "learning_rate": 2.4548566538296964e-05, + "loss": 0.3897, + "step": 20538 + }, + { + "epoch": 26.365853658536587, + "grad_norm": 0.9979249835014343, + "learning_rate": 2.454813863928113e-05, + "loss": 0.3087, + "step": 20539 + }, + { + "epoch": 26.36713735558408, + "grad_norm": 15.058501243591309, + "learning_rate": 2.4547710740265298e-05, + "loss": 0.3428, + "step": 20540 + }, + { + "epoch": 26.36842105263158, + "grad_norm": 1.552610158920288, + "learning_rate": 2.4547282841249466e-05, + "loss": 0.3866, + "step": 20541 + }, + { + "epoch": 26.369704749679077, + "grad_norm": 3.025646924972534, + "learning_rate": 2.4546854942233634e-05, + "loss": 0.3097, + "step": 20542 + }, + { + "epoch": 26.37098844672657, + "grad_norm": 1.3039119243621826, + "learning_rate": 2.4546427043217803e-05, + "loss": 0.3442, + "step": 20543 + }, + { + "epoch": 26.37227214377407, + "grad_norm": 1.0174890756607056, + "learning_rate": 2.4545999144201968e-05, + "loss": 0.3427, + "step": 20544 + }, + { + "epoch": 26.373555840821567, + "grad_norm": 1.601650595664978, + "learning_rate": 2.4545571245186136e-05, + "loss": 0.3181, + "step": 20545 + }, + { + "epoch": 26.374839537869065, + "grad_norm": 1.0188263654708862, + "learning_rate": 2.4545143346170305e-05, + "loss": 0.3222, + "step": 20546 + }, + { + "epoch": 26.37612323491656, + "grad_norm": 1.3604671955108643, + "learning_rate": 2.4544715447154473e-05, + "loss": 0.3456, + "step": 20547 + }, + { + "epoch": 26.377406931964057, + "grad_norm": 2.378868579864502, + "learning_rate": 2.4544287548138638e-05, + "loss": 0.3682, + "step": 20548 + }, + { + "epoch": 26.378690629011555, + "grad_norm": 1.9012795686721802, + "learning_rate": 2.454385964912281e-05, + "loss": 0.3562, + "step": 20549 + }, + { + "epoch": 26.37997432605905, + "grad_norm": 1.2303537130355835, + "learning_rate": 2.4543431750106975e-05, + "loss": 0.3991, + "step": 20550 + }, + { + "epoch": 26.381258023106547, + "grad_norm": 1.1358728408813477, + "learning_rate": 2.4543003851091143e-05, + "loss": 0.3388, + "step": 20551 + }, + { + "epoch": 26.382541720154045, + "grad_norm": 1.632067322731018, + "learning_rate": 2.4542575952075312e-05, + "loss": 0.441, + "step": 20552 + }, + { + "epoch": 26.38382541720154, + "grad_norm": 1.2833366394042969, + "learning_rate": 2.4542148053059477e-05, + "loss": 0.4883, + "step": 20553 + }, + { + "epoch": 26.385109114249037, + "grad_norm": 1.9073377847671509, + "learning_rate": 2.454172015404365e-05, + "loss": 0.5712, + "step": 20554 + }, + { + "epoch": 26.386392811296535, + "grad_norm": 1.6394264698028564, + "learning_rate": 2.4541292255027814e-05, + "loss": 0.3433, + "step": 20555 + }, + { + "epoch": 26.387676508344033, + "grad_norm": 4.021840572357178, + "learning_rate": 2.4540864356011982e-05, + "loss": 0.2803, + "step": 20556 + }, + { + "epoch": 26.388960205391527, + "grad_norm": 1.0246769189834595, + "learning_rate": 2.454043645699615e-05, + "loss": 0.3301, + "step": 20557 + }, + { + "epoch": 26.390243902439025, + "grad_norm": 1.020865797996521, + "learning_rate": 2.4540008557980315e-05, + "loss": 0.3429, + "step": 20558 + }, + { + "epoch": 26.391527599486523, + "grad_norm": 1.293338418006897, + "learning_rate": 2.4539580658964487e-05, + "loss": 0.3264, + "step": 20559 + }, + { + "epoch": 26.392811296534017, + "grad_norm": 1.632703185081482, + "learning_rate": 2.4539152759948652e-05, + "loss": 0.3374, + "step": 20560 + }, + { + "epoch": 26.394094993581515, + "grad_norm": 1.2542574405670166, + "learning_rate": 2.453872486093282e-05, + "loss": 0.3167, + "step": 20561 + }, + { + "epoch": 26.395378690629013, + "grad_norm": 1.1809147596359253, + "learning_rate": 2.453829696191699e-05, + "loss": 0.338, + "step": 20562 + }, + { + "epoch": 26.396662387676507, + "grad_norm": 1.2451130151748657, + "learning_rate": 2.4537869062901157e-05, + "loss": 0.3114, + "step": 20563 + }, + { + "epoch": 26.397946084724005, + "grad_norm": 0.9408824443817139, + "learning_rate": 2.4537441163885322e-05, + "loss": 0.3539, + "step": 20564 + }, + { + "epoch": 26.399229781771503, + "grad_norm": 1.178775668144226, + "learning_rate": 2.453701326486949e-05, + "loss": 0.3206, + "step": 20565 + }, + { + "epoch": 26.400513478818997, + "grad_norm": 1.1279231309890747, + "learning_rate": 2.453658536585366e-05, + "loss": 0.3613, + "step": 20566 + }, + { + "epoch": 26.401797175866495, + "grad_norm": 1.3011746406555176, + "learning_rate": 2.4536157466837824e-05, + "loss": 0.3225, + "step": 20567 + }, + { + "epoch": 26.403080872913993, + "grad_norm": 1.3661795854568481, + "learning_rate": 2.4535729567821996e-05, + "loss": 0.3209, + "step": 20568 + }, + { + "epoch": 26.40436456996149, + "grad_norm": 1.1261297464370728, + "learning_rate": 2.453530166880616e-05, + "loss": 0.3207, + "step": 20569 + }, + { + "epoch": 26.405648267008985, + "grad_norm": 1.6539722681045532, + "learning_rate": 2.4534873769790333e-05, + "loss": 0.342, + "step": 20570 + }, + { + "epoch": 26.406931964056483, + "grad_norm": 2.611433506011963, + "learning_rate": 2.4534445870774498e-05, + "loss": 0.3345, + "step": 20571 + }, + { + "epoch": 26.40821566110398, + "grad_norm": 1.2206532955169678, + "learning_rate": 2.4534017971758663e-05, + "loss": 0.3021, + "step": 20572 + }, + { + "epoch": 26.409499358151475, + "grad_norm": 1.068124771118164, + "learning_rate": 2.4533590072742835e-05, + "loss": 0.3484, + "step": 20573 + }, + { + "epoch": 26.410783055198973, + "grad_norm": 1.3187135457992554, + "learning_rate": 2.4533162173727e-05, + "loss": 0.3553, + "step": 20574 + }, + { + "epoch": 26.41206675224647, + "grad_norm": 1.5373585224151611, + "learning_rate": 2.4532734274711168e-05, + "loss": 0.3256, + "step": 20575 + }, + { + "epoch": 26.413350449293965, + "grad_norm": 1.4215340614318848, + "learning_rate": 2.4532306375695337e-05, + "loss": 0.3551, + "step": 20576 + }, + { + "epoch": 26.414634146341463, + "grad_norm": 1.2787210941314697, + "learning_rate": 2.4531878476679505e-05, + "loss": 0.3466, + "step": 20577 + }, + { + "epoch": 26.41591784338896, + "grad_norm": 2.8778834342956543, + "learning_rate": 2.4531450577663673e-05, + "loss": 0.3485, + "step": 20578 + }, + { + "epoch": 26.41720154043646, + "grad_norm": 0.9132496118545532, + "learning_rate": 2.453102267864784e-05, + "loss": 0.3574, + "step": 20579 + }, + { + "epoch": 26.418485237483953, + "grad_norm": 1.7319204807281494, + "learning_rate": 2.4530594779632007e-05, + "loss": 0.3211, + "step": 20580 + }, + { + "epoch": 26.41976893453145, + "grad_norm": 1.3139883279800415, + "learning_rate": 2.4530166880616175e-05, + "loss": 0.3063, + "step": 20581 + }, + { + "epoch": 26.42105263157895, + "grad_norm": 0.7363197803497314, + "learning_rate": 2.4529738981600344e-05, + "loss": 0.2816, + "step": 20582 + }, + { + "epoch": 26.422336328626443, + "grad_norm": 1.1781541109085083, + "learning_rate": 2.452931108258451e-05, + "loss": 0.32, + "step": 20583 + }, + { + "epoch": 26.42362002567394, + "grad_norm": 1.8213491439819336, + "learning_rate": 2.452888318356868e-05, + "loss": 0.3548, + "step": 20584 + }, + { + "epoch": 26.42490372272144, + "grad_norm": 3.8088319301605225, + "learning_rate": 2.4528455284552846e-05, + "loss": 0.3093, + "step": 20585 + }, + { + "epoch": 26.426187419768933, + "grad_norm": 0.8559215068817139, + "learning_rate": 2.4528027385537014e-05, + "loss": 0.3138, + "step": 20586 + }, + { + "epoch": 26.42747111681643, + "grad_norm": 1.0518684387207031, + "learning_rate": 2.4527599486521182e-05, + "loss": 0.3311, + "step": 20587 + }, + { + "epoch": 26.42875481386393, + "grad_norm": 3.212092876434326, + "learning_rate": 2.4527171587505347e-05, + "loss": 0.3366, + "step": 20588 + }, + { + "epoch": 26.430038510911427, + "grad_norm": 2.2232396602630615, + "learning_rate": 2.452674368848952e-05, + "loss": 0.3408, + "step": 20589 + }, + { + "epoch": 26.43132220795892, + "grad_norm": 1.1837841272354126, + "learning_rate": 2.4526315789473684e-05, + "loss": 0.3361, + "step": 20590 + }, + { + "epoch": 26.43260590500642, + "grad_norm": 1.0314278602600098, + "learning_rate": 2.4525887890457853e-05, + "loss": 0.3628, + "step": 20591 + }, + { + "epoch": 26.433889602053917, + "grad_norm": 1.3456770181655884, + "learning_rate": 2.452545999144202e-05, + "loss": 0.3951, + "step": 20592 + }, + { + "epoch": 26.43517329910141, + "grad_norm": 1.152553915977478, + "learning_rate": 2.4525032092426186e-05, + "loss": 0.3032, + "step": 20593 + }, + { + "epoch": 26.43645699614891, + "grad_norm": 1.1373229026794434, + "learning_rate": 2.4524604193410358e-05, + "loss": 0.3378, + "step": 20594 + }, + { + "epoch": 26.437740693196407, + "grad_norm": 1.8608859777450562, + "learning_rate": 2.4524176294394523e-05, + "loss": 0.3489, + "step": 20595 + }, + { + "epoch": 26.4390243902439, + "grad_norm": 1.4278992414474487, + "learning_rate": 2.452374839537869e-05, + "loss": 0.3516, + "step": 20596 + }, + { + "epoch": 26.4403080872914, + "grad_norm": 1.1520287990570068, + "learning_rate": 2.452332049636286e-05, + "loss": 0.3538, + "step": 20597 + }, + { + "epoch": 26.441591784338897, + "grad_norm": 3.026257276535034, + "learning_rate": 2.4522892597347025e-05, + "loss": 0.3498, + "step": 20598 + }, + { + "epoch": 26.44287548138639, + "grad_norm": 1.9389842748641968, + "learning_rate": 2.4522464698331193e-05, + "loss": 0.3321, + "step": 20599 + }, + { + "epoch": 26.44415917843389, + "grad_norm": 2.114692211151123, + "learning_rate": 2.452203679931536e-05, + "loss": 0.4257, + "step": 20600 + }, + { + "epoch": 26.445442875481387, + "grad_norm": 2.1902594566345215, + "learning_rate": 2.452160890029953e-05, + "loss": 0.371, + "step": 20601 + }, + { + "epoch": 26.446726572528885, + "grad_norm": 1.8809231519699097, + "learning_rate": 2.45211810012837e-05, + "loss": 0.4292, + "step": 20602 + }, + { + "epoch": 26.44801026957638, + "grad_norm": 2.181342840194702, + "learning_rate": 2.4520753102267867e-05, + "loss": 0.4317, + "step": 20603 + }, + { + "epoch": 26.449293966623877, + "grad_norm": 2.284548282623291, + "learning_rate": 2.4520325203252032e-05, + "loss": 0.5785, + "step": 20604 + }, + { + "epoch": 26.450577663671375, + "grad_norm": 1.3592246770858765, + "learning_rate": 2.45198973042362e-05, + "loss": 0.3237, + "step": 20605 + }, + { + "epoch": 26.45186136071887, + "grad_norm": 0.9475399255752563, + "learning_rate": 2.451946940522037e-05, + "loss": 0.3232, + "step": 20606 + }, + { + "epoch": 26.453145057766367, + "grad_norm": 1.0305829048156738, + "learning_rate": 2.4519041506204534e-05, + "loss": 0.3213, + "step": 20607 + }, + { + "epoch": 26.454428754813865, + "grad_norm": 1.3392016887664795, + "learning_rate": 2.4518613607188705e-05, + "loss": 0.3294, + "step": 20608 + }, + { + "epoch": 26.45571245186136, + "grad_norm": 1.0204051733016968, + "learning_rate": 2.451818570817287e-05, + "loss": 0.3323, + "step": 20609 + }, + { + "epoch": 26.456996148908857, + "grad_norm": 0.8414825797080994, + "learning_rate": 2.4517757809157042e-05, + "loss": 0.3448, + "step": 20610 + }, + { + "epoch": 26.458279845956355, + "grad_norm": 0.8205938935279846, + "learning_rate": 2.4517329910141207e-05, + "loss": 0.3037, + "step": 20611 + }, + { + "epoch": 26.459563543003853, + "grad_norm": 1.0746268033981323, + "learning_rate": 2.4516902011125372e-05, + "loss": 0.314, + "step": 20612 + }, + { + "epoch": 26.460847240051347, + "grad_norm": 0.9594523906707764, + "learning_rate": 2.4516474112109544e-05, + "loss": 0.3174, + "step": 20613 + }, + { + "epoch": 26.462130937098845, + "grad_norm": 1.3980649709701538, + "learning_rate": 2.451604621309371e-05, + "loss": 0.3128, + "step": 20614 + }, + { + "epoch": 26.463414634146343, + "grad_norm": 0.7978208065032959, + "learning_rate": 2.4515618314077878e-05, + "loss": 0.3303, + "step": 20615 + }, + { + "epoch": 26.464698331193837, + "grad_norm": 2.0418007373809814, + "learning_rate": 2.4515190415062046e-05, + "loss": 0.3422, + "step": 20616 + }, + { + "epoch": 26.465982028241335, + "grad_norm": 0.8561674356460571, + "learning_rate": 2.4514762516046214e-05, + "loss": 0.3012, + "step": 20617 + }, + { + "epoch": 26.467265725288833, + "grad_norm": 2.6597790718078613, + "learning_rate": 2.4514334617030383e-05, + "loss": 0.3315, + "step": 20618 + }, + { + "epoch": 26.468549422336327, + "grad_norm": 2.3598318099975586, + "learning_rate": 2.4513906718014548e-05, + "loss": 0.3364, + "step": 20619 + }, + { + "epoch": 26.469833119383825, + "grad_norm": 3.296992540359497, + "learning_rate": 2.4513478818998716e-05, + "loss": 0.3452, + "step": 20620 + }, + { + "epoch": 26.471116816431323, + "grad_norm": 1.1006840467453003, + "learning_rate": 2.4513050919982885e-05, + "loss": 0.3304, + "step": 20621 + }, + { + "epoch": 26.47240051347882, + "grad_norm": 3.9135570526123047, + "learning_rate": 2.4512623020967053e-05, + "loss": 0.3132, + "step": 20622 + }, + { + "epoch": 26.473684210526315, + "grad_norm": 1.4389820098876953, + "learning_rate": 2.4512195121951218e-05, + "loss": 0.3138, + "step": 20623 + }, + { + "epoch": 26.474967907573813, + "grad_norm": 1.1625005006790161, + "learning_rate": 2.451176722293539e-05, + "loss": 0.339, + "step": 20624 + }, + { + "epoch": 26.47625160462131, + "grad_norm": 0.8698553442955017, + "learning_rate": 2.4511339323919555e-05, + "loss": 0.2883, + "step": 20625 + }, + { + "epoch": 26.477535301668805, + "grad_norm": 1.1635239124298096, + "learning_rate": 2.4510911424903723e-05, + "loss": 0.3206, + "step": 20626 + }, + { + "epoch": 26.478818998716303, + "grad_norm": 2.2785141468048096, + "learning_rate": 2.4510483525887892e-05, + "loss": 0.3281, + "step": 20627 + }, + { + "epoch": 26.4801026957638, + "grad_norm": 1.1430420875549316, + "learning_rate": 2.4510055626872057e-05, + "loss": 0.3443, + "step": 20628 + }, + { + "epoch": 26.481386392811295, + "grad_norm": 0.9196429252624512, + "learning_rate": 2.450962772785623e-05, + "loss": 0.313, + "step": 20629 + }, + { + "epoch": 26.482670089858793, + "grad_norm": 0.9427947402000427, + "learning_rate": 2.4509199828840394e-05, + "loss": 0.3162, + "step": 20630 + }, + { + "epoch": 26.48395378690629, + "grad_norm": 1.8598780632019043, + "learning_rate": 2.4508771929824562e-05, + "loss": 0.335, + "step": 20631 + }, + { + "epoch": 26.485237483953785, + "grad_norm": 2.3636741638183594, + "learning_rate": 2.450834403080873e-05, + "loss": 0.3409, + "step": 20632 + }, + { + "epoch": 26.486521181001283, + "grad_norm": 0.9754962921142578, + "learning_rate": 2.4507916131792895e-05, + "loss": 0.3212, + "step": 20633 + }, + { + "epoch": 26.48780487804878, + "grad_norm": 1.2647455930709839, + "learning_rate": 2.4507488232777067e-05, + "loss": 0.3249, + "step": 20634 + }, + { + "epoch": 26.48908857509628, + "grad_norm": 0.9070084095001221, + "learning_rate": 2.4507060333761232e-05, + "loss": 0.342, + "step": 20635 + }, + { + "epoch": 26.490372272143773, + "grad_norm": 2.0446717739105225, + "learning_rate": 2.45066324347454e-05, + "loss": 0.3274, + "step": 20636 + }, + { + "epoch": 26.49165596919127, + "grad_norm": 1.0437663793563843, + "learning_rate": 2.450620453572957e-05, + "loss": 0.3202, + "step": 20637 + }, + { + "epoch": 26.49293966623877, + "grad_norm": 1.9067327976226807, + "learning_rate": 2.4505776636713738e-05, + "loss": 0.331, + "step": 20638 + }, + { + "epoch": 26.494223363286263, + "grad_norm": 1.081188678741455, + "learning_rate": 2.4505348737697903e-05, + "loss": 0.3184, + "step": 20639 + }, + { + "epoch": 26.49550706033376, + "grad_norm": 2.185203790664673, + "learning_rate": 2.450492083868207e-05, + "loss": 0.3459, + "step": 20640 + }, + { + "epoch": 26.49679075738126, + "grad_norm": 1.1115564107894897, + "learning_rate": 2.450449293966624e-05, + "loss": 0.3751, + "step": 20641 + }, + { + "epoch": 26.498074454428753, + "grad_norm": 1.3162953853607178, + "learning_rate": 2.4504065040650408e-05, + "loss": 0.3596, + "step": 20642 + }, + { + "epoch": 26.49935815147625, + "grad_norm": 1.363154411315918, + "learning_rate": 2.4503637141634576e-05, + "loss": 0.3367, + "step": 20643 + }, + { + "epoch": 26.50064184852375, + "grad_norm": 4.854251384735107, + "learning_rate": 2.450320924261874e-05, + "loss": 0.3238, + "step": 20644 + }, + { + "epoch": 26.501925545571247, + "grad_norm": 1.2997920513153076, + "learning_rate": 2.4502781343602913e-05, + "loss": 0.355, + "step": 20645 + }, + { + "epoch": 26.50320924261874, + "grad_norm": 1.481009840965271, + "learning_rate": 2.4502353444587078e-05, + "loss": 0.3416, + "step": 20646 + }, + { + "epoch": 26.50449293966624, + "grad_norm": 2.4614386558532715, + "learning_rate": 2.4501925545571243e-05, + "loss": 0.3536, + "step": 20647 + }, + { + "epoch": 26.505776636713737, + "grad_norm": 1.3556357622146606, + "learning_rate": 2.4501497646555415e-05, + "loss": 0.3931, + "step": 20648 + }, + { + "epoch": 26.50706033376123, + "grad_norm": 3.078784942626953, + "learning_rate": 2.450106974753958e-05, + "loss": 0.3857, + "step": 20649 + }, + { + "epoch": 26.50834403080873, + "grad_norm": 2.63063907623291, + "learning_rate": 2.450064184852375e-05, + "loss": 0.3774, + "step": 20650 + }, + { + "epoch": 26.509627727856227, + "grad_norm": 2.4784064292907715, + "learning_rate": 2.4500213949507917e-05, + "loss": 0.3686, + "step": 20651 + }, + { + "epoch": 26.51091142490372, + "grad_norm": 10.244682312011719, + "learning_rate": 2.4499786050492085e-05, + "loss": 0.4634, + "step": 20652 + }, + { + "epoch": 26.51219512195122, + "grad_norm": 1.491782307624817, + "learning_rate": 2.4499358151476254e-05, + "loss": 0.4066, + "step": 20653 + }, + { + "epoch": 26.513478818998717, + "grad_norm": 4.774467945098877, + "learning_rate": 2.449893025246042e-05, + "loss": 0.5298, + "step": 20654 + }, + { + "epoch": 26.514762516046215, + "grad_norm": 4.458399295806885, + "learning_rate": 2.4498502353444587e-05, + "loss": 0.3273, + "step": 20655 + }, + { + "epoch": 26.51604621309371, + "grad_norm": 2.907571792602539, + "learning_rate": 2.4498074454428755e-05, + "loss": 0.3095, + "step": 20656 + }, + { + "epoch": 26.517329910141207, + "grad_norm": 1.1480225324630737, + "learning_rate": 2.4497646555412924e-05, + "loss": 0.3425, + "step": 20657 + }, + { + "epoch": 26.518613607188705, + "grad_norm": 1.079278588294983, + "learning_rate": 2.4497218656397092e-05, + "loss": 0.3353, + "step": 20658 + }, + { + "epoch": 26.5198973042362, + "grad_norm": 1.2826247215270996, + "learning_rate": 2.4496790757381257e-05, + "loss": 0.356, + "step": 20659 + }, + { + "epoch": 26.521181001283697, + "grad_norm": 1.113809585571289, + "learning_rate": 2.4496362858365426e-05, + "loss": 0.3515, + "step": 20660 + }, + { + "epoch": 26.522464698331195, + "grad_norm": 0.8953080177307129, + "learning_rate": 2.4495934959349594e-05, + "loss": 0.3238, + "step": 20661 + }, + { + "epoch": 26.52374839537869, + "grad_norm": 1.1650166511535645, + "learning_rate": 2.4495507060333762e-05, + "loss": 0.3134, + "step": 20662 + }, + { + "epoch": 26.525032092426187, + "grad_norm": 3.5938973426818848, + "learning_rate": 2.4495079161317927e-05, + "loss": 0.3832, + "step": 20663 + }, + { + "epoch": 26.526315789473685, + "grad_norm": 2.613783597946167, + "learning_rate": 2.44946512623021e-05, + "loss": 0.3194, + "step": 20664 + }, + { + "epoch": 26.527599486521183, + "grad_norm": 1.5262272357940674, + "learning_rate": 2.4494223363286264e-05, + "loss": 0.334, + "step": 20665 + }, + { + "epoch": 26.528883183568677, + "grad_norm": 1.1780091524124146, + "learning_rate": 2.4493795464270433e-05, + "loss": 0.3554, + "step": 20666 + }, + { + "epoch": 26.530166880616175, + "grad_norm": 1.1986843347549438, + "learning_rate": 2.44933675652546e-05, + "loss": 0.3176, + "step": 20667 + }, + { + "epoch": 26.531450577663673, + "grad_norm": 1.6040688753128052, + "learning_rate": 2.4492939666238766e-05, + "loss": 0.3403, + "step": 20668 + }, + { + "epoch": 26.532734274711167, + "grad_norm": 0.9066311120986938, + "learning_rate": 2.4492511767222938e-05, + "loss": 0.3016, + "step": 20669 + }, + { + "epoch": 26.534017971758665, + "grad_norm": 1.6921807527542114, + "learning_rate": 2.4492083868207103e-05, + "loss": 0.3255, + "step": 20670 + }, + { + "epoch": 26.535301668806163, + "grad_norm": 1.676769733428955, + "learning_rate": 2.449165596919127e-05, + "loss": 0.3313, + "step": 20671 + }, + { + "epoch": 26.536585365853657, + "grad_norm": 2.4076077938079834, + "learning_rate": 2.449122807017544e-05, + "loss": 0.335, + "step": 20672 + }, + { + "epoch": 26.537869062901155, + "grad_norm": 0.9933673739433289, + "learning_rate": 2.4490800171159605e-05, + "loss": 0.3268, + "step": 20673 + }, + { + "epoch": 26.539152759948653, + "grad_norm": 1.2314889430999756, + "learning_rate": 2.4490372272143777e-05, + "loss": 0.3285, + "step": 20674 + }, + { + "epoch": 26.540436456996147, + "grad_norm": 1.3792505264282227, + "learning_rate": 2.448994437312794e-05, + "loss": 0.3503, + "step": 20675 + }, + { + "epoch": 26.541720154043645, + "grad_norm": 1.5679829120635986, + "learning_rate": 2.448951647411211e-05, + "loss": 0.301, + "step": 20676 + }, + { + "epoch": 26.543003851091143, + "grad_norm": 2.048908233642578, + "learning_rate": 2.448908857509628e-05, + "loss": 0.3341, + "step": 20677 + }, + { + "epoch": 26.54428754813864, + "grad_norm": 1.2330553531646729, + "learning_rate": 2.4488660676080447e-05, + "loss": 0.3152, + "step": 20678 + }, + { + "epoch": 26.545571245186135, + "grad_norm": 1.0285851955413818, + "learning_rate": 2.4488232777064612e-05, + "loss": 0.3509, + "step": 20679 + }, + { + "epoch": 26.546854942233633, + "grad_norm": 1.218829870223999, + "learning_rate": 2.448780487804878e-05, + "loss": 0.3465, + "step": 20680 + }, + { + "epoch": 26.54813863928113, + "grad_norm": 1.7027469873428345, + "learning_rate": 2.448737697903295e-05, + "loss": 0.3516, + "step": 20681 + }, + { + "epoch": 26.549422336328625, + "grad_norm": 1.587557077407837, + "learning_rate": 2.4486949080017117e-05, + "loss": 0.3224, + "step": 20682 + }, + { + "epoch": 26.550706033376123, + "grad_norm": 7.800666809082031, + "learning_rate": 2.4486521181001286e-05, + "loss": 0.3218, + "step": 20683 + }, + { + "epoch": 26.55198973042362, + "grad_norm": 1.8678398132324219, + "learning_rate": 2.448609328198545e-05, + "loss": 0.3712, + "step": 20684 + }, + { + "epoch": 26.553273427471115, + "grad_norm": 1.9880220890045166, + "learning_rate": 2.4485665382969622e-05, + "loss": 0.312, + "step": 20685 + }, + { + "epoch": 26.554557124518613, + "grad_norm": 1.1537531614303589, + "learning_rate": 2.4485237483953787e-05, + "loss": 0.3484, + "step": 20686 + }, + { + "epoch": 26.55584082156611, + "grad_norm": 1.2995104789733887, + "learning_rate": 2.4484809584937952e-05, + "loss": 0.3171, + "step": 20687 + }, + { + "epoch": 26.55712451861361, + "grad_norm": 1.8409223556518555, + "learning_rate": 2.4484381685922124e-05, + "loss": 0.3692, + "step": 20688 + }, + { + "epoch": 26.558408215661103, + "grad_norm": 3.8627984523773193, + "learning_rate": 2.448395378690629e-05, + "loss": 0.3314, + "step": 20689 + }, + { + "epoch": 26.5596919127086, + "grad_norm": 3.187316417694092, + "learning_rate": 2.448352588789046e-05, + "loss": 0.3375, + "step": 20690 + }, + { + "epoch": 26.5609756097561, + "grad_norm": 1.9177100658416748, + "learning_rate": 2.4483097988874626e-05, + "loss": 0.3505, + "step": 20691 + }, + { + "epoch": 26.562259306803593, + "grad_norm": 1.504603624343872, + "learning_rate": 2.4482670089858794e-05, + "loss": 0.3448, + "step": 20692 + }, + { + "epoch": 26.56354300385109, + "grad_norm": 3.7219502925872803, + "learning_rate": 2.4482242190842963e-05, + "loss": 0.361, + "step": 20693 + }, + { + "epoch": 26.56482670089859, + "grad_norm": 1.784618854522705, + "learning_rate": 2.4481814291827128e-05, + "loss": 0.3446, + "step": 20694 + }, + { + "epoch": 26.566110397946083, + "grad_norm": 1.2351970672607422, + "learning_rate": 2.4481386392811296e-05, + "loss": 0.309, + "step": 20695 + }, + { + "epoch": 26.56739409499358, + "grad_norm": 1.1024726629257202, + "learning_rate": 2.4480958493795465e-05, + "loss": 0.3691, + "step": 20696 + }, + { + "epoch": 26.56867779204108, + "grad_norm": 1.5668302774429321, + "learning_rate": 2.4480530594779633e-05, + "loss": 0.3586, + "step": 20697 + }, + { + "epoch": 26.569961489088577, + "grad_norm": 1.3848634958267212, + "learning_rate": 2.44801026957638e-05, + "loss": 0.3593, + "step": 20698 + }, + { + "epoch": 26.57124518613607, + "grad_norm": 2.546964406967163, + "learning_rate": 2.447967479674797e-05, + "loss": 0.359, + "step": 20699 + }, + { + "epoch": 26.57252888318357, + "grad_norm": 6.197642803192139, + "learning_rate": 2.4479246897732135e-05, + "loss": 0.3665, + "step": 20700 + }, + { + "epoch": 26.573812580231067, + "grad_norm": 2.0833964347839355, + "learning_rate": 2.4478818998716303e-05, + "loss": 0.3907, + "step": 20701 + }, + { + "epoch": 26.57509627727856, + "grad_norm": 1.7436636686325073, + "learning_rate": 2.4478391099700472e-05, + "loss": 0.4065, + "step": 20702 + }, + { + "epoch": 26.57637997432606, + "grad_norm": 2.4211933612823486, + "learning_rate": 2.4477963200684637e-05, + "loss": 0.4625, + "step": 20703 + }, + { + "epoch": 26.577663671373557, + "grad_norm": 1.7442152500152588, + "learning_rate": 2.447753530166881e-05, + "loss": 0.5004, + "step": 20704 + }, + { + "epoch": 26.57894736842105, + "grad_norm": 0.8101143836975098, + "learning_rate": 2.4477107402652974e-05, + "loss": 0.3247, + "step": 20705 + }, + { + "epoch": 26.58023106546855, + "grad_norm": 0.8563016057014465, + "learning_rate": 2.4476679503637145e-05, + "loss": 0.3208, + "step": 20706 + }, + { + "epoch": 26.581514762516047, + "grad_norm": 1.1231045722961426, + "learning_rate": 2.447625160462131e-05, + "loss": 0.3481, + "step": 20707 + }, + { + "epoch": 26.58279845956354, + "grad_norm": 0.9903632998466492, + "learning_rate": 2.4475823705605476e-05, + "loss": 0.3799, + "step": 20708 + }, + { + "epoch": 26.58408215661104, + "grad_norm": 1.3981879949569702, + "learning_rate": 2.4475395806589647e-05, + "loss": 0.3325, + "step": 20709 + }, + { + "epoch": 26.585365853658537, + "grad_norm": 0.814812421798706, + "learning_rate": 2.4474967907573812e-05, + "loss": 0.2976, + "step": 20710 + }, + { + "epoch": 26.586649550706035, + "grad_norm": 3.474919319152832, + "learning_rate": 2.447454000855798e-05, + "loss": 0.2941, + "step": 20711 + }, + { + "epoch": 26.58793324775353, + "grad_norm": 1.8942193984985352, + "learning_rate": 2.447411210954215e-05, + "loss": 0.3238, + "step": 20712 + }, + { + "epoch": 26.589216944801027, + "grad_norm": 1.1574218273162842, + "learning_rate": 2.4473684210526318e-05, + "loss": 0.3183, + "step": 20713 + }, + { + "epoch": 26.590500641848525, + "grad_norm": 1.9939888715744019, + "learning_rate": 2.4473256311510486e-05, + "loss": 0.3157, + "step": 20714 + }, + { + "epoch": 26.59178433889602, + "grad_norm": 1.8160189390182495, + "learning_rate": 2.447282841249465e-05, + "loss": 0.347, + "step": 20715 + }, + { + "epoch": 26.593068035943517, + "grad_norm": 1.893416166305542, + "learning_rate": 2.447240051347882e-05, + "loss": 0.3431, + "step": 20716 + }, + { + "epoch": 26.594351732991015, + "grad_norm": 1.1715447902679443, + "learning_rate": 2.4471972614462988e-05, + "loss": 0.33, + "step": 20717 + }, + { + "epoch": 26.59563543003851, + "grad_norm": 1.4700919389724731, + "learning_rate": 2.4471544715447156e-05, + "loss": 0.3536, + "step": 20718 + }, + { + "epoch": 26.596919127086007, + "grad_norm": 1.293154239654541, + "learning_rate": 2.447111681643132e-05, + "loss": 0.3131, + "step": 20719 + }, + { + "epoch": 26.598202824133505, + "grad_norm": 1.3381481170654297, + "learning_rate": 2.447068891741549e-05, + "loss": 0.3778, + "step": 20720 + }, + { + "epoch": 26.599486521181003, + "grad_norm": 2.0680084228515625, + "learning_rate": 2.4470261018399658e-05, + "loss": 0.2999, + "step": 20721 + }, + { + "epoch": 26.600770218228497, + "grad_norm": 1.661065936088562, + "learning_rate": 2.4469833119383826e-05, + "loss": 0.3141, + "step": 20722 + }, + { + "epoch": 26.602053915275995, + "grad_norm": 0.8286745548248291, + "learning_rate": 2.4469405220367995e-05, + "loss": 0.3027, + "step": 20723 + }, + { + "epoch": 26.603337612323493, + "grad_norm": 1.3627859354019165, + "learning_rate": 2.446897732135216e-05, + "loss": 0.3392, + "step": 20724 + }, + { + "epoch": 26.604621309370987, + "grad_norm": 1.5994644165039062, + "learning_rate": 2.4468549422336332e-05, + "loss": 0.3305, + "step": 20725 + }, + { + "epoch": 26.605905006418485, + "grad_norm": 1.2257095575332642, + "learning_rate": 2.4468121523320497e-05, + "loss": 0.3961, + "step": 20726 + }, + { + "epoch": 26.607188703465983, + "grad_norm": 1.0653188228607178, + "learning_rate": 2.4467693624304662e-05, + "loss": 0.3212, + "step": 20727 + }, + { + "epoch": 26.608472400513477, + "grad_norm": 1.2383027076721191, + "learning_rate": 2.4467265725288834e-05, + "loss": 0.336, + "step": 20728 + }, + { + "epoch": 26.609756097560975, + "grad_norm": 1.5229688882827759, + "learning_rate": 2.4466837826273e-05, + "loss": 0.3111, + "step": 20729 + }, + { + "epoch": 26.611039794608473, + "grad_norm": 1.1834065914154053, + "learning_rate": 2.446640992725717e-05, + "loss": 0.3613, + "step": 20730 + }, + { + "epoch": 26.61232349165597, + "grad_norm": 1.5929477214813232, + "learning_rate": 2.4465982028241335e-05, + "loss": 0.3261, + "step": 20731 + }, + { + "epoch": 26.613607188703465, + "grad_norm": 1.0795515775680542, + "learning_rate": 2.4465554129225504e-05, + "loss": 0.3362, + "step": 20732 + }, + { + "epoch": 26.614890885750963, + "grad_norm": 1.0496159791946411, + "learning_rate": 2.4465126230209672e-05, + "loss": 0.3078, + "step": 20733 + }, + { + "epoch": 26.61617458279846, + "grad_norm": 1.1013174057006836, + "learning_rate": 2.4464698331193837e-05, + "loss": 0.3132, + "step": 20734 + }, + { + "epoch": 26.617458279845955, + "grad_norm": 3.1702606678009033, + "learning_rate": 2.4464270432178006e-05, + "loss": 0.3557, + "step": 20735 + }, + { + "epoch": 26.618741976893453, + "grad_norm": 1.7545692920684814, + "learning_rate": 2.4463842533162174e-05, + "loss": 0.353, + "step": 20736 + }, + { + "epoch": 26.62002567394095, + "grad_norm": 1.9837123155593872, + "learning_rate": 2.4463414634146343e-05, + "loss": 0.3884, + "step": 20737 + }, + { + "epoch": 26.621309370988445, + "grad_norm": 1.8921303749084473, + "learning_rate": 2.446298673513051e-05, + "loss": 0.3671, + "step": 20738 + }, + { + "epoch": 26.622593068035943, + "grad_norm": 1.7406141757965088, + "learning_rate": 2.446255883611468e-05, + "loss": 0.3854, + "step": 20739 + }, + { + "epoch": 26.62387676508344, + "grad_norm": 1.3388770818710327, + "learning_rate": 2.4462130937098844e-05, + "loss": 0.3457, + "step": 20740 + }, + { + "epoch": 26.625160462130935, + "grad_norm": 4.25235652923584, + "learning_rate": 2.4461703038083013e-05, + "loss": 0.3428, + "step": 20741 + }, + { + "epoch": 26.626444159178433, + "grad_norm": 1.3852797746658325, + "learning_rate": 2.446127513906718e-05, + "loss": 0.3706, + "step": 20742 + }, + { + "epoch": 26.62772785622593, + "grad_norm": 2.81321382522583, + "learning_rate": 2.4460847240051346e-05, + "loss": 0.3197, + "step": 20743 + }, + { + "epoch": 26.62901155327343, + "grad_norm": 1.0948907136917114, + "learning_rate": 2.4460419341035518e-05, + "loss": 0.3353, + "step": 20744 + }, + { + "epoch": 26.630295250320923, + "grad_norm": 1.2664055824279785, + "learning_rate": 2.4459991442019683e-05, + "loss": 0.3617, + "step": 20745 + }, + { + "epoch": 26.63157894736842, + "grad_norm": 1.207558512687683, + "learning_rate": 2.4459563543003855e-05, + "loss": 0.3425, + "step": 20746 + }, + { + "epoch": 26.63286264441592, + "grad_norm": 1.5189590454101562, + "learning_rate": 2.445913564398802e-05, + "loss": 0.4612, + "step": 20747 + }, + { + "epoch": 26.634146341463413, + "grad_norm": 3.681527853012085, + "learning_rate": 2.4458707744972185e-05, + "loss": 0.3486, + "step": 20748 + }, + { + "epoch": 26.63543003851091, + "grad_norm": 2.0373334884643555, + "learning_rate": 2.4458279845956357e-05, + "loss": 0.3614, + "step": 20749 + }, + { + "epoch": 26.63671373555841, + "grad_norm": 1.3376837968826294, + "learning_rate": 2.4457851946940522e-05, + "loss": 0.3734, + "step": 20750 + }, + { + "epoch": 26.637997432605903, + "grad_norm": 1.5557255744934082, + "learning_rate": 2.445742404792469e-05, + "loss": 0.3704, + "step": 20751 + }, + { + "epoch": 26.6392811296534, + "grad_norm": 1.765899419784546, + "learning_rate": 2.445699614890886e-05, + "loss": 0.4016, + "step": 20752 + }, + { + "epoch": 26.6405648267009, + "grad_norm": 4.759716510772705, + "learning_rate": 2.4456568249893027e-05, + "loss": 0.4338, + "step": 20753 + }, + { + "epoch": 26.641848523748397, + "grad_norm": 1.943234920501709, + "learning_rate": 2.4456140350877195e-05, + "loss": 0.54, + "step": 20754 + }, + { + "epoch": 26.64313222079589, + "grad_norm": 3.043231248855591, + "learning_rate": 2.445571245186136e-05, + "loss": 0.3387, + "step": 20755 + }, + { + "epoch": 26.64441591784339, + "grad_norm": 0.9014863967895508, + "learning_rate": 2.445528455284553e-05, + "loss": 0.3638, + "step": 20756 + }, + { + "epoch": 26.645699614890887, + "grad_norm": 1.1981242895126343, + "learning_rate": 2.4454856653829697e-05, + "loss": 0.3136, + "step": 20757 + }, + { + "epoch": 26.64698331193838, + "grad_norm": 1.0018032789230347, + "learning_rate": 2.4454428754813866e-05, + "loss": 0.3308, + "step": 20758 + }, + { + "epoch": 26.64826700898588, + "grad_norm": 1.2280926704406738, + "learning_rate": 2.445400085579803e-05, + "loss": 0.3144, + "step": 20759 + }, + { + "epoch": 26.649550706033377, + "grad_norm": 0.8417773842811584, + "learning_rate": 2.4453572956782202e-05, + "loss": 0.3169, + "step": 20760 + }, + { + "epoch": 26.65083440308087, + "grad_norm": 1.1297180652618408, + "learning_rate": 2.4453145057766367e-05, + "loss": 0.3305, + "step": 20761 + }, + { + "epoch": 26.65211810012837, + "grad_norm": 2.5662238597869873, + "learning_rate": 2.4452717158750536e-05, + "loss": 0.33, + "step": 20762 + }, + { + "epoch": 26.653401797175867, + "grad_norm": 1.3731694221496582, + "learning_rate": 2.4452289259734704e-05, + "loss": 0.3266, + "step": 20763 + }, + { + "epoch": 26.654685494223365, + "grad_norm": 1.2703962326049805, + "learning_rate": 2.445186136071887e-05, + "loss": 0.3437, + "step": 20764 + }, + { + "epoch": 26.65596919127086, + "grad_norm": 1.3822330236434937, + "learning_rate": 2.445143346170304e-05, + "loss": 0.2975, + "step": 20765 + }, + { + "epoch": 26.657252888318357, + "grad_norm": 1.2742308378219604, + "learning_rate": 2.4451005562687206e-05, + "loss": 0.361, + "step": 20766 + }, + { + "epoch": 26.658536585365855, + "grad_norm": 0.9785916209220886, + "learning_rate": 2.4450577663671375e-05, + "loss": 0.3323, + "step": 20767 + }, + { + "epoch": 26.65982028241335, + "grad_norm": 1.076230525970459, + "learning_rate": 2.4450149764655543e-05, + "loss": 0.3066, + "step": 20768 + }, + { + "epoch": 26.661103979460847, + "grad_norm": 0.8279434442520142, + "learning_rate": 2.4449721865639708e-05, + "loss": 0.3164, + "step": 20769 + }, + { + "epoch": 26.662387676508345, + "grad_norm": 1.5969491004943848, + "learning_rate": 2.4449293966623876e-05, + "loss": 0.3191, + "step": 20770 + }, + { + "epoch": 26.66367137355584, + "grad_norm": 1.539189100265503, + "learning_rate": 2.4448866067608045e-05, + "loss": 0.3358, + "step": 20771 + }, + { + "epoch": 26.664955070603337, + "grad_norm": 1.5474456548690796, + "learning_rate": 2.4448438168592213e-05, + "loss": 0.3111, + "step": 20772 + }, + { + "epoch": 26.666238767650835, + "grad_norm": 0.9195961952209473, + "learning_rate": 2.444801026957638e-05, + "loss": 0.3387, + "step": 20773 + }, + { + "epoch": 26.66752246469833, + "grad_norm": 1.4307600259780884, + "learning_rate": 2.444758237056055e-05, + "loss": 0.3263, + "step": 20774 + }, + { + "epoch": 26.668806161745827, + "grad_norm": 0.767900288105011, + "learning_rate": 2.4447154471544715e-05, + "loss": 0.3261, + "step": 20775 + }, + { + "epoch": 26.670089858793325, + "grad_norm": 0.8477922677993774, + "learning_rate": 2.4446726572528883e-05, + "loss": 0.3269, + "step": 20776 + }, + { + "epoch": 26.671373555840823, + "grad_norm": 1.3898544311523438, + "learning_rate": 2.4446298673513052e-05, + "loss": 0.3507, + "step": 20777 + }, + { + "epoch": 26.672657252888317, + "grad_norm": 1.4726494550704956, + "learning_rate": 2.4445870774497217e-05, + "loss": 0.3204, + "step": 20778 + }, + { + "epoch": 26.673940949935815, + "grad_norm": 1.8341046571731567, + "learning_rate": 2.444544287548139e-05, + "loss": 0.3439, + "step": 20779 + }, + { + "epoch": 26.675224646983313, + "grad_norm": 1.8389606475830078, + "learning_rate": 2.4445014976465554e-05, + "loss": 0.2924, + "step": 20780 + }, + { + "epoch": 26.676508344030808, + "grad_norm": 1.1943163871765137, + "learning_rate": 2.4444587077449722e-05, + "loss": 0.3285, + "step": 20781 + }, + { + "epoch": 26.677792041078305, + "grad_norm": 1.6546121835708618, + "learning_rate": 2.444415917843389e-05, + "loss": 0.35, + "step": 20782 + }, + { + "epoch": 26.679075738125803, + "grad_norm": 1.3174890279769897, + "learning_rate": 2.4443731279418056e-05, + "loss": 0.3153, + "step": 20783 + }, + { + "epoch": 26.680359435173298, + "grad_norm": 2.2842066287994385, + "learning_rate": 2.4443303380402227e-05, + "loss": 0.3091, + "step": 20784 + }, + { + "epoch": 26.681643132220795, + "grad_norm": 3.521811008453369, + "learning_rate": 2.4442875481386392e-05, + "loss": 0.33, + "step": 20785 + }, + { + "epoch": 26.682926829268293, + "grad_norm": 1.5304280519485474, + "learning_rate": 2.444244758237056e-05, + "loss": 0.3529, + "step": 20786 + }, + { + "epoch": 26.68421052631579, + "grad_norm": 1.5302468538284302, + "learning_rate": 2.444201968335473e-05, + "loss": 0.3635, + "step": 20787 + }, + { + "epoch": 26.685494223363285, + "grad_norm": 1.1290090084075928, + "learning_rate": 2.4441591784338894e-05, + "loss": 0.3208, + "step": 20788 + }, + { + "epoch": 26.686777920410783, + "grad_norm": 1.178033471107483, + "learning_rate": 2.4441163885323066e-05, + "loss": 0.3334, + "step": 20789 + }, + { + "epoch": 26.68806161745828, + "grad_norm": 1.3066424131393433, + "learning_rate": 2.444073598630723e-05, + "loss": 0.3759, + "step": 20790 + }, + { + "epoch": 26.689345314505776, + "grad_norm": 1.4304300546646118, + "learning_rate": 2.44403080872914e-05, + "loss": 0.3163, + "step": 20791 + }, + { + "epoch": 26.690629011553273, + "grad_norm": 2.418018102645874, + "learning_rate": 2.4439880188275568e-05, + "loss": 0.318, + "step": 20792 + }, + { + "epoch": 26.69191270860077, + "grad_norm": 2.298902988433838, + "learning_rate": 2.4439452289259736e-05, + "loss": 0.359, + "step": 20793 + }, + { + "epoch": 26.693196405648266, + "grad_norm": 1.3160479068756104, + "learning_rate": 2.44390243902439e-05, + "loss": 0.3381, + "step": 20794 + }, + { + "epoch": 26.694480102695763, + "grad_norm": 1.5375807285308838, + "learning_rate": 2.443859649122807e-05, + "loss": 0.3782, + "step": 20795 + }, + { + "epoch": 26.69576379974326, + "grad_norm": 2.2859818935394287, + "learning_rate": 2.4438168592212238e-05, + "loss": 0.4511, + "step": 20796 + }, + { + "epoch": 26.69704749679076, + "grad_norm": 1.9960017204284668, + "learning_rate": 2.4437740693196407e-05, + "loss": 0.3721, + "step": 20797 + }, + { + "epoch": 26.698331193838253, + "grad_norm": 8.140814781188965, + "learning_rate": 2.4437312794180575e-05, + "loss": 0.3311, + "step": 20798 + }, + { + "epoch": 26.69961489088575, + "grad_norm": 1.4330514669418335, + "learning_rate": 2.443688489516474e-05, + "loss": 0.365, + "step": 20799 + }, + { + "epoch": 26.70089858793325, + "grad_norm": 1.6450878381729126, + "learning_rate": 2.4436456996148912e-05, + "loss": 0.3754, + "step": 20800 + }, + { + "epoch": 26.702182284980744, + "grad_norm": 1.4142215251922607, + "learning_rate": 2.4436029097133077e-05, + "loss": 0.3689, + "step": 20801 + }, + { + "epoch": 26.70346598202824, + "grad_norm": 1.1609466075897217, + "learning_rate": 2.4435601198117242e-05, + "loss": 0.4228, + "step": 20802 + }, + { + "epoch": 26.70474967907574, + "grad_norm": 3.1039516925811768, + "learning_rate": 2.4435173299101414e-05, + "loss": 0.5076, + "step": 20803 + }, + { + "epoch": 26.706033376123234, + "grad_norm": 4.66117000579834, + "learning_rate": 2.443474540008558e-05, + "loss": 0.535, + "step": 20804 + }, + { + "epoch": 26.70731707317073, + "grad_norm": 1.044055700302124, + "learning_rate": 2.443431750106975e-05, + "loss": 0.3205, + "step": 20805 + }, + { + "epoch": 26.70860077021823, + "grad_norm": 0.9393153190612793, + "learning_rate": 2.4433889602053915e-05, + "loss": 0.3266, + "step": 20806 + }, + { + "epoch": 26.709884467265724, + "grad_norm": 0.9814698100090027, + "learning_rate": 2.4433461703038084e-05, + "loss": 0.3095, + "step": 20807 + }, + { + "epoch": 26.71116816431322, + "grad_norm": 1.2949206829071045, + "learning_rate": 2.4433033804022252e-05, + "loss": 0.3599, + "step": 20808 + }, + { + "epoch": 26.71245186136072, + "grad_norm": 1.221364140510559, + "learning_rate": 2.4432605905006417e-05, + "loss": 0.3353, + "step": 20809 + }, + { + "epoch": 26.713735558408217, + "grad_norm": 1.436446189880371, + "learning_rate": 2.4432178005990586e-05, + "loss": 0.3491, + "step": 20810 + }, + { + "epoch": 26.71501925545571, + "grad_norm": 0.8818352818489075, + "learning_rate": 2.4431750106974754e-05, + "loss": 0.3573, + "step": 20811 + }, + { + "epoch": 26.71630295250321, + "grad_norm": 0.9642578959465027, + "learning_rate": 2.4431322207958923e-05, + "loss": 0.3258, + "step": 20812 + }, + { + "epoch": 26.717586649550707, + "grad_norm": 0.8385055065155029, + "learning_rate": 2.443089430894309e-05, + "loss": 0.3566, + "step": 20813 + }, + { + "epoch": 26.7188703465982, + "grad_norm": 1.4125986099243164, + "learning_rate": 2.443046640992726e-05, + "loss": 0.3392, + "step": 20814 + }, + { + "epoch": 26.7201540436457, + "grad_norm": 4.444985389709473, + "learning_rate": 2.4430038510911424e-05, + "loss": 0.3464, + "step": 20815 + }, + { + "epoch": 26.721437740693197, + "grad_norm": 1.2668417692184448, + "learning_rate": 2.4429610611895593e-05, + "loss": 0.368, + "step": 20816 + }, + { + "epoch": 26.72272143774069, + "grad_norm": 1.1963804960250854, + "learning_rate": 2.442918271287976e-05, + "loss": 0.3091, + "step": 20817 + }, + { + "epoch": 26.72400513478819, + "grad_norm": 1.0641295909881592, + "learning_rate": 2.4428754813863926e-05, + "loss": 0.3374, + "step": 20818 + }, + { + "epoch": 26.725288831835687, + "grad_norm": 1.1583683490753174, + "learning_rate": 2.4428326914848098e-05, + "loss": 0.373, + "step": 20819 + }, + { + "epoch": 26.726572528883185, + "grad_norm": 1.3184388875961304, + "learning_rate": 2.4427899015832263e-05, + "loss": 0.347, + "step": 20820 + }, + { + "epoch": 26.72785622593068, + "grad_norm": 1.1578338146209717, + "learning_rate": 2.4427471116816435e-05, + "loss": 0.3747, + "step": 20821 + }, + { + "epoch": 26.729139922978177, + "grad_norm": 0.9210330247879028, + "learning_rate": 2.44270432178006e-05, + "loss": 0.312, + "step": 20822 + }, + { + "epoch": 26.730423620025675, + "grad_norm": 1.1968497037887573, + "learning_rate": 2.4426615318784765e-05, + "loss": 0.3131, + "step": 20823 + }, + { + "epoch": 26.73170731707317, + "grad_norm": 1.134635090827942, + "learning_rate": 2.4426187419768937e-05, + "loss": 0.3661, + "step": 20824 + }, + { + "epoch": 26.732991014120667, + "grad_norm": 1.4023860692977905, + "learning_rate": 2.4425759520753102e-05, + "loss": 0.3082, + "step": 20825 + }, + { + "epoch": 26.734274711168165, + "grad_norm": 1.4779610633850098, + "learning_rate": 2.442533162173727e-05, + "loss": 0.3197, + "step": 20826 + }, + { + "epoch": 26.73555840821566, + "grad_norm": 1.130188226699829, + "learning_rate": 2.442490372272144e-05, + "loss": 0.3644, + "step": 20827 + }, + { + "epoch": 26.736842105263158, + "grad_norm": 1.4563082456588745, + "learning_rate": 2.4424475823705607e-05, + "loss": 0.3054, + "step": 20828 + }, + { + "epoch": 26.738125802310655, + "grad_norm": 1.3133985996246338, + "learning_rate": 2.4424047924689775e-05, + "loss": 0.3464, + "step": 20829 + }, + { + "epoch": 26.739409499358153, + "grad_norm": 5.650814056396484, + "learning_rate": 2.442362002567394e-05, + "loss": 0.3297, + "step": 20830 + }, + { + "epoch": 26.740693196405648, + "grad_norm": 0.9196216464042664, + "learning_rate": 2.442319212665811e-05, + "loss": 0.3069, + "step": 20831 + }, + { + "epoch": 26.741976893453145, + "grad_norm": 2.870236873626709, + "learning_rate": 2.4422764227642277e-05, + "loss": 0.3401, + "step": 20832 + }, + { + "epoch": 26.743260590500643, + "grad_norm": 3.787611961364746, + "learning_rate": 2.4422336328626446e-05, + "loss": 0.3268, + "step": 20833 + }, + { + "epoch": 26.744544287548138, + "grad_norm": 1.3070250749588013, + "learning_rate": 2.442190842961061e-05, + "loss": 0.3639, + "step": 20834 + }, + { + "epoch": 26.745827984595635, + "grad_norm": 1.6075407266616821, + "learning_rate": 2.4421480530594782e-05, + "loss": 0.341, + "step": 20835 + }, + { + "epoch": 26.747111681643133, + "grad_norm": 1.2623043060302734, + "learning_rate": 2.4421052631578948e-05, + "loss": 0.3818, + "step": 20836 + }, + { + "epoch": 26.748395378690628, + "grad_norm": 1.5064960718154907, + "learning_rate": 2.4420624732563116e-05, + "loss": 0.3496, + "step": 20837 + }, + { + "epoch": 26.749679075738126, + "grad_norm": 1.0914922952651978, + "learning_rate": 2.4420196833547284e-05, + "loss": 0.3314, + "step": 20838 + }, + { + "epoch": 26.750962772785623, + "grad_norm": 1.2368230819702148, + "learning_rate": 2.441976893453145e-05, + "loss": 0.3374, + "step": 20839 + }, + { + "epoch": 26.752246469833118, + "grad_norm": 1.384232521057129, + "learning_rate": 2.441934103551562e-05, + "loss": 0.3312, + "step": 20840 + }, + { + "epoch": 26.753530166880616, + "grad_norm": 1.379010558128357, + "learning_rate": 2.4418913136499786e-05, + "loss": 0.3484, + "step": 20841 + }, + { + "epoch": 26.754813863928113, + "grad_norm": 1.4303299188613892, + "learning_rate": 2.441848523748395e-05, + "loss": 0.3882, + "step": 20842 + }, + { + "epoch": 26.75609756097561, + "grad_norm": 1.1938470602035522, + "learning_rate": 2.4418057338468123e-05, + "loss": 0.3451, + "step": 20843 + }, + { + "epoch": 26.757381258023106, + "grad_norm": 1.2297885417938232, + "learning_rate": 2.4417629439452288e-05, + "loss": 0.3697, + "step": 20844 + }, + { + "epoch": 26.758664955070603, + "grad_norm": 1.2900454998016357, + "learning_rate": 2.441720154043646e-05, + "loss": 0.3885, + "step": 20845 + }, + { + "epoch": 26.7599486521181, + "grad_norm": 2.4759373664855957, + "learning_rate": 2.4416773641420625e-05, + "loss": 0.3318, + "step": 20846 + }, + { + "epoch": 26.761232349165596, + "grad_norm": 1.3170634508132935, + "learning_rate": 2.4416345742404793e-05, + "loss": 0.3178, + "step": 20847 + }, + { + "epoch": 26.762516046213094, + "grad_norm": 1.9376816749572754, + "learning_rate": 2.441591784338896e-05, + "loss": 0.3774, + "step": 20848 + }, + { + "epoch": 26.76379974326059, + "grad_norm": 1.83211088180542, + "learning_rate": 2.4415489944373127e-05, + "loss": 0.4066, + "step": 20849 + }, + { + "epoch": 26.765083440308086, + "grad_norm": 1.6272547245025635, + "learning_rate": 2.4415062045357295e-05, + "loss": 0.3797, + "step": 20850 + }, + { + "epoch": 26.766367137355584, + "grad_norm": 2.853050708770752, + "learning_rate": 2.4414634146341464e-05, + "loss": 0.3991, + "step": 20851 + }, + { + "epoch": 26.76765083440308, + "grad_norm": 1.588840126991272, + "learning_rate": 2.4414206247325632e-05, + "loss": 0.4285, + "step": 20852 + }, + { + "epoch": 26.76893453145058, + "grad_norm": 1.9739195108413696, + "learning_rate": 2.44137783483098e-05, + "loss": 0.4292, + "step": 20853 + }, + { + "epoch": 26.770218228498074, + "grad_norm": 4.285040855407715, + "learning_rate": 2.441335044929397e-05, + "loss": 0.478, + "step": 20854 + }, + { + "epoch": 26.77150192554557, + "grad_norm": 1.0898326635360718, + "learning_rate": 2.4412922550278134e-05, + "loss": 0.3173, + "step": 20855 + }, + { + "epoch": 26.77278562259307, + "grad_norm": 1.0643384456634521, + "learning_rate": 2.4412494651262302e-05, + "loss": 0.3202, + "step": 20856 + }, + { + "epoch": 26.774069319640564, + "grad_norm": 0.7994303107261658, + "learning_rate": 2.441206675224647e-05, + "loss": 0.3161, + "step": 20857 + }, + { + "epoch": 26.77535301668806, + "grad_norm": 0.9141993522644043, + "learning_rate": 2.4411638853230636e-05, + "loss": 0.3452, + "step": 20858 + }, + { + "epoch": 26.77663671373556, + "grad_norm": 0.9192059636116028, + "learning_rate": 2.4411210954214807e-05, + "loss": 0.3385, + "step": 20859 + }, + { + "epoch": 26.777920410783054, + "grad_norm": 1.5230568647384644, + "learning_rate": 2.4410783055198972e-05, + "loss": 0.3435, + "step": 20860 + }, + { + "epoch": 26.77920410783055, + "grad_norm": 1.449508547782898, + "learning_rate": 2.4410355156183144e-05, + "loss": 0.3504, + "step": 20861 + }, + { + "epoch": 26.78048780487805, + "grad_norm": 1.6515092849731445, + "learning_rate": 2.440992725716731e-05, + "loss": 0.3374, + "step": 20862 + }, + { + "epoch": 26.781771501925547, + "grad_norm": 0.8839256167411804, + "learning_rate": 2.4409499358151474e-05, + "loss": 0.3319, + "step": 20863 + }, + { + "epoch": 26.78305519897304, + "grad_norm": 1.0568705797195435, + "learning_rate": 2.4409071459135646e-05, + "loss": 0.3605, + "step": 20864 + }, + { + "epoch": 26.78433889602054, + "grad_norm": 2.341325521469116, + "learning_rate": 2.440864356011981e-05, + "loss": 0.3194, + "step": 20865 + }, + { + "epoch": 26.785622593068037, + "grad_norm": 1.842786192893982, + "learning_rate": 2.440821566110398e-05, + "loss": 0.36, + "step": 20866 + }, + { + "epoch": 26.78690629011553, + "grad_norm": 1.5876784324645996, + "learning_rate": 2.4407787762088148e-05, + "loss": 0.3287, + "step": 20867 + }, + { + "epoch": 26.78818998716303, + "grad_norm": 1.2868624925613403, + "learning_rate": 2.4407359863072316e-05, + "loss": 0.3186, + "step": 20868 + }, + { + "epoch": 26.789473684210527, + "grad_norm": 1.4693397283554077, + "learning_rate": 2.4406931964056485e-05, + "loss": 0.3536, + "step": 20869 + }, + { + "epoch": 26.79075738125802, + "grad_norm": 1.3177235126495361, + "learning_rate": 2.440650406504065e-05, + "loss": 0.3399, + "step": 20870 + }, + { + "epoch": 26.79204107830552, + "grad_norm": 1.5688138008117676, + "learning_rate": 2.4406076166024818e-05, + "loss": 0.3161, + "step": 20871 + }, + { + "epoch": 26.793324775353017, + "grad_norm": 2.0330634117126465, + "learning_rate": 2.4405648267008987e-05, + "loss": 0.3015, + "step": 20872 + }, + { + "epoch": 26.794608472400512, + "grad_norm": 1.0704485177993774, + "learning_rate": 2.4405220367993155e-05, + "loss": 0.3407, + "step": 20873 + }, + { + "epoch": 26.79589216944801, + "grad_norm": 0.9582903981208801, + "learning_rate": 2.440479246897732e-05, + "loss": 0.311, + "step": 20874 + }, + { + "epoch": 26.797175866495508, + "grad_norm": 1.4726672172546387, + "learning_rate": 2.4404364569961492e-05, + "loss": 0.3759, + "step": 20875 + }, + { + "epoch": 26.798459563543005, + "grad_norm": 0.9884264469146729, + "learning_rate": 2.4403936670945657e-05, + "loss": 0.3181, + "step": 20876 + }, + { + "epoch": 26.7997432605905, + "grad_norm": 1.8583526611328125, + "learning_rate": 2.4403508771929825e-05, + "loss": 0.3185, + "step": 20877 + }, + { + "epoch": 26.801026957637998, + "grad_norm": 1.7308465242385864, + "learning_rate": 2.4403080872913994e-05, + "loss": 0.2867, + "step": 20878 + }, + { + "epoch": 26.802310654685495, + "grad_norm": 1.3532730340957642, + "learning_rate": 2.440265297389816e-05, + "loss": 0.336, + "step": 20879 + }, + { + "epoch": 26.80359435173299, + "grad_norm": 1.055752158164978, + "learning_rate": 2.440222507488233e-05, + "loss": 0.3232, + "step": 20880 + }, + { + "epoch": 26.804878048780488, + "grad_norm": 1.2018519639968872, + "learning_rate": 2.4401797175866496e-05, + "loss": 0.3546, + "step": 20881 + }, + { + "epoch": 26.806161745827985, + "grad_norm": 1.7656073570251465, + "learning_rate": 2.4401369276850664e-05, + "loss": 0.3054, + "step": 20882 + }, + { + "epoch": 26.80744544287548, + "grad_norm": 2.6735944747924805, + "learning_rate": 2.4400941377834832e-05, + "loss": 0.3431, + "step": 20883 + }, + { + "epoch": 26.808729139922978, + "grad_norm": 2.741210460662842, + "learning_rate": 2.4400513478818997e-05, + "loss": 0.3093, + "step": 20884 + }, + { + "epoch": 26.810012836970476, + "grad_norm": 1.2256875038146973, + "learning_rate": 2.440008557980317e-05, + "loss": 0.3047, + "step": 20885 + }, + { + "epoch": 26.811296534017973, + "grad_norm": 1.3274853229522705, + "learning_rate": 2.4399657680787334e-05, + "loss": 0.3234, + "step": 20886 + }, + { + "epoch": 26.812580231065468, + "grad_norm": 1.3601747751235962, + "learning_rate": 2.4399229781771503e-05, + "loss": 0.3157, + "step": 20887 + }, + { + "epoch": 26.813863928112966, + "grad_norm": 1.3091844320297241, + "learning_rate": 2.439880188275567e-05, + "loss": 0.3241, + "step": 20888 + }, + { + "epoch": 26.815147625160463, + "grad_norm": 1.577842116355896, + "learning_rate": 2.439837398373984e-05, + "loss": 0.3589, + "step": 20889 + }, + { + "epoch": 26.816431322207958, + "grad_norm": 6.293832302093506, + "learning_rate": 2.4397946084724004e-05, + "loss": 0.3313, + "step": 20890 + }, + { + "epoch": 26.817715019255456, + "grad_norm": 1.494102954864502, + "learning_rate": 2.4397518185708173e-05, + "loss": 0.3435, + "step": 20891 + }, + { + "epoch": 26.818998716302954, + "grad_norm": 1.7544058561325073, + "learning_rate": 2.439709028669234e-05, + "loss": 0.3579, + "step": 20892 + }, + { + "epoch": 26.820282413350448, + "grad_norm": 1.6688807010650635, + "learning_rate": 2.439666238767651e-05, + "loss": 0.3496, + "step": 20893 + }, + { + "epoch": 26.821566110397946, + "grad_norm": 1.4939593076705933, + "learning_rate": 2.4396234488660678e-05, + "loss": 0.3627, + "step": 20894 + }, + { + "epoch": 26.822849807445444, + "grad_norm": 3.531174898147583, + "learning_rate": 2.4395806589644843e-05, + "loss": 0.3888, + "step": 20895 + }, + { + "epoch": 26.82413350449294, + "grad_norm": 2.8009262084960938, + "learning_rate": 2.4395378690629015e-05, + "loss": 0.4079, + "step": 20896 + }, + { + "epoch": 26.825417201540436, + "grad_norm": 2.865934371948242, + "learning_rate": 2.439495079161318e-05, + "loss": 0.3564, + "step": 20897 + }, + { + "epoch": 26.826700898587934, + "grad_norm": 2.5853466987609863, + "learning_rate": 2.4394522892597345e-05, + "loss": 0.3933, + "step": 20898 + }, + { + "epoch": 26.82798459563543, + "grad_norm": 1.2448898553848267, + "learning_rate": 2.4394094993581517e-05, + "loss": 0.3723, + "step": 20899 + }, + { + "epoch": 26.829268292682926, + "grad_norm": 2.0512311458587646, + "learning_rate": 2.4393667094565682e-05, + "loss": 0.3937, + "step": 20900 + }, + { + "epoch": 26.830551989730424, + "grad_norm": 1.1867045164108276, + "learning_rate": 2.4393239195549854e-05, + "loss": 0.4121, + "step": 20901 + }, + { + "epoch": 26.83183568677792, + "grad_norm": 1.6862025260925293, + "learning_rate": 2.439281129653402e-05, + "loss": 0.4181, + "step": 20902 + }, + { + "epoch": 26.833119383825416, + "grad_norm": 1.6935912370681763, + "learning_rate": 2.4392383397518184e-05, + "loss": 0.3928, + "step": 20903 + }, + { + "epoch": 26.834403080872914, + "grad_norm": 4.122379302978516, + "learning_rate": 2.4391955498502355e-05, + "loss": 0.5267, + "step": 20904 + }, + { + "epoch": 26.83568677792041, + "grad_norm": 1.2690246105194092, + "learning_rate": 2.439152759948652e-05, + "loss": 0.3268, + "step": 20905 + }, + { + "epoch": 26.836970474967906, + "grad_norm": 2.88625431060791, + "learning_rate": 2.439109970047069e-05, + "loss": 0.3403, + "step": 20906 + }, + { + "epoch": 26.838254172015404, + "grad_norm": 1.5683209896087646, + "learning_rate": 2.4390671801454857e-05, + "loss": 0.3408, + "step": 20907 + }, + { + "epoch": 26.8395378690629, + "grad_norm": 1.56614089012146, + "learning_rate": 2.4390243902439026e-05, + "loss": 0.3356, + "step": 20908 + }, + { + "epoch": 26.8408215661104, + "grad_norm": 2.425556182861328, + "learning_rate": 2.4389816003423194e-05, + "loss": 0.3542, + "step": 20909 + }, + { + "epoch": 26.842105263157894, + "grad_norm": 1.043408989906311, + "learning_rate": 2.438938810440736e-05, + "loss": 0.3029, + "step": 20910 + }, + { + "epoch": 26.84338896020539, + "grad_norm": 1.2138580083847046, + "learning_rate": 2.4388960205391528e-05, + "loss": 0.3527, + "step": 20911 + }, + { + "epoch": 26.84467265725289, + "grad_norm": 1.0834273099899292, + "learning_rate": 2.4388532306375696e-05, + "loss": 0.3195, + "step": 20912 + }, + { + "epoch": 26.845956354300384, + "grad_norm": 3.3616745471954346, + "learning_rate": 2.4388104407359864e-05, + "loss": 0.321, + "step": 20913 + }, + { + "epoch": 26.84724005134788, + "grad_norm": 1.0101135969161987, + "learning_rate": 2.438767650834403e-05, + "loss": 0.3246, + "step": 20914 + }, + { + "epoch": 26.84852374839538, + "grad_norm": 1.004429817199707, + "learning_rate": 2.43872486093282e-05, + "loss": 0.3506, + "step": 20915 + }, + { + "epoch": 26.849807445442874, + "grad_norm": 1.3683699369430542, + "learning_rate": 2.4386820710312366e-05, + "loss": 0.3426, + "step": 20916 + }, + { + "epoch": 26.85109114249037, + "grad_norm": 2.334221363067627, + "learning_rate": 2.4386392811296535e-05, + "loss": 0.3334, + "step": 20917 + }, + { + "epoch": 26.85237483953787, + "grad_norm": 1.2104699611663818, + "learning_rate": 2.4385964912280703e-05, + "loss": 0.3388, + "step": 20918 + }, + { + "epoch": 26.853658536585368, + "grad_norm": 1.0805597305297852, + "learning_rate": 2.4385537013264868e-05, + "loss": 0.3139, + "step": 20919 + }, + { + "epoch": 26.854942233632862, + "grad_norm": 1.5024363994598389, + "learning_rate": 2.438510911424904e-05, + "loss": 0.372, + "step": 20920 + }, + { + "epoch": 26.85622593068036, + "grad_norm": 1.2855777740478516, + "learning_rate": 2.4384681215233205e-05, + "loss": 0.3441, + "step": 20921 + }, + { + "epoch": 26.857509627727858, + "grad_norm": 2.0619094371795654, + "learning_rate": 2.4384253316217373e-05, + "loss": 0.321, + "step": 20922 + }, + { + "epoch": 26.858793324775352, + "grad_norm": 0.9751520752906799, + "learning_rate": 2.4383825417201542e-05, + "loss": 0.3484, + "step": 20923 + }, + { + "epoch": 26.86007702182285, + "grad_norm": 3.5488619804382324, + "learning_rate": 2.4383397518185707e-05, + "loss": 0.3399, + "step": 20924 + }, + { + "epoch": 26.861360718870348, + "grad_norm": 0.8574868440628052, + "learning_rate": 2.438296961916988e-05, + "loss": 0.3321, + "step": 20925 + }, + { + "epoch": 26.862644415917842, + "grad_norm": 1.3631924390792847, + "learning_rate": 2.4382541720154044e-05, + "loss": 0.349, + "step": 20926 + }, + { + "epoch": 26.86392811296534, + "grad_norm": 5.337093830108643, + "learning_rate": 2.4382113821138212e-05, + "loss": 0.3307, + "step": 20927 + }, + { + "epoch": 26.865211810012838, + "grad_norm": 1.3014118671417236, + "learning_rate": 2.438168592212238e-05, + "loss": 0.3301, + "step": 20928 + }, + { + "epoch": 26.866495507060336, + "grad_norm": 1.0570133924484253, + "learning_rate": 2.438125802310655e-05, + "loss": 0.2954, + "step": 20929 + }, + { + "epoch": 26.86777920410783, + "grad_norm": 2.6531982421875, + "learning_rate": 2.4380830124090714e-05, + "loss": 0.2981, + "step": 20930 + }, + { + "epoch": 26.869062901155328, + "grad_norm": 1.2466833591461182, + "learning_rate": 2.4380402225074882e-05, + "loss": 0.3733, + "step": 20931 + }, + { + "epoch": 26.870346598202826, + "grad_norm": 0.8025670051574707, + "learning_rate": 2.437997432605905e-05, + "loss": 0.2968, + "step": 20932 + }, + { + "epoch": 26.87163029525032, + "grad_norm": 0.9416711330413818, + "learning_rate": 2.437954642704322e-05, + "loss": 0.3276, + "step": 20933 + }, + { + "epoch": 26.872913992297818, + "grad_norm": 1.2399626970291138, + "learning_rate": 2.4379118528027387e-05, + "loss": 0.3339, + "step": 20934 + }, + { + "epoch": 26.874197689345316, + "grad_norm": 1.024865746498108, + "learning_rate": 2.4378690629011553e-05, + "loss": 0.3213, + "step": 20935 + }, + { + "epoch": 26.87548138639281, + "grad_norm": 1.327388882637024, + "learning_rate": 2.4378262729995724e-05, + "loss": 0.3167, + "step": 20936 + }, + { + "epoch": 26.876765083440308, + "grad_norm": 2.087181568145752, + "learning_rate": 2.437783483097989e-05, + "loss": 0.3514, + "step": 20937 + }, + { + "epoch": 26.878048780487806, + "grad_norm": 2.690804958343506, + "learning_rate": 2.4377406931964054e-05, + "loss": 0.3688, + "step": 20938 + }, + { + "epoch": 26.8793324775353, + "grad_norm": 1.4152181148529053, + "learning_rate": 2.4376979032948226e-05, + "loss": 0.3405, + "step": 20939 + }, + { + "epoch": 26.880616174582798, + "grad_norm": 2.624511480331421, + "learning_rate": 2.437655113393239e-05, + "loss": 0.3205, + "step": 20940 + }, + { + "epoch": 26.881899871630296, + "grad_norm": 1.8146923780441284, + "learning_rate": 2.4376123234916563e-05, + "loss": 0.3329, + "step": 20941 + }, + { + "epoch": 26.883183568677794, + "grad_norm": 1.553488850593567, + "learning_rate": 2.4375695335900728e-05, + "loss": 0.3272, + "step": 20942 + }, + { + "epoch": 26.884467265725288, + "grad_norm": 1.9248898029327393, + "learning_rate": 2.4375267436884896e-05, + "loss": 0.3277, + "step": 20943 + }, + { + "epoch": 26.885750962772786, + "grad_norm": 5.983360767364502, + "learning_rate": 2.4374839537869065e-05, + "loss": 0.3276, + "step": 20944 + }, + { + "epoch": 26.887034659820284, + "grad_norm": 1.9996572732925415, + "learning_rate": 2.437441163885323e-05, + "loss": 0.3175, + "step": 20945 + }, + { + "epoch": 26.888318356867778, + "grad_norm": 1.7071784734725952, + "learning_rate": 2.4373983739837398e-05, + "loss": 0.3671, + "step": 20946 + }, + { + "epoch": 26.889602053915276, + "grad_norm": 2.098259449005127, + "learning_rate": 2.4373555840821567e-05, + "loss": 0.3546, + "step": 20947 + }, + { + "epoch": 26.890885750962774, + "grad_norm": 3.2582523822784424, + "learning_rate": 2.4373127941805735e-05, + "loss": 0.3256, + "step": 20948 + }, + { + "epoch": 26.892169448010268, + "grad_norm": 1.4967117309570312, + "learning_rate": 2.4372700042789904e-05, + "loss": 0.437, + "step": 20949 + }, + { + "epoch": 26.893453145057766, + "grad_norm": 1.9795291423797607, + "learning_rate": 2.4372272143774072e-05, + "loss": 0.3907, + "step": 20950 + }, + { + "epoch": 26.894736842105264, + "grad_norm": 1.8294874429702759, + "learning_rate": 2.4371844244758237e-05, + "loss": 0.3775, + "step": 20951 + }, + { + "epoch": 26.89602053915276, + "grad_norm": 2.9613444805145264, + "learning_rate": 2.4371416345742405e-05, + "loss": 0.3984, + "step": 20952 + }, + { + "epoch": 26.897304236200256, + "grad_norm": 2.0308902263641357, + "learning_rate": 2.4370988446726574e-05, + "loss": 0.4546, + "step": 20953 + }, + { + "epoch": 26.898587933247754, + "grad_norm": 2.2297325134277344, + "learning_rate": 2.437056054771074e-05, + "loss": 0.5194, + "step": 20954 + }, + { + "epoch": 26.89987163029525, + "grad_norm": 1.093476414680481, + "learning_rate": 2.437013264869491e-05, + "loss": 0.3146, + "step": 20955 + }, + { + "epoch": 26.901155327342746, + "grad_norm": 1.0552997589111328, + "learning_rate": 2.4369704749679076e-05, + "loss": 0.3218, + "step": 20956 + }, + { + "epoch": 26.902439024390244, + "grad_norm": 1.3966400623321533, + "learning_rate": 2.4369276850663247e-05, + "loss": 0.3243, + "step": 20957 + }, + { + "epoch": 26.90372272143774, + "grad_norm": 1.7212518453598022, + "learning_rate": 2.4368848951647412e-05, + "loss": 0.339, + "step": 20958 + }, + { + "epoch": 26.905006418485236, + "grad_norm": 1.2040334939956665, + "learning_rate": 2.4368421052631577e-05, + "loss": 0.3429, + "step": 20959 + }, + { + "epoch": 26.906290115532734, + "grad_norm": 0.956622838973999, + "learning_rate": 2.436799315361575e-05, + "loss": 0.3005, + "step": 20960 + }, + { + "epoch": 26.90757381258023, + "grad_norm": 1.5401211977005005, + "learning_rate": 2.4367565254599914e-05, + "loss": 0.322, + "step": 20961 + }, + { + "epoch": 26.90885750962773, + "grad_norm": 0.72726970911026, + "learning_rate": 2.4367137355584083e-05, + "loss": 0.3082, + "step": 20962 + }, + { + "epoch": 26.910141206675224, + "grad_norm": 1.37029230594635, + "learning_rate": 2.436670945656825e-05, + "loss": 0.3454, + "step": 20963 + }, + { + "epoch": 26.911424903722722, + "grad_norm": 1.465407133102417, + "learning_rate": 2.436628155755242e-05, + "loss": 0.3425, + "step": 20964 + }, + { + "epoch": 26.91270860077022, + "grad_norm": 1.0848565101623535, + "learning_rate": 2.4365853658536588e-05, + "loss": 0.3654, + "step": 20965 + }, + { + "epoch": 26.913992297817714, + "grad_norm": 1.4588470458984375, + "learning_rate": 2.4365425759520753e-05, + "loss": 0.3234, + "step": 20966 + }, + { + "epoch": 26.915275994865212, + "grad_norm": 0.9135245084762573, + "learning_rate": 2.436499786050492e-05, + "loss": 0.31, + "step": 20967 + }, + { + "epoch": 26.91655969191271, + "grad_norm": 0.9957262873649597, + "learning_rate": 2.436456996148909e-05, + "loss": 0.3149, + "step": 20968 + }, + { + "epoch": 26.917843388960204, + "grad_norm": 1.1300746202468872, + "learning_rate": 2.4364142062473258e-05, + "loss": 0.3469, + "step": 20969 + }, + { + "epoch": 26.919127086007702, + "grad_norm": 1.4145504236221313, + "learning_rate": 2.4363714163457423e-05, + "loss": 0.3281, + "step": 20970 + }, + { + "epoch": 26.9204107830552, + "grad_norm": 1.1122626066207886, + "learning_rate": 2.436328626444159e-05, + "loss": 0.3044, + "step": 20971 + }, + { + "epoch": 26.921694480102694, + "grad_norm": 1.7759273052215576, + "learning_rate": 2.436285836542576e-05, + "loss": 0.2929, + "step": 20972 + }, + { + "epoch": 26.922978177150192, + "grad_norm": 1.5961600542068481, + "learning_rate": 2.4362430466409925e-05, + "loss": 0.3483, + "step": 20973 + }, + { + "epoch": 26.92426187419769, + "grad_norm": 1.101108193397522, + "learning_rate": 2.4362002567394097e-05, + "loss": 0.3528, + "step": 20974 + }, + { + "epoch": 26.925545571245188, + "grad_norm": 1.4523717164993286, + "learning_rate": 2.4361574668378262e-05, + "loss": 0.3362, + "step": 20975 + }, + { + "epoch": 26.926829268292682, + "grad_norm": 1.7268255949020386, + "learning_rate": 2.4361146769362434e-05, + "loss": 0.331, + "step": 20976 + }, + { + "epoch": 26.92811296534018, + "grad_norm": 2.112178325653076, + "learning_rate": 2.43607188703466e-05, + "loss": 0.3323, + "step": 20977 + }, + { + "epoch": 26.929396662387678, + "grad_norm": 1.8284597396850586, + "learning_rate": 2.4360290971330764e-05, + "loss": 0.3167, + "step": 20978 + }, + { + "epoch": 26.930680359435172, + "grad_norm": 2.52108097076416, + "learning_rate": 2.4359863072314936e-05, + "loss": 0.3516, + "step": 20979 + }, + { + "epoch": 26.93196405648267, + "grad_norm": 2.4633545875549316, + "learning_rate": 2.43594351732991e-05, + "loss": 0.2889, + "step": 20980 + }, + { + "epoch": 26.933247753530168, + "grad_norm": 4.316934585571289, + "learning_rate": 2.435900727428327e-05, + "loss": 0.318, + "step": 20981 + }, + { + "epoch": 26.934531450577662, + "grad_norm": 3.504901647567749, + "learning_rate": 2.4358579375267437e-05, + "loss": 0.3744, + "step": 20982 + }, + { + "epoch": 26.93581514762516, + "grad_norm": 1.5004702806472778, + "learning_rate": 2.4358151476251606e-05, + "loss": 0.3586, + "step": 20983 + }, + { + "epoch": 26.937098844672658, + "grad_norm": 1.8340778350830078, + "learning_rate": 2.4357723577235774e-05, + "loss": 0.3463, + "step": 20984 + }, + { + "epoch": 26.938382541720156, + "grad_norm": 1.4099376201629639, + "learning_rate": 2.435729567821994e-05, + "loss": 0.3341, + "step": 20985 + }, + { + "epoch": 26.93966623876765, + "grad_norm": 6.4438676834106445, + "learning_rate": 2.4356867779204108e-05, + "loss": 0.3764, + "step": 20986 + }, + { + "epoch": 26.940949935815148, + "grad_norm": 1.7943097352981567, + "learning_rate": 2.4356439880188276e-05, + "loss": 0.3719, + "step": 20987 + }, + { + "epoch": 26.942233632862646, + "grad_norm": 8.145584106445312, + "learning_rate": 2.4356011981172444e-05, + "loss": 0.3658, + "step": 20988 + }, + { + "epoch": 26.94351732991014, + "grad_norm": 1.3107961416244507, + "learning_rate": 2.435558408215661e-05, + "loss": 0.3538, + "step": 20989 + }, + { + "epoch": 26.944801026957638, + "grad_norm": 1.3244843482971191, + "learning_rate": 2.435515618314078e-05, + "loss": 0.3305, + "step": 20990 + }, + { + "epoch": 26.946084724005136, + "grad_norm": 1.8671057224273682, + "learning_rate": 2.4354728284124946e-05, + "loss": 0.3572, + "step": 20991 + }, + { + "epoch": 26.94736842105263, + "grad_norm": 1.3921345472335815, + "learning_rate": 2.4354300385109115e-05, + "loss": 0.3841, + "step": 20992 + }, + { + "epoch": 26.948652118100128, + "grad_norm": 2.150524139404297, + "learning_rate": 2.4353872486093283e-05, + "loss": 0.3607, + "step": 20993 + }, + { + "epoch": 26.949935815147626, + "grad_norm": 6.342907905578613, + "learning_rate": 2.4353444587077448e-05, + "loss": 0.3524, + "step": 20994 + }, + { + "epoch": 26.951219512195124, + "grad_norm": 2.4568071365356445, + "learning_rate": 2.435301668806162e-05, + "loss": 0.3316, + "step": 20995 + }, + { + "epoch": 26.952503209242618, + "grad_norm": 4.503480434417725, + "learning_rate": 2.4352588789045785e-05, + "loss": 0.3479, + "step": 20996 + }, + { + "epoch": 26.953786906290116, + "grad_norm": 1.9099665880203247, + "learning_rate": 2.4352160890029953e-05, + "loss": 0.3898, + "step": 20997 + }, + { + "epoch": 26.955070603337614, + "grad_norm": 1.6102116107940674, + "learning_rate": 2.4351732991014122e-05, + "loss": 0.3431, + "step": 20998 + }, + { + "epoch": 26.956354300385108, + "grad_norm": 1.5452834367752075, + "learning_rate": 2.4351305091998287e-05, + "loss": 0.3467, + "step": 20999 + }, + { + "epoch": 26.957637997432606, + "grad_norm": 2.741219997406006, + "learning_rate": 2.435087719298246e-05, + "loss": 0.4002, + "step": 21000 + }, + { + "epoch": 26.957637997432606, + "eval_cer": 0.2664969649500685, + "eval_loss": 0.4831768572330475, + "eval_runtime": 14.4258, + "eval_samples_per_second": 68.142, + "eval_steps_per_second": 0.485, + "eval_wer": 0.4618206833295116, + "step": 21000 + }, + { + "epoch": 26.958921694480104, + "grad_norm": 3.1986143589019775, + "learning_rate": 2.4350449293966624e-05, + "loss": 0.4167, + "step": 21001 + }, + { + "epoch": 26.960205391527598, + "grad_norm": 1.793992042541504, + "learning_rate": 2.4350021394950792e-05, + "loss": 0.417, + "step": 21002 + }, + { + "epoch": 26.961489088575096, + "grad_norm": 2.3562915325164795, + "learning_rate": 2.434959349593496e-05, + "loss": 0.3869, + "step": 21003 + }, + { + "epoch": 26.962772785622594, + "grad_norm": 3.4144480228424072, + "learning_rate": 2.434916559691913e-05, + "loss": 0.4737, + "step": 21004 + }, + { + "epoch": 26.964056482670088, + "grad_norm": 1.0857322216033936, + "learning_rate": 2.4348737697903294e-05, + "loss": 0.2892, + "step": 21005 + }, + { + "epoch": 26.965340179717586, + "grad_norm": 1.3111330270767212, + "learning_rate": 2.4348309798887462e-05, + "loss": 0.3136, + "step": 21006 + }, + { + "epoch": 26.966623876765084, + "grad_norm": 1.5786182880401611, + "learning_rate": 2.434788189987163e-05, + "loss": 0.338, + "step": 21007 + }, + { + "epoch": 26.96790757381258, + "grad_norm": 1.649882435798645, + "learning_rate": 2.43474540008558e-05, + "loss": 0.3266, + "step": 21008 + }, + { + "epoch": 26.969191270860076, + "grad_norm": 1.0791327953338623, + "learning_rate": 2.4347026101839968e-05, + "loss": 0.3254, + "step": 21009 + }, + { + "epoch": 26.970474967907574, + "grad_norm": 2.5126631259918213, + "learning_rate": 2.4346598202824133e-05, + "loss": 0.3027, + "step": 21010 + }, + { + "epoch": 26.971758664955072, + "grad_norm": 1.4201337099075317, + "learning_rate": 2.4346170303808304e-05, + "loss": 0.3351, + "step": 21011 + }, + { + "epoch": 26.973042362002566, + "grad_norm": 1.694169521331787, + "learning_rate": 2.434574240479247e-05, + "loss": 0.3142, + "step": 21012 + }, + { + "epoch": 26.974326059050064, + "grad_norm": 1.135298252105713, + "learning_rate": 2.4345314505776634e-05, + "loss": 0.3364, + "step": 21013 + }, + { + "epoch": 26.975609756097562, + "grad_norm": 1.7191401720046997, + "learning_rate": 2.4344886606760806e-05, + "loss": 0.3492, + "step": 21014 + }, + { + "epoch": 26.976893453145056, + "grad_norm": 1.140726923942566, + "learning_rate": 2.434445870774497e-05, + "loss": 0.3211, + "step": 21015 + }, + { + "epoch": 26.978177150192554, + "grad_norm": 1.4382697343826294, + "learning_rate": 2.4344030808729143e-05, + "loss": 0.3098, + "step": 21016 + }, + { + "epoch": 26.979460847240052, + "grad_norm": 1.5307430028915405, + "learning_rate": 2.4343602909713308e-05, + "loss": 0.3406, + "step": 21017 + }, + { + "epoch": 26.98074454428755, + "grad_norm": 1.1481761932373047, + "learning_rate": 2.4343175010697476e-05, + "loss": 0.3358, + "step": 21018 + }, + { + "epoch": 26.982028241335044, + "grad_norm": 1.7377967834472656, + "learning_rate": 2.4342747111681645e-05, + "loss": 0.355, + "step": 21019 + }, + { + "epoch": 26.983311938382542, + "grad_norm": 1.6677101850509644, + "learning_rate": 2.434231921266581e-05, + "loss": 0.341, + "step": 21020 + }, + { + "epoch": 26.98459563543004, + "grad_norm": 3.0562918186187744, + "learning_rate": 2.434189131364998e-05, + "loss": 0.3427, + "step": 21021 + }, + { + "epoch": 26.985879332477534, + "grad_norm": 1.2763301134109497, + "learning_rate": 2.4341463414634147e-05, + "loss": 0.3081, + "step": 21022 + }, + { + "epoch": 26.987163029525032, + "grad_norm": 1.5962932109832764, + "learning_rate": 2.4341035515618315e-05, + "loss": 0.3048, + "step": 21023 + }, + { + "epoch": 26.98844672657253, + "grad_norm": 6.156511306762695, + "learning_rate": 2.4340607616602484e-05, + "loss": 0.3707, + "step": 21024 + }, + { + "epoch": 26.989730423620024, + "grad_norm": 2.3358397483825684, + "learning_rate": 2.4340179717586652e-05, + "loss": 0.357, + "step": 21025 + }, + { + "epoch": 26.991014120667522, + "grad_norm": 1.1911120414733887, + "learning_rate": 2.4339751818570817e-05, + "loss": 0.3442, + "step": 21026 + }, + { + "epoch": 26.99229781771502, + "grad_norm": 2.377906084060669, + "learning_rate": 2.4339323919554985e-05, + "loss": 0.3555, + "step": 21027 + }, + { + "epoch": 26.993581514762518, + "grad_norm": 2.350461483001709, + "learning_rate": 2.4338896020539154e-05, + "loss": 0.3722, + "step": 21028 + }, + { + "epoch": 26.994865211810012, + "grad_norm": 2.6497721672058105, + "learning_rate": 2.433846812152332e-05, + "loss": 0.3574, + "step": 21029 + }, + { + "epoch": 26.99614890885751, + "grad_norm": 1.9527041912078857, + "learning_rate": 2.433804022250749e-05, + "loss": 0.3857, + "step": 21030 + }, + { + "epoch": 26.997432605905008, + "grad_norm": 1.327915072441101, + "learning_rate": 2.4337612323491656e-05, + "loss": 0.3865, + "step": 21031 + }, + { + "epoch": 26.998716302952502, + "grad_norm": 3.5364580154418945, + "learning_rate": 2.4337184424475824e-05, + "loss": 0.4921, + "step": 21032 + }, + { + "epoch": 27.0, + "grad_norm": 3.5974485874176025, + "learning_rate": 2.4336756525459992e-05, + "loss": 0.6405, + "step": 21033 + }, + { + "epoch": 27.001283697047498, + "grad_norm": 1.1431427001953125, + "learning_rate": 2.4336328626444158e-05, + "loss": 0.3013, + "step": 21034 + }, + { + "epoch": 27.002567394094992, + "grad_norm": 0.6255560517311096, + "learning_rate": 2.433590072742833e-05, + "loss": 0.2937, + "step": 21035 + }, + { + "epoch": 27.00385109114249, + "grad_norm": 1.0300949811935425, + "learning_rate": 2.4335472828412494e-05, + "loss": 0.2712, + "step": 21036 + }, + { + "epoch": 27.005134788189988, + "grad_norm": 1.4001173973083496, + "learning_rate": 2.4335044929396663e-05, + "loss": 0.3282, + "step": 21037 + }, + { + "epoch": 27.006418485237482, + "grad_norm": 0.9652799367904663, + "learning_rate": 2.433461703038083e-05, + "loss": 0.306, + "step": 21038 + }, + { + "epoch": 27.00770218228498, + "grad_norm": 1.6129086017608643, + "learning_rate": 2.4334189131364996e-05, + "loss": 0.2925, + "step": 21039 + }, + { + "epoch": 27.008985879332478, + "grad_norm": 1.0385158061981201, + "learning_rate": 2.4333761232349168e-05, + "loss": 0.3265, + "step": 21040 + }, + { + "epoch": 27.010269576379976, + "grad_norm": 1.3413934707641602, + "learning_rate": 2.4333333333333333e-05, + "loss": 0.3224, + "step": 21041 + }, + { + "epoch": 27.01155327342747, + "grad_norm": 1.050421953201294, + "learning_rate": 2.43329054343175e-05, + "loss": 0.2941, + "step": 21042 + }, + { + "epoch": 27.012836970474968, + "grad_norm": 1.1310317516326904, + "learning_rate": 2.433247753530167e-05, + "loss": 0.3101, + "step": 21043 + }, + { + "epoch": 27.014120667522466, + "grad_norm": 0.9799347519874573, + "learning_rate": 2.4332049636285838e-05, + "loss": 0.3345, + "step": 21044 + }, + { + "epoch": 27.01540436456996, + "grad_norm": 1.0178154706954956, + "learning_rate": 2.4331621737270003e-05, + "loss": 0.3028, + "step": 21045 + }, + { + "epoch": 27.016688061617458, + "grad_norm": 1.9230294227600098, + "learning_rate": 2.433119383825417e-05, + "loss": 0.331, + "step": 21046 + }, + { + "epoch": 27.017971758664956, + "grad_norm": 1.729784369468689, + "learning_rate": 2.433076593923834e-05, + "loss": 0.3313, + "step": 21047 + }, + { + "epoch": 27.01925545571245, + "grad_norm": 1.8932020664215088, + "learning_rate": 2.433033804022251e-05, + "loss": 0.2947, + "step": 21048 + }, + { + "epoch": 27.020539152759948, + "grad_norm": 2.2156262397766113, + "learning_rate": 2.4329910141206677e-05, + "loss": 0.2949, + "step": 21049 + }, + { + "epoch": 27.021822849807446, + "grad_norm": 1.3661056756973267, + "learning_rate": 2.4329482242190842e-05, + "loss": 0.3098, + "step": 21050 + }, + { + "epoch": 27.023106546854944, + "grad_norm": 1.3851321935653687, + "learning_rate": 2.4329054343175014e-05, + "loss": 0.2741, + "step": 21051 + }, + { + "epoch": 27.024390243902438, + "grad_norm": 0.9871958494186401, + "learning_rate": 2.432862644415918e-05, + "loss": 0.3054, + "step": 21052 + }, + { + "epoch": 27.025673940949936, + "grad_norm": 8.49759578704834, + "learning_rate": 2.4328198545143344e-05, + "loss": 0.3121, + "step": 21053 + }, + { + "epoch": 27.026957637997434, + "grad_norm": 0.8892289400100708, + "learning_rate": 2.4327770646127516e-05, + "loss": 0.3205, + "step": 21054 + }, + { + "epoch": 27.028241335044928, + "grad_norm": 2.2606983184814453, + "learning_rate": 2.432734274711168e-05, + "loss": 0.2964, + "step": 21055 + }, + { + "epoch": 27.029525032092426, + "grad_norm": 1.8056972026824951, + "learning_rate": 2.4326914848095852e-05, + "loss": 0.3254, + "step": 21056 + }, + { + "epoch": 27.030808729139924, + "grad_norm": 1.8645951747894287, + "learning_rate": 2.4326486949080017e-05, + "loss": 0.2766, + "step": 21057 + }, + { + "epoch": 27.03209242618742, + "grad_norm": 1.7396278381347656, + "learning_rate": 2.4326059050064186e-05, + "loss": 0.292, + "step": 21058 + }, + { + "epoch": 27.033376123234916, + "grad_norm": 1.4442782402038574, + "learning_rate": 2.4325631151048354e-05, + "loss": 0.3531, + "step": 21059 + }, + { + "epoch": 27.034659820282414, + "grad_norm": 1.914863109588623, + "learning_rate": 2.432520325203252e-05, + "loss": 0.3273, + "step": 21060 + }, + { + "epoch": 27.035943517329912, + "grad_norm": 0.958122193813324, + "learning_rate": 2.4324775353016688e-05, + "loss": 0.3064, + "step": 21061 + }, + { + "epoch": 27.037227214377406, + "grad_norm": 1.6523677110671997, + "learning_rate": 2.4324347454000856e-05, + "loss": 0.3018, + "step": 21062 + }, + { + "epoch": 27.038510911424904, + "grad_norm": 5.3830246925354, + "learning_rate": 2.4323919554985025e-05, + "loss": 0.2926, + "step": 21063 + }, + { + "epoch": 27.039794608472402, + "grad_norm": 1.4079780578613281, + "learning_rate": 2.4323491655969193e-05, + "loss": 0.301, + "step": 21064 + }, + { + "epoch": 27.041078305519896, + "grad_norm": 5.343571186065674, + "learning_rate": 2.432306375695336e-05, + "loss": 0.3025, + "step": 21065 + }, + { + "epoch": 27.042362002567394, + "grad_norm": 2.9610414505004883, + "learning_rate": 2.4322635857937526e-05, + "loss": 0.2998, + "step": 21066 + }, + { + "epoch": 27.043645699614892, + "grad_norm": 2.145155668258667, + "learning_rate": 2.4322207958921695e-05, + "loss": 0.328, + "step": 21067 + }, + { + "epoch": 27.044929396662386, + "grad_norm": 1.3839834928512573, + "learning_rate": 2.4321780059905863e-05, + "loss": 0.3031, + "step": 21068 + }, + { + "epoch": 27.046213093709884, + "grad_norm": 1.9290167093276978, + "learning_rate": 2.4321352160890028e-05, + "loss": 0.3085, + "step": 21069 + }, + { + "epoch": 27.047496790757382, + "grad_norm": 2.40486478805542, + "learning_rate": 2.43209242618742e-05, + "loss": 0.3456, + "step": 21070 + }, + { + "epoch": 27.048780487804876, + "grad_norm": 1.324682354927063, + "learning_rate": 2.4320496362858365e-05, + "loss": 0.3045, + "step": 21071 + }, + { + "epoch": 27.050064184852374, + "grad_norm": 1.1678647994995117, + "learning_rate": 2.4320068463842537e-05, + "loss": 0.3018, + "step": 21072 + }, + { + "epoch": 27.051347881899872, + "grad_norm": 1.122829794883728, + "learning_rate": 2.4319640564826702e-05, + "loss": 0.3454, + "step": 21073 + }, + { + "epoch": 27.05263157894737, + "grad_norm": 1.1766436100006104, + "learning_rate": 2.4319212665810867e-05, + "loss": 0.3366, + "step": 21074 + }, + { + "epoch": 27.053915275994864, + "grad_norm": 1.2485650777816772, + "learning_rate": 2.431878476679504e-05, + "loss": 0.3348, + "step": 21075 + }, + { + "epoch": 27.055198973042362, + "grad_norm": 1.3027141094207764, + "learning_rate": 2.4318356867779204e-05, + "loss": 0.3188, + "step": 21076 + }, + { + "epoch": 27.05648267008986, + "grad_norm": 1.6586519479751587, + "learning_rate": 2.4317928968763372e-05, + "loss": 0.3028, + "step": 21077 + }, + { + "epoch": 27.057766367137354, + "grad_norm": 2.834874391555786, + "learning_rate": 2.431750106974754e-05, + "loss": 0.3378, + "step": 21078 + }, + { + "epoch": 27.059050064184852, + "grad_norm": 2.3247761726379395, + "learning_rate": 2.431707317073171e-05, + "loss": 0.3597, + "step": 21079 + }, + { + "epoch": 27.06033376123235, + "grad_norm": 2.746018648147583, + "learning_rate": 2.4316645271715877e-05, + "loss": 0.3862, + "step": 21080 + }, + { + "epoch": 27.061617458279844, + "grad_norm": 5.918662071228027, + "learning_rate": 2.4316217372700042e-05, + "loss": 0.4164, + "step": 21081 + }, + { + "epoch": 27.062901155327342, + "grad_norm": 2.4558298587799072, + "learning_rate": 2.431578947368421e-05, + "loss": 0.4659, + "step": 21082 + }, + { + "epoch": 27.06418485237484, + "grad_norm": 2.0306971073150635, + "learning_rate": 2.431536157466838e-05, + "loss": 0.4814, + "step": 21083 + }, + { + "epoch": 27.065468549422338, + "grad_norm": 1.3047943115234375, + "learning_rate": 2.4314933675652548e-05, + "loss": 0.3139, + "step": 21084 + }, + { + "epoch": 27.066752246469832, + "grad_norm": 1.114835500717163, + "learning_rate": 2.4314505776636713e-05, + "loss": 0.2968, + "step": 21085 + }, + { + "epoch": 27.06803594351733, + "grad_norm": 1.1327219009399414, + "learning_rate": 2.4314077877620884e-05, + "loss": 0.2953, + "step": 21086 + }, + { + "epoch": 27.069319640564828, + "grad_norm": 1.06473708152771, + "learning_rate": 2.431364997860505e-05, + "loss": 0.3312, + "step": 21087 + }, + { + "epoch": 27.070603337612322, + "grad_norm": 1.1770200729370117, + "learning_rate": 2.4313222079589218e-05, + "loss": 0.2986, + "step": 21088 + }, + { + "epoch": 27.07188703465982, + "grad_norm": 0.8289997577667236, + "learning_rate": 2.4312794180573386e-05, + "loss": 0.3225, + "step": 21089 + }, + { + "epoch": 27.073170731707318, + "grad_norm": 1.153674840927124, + "learning_rate": 2.431236628155755e-05, + "loss": 0.2805, + "step": 21090 + }, + { + "epoch": 27.074454428754812, + "grad_norm": 4.030437469482422, + "learning_rate": 2.4311938382541723e-05, + "loss": 0.2777, + "step": 21091 + }, + { + "epoch": 27.07573812580231, + "grad_norm": 0.8717454075813293, + "learning_rate": 2.4311510483525888e-05, + "loss": 0.3274, + "step": 21092 + }, + { + "epoch": 27.077021822849808, + "grad_norm": 1.5523406267166138, + "learning_rate": 2.4311082584510053e-05, + "loss": 0.3184, + "step": 21093 + }, + { + "epoch": 27.078305519897306, + "grad_norm": 1.1748650074005127, + "learning_rate": 2.4310654685494225e-05, + "loss": 0.3092, + "step": 21094 + }, + { + "epoch": 27.0795892169448, + "grad_norm": 0.9003386497497559, + "learning_rate": 2.431022678647839e-05, + "loss": 0.2757, + "step": 21095 + }, + { + "epoch": 27.080872913992298, + "grad_norm": 1.0721794366836548, + "learning_rate": 2.4309798887462562e-05, + "loss": 0.2737, + "step": 21096 + }, + { + "epoch": 27.082156611039796, + "grad_norm": 1.375417947769165, + "learning_rate": 2.4309370988446727e-05, + "loss": 0.3388, + "step": 21097 + }, + { + "epoch": 27.08344030808729, + "grad_norm": 1.8297021389007568, + "learning_rate": 2.4308943089430895e-05, + "loss": 0.2895, + "step": 21098 + }, + { + "epoch": 27.084724005134788, + "grad_norm": 1.3860431909561157, + "learning_rate": 2.4308515190415064e-05, + "loss": 0.3236, + "step": 21099 + }, + { + "epoch": 27.086007702182286, + "grad_norm": 2.0838184356689453, + "learning_rate": 2.430808729139923e-05, + "loss": 0.3015, + "step": 21100 + }, + { + "epoch": 27.08729139922978, + "grad_norm": 1.2049216032028198, + "learning_rate": 2.4307659392383397e-05, + "loss": 0.303, + "step": 21101 + }, + { + "epoch": 27.088575096277278, + "grad_norm": 1.9692326784133911, + "learning_rate": 2.4307231493367565e-05, + "loss": 0.3098, + "step": 21102 + }, + { + "epoch": 27.089858793324776, + "grad_norm": 1.054573893547058, + "learning_rate": 2.4306803594351734e-05, + "loss": 0.3263, + "step": 21103 + }, + { + "epoch": 27.09114249037227, + "grad_norm": 1.116349220275879, + "learning_rate": 2.4306375695335902e-05, + "loss": 0.3073, + "step": 21104 + }, + { + "epoch": 27.09242618741977, + "grad_norm": 2.285100221633911, + "learning_rate": 2.430594779632007e-05, + "loss": 0.2979, + "step": 21105 + }, + { + "epoch": 27.093709884467266, + "grad_norm": 1.1538281440734863, + "learning_rate": 2.4305519897304236e-05, + "loss": 0.3057, + "step": 21106 + }, + { + "epoch": 27.094993581514764, + "grad_norm": 1.012457251548767, + "learning_rate": 2.4305091998288404e-05, + "loss": 0.2841, + "step": 21107 + }, + { + "epoch": 27.09627727856226, + "grad_norm": 0.8732671737670898, + "learning_rate": 2.4304664099272573e-05, + "loss": 0.3276, + "step": 21108 + }, + { + "epoch": 27.097560975609756, + "grad_norm": 0.9138410687446594, + "learning_rate": 2.4304236200256738e-05, + "loss": 0.279, + "step": 21109 + }, + { + "epoch": 27.098844672657254, + "grad_norm": 3.1927151679992676, + "learning_rate": 2.430380830124091e-05, + "loss": 0.3128, + "step": 21110 + }, + { + "epoch": 27.10012836970475, + "grad_norm": 1.040499210357666, + "learning_rate": 2.4303380402225074e-05, + "loss": 0.3024, + "step": 21111 + }, + { + "epoch": 27.101412066752246, + "grad_norm": 1.0904381275177002, + "learning_rate": 2.4302952503209246e-05, + "loss": 0.3266, + "step": 21112 + }, + { + "epoch": 27.102695763799744, + "grad_norm": 1.148080587387085, + "learning_rate": 2.430252460419341e-05, + "loss": 0.3022, + "step": 21113 + }, + { + "epoch": 27.10397946084724, + "grad_norm": 1.2591753005981445, + "learning_rate": 2.4302096705177576e-05, + "loss": 0.3081, + "step": 21114 + }, + { + "epoch": 27.105263157894736, + "grad_norm": 3.39018177986145, + "learning_rate": 2.4301668806161748e-05, + "loss": 0.3183, + "step": 21115 + }, + { + "epoch": 27.106546854942234, + "grad_norm": 1.734795331954956, + "learning_rate": 2.4301240907145913e-05, + "loss": 0.3533, + "step": 21116 + }, + { + "epoch": 27.107830551989732, + "grad_norm": 1.556699275970459, + "learning_rate": 2.430081300813008e-05, + "loss": 0.3226, + "step": 21117 + }, + { + "epoch": 27.109114249037226, + "grad_norm": 1.337669014930725, + "learning_rate": 2.430038510911425e-05, + "loss": 0.2815, + "step": 21118 + }, + { + "epoch": 27.110397946084724, + "grad_norm": 1.8611633777618408, + "learning_rate": 2.4299957210098418e-05, + "loss": 0.3219, + "step": 21119 + }, + { + "epoch": 27.111681643132222, + "grad_norm": 1.0203036069869995, + "learning_rate": 2.4299529311082587e-05, + "loss": 0.3059, + "step": 21120 + }, + { + "epoch": 27.112965340179716, + "grad_norm": 1.3386708498001099, + "learning_rate": 2.4299101412066752e-05, + "loss": 0.3187, + "step": 21121 + }, + { + "epoch": 27.114249037227214, + "grad_norm": 5.259277820587158, + "learning_rate": 2.429867351305092e-05, + "loss": 0.3358, + "step": 21122 + }, + { + "epoch": 27.115532734274712, + "grad_norm": 1.213955283164978, + "learning_rate": 2.429824561403509e-05, + "loss": 0.3374, + "step": 21123 + }, + { + "epoch": 27.116816431322206, + "grad_norm": 1.7164745330810547, + "learning_rate": 2.4297817715019257e-05, + "loss": 0.295, + "step": 21124 + }, + { + "epoch": 27.118100128369704, + "grad_norm": 1.5005295276641846, + "learning_rate": 2.4297389816003422e-05, + "loss": 0.3307, + "step": 21125 + }, + { + "epoch": 27.119383825417202, + "grad_norm": 1.1261998414993286, + "learning_rate": 2.4296961916987594e-05, + "loss": 0.3054, + "step": 21126 + }, + { + "epoch": 27.1206675224647, + "grad_norm": 1.36162269115448, + "learning_rate": 2.429653401797176e-05, + "loss": 0.3329, + "step": 21127 + }, + { + "epoch": 27.121951219512194, + "grad_norm": 1.824761986732483, + "learning_rate": 2.4296106118955927e-05, + "loss": 0.3429, + "step": 21128 + }, + { + "epoch": 27.123234916559692, + "grad_norm": 1.7619588375091553, + "learning_rate": 2.4295678219940096e-05, + "loss": 0.4018, + "step": 21129 + }, + { + "epoch": 27.12451861360719, + "grad_norm": 2.9861068725585938, + "learning_rate": 2.429525032092426e-05, + "loss": 0.3323, + "step": 21130 + }, + { + "epoch": 27.125802310654684, + "grad_norm": 1.2637755870819092, + "learning_rate": 2.4294822421908432e-05, + "loss": 0.3709, + "step": 21131 + }, + { + "epoch": 27.127086007702182, + "grad_norm": 6.603690147399902, + "learning_rate": 2.4294394522892597e-05, + "loss": 0.4344, + "step": 21132 + }, + { + "epoch": 27.12836970474968, + "grad_norm": 2.608802556991577, + "learning_rate": 2.4293966623876766e-05, + "loss": 0.5262, + "step": 21133 + }, + { + "epoch": 27.129653401797174, + "grad_norm": 2.393549680709839, + "learning_rate": 2.4293538724860934e-05, + "loss": 0.3077, + "step": 21134 + }, + { + "epoch": 27.130937098844672, + "grad_norm": 1.1607180833816528, + "learning_rate": 2.42931108258451e-05, + "loss": 0.2902, + "step": 21135 + }, + { + "epoch": 27.13222079589217, + "grad_norm": 1.3640152215957642, + "learning_rate": 2.429268292682927e-05, + "loss": 0.276, + "step": 21136 + }, + { + "epoch": 27.133504492939664, + "grad_norm": 1.5112930536270142, + "learning_rate": 2.4292255027813436e-05, + "loss": 0.3627, + "step": 21137 + }, + { + "epoch": 27.134788189987162, + "grad_norm": 2.1309261322021484, + "learning_rate": 2.4291827128797605e-05, + "loss": 0.29, + "step": 21138 + }, + { + "epoch": 27.13607188703466, + "grad_norm": 0.8260272145271301, + "learning_rate": 2.4291399229781773e-05, + "loss": 0.3093, + "step": 21139 + }, + { + "epoch": 27.137355584082158, + "grad_norm": 2.455303430557251, + "learning_rate": 2.429097133076594e-05, + "loss": 0.3308, + "step": 21140 + }, + { + "epoch": 27.138639281129652, + "grad_norm": 6.762977123260498, + "learning_rate": 2.4290543431750106e-05, + "loss": 0.3179, + "step": 21141 + }, + { + "epoch": 27.13992297817715, + "grad_norm": 2.577993154525757, + "learning_rate": 2.4290115532734275e-05, + "loss": 0.2989, + "step": 21142 + }, + { + "epoch": 27.141206675224648, + "grad_norm": 1.3965189456939697, + "learning_rate": 2.4289687633718443e-05, + "loss": 0.3319, + "step": 21143 + }, + { + "epoch": 27.142490372272142, + "grad_norm": 0.918574869632721, + "learning_rate": 2.428925973470261e-05, + "loss": 0.3119, + "step": 21144 + }, + { + "epoch": 27.14377406931964, + "grad_norm": 3.9791653156280518, + "learning_rate": 2.428883183568678e-05, + "loss": 0.3164, + "step": 21145 + }, + { + "epoch": 27.145057766367138, + "grad_norm": 2.322662830352783, + "learning_rate": 2.4288403936670945e-05, + "loss": 0.3146, + "step": 21146 + }, + { + "epoch": 27.146341463414632, + "grad_norm": 1.4739408493041992, + "learning_rate": 2.4287976037655117e-05, + "loss": 0.3102, + "step": 21147 + }, + { + "epoch": 27.14762516046213, + "grad_norm": 1.547732949256897, + "learning_rate": 2.4287548138639282e-05, + "loss": 0.2892, + "step": 21148 + }, + { + "epoch": 27.14890885750963, + "grad_norm": 1.2170140743255615, + "learning_rate": 2.4287120239623447e-05, + "loss": 0.2954, + "step": 21149 + }, + { + "epoch": 27.150192554557126, + "grad_norm": 1.1906142234802246, + "learning_rate": 2.428669234060762e-05, + "loss": 0.3346, + "step": 21150 + }, + { + "epoch": 27.15147625160462, + "grad_norm": 0.8841050267219543, + "learning_rate": 2.4286264441591784e-05, + "loss": 0.2981, + "step": 21151 + }, + { + "epoch": 27.15275994865212, + "grad_norm": 0.9429501295089722, + "learning_rate": 2.4285836542575956e-05, + "loss": 0.3106, + "step": 21152 + }, + { + "epoch": 27.154043645699616, + "grad_norm": 1.2049700021743774, + "learning_rate": 2.428540864356012e-05, + "loss": 0.2858, + "step": 21153 + }, + { + "epoch": 27.15532734274711, + "grad_norm": 1.4285914897918701, + "learning_rate": 2.4284980744544286e-05, + "loss": 0.3247, + "step": 21154 + }, + { + "epoch": 27.15661103979461, + "grad_norm": 1.110754370689392, + "learning_rate": 2.4284552845528457e-05, + "loss": 0.309, + "step": 21155 + }, + { + "epoch": 27.157894736842106, + "grad_norm": 2.1719348430633545, + "learning_rate": 2.4284124946512622e-05, + "loss": 0.3318, + "step": 21156 + }, + { + "epoch": 27.1591784338896, + "grad_norm": 2.971694231033325, + "learning_rate": 2.428369704749679e-05, + "loss": 0.3155, + "step": 21157 + }, + { + "epoch": 27.1604621309371, + "grad_norm": 1.5846232175827026, + "learning_rate": 2.428326914848096e-05, + "loss": 0.3297, + "step": 21158 + }, + { + "epoch": 27.161745827984596, + "grad_norm": 1.6780742406845093, + "learning_rate": 2.4282841249465128e-05, + "loss": 0.3002, + "step": 21159 + }, + { + "epoch": 27.163029525032094, + "grad_norm": 2.1692211627960205, + "learning_rate": 2.4282413350449296e-05, + "loss": 0.2964, + "step": 21160 + }, + { + "epoch": 27.16431322207959, + "grad_norm": 0.9447121620178223, + "learning_rate": 2.428198545143346e-05, + "loss": 0.3229, + "step": 21161 + }, + { + "epoch": 27.165596919127086, + "grad_norm": 0.9064248204231262, + "learning_rate": 2.428155755241763e-05, + "loss": 0.3012, + "step": 21162 + }, + { + "epoch": 27.166880616174584, + "grad_norm": 1.1319340467453003, + "learning_rate": 2.4281129653401798e-05, + "loss": 0.2779, + "step": 21163 + }, + { + "epoch": 27.16816431322208, + "grad_norm": 1.5606322288513184, + "learning_rate": 2.4280701754385966e-05, + "loss": 0.291, + "step": 21164 + }, + { + "epoch": 27.169448010269576, + "grad_norm": 2.004901885986328, + "learning_rate": 2.428027385537013e-05, + "loss": 0.3184, + "step": 21165 + }, + { + "epoch": 27.170731707317074, + "grad_norm": 0.9975634813308716, + "learning_rate": 2.4279845956354303e-05, + "loss": 0.3128, + "step": 21166 + }, + { + "epoch": 27.17201540436457, + "grad_norm": 1.635781168937683, + "learning_rate": 2.4279418057338468e-05, + "loss": 0.3245, + "step": 21167 + }, + { + "epoch": 27.173299101412066, + "grad_norm": 2.7293429374694824, + "learning_rate": 2.4278990158322637e-05, + "loss": 0.308, + "step": 21168 + }, + { + "epoch": 27.174582798459564, + "grad_norm": 1.0869120359420776, + "learning_rate": 2.4278562259306805e-05, + "loss": 0.3368, + "step": 21169 + }, + { + "epoch": 27.17586649550706, + "grad_norm": 1.6539075374603271, + "learning_rate": 2.427813436029097e-05, + "loss": 0.2989, + "step": 21170 + }, + { + "epoch": 27.177150192554556, + "grad_norm": 3.7975313663482666, + "learning_rate": 2.4277706461275142e-05, + "loss": 0.3244, + "step": 21171 + }, + { + "epoch": 27.178433889602054, + "grad_norm": 1.5058045387268066, + "learning_rate": 2.4277278562259307e-05, + "loss": 0.345, + "step": 21172 + }, + { + "epoch": 27.179717586649552, + "grad_norm": 1.757336139678955, + "learning_rate": 2.4276850663243475e-05, + "loss": 0.3404, + "step": 21173 + }, + { + "epoch": 27.181001283697046, + "grad_norm": 1.1136505603790283, + "learning_rate": 2.4276422764227644e-05, + "loss": 0.353, + "step": 21174 + }, + { + "epoch": 27.182284980744544, + "grad_norm": 1.1657135486602783, + "learning_rate": 2.427599486521181e-05, + "loss": 0.3337, + "step": 21175 + }, + { + "epoch": 27.183568677792042, + "grad_norm": 1.446928858757019, + "learning_rate": 2.4275566966195977e-05, + "loss": 0.388, + "step": 21176 + }, + { + "epoch": 27.184852374839537, + "grad_norm": 1.9375406503677368, + "learning_rate": 2.4275139067180146e-05, + "loss": 0.3653, + "step": 21177 + }, + { + "epoch": 27.186136071887034, + "grad_norm": 2.041018486022949, + "learning_rate": 2.4274711168164314e-05, + "loss": 0.3483, + "step": 21178 + }, + { + "epoch": 27.187419768934532, + "grad_norm": 1.6270060539245605, + "learning_rate": 2.4274283269148482e-05, + "loss": 0.3431, + "step": 21179 + }, + { + "epoch": 27.188703465982027, + "grad_norm": 2.0390021800994873, + "learning_rate": 2.427385537013265e-05, + "loss": 0.3569, + "step": 21180 + }, + { + "epoch": 27.189987163029524, + "grad_norm": 1.5487786531448364, + "learning_rate": 2.4273427471116816e-05, + "loss": 0.4051, + "step": 21181 + }, + { + "epoch": 27.191270860077022, + "grad_norm": 4.619960784912109, + "learning_rate": 2.4272999572100984e-05, + "loss": 0.3664, + "step": 21182 + }, + { + "epoch": 27.19255455712452, + "grad_norm": 3.9964652061462402, + "learning_rate": 2.4272571673085153e-05, + "loss": 0.4583, + "step": 21183 + }, + { + "epoch": 27.193838254172015, + "grad_norm": 1.2900108098983765, + "learning_rate": 2.4272143774069318e-05, + "loss": 0.3291, + "step": 21184 + }, + { + "epoch": 27.195121951219512, + "grad_norm": 1.6660860776901245, + "learning_rate": 2.427171587505349e-05, + "loss": 0.3116, + "step": 21185 + }, + { + "epoch": 27.19640564826701, + "grad_norm": 1.5764919519424438, + "learning_rate": 2.4271287976037654e-05, + "loss": 0.3136, + "step": 21186 + }, + { + "epoch": 27.197689345314505, + "grad_norm": 1.113426923751831, + "learning_rate": 2.4270860077021826e-05, + "loss": 0.3162, + "step": 21187 + }, + { + "epoch": 27.198973042362002, + "grad_norm": 5.4797892570495605, + "learning_rate": 2.427043217800599e-05, + "loss": 0.3224, + "step": 21188 + }, + { + "epoch": 27.2002567394095, + "grad_norm": 1.5178606510162354, + "learning_rate": 2.4270004278990156e-05, + "loss": 0.2997, + "step": 21189 + }, + { + "epoch": 27.201540436456995, + "grad_norm": 1.5035746097564697, + "learning_rate": 2.4269576379974328e-05, + "loss": 0.3245, + "step": 21190 + }, + { + "epoch": 27.202824133504492, + "grad_norm": 4.544338703155518, + "learning_rate": 2.4269148480958493e-05, + "loss": 0.3294, + "step": 21191 + }, + { + "epoch": 27.20410783055199, + "grad_norm": 1.1992954015731812, + "learning_rate": 2.426872058194266e-05, + "loss": 0.3173, + "step": 21192 + }, + { + "epoch": 27.205391527599488, + "grad_norm": 1.2322808504104614, + "learning_rate": 2.426829268292683e-05, + "loss": 0.2885, + "step": 21193 + }, + { + "epoch": 27.206675224646983, + "grad_norm": 1.4014146327972412, + "learning_rate": 2.4267864783911e-05, + "loss": 0.31, + "step": 21194 + }, + { + "epoch": 27.20795892169448, + "grad_norm": 2.511127471923828, + "learning_rate": 2.4267436884895167e-05, + "loss": 0.3521, + "step": 21195 + }, + { + "epoch": 27.20924261874198, + "grad_norm": 0.9387292265892029, + "learning_rate": 2.4267008985879332e-05, + "loss": 0.3219, + "step": 21196 + }, + { + "epoch": 27.210526315789473, + "grad_norm": 0.9132069945335388, + "learning_rate": 2.42665810868635e-05, + "loss": 0.3021, + "step": 21197 + }, + { + "epoch": 27.21181001283697, + "grad_norm": 2.450946569442749, + "learning_rate": 2.426615318784767e-05, + "loss": 0.2981, + "step": 21198 + }, + { + "epoch": 27.21309370988447, + "grad_norm": 1.1185276508331299, + "learning_rate": 2.4265725288831837e-05, + "loss": 0.3336, + "step": 21199 + }, + { + "epoch": 27.214377406931963, + "grad_norm": 1.2548123598098755, + "learning_rate": 2.4265297389816002e-05, + "loss": 0.3045, + "step": 21200 + }, + { + "epoch": 27.21566110397946, + "grad_norm": 2.1281356811523438, + "learning_rate": 2.4264869490800174e-05, + "loss": 0.331, + "step": 21201 + }, + { + "epoch": 27.21694480102696, + "grad_norm": 1.0991476774215698, + "learning_rate": 2.426444159178434e-05, + "loss": 0.3063, + "step": 21202 + }, + { + "epoch": 27.218228498074453, + "grad_norm": 1.4393011331558228, + "learning_rate": 2.4264013692768507e-05, + "loss": 0.296, + "step": 21203 + }, + { + "epoch": 27.21951219512195, + "grad_norm": 4.925352096557617, + "learning_rate": 2.4263585793752676e-05, + "loss": 0.3361, + "step": 21204 + }, + { + "epoch": 27.22079589216945, + "grad_norm": 3.0475833415985107, + "learning_rate": 2.426315789473684e-05, + "loss": 0.3255, + "step": 21205 + }, + { + "epoch": 27.222079589216946, + "grad_norm": 1.640907883644104, + "learning_rate": 2.4262729995721013e-05, + "loss": 0.3055, + "step": 21206 + }, + { + "epoch": 27.22336328626444, + "grad_norm": 0.8787699341773987, + "learning_rate": 2.4262302096705178e-05, + "loss": 0.2961, + "step": 21207 + }, + { + "epoch": 27.22464698331194, + "grad_norm": 1.2991259098052979, + "learning_rate": 2.4261874197689346e-05, + "loss": 0.31, + "step": 21208 + }, + { + "epoch": 27.225930680359436, + "grad_norm": 1.024275541305542, + "learning_rate": 2.4261446298673514e-05, + "loss": 0.3291, + "step": 21209 + }, + { + "epoch": 27.22721437740693, + "grad_norm": 0.7575969696044922, + "learning_rate": 2.426101839965768e-05, + "loss": 0.3068, + "step": 21210 + }, + { + "epoch": 27.22849807445443, + "grad_norm": 0.9844152927398682, + "learning_rate": 2.426059050064185e-05, + "loss": 0.305, + "step": 21211 + }, + { + "epoch": 27.229781771501926, + "grad_norm": 1.422925591468811, + "learning_rate": 2.4260162601626016e-05, + "loss": 0.2832, + "step": 21212 + }, + { + "epoch": 27.23106546854942, + "grad_norm": 2.0454375743865967, + "learning_rate": 2.4259734702610185e-05, + "loss": 0.3205, + "step": 21213 + }, + { + "epoch": 27.23234916559692, + "grad_norm": 1.3097822666168213, + "learning_rate": 2.4259306803594353e-05, + "loss": 0.326, + "step": 21214 + }, + { + "epoch": 27.233632862644416, + "grad_norm": 3.4879517555236816, + "learning_rate": 2.4258878904578518e-05, + "loss": 0.3247, + "step": 21215 + }, + { + "epoch": 27.234916559691914, + "grad_norm": 3.2017977237701416, + "learning_rate": 2.4258451005562686e-05, + "loss": 0.3368, + "step": 21216 + }, + { + "epoch": 27.23620025673941, + "grad_norm": 1.1471014022827148, + "learning_rate": 2.4258023106546855e-05, + "loss": 0.3258, + "step": 21217 + }, + { + "epoch": 27.237483953786906, + "grad_norm": 1.8321338891983032, + "learning_rate": 2.4257595207531023e-05, + "loss": 0.3849, + "step": 21218 + }, + { + "epoch": 27.238767650834404, + "grad_norm": 1.4624934196472168, + "learning_rate": 2.4257167308515192e-05, + "loss": 0.3113, + "step": 21219 + }, + { + "epoch": 27.2400513478819, + "grad_norm": 1.4735649824142456, + "learning_rate": 2.425673940949936e-05, + "loss": 0.328, + "step": 21220 + }, + { + "epoch": 27.241335044929397, + "grad_norm": 4.104631423950195, + "learning_rate": 2.4256311510483525e-05, + "loss": 0.3511, + "step": 21221 + }, + { + "epoch": 27.242618741976894, + "grad_norm": 1.4558234214782715, + "learning_rate": 2.4255883611467694e-05, + "loss": 0.3379, + "step": 21222 + }, + { + "epoch": 27.24390243902439, + "grad_norm": 1.4347119331359863, + "learning_rate": 2.4255455712451862e-05, + "loss": 0.3233, + "step": 21223 + }, + { + "epoch": 27.245186136071887, + "grad_norm": 1.4418212175369263, + "learning_rate": 2.4255027813436027e-05, + "loss": 0.3504, + "step": 21224 + }, + { + "epoch": 27.246469833119384, + "grad_norm": 2.3885488510131836, + "learning_rate": 2.42545999144202e-05, + "loss": 0.3595, + "step": 21225 + }, + { + "epoch": 27.247753530166882, + "grad_norm": 1.3577202558517456, + "learning_rate": 2.4254172015404364e-05, + "loss": 0.3412, + "step": 21226 + }, + { + "epoch": 27.249037227214377, + "grad_norm": 1.7197819948196411, + "learning_rate": 2.4253744116388536e-05, + "loss": 0.3542, + "step": 21227 + }, + { + "epoch": 27.250320924261874, + "grad_norm": 1.6724027395248413, + "learning_rate": 2.42533162173727e-05, + "loss": 0.357, + "step": 21228 + }, + { + "epoch": 27.251604621309372, + "grad_norm": 1.2600321769714355, + "learning_rate": 2.4252888318356866e-05, + "loss": 0.3405, + "step": 21229 + }, + { + "epoch": 27.252888318356867, + "grad_norm": 2.45404314994812, + "learning_rate": 2.4252460419341037e-05, + "loss": 0.3794, + "step": 21230 + }, + { + "epoch": 27.254172015404365, + "grad_norm": 1.8193010091781616, + "learning_rate": 2.4252032520325202e-05, + "loss": 0.3437, + "step": 21231 + }, + { + "epoch": 27.255455712451862, + "grad_norm": 1.914097785949707, + "learning_rate": 2.425160462130937e-05, + "loss": 0.3927, + "step": 21232 + }, + { + "epoch": 27.256739409499357, + "grad_norm": 2.5034408569335938, + "learning_rate": 2.425117672229354e-05, + "loss": 0.5865, + "step": 21233 + }, + { + "epoch": 27.258023106546855, + "grad_norm": 0.9415966868400574, + "learning_rate": 2.4250748823277708e-05, + "loss": 0.2899, + "step": 21234 + }, + { + "epoch": 27.259306803594352, + "grad_norm": 1.4864357709884644, + "learning_rate": 2.4250320924261876e-05, + "loss": 0.301, + "step": 21235 + }, + { + "epoch": 27.260590500641847, + "grad_norm": 0.7937131524085999, + "learning_rate": 2.424989302524604e-05, + "loss": 0.3163, + "step": 21236 + }, + { + "epoch": 27.261874197689345, + "grad_norm": 1.1522393226623535, + "learning_rate": 2.424946512623021e-05, + "loss": 0.2967, + "step": 21237 + }, + { + "epoch": 27.263157894736842, + "grad_norm": 1.020194411277771, + "learning_rate": 2.4249037227214378e-05, + "loss": 0.3014, + "step": 21238 + }, + { + "epoch": 27.26444159178434, + "grad_norm": 1.29073166847229, + "learning_rate": 2.4248609328198546e-05, + "loss": 0.3349, + "step": 21239 + }, + { + "epoch": 27.265725288831835, + "grad_norm": 1.4916338920593262, + "learning_rate": 2.424818142918271e-05, + "loss": 0.2882, + "step": 21240 + }, + { + "epoch": 27.267008985879333, + "grad_norm": 1.0464178323745728, + "learning_rate": 2.4247753530166883e-05, + "loss": 0.3038, + "step": 21241 + }, + { + "epoch": 27.26829268292683, + "grad_norm": 2.2719197273254395, + "learning_rate": 2.4247325631151048e-05, + "loss": 0.2841, + "step": 21242 + }, + { + "epoch": 27.269576379974325, + "grad_norm": 1.2260392904281616, + "learning_rate": 2.4246897732135217e-05, + "loss": 0.3418, + "step": 21243 + }, + { + "epoch": 27.270860077021823, + "grad_norm": 2.7030420303344727, + "learning_rate": 2.4246469833119385e-05, + "loss": 0.3205, + "step": 21244 + }, + { + "epoch": 27.27214377406932, + "grad_norm": 1.2568227052688599, + "learning_rate": 2.424604193410355e-05, + "loss": 0.3278, + "step": 21245 + }, + { + "epoch": 27.273427471116815, + "grad_norm": 0.9047138690948486, + "learning_rate": 2.4245614035087722e-05, + "loss": 0.2853, + "step": 21246 + }, + { + "epoch": 27.274711168164313, + "grad_norm": 1.0603991746902466, + "learning_rate": 2.4245186136071887e-05, + "loss": 0.3307, + "step": 21247 + }, + { + "epoch": 27.27599486521181, + "grad_norm": 1.124699354171753, + "learning_rate": 2.4244758237056055e-05, + "loss": 0.3094, + "step": 21248 + }, + { + "epoch": 27.27727856225931, + "grad_norm": 1.1823800802230835, + "learning_rate": 2.4244330338040224e-05, + "loss": 0.302, + "step": 21249 + }, + { + "epoch": 27.278562259306803, + "grad_norm": 1.1693103313446045, + "learning_rate": 2.424390243902439e-05, + "loss": 0.3132, + "step": 21250 + }, + { + "epoch": 27.2798459563543, + "grad_norm": 1.046883225440979, + "learning_rate": 2.424347454000856e-05, + "loss": 0.3062, + "step": 21251 + }, + { + "epoch": 27.2811296534018, + "grad_norm": 1.756804347038269, + "learning_rate": 2.4243046640992726e-05, + "loss": 0.2949, + "step": 21252 + }, + { + "epoch": 27.282413350449293, + "grad_norm": 1.353186011314392, + "learning_rate": 2.4242618741976894e-05, + "loss": 0.2936, + "step": 21253 + }, + { + "epoch": 27.28369704749679, + "grad_norm": 1.2128254175186157, + "learning_rate": 2.4242190842961062e-05, + "loss": 0.3205, + "step": 21254 + }, + { + "epoch": 27.28498074454429, + "grad_norm": 1.9802350997924805, + "learning_rate": 2.424176294394523e-05, + "loss": 0.2808, + "step": 21255 + }, + { + "epoch": 27.286264441591783, + "grad_norm": 1.1334327459335327, + "learning_rate": 2.4241335044929396e-05, + "loss": 0.3372, + "step": 21256 + }, + { + "epoch": 27.28754813863928, + "grad_norm": 1.6063635349273682, + "learning_rate": 2.4240907145913564e-05, + "loss": 0.3302, + "step": 21257 + }, + { + "epoch": 27.28883183568678, + "grad_norm": 1.1273475885391235, + "learning_rate": 2.4240479246897733e-05, + "loss": 0.2867, + "step": 21258 + }, + { + "epoch": 27.290115532734276, + "grad_norm": 1.083748459815979, + "learning_rate": 2.42400513478819e-05, + "loss": 0.3249, + "step": 21259 + }, + { + "epoch": 27.29139922978177, + "grad_norm": 1.314014196395874, + "learning_rate": 2.423962344886607e-05, + "loss": 0.3019, + "step": 21260 + }, + { + "epoch": 27.29268292682927, + "grad_norm": 0.8861761093139648, + "learning_rate": 2.4239195549850235e-05, + "loss": 0.288, + "step": 21261 + }, + { + "epoch": 27.293966623876766, + "grad_norm": 1.0014762878417969, + "learning_rate": 2.4238767650834406e-05, + "loss": 0.2925, + "step": 21262 + }, + { + "epoch": 27.29525032092426, + "grad_norm": 1.4798414707183838, + "learning_rate": 2.423833975181857e-05, + "loss": 0.3204, + "step": 21263 + }, + { + "epoch": 27.29653401797176, + "grad_norm": 2.8787646293640137, + "learning_rate": 2.4237911852802736e-05, + "loss": 0.3338, + "step": 21264 + }, + { + "epoch": 27.297817715019256, + "grad_norm": 1.5650650262832642, + "learning_rate": 2.4237483953786908e-05, + "loss": 0.313, + "step": 21265 + }, + { + "epoch": 27.29910141206675, + "grad_norm": 2.07369327545166, + "learning_rate": 2.4237056054771073e-05, + "loss": 0.3232, + "step": 21266 + }, + { + "epoch": 27.30038510911425, + "grad_norm": 1.1631555557250977, + "learning_rate": 2.4236628155755245e-05, + "loss": 0.3382, + "step": 21267 + }, + { + "epoch": 27.301668806161747, + "grad_norm": 1.9596434831619263, + "learning_rate": 2.423620025673941e-05, + "loss": 0.3166, + "step": 21268 + }, + { + "epoch": 27.30295250320924, + "grad_norm": 1.023789644241333, + "learning_rate": 2.423577235772358e-05, + "loss": 0.3205, + "step": 21269 + }, + { + "epoch": 27.30423620025674, + "grad_norm": 1.6570132970809937, + "learning_rate": 2.4235344458707747e-05, + "loss": 0.3232, + "step": 21270 + }, + { + "epoch": 27.305519897304237, + "grad_norm": 2.0217080116271973, + "learning_rate": 2.4234916559691912e-05, + "loss": 0.3401, + "step": 21271 + }, + { + "epoch": 27.306803594351734, + "grad_norm": 1.036037802696228, + "learning_rate": 2.423448866067608e-05, + "loss": 0.311, + "step": 21272 + }, + { + "epoch": 27.30808729139923, + "grad_norm": 1.1084742546081543, + "learning_rate": 2.423406076166025e-05, + "loss": 0.3202, + "step": 21273 + }, + { + "epoch": 27.309370988446727, + "grad_norm": 1.3475267887115479, + "learning_rate": 2.4233632862644417e-05, + "loss": 0.3514, + "step": 21274 + }, + { + "epoch": 27.310654685494224, + "grad_norm": 2.2709856033325195, + "learning_rate": 2.4233204963628586e-05, + "loss": 0.3788, + "step": 21275 + }, + { + "epoch": 27.31193838254172, + "grad_norm": 2.4532339572906494, + "learning_rate": 2.423277706461275e-05, + "loss": 0.3562, + "step": 21276 + }, + { + "epoch": 27.313222079589217, + "grad_norm": 1.9991739988327026, + "learning_rate": 2.423234916559692e-05, + "loss": 0.3487, + "step": 21277 + }, + { + "epoch": 27.314505776636715, + "grad_norm": 4.955125331878662, + "learning_rate": 2.4231921266581087e-05, + "loss": 0.35, + "step": 21278 + }, + { + "epoch": 27.31578947368421, + "grad_norm": 1.6632332801818848, + "learning_rate": 2.4231493367565256e-05, + "loss": 0.3672, + "step": 21279 + }, + { + "epoch": 27.317073170731707, + "grad_norm": 4.242556095123291, + "learning_rate": 2.423106546854942e-05, + "loss": 0.3779, + "step": 21280 + }, + { + "epoch": 27.318356867779205, + "grad_norm": 2.61791729927063, + "learning_rate": 2.4230637569533593e-05, + "loss": 0.3698, + "step": 21281 + }, + { + "epoch": 27.319640564826702, + "grad_norm": 1.632374882698059, + "learning_rate": 2.4230209670517758e-05, + "loss": 0.453, + "step": 21282 + }, + { + "epoch": 27.320924261874197, + "grad_norm": 1.6800236701965332, + "learning_rate": 2.4229781771501926e-05, + "loss": 0.5111, + "step": 21283 + }, + { + "epoch": 27.322207958921695, + "grad_norm": 1.4669064283370972, + "learning_rate": 2.4229353872486094e-05, + "loss": 0.2969, + "step": 21284 + }, + { + "epoch": 27.323491655969192, + "grad_norm": 1.6646209955215454, + "learning_rate": 2.422892597347026e-05, + "loss": 0.2961, + "step": 21285 + }, + { + "epoch": 27.324775353016687, + "grad_norm": 2.0835251808166504, + "learning_rate": 2.422849807445443e-05, + "loss": 0.3179, + "step": 21286 + }, + { + "epoch": 27.326059050064185, + "grad_norm": 1.8553423881530762, + "learning_rate": 2.4228070175438596e-05, + "loss": 0.3139, + "step": 21287 + }, + { + "epoch": 27.327342747111683, + "grad_norm": 1.3663467168807983, + "learning_rate": 2.4227642276422765e-05, + "loss": 0.347, + "step": 21288 + }, + { + "epoch": 27.328626444159177, + "grad_norm": 1.1474194526672363, + "learning_rate": 2.4227214377406933e-05, + "loss": 0.3254, + "step": 21289 + }, + { + "epoch": 27.329910141206675, + "grad_norm": 2.3778302669525146, + "learning_rate": 2.4226786478391098e-05, + "loss": 0.3352, + "step": 21290 + }, + { + "epoch": 27.331193838254173, + "grad_norm": 2.4473183155059814, + "learning_rate": 2.422635857937527e-05, + "loss": 0.3025, + "step": 21291 + }, + { + "epoch": 27.33247753530167, + "grad_norm": 1.181968092918396, + "learning_rate": 2.4225930680359435e-05, + "loss": 0.3281, + "step": 21292 + }, + { + "epoch": 27.333761232349165, + "grad_norm": 1.0256768465042114, + "learning_rate": 2.4225502781343603e-05, + "loss": 0.3366, + "step": 21293 + }, + { + "epoch": 27.335044929396663, + "grad_norm": 0.9108388423919678, + "learning_rate": 2.4225074882327772e-05, + "loss": 0.3007, + "step": 21294 + }, + { + "epoch": 27.33632862644416, + "grad_norm": 0.8813638091087341, + "learning_rate": 2.422464698331194e-05, + "loss": 0.335, + "step": 21295 + }, + { + "epoch": 27.337612323491655, + "grad_norm": 3.8044145107269287, + "learning_rate": 2.4224219084296105e-05, + "loss": 0.3165, + "step": 21296 + }, + { + "epoch": 27.338896020539153, + "grad_norm": 1.9070321321487427, + "learning_rate": 2.4223791185280274e-05, + "loss": 0.3027, + "step": 21297 + }, + { + "epoch": 27.34017971758665, + "grad_norm": 3.4327380657196045, + "learning_rate": 2.4223363286264442e-05, + "loss": 0.3552, + "step": 21298 + }, + { + "epoch": 27.341463414634145, + "grad_norm": 2.0935723781585693, + "learning_rate": 2.422293538724861e-05, + "loss": 0.3275, + "step": 21299 + }, + { + "epoch": 27.342747111681643, + "grad_norm": 1.1582516431808472, + "learning_rate": 2.422250748823278e-05, + "loss": 0.3446, + "step": 21300 + }, + { + "epoch": 27.34403080872914, + "grad_norm": 0.8126914501190186, + "learning_rate": 2.4222079589216944e-05, + "loss": 0.2974, + "step": 21301 + }, + { + "epoch": 27.345314505776635, + "grad_norm": 0.9549459218978882, + "learning_rate": 2.4221651690201116e-05, + "loss": 0.3007, + "step": 21302 + }, + { + "epoch": 27.346598202824133, + "grad_norm": 1.0365926027297974, + "learning_rate": 2.422122379118528e-05, + "loss": 0.3064, + "step": 21303 + }, + { + "epoch": 27.34788189987163, + "grad_norm": 1.8318923711776733, + "learning_rate": 2.4220795892169446e-05, + "loss": 0.3012, + "step": 21304 + }, + { + "epoch": 27.34916559691913, + "grad_norm": 2.0185303688049316, + "learning_rate": 2.4220367993153618e-05, + "loss": 0.2947, + "step": 21305 + }, + { + "epoch": 27.350449293966623, + "grad_norm": 0.9993926286697388, + "learning_rate": 2.4219940094137783e-05, + "loss": 0.3101, + "step": 21306 + }, + { + "epoch": 27.35173299101412, + "grad_norm": 0.973666787147522, + "learning_rate": 2.4219512195121954e-05, + "loss": 0.3035, + "step": 21307 + }, + { + "epoch": 27.35301668806162, + "grad_norm": 2.183102607727051, + "learning_rate": 2.421908429610612e-05, + "loss": 0.3539, + "step": 21308 + }, + { + "epoch": 27.354300385109113, + "grad_norm": 1.3282781839370728, + "learning_rate": 2.4218656397090288e-05, + "loss": 0.3156, + "step": 21309 + }, + { + "epoch": 27.35558408215661, + "grad_norm": 1.4469859600067139, + "learning_rate": 2.4218228498074456e-05, + "loss": 0.2974, + "step": 21310 + }, + { + "epoch": 27.35686777920411, + "grad_norm": 0.8562618494033813, + "learning_rate": 2.421780059905862e-05, + "loss": 0.2839, + "step": 21311 + }, + { + "epoch": 27.358151476251603, + "grad_norm": 1.2134367227554321, + "learning_rate": 2.421737270004279e-05, + "loss": 0.3217, + "step": 21312 + }, + { + "epoch": 27.3594351732991, + "grad_norm": 1.6746509075164795, + "learning_rate": 2.4216944801026958e-05, + "loss": 0.3002, + "step": 21313 + }, + { + "epoch": 27.3607188703466, + "grad_norm": 1.591570496559143, + "learning_rate": 2.4216516902011126e-05, + "loss": 0.324, + "step": 21314 + }, + { + "epoch": 27.362002567394097, + "grad_norm": 1.1704912185668945, + "learning_rate": 2.4216089002995295e-05, + "loss": 0.3121, + "step": 21315 + }, + { + "epoch": 27.36328626444159, + "grad_norm": 2.4061429500579834, + "learning_rate": 2.4215661103979463e-05, + "loss": 0.3267, + "step": 21316 + }, + { + "epoch": 27.36456996148909, + "grad_norm": 1.4028834104537964, + "learning_rate": 2.421523320496363e-05, + "loss": 0.3018, + "step": 21317 + }, + { + "epoch": 27.365853658536587, + "grad_norm": 1.4843804836273193, + "learning_rate": 2.4214805305947797e-05, + "loss": 0.3389, + "step": 21318 + }, + { + "epoch": 27.36713735558408, + "grad_norm": 1.0176472663879395, + "learning_rate": 2.4214377406931965e-05, + "loss": 0.3089, + "step": 21319 + }, + { + "epoch": 27.36842105263158, + "grad_norm": 2.0004220008850098, + "learning_rate": 2.421394950791613e-05, + "loss": 0.3228, + "step": 21320 + }, + { + "epoch": 27.369704749679077, + "grad_norm": 1.3394428491592407, + "learning_rate": 2.4213521608900302e-05, + "loss": 0.3335, + "step": 21321 + }, + { + "epoch": 27.37098844672657, + "grad_norm": 1.9419615268707275, + "learning_rate": 2.4213093709884467e-05, + "loss": 0.3248, + "step": 21322 + }, + { + "epoch": 27.37227214377407, + "grad_norm": 1.5549839735031128, + "learning_rate": 2.421266581086864e-05, + "loss": 0.3438, + "step": 21323 + }, + { + "epoch": 27.373555840821567, + "grad_norm": 1.1688001155853271, + "learning_rate": 2.4212237911852804e-05, + "loss": 0.2849, + "step": 21324 + }, + { + "epoch": 27.374839537869065, + "grad_norm": 1.740372657775879, + "learning_rate": 2.421181001283697e-05, + "loss": 0.3438, + "step": 21325 + }, + { + "epoch": 27.37612323491656, + "grad_norm": 1.1264351606369019, + "learning_rate": 2.421138211382114e-05, + "loss": 0.3151, + "step": 21326 + }, + { + "epoch": 27.377406931964057, + "grad_norm": 1.0575757026672363, + "learning_rate": 2.4210954214805306e-05, + "loss": 0.3725, + "step": 21327 + }, + { + "epoch": 27.378690629011555, + "grad_norm": 4.230491638183594, + "learning_rate": 2.4210526315789474e-05, + "loss": 0.33, + "step": 21328 + }, + { + "epoch": 27.37997432605905, + "grad_norm": 1.0996650457382202, + "learning_rate": 2.4210098416773642e-05, + "loss": 0.3535, + "step": 21329 + }, + { + "epoch": 27.381258023106547, + "grad_norm": 2.1789233684539795, + "learning_rate": 2.420967051775781e-05, + "loss": 0.3548, + "step": 21330 + }, + { + "epoch": 27.382541720154045, + "grad_norm": 1.890178918838501, + "learning_rate": 2.420924261874198e-05, + "loss": 0.3846, + "step": 21331 + }, + { + "epoch": 27.38382541720154, + "grad_norm": 1.7564893960952759, + "learning_rate": 2.4208814719726144e-05, + "loss": 0.3858, + "step": 21332 + }, + { + "epoch": 27.385109114249037, + "grad_norm": 1.834876298904419, + "learning_rate": 2.4208386820710313e-05, + "loss": 0.5373, + "step": 21333 + }, + { + "epoch": 27.386392811296535, + "grad_norm": 2.4426965713500977, + "learning_rate": 2.420795892169448e-05, + "loss": 0.2954, + "step": 21334 + }, + { + "epoch": 27.387676508344033, + "grad_norm": 2.505772829055786, + "learning_rate": 2.420753102267865e-05, + "loss": 0.3187, + "step": 21335 + }, + { + "epoch": 27.388960205391527, + "grad_norm": 1.076013207435608, + "learning_rate": 2.4207103123662815e-05, + "loss": 0.3519, + "step": 21336 + }, + { + "epoch": 27.390243902439025, + "grad_norm": 1.0073962211608887, + "learning_rate": 2.4206675224646983e-05, + "loss": 0.3318, + "step": 21337 + }, + { + "epoch": 27.391527599486523, + "grad_norm": 0.8635515570640564, + "learning_rate": 2.420624732563115e-05, + "loss": 0.2949, + "step": 21338 + }, + { + "epoch": 27.392811296534017, + "grad_norm": 1.9218027591705322, + "learning_rate": 2.420581942661532e-05, + "loss": 0.3084, + "step": 21339 + }, + { + "epoch": 27.394094993581515, + "grad_norm": 0.9154230952262878, + "learning_rate": 2.4205391527599488e-05, + "loss": 0.3281, + "step": 21340 + }, + { + "epoch": 27.395378690629013, + "grad_norm": 2.7325592041015625, + "learning_rate": 2.4204963628583653e-05, + "loss": 0.3029, + "step": 21341 + }, + { + "epoch": 27.396662387676507, + "grad_norm": 0.9947226643562317, + "learning_rate": 2.4204535729567825e-05, + "loss": 0.3125, + "step": 21342 + }, + { + "epoch": 27.397946084724005, + "grad_norm": 0.8592631220817566, + "learning_rate": 2.420410783055199e-05, + "loss": 0.3259, + "step": 21343 + }, + { + "epoch": 27.399229781771503, + "grad_norm": 1.1406272649765015, + "learning_rate": 2.4203679931536155e-05, + "loss": 0.3577, + "step": 21344 + }, + { + "epoch": 27.400513478818997, + "grad_norm": 3.2642412185668945, + "learning_rate": 2.4203252032520327e-05, + "loss": 0.3467, + "step": 21345 + }, + { + "epoch": 27.401797175866495, + "grad_norm": 3.3461825847625732, + "learning_rate": 2.4202824133504492e-05, + "loss": 0.3056, + "step": 21346 + }, + { + "epoch": 27.403080872913993, + "grad_norm": 1.854394793510437, + "learning_rate": 2.4202396234488664e-05, + "loss": 0.3096, + "step": 21347 + }, + { + "epoch": 27.40436456996149, + "grad_norm": 0.9712712168693542, + "learning_rate": 2.420196833547283e-05, + "loss": 0.308, + "step": 21348 + }, + { + "epoch": 27.405648267008985, + "grad_norm": 1.3869636058807373, + "learning_rate": 2.4201540436456997e-05, + "loss": 0.3087, + "step": 21349 + }, + { + "epoch": 27.406931964056483, + "grad_norm": 2.1095688343048096, + "learning_rate": 2.4201112537441166e-05, + "loss": 0.3084, + "step": 21350 + }, + { + "epoch": 27.40821566110398, + "grad_norm": 1.4385334253311157, + "learning_rate": 2.420068463842533e-05, + "loss": 0.2823, + "step": 21351 + }, + { + "epoch": 27.409499358151475, + "grad_norm": 2.3278212547302246, + "learning_rate": 2.42002567394095e-05, + "loss": 0.2936, + "step": 21352 + }, + { + "epoch": 27.410783055198973, + "grad_norm": 1.7963098287582397, + "learning_rate": 2.4199828840393667e-05, + "loss": 0.2871, + "step": 21353 + }, + { + "epoch": 27.41206675224647, + "grad_norm": 1.359695553779602, + "learning_rate": 2.4199400941377836e-05, + "loss": 0.3222, + "step": 21354 + }, + { + "epoch": 27.413350449293965, + "grad_norm": 2.172588348388672, + "learning_rate": 2.4198973042362004e-05, + "loss": 0.351, + "step": 21355 + }, + { + "epoch": 27.414634146341463, + "grad_norm": 1.6432684659957886, + "learning_rate": 2.4198545143346173e-05, + "loss": 0.3017, + "step": 21356 + }, + { + "epoch": 27.41591784338896, + "grad_norm": 1.0089912414550781, + "learning_rate": 2.4198117244330338e-05, + "loss": 0.2965, + "step": 21357 + }, + { + "epoch": 27.41720154043646, + "grad_norm": 1.8332682847976685, + "learning_rate": 2.4197689345314506e-05, + "loss": 0.325, + "step": 21358 + }, + { + "epoch": 27.418485237483953, + "grad_norm": 1.630669355392456, + "learning_rate": 2.4197261446298674e-05, + "loss": 0.2933, + "step": 21359 + }, + { + "epoch": 27.41976893453145, + "grad_norm": 1.423943042755127, + "learning_rate": 2.419683354728284e-05, + "loss": 0.3308, + "step": 21360 + }, + { + "epoch": 27.42105263157895, + "grad_norm": 1.6007981300354004, + "learning_rate": 2.419640564826701e-05, + "loss": 0.3016, + "step": 21361 + }, + { + "epoch": 27.422336328626443, + "grad_norm": 1.4136799573898315, + "learning_rate": 2.4195977749251176e-05, + "loss": 0.3028, + "step": 21362 + }, + { + "epoch": 27.42362002567394, + "grad_norm": 2.143033504486084, + "learning_rate": 2.4195549850235348e-05, + "loss": 0.3242, + "step": 21363 + }, + { + "epoch": 27.42490372272144, + "grad_norm": 1.7186815738677979, + "learning_rate": 2.4195121951219513e-05, + "loss": 0.3138, + "step": 21364 + }, + { + "epoch": 27.426187419768933, + "grad_norm": 2.462932586669922, + "learning_rate": 2.4194694052203678e-05, + "loss": 0.3196, + "step": 21365 + }, + { + "epoch": 27.42747111681643, + "grad_norm": 1.872620701789856, + "learning_rate": 2.419426615318785e-05, + "loss": 0.3352, + "step": 21366 + }, + { + "epoch": 27.42875481386393, + "grad_norm": 1.4227855205535889, + "learning_rate": 2.4193838254172015e-05, + "loss": 0.3316, + "step": 21367 + }, + { + "epoch": 27.430038510911427, + "grad_norm": 1.4516721963882446, + "learning_rate": 2.4193410355156183e-05, + "loss": 0.3468, + "step": 21368 + }, + { + "epoch": 27.43132220795892, + "grad_norm": 1.0912505388259888, + "learning_rate": 2.4192982456140352e-05, + "loss": 0.3334, + "step": 21369 + }, + { + "epoch": 27.43260590500642, + "grad_norm": 1.0892363786697388, + "learning_rate": 2.419255455712452e-05, + "loss": 0.3341, + "step": 21370 + }, + { + "epoch": 27.433889602053917, + "grad_norm": 1.998653769493103, + "learning_rate": 2.419212665810869e-05, + "loss": 0.3921, + "step": 21371 + }, + { + "epoch": 27.43517329910141, + "grad_norm": 1.3291460275650024, + "learning_rate": 2.4191698759092854e-05, + "loss": 0.3272, + "step": 21372 + }, + { + "epoch": 27.43645699614891, + "grad_norm": 1.1476500034332275, + "learning_rate": 2.4191270860077022e-05, + "loss": 0.3131, + "step": 21373 + }, + { + "epoch": 27.437740693196407, + "grad_norm": 1.408254861831665, + "learning_rate": 2.419084296106119e-05, + "loss": 0.3455, + "step": 21374 + }, + { + "epoch": 27.4390243902439, + "grad_norm": 1.8624085187911987, + "learning_rate": 2.419041506204536e-05, + "loss": 0.3604, + "step": 21375 + }, + { + "epoch": 27.4403080872914, + "grad_norm": 1.2546228170394897, + "learning_rate": 2.4189987163029524e-05, + "loss": 0.3603, + "step": 21376 + }, + { + "epoch": 27.441591784338897, + "grad_norm": 2.095837116241455, + "learning_rate": 2.4189559264013696e-05, + "loss": 0.3519, + "step": 21377 + }, + { + "epoch": 27.44287548138639, + "grad_norm": 1.7469731569290161, + "learning_rate": 2.418913136499786e-05, + "loss": 0.3563, + "step": 21378 + }, + { + "epoch": 27.44415917843389, + "grad_norm": 1.6424514055252075, + "learning_rate": 2.4188703465982026e-05, + "loss": 0.3747, + "step": 21379 + }, + { + "epoch": 27.445442875481387, + "grad_norm": 2.5964643955230713, + "learning_rate": 2.4188275566966198e-05, + "loss": 0.3599, + "step": 21380 + }, + { + "epoch": 27.446726572528885, + "grad_norm": 3.1844751834869385, + "learning_rate": 2.4187847667950363e-05, + "loss": 0.4428, + "step": 21381 + }, + { + "epoch": 27.44801026957638, + "grad_norm": 2.087603807449341, + "learning_rate": 2.4187419768934534e-05, + "loss": 0.4254, + "step": 21382 + }, + { + "epoch": 27.449293966623877, + "grad_norm": 2.5524795055389404, + "learning_rate": 2.41869918699187e-05, + "loss": 0.4897, + "step": 21383 + }, + { + "epoch": 27.450577663671375, + "grad_norm": 1.465704321861267, + "learning_rate": 2.4186563970902868e-05, + "loss": 0.2773, + "step": 21384 + }, + { + "epoch": 27.45186136071887, + "grad_norm": 2.353031635284424, + "learning_rate": 2.4186136071887036e-05, + "loss": 0.31, + "step": 21385 + }, + { + "epoch": 27.453145057766367, + "grad_norm": 1.124841570854187, + "learning_rate": 2.41857081728712e-05, + "loss": 0.3212, + "step": 21386 + }, + { + "epoch": 27.454428754813865, + "grad_norm": 1.4200438261032104, + "learning_rate": 2.418528027385537e-05, + "loss": 0.3103, + "step": 21387 + }, + { + "epoch": 27.45571245186136, + "grad_norm": 1.6528171300888062, + "learning_rate": 2.4184852374839538e-05, + "loss": 0.3195, + "step": 21388 + }, + { + "epoch": 27.456996148908857, + "grad_norm": 2.7219977378845215, + "learning_rate": 2.4184424475823707e-05, + "loss": 0.3215, + "step": 21389 + }, + { + "epoch": 27.458279845956355, + "grad_norm": 1.837236762046814, + "learning_rate": 2.4183996576807875e-05, + "loss": 0.2973, + "step": 21390 + }, + { + "epoch": 27.459563543003853, + "grad_norm": 1.2322471141815186, + "learning_rate": 2.4183568677792043e-05, + "loss": 0.3349, + "step": 21391 + }, + { + "epoch": 27.460847240051347, + "grad_norm": 1.407382845878601, + "learning_rate": 2.418314077877621e-05, + "loss": 0.3309, + "step": 21392 + }, + { + "epoch": 27.462130937098845, + "grad_norm": 1.1129554510116577, + "learning_rate": 2.4182712879760377e-05, + "loss": 0.3538, + "step": 21393 + }, + { + "epoch": 27.463414634146343, + "grad_norm": 1.4879707098007202, + "learning_rate": 2.4182284980744545e-05, + "loss": 0.3097, + "step": 21394 + }, + { + "epoch": 27.464698331193837, + "grad_norm": 5.362621784210205, + "learning_rate": 2.418185708172871e-05, + "loss": 0.3083, + "step": 21395 + }, + { + "epoch": 27.465982028241335, + "grad_norm": 1.0732378959655762, + "learning_rate": 2.4181429182712882e-05, + "loss": 0.3101, + "step": 21396 + }, + { + "epoch": 27.467265725288833, + "grad_norm": 0.8627792596817017, + "learning_rate": 2.4181001283697047e-05, + "loss": 0.3135, + "step": 21397 + }, + { + "epoch": 27.468549422336327, + "grad_norm": 1.4120370149612427, + "learning_rate": 2.418057338468122e-05, + "loss": 0.3283, + "step": 21398 + }, + { + "epoch": 27.469833119383825, + "grad_norm": 1.787163257598877, + "learning_rate": 2.4180145485665384e-05, + "loss": 0.3196, + "step": 21399 + }, + { + "epoch": 27.471116816431323, + "grad_norm": 1.158926010131836, + "learning_rate": 2.417971758664955e-05, + "loss": 0.3003, + "step": 21400 + }, + { + "epoch": 27.47240051347882, + "grad_norm": 1.3648755550384521, + "learning_rate": 2.417928968763372e-05, + "loss": 0.3423, + "step": 21401 + }, + { + "epoch": 27.473684210526315, + "grad_norm": 0.9686069488525391, + "learning_rate": 2.4178861788617886e-05, + "loss": 0.3038, + "step": 21402 + }, + { + "epoch": 27.474967907573813, + "grad_norm": 1.088714838027954, + "learning_rate": 2.4178433889602054e-05, + "loss": 0.3332, + "step": 21403 + }, + { + "epoch": 27.47625160462131, + "grad_norm": 1.7799417972564697, + "learning_rate": 2.4178005990586223e-05, + "loss": 0.3056, + "step": 21404 + }, + { + "epoch": 27.477535301668805, + "grad_norm": 1.3906254768371582, + "learning_rate": 2.4177578091570388e-05, + "loss": 0.3403, + "step": 21405 + }, + { + "epoch": 27.478818998716303, + "grad_norm": 3.6715033054351807, + "learning_rate": 2.417715019255456e-05, + "loss": 0.3007, + "step": 21406 + }, + { + "epoch": 27.4801026957638, + "grad_norm": 1.7436586618423462, + "learning_rate": 2.4176722293538724e-05, + "loss": 0.3219, + "step": 21407 + }, + { + "epoch": 27.481386392811295, + "grad_norm": 4.253046035766602, + "learning_rate": 2.4176294394522893e-05, + "loss": 0.3165, + "step": 21408 + }, + { + "epoch": 27.482670089858793, + "grad_norm": 1.9377782344818115, + "learning_rate": 2.417586649550706e-05, + "loss": 0.2973, + "step": 21409 + }, + { + "epoch": 27.48395378690629, + "grad_norm": 2.278919219970703, + "learning_rate": 2.417543859649123e-05, + "loss": 0.2992, + "step": 21410 + }, + { + "epoch": 27.485237483953785, + "grad_norm": 1.6850508451461792, + "learning_rate": 2.4175010697475395e-05, + "loss": 0.3245, + "step": 21411 + }, + { + "epoch": 27.486521181001283, + "grad_norm": 1.0245944261550903, + "learning_rate": 2.4174582798459563e-05, + "loss": 0.3125, + "step": 21412 + }, + { + "epoch": 27.48780487804878, + "grad_norm": 1.1634029150009155, + "learning_rate": 2.417415489944373e-05, + "loss": 0.3104, + "step": 21413 + }, + { + "epoch": 27.48908857509628, + "grad_norm": 1.305998682975769, + "learning_rate": 2.41737270004279e-05, + "loss": 0.334, + "step": 21414 + }, + { + "epoch": 27.490372272143773, + "grad_norm": 1.0826973915100098, + "learning_rate": 2.4173299101412068e-05, + "loss": 0.2967, + "step": 21415 + }, + { + "epoch": 27.49165596919127, + "grad_norm": 2.500277280807495, + "learning_rate": 2.4172871202396233e-05, + "loss": 0.3182, + "step": 21416 + }, + { + "epoch": 27.49293966623877, + "grad_norm": 1.8956663608551025, + "learning_rate": 2.4172443303380405e-05, + "loss": 0.3356, + "step": 21417 + }, + { + "epoch": 27.494223363286263, + "grad_norm": 1.5906107425689697, + "learning_rate": 2.417201540436457e-05, + "loss": 0.3446, + "step": 21418 + }, + { + "epoch": 27.49550706033376, + "grad_norm": 1.5047062635421753, + "learning_rate": 2.4171587505348735e-05, + "loss": 0.341, + "step": 21419 + }, + { + "epoch": 27.49679075738126, + "grad_norm": 1.7392088174819946, + "learning_rate": 2.4171159606332907e-05, + "loss": 0.3022, + "step": 21420 + }, + { + "epoch": 27.498074454428753, + "grad_norm": 1.1786237955093384, + "learning_rate": 2.4170731707317072e-05, + "loss": 0.3349, + "step": 21421 + }, + { + "epoch": 27.49935815147625, + "grad_norm": 1.518707036972046, + "learning_rate": 2.4170303808301244e-05, + "loss": 0.3625, + "step": 21422 + }, + { + "epoch": 27.50064184852375, + "grad_norm": 1.243668794631958, + "learning_rate": 2.416987590928541e-05, + "loss": 0.3452, + "step": 21423 + }, + { + "epoch": 27.501925545571247, + "grad_norm": 2.4556126594543457, + "learning_rate": 2.4169448010269577e-05, + "loss": 0.3312, + "step": 21424 + }, + { + "epoch": 27.50320924261874, + "grad_norm": 1.3051817417144775, + "learning_rate": 2.4169020111253746e-05, + "loss": 0.3892, + "step": 21425 + }, + { + "epoch": 27.50449293966624, + "grad_norm": 5.197854042053223, + "learning_rate": 2.416859221223791e-05, + "loss": 0.354, + "step": 21426 + }, + { + "epoch": 27.505776636713737, + "grad_norm": 1.239782691001892, + "learning_rate": 2.416816431322208e-05, + "loss": 0.3537, + "step": 21427 + }, + { + "epoch": 27.50706033376123, + "grad_norm": 1.6324323415756226, + "learning_rate": 2.4167736414206247e-05, + "loss": 0.3624, + "step": 21428 + }, + { + "epoch": 27.50834403080873, + "grad_norm": 2.7665328979492188, + "learning_rate": 2.4167308515190416e-05, + "loss": 0.3725, + "step": 21429 + }, + { + "epoch": 27.509627727856227, + "grad_norm": 1.334524154663086, + "learning_rate": 2.4166880616174584e-05, + "loss": 0.3881, + "step": 21430 + }, + { + "epoch": 27.51091142490372, + "grad_norm": 1.4411859512329102, + "learning_rate": 2.4166452717158753e-05, + "loss": 0.3602, + "step": 21431 + }, + { + "epoch": 27.51219512195122, + "grad_norm": 2.0086007118225098, + "learning_rate": 2.4166024818142918e-05, + "loss": 0.4301, + "step": 21432 + }, + { + "epoch": 27.513478818998717, + "grad_norm": 1.455165982246399, + "learning_rate": 2.4165596919127086e-05, + "loss": 0.5495, + "step": 21433 + }, + { + "epoch": 27.514762516046215, + "grad_norm": 1.7422873973846436, + "learning_rate": 2.4165169020111255e-05, + "loss": 0.2944, + "step": 21434 + }, + { + "epoch": 27.51604621309371, + "grad_norm": 1.1264641284942627, + "learning_rate": 2.416474112109542e-05, + "loss": 0.3001, + "step": 21435 + }, + { + "epoch": 27.517329910141207, + "grad_norm": 1.393190860748291, + "learning_rate": 2.416431322207959e-05, + "loss": 0.3158, + "step": 21436 + }, + { + "epoch": 27.518613607188705, + "grad_norm": 1.5211261510849, + "learning_rate": 2.4163885323063756e-05, + "loss": 0.3287, + "step": 21437 + }, + { + "epoch": 27.5198973042362, + "grad_norm": 1.0794179439544678, + "learning_rate": 2.4163457424047928e-05, + "loss": 0.321, + "step": 21438 + }, + { + "epoch": 27.521181001283697, + "grad_norm": 0.7603316307067871, + "learning_rate": 2.4163029525032093e-05, + "loss": 0.3109, + "step": 21439 + }, + { + "epoch": 27.522464698331195, + "grad_norm": 2.2379047870635986, + "learning_rate": 2.4162601626016258e-05, + "loss": 0.327, + "step": 21440 + }, + { + "epoch": 27.52374839537869, + "grad_norm": 2.523228168487549, + "learning_rate": 2.416217372700043e-05, + "loss": 0.3143, + "step": 21441 + }, + { + "epoch": 27.525032092426187, + "grad_norm": 1.5044922828674316, + "learning_rate": 2.4161745827984595e-05, + "loss": 0.3327, + "step": 21442 + }, + { + "epoch": 27.526315789473685, + "grad_norm": 1.2396037578582764, + "learning_rate": 2.4161317928968763e-05, + "loss": 0.3228, + "step": 21443 + }, + { + "epoch": 27.527599486521183, + "grad_norm": 1.2092950344085693, + "learning_rate": 2.4160890029952932e-05, + "loss": 0.3138, + "step": 21444 + }, + { + "epoch": 27.528883183568677, + "grad_norm": 13.556973457336426, + "learning_rate": 2.41604621309371e-05, + "loss": 0.3307, + "step": 21445 + }, + { + "epoch": 27.530166880616175, + "grad_norm": 2.29557728767395, + "learning_rate": 2.416003423192127e-05, + "loss": 0.2874, + "step": 21446 + }, + { + "epoch": 27.531450577663673, + "grad_norm": 5.024387359619141, + "learning_rate": 2.4159606332905434e-05, + "loss": 0.3426, + "step": 21447 + }, + { + "epoch": 27.532734274711167, + "grad_norm": 1.0154484510421753, + "learning_rate": 2.4159178433889602e-05, + "loss": 0.3293, + "step": 21448 + }, + { + "epoch": 27.534017971758665, + "grad_norm": 1.6079936027526855, + "learning_rate": 2.415875053487377e-05, + "loss": 0.2966, + "step": 21449 + }, + { + "epoch": 27.535301668806163, + "grad_norm": 2.1325571537017822, + "learning_rate": 2.415832263585794e-05, + "loss": 0.3371, + "step": 21450 + }, + { + "epoch": 27.536585365853657, + "grad_norm": 1.0817365646362305, + "learning_rate": 2.4157894736842104e-05, + "loss": 0.3095, + "step": 21451 + }, + { + "epoch": 27.537869062901155, + "grad_norm": 4.216409683227539, + "learning_rate": 2.4157466837826276e-05, + "loss": 0.3232, + "step": 21452 + }, + { + "epoch": 27.539152759948653, + "grad_norm": 1.0365424156188965, + "learning_rate": 2.415703893881044e-05, + "loss": 0.3181, + "step": 21453 + }, + { + "epoch": 27.540436456996147, + "grad_norm": 1.0163062810897827, + "learning_rate": 2.415661103979461e-05, + "loss": 0.3135, + "step": 21454 + }, + { + "epoch": 27.541720154043645, + "grad_norm": 1.0008220672607422, + "learning_rate": 2.4156183140778778e-05, + "loss": 0.3199, + "step": 21455 + }, + { + "epoch": 27.543003851091143, + "grad_norm": 1.306530475616455, + "learning_rate": 2.4155755241762943e-05, + "loss": 0.2921, + "step": 21456 + }, + { + "epoch": 27.54428754813864, + "grad_norm": 1.4541828632354736, + "learning_rate": 2.4155327342747114e-05, + "loss": 0.3062, + "step": 21457 + }, + { + "epoch": 27.545571245186135, + "grad_norm": 1.8368452787399292, + "learning_rate": 2.415489944373128e-05, + "loss": 0.2999, + "step": 21458 + }, + { + "epoch": 27.546854942233633, + "grad_norm": 1.161112904548645, + "learning_rate": 2.4154471544715448e-05, + "loss": 0.356, + "step": 21459 + }, + { + "epoch": 27.54813863928113, + "grad_norm": 1.0380386114120483, + "learning_rate": 2.4154043645699616e-05, + "loss": 0.2759, + "step": 21460 + }, + { + "epoch": 27.549422336328625, + "grad_norm": 1.5308825969696045, + "learning_rate": 2.415361574668378e-05, + "loss": 0.3218, + "step": 21461 + }, + { + "epoch": 27.550706033376123, + "grad_norm": 1.5118309259414673, + "learning_rate": 2.4153187847667953e-05, + "loss": 0.3293, + "step": 21462 + }, + { + "epoch": 27.55198973042362, + "grad_norm": 2.322317361831665, + "learning_rate": 2.4152759948652118e-05, + "loss": 0.298, + "step": 21463 + }, + { + "epoch": 27.553273427471115, + "grad_norm": 0.8505240082740784, + "learning_rate": 2.4152332049636287e-05, + "loss": 0.3034, + "step": 21464 + }, + { + "epoch": 27.554557124518613, + "grad_norm": 1.0516481399536133, + "learning_rate": 2.4151904150620455e-05, + "loss": 0.3158, + "step": 21465 + }, + { + "epoch": 27.55584082156611, + "grad_norm": 0.9289096593856812, + "learning_rate": 2.415147625160462e-05, + "loss": 0.2979, + "step": 21466 + }, + { + "epoch": 27.55712451861361, + "grad_norm": 1.4968171119689941, + "learning_rate": 2.415104835258879e-05, + "loss": 0.2983, + "step": 21467 + }, + { + "epoch": 27.558408215661103, + "grad_norm": 1.6085537672042847, + "learning_rate": 2.4150620453572957e-05, + "loss": 0.3101, + "step": 21468 + }, + { + "epoch": 27.5596919127086, + "grad_norm": 1.3258272409439087, + "learning_rate": 2.4150192554557125e-05, + "loss": 0.3172, + "step": 21469 + }, + { + "epoch": 27.5609756097561, + "grad_norm": 1.0944610834121704, + "learning_rate": 2.4149764655541294e-05, + "loss": 0.3432, + "step": 21470 + }, + { + "epoch": 27.562259306803593, + "grad_norm": 1.3417279720306396, + "learning_rate": 2.4149336756525462e-05, + "loss": 0.3073, + "step": 21471 + }, + { + "epoch": 27.56354300385109, + "grad_norm": 1.571533203125, + "learning_rate": 2.4148908857509627e-05, + "loss": 0.316, + "step": 21472 + }, + { + "epoch": 27.56482670089859, + "grad_norm": 1.3631035089492798, + "learning_rate": 2.4148480958493796e-05, + "loss": 0.3596, + "step": 21473 + }, + { + "epoch": 27.566110397946083, + "grad_norm": 2.735804319381714, + "learning_rate": 2.4148053059477964e-05, + "loss": 0.3658, + "step": 21474 + }, + { + "epoch": 27.56739409499358, + "grad_norm": 1.587810754776001, + "learning_rate": 2.414762516046213e-05, + "loss": 0.3485, + "step": 21475 + }, + { + "epoch": 27.56867779204108, + "grad_norm": 3.7156026363372803, + "learning_rate": 2.41471972614463e-05, + "loss": 0.3142, + "step": 21476 + }, + { + "epoch": 27.569961489088577, + "grad_norm": 1.6817448139190674, + "learning_rate": 2.4146769362430466e-05, + "loss": 0.3906, + "step": 21477 + }, + { + "epoch": 27.57124518613607, + "grad_norm": 3.101588010787964, + "learning_rate": 2.4146341463414638e-05, + "loss": 0.3369, + "step": 21478 + }, + { + "epoch": 27.57252888318357, + "grad_norm": 1.8913935422897339, + "learning_rate": 2.4145913564398803e-05, + "loss": 0.3779, + "step": 21479 + }, + { + "epoch": 27.573812580231067, + "grad_norm": 2.146216630935669, + "learning_rate": 2.4145485665382968e-05, + "loss": 0.3428, + "step": 21480 + }, + { + "epoch": 27.57509627727856, + "grad_norm": 1.3900760412216187, + "learning_rate": 2.414505776636714e-05, + "loss": 0.3921, + "step": 21481 + }, + { + "epoch": 27.57637997432606, + "grad_norm": 5.385575294494629, + "learning_rate": 2.4144629867351304e-05, + "loss": 0.438, + "step": 21482 + }, + { + "epoch": 27.577663671373557, + "grad_norm": 3.9441771507263184, + "learning_rate": 2.4144201968335473e-05, + "loss": 0.5335, + "step": 21483 + }, + { + "epoch": 27.57894736842105, + "grad_norm": 1.9218111038208008, + "learning_rate": 2.414377406931964e-05, + "loss": 0.3689, + "step": 21484 + }, + { + "epoch": 27.58023106546855, + "grad_norm": 0.8779562711715698, + "learning_rate": 2.414334617030381e-05, + "loss": 0.3307, + "step": 21485 + }, + { + "epoch": 27.581514762516047, + "grad_norm": 1.2325330972671509, + "learning_rate": 2.4142918271287978e-05, + "loss": 0.3434, + "step": 21486 + }, + { + "epoch": 27.58279845956354, + "grad_norm": 1.477997064590454, + "learning_rate": 2.4142490372272143e-05, + "loss": 0.335, + "step": 21487 + }, + { + "epoch": 27.58408215661104, + "grad_norm": 2.1944477558135986, + "learning_rate": 2.414206247325631e-05, + "loss": 0.3349, + "step": 21488 + }, + { + "epoch": 27.585365853658537, + "grad_norm": 1.3933892250061035, + "learning_rate": 2.414163457424048e-05, + "loss": 0.2898, + "step": 21489 + }, + { + "epoch": 27.586649550706035, + "grad_norm": 1.3547239303588867, + "learning_rate": 2.414120667522465e-05, + "loss": 0.3275, + "step": 21490 + }, + { + "epoch": 27.58793324775353, + "grad_norm": 3.07384991645813, + "learning_rate": 2.4140778776208813e-05, + "loss": 0.2936, + "step": 21491 + }, + { + "epoch": 27.589216944801027, + "grad_norm": 1.0004627704620361, + "learning_rate": 2.4140350877192985e-05, + "loss": 0.2948, + "step": 21492 + }, + { + "epoch": 27.590500641848525, + "grad_norm": 0.8311821818351746, + "learning_rate": 2.413992297817715e-05, + "loss": 0.3151, + "step": 21493 + }, + { + "epoch": 27.59178433889602, + "grad_norm": 1.1455600261688232, + "learning_rate": 2.413949507916132e-05, + "loss": 0.3092, + "step": 21494 + }, + { + "epoch": 27.593068035943517, + "grad_norm": 0.8467093110084534, + "learning_rate": 2.4139067180145487e-05, + "loss": 0.3139, + "step": 21495 + }, + { + "epoch": 27.594351732991015, + "grad_norm": 1.084397792816162, + "learning_rate": 2.4138639281129652e-05, + "loss": 0.3473, + "step": 21496 + }, + { + "epoch": 27.59563543003851, + "grad_norm": 1.4955682754516602, + "learning_rate": 2.4138211382113824e-05, + "loss": 0.3319, + "step": 21497 + }, + { + "epoch": 27.596919127086007, + "grad_norm": 1.2030752897262573, + "learning_rate": 2.413778348309799e-05, + "loss": 0.303, + "step": 21498 + }, + { + "epoch": 27.598202824133505, + "grad_norm": 0.9637071490287781, + "learning_rate": 2.4137355584082157e-05, + "loss": 0.3475, + "step": 21499 + }, + { + "epoch": 27.599486521181003, + "grad_norm": 1.2813849449157715, + "learning_rate": 2.4136927685066326e-05, + "loss": 0.3166, + "step": 21500 + }, + { + "epoch": 27.600770218228497, + "grad_norm": 1.4173450469970703, + "learning_rate": 2.413649978605049e-05, + "loss": 0.3148, + "step": 21501 + }, + { + "epoch": 27.602053915275995, + "grad_norm": 1.4337221384048462, + "learning_rate": 2.4136071887034663e-05, + "loss": 0.3228, + "step": 21502 + }, + { + "epoch": 27.603337612323493, + "grad_norm": 2.108581304550171, + "learning_rate": 2.4135643988018828e-05, + "loss": 0.319, + "step": 21503 + }, + { + "epoch": 27.604621309370987, + "grad_norm": 0.9275622367858887, + "learning_rate": 2.4135216089002996e-05, + "loss": 0.3014, + "step": 21504 + }, + { + "epoch": 27.605905006418485, + "grad_norm": 2.365949869155884, + "learning_rate": 2.4134788189987164e-05, + "loss": 0.3009, + "step": 21505 + }, + { + "epoch": 27.607188703465983, + "grad_norm": 1.4131718873977661, + "learning_rate": 2.4134360290971333e-05, + "loss": 0.3266, + "step": 21506 + }, + { + "epoch": 27.608472400513477, + "grad_norm": 7.766703128814697, + "learning_rate": 2.4133932391955498e-05, + "loss": 0.2951, + "step": 21507 + }, + { + "epoch": 27.609756097560975, + "grad_norm": 1.591921091079712, + "learning_rate": 2.4133504492939666e-05, + "loss": 0.3117, + "step": 21508 + }, + { + "epoch": 27.611039794608473, + "grad_norm": 1.823159098625183, + "learning_rate": 2.4133076593923835e-05, + "loss": 0.3148, + "step": 21509 + }, + { + "epoch": 27.61232349165597, + "grad_norm": 0.9895274043083191, + "learning_rate": 2.4132648694908003e-05, + "loss": 0.295, + "step": 21510 + }, + { + "epoch": 27.613607188703465, + "grad_norm": 1.137740969657898, + "learning_rate": 2.413222079589217e-05, + "loss": 0.3117, + "step": 21511 + }, + { + "epoch": 27.614890885750963, + "grad_norm": 1.5074342489242554, + "learning_rate": 2.4131792896876336e-05, + "loss": 0.3032, + "step": 21512 + }, + { + "epoch": 27.61617458279846, + "grad_norm": 1.627795696258545, + "learning_rate": 2.4131364997860508e-05, + "loss": 0.3468, + "step": 21513 + }, + { + "epoch": 27.617458279845955, + "grad_norm": 1.457671046257019, + "learning_rate": 2.4130937098844673e-05, + "loss": 0.3212, + "step": 21514 + }, + { + "epoch": 27.618741976893453, + "grad_norm": 1.3914819955825806, + "learning_rate": 2.413050919982884e-05, + "loss": 0.3035, + "step": 21515 + }, + { + "epoch": 27.62002567394095, + "grad_norm": 1.3685612678527832, + "learning_rate": 2.413008130081301e-05, + "loss": 0.3672, + "step": 21516 + }, + { + "epoch": 27.621309370988445, + "grad_norm": 1.9230931997299194, + "learning_rate": 2.4129653401797175e-05, + "loss": 0.3218, + "step": 21517 + }, + { + "epoch": 27.622593068035943, + "grad_norm": 1.9921557903289795, + "learning_rate": 2.4129225502781347e-05, + "loss": 0.3403, + "step": 21518 + }, + { + "epoch": 27.62387676508344, + "grad_norm": 1.280705451965332, + "learning_rate": 2.4128797603765512e-05, + "loss": 0.3299, + "step": 21519 + }, + { + "epoch": 27.625160462130935, + "grad_norm": 1.6428145170211792, + "learning_rate": 2.412836970474968e-05, + "loss": 0.3331, + "step": 21520 + }, + { + "epoch": 27.626444159178433, + "grad_norm": 1.7468491792678833, + "learning_rate": 2.412794180573385e-05, + "loss": 0.362, + "step": 21521 + }, + { + "epoch": 27.62772785622593, + "grad_norm": 9.154139518737793, + "learning_rate": 2.4127513906718014e-05, + "loss": 0.3494, + "step": 21522 + }, + { + "epoch": 27.62901155327343, + "grad_norm": 2.265488386154175, + "learning_rate": 2.4127086007702182e-05, + "loss": 0.3444, + "step": 21523 + }, + { + "epoch": 27.630295250320923, + "grad_norm": 1.514542818069458, + "learning_rate": 2.412665810868635e-05, + "loss": 0.3385, + "step": 21524 + }, + { + "epoch": 27.63157894736842, + "grad_norm": 2.4469852447509766, + "learning_rate": 2.412623020967052e-05, + "loss": 0.335, + "step": 21525 + }, + { + "epoch": 27.63286264441592, + "grad_norm": 1.13747239112854, + "learning_rate": 2.4125802310654687e-05, + "loss": 0.343, + "step": 21526 + }, + { + "epoch": 27.634146341463413, + "grad_norm": 1.9624972343444824, + "learning_rate": 2.4125374411638852e-05, + "loss": 0.3769, + "step": 21527 + }, + { + "epoch": 27.63543003851091, + "grad_norm": 4.505080699920654, + "learning_rate": 2.412494651262302e-05, + "loss": 0.3743, + "step": 21528 + }, + { + "epoch": 27.63671373555841, + "grad_norm": 1.7241770029067993, + "learning_rate": 2.412451861360719e-05, + "loss": 0.3623, + "step": 21529 + }, + { + "epoch": 27.637997432605903, + "grad_norm": 1.3853791952133179, + "learning_rate": 2.4124090714591358e-05, + "loss": 0.3912, + "step": 21530 + }, + { + "epoch": 27.6392811296534, + "grad_norm": 1.53256094455719, + "learning_rate": 2.4123662815575523e-05, + "loss": 0.3966, + "step": 21531 + }, + { + "epoch": 27.6405648267009, + "grad_norm": 10.900014877319336, + "learning_rate": 2.4123234916559695e-05, + "loss": 0.4336, + "step": 21532 + }, + { + "epoch": 27.641848523748397, + "grad_norm": 2.598047971725464, + "learning_rate": 2.412280701754386e-05, + "loss": 0.5196, + "step": 21533 + }, + { + "epoch": 27.64313222079589, + "grad_norm": 0.995383620262146, + "learning_rate": 2.4122379118528028e-05, + "loss": 0.3196, + "step": 21534 + }, + { + "epoch": 27.64441591784339, + "grad_norm": 0.9072656035423279, + "learning_rate": 2.4121951219512196e-05, + "loss": 0.2987, + "step": 21535 + }, + { + "epoch": 27.645699614890887, + "grad_norm": 1.433354139328003, + "learning_rate": 2.412152332049636e-05, + "loss": 0.3005, + "step": 21536 + }, + { + "epoch": 27.64698331193838, + "grad_norm": 2.364135503768921, + "learning_rate": 2.4121095421480533e-05, + "loss": 0.3242, + "step": 21537 + }, + { + "epoch": 27.64826700898588, + "grad_norm": 1.2233904600143433, + "learning_rate": 2.4120667522464698e-05, + "loss": 0.2954, + "step": 21538 + }, + { + "epoch": 27.649550706033377, + "grad_norm": 1.03350830078125, + "learning_rate": 2.4120239623448867e-05, + "loss": 0.3153, + "step": 21539 + }, + { + "epoch": 27.65083440308087, + "grad_norm": 1.2514859437942505, + "learning_rate": 2.4119811724433035e-05, + "loss": 0.3369, + "step": 21540 + }, + { + "epoch": 27.65211810012837, + "grad_norm": 2.084606647491455, + "learning_rate": 2.41193838254172e-05, + "loss": 0.326, + "step": 21541 + }, + { + "epoch": 27.653401797175867, + "grad_norm": 2.114854574203491, + "learning_rate": 2.4118955926401372e-05, + "loss": 0.3661, + "step": 21542 + }, + { + "epoch": 27.654685494223365, + "grad_norm": 0.9954802989959717, + "learning_rate": 2.4118528027385537e-05, + "loss": 0.333, + "step": 21543 + }, + { + "epoch": 27.65596919127086, + "grad_norm": 1.3115992546081543, + "learning_rate": 2.4118100128369705e-05, + "loss": 0.3162, + "step": 21544 + }, + { + "epoch": 27.657252888318357, + "grad_norm": 0.999255895614624, + "learning_rate": 2.4117672229353874e-05, + "loss": 0.3382, + "step": 21545 + }, + { + "epoch": 27.658536585365855, + "grad_norm": 1.954458475112915, + "learning_rate": 2.4117244330338042e-05, + "loss": 0.3012, + "step": 21546 + }, + { + "epoch": 27.65982028241335, + "grad_norm": 0.8525649905204773, + "learning_rate": 2.4116816431322207e-05, + "loss": 0.3151, + "step": 21547 + }, + { + "epoch": 27.661103979460847, + "grad_norm": 1.1180236339569092, + "learning_rate": 2.4116388532306376e-05, + "loss": 0.3224, + "step": 21548 + }, + { + "epoch": 27.662387676508345, + "grad_norm": 1.076082468032837, + "learning_rate": 2.4115960633290544e-05, + "loss": 0.2986, + "step": 21549 + }, + { + "epoch": 27.66367137355584, + "grad_norm": 1.0308339595794678, + "learning_rate": 2.4115532734274712e-05, + "loss": 0.2995, + "step": 21550 + }, + { + "epoch": 27.664955070603337, + "grad_norm": 0.824935257434845, + "learning_rate": 2.411510483525888e-05, + "loss": 0.3015, + "step": 21551 + }, + { + "epoch": 27.666238767650835, + "grad_norm": 1.2989004850387573, + "learning_rate": 2.4114676936243046e-05, + "loss": 0.342, + "step": 21552 + }, + { + "epoch": 27.66752246469833, + "grad_norm": 2.147799253463745, + "learning_rate": 2.4114249037227218e-05, + "loss": 0.3143, + "step": 21553 + }, + { + "epoch": 27.668806161745827, + "grad_norm": 1.133109450340271, + "learning_rate": 2.4113821138211383e-05, + "loss": 0.3266, + "step": 21554 + }, + { + "epoch": 27.670089858793325, + "grad_norm": 1.2815955877304077, + "learning_rate": 2.4113393239195548e-05, + "loss": 0.3017, + "step": 21555 + }, + { + "epoch": 27.671373555840823, + "grad_norm": 1.7204657793045044, + "learning_rate": 2.411296534017972e-05, + "loss": 0.3447, + "step": 21556 + }, + { + "epoch": 27.672657252888317, + "grad_norm": 0.8345543742179871, + "learning_rate": 2.4112537441163884e-05, + "loss": 0.3068, + "step": 21557 + }, + { + "epoch": 27.673940949935815, + "grad_norm": 1.1421175003051758, + "learning_rate": 2.4112109542148056e-05, + "loss": 0.3165, + "step": 21558 + }, + { + "epoch": 27.675224646983313, + "grad_norm": 0.9922741651535034, + "learning_rate": 2.411168164313222e-05, + "loss": 0.3071, + "step": 21559 + }, + { + "epoch": 27.676508344030808, + "grad_norm": 1.7321656942367554, + "learning_rate": 2.411125374411639e-05, + "loss": 0.3111, + "step": 21560 + }, + { + "epoch": 27.677792041078305, + "grad_norm": 1.3501238822937012, + "learning_rate": 2.4110825845100558e-05, + "loss": 0.322, + "step": 21561 + }, + { + "epoch": 27.679075738125803, + "grad_norm": 1.1169884204864502, + "learning_rate": 2.4110397946084723e-05, + "loss": 0.3368, + "step": 21562 + }, + { + "epoch": 27.680359435173298, + "grad_norm": 2.03898024559021, + "learning_rate": 2.410997004706889e-05, + "loss": 0.3333, + "step": 21563 + }, + { + "epoch": 27.681643132220795, + "grad_norm": 1.3375203609466553, + "learning_rate": 2.410954214805306e-05, + "loss": 0.3177, + "step": 21564 + }, + { + "epoch": 27.682926829268293, + "grad_norm": 3.8747870922088623, + "learning_rate": 2.410911424903723e-05, + "loss": 0.3095, + "step": 21565 + }, + { + "epoch": 27.68421052631579, + "grad_norm": 1.186700701713562, + "learning_rate": 2.4108686350021397e-05, + "loss": 0.3314, + "step": 21566 + }, + { + "epoch": 27.685494223363285, + "grad_norm": 0.9944971799850464, + "learning_rate": 2.4108258451005565e-05, + "loss": 0.3625, + "step": 21567 + }, + { + "epoch": 27.686777920410783, + "grad_norm": 1.6430511474609375, + "learning_rate": 2.410783055198973e-05, + "loss": 0.3228, + "step": 21568 + }, + { + "epoch": 27.68806161745828, + "grad_norm": 1.5226961374282837, + "learning_rate": 2.41074026529739e-05, + "loss": 0.3245, + "step": 21569 + }, + { + "epoch": 27.689345314505776, + "grad_norm": 1.1136066913604736, + "learning_rate": 2.4106974753958067e-05, + "loss": 0.3366, + "step": 21570 + }, + { + "epoch": 27.690629011553273, + "grad_norm": 1.7121676206588745, + "learning_rate": 2.4106546854942232e-05, + "loss": 0.3381, + "step": 21571 + }, + { + "epoch": 27.69191270860077, + "grad_norm": 1.2313172817230225, + "learning_rate": 2.4106118955926404e-05, + "loss": 0.2848, + "step": 21572 + }, + { + "epoch": 27.693196405648266, + "grad_norm": 1.2185947895050049, + "learning_rate": 2.410569105691057e-05, + "loss": 0.353, + "step": 21573 + }, + { + "epoch": 27.694480102695763, + "grad_norm": 1.0058231353759766, + "learning_rate": 2.410526315789474e-05, + "loss": 0.3379, + "step": 21574 + }, + { + "epoch": 27.69576379974326, + "grad_norm": 2.0037589073181152, + "learning_rate": 2.4104835258878906e-05, + "loss": 0.4082, + "step": 21575 + }, + { + "epoch": 27.69704749679076, + "grad_norm": 1.027636170387268, + "learning_rate": 2.410440735986307e-05, + "loss": 0.3224, + "step": 21576 + }, + { + "epoch": 27.698331193838253, + "grad_norm": 1.7690993547439575, + "learning_rate": 2.4103979460847243e-05, + "loss": 0.3596, + "step": 21577 + }, + { + "epoch": 27.69961489088575, + "grad_norm": 1.1951614618301392, + "learning_rate": 2.4103551561831408e-05, + "loss": 0.3183, + "step": 21578 + }, + { + "epoch": 27.70089858793325, + "grad_norm": 1.8776289224624634, + "learning_rate": 2.4103123662815576e-05, + "loss": 0.347, + "step": 21579 + }, + { + "epoch": 27.702182284980744, + "grad_norm": 1.8911809921264648, + "learning_rate": 2.4102695763799744e-05, + "loss": 0.3533, + "step": 21580 + }, + { + "epoch": 27.70346598202824, + "grad_norm": 1.9297605752944946, + "learning_rate": 2.4102267864783913e-05, + "loss": 0.4435, + "step": 21581 + }, + { + "epoch": 27.70474967907574, + "grad_norm": 1.5993902683258057, + "learning_rate": 2.4101839965768078e-05, + "loss": 0.3936, + "step": 21582 + }, + { + "epoch": 27.706033376123234, + "grad_norm": 2.151665449142456, + "learning_rate": 2.4101412066752246e-05, + "loss": 0.5403, + "step": 21583 + }, + { + "epoch": 27.70731707317073, + "grad_norm": 0.8288568258285522, + "learning_rate": 2.4100984167736415e-05, + "loss": 0.3386, + "step": 21584 + }, + { + "epoch": 27.70860077021823, + "grad_norm": 1.0098735094070435, + "learning_rate": 2.4100556268720583e-05, + "loss": 0.3385, + "step": 21585 + }, + { + "epoch": 27.709884467265724, + "grad_norm": 0.8391786813735962, + "learning_rate": 2.410012836970475e-05, + "loss": 0.3183, + "step": 21586 + }, + { + "epoch": 27.71116816431322, + "grad_norm": 0.8652905821800232, + "learning_rate": 2.4099700470688917e-05, + "loss": 0.3333, + "step": 21587 + }, + { + "epoch": 27.71245186136072, + "grad_norm": 1.2292611598968506, + "learning_rate": 2.4099272571673085e-05, + "loss": 0.3023, + "step": 21588 + }, + { + "epoch": 27.713735558408217, + "grad_norm": 1.1559951305389404, + "learning_rate": 2.4098844672657253e-05, + "loss": 0.3218, + "step": 21589 + }, + { + "epoch": 27.71501925545571, + "grad_norm": 1.0840555429458618, + "learning_rate": 2.409841677364142e-05, + "loss": 0.3038, + "step": 21590 + }, + { + "epoch": 27.71630295250321, + "grad_norm": 6.570918083190918, + "learning_rate": 2.409798887462559e-05, + "loss": 0.3268, + "step": 21591 + }, + { + "epoch": 27.717586649550707, + "grad_norm": 1.0073662996292114, + "learning_rate": 2.4097560975609755e-05, + "loss": 0.3246, + "step": 21592 + }, + { + "epoch": 27.7188703465982, + "grad_norm": 0.955643355846405, + "learning_rate": 2.4097133076593927e-05, + "loss": 0.3099, + "step": 21593 + }, + { + "epoch": 27.7201540436457, + "grad_norm": 0.9594277739524841, + "learning_rate": 2.4096705177578092e-05, + "loss": 0.2926, + "step": 21594 + }, + { + "epoch": 27.721437740693197, + "grad_norm": 0.8963575959205627, + "learning_rate": 2.4096277278562257e-05, + "loss": 0.2928, + "step": 21595 + }, + { + "epoch": 27.72272143774069, + "grad_norm": 1.7231253385543823, + "learning_rate": 2.409584937954643e-05, + "loss": 0.3288, + "step": 21596 + }, + { + "epoch": 27.72400513478819, + "grad_norm": 1.1285070180892944, + "learning_rate": 2.4095421480530594e-05, + "loss": 0.3237, + "step": 21597 + }, + { + "epoch": 27.725288831835687, + "grad_norm": 2.2009081840515137, + "learning_rate": 2.4094993581514762e-05, + "loss": 0.3279, + "step": 21598 + }, + { + "epoch": 27.726572528883185, + "grad_norm": 1.9969874620437622, + "learning_rate": 2.409456568249893e-05, + "loss": 0.3233, + "step": 21599 + }, + { + "epoch": 27.72785622593068, + "grad_norm": 0.9234591722488403, + "learning_rate": 2.40941377834831e-05, + "loss": 0.3141, + "step": 21600 + }, + { + "epoch": 27.729139922978177, + "grad_norm": 2.563279628753662, + "learning_rate": 2.4093709884467268e-05, + "loss": 0.2633, + "step": 21601 + }, + { + "epoch": 27.730423620025675, + "grad_norm": 2.1626100540161133, + "learning_rate": 2.4093281985451433e-05, + "loss": 0.3196, + "step": 21602 + }, + { + "epoch": 27.73170731707317, + "grad_norm": 1.9876898527145386, + "learning_rate": 2.40928540864356e-05, + "loss": 0.357, + "step": 21603 + }, + { + "epoch": 27.732991014120667, + "grad_norm": 1.1779347658157349, + "learning_rate": 2.409242618741977e-05, + "loss": 0.3098, + "step": 21604 + }, + { + "epoch": 27.734274711168165, + "grad_norm": 0.7992597818374634, + "learning_rate": 2.4091998288403938e-05, + "loss": 0.3089, + "step": 21605 + }, + { + "epoch": 27.73555840821566, + "grad_norm": 0.9435961842536926, + "learning_rate": 2.4091570389388103e-05, + "loss": 0.3403, + "step": 21606 + }, + { + "epoch": 27.736842105263158, + "grad_norm": 5.31666898727417, + "learning_rate": 2.4091142490372275e-05, + "loss": 0.3508, + "step": 21607 + }, + { + "epoch": 27.738125802310655, + "grad_norm": 1.1156895160675049, + "learning_rate": 2.409071459135644e-05, + "loss": 0.304, + "step": 21608 + }, + { + "epoch": 27.739409499358153, + "grad_norm": 1.0840173959732056, + "learning_rate": 2.4090286692340608e-05, + "loss": 0.3171, + "step": 21609 + }, + { + "epoch": 27.740693196405648, + "grad_norm": 2.493351697921753, + "learning_rate": 2.4089858793324776e-05, + "loss": 0.3434, + "step": 21610 + }, + { + "epoch": 27.741976893453145, + "grad_norm": 4.423862457275391, + "learning_rate": 2.408943089430894e-05, + "loss": 0.299, + "step": 21611 + }, + { + "epoch": 27.743260590500643, + "grad_norm": 4.145283222198486, + "learning_rate": 2.4089002995293113e-05, + "loss": 0.2978, + "step": 21612 + }, + { + "epoch": 27.744544287548138, + "grad_norm": 1.8557192087173462, + "learning_rate": 2.4088575096277278e-05, + "loss": 0.3239, + "step": 21613 + }, + { + "epoch": 27.745827984595635, + "grad_norm": 1.5494554042816162, + "learning_rate": 2.4088147197261447e-05, + "loss": 0.3172, + "step": 21614 + }, + { + "epoch": 27.747111681643133, + "grad_norm": 1.5109672546386719, + "learning_rate": 2.4087719298245615e-05, + "loss": 0.3241, + "step": 21615 + }, + { + "epoch": 27.748395378690628, + "grad_norm": 2.751864433288574, + "learning_rate": 2.408729139922978e-05, + "loss": 0.3121, + "step": 21616 + }, + { + "epoch": 27.749679075738126, + "grad_norm": 2.9035069942474365, + "learning_rate": 2.4086863500213952e-05, + "loss": 0.3365, + "step": 21617 + }, + { + "epoch": 27.750962772785623, + "grad_norm": 0.9775329828262329, + "learning_rate": 2.4086435601198117e-05, + "loss": 0.3679, + "step": 21618 + }, + { + "epoch": 27.752246469833118, + "grad_norm": 1.4541476964950562, + "learning_rate": 2.4086007702182285e-05, + "loss": 0.3656, + "step": 21619 + }, + { + "epoch": 27.753530166880616, + "grad_norm": 1.317460060119629, + "learning_rate": 2.4085579803166454e-05, + "loss": 0.3339, + "step": 21620 + }, + { + "epoch": 27.754813863928113, + "grad_norm": 1.5009150505065918, + "learning_rate": 2.4085151904150622e-05, + "loss": 0.3544, + "step": 21621 + }, + { + "epoch": 27.75609756097561, + "grad_norm": 1.1509301662445068, + "learning_rate": 2.4084724005134787e-05, + "loss": 0.308, + "step": 21622 + }, + { + "epoch": 27.757381258023106, + "grad_norm": 1.4320592880249023, + "learning_rate": 2.4084296106118956e-05, + "loss": 0.2941, + "step": 21623 + }, + { + "epoch": 27.758664955070603, + "grad_norm": 2.182749032974243, + "learning_rate": 2.4083868207103124e-05, + "loss": 0.3458, + "step": 21624 + }, + { + "epoch": 27.7599486521181, + "grad_norm": 1.825188159942627, + "learning_rate": 2.4083440308087292e-05, + "loss": 0.337, + "step": 21625 + }, + { + "epoch": 27.761232349165596, + "grad_norm": 1.279483437538147, + "learning_rate": 2.408301240907146e-05, + "loss": 0.3528, + "step": 21626 + }, + { + "epoch": 27.762516046213094, + "grad_norm": 8.021845817565918, + "learning_rate": 2.4082584510055626e-05, + "loss": 0.3411, + "step": 21627 + }, + { + "epoch": 27.76379974326059, + "grad_norm": 1.340144157409668, + "learning_rate": 2.4082156611039798e-05, + "loss": 0.3737, + "step": 21628 + }, + { + "epoch": 27.765083440308086, + "grad_norm": 1.378119945526123, + "learning_rate": 2.4081728712023963e-05, + "loss": 0.3976, + "step": 21629 + }, + { + "epoch": 27.766367137355584, + "grad_norm": 1.9186047315597534, + "learning_rate": 2.4081300813008128e-05, + "loss": 0.3486, + "step": 21630 + }, + { + "epoch": 27.76765083440308, + "grad_norm": 1.8994293212890625, + "learning_rate": 2.40808729139923e-05, + "loss": 0.4252, + "step": 21631 + }, + { + "epoch": 27.76893453145058, + "grad_norm": 2.4542694091796875, + "learning_rate": 2.4080445014976465e-05, + "loss": 0.434, + "step": 21632 + }, + { + "epoch": 27.770218228498074, + "grad_norm": 1.9358183145523071, + "learning_rate": 2.4080017115960636e-05, + "loss": 0.4989, + "step": 21633 + }, + { + "epoch": 27.77150192554557, + "grad_norm": 0.7924365997314453, + "learning_rate": 2.40795892169448e-05, + "loss": 0.3005, + "step": 21634 + }, + { + "epoch": 27.77278562259307, + "grad_norm": 1.8460345268249512, + "learning_rate": 2.407916131792897e-05, + "loss": 0.3305, + "step": 21635 + }, + { + "epoch": 27.774069319640564, + "grad_norm": 1.3298897743225098, + "learning_rate": 2.4078733418913138e-05, + "loss": 0.3501, + "step": 21636 + }, + { + "epoch": 27.77535301668806, + "grad_norm": 1.0404486656188965, + "learning_rate": 2.4078305519897303e-05, + "loss": 0.3062, + "step": 21637 + }, + { + "epoch": 27.77663671373556, + "grad_norm": 1.362786889076233, + "learning_rate": 2.407787762088147e-05, + "loss": 0.3013, + "step": 21638 + }, + { + "epoch": 27.777920410783054, + "grad_norm": 1.7872161865234375, + "learning_rate": 2.407744972186564e-05, + "loss": 0.2946, + "step": 21639 + }, + { + "epoch": 27.77920410783055, + "grad_norm": 2.279736280441284, + "learning_rate": 2.407702182284981e-05, + "loss": 0.3492, + "step": 21640 + }, + { + "epoch": 27.78048780487805, + "grad_norm": 1.0877851247787476, + "learning_rate": 2.4076593923833977e-05, + "loss": 0.3267, + "step": 21641 + }, + { + "epoch": 27.781771501925547, + "grad_norm": 1.7171471118927002, + "learning_rate": 2.4076166024818145e-05, + "loss": 0.3269, + "step": 21642 + }, + { + "epoch": 27.78305519897304, + "grad_norm": 0.9395342469215393, + "learning_rate": 2.407573812580231e-05, + "loss": 0.3104, + "step": 21643 + }, + { + "epoch": 27.78433889602054, + "grad_norm": 1.5922200679779053, + "learning_rate": 2.407531022678648e-05, + "loss": 0.3669, + "step": 21644 + }, + { + "epoch": 27.785622593068037, + "grad_norm": 2.2391092777252197, + "learning_rate": 2.4074882327770647e-05, + "loss": 0.331, + "step": 21645 + }, + { + "epoch": 27.78690629011553, + "grad_norm": 1.0106412172317505, + "learning_rate": 2.4074454428754812e-05, + "loss": 0.302, + "step": 21646 + }, + { + "epoch": 27.78818998716303, + "grad_norm": 0.8960406184196472, + "learning_rate": 2.4074026529738984e-05, + "loss": 0.3328, + "step": 21647 + }, + { + "epoch": 27.789473684210527, + "grad_norm": 1.31252121925354, + "learning_rate": 2.407359863072315e-05, + "loss": 0.292, + "step": 21648 + }, + { + "epoch": 27.79075738125802, + "grad_norm": 1.7645622491836548, + "learning_rate": 2.4073170731707317e-05, + "loss": 0.3275, + "step": 21649 + }, + { + "epoch": 27.79204107830552, + "grad_norm": 0.9341391324996948, + "learning_rate": 2.4072742832691486e-05, + "loss": 0.3311, + "step": 21650 + }, + { + "epoch": 27.793324775353017, + "grad_norm": 1.4792815446853638, + "learning_rate": 2.407231493367565e-05, + "loss": 0.3036, + "step": 21651 + }, + { + "epoch": 27.794608472400512, + "grad_norm": 4.089712619781494, + "learning_rate": 2.4071887034659823e-05, + "loss": 0.31, + "step": 21652 + }, + { + "epoch": 27.79589216944801, + "grad_norm": 1.1529022455215454, + "learning_rate": 2.4071459135643988e-05, + "loss": 0.3243, + "step": 21653 + }, + { + "epoch": 27.797175866495508, + "grad_norm": 1.1824688911437988, + "learning_rate": 2.4071031236628156e-05, + "loss": 0.3105, + "step": 21654 + }, + { + "epoch": 27.798459563543005, + "grad_norm": 3.814262628555298, + "learning_rate": 2.4070603337612324e-05, + "loss": 0.2946, + "step": 21655 + }, + { + "epoch": 27.7997432605905, + "grad_norm": 0.9383294582366943, + "learning_rate": 2.407017543859649e-05, + "loss": 0.2904, + "step": 21656 + }, + { + "epoch": 27.801026957637998, + "grad_norm": 1.3502800464630127, + "learning_rate": 2.406974753958066e-05, + "loss": 0.321, + "step": 21657 + }, + { + "epoch": 27.802310654685495, + "grad_norm": 3.4424829483032227, + "learning_rate": 2.4069319640564826e-05, + "loss": 0.3096, + "step": 21658 + }, + { + "epoch": 27.80359435173299, + "grad_norm": 1.758421778678894, + "learning_rate": 2.4068891741548995e-05, + "loss": 0.317, + "step": 21659 + }, + { + "epoch": 27.804878048780488, + "grad_norm": 1.882686734199524, + "learning_rate": 2.4068463842533163e-05, + "loss": 0.305, + "step": 21660 + }, + { + "epoch": 27.806161745827985, + "grad_norm": 2.824578046798706, + "learning_rate": 2.406803594351733e-05, + "loss": 0.3013, + "step": 21661 + }, + { + "epoch": 27.80744544287548, + "grad_norm": 2.2060556411743164, + "learning_rate": 2.4067608044501497e-05, + "loss": 0.3361, + "step": 21662 + }, + { + "epoch": 27.808729139922978, + "grad_norm": 1.327705979347229, + "learning_rate": 2.4067180145485665e-05, + "loss": 0.312, + "step": 21663 + }, + { + "epoch": 27.810012836970476, + "grad_norm": 1.392676591873169, + "learning_rate": 2.4066752246469833e-05, + "loss": 0.3336, + "step": 21664 + }, + { + "epoch": 27.811296534017973, + "grad_norm": 1.234197974205017, + "learning_rate": 2.4066324347454002e-05, + "loss": 0.3292, + "step": 21665 + }, + { + "epoch": 27.812580231065468, + "grad_norm": 1.295016884803772, + "learning_rate": 2.406589644843817e-05, + "loss": 0.2753, + "step": 21666 + }, + { + "epoch": 27.813863928112966, + "grad_norm": 1.6699413061141968, + "learning_rate": 2.4065468549422335e-05, + "loss": 0.3321, + "step": 21667 + }, + { + "epoch": 27.815147625160463, + "grad_norm": 1.0518803596496582, + "learning_rate": 2.4065040650406507e-05, + "loss": 0.3259, + "step": 21668 + }, + { + "epoch": 27.816431322207958, + "grad_norm": 1.9793487787246704, + "learning_rate": 2.4064612751390672e-05, + "loss": 0.3126, + "step": 21669 + }, + { + "epoch": 27.817715019255456, + "grad_norm": 1.8888660669326782, + "learning_rate": 2.4064184852374837e-05, + "loss": 0.3748, + "step": 21670 + }, + { + "epoch": 27.818998716302954, + "grad_norm": 5.48696231842041, + "learning_rate": 2.406375695335901e-05, + "loss": 0.3478, + "step": 21671 + }, + { + "epoch": 27.820282413350448, + "grad_norm": 5.786911964416504, + "learning_rate": 2.4063329054343174e-05, + "loss": 0.3353, + "step": 21672 + }, + { + "epoch": 27.821566110397946, + "grad_norm": 3.295888900756836, + "learning_rate": 2.4062901155327346e-05, + "loss": 0.3251, + "step": 21673 + }, + { + "epoch": 27.822849807445444, + "grad_norm": 1.553531527519226, + "learning_rate": 2.406247325631151e-05, + "loss": 0.3462, + "step": 21674 + }, + { + "epoch": 27.82413350449294, + "grad_norm": 1.7388527393341064, + "learning_rate": 2.406204535729568e-05, + "loss": 0.3715, + "step": 21675 + }, + { + "epoch": 27.825417201540436, + "grad_norm": 1.5269403457641602, + "learning_rate": 2.4061617458279848e-05, + "loss": 0.3216, + "step": 21676 + }, + { + "epoch": 27.826700898587934, + "grad_norm": 1.6712709665298462, + "learning_rate": 2.4061189559264013e-05, + "loss": 0.3574, + "step": 21677 + }, + { + "epoch": 27.82798459563543, + "grad_norm": 2.4103341102600098, + "learning_rate": 2.406076166024818e-05, + "loss": 0.3822, + "step": 21678 + }, + { + "epoch": 27.829268292682926, + "grad_norm": 2.513113498687744, + "learning_rate": 2.406033376123235e-05, + "loss": 0.39, + "step": 21679 + }, + { + "epoch": 27.830551989730424, + "grad_norm": 1.9243439435958862, + "learning_rate": 2.4059905862216518e-05, + "loss": 0.4098, + "step": 21680 + }, + { + "epoch": 27.83183568677792, + "grad_norm": 2.512550115585327, + "learning_rate": 2.4059477963200686e-05, + "loss": 0.4318, + "step": 21681 + }, + { + "epoch": 27.833119383825416, + "grad_norm": 1.7171351909637451, + "learning_rate": 2.4059050064184855e-05, + "loss": 0.4457, + "step": 21682 + }, + { + "epoch": 27.834403080872914, + "grad_norm": 1.6497254371643066, + "learning_rate": 2.405862216516902e-05, + "loss": 0.4725, + "step": 21683 + }, + { + "epoch": 27.83568677792041, + "grad_norm": 1.2662698030471802, + "learning_rate": 2.4058194266153188e-05, + "loss": 0.3077, + "step": 21684 + }, + { + "epoch": 27.836970474967906, + "grad_norm": 0.8722487688064575, + "learning_rate": 2.4057766367137357e-05, + "loss": 0.2974, + "step": 21685 + }, + { + "epoch": 27.838254172015404, + "grad_norm": 4.740320682525635, + "learning_rate": 2.405733846812152e-05, + "loss": 0.3143, + "step": 21686 + }, + { + "epoch": 27.8395378690629, + "grad_norm": 0.8446124196052551, + "learning_rate": 2.4056910569105693e-05, + "loss": 0.3296, + "step": 21687 + }, + { + "epoch": 27.8408215661104, + "grad_norm": 1.6222412586212158, + "learning_rate": 2.405648267008986e-05, + "loss": 0.3227, + "step": 21688 + }, + { + "epoch": 27.842105263157894, + "grad_norm": 0.9810665249824524, + "learning_rate": 2.405605477107403e-05, + "loss": 0.3078, + "step": 21689 + }, + { + "epoch": 27.84338896020539, + "grad_norm": 0.9895999431610107, + "learning_rate": 2.4055626872058195e-05, + "loss": 0.3299, + "step": 21690 + }, + { + "epoch": 27.84467265725289, + "grad_norm": 0.9083179831504822, + "learning_rate": 2.405519897304236e-05, + "loss": 0.3305, + "step": 21691 + }, + { + "epoch": 27.845956354300384, + "grad_norm": 1.229966163635254, + "learning_rate": 2.4054771074026532e-05, + "loss": 0.3435, + "step": 21692 + }, + { + "epoch": 27.84724005134788, + "grad_norm": 0.8470117449760437, + "learning_rate": 2.4054343175010697e-05, + "loss": 0.3231, + "step": 21693 + }, + { + "epoch": 27.84852374839538, + "grad_norm": 1.589989185333252, + "learning_rate": 2.4053915275994865e-05, + "loss": 0.319, + "step": 21694 + }, + { + "epoch": 27.849807445442874, + "grad_norm": 1.5653165578842163, + "learning_rate": 2.4053487376979034e-05, + "loss": 0.3386, + "step": 21695 + }, + { + "epoch": 27.85109114249037, + "grad_norm": 1.319693684577942, + "learning_rate": 2.4053059477963202e-05, + "loss": 0.306, + "step": 21696 + }, + { + "epoch": 27.85237483953787, + "grad_norm": 0.9469940662384033, + "learning_rate": 2.405263157894737e-05, + "loss": 0.3097, + "step": 21697 + }, + { + "epoch": 27.853658536585368, + "grad_norm": 0.9853553175926208, + "learning_rate": 2.4052203679931536e-05, + "loss": 0.3421, + "step": 21698 + }, + { + "epoch": 27.854942233632862, + "grad_norm": 0.9507312178611755, + "learning_rate": 2.4051775780915704e-05, + "loss": 0.3424, + "step": 21699 + }, + { + "epoch": 27.85622593068036, + "grad_norm": 1.2869329452514648, + "learning_rate": 2.4051347881899873e-05, + "loss": 0.3165, + "step": 21700 + }, + { + "epoch": 27.857509627727858, + "grad_norm": 0.7785537838935852, + "learning_rate": 2.405091998288404e-05, + "loss": 0.3294, + "step": 21701 + }, + { + "epoch": 27.858793324775352, + "grad_norm": 1.0652800798416138, + "learning_rate": 2.4050492083868206e-05, + "loss": 0.346, + "step": 21702 + }, + { + "epoch": 27.86007702182285, + "grad_norm": 1.4589183330535889, + "learning_rate": 2.4050064184852378e-05, + "loss": 0.343, + "step": 21703 + }, + { + "epoch": 27.861360718870348, + "grad_norm": 0.8870067596435547, + "learning_rate": 2.4049636285836543e-05, + "loss": 0.3216, + "step": 21704 + }, + { + "epoch": 27.862644415917842, + "grad_norm": 1.7203289270401, + "learning_rate": 2.404920838682071e-05, + "loss": 0.3078, + "step": 21705 + }, + { + "epoch": 27.86392811296534, + "grad_norm": 1.8039082288742065, + "learning_rate": 2.404878048780488e-05, + "loss": 0.3261, + "step": 21706 + }, + { + "epoch": 27.865211810012838, + "grad_norm": 1.6533174514770508, + "learning_rate": 2.4048352588789045e-05, + "loss": 0.3185, + "step": 21707 + }, + { + "epoch": 27.866495507060336, + "grad_norm": 1.6133942604064941, + "learning_rate": 2.4047924689773216e-05, + "loss": 0.3338, + "step": 21708 + }, + { + "epoch": 27.86777920410783, + "grad_norm": 1.1173405647277832, + "learning_rate": 2.404749679075738e-05, + "loss": 0.3203, + "step": 21709 + }, + { + "epoch": 27.869062901155328, + "grad_norm": 2.558969020843506, + "learning_rate": 2.404706889174155e-05, + "loss": 0.2915, + "step": 21710 + }, + { + "epoch": 27.870346598202826, + "grad_norm": 1.7728475332260132, + "learning_rate": 2.4046640992725718e-05, + "loss": 0.3112, + "step": 21711 + }, + { + "epoch": 27.87163029525032, + "grad_norm": 1.167654037475586, + "learning_rate": 2.4046213093709883e-05, + "loss": 0.3383, + "step": 21712 + }, + { + "epoch": 27.872913992297818, + "grad_norm": 0.8605691194534302, + "learning_rate": 2.4045785194694055e-05, + "loss": 0.3197, + "step": 21713 + }, + { + "epoch": 27.874197689345316, + "grad_norm": 2.161029577255249, + "learning_rate": 2.404535729567822e-05, + "loss": 0.3084, + "step": 21714 + }, + { + "epoch": 27.87548138639281, + "grad_norm": 1.1176079511642456, + "learning_rate": 2.404492939666239e-05, + "loss": 0.3444, + "step": 21715 + }, + { + "epoch": 27.876765083440308, + "grad_norm": 7.314105987548828, + "learning_rate": 2.4044501497646557e-05, + "loss": 0.3732, + "step": 21716 + }, + { + "epoch": 27.878048780487806, + "grad_norm": 2.9271316528320312, + "learning_rate": 2.4044073598630722e-05, + "loss": 0.3077, + "step": 21717 + }, + { + "epoch": 27.8793324775353, + "grad_norm": 1.3661859035491943, + "learning_rate": 2.404364569961489e-05, + "loss": 0.3352, + "step": 21718 + }, + { + "epoch": 27.880616174582798, + "grad_norm": 1.3045190572738647, + "learning_rate": 2.404321780059906e-05, + "loss": 0.306, + "step": 21719 + }, + { + "epoch": 27.881899871630296, + "grad_norm": 1.1830672025680542, + "learning_rate": 2.4042789901583227e-05, + "loss": 0.3359, + "step": 21720 + }, + { + "epoch": 27.883183568677794, + "grad_norm": 1.1894904375076294, + "learning_rate": 2.4042362002567396e-05, + "loss": 0.3202, + "step": 21721 + }, + { + "epoch": 27.884467265725288, + "grad_norm": 1.5593101978302002, + "learning_rate": 2.4041934103551564e-05, + "loss": 0.3291, + "step": 21722 + }, + { + "epoch": 27.885750962772786, + "grad_norm": 1.1482309103012085, + "learning_rate": 2.404150620453573e-05, + "loss": 0.3257, + "step": 21723 + }, + { + "epoch": 27.887034659820284, + "grad_norm": 2.6490468978881836, + "learning_rate": 2.4041078305519897e-05, + "loss": 0.3405, + "step": 21724 + }, + { + "epoch": 27.888318356867778, + "grad_norm": 1.7570146322250366, + "learning_rate": 2.4040650406504066e-05, + "loss": 0.3438, + "step": 21725 + }, + { + "epoch": 27.889602053915276, + "grad_norm": 3.846703052520752, + "learning_rate": 2.404022250748823e-05, + "loss": 0.3252, + "step": 21726 + }, + { + "epoch": 27.890885750962774, + "grad_norm": 3.7986655235290527, + "learning_rate": 2.4039794608472403e-05, + "loss": 0.3727, + "step": 21727 + }, + { + "epoch": 27.892169448010268, + "grad_norm": 1.5889467000961304, + "learning_rate": 2.4039366709456568e-05, + "loss": 0.3847, + "step": 21728 + }, + { + "epoch": 27.893453145057766, + "grad_norm": 1.9025286436080933, + "learning_rate": 2.403893881044074e-05, + "loss": 0.4192, + "step": 21729 + }, + { + "epoch": 27.894736842105264, + "grad_norm": 2.544910430908203, + "learning_rate": 2.4038510911424905e-05, + "loss": 0.3877, + "step": 21730 + }, + { + "epoch": 27.89602053915276, + "grad_norm": 2.018488883972168, + "learning_rate": 2.403808301240907e-05, + "loss": 0.4406, + "step": 21731 + }, + { + "epoch": 27.897304236200256, + "grad_norm": 2.50102162361145, + "learning_rate": 2.403765511339324e-05, + "loss": 0.4663, + "step": 21732 + }, + { + "epoch": 27.898587933247754, + "grad_norm": 1.6342122554779053, + "learning_rate": 2.4037227214377406e-05, + "loss": 0.4859, + "step": 21733 + }, + { + "epoch": 27.89987163029525, + "grad_norm": 2.150434732437134, + "learning_rate": 2.4036799315361575e-05, + "loss": 0.2761, + "step": 21734 + }, + { + "epoch": 27.901155327342746, + "grad_norm": 1.2794666290283203, + "learning_rate": 2.4036371416345743e-05, + "loss": 0.3057, + "step": 21735 + }, + { + "epoch": 27.902439024390244, + "grad_norm": 2.5702803134918213, + "learning_rate": 2.403594351732991e-05, + "loss": 0.3068, + "step": 21736 + }, + { + "epoch": 27.90372272143774, + "grad_norm": 1.1059767007827759, + "learning_rate": 2.403551561831408e-05, + "loss": 0.3004, + "step": 21737 + }, + { + "epoch": 27.905006418485236, + "grad_norm": 1.5504900217056274, + "learning_rate": 2.4035087719298245e-05, + "loss": 0.3519, + "step": 21738 + }, + { + "epoch": 27.906290115532734, + "grad_norm": 2.31109619140625, + "learning_rate": 2.4034659820282413e-05, + "loss": 0.2816, + "step": 21739 + }, + { + "epoch": 27.90757381258023, + "grad_norm": 0.9188140034675598, + "learning_rate": 2.4034231921266582e-05, + "loss": 0.3047, + "step": 21740 + }, + { + "epoch": 27.90885750962773, + "grad_norm": 3.065546751022339, + "learning_rate": 2.403380402225075e-05, + "loss": 0.31, + "step": 21741 + }, + { + "epoch": 27.910141206675224, + "grad_norm": 1.296015739440918, + "learning_rate": 2.4033376123234915e-05, + "loss": 0.3064, + "step": 21742 + }, + { + "epoch": 27.911424903722722, + "grad_norm": 1.1199920177459717, + "learning_rate": 2.4032948224219087e-05, + "loss": 0.3439, + "step": 21743 + }, + { + "epoch": 27.91270860077022, + "grad_norm": 0.8805428147315979, + "learning_rate": 2.4032520325203252e-05, + "loss": 0.294, + "step": 21744 + }, + { + "epoch": 27.913992297817714, + "grad_norm": 0.9096043705940247, + "learning_rate": 2.403209242618742e-05, + "loss": 0.3351, + "step": 21745 + }, + { + "epoch": 27.915275994865212, + "grad_norm": 0.9973348379135132, + "learning_rate": 2.403166452717159e-05, + "loss": 0.3136, + "step": 21746 + }, + { + "epoch": 27.91655969191271, + "grad_norm": 2.0831117630004883, + "learning_rate": 2.4031236628155754e-05, + "loss": 0.3093, + "step": 21747 + }, + { + "epoch": 27.917843388960204, + "grad_norm": 0.9366073608398438, + "learning_rate": 2.4030808729139926e-05, + "loss": 0.3176, + "step": 21748 + }, + { + "epoch": 27.919127086007702, + "grad_norm": 2.4770827293395996, + "learning_rate": 2.403038083012409e-05, + "loss": 0.3589, + "step": 21749 + }, + { + "epoch": 27.9204107830552, + "grad_norm": 1.0194838047027588, + "learning_rate": 2.402995293110826e-05, + "loss": 0.3298, + "step": 21750 + }, + { + "epoch": 27.921694480102694, + "grad_norm": 1.0748698711395264, + "learning_rate": 2.4029525032092428e-05, + "loss": 0.299, + "step": 21751 + }, + { + "epoch": 27.922978177150192, + "grad_norm": 1.764724612236023, + "learning_rate": 2.4029097133076593e-05, + "loss": 0.3145, + "step": 21752 + }, + { + "epoch": 27.92426187419769, + "grad_norm": 1.6129871606826782, + "learning_rate": 2.4028669234060764e-05, + "loss": 0.2858, + "step": 21753 + }, + { + "epoch": 27.925545571245188, + "grad_norm": 1.260746717453003, + "learning_rate": 2.402824133504493e-05, + "loss": 0.2959, + "step": 21754 + }, + { + "epoch": 27.926829268292682, + "grad_norm": 5.126245021820068, + "learning_rate": 2.4027813436029098e-05, + "loss": 0.3243, + "step": 21755 + }, + { + "epoch": 27.92811296534018, + "grad_norm": 1.432118535041809, + "learning_rate": 2.4027385537013266e-05, + "loss": 0.3445, + "step": 21756 + }, + { + "epoch": 27.929396662387678, + "grad_norm": 1.2055373191833496, + "learning_rate": 2.4026957637997435e-05, + "loss": 0.2767, + "step": 21757 + }, + { + "epoch": 27.930680359435172, + "grad_norm": 1.6921714544296265, + "learning_rate": 2.40265297389816e-05, + "loss": 0.3251, + "step": 21758 + }, + { + "epoch": 27.93196405648267, + "grad_norm": 0.9138566255569458, + "learning_rate": 2.4026101839965768e-05, + "loss": 0.3295, + "step": 21759 + }, + { + "epoch": 27.933247753530168, + "grad_norm": 1.1566600799560547, + "learning_rate": 2.4025673940949937e-05, + "loss": 0.343, + "step": 21760 + }, + { + "epoch": 27.934531450577662, + "grad_norm": 1.1050217151641846, + "learning_rate": 2.4025246041934105e-05, + "loss": 0.3189, + "step": 21761 + }, + { + "epoch": 27.93581514762516, + "grad_norm": 1.0713386535644531, + "learning_rate": 2.4024818142918273e-05, + "loss": 0.3136, + "step": 21762 + }, + { + "epoch": 27.937098844672658, + "grad_norm": 1.3178828954696655, + "learning_rate": 2.402439024390244e-05, + "loss": 0.3487, + "step": 21763 + }, + { + "epoch": 27.938382541720156, + "grad_norm": 1.5201456546783447, + "learning_rate": 2.402396234488661e-05, + "loss": 0.3132, + "step": 21764 + }, + { + "epoch": 27.93966623876765, + "grad_norm": 3.061206817626953, + "learning_rate": 2.4023534445870775e-05, + "loss": 0.3376, + "step": 21765 + }, + { + "epoch": 27.940949935815148, + "grad_norm": 3.395836591720581, + "learning_rate": 2.402310654685494e-05, + "loss": 0.3206, + "step": 21766 + }, + { + "epoch": 27.942233632862646, + "grad_norm": 3.0681493282318115, + "learning_rate": 2.4022678647839112e-05, + "loss": 0.3329, + "step": 21767 + }, + { + "epoch": 27.94351732991014, + "grad_norm": 1.6690154075622559, + "learning_rate": 2.4022250748823277e-05, + "loss": 0.3586, + "step": 21768 + }, + { + "epoch": 27.944801026957638, + "grad_norm": 1.0189803838729858, + "learning_rate": 2.402182284980745e-05, + "loss": 0.3009, + "step": 21769 + }, + { + "epoch": 27.946084724005136, + "grad_norm": 2.7101287841796875, + "learning_rate": 2.4021394950791614e-05, + "loss": 0.3194, + "step": 21770 + }, + { + "epoch": 27.94736842105263, + "grad_norm": 2.1497368812561035, + "learning_rate": 2.4020967051775782e-05, + "loss": 0.3417, + "step": 21771 + }, + { + "epoch": 27.948652118100128, + "grad_norm": 1.3265001773834229, + "learning_rate": 2.402053915275995e-05, + "loss": 0.351, + "step": 21772 + }, + { + "epoch": 27.949935815147626, + "grad_norm": 1.3941296339035034, + "learning_rate": 2.4020111253744116e-05, + "loss": 0.3662, + "step": 21773 + }, + { + "epoch": 27.951219512195124, + "grad_norm": 1.006109595298767, + "learning_rate": 2.4019683354728284e-05, + "loss": 0.3328, + "step": 21774 + }, + { + "epoch": 27.952503209242618, + "grad_norm": 3.1672277450561523, + "learning_rate": 2.4019255455712453e-05, + "loss": 0.3641, + "step": 21775 + }, + { + "epoch": 27.953786906290116, + "grad_norm": 1.1232681274414062, + "learning_rate": 2.401882755669662e-05, + "loss": 0.3187, + "step": 21776 + }, + { + "epoch": 27.955070603337614, + "grad_norm": 1.2765902280807495, + "learning_rate": 2.401839965768079e-05, + "loss": 0.3475, + "step": 21777 + }, + { + "epoch": 27.956354300385108, + "grad_norm": 1.165855050086975, + "learning_rate": 2.4017971758664954e-05, + "loss": 0.3382, + "step": 21778 + }, + { + "epoch": 27.957637997432606, + "grad_norm": 2.078815460205078, + "learning_rate": 2.4017543859649123e-05, + "loss": 0.3607, + "step": 21779 + }, + { + "epoch": 27.958921694480104, + "grad_norm": 2.5508742332458496, + "learning_rate": 2.401711596063329e-05, + "loss": 0.3718, + "step": 21780 + }, + { + "epoch": 27.960205391527598, + "grad_norm": 3.892212390899658, + "learning_rate": 2.401668806161746e-05, + "loss": 0.4012, + "step": 21781 + }, + { + "epoch": 27.961489088575096, + "grad_norm": 4.872045516967773, + "learning_rate": 2.4016260162601625e-05, + "loss": 0.4553, + "step": 21782 + }, + { + "epoch": 27.962772785622594, + "grad_norm": 3.4386680126190186, + "learning_rate": 2.4015832263585796e-05, + "loss": 0.5523, + "step": 21783 + }, + { + "epoch": 27.964056482670088, + "grad_norm": 0.9179022312164307, + "learning_rate": 2.401540436456996e-05, + "loss": 0.3082, + "step": 21784 + }, + { + "epoch": 27.965340179717586, + "grad_norm": 0.9910194873809814, + "learning_rate": 2.4014976465554127e-05, + "loss": 0.3151, + "step": 21785 + }, + { + "epoch": 27.966623876765084, + "grad_norm": 1.1756831407546997, + "learning_rate": 2.40145485665383e-05, + "loss": 0.3146, + "step": 21786 + }, + { + "epoch": 27.96790757381258, + "grad_norm": 1.6645222902297974, + "learning_rate": 2.4014120667522463e-05, + "loss": 0.2827, + "step": 21787 + }, + { + "epoch": 27.969191270860076, + "grad_norm": 1.1771669387817383, + "learning_rate": 2.4013692768506635e-05, + "loss": 0.3326, + "step": 21788 + }, + { + "epoch": 27.970474967907574, + "grad_norm": 0.8578036427497864, + "learning_rate": 2.40132648694908e-05, + "loss": 0.2981, + "step": 21789 + }, + { + "epoch": 27.971758664955072, + "grad_norm": 1.445390224456787, + "learning_rate": 2.401283697047497e-05, + "loss": 0.3074, + "step": 21790 + }, + { + "epoch": 27.973042362002566, + "grad_norm": 1.2807742357254028, + "learning_rate": 2.4012409071459137e-05, + "loss": 0.3232, + "step": 21791 + }, + { + "epoch": 27.974326059050064, + "grad_norm": 1.3084887266159058, + "learning_rate": 2.4011981172443302e-05, + "loss": 0.3554, + "step": 21792 + }, + { + "epoch": 27.975609756097562, + "grad_norm": 1.0595506429672241, + "learning_rate": 2.401155327342747e-05, + "loss": 0.3253, + "step": 21793 + }, + { + "epoch": 27.976893453145056, + "grad_norm": 1.2288445234298706, + "learning_rate": 2.401112537441164e-05, + "loss": 0.3171, + "step": 21794 + }, + { + "epoch": 27.978177150192554, + "grad_norm": 2.199955701828003, + "learning_rate": 2.4010697475395807e-05, + "loss": 0.3304, + "step": 21795 + }, + { + "epoch": 27.979460847240052, + "grad_norm": 0.939915657043457, + "learning_rate": 2.4010269576379976e-05, + "loss": 0.3566, + "step": 21796 + }, + { + "epoch": 27.98074454428755, + "grad_norm": 1.3645999431610107, + "learning_rate": 2.4009841677364144e-05, + "loss": 0.3203, + "step": 21797 + }, + { + "epoch": 27.982028241335044, + "grad_norm": 1.6096113920211792, + "learning_rate": 2.400941377834831e-05, + "loss": 0.3027, + "step": 21798 + }, + { + "epoch": 27.983311938382542, + "grad_norm": 1.2773491144180298, + "learning_rate": 2.4008985879332478e-05, + "loss": 0.3328, + "step": 21799 + }, + { + "epoch": 27.98459563543004, + "grad_norm": 3.891645908355713, + "learning_rate": 2.4008557980316646e-05, + "loss": 0.2959, + "step": 21800 + }, + { + "epoch": 27.985879332477534, + "grad_norm": 1.5309005975723267, + "learning_rate": 2.400813008130081e-05, + "loss": 0.3165, + "step": 21801 + }, + { + "epoch": 27.987163029525032, + "grad_norm": 1.4693866968154907, + "learning_rate": 2.4007702182284983e-05, + "loss": 0.3334, + "step": 21802 + }, + { + "epoch": 27.98844672657253, + "grad_norm": 1.268848180770874, + "learning_rate": 2.4007274283269148e-05, + "loss": 0.3446, + "step": 21803 + }, + { + "epoch": 27.989730423620024, + "grad_norm": 2.7363369464874268, + "learning_rate": 2.400684638425332e-05, + "loss": 0.3386, + "step": 21804 + }, + { + "epoch": 27.991014120667522, + "grad_norm": 1.073901653289795, + "learning_rate": 2.4006418485237485e-05, + "loss": 0.2988, + "step": 21805 + }, + { + "epoch": 27.99229781771502, + "grad_norm": 1.2541943788528442, + "learning_rate": 2.400599058622165e-05, + "loss": 0.368, + "step": 21806 + }, + { + "epoch": 27.993581514762518, + "grad_norm": 2.980489492416382, + "learning_rate": 2.400556268720582e-05, + "loss": 0.3164, + "step": 21807 + }, + { + "epoch": 27.994865211810012, + "grad_norm": 0.9755175709724426, + "learning_rate": 2.4005134788189986e-05, + "loss": 0.3207, + "step": 21808 + }, + { + "epoch": 27.99614890885751, + "grad_norm": 2.169264316558838, + "learning_rate": 2.4004706889174155e-05, + "loss": 0.3685, + "step": 21809 + }, + { + "epoch": 27.997432605905008, + "grad_norm": 1.922028660774231, + "learning_rate": 2.4004278990158323e-05, + "loss": 0.3774, + "step": 21810 + }, + { + "epoch": 27.998716302952502, + "grad_norm": 1.3695383071899414, + "learning_rate": 2.400385109114249e-05, + "loss": 0.3985, + "step": 21811 + }, + { + "epoch": 28.0, + "grad_norm": 2.6769261360168457, + "learning_rate": 2.400342319212666e-05, + "loss": 0.518, + "step": 21812 + }, + { + "epoch": 28.001283697047498, + "grad_norm": 1.572811245918274, + "learning_rate": 2.4002995293110825e-05, + "loss": 0.3077, + "step": 21813 + }, + { + "epoch": 28.002567394094992, + "grad_norm": 0.8441378474235535, + "learning_rate": 2.4002567394094994e-05, + "loss": 0.2914, + "step": 21814 + }, + { + "epoch": 28.00385109114249, + "grad_norm": 0.8838710188865662, + "learning_rate": 2.4002139495079162e-05, + "loss": 0.2809, + "step": 21815 + }, + { + "epoch": 28.005134788189988, + "grad_norm": 1.035292148590088, + "learning_rate": 2.400171159606333e-05, + "loss": 0.3377, + "step": 21816 + }, + { + "epoch": 28.006418485237482, + "grad_norm": 1.600455403327942, + "learning_rate": 2.4001283697047495e-05, + "loss": 0.309, + "step": 21817 + }, + { + "epoch": 28.00770218228498, + "grad_norm": 0.8679885864257812, + "learning_rate": 2.4000855798031667e-05, + "loss": 0.3065, + "step": 21818 + }, + { + "epoch": 28.008985879332478, + "grad_norm": 1.2178795337677002, + "learning_rate": 2.4000427899015832e-05, + "loss": 0.3303, + "step": 21819 + }, + { + "epoch": 28.010269576379976, + "grad_norm": 0.9715826511383057, + "learning_rate": 2.4e-05, + "loss": 0.2869, + "step": 21820 + }, + { + "epoch": 28.01155327342747, + "grad_norm": 1.2543113231658936, + "learning_rate": 2.399957210098417e-05, + "loss": 0.2953, + "step": 21821 + }, + { + "epoch": 28.012836970474968, + "grad_norm": 1.5374302864074707, + "learning_rate": 2.3999144201968334e-05, + "loss": 0.2885, + "step": 21822 + }, + { + "epoch": 28.014120667522466, + "grad_norm": 0.9458462595939636, + "learning_rate": 2.3998716302952506e-05, + "loss": 0.3235, + "step": 21823 + }, + { + "epoch": 28.01540436456996, + "grad_norm": 0.9165031313896179, + "learning_rate": 2.399828840393667e-05, + "loss": 0.2966, + "step": 21824 + }, + { + "epoch": 28.016688061617458, + "grad_norm": 1.6591325998306274, + "learning_rate": 2.399786050492084e-05, + "loss": 0.2928, + "step": 21825 + }, + { + "epoch": 28.017971758664956, + "grad_norm": 3.3865907192230225, + "learning_rate": 2.3997432605905008e-05, + "loss": 0.335, + "step": 21826 + }, + { + "epoch": 28.01925545571245, + "grad_norm": 1.014941930770874, + "learning_rate": 2.3997004706889173e-05, + "loss": 0.2994, + "step": 21827 + }, + { + "epoch": 28.020539152759948, + "grad_norm": 1.8728796243667603, + "learning_rate": 2.3996576807873345e-05, + "loss": 0.3016, + "step": 21828 + }, + { + "epoch": 28.021822849807446, + "grad_norm": 0.9411805868148804, + "learning_rate": 2.399614890885751e-05, + "loss": 0.323, + "step": 21829 + }, + { + "epoch": 28.023106546854944, + "grad_norm": 0.8021646738052368, + "learning_rate": 2.3995721009841678e-05, + "loss": 0.3014, + "step": 21830 + }, + { + "epoch": 28.024390243902438, + "grad_norm": 3.4971842765808105, + "learning_rate": 2.3995293110825846e-05, + "loss": 0.2679, + "step": 21831 + }, + { + "epoch": 28.025673940949936, + "grad_norm": 1.1303291320800781, + "learning_rate": 2.3994865211810015e-05, + "loss": 0.2995, + "step": 21832 + }, + { + "epoch": 28.026957637997434, + "grad_norm": 1.1470041275024414, + "learning_rate": 2.399443731279418e-05, + "loss": 0.3191, + "step": 21833 + }, + { + "epoch": 28.028241335044928, + "grad_norm": 1.3101272583007812, + "learning_rate": 2.3994009413778348e-05, + "loss": 0.2994, + "step": 21834 + }, + { + "epoch": 28.029525032092426, + "grad_norm": 1.889043927192688, + "learning_rate": 2.3993581514762517e-05, + "loss": 0.3287, + "step": 21835 + }, + { + "epoch": 28.030808729139924, + "grad_norm": 2.5263285636901855, + "learning_rate": 2.3993153615746685e-05, + "loss": 0.304, + "step": 21836 + }, + { + "epoch": 28.03209242618742, + "grad_norm": 2.4020371437072754, + "learning_rate": 2.3992725716730853e-05, + "loss": 0.3154, + "step": 21837 + }, + { + "epoch": 28.033376123234916, + "grad_norm": 1.5686008930206299, + "learning_rate": 2.399229781771502e-05, + "loss": 0.3036, + "step": 21838 + }, + { + "epoch": 28.034659820282414, + "grad_norm": 4.153070449829102, + "learning_rate": 2.3991869918699187e-05, + "loss": 0.2834, + "step": 21839 + }, + { + "epoch": 28.035943517329912, + "grad_norm": 1.3518259525299072, + "learning_rate": 2.3991442019683355e-05, + "loss": 0.2772, + "step": 21840 + }, + { + "epoch": 28.037227214377406, + "grad_norm": 1.701914668083191, + "learning_rate": 2.399101412066752e-05, + "loss": 0.2966, + "step": 21841 + }, + { + "epoch": 28.038510911424904, + "grad_norm": 0.9860066175460815, + "learning_rate": 2.3990586221651692e-05, + "loss": 0.293, + "step": 21842 + }, + { + "epoch": 28.039794608472402, + "grad_norm": 1.7866902351379395, + "learning_rate": 2.3990158322635857e-05, + "loss": 0.2975, + "step": 21843 + }, + { + "epoch": 28.041078305519896, + "grad_norm": 1.5584361553192139, + "learning_rate": 2.398973042362003e-05, + "loss": 0.3523, + "step": 21844 + }, + { + "epoch": 28.042362002567394, + "grad_norm": 1.2418341636657715, + "learning_rate": 2.3989302524604194e-05, + "loss": 0.3349, + "step": 21845 + }, + { + "epoch": 28.043645699614892, + "grad_norm": 1.2816768884658813, + "learning_rate": 2.398887462558836e-05, + "loss": 0.3171, + "step": 21846 + }, + { + "epoch": 28.044929396662386, + "grad_norm": 1.4894787073135376, + "learning_rate": 2.398844672657253e-05, + "loss": 0.3019, + "step": 21847 + }, + { + "epoch": 28.046213093709884, + "grad_norm": 1.708516001701355, + "learning_rate": 2.3988018827556696e-05, + "loss": 0.3199, + "step": 21848 + }, + { + "epoch": 28.047496790757382, + "grad_norm": 1.7108383178710938, + "learning_rate": 2.3987590928540864e-05, + "loss": 0.2757, + "step": 21849 + }, + { + "epoch": 28.048780487804876, + "grad_norm": 3.0401480197906494, + "learning_rate": 2.3987163029525033e-05, + "loss": 0.2975, + "step": 21850 + }, + { + "epoch": 28.050064184852374, + "grad_norm": 1.7819159030914307, + "learning_rate": 2.39867351305092e-05, + "loss": 0.2999, + "step": 21851 + }, + { + "epoch": 28.051347881899872, + "grad_norm": 1.1615451574325562, + "learning_rate": 2.398630723149337e-05, + "loss": 0.324, + "step": 21852 + }, + { + "epoch": 28.05263157894737, + "grad_norm": 1.1908190250396729, + "learning_rate": 2.3985879332477534e-05, + "loss": 0.2837, + "step": 21853 + }, + { + "epoch": 28.053915275994864, + "grad_norm": 1.9622349739074707, + "learning_rate": 2.3985451433461703e-05, + "loss": 0.3515, + "step": 21854 + }, + { + "epoch": 28.055198973042362, + "grad_norm": 1.2832865715026855, + "learning_rate": 2.398502353444587e-05, + "loss": 0.3457, + "step": 21855 + }, + { + "epoch": 28.05648267008986, + "grad_norm": 1.463740587234497, + "learning_rate": 2.398459563543004e-05, + "loss": 0.2809, + "step": 21856 + }, + { + "epoch": 28.057766367137354, + "grad_norm": 6.3953633308410645, + "learning_rate": 2.3984167736414205e-05, + "loss": 0.3823, + "step": 21857 + }, + { + "epoch": 28.059050064184852, + "grad_norm": 1.2676633596420288, + "learning_rate": 2.3983739837398377e-05, + "loss": 0.3325, + "step": 21858 + }, + { + "epoch": 28.06033376123235, + "grad_norm": 1.4038355350494385, + "learning_rate": 2.398331193838254e-05, + "loss": 0.4185, + "step": 21859 + }, + { + "epoch": 28.061617458279844, + "grad_norm": 2.5700438022613525, + "learning_rate": 2.398288403936671e-05, + "loss": 0.3383, + "step": 21860 + }, + { + "epoch": 28.062901155327342, + "grad_norm": 2.2555582523345947, + "learning_rate": 2.398245614035088e-05, + "loss": 0.4549, + "step": 21861 + }, + { + "epoch": 28.06418485237484, + "grad_norm": 1.4425755739212036, + "learning_rate": 2.3982028241335043e-05, + "loss": 0.4705, + "step": 21862 + }, + { + "epoch": 28.065468549422338, + "grad_norm": 1.322851300239563, + "learning_rate": 2.3981600342319215e-05, + "loss": 0.3206, + "step": 21863 + }, + { + "epoch": 28.066752246469832, + "grad_norm": 1.388502597808838, + "learning_rate": 2.398117244330338e-05, + "loss": 0.2915, + "step": 21864 + }, + { + "epoch": 28.06803594351733, + "grad_norm": 0.7794787883758545, + "learning_rate": 2.398074454428755e-05, + "loss": 0.2887, + "step": 21865 + }, + { + "epoch": 28.069319640564828, + "grad_norm": 0.703319787979126, + "learning_rate": 2.3980316645271717e-05, + "loss": 0.2901, + "step": 21866 + }, + { + "epoch": 28.070603337612322, + "grad_norm": 0.7982649207115173, + "learning_rate": 2.3979888746255882e-05, + "loss": 0.2895, + "step": 21867 + }, + { + "epoch": 28.07188703465982, + "grad_norm": 1.7209160327911377, + "learning_rate": 2.3979460847240054e-05, + "loss": 0.2906, + "step": 21868 + }, + { + "epoch": 28.073170731707318, + "grad_norm": 0.8801164627075195, + "learning_rate": 2.397903294822422e-05, + "loss": 0.2946, + "step": 21869 + }, + { + "epoch": 28.074454428754812, + "grad_norm": 0.8158016800880432, + "learning_rate": 2.3978605049208387e-05, + "loss": 0.2917, + "step": 21870 + }, + { + "epoch": 28.07573812580231, + "grad_norm": 2.773597240447998, + "learning_rate": 2.3978177150192556e-05, + "loss": 0.3, + "step": 21871 + }, + { + "epoch": 28.077021822849808, + "grad_norm": 0.9049150347709656, + "learning_rate": 2.3977749251176724e-05, + "loss": 0.3012, + "step": 21872 + }, + { + "epoch": 28.078305519897306, + "grad_norm": 1.4100546836853027, + "learning_rate": 2.397732135216089e-05, + "loss": 0.3381, + "step": 21873 + }, + { + "epoch": 28.0795892169448, + "grad_norm": 2.5024611949920654, + "learning_rate": 2.3976893453145058e-05, + "loss": 0.3157, + "step": 21874 + }, + { + "epoch": 28.080872913992298, + "grad_norm": 1.5818930864334106, + "learning_rate": 2.3976465554129226e-05, + "loss": 0.3085, + "step": 21875 + }, + { + "epoch": 28.082156611039796, + "grad_norm": 0.8877540826797485, + "learning_rate": 2.3976037655113394e-05, + "loss": 0.2964, + "step": 21876 + }, + { + "epoch": 28.08344030808729, + "grad_norm": 1.2492259740829468, + "learning_rate": 2.3975609756097563e-05, + "loss": 0.3186, + "step": 21877 + }, + { + "epoch": 28.084724005134788, + "grad_norm": 0.7400742769241333, + "learning_rate": 2.3975181857081728e-05, + "loss": 0.3158, + "step": 21878 + }, + { + "epoch": 28.086007702182286, + "grad_norm": 1.0979822874069214, + "learning_rate": 2.39747539580659e-05, + "loss": 0.2956, + "step": 21879 + }, + { + "epoch": 28.08729139922978, + "grad_norm": 0.9454737901687622, + "learning_rate": 2.3974326059050065e-05, + "loss": 0.2983, + "step": 21880 + }, + { + "epoch": 28.088575096277278, + "grad_norm": 2.565433979034424, + "learning_rate": 2.397389816003423e-05, + "loss": 0.2879, + "step": 21881 + }, + { + "epoch": 28.089858793324776, + "grad_norm": 0.8592546582221985, + "learning_rate": 2.39734702610184e-05, + "loss": 0.2694, + "step": 21882 + }, + { + "epoch": 28.09114249037227, + "grad_norm": 0.875028133392334, + "learning_rate": 2.3973042362002567e-05, + "loss": 0.2994, + "step": 21883 + }, + { + "epoch": 28.09242618741977, + "grad_norm": 3.6720712184906006, + "learning_rate": 2.3972614462986738e-05, + "loss": 0.2875, + "step": 21884 + }, + { + "epoch": 28.093709884467266, + "grad_norm": 0.8566001653671265, + "learning_rate": 2.3972186563970903e-05, + "loss": 0.2986, + "step": 21885 + }, + { + "epoch": 28.094993581514764, + "grad_norm": 6.467459201812744, + "learning_rate": 2.3971758664955072e-05, + "loss": 0.284, + "step": 21886 + }, + { + "epoch": 28.09627727856226, + "grad_norm": 1.047473669052124, + "learning_rate": 2.397133076593924e-05, + "loss": 0.2772, + "step": 21887 + }, + { + "epoch": 28.097560975609756, + "grad_norm": 1.8728280067443848, + "learning_rate": 2.3970902866923405e-05, + "loss": 0.2876, + "step": 21888 + }, + { + "epoch": 28.098844672657254, + "grad_norm": 1.1123273372650146, + "learning_rate": 2.3970474967907574e-05, + "loss": 0.2923, + "step": 21889 + }, + { + "epoch": 28.10012836970475, + "grad_norm": 1.2982426881790161, + "learning_rate": 2.3970047068891742e-05, + "loss": 0.3248, + "step": 21890 + }, + { + "epoch": 28.101412066752246, + "grad_norm": 8.847606658935547, + "learning_rate": 2.396961916987591e-05, + "loss": 0.3124, + "step": 21891 + }, + { + "epoch": 28.102695763799744, + "grad_norm": 1.4748141765594482, + "learning_rate": 2.396919127086008e-05, + "loss": 0.3124, + "step": 21892 + }, + { + "epoch": 28.10397946084724, + "grad_norm": 1.915488362312317, + "learning_rate": 2.3968763371844247e-05, + "loss": 0.3105, + "step": 21893 + }, + { + "epoch": 28.105263157894736, + "grad_norm": 1.23166024684906, + "learning_rate": 2.3968335472828412e-05, + "loss": 0.2785, + "step": 21894 + }, + { + "epoch": 28.106546854942234, + "grad_norm": 1.1400887966156006, + "learning_rate": 2.396790757381258e-05, + "loss": 0.3202, + "step": 21895 + }, + { + "epoch": 28.107830551989732, + "grad_norm": 1.8947583436965942, + "learning_rate": 2.396747967479675e-05, + "loss": 0.281, + "step": 21896 + }, + { + "epoch": 28.109114249037226, + "grad_norm": 0.9882593154907227, + "learning_rate": 2.3967051775780914e-05, + "loss": 0.2999, + "step": 21897 + }, + { + "epoch": 28.110397946084724, + "grad_norm": 1.3261213302612305, + "learning_rate": 2.3966623876765086e-05, + "loss": 0.3249, + "step": 21898 + }, + { + "epoch": 28.111681643132222, + "grad_norm": 1.1361688375473022, + "learning_rate": 2.396619597774925e-05, + "loss": 0.3429, + "step": 21899 + }, + { + "epoch": 28.112965340179716, + "grad_norm": 1.689063549041748, + "learning_rate": 2.396576807873342e-05, + "loss": 0.3449, + "step": 21900 + }, + { + "epoch": 28.114249037227214, + "grad_norm": 1.3065745830535889, + "learning_rate": 2.3965340179717588e-05, + "loss": 0.2802, + "step": 21901 + }, + { + "epoch": 28.115532734274712, + "grad_norm": 0.9818000793457031, + "learning_rate": 2.3964912280701753e-05, + "loss": 0.3172, + "step": 21902 + }, + { + "epoch": 28.116816431322206, + "grad_norm": 1.7300443649291992, + "learning_rate": 2.3964484381685925e-05, + "loss": 0.3367, + "step": 21903 + }, + { + "epoch": 28.118100128369704, + "grad_norm": 2.980926513671875, + "learning_rate": 2.396405648267009e-05, + "loss": 0.3318, + "step": 21904 + }, + { + "epoch": 28.119383825417202, + "grad_norm": 2.176044464111328, + "learning_rate": 2.3963628583654258e-05, + "loss": 0.3202, + "step": 21905 + }, + { + "epoch": 28.1206675224647, + "grad_norm": 1.0977693796157837, + "learning_rate": 2.3963200684638426e-05, + "loss": 0.3237, + "step": 21906 + }, + { + "epoch": 28.121951219512194, + "grad_norm": 2.566824436187744, + "learning_rate": 2.396277278562259e-05, + "loss": 0.3405, + "step": 21907 + }, + { + "epoch": 28.123234916559692, + "grad_norm": 1.9980947971343994, + "learning_rate": 2.3962344886606763e-05, + "loss": 0.3564, + "step": 21908 + }, + { + "epoch": 28.12451861360719, + "grad_norm": 1.0850938558578491, + "learning_rate": 2.3961916987590928e-05, + "loss": 0.3906, + "step": 21909 + }, + { + "epoch": 28.125802310654684, + "grad_norm": 5.827768802642822, + "learning_rate": 2.3961489088575097e-05, + "loss": 0.3645, + "step": 21910 + }, + { + "epoch": 28.127086007702182, + "grad_norm": 1.8434332609176636, + "learning_rate": 2.3961061189559265e-05, + "loss": 0.419, + "step": 21911 + }, + { + "epoch": 28.12836970474968, + "grad_norm": 4.490200042724609, + "learning_rate": 2.3960633290543434e-05, + "loss": 0.4329, + "step": 21912 + }, + { + "epoch": 28.129653401797174, + "grad_norm": 1.1843626499176025, + "learning_rate": 2.39602053915276e-05, + "loss": 0.3009, + "step": 21913 + }, + { + "epoch": 28.130937098844672, + "grad_norm": 0.6087806224822998, + "learning_rate": 2.3959777492511767e-05, + "loss": 0.2671, + "step": 21914 + }, + { + "epoch": 28.13222079589217, + "grad_norm": 0.8863800168037415, + "learning_rate": 2.3959349593495935e-05, + "loss": 0.2941, + "step": 21915 + }, + { + "epoch": 28.133504492939664, + "grad_norm": 1.1093648672103882, + "learning_rate": 2.3958921694480104e-05, + "loss": 0.3085, + "step": 21916 + }, + { + "epoch": 28.134788189987162, + "grad_norm": 0.8373936414718628, + "learning_rate": 2.3958493795464272e-05, + "loss": 0.2633, + "step": 21917 + }, + { + "epoch": 28.13607188703466, + "grad_norm": 0.8485817909240723, + "learning_rate": 2.3958065896448437e-05, + "loss": 0.2994, + "step": 21918 + }, + { + "epoch": 28.137355584082158, + "grad_norm": 0.9768522381782532, + "learning_rate": 2.395763799743261e-05, + "loss": 0.2811, + "step": 21919 + }, + { + "epoch": 28.138639281129652, + "grad_norm": 0.7261384725570679, + "learning_rate": 2.3957210098416774e-05, + "loss": 0.3007, + "step": 21920 + }, + { + "epoch": 28.13992297817715, + "grad_norm": 1.0004626512527466, + "learning_rate": 2.395678219940094e-05, + "loss": 0.292, + "step": 21921 + }, + { + "epoch": 28.141206675224648, + "grad_norm": 1.2866548299789429, + "learning_rate": 2.395635430038511e-05, + "loss": 0.302, + "step": 21922 + }, + { + "epoch": 28.142490372272142, + "grad_norm": 0.9760018587112427, + "learning_rate": 2.3955926401369276e-05, + "loss": 0.315, + "step": 21923 + }, + { + "epoch": 28.14377406931964, + "grad_norm": 1.6213127374649048, + "learning_rate": 2.3955498502353448e-05, + "loss": 0.3151, + "step": 21924 + }, + { + "epoch": 28.145057766367138, + "grad_norm": 1.2406246662139893, + "learning_rate": 2.3955070603337613e-05, + "loss": 0.2748, + "step": 21925 + }, + { + "epoch": 28.146341463414632, + "grad_norm": 0.792332649230957, + "learning_rate": 2.395464270432178e-05, + "loss": 0.2974, + "step": 21926 + }, + { + "epoch": 28.14762516046213, + "grad_norm": 0.8550714254379272, + "learning_rate": 2.395421480530595e-05, + "loss": 0.3066, + "step": 21927 + }, + { + "epoch": 28.14890885750963, + "grad_norm": 1.5791282653808594, + "learning_rate": 2.3953786906290115e-05, + "loss": 0.3014, + "step": 21928 + }, + { + "epoch": 28.150192554557126, + "grad_norm": 1.2648733854293823, + "learning_rate": 2.3953359007274283e-05, + "loss": 0.3082, + "step": 21929 + }, + { + "epoch": 28.15147625160462, + "grad_norm": 1.8017605543136597, + "learning_rate": 2.395293110825845e-05, + "loss": 0.2818, + "step": 21930 + }, + { + "epoch": 28.15275994865212, + "grad_norm": 1.6209312677383423, + "learning_rate": 2.395250320924262e-05, + "loss": 0.2799, + "step": 21931 + }, + { + "epoch": 28.154043645699616, + "grad_norm": 1.0488290786743164, + "learning_rate": 2.3952075310226788e-05, + "loss": 0.287, + "step": 21932 + }, + { + "epoch": 28.15532734274711, + "grad_norm": 1.4277397394180298, + "learning_rate": 2.3951647411210957e-05, + "loss": 0.2936, + "step": 21933 + }, + { + "epoch": 28.15661103979461, + "grad_norm": 1.1382383108139038, + "learning_rate": 2.395121951219512e-05, + "loss": 0.3109, + "step": 21934 + }, + { + "epoch": 28.157894736842106, + "grad_norm": 1.0703177452087402, + "learning_rate": 2.395079161317929e-05, + "loss": 0.2981, + "step": 21935 + }, + { + "epoch": 28.1591784338896, + "grad_norm": 0.9422547817230225, + "learning_rate": 2.395036371416346e-05, + "loss": 0.2885, + "step": 21936 + }, + { + "epoch": 28.1604621309371, + "grad_norm": 1.2659378051757812, + "learning_rate": 2.3949935815147623e-05, + "loss": 0.297, + "step": 21937 + }, + { + "epoch": 28.161745827984596, + "grad_norm": 3.0176045894622803, + "learning_rate": 2.3949507916131795e-05, + "loss": 0.2983, + "step": 21938 + }, + { + "epoch": 28.163029525032094, + "grad_norm": 1.6691951751708984, + "learning_rate": 2.394908001711596e-05, + "loss": 0.3328, + "step": 21939 + }, + { + "epoch": 28.16431322207959, + "grad_norm": 1.0815390348434448, + "learning_rate": 2.3948652118100132e-05, + "loss": 0.2932, + "step": 21940 + }, + { + "epoch": 28.165596919127086, + "grad_norm": 1.7739231586456299, + "learning_rate": 2.3948224219084297e-05, + "loss": 0.2941, + "step": 21941 + }, + { + "epoch": 28.166880616174584, + "grad_norm": 2.5626885890960693, + "learning_rate": 2.3947796320068462e-05, + "loss": 0.2759, + "step": 21942 + }, + { + "epoch": 28.16816431322208, + "grad_norm": 1.2966166734695435, + "learning_rate": 2.3947368421052634e-05, + "loss": 0.2917, + "step": 21943 + }, + { + "epoch": 28.169448010269576, + "grad_norm": 2.116143226623535, + "learning_rate": 2.39469405220368e-05, + "loss": 0.2896, + "step": 21944 + }, + { + "epoch": 28.170731707317074, + "grad_norm": 1.2169147729873657, + "learning_rate": 2.3946512623020967e-05, + "loss": 0.2699, + "step": 21945 + }, + { + "epoch": 28.17201540436457, + "grad_norm": 1.2182930707931519, + "learning_rate": 2.3946084724005136e-05, + "loss": 0.2972, + "step": 21946 + }, + { + "epoch": 28.173299101412066, + "grad_norm": 1.4250214099884033, + "learning_rate": 2.3945656824989304e-05, + "loss": 0.2842, + "step": 21947 + }, + { + "epoch": 28.174582798459564, + "grad_norm": 1.3466695547103882, + "learning_rate": 2.3945228925973473e-05, + "loss": 0.314, + "step": 21948 + }, + { + "epoch": 28.17586649550706, + "grad_norm": 1.30819833278656, + "learning_rate": 2.3944801026957638e-05, + "loss": 0.3156, + "step": 21949 + }, + { + "epoch": 28.177150192554556, + "grad_norm": 1.7521967887878418, + "learning_rate": 2.3944373127941806e-05, + "loss": 0.3089, + "step": 21950 + }, + { + "epoch": 28.178433889602054, + "grad_norm": 1.104588270187378, + "learning_rate": 2.3943945228925974e-05, + "loss": 0.3104, + "step": 21951 + }, + { + "epoch": 28.179717586649552, + "grad_norm": 2.14359450340271, + "learning_rate": 2.3943517329910143e-05, + "loss": 0.2815, + "step": 21952 + }, + { + "epoch": 28.181001283697046, + "grad_norm": 1.7412770986557007, + "learning_rate": 2.3943089430894308e-05, + "loss": 0.3049, + "step": 21953 + }, + { + "epoch": 28.182284980744544, + "grad_norm": 1.585098147392273, + "learning_rate": 2.394266153187848e-05, + "loss": 0.3209, + "step": 21954 + }, + { + "epoch": 28.183568677792042, + "grad_norm": 3.094052791595459, + "learning_rate": 2.3942233632862645e-05, + "loss": 0.3608, + "step": 21955 + }, + { + "epoch": 28.184852374839537, + "grad_norm": 1.35159432888031, + "learning_rate": 2.3941805733846813e-05, + "loss": 0.3252, + "step": 21956 + }, + { + "epoch": 28.186136071887034, + "grad_norm": 1.5116161108016968, + "learning_rate": 2.394137783483098e-05, + "loss": 0.386, + "step": 21957 + }, + { + "epoch": 28.187419768934532, + "grad_norm": 1.349184513092041, + "learning_rate": 2.3940949935815147e-05, + "loss": 0.3313, + "step": 21958 + }, + { + "epoch": 28.188703465982027, + "grad_norm": 7.093800067901611, + "learning_rate": 2.394052203679932e-05, + "loss": 0.3307, + "step": 21959 + }, + { + "epoch": 28.189987163029524, + "grad_norm": 2.0152106285095215, + "learning_rate": 2.3940094137783483e-05, + "loss": 0.3597, + "step": 21960 + }, + { + "epoch": 28.191270860077022, + "grad_norm": 2.2459473609924316, + "learning_rate": 2.3939666238767652e-05, + "loss": 0.3608, + "step": 21961 + }, + { + "epoch": 28.19255455712452, + "grad_norm": 2.5341150760650635, + "learning_rate": 2.393923833975182e-05, + "loss": 0.4791, + "step": 21962 + }, + { + "epoch": 28.193838254172015, + "grad_norm": 1.624497890472412, + "learning_rate": 2.3938810440735985e-05, + "loss": 0.265, + "step": 21963 + }, + { + "epoch": 28.195121951219512, + "grad_norm": 0.788688600063324, + "learning_rate": 2.3938382541720157e-05, + "loss": 0.2999, + "step": 21964 + }, + { + "epoch": 28.19640564826701, + "grad_norm": 0.9795841574668884, + "learning_rate": 2.3937954642704322e-05, + "loss": 0.3216, + "step": 21965 + }, + { + "epoch": 28.197689345314505, + "grad_norm": 1.2492026090621948, + "learning_rate": 2.393752674368849e-05, + "loss": 0.3026, + "step": 21966 + }, + { + "epoch": 28.198973042362002, + "grad_norm": 1.0448040962219238, + "learning_rate": 2.393709884467266e-05, + "loss": 0.3546, + "step": 21967 + }, + { + "epoch": 28.2002567394095, + "grad_norm": 0.8697553277015686, + "learning_rate": 2.3936670945656824e-05, + "loss": 0.2761, + "step": 21968 + }, + { + "epoch": 28.201540436456995, + "grad_norm": 2.063096046447754, + "learning_rate": 2.3936243046640992e-05, + "loss": 0.3097, + "step": 21969 + }, + { + "epoch": 28.202824133504492, + "grad_norm": 2.2504701614379883, + "learning_rate": 2.393581514762516e-05, + "loss": 0.3324, + "step": 21970 + }, + { + "epoch": 28.20410783055199, + "grad_norm": 1.1250488758087158, + "learning_rate": 2.393538724860933e-05, + "loss": 0.3232, + "step": 21971 + }, + { + "epoch": 28.205391527599488, + "grad_norm": 0.9233012199401855, + "learning_rate": 2.3934959349593498e-05, + "loss": 0.3125, + "step": 21972 + }, + { + "epoch": 28.206675224646983, + "grad_norm": 0.9836688041687012, + "learning_rate": 2.3934531450577666e-05, + "loss": 0.2898, + "step": 21973 + }, + { + "epoch": 28.20795892169448, + "grad_norm": 1.056261658668518, + "learning_rate": 2.393410355156183e-05, + "loss": 0.3013, + "step": 21974 + }, + { + "epoch": 28.20924261874198, + "grad_norm": 5.012901782989502, + "learning_rate": 2.3933675652546e-05, + "loss": 0.2988, + "step": 21975 + }, + { + "epoch": 28.210526315789473, + "grad_norm": 1.2043752670288086, + "learning_rate": 2.3933247753530168e-05, + "loss": 0.3098, + "step": 21976 + }, + { + "epoch": 28.21181001283697, + "grad_norm": 1.0859708786010742, + "learning_rate": 2.3932819854514333e-05, + "loss": 0.3082, + "step": 21977 + }, + { + "epoch": 28.21309370988447, + "grad_norm": 4.264999866485596, + "learning_rate": 2.3932391955498505e-05, + "loss": 0.3426, + "step": 21978 + }, + { + "epoch": 28.214377406931963, + "grad_norm": 3.1924781799316406, + "learning_rate": 2.393196405648267e-05, + "loss": 0.3039, + "step": 21979 + }, + { + "epoch": 28.21566110397946, + "grad_norm": 1.1887788772583008, + "learning_rate": 2.393153615746684e-05, + "loss": 0.3095, + "step": 21980 + }, + { + "epoch": 28.21694480102696, + "grad_norm": 2.134885549545288, + "learning_rate": 2.3931108258451006e-05, + "loss": 0.2791, + "step": 21981 + }, + { + "epoch": 28.218228498074453, + "grad_norm": 1.0123831033706665, + "learning_rate": 2.393068035943517e-05, + "loss": 0.3, + "step": 21982 + }, + { + "epoch": 28.21951219512195, + "grad_norm": 1.2190390825271606, + "learning_rate": 2.3930252460419343e-05, + "loss": 0.3073, + "step": 21983 + }, + { + "epoch": 28.22079589216945, + "grad_norm": 2.475555658340454, + "learning_rate": 2.392982456140351e-05, + "loss": 0.3107, + "step": 21984 + }, + { + "epoch": 28.222079589216946, + "grad_norm": 1.3004974126815796, + "learning_rate": 2.3929396662387677e-05, + "loss": 0.3553, + "step": 21985 + }, + { + "epoch": 28.22336328626444, + "grad_norm": 0.9997683763504028, + "learning_rate": 2.3928968763371845e-05, + "loss": 0.3058, + "step": 21986 + }, + { + "epoch": 28.22464698331194, + "grad_norm": 2.3024885654449463, + "learning_rate": 2.3928540864356014e-05, + "loss": 0.3311, + "step": 21987 + }, + { + "epoch": 28.225930680359436, + "grad_norm": 1.0263317823410034, + "learning_rate": 2.392811296534018e-05, + "loss": 0.2928, + "step": 21988 + }, + { + "epoch": 28.22721437740693, + "grad_norm": 1.7205276489257812, + "learning_rate": 2.3927685066324347e-05, + "loss": 0.2989, + "step": 21989 + }, + { + "epoch": 28.22849807445443, + "grad_norm": 1.3103344440460205, + "learning_rate": 2.3927257167308515e-05, + "loss": 0.3101, + "step": 21990 + }, + { + "epoch": 28.229781771501926, + "grad_norm": 1.1596112251281738, + "learning_rate": 2.3926829268292684e-05, + "loss": 0.3023, + "step": 21991 + }, + { + "epoch": 28.23106546854942, + "grad_norm": 2.5909574031829834, + "learning_rate": 2.3926401369276852e-05, + "loss": 0.2699, + "step": 21992 + }, + { + "epoch": 28.23234916559692, + "grad_norm": 1.2841256856918335, + "learning_rate": 2.3925973470261017e-05, + "loss": 0.2596, + "step": 21993 + }, + { + "epoch": 28.233632862644416, + "grad_norm": 1.1519235372543335, + "learning_rate": 2.392554557124519e-05, + "loss": 0.2973, + "step": 21994 + }, + { + "epoch": 28.234916559691914, + "grad_norm": 3.2506191730499268, + "learning_rate": 2.3925117672229354e-05, + "loss": 0.2705, + "step": 21995 + }, + { + "epoch": 28.23620025673941, + "grad_norm": 1.3404608964920044, + "learning_rate": 2.392468977321352e-05, + "loss": 0.3245, + "step": 21996 + }, + { + "epoch": 28.237483953786906, + "grad_norm": 1.650022029876709, + "learning_rate": 2.392426187419769e-05, + "loss": 0.3309, + "step": 21997 + }, + { + "epoch": 28.238767650834404, + "grad_norm": 0.8338812589645386, + "learning_rate": 2.3923833975181856e-05, + "loss": 0.3047, + "step": 21998 + }, + { + "epoch": 28.2400513478819, + "grad_norm": 2.07100772857666, + "learning_rate": 2.3923406076166028e-05, + "loss": 0.3145, + "step": 21999 + }, + { + "epoch": 28.241335044929397, + "grad_norm": 1.0311462879180908, + "learning_rate": 2.3922978177150193e-05, + "loss": 0.2956, + "step": 22000 + }, + { + "epoch": 28.241335044929397, + "eval_cer": 0.264636773056589, + "eval_loss": 0.4756144881248474, + "eval_runtime": 14.3943, + "eval_samples_per_second": 68.291, + "eval_steps_per_second": 0.486, + "eval_wer": 0.4443934877321715, + "step": 22000 + }, + { + "epoch": 28.242618741976894, + "grad_norm": 1.3570033311843872, + "learning_rate": 2.392255027813436e-05, + "loss": 0.3143, + "step": 22001 + }, + { + "epoch": 28.24390243902439, + "grad_norm": 2.040550470352173, + "learning_rate": 2.392212237911853e-05, + "loss": 0.2964, + "step": 22002 + }, + { + "epoch": 28.245186136071887, + "grad_norm": 1.410998821258545, + "learning_rate": 2.3921694480102695e-05, + "loss": 0.3417, + "step": 22003 + }, + { + "epoch": 28.246469833119384, + "grad_norm": 3.2969236373901367, + "learning_rate": 2.3921266581086863e-05, + "loss": 0.3182, + "step": 22004 + }, + { + "epoch": 28.247753530166882, + "grad_norm": 1.8563475608825684, + "learning_rate": 2.392083868207103e-05, + "loss": 0.3173, + "step": 22005 + }, + { + "epoch": 28.249037227214377, + "grad_norm": 2.1792116165161133, + "learning_rate": 2.39204107830552e-05, + "loss": 0.3205, + "step": 22006 + }, + { + "epoch": 28.250320924261874, + "grad_norm": 2.419741630554199, + "learning_rate": 2.3919982884039368e-05, + "loss": 0.361, + "step": 22007 + }, + { + "epoch": 28.251604621309372, + "grad_norm": 1.3180758953094482, + "learning_rate": 2.3919554985023537e-05, + "loss": 0.3758, + "step": 22008 + }, + { + "epoch": 28.252888318356867, + "grad_norm": 1.0256433486938477, + "learning_rate": 2.39191270860077e-05, + "loss": 0.3074, + "step": 22009 + }, + { + "epoch": 28.254172015404365, + "grad_norm": 1.2146871089935303, + "learning_rate": 2.391869918699187e-05, + "loss": 0.3607, + "step": 22010 + }, + { + "epoch": 28.255455712451862, + "grad_norm": 1.7384756803512573, + "learning_rate": 2.391827128797604e-05, + "loss": 0.4382, + "step": 22011 + }, + { + "epoch": 28.256739409499357, + "grad_norm": 2.6038525104522705, + "learning_rate": 2.3917843388960204e-05, + "loss": 0.4958, + "step": 22012 + }, + { + "epoch": 28.258023106546855, + "grad_norm": 1.193885326385498, + "learning_rate": 2.3917415489944375e-05, + "loss": 0.2966, + "step": 22013 + }, + { + "epoch": 28.259306803594352, + "grad_norm": 1.0488287210464478, + "learning_rate": 2.391698759092854e-05, + "loss": 0.282, + "step": 22014 + }, + { + "epoch": 28.260590500641847, + "grad_norm": 1.1846752166748047, + "learning_rate": 2.3916559691912712e-05, + "loss": 0.2999, + "step": 22015 + }, + { + "epoch": 28.261874197689345, + "grad_norm": 0.8742940425872803, + "learning_rate": 2.3916131792896877e-05, + "loss": 0.2974, + "step": 22016 + }, + { + "epoch": 28.263157894736842, + "grad_norm": 1.3638677597045898, + "learning_rate": 2.3915703893881042e-05, + "loss": 0.3087, + "step": 22017 + }, + { + "epoch": 28.26444159178434, + "grad_norm": 0.7631314396858215, + "learning_rate": 2.3915275994865214e-05, + "loss": 0.2634, + "step": 22018 + }, + { + "epoch": 28.265725288831835, + "grad_norm": 1.1536232233047485, + "learning_rate": 2.391484809584938e-05, + "loss": 0.308, + "step": 22019 + }, + { + "epoch": 28.267008985879333, + "grad_norm": 0.7158194184303284, + "learning_rate": 2.3914420196833547e-05, + "loss": 0.2798, + "step": 22020 + }, + { + "epoch": 28.26829268292683, + "grad_norm": 0.9816306829452515, + "learning_rate": 2.3913992297817716e-05, + "loss": 0.3181, + "step": 22021 + }, + { + "epoch": 28.269576379974325, + "grad_norm": 1.3304288387298584, + "learning_rate": 2.3913564398801884e-05, + "loss": 0.3208, + "step": 22022 + }, + { + "epoch": 28.270860077021823, + "grad_norm": 1.342820405960083, + "learning_rate": 2.3913136499786053e-05, + "loss": 0.2933, + "step": 22023 + }, + { + "epoch": 28.27214377406932, + "grad_norm": 1.5044138431549072, + "learning_rate": 2.3912708600770218e-05, + "loss": 0.3157, + "step": 22024 + }, + { + "epoch": 28.273427471116815, + "grad_norm": 1.0138623714447021, + "learning_rate": 2.3912280701754386e-05, + "loss": 0.2898, + "step": 22025 + }, + { + "epoch": 28.274711168164313, + "grad_norm": 1.6218465566635132, + "learning_rate": 2.3911852802738555e-05, + "loss": 0.3003, + "step": 22026 + }, + { + "epoch": 28.27599486521181, + "grad_norm": 0.8566128611564636, + "learning_rate": 2.3911424903722723e-05, + "loss": 0.2839, + "step": 22027 + }, + { + "epoch": 28.27727856225931, + "grad_norm": 1.2155416011810303, + "learning_rate": 2.3910997004706888e-05, + "loss": 0.3392, + "step": 22028 + }, + { + "epoch": 28.278562259306803, + "grad_norm": 1.6139088869094849, + "learning_rate": 2.3910569105691056e-05, + "loss": 0.3027, + "step": 22029 + }, + { + "epoch": 28.2798459563543, + "grad_norm": 1.0369551181793213, + "learning_rate": 2.3910141206675225e-05, + "loss": 0.2954, + "step": 22030 + }, + { + "epoch": 28.2811296534018, + "grad_norm": 0.9111302495002747, + "learning_rate": 2.3909713307659393e-05, + "loss": 0.3292, + "step": 22031 + }, + { + "epoch": 28.282413350449293, + "grad_norm": 2.2784016132354736, + "learning_rate": 2.390928540864356e-05, + "loss": 0.2622, + "step": 22032 + }, + { + "epoch": 28.28369704749679, + "grad_norm": 0.8468539118766785, + "learning_rate": 2.3908857509627727e-05, + "loss": 0.2715, + "step": 22033 + }, + { + "epoch": 28.28498074454429, + "grad_norm": 1.0064284801483154, + "learning_rate": 2.39084296106119e-05, + "loss": 0.2993, + "step": 22034 + }, + { + "epoch": 28.286264441591783, + "grad_norm": 1.1503180265426636, + "learning_rate": 2.3908001711596063e-05, + "loss": 0.3009, + "step": 22035 + }, + { + "epoch": 28.28754813863928, + "grad_norm": 0.8446183204650879, + "learning_rate": 2.390757381258023e-05, + "loss": 0.2942, + "step": 22036 + }, + { + "epoch": 28.28883183568678, + "grad_norm": 1.0092442035675049, + "learning_rate": 2.39071459135644e-05, + "loss": 0.3371, + "step": 22037 + }, + { + "epoch": 28.290115532734276, + "grad_norm": 0.9995301961898804, + "learning_rate": 2.3906718014548565e-05, + "loss": 0.2967, + "step": 22038 + }, + { + "epoch": 28.29139922978177, + "grad_norm": 1.5035123825073242, + "learning_rate": 2.3906290115532737e-05, + "loss": 0.2934, + "step": 22039 + }, + { + "epoch": 28.29268292682927, + "grad_norm": 2.550726890563965, + "learning_rate": 2.3905862216516902e-05, + "loss": 0.3012, + "step": 22040 + }, + { + "epoch": 28.293966623876766, + "grad_norm": 2.4428861141204834, + "learning_rate": 2.390543431750107e-05, + "loss": 0.2998, + "step": 22041 + }, + { + "epoch": 28.29525032092426, + "grad_norm": 1.8867162466049194, + "learning_rate": 2.390500641848524e-05, + "loss": 0.3085, + "step": 22042 + }, + { + "epoch": 28.29653401797176, + "grad_norm": 1.3883955478668213, + "learning_rate": 2.3904578519469404e-05, + "loss": 0.2963, + "step": 22043 + }, + { + "epoch": 28.297817715019256, + "grad_norm": 1.9824572801589966, + "learning_rate": 2.3904150620453572e-05, + "loss": 0.3019, + "step": 22044 + }, + { + "epoch": 28.29910141206675, + "grad_norm": 1.158097743988037, + "learning_rate": 2.390372272143774e-05, + "loss": 0.3446, + "step": 22045 + }, + { + "epoch": 28.30038510911425, + "grad_norm": 1.1375067234039307, + "learning_rate": 2.390329482242191e-05, + "loss": 0.3272, + "step": 22046 + }, + { + "epoch": 28.301668806161747, + "grad_norm": 1.5457173585891724, + "learning_rate": 2.3902866923406078e-05, + "loss": 0.3072, + "step": 22047 + }, + { + "epoch": 28.30295250320924, + "grad_norm": 2.0548055171966553, + "learning_rate": 2.3902439024390246e-05, + "loss": 0.2855, + "step": 22048 + }, + { + "epoch": 28.30423620025674, + "grad_norm": 1.0669281482696533, + "learning_rate": 2.390201112537441e-05, + "loss": 0.3001, + "step": 22049 + }, + { + "epoch": 28.305519897304237, + "grad_norm": 1.3845398426055908, + "learning_rate": 2.390158322635858e-05, + "loss": 0.2923, + "step": 22050 + }, + { + "epoch": 28.306803594351734, + "grad_norm": 1.1960487365722656, + "learning_rate": 2.3901155327342748e-05, + "loss": 0.3121, + "step": 22051 + }, + { + "epoch": 28.30808729139923, + "grad_norm": 2.372420310974121, + "learning_rate": 2.3900727428326913e-05, + "loss": 0.3497, + "step": 22052 + }, + { + "epoch": 28.309370988446727, + "grad_norm": 1.639732003211975, + "learning_rate": 2.3900299529311085e-05, + "loss": 0.2968, + "step": 22053 + }, + { + "epoch": 28.310654685494224, + "grad_norm": 1.1023929119110107, + "learning_rate": 2.389987163029525e-05, + "loss": 0.3446, + "step": 22054 + }, + { + "epoch": 28.31193838254172, + "grad_norm": 2.1641976833343506, + "learning_rate": 2.389944373127942e-05, + "loss": 0.3515, + "step": 22055 + }, + { + "epoch": 28.313222079589217, + "grad_norm": 2.8179330825805664, + "learning_rate": 2.3899015832263587e-05, + "loss": 0.3548, + "step": 22056 + }, + { + "epoch": 28.314505776636715, + "grad_norm": 1.1080154180526733, + "learning_rate": 2.389858793324775e-05, + "loss": 0.3076, + "step": 22057 + }, + { + "epoch": 28.31578947368421, + "grad_norm": 1.6451057195663452, + "learning_rate": 2.3898160034231923e-05, + "loss": 0.3401, + "step": 22058 + }, + { + "epoch": 28.317073170731707, + "grad_norm": 3.1453118324279785, + "learning_rate": 2.389773213521609e-05, + "loss": 0.3454, + "step": 22059 + }, + { + "epoch": 28.318356867779205, + "grad_norm": 6.865065097808838, + "learning_rate": 2.3897304236200257e-05, + "loss": 0.3678, + "step": 22060 + }, + { + "epoch": 28.319640564826702, + "grad_norm": 2.7741994857788086, + "learning_rate": 2.3896876337184425e-05, + "loss": 0.3748, + "step": 22061 + }, + { + "epoch": 28.320924261874197, + "grad_norm": 3.952997922897339, + "learning_rate": 2.3896448438168594e-05, + "loss": 0.4886, + "step": 22062 + }, + { + "epoch": 28.322207958921695, + "grad_norm": 0.936603844165802, + "learning_rate": 2.3896020539152762e-05, + "loss": 0.2944, + "step": 22063 + }, + { + "epoch": 28.323491655969192, + "grad_norm": 1.7230130434036255, + "learning_rate": 2.3895592640136927e-05, + "loss": 0.2753, + "step": 22064 + }, + { + "epoch": 28.324775353016687, + "grad_norm": 1.1325575113296509, + "learning_rate": 2.3895164741121095e-05, + "loss": 0.2824, + "step": 22065 + }, + { + "epoch": 28.326059050064185, + "grad_norm": 1.1601084470748901, + "learning_rate": 2.3894736842105264e-05, + "loss": 0.3176, + "step": 22066 + }, + { + "epoch": 28.327342747111683, + "grad_norm": 1.0900577306747437, + "learning_rate": 2.3894308943089432e-05, + "loss": 0.341, + "step": 22067 + }, + { + "epoch": 28.328626444159177, + "grad_norm": 1.2331945896148682, + "learning_rate": 2.3893881044073597e-05, + "loss": 0.2684, + "step": 22068 + }, + { + "epoch": 28.329910141206675, + "grad_norm": 1.5939104557037354, + "learning_rate": 2.389345314505777e-05, + "loss": 0.2851, + "step": 22069 + }, + { + "epoch": 28.331193838254173, + "grad_norm": 2.460831642150879, + "learning_rate": 2.3893025246041934e-05, + "loss": 0.2939, + "step": 22070 + }, + { + "epoch": 28.33247753530167, + "grad_norm": 0.8851640820503235, + "learning_rate": 2.3892597347026103e-05, + "loss": 0.3256, + "step": 22071 + }, + { + "epoch": 28.333761232349165, + "grad_norm": 0.9065062999725342, + "learning_rate": 2.389216944801027e-05, + "loss": 0.306, + "step": 22072 + }, + { + "epoch": 28.335044929396663, + "grad_norm": 1.6996338367462158, + "learning_rate": 2.3891741548994436e-05, + "loss": 0.305, + "step": 22073 + }, + { + "epoch": 28.33632862644416, + "grad_norm": 1.089507818222046, + "learning_rate": 2.3891313649978608e-05, + "loss": 0.2951, + "step": 22074 + }, + { + "epoch": 28.337612323491655, + "grad_norm": 0.9849973320960999, + "learning_rate": 2.3890885750962773e-05, + "loss": 0.2901, + "step": 22075 + }, + { + "epoch": 28.338896020539153, + "grad_norm": 1.1427147388458252, + "learning_rate": 2.389045785194694e-05, + "loss": 0.2865, + "step": 22076 + }, + { + "epoch": 28.34017971758665, + "grad_norm": 3.298105239868164, + "learning_rate": 2.389002995293111e-05, + "loss": 0.3047, + "step": 22077 + }, + { + "epoch": 28.341463414634145, + "grad_norm": 0.9716641306877136, + "learning_rate": 2.3889602053915275e-05, + "loss": 0.2615, + "step": 22078 + }, + { + "epoch": 28.342747111681643, + "grad_norm": 6.970831394195557, + "learning_rate": 2.3889174154899446e-05, + "loss": 0.2868, + "step": 22079 + }, + { + "epoch": 28.34403080872914, + "grad_norm": 0.9082292914390564, + "learning_rate": 2.388874625588361e-05, + "loss": 0.2822, + "step": 22080 + }, + { + "epoch": 28.345314505776635, + "grad_norm": 1.1445831060409546, + "learning_rate": 2.388831835686778e-05, + "loss": 0.3012, + "step": 22081 + }, + { + "epoch": 28.346598202824133, + "grad_norm": 3.223784923553467, + "learning_rate": 2.3887890457851948e-05, + "loss": 0.3489, + "step": 22082 + }, + { + "epoch": 28.34788189987163, + "grad_norm": 1.1113718748092651, + "learning_rate": 2.3887462558836117e-05, + "loss": 0.2779, + "step": 22083 + }, + { + "epoch": 28.34916559691913, + "grad_norm": 0.9685734510421753, + "learning_rate": 2.3887034659820282e-05, + "loss": 0.2931, + "step": 22084 + }, + { + "epoch": 28.350449293966623, + "grad_norm": 1.3766536712646484, + "learning_rate": 2.388660676080445e-05, + "loss": 0.2939, + "step": 22085 + }, + { + "epoch": 28.35173299101412, + "grad_norm": 0.9945179224014282, + "learning_rate": 2.388617886178862e-05, + "loss": 0.2794, + "step": 22086 + }, + { + "epoch": 28.35301668806162, + "grad_norm": 2.0204098224639893, + "learning_rate": 2.3885750962772787e-05, + "loss": 0.3095, + "step": 22087 + }, + { + "epoch": 28.354300385109113, + "grad_norm": 2.2280728816986084, + "learning_rate": 2.3885323063756955e-05, + "loss": 0.291, + "step": 22088 + }, + { + "epoch": 28.35558408215661, + "grad_norm": 1.857338786125183, + "learning_rate": 2.388489516474112e-05, + "loss": 0.2658, + "step": 22089 + }, + { + "epoch": 28.35686777920411, + "grad_norm": 0.9881101250648499, + "learning_rate": 2.388446726572529e-05, + "loss": 0.2846, + "step": 22090 + }, + { + "epoch": 28.358151476251603, + "grad_norm": 1.5625675916671753, + "learning_rate": 2.3884039366709457e-05, + "loss": 0.2805, + "step": 22091 + }, + { + "epoch": 28.3594351732991, + "grad_norm": 1.6337982416152954, + "learning_rate": 2.3883611467693622e-05, + "loss": 0.2947, + "step": 22092 + }, + { + "epoch": 28.3607188703466, + "grad_norm": 1.2361539602279663, + "learning_rate": 2.3883183568677794e-05, + "loss": 0.2939, + "step": 22093 + }, + { + "epoch": 28.362002567394097, + "grad_norm": 1.4821579456329346, + "learning_rate": 2.388275566966196e-05, + "loss": 0.2771, + "step": 22094 + }, + { + "epoch": 28.36328626444159, + "grad_norm": 1.3844120502471924, + "learning_rate": 2.388232777064613e-05, + "loss": 0.299, + "step": 22095 + }, + { + "epoch": 28.36456996148909, + "grad_norm": 1.0666615962982178, + "learning_rate": 2.3881899871630296e-05, + "loss": 0.3076, + "step": 22096 + }, + { + "epoch": 28.365853658536587, + "grad_norm": 1.4661866426467896, + "learning_rate": 2.388147197261446e-05, + "loss": 0.3213, + "step": 22097 + }, + { + "epoch": 28.36713735558408, + "grad_norm": 1.0132389068603516, + "learning_rate": 2.3881044073598633e-05, + "loss": 0.3071, + "step": 22098 + }, + { + "epoch": 28.36842105263158, + "grad_norm": 1.457556128501892, + "learning_rate": 2.3880616174582798e-05, + "loss": 0.3271, + "step": 22099 + }, + { + "epoch": 28.369704749679077, + "grad_norm": 1.9333382844924927, + "learning_rate": 2.3880188275566966e-05, + "loss": 0.2793, + "step": 22100 + }, + { + "epoch": 28.37098844672657, + "grad_norm": 1.400619387626648, + "learning_rate": 2.3879760376551135e-05, + "loss": 0.3223, + "step": 22101 + }, + { + "epoch": 28.37227214377407, + "grad_norm": 1.1605415344238281, + "learning_rate": 2.3879332477535303e-05, + "loss": 0.2948, + "step": 22102 + }, + { + "epoch": 28.373555840821567, + "grad_norm": 1.0289288759231567, + "learning_rate": 2.387890457851947e-05, + "loss": 0.3463, + "step": 22103 + }, + { + "epoch": 28.374839537869065, + "grad_norm": 3.5675034523010254, + "learning_rate": 2.3878476679503636e-05, + "loss": 0.3359, + "step": 22104 + }, + { + "epoch": 28.37612323491656, + "grad_norm": 1.9931949377059937, + "learning_rate": 2.3878048780487805e-05, + "loss": 0.3604, + "step": 22105 + }, + { + "epoch": 28.377406931964057, + "grad_norm": 1.763211727142334, + "learning_rate": 2.3877620881471973e-05, + "loss": 0.3328, + "step": 22106 + }, + { + "epoch": 28.378690629011555, + "grad_norm": 1.8945770263671875, + "learning_rate": 2.387719298245614e-05, + "loss": 0.3483, + "step": 22107 + }, + { + "epoch": 28.37997432605905, + "grad_norm": 1.7942837476730347, + "learning_rate": 2.3876765083440307e-05, + "loss": 0.3277, + "step": 22108 + }, + { + "epoch": 28.381258023106547, + "grad_norm": 2.0578014850616455, + "learning_rate": 2.387633718442448e-05, + "loss": 0.3194, + "step": 22109 + }, + { + "epoch": 28.382541720154045, + "grad_norm": 2.2666656970977783, + "learning_rate": 2.3875909285408644e-05, + "loss": 0.3991, + "step": 22110 + }, + { + "epoch": 28.38382541720154, + "grad_norm": 2.490402936935425, + "learning_rate": 2.3875481386392812e-05, + "loss": 0.3548, + "step": 22111 + }, + { + "epoch": 28.385109114249037, + "grad_norm": 2.583385467529297, + "learning_rate": 2.387505348737698e-05, + "loss": 0.5103, + "step": 22112 + }, + { + "epoch": 28.386392811296535, + "grad_norm": 1.1521458625793457, + "learning_rate": 2.3874625588361145e-05, + "loss": 0.2834, + "step": 22113 + }, + { + "epoch": 28.387676508344033, + "grad_norm": 5.276712417602539, + "learning_rate": 2.3874197689345317e-05, + "loss": 0.2981, + "step": 22114 + }, + { + "epoch": 28.388960205391527, + "grad_norm": 1.7419055700302124, + "learning_rate": 2.3873769790329482e-05, + "loss": 0.3104, + "step": 22115 + }, + { + "epoch": 28.390243902439025, + "grad_norm": 1.6024196147918701, + "learning_rate": 2.387334189131365e-05, + "loss": 0.3121, + "step": 22116 + }, + { + "epoch": 28.391527599486523, + "grad_norm": 2.5680975914001465, + "learning_rate": 2.387291399229782e-05, + "loss": 0.2978, + "step": 22117 + }, + { + "epoch": 28.392811296534017, + "grad_norm": 1.4354758262634277, + "learning_rate": 2.3872486093281984e-05, + "loss": 0.3005, + "step": 22118 + }, + { + "epoch": 28.394094993581515, + "grad_norm": 1.6592106819152832, + "learning_rate": 2.3872058194266156e-05, + "loss": 0.2942, + "step": 22119 + }, + { + "epoch": 28.395378690629013, + "grad_norm": 0.8905748128890991, + "learning_rate": 2.387163029525032e-05, + "loss": 0.2943, + "step": 22120 + }, + { + "epoch": 28.396662387676507, + "grad_norm": 1.019070029258728, + "learning_rate": 2.387120239623449e-05, + "loss": 0.2899, + "step": 22121 + }, + { + "epoch": 28.397946084724005, + "grad_norm": 3.408198833465576, + "learning_rate": 2.3870774497218658e-05, + "loss": 0.2902, + "step": 22122 + }, + { + "epoch": 28.399229781771503, + "grad_norm": 1.4894447326660156, + "learning_rate": 2.3870346598202826e-05, + "loss": 0.2952, + "step": 22123 + }, + { + "epoch": 28.400513478818997, + "grad_norm": 1.2674767971038818, + "learning_rate": 2.386991869918699e-05, + "loss": 0.3116, + "step": 22124 + }, + { + "epoch": 28.401797175866495, + "grad_norm": 0.9861820936203003, + "learning_rate": 2.386949080017116e-05, + "loss": 0.3015, + "step": 22125 + }, + { + "epoch": 28.403080872913993, + "grad_norm": 2.9955992698669434, + "learning_rate": 2.3869062901155328e-05, + "loss": 0.2862, + "step": 22126 + }, + { + "epoch": 28.40436456996149, + "grad_norm": 1.3264442682266235, + "learning_rate": 2.3868635002139496e-05, + "loss": 0.3091, + "step": 22127 + }, + { + "epoch": 28.405648267008985, + "grad_norm": 0.8295482397079468, + "learning_rate": 2.3868207103123665e-05, + "loss": 0.2937, + "step": 22128 + }, + { + "epoch": 28.406931964056483, + "grad_norm": 1.2983595132827759, + "learning_rate": 2.386777920410783e-05, + "loss": 0.2882, + "step": 22129 + }, + { + "epoch": 28.40821566110398, + "grad_norm": 2.4126601219177246, + "learning_rate": 2.3867351305092e-05, + "loss": 0.2885, + "step": 22130 + }, + { + "epoch": 28.409499358151475, + "grad_norm": 1.475412368774414, + "learning_rate": 2.3866923406076167e-05, + "loss": 0.3054, + "step": 22131 + }, + { + "epoch": 28.410783055198973, + "grad_norm": 0.9311888813972473, + "learning_rate": 2.386649550706033e-05, + "loss": 0.3198, + "step": 22132 + }, + { + "epoch": 28.41206675224647, + "grad_norm": 1.6352510452270508, + "learning_rate": 2.3866067608044503e-05, + "loss": 0.2969, + "step": 22133 + }, + { + "epoch": 28.413350449293965, + "grad_norm": 2.1146881580352783, + "learning_rate": 2.386563970902867e-05, + "loss": 0.3196, + "step": 22134 + }, + { + "epoch": 28.414634146341463, + "grad_norm": 1.1347984075546265, + "learning_rate": 2.386521181001284e-05, + "loss": 0.2893, + "step": 22135 + }, + { + "epoch": 28.41591784338896, + "grad_norm": 1.00575590133667, + "learning_rate": 2.3864783910997005e-05, + "loss": 0.2881, + "step": 22136 + }, + { + "epoch": 28.41720154043646, + "grad_norm": 0.9908257722854614, + "learning_rate": 2.3864356011981174e-05, + "loss": 0.292, + "step": 22137 + }, + { + "epoch": 28.418485237483953, + "grad_norm": 3.1550941467285156, + "learning_rate": 2.3863928112965342e-05, + "loss": 0.3321, + "step": 22138 + }, + { + "epoch": 28.41976893453145, + "grad_norm": 1.1330466270446777, + "learning_rate": 2.3863500213949507e-05, + "loss": 0.3168, + "step": 22139 + }, + { + "epoch": 28.42105263157895, + "grad_norm": 1.1012632846832275, + "learning_rate": 2.3863072314933676e-05, + "loss": 0.2865, + "step": 22140 + }, + { + "epoch": 28.422336328626443, + "grad_norm": 1.6529231071472168, + "learning_rate": 2.3862644415917844e-05, + "loss": 0.3221, + "step": 22141 + }, + { + "epoch": 28.42362002567394, + "grad_norm": 1.1190212965011597, + "learning_rate": 2.3862216516902012e-05, + "loss": 0.2986, + "step": 22142 + }, + { + "epoch": 28.42490372272144, + "grad_norm": 1.9189904928207397, + "learning_rate": 2.386178861788618e-05, + "loss": 0.315, + "step": 22143 + }, + { + "epoch": 28.426187419768933, + "grad_norm": 0.8851519227027893, + "learning_rate": 2.386136071887035e-05, + "loss": 0.3056, + "step": 22144 + }, + { + "epoch": 28.42747111681643, + "grad_norm": 1.2167861461639404, + "learning_rate": 2.3860932819854514e-05, + "loss": 0.3418, + "step": 22145 + }, + { + "epoch": 28.42875481386393, + "grad_norm": 1.6825650930404663, + "learning_rate": 2.3860504920838683e-05, + "loss": 0.3405, + "step": 22146 + }, + { + "epoch": 28.430038510911427, + "grad_norm": 1.7578907012939453, + "learning_rate": 2.386007702182285e-05, + "loss": 0.2783, + "step": 22147 + }, + { + "epoch": 28.43132220795892, + "grad_norm": 1.3559329509735107, + "learning_rate": 2.3859649122807016e-05, + "loss": 0.3131, + "step": 22148 + }, + { + "epoch": 28.43260590500642, + "grad_norm": 1.5207241773605347, + "learning_rate": 2.3859221223791188e-05, + "loss": 0.3203, + "step": 22149 + }, + { + "epoch": 28.433889602053917, + "grad_norm": 1.3704733848571777, + "learning_rate": 2.3858793324775353e-05, + "loss": 0.2985, + "step": 22150 + }, + { + "epoch": 28.43517329910141, + "grad_norm": 2.2768378257751465, + "learning_rate": 2.385836542575952e-05, + "loss": 0.3125, + "step": 22151 + }, + { + "epoch": 28.43645699614891, + "grad_norm": 1.054979920387268, + "learning_rate": 2.385793752674369e-05, + "loss": 0.3438, + "step": 22152 + }, + { + "epoch": 28.437740693196407, + "grad_norm": 1.148616075515747, + "learning_rate": 2.3857509627727855e-05, + "loss": 0.3521, + "step": 22153 + }, + { + "epoch": 28.4390243902439, + "grad_norm": 1.1903942823410034, + "learning_rate": 2.3857081728712027e-05, + "loss": 0.3448, + "step": 22154 + }, + { + "epoch": 28.4403080872914, + "grad_norm": 1.817764401435852, + "learning_rate": 2.385665382969619e-05, + "loss": 0.3099, + "step": 22155 + }, + { + "epoch": 28.441591784338897, + "grad_norm": 1.8342422246932983, + "learning_rate": 2.385622593068036e-05, + "loss": 0.2813, + "step": 22156 + }, + { + "epoch": 28.44287548138639, + "grad_norm": 2.535047769546509, + "learning_rate": 2.385579803166453e-05, + "loss": 0.39, + "step": 22157 + }, + { + "epoch": 28.44415917843389, + "grad_norm": 2.4507858753204346, + "learning_rate": 2.3855370132648693e-05, + "loss": 0.3431, + "step": 22158 + }, + { + "epoch": 28.445442875481387, + "grad_norm": 1.3564865589141846, + "learning_rate": 2.3854942233632865e-05, + "loss": 0.3129, + "step": 22159 + }, + { + "epoch": 28.446726572528885, + "grad_norm": 1.443880319595337, + "learning_rate": 2.385451433461703e-05, + "loss": 0.3763, + "step": 22160 + }, + { + "epoch": 28.44801026957638, + "grad_norm": 2.855541944503784, + "learning_rate": 2.38540864356012e-05, + "loss": 0.3947, + "step": 22161 + }, + { + "epoch": 28.449293966623877, + "grad_norm": 2.0983834266662598, + "learning_rate": 2.3853658536585367e-05, + "loss": 0.4655, + "step": 22162 + }, + { + "epoch": 28.450577663671375, + "grad_norm": 1.5787303447723389, + "learning_rate": 2.3853230637569535e-05, + "loss": 0.2938, + "step": 22163 + }, + { + "epoch": 28.45186136071887, + "grad_norm": 0.8750435709953308, + "learning_rate": 2.38528027385537e-05, + "loss": 0.2938, + "step": 22164 + }, + { + "epoch": 28.453145057766367, + "grad_norm": 0.8742973208427429, + "learning_rate": 2.385237483953787e-05, + "loss": 0.295, + "step": 22165 + }, + { + "epoch": 28.454428754813865, + "grad_norm": 1.9464197158813477, + "learning_rate": 2.3851946940522037e-05, + "loss": 0.33, + "step": 22166 + }, + { + "epoch": 28.45571245186136, + "grad_norm": 1.334641456604004, + "learning_rate": 2.3851519041506206e-05, + "loss": 0.2878, + "step": 22167 + }, + { + "epoch": 28.456996148908857, + "grad_norm": 1.2322996854782104, + "learning_rate": 2.3851091142490374e-05, + "loss": 0.3105, + "step": 22168 + }, + { + "epoch": 28.458279845956355, + "grad_norm": 1.3508851528167725, + "learning_rate": 2.385066324347454e-05, + "loss": 0.3291, + "step": 22169 + }, + { + "epoch": 28.459563543003853, + "grad_norm": 0.9900733232498169, + "learning_rate": 2.385023534445871e-05, + "loss": 0.3128, + "step": 22170 + }, + { + "epoch": 28.460847240051347, + "grad_norm": 0.8766537308692932, + "learning_rate": 2.3849807445442876e-05, + "loss": 0.2988, + "step": 22171 + }, + { + "epoch": 28.462130937098845, + "grad_norm": 1.8247873783111572, + "learning_rate": 2.384937954642704e-05, + "loss": 0.3144, + "step": 22172 + }, + { + "epoch": 28.463414634146343, + "grad_norm": 0.8703067302703857, + "learning_rate": 2.3848951647411213e-05, + "loss": 0.3244, + "step": 22173 + }, + { + "epoch": 28.464698331193837, + "grad_norm": 0.9273424744606018, + "learning_rate": 2.3848523748395378e-05, + "loss": 0.3032, + "step": 22174 + }, + { + "epoch": 28.465982028241335, + "grad_norm": 0.8671573400497437, + "learning_rate": 2.384809584937955e-05, + "loss": 0.3162, + "step": 22175 + }, + { + "epoch": 28.467265725288833, + "grad_norm": 1.1092565059661865, + "learning_rate": 2.3847667950363715e-05, + "loss": 0.3052, + "step": 22176 + }, + { + "epoch": 28.468549422336327, + "grad_norm": 1.3122117519378662, + "learning_rate": 2.3847240051347883e-05, + "loss": 0.335, + "step": 22177 + }, + { + "epoch": 28.469833119383825, + "grad_norm": 8.283514022827148, + "learning_rate": 2.384681215233205e-05, + "loss": 0.3061, + "step": 22178 + }, + { + "epoch": 28.471116816431323, + "grad_norm": 2.0363028049468994, + "learning_rate": 2.3846384253316216e-05, + "loss": 0.2844, + "step": 22179 + }, + { + "epoch": 28.47240051347882, + "grad_norm": 1.1427124738693237, + "learning_rate": 2.3845956354300385e-05, + "loss": 0.2709, + "step": 22180 + }, + { + "epoch": 28.473684210526315, + "grad_norm": 1.1169447898864746, + "learning_rate": 2.3845528455284553e-05, + "loss": 0.3128, + "step": 22181 + }, + { + "epoch": 28.474967907573813, + "grad_norm": 1.4218556880950928, + "learning_rate": 2.3845100556268722e-05, + "loss": 0.334, + "step": 22182 + }, + { + "epoch": 28.47625160462131, + "grad_norm": 0.9641687273979187, + "learning_rate": 2.384467265725289e-05, + "loss": 0.3281, + "step": 22183 + }, + { + "epoch": 28.477535301668805, + "grad_norm": 1.1569561958312988, + "learning_rate": 2.384424475823706e-05, + "loss": 0.2923, + "step": 22184 + }, + { + "epoch": 28.478818998716303, + "grad_norm": 1.5350940227508545, + "learning_rate": 2.3843816859221224e-05, + "loss": 0.3199, + "step": 22185 + }, + { + "epoch": 28.4801026957638, + "grad_norm": 1.0217549800872803, + "learning_rate": 2.3843388960205392e-05, + "loss": 0.3307, + "step": 22186 + }, + { + "epoch": 28.481386392811295, + "grad_norm": 1.088158130645752, + "learning_rate": 2.384296106118956e-05, + "loss": 0.3145, + "step": 22187 + }, + { + "epoch": 28.482670089858793, + "grad_norm": 1.3648184537887573, + "learning_rate": 2.3842533162173725e-05, + "loss": 0.3356, + "step": 22188 + }, + { + "epoch": 28.48395378690629, + "grad_norm": 2.0327565670013428, + "learning_rate": 2.3842105263157897e-05, + "loss": 0.2999, + "step": 22189 + }, + { + "epoch": 28.485237483953785, + "grad_norm": 2.1574296951293945, + "learning_rate": 2.3841677364142062e-05, + "loss": 0.334, + "step": 22190 + }, + { + "epoch": 28.486521181001283, + "grad_norm": 1.364201545715332, + "learning_rate": 2.384124946512623e-05, + "loss": 0.3175, + "step": 22191 + }, + { + "epoch": 28.48780487804878, + "grad_norm": 1.0909526348114014, + "learning_rate": 2.38408215661104e-05, + "loss": 0.3551, + "step": 22192 + }, + { + "epoch": 28.48908857509628, + "grad_norm": 0.9656968712806702, + "learning_rate": 2.3840393667094564e-05, + "loss": 0.2896, + "step": 22193 + }, + { + "epoch": 28.490372272143773, + "grad_norm": 1.252185583114624, + "learning_rate": 2.3839965768078736e-05, + "loss": 0.3304, + "step": 22194 + }, + { + "epoch": 28.49165596919127, + "grad_norm": 2.0793888568878174, + "learning_rate": 2.38395378690629e-05, + "loss": 0.2957, + "step": 22195 + }, + { + "epoch": 28.49293966623877, + "grad_norm": 1.799634575843811, + "learning_rate": 2.383910997004707e-05, + "loss": 0.3186, + "step": 22196 + }, + { + "epoch": 28.494223363286263, + "grad_norm": 2.0974435806274414, + "learning_rate": 2.3838682071031238e-05, + "loss": 0.3166, + "step": 22197 + }, + { + "epoch": 28.49550706033376, + "grad_norm": 1.3855832815170288, + "learning_rate": 2.3838254172015406e-05, + "loss": 0.3512, + "step": 22198 + }, + { + "epoch": 28.49679075738126, + "grad_norm": 2.173262357711792, + "learning_rate": 2.383782627299957e-05, + "loss": 0.3156, + "step": 22199 + }, + { + "epoch": 28.498074454428753, + "grad_norm": 6.0755767822265625, + "learning_rate": 2.383739837398374e-05, + "loss": 0.3323, + "step": 22200 + }, + { + "epoch": 28.49935815147625, + "grad_norm": 1.0211403369903564, + "learning_rate": 2.3836970474967908e-05, + "loss": 0.3154, + "step": 22201 + }, + { + "epoch": 28.50064184852375, + "grad_norm": 1.1615917682647705, + "learning_rate": 2.3836542575952076e-05, + "loss": 0.3166, + "step": 22202 + }, + { + "epoch": 28.501925545571247, + "grad_norm": 2.469339370727539, + "learning_rate": 2.3836114676936245e-05, + "loss": 0.3177, + "step": 22203 + }, + { + "epoch": 28.50320924261874, + "grad_norm": 1.4674291610717773, + "learning_rate": 2.383568677792041e-05, + "loss": 0.3224, + "step": 22204 + }, + { + "epoch": 28.50449293966624, + "grad_norm": 1.2421133518218994, + "learning_rate": 2.383525887890458e-05, + "loss": 0.3504, + "step": 22205 + }, + { + "epoch": 28.505776636713737, + "grad_norm": 1.6845386028289795, + "learning_rate": 2.3834830979888747e-05, + "loss": 0.3389, + "step": 22206 + }, + { + "epoch": 28.50706033376123, + "grad_norm": 1.91936457157135, + "learning_rate": 2.383440308087291e-05, + "loss": 0.3622, + "step": 22207 + }, + { + "epoch": 28.50834403080873, + "grad_norm": 1.9852485656738281, + "learning_rate": 2.3833975181857083e-05, + "loss": 0.4108, + "step": 22208 + }, + { + "epoch": 28.509627727856227, + "grad_norm": 3.546316146850586, + "learning_rate": 2.383354728284125e-05, + "loss": 0.3702, + "step": 22209 + }, + { + "epoch": 28.51091142490372, + "grad_norm": 2.484304428100586, + "learning_rate": 2.383311938382542e-05, + "loss": 0.3744, + "step": 22210 + }, + { + "epoch": 28.51219512195122, + "grad_norm": 1.7604998350143433, + "learning_rate": 2.3832691484809585e-05, + "loss": 0.4438, + "step": 22211 + }, + { + "epoch": 28.513478818998717, + "grad_norm": 2.3096697330474854, + "learning_rate": 2.3832263585793754e-05, + "loss": 0.4801, + "step": 22212 + }, + { + "epoch": 28.514762516046215, + "grad_norm": 0.9762553572654724, + "learning_rate": 2.3831835686777922e-05, + "loss": 0.2818, + "step": 22213 + }, + { + "epoch": 28.51604621309371, + "grad_norm": 1.6435277462005615, + "learning_rate": 2.3831407787762087e-05, + "loss": 0.3137, + "step": 22214 + }, + { + "epoch": 28.517329910141207, + "grad_norm": 2.206843376159668, + "learning_rate": 2.3830979888746256e-05, + "loss": 0.3168, + "step": 22215 + }, + { + "epoch": 28.518613607188705, + "grad_norm": 0.881407618522644, + "learning_rate": 2.3830551989730424e-05, + "loss": 0.2849, + "step": 22216 + }, + { + "epoch": 28.5198973042362, + "grad_norm": 1.5529978275299072, + "learning_rate": 2.3830124090714592e-05, + "loss": 0.3152, + "step": 22217 + }, + { + "epoch": 28.521181001283697, + "grad_norm": 0.9434942007064819, + "learning_rate": 2.382969619169876e-05, + "loss": 0.3316, + "step": 22218 + }, + { + "epoch": 28.522464698331195, + "grad_norm": 1.1270971298217773, + "learning_rate": 2.3829268292682926e-05, + "loss": 0.3132, + "step": 22219 + }, + { + "epoch": 28.52374839537869, + "grad_norm": 0.730718731880188, + "learning_rate": 2.3828840393667094e-05, + "loss": 0.2948, + "step": 22220 + }, + { + "epoch": 28.525032092426187, + "grad_norm": 1.1844286918640137, + "learning_rate": 2.3828412494651263e-05, + "loss": 0.2871, + "step": 22221 + }, + { + "epoch": 28.526315789473685, + "grad_norm": 2.999107599258423, + "learning_rate": 2.382798459563543e-05, + "loss": 0.3084, + "step": 22222 + }, + { + "epoch": 28.527599486521183, + "grad_norm": 1.4443827867507935, + "learning_rate": 2.3827556696619596e-05, + "loss": 0.3162, + "step": 22223 + }, + { + "epoch": 28.528883183568677, + "grad_norm": 1.0828361511230469, + "learning_rate": 2.3827128797603768e-05, + "loss": 0.3038, + "step": 22224 + }, + { + "epoch": 28.530166880616175, + "grad_norm": 1.825585126876831, + "learning_rate": 2.3826700898587933e-05, + "loss": 0.3188, + "step": 22225 + }, + { + "epoch": 28.531450577663673, + "grad_norm": 1.1187812089920044, + "learning_rate": 2.38262729995721e-05, + "loss": 0.3153, + "step": 22226 + }, + { + "epoch": 28.532734274711167, + "grad_norm": 1.3924936056137085, + "learning_rate": 2.382584510055627e-05, + "loss": 0.2938, + "step": 22227 + }, + { + "epoch": 28.534017971758665, + "grad_norm": 0.9029104113578796, + "learning_rate": 2.3825417201540435e-05, + "loss": 0.2878, + "step": 22228 + }, + { + "epoch": 28.535301668806163, + "grad_norm": 1.976366400718689, + "learning_rate": 2.3824989302524607e-05, + "loss": 0.2994, + "step": 22229 + }, + { + "epoch": 28.536585365853657, + "grad_norm": 1.0922315120697021, + "learning_rate": 2.382456140350877e-05, + "loss": 0.2975, + "step": 22230 + }, + { + "epoch": 28.537869062901155, + "grad_norm": 2.259208917617798, + "learning_rate": 2.382413350449294e-05, + "loss": 0.3173, + "step": 22231 + }, + { + "epoch": 28.539152759948653, + "grad_norm": 1.1861484050750732, + "learning_rate": 2.382370560547711e-05, + "loss": 0.3158, + "step": 22232 + }, + { + "epoch": 28.540436456996147, + "grad_norm": 0.859621524810791, + "learning_rate": 2.3823277706461273e-05, + "loss": 0.2986, + "step": 22233 + }, + { + "epoch": 28.541720154043645, + "grad_norm": 1.4512591361999512, + "learning_rate": 2.3822849807445445e-05, + "loss": 0.3014, + "step": 22234 + }, + { + "epoch": 28.543003851091143, + "grad_norm": 1.2503021955490112, + "learning_rate": 2.382242190842961e-05, + "loss": 0.3009, + "step": 22235 + }, + { + "epoch": 28.54428754813864, + "grad_norm": 0.9909793138504028, + "learning_rate": 2.382199400941378e-05, + "loss": 0.2974, + "step": 22236 + }, + { + "epoch": 28.545571245186135, + "grad_norm": 1.6014083623886108, + "learning_rate": 2.3821566110397947e-05, + "loss": 0.2968, + "step": 22237 + }, + { + "epoch": 28.546854942233633, + "grad_norm": 1.0650323629379272, + "learning_rate": 2.3821138211382116e-05, + "loss": 0.2815, + "step": 22238 + }, + { + "epoch": 28.54813863928113, + "grad_norm": 0.9986312985420227, + "learning_rate": 2.382071031236628e-05, + "loss": 0.2924, + "step": 22239 + }, + { + "epoch": 28.549422336328625, + "grad_norm": 1.079756736755371, + "learning_rate": 2.382028241335045e-05, + "loss": 0.3005, + "step": 22240 + }, + { + "epoch": 28.550706033376123, + "grad_norm": 1.0165456533432007, + "learning_rate": 2.3819854514334617e-05, + "loss": 0.3, + "step": 22241 + }, + { + "epoch": 28.55198973042362, + "grad_norm": 0.8663925528526306, + "learning_rate": 2.3819426615318786e-05, + "loss": 0.275, + "step": 22242 + }, + { + "epoch": 28.553273427471115, + "grad_norm": 1.7022382020950317, + "learning_rate": 2.3818998716302954e-05, + "loss": 0.3269, + "step": 22243 + }, + { + "epoch": 28.554557124518613, + "grad_norm": 2.1766841411590576, + "learning_rate": 2.381857081728712e-05, + "loss": 0.3304, + "step": 22244 + }, + { + "epoch": 28.55584082156611, + "grad_norm": 1.1251749992370605, + "learning_rate": 2.381814291827129e-05, + "loss": 0.3068, + "step": 22245 + }, + { + "epoch": 28.55712451861361, + "grad_norm": 1.4551854133605957, + "learning_rate": 2.3817715019255456e-05, + "loss": 0.3216, + "step": 22246 + }, + { + "epoch": 28.558408215661103, + "grad_norm": 1.8894281387329102, + "learning_rate": 2.381728712023962e-05, + "loss": 0.3278, + "step": 22247 + }, + { + "epoch": 28.5596919127086, + "grad_norm": 1.199860692024231, + "learning_rate": 2.3816859221223793e-05, + "loss": 0.3191, + "step": 22248 + }, + { + "epoch": 28.5609756097561, + "grad_norm": 1.1866042613983154, + "learning_rate": 2.3816431322207958e-05, + "loss": 0.2774, + "step": 22249 + }, + { + "epoch": 28.562259306803593, + "grad_norm": 1.1908921003341675, + "learning_rate": 2.381600342319213e-05, + "loss": 0.3193, + "step": 22250 + }, + { + "epoch": 28.56354300385109, + "grad_norm": 1.0516189336776733, + "learning_rate": 2.3815575524176295e-05, + "loss": 0.3127, + "step": 22251 + }, + { + "epoch": 28.56482670089859, + "grad_norm": 1.2632603645324707, + "learning_rate": 2.3815147625160463e-05, + "loss": 0.3189, + "step": 22252 + }, + { + "epoch": 28.566110397946083, + "grad_norm": 1.319783329963684, + "learning_rate": 2.381471972614463e-05, + "loss": 0.3438, + "step": 22253 + }, + { + "epoch": 28.56739409499358, + "grad_norm": 1.7349812984466553, + "learning_rate": 2.3814291827128797e-05, + "loss": 0.3221, + "step": 22254 + }, + { + "epoch": 28.56867779204108, + "grad_norm": 2.6299450397491455, + "learning_rate": 2.3813863928112965e-05, + "loss": 0.3345, + "step": 22255 + }, + { + "epoch": 28.569961489088577, + "grad_norm": 2.4615306854248047, + "learning_rate": 2.3813436029097133e-05, + "loss": 0.3249, + "step": 22256 + }, + { + "epoch": 28.57124518613607, + "grad_norm": 1.626952886581421, + "learning_rate": 2.3813008130081302e-05, + "loss": 0.3293, + "step": 22257 + }, + { + "epoch": 28.57252888318357, + "grad_norm": 1.4959079027175903, + "learning_rate": 2.381258023106547e-05, + "loss": 0.3003, + "step": 22258 + }, + { + "epoch": 28.573812580231067, + "grad_norm": 1.2698900699615479, + "learning_rate": 2.381215233204964e-05, + "loss": 0.2892, + "step": 22259 + }, + { + "epoch": 28.57509627727856, + "grad_norm": 1.6922367811203003, + "learning_rate": 2.3811724433033804e-05, + "loss": 0.3704, + "step": 22260 + }, + { + "epoch": 28.57637997432606, + "grad_norm": 1.7684766054153442, + "learning_rate": 2.3811296534017972e-05, + "loss": 0.438, + "step": 22261 + }, + { + "epoch": 28.577663671373557, + "grad_norm": 3.0076870918273926, + "learning_rate": 2.381086863500214e-05, + "loss": 0.4739, + "step": 22262 + }, + { + "epoch": 28.57894736842105, + "grad_norm": 1.9695299863815308, + "learning_rate": 2.3810440735986305e-05, + "loss": 0.3206, + "step": 22263 + }, + { + "epoch": 28.58023106546855, + "grad_norm": 0.9063071012496948, + "learning_rate": 2.3810012836970477e-05, + "loss": 0.3071, + "step": 22264 + }, + { + "epoch": 28.581514762516047, + "grad_norm": 1.3705488443374634, + "learning_rate": 2.3809584937954642e-05, + "loss": 0.3188, + "step": 22265 + }, + { + "epoch": 28.58279845956354, + "grad_norm": 7.711677551269531, + "learning_rate": 2.3809157038938814e-05, + "loss": 0.3219, + "step": 22266 + }, + { + "epoch": 28.58408215661104, + "grad_norm": 0.857741117477417, + "learning_rate": 2.380872913992298e-05, + "loss": 0.3117, + "step": 22267 + }, + { + "epoch": 28.585365853658537, + "grad_norm": 0.7828433513641357, + "learning_rate": 2.3808301240907144e-05, + "loss": 0.2826, + "step": 22268 + }, + { + "epoch": 28.586649550706035, + "grad_norm": 1.1039127111434937, + "learning_rate": 2.3807873341891316e-05, + "loss": 0.3064, + "step": 22269 + }, + { + "epoch": 28.58793324775353, + "grad_norm": 1.0963099002838135, + "learning_rate": 2.380744544287548e-05, + "loss": 0.3068, + "step": 22270 + }, + { + "epoch": 28.589216944801027, + "grad_norm": 2.4488115310668945, + "learning_rate": 2.380701754385965e-05, + "loss": 0.3283, + "step": 22271 + }, + { + "epoch": 28.590500641848525, + "grad_norm": 0.9052548408508301, + "learning_rate": 2.3806589644843818e-05, + "loss": 0.3431, + "step": 22272 + }, + { + "epoch": 28.59178433889602, + "grad_norm": 1.2347650527954102, + "learning_rate": 2.3806161745827986e-05, + "loss": 0.2981, + "step": 22273 + }, + { + "epoch": 28.593068035943517, + "grad_norm": 1.0207911729812622, + "learning_rate": 2.3805733846812155e-05, + "loss": 0.321, + "step": 22274 + }, + { + "epoch": 28.594351732991015, + "grad_norm": 1.4152973890304565, + "learning_rate": 2.380530594779632e-05, + "loss": 0.3103, + "step": 22275 + }, + { + "epoch": 28.59563543003851, + "grad_norm": 1.6482553482055664, + "learning_rate": 2.3804878048780488e-05, + "loss": 0.2965, + "step": 22276 + }, + { + "epoch": 28.596919127086007, + "grad_norm": 0.8468120098114014, + "learning_rate": 2.3804450149764656e-05, + "loss": 0.2847, + "step": 22277 + }, + { + "epoch": 28.598202824133505, + "grad_norm": 0.9038289785385132, + "learning_rate": 2.3804022250748825e-05, + "loss": 0.3294, + "step": 22278 + }, + { + "epoch": 28.599486521181003, + "grad_norm": 1.162835955619812, + "learning_rate": 2.380359435173299e-05, + "loss": 0.3148, + "step": 22279 + }, + { + "epoch": 28.600770218228497, + "grad_norm": 0.726728618144989, + "learning_rate": 2.380316645271716e-05, + "loss": 0.2835, + "step": 22280 + }, + { + "epoch": 28.602053915275995, + "grad_norm": 0.984734058380127, + "learning_rate": 2.3802738553701327e-05, + "loss": 0.3215, + "step": 22281 + }, + { + "epoch": 28.603337612323493, + "grad_norm": 0.9699156880378723, + "learning_rate": 2.3802310654685495e-05, + "loss": 0.2996, + "step": 22282 + }, + { + "epoch": 28.604621309370987, + "grad_norm": 0.9361474514007568, + "learning_rate": 2.3801882755669664e-05, + "loss": 0.3136, + "step": 22283 + }, + { + "epoch": 28.605905006418485, + "grad_norm": 1.158187747001648, + "learning_rate": 2.380145485665383e-05, + "loss": 0.2674, + "step": 22284 + }, + { + "epoch": 28.607188703465983, + "grad_norm": 1.2803195714950562, + "learning_rate": 2.3801026957638e-05, + "loss": 0.2856, + "step": 22285 + }, + { + "epoch": 28.608472400513477, + "grad_norm": 1.5041279792785645, + "learning_rate": 2.3800599058622165e-05, + "loss": 0.2814, + "step": 22286 + }, + { + "epoch": 28.609756097560975, + "grad_norm": 0.882480263710022, + "learning_rate": 2.380017115960633e-05, + "loss": 0.3054, + "step": 22287 + }, + { + "epoch": 28.611039794608473, + "grad_norm": 6.66646146774292, + "learning_rate": 2.3799743260590502e-05, + "loss": 0.3082, + "step": 22288 + }, + { + "epoch": 28.61232349165597, + "grad_norm": 1.1041818857192993, + "learning_rate": 2.3799315361574667e-05, + "loss": 0.2968, + "step": 22289 + }, + { + "epoch": 28.613607188703465, + "grad_norm": 0.8416222929954529, + "learning_rate": 2.379888746255884e-05, + "loss": 0.2828, + "step": 22290 + }, + { + "epoch": 28.614890885750963, + "grad_norm": 0.9470923542976379, + "learning_rate": 2.3798459563543004e-05, + "loss": 0.2884, + "step": 22291 + }, + { + "epoch": 28.61617458279846, + "grad_norm": 1.1150728464126587, + "learning_rate": 2.3798031664527172e-05, + "loss": 0.2633, + "step": 22292 + }, + { + "epoch": 28.617458279845955, + "grad_norm": 1.4131830930709839, + "learning_rate": 2.379760376551134e-05, + "loss": 0.2875, + "step": 22293 + }, + { + "epoch": 28.618741976893453, + "grad_norm": 2.0489838123321533, + "learning_rate": 2.3797175866495506e-05, + "loss": 0.3144, + "step": 22294 + }, + { + "epoch": 28.62002567394095, + "grad_norm": 2.425067663192749, + "learning_rate": 2.3796747967479674e-05, + "loss": 0.3145, + "step": 22295 + }, + { + "epoch": 28.621309370988445, + "grad_norm": 1.447326898574829, + "learning_rate": 2.3796320068463843e-05, + "loss": 0.3111, + "step": 22296 + }, + { + "epoch": 28.622593068035943, + "grad_norm": 1.2979782819747925, + "learning_rate": 2.379589216944801e-05, + "loss": 0.3179, + "step": 22297 + }, + { + "epoch": 28.62387676508344, + "grad_norm": 1.0232205390930176, + "learning_rate": 2.379546427043218e-05, + "loss": 0.2965, + "step": 22298 + }, + { + "epoch": 28.625160462130935, + "grad_norm": 1.036124587059021, + "learning_rate": 2.3795036371416348e-05, + "loss": 0.3056, + "step": 22299 + }, + { + "epoch": 28.626444159178433, + "grad_norm": 1.1737852096557617, + "learning_rate": 2.3794608472400513e-05, + "loss": 0.3489, + "step": 22300 + }, + { + "epoch": 28.62772785622593, + "grad_norm": 1.9109163284301758, + "learning_rate": 2.379418057338468e-05, + "loss": 0.3082, + "step": 22301 + }, + { + "epoch": 28.62901155327343, + "grad_norm": 4.812931060791016, + "learning_rate": 2.379375267436885e-05, + "loss": 0.276, + "step": 22302 + }, + { + "epoch": 28.630295250320923, + "grad_norm": 1.2696865797042847, + "learning_rate": 2.3793324775353015e-05, + "loss": 0.3457, + "step": 22303 + }, + { + "epoch": 28.63157894736842, + "grad_norm": 1.432047963142395, + "learning_rate": 2.3792896876337187e-05, + "loss": 0.2755, + "step": 22304 + }, + { + "epoch": 28.63286264441592, + "grad_norm": 1.3213368654251099, + "learning_rate": 2.379246897732135e-05, + "loss": 0.3262, + "step": 22305 + }, + { + "epoch": 28.634146341463413, + "grad_norm": 0.9687851667404175, + "learning_rate": 2.3792041078305523e-05, + "loss": 0.3404, + "step": 22306 + }, + { + "epoch": 28.63543003851091, + "grad_norm": 1.7116633653640747, + "learning_rate": 2.379161317928969e-05, + "loss": 0.3072, + "step": 22307 + }, + { + "epoch": 28.63671373555841, + "grad_norm": 1.2105060815811157, + "learning_rate": 2.3791185280273854e-05, + "loss": 0.3352, + "step": 22308 + }, + { + "epoch": 28.637997432605903, + "grad_norm": 1.4747873544692993, + "learning_rate": 2.3790757381258025e-05, + "loss": 0.357, + "step": 22309 + }, + { + "epoch": 28.6392811296534, + "grad_norm": 2.364773750305176, + "learning_rate": 2.379032948224219e-05, + "loss": 0.3747, + "step": 22310 + }, + { + "epoch": 28.6405648267009, + "grad_norm": 2.2892627716064453, + "learning_rate": 2.378990158322636e-05, + "loss": 0.3978, + "step": 22311 + }, + { + "epoch": 28.641848523748397, + "grad_norm": 3.687654972076416, + "learning_rate": 2.3789473684210527e-05, + "loss": 0.575, + "step": 22312 + }, + { + "epoch": 28.64313222079589, + "grad_norm": 1.1704188585281372, + "learning_rate": 2.3789045785194696e-05, + "loss": 0.3127, + "step": 22313 + }, + { + "epoch": 28.64441591784339, + "grad_norm": 0.7672255635261536, + "learning_rate": 2.3788617886178864e-05, + "loss": 0.3136, + "step": 22314 + }, + { + "epoch": 28.645699614890887, + "grad_norm": 0.8161812424659729, + "learning_rate": 2.378818998716303e-05, + "loss": 0.2774, + "step": 22315 + }, + { + "epoch": 28.64698331193838, + "grad_norm": 0.8012319207191467, + "learning_rate": 2.3787762088147197e-05, + "loss": 0.3127, + "step": 22316 + }, + { + "epoch": 28.64826700898588, + "grad_norm": 1.2861442565917969, + "learning_rate": 2.3787334189131366e-05, + "loss": 0.297, + "step": 22317 + }, + { + "epoch": 28.649550706033377, + "grad_norm": 0.9636515378952026, + "learning_rate": 2.3786906290115534e-05, + "loss": 0.3095, + "step": 22318 + }, + { + "epoch": 28.65083440308087, + "grad_norm": 0.9597885012626648, + "learning_rate": 2.37864783910997e-05, + "loss": 0.3133, + "step": 22319 + }, + { + "epoch": 28.65211810012837, + "grad_norm": 0.8257044553756714, + "learning_rate": 2.378605049208387e-05, + "loss": 0.2918, + "step": 22320 + }, + { + "epoch": 28.653401797175867, + "grad_norm": 1.5697615146636963, + "learning_rate": 2.3785622593068036e-05, + "loss": 0.2697, + "step": 22321 + }, + { + "epoch": 28.654685494223365, + "grad_norm": 0.6812276840209961, + "learning_rate": 2.3785194694052204e-05, + "loss": 0.2959, + "step": 22322 + }, + { + "epoch": 28.65596919127086, + "grad_norm": 2.3533225059509277, + "learning_rate": 2.3784766795036373e-05, + "loss": 0.2756, + "step": 22323 + }, + { + "epoch": 28.657252888318357, + "grad_norm": 0.8575966954231262, + "learning_rate": 2.3784338896020538e-05, + "loss": 0.3022, + "step": 22324 + }, + { + "epoch": 28.658536585365855, + "grad_norm": 4.334560871124268, + "learning_rate": 2.378391099700471e-05, + "loss": 0.2673, + "step": 22325 + }, + { + "epoch": 28.65982028241335, + "grad_norm": 2.359642505645752, + "learning_rate": 2.3783483097988875e-05, + "loss": 0.3125, + "step": 22326 + }, + { + "epoch": 28.661103979460847, + "grad_norm": 0.813757598400116, + "learning_rate": 2.3783055198973043e-05, + "loss": 0.2873, + "step": 22327 + }, + { + "epoch": 28.662387676508345, + "grad_norm": 17.938020706176758, + "learning_rate": 2.378262729995721e-05, + "loss": 0.3046, + "step": 22328 + }, + { + "epoch": 28.66367137355584, + "grad_norm": 0.8691705465316772, + "learning_rate": 2.3782199400941377e-05, + "loss": 0.2883, + "step": 22329 + }, + { + "epoch": 28.664955070603337, + "grad_norm": 1.752822756767273, + "learning_rate": 2.378177150192555e-05, + "loss": 0.2855, + "step": 22330 + }, + { + "epoch": 28.666238767650835, + "grad_norm": 1.8408385515213013, + "learning_rate": 2.3781343602909713e-05, + "loss": 0.3039, + "step": 22331 + }, + { + "epoch": 28.66752246469833, + "grad_norm": 1.2335487604141235, + "learning_rate": 2.3780915703893882e-05, + "loss": 0.3033, + "step": 22332 + }, + { + "epoch": 28.668806161745827, + "grad_norm": 1.4468436241149902, + "learning_rate": 2.378048780487805e-05, + "loss": 0.3181, + "step": 22333 + }, + { + "epoch": 28.670089858793325, + "grad_norm": 0.986322283744812, + "learning_rate": 2.378005990586222e-05, + "loss": 0.2934, + "step": 22334 + }, + { + "epoch": 28.671373555840823, + "grad_norm": 1.3846060037612915, + "learning_rate": 2.3779632006846384e-05, + "loss": 0.3066, + "step": 22335 + }, + { + "epoch": 28.672657252888317, + "grad_norm": 1.1058425903320312, + "learning_rate": 2.3779204107830552e-05, + "loss": 0.271, + "step": 22336 + }, + { + "epoch": 28.673940949935815, + "grad_norm": 1.0716018676757812, + "learning_rate": 2.377877620881472e-05, + "loss": 0.2883, + "step": 22337 + }, + { + "epoch": 28.675224646983313, + "grad_norm": 1.1417542695999146, + "learning_rate": 2.377834830979889e-05, + "loss": 0.2742, + "step": 22338 + }, + { + "epoch": 28.676508344030808, + "grad_norm": 2.576460123062134, + "learning_rate": 2.3777920410783057e-05, + "loss": 0.2865, + "step": 22339 + }, + { + "epoch": 28.677792041078305, + "grad_norm": 1.2134445905685425, + "learning_rate": 2.3777492511767222e-05, + "loss": 0.2669, + "step": 22340 + }, + { + "epoch": 28.679075738125803, + "grad_norm": 1.632873773574829, + "learning_rate": 2.377706461275139e-05, + "loss": 0.2749, + "step": 22341 + }, + { + "epoch": 28.680359435173298, + "grad_norm": 0.95890873670578, + "learning_rate": 2.377663671373556e-05, + "loss": 0.305, + "step": 22342 + }, + { + "epoch": 28.681643132220795, + "grad_norm": 1.3946800231933594, + "learning_rate": 2.3776208814719724e-05, + "loss": 0.285, + "step": 22343 + }, + { + "epoch": 28.682926829268293, + "grad_norm": 1.2150144577026367, + "learning_rate": 2.3775780915703896e-05, + "loss": 0.2761, + "step": 22344 + }, + { + "epoch": 28.68421052631579, + "grad_norm": 1.1950002908706665, + "learning_rate": 2.377535301668806e-05, + "loss": 0.3029, + "step": 22345 + }, + { + "epoch": 28.685494223363285, + "grad_norm": 1.9268536567687988, + "learning_rate": 2.3774925117672233e-05, + "loss": 0.333, + "step": 22346 + }, + { + "epoch": 28.686777920410783, + "grad_norm": 1.0640681982040405, + "learning_rate": 2.3774497218656398e-05, + "loss": 0.29, + "step": 22347 + }, + { + "epoch": 28.68806161745828, + "grad_norm": 1.2261232137680054, + "learning_rate": 2.3774069319640563e-05, + "loss": 0.2787, + "step": 22348 + }, + { + "epoch": 28.689345314505776, + "grad_norm": 2.2200582027435303, + "learning_rate": 2.3773641420624735e-05, + "loss": 0.3355, + "step": 22349 + }, + { + "epoch": 28.690629011553273, + "grad_norm": 7.824954032897949, + "learning_rate": 2.37732135216089e-05, + "loss": 0.3032, + "step": 22350 + }, + { + "epoch": 28.69191270860077, + "grad_norm": 2.3694040775299072, + "learning_rate": 2.3772785622593068e-05, + "loss": 0.3306, + "step": 22351 + }, + { + "epoch": 28.693196405648266, + "grad_norm": 0.9946658611297607, + "learning_rate": 2.3772357723577237e-05, + "loss": 0.3298, + "step": 22352 + }, + { + "epoch": 28.694480102695763, + "grad_norm": 2.9222919940948486, + "learning_rate": 2.3771929824561405e-05, + "loss": 0.3432, + "step": 22353 + }, + { + "epoch": 28.69576379974326, + "grad_norm": 2.673553705215454, + "learning_rate": 2.3771501925545573e-05, + "loss": 0.3419, + "step": 22354 + }, + { + "epoch": 28.69704749679076, + "grad_norm": 1.4028878211975098, + "learning_rate": 2.377107402652974e-05, + "loss": 0.3286, + "step": 22355 + }, + { + "epoch": 28.698331193838253, + "grad_norm": 1.4723727703094482, + "learning_rate": 2.3770646127513907e-05, + "loss": 0.3328, + "step": 22356 + }, + { + "epoch": 28.69961489088575, + "grad_norm": 1.5649527311325073, + "learning_rate": 2.3770218228498075e-05, + "loss": 0.3412, + "step": 22357 + }, + { + "epoch": 28.70089858793325, + "grad_norm": 2.0729832649230957, + "learning_rate": 2.3769790329482244e-05, + "loss": 0.343, + "step": 22358 + }, + { + "epoch": 28.702182284980744, + "grad_norm": 8.825000762939453, + "learning_rate": 2.376936243046641e-05, + "loss": 0.3533, + "step": 22359 + }, + { + "epoch": 28.70346598202824, + "grad_norm": 1.406335473060608, + "learning_rate": 2.376893453145058e-05, + "loss": 0.378, + "step": 22360 + }, + { + "epoch": 28.70474967907574, + "grad_norm": 1.6568132638931274, + "learning_rate": 2.3768506632434745e-05, + "loss": 0.4398, + "step": 22361 + }, + { + "epoch": 28.706033376123234, + "grad_norm": 1.962081789970398, + "learning_rate": 2.3768078733418914e-05, + "loss": 0.4788, + "step": 22362 + }, + { + "epoch": 28.70731707317073, + "grad_norm": 0.8415127396583557, + "learning_rate": 2.3767650834403082e-05, + "loss": 0.3048, + "step": 22363 + }, + { + "epoch": 28.70860077021823, + "grad_norm": 1.598313570022583, + "learning_rate": 2.3767222935387247e-05, + "loss": 0.3044, + "step": 22364 + }, + { + "epoch": 28.709884467265724, + "grad_norm": 1.6730554103851318, + "learning_rate": 2.376679503637142e-05, + "loss": 0.3021, + "step": 22365 + }, + { + "epoch": 28.71116816431322, + "grad_norm": 1.2561485767364502, + "learning_rate": 2.3766367137355584e-05, + "loss": 0.3163, + "step": 22366 + }, + { + "epoch": 28.71245186136072, + "grad_norm": 0.8109434843063354, + "learning_rate": 2.3765939238339753e-05, + "loss": 0.2885, + "step": 22367 + }, + { + "epoch": 28.713735558408217, + "grad_norm": 0.7707194685935974, + "learning_rate": 2.376551133932392e-05, + "loss": 0.3009, + "step": 22368 + }, + { + "epoch": 28.71501925545571, + "grad_norm": 1.049349069595337, + "learning_rate": 2.3765083440308086e-05, + "loss": 0.2873, + "step": 22369 + }, + { + "epoch": 28.71630295250321, + "grad_norm": 0.9698308706283569, + "learning_rate": 2.3764655541292258e-05, + "loss": 0.3121, + "step": 22370 + }, + { + "epoch": 28.717586649550707, + "grad_norm": 1.5172538757324219, + "learning_rate": 2.3764227642276423e-05, + "loss": 0.3343, + "step": 22371 + }, + { + "epoch": 28.7188703465982, + "grad_norm": 1.0999373197555542, + "learning_rate": 2.376379974326059e-05, + "loss": 0.3164, + "step": 22372 + }, + { + "epoch": 28.7201540436457, + "grad_norm": 1.184808611869812, + "learning_rate": 2.376337184424476e-05, + "loss": 0.3005, + "step": 22373 + }, + { + "epoch": 28.721437740693197, + "grad_norm": 1.703782081604004, + "learning_rate": 2.3762943945228928e-05, + "loss": 0.2881, + "step": 22374 + }, + { + "epoch": 28.72272143774069, + "grad_norm": 1.8960074186325073, + "learning_rate": 2.3762516046213093e-05, + "loss": 0.2996, + "step": 22375 + }, + { + "epoch": 28.72400513478819, + "grad_norm": 1.1619963645935059, + "learning_rate": 2.376208814719726e-05, + "loss": 0.2827, + "step": 22376 + }, + { + "epoch": 28.725288831835687, + "grad_norm": 1.7073860168457031, + "learning_rate": 2.376166024818143e-05, + "loss": 0.3357, + "step": 22377 + }, + { + "epoch": 28.726572528883185, + "grad_norm": 0.9972891211509705, + "learning_rate": 2.3761232349165598e-05, + "loss": 0.3292, + "step": 22378 + }, + { + "epoch": 28.72785622593068, + "grad_norm": 1.0192445516586304, + "learning_rate": 2.3760804450149767e-05, + "loss": 0.305, + "step": 22379 + }, + { + "epoch": 28.729139922978177, + "grad_norm": 1.803245186805725, + "learning_rate": 2.3760376551133932e-05, + "loss": 0.2896, + "step": 22380 + }, + { + "epoch": 28.730423620025675, + "grad_norm": 2.2918951511383057, + "learning_rate": 2.3759948652118104e-05, + "loss": 0.2765, + "step": 22381 + }, + { + "epoch": 28.73170731707317, + "grad_norm": 1.2074607610702515, + "learning_rate": 2.375952075310227e-05, + "loss": 0.294, + "step": 22382 + }, + { + "epoch": 28.732991014120667, + "grad_norm": 1.1637773513793945, + "learning_rate": 2.3759092854086434e-05, + "loss": 0.2847, + "step": 22383 + }, + { + "epoch": 28.734274711168165, + "grad_norm": 1.9878079891204834, + "learning_rate": 2.3758664955070605e-05, + "loss": 0.3053, + "step": 22384 + }, + { + "epoch": 28.73555840821566, + "grad_norm": 1.0848369598388672, + "learning_rate": 2.375823705605477e-05, + "loss": 0.2805, + "step": 22385 + }, + { + "epoch": 28.736842105263158, + "grad_norm": 1.3145140409469604, + "learning_rate": 2.3757809157038942e-05, + "loss": 0.2894, + "step": 22386 + }, + { + "epoch": 28.738125802310655, + "grad_norm": 1.2818305492401123, + "learning_rate": 2.3757381258023107e-05, + "loss": 0.2868, + "step": 22387 + }, + { + "epoch": 28.739409499358153, + "grad_norm": 1.1936547756195068, + "learning_rate": 2.3756953359007276e-05, + "loss": 0.2826, + "step": 22388 + }, + { + "epoch": 28.740693196405648, + "grad_norm": 0.862038254737854, + "learning_rate": 2.3756525459991444e-05, + "loss": 0.3096, + "step": 22389 + }, + { + "epoch": 28.741976893453145, + "grad_norm": 2.4928197860717773, + "learning_rate": 2.375609756097561e-05, + "loss": 0.2788, + "step": 22390 + }, + { + "epoch": 28.743260590500643, + "grad_norm": 3.6612021923065186, + "learning_rate": 2.3755669661959777e-05, + "loss": 0.3055, + "step": 22391 + }, + { + "epoch": 28.744544287548138, + "grad_norm": 1.9584739208221436, + "learning_rate": 2.3755241762943946e-05, + "loss": 0.3545, + "step": 22392 + }, + { + "epoch": 28.745827984595635, + "grad_norm": 1.4471690654754639, + "learning_rate": 2.3754813863928114e-05, + "loss": 0.3274, + "step": 22393 + }, + { + "epoch": 28.747111681643133, + "grad_norm": 1.0326467752456665, + "learning_rate": 2.375438596491228e-05, + "loss": 0.3387, + "step": 22394 + }, + { + "epoch": 28.748395378690628, + "grad_norm": 1.7261158227920532, + "learning_rate": 2.375395806589645e-05, + "loss": 0.2885, + "step": 22395 + }, + { + "epoch": 28.749679075738126, + "grad_norm": 1.3886901140213013, + "learning_rate": 2.3753530166880616e-05, + "loss": 0.3346, + "step": 22396 + }, + { + "epoch": 28.750962772785623, + "grad_norm": 1.8252695798873901, + "learning_rate": 2.3753102267864785e-05, + "loss": 0.328, + "step": 22397 + }, + { + "epoch": 28.752246469833118, + "grad_norm": 1.552993655204773, + "learning_rate": 2.3752674368848953e-05, + "loss": 0.3206, + "step": 22398 + }, + { + "epoch": 28.753530166880616, + "grad_norm": 1.238069772720337, + "learning_rate": 2.3752246469833118e-05, + "loss": 0.3389, + "step": 22399 + }, + { + "epoch": 28.754813863928113, + "grad_norm": 1.383675456047058, + "learning_rate": 2.375181857081729e-05, + "loss": 0.3107, + "step": 22400 + }, + { + "epoch": 28.75609756097561, + "grad_norm": 7.591041088104248, + "learning_rate": 2.3751390671801455e-05, + "loss": 0.3472, + "step": 22401 + }, + { + "epoch": 28.757381258023106, + "grad_norm": 1.0895123481750488, + "learning_rate": 2.3750962772785623e-05, + "loss": 0.3079, + "step": 22402 + }, + { + "epoch": 28.758664955070603, + "grad_norm": 1.3776228427886963, + "learning_rate": 2.375053487376979e-05, + "loss": 0.3028, + "step": 22403 + }, + { + "epoch": 28.7599486521181, + "grad_norm": 2.021371603012085, + "learning_rate": 2.3750106974753957e-05, + "loss": 0.3596, + "step": 22404 + }, + { + "epoch": 28.761232349165596, + "grad_norm": 3.350107192993164, + "learning_rate": 2.374967907573813e-05, + "loss": 0.2824, + "step": 22405 + }, + { + "epoch": 28.762516046213094, + "grad_norm": 2.318948268890381, + "learning_rate": 2.3749251176722293e-05, + "loss": 0.2795, + "step": 22406 + }, + { + "epoch": 28.76379974326059, + "grad_norm": 3.2589261531829834, + "learning_rate": 2.3748823277706462e-05, + "loss": 0.3766, + "step": 22407 + }, + { + "epoch": 28.765083440308086, + "grad_norm": 1.1973087787628174, + "learning_rate": 2.374839537869063e-05, + "loss": 0.3369, + "step": 22408 + }, + { + "epoch": 28.766367137355584, + "grad_norm": 2.5022614002227783, + "learning_rate": 2.3747967479674795e-05, + "loss": 0.3894, + "step": 22409 + }, + { + "epoch": 28.76765083440308, + "grad_norm": 1.731655478477478, + "learning_rate": 2.3747539580658964e-05, + "loss": 0.356, + "step": 22410 + }, + { + "epoch": 28.76893453145058, + "grad_norm": 1.6697791814804077, + "learning_rate": 2.3747111681643132e-05, + "loss": 0.3653, + "step": 22411 + }, + { + "epoch": 28.770218228498074, + "grad_norm": 1.6236132383346558, + "learning_rate": 2.37466837826273e-05, + "loss": 0.4976, + "step": 22412 + }, + { + "epoch": 28.77150192554557, + "grad_norm": 1.0630358457565308, + "learning_rate": 2.374625588361147e-05, + "loss": 0.2917, + "step": 22413 + }, + { + "epoch": 28.77278562259307, + "grad_norm": 0.825067400932312, + "learning_rate": 2.3745827984595637e-05, + "loss": 0.3292, + "step": 22414 + }, + { + "epoch": 28.774069319640564, + "grad_norm": 0.9103935360908508, + "learning_rate": 2.3745400085579802e-05, + "loss": 0.3121, + "step": 22415 + }, + { + "epoch": 28.77535301668806, + "grad_norm": 0.7161197066307068, + "learning_rate": 2.374497218656397e-05, + "loss": 0.2954, + "step": 22416 + }, + { + "epoch": 28.77663671373556, + "grad_norm": 0.9855989813804626, + "learning_rate": 2.374454428754814e-05, + "loss": 0.3045, + "step": 22417 + }, + { + "epoch": 28.777920410783054, + "grad_norm": 0.9802849292755127, + "learning_rate": 2.3744116388532304e-05, + "loss": 0.2868, + "step": 22418 + }, + { + "epoch": 28.77920410783055, + "grad_norm": 3.922443151473999, + "learning_rate": 2.3743688489516476e-05, + "loss": 0.2888, + "step": 22419 + }, + { + "epoch": 28.78048780487805, + "grad_norm": 2.45176100730896, + "learning_rate": 2.374326059050064e-05, + "loss": 0.2962, + "step": 22420 + }, + { + "epoch": 28.781771501925547, + "grad_norm": 1.4900747537612915, + "learning_rate": 2.3742832691484813e-05, + "loss": 0.3537, + "step": 22421 + }, + { + "epoch": 28.78305519897304, + "grad_norm": 1.8930779695510864, + "learning_rate": 2.3742404792468978e-05, + "loss": 0.3002, + "step": 22422 + }, + { + "epoch": 28.78433889602054, + "grad_norm": 0.8201322555541992, + "learning_rate": 2.3741976893453143e-05, + "loss": 0.2888, + "step": 22423 + }, + { + "epoch": 28.785622593068037, + "grad_norm": 1.289141058921814, + "learning_rate": 2.3741548994437315e-05, + "loss": 0.3234, + "step": 22424 + }, + { + "epoch": 28.78690629011553, + "grad_norm": 4.292053699493408, + "learning_rate": 2.374112109542148e-05, + "loss": 0.3553, + "step": 22425 + }, + { + "epoch": 28.78818998716303, + "grad_norm": 1.2538636922836304, + "learning_rate": 2.3740693196405648e-05, + "loss": 0.2907, + "step": 22426 + }, + { + "epoch": 28.789473684210527, + "grad_norm": 4.653135299682617, + "learning_rate": 2.3740265297389817e-05, + "loss": 0.2927, + "step": 22427 + }, + { + "epoch": 28.79075738125802, + "grad_norm": 0.8586333394050598, + "learning_rate": 2.3739837398373985e-05, + "loss": 0.3109, + "step": 22428 + }, + { + "epoch": 28.79204107830552, + "grad_norm": 1.0453754663467407, + "learning_rate": 2.3739409499358153e-05, + "loss": 0.3445, + "step": 22429 + }, + { + "epoch": 28.793324775353017, + "grad_norm": 0.9292091131210327, + "learning_rate": 2.373898160034232e-05, + "loss": 0.2765, + "step": 22430 + }, + { + "epoch": 28.794608472400512, + "grad_norm": 1.256537914276123, + "learning_rate": 2.3738553701326487e-05, + "loss": 0.323, + "step": 22431 + }, + { + "epoch": 28.79589216944801, + "grad_norm": 0.9502711892127991, + "learning_rate": 2.3738125802310655e-05, + "loss": 0.2951, + "step": 22432 + }, + { + "epoch": 28.797175866495508, + "grad_norm": 0.9147785305976868, + "learning_rate": 2.3737697903294824e-05, + "loss": 0.2959, + "step": 22433 + }, + { + "epoch": 28.798459563543005, + "grad_norm": 0.8717190027236938, + "learning_rate": 2.373727000427899e-05, + "loss": 0.2955, + "step": 22434 + }, + { + "epoch": 28.7997432605905, + "grad_norm": 1.3941832780838013, + "learning_rate": 2.373684210526316e-05, + "loss": 0.3031, + "step": 22435 + }, + { + "epoch": 28.801026957637998, + "grad_norm": 1.4922423362731934, + "learning_rate": 2.3736414206247326e-05, + "loss": 0.297, + "step": 22436 + }, + { + "epoch": 28.802310654685495, + "grad_norm": 1.1807913780212402, + "learning_rate": 2.3735986307231494e-05, + "loss": 0.2845, + "step": 22437 + }, + { + "epoch": 28.80359435173299, + "grad_norm": 1.4685529470443726, + "learning_rate": 2.3735558408215662e-05, + "loss": 0.337, + "step": 22438 + }, + { + "epoch": 28.804878048780488, + "grad_norm": 0.9486554861068726, + "learning_rate": 2.3735130509199827e-05, + "loss": 0.2813, + "step": 22439 + }, + { + "epoch": 28.806161745827985, + "grad_norm": 1.0032589435577393, + "learning_rate": 2.3734702610184e-05, + "loss": 0.2859, + "step": 22440 + }, + { + "epoch": 28.80744544287548, + "grad_norm": 1.23661470413208, + "learning_rate": 2.3734274711168164e-05, + "loss": 0.289, + "step": 22441 + }, + { + "epoch": 28.808729139922978, + "grad_norm": 1.4988232851028442, + "learning_rate": 2.3733846812152333e-05, + "loss": 0.2765, + "step": 22442 + }, + { + "epoch": 28.810012836970476, + "grad_norm": 1.0506654977798462, + "learning_rate": 2.37334189131365e-05, + "loss": 0.3187, + "step": 22443 + }, + { + "epoch": 28.811296534017973, + "grad_norm": 1.5558315515518188, + "learning_rate": 2.3732991014120666e-05, + "loss": 0.293, + "step": 22444 + }, + { + "epoch": 28.812580231065468, + "grad_norm": 2.2439589500427246, + "learning_rate": 2.3732563115104838e-05, + "loss": 0.3438, + "step": 22445 + }, + { + "epoch": 28.813863928112966, + "grad_norm": 1.522552490234375, + "learning_rate": 2.3732135216089003e-05, + "loss": 0.3115, + "step": 22446 + }, + { + "epoch": 28.815147625160463, + "grad_norm": 1.252974271774292, + "learning_rate": 2.373170731707317e-05, + "loss": 0.309, + "step": 22447 + }, + { + "epoch": 28.816431322207958, + "grad_norm": 1.3538341522216797, + "learning_rate": 2.373127941805734e-05, + "loss": 0.3067, + "step": 22448 + }, + { + "epoch": 28.817715019255456, + "grad_norm": 1.155785083770752, + "learning_rate": 2.3730851519041508e-05, + "loss": 0.2836, + "step": 22449 + }, + { + "epoch": 28.818998716302954, + "grad_norm": 2.2276835441589355, + "learning_rate": 2.3730423620025673e-05, + "loss": 0.327, + "step": 22450 + }, + { + "epoch": 28.820282413350448, + "grad_norm": 1.4792449474334717, + "learning_rate": 2.372999572100984e-05, + "loss": 0.3099, + "step": 22451 + }, + { + "epoch": 28.821566110397946, + "grad_norm": 1.468909740447998, + "learning_rate": 2.372956782199401e-05, + "loss": 0.2993, + "step": 22452 + }, + { + "epoch": 28.822849807445444, + "grad_norm": 3.784296989440918, + "learning_rate": 2.372913992297818e-05, + "loss": 0.3356, + "step": 22453 + }, + { + "epoch": 28.82413350449294, + "grad_norm": 1.6481993198394775, + "learning_rate": 2.3728712023962347e-05, + "loss": 0.3426, + "step": 22454 + }, + { + "epoch": 28.825417201540436, + "grad_norm": 1.8415783643722534, + "learning_rate": 2.3728284124946512e-05, + "loss": 0.341, + "step": 22455 + }, + { + "epoch": 28.826700898587934, + "grad_norm": 1.6710529327392578, + "learning_rate": 2.3727856225930684e-05, + "loss": 0.2996, + "step": 22456 + }, + { + "epoch": 28.82798459563543, + "grad_norm": 4.418908596038818, + "learning_rate": 2.372742832691485e-05, + "loss": 0.308, + "step": 22457 + }, + { + "epoch": 28.829268292682926, + "grad_norm": 1.51800537109375, + "learning_rate": 2.3727000427899014e-05, + "loss": 0.3541, + "step": 22458 + }, + { + "epoch": 28.830551989730424, + "grad_norm": 2.140943765640259, + "learning_rate": 2.3726572528883185e-05, + "loss": 0.3647, + "step": 22459 + }, + { + "epoch": 28.83183568677792, + "grad_norm": 1.7010012865066528, + "learning_rate": 2.372614462986735e-05, + "loss": 0.3763, + "step": 22460 + }, + { + "epoch": 28.833119383825416, + "grad_norm": 1.9874436855316162, + "learning_rate": 2.3725716730851522e-05, + "loss": 0.4129, + "step": 22461 + }, + { + "epoch": 28.834403080872914, + "grad_norm": 1.8645464181900024, + "learning_rate": 2.3725288831835687e-05, + "loss": 0.4666, + "step": 22462 + }, + { + "epoch": 28.83568677792041, + "grad_norm": 1.7285300493240356, + "learning_rate": 2.3724860932819856e-05, + "loss": 0.2857, + "step": 22463 + }, + { + "epoch": 28.836970474967906, + "grad_norm": 0.7717533707618713, + "learning_rate": 2.3724433033804024e-05, + "loss": 0.2922, + "step": 22464 + }, + { + "epoch": 28.838254172015404, + "grad_norm": 0.9007665514945984, + "learning_rate": 2.372400513478819e-05, + "loss": 0.3033, + "step": 22465 + }, + { + "epoch": 28.8395378690629, + "grad_norm": 1.0590637922286987, + "learning_rate": 2.3723577235772358e-05, + "loss": 0.2859, + "step": 22466 + }, + { + "epoch": 28.8408215661104, + "grad_norm": 0.6890484690666199, + "learning_rate": 2.3723149336756526e-05, + "loss": 0.3058, + "step": 22467 + }, + { + "epoch": 28.842105263157894, + "grad_norm": 1.0946694612503052, + "learning_rate": 2.3722721437740694e-05, + "loss": 0.3093, + "step": 22468 + }, + { + "epoch": 28.84338896020539, + "grad_norm": 1.0488839149475098, + "learning_rate": 2.3722293538724863e-05, + "loss": 0.2846, + "step": 22469 + }, + { + "epoch": 28.84467265725289, + "grad_norm": 0.9601845741271973, + "learning_rate": 2.3721865639709028e-05, + "loss": 0.2894, + "step": 22470 + }, + { + "epoch": 28.845956354300384, + "grad_norm": 1.1259064674377441, + "learning_rate": 2.3721437740693196e-05, + "loss": 0.3218, + "step": 22471 + }, + { + "epoch": 28.84724005134788, + "grad_norm": 1.3910787105560303, + "learning_rate": 2.3721009841677365e-05, + "loss": 0.2981, + "step": 22472 + }, + { + "epoch": 28.84852374839538, + "grad_norm": 0.9329956769943237, + "learning_rate": 2.3720581942661533e-05, + "loss": 0.2675, + "step": 22473 + }, + { + "epoch": 28.849807445442874, + "grad_norm": 1.216722011566162, + "learning_rate": 2.3720154043645698e-05, + "loss": 0.3168, + "step": 22474 + }, + { + "epoch": 28.85109114249037, + "grad_norm": 1.2017563581466675, + "learning_rate": 2.371972614462987e-05, + "loss": 0.3066, + "step": 22475 + }, + { + "epoch": 28.85237483953787, + "grad_norm": 1.0482890605926514, + "learning_rate": 2.3719298245614035e-05, + "loss": 0.2846, + "step": 22476 + }, + { + "epoch": 28.853658536585368, + "grad_norm": 0.9365867376327515, + "learning_rate": 2.3718870346598203e-05, + "loss": 0.3199, + "step": 22477 + }, + { + "epoch": 28.854942233632862, + "grad_norm": 1.325595736503601, + "learning_rate": 2.3718442447582372e-05, + "loss": 0.3252, + "step": 22478 + }, + { + "epoch": 28.85622593068036, + "grad_norm": 1.0982623100280762, + "learning_rate": 2.3718014548566537e-05, + "loss": 0.2731, + "step": 22479 + }, + { + "epoch": 28.857509627727858, + "grad_norm": 1.0098872184753418, + "learning_rate": 2.371758664955071e-05, + "loss": 0.2966, + "step": 22480 + }, + { + "epoch": 28.858793324775352, + "grad_norm": 1.6874719858169556, + "learning_rate": 2.3717158750534874e-05, + "loss": 0.308, + "step": 22481 + }, + { + "epoch": 28.86007702182285, + "grad_norm": 1.674340009689331, + "learning_rate": 2.3716730851519042e-05, + "loss": 0.2815, + "step": 22482 + }, + { + "epoch": 28.861360718870348, + "grad_norm": 1.3596419095993042, + "learning_rate": 2.371630295250321e-05, + "loss": 0.2967, + "step": 22483 + }, + { + "epoch": 28.862644415917842, + "grad_norm": 2.5812599658966064, + "learning_rate": 2.3715875053487375e-05, + "loss": 0.3192, + "step": 22484 + }, + { + "epoch": 28.86392811296534, + "grad_norm": 1.485813856124878, + "learning_rate": 2.3715447154471547e-05, + "loss": 0.2924, + "step": 22485 + }, + { + "epoch": 28.865211810012838, + "grad_norm": 1.1713759899139404, + "learning_rate": 2.3715019255455712e-05, + "loss": 0.2881, + "step": 22486 + }, + { + "epoch": 28.866495507060336, + "grad_norm": 0.8590601086616516, + "learning_rate": 2.371459135643988e-05, + "loss": 0.2937, + "step": 22487 + }, + { + "epoch": 28.86777920410783, + "grad_norm": 1.2281185388565063, + "learning_rate": 2.371416345742405e-05, + "loss": 0.2984, + "step": 22488 + }, + { + "epoch": 28.869062901155328, + "grad_norm": 2.2772276401519775, + "learning_rate": 2.3713735558408217e-05, + "loss": 0.2897, + "step": 22489 + }, + { + "epoch": 28.870346598202826, + "grad_norm": 1.0431004762649536, + "learning_rate": 2.3713307659392382e-05, + "loss": 0.3144, + "step": 22490 + }, + { + "epoch": 28.87163029525032, + "grad_norm": 1.3751797676086426, + "learning_rate": 2.371287976037655e-05, + "loss": 0.2951, + "step": 22491 + }, + { + "epoch": 28.872913992297818, + "grad_norm": 0.8353786468505859, + "learning_rate": 2.371245186136072e-05, + "loss": 0.2869, + "step": 22492 + }, + { + "epoch": 28.874197689345316, + "grad_norm": 0.9095918536186218, + "learning_rate": 2.3712023962344888e-05, + "loss": 0.3223, + "step": 22493 + }, + { + "epoch": 28.87548138639281, + "grad_norm": 0.8784189224243164, + "learning_rate": 2.3711596063329056e-05, + "loss": 0.3013, + "step": 22494 + }, + { + "epoch": 28.876765083440308, + "grad_norm": 11.408334732055664, + "learning_rate": 2.371116816431322e-05, + "loss": 0.3277, + "step": 22495 + }, + { + "epoch": 28.878048780487806, + "grad_norm": 2.0960216522216797, + "learning_rate": 2.3710740265297393e-05, + "loss": 0.3025, + "step": 22496 + }, + { + "epoch": 28.8793324775353, + "grad_norm": 0.9742480516433716, + "learning_rate": 2.3710312366281558e-05, + "loss": 0.3375, + "step": 22497 + }, + { + "epoch": 28.880616174582798, + "grad_norm": 0.7935982942581177, + "learning_rate": 2.3709884467265723e-05, + "loss": 0.3082, + "step": 22498 + }, + { + "epoch": 28.881899871630296, + "grad_norm": 1.7518529891967773, + "learning_rate": 2.3709456568249895e-05, + "loss": 0.2983, + "step": 22499 + }, + { + "epoch": 28.883183568677794, + "grad_norm": 1.0839051008224487, + "learning_rate": 2.370902866923406e-05, + "loss": 0.3273, + "step": 22500 + }, + { + "epoch": 28.884467265725288, + "grad_norm": 1.5440953969955444, + "learning_rate": 2.370860077021823e-05, + "loss": 0.3302, + "step": 22501 + }, + { + "epoch": 28.885750962772786, + "grad_norm": 2.0109078884124756, + "learning_rate": 2.3708172871202397e-05, + "loss": 0.2967, + "step": 22502 + }, + { + "epoch": 28.887034659820284, + "grad_norm": 1.7172839641571045, + "learning_rate": 2.3707744972186565e-05, + "loss": 0.3098, + "step": 22503 + }, + { + "epoch": 28.888318356867778, + "grad_norm": 1.8220970630645752, + "learning_rate": 2.3707317073170733e-05, + "loss": 0.3479, + "step": 22504 + }, + { + "epoch": 28.889602053915276, + "grad_norm": 1.6041462421417236, + "learning_rate": 2.37068891741549e-05, + "loss": 0.3129, + "step": 22505 + }, + { + "epoch": 28.890885750962774, + "grad_norm": 1.391182541847229, + "learning_rate": 2.3706461275139067e-05, + "loss": 0.3287, + "step": 22506 + }, + { + "epoch": 28.892169448010268, + "grad_norm": 2.980132579803467, + "learning_rate": 2.3706033376123235e-05, + "loss": 0.2991, + "step": 22507 + }, + { + "epoch": 28.893453145057766, + "grad_norm": 2.7620224952697754, + "learning_rate": 2.3705605477107404e-05, + "loss": 0.3121, + "step": 22508 + }, + { + "epoch": 28.894736842105264, + "grad_norm": 1.9149409532546997, + "learning_rate": 2.3705177578091572e-05, + "loss": 0.3541, + "step": 22509 + }, + { + "epoch": 28.89602053915276, + "grad_norm": 7.542996406555176, + "learning_rate": 2.370474967907574e-05, + "loss": 0.4323, + "step": 22510 + }, + { + "epoch": 28.897304236200256, + "grad_norm": 1.3325386047363281, + "learning_rate": 2.3704321780059906e-05, + "loss": 0.3876, + "step": 22511 + }, + { + "epoch": 28.898587933247754, + "grad_norm": 2.557023525238037, + "learning_rate": 2.3703893881044074e-05, + "loss": 0.4867, + "step": 22512 + }, + { + "epoch": 28.89987163029525, + "grad_norm": 0.8298783302307129, + "learning_rate": 2.3703465982028242e-05, + "loss": 0.2985, + "step": 22513 + }, + { + "epoch": 28.901155327342746, + "grad_norm": 0.9286243319511414, + "learning_rate": 2.3703038083012407e-05, + "loss": 0.3186, + "step": 22514 + }, + { + "epoch": 28.902439024390244, + "grad_norm": 1.2810313701629639, + "learning_rate": 2.370261018399658e-05, + "loss": 0.3038, + "step": 22515 + }, + { + "epoch": 28.90372272143774, + "grad_norm": 0.7145698070526123, + "learning_rate": 2.3702182284980744e-05, + "loss": 0.3024, + "step": 22516 + }, + { + "epoch": 28.905006418485236, + "grad_norm": 1.0219120979309082, + "learning_rate": 2.3701754385964916e-05, + "loss": 0.331, + "step": 22517 + }, + { + "epoch": 28.906290115532734, + "grad_norm": 0.8274914026260376, + "learning_rate": 2.370132648694908e-05, + "loss": 0.3185, + "step": 22518 + }, + { + "epoch": 28.90757381258023, + "grad_norm": 0.8504221439361572, + "learning_rate": 2.3700898587933246e-05, + "loss": 0.2957, + "step": 22519 + }, + { + "epoch": 28.90885750962773, + "grad_norm": 1.2997766733169556, + "learning_rate": 2.3700470688917418e-05, + "loss": 0.2922, + "step": 22520 + }, + { + "epoch": 28.910141206675224, + "grad_norm": 1.3515435457229614, + "learning_rate": 2.3700042789901583e-05, + "loss": 0.3039, + "step": 22521 + }, + { + "epoch": 28.911424903722722, + "grad_norm": 0.8916047811508179, + "learning_rate": 2.369961489088575e-05, + "loss": 0.3192, + "step": 22522 + }, + { + "epoch": 28.91270860077022, + "grad_norm": 0.9797864556312561, + "learning_rate": 2.369918699186992e-05, + "loss": 0.3173, + "step": 22523 + }, + { + "epoch": 28.913992297817714, + "grad_norm": 4.215210437774658, + "learning_rate": 2.3698759092854088e-05, + "loss": 0.3199, + "step": 22524 + }, + { + "epoch": 28.915275994865212, + "grad_norm": 1.1227295398712158, + "learning_rate": 2.3698331193838257e-05, + "loss": 0.2965, + "step": 22525 + }, + { + "epoch": 28.91655969191271, + "grad_norm": 6.413669586181641, + "learning_rate": 2.369790329482242e-05, + "loss": 0.2998, + "step": 22526 + }, + { + "epoch": 28.917843388960204, + "grad_norm": 1.0565673112869263, + "learning_rate": 2.369747539580659e-05, + "loss": 0.2731, + "step": 22527 + }, + { + "epoch": 28.919127086007702, + "grad_norm": 0.7722679972648621, + "learning_rate": 2.369704749679076e-05, + "loss": 0.2746, + "step": 22528 + }, + { + "epoch": 28.9204107830552, + "grad_norm": 1.5577783584594727, + "learning_rate": 2.3696619597774927e-05, + "loss": 0.3186, + "step": 22529 + }, + { + "epoch": 28.921694480102694, + "grad_norm": 1.5002872943878174, + "learning_rate": 2.3696191698759092e-05, + "loss": 0.298, + "step": 22530 + }, + { + "epoch": 28.922978177150192, + "grad_norm": 1.3738762140274048, + "learning_rate": 2.369576379974326e-05, + "loss": 0.2938, + "step": 22531 + }, + { + "epoch": 28.92426187419769, + "grad_norm": 0.9305105209350586, + "learning_rate": 2.369533590072743e-05, + "loss": 0.2613, + "step": 22532 + }, + { + "epoch": 28.925545571245188, + "grad_norm": 1.3617801666259766, + "learning_rate": 2.3694908001711597e-05, + "loss": 0.3406, + "step": 22533 + }, + { + "epoch": 28.926829268292682, + "grad_norm": 0.9674723148345947, + "learning_rate": 2.3694480102695765e-05, + "loss": 0.301, + "step": 22534 + }, + { + "epoch": 28.92811296534018, + "grad_norm": 0.7979612946510315, + "learning_rate": 2.369405220367993e-05, + "loss": 0.3181, + "step": 22535 + }, + { + "epoch": 28.929396662387678, + "grad_norm": 1.4279751777648926, + "learning_rate": 2.3693624304664102e-05, + "loss": 0.3079, + "step": 22536 + }, + { + "epoch": 28.930680359435172, + "grad_norm": 1.27653968334198, + "learning_rate": 2.3693196405648267e-05, + "loss": 0.3003, + "step": 22537 + }, + { + "epoch": 28.93196405648267, + "grad_norm": 0.9240562319755554, + "learning_rate": 2.3692768506632432e-05, + "loss": 0.3233, + "step": 22538 + }, + { + "epoch": 28.933247753530168, + "grad_norm": 6.396643161773682, + "learning_rate": 2.3692340607616604e-05, + "loss": 0.3022, + "step": 22539 + }, + { + "epoch": 28.934531450577662, + "grad_norm": 1.9161336421966553, + "learning_rate": 2.369191270860077e-05, + "loss": 0.3085, + "step": 22540 + }, + { + "epoch": 28.93581514762516, + "grad_norm": 1.1948915719985962, + "learning_rate": 2.369148480958494e-05, + "loss": 0.3264, + "step": 22541 + }, + { + "epoch": 28.937098844672658, + "grad_norm": 1.2149553298950195, + "learning_rate": 2.3691056910569106e-05, + "loss": 0.3051, + "step": 22542 + }, + { + "epoch": 28.938382541720156, + "grad_norm": 1.0519615411758423, + "learning_rate": 2.3690629011553274e-05, + "loss": 0.2962, + "step": 22543 + }, + { + "epoch": 28.93966623876765, + "grad_norm": 1.3654793500900269, + "learning_rate": 2.3690201112537443e-05, + "loss": 0.3181, + "step": 22544 + }, + { + "epoch": 28.940949935815148, + "grad_norm": 1.4769539833068848, + "learning_rate": 2.3689773213521608e-05, + "loss": 0.3245, + "step": 22545 + }, + { + "epoch": 28.942233632862646, + "grad_norm": 1.8268436193466187, + "learning_rate": 2.3689345314505776e-05, + "loss": 0.3465, + "step": 22546 + }, + { + "epoch": 28.94351732991014, + "grad_norm": 1.4571857452392578, + "learning_rate": 2.3688917415489945e-05, + "loss": 0.2951, + "step": 22547 + }, + { + "epoch": 28.944801026957638, + "grad_norm": 1.2931050062179565, + "learning_rate": 2.3688489516474113e-05, + "loss": 0.3108, + "step": 22548 + }, + { + "epoch": 28.946084724005136, + "grad_norm": 1.0788145065307617, + "learning_rate": 2.368806161745828e-05, + "loss": 0.3454, + "step": 22549 + }, + { + "epoch": 28.94736842105263, + "grad_norm": 4.516812801361084, + "learning_rate": 2.368763371844245e-05, + "loss": 0.3034, + "step": 22550 + }, + { + "epoch": 28.948652118100128, + "grad_norm": 2.56841778755188, + "learning_rate": 2.3687205819426615e-05, + "loss": 0.2677, + "step": 22551 + }, + { + "epoch": 28.949935815147626, + "grad_norm": 0.9818100333213806, + "learning_rate": 2.3686777920410783e-05, + "loss": 0.3177, + "step": 22552 + }, + { + "epoch": 28.951219512195124, + "grad_norm": 3.8905770778656006, + "learning_rate": 2.3686350021394952e-05, + "loss": 0.3773, + "step": 22553 + }, + { + "epoch": 28.952503209242618, + "grad_norm": 1.2962231636047363, + "learning_rate": 2.3685922122379117e-05, + "loss": 0.313, + "step": 22554 + }, + { + "epoch": 28.953786906290116, + "grad_norm": 7.239830017089844, + "learning_rate": 2.368549422336329e-05, + "loss": 0.3265, + "step": 22555 + }, + { + "epoch": 28.955070603337614, + "grad_norm": 3.625087261199951, + "learning_rate": 2.3685066324347454e-05, + "loss": 0.3358, + "step": 22556 + }, + { + "epoch": 28.956354300385108, + "grad_norm": 1.235275387763977, + "learning_rate": 2.3684638425331625e-05, + "loss": 0.3237, + "step": 22557 + }, + { + "epoch": 28.957637997432606, + "grad_norm": 1.9399970769882202, + "learning_rate": 2.368421052631579e-05, + "loss": 0.3541, + "step": 22558 + }, + { + "epoch": 28.958921694480104, + "grad_norm": 1.9840879440307617, + "learning_rate": 2.3683782627299955e-05, + "loss": 0.3651, + "step": 22559 + }, + { + "epoch": 28.960205391527598, + "grad_norm": 1.2248579263687134, + "learning_rate": 2.3683354728284127e-05, + "loss": 0.3508, + "step": 22560 + }, + { + "epoch": 28.961489088575096, + "grad_norm": 2.9877076148986816, + "learning_rate": 2.3682926829268292e-05, + "loss": 0.3757, + "step": 22561 + }, + { + "epoch": 28.962772785622594, + "grad_norm": 8.113962173461914, + "learning_rate": 2.368249893025246e-05, + "loss": 0.4828, + "step": 22562 + }, + { + "epoch": 28.964056482670088, + "grad_norm": 0.968981146812439, + "learning_rate": 2.368207103123663e-05, + "loss": 0.3078, + "step": 22563 + }, + { + "epoch": 28.965340179717586, + "grad_norm": 1.2192977666854858, + "learning_rate": 2.3681643132220798e-05, + "loss": 0.2939, + "step": 22564 + }, + { + "epoch": 28.966623876765084, + "grad_norm": 2.140226364135742, + "learning_rate": 2.3681215233204966e-05, + "loss": 0.306, + "step": 22565 + }, + { + "epoch": 28.96790757381258, + "grad_norm": 1.0896397829055786, + "learning_rate": 2.368078733418913e-05, + "loss": 0.3125, + "step": 22566 + }, + { + "epoch": 28.969191270860076, + "grad_norm": 0.711398720741272, + "learning_rate": 2.36803594351733e-05, + "loss": 0.3268, + "step": 22567 + }, + { + "epoch": 28.970474967907574, + "grad_norm": 0.8926868438720703, + "learning_rate": 2.3679931536157468e-05, + "loss": 0.2919, + "step": 22568 + }, + { + "epoch": 28.971758664955072, + "grad_norm": 0.9899234175682068, + "learning_rate": 2.3679503637141636e-05, + "loss": 0.3165, + "step": 22569 + }, + { + "epoch": 28.973042362002566, + "grad_norm": 0.7989745736122131, + "learning_rate": 2.36790757381258e-05, + "loss": 0.3102, + "step": 22570 + }, + { + "epoch": 28.974326059050064, + "grad_norm": 0.8461089730262756, + "learning_rate": 2.3678647839109973e-05, + "loss": 0.3068, + "step": 22571 + }, + { + "epoch": 28.975609756097562, + "grad_norm": 1.018578290939331, + "learning_rate": 2.3678219940094138e-05, + "loss": 0.2894, + "step": 22572 + }, + { + "epoch": 28.976893453145056, + "grad_norm": 1.4532890319824219, + "learning_rate": 2.3677792041078306e-05, + "loss": 0.3008, + "step": 22573 + }, + { + "epoch": 28.978177150192554, + "grad_norm": 0.7866982817649841, + "learning_rate": 2.3677364142062475e-05, + "loss": 0.2995, + "step": 22574 + }, + { + "epoch": 28.979460847240052, + "grad_norm": 1.5437932014465332, + "learning_rate": 2.367693624304664e-05, + "loss": 0.2954, + "step": 22575 + }, + { + "epoch": 28.98074454428755, + "grad_norm": 3.080054759979248, + "learning_rate": 2.367650834403081e-05, + "loss": 0.2966, + "step": 22576 + }, + { + "epoch": 28.982028241335044, + "grad_norm": 1.0700194835662842, + "learning_rate": 2.3676080445014977e-05, + "loss": 0.2824, + "step": 22577 + }, + { + "epoch": 28.983311938382542, + "grad_norm": 1.435107707977295, + "learning_rate": 2.3675652545999145e-05, + "loss": 0.2631, + "step": 22578 + }, + { + "epoch": 28.98459563543004, + "grad_norm": 1.222084879875183, + "learning_rate": 2.3675224646983314e-05, + "loss": 0.3394, + "step": 22579 + }, + { + "epoch": 28.985879332477534, + "grad_norm": 3.118412971496582, + "learning_rate": 2.367479674796748e-05, + "loss": 0.3057, + "step": 22580 + }, + { + "epoch": 28.987163029525032, + "grad_norm": 1.093835711479187, + "learning_rate": 2.367436884895165e-05, + "loss": 0.3187, + "step": 22581 + }, + { + "epoch": 28.98844672657253, + "grad_norm": 0.8575426340103149, + "learning_rate": 2.3673940949935815e-05, + "loss": 0.3377, + "step": 22582 + }, + { + "epoch": 28.989730423620024, + "grad_norm": 1.4527287483215332, + "learning_rate": 2.3673513050919984e-05, + "loss": 0.2912, + "step": 22583 + }, + { + "epoch": 28.991014120667522, + "grad_norm": 2.07446551322937, + "learning_rate": 2.3673085151904152e-05, + "loss": 0.3123, + "step": 22584 + }, + { + "epoch": 28.99229781771502, + "grad_norm": 1.2675129175186157, + "learning_rate": 2.367265725288832e-05, + "loss": 0.3377, + "step": 22585 + }, + { + "epoch": 28.993581514762518, + "grad_norm": 1.8993926048278809, + "learning_rate": 2.3672229353872486e-05, + "loss": 0.3334, + "step": 22586 + }, + { + "epoch": 28.994865211810012, + "grad_norm": 1.5579133033752441, + "learning_rate": 2.3671801454856654e-05, + "loss": 0.3277, + "step": 22587 + }, + { + "epoch": 28.99614890885751, + "grad_norm": 1.7254148721694946, + "learning_rate": 2.3671373555840822e-05, + "loss": 0.3207, + "step": 22588 + }, + { + "epoch": 28.997432605905008, + "grad_norm": 1.6527085304260254, + "learning_rate": 2.367094565682499e-05, + "loss": 0.3749, + "step": 22589 + }, + { + "epoch": 28.998716302952502, + "grad_norm": 1.7455133199691772, + "learning_rate": 2.367051775780916e-05, + "loss": 0.3983, + "step": 22590 + }, + { + "epoch": 29.0, + "grad_norm": 2.043522357940674, + "learning_rate": 2.3670089858793324e-05, + "loss": 0.5732, + "step": 22591 + }, + { + "epoch": 29.001283697047498, + "grad_norm": 1.257083535194397, + "learning_rate": 2.3669661959777493e-05, + "loss": 0.2706, + "step": 22592 + }, + { + "epoch": 29.002567394094992, + "grad_norm": 1.8157808780670166, + "learning_rate": 2.366923406076166e-05, + "loss": 0.2863, + "step": 22593 + }, + { + "epoch": 29.00385109114249, + "grad_norm": 0.7590872049331665, + "learning_rate": 2.3668806161745826e-05, + "loss": 0.2772, + "step": 22594 + }, + { + "epoch": 29.005134788189988, + "grad_norm": 1.6239266395568848, + "learning_rate": 2.3668378262729998e-05, + "loss": 0.3086, + "step": 22595 + }, + { + "epoch": 29.006418485237482, + "grad_norm": 0.9211810231208801, + "learning_rate": 2.3667950363714163e-05, + "loss": 0.3231, + "step": 22596 + }, + { + "epoch": 29.00770218228498, + "grad_norm": 0.8112150430679321, + "learning_rate": 2.366752246469833e-05, + "loss": 0.3102, + "step": 22597 + }, + { + "epoch": 29.008985879332478, + "grad_norm": 1.347033143043518, + "learning_rate": 2.36670945656825e-05, + "loss": 0.2955, + "step": 22598 + }, + { + "epoch": 29.010269576379976, + "grad_norm": 2.659727096557617, + "learning_rate": 2.3666666666666665e-05, + "loss": 0.2769, + "step": 22599 + }, + { + "epoch": 29.01155327342747, + "grad_norm": 0.8582963347434998, + "learning_rate": 2.3666238767650837e-05, + "loss": 0.2698, + "step": 22600 + }, + { + "epoch": 29.012836970474968, + "grad_norm": 0.94265216588974, + "learning_rate": 2.3665810868635e-05, + "loss": 0.2866, + "step": 22601 + }, + { + "epoch": 29.014120667522466, + "grad_norm": 0.9783278703689575, + "learning_rate": 2.366538296961917e-05, + "loss": 0.2813, + "step": 22602 + }, + { + "epoch": 29.01540436456996, + "grad_norm": 1.1049987077713013, + "learning_rate": 2.366495507060334e-05, + "loss": 0.2968, + "step": 22603 + }, + { + "epoch": 29.016688061617458, + "grad_norm": 1.0779467821121216, + "learning_rate": 2.3664527171587507e-05, + "loss": 0.3135, + "step": 22604 + }, + { + "epoch": 29.017971758664956, + "grad_norm": 0.8036814332008362, + "learning_rate": 2.3664099272571672e-05, + "loss": 0.3106, + "step": 22605 + }, + { + "epoch": 29.01925545571245, + "grad_norm": 0.7839330434799194, + "learning_rate": 2.366367137355584e-05, + "loss": 0.2934, + "step": 22606 + }, + { + "epoch": 29.020539152759948, + "grad_norm": 1.0295276641845703, + "learning_rate": 2.366324347454001e-05, + "loss": 0.3009, + "step": 22607 + }, + { + "epoch": 29.021822849807446, + "grad_norm": 0.9949512481689453, + "learning_rate": 2.3662815575524177e-05, + "loss": 0.278, + "step": 22608 + }, + { + "epoch": 29.023106546854944, + "grad_norm": 0.8356366753578186, + "learning_rate": 2.3662387676508346e-05, + "loss": 0.2664, + "step": 22609 + }, + { + "epoch": 29.024390243902438, + "grad_norm": 2.6212377548217773, + "learning_rate": 2.366195977749251e-05, + "loss": 0.2837, + "step": 22610 + }, + { + "epoch": 29.025673940949936, + "grad_norm": 1.8308814764022827, + "learning_rate": 2.3661531878476682e-05, + "loss": 0.2739, + "step": 22611 + }, + { + "epoch": 29.026957637997434, + "grad_norm": 0.9741290211677551, + "learning_rate": 2.3661103979460847e-05, + "loss": 0.304, + "step": 22612 + }, + { + "epoch": 29.028241335044928, + "grad_norm": 1.182667851448059, + "learning_rate": 2.3660676080445012e-05, + "loss": 0.3054, + "step": 22613 + }, + { + "epoch": 29.029525032092426, + "grad_norm": 0.9559177756309509, + "learning_rate": 2.3660248181429184e-05, + "loss": 0.2685, + "step": 22614 + }, + { + "epoch": 29.030808729139924, + "grad_norm": 1.8850033283233643, + "learning_rate": 2.365982028241335e-05, + "loss": 0.2799, + "step": 22615 + }, + { + "epoch": 29.03209242618742, + "grad_norm": 1.0238115787506104, + "learning_rate": 2.365939238339752e-05, + "loss": 0.2567, + "step": 22616 + }, + { + "epoch": 29.033376123234916, + "grad_norm": 0.8624258637428284, + "learning_rate": 2.3658964484381686e-05, + "loss": 0.2736, + "step": 22617 + }, + { + "epoch": 29.034659820282414, + "grad_norm": 1.0084433555603027, + "learning_rate": 2.3658536585365854e-05, + "loss": 0.2605, + "step": 22618 + }, + { + "epoch": 29.035943517329912, + "grad_norm": 3.5898149013519287, + "learning_rate": 2.3658108686350023e-05, + "loss": 0.2835, + "step": 22619 + }, + { + "epoch": 29.037227214377406, + "grad_norm": 1.564121961593628, + "learning_rate": 2.3657680787334188e-05, + "loss": 0.2757, + "step": 22620 + }, + { + "epoch": 29.038510911424904, + "grad_norm": 1.8751773834228516, + "learning_rate": 2.3657252888318356e-05, + "loss": 0.29, + "step": 22621 + }, + { + "epoch": 29.039794608472402, + "grad_norm": 1.413305401802063, + "learning_rate": 2.3656824989302525e-05, + "loss": 0.2955, + "step": 22622 + }, + { + "epoch": 29.041078305519896, + "grad_norm": 1.0170778036117554, + "learning_rate": 2.3656397090286693e-05, + "loss": 0.2736, + "step": 22623 + }, + { + "epoch": 29.042362002567394, + "grad_norm": 1.1292946338653564, + "learning_rate": 2.365596919127086e-05, + "loss": 0.2905, + "step": 22624 + }, + { + "epoch": 29.043645699614892, + "grad_norm": 1.2066844701766968, + "learning_rate": 2.365554129225503e-05, + "loss": 0.3076, + "step": 22625 + }, + { + "epoch": 29.044929396662386, + "grad_norm": 1.5555615425109863, + "learning_rate": 2.3655113393239195e-05, + "loss": 0.2654, + "step": 22626 + }, + { + "epoch": 29.046213093709884, + "grad_norm": 1.2775846719741821, + "learning_rate": 2.3654685494223363e-05, + "loss": 0.2813, + "step": 22627 + }, + { + "epoch": 29.047496790757382, + "grad_norm": 1.1728627681732178, + "learning_rate": 2.3654257595207532e-05, + "loss": 0.3347, + "step": 22628 + }, + { + "epoch": 29.048780487804876, + "grad_norm": 1.4056998491287231, + "learning_rate": 2.3653829696191697e-05, + "loss": 0.29, + "step": 22629 + }, + { + "epoch": 29.050064184852374, + "grad_norm": 2.3502707481384277, + "learning_rate": 2.365340179717587e-05, + "loss": 0.2989, + "step": 22630 + }, + { + "epoch": 29.051347881899872, + "grad_norm": 1.2113457918167114, + "learning_rate": 2.3652973898160034e-05, + "loss": 0.2975, + "step": 22631 + }, + { + "epoch": 29.05263157894737, + "grad_norm": 1.1419187784194946, + "learning_rate": 2.3652545999144205e-05, + "loss": 0.3143, + "step": 22632 + }, + { + "epoch": 29.053915275994864, + "grad_norm": 12.2249174118042, + "learning_rate": 2.365211810012837e-05, + "loss": 0.3466, + "step": 22633 + }, + { + "epoch": 29.055198973042362, + "grad_norm": 1.6785509586334229, + "learning_rate": 2.3651690201112536e-05, + "loss": 0.2968, + "step": 22634 + }, + { + "epoch": 29.05648267008986, + "grad_norm": 1.3544297218322754, + "learning_rate": 2.3651262302096707e-05, + "loss": 0.3145, + "step": 22635 + }, + { + "epoch": 29.057766367137354, + "grad_norm": 1.6875001192092896, + "learning_rate": 2.3650834403080872e-05, + "loss": 0.3388, + "step": 22636 + }, + { + "epoch": 29.059050064184852, + "grad_norm": 2.0651400089263916, + "learning_rate": 2.365040650406504e-05, + "loss": 0.3394, + "step": 22637 + }, + { + "epoch": 29.06033376123235, + "grad_norm": 1.3338027000427246, + "learning_rate": 2.364997860504921e-05, + "loss": 0.3243, + "step": 22638 + }, + { + "epoch": 29.061617458279844, + "grad_norm": 1.185336947441101, + "learning_rate": 2.3649550706033378e-05, + "loss": 0.3217, + "step": 22639 + }, + { + "epoch": 29.062901155327342, + "grad_norm": 1.6363613605499268, + "learning_rate": 2.3649122807017546e-05, + "loss": 0.3547, + "step": 22640 + }, + { + "epoch": 29.06418485237484, + "grad_norm": 1.9370115995407104, + "learning_rate": 2.364869490800171e-05, + "loss": 0.4691, + "step": 22641 + }, + { + "epoch": 29.065468549422338, + "grad_norm": 1.9914634227752686, + "learning_rate": 2.364826700898588e-05, + "loss": 0.2894, + "step": 22642 + }, + { + "epoch": 29.066752246469832, + "grad_norm": 1.1415815353393555, + "learning_rate": 2.3647839109970048e-05, + "loss": 0.326, + "step": 22643 + }, + { + "epoch": 29.06803594351733, + "grad_norm": 1.3101872205734253, + "learning_rate": 2.3647411210954216e-05, + "loss": 0.2795, + "step": 22644 + }, + { + "epoch": 29.069319640564828, + "grad_norm": 0.9119415283203125, + "learning_rate": 2.364698331193838e-05, + "loss": 0.2994, + "step": 22645 + }, + { + "epoch": 29.070603337612322, + "grad_norm": 0.9204085469245911, + "learning_rate": 2.3646555412922553e-05, + "loss": 0.3148, + "step": 22646 + }, + { + "epoch": 29.07188703465982, + "grad_norm": 1.7436981201171875, + "learning_rate": 2.3646127513906718e-05, + "loss": 0.2822, + "step": 22647 + }, + { + "epoch": 29.073170731707318, + "grad_norm": 0.9053673148155212, + "learning_rate": 2.3645699614890887e-05, + "loss": 0.288, + "step": 22648 + }, + { + "epoch": 29.074454428754812, + "grad_norm": 1.2195988893508911, + "learning_rate": 2.3645271715875055e-05, + "loss": 0.3209, + "step": 22649 + }, + { + "epoch": 29.07573812580231, + "grad_norm": 1.3159109354019165, + "learning_rate": 2.364484381685922e-05, + "loss": 0.3087, + "step": 22650 + }, + { + "epoch": 29.077021822849808, + "grad_norm": 0.8222793340682983, + "learning_rate": 2.3644415917843392e-05, + "loss": 0.2712, + "step": 22651 + }, + { + "epoch": 29.078305519897306, + "grad_norm": 1.233634114265442, + "learning_rate": 2.3643988018827557e-05, + "loss": 0.2701, + "step": 22652 + }, + { + "epoch": 29.0795892169448, + "grad_norm": 1.0923166275024414, + "learning_rate": 2.3643560119811725e-05, + "loss": 0.3052, + "step": 22653 + }, + { + "epoch": 29.080872913992298, + "grad_norm": 0.8136759996414185, + "learning_rate": 2.3643132220795894e-05, + "loss": 0.2801, + "step": 22654 + }, + { + "epoch": 29.082156611039796, + "grad_norm": 2.741124153137207, + "learning_rate": 2.364270432178006e-05, + "loss": 0.3034, + "step": 22655 + }, + { + "epoch": 29.08344030808729, + "grad_norm": 1.2924968004226685, + "learning_rate": 2.364227642276423e-05, + "loss": 0.2948, + "step": 22656 + }, + { + "epoch": 29.084724005134788, + "grad_norm": 0.9652037024497986, + "learning_rate": 2.3641848523748395e-05, + "loss": 0.2891, + "step": 22657 + }, + { + "epoch": 29.086007702182286, + "grad_norm": 1.4121348857879639, + "learning_rate": 2.3641420624732564e-05, + "loss": 0.2919, + "step": 22658 + }, + { + "epoch": 29.08729139922978, + "grad_norm": 4.2878828048706055, + "learning_rate": 2.3640992725716732e-05, + "loss": 0.2639, + "step": 22659 + }, + { + "epoch": 29.088575096277278, + "grad_norm": 0.9767290353775024, + "learning_rate": 2.3640564826700897e-05, + "loss": 0.2697, + "step": 22660 + }, + { + "epoch": 29.089858793324776, + "grad_norm": 1.8802562952041626, + "learning_rate": 2.3640136927685066e-05, + "loss": 0.2847, + "step": 22661 + }, + { + "epoch": 29.09114249037227, + "grad_norm": 1.12565016746521, + "learning_rate": 2.3639709028669234e-05, + "loss": 0.2883, + "step": 22662 + }, + { + "epoch": 29.09242618741977, + "grad_norm": 1.8897771835327148, + "learning_rate": 2.3639281129653403e-05, + "loss": 0.2852, + "step": 22663 + }, + { + "epoch": 29.093709884467266, + "grad_norm": 0.8160878419876099, + "learning_rate": 2.363885323063757e-05, + "loss": 0.2788, + "step": 22664 + }, + { + "epoch": 29.094993581514764, + "grad_norm": 3.259650707244873, + "learning_rate": 2.363842533162174e-05, + "loss": 0.2844, + "step": 22665 + }, + { + "epoch": 29.09627727856226, + "grad_norm": 1.0328476428985596, + "learning_rate": 2.3637997432605904e-05, + "loss": 0.2802, + "step": 22666 + }, + { + "epoch": 29.097560975609756, + "grad_norm": 3.7233667373657227, + "learning_rate": 2.3637569533590073e-05, + "loss": 0.2906, + "step": 22667 + }, + { + "epoch": 29.098844672657254, + "grad_norm": 1.6228963136672974, + "learning_rate": 2.363714163457424e-05, + "loss": 0.2872, + "step": 22668 + }, + { + "epoch": 29.10012836970475, + "grad_norm": 0.9091278314590454, + "learning_rate": 2.3636713735558406e-05, + "loss": 0.2569, + "step": 22669 + }, + { + "epoch": 29.101412066752246, + "grad_norm": 0.9617139101028442, + "learning_rate": 2.3636285836542578e-05, + "loss": 0.2876, + "step": 22670 + }, + { + "epoch": 29.102695763799744, + "grad_norm": 4.602855682373047, + "learning_rate": 2.3635857937526743e-05, + "loss": 0.3031, + "step": 22671 + }, + { + "epoch": 29.10397946084724, + "grad_norm": 1.1029322147369385, + "learning_rate": 2.3635430038510915e-05, + "loss": 0.2858, + "step": 22672 + }, + { + "epoch": 29.105263157894736, + "grad_norm": 2.1118664741516113, + "learning_rate": 2.363500213949508e-05, + "loss": 0.2851, + "step": 22673 + }, + { + "epoch": 29.106546854942234, + "grad_norm": 1.2781577110290527, + "learning_rate": 2.3634574240479245e-05, + "loss": 0.2626, + "step": 22674 + }, + { + "epoch": 29.107830551989732, + "grad_norm": 1.6954104900360107, + "learning_rate": 2.3634146341463417e-05, + "loss": 0.2828, + "step": 22675 + }, + { + "epoch": 29.109114249037226, + "grad_norm": 1.1069892644882202, + "learning_rate": 2.3633718442447582e-05, + "loss": 0.318, + "step": 22676 + }, + { + "epoch": 29.110397946084724, + "grad_norm": 1.4752689599990845, + "learning_rate": 2.363329054343175e-05, + "loss": 0.2776, + "step": 22677 + }, + { + "epoch": 29.111681643132222, + "grad_norm": 1.1734113693237305, + "learning_rate": 2.363286264441592e-05, + "loss": 0.2911, + "step": 22678 + }, + { + "epoch": 29.112965340179716, + "grad_norm": 1.328092098236084, + "learning_rate": 2.3632434745400087e-05, + "loss": 0.3041, + "step": 22679 + }, + { + "epoch": 29.114249037227214, + "grad_norm": 1.3122217655181885, + "learning_rate": 2.3632006846384255e-05, + "loss": 0.2594, + "step": 22680 + }, + { + "epoch": 29.115532734274712, + "grad_norm": 1.6460716724395752, + "learning_rate": 2.363157894736842e-05, + "loss": 0.247, + "step": 22681 + }, + { + "epoch": 29.116816431322206, + "grad_norm": 1.1675440073013306, + "learning_rate": 2.363115104835259e-05, + "loss": 0.2939, + "step": 22682 + }, + { + "epoch": 29.118100128369704, + "grad_norm": 1.6964210271835327, + "learning_rate": 2.3630723149336757e-05, + "loss": 0.3121, + "step": 22683 + }, + { + "epoch": 29.119383825417202, + "grad_norm": 1.1382769346237183, + "learning_rate": 2.3630295250320926e-05, + "loss": 0.281, + "step": 22684 + }, + { + "epoch": 29.1206675224647, + "grad_norm": 3.505030632019043, + "learning_rate": 2.362986735130509e-05, + "loss": 0.3189, + "step": 22685 + }, + { + "epoch": 29.121951219512194, + "grad_norm": 2.14595890045166, + "learning_rate": 2.3629439452289262e-05, + "loss": 0.3468, + "step": 22686 + }, + { + "epoch": 29.123234916559692, + "grad_norm": 2.4946846961975098, + "learning_rate": 2.3629011553273427e-05, + "loss": 0.3202, + "step": 22687 + }, + { + "epoch": 29.12451861360719, + "grad_norm": 2.2061593532562256, + "learning_rate": 2.3628583654257596e-05, + "loss": 0.3356, + "step": 22688 + }, + { + "epoch": 29.125802310654684, + "grad_norm": 2.1713345050811768, + "learning_rate": 2.3628155755241764e-05, + "loss": 0.3453, + "step": 22689 + }, + { + "epoch": 29.127086007702182, + "grad_norm": 2.692843198776245, + "learning_rate": 2.362772785622593e-05, + "loss": 0.4148, + "step": 22690 + }, + { + "epoch": 29.12836970474968, + "grad_norm": 2.290494918823242, + "learning_rate": 2.36272999572101e-05, + "loss": 0.4524, + "step": 22691 + }, + { + "epoch": 29.129653401797174, + "grad_norm": 1.4294224977493286, + "learning_rate": 2.3626872058194266e-05, + "loss": 0.3197, + "step": 22692 + }, + { + "epoch": 29.130937098844672, + "grad_norm": 0.9294313192367554, + "learning_rate": 2.3626444159178435e-05, + "loss": 0.2908, + "step": 22693 + }, + { + "epoch": 29.13222079589217, + "grad_norm": 3.667747974395752, + "learning_rate": 2.3626016260162603e-05, + "loss": 0.2922, + "step": 22694 + }, + { + "epoch": 29.133504492939664, + "grad_norm": 1.7761284112930298, + "learning_rate": 2.3625588361146768e-05, + "loss": 0.3182, + "step": 22695 + }, + { + "epoch": 29.134788189987162, + "grad_norm": 1.6947847604751587, + "learning_rate": 2.362516046213094e-05, + "loss": 0.3102, + "step": 22696 + }, + { + "epoch": 29.13607188703466, + "grad_norm": 1.0900347232818604, + "learning_rate": 2.3624732563115105e-05, + "loss": 0.2992, + "step": 22697 + }, + { + "epoch": 29.137355584082158, + "grad_norm": 1.8561842441558838, + "learning_rate": 2.3624304664099273e-05, + "loss": 0.2794, + "step": 22698 + }, + { + "epoch": 29.138639281129652, + "grad_norm": 0.9423516988754272, + "learning_rate": 2.362387676508344e-05, + "loss": 0.302, + "step": 22699 + }, + { + "epoch": 29.13992297817715, + "grad_norm": 1.086221694946289, + "learning_rate": 2.362344886606761e-05, + "loss": 0.3134, + "step": 22700 + }, + { + "epoch": 29.141206675224648, + "grad_norm": 1.244053840637207, + "learning_rate": 2.3623020967051775e-05, + "loss": 0.324, + "step": 22701 + }, + { + "epoch": 29.142490372272142, + "grad_norm": 0.7863152623176575, + "learning_rate": 2.3622593068035943e-05, + "loss": 0.3034, + "step": 22702 + }, + { + "epoch": 29.14377406931964, + "grad_norm": 1.706860065460205, + "learning_rate": 2.3622165169020112e-05, + "loss": 0.3052, + "step": 22703 + }, + { + "epoch": 29.145057766367138, + "grad_norm": 1.1315478086471558, + "learning_rate": 2.362173727000428e-05, + "loss": 0.2699, + "step": 22704 + }, + { + "epoch": 29.146341463414632, + "grad_norm": 1.3878124952316284, + "learning_rate": 2.362130937098845e-05, + "loss": 0.3086, + "step": 22705 + }, + { + "epoch": 29.14762516046213, + "grad_norm": 1.0701824426651, + "learning_rate": 2.3620881471972614e-05, + "loss": 0.273, + "step": 22706 + }, + { + "epoch": 29.14890885750963, + "grad_norm": 1.12495756149292, + "learning_rate": 2.3620453572956786e-05, + "loss": 0.3044, + "step": 22707 + }, + { + "epoch": 29.150192554557126, + "grad_norm": 1.7782782316207886, + "learning_rate": 2.362002567394095e-05, + "loss": 0.2908, + "step": 22708 + }, + { + "epoch": 29.15147625160462, + "grad_norm": 2.0759592056274414, + "learning_rate": 2.3619597774925116e-05, + "loss": 0.3051, + "step": 22709 + }, + { + "epoch": 29.15275994865212, + "grad_norm": 2.098693609237671, + "learning_rate": 2.3619169875909287e-05, + "loss": 0.3007, + "step": 22710 + }, + { + "epoch": 29.154043645699616, + "grad_norm": 1.2518670558929443, + "learning_rate": 2.3618741976893452e-05, + "loss": 0.3001, + "step": 22711 + }, + { + "epoch": 29.15532734274711, + "grad_norm": 2.0873119831085205, + "learning_rate": 2.3618314077877624e-05, + "loss": 0.2845, + "step": 22712 + }, + { + "epoch": 29.15661103979461, + "grad_norm": 1.3597657680511475, + "learning_rate": 2.361788617886179e-05, + "loss": 0.3152, + "step": 22713 + }, + { + "epoch": 29.157894736842106, + "grad_norm": 1.3060715198516846, + "learning_rate": 2.3617458279845958e-05, + "loss": 0.2935, + "step": 22714 + }, + { + "epoch": 29.1591784338896, + "grad_norm": 0.8778021335601807, + "learning_rate": 2.3617030380830126e-05, + "loss": 0.2814, + "step": 22715 + }, + { + "epoch": 29.1604621309371, + "grad_norm": 1.2274465560913086, + "learning_rate": 2.361660248181429e-05, + "loss": 0.2946, + "step": 22716 + }, + { + "epoch": 29.161745827984596, + "grad_norm": 0.9575883746147156, + "learning_rate": 2.361617458279846e-05, + "loss": 0.3287, + "step": 22717 + }, + { + "epoch": 29.163029525032094, + "grad_norm": 1.5569591522216797, + "learning_rate": 2.3615746683782628e-05, + "loss": 0.2735, + "step": 22718 + }, + { + "epoch": 29.16431322207959, + "grad_norm": 0.9754843711853027, + "learning_rate": 2.3615318784766796e-05, + "loss": 0.2764, + "step": 22719 + }, + { + "epoch": 29.165596919127086, + "grad_norm": 1.7663557529449463, + "learning_rate": 2.3614890885750965e-05, + "loss": 0.262, + "step": 22720 + }, + { + "epoch": 29.166880616174584, + "grad_norm": 1.571447730064392, + "learning_rate": 2.361446298673513e-05, + "loss": 0.2597, + "step": 22721 + }, + { + "epoch": 29.16816431322208, + "grad_norm": 0.8945791125297546, + "learning_rate": 2.3614035087719298e-05, + "loss": 0.3029, + "step": 22722 + }, + { + "epoch": 29.169448010269576, + "grad_norm": 1.2733315229415894, + "learning_rate": 2.3613607188703467e-05, + "loss": 0.2968, + "step": 22723 + }, + { + "epoch": 29.170731707317074, + "grad_norm": 1.0786776542663574, + "learning_rate": 2.3613179289687635e-05, + "loss": 0.2768, + "step": 22724 + }, + { + "epoch": 29.17201540436457, + "grad_norm": 0.9399215579032898, + "learning_rate": 2.36127513906718e-05, + "loss": 0.2808, + "step": 22725 + }, + { + "epoch": 29.173299101412066, + "grad_norm": 1.5232893228530884, + "learning_rate": 2.3612323491655972e-05, + "loss": 0.2897, + "step": 22726 + }, + { + "epoch": 29.174582798459564, + "grad_norm": 0.9597460627555847, + "learning_rate": 2.3611895592640137e-05, + "loss": 0.3173, + "step": 22727 + }, + { + "epoch": 29.17586649550706, + "grad_norm": 2.3642022609710693, + "learning_rate": 2.3611467693624305e-05, + "loss": 0.2971, + "step": 22728 + }, + { + "epoch": 29.177150192554556, + "grad_norm": 2.7144436836242676, + "learning_rate": 2.3611039794608474e-05, + "loss": 0.3246, + "step": 22729 + }, + { + "epoch": 29.178433889602054, + "grad_norm": 1.1389597654342651, + "learning_rate": 2.361061189559264e-05, + "loss": 0.2876, + "step": 22730 + }, + { + "epoch": 29.179717586649552, + "grad_norm": 1.695380449295044, + "learning_rate": 2.361018399657681e-05, + "loss": 0.3371, + "step": 22731 + }, + { + "epoch": 29.181001283697046, + "grad_norm": 1.7366513013839722, + "learning_rate": 2.3609756097560975e-05, + "loss": 0.2747, + "step": 22732 + }, + { + "epoch": 29.182284980744544, + "grad_norm": 1.3189525604248047, + "learning_rate": 2.3609328198545144e-05, + "loss": 0.3065, + "step": 22733 + }, + { + "epoch": 29.183568677792042, + "grad_norm": 7.612799167633057, + "learning_rate": 2.3608900299529312e-05, + "loss": 0.3296, + "step": 22734 + }, + { + "epoch": 29.184852374839537, + "grad_norm": 4.056232929229736, + "learning_rate": 2.3608472400513477e-05, + "loss": 0.3183, + "step": 22735 + }, + { + "epoch": 29.186136071887034, + "grad_norm": 2.3733928203582764, + "learning_rate": 2.360804450149765e-05, + "loss": 0.3223, + "step": 22736 + }, + { + "epoch": 29.187419768934532, + "grad_norm": 2.7527360916137695, + "learning_rate": 2.3607616602481814e-05, + "loss": 0.3033, + "step": 22737 + }, + { + "epoch": 29.188703465982027, + "grad_norm": 1.62944495677948, + "learning_rate": 2.3607188703465983e-05, + "loss": 0.2931, + "step": 22738 + }, + { + "epoch": 29.189987163029524, + "grad_norm": 2.292322874069214, + "learning_rate": 2.360676080445015e-05, + "loss": 0.3621, + "step": 22739 + }, + { + "epoch": 29.191270860077022, + "grad_norm": 1.502423882484436, + "learning_rate": 2.360633290543432e-05, + "loss": 0.354, + "step": 22740 + }, + { + "epoch": 29.19255455712452, + "grad_norm": 1.9145833253860474, + "learning_rate": 2.3605905006418484e-05, + "loss": 0.4341, + "step": 22741 + }, + { + "epoch": 29.193838254172015, + "grad_norm": 0.8887741565704346, + "learning_rate": 2.3605477107402653e-05, + "loss": 0.2746, + "step": 22742 + }, + { + "epoch": 29.195121951219512, + "grad_norm": 0.831527829170227, + "learning_rate": 2.360504920838682e-05, + "loss": 0.2746, + "step": 22743 + }, + { + "epoch": 29.19640564826701, + "grad_norm": 1.2803798913955688, + "learning_rate": 2.360462130937099e-05, + "loss": 0.2727, + "step": 22744 + }, + { + "epoch": 29.197689345314505, + "grad_norm": 1.0199189186096191, + "learning_rate": 2.3604193410355158e-05, + "loss": 0.2977, + "step": 22745 + }, + { + "epoch": 29.198973042362002, + "grad_norm": 0.9281680583953857, + "learning_rate": 2.3603765511339323e-05, + "loss": 0.3014, + "step": 22746 + }, + { + "epoch": 29.2002567394095, + "grad_norm": 0.8392531275749207, + "learning_rate": 2.3603337612323495e-05, + "loss": 0.2573, + "step": 22747 + }, + { + "epoch": 29.201540436456995, + "grad_norm": 0.9474509954452515, + "learning_rate": 2.360290971330766e-05, + "loss": 0.2834, + "step": 22748 + }, + { + "epoch": 29.202824133504492, + "grad_norm": 0.8865588307380676, + "learning_rate": 2.3602481814291825e-05, + "loss": 0.3031, + "step": 22749 + }, + { + "epoch": 29.20410783055199, + "grad_norm": 1.0912131071090698, + "learning_rate": 2.3602053915275997e-05, + "loss": 0.294, + "step": 22750 + }, + { + "epoch": 29.205391527599488, + "grad_norm": 2.4406325817108154, + "learning_rate": 2.3601626016260162e-05, + "loss": 0.2818, + "step": 22751 + }, + { + "epoch": 29.206675224646983, + "grad_norm": 1.5111764669418335, + "learning_rate": 2.3601198117244334e-05, + "loss": 0.3142, + "step": 22752 + }, + { + "epoch": 29.20795892169448, + "grad_norm": 0.8598983287811279, + "learning_rate": 2.36007702182285e-05, + "loss": 0.3034, + "step": 22753 + }, + { + "epoch": 29.20924261874198, + "grad_norm": 0.8674595952033997, + "learning_rate": 2.3600342319212667e-05, + "loss": 0.2581, + "step": 22754 + }, + { + "epoch": 29.210526315789473, + "grad_norm": 1.4740873575210571, + "learning_rate": 2.3599914420196835e-05, + "loss": 0.2885, + "step": 22755 + }, + { + "epoch": 29.21181001283697, + "grad_norm": 1.40244722366333, + "learning_rate": 2.3599486521181e-05, + "loss": 0.2794, + "step": 22756 + }, + { + "epoch": 29.21309370988447, + "grad_norm": 1.2119799852371216, + "learning_rate": 2.359905862216517e-05, + "loss": 0.3, + "step": 22757 + }, + { + "epoch": 29.214377406931963, + "grad_norm": 1.1758453845977783, + "learning_rate": 2.3598630723149337e-05, + "loss": 0.2876, + "step": 22758 + }, + { + "epoch": 29.21566110397946, + "grad_norm": 2.757626533508301, + "learning_rate": 2.3598202824133506e-05, + "loss": 0.2739, + "step": 22759 + }, + { + "epoch": 29.21694480102696, + "grad_norm": 1.3481202125549316, + "learning_rate": 2.3597774925117674e-05, + "loss": 0.2769, + "step": 22760 + }, + { + "epoch": 29.218228498074453, + "grad_norm": 0.9932312965393066, + "learning_rate": 2.3597347026101842e-05, + "loss": 0.3115, + "step": 22761 + }, + { + "epoch": 29.21951219512195, + "grad_norm": 0.9194758534431458, + "learning_rate": 2.3596919127086008e-05, + "loss": 0.2741, + "step": 22762 + }, + { + "epoch": 29.22079589216945, + "grad_norm": 1.6017111539840698, + "learning_rate": 2.3596491228070176e-05, + "loss": 0.296, + "step": 22763 + }, + { + "epoch": 29.222079589216946, + "grad_norm": 1.1951714754104614, + "learning_rate": 2.3596063329054344e-05, + "loss": 0.2839, + "step": 22764 + }, + { + "epoch": 29.22336328626444, + "grad_norm": 0.936883807182312, + "learning_rate": 2.359563543003851e-05, + "loss": 0.3093, + "step": 22765 + }, + { + "epoch": 29.22464698331194, + "grad_norm": 0.9420318603515625, + "learning_rate": 2.359520753102268e-05, + "loss": 0.2729, + "step": 22766 + }, + { + "epoch": 29.225930680359436, + "grad_norm": 1.895964503288269, + "learning_rate": 2.3594779632006846e-05, + "loss": 0.2919, + "step": 22767 + }, + { + "epoch": 29.22721437740693, + "grad_norm": 2.473931074142456, + "learning_rate": 2.3594351732991018e-05, + "loss": 0.2892, + "step": 22768 + }, + { + "epoch": 29.22849807445443, + "grad_norm": 1.0871610641479492, + "learning_rate": 2.3593923833975183e-05, + "loss": 0.2784, + "step": 22769 + }, + { + "epoch": 29.229781771501926, + "grad_norm": 1.6418216228485107, + "learning_rate": 2.3593495934959348e-05, + "loss": 0.3236, + "step": 22770 + }, + { + "epoch": 29.23106546854942, + "grad_norm": 1.6316471099853516, + "learning_rate": 2.359306803594352e-05, + "loss": 0.276, + "step": 22771 + }, + { + "epoch": 29.23234916559692, + "grad_norm": 0.9800453782081604, + "learning_rate": 2.3592640136927685e-05, + "loss": 0.2599, + "step": 22772 + }, + { + "epoch": 29.233632862644416, + "grad_norm": 1.3886895179748535, + "learning_rate": 2.3592212237911853e-05, + "loss": 0.2934, + "step": 22773 + }, + { + "epoch": 29.234916559691914, + "grad_norm": 1.4831839799880981, + "learning_rate": 2.359178433889602e-05, + "loss": 0.3034, + "step": 22774 + }, + { + "epoch": 29.23620025673941, + "grad_norm": 2.2974517345428467, + "learning_rate": 2.359135643988019e-05, + "loss": 0.3374, + "step": 22775 + }, + { + "epoch": 29.237483953786906, + "grad_norm": 1.027395248413086, + "learning_rate": 2.359092854086436e-05, + "loss": 0.2935, + "step": 22776 + }, + { + "epoch": 29.238767650834404, + "grad_norm": 1.6526962518692017, + "learning_rate": 2.3590500641848524e-05, + "loss": 0.3083, + "step": 22777 + }, + { + "epoch": 29.2400513478819, + "grad_norm": 1.2503613233566284, + "learning_rate": 2.3590072742832692e-05, + "loss": 0.3071, + "step": 22778 + }, + { + "epoch": 29.241335044929397, + "grad_norm": 1.198198914527893, + "learning_rate": 2.358964484381686e-05, + "loss": 0.2819, + "step": 22779 + }, + { + "epoch": 29.242618741976894, + "grad_norm": 1.5382989645004272, + "learning_rate": 2.358921694480103e-05, + "loss": 0.2929, + "step": 22780 + }, + { + "epoch": 29.24390243902439, + "grad_norm": 0.8793976306915283, + "learning_rate": 2.3588789045785194e-05, + "loss": 0.2616, + "step": 22781 + }, + { + "epoch": 29.245186136071887, + "grad_norm": 5.195614814758301, + "learning_rate": 2.3588361146769362e-05, + "loss": 0.3001, + "step": 22782 + }, + { + "epoch": 29.246469833119384, + "grad_norm": 1.4638344049453735, + "learning_rate": 2.358793324775353e-05, + "loss": 0.3642, + "step": 22783 + }, + { + "epoch": 29.247753530166882, + "grad_norm": 3.093827486038208, + "learning_rate": 2.35875053487377e-05, + "loss": 0.3141, + "step": 22784 + }, + { + "epoch": 29.249037227214377, + "grad_norm": 1.8394978046417236, + "learning_rate": 2.3587077449721867e-05, + "loss": 0.2782, + "step": 22785 + }, + { + "epoch": 29.250320924261874, + "grad_norm": 2.638087272644043, + "learning_rate": 2.3586649550706032e-05, + "loss": 0.3, + "step": 22786 + }, + { + "epoch": 29.251604621309372, + "grad_norm": 4.531355857849121, + "learning_rate": 2.3586221651690204e-05, + "loss": 0.2794, + "step": 22787 + }, + { + "epoch": 29.252888318356867, + "grad_norm": 2.473978042602539, + "learning_rate": 2.358579375267437e-05, + "loss": 0.3493, + "step": 22788 + }, + { + "epoch": 29.254172015404365, + "grad_norm": 2.216092348098755, + "learning_rate": 2.3585365853658534e-05, + "loss": 0.3538, + "step": 22789 + }, + { + "epoch": 29.255455712451862, + "grad_norm": 1.7566983699798584, + "learning_rate": 2.3584937954642706e-05, + "loss": 0.3354, + "step": 22790 + }, + { + "epoch": 29.256739409499357, + "grad_norm": 4.4881768226623535, + "learning_rate": 2.358451005562687e-05, + "loss": 0.4682, + "step": 22791 + }, + { + "epoch": 29.258023106546855, + "grad_norm": 0.9677572846412659, + "learning_rate": 2.3584082156611043e-05, + "loss": 0.2873, + "step": 22792 + }, + { + "epoch": 29.259306803594352, + "grad_norm": 1.4901329278945923, + "learning_rate": 2.3583654257595208e-05, + "loss": 0.2961, + "step": 22793 + }, + { + "epoch": 29.260590500641847, + "grad_norm": 1.1558527946472168, + "learning_rate": 2.3583226358579376e-05, + "loss": 0.3001, + "step": 22794 + }, + { + "epoch": 29.261874197689345, + "grad_norm": 0.8652340769767761, + "learning_rate": 2.3582798459563545e-05, + "loss": 0.2949, + "step": 22795 + }, + { + "epoch": 29.263157894736842, + "grad_norm": 1.9272122383117676, + "learning_rate": 2.358237056054771e-05, + "loss": 0.2744, + "step": 22796 + }, + { + "epoch": 29.26444159178434, + "grad_norm": 0.7481434941291809, + "learning_rate": 2.3581942661531878e-05, + "loss": 0.2813, + "step": 22797 + }, + { + "epoch": 29.265725288831835, + "grad_norm": 0.9716150164604187, + "learning_rate": 2.3581514762516047e-05, + "loss": 0.2988, + "step": 22798 + }, + { + "epoch": 29.267008985879333, + "grad_norm": 1.1165400743484497, + "learning_rate": 2.3581086863500215e-05, + "loss": 0.3007, + "step": 22799 + }, + { + "epoch": 29.26829268292683, + "grad_norm": 1.340114951133728, + "learning_rate": 2.358065896448438e-05, + "loss": 0.2732, + "step": 22800 + }, + { + "epoch": 29.269576379974325, + "grad_norm": 0.8135131001472473, + "learning_rate": 2.3580231065468552e-05, + "loss": 0.3205, + "step": 22801 + }, + { + "epoch": 29.270860077021823, + "grad_norm": 1.2186647653579712, + "learning_rate": 2.3579803166452717e-05, + "loss": 0.2879, + "step": 22802 + }, + { + "epoch": 29.27214377406932, + "grad_norm": 0.9036030769348145, + "learning_rate": 2.3579375267436885e-05, + "loss": 0.2813, + "step": 22803 + }, + { + "epoch": 29.273427471116815, + "grad_norm": 1.2694262266159058, + "learning_rate": 2.3578947368421054e-05, + "loss": 0.2944, + "step": 22804 + }, + { + "epoch": 29.274711168164313, + "grad_norm": 1.552024245262146, + "learning_rate": 2.357851946940522e-05, + "loss": 0.2835, + "step": 22805 + }, + { + "epoch": 29.27599486521181, + "grad_norm": 0.9609408974647522, + "learning_rate": 2.357809157038939e-05, + "loss": 0.2929, + "step": 22806 + }, + { + "epoch": 29.27727856225931, + "grad_norm": 0.9229080677032471, + "learning_rate": 2.3577663671373556e-05, + "loss": 0.2798, + "step": 22807 + }, + { + "epoch": 29.278562259306803, + "grad_norm": 0.9030579924583435, + "learning_rate": 2.3577235772357724e-05, + "loss": 0.2784, + "step": 22808 + }, + { + "epoch": 29.2798459563543, + "grad_norm": 1.400951862335205, + "learning_rate": 2.3576807873341892e-05, + "loss": 0.2843, + "step": 22809 + }, + { + "epoch": 29.2811296534018, + "grad_norm": 1.3085460662841797, + "learning_rate": 2.3576379974326057e-05, + "loss": 0.2745, + "step": 22810 + }, + { + "epoch": 29.282413350449293, + "grad_norm": 1.2813105583190918, + "learning_rate": 2.357595207531023e-05, + "loss": 0.2878, + "step": 22811 + }, + { + "epoch": 29.28369704749679, + "grad_norm": 1.7216358184814453, + "learning_rate": 2.3575524176294394e-05, + "loss": 0.2927, + "step": 22812 + }, + { + "epoch": 29.28498074454429, + "grad_norm": 2.4176061153411865, + "learning_rate": 2.3575096277278563e-05, + "loss": 0.2765, + "step": 22813 + }, + { + "epoch": 29.286264441591783, + "grad_norm": 1.1525615453720093, + "learning_rate": 2.357466837826273e-05, + "loss": 0.2676, + "step": 22814 + }, + { + "epoch": 29.28754813863928, + "grad_norm": 1.1305291652679443, + "learning_rate": 2.35742404792469e-05, + "loss": 0.2601, + "step": 22815 + }, + { + "epoch": 29.28883183568678, + "grad_norm": 0.7632454037666321, + "learning_rate": 2.3573812580231064e-05, + "loss": 0.2783, + "step": 22816 + }, + { + "epoch": 29.290115532734276, + "grad_norm": 1.1304446458816528, + "learning_rate": 2.3573384681215233e-05, + "loss": 0.3249, + "step": 22817 + }, + { + "epoch": 29.29139922978177, + "grad_norm": 1.0207254886627197, + "learning_rate": 2.35729567821994e-05, + "loss": 0.2813, + "step": 22818 + }, + { + "epoch": 29.29268292682927, + "grad_norm": 3.966038703918457, + "learning_rate": 2.357252888318357e-05, + "loss": 0.2829, + "step": 22819 + }, + { + "epoch": 29.293966623876766, + "grad_norm": 1.1560972929000854, + "learning_rate": 2.3572100984167738e-05, + "loss": 0.2951, + "step": 22820 + }, + { + "epoch": 29.29525032092426, + "grad_norm": 1.0810890197753906, + "learning_rate": 2.3571673085151903e-05, + "loss": 0.282, + "step": 22821 + }, + { + "epoch": 29.29653401797176, + "grad_norm": 3.668856382369995, + "learning_rate": 2.3571245186136075e-05, + "loss": 0.2908, + "step": 22822 + }, + { + "epoch": 29.297817715019256, + "grad_norm": 1.3213425874710083, + "learning_rate": 2.357081728712024e-05, + "loss": 0.3107, + "step": 22823 + }, + { + "epoch": 29.29910141206675, + "grad_norm": 3.1768229007720947, + "learning_rate": 2.3570389388104405e-05, + "loss": 0.2852, + "step": 22824 + }, + { + "epoch": 29.30038510911425, + "grad_norm": 1.3345767259597778, + "learning_rate": 2.3569961489088577e-05, + "loss": 0.3176, + "step": 22825 + }, + { + "epoch": 29.301668806161747, + "grad_norm": 1.225582480430603, + "learning_rate": 2.3569533590072742e-05, + "loss": 0.2868, + "step": 22826 + }, + { + "epoch": 29.30295250320924, + "grad_norm": 3.07700777053833, + "learning_rate": 2.3569105691056914e-05, + "loss": 0.3108, + "step": 22827 + }, + { + "epoch": 29.30423620025674, + "grad_norm": 8.900459289550781, + "learning_rate": 2.356867779204108e-05, + "loss": 0.3151, + "step": 22828 + }, + { + "epoch": 29.305519897304237, + "grad_norm": 1.3900103569030762, + "learning_rate": 2.3568249893025247e-05, + "loss": 0.3147, + "step": 22829 + }, + { + "epoch": 29.306803594351734, + "grad_norm": 1.5650314092636108, + "learning_rate": 2.3567821994009415e-05, + "loss": 0.2814, + "step": 22830 + }, + { + "epoch": 29.30808729139923, + "grad_norm": 1.2303838729858398, + "learning_rate": 2.356739409499358e-05, + "loss": 0.2795, + "step": 22831 + }, + { + "epoch": 29.309370988446727, + "grad_norm": 1.1544866561889648, + "learning_rate": 2.356696619597775e-05, + "loss": 0.3003, + "step": 22832 + }, + { + "epoch": 29.310654685494224, + "grad_norm": 3.130871534347534, + "learning_rate": 2.3566538296961917e-05, + "loss": 0.3492, + "step": 22833 + }, + { + "epoch": 29.31193838254172, + "grad_norm": 1.4699487686157227, + "learning_rate": 2.3566110397946086e-05, + "loss": 0.2587, + "step": 22834 + }, + { + "epoch": 29.313222079589217, + "grad_norm": 3.033812999725342, + "learning_rate": 2.3565682498930254e-05, + "loss": 0.2599, + "step": 22835 + }, + { + "epoch": 29.314505776636715, + "grad_norm": 1.413286566734314, + "learning_rate": 2.3565254599914423e-05, + "loss": 0.3304, + "step": 22836 + }, + { + "epoch": 29.31578947368421, + "grad_norm": 1.7799044847488403, + "learning_rate": 2.3564826700898588e-05, + "loss": 0.3619, + "step": 22837 + }, + { + "epoch": 29.317073170731707, + "grad_norm": 1.407713770866394, + "learning_rate": 2.3564398801882756e-05, + "loss": 0.3561, + "step": 22838 + }, + { + "epoch": 29.318356867779205, + "grad_norm": 1.3800522089004517, + "learning_rate": 2.3563970902866924e-05, + "loss": 0.3636, + "step": 22839 + }, + { + "epoch": 29.319640564826702, + "grad_norm": 2.3924784660339355, + "learning_rate": 2.356354300385109e-05, + "loss": 0.3942, + "step": 22840 + }, + { + "epoch": 29.320924261874197, + "grad_norm": 4.652464866638184, + "learning_rate": 2.356311510483526e-05, + "loss": 0.454, + "step": 22841 + }, + { + "epoch": 29.322207958921695, + "grad_norm": 0.6924782991409302, + "learning_rate": 2.3562687205819426e-05, + "loss": 0.2856, + "step": 22842 + }, + { + "epoch": 29.323491655969192, + "grad_norm": 0.8470931649208069, + "learning_rate": 2.3562259306803595e-05, + "loss": 0.2882, + "step": 22843 + }, + { + "epoch": 29.324775353016687, + "grad_norm": 0.714046061038971, + "learning_rate": 2.3561831407787763e-05, + "loss": 0.2875, + "step": 22844 + }, + { + "epoch": 29.326059050064185, + "grad_norm": 2.031283140182495, + "learning_rate": 2.3561403508771928e-05, + "loss": 0.2889, + "step": 22845 + }, + { + "epoch": 29.327342747111683, + "grad_norm": 0.9805073738098145, + "learning_rate": 2.35609756097561e-05, + "loss": 0.3007, + "step": 22846 + }, + { + "epoch": 29.328626444159177, + "grad_norm": 0.782789945602417, + "learning_rate": 2.3560547710740265e-05, + "loss": 0.2697, + "step": 22847 + }, + { + "epoch": 29.329910141206675, + "grad_norm": 1.6304361820220947, + "learning_rate": 2.3560119811724433e-05, + "loss": 0.2691, + "step": 22848 + }, + { + "epoch": 29.331193838254173, + "grad_norm": 0.8105974793434143, + "learning_rate": 2.3559691912708602e-05, + "loss": 0.2755, + "step": 22849 + }, + { + "epoch": 29.33247753530167, + "grad_norm": 1.6324506998062134, + "learning_rate": 2.3559264013692767e-05, + "loss": 0.3039, + "step": 22850 + }, + { + "epoch": 29.333761232349165, + "grad_norm": 2.390537977218628, + "learning_rate": 2.355883611467694e-05, + "loss": 0.3044, + "step": 22851 + }, + { + "epoch": 29.335044929396663, + "grad_norm": 1.088338017463684, + "learning_rate": 2.3558408215661104e-05, + "loss": 0.2763, + "step": 22852 + }, + { + "epoch": 29.33632862644416, + "grad_norm": 1.0748980045318604, + "learning_rate": 2.3557980316645272e-05, + "loss": 0.2965, + "step": 22853 + }, + { + "epoch": 29.337612323491655, + "grad_norm": 0.9690587520599365, + "learning_rate": 2.355755241762944e-05, + "loss": 0.2776, + "step": 22854 + }, + { + "epoch": 29.338896020539153, + "grad_norm": 0.9101088047027588, + "learning_rate": 2.355712451861361e-05, + "loss": 0.2841, + "step": 22855 + }, + { + "epoch": 29.34017971758665, + "grad_norm": 0.8698464035987854, + "learning_rate": 2.3556696619597774e-05, + "loss": 0.2814, + "step": 22856 + }, + { + "epoch": 29.341463414634145, + "grad_norm": 4.917407512664795, + "learning_rate": 2.3556268720581942e-05, + "loss": 0.2925, + "step": 22857 + }, + { + "epoch": 29.342747111681643, + "grad_norm": 0.9740419983863831, + "learning_rate": 2.355584082156611e-05, + "loss": 0.3005, + "step": 22858 + }, + { + "epoch": 29.34403080872914, + "grad_norm": 1.480771780014038, + "learning_rate": 2.355541292255028e-05, + "loss": 0.2913, + "step": 22859 + }, + { + "epoch": 29.345314505776635, + "grad_norm": 1.2615861892700195, + "learning_rate": 2.3554985023534447e-05, + "loss": 0.3024, + "step": 22860 + }, + { + "epoch": 29.346598202824133, + "grad_norm": 0.8964222073554993, + "learning_rate": 2.3554557124518613e-05, + "loss": 0.2782, + "step": 22861 + }, + { + "epoch": 29.34788189987163, + "grad_norm": 3.150984287261963, + "learning_rate": 2.3554129225502784e-05, + "loss": 0.2838, + "step": 22862 + }, + { + "epoch": 29.34916559691913, + "grad_norm": 0.8864907026290894, + "learning_rate": 2.355370132648695e-05, + "loss": 0.3104, + "step": 22863 + }, + { + "epoch": 29.350449293966623, + "grad_norm": 1.081555724143982, + "learning_rate": 2.3553273427471114e-05, + "loss": 0.2902, + "step": 22864 + }, + { + "epoch": 29.35173299101412, + "grad_norm": 1.001307725906372, + "learning_rate": 2.3552845528455286e-05, + "loss": 0.2574, + "step": 22865 + }, + { + "epoch": 29.35301668806162, + "grad_norm": 0.9801980257034302, + "learning_rate": 2.355241762943945e-05, + "loss": 0.2869, + "step": 22866 + }, + { + "epoch": 29.354300385109113, + "grad_norm": 1.0729820728302002, + "learning_rate": 2.3551989730423623e-05, + "loss": 0.3078, + "step": 22867 + }, + { + "epoch": 29.35558408215661, + "grad_norm": 1.4160715341567993, + "learning_rate": 2.3551561831407788e-05, + "loss": 0.2712, + "step": 22868 + }, + { + "epoch": 29.35686777920411, + "grad_norm": 2.293370485305786, + "learning_rate": 2.3551133932391956e-05, + "loss": 0.2788, + "step": 22869 + }, + { + "epoch": 29.358151476251603, + "grad_norm": 6.692941665649414, + "learning_rate": 2.3550706033376125e-05, + "loss": 0.2676, + "step": 22870 + }, + { + "epoch": 29.3594351732991, + "grad_norm": 1.3768975734710693, + "learning_rate": 2.355027813436029e-05, + "loss": 0.2934, + "step": 22871 + }, + { + "epoch": 29.3607188703466, + "grad_norm": 1.2429475784301758, + "learning_rate": 2.3549850235344458e-05, + "loss": 0.2846, + "step": 22872 + }, + { + "epoch": 29.362002567394097, + "grad_norm": 1.2213996648788452, + "learning_rate": 2.3549422336328627e-05, + "loss": 0.2934, + "step": 22873 + }, + { + "epoch": 29.36328626444159, + "grad_norm": 1.3143153190612793, + "learning_rate": 2.3548994437312795e-05, + "loss": 0.3116, + "step": 22874 + }, + { + "epoch": 29.36456996148909, + "grad_norm": 2.198885440826416, + "learning_rate": 2.3548566538296964e-05, + "loss": 0.3037, + "step": 22875 + }, + { + "epoch": 29.365853658536587, + "grad_norm": 2.036620616912842, + "learning_rate": 2.3548138639281132e-05, + "loss": 0.2904, + "step": 22876 + }, + { + "epoch": 29.36713735558408, + "grad_norm": 2.2309658527374268, + "learning_rate": 2.3547710740265297e-05, + "loss": 0.2564, + "step": 22877 + }, + { + "epoch": 29.36842105263158, + "grad_norm": 1.7376595735549927, + "learning_rate": 2.3547282841249465e-05, + "loss": 0.2806, + "step": 22878 + }, + { + "epoch": 29.369704749679077, + "grad_norm": 1.5947507619857788, + "learning_rate": 2.3546854942233634e-05, + "loss": 0.3068, + "step": 22879 + }, + { + "epoch": 29.37098844672657, + "grad_norm": 1.3914453983306885, + "learning_rate": 2.35464270432178e-05, + "loss": 0.3081, + "step": 22880 + }, + { + "epoch": 29.37227214377407, + "grad_norm": 1.9116407632827759, + "learning_rate": 2.354599914420197e-05, + "loss": 0.2825, + "step": 22881 + }, + { + "epoch": 29.373555840821567, + "grad_norm": 1.1450024843215942, + "learning_rate": 2.3545571245186136e-05, + "loss": 0.2909, + "step": 22882 + }, + { + "epoch": 29.374839537869065, + "grad_norm": 1.1889114379882812, + "learning_rate": 2.3545143346170307e-05, + "loss": 0.2985, + "step": 22883 + }, + { + "epoch": 29.37612323491656, + "grad_norm": 2.8912181854248047, + "learning_rate": 2.3544715447154472e-05, + "loss": 0.2957, + "step": 22884 + }, + { + "epoch": 29.377406931964057, + "grad_norm": 4.127355098724365, + "learning_rate": 2.3544287548138637e-05, + "loss": 0.2955, + "step": 22885 + }, + { + "epoch": 29.378690629011555, + "grad_norm": 4.558243751525879, + "learning_rate": 2.354385964912281e-05, + "loss": 0.3582, + "step": 22886 + }, + { + "epoch": 29.37997432605905, + "grad_norm": 1.0972084999084473, + "learning_rate": 2.3543431750106974e-05, + "loss": 0.3415, + "step": 22887 + }, + { + "epoch": 29.381258023106547, + "grad_norm": 3.181570291519165, + "learning_rate": 2.3543003851091143e-05, + "loss": 0.2915, + "step": 22888 + }, + { + "epoch": 29.382541720154045, + "grad_norm": 1.5169209241867065, + "learning_rate": 2.354257595207531e-05, + "loss": 0.3569, + "step": 22889 + }, + { + "epoch": 29.38382541720154, + "grad_norm": 1.4149423837661743, + "learning_rate": 2.354214805305948e-05, + "loss": 0.3511, + "step": 22890 + }, + { + "epoch": 29.385109114249037, + "grad_norm": 6.702234745025635, + "learning_rate": 2.3541720154043648e-05, + "loss": 0.4273, + "step": 22891 + }, + { + "epoch": 29.386392811296535, + "grad_norm": 1.0551198720932007, + "learning_rate": 2.3541292255027813e-05, + "loss": 0.3073, + "step": 22892 + }, + { + "epoch": 29.387676508344033, + "grad_norm": 0.7092496752738953, + "learning_rate": 2.354086435601198e-05, + "loss": 0.2753, + "step": 22893 + }, + { + "epoch": 29.388960205391527, + "grad_norm": 0.7968276739120483, + "learning_rate": 2.354043645699615e-05, + "loss": 0.2937, + "step": 22894 + }, + { + "epoch": 29.390243902439025, + "grad_norm": 1.2176884412765503, + "learning_rate": 2.3540008557980318e-05, + "loss": 0.3103, + "step": 22895 + }, + { + "epoch": 29.391527599486523, + "grad_norm": 1.5438798666000366, + "learning_rate": 2.3539580658964483e-05, + "loss": 0.2777, + "step": 22896 + }, + { + "epoch": 29.392811296534017, + "grad_norm": 0.8670893311500549, + "learning_rate": 2.3539152759948655e-05, + "loss": 0.2828, + "step": 22897 + }, + { + "epoch": 29.394094993581515, + "grad_norm": 0.9837680459022522, + "learning_rate": 2.353872486093282e-05, + "loss": 0.2971, + "step": 22898 + }, + { + "epoch": 29.395378690629013, + "grad_norm": 1.4746116399765015, + "learning_rate": 2.353829696191699e-05, + "loss": 0.2707, + "step": 22899 + }, + { + "epoch": 29.396662387676507, + "grad_norm": 1.1022906303405762, + "learning_rate": 2.3537869062901157e-05, + "loss": 0.289, + "step": 22900 + }, + { + "epoch": 29.397946084724005, + "grad_norm": 1.4683855772018433, + "learning_rate": 2.3537441163885322e-05, + "loss": 0.3198, + "step": 22901 + }, + { + "epoch": 29.399229781771503, + "grad_norm": 0.962337851524353, + "learning_rate": 2.3537013264869494e-05, + "loss": 0.2727, + "step": 22902 + }, + { + "epoch": 29.400513478818997, + "grad_norm": 1.1781870126724243, + "learning_rate": 2.353658536585366e-05, + "loss": 0.2746, + "step": 22903 + }, + { + "epoch": 29.401797175866495, + "grad_norm": 1.03955078125, + "learning_rate": 2.3536157466837827e-05, + "loss": 0.3367, + "step": 22904 + }, + { + "epoch": 29.403080872913993, + "grad_norm": 0.8468447327613831, + "learning_rate": 2.3535729567821996e-05, + "loss": 0.2909, + "step": 22905 + }, + { + "epoch": 29.40436456996149, + "grad_norm": 1.2257802486419678, + "learning_rate": 2.353530166880616e-05, + "loss": 0.2922, + "step": 22906 + }, + { + "epoch": 29.405648267008985, + "grad_norm": 2.0377604961395264, + "learning_rate": 2.3534873769790332e-05, + "loss": 0.3013, + "step": 22907 + }, + { + "epoch": 29.406931964056483, + "grad_norm": 2.2684545516967773, + "learning_rate": 2.3534445870774497e-05, + "loss": 0.2596, + "step": 22908 + }, + { + "epoch": 29.40821566110398, + "grad_norm": 1.504866123199463, + "learning_rate": 2.3534017971758666e-05, + "loss": 0.3038, + "step": 22909 + }, + { + "epoch": 29.409499358151475, + "grad_norm": 1.2823374271392822, + "learning_rate": 2.3533590072742834e-05, + "loss": 0.2898, + "step": 22910 + }, + { + "epoch": 29.410783055198973, + "grad_norm": 1.0037649869918823, + "learning_rate": 2.3533162173727e-05, + "loss": 0.2698, + "step": 22911 + }, + { + "epoch": 29.41206675224647, + "grad_norm": 1.103081226348877, + "learning_rate": 2.3532734274711168e-05, + "loss": 0.3081, + "step": 22912 + }, + { + "epoch": 29.413350449293965, + "grad_norm": 0.9165655970573425, + "learning_rate": 2.3532306375695336e-05, + "loss": 0.2656, + "step": 22913 + }, + { + "epoch": 29.414634146341463, + "grad_norm": 1.218154788017273, + "learning_rate": 2.3531878476679504e-05, + "loss": 0.2973, + "step": 22914 + }, + { + "epoch": 29.41591784338896, + "grad_norm": 1.9764404296875, + "learning_rate": 2.3531450577663673e-05, + "loss": 0.2867, + "step": 22915 + }, + { + "epoch": 29.41720154043646, + "grad_norm": 1.9171992540359497, + "learning_rate": 2.353102267864784e-05, + "loss": 0.2807, + "step": 22916 + }, + { + "epoch": 29.418485237483953, + "grad_norm": 4.492321491241455, + "learning_rate": 2.3530594779632006e-05, + "loss": 0.2564, + "step": 22917 + }, + { + "epoch": 29.41976893453145, + "grad_norm": 0.9581020474433899, + "learning_rate": 2.3530166880616175e-05, + "loss": 0.2946, + "step": 22918 + }, + { + "epoch": 29.42105263157895, + "grad_norm": 1.8874684572219849, + "learning_rate": 2.3529738981600343e-05, + "loss": 0.3084, + "step": 22919 + }, + { + "epoch": 29.422336328626443, + "grad_norm": 0.9746703505516052, + "learning_rate": 2.3529311082584508e-05, + "loss": 0.2866, + "step": 22920 + }, + { + "epoch": 29.42362002567394, + "grad_norm": 1.1351954936981201, + "learning_rate": 2.352888318356868e-05, + "loss": 0.28, + "step": 22921 + }, + { + "epoch": 29.42490372272144, + "grad_norm": 1.394834280014038, + "learning_rate": 2.3528455284552845e-05, + "loss": 0.3293, + "step": 22922 + }, + { + "epoch": 29.426187419768933, + "grad_norm": 1.2275246381759644, + "learning_rate": 2.3528027385537017e-05, + "loss": 0.2677, + "step": 22923 + }, + { + "epoch": 29.42747111681643, + "grad_norm": 1.7367042303085327, + "learning_rate": 2.3527599486521182e-05, + "loss": 0.3002, + "step": 22924 + }, + { + "epoch": 29.42875481386393, + "grad_norm": 1.0354585647583008, + "learning_rate": 2.3527171587505347e-05, + "loss": 0.3067, + "step": 22925 + }, + { + "epoch": 29.430038510911427, + "grad_norm": 1.5368168354034424, + "learning_rate": 2.352674368848952e-05, + "loss": 0.3035, + "step": 22926 + }, + { + "epoch": 29.43132220795892, + "grad_norm": 2.0486767292022705, + "learning_rate": 2.3526315789473684e-05, + "loss": 0.3055, + "step": 22927 + }, + { + "epoch": 29.43260590500642, + "grad_norm": 1.1329145431518555, + "learning_rate": 2.3525887890457852e-05, + "loss": 0.3204, + "step": 22928 + }, + { + "epoch": 29.433889602053917, + "grad_norm": 2.0436642169952393, + "learning_rate": 2.352545999144202e-05, + "loss": 0.3101, + "step": 22929 + }, + { + "epoch": 29.43517329910141, + "grad_norm": 1.1441915035247803, + "learning_rate": 2.352503209242619e-05, + "loss": 0.3028, + "step": 22930 + }, + { + "epoch": 29.43645699614891, + "grad_norm": 3.1168408393859863, + "learning_rate": 2.3524604193410357e-05, + "loss": 0.3327, + "step": 22931 + }, + { + "epoch": 29.437740693196407, + "grad_norm": 1.817253828048706, + "learning_rate": 2.3524176294394522e-05, + "loss": 0.3189, + "step": 22932 + }, + { + "epoch": 29.4390243902439, + "grad_norm": 3.0814316272735596, + "learning_rate": 2.352374839537869e-05, + "loss": 0.3411, + "step": 22933 + }, + { + "epoch": 29.4403080872914, + "grad_norm": 1.8251566886901855, + "learning_rate": 2.352332049636286e-05, + "loss": 0.3495, + "step": 22934 + }, + { + "epoch": 29.441591784338897, + "grad_norm": 11.762024879455566, + "learning_rate": 2.3522892597347028e-05, + "loss": 0.3499, + "step": 22935 + }, + { + "epoch": 29.44287548138639, + "grad_norm": 2.090623378753662, + "learning_rate": 2.3522464698331193e-05, + "loss": 0.3747, + "step": 22936 + }, + { + "epoch": 29.44415917843389, + "grad_norm": 3.5066146850585938, + "learning_rate": 2.3522036799315364e-05, + "loss": 0.3429, + "step": 22937 + }, + { + "epoch": 29.445442875481387, + "grad_norm": 1.573703408241272, + "learning_rate": 2.352160890029953e-05, + "loss": 0.3241, + "step": 22938 + }, + { + "epoch": 29.446726572528885, + "grad_norm": 4.171596527099609, + "learning_rate": 2.3521181001283698e-05, + "loss": 0.3622, + "step": 22939 + }, + { + "epoch": 29.44801026957638, + "grad_norm": 2.996790647506714, + "learning_rate": 2.3520753102267866e-05, + "loss": 0.4019, + "step": 22940 + }, + { + "epoch": 29.449293966623877, + "grad_norm": 2.3011739253997803, + "learning_rate": 2.352032520325203e-05, + "loss": 0.4519, + "step": 22941 + }, + { + "epoch": 29.450577663671375, + "grad_norm": 5.213837623596191, + "learning_rate": 2.3519897304236203e-05, + "loss": 0.3119, + "step": 22942 + }, + { + "epoch": 29.45186136071887, + "grad_norm": 1.1362802982330322, + "learning_rate": 2.3519469405220368e-05, + "loss": 0.263, + "step": 22943 + }, + { + "epoch": 29.453145057766367, + "grad_norm": 0.9381871223449707, + "learning_rate": 2.3519041506204536e-05, + "loss": 0.2793, + "step": 22944 + }, + { + "epoch": 29.454428754813865, + "grad_norm": 0.9623916149139404, + "learning_rate": 2.3518613607188705e-05, + "loss": 0.2788, + "step": 22945 + }, + { + "epoch": 29.45571245186136, + "grad_norm": 1.0971516370773315, + "learning_rate": 2.351818570817287e-05, + "loss": 0.3055, + "step": 22946 + }, + { + "epoch": 29.456996148908857, + "grad_norm": 0.8305585980415344, + "learning_rate": 2.3517757809157042e-05, + "loss": 0.2894, + "step": 22947 + }, + { + "epoch": 29.458279845956355, + "grad_norm": 0.8686585426330566, + "learning_rate": 2.3517329910141207e-05, + "loss": 0.2918, + "step": 22948 + }, + { + "epoch": 29.459563543003853, + "grad_norm": 0.9909194707870483, + "learning_rate": 2.3516902011125375e-05, + "loss": 0.2473, + "step": 22949 + }, + { + "epoch": 29.460847240051347, + "grad_norm": 0.9110826849937439, + "learning_rate": 2.3516474112109544e-05, + "loss": 0.2941, + "step": 22950 + }, + { + "epoch": 29.462130937098845, + "grad_norm": 2.2902684211730957, + "learning_rate": 2.3516046213093712e-05, + "loss": 0.3071, + "step": 22951 + }, + { + "epoch": 29.463414634146343, + "grad_norm": 1.4247524738311768, + "learning_rate": 2.3515618314077877e-05, + "loss": 0.2872, + "step": 22952 + }, + { + "epoch": 29.464698331193837, + "grad_norm": 1.0036002397537231, + "learning_rate": 2.3515190415062045e-05, + "loss": 0.2941, + "step": 22953 + }, + { + "epoch": 29.465982028241335, + "grad_norm": 1.2549511194229126, + "learning_rate": 2.3514762516046214e-05, + "loss": 0.2835, + "step": 22954 + }, + { + "epoch": 29.467265725288833, + "grad_norm": 0.9550581574440002, + "learning_rate": 2.3514334617030382e-05, + "loss": 0.2847, + "step": 22955 + }, + { + "epoch": 29.468549422336327, + "grad_norm": 1.0875808000564575, + "learning_rate": 2.351390671801455e-05, + "loss": 0.2797, + "step": 22956 + }, + { + "epoch": 29.469833119383825, + "grad_norm": 0.8742344975471497, + "learning_rate": 2.3513478818998716e-05, + "loss": 0.27, + "step": 22957 + }, + { + "epoch": 29.471116816431323, + "grad_norm": 1.4192922115325928, + "learning_rate": 2.3513050919982887e-05, + "loss": 0.2999, + "step": 22958 + }, + { + "epoch": 29.47240051347882, + "grad_norm": 1.6631557941436768, + "learning_rate": 2.3512623020967052e-05, + "loss": 0.3016, + "step": 22959 + }, + { + "epoch": 29.473684210526315, + "grad_norm": 1.8327491283416748, + "learning_rate": 2.3512195121951218e-05, + "loss": 0.2598, + "step": 22960 + }, + { + "epoch": 29.474967907573813, + "grad_norm": 1.0588136911392212, + "learning_rate": 2.351176722293539e-05, + "loss": 0.2969, + "step": 22961 + }, + { + "epoch": 29.47625160462131, + "grad_norm": 1.1972576379776, + "learning_rate": 2.3511339323919554e-05, + "loss": 0.2838, + "step": 22962 + }, + { + "epoch": 29.477535301668805, + "grad_norm": 4.206551551818848, + "learning_rate": 2.3510911424903726e-05, + "loss": 0.2734, + "step": 22963 + }, + { + "epoch": 29.478818998716303, + "grad_norm": 1.0744574069976807, + "learning_rate": 2.351048352588789e-05, + "loss": 0.2854, + "step": 22964 + }, + { + "epoch": 29.4801026957638, + "grad_norm": 0.9791396856307983, + "learning_rate": 2.351005562687206e-05, + "loss": 0.2921, + "step": 22965 + }, + { + "epoch": 29.481386392811295, + "grad_norm": 1.1208786964416504, + "learning_rate": 2.3509627727856228e-05, + "loss": 0.315, + "step": 22966 + }, + { + "epoch": 29.482670089858793, + "grad_norm": 1.1696418523788452, + "learning_rate": 2.3509199828840393e-05, + "loss": 0.3015, + "step": 22967 + }, + { + "epoch": 29.48395378690629, + "grad_norm": 0.835610032081604, + "learning_rate": 2.350877192982456e-05, + "loss": 0.2513, + "step": 22968 + }, + { + "epoch": 29.485237483953785, + "grad_norm": 1.0969524383544922, + "learning_rate": 2.350834403080873e-05, + "loss": 0.2727, + "step": 22969 + }, + { + "epoch": 29.486521181001283, + "grad_norm": 0.9650252461433411, + "learning_rate": 2.3507916131792898e-05, + "loss": 0.273, + "step": 22970 + }, + { + "epoch": 29.48780487804878, + "grad_norm": 0.8981412053108215, + "learning_rate": 2.3507488232777067e-05, + "loss": 0.2783, + "step": 22971 + }, + { + "epoch": 29.48908857509628, + "grad_norm": 1.071419358253479, + "learning_rate": 2.350706033376123e-05, + "loss": 0.3013, + "step": 22972 + }, + { + "epoch": 29.490372272143773, + "grad_norm": 10.14621639251709, + "learning_rate": 2.35066324347454e-05, + "loss": 0.2527, + "step": 22973 + }, + { + "epoch": 29.49165596919127, + "grad_norm": 1.1840341091156006, + "learning_rate": 2.350620453572957e-05, + "loss": 0.3065, + "step": 22974 + }, + { + "epoch": 29.49293966623877, + "grad_norm": 0.9949464797973633, + "learning_rate": 2.3505776636713737e-05, + "loss": 0.2936, + "step": 22975 + }, + { + "epoch": 29.494223363286263, + "grad_norm": 1.1507290601730347, + "learning_rate": 2.3505348737697902e-05, + "loss": 0.2861, + "step": 22976 + }, + { + "epoch": 29.49550706033376, + "grad_norm": 1.30077064037323, + "learning_rate": 2.3504920838682074e-05, + "loss": 0.2769, + "step": 22977 + }, + { + "epoch": 29.49679075738126, + "grad_norm": 1.3950018882751465, + "learning_rate": 2.350449293966624e-05, + "loss": 0.3277, + "step": 22978 + }, + { + "epoch": 29.498074454428753, + "grad_norm": 1.8214631080627441, + "learning_rate": 2.3504065040650407e-05, + "loss": 0.2781, + "step": 22979 + }, + { + "epoch": 29.49935815147625, + "grad_norm": 1.0705170631408691, + "learning_rate": 2.3503637141634576e-05, + "loss": 0.3017, + "step": 22980 + }, + { + "epoch": 29.50064184852375, + "grad_norm": 1.442576289176941, + "learning_rate": 2.350320924261874e-05, + "loss": 0.3149, + "step": 22981 + }, + { + "epoch": 29.501925545571247, + "grad_norm": 0.9819137454032898, + "learning_rate": 2.3502781343602912e-05, + "loss": 0.2623, + "step": 22982 + }, + { + "epoch": 29.50320924261874, + "grad_norm": 1.6856422424316406, + "learning_rate": 2.3502353444587077e-05, + "loss": 0.3246, + "step": 22983 + }, + { + "epoch": 29.50449293966624, + "grad_norm": 1.2100555896759033, + "learning_rate": 2.3501925545571246e-05, + "loss": 0.2899, + "step": 22984 + }, + { + "epoch": 29.505776636713737, + "grad_norm": 1.209345817565918, + "learning_rate": 2.3501497646555414e-05, + "loss": 0.3076, + "step": 22985 + }, + { + "epoch": 29.50706033376123, + "grad_norm": 2.0674362182617188, + "learning_rate": 2.350106974753958e-05, + "loss": 0.344, + "step": 22986 + }, + { + "epoch": 29.50834403080873, + "grad_norm": 1.3337733745574951, + "learning_rate": 2.350064184852375e-05, + "loss": 0.2913, + "step": 22987 + }, + { + "epoch": 29.509627727856227, + "grad_norm": 2.3490548133850098, + "learning_rate": 2.3500213949507916e-05, + "loss": 0.3369, + "step": 22988 + }, + { + "epoch": 29.51091142490372, + "grad_norm": 1.7937264442443848, + "learning_rate": 2.3499786050492085e-05, + "loss": 0.3659, + "step": 22989 + }, + { + "epoch": 29.51219512195122, + "grad_norm": 9.13563346862793, + "learning_rate": 2.3499358151476253e-05, + "loss": 0.3948, + "step": 22990 + }, + { + "epoch": 29.513478818998717, + "grad_norm": 2.075955867767334, + "learning_rate": 2.349893025246042e-05, + "loss": 0.4038, + "step": 22991 + }, + { + "epoch": 29.514762516046215, + "grad_norm": 1.1749451160430908, + "learning_rate": 2.3498502353444586e-05, + "loss": 0.2774, + "step": 22992 + }, + { + "epoch": 29.51604621309371, + "grad_norm": 2.178130626678467, + "learning_rate": 2.3498074454428755e-05, + "loss": 0.2849, + "step": 22993 + }, + { + "epoch": 29.517329910141207, + "grad_norm": 1.4920129776000977, + "learning_rate": 2.3497646555412923e-05, + "loss": 0.2976, + "step": 22994 + }, + { + "epoch": 29.518613607188705, + "grad_norm": 1.5461089611053467, + "learning_rate": 2.349721865639709e-05, + "loss": 0.2818, + "step": 22995 + }, + { + "epoch": 29.5198973042362, + "grad_norm": 1.5151647329330444, + "learning_rate": 2.349679075738126e-05, + "loss": 0.2641, + "step": 22996 + }, + { + "epoch": 29.521181001283697, + "grad_norm": 2.902867555618286, + "learning_rate": 2.3496362858365425e-05, + "loss": 0.3133, + "step": 22997 + }, + { + "epoch": 29.522464698331195, + "grad_norm": 1.2185629606246948, + "learning_rate": 2.3495934959349597e-05, + "loss": 0.2738, + "step": 22998 + }, + { + "epoch": 29.52374839537869, + "grad_norm": 1.0319033861160278, + "learning_rate": 2.3495507060333762e-05, + "loss": 0.3078, + "step": 22999 + }, + { + "epoch": 29.525032092426187, + "grad_norm": 1.4930106401443481, + "learning_rate": 2.3495079161317927e-05, + "loss": 0.2779, + "step": 23000 + }, + { + "epoch": 29.525032092426187, + "eval_cer": 0.26799001370667713, + "eval_loss": 0.4837488830089569, + "eval_runtime": 13.9224, + "eval_samples_per_second": 70.606, + "eval_steps_per_second": 0.503, + "eval_wer": 0.4498968126576473, + "step": 23000 + }, + { + "epoch": 29.526315789473685, + "grad_norm": 1.6967192888259888, + "learning_rate": 2.34946512623021e-05, + "loss": 0.2724, + "step": 23001 + }, + { + "epoch": 29.527599486521183, + "grad_norm": 0.8968327045440674, + "learning_rate": 2.3494223363286264e-05, + "loss": 0.3089, + "step": 23002 + }, + { + "epoch": 29.528883183568677, + "grad_norm": 1.3706984519958496, + "learning_rate": 2.3493795464270432e-05, + "loss": 0.3059, + "step": 23003 + }, + { + "epoch": 29.530166880616175, + "grad_norm": 1.841687798500061, + "learning_rate": 2.34933675652546e-05, + "loss": 0.3017, + "step": 23004 + }, + { + "epoch": 29.531450577663673, + "grad_norm": 0.8843832015991211, + "learning_rate": 2.349293966623877e-05, + "loss": 0.2861, + "step": 23005 + }, + { + "epoch": 29.532734274711167, + "grad_norm": 1.061059594154358, + "learning_rate": 2.3492511767222937e-05, + "loss": 0.2884, + "step": 23006 + }, + { + "epoch": 29.534017971758665, + "grad_norm": 1.1382739543914795, + "learning_rate": 2.3492083868207102e-05, + "loss": 0.2885, + "step": 23007 + }, + { + "epoch": 29.535301668806163, + "grad_norm": 2.5821516513824463, + "learning_rate": 2.349165596919127e-05, + "loss": 0.2994, + "step": 23008 + }, + { + "epoch": 29.536585365853657, + "grad_norm": 1.3128408193588257, + "learning_rate": 2.349122807017544e-05, + "loss": 0.2783, + "step": 23009 + }, + { + "epoch": 29.537869062901155, + "grad_norm": 0.9233804941177368, + "learning_rate": 2.3490800171159608e-05, + "loss": 0.2789, + "step": 23010 + }, + { + "epoch": 29.539152759948653, + "grad_norm": 1.27265465259552, + "learning_rate": 2.3490372272143773e-05, + "loss": 0.3035, + "step": 23011 + }, + { + "epoch": 29.540436456996147, + "grad_norm": 0.918336033821106, + "learning_rate": 2.3489944373127944e-05, + "loss": 0.3053, + "step": 23012 + }, + { + "epoch": 29.541720154043645, + "grad_norm": 3.3874611854553223, + "learning_rate": 2.348951647411211e-05, + "loss": 0.3052, + "step": 23013 + }, + { + "epoch": 29.543003851091143, + "grad_norm": 1.3196195363998413, + "learning_rate": 2.3489088575096278e-05, + "loss": 0.3078, + "step": 23014 + }, + { + "epoch": 29.54428754813864, + "grad_norm": 2.258190155029297, + "learning_rate": 2.3488660676080446e-05, + "loss": 0.3081, + "step": 23015 + }, + { + "epoch": 29.545571245186135, + "grad_norm": 2.0347635746002197, + "learning_rate": 2.348823277706461e-05, + "loss": 0.3124, + "step": 23016 + }, + { + "epoch": 29.546854942233633, + "grad_norm": 1.0057013034820557, + "learning_rate": 2.3487804878048783e-05, + "loss": 0.2824, + "step": 23017 + }, + { + "epoch": 29.54813863928113, + "grad_norm": 1.0582342147827148, + "learning_rate": 2.3487376979032948e-05, + "loss": 0.2848, + "step": 23018 + }, + { + "epoch": 29.549422336328625, + "grad_norm": 1.3684574365615845, + "learning_rate": 2.3486949080017117e-05, + "loss": 0.3077, + "step": 23019 + }, + { + "epoch": 29.550706033376123, + "grad_norm": 1.0200246572494507, + "learning_rate": 2.3486521181001285e-05, + "loss": 0.2633, + "step": 23020 + }, + { + "epoch": 29.55198973042362, + "grad_norm": 1.7975490093231201, + "learning_rate": 2.348609328198545e-05, + "loss": 0.3167, + "step": 23021 + }, + { + "epoch": 29.553273427471115, + "grad_norm": 1.5561943054199219, + "learning_rate": 2.3485665382969622e-05, + "loss": 0.2862, + "step": 23022 + }, + { + "epoch": 29.554557124518613, + "grad_norm": 1.1965711116790771, + "learning_rate": 2.3485237483953787e-05, + "loss": 0.3121, + "step": 23023 + }, + { + "epoch": 29.55584082156611, + "grad_norm": 1.3360038995742798, + "learning_rate": 2.3484809584937955e-05, + "loss": 0.2945, + "step": 23024 + }, + { + "epoch": 29.55712451861361, + "grad_norm": 1.0624303817749023, + "learning_rate": 2.3484381685922124e-05, + "loss": 0.3117, + "step": 23025 + }, + { + "epoch": 29.558408215661103, + "grad_norm": 2.3357009887695312, + "learning_rate": 2.3483953786906292e-05, + "loss": 0.3049, + "step": 23026 + }, + { + "epoch": 29.5596919127086, + "grad_norm": 1.2845044136047363, + "learning_rate": 2.3483525887890457e-05, + "loss": 0.2864, + "step": 23027 + }, + { + "epoch": 29.5609756097561, + "grad_norm": 2.3792572021484375, + "learning_rate": 2.3483097988874625e-05, + "loss": 0.3168, + "step": 23028 + }, + { + "epoch": 29.562259306803593, + "grad_norm": 0.9613783955574036, + "learning_rate": 2.3482670089858794e-05, + "loss": 0.3031, + "step": 23029 + }, + { + "epoch": 29.56354300385109, + "grad_norm": 0.9435520172119141, + "learning_rate": 2.3482242190842962e-05, + "loss": 0.3146, + "step": 23030 + }, + { + "epoch": 29.56482670089859, + "grad_norm": 1.9535430669784546, + "learning_rate": 2.348181429182713e-05, + "loss": 0.3026, + "step": 23031 + }, + { + "epoch": 29.566110397946083, + "grad_norm": 1.364145040512085, + "learning_rate": 2.3481386392811296e-05, + "loss": 0.3214, + "step": 23032 + }, + { + "epoch": 29.56739409499358, + "grad_norm": 5.233346939086914, + "learning_rate": 2.3480958493795464e-05, + "loss": 0.3084, + "step": 23033 + }, + { + "epoch": 29.56867779204108, + "grad_norm": 1.747430443763733, + "learning_rate": 2.3480530594779633e-05, + "loss": 0.3072, + "step": 23034 + }, + { + "epoch": 29.569961489088577, + "grad_norm": 1.2550048828125, + "learning_rate": 2.3480102695763798e-05, + "loss": 0.3203, + "step": 23035 + }, + { + "epoch": 29.57124518613607, + "grad_norm": 1.2518529891967773, + "learning_rate": 2.347967479674797e-05, + "loss": 0.3109, + "step": 23036 + }, + { + "epoch": 29.57252888318357, + "grad_norm": 1.5431374311447144, + "learning_rate": 2.3479246897732134e-05, + "loss": 0.3101, + "step": 23037 + }, + { + "epoch": 29.573812580231067, + "grad_norm": 10.582168579101562, + "learning_rate": 2.3478818998716306e-05, + "loss": 0.3545, + "step": 23038 + }, + { + "epoch": 29.57509627727856, + "grad_norm": 2.042525291442871, + "learning_rate": 2.347839109970047e-05, + "loss": 0.3181, + "step": 23039 + }, + { + "epoch": 29.57637997432606, + "grad_norm": 4.962934494018555, + "learning_rate": 2.3477963200684636e-05, + "loss": 0.4007, + "step": 23040 + }, + { + "epoch": 29.577663671373557, + "grad_norm": 2.102271795272827, + "learning_rate": 2.3477535301668808e-05, + "loss": 0.4349, + "step": 23041 + }, + { + "epoch": 29.57894736842105, + "grad_norm": 2.062110662460327, + "learning_rate": 2.3477107402652973e-05, + "loss": 0.2988, + "step": 23042 + }, + { + "epoch": 29.58023106546855, + "grad_norm": 1.1408966779708862, + "learning_rate": 2.347667950363714e-05, + "loss": 0.2883, + "step": 23043 + }, + { + "epoch": 29.581514762516047, + "grad_norm": 1.296051025390625, + "learning_rate": 2.347625160462131e-05, + "loss": 0.2931, + "step": 23044 + }, + { + "epoch": 29.58279845956354, + "grad_norm": 2.462003231048584, + "learning_rate": 2.347582370560548e-05, + "loss": 0.2975, + "step": 23045 + }, + { + "epoch": 29.58408215661104, + "grad_norm": 4.877047061920166, + "learning_rate": 2.3475395806589647e-05, + "loss": 0.2858, + "step": 23046 + }, + { + "epoch": 29.585365853658537, + "grad_norm": 1.1465215682983398, + "learning_rate": 2.3474967907573812e-05, + "loss": 0.2935, + "step": 23047 + }, + { + "epoch": 29.586649550706035, + "grad_norm": 1.2265411615371704, + "learning_rate": 2.347454000855798e-05, + "loss": 0.3372, + "step": 23048 + }, + { + "epoch": 29.58793324775353, + "grad_norm": 1.18027663230896, + "learning_rate": 2.347411210954215e-05, + "loss": 0.2946, + "step": 23049 + }, + { + "epoch": 29.589216944801027, + "grad_norm": 1.679772973060608, + "learning_rate": 2.3473684210526317e-05, + "loss": 0.3062, + "step": 23050 + }, + { + "epoch": 29.590500641848525, + "grad_norm": 1.3376728296279907, + "learning_rate": 2.3473256311510482e-05, + "loss": 0.2965, + "step": 23051 + }, + { + "epoch": 29.59178433889602, + "grad_norm": 1.4403475522994995, + "learning_rate": 2.3472828412494654e-05, + "loss": 0.2892, + "step": 23052 + }, + { + "epoch": 29.593068035943517, + "grad_norm": 1.0025118589401245, + "learning_rate": 2.347240051347882e-05, + "loss": 0.2788, + "step": 23053 + }, + { + "epoch": 29.594351732991015, + "grad_norm": 1.9090245962142944, + "learning_rate": 2.3471972614462987e-05, + "loss": 0.3173, + "step": 23054 + }, + { + "epoch": 29.59563543003851, + "grad_norm": 2.2772157192230225, + "learning_rate": 2.3471544715447156e-05, + "loss": 0.3196, + "step": 23055 + }, + { + "epoch": 29.596919127086007, + "grad_norm": 1.6685385704040527, + "learning_rate": 2.347111681643132e-05, + "loss": 0.3076, + "step": 23056 + }, + { + "epoch": 29.598202824133505, + "grad_norm": 2.5659523010253906, + "learning_rate": 2.3470688917415492e-05, + "loss": 0.291, + "step": 23057 + }, + { + "epoch": 29.599486521181003, + "grad_norm": 1.9979127645492554, + "learning_rate": 2.3470261018399657e-05, + "loss": 0.2743, + "step": 23058 + }, + { + "epoch": 29.600770218228497, + "grad_norm": 1.2136873006820679, + "learning_rate": 2.3469833119383826e-05, + "loss": 0.2642, + "step": 23059 + }, + { + "epoch": 29.602053915275995, + "grad_norm": 0.8771042823791504, + "learning_rate": 2.3469405220367994e-05, + "loss": 0.2863, + "step": 23060 + }, + { + "epoch": 29.603337612323493, + "grad_norm": 0.9185723662376404, + "learning_rate": 2.346897732135216e-05, + "loss": 0.282, + "step": 23061 + }, + { + "epoch": 29.604621309370987, + "grad_norm": 1.0965285301208496, + "learning_rate": 2.346854942233633e-05, + "loss": 0.2992, + "step": 23062 + }, + { + "epoch": 29.605905006418485, + "grad_norm": 0.8068620562553406, + "learning_rate": 2.3468121523320496e-05, + "loss": 0.2639, + "step": 23063 + }, + { + "epoch": 29.607188703465983, + "grad_norm": 1.972267508506775, + "learning_rate": 2.3467693624304665e-05, + "loss": 0.2714, + "step": 23064 + }, + { + "epoch": 29.608472400513477, + "grad_norm": 1.446305513381958, + "learning_rate": 2.3467265725288833e-05, + "loss": 0.2864, + "step": 23065 + }, + { + "epoch": 29.609756097560975, + "grad_norm": 1.3628817796707153, + "learning_rate": 2.3466837826273e-05, + "loss": 0.272, + "step": 23066 + }, + { + "epoch": 29.611039794608473, + "grad_norm": 1.8015514612197876, + "learning_rate": 2.3466409927257166e-05, + "loss": 0.2744, + "step": 23067 + }, + { + "epoch": 29.61232349165597, + "grad_norm": 0.9412595629692078, + "learning_rate": 2.3465982028241335e-05, + "loss": 0.2794, + "step": 23068 + }, + { + "epoch": 29.613607188703465, + "grad_norm": 1.1228350400924683, + "learning_rate": 2.3465554129225503e-05, + "loss": 0.3112, + "step": 23069 + }, + { + "epoch": 29.614890885750963, + "grad_norm": 0.9813610315322876, + "learning_rate": 2.346512623020967e-05, + "loss": 0.2743, + "step": 23070 + }, + { + "epoch": 29.61617458279846, + "grad_norm": 0.9495212435722351, + "learning_rate": 2.346469833119384e-05, + "loss": 0.2868, + "step": 23071 + }, + { + "epoch": 29.617458279845955, + "grad_norm": 0.7591753005981445, + "learning_rate": 2.3464270432178005e-05, + "loss": 0.2835, + "step": 23072 + }, + { + "epoch": 29.618741976893453, + "grad_norm": 5.16187858581543, + "learning_rate": 2.3463842533162177e-05, + "loss": 0.2991, + "step": 23073 + }, + { + "epoch": 29.62002567394095, + "grad_norm": 1.0247533321380615, + "learning_rate": 2.3463414634146342e-05, + "loss": 0.3056, + "step": 23074 + }, + { + "epoch": 29.621309370988445, + "grad_norm": 1.6122486591339111, + "learning_rate": 2.3462986735130507e-05, + "loss": 0.2706, + "step": 23075 + }, + { + "epoch": 29.622593068035943, + "grad_norm": 0.9312440752983093, + "learning_rate": 2.346255883611468e-05, + "loss": 0.2825, + "step": 23076 + }, + { + "epoch": 29.62387676508344, + "grad_norm": 1.6877723932266235, + "learning_rate": 2.3462130937098844e-05, + "loss": 0.2795, + "step": 23077 + }, + { + "epoch": 29.625160462130935, + "grad_norm": 0.9919809699058533, + "learning_rate": 2.3461703038083016e-05, + "loss": 0.3071, + "step": 23078 + }, + { + "epoch": 29.626444159178433, + "grad_norm": 1.2220913171768188, + "learning_rate": 2.346127513906718e-05, + "loss": 0.2722, + "step": 23079 + }, + { + "epoch": 29.62772785622593, + "grad_norm": 1.612348198890686, + "learning_rate": 2.346084724005135e-05, + "loss": 0.3133, + "step": 23080 + }, + { + "epoch": 29.62901155327343, + "grad_norm": 3.195359706878662, + "learning_rate": 2.3460419341035517e-05, + "loss": 0.305, + "step": 23081 + }, + { + "epoch": 29.630295250320923, + "grad_norm": 3.0318410396575928, + "learning_rate": 2.3459991442019682e-05, + "loss": 0.2844, + "step": 23082 + }, + { + "epoch": 29.63157894736842, + "grad_norm": 7.2202606201171875, + "learning_rate": 2.345956354300385e-05, + "loss": 0.3525, + "step": 23083 + }, + { + "epoch": 29.63286264441592, + "grad_norm": 1.5384888648986816, + "learning_rate": 2.345913564398802e-05, + "loss": 0.3333, + "step": 23084 + }, + { + "epoch": 29.634146341463413, + "grad_norm": 1.682904839515686, + "learning_rate": 2.3458707744972188e-05, + "loss": 0.3044, + "step": 23085 + }, + { + "epoch": 29.63543003851091, + "grad_norm": 2.100858688354492, + "learning_rate": 2.3458279845956356e-05, + "loss": 0.3106, + "step": 23086 + }, + { + "epoch": 29.63671373555841, + "grad_norm": 1.8462039232254028, + "learning_rate": 2.3457851946940524e-05, + "loss": 0.33, + "step": 23087 + }, + { + "epoch": 29.637997432605903, + "grad_norm": 4.968445777893066, + "learning_rate": 2.345742404792469e-05, + "loss": 0.3155, + "step": 23088 + }, + { + "epoch": 29.6392811296534, + "grad_norm": 1.3253215551376343, + "learning_rate": 2.3456996148908858e-05, + "loss": 0.416, + "step": 23089 + }, + { + "epoch": 29.6405648267009, + "grad_norm": 2.008613348007202, + "learning_rate": 2.3456568249893026e-05, + "loss": 0.3868, + "step": 23090 + }, + { + "epoch": 29.641848523748397, + "grad_norm": 3.0543503761291504, + "learning_rate": 2.345614035087719e-05, + "loss": 0.5325, + "step": 23091 + }, + { + "epoch": 29.64313222079589, + "grad_norm": 0.7443810701370239, + "learning_rate": 2.3455712451861363e-05, + "loss": 0.2781, + "step": 23092 + }, + { + "epoch": 29.64441591784339, + "grad_norm": 0.7999264597892761, + "learning_rate": 2.3455284552845528e-05, + "loss": 0.2769, + "step": 23093 + }, + { + "epoch": 29.645699614890887, + "grad_norm": 1.108322024345398, + "learning_rate": 2.3454856653829697e-05, + "loss": 0.2937, + "step": 23094 + }, + { + "epoch": 29.64698331193838, + "grad_norm": 1.3505229949951172, + "learning_rate": 2.3454428754813865e-05, + "loss": 0.297, + "step": 23095 + }, + { + "epoch": 29.64826700898588, + "grad_norm": 1.028150200843811, + "learning_rate": 2.345400085579803e-05, + "loss": 0.287, + "step": 23096 + }, + { + "epoch": 29.649550706033377, + "grad_norm": 1.1151373386383057, + "learning_rate": 2.3453572956782202e-05, + "loss": 0.2769, + "step": 23097 + }, + { + "epoch": 29.65083440308087, + "grad_norm": 0.738835334777832, + "learning_rate": 2.3453145057766367e-05, + "loss": 0.2844, + "step": 23098 + }, + { + "epoch": 29.65211810012837, + "grad_norm": 0.9043592810630798, + "learning_rate": 2.3452717158750535e-05, + "loss": 0.3028, + "step": 23099 + }, + { + "epoch": 29.653401797175867, + "grad_norm": 1.2053086757659912, + "learning_rate": 2.3452289259734704e-05, + "loss": 0.3147, + "step": 23100 + }, + { + "epoch": 29.654685494223365, + "grad_norm": 0.8924965858459473, + "learning_rate": 2.345186136071887e-05, + "loss": 0.2765, + "step": 23101 + }, + { + "epoch": 29.65596919127086, + "grad_norm": 2.913194179534912, + "learning_rate": 2.345143346170304e-05, + "loss": 0.3123, + "step": 23102 + }, + { + "epoch": 29.657252888318357, + "grad_norm": 1.7076667547225952, + "learning_rate": 2.3451005562687206e-05, + "loss": 0.306, + "step": 23103 + }, + { + "epoch": 29.658536585365855, + "grad_norm": 1.0270012617111206, + "learning_rate": 2.3450577663671374e-05, + "loss": 0.296, + "step": 23104 + }, + { + "epoch": 29.65982028241335, + "grad_norm": 1.0629761219024658, + "learning_rate": 2.3450149764655542e-05, + "loss": 0.2779, + "step": 23105 + }, + { + "epoch": 29.661103979460847, + "grad_norm": 1.0119779109954834, + "learning_rate": 2.344972186563971e-05, + "loss": 0.2862, + "step": 23106 + }, + { + "epoch": 29.662387676508345, + "grad_norm": 0.9375263452529907, + "learning_rate": 2.3449293966623876e-05, + "loss": 0.2801, + "step": 23107 + }, + { + "epoch": 29.66367137355584, + "grad_norm": 3.277782440185547, + "learning_rate": 2.3448866067608044e-05, + "loss": 0.277, + "step": 23108 + }, + { + "epoch": 29.664955070603337, + "grad_norm": 0.9606741666793823, + "learning_rate": 2.3448438168592213e-05, + "loss": 0.284, + "step": 23109 + }, + { + "epoch": 29.666238767650835, + "grad_norm": 1.9442989826202393, + "learning_rate": 2.344801026957638e-05, + "loss": 0.2943, + "step": 23110 + }, + { + "epoch": 29.66752246469833, + "grad_norm": 1.3610129356384277, + "learning_rate": 2.344758237056055e-05, + "loss": 0.3102, + "step": 23111 + }, + { + "epoch": 29.668806161745827, + "grad_norm": 1.1847275495529175, + "learning_rate": 2.3447154471544714e-05, + "loss": 0.2826, + "step": 23112 + }, + { + "epoch": 29.670089858793325, + "grad_norm": 0.8241307139396667, + "learning_rate": 2.3446726572528886e-05, + "loss": 0.272, + "step": 23113 + }, + { + "epoch": 29.671373555840823, + "grad_norm": 2.5457983016967773, + "learning_rate": 2.344629867351305e-05, + "loss": 0.3125, + "step": 23114 + }, + { + "epoch": 29.672657252888317, + "grad_norm": 2.268122434616089, + "learning_rate": 2.3445870774497216e-05, + "loss": 0.2791, + "step": 23115 + }, + { + "epoch": 29.673940949935815, + "grad_norm": 0.8443577289581299, + "learning_rate": 2.3445442875481388e-05, + "loss": 0.2835, + "step": 23116 + }, + { + "epoch": 29.675224646983313, + "grad_norm": 2.12610125541687, + "learning_rate": 2.3445014976465553e-05, + "loss": 0.3053, + "step": 23117 + }, + { + "epoch": 29.676508344030808, + "grad_norm": 3.662466526031494, + "learning_rate": 2.3444587077449725e-05, + "loss": 0.2798, + "step": 23118 + }, + { + "epoch": 29.677792041078305, + "grad_norm": 1.3868036270141602, + "learning_rate": 2.344415917843389e-05, + "loss": 0.2924, + "step": 23119 + }, + { + "epoch": 29.679075738125803, + "grad_norm": 2.333500623703003, + "learning_rate": 2.344373127941806e-05, + "loss": 0.2744, + "step": 23120 + }, + { + "epoch": 29.680359435173298, + "grad_norm": 3.483461618423462, + "learning_rate": 2.3443303380402227e-05, + "loss": 0.2989, + "step": 23121 + }, + { + "epoch": 29.681643132220795, + "grad_norm": 1.5607529878616333, + "learning_rate": 2.3442875481386392e-05, + "loss": 0.2506, + "step": 23122 + }, + { + "epoch": 29.682926829268293, + "grad_norm": 2.1216819286346436, + "learning_rate": 2.344244758237056e-05, + "loss": 0.2952, + "step": 23123 + }, + { + "epoch": 29.68421052631579, + "grad_norm": 2.2834725379943848, + "learning_rate": 2.344201968335473e-05, + "loss": 0.2734, + "step": 23124 + }, + { + "epoch": 29.685494223363285, + "grad_norm": 5.157568454742432, + "learning_rate": 2.3441591784338897e-05, + "loss": 0.2991, + "step": 23125 + }, + { + "epoch": 29.686777920410783, + "grad_norm": 1.1913328170776367, + "learning_rate": 2.3441163885323065e-05, + "loss": 0.3003, + "step": 23126 + }, + { + "epoch": 29.68806161745828, + "grad_norm": 2.1188251972198486, + "learning_rate": 2.3440735986307234e-05, + "loss": 0.291, + "step": 23127 + }, + { + "epoch": 29.689345314505776, + "grad_norm": 2.5749969482421875, + "learning_rate": 2.34403080872914e-05, + "loss": 0.2792, + "step": 23128 + }, + { + "epoch": 29.690629011553273, + "grad_norm": 2.076164722442627, + "learning_rate": 2.3439880188275567e-05, + "loss": 0.2815, + "step": 23129 + }, + { + "epoch": 29.69191270860077, + "grad_norm": 1.3522794246673584, + "learning_rate": 2.3439452289259736e-05, + "loss": 0.2926, + "step": 23130 + }, + { + "epoch": 29.693196405648266, + "grad_norm": 2.1793994903564453, + "learning_rate": 2.34390243902439e-05, + "loss": 0.3074, + "step": 23131 + }, + { + "epoch": 29.694480102695763, + "grad_norm": 5.065293312072754, + "learning_rate": 2.3438596491228073e-05, + "loss": 0.3141, + "step": 23132 + }, + { + "epoch": 29.69576379974326, + "grad_norm": 1.1175018548965454, + "learning_rate": 2.3438168592212238e-05, + "loss": 0.3264, + "step": 23133 + }, + { + "epoch": 29.69704749679076, + "grad_norm": 1.8903146982192993, + "learning_rate": 2.343774069319641e-05, + "loss": 0.3, + "step": 23134 + }, + { + "epoch": 29.698331193838253, + "grad_norm": 1.365933895111084, + "learning_rate": 2.3437312794180574e-05, + "loss": 0.3278, + "step": 23135 + }, + { + "epoch": 29.69961489088575, + "grad_norm": 1.5876609086990356, + "learning_rate": 2.343688489516474e-05, + "loss": 0.2938, + "step": 23136 + }, + { + "epoch": 29.70089858793325, + "grad_norm": 1.734047770500183, + "learning_rate": 2.343645699614891e-05, + "loss": 0.3209, + "step": 23137 + }, + { + "epoch": 29.702182284980744, + "grad_norm": 1.3538868427276611, + "learning_rate": 2.3436029097133076e-05, + "loss": 0.3354, + "step": 23138 + }, + { + "epoch": 29.70346598202824, + "grad_norm": 2.5953316688537598, + "learning_rate": 2.3435601198117245e-05, + "loss": 0.3549, + "step": 23139 + }, + { + "epoch": 29.70474967907574, + "grad_norm": 1.9738675355911255, + "learning_rate": 2.3435173299101413e-05, + "loss": 0.4186, + "step": 23140 + }, + { + "epoch": 29.706033376123234, + "grad_norm": 2.907496452331543, + "learning_rate": 2.343474540008558e-05, + "loss": 0.4954, + "step": 23141 + }, + { + "epoch": 29.70731707317073, + "grad_norm": 1.0015194416046143, + "learning_rate": 2.343431750106975e-05, + "loss": 0.2947, + "step": 23142 + }, + { + "epoch": 29.70860077021823, + "grad_norm": 0.9710833430290222, + "learning_rate": 2.3433889602053915e-05, + "loss": 0.2829, + "step": 23143 + }, + { + "epoch": 29.709884467265724, + "grad_norm": 0.7380360960960388, + "learning_rate": 2.3433461703038083e-05, + "loss": 0.3011, + "step": 23144 + }, + { + "epoch": 29.71116816431322, + "grad_norm": 1.524025797843933, + "learning_rate": 2.3433033804022252e-05, + "loss": 0.3156, + "step": 23145 + }, + { + "epoch": 29.71245186136072, + "grad_norm": 1.413356065750122, + "learning_rate": 2.343260590500642e-05, + "loss": 0.3006, + "step": 23146 + }, + { + "epoch": 29.713735558408217, + "grad_norm": 0.7185951471328735, + "learning_rate": 2.3432178005990585e-05, + "loss": 0.2997, + "step": 23147 + }, + { + "epoch": 29.71501925545571, + "grad_norm": 0.7380267977714539, + "learning_rate": 2.3431750106974757e-05, + "loss": 0.3137, + "step": 23148 + }, + { + "epoch": 29.71630295250321, + "grad_norm": 1.2825756072998047, + "learning_rate": 2.3431322207958922e-05, + "loss": 0.2717, + "step": 23149 + }, + { + "epoch": 29.717586649550707, + "grad_norm": 0.8831636905670166, + "learning_rate": 2.343089430894309e-05, + "loss": 0.3104, + "step": 23150 + }, + { + "epoch": 29.7188703465982, + "grad_norm": 0.7669031620025635, + "learning_rate": 2.343046640992726e-05, + "loss": 0.2733, + "step": 23151 + }, + { + "epoch": 29.7201540436457, + "grad_norm": 1.1943086385726929, + "learning_rate": 2.3430038510911424e-05, + "loss": 0.287, + "step": 23152 + }, + { + "epoch": 29.721437740693197, + "grad_norm": 1.2037725448608398, + "learning_rate": 2.3429610611895596e-05, + "loss": 0.2919, + "step": 23153 + }, + { + "epoch": 29.72272143774069, + "grad_norm": 1.1601663827896118, + "learning_rate": 2.342918271287976e-05, + "loss": 0.3051, + "step": 23154 + }, + { + "epoch": 29.72400513478819, + "grad_norm": 1.2313412427902222, + "learning_rate": 2.342875481386393e-05, + "loss": 0.3284, + "step": 23155 + }, + { + "epoch": 29.725288831835687, + "grad_norm": 3.164065361022949, + "learning_rate": 2.3428326914848097e-05, + "loss": 0.2952, + "step": 23156 + }, + { + "epoch": 29.726572528883185, + "grad_norm": 1.6807318925857544, + "learning_rate": 2.3427899015832262e-05, + "loss": 0.3029, + "step": 23157 + }, + { + "epoch": 29.72785622593068, + "grad_norm": 1.3234614133834839, + "learning_rate": 2.3427471116816434e-05, + "loss": 0.2851, + "step": 23158 + }, + { + "epoch": 29.729139922978177, + "grad_norm": 1.0179132223129272, + "learning_rate": 2.34270432178006e-05, + "loss": 0.2672, + "step": 23159 + }, + { + "epoch": 29.730423620025675, + "grad_norm": 0.8920155167579651, + "learning_rate": 2.3426615318784768e-05, + "loss": 0.2672, + "step": 23160 + }, + { + "epoch": 29.73170731707317, + "grad_norm": 1.6841214895248413, + "learning_rate": 2.3426187419768936e-05, + "loss": 0.2809, + "step": 23161 + }, + { + "epoch": 29.732991014120667, + "grad_norm": 1.0767464637756348, + "learning_rate": 2.34257595207531e-05, + "loss": 0.293, + "step": 23162 + }, + { + "epoch": 29.734274711168165, + "grad_norm": 1.2112808227539062, + "learning_rate": 2.342533162173727e-05, + "loss": 0.2811, + "step": 23163 + }, + { + "epoch": 29.73555840821566, + "grad_norm": 1.5935227870941162, + "learning_rate": 2.3424903722721438e-05, + "loss": 0.2799, + "step": 23164 + }, + { + "epoch": 29.736842105263158, + "grad_norm": 1.5760993957519531, + "learning_rate": 2.3424475823705606e-05, + "loss": 0.2984, + "step": 23165 + }, + { + "epoch": 29.738125802310655, + "grad_norm": 0.9947316646575928, + "learning_rate": 2.3424047924689775e-05, + "loss": 0.2644, + "step": 23166 + }, + { + "epoch": 29.739409499358153, + "grad_norm": 2.1615395545959473, + "learning_rate": 2.3423620025673943e-05, + "loss": 0.3523, + "step": 23167 + }, + { + "epoch": 29.740693196405648, + "grad_norm": 1.140594720840454, + "learning_rate": 2.3423192126658108e-05, + "loss": 0.2598, + "step": 23168 + }, + { + "epoch": 29.741976893453145, + "grad_norm": 1.1289020776748657, + "learning_rate": 2.3422764227642277e-05, + "loss": 0.2786, + "step": 23169 + }, + { + "epoch": 29.743260590500643, + "grad_norm": 1.9113500118255615, + "learning_rate": 2.3422336328626445e-05, + "loss": 0.2949, + "step": 23170 + }, + { + "epoch": 29.744544287548138, + "grad_norm": 1.2087326049804688, + "learning_rate": 2.342190842961061e-05, + "loss": 0.3083, + "step": 23171 + }, + { + "epoch": 29.745827984595635, + "grad_norm": 1.0547338724136353, + "learning_rate": 2.3421480530594782e-05, + "loss": 0.3034, + "step": 23172 + }, + { + "epoch": 29.747111681643133, + "grad_norm": 0.9512729048728943, + "learning_rate": 2.3421052631578947e-05, + "loss": 0.3013, + "step": 23173 + }, + { + "epoch": 29.748395378690628, + "grad_norm": 1.0491914749145508, + "learning_rate": 2.342062473256312e-05, + "loss": 0.2896, + "step": 23174 + }, + { + "epoch": 29.749679075738126, + "grad_norm": 1.2948553562164307, + "learning_rate": 2.3420196833547284e-05, + "loss": 0.2959, + "step": 23175 + }, + { + "epoch": 29.750962772785623, + "grad_norm": 1.6890290975570679, + "learning_rate": 2.341976893453145e-05, + "loss": 0.3044, + "step": 23176 + }, + { + "epoch": 29.752246469833118, + "grad_norm": 2.7377781867980957, + "learning_rate": 2.341934103551562e-05, + "loss": 0.3114, + "step": 23177 + }, + { + "epoch": 29.753530166880616, + "grad_norm": 1.6057606935501099, + "learning_rate": 2.3418913136499786e-05, + "loss": 0.2955, + "step": 23178 + }, + { + "epoch": 29.754813863928113, + "grad_norm": 1.369246244430542, + "learning_rate": 2.3418485237483954e-05, + "loss": 0.2775, + "step": 23179 + }, + { + "epoch": 29.75609756097561, + "grad_norm": 1.8129093647003174, + "learning_rate": 2.3418057338468122e-05, + "loss": 0.3134, + "step": 23180 + }, + { + "epoch": 29.757381258023106, + "grad_norm": 3.519124746322632, + "learning_rate": 2.341762943945229e-05, + "loss": 0.283, + "step": 23181 + }, + { + "epoch": 29.758664955070603, + "grad_norm": 1.4066835641860962, + "learning_rate": 2.341720154043646e-05, + "loss": 0.3133, + "step": 23182 + }, + { + "epoch": 29.7599486521181, + "grad_norm": 1.9363964796066284, + "learning_rate": 2.3416773641420624e-05, + "loss": 0.317, + "step": 23183 + }, + { + "epoch": 29.761232349165596, + "grad_norm": 1.957507610321045, + "learning_rate": 2.3416345742404793e-05, + "loss": 0.3059, + "step": 23184 + }, + { + "epoch": 29.762516046213094, + "grad_norm": 2.2985665798187256, + "learning_rate": 2.341591784338896e-05, + "loss": 0.312, + "step": 23185 + }, + { + "epoch": 29.76379974326059, + "grad_norm": 1.1632087230682373, + "learning_rate": 2.341548994437313e-05, + "loss": 0.2992, + "step": 23186 + }, + { + "epoch": 29.765083440308086, + "grad_norm": 3.213239908218384, + "learning_rate": 2.3415062045357295e-05, + "loss": 0.2993, + "step": 23187 + }, + { + "epoch": 29.766367137355584, + "grad_norm": 4.0280961990356445, + "learning_rate": 2.3414634146341466e-05, + "loss": 0.337, + "step": 23188 + }, + { + "epoch": 29.76765083440308, + "grad_norm": 3.867424726486206, + "learning_rate": 2.341420624732563e-05, + "loss": 0.3661, + "step": 23189 + }, + { + "epoch": 29.76893453145058, + "grad_norm": 2.8644323348999023, + "learning_rate": 2.34137783483098e-05, + "loss": 0.4224, + "step": 23190 + }, + { + "epoch": 29.770218228498074, + "grad_norm": 4.9568305015563965, + "learning_rate": 2.3413350449293968e-05, + "loss": 0.4914, + "step": 23191 + }, + { + "epoch": 29.77150192554557, + "grad_norm": 0.8097947239875793, + "learning_rate": 2.3412922550278133e-05, + "loss": 0.2666, + "step": 23192 + }, + { + "epoch": 29.77278562259307, + "grad_norm": 1.244235634803772, + "learning_rate": 2.3412494651262305e-05, + "loss": 0.3166, + "step": 23193 + }, + { + "epoch": 29.774069319640564, + "grad_norm": 0.8307353854179382, + "learning_rate": 2.341206675224647e-05, + "loss": 0.2578, + "step": 23194 + }, + { + "epoch": 29.77535301668806, + "grad_norm": 1.243048906326294, + "learning_rate": 2.341163885323064e-05, + "loss": 0.3049, + "step": 23195 + }, + { + "epoch": 29.77663671373556, + "grad_norm": 2.1594839096069336, + "learning_rate": 2.3411210954214807e-05, + "loss": 0.2855, + "step": 23196 + }, + { + "epoch": 29.777920410783054, + "grad_norm": 1.268166422843933, + "learning_rate": 2.3410783055198972e-05, + "loss": 0.2908, + "step": 23197 + }, + { + "epoch": 29.77920410783055, + "grad_norm": 0.8988884687423706, + "learning_rate": 2.341035515618314e-05, + "loss": 0.3064, + "step": 23198 + }, + { + "epoch": 29.78048780487805, + "grad_norm": 1.0349856615066528, + "learning_rate": 2.340992725716731e-05, + "loss": 0.2956, + "step": 23199 + }, + { + "epoch": 29.781771501925547, + "grad_norm": 1.1517860889434814, + "learning_rate": 2.3409499358151477e-05, + "loss": 0.308, + "step": 23200 + }, + { + "epoch": 29.78305519897304, + "grad_norm": 2.125460624694824, + "learning_rate": 2.3409071459135646e-05, + "loss": 0.322, + "step": 23201 + }, + { + "epoch": 29.78433889602054, + "grad_norm": 0.9228848218917847, + "learning_rate": 2.3408643560119814e-05, + "loss": 0.3058, + "step": 23202 + }, + { + "epoch": 29.785622593068037, + "grad_norm": 1.0206513404846191, + "learning_rate": 2.340821566110398e-05, + "loss": 0.2924, + "step": 23203 + }, + { + "epoch": 29.78690629011553, + "grad_norm": 2.670297145843506, + "learning_rate": 2.3407787762088147e-05, + "loss": 0.2991, + "step": 23204 + }, + { + "epoch": 29.78818998716303, + "grad_norm": 1.0668045282363892, + "learning_rate": 2.3407359863072316e-05, + "loss": 0.2784, + "step": 23205 + }, + { + "epoch": 29.789473684210527, + "grad_norm": 1.0890014171600342, + "learning_rate": 2.340693196405648e-05, + "loss": 0.2738, + "step": 23206 + }, + { + "epoch": 29.79075738125802, + "grad_norm": 1.2443901300430298, + "learning_rate": 2.3406504065040653e-05, + "loss": 0.3101, + "step": 23207 + }, + { + "epoch": 29.79204107830552, + "grad_norm": 2.5382778644561768, + "learning_rate": 2.3406076166024818e-05, + "loss": 0.3191, + "step": 23208 + }, + { + "epoch": 29.793324775353017, + "grad_norm": 0.9952653646469116, + "learning_rate": 2.340564826700899e-05, + "loss": 0.2495, + "step": 23209 + }, + { + "epoch": 29.794608472400512, + "grad_norm": 1.232662558555603, + "learning_rate": 2.3405220367993154e-05, + "loss": 0.3073, + "step": 23210 + }, + { + "epoch": 29.79589216944801, + "grad_norm": 2.169389247894287, + "learning_rate": 2.340479246897732e-05, + "loss": 0.2909, + "step": 23211 + }, + { + "epoch": 29.797175866495508, + "grad_norm": 1.7204073667526245, + "learning_rate": 2.340436456996149e-05, + "loss": 0.2893, + "step": 23212 + }, + { + "epoch": 29.798459563543005, + "grad_norm": 0.9467862248420715, + "learning_rate": 2.3403936670945656e-05, + "loss": 0.2945, + "step": 23213 + }, + { + "epoch": 29.7997432605905, + "grad_norm": 1.54688560962677, + "learning_rate": 2.3403508771929825e-05, + "loss": 0.3021, + "step": 23214 + }, + { + "epoch": 29.801026957637998, + "grad_norm": 1.4950366020202637, + "learning_rate": 2.3403080872913993e-05, + "loss": 0.2749, + "step": 23215 + }, + { + "epoch": 29.802310654685495, + "grad_norm": 1.4930044412612915, + "learning_rate": 2.340265297389816e-05, + "loss": 0.285, + "step": 23216 + }, + { + "epoch": 29.80359435173299, + "grad_norm": 1.3875116109848022, + "learning_rate": 2.340222507488233e-05, + "loss": 0.3165, + "step": 23217 + }, + { + "epoch": 29.804878048780488, + "grad_norm": 1.308295726776123, + "learning_rate": 2.3401797175866495e-05, + "loss": 0.2748, + "step": 23218 + }, + { + "epoch": 29.806161745827985, + "grad_norm": 1.8326703310012817, + "learning_rate": 2.3401369276850663e-05, + "loss": 0.2629, + "step": 23219 + }, + { + "epoch": 29.80744544287548, + "grad_norm": 1.927560806274414, + "learning_rate": 2.3400941377834832e-05, + "loss": 0.2521, + "step": 23220 + }, + { + "epoch": 29.808729139922978, + "grad_norm": 1.2413939237594604, + "learning_rate": 2.3400513478819e-05, + "loss": 0.2727, + "step": 23221 + }, + { + "epoch": 29.810012836970476, + "grad_norm": 1.9685757160186768, + "learning_rate": 2.3400085579803165e-05, + "loss": 0.2918, + "step": 23222 + }, + { + "epoch": 29.811296534017973, + "grad_norm": 1.8091099262237549, + "learning_rate": 2.3399657680787334e-05, + "loss": 0.3361, + "step": 23223 + }, + { + "epoch": 29.812580231065468, + "grad_norm": 2.679340362548828, + "learning_rate": 2.3399229781771502e-05, + "loss": 0.3205, + "step": 23224 + }, + { + "epoch": 29.813863928112966, + "grad_norm": 1.922476053237915, + "learning_rate": 2.339880188275567e-05, + "loss": 0.314, + "step": 23225 + }, + { + "epoch": 29.815147625160463, + "grad_norm": 3.1994757652282715, + "learning_rate": 2.339837398373984e-05, + "loss": 0.3276, + "step": 23226 + }, + { + "epoch": 29.816431322207958, + "grad_norm": 1.798181176185608, + "learning_rate": 2.3397946084724004e-05, + "loss": 0.2722, + "step": 23227 + }, + { + "epoch": 29.817715019255456, + "grad_norm": 1.3282477855682373, + "learning_rate": 2.3397518185708176e-05, + "loss": 0.2905, + "step": 23228 + }, + { + "epoch": 29.818998716302954, + "grad_norm": 2.741978645324707, + "learning_rate": 2.339709028669234e-05, + "loss": 0.3231, + "step": 23229 + }, + { + "epoch": 29.820282413350448, + "grad_norm": 3.82755708694458, + "learning_rate": 2.3396662387676506e-05, + "loss": 0.2942, + "step": 23230 + }, + { + "epoch": 29.821566110397946, + "grad_norm": 1.347217321395874, + "learning_rate": 2.3396234488660678e-05, + "loss": 0.2762, + "step": 23231 + }, + { + "epoch": 29.822849807445444, + "grad_norm": 2.2936151027679443, + "learning_rate": 2.3395806589644843e-05, + "loss": 0.3174, + "step": 23232 + }, + { + "epoch": 29.82413350449294, + "grad_norm": 1.4826397895812988, + "learning_rate": 2.3395378690629014e-05, + "loss": 0.3136, + "step": 23233 + }, + { + "epoch": 29.825417201540436, + "grad_norm": 2.901057481765747, + "learning_rate": 2.339495079161318e-05, + "loss": 0.277, + "step": 23234 + }, + { + "epoch": 29.826700898587934, + "grad_norm": 3.297377109527588, + "learning_rate": 2.3394522892597348e-05, + "loss": 0.3245, + "step": 23235 + }, + { + "epoch": 29.82798459563543, + "grad_norm": 1.3479810953140259, + "learning_rate": 2.3394094993581516e-05, + "loss": 0.3095, + "step": 23236 + }, + { + "epoch": 29.829268292682926, + "grad_norm": 1.9875619411468506, + "learning_rate": 2.339366709456568e-05, + "loss": 0.3546, + "step": 23237 + }, + { + "epoch": 29.830551989730424, + "grad_norm": 1.8034520149230957, + "learning_rate": 2.339323919554985e-05, + "loss": 0.3596, + "step": 23238 + }, + { + "epoch": 29.83183568677792, + "grad_norm": 1.722825288772583, + "learning_rate": 2.3392811296534018e-05, + "loss": 0.3583, + "step": 23239 + }, + { + "epoch": 29.833119383825416, + "grad_norm": 2.9461729526519775, + "learning_rate": 2.3392383397518186e-05, + "loss": 0.3906, + "step": 23240 + }, + { + "epoch": 29.834403080872914, + "grad_norm": 2.107726812362671, + "learning_rate": 2.3391955498502355e-05, + "loss": 0.5037, + "step": 23241 + }, + { + "epoch": 29.83568677792041, + "grad_norm": 0.7516592741012573, + "learning_rate": 2.3391527599486523e-05, + "loss": 0.2951, + "step": 23242 + }, + { + "epoch": 29.836970474967906, + "grad_norm": 1.8023215532302856, + "learning_rate": 2.339109970047069e-05, + "loss": 0.2982, + "step": 23243 + }, + { + "epoch": 29.838254172015404, + "grad_norm": 1.0962549448013306, + "learning_rate": 2.3390671801454857e-05, + "loss": 0.3274, + "step": 23244 + }, + { + "epoch": 29.8395378690629, + "grad_norm": 1.0740201473236084, + "learning_rate": 2.3390243902439025e-05, + "loss": 0.2764, + "step": 23245 + }, + { + "epoch": 29.8408215661104, + "grad_norm": 1.1134703159332275, + "learning_rate": 2.338981600342319e-05, + "loss": 0.3014, + "step": 23246 + }, + { + "epoch": 29.842105263157894, + "grad_norm": 1.0721310377120972, + "learning_rate": 2.3389388104407362e-05, + "loss": 0.2839, + "step": 23247 + }, + { + "epoch": 29.84338896020539, + "grad_norm": 2.739820957183838, + "learning_rate": 2.3388960205391527e-05, + "loss": 0.2858, + "step": 23248 + }, + { + "epoch": 29.84467265725289, + "grad_norm": 1.0179420709609985, + "learning_rate": 2.33885323063757e-05, + "loss": 0.3131, + "step": 23249 + }, + { + "epoch": 29.845956354300384, + "grad_norm": 2.7474477291107178, + "learning_rate": 2.3388104407359864e-05, + "loss": 0.2817, + "step": 23250 + }, + { + "epoch": 29.84724005134788, + "grad_norm": 1.8800941705703735, + "learning_rate": 2.338767650834403e-05, + "loss": 0.3028, + "step": 23251 + }, + { + "epoch": 29.84852374839538, + "grad_norm": 2.559352397918701, + "learning_rate": 2.33872486093282e-05, + "loss": 0.3088, + "step": 23252 + }, + { + "epoch": 29.849807445442874, + "grad_norm": 1.0254689455032349, + "learning_rate": 2.3386820710312366e-05, + "loss": 0.2827, + "step": 23253 + }, + { + "epoch": 29.85109114249037, + "grad_norm": 1.3970415592193604, + "learning_rate": 2.3386392811296534e-05, + "loss": 0.2804, + "step": 23254 + }, + { + "epoch": 29.85237483953787, + "grad_norm": 1.0959594249725342, + "learning_rate": 2.3385964912280702e-05, + "loss": 0.2914, + "step": 23255 + }, + { + "epoch": 29.853658536585368, + "grad_norm": 2.9041125774383545, + "learning_rate": 2.338553701326487e-05, + "loss": 0.2796, + "step": 23256 + }, + { + "epoch": 29.854942233632862, + "grad_norm": 1.1146494150161743, + "learning_rate": 2.338510911424904e-05, + "loss": 0.2981, + "step": 23257 + }, + { + "epoch": 29.85622593068036, + "grad_norm": 1.8797125816345215, + "learning_rate": 2.3384681215233204e-05, + "loss": 0.309, + "step": 23258 + }, + { + "epoch": 29.857509627727858, + "grad_norm": 1.531172513961792, + "learning_rate": 2.3384253316217373e-05, + "loss": 0.2633, + "step": 23259 + }, + { + "epoch": 29.858793324775352, + "grad_norm": 2.1872944831848145, + "learning_rate": 2.338382541720154e-05, + "loss": 0.2832, + "step": 23260 + }, + { + "epoch": 29.86007702182285, + "grad_norm": 3.7988643646240234, + "learning_rate": 2.338339751818571e-05, + "loss": 0.3168, + "step": 23261 + }, + { + "epoch": 29.861360718870348, + "grad_norm": 1.1424472332000732, + "learning_rate": 2.3382969619169875e-05, + "loss": 0.2815, + "step": 23262 + }, + { + "epoch": 29.862644415917842, + "grad_norm": 0.9933681488037109, + "learning_rate": 2.3382541720154046e-05, + "loss": 0.306, + "step": 23263 + }, + { + "epoch": 29.86392811296534, + "grad_norm": 1.721043586730957, + "learning_rate": 2.338211382113821e-05, + "loss": 0.2699, + "step": 23264 + }, + { + "epoch": 29.865211810012838, + "grad_norm": 1.7026216983795166, + "learning_rate": 2.338168592212238e-05, + "loss": 0.273, + "step": 23265 + }, + { + "epoch": 29.866495507060336, + "grad_norm": 2.772531270980835, + "learning_rate": 2.3381258023106548e-05, + "loss": 0.2999, + "step": 23266 + }, + { + "epoch": 29.86777920410783, + "grad_norm": 1.1715632677078247, + "learning_rate": 2.3380830124090713e-05, + "loss": 0.2762, + "step": 23267 + }, + { + "epoch": 29.869062901155328, + "grad_norm": 1.3537489175796509, + "learning_rate": 2.3380402225074885e-05, + "loss": 0.2673, + "step": 23268 + }, + { + "epoch": 29.870346598202826, + "grad_norm": 2.2396762371063232, + "learning_rate": 2.337997432605905e-05, + "loss": 0.2713, + "step": 23269 + }, + { + "epoch": 29.87163029525032, + "grad_norm": 1.4601240158081055, + "learning_rate": 2.337954642704322e-05, + "loss": 0.2994, + "step": 23270 + }, + { + "epoch": 29.872913992297818, + "grad_norm": 3.3572700023651123, + "learning_rate": 2.3379118528027387e-05, + "loss": 0.3228, + "step": 23271 + }, + { + "epoch": 29.874197689345316, + "grad_norm": 5.006834983825684, + "learning_rate": 2.3378690629011552e-05, + "loss": 0.2681, + "step": 23272 + }, + { + "epoch": 29.87548138639281, + "grad_norm": 1.4113283157348633, + "learning_rate": 2.3378262729995724e-05, + "loss": 0.2955, + "step": 23273 + }, + { + "epoch": 29.876765083440308, + "grad_norm": 1.0792052745819092, + "learning_rate": 2.337783483097989e-05, + "loss": 0.2781, + "step": 23274 + }, + { + "epoch": 29.878048780487806, + "grad_norm": 1.225459098815918, + "learning_rate": 2.3377406931964057e-05, + "loss": 0.3007, + "step": 23275 + }, + { + "epoch": 29.8793324775353, + "grad_norm": 7.623040676116943, + "learning_rate": 2.3376979032948226e-05, + "loss": 0.2859, + "step": 23276 + }, + { + "epoch": 29.880616174582798, + "grad_norm": 2.8395278453826904, + "learning_rate": 2.3376551133932394e-05, + "loss": 0.3226, + "step": 23277 + }, + { + "epoch": 29.881899871630296, + "grad_norm": 3.2168147563934326, + "learning_rate": 2.337612323491656e-05, + "loss": 0.347, + "step": 23278 + }, + { + "epoch": 29.883183568677794, + "grad_norm": 1.9953261613845825, + "learning_rate": 2.3375695335900727e-05, + "loss": 0.3127, + "step": 23279 + }, + { + "epoch": 29.884467265725288, + "grad_norm": 3.9373526573181152, + "learning_rate": 2.3375267436884896e-05, + "loss": 0.2919, + "step": 23280 + }, + { + "epoch": 29.885750962772786, + "grad_norm": 2.1319141387939453, + "learning_rate": 2.3374839537869064e-05, + "loss": 0.2899, + "step": 23281 + }, + { + "epoch": 29.887034659820284, + "grad_norm": 3.0183591842651367, + "learning_rate": 2.3374411638853233e-05, + "loss": 0.3108, + "step": 23282 + }, + { + "epoch": 29.888318356867778, + "grad_norm": 9.484999656677246, + "learning_rate": 2.3373983739837398e-05, + "loss": 0.3088, + "step": 23283 + }, + { + "epoch": 29.889602053915276, + "grad_norm": 3.635266065597534, + "learning_rate": 2.3373555840821566e-05, + "loss": 0.3205, + "step": 23284 + }, + { + "epoch": 29.890885750962774, + "grad_norm": 5.15251350402832, + "learning_rate": 2.3373127941805734e-05, + "loss": 0.3202, + "step": 23285 + }, + { + "epoch": 29.892169448010268, + "grad_norm": 5.528926849365234, + "learning_rate": 2.33727000427899e-05, + "loss": 0.3272, + "step": 23286 + }, + { + "epoch": 29.893453145057766, + "grad_norm": 2.483421564102173, + "learning_rate": 2.337227214377407e-05, + "loss": 0.3588, + "step": 23287 + }, + { + "epoch": 29.894736842105264, + "grad_norm": 3.187314033508301, + "learning_rate": 2.3371844244758236e-05, + "loss": 0.377, + "step": 23288 + }, + { + "epoch": 29.89602053915276, + "grad_norm": 2.4414384365081787, + "learning_rate": 2.3371416345742408e-05, + "loss": 0.3806, + "step": 23289 + }, + { + "epoch": 29.897304236200256, + "grad_norm": 2.1404800415039062, + "learning_rate": 2.3370988446726573e-05, + "loss": 0.3775, + "step": 23290 + }, + { + "epoch": 29.898587933247754, + "grad_norm": 2.3195605278015137, + "learning_rate": 2.3370560547710738e-05, + "loss": 0.4653, + "step": 23291 + }, + { + "epoch": 29.89987163029525, + "grad_norm": 1.242884874343872, + "learning_rate": 2.337013264869491e-05, + "loss": 0.3281, + "step": 23292 + }, + { + "epoch": 29.901155327342746, + "grad_norm": 0.8448021411895752, + "learning_rate": 2.3369704749679075e-05, + "loss": 0.2941, + "step": 23293 + }, + { + "epoch": 29.902439024390244, + "grad_norm": 0.7391595840454102, + "learning_rate": 2.3369276850663243e-05, + "loss": 0.3045, + "step": 23294 + }, + { + "epoch": 29.90372272143774, + "grad_norm": 0.9601650834083557, + "learning_rate": 2.3368848951647412e-05, + "loss": 0.3006, + "step": 23295 + }, + { + "epoch": 29.905006418485236, + "grad_norm": 1.4842609167099, + "learning_rate": 2.336842105263158e-05, + "loss": 0.3168, + "step": 23296 + }, + { + "epoch": 29.906290115532734, + "grad_norm": 1.5029406547546387, + "learning_rate": 2.336799315361575e-05, + "loss": 0.2692, + "step": 23297 + }, + { + "epoch": 29.90757381258023, + "grad_norm": 1.2900035381317139, + "learning_rate": 2.3367565254599914e-05, + "loss": 0.2995, + "step": 23298 + }, + { + "epoch": 29.90885750962773, + "grad_norm": 0.857740044593811, + "learning_rate": 2.3367137355584082e-05, + "loss": 0.3027, + "step": 23299 + }, + { + "epoch": 29.910141206675224, + "grad_norm": 0.9579412937164307, + "learning_rate": 2.336670945656825e-05, + "loss": 0.2672, + "step": 23300 + }, + { + "epoch": 29.911424903722722, + "grad_norm": 15.31470775604248, + "learning_rate": 2.336628155755242e-05, + "loss": 0.3013, + "step": 23301 + }, + { + "epoch": 29.91270860077022, + "grad_norm": 3.659087896347046, + "learning_rate": 2.3365853658536584e-05, + "loss": 0.3108, + "step": 23302 + }, + { + "epoch": 29.913992297817714, + "grad_norm": 0.8474793434143066, + "learning_rate": 2.3365425759520756e-05, + "loss": 0.2811, + "step": 23303 + }, + { + "epoch": 29.915275994865212, + "grad_norm": 2.2933614253997803, + "learning_rate": 2.336499786050492e-05, + "loss": 0.2614, + "step": 23304 + }, + { + "epoch": 29.91655969191271, + "grad_norm": 0.9142018556594849, + "learning_rate": 2.336456996148909e-05, + "loss": 0.3096, + "step": 23305 + }, + { + "epoch": 29.917843388960204, + "grad_norm": 1.9860299825668335, + "learning_rate": 2.3364142062473258e-05, + "loss": 0.2708, + "step": 23306 + }, + { + "epoch": 29.919127086007702, + "grad_norm": 2.1617586612701416, + "learning_rate": 2.3363714163457423e-05, + "loss": 0.2936, + "step": 23307 + }, + { + "epoch": 29.9204107830552, + "grad_norm": 1.5290327072143555, + "learning_rate": 2.3363286264441594e-05, + "loss": 0.2798, + "step": 23308 + }, + { + "epoch": 29.921694480102694, + "grad_norm": 0.900494396686554, + "learning_rate": 2.336285836542576e-05, + "loss": 0.3057, + "step": 23309 + }, + { + "epoch": 29.922978177150192, + "grad_norm": 2.3712351322174072, + "learning_rate": 2.3362430466409928e-05, + "loss": 0.3197, + "step": 23310 + }, + { + "epoch": 29.92426187419769, + "grad_norm": 3.7234299182891846, + "learning_rate": 2.3362002567394096e-05, + "loss": 0.2566, + "step": 23311 + }, + { + "epoch": 29.925545571245188, + "grad_norm": 1.8354650735855103, + "learning_rate": 2.336157466837826e-05, + "loss": 0.2635, + "step": 23312 + }, + { + "epoch": 29.926829268292682, + "grad_norm": 4.495632171630859, + "learning_rate": 2.3361146769362433e-05, + "loss": 0.2855, + "step": 23313 + }, + { + "epoch": 29.92811296534018, + "grad_norm": 1.2817684412002563, + "learning_rate": 2.3360718870346598e-05, + "loss": 0.3014, + "step": 23314 + }, + { + "epoch": 29.929396662387678, + "grad_norm": 1.6211224794387817, + "learning_rate": 2.3360290971330767e-05, + "loss": 0.3007, + "step": 23315 + }, + { + "epoch": 29.930680359435172, + "grad_norm": 2.5867738723754883, + "learning_rate": 2.3359863072314935e-05, + "loss": 0.296, + "step": 23316 + }, + { + "epoch": 29.93196405648267, + "grad_norm": 2.299825429916382, + "learning_rate": 2.3359435173299103e-05, + "loss": 0.3007, + "step": 23317 + }, + { + "epoch": 29.933247753530168, + "grad_norm": 0.860439658164978, + "learning_rate": 2.335900727428327e-05, + "loss": 0.3196, + "step": 23318 + }, + { + "epoch": 29.934531450577662, + "grad_norm": 2.2469921112060547, + "learning_rate": 2.3358579375267437e-05, + "loss": 0.3011, + "step": 23319 + }, + { + "epoch": 29.93581514762516, + "grad_norm": 1.0827052593231201, + "learning_rate": 2.3358151476251605e-05, + "loss": 0.2689, + "step": 23320 + }, + { + "epoch": 29.937098844672658, + "grad_norm": 1.5344111919403076, + "learning_rate": 2.3357723577235774e-05, + "loss": 0.2873, + "step": 23321 + }, + { + "epoch": 29.938382541720156, + "grad_norm": 3.1998443603515625, + "learning_rate": 2.3357295678219942e-05, + "loss": 0.2893, + "step": 23322 + }, + { + "epoch": 29.93966623876765, + "grad_norm": 3.815157651901245, + "learning_rate": 2.3356867779204107e-05, + "loss": 0.3008, + "step": 23323 + }, + { + "epoch": 29.940949935815148, + "grad_norm": 4.7550458908081055, + "learning_rate": 2.335643988018828e-05, + "loss": 0.3229, + "step": 23324 + }, + { + "epoch": 29.942233632862646, + "grad_norm": 1.538667917251587, + "learning_rate": 2.3356011981172444e-05, + "loss": 0.2974, + "step": 23325 + }, + { + "epoch": 29.94351732991014, + "grad_norm": 1.49657142162323, + "learning_rate": 2.335558408215661e-05, + "loss": 0.2913, + "step": 23326 + }, + { + "epoch": 29.944801026957638, + "grad_norm": 1.035172939300537, + "learning_rate": 2.335515618314078e-05, + "loss": 0.2655, + "step": 23327 + }, + { + "epoch": 29.946084724005136, + "grad_norm": 1.8562531471252441, + "learning_rate": 2.3354728284124946e-05, + "loss": 0.3034, + "step": 23328 + }, + { + "epoch": 29.94736842105263, + "grad_norm": 1.4033927917480469, + "learning_rate": 2.3354300385109118e-05, + "loss": 0.2764, + "step": 23329 + }, + { + "epoch": 29.948652118100128, + "grad_norm": 1.4914335012435913, + "learning_rate": 2.3353872486093283e-05, + "loss": 0.3102, + "step": 23330 + }, + { + "epoch": 29.949935815147626, + "grad_norm": 1.8869564533233643, + "learning_rate": 2.335344458707745e-05, + "loss": 0.3221, + "step": 23331 + }, + { + "epoch": 29.951219512195124, + "grad_norm": 3.0189967155456543, + "learning_rate": 2.335301668806162e-05, + "loss": 0.3441, + "step": 23332 + }, + { + "epoch": 29.952503209242618, + "grad_norm": 1.8876596689224243, + "learning_rate": 2.3352588789045784e-05, + "loss": 0.3566, + "step": 23333 + }, + { + "epoch": 29.953786906290116, + "grad_norm": 3.2052645683288574, + "learning_rate": 2.3352160890029953e-05, + "loss": 0.3316, + "step": 23334 + }, + { + "epoch": 29.955070603337614, + "grad_norm": 1.586047887802124, + "learning_rate": 2.335173299101412e-05, + "loss": 0.3169, + "step": 23335 + }, + { + "epoch": 29.956354300385108, + "grad_norm": 2.353081226348877, + "learning_rate": 2.335130509199829e-05, + "loss": 0.3206, + "step": 23336 + }, + { + "epoch": 29.957637997432606, + "grad_norm": 1.2371788024902344, + "learning_rate": 2.3350877192982458e-05, + "loss": 0.3417, + "step": 23337 + }, + { + "epoch": 29.958921694480104, + "grad_norm": 2.475446939468384, + "learning_rate": 2.3350449293966626e-05, + "loss": 0.3555, + "step": 23338 + }, + { + "epoch": 29.960205391527598, + "grad_norm": 2.1691389083862305, + "learning_rate": 2.335002139495079e-05, + "loss": 0.346, + "step": 23339 + }, + { + "epoch": 29.961489088575096, + "grad_norm": 9.529752731323242, + "learning_rate": 2.334959349593496e-05, + "loss": 0.3936, + "step": 23340 + }, + { + "epoch": 29.962772785622594, + "grad_norm": 3.9459068775177, + "learning_rate": 2.3349165596919128e-05, + "loss": 0.4821, + "step": 23341 + }, + { + "epoch": 29.964056482670088, + "grad_norm": 1.693493366241455, + "learning_rate": 2.3348737697903293e-05, + "loss": 0.2778, + "step": 23342 + }, + { + "epoch": 29.965340179717586, + "grad_norm": 0.8430227637290955, + "learning_rate": 2.3348309798887465e-05, + "loss": 0.3039, + "step": 23343 + }, + { + "epoch": 29.966623876765084, + "grad_norm": 0.8975194096565247, + "learning_rate": 2.334788189987163e-05, + "loss": 0.2958, + "step": 23344 + }, + { + "epoch": 29.96790757381258, + "grad_norm": 2.5841615200042725, + "learning_rate": 2.33474540008558e-05, + "loss": 0.2905, + "step": 23345 + }, + { + "epoch": 29.969191270860076, + "grad_norm": 1.68199622631073, + "learning_rate": 2.3347026101839967e-05, + "loss": 0.292, + "step": 23346 + }, + { + "epoch": 29.970474967907574, + "grad_norm": 2.0207297801971436, + "learning_rate": 2.3346598202824132e-05, + "loss": 0.3057, + "step": 23347 + }, + { + "epoch": 29.971758664955072, + "grad_norm": 1.2474273443222046, + "learning_rate": 2.3346170303808304e-05, + "loss": 0.3043, + "step": 23348 + }, + { + "epoch": 29.973042362002566, + "grad_norm": 0.9118398427963257, + "learning_rate": 2.334574240479247e-05, + "loss": 0.304, + "step": 23349 + }, + { + "epoch": 29.974326059050064, + "grad_norm": 3.0947561264038086, + "learning_rate": 2.3345314505776637e-05, + "loss": 0.3382, + "step": 23350 + }, + { + "epoch": 29.975609756097562, + "grad_norm": 1.2740681171417236, + "learning_rate": 2.3344886606760806e-05, + "loss": 0.3053, + "step": 23351 + }, + { + "epoch": 29.976893453145056, + "grad_norm": 1.5802431106567383, + "learning_rate": 2.334445870774497e-05, + "loss": 0.2719, + "step": 23352 + }, + { + "epoch": 29.978177150192554, + "grad_norm": 1.558254599571228, + "learning_rate": 2.3344030808729142e-05, + "loss": 0.3066, + "step": 23353 + }, + { + "epoch": 29.979460847240052, + "grad_norm": 1.2601393461227417, + "learning_rate": 2.3343602909713307e-05, + "loss": 0.2902, + "step": 23354 + }, + { + "epoch": 29.98074454428755, + "grad_norm": 1.1948944330215454, + "learning_rate": 2.3343175010697476e-05, + "loss": 0.2969, + "step": 23355 + }, + { + "epoch": 29.982028241335044, + "grad_norm": 6.325289249420166, + "learning_rate": 2.3342747111681644e-05, + "loss": 0.286, + "step": 23356 + }, + { + "epoch": 29.983311938382542, + "grad_norm": 1.6221530437469482, + "learning_rate": 2.3342319212665813e-05, + "loss": 0.2825, + "step": 23357 + }, + { + "epoch": 29.98459563543004, + "grad_norm": 1.0776640176773071, + "learning_rate": 2.3341891313649978e-05, + "loss": 0.2774, + "step": 23358 + }, + { + "epoch": 29.985879332477534, + "grad_norm": 1.0623666048049927, + "learning_rate": 2.3341463414634146e-05, + "loss": 0.2846, + "step": 23359 + }, + { + "epoch": 29.987163029525032, + "grad_norm": 2.59745454788208, + "learning_rate": 2.3341035515618315e-05, + "loss": 0.3114, + "step": 23360 + }, + { + "epoch": 29.98844672657253, + "grad_norm": 1.711427927017212, + "learning_rate": 2.3340607616602483e-05, + "loss": 0.3268, + "step": 23361 + }, + { + "epoch": 29.989730423620024, + "grad_norm": 2.516958236694336, + "learning_rate": 2.334017971758665e-05, + "loss": 0.2751, + "step": 23362 + }, + { + "epoch": 29.991014120667522, + "grad_norm": 2.553750991821289, + "learning_rate": 2.3339751818570816e-05, + "loss": 0.2909, + "step": 23363 + }, + { + "epoch": 29.99229781771502, + "grad_norm": 1.7149509191513062, + "learning_rate": 2.3339323919554988e-05, + "loss": 0.3023, + "step": 23364 + }, + { + "epoch": 29.993581514762518, + "grad_norm": 1.143154263496399, + "learning_rate": 2.3338896020539153e-05, + "loss": 0.3058, + "step": 23365 + }, + { + "epoch": 29.994865211810012, + "grad_norm": 1.9948153495788574, + "learning_rate": 2.3338468121523318e-05, + "loss": 0.342, + "step": 23366 + }, + { + "epoch": 29.99614890885751, + "grad_norm": 1.353908896446228, + "learning_rate": 2.333804022250749e-05, + "loss": 0.3469, + "step": 23367 + }, + { + "epoch": 29.997432605905008, + "grad_norm": 2.001152753829956, + "learning_rate": 2.3337612323491655e-05, + "loss": 0.3558, + "step": 23368 + }, + { + "epoch": 29.998716302952502, + "grad_norm": 3.4331960678100586, + "learning_rate": 2.3337184424475827e-05, + "loss": 0.3697, + "step": 23369 + }, + { + "epoch": 30.0, + "grad_norm": 4.7400102615356445, + "learning_rate": 2.3336756525459992e-05, + "loss": 0.5025, + "step": 23370 + }, + { + "epoch": 30.001283697047498, + "grad_norm": 0.8901658058166504, + "learning_rate": 2.333632862644416e-05, + "loss": 0.2937, + "step": 23371 + }, + { + "epoch": 30.002567394094992, + "grad_norm": 0.5640008449554443, + "learning_rate": 2.333590072742833e-05, + "loss": 0.2804, + "step": 23372 + }, + { + "epoch": 30.00385109114249, + "grad_norm": 1.0945172309875488, + "learning_rate": 2.3335472828412494e-05, + "loss": 0.268, + "step": 23373 + }, + { + "epoch": 30.005134788189988, + "grad_norm": 0.8603835701942444, + "learning_rate": 2.3335044929396662e-05, + "loss": 0.3024, + "step": 23374 + }, + { + "epoch": 30.006418485237482, + "grad_norm": 0.6966275572776794, + "learning_rate": 2.333461703038083e-05, + "loss": 0.2774, + "step": 23375 + }, + { + "epoch": 30.00770218228498, + "grad_norm": 1.2506870031356812, + "learning_rate": 2.3334189131365e-05, + "loss": 0.2901, + "step": 23376 + }, + { + "epoch": 30.008985879332478, + "grad_norm": 1.4517238140106201, + "learning_rate": 2.3333761232349167e-05, + "loss": 0.2929, + "step": 23377 + }, + { + "epoch": 30.010269576379976, + "grad_norm": 1.797316551208496, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.2499, + "step": 23378 + }, + { + "epoch": 30.01155327342747, + "grad_norm": 1.50209641456604, + "learning_rate": 2.33329054343175e-05, + "loss": 0.2556, + "step": 23379 + }, + { + "epoch": 30.012836970474968, + "grad_norm": 1.4079558849334717, + "learning_rate": 2.333247753530167e-05, + "loss": 0.3034, + "step": 23380 + }, + { + "epoch": 30.014120667522466, + "grad_norm": 1.003326654434204, + "learning_rate": 2.3332049636285838e-05, + "loss": 0.2837, + "step": 23381 + }, + { + "epoch": 30.01540436456996, + "grad_norm": 1.089290738105774, + "learning_rate": 2.3331621737270003e-05, + "loss": 0.2905, + "step": 23382 + }, + { + "epoch": 30.016688061617458, + "grad_norm": 1.0605034828186035, + "learning_rate": 2.3331193838254174e-05, + "loss": 0.3137, + "step": 23383 + }, + { + "epoch": 30.017971758664956, + "grad_norm": 0.9377774000167847, + "learning_rate": 2.333076593923834e-05, + "loss": 0.2644, + "step": 23384 + }, + { + "epoch": 30.01925545571245, + "grad_norm": 0.837197482585907, + "learning_rate": 2.333033804022251e-05, + "loss": 0.2896, + "step": 23385 + }, + { + "epoch": 30.020539152759948, + "grad_norm": 0.9844673871994019, + "learning_rate": 2.3329910141206676e-05, + "loss": 0.268, + "step": 23386 + }, + { + "epoch": 30.021822849807446, + "grad_norm": 0.8857032656669617, + "learning_rate": 2.332948224219084e-05, + "loss": 0.2645, + "step": 23387 + }, + { + "epoch": 30.023106546854944, + "grad_norm": 2.644896984100342, + "learning_rate": 2.3329054343175013e-05, + "loss": 0.2647, + "step": 23388 + }, + { + "epoch": 30.024390243902438, + "grad_norm": 0.7503523826599121, + "learning_rate": 2.3328626444159178e-05, + "loss": 0.2684, + "step": 23389 + }, + { + "epoch": 30.025673940949936, + "grad_norm": 1.0754379034042358, + "learning_rate": 2.3328198545143347e-05, + "loss": 0.2728, + "step": 23390 + }, + { + "epoch": 30.026957637997434, + "grad_norm": 1.403730869293213, + "learning_rate": 2.3327770646127515e-05, + "loss": 0.2822, + "step": 23391 + }, + { + "epoch": 30.028241335044928, + "grad_norm": 2.315969944000244, + "learning_rate": 2.3327342747111683e-05, + "loss": 0.2946, + "step": 23392 + }, + { + "epoch": 30.029525032092426, + "grad_norm": 1.5643192529678345, + "learning_rate": 2.3326914848095852e-05, + "loss": 0.3259, + "step": 23393 + }, + { + "epoch": 30.030808729139924, + "grad_norm": 0.8423987030982971, + "learning_rate": 2.3326486949080017e-05, + "loss": 0.2772, + "step": 23394 + }, + { + "epoch": 30.03209242618742, + "grad_norm": 2.658200740814209, + "learning_rate": 2.3326059050064185e-05, + "loss": 0.271, + "step": 23395 + }, + { + "epoch": 30.033376123234916, + "grad_norm": 0.9784629344940186, + "learning_rate": 2.3325631151048354e-05, + "loss": 0.2861, + "step": 23396 + }, + { + "epoch": 30.034659820282414, + "grad_norm": 3.827990770339966, + "learning_rate": 2.3325203252032522e-05, + "loss": 0.2873, + "step": 23397 + }, + { + "epoch": 30.035943517329912, + "grad_norm": 1.2836955785751343, + "learning_rate": 2.3324775353016687e-05, + "loss": 0.2502, + "step": 23398 + }, + { + "epoch": 30.037227214377406, + "grad_norm": 1.7194023132324219, + "learning_rate": 2.332434745400086e-05, + "loss": 0.2649, + "step": 23399 + }, + { + "epoch": 30.038510911424904, + "grad_norm": 2.1680169105529785, + "learning_rate": 2.3323919554985024e-05, + "loss": 0.2801, + "step": 23400 + }, + { + "epoch": 30.039794608472402, + "grad_norm": 1.228145718574524, + "learning_rate": 2.332349165596919e-05, + "loss": 0.2967, + "step": 23401 + }, + { + "epoch": 30.041078305519896, + "grad_norm": 1.7630985975265503, + "learning_rate": 2.332306375695336e-05, + "loss": 0.2854, + "step": 23402 + }, + { + "epoch": 30.042362002567394, + "grad_norm": 1.607376217842102, + "learning_rate": 2.3322635857937526e-05, + "loss": 0.2681, + "step": 23403 + }, + { + "epoch": 30.043645699614892, + "grad_norm": 0.9138506650924683, + "learning_rate": 2.3322207958921698e-05, + "loss": 0.2729, + "step": 23404 + }, + { + "epoch": 30.044929396662386, + "grad_norm": 1.7009060382843018, + "learning_rate": 2.3321780059905863e-05, + "loss": 0.2782, + "step": 23405 + }, + { + "epoch": 30.046213093709884, + "grad_norm": 1.4110817909240723, + "learning_rate": 2.3321352160890028e-05, + "loss": 0.2754, + "step": 23406 + }, + { + "epoch": 30.047496790757382, + "grad_norm": 1.5954110622406006, + "learning_rate": 2.33209242618742e-05, + "loss": 0.2793, + "step": 23407 + }, + { + "epoch": 30.048780487804876, + "grad_norm": 1.2931767702102661, + "learning_rate": 2.3320496362858364e-05, + "loss": 0.3028, + "step": 23408 + }, + { + "epoch": 30.050064184852374, + "grad_norm": 1.4032326936721802, + "learning_rate": 2.3320068463842533e-05, + "loss": 0.2687, + "step": 23409 + }, + { + "epoch": 30.051347881899872, + "grad_norm": 2.735941171646118, + "learning_rate": 2.33196405648267e-05, + "loss": 0.2808, + "step": 23410 + }, + { + "epoch": 30.05263157894737, + "grad_norm": 2.2115423679351807, + "learning_rate": 2.331921266581087e-05, + "loss": 0.2857, + "step": 23411 + }, + { + "epoch": 30.053915275994864, + "grad_norm": NaN, + "learning_rate": 2.331921266581087e-05, + "loss": 0.3137, + "step": 23412 + }, + { + "epoch": 30.055198973042362, + "grad_norm": 2.316399574279785, + "learning_rate": 2.3318784766795038e-05, + "loss": 0.3243, + "step": 23413 + }, + { + "epoch": 30.05648267008986, + "grad_norm": 1.6000425815582275, + "learning_rate": 2.3318356867779203e-05, + "loss": 0.2873, + "step": 23414 + }, + { + "epoch": 30.057766367137354, + "grad_norm": 1.7377067804336548, + "learning_rate": 2.331792896876337e-05, + "loss": 0.2848, + "step": 23415 + }, + { + "epoch": 30.059050064184852, + "grad_norm": 1.6984915733337402, + "learning_rate": 2.331750106974754e-05, + "loss": 0.3098, + "step": 23416 + }, + { + "epoch": 30.06033376123235, + "grad_norm": 2.144512891769409, + "learning_rate": 2.331707317073171e-05, + "loss": 0.3133, + "step": 23417 + }, + { + "epoch": 30.061617458279844, + "grad_norm": 1.8874834775924683, + "learning_rate": 2.3316645271715873e-05, + "loss": 0.3459, + "step": 23418 + }, + { + "epoch": 30.062901155327342, + "grad_norm": 1.8396916389465332, + "learning_rate": 2.3316217372700045e-05, + "loss": 0.3651, + "step": 23419 + }, + { + "epoch": 30.06418485237484, + "grad_norm": 4.195216178894043, + "learning_rate": 2.331578947368421e-05, + "loss": 0.496, + "step": 23420 + }, + { + "epoch": 30.065468549422338, + "grad_norm": 0.9168094396591187, + "learning_rate": 2.331536157466838e-05, + "loss": 0.2737, + "step": 23421 + }, + { + "epoch": 30.066752246469832, + "grad_norm": 1.1689908504486084, + "learning_rate": 2.3314933675652547e-05, + "loss": 0.2573, + "step": 23422 + }, + { + "epoch": 30.06803594351733, + "grad_norm": 0.893302857875824, + "learning_rate": 2.3314505776636712e-05, + "loss": 0.2791, + "step": 23423 + }, + { + "epoch": 30.069319640564828, + "grad_norm": 0.829454243183136, + "learning_rate": 2.3314077877620884e-05, + "loss": 0.2647, + "step": 23424 + }, + { + "epoch": 30.070603337612322, + "grad_norm": 1.035879373550415, + "learning_rate": 2.331364997860505e-05, + "loss": 0.2755, + "step": 23425 + }, + { + "epoch": 30.07188703465982, + "grad_norm": 0.6640985608100891, + "learning_rate": 2.3313222079589217e-05, + "loss": 0.2554, + "step": 23426 + }, + { + "epoch": 30.073170731707318, + "grad_norm": 0.8602825999259949, + "learning_rate": 2.3312794180573386e-05, + "loss": 0.2764, + "step": 23427 + }, + { + "epoch": 30.074454428754812, + "grad_norm": 1.0254254341125488, + "learning_rate": 2.331236628155755e-05, + "loss": 0.2818, + "step": 23428 + }, + { + "epoch": 30.07573812580231, + "grad_norm": 0.9814500212669373, + "learning_rate": 2.3311938382541723e-05, + "loss": 0.2847, + "step": 23429 + }, + { + "epoch": 30.077021822849808, + "grad_norm": 1.6927266120910645, + "learning_rate": 2.3311510483525888e-05, + "loss": 0.2633, + "step": 23430 + }, + { + "epoch": 30.078305519897306, + "grad_norm": 2.4076268672943115, + "learning_rate": 2.3311082584510056e-05, + "loss": 0.2364, + "step": 23431 + }, + { + "epoch": 30.0795892169448, + "grad_norm": 0.7441576719284058, + "learning_rate": 2.3310654685494224e-05, + "loss": 0.2771, + "step": 23432 + }, + { + "epoch": 30.080872913992298, + "grad_norm": 1.4183927774429321, + "learning_rate": 2.3310226786478393e-05, + "loss": 0.2843, + "step": 23433 + }, + { + "epoch": 30.082156611039796, + "grad_norm": 2.1700000762939453, + "learning_rate": 2.3309798887462558e-05, + "loss": 0.2384, + "step": 23434 + }, + { + "epoch": 30.08344030808729, + "grad_norm": 1.7625564336776733, + "learning_rate": 2.3309370988446726e-05, + "loss": 0.2506, + "step": 23435 + }, + { + "epoch": 30.084724005134788, + "grad_norm": 0.9372240304946899, + "learning_rate": 2.3308943089430895e-05, + "loss": 0.3051, + "step": 23436 + }, + { + "epoch": 30.086007702182286, + "grad_norm": 1.4949666261672974, + "learning_rate": 2.3308515190415063e-05, + "loss": 0.253, + "step": 23437 + }, + { + "epoch": 30.08729139922978, + "grad_norm": 1.8102210760116577, + "learning_rate": 2.330808729139923e-05, + "loss": 0.2486, + "step": 23438 + }, + { + "epoch": 30.088575096277278, + "grad_norm": 0.9074148535728455, + "learning_rate": 2.3307659392383396e-05, + "loss": 0.2951, + "step": 23439 + }, + { + "epoch": 30.089858793324776, + "grad_norm": 1.2110475301742554, + "learning_rate": 2.3307231493367568e-05, + "loss": 0.2987, + "step": 23440 + }, + { + "epoch": 30.09114249037227, + "grad_norm": 0.9299782514572144, + "learning_rate": 2.3306803594351733e-05, + "loss": 0.2813, + "step": 23441 + }, + { + "epoch": 30.09242618741977, + "grad_norm": 0.9552693963050842, + "learning_rate": 2.33063756953359e-05, + "loss": 0.277, + "step": 23442 + }, + { + "epoch": 30.093709884467266, + "grad_norm": 0.98820561170578, + "learning_rate": 2.330594779632007e-05, + "loss": 0.2739, + "step": 23443 + }, + { + "epoch": 30.094993581514764, + "grad_norm": 0.9569026827812195, + "learning_rate": 2.3305519897304235e-05, + "loss": 0.2446, + "step": 23444 + }, + { + "epoch": 30.09627727856226, + "grad_norm": 0.7894458174705505, + "learning_rate": 2.3305091998288407e-05, + "loss": 0.2765, + "step": 23445 + }, + { + "epoch": 30.097560975609756, + "grad_norm": 0.9241060614585876, + "learning_rate": 2.3304664099272572e-05, + "loss": 0.2635, + "step": 23446 + }, + { + "epoch": 30.098844672657254, + "grad_norm": 0.874125599861145, + "learning_rate": 2.330423620025674e-05, + "loss": 0.2942, + "step": 23447 + }, + { + "epoch": 30.10012836970475, + "grad_norm": 1.6274718046188354, + "learning_rate": 2.330380830124091e-05, + "loss": 0.2637, + "step": 23448 + }, + { + "epoch": 30.101412066752246, + "grad_norm": 2.381925344467163, + "learning_rate": 2.3303380402225074e-05, + "loss": 0.3046, + "step": 23449 + }, + { + "epoch": 30.102695763799744, + "grad_norm": 1.0920346975326538, + "learning_rate": 2.3302952503209242e-05, + "loss": 0.2498, + "step": 23450 + }, + { + "epoch": 30.10397946084724, + "grad_norm": 1.3970366716384888, + "learning_rate": 2.330252460419341e-05, + "loss": 0.2936, + "step": 23451 + }, + { + "epoch": 30.105263157894736, + "grad_norm": 1.0820789337158203, + "learning_rate": 2.330209670517758e-05, + "loss": 0.2575, + "step": 23452 + }, + { + "epoch": 30.106546854942234, + "grad_norm": 2.09926176071167, + "learning_rate": 2.3301668806161747e-05, + "loss": 0.277, + "step": 23453 + }, + { + "epoch": 30.107830551989732, + "grad_norm": 1.3500362634658813, + "learning_rate": 2.3301240907145916e-05, + "loss": 0.2448, + "step": 23454 + }, + { + "epoch": 30.109114249037226, + "grad_norm": 1.3467270135879517, + "learning_rate": 2.330081300813008e-05, + "loss": 0.2645, + "step": 23455 + }, + { + "epoch": 30.110397946084724, + "grad_norm": 0.9776456356048584, + "learning_rate": 2.330038510911425e-05, + "loss": 0.2965, + "step": 23456 + }, + { + "epoch": 30.111681643132222, + "grad_norm": 1.413366675376892, + "learning_rate": 2.3299957210098418e-05, + "loss": 0.2608, + "step": 23457 + }, + { + "epoch": 30.112965340179716, + "grad_norm": 1.1414510011672974, + "learning_rate": 2.3299529311082583e-05, + "loss": 0.3056, + "step": 23458 + }, + { + "epoch": 30.114249037227214, + "grad_norm": 1.3303356170654297, + "learning_rate": 2.3299101412066755e-05, + "loss": 0.2729, + "step": 23459 + }, + { + "epoch": 30.115532734274712, + "grad_norm": 1.5761014223098755, + "learning_rate": 2.329867351305092e-05, + "loss": 0.2731, + "step": 23460 + }, + { + "epoch": 30.116816431322206, + "grad_norm": 2.2228713035583496, + "learning_rate": 2.329824561403509e-05, + "loss": 0.2517, + "step": 23461 + }, + { + "epoch": 30.118100128369704, + "grad_norm": 3.300018072128296, + "learning_rate": 2.3297817715019256e-05, + "loss": 0.2877, + "step": 23462 + }, + { + "epoch": 30.119383825417202, + "grad_norm": 1.3858180046081543, + "learning_rate": 2.329738981600342e-05, + "loss": 0.3119, + "step": 23463 + }, + { + "epoch": 30.1206675224647, + "grad_norm": 1.5998998880386353, + "learning_rate": 2.3296961916987593e-05, + "loss": 0.29, + "step": 23464 + }, + { + "epoch": 30.121951219512194, + "grad_norm": 1.7556074857711792, + "learning_rate": 2.3296534017971758e-05, + "loss": 0.3233, + "step": 23465 + }, + { + "epoch": 30.123234916559692, + "grad_norm": 2.442430257797241, + "learning_rate": 2.3296106118955927e-05, + "loss": 0.3232, + "step": 23466 + }, + { + "epoch": 30.12451861360719, + "grad_norm": 1.123333215713501, + "learning_rate": 2.3295678219940095e-05, + "loss": 0.3055, + "step": 23467 + }, + { + "epoch": 30.125802310654684, + "grad_norm": 9.21809196472168, + "learning_rate": 2.329525032092426e-05, + "loss": 0.3266, + "step": 23468 + }, + { + "epoch": 30.127086007702182, + "grad_norm": 1.355927586555481, + "learning_rate": 2.3294822421908432e-05, + "loss": 0.3862, + "step": 23469 + }, + { + "epoch": 30.12836970474968, + "grad_norm": 3.2119972705841064, + "learning_rate": 2.3294394522892597e-05, + "loss": 0.4563, + "step": 23470 + }, + { + "epoch": 30.129653401797174, + "grad_norm": 1.24661123752594, + "learning_rate": 2.3293966623876765e-05, + "loss": 0.3043, + "step": 23471 + }, + { + "epoch": 30.130937098844672, + "grad_norm": 0.6643092632293701, + "learning_rate": 2.3293538724860934e-05, + "loss": 0.2867, + "step": 23472 + }, + { + "epoch": 30.13222079589217, + "grad_norm": 1.315665602684021, + "learning_rate": 2.3293110825845102e-05, + "loss": 0.2859, + "step": 23473 + }, + { + "epoch": 30.133504492939664, + "grad_norm": 1.4360874891281128, + "learning_rate": 2.3292682926829267e-05, + "loss": 0.3107, + "step": 23474 + }, + { + "epoch": 30.134788189987162, + "grad_norm": 1.0014824867248535, + "learning_rate": 2.3292255027813436e-05, + "loss": 0.3041, + "step": 23475 + }, + { + "epoch": 30.13607188703466, + "grad_norm": 0.8347403407096863, + "learning_rate": 2.3291827128797604e-05, + "loss": 0.2803, + "step": 23476 + }, + { + "epoch": 30.137355584082158, + "grad_norm": 0.7828074097633362, + "learning_rate": 2.3291399229781772e-05, + "loss": 0.2736, + "step": 23477 + }, + { + "epoch": 30.138639281129652, + "grad_norm": 1.1367138624191284, + "learning_rate": 2.329097133076594e-05, + "loss": 0.2938, + "step": 23478 + }, + { + "epoch": 30.13992297817715, + "grad_norm": 0.9500275254249573, + "learning_rate": 2.3290543431750106e-05, + "loss": 0.2976, + "step": 23479 + }, + { + "epoch": 30.141206675224648, + "grad_norm": 0.9933914542198181, + "learning_rate": 2.3290115532734278e-05, + "loss": 0.2893, + "step": 23480 + }, + { + "epoch": 30.142490372272142, + "grad_norm": 0.8969666957855225, + "learning_rate": 2.3289687633718443e-05, + "loss": 0.275, + "step": 23481 + }, + { + "epoch": 30.14377406931964, + "grad_norm": 1.3653028011322021, + "learning_rate": 2.3289259734702608e-05, + "loss": 0.2824, + "step": 23482 + }, + { + "epoch": 30.145057766367138, + "grad_norm": 1.419648289680481, + "learning_rate": 2.328883183568678e-05, + "loss": 0.2423, + "step": 23483 + }, + { + "epoch": 30.146341463414632, + "grad_norm": 1.267340064048767, + "learning_rate": 2.3288403936670945e-05, + "loss": 0.258, + "step": 23484 + }, + { + "epoch": 30.14762516046213, + "grad_norm": 1.0717040300369263, + "learning_rate": 2.3287976037655116e-05, + "loss": 0.267, + "step": 23485 + }, + { + "epoch": 30.14890885750963, + "grad_norm": 1.146138310432434, + "learning_rate": 2.328754813863928e-05, + "loss": 0.2563, + "step": 23486 + }, + { + "epoch": 30.150192554557126, + "grad_norm": 1.449037790298462, + "learning_rate": 2.328712023962345e-05, + "loss": 0.3056, + "step": 23487 + }, + { + "epoch": 30.15147625160462, + "grad_norm": 1.4746943712234497, + "learning_rate": 2.3286692340607618e-05, + "loss": 0.2526, + "step": 23488 + }, + { + "epoch": 30.15275994865212, + "grad_norm": 6.0646772384643555, + "learning_rate": 2.3286264441591783e-05, + "loss": 0.2867, + "step": 23489 + }, + { + "epoch": 30.154043645699616, + "grad_norm": 1.1744219064712524, + "learning_rate": 2.328583654257595e-05, + "loss": 0.2859, + "step": 23490 + }, + { + "epoch": 30.15532734274711, + "grad_norm": 1.090391993522644, + "learning_rate": 2.328540864356012e-05, + "loss": 0.2385, + "step": 23491 + }, + { + "epoch": 30.15661103979461, + "grad_norm": 5.352470874786377, + "learning_rate": 2.328498074454429e-05, + "loss": 0.2942, + "step": 23492 + }, + { + "epoch": 30.157894736842106, + "grad_norm": 1.0360335111618042, + "learning_rate": 2.3284552845528457e-05, + "loss": 0.2718, + "step": 23493 + }, + { + "epoch": 30.1591784338896, + "grad_norm": 0.8920185565948486, + "learning_rate": 2.3284124946512625e-05, + "loss": 0.2594, + "step": 23494 + }, + { + "epoch": 30.1604621309371, + "grad_norm": 0.9191368818283081, + "learning_rate": 2.328369704749679e-05, + "loss": 0.2831, + "step": 23495 + }, + { + "epoch": 30.161745827984596, + "grad_norm": 2.5437779426574707, + "learning_rate": 2.328326914848096e-05, + "loss": 0.2695, + "step": 23496 + }, + { + "epoch": 30.163029525032094, + "grad_norm": 0.9053037166595459, + "learning_rate": 2.3282841249465127e-05, + "loss": 0.2856, + "step": 23497 + }, + { + "epoch": 30.16431322207959, + "grad_norm": 1.3617457151412964, + "learning_rate": 2.3282413350449292e-05, + "loss": 0.286, + "step": 23498 + }, + { + "epoch": 30.165596919127086, + "grad_norm": 1.1973059177398682, + "learning_rate": 2.3281985451433464e-05, + "loss": 0.2546, + "step": 23499 + }, + { + "epoch": 30.166880616174584, + "grad_norm": 1.862966775894165, + "learning_rate": 2.328155755241763e-05, + "loss": 0.2734, + "step": 23500 + }, + { + "epoch": 30.16816431322208, + "grad_norm": 1.236985683441162, + "learning_rate": 2.32811296534018e-05, + "loss": 0.2598, + "step": 23501 + }, + { + "epoch": 30.169448010269576, + "grad_norm": 1.0599677562713623, + "learning_rate": 2.3280701754385966e-05, + "loss": 0.2865, + "step": 23502 + }, + { + "epoch": 30.170731707317074, + "grad_norm": 1.8458220958709717, + "learning_rate": 2.328027385537013e-05, + "loss": 0.3323, + "step": 23503 + }, + { + "epoch": 30.17201540436457, + "grad_norm": 1.3335630893707275, + "learning_rate": 2.3279845956354303e-05, + "loss": 0.265, + "step": 23504 + }, + { + "epoch": 30.173299101412066, + "grad_norm": 1.0498218536376953, + "learning_rate": 2.3279418057338468e-05, + "loss": 0.2676, + "step": 23505 + }, + { + "epoch": 30.174582798459564, + "grad_norm": 1.158762812614441, + "learning_rate": 2.3278990158322636e-05, + "loss": 0.2696, + "step": 23506 + }, + { + "epoch": 30.17586649550706, + "grad_norm": 3.10868763923645, + "learning_rate": 2.3278562259306804e-05, + "loss": 0.3217, + "step": 23507 + }, + { + "epoch": 30.177150192554556, + "grad_norm": 1.307620882987976, + "learning_rate": 2.3278134360290973e-05, + "loss": 0.3054, + "step": 23508 + }, + { + "epoch": 30.178433889602054, + "grad_norm": 1.6267307996749878, + "learning_rate": 2.327770646127514e-05, + "loss": 0.2645, + "step": 23509 + }, + { + "epoch": 30.179717586649552, + "grad_norm": 1.6583077907562256, + "learning_rate": 2.3277278562259306e-05, + "loss": 0.2863, + "step": 23510 + }, + { + "epoch": 30.181001283697046, + "grad_norm": 12.491923332214355, + "learning_rate": 2.3276850663243475e-05, + "loss": 0.2815, + "step": 23511 + }, + { + "epoch": 30.182284980744544, + "grad_norm": 3.5828964710235596, + "learning_rate": 2.3276422764227643e-05, + "loss": 0.288, + "step": 23512 + }, + { + "epoch": 30.183568677792042, + "grad_norm": 1.6245715618133545, + "learning_rate": 2.327599486521181e-05, + "loss": 0.3106, + "step": 23513 + }, + { + "epoch": 30.184852374839537, + "grad_norm": 2.789689779281616, + "learning_rate": 2.3275566966195977e-05, + "loss": 0.312, + "step": 23514 + }, + { + "epoch": 30.186136071887034, + "grad_norm": 13.724899291992188, + "learning_rate": 2.327513906718015e-05, + "loss": 0.3243, + "step": 23515 + }, + { + "epoch": 30.187419768934532, + "grad_norm": 1.8879165649414062, + "learning_rate": 2.3274711168164313e-05, + "loss": 0.3416, + "step": 23516 + }, + { + "epoch": 30.188703465982027, + "grad_norm": 5.298285961151123, + "learning_rate": 2.3274283269148482e-05, + "loss": 0.2981, + "step": 23517 + }, + { + "epoch": 30.189987163029524, + "grad_norm": 3.276736259460449, + "learning_rate": 2.327385537013265e-05, + "loss": 0.367, + "step": 23518 + }, + { + "epoch": 30.191270860077022, + "grad_norm": 3.404867649078369, + "learning_rate": 2.3273427471116815e-05, + "loss": 0.3523, + "step": 23519 + }, + { + "epoch": 30.19255455712452, + "grad_norm": 3.3142802715301514, + "learning_rate": 2.3272999572100987e-05, + "loss": 0.478, + "step": 23520 + }, + { + "epoch": 30.193838254172015, + "grad_norm": 2.16056489944458, + "learning_rate": 2.3272571673085152e-05, + "loss": 0.2983, + "step": 23521 + }, + { + "epoch": 30.195121951219512, + "grad_norm": 0.8452741503715515, + "learning_rate": 2.327214377406932e-05, + "loss": 0.2915, + "step": 23522 + }, + { + "epoch": 30.19640564826701, + "grad_norm": 1.1240254640579224, + "learning_rate": 2.327171587505349e-05, + "loss": 0.293, + "step": 23523 + }, + { + "epoch": 30.197689345314505, + "grad_norm": 1.2719653844833374, + "learning_rate": 2.3271287976037654e-05, + "loss": 0.2993, + "step": 23524 + }, + { + "epoch": 30.198973042362002, + "grad_norm": 1.1153994798660278, + "learning_rate": 2.3270860077021826e-05, + "loss": 0.2812, + "step": 23525 + }, + { + "epoch": 30.2002567394095, + "grad_norm": 0.8130981922149658, + "learning_rate": 2.327043217800599e-05, + "loss": 0.2673, + "step": 23526 + }, + { + "epoch": 30.201540436456995, + "grad_norm": 0.9488996267318726, + "learning_rate": 2.327000427899016e-05, + "loss": 0.312, + "step": 23527 + }, + { + "epoch": 30.202824133504492, + "grad_norm": 1.0669796466827393, + "learning_rate": 2.3269576379974328e-05, + "loss": 0.2735, + "step": 23528 + }, + { + "epoch": 30.20410783055199, + "grad_norm": 1.3963032960891724, + "learning_rate": 2.3269148480958493e-05, + "loss": 0.294, + "step": 23529 + }, + { + "epoch": 30.205391527599488, + "grad_norm": 1.0382510423660278, + "learning_rate": 2.326872058194266e-05, + "loss": 0.2637, + "step": 23530 + }, + { + "epoch": 30.206675224646983, + "grad_norm": 1.2424137592315674, + "learning_rate": 2.326829268292683e-05, + "loss": 0.2705, + "step": 23531 + }, + { + "epoch": 30.20795892169448, + "grad_norm": 1.2808376550674438, + "learning_rate": 2.3267864783910998e-05, + "loss": 0.2702, + "step": 23532 + }, + { + "epoch": 30.20924261874198, + "grad_norm": 1.0070935487747192, + "learning_rate": 2.3267436884895166e-05, + "loss": 0.287, + "step": 23533 + }, + { + "epoch": 30.210526315789473, + "grad_norm": 1.3946306705474854, + "learning_rate": 2.3267008985879335e-05, + "loss": 0.3107, + "step": 23534 + }, + { + "epoch": 30.21181001283697, + "grad_norm": 0.7945297956466675, + "learning_rate": 2.32665810868635e-05, + "loss": 0.2919, + "step": 23535 + }, + { + "epoch": 30.21309370988447, + "grad_norm": 2.637622833251953, + "learning_rate": 2.3266153187847668e-05, + "loss": 0.2385, + "step": 23536 + }, + { + "epoch": 30.214377406931963, + "grad_norm": 1.2084091901779175, + "learning_rate": 2.3265725288831836e-05, + "loss": 0.2818, + "step": 23537 + }, + { + "epoch": 30.21566110397946, + "grad_norm": 0.9297283291816711, + "learning_rate": 2.3265297389816e-05, + "loss": 0.2553, + "step": 23538 + }, + { + "epoch": 30.21694480102696, + "grad_norm": 1.1878914833068848, + "learning_rate": 2.3264869490800173e-05, + "loss": 0.2808, + "step": 23539 + }, + { + "epoch": 30.218228498074453, + "grad_norm": 7.649975299835205, + "learning_rate": 2.3264441591784338e-05, + "loss": 0.2425, + "step": 23540 + }, + { + "epoch": 30.21951219512195, + "grad_norm": 1.6687359809875488, + "learning_rate": 2.326401369276851e-05, + "loss": 0.2652, + "step": 23541 + }, + { + "epoch": 30.22079589216945, + "grad_norm": 1.0509660243988037, + "learning_rate": 2.3263585793752675e-05, + "loss": 0.2662, + "step": 23542 + }, + { + "epoch": 30.222079589216946, + "grad_norm": 1.0657504796981812, + "learning_rate": 2.326315789473684e-05, + "loss": 0.2493, + "step": 23543 + }, + { + "epoch": 30.22336328626444, + "grad_norm": 0.9837929010391235, + "learning_rate": 2.3262729995721012e-05, + "loss": 0.2661, + "step": 23544 + }, + { + "epoch": 30.22464698331194, + "grad_norm": 1.3871917724609375, + "learning_rate": 2.3262302096705177e-05, + "loss": 0.2665, + "step": 23545 + }, + { + "epoch": 30.225930680359436, + "grad_norm": 1.2112754583358765, + "learning_rate": 2.3261874197689345e-05, + "loss": 0.2694, + "step": 23546 + }, + { + "epoch": 30.22721437740693, + "grad_norm": 1.508649468421936, + "learning_rate": 2.3261446298673514e-05, + "loss": 0.2477, + "step": 23547 + }, + { + "epoch": 30.22849807445443, + "grad_norm": 1.0106343030929565, + "learning_rate": 2.3261018399657682e-05, + "loss": 0.2692, + "step": 23548 + }, + { + "epoch": 30.229781771501926, + "grad_norm": 1.4143743515014648, + "learning_rate": 2.326059050064185e-05, + "loss": 0.2609, + "step": 23549 + }, + { + "epoch": 30.23106546854942, + "grad_norm": 1.510716199874878, + "learning_rate": 2.3260162601626016e-05, + "loss": 0.2548, + "step": 23550 + }, + { + "epoch": 30.23234916559692, + "grad_norm": 1.6838430166244507, + "learning_rate": 2.3259734702610184e-05, + "loss": 0.2719, + "step": 23551 + }, + { + "epoch": 30.233632862644416, + "grad_norm": 1.2738498449325562, + "learning_rate": 2.3259306803594352e-05, + "loss": 0.2649, + "step": 23552 + }, + { + "epoch": 30.234916559691914, + "grad_norm": 1.2063876390457153, + "learning_rate": 2.325887890457852e-05, + "loss": 0.3053, + "step": 23553 + }, + { + "epoch": 30.23620025673941, + "grad_norm": 1.1354001760482788, + "learning_rate": 2.3258451005562686e-05, + "loss": 0.247, + "step": 23554 + }, + { + "epoch": 30.237483953786906, + "grad_norm": 1.0719008445739746, + "learning_rate": 2.3258023106546858e-05, + "loss": 0.2902, + "step": 23555 + }, + { + "epoch": 30.238767650834404, + "grad_norm": 0.9675849080085754, + "learning_rate": 2.3257595207531023e-05, + "loss": 0.2434, + "step": 23556 + }, + { + "epoch": 30.2400513478819, + "grad_norm": 1.7526912689208984, + "learning_rate": 2.325716730851519e-05, + "loss": 0.3061, + "step": 23557 + }, + { + "epoch": 30.241335044929397, + "grad_norm": 1.2300750017166138, + "learning_rate": 2.325673940949936e-05, + "loss": 0.2591, + "step": 23558 + }, + { + "epoch": 30.242618741976894, + "grad_norm": 1.302398681640625, + "learning_rate": 2.3256311510483525e-05, + "loss": 0.2875, + "step": 23559 + }, + { + "epoch": 30.24390243902439, + "grad_norm": 3.0027668476104736, + "learning_rate": 2.3255883611467696e-05, + "loss": 0.2572, + "step": 23560 + }, + { + "epoch": 30.245186136071887, + "grad_norm": 1.8064664602279663, + "learning_rate": 2.325545571245186e-05, + "loss": 0.2576, + "step": 23561 + }, + { + "epoch": 30.246469833119384, + "grad_norm": 1.1541861295700073, + "learning_rate": 2.325502781343603e-05, + "loss": 0.2839, + "step": 23562 + }, + { + "epoch": 30.247753530166882, + "grad_norm": 1.1440386772155762, + "learning_rate": 2.3254599914420198e-05, + "loss": 0.2683, + "step": 23563 + }, + { + "epoch": 30.249037227214377, + "grad_norm": 1.3812284469604492, + "learning_rate": 2.3254172015404363e-05, + "loss": 0.2846, + "step": 23564 + }, + { + "epoch": 30.250320924261874, + "grad_norm": 1.8603355884552002, + "learning_rate": 2.3253744116388535e-05, + "loss": 0.3135, + "step": 23565 + }, + { + "epoch": 30.251604621309372, + "grad_norm": 1.159193515777588, + "learning_rate": 2.32533162173727e-05, + "loss": 0.3304, + "step": 23566 + }, + { + "epoch": 30.252888318356867, + "grad_norm": 1.9383130073547363, + "learning_rate": 2.325288831835687e-05, + "loss": 0.3276, + "step": 23567 + }, + { + "epoch": 30.254172015404365, + "grad_norm": 2.1022391319274902, + "learning_rate": 2.3252460419341037e-05, + "loss": 0.3476, + "step": 23568 + }, + { + "epoch": 30.255455712451862, + "grad_norm": 1.4421592950820923, + "learning_rate": 2.3252032520325205e-05, + "loss": 0.4036, + "step": 23569 + }, + { + "epoch": 30.256739409499357, + "grad_norm": 1.8175170421600342, + "learning_rate": 2.325160462130937e-05, + "loss": 0.4022, + "step": 23570 + }, + { + "epoch": 30.258023106546855, + "grad_norm": 1.3175479173660278, + "learning_rate": 2.325117672229354e-05, + "loss": 0.2527, + "step": 23571 + }, + { + "epoch": 30.259306803594352, + "grad_norm": 1.420444369316101, + "learning_rate": 2.3250748823277707e-05, + "loss": 0.2944, + "step": 23572 + }, + { + "epoch": 30.260590500641847, + "grad_norm": 1.3606377840042114, + "learning_rate": 2.3250320924261876e-05, + "loss": 0.2814, + "step": 23573 + }, + { + "epoch": 30.261874197689345, + "grad_norm": 1.016093134880066, + "learning_rate": 2.3249893025246044e-05, + "loss": 0.2662, + "step": 23574 + }, + { + "epoch": 30.263157894736842, + "grad_norm": 1.0759234428405762, + "learning_rate": 2.324946512623021e-05, + "loss": 0.288, + "step": 23575 + }, + { + "epoch": 30.26444159178434, + "grad_norm": 0.7769531607627869, + "learning_rate": 2.324903722721438e-05, + "loss": 0.2621, + "step": 23576 + }, + { + "epoch": 30.265725288831835, + "grad_norm": 0.9211670756340027, + "learning_rate": 2.3248609328198546e-05, + "loss": 0.2716, + "step": 23577 + }, + { + "epoch": 30.267008985879333, + "grad_norm": 2.664754867553711, + "learning_rate": 2.324818142918271e-05, + "loss": 0.2553, + "step": 23578 + }, + { + "epoch": 30.26829268292683, + "grad_norm": 1.7108091115951538, + "learning_rate": 2.3247753530166883e-05, + "loss": 0.2805, + "step": 23579 + }, + { + "epoch": 30.269576379974325, + "grad_norm": 0.9969615340232849, + "learning_rate": 2.3247325631151048e-05, + "loss": 0.2925, + "step": 23580 + }, + { + "epoch": 30.270860077021823, + "grad_norm": 3.328977108001709, + "learning_rate": 2.324689773213522e-05, + "loss": 0.2602, + "step": 23581 + }, + { + "epoch": 30.27214377406932, + "grad_norm": 0.9372472167015076, + "learning_rate": 2.3246469833119384e-05, + "loss": 0.2878, + "step": 23582 + }, + { + "epoch": 30.273427471116815, + "grad_norm": 2.413886070251465, + "learning_rate": 2.3246041934103553e-05, + "loss": 0.3094, + "step": 23583 + }, + { + "epoch": 30.274711168164313, + "grad_norm": 0.9974322319030762, + "learning_rate": 2.324561403508772e-05, + "loss": 0.2841, + "step": 23584 + }, + { + "epoch": 30.27599486521181, + "grad_norm": 0.8016929030418396, + "learning_rate": 2.3245186136071886e-05, + "loss": 0.2872, + "step": 23585 + }, + { + "epoch": 30.27727856225931, + "grad_norm": 0.7739986181259155, + "learning_rate": 2.3244758237056055e-05, + "loss": 0.2737, + "step": 23586 + }, + { + "epoch": 30.278562259306803, + "grad_norm": 1.0010249614715576, + "learning_rate": 2.3244330338040223e-05, + "loss": 0.2895, + "step": 23587 + }, + { + "epoch": 30.2798459563543, + "grad_norm": 1.8894882202148438, + "learning_rate": 2.324390243902439e-05, + "loss": 0.2814, + "step": 23588 + }, + { + "epoch": 30.2811296534018, + "grad_norm": 1.6357067823410034, + "learning_rate": 2.324347454000856e-05, + "loss": 0.2768, + "step": 23589 + }, + { + "epoch": 30.282413350449293, + "grad_norm": 1.19562566280365, + "learning_rate": 2.324304664099273e-05, + "loss": 0.2879, + "step": 23590 + }, + { + "epoch": 30.28369704749679, + "grad_norm": 2.50925350189209, + "learning_rate": 2.3242618741976893e-05, + "loss": 0.2561, + "step": 23591 + }, + { + "epoch": 30.28498074454429, + "grad_norm": 3.1723897457122803, + "learning_rate": 2.3242190842961062e-05, + "loss": 0.263, + "step": 23592 + }, + { + "epoch": 30.286264441591783, + "grad_norm": 0.8940516114234924, + "learning_rate": 2.324176294394523e-05, + "loss": 0.2601, + "step": 23593 + }, + { + "epoch": 30.28754813863928, + "grad_norm": 1.024609088897705, + "learning_rate": 2.3241335044929395e-05, + "loss": 0.2721, + "step": 23594 + }, + { + "epoch": 30.28883183568678, + "grad_norm": 1.0893909931182861, + "learning_rate": 2.3240907145913567e-05, + "loss": 0.2606, + "step": 23595 + }, + { + "epoch": 30.290115532734276, + "grad_norm": 1.1947511434555054, + "learning_rate": 2.3240479246897732e-05, + "loss": 0.28, + "step": 23596 + }, + { + "epoch": 30.29139922978177, + "grad_norm": 1.137686014175415, + "learning_rate": 2.32400513478819e-05, + "loss": 0.2726, + "step": 23597 + }, + { + "epoch": 30.29268292682927, + "grad_norm": 1.6407665014266968, + "learning_rate": 2.323962344886607e-05, + "loss": 0.2707, + "step": 23598 + }, + { + "epoch": 30.293966623876766, + "grad_norm": 1.0702440738677979, + "learning_rate": 2.3239195549850234e-05, + "loss": 0.2498, + "step": 23599 + }, + { + "epoch": 30.29525032092426, + "grad_norm": 1.0752623081207275, + "learning_rate": 2.3238767650834406e-05, + "loss": 0.257, + "step": 23600 + }, + { + "epoch": 30.29653401797176, + "grad_norm": 0.8166197538375854, + "learning_rate": 2.323833975181857e-05, + "loss": 0.3003, + "step": 23601 + }, + { + "epoch": 30.297817715019256, + "grad_norm": 1.2173534631729126, + "learning_rate": 2.323791185280274e-05, + "loss": 0.3033, + "step": 23602 + }, + { + "epoch": 30.29910141206675, + "grad_norm": 0.9798933863639832, + "learning_rate": 2.3237483953786908e-05, + "loss": 0.2749, + "step": 23603 + }, + { + "epoch": 30.30038510911425, + "grad_norm": 1.4695152044296265, + "learning_rate": 2.3237056054771073e-05, + "loss": 0.2809, + "step": 23604 + }, + { + "epoch": 30.301668806161747, + "grad_norm": 1.3777166604995728, + "learning_rate": 2.323662815575524e-05, + "loss": 0.3176, + "step": 23605 + }, + { + "epoch": 30.30295250320924, + "grad_norm": 1.1284596920013428, + "learning_rate": 2.323620025673941e-05, + "loss": 0.283, + "step": 23606 + }, + { + "epoch": 30.30423620025674, + "grad_norm": 2.7028284072875977, + "learning_rate": 2.3235772357723578e-05, + "loss": 0.3045, + "step": 23607 + }, + { + "epoch": 30.305519897304237, + "grad_norm": 1.4995739459991455, + "learning_rate": 2.3235344458707746e-05, + "loss": 0.327, + "step": 23608 + }, + { + "epoch": 30.306803594351734, + "grad_norm": 1.1612567901611328, + "learning_rate": 2.3234916559691915e-05, + "loss": 0.2788, + "step": 23609 + }, + { + "epoch": 30.30808729139923, + "grad_norm": 1.2269738912582397, + "learning_rate": 2.323448866067608e-05, + "loss": 0.276, + "step": 23610 + }, + { + "epoch": 30.309370988446727, + "grad_norm": 1.277748465538025, + "learning_rate": 2.3234060761660248e-05, + "loss": 0.2431, + "step": 23611 + }, + { + "epoch": 30.310654685494224, + "grad_norm": 2.9757254123687744, + "learning_rate": 2.3233632862644417e-05, + "loss": 0.343, + "step": 23612 + }, + { + "epoch": 30.31193838254172, + "grad_norm": 1.5968607664108276, + "learning_rate": 2.323320496362858e-05, + "loss": 0.3115, + "step": 23613 + }, + { + "epoch": 30.313222079589217, + "grad_norm": 4.2697038650512695, + "learning_rate": 2.3232777064612753e-05, + "loss": 0.2894, + "step": 23614 + }, + { + "epoch": 30.314505776636715, + "grad_norm": 1.2577219009399414, + "learning_rate": 2.323234916559692e-05, + "loss": 0.3146, + "step": 23615 + }, + { + "epoch": 30.31578947368421, + "grad_norm": 1.153220772743225, + "learning_rate": 2.323192126658109e-05, + "loss": 0.3118, + "step": 23616 + }, + { + "epoch": 30.317073170731707, + "grad_norm": 1.7008564472198486, + "learning_rate": 2.3231493367565255e-05, + "loss": 0.3316, + "step": 23617 + }, + { + "epoch": 30.318356867779205, + "grad_norm": 1.2030247449874878, + "learning_rate": 2.323106546854942e-05, + "loss": 0.3352, + "step": 23618 + }, + { + "epoch": 30.319640564826702, + "grad_norm": 1.8711698055267334, + "learning_rate": 2.3230637569533592e-05, + "loss": 0.3958, + "step": 23619 + }, + { + "epoch": 30.320924261874197, + "grad_norm": 1.8208836317062378, + "learning_rate": 2.3230209670517757e-05, + "loss": 0.4586, + "step": 23620 + }, + { + "epoch": 30.322207958921695, + "grad_norm": 0.6090402603149414, + "learning_rate": 2.3229781771501925e-05, + "loss": 0.2644, + "step": 23621 + }, + { + "epoch": 30.323491655969192, + "grad_norm": 0.5646403431892395, + "learning_rate": 2.3229353872486094e-05, + "loss": 0.2659, + "step": 23622 + }, + { + "epoch": 30.324775353016687, + "grad_norm": 1.2550069093704224, + "learning_rate": 2.3228925973470262e-05, + "loss": 0.2793, + "step": 23623 + }, + { + "epoch": 30.326059050064185, + "grad_norm": 2.8401920795440674, + "learning_rate": 2.322849807445443e-05, + "loss": 0.2749, + "step": 23624 + }, + { + "epoch": 30.327342747111683, + "grad_norm": 0.9711334705352783, + "learning_rate": 2.3228070175438596e-05, + "loss": 0.2715, + "step": 23625 + }, + { + "epoch": 30.328626444159177, + "grad_norm": 0.9144242405891418, + "learning_rate": 2.3227642276422764e-05, + "loss": 0.2853, + "step": 23626 + }, + { + "epoch": 30.329910141206675, + "grad_norm": 0.9009007215499878, + "learning_rate": 2.3227214377406933e-05, + "loss": 0.3027, + "step": 23627 + }, + { + "epoch": 30.331193838254173, + "grad_norm": 0.7012345194816589, + "learning_rate": 2.32267864783911e-05, + "loss": 0.2432, + "step": 23628 + }, + { + "epoch": 30.33247753530167, + "grad_norm": 1.1519505977630615, + "learning_rate": 2.3226358579375266e-05, + "loss": 0.2998, + "step": 23629 + }, + { + "epoch": 30.333761232349165, + "grad_norm": 0.7228506207466125, + "learning_rate": 2.3225930680359438e-05, + "loss": 0.2874, + "step": 23630 + }, + { + "epoch": 30.335044929396663, + "grad_norm": 0.8932589292526245, + "learning_rate": 2.3225502781343603e-05, + "loss": 0.2637, + "step": 23631 + }, + { + "epoch": 30.33632862644416, + "grad_norm": 0.9383067488670349, + "learning_rate": 2.322507488232777e-05, + "loss": 0.2955, + "step": 23632 + }, + { + "epoch": 30.337612323491655, + "grad_norm": 6.033341884613037, + "learning_rate": 2.322464698331194e-05, + "loss": 0.2663, + "step": 23633 + }, + { + "epoch": 30.338896020539153, + "grad_norm": 4.114242076873779, + "learning_rate": 2.3224219084296105e-05, + "loss": 0.2808, + "step": 23634 + }, + { + "epoch": 30.34017971758665, + "grad_norm": 1.230980634689331, + "learning_rate": 2.3223791185280276e-05, + "loss": 0.2717, + "step": 23635 + }, + { + "epoch": 30.341463414634145, + "grad_norm": 1.3516846895217896, + "learning_rate": 2.322336328626444e-05, + "loss": 0.2795, + "step": 23636 + }, + { + "epoch": 30.342747111681643, + "grad_norm": 0.7553708553314209, + "learning_rate": 2.322293538724861e-05, + "loss": 0.2933, + "step": 23637 + }, + { + "epoch": 30.34403080872914, + "grad_norm": 1.25565505027771, + "learning_rate": 2.3222507488232778e-05, + "loss": 0.2811, + "step": 23638 + }, + { + "epoch": 30.345314505776635, + "grad_norm": 0.8983187079429626, + "learning_rate": 2.3222079589216943e-05, + "loss": 0.277, + "step": 23639 + }, + { + "epoch": 30.346598202824133, + "grad_norm": 0.9318846464157104, + "learning_rate": 2.3221651690201115e-05, + "loss": 0.2815, + "step": 23640 + }, + { + "epoch": 30.34788189987163, + "grad_norm": 0.8479331731796265, + "learning_rate": 2.322122379118528e-05, + "loss": 0.251, + "step": 23641 + }, + { + "epoch": 30.34916559691913, + "grad_norm": 1.0460609197616577, + "learning_rate": 2.322079589216945e-05, + "loss": 0.2746, + "step": 23642 + }, + { + "epoch": 30.350449293966623, + "grad_norm": 1.0062257051467896, + "learning_rate": 2.3220367993153617e-05, + "loss": 0.2819, + "step": 23643 + }, + { + "epoch": 30.35173299101412, + "grad_norm": 1.4820055961608887, + "learning_rate": 2.3219940094137785e-05, + "loss": 0.2571, + "step": 23644 + }, + { + "epoch": 30.35301668806162, + "grad_norm": 1.0841405391693115, + "learning_rate": 2.321951219512195e-05, + "loss": 0.2559, + "step": 23645 + }, + { + "epoch": 30.354300385109113, + "grad_norm": 4.512894630432129, + "learning_rate": 2.321908429610612e-05, + "loss": 0.2482, + "step": 23646 + }, + { + "epoch": 30.35558408215661, + "grad_norm": 0.9837606549263, + "learning_rate": 2.3218656397090287e-05, + "loss": 0.2597, + "step": 23647 + }, + { + "epoch": 30.35686777920411, + "grad_norm": 0.9002317786216736, + "learning_rate": 2.3218228498074456e-05, + "loss": 0.2518, + "step": 23648 + }, + { + "epoch": 30.358151476251603, + "grad_norm": 1.1270990371704102, + "learning_rate": 2.3217800599058624e-05, + "loss": 0.2677, + "step": 23649 + }, + { + "epoch": 30.3594351732991, + "grad_norm": 10.46330451965332, + "learning_rate": 2.321737270004279e-05, + "loss": 0.2833, + "step": 23650 + }, + { + "epoch": 30.3607188703466, + "grad_norm": 0.8252617120742798, + "learning_rate": 2.321694480102696e-05, + "loss": 0.2497, + "step": 23651 + }, + { + "epoch": 30.362002567394097, + "grad_norm": 1.8233321905136108, + "learning_rate": 2.3216516902011126e-05, + "loss": 0.2675, + "step": 23652 + }, + { + "epoch": 30.36328626444159, + "grad_norm": 0.8895518779754639, + "learning_rate": 2.321608900299529e-05, + "loss": 0.2655, + "step": 23653 + }, + { + "epoch": 30.36456996148909, + "grad_norm": 1.6831263303756714, + "learning_rate": 2.3215661103979463e-05, + "loss": 0.3123, + "step": 23654 + }, + { + "epoch": 30.365853658536587, + "grad_norm": 1.463683009147644, + "learning_rate": 2.3215233204963628e-05, + "loss": 0.2762, + "step": 23655 + }, + { + "epoch": 30.36713735558408, + "grad_norm": 1.8937863111495972, + "learning_rate": 2.32148053059478e-05, + "loss": 0.3057, + "step": 23656 + }, + { + "epoch": 30.36842105263158, + "grad_norm": 1.7406752109527588, + "learning_rate": 2.3214377406931965e-05, + "loss": 0.2812, + "step": 23657 + }, + { + "epoch": 30.369704749679077, + "grad_norm": 1.1059240102767944, + "learning_rate": 2.321394950791613e-05, + "loss": 0.2802, + "step": 23658 + }, + { + "epoch": 30.37098844672657, + "grad_norm": 1.1429593563079834, + "learning_rate": 2.32135216089003e-05, + "loss": 0.2539, + "step": 23659 + }, + { + "epoch": 30.37227214377407, + "grad_norm": 1.2332255840301514, + "learning_rate": 2.3213093709884466e-05, + "loss": 0.2813, + "step": 23660 + }, + { + "epoch": 30.373555840821567, + "grad_norm": 2.6571919918060303, + "learning_rate": 2.3212665810868635e-05, + "loss": 0.2828, + "step": 23661 + }, + { + "epoch": 30.374839537869065, + "grad_norm": 1.4650471210479736, + "learning_rate": 2.3212237911852803e-05, + "loss": 0.2972, + "step": 23662 + }, + { + "epoch": 30.37612323491656, + "grad_norm": 1.2031116485595703, + "learning_rate": 2.321181001283697e-05, + "loss": 0.3243, + "step": 23663 + }, + { + "epoch": 30.377406931964057, + "grad_norm": 3.4222323894500732, + "learning_rate": 2.321138211382114e-05, + "loss": 0.294, + "step": 23664 + }, + { + "epoch": 30.378690629011555, + "grad_norm": 4.87891960144043, + "learning_rate": 2.3210954214805305e-05, + "loss": 0.3016, + "step": 23665 + }, + { + "epoch": 30.37997432605905, + "grad_norm": 2.658616065979004, + "learning_rate": 2.3210526315789473e-05, + "loss": 0.3419, + "step": 23666 + }, + { + "epoch": 30.381258023106547, + "grad_norm": 2.5464468002319336, + "learning_rate": 2.3210098416773642e-05, + "loss": 0.2936, + "step": 23667 + }, + { + "epoch": 30.382541720154045, + "grad_norm": 1.6104291677474976, + "learning_rate": 2.320967051775781e-05, + "loss": 0.3001, + "step": 23668 + }, + { + "epoch": 30.38382541720154, + "grad_norm": 3.642638921737671, + "learning_rate": 2.3209242618741975e-05, + "loss": 0.3759, + "step": 23669 + }, + { + "epoch": 30.385109114249037, + "grad_norm": 2.3862807750701904, + "learning_rate": 2.3208814719726147e-05, + "loss": 0.415, + "step": 23670 + }, + { + "epoch": 30.386392811296535, + "grad_norm": 1.0456098318099976, + "learning_rate": 2.3208386820710312e-05, + "loss": 0.2791, + "step": 23671 + }, + { + "epoch": 30.387676508344033, + "grad_norm": 0.7115697264671326, + "learning_rate": 2.320795892169448e-05, + "loss": 0.2727, + "step": 23672 + }, + { + "epoch": 30.388960205391527, + "grad_norm": 3.6207516193389893, + "learning_rate": 2.320753102267865e-05, + "loss": 0.2739, + "step": 23673 + }, + { + "epoch": 30.390243902439025, + "grad_norm": 0.8754916787147522, + "learning_rate": 2.3207103123662814e-05, + "loss": 0.2735, + "step": 23674 + }, + { + "epoch": 30.391527599486523, + "grad_norm": 1.5906023979187012, + "learning_rate": 2.3206675224646986e-05, + "loss": 0.2819, + "step": 23675 + }, + { + "epoch": 30.392811296534017, + "grad_norm": 0.9354396462440491, + "learning_rate": 2.320624732563115e-05, + "loss": 0.2728, + "step": 23676 + }, + { + "epoch": 30.394094993581515, + "grad_norm": 0.8812106847763062, + "learning_rate": 2.320581942661532e-05, + "loss": 0.266, + "step": 23677 + }, + { + "epoch": 30.395378690629013, + "grad_norm": 1.1867165565490723, + "learning_rate": 2.3205391527599488e-05, + "loss": 0.2867, + "step": 23678 + }, + { + "epoch": 30.396662387676507, + "grad_norm": 1.0088825225830078, + "learning_rate": 2.3204963628583653e-05, + "loss": 0.3117, + "step": 23679 + }, + { + "epoch": 30.397946084724005, + "grad_norm": 1.5610352754592896, + "learning_rate": 2.3204535729567824e-05, + "loss": 0.2875, + "step": 23680 + }, + { + "epoch": 30.399229781771503, + "grad_norm": 2.970334768295288, + "learning_rate": 2.320410783055199e-05, + "loss": 0.2628, + "step": 23681 + }, + { + "epoch": 30.400513478818997, + "grad_norm": 5.877843379974365, + "learning_rate": 2.3203679931536158e-05, + "loss": 0.2835, + "step": 23682 + }, + { + "epoch": 30.401797175866495, + "grad_norm": 1.0364270210266113, + "learning_rate": 2.3203252032520326e-05, + "loss": 0.2521, + "step": 23683 + }, + { + "epoch": 30.403080872913993, + "grad_norm": 2.5277960300445557, + "learning_rate": 2.3202824133504495e-05, + "loss": 0.2709, + "step": 23684 + }, + { + "epoch": 30.40436456996149, + "grad_norm": 1.7963566780090332, + "learning_rate": 2.320239623448866e-05, + "loss": 0.272, + "step": 23685 + }, + { + "epoch": 30.405648267008985, + "grad_norm": 1.354882836341858, + "learning_rate": 2.3201968335472828e-05, + "loss": 0.3063, + "step": 23686 + }, + { + "epoch": 30.406931964056483, + "grad_norm": 1.0157109498977661, + "learning_rate": 2.3201540436456997e-05, + "loss": 0.2777, + "step": 23687 + }, + { + "epoch": 30.40821566110398, + "grad_norm": 1.1494495868682861, + "learning_rate": 2.3201112537441165e-05, + "loss": 0.2682, + "step": 23688 + }, + { + "epoch": 30.409499358151475, + "grad_norm": 1.4702903032302856, + "learning_rate": 2.3200684638425333e-05, + "loss": 0.2647, + "step": 23689 + }, + { + "epoch": 30.410783055198973, + "grad_norm": 1.0217268466949463, + "learning_rate": 2.32002567394095e-05, + "loss": 0.3222, + "step": 23690 + }, + { + "epoch": 30.41206675224647, + "grad_norm": 0.949241578578949, + "learning_rate": 2.319982884039367e-05, + "loss": 0.2888, + "step": 23691 + }, + { + "epoch": 30.413350449293965, + "grad_norm": 1.139064073562622, + "learning_rate": 2.3199400941377835e-05, + "loss": 0.2693, + "step": 23692 + }, + { + "epoch": 30.414634146341463, + "grad_norm": 1.5907649993896484, + "learning_rate": 2.3198973042362e-05, + "loss": 0.2775, + "step": 23693 + }, + { + "epoch": 30.41591784338896, + "grad_norm": 1.1504346132278442, + "learning_rate": 2.3198545143346172e-05, + "loss": 0.2512, + "step": 23694 + }, + { + "epoch": 30.41720154043646, + "grad_norm": 4.276527404785156, + "learning_rate": 2.3198117244330337e-05, + "loss": 0.2835, + "step": 23695 + }, + { + "epoch": 30.418485237483953, + "grad_norm": 2.3448774814605713, + "learning_rate": 2.319768934531451e-05, + "loss": 0.2734, + "step": 23696 + }, + { + "epoch": 30.41976893453145, + "grad_norm": 1.4633934497833252, + "learning_rate": 2.3197261446298674e-05, + "loss": 0.2594, + "step": 23697 + }, + { + "epoch": 30.42105263157895, + "grad_norm": 1.1326895952224731, + "learning_rate": 2.3196833547282842e-05, + "loss": 0.2959, + "step": 23698 + }, + { + "epoch": 30.422336328626443, + "grad_norm": 1.2092325687408447, + "learning_rate": 2.319640564826701e-05, + "loss": 0.2334, + "step": 23699 + }, + { + "epoch": 30.42362002567394, + "grad_norm": 1.8955862522125244, + "learning_rate": 2.3195977749251176e-05, + "loss": 0.2798, + "step": 23700 + }, + { + "epoch": 30.42490372272144, + "grad_norm": 1.0649046897888184, + "learning_rate": 2.3195549850235344e-05, + "loss": 0.2954, + "step": 23701 + }, + { + "epoch": 30.426187419768933, + "grad_norm": 0.8232793211936951, + "learning_rate": 2.3195121951219513e-05, + "loss": 0.2932, + "step": 23702 + }, + { + "epoch": 30.42747111681643, + "grad_norm": 1.237231969833374, + "learning_rate": 2.319469405220368e-05, + "loss": 0.3285, + "step": 23703 + }, + { + "epoch": 30.42875481386393, + "grad_norm": 1.1516224145889282, + "learning_rate": 2.319426615318785e-05, + "loss": 0.2781, + "step": 23704 + }, + { + "epoch": 30.430038510911427, + "grad_norm": 1.2930099964141846, + "learning_rate": 2.3193838254172018e-05, + "loss": 0.2691, + "step": 23705 + }, + { + "epoch": 30.43132220795892, + "grad_norm": 3.592655897140503, + "learning_rate": 2.3193410355156183e-05, + "loss": 0.2861, + "step": 23706 + }, + { + "epoch": 30.43260590500642, + "grad_norm": 1.1147843599319458, + "learning_rate": 2.319298245614035e-05, + "loss": 0.2848, + "step": 23707 + }, + { + "epoch": 30.433889602053917, + "grad_norm": 1.9898768663406372, + "learning_rate": 2.319255455712452e-05, + "loss": 0.2882, + "step": 23708 + }, + { + "epoch": 30.43517329910141, + "grad_norm": 3.306746006011963, + "learning_rate": 2.3192126658108685e-05, + "loss": 0.2599, + "step": 23709 + }, + { + "epoch": 30.43645699614891, + "grad_norm": 1.7824389934539795, + "learning_rate": 2.3191698759092856e-05, + "loss": 0.2934, + "step": 23710 + }, + { + "epoch": 30.437740693196407, + "grad_norm": 2.3372890949249268, + "learning_rate": 2.319127086007702e-05, + "loss": 0.2885, + "step": 23711 + }, + { + "epoch": 30.4390243902439, + "grad_norm": 1.2663432359695435, + "learning_rate": 2.3190842961061193e-05, + "loss": 0.2958, + "step": 23712 + }, + { + "epoch": 30.4403080872914, + "grad_norm": 4.003114700317383, + "learning_rate": 2.319041506204536e-05, + "loss": 0.3157, + "step": 23713 + }, + { + "epoch": 30.441591784338897, + "grad_norm": 5.521825790405273, + "learning_rate": 2.3189987163029523e-05, + "loss": 0.2879, + "step": 23714 + }, + { + "epoch": 30.44287548138639, + "grad_norm": 1.2923742532730103, + "learning_rate": 2.3189559264013695e-05, + "loss": 0.3238, + "step": 23715 + }, + { + "epoch": 30.44415917843389, + "grad_norm": 1.1774494647979736, + "learning_rate": 2.318913136499786e-05, + "loss": 0.3127, + "step": 23716 + }, + { + "epoch": 30.445442875481387, + "grad_norm": 1.4207537174224854, + "learning_rate": 2.318870346598203e-05, + "loss": 0.3528, + "step": 23717 + }, + { + "epoch": 30.446726572528885, + "grad_norm": 1.8428367376327515, + "learning_rate": 2.3188275566966197e-05, + "loss": 0.3174, + "step": 23718 + }, + { + "epoch": 30.44801026957638, + "grad_norm": 2.0520715713500977, + "learning_rate": 2.3187847667950362e-05, + "loss": 0.4253, + "step": 23719 + }, + { + "epoch": 30.449293966623877, + "grad_norm": 3.9613184928894043, + "learning_rate": 2.3187419768934534e-05, + "loss": 0.3968, + "step": 23720 + }, + { + "epoch": 30.450577663671375, + "grad_norm": 1.5506579875946045, + "learning_rate": 2.31869918699187e-05, + "loss": 0.2726, + "step": 23721 + }, + { + "epoch": 30.45186136071887, + "grad_norm": 0.7460857033729553, + "learning_rate": 2.3186563970902867e-05, + "loss": 0.2826, + "step": 23722 + }, + { + "epoch": 30.453145057766367, + "grad_norm": 0.8760510683059692, + "learning_rate": 2.3186136071887036e-05, + "loss": 0.2942, + "step": 23723 + }, + { + "epoch": 30.454428754813865, + "grad_norm": 5.512377738952637, + "learning_rate": 2.3185708172871204e-05, + "loss": 0.278, + "step": 23724 + }, + { + "epoch": 30.45571245186136, + "grad_norm": 0.9851745963096619, + "learning_rate": 2.318528027385537e-05, + "loss": 0.2793, + "step": 23725 + }, + { + "epoch": 30.456996148908857, + "grad_norm": 0.8223485946655273, + "learning_rate": 2.3184852374839538e-05, + "loss": 0.2813, + "step": 23726 + }, + { + "epoch": 30.458279845956355, + "grad_norm": 1.0464279651641846, + "learning_rate": 2.3184424475823706e-05, + "loss": 0.2908, + "step": 23727 + }, + { + "epoch": 30.459563543003853, + "grad_norm": 1.2353087663650513, + "learning_rate": 2.3183996576807874e-05, + "loss": 0.3082, + "step": 23728 + }, + { + "epoch": 30.460847240051347, + "grad_norm": 1.4162728786468506, + "learning_rate": 2.3183568677792043e-05, + "loss": 0.2707, + "step": 23729 + }, + { + "epoch": 30.462130937098845, + "grad_norm": 0.7636311054229736, + "learning_rate": 2.3183140778776208e-05, + "loss": 0.2734, + "step": 23730 + }, + { + "epoch": 30.463414634146343, + "grad_norm": 0.7814857959747314, + "learning_rate": 2.318271287976038e-05, + "loss": 0.3061, + "step": 23731 + }, + { + "epoch": 30.464698331193837, + "grad_norm": 1.1726247072219849, + "learning_rate": 2.3182284980744545e-05, + "loss": 0.2671, + "step": 23732 + }, + { + "epoch": 30.465982028241335, + "grad_norm": 1.090773582458496, + "learning_rate": 2.318185708172871e-05, + "loss": 0.2558, + "step": 23733 + }, + { + "epoch": 30.467265725288833, + "grad_norm": 0.8048742413520813, + "learning_rate": 2.318142918271288e-05, + "loss": 0.254, + "step": 23734 + }, + { + "epoch": 30.468549422336327, + "grad_norm": 1.4720081090927124, + "learning_rate": 2.3181001283697046e-05, + "loss": 0.2901, + "step": 23735 + }, + { + "epoch": 30.469833119383825, + "grad_norm": 1.1285406351089478, + "learning_rate": 2.3180573384681218e-05, + "loss": 0.2982, + "step": 23736 + }, + { + "epoch": 30.471116816431323, + "grad_norm": 1.3435378074645996, + "learning_rate": 2.3180145485665383e-05, + "loss": 0.2686, + "step": 23737 + }, + { + "epoch": 30.47240051347882, + "grad_norm": 2.0711238384246826, + "learning_rate": 2.317971758664955e-05, + "loss": 0.2694, + "step": 23738 + }, + { + "epoch": 30.473684210526315, + "grad_norm": 1.0083365440368652, + "learning_rate": 2.317928968763372e-05, + "loss": 0.2932, + "step": 23739 + }, + { + "epoch": 30.474967907573813, + "grad_norm": 1.068803071975708, + "learning_rate": 2.3178861788617885e-05, + "loss": 0.2517, + "step": 23740 + }, + { + "epoch": 30.47625160462131, + "grad_norm": 1.195717692375183, + "learning_rate": 2.3178433889602054e-05, + "loss": 0.2717, + "step": 23741 + }, + { + "epoch": 30.477535301668805, + "grad_norm": 2.213944673538208, + "learning_rate": 2.3178005990586222e-05, + "loss": 0.2359, + "step": 23742 + }, + { + "epoch": 30.478818998716303, + "grad_norm": 1.0887517929077148, + "learning_rate": 2.317757809157039e-05, + "loss": 0.2513, + "step": 23743 + }, + { + "epoch": 30.4801026957638, + "grad_norm": 2.0506458282470703, + "learning_rate": 2.317715019255456e-05, + "loss": 0.2858, + "step": 23744 + }, + { + "epoch": 30.481386392811295, + "grad_norm": 1.13584566116333, + "learning_rate": 2.3176722293538727e-05, + "loss": 0.2699, + "step": 23745 + }, + { + "epoch": 30.482670089858793, + "grad_norm": 1.2828787565231323, + "learning_rate": 2.3176294394522892e-05, + "loss": 0.2691, + "step": 23746 + }, + { + "epoch": 30.48395378690629, + "grad_norm": 0.9032615423202515, + "learning_rate": 2.317586649550706e-05, + "loss": 0.2438, + "step": 23747 + }, + { + "epoch": 30.485237483953785, + "grad_norm": 0.9505434036254883, + "learning_rate": 2.317543859649123e-05, + "loss": 0.2973, + "step": 23748 + }, + { + "epoch": 30.486521181001283, + "grad_norm": 1.0760269165039062, + "learning_rate": 2.3175010697475394e-05, + "loss": 0.2751, + "step": 23749 + }, + { + "epoch": 30.48780487804878, + "grad_norm": 0.9882152676582336, + "learning_rate": 2.3174582798459566e-05, + "loss": 0.276, + "step": 23750 + }, + { + "epoch": 30.48908857509628, + "grad_norm": 1.3444818258285522, + "learning_rate": 2.317415489944373e-05, + "loss": 0.3089, + "step": 23751 + }, + { + "epoch": 30.490372272143773, + "grad_norm": 0.8280643224716187, + "learning_rate": 2.3173727000427903e-05, + "loss": 0.2738, + "step": 23752 + }, + { + "epoch": 30.49165596919127, + "grad_norm": 1.0067763328552246, + "learning_rate": 2.3173299101412068e-05, + "loss": 0.2745, + "step": 23753 + }, + { + "epoch": 30.49293966623877, + "grad_norm": 1.030374526977539, + "learning_rate": 2.3172871202396233e-05, + "loss": 0.3197, + "step": 23754 + }, + { + "epoch": 30.494223363286263, + "grad_norm": 1.4350483417510986, + "learning_rate": 2.3172443303380405e-05, + "loss": 0.3043, + "step": 23755 + }, + { + "epoch": 30.49550706033376, + "grad_norm": 1.3019026517868042, + "learning_rate": 2.317201540436457e-05, + "loss": 0.2842, + "step": 23756 + }, + { + "epoch": 30.49679075738126, + "grad_norm": 1.3935357332229614, + "learning_rate": 2.3171587505348738e-05, + "loss": 0.2871, + "step": 23757 + }, + { + "epoch": 30.498074454428753, + "grad_norm": 2.6851541996002197, + "learning_rate": 2.3171159606332906e-05, + "loss": 0.2586, + "step": 23758 + }, + { + "epoch": 30.49935815147625, + "grad_norm": 1.712164044380188, + "learning_rate": 2.3170731707317075e-05, + "loss": 0.2849, + "step": 23759 + }, + { + "epoch": 30.50064184852375, + "grad_norm": 1.256703495979309, + "learning_rate": 2.3170303808301243e-05, + "loss": 0.3047, + "step": 23760 + }, + { + "epoch": 30.501925545571247, + "grad_norm": 1.7856439352035522, + "learning_rate": 2.3169875909285408e-05, + "loss": 0.2855, + "step": 23761 + }, + { + "epoch": 30.50320924261874, + "grad_norm": 1.0493271350860596, + "learning_rate": 2.3169448010269577e-05, + "loss": 0.298, + "step": 23762 + }, + { + "epoch": 30.50449293966624, + "grad_norm": 1.6575336456298828, + "learning_rate": 2.3169020111253745e-05, + "loss": 0.2945, + "step": 23763 + }, + { + "epoch": 30.505776636713737, + "grad_norm": 1.4521026611328125, + "learning_rate": 2.3168592212237913e-05, + "loss": 0.3284, + "step": 23764 + }, + { + "epoch": 30.50706033376123, + "grad_norm": 1.5811336040496826, + "learning_rate": 2.316816431322208e-05, + "loss": 0.3315, + "step": 23765 + }, + { + "epoch": 30.50834403080873, + "grad_norm": 1.178334355354309, + "learning_rate": 2.316773641420625e-05, + "loss": 0.3265, + "step": 23766 + }, + { + "epoch": 30.509627727856227, + "grad_norm": 2.023864984512329, + "learning_rate": 2.3167308515190415e-05, + "loss": 0.326, + "step": 23767 + }, + { + "epoch": 30.51091142490372, + "grad_norm": 2.694711685180664, + "learning_rate": 2.3166880616174584e-05, + "loss": 0.3783, + "step": 23768 + }, + { + "epoch": 30.51219512195122, + "grad_norm": 3.2812397480010986, + "learning_rate": 2.3166452717158752e-05, + "loss": 0.3548, + "step": 23769 + }, + { + "epoch": 30.513478818998717, + "grad_norm": 3.7455861568450928, + "learning_rate": 2.3166024818142917e-05, + "loss": 0.4809, + "step": 23770 + }, + { + "epoch": 30.514762516046215, + "grad_norm": 1.0577069520950317, + "learning_rate": 2.316559691912709e-05, + "loss": 0.2684, + "step": 23771 + }, + { + "epoch": 30.51604621309371, + "grad_norm": 2.104494094848633, + "learning_rate": 2.3165169020111254e-05, + "loss": 0.28, + "step": 23772 + }, + { + "epoch": 30.517329910141207, + "grad_norm": 1.1065622568130493, + "learning_rate": 2.3164741121095422e-05, + "loss": 0.2971, + "step": 23773 + }, + { + "epoch": 30.518613607188705, + "grad_norm": 0.7908625602722168, + "learning_rate": 2.316431322207959e-05, + "loss": 0.2905, + "step": 23774 + }, + { + "epoch": 30.5198973042362, + "grad_norm": 1.4305323362350464, + "learning_rate": 2.3163885323063756e-05, + "loss": 0.2618, + "step": 23775 + }, + { + "epoch": 30.521181001283697, + "grad_norm": 0.814999520778656, + "learning_rate": 2.3163457424047928e-05, + "loss": 0.2716, + "step": 23776 + }, + { + "epoch": 30.522464698331195, + "grad_norm": 0.7788211107254028, + "learning_rate": 2.3163029525032093e-05, + "loss": 0.2549, + "step": 23777 + }, + { + "epoch": 30.52374839537869, + "grad_norm": 0.7144928574562073, + "learning_rate": 2.316260162601626e-05, + "loss": 0.2767, + "step": 23778 + }, + { + "epoch": 30.525032092426187, + "grad_norm": 1.0649642944335938, + "learning_rate": 2.316217372700043e-05, + "loss": 0.276, + "step": 23779 + }, + { + "epoch": 30.526315789473685, + "grad_norm": 0.8593153357505798, + "learning_rate": 2.3161745827984594e-05, + "loss": 0.3109, + "step": 23780 + }, + { + "epoch": 30.527599486521183, + "grad_norm": 1.0382176637649536, + "learning_rate": 2.3161317928968763e-05, + "loss": 0.3237, + "step": 23781 + }, + { + "epoch": 30.528883183568677, + "grad_norm": 2.7345213890075684, + "learning_rate": 2.316089002995293e-05, + "loss": 0.3143, + "step": 23782 + }, + { + "epoch": 30.530166880616175, + "grad_norm": 0.9324472546577454, + "learning_rate": 2.31604621309371e-05, + "loss": 0.2918, + "step": 23783 + }, + { + "epoch": 30.531450577663673, + "grad_norm": 0.8867249488830566, + "learning_rate": 2.3160034231921268e-05, + "loss": 0.2624, + "step": 23784 + }, + { + "epoch": 30.532734274711167, + "grad_norm": 5.121676445007324, + "learning_rate": 2.3159606332905437e-05, + "loss": 0.3048, + "step": 23785 + }, + { + "epoch": 30.534017971758665, + "grad_norm": 1.6485730409622192, + "learning_rate": 2.31591784338896e-05, + "loss": 0.3044, + "step": 23786 + }, + { + "epoch": 30.535301668806163, + "grad_norm": 0.7469176054000854, + "learning_rate": 2.315875053487377e-05, + "loss": 0.262, + "step": 23787 + }, + { + "epoch": 30.536585365853657, + "grad_norm": 0.8625405430793762, + "learning_rate": 2.315832263585794e-05, + "loss": 0.2827, + "step": 23788 + }, + { + "epoch": 30.537869062901155, + "grad_norm": 1.0741095542907715, + "learning_rate": 2.3157894736842103e-05, + "loss": 0.2687, + "step": 23789 + }, + { + "epoch": 30.539152759948653, + "grad_norm": 1.518500804901123, + "learning_rate": 2.3157466837826275e-05, + "loss": 0.2631, + "step": 23790 + }, + { + "epoch": 30.540436456996147, + "grad_norm": 0.9463768601417542, + "learning_rate": 2.315703893881044e-05, + "loss": 0.2829, + "step": 23791 + }, + { + "epoch": 30.541720154043645, + "grad_norm": 1.3658769130706787, + "learning_rate": 2.3156611039794612e-05, + "loss": 0.2985, + "step": 23792 + }, + { + "epoch": 30.543003851091143, + "grad_norm": 0.8877156972885132, + "learning_rate": 2.3156183140778777e-05, + "loss": 0.2644, + "step": 23793 + }, + { + "epoch": 30.54428754813864, + "grad_norm": 0.9652512073516846, + "learning_rate": 2.3155755241762942e-05, + "loss": 0.2645, + "step": 23794 + }, + { + "epoch": 30.545571245186135, + "grad_norm": 7.0813775062561035, + "learning_rate": 2.3155327342747114e-05, + "loss": 0.2719, + "step": 23795 + }, + { + "epoch": 30.546854942233633, + "grad_norm": 5.718926906585693, + "learning_rate": 2.315489944373128e-05, + "loss": 0.2801, + "step": 23796 + }, + { + "epoch": 30.54813863928113, + "grad_norm": 1.1597042083740234, + "learning_rate": 2.3154471544715447e-05, + "loss": 0.2786, + "step": 23797 + }, + { + "epoch": 30.549422336328625, + "grad_norm": 1.5672467947006226, + "learning_rate": 2.3154043645699616e-05, + "loss": 0.2692, + "step": 23798 + }, + { + "epoch": 30.550706033376123, + "grad_norm": 1.1154989004135132, + "learning_rate": 2.3153615746683784e-05, + "loss": 0.2679, + "step": 23799 + }, + { + "epoch": 30.55198973042362, + "grad_norm": 1.084285855293274, + "learning_rate": 2.3153187847667953e-05, + "loss": 0.2695, + "step": 23800 + }, + { + "epoch": 30.553273427471115, + "grad_norm": 1.2838913202285767, + "learning_rate": 2.3152759948652118e-05, + "loss": 0.2665, + "step": 23801 + }, + { + "epoch": 30.554557124518613, + "grad_norm": 0.9482032060623169, + "learning_rate": 2.3152332049636286e-05, + "loss": 0.2665, + "step": 23802 + }, + { + "epoch": 30.55584082156611, + "grad_norm": 0.9839780330657959, + "learning_rate": 2.3151904150620454e-05, + "loss": 0.2476, + "step": 23803 + }, + { + "epoch": 30.55712451861361, + "grad_norm": 0.9231642484664917, + "learning_rate": 2.3151476251604623e-05, + "loss": 0.2806, + "step": 23804 + }, + { + "epoch": 30.558408215661103, + "grad_norm": 1.62587308883667, + "learning_rate": 2.3151048352588788e-05, + "loss": 0.2973, + "step": 23805 + }, + { + "epoch": 30.5596919127086, + "grad_norm": 1.060386061668396, + "learning_rate": 2.315062045357296e-05, + "loss": 0.2821, + "step": 23806 + }, + { + "epoch": 30.5609756097561, + "grad_norm": 1.2224773168563843, + "learning_rate": 2.3150192554557125e-05, + "loss": 0.2793, + "step": 23807 + }, + { + "epoch": 30.562259306803593, + "grad_norm": 1.30599045753479, + "learning_rate": 2.314976465554129e-05, + "loss": 0.2748, + "step": 23808 + }, + { + "epoch": 30.56354300385109, + "grad_norm": 1.4051541090011597, + "learning_rate": 2.314933675652546e-05, + "loss": 0.296, + "step": 23809 + }, + { + "epoch": 30.56482670089859, + "grad_norm": 1.4090639352798462, + "learning_rate": 2.3148908857509627e-05, + "loss": 0.3053, + "step": 23810 + }, + { + "epoch": 30.566110397946083, + "grad_norm": 1.3263047933578491, + "learning_rate": 2.31484809584938e-05, + "loss": 0.3013, + "step": 23811 + }, + { + "epoch": 30.56739409499358, + "grad_norm": 1.4364782571792603, + "learning_rate": 2.3148053059477963e-05, + "loss": 0.3177, + "step": 23812 + }, + { + "epoch": 30.56867779204108, + "grad_norm": 1.07907235622406, + "learning_rate": 2.3147625160462132e-05, + "loss": 0.2881, + "step": 23813 + }, + { + "epoch": 30.569961489088577, + "grad_norm": 1.2345950603485107, + "learning_rate": 2.31471972614463e-05, + "loss": 0.2748, + "step": 23814 + }, + { + "epoch": 30.57124518613607, + "grad_norm": 2.0353846549987793, + "learning_rate": 2.3146769362430465e-05, + "loss": 0.2988, + "step": 23815 + }, + { + "epoch": 30.57252888318357, + "grad_norm": 2.241147041320801, + "learning_rate": 2.3146341463414634e-05, + "loss": 0.3004, + "step": 23816 + }, + { + "epoch": 30.573812580231067, + "grad_norm": 1.4846420288085938, + "learning_rate": 2.3145913564398802e-05, + "loss": 0.2981, + "step": 23817 + }, + { + "epoch": 30.57509627727856, + "grad_norm": 2.5748164653778076, + "learning_rate": 2.314548566538297e-05, + "loss": 0.323, + "step": 23818 + }, + { + "epoch": 30.57637997432606, + "grad_norm": 3.510800838470459, + "learning_rate": 2.314505776636714e-05, + "loss": 0.3879, + "step": 23819 + }, + { + "epoch": 30.577663671373557, + "grad_norm": 1.7699509859085083, + "learning_rate": 2.3144629867351307e-05, + "loss": 0.4983, + "step": 23820 + }, + { + "epoch": 30.57894736842105, + "grad_norm": 1.436477541923523, + "learning_rate": 2.3144201968335472e-05, + "loss": 0.3015, + "step": 23821 + }, + { + "epoch": 30.58023106546855, + "grad_norm": 1.4214632511138916, + "learning_rate": 2.314377406931964e-05, + "loss": 0.2656, + "step": 23822 + }, + { + "epoch": 30.581514762516047, + "grad_norm": 1.6652716398239136, + "learning_rate": 2.314334617030381e-05, + "loss": 0.2947, + "step": 23823 + }, + { + "epoch": 30.58279845956354, + "grad_norm": 1.0491694211959839, + "learning_rate": 2.3142918271287974e-05, + "loss": 0.2978, + "step": 23824 + }, + { + "epoch": 30.58408215661104, + "grad_norm": 0.8078650236129761, + "learning_rate": 2.3142490372272146e-05, + "loss": 0.3049, + "step": 23825 + }, + { + "epoch": 30.585365853658537, + "grad_norm": 0.976604163646698, + "learning_rate": 2.314206247325631e-05, + "loss": 0.2716, + "step": 23826 + }, + { + "epoch": 30.586649550706035, + "grad_norm": 1.47983980178833, + "learning_rate": 2.3141634574240483e-05, + "loss": 0.2711, + "step": 23827 + }, + { + "epoch": 30.58793324775353, + "grad_norm": 0.9352070689201355, + "learning_rate": 2.3141206675224648e-05, + "loss": 0.2959, + "step": 23828 + }, + { + "epoch": 30.589216944801027, + "grad_norm": 1.7903817892074585, + "learning_rate": 2.3140778776208813e-05, + "loss": 0.268, + "step": 23829 + }, + { + "epoch": 30.590500641848525, + "grad_norm": 0.9000784158706665, + "learning_rate": 2.3140350877192985e-05, + "loss": 0.3085, + "step": 23830 + }, + { + "epoch": 30.59178433889602, + "grad_norm": 1.2188304662704468, + "learning_rate": 2.313992297817715e-05, + "loss": 0.2807, + "step": 23831 + }, + { + "epoch": 30.593068035943517, + "grad_norm": 0.9138957262039185, + "learning_rate": 2.3139495079161318e-05, + "loss": 0.2724, + "step": 23832 + }, + { + "epoch": 30.594351732991015, + "grad_norm": 0.9558005928993225, + "learning_rate": 2.3139067180145486e-05, + "loss": 0.2824, + "step": 23833 + }, + { + "epoch": 30.59563543003851, + "grad_norm": 1.3584493398666382, + "learning_rate": 2.3138639281129655e-05, + "loss": 0.3071, + "step": 23834 + }, + { + "epoch": 30.596919127086007, + "grad_norm": 1.6417315006256104, + "learning_rate": 2.3138211382113823e-05, + "loss": 0.2537, + "step": 23835 + }, + { + "epoch": 30.598202824133505, + "grad_norm": 1.1718698740005493, + "learning_rate": 2.3137783483097988e-05, + "loss": 0.2594, + "step": 23836 + }, + { + "epoch": 30.599486521181003, + "grad_norm": 1.9402439594268799, + "learning_rate": 2.3137355584082157e-05, + "loss": 0.2842, + "step": 23837 + }, + { + "epoch": 30.600770218228497, + "grad_norm": 1.05172860622406, + "learning_rate": 2.3136927685066325e-05, + "loss": 0.261, + "step": 23838 + }, + { + "epoch": 30.602053915275995, + "grad_norm": 1.2964249849319458, + "learning_rate": 2.3136499786050494e-05, + "loss": 0.2551, + "step": 23839 + }, + { + "epoch": 30.603337612323493, + "grad_norm": 1.4143507480621338, + "learning_rate": 2.313607188703466e-05, + "loss": 0.2595, + "step": 23840 + }, + { + "epoch": 30.604621309370987, + "grad_norm": 1.104323148727417, + "learning_rate": 2.3135643988018827e-05, + "loss": 0.2843, + "step": 23841 + }, + { + "epoch": 30.605905006418485, + "grad_norm": 0.8270297646522522, + "learning_rate": 2.3135216089002995e-05, + "loss": 0.2646, + "step": 23842 + }, + { + "epoch": 30.607188703465983, + "grad_norm": 1.1138774156570435, + "learning_rate": 2.3134788189987164e-05, + "loss": 0.2887, + "step": 23843 + }, + { + "epoch": 30.608472400513477, + "grad_norm": 1.1341986656188965, + "learning_rate": 2.3134360290971332e-05, + "loss": 0.2782, + "step": 23844 + }, + { + "epoch": 30.609756097560975, + "grad_norm": 1.7171281576156616, + "learning_rate": 2.3133932391955497e-05, + "loss": 0.2786, + "step": 23845 + }, + { + "epoch": 30.611039794608473, + "grad_norm": 1.03096342086792, + "learning_rate": 2.313350449293967e-05, + "loss": 0.2754, + "step": 23846 + }, + { + "epoch": 30.61232349165597, + "grad_norm": 0.9277490377426147, + "learning_rate": 2.3133076593923834e-05, + "loss": 0.2815, + "step": 23847 + }, + { + "epoch": 30.613607188703465, + "grad_norm": 6.493689060211182, + "learning_rate": 2.3132648694908e-05, + "loss": 0.2803, + "step": 23848 + }, + { + "epoch": 30.614890885750963, + "grad_norm": 0.8519201874732971, + "learning_rate": 2.313222079589217e-05, + "loss": 0.2597, + "step": 23849 + }, + { + "epoch": 30.61617458279846, + "grad_norm": 0.8707610964775085, + "learning_rate": 2.3131792896876336e-05, + "loss": 0.2828, + "step": 23850 + }, + { + "epoch": 30.617458279845955, + "grad_norm": 2.337690830230713, + "learning_rate": 2.3131364997860508e-05, + "loss": 0.2612, + "step": 23851 + }, + { + "epoch": 30.618741976893453, + "grad_norm": 0.9421306252479553, + "learning_rate": 2.3130937098844673e-05, + "loss": 0.2757, + "step": 23852 + }, + { + "epoch": 30.62002567394095, + "grad_norm": 0.8272692561149597, + "learning_rate": 2.313050919982884e-05, + "loss": 0.2601, + "step": 23853 + }, + { + "epoch": 30.621309370988445, + "grad_norm": 1.9792417287826538, + "learning_rate": 2.313008130081301e-05, + "loss": 0.2921, + "step": 23854 + }, + { + "epoch": 30.622593068035943, + "grad_norm": 1.0947761535644531, + "learning_rate": 2.3129653401797175e-05, + "loss": 0.2635, + "step": 23855 + }, + { + "epoch": 30.62387676508344, + "grad_norm": 1.1864169836044312, + "learning_rate": 2.3129225502781343e-05, + "loss": 0.3051, + "step": 23856 + }, + { + "epoch": 30.625160462130935, + "grad_norm": 2.681763172149658, + "learning_rate": 2.312879760376551e-05, + "loss": 0.312, + "step": 23857 + }, + { + "epoch": 30.626444159178433, + "grad_norm": 1.8440622091293335, + "learning_rate": 2.312836970474968e-05, + "loss": 0.3158, + "step": 23858 + }, + { + "epoch": 30.62772785622593, + "grad_norm": 1.1600130796432495, + "learning_rate": 2.3127941805733848e-05, + "loss": 0.2459, + "step": 23859 + }, + { + "epoch": 30.62901155327343, + "grad_norm": 1.6322221755981445, + "learning_rate": 2.3127513906718017e-05, + "loss": 0.3327, + "step": 23860 + }, + { + "epoch": 30.630295250320923, + "grad_norm": 0.9026054739952087, + "learning_rate": 2.312708600770218e-05, + "loss": 0.2856, + "step": 23861 + }, + { + "epoch": 30.63157894736842, + "grad_norm": 1.0915868282318115, + "learning_rate": 2.312665810868635e-05, + "loss": 0.3219, + "step": 23862 + }, + { + "epoch": 30.63286264441592, + "grad_norm": 1.5113028287887573, + "learning_rate": 2.312623020967052e-05, + "loss": 0.2813, + "step": 23863 + }, + { + "epoch": 30.634146341463413, + "grad_norm": 1.4860271215438843, + "learning_rate": 2.3125802310654683e-05, + "loss": 0.3038, + "step": 23864 + }, + { + "epoch": 30.63543003851091, + "grad_norm": 1.1386719942092896, + "learning_rate": 2.3125374411638855e-05, + "loss": 0.2874, + "step": 23865 + }, + { + "epoch": 30.63671373555841, + "grad_norm": 2.0408527851104736, + "learning_rate": 2.312494651262302e-05, + "loss": 0.3058, + "step": 23866 + }, + { + "epoch": 30.637997432605903, + "grad_norm": 7.059231281280518, + "learning_rate": 2.3124518613607192e-05, + "loss": 0.3249, + "step": 23867 + }, + { + "epoch": 30.6392811296534, + "grad_norm": 2.7842328548431396, + "learning_rate": 2.3124090714591357e-05, + "loss": 0.3244, + "step": 23868 + }, + { + "epoch": 30.6405648267009, + "grad_norm": 3.1241562366485596, + "learning_rate": 2.3123662815575522e-05, + "loss": 0.3287, + "step": 23869 + }, + { + "epoch": 30.641848523748397, + "grad_norm": 4.744751453399658, + "learning_rate": 2.3123234916559694e-05, + "loss": 0.4419, + "step": 23870 + }, + { + "epoch": 30.64313222079589, + "grad_norm": 0.9105860590934753, + "learning_rate": 2.312280701754386e-05, + "loss": 0.2975, + "step": 23871 + }, + { + "epoch": 30.64441591784339, + "grad_norm": 2.780817985534668, + "learning_rate": 2.3122379118528027e-05, + "loss": 0.2638, + "step": 23872 + }, + { + "epoch": 30.645699614890887, + "grad_norm": 2.386228561401367, + "learning_rate": 2.3121951219512196e-05, + "loss": 0.2777, + "step": 23873 + }, + { + "epoch": 30.64698331193838, + "grad_norm": 1.524272084236145, + "learning_rate": 2.3121523320496364e-05, + "loss": 0.2895, + "step": 23874 + }, + { + "epoch": 30.64826700898588, + "grad_norm": 1.1319327354431152, + "learning_rate": 2.3121095421480533e-05, + "loss": 0.2747, + "step": 23875 + }, + { + "epoch": 30.649550706033377, + "grad_norm": 2.548496961593628, + "learning_rate": 2.3120667522464698e-05, + "loss": 0.2895, + "step": 23876 + }, + { + "epoch": 30.65083440308087, + "grad_norm": 1.0593957901000977, + "learning_rate": 2.3120239623448866e-05, + "loss": 0.291, + "step": 23877 + }, + { + "epoch": 30.65211810012837, + "grad_norm": 1.120993733406067, + "learning_rate": 2.3119811724433034e-05, + "loss": 0.3088, + "step": 23878 + }, + { + "epoch": 30.653401797175867, + "grad_norm": 1.5805373191833496, + "learning_rate": 2.3119383825417203e-05, + "loss": 0.285, + "step": 23879 + }, + { + "epoch": 30.654685494223365, + "grad_norm": 0.829292893409729, + "learning_rate": 2.3118955926401368e-05, + "loss": 0.3058, + "step": 23880 + }, + { + "epoch": 30.65596919127086, + "grad_norm": 4.958008766174316, + "learning_rate": 2.311852802738554e-05, + "loss": 0.2612, + "step": 23881 + }, + { + "epoch": 30.657252888318357, + "grad_norm": 0.9986546635627747, + "learning_rate": 2.3118100128369705e-05, + "loss": 0.2717, + "step": 23882 + }, + { + "epoch": 30.658536585365855, + "grad_norm": 0.9189849495887756, + "learning_rate": 2.3117672229353873e-05, + "loss": 0.2816, + "step": 23883 + }, + { + "epoch": 30.65982028241335, + "grad_norm": 0.8213376402854919, + "learning_rate": 2.311724433033804e-05, + "loss": 0.2784, + "step": 23884 + }, + { + "epoch": 30.661103979460847, + "grad_norm": 0.9507074356079102, + "learning_rate": 2.3116816431322207e-05, + "loss": 0.2896, + "step": 23885 + }, + { + "epoch": 30.662387676508345, + "grad_norm": 3.5328519344329834, + "learning_rate": 2.311638853230638e-05, + "loss": 0.2774, + "step": 23886 + }, + { + "epoch": 30.66367137355584, + "grad_norm": 1.2273589372634888, + "learning_rate": 2.3115960633290543e-05, + "loss": 0.2848, + "step": 23887 + }, + { + "epoch": 30.664955070603337, + "grad_norm": 1.2843928337097168, + "learning_rate": 2.3115532734274712e-05, + "loss": 0.2794, + "step": 23888 + }, + { + "epoch": 30.666238767650835, + "grad_norm": 0.8475443720817566, + "learning_rate": 2.311510483525888e-05, + "loss": 0.2547, + "step": 23889 + }, + { + "epoch": 30.66752246469833, + "grad_norm": 1.0866678953170776, + "learning_rate": 2.3114676936243045e-05, + "loss": 0.3132, + "step": 23890 + }, + { + "epoch": 30.668806161745827, + "grad_norm": 1.1061594486236572, + "learning_rate": 2.3114249037227217e-05, + "loss": 0.2824, + "step": 23891 + }, + { + "epoch": 30.670089858793325, + "grad_norm": 1.274674415588379, + "learning_rate": 2.3113821138211382e-05, + "loss": 0.2511, + "step": 23892 + }, + { + "epoch": 30.671373555840823, + "grad_norm": 0.8895190358161926, + "learning_rate": 2.311339323919555e-05, + "loss": 0.28, + "step": 23893 + }, + { + "epoch": 30.672657252888317, + "grad_norm": 1.5234090089797974, + "learning_rate": 2.311296534017972e-05, + "loss": 0.2836, + "step": 23894 + }, + { + "epoch": 30.673940949935815, + "grad_norm": 2.2357964515686035, + "learning_rate": 2.3112537441163887e-05, + "loss": 0.2166, + "step": 23895 + }, + { + "epoch": 30.675224646983313, + "grad_norm": 3.3775575160980225, + "learning_rate": 2.3112109542148052e-05, + "loss": 0.2783, + "step": 23896 + }, + { + "epoch": 30.676508344030808, + "grad_norm": 1.1141538619995117, + "learning_rate": 2.311168164313222e-05, + "loss": 0.3253, + "step": 23897 + }, + { + "epoch": 30.677792041078305, + "grad_norm": 0.9694616794586182, + "learning_rate": 2.311125374411639e-05, + "loss": 0.324, + "step": 23898 + }, + { + "epoch": 30.679075738125803, + "grad_norm": 2.2747325897216797, + "learning_rate": 2.3110825845100558e-05, + "loss": 0.2335, + "step": 23899 + }, + { + "epoch": 30.680359435173298, + "grad_norm": 0.9256809949874878, + "learning_rate": 2.3110397946084726e-05, + "loss": 0.2519, + "step": 23900 + }, + { + "epoch": 30.681643132220795, + "grad_norm": 1.0274780988693237, + "learning_rate": 2.310997004706889e-05, + "loss": 0.2463, + "step": 23901 + }, + { + "epoch": 30.682926829268293, + "grad_norm": 1.537113904953003, + "learning_rate": 2.310954214805306e-05, + "loss": 0.2634, + "step": 23902 + }, + { + "epoch": 30.68421052631579, + "grad_norm": 1.0888983011245728, + "learning_rate": 2.3109114249037228e-05, + "loss": 0.2551, + "step": 23903 + }, + { + "epoch": 30.685494223363285, + "grad_norm": 1.6842371225357056, + "learning_rate": 2.3108686350021393e-05, + "loss": 0.2835, + "step": 23904 + }, + { + "epoch": 30.686777920410783, + "grad_norm": 1.2306448221206665, + "learning_rate": 2.3108258451005565e-05, + "loss": 0.2797, + "step": 23905 + }, + { + "epoch": 30.68806161745828, + "grad_norm": 1.5689290761947632, + "learning_rate": 2.310783055198973e-05, + "loss": 0.268, + "step": 23906 + }, + { + "epoch": 30.689345314505776, + "grad_norm": 1.032928228378296, + "learning_rate": 2.31074026529739e-05, + "loss": 0.312, + "step": 23907 + }, + { + "epoch": 30.690629011553273, + "grad_norm": 1.208009958267212, + "learning_rate": 2.3106974753958066e-05, + "loss": 0.3307, + "step": 23908 + }, + { + "epoch": 30.69191270860077, + "grad_norm": 3.490577459335327, + "learning_rate": 2.310654685494223e-05, + "loss": 0.2907, + "step": 23909 + }, + { + "epoch": 30.693196405648266, + "grad_norm": 0.9887654185295105, + "learning_rate": 2.3106118955926403e-05, + "loss": 0.2745, + "step": 23910 + }, + { + "epoch": 30.694480102695763, + "grad_norm": 1.2257598638534546, + "learning_rate": 2.310569105691057e-05, + "loss": 0.3025, + "step": 23911 + }, + { + "epoch": 30.69576379974326, + "grad_norm": 2.2743680477142334, + "learning_rate": 2.3105263157894737e-05, + "loss": 0.2841, + "step": 23912 + }, + { + "epoch": 30.69704749679076, + "grad_norm": 1.6735479831695557, + "learning_rate": 2.3104835258878905e-05, + "loss": 0.3071, + "step": 23913 + }, + { + "epoch": 30.698331193838253, + "grad_norm": 1.262426495552063, + "learning_rate": 2.3104407359863074e-05, + "loss": 0.2779, + "step": 23914 + }, + { + "epoch": 30.69961489088575, + "grad_norm": 1.2828360795974731, + "learning_rate": 2.3103979460847242e-05, + "loss": 0.3029, + "step": 23915 + }, + { + "epoch": 30.70089858793325, + "grad_norm": 2.606274127960205, + "learning_rate": 2.3103551561831407e-05, + "loss": 0.339, + "step": 23916 + }, + { + "epoch": 30.702182284980744, + "grad_norm": 1.360792875289917, + "learning_rate": 2.3103123662815575e-05, + "loss": 0.2938, + "step": 23917 + }, + { + "epoch": 30.70346598202824, + "grad_norm": 1.396729826927185, + "learning_rate": 2.3102695763799744e-05, + "loss": 0.3159, + "step": 23918 + }, + { + "epoch": 30.70474967907574, + "grad_norm": 1.553117275238037, + "learning_rate": 2.3102267864783912e-05, + "loss": 0.3119, + "step": 23919 + }, + { + "epoch": 30.706033376123234, + "grad_norm": 4.214175224304199, + "learning_rate": 2.3101839965768077e-05, + "loss": 0.4436, + "step": 23920 + }, + { + "epoch": 30.70731707317073, + "grad_norm": 1.1940630674362183, + "learning_rate": 2.310141206675225e-05, + "loss": 0.3028, + "step": 23921 + }, + { + "epoch": 30.70860077021823, + "grad_norm": 1.013588309288025, + "learning_rate": 2.3100984167736414e-05, + "loss": 0.2848, + "step": 23922 + }, + { + "epoch": 30.709884467265724, + "grad_norm": 0.9032706022262573, + "learning_rate": 2.3100556268720582e-05, + "loss": 0.2718, + "step": 23923 + }, + { + "epoch": 30.71116816431322, + "grad_norm": 0.9327160716056824, + "learning_rate": 2.310012836970475e-05, + "loss": 0.289, + "step": 23924 + }, + { + "epoch": 30.71245186136072, + "grad_norm": 1.749951720237732, + "learning_rate": 2.3099700470688916e-05, + "loss": 0.2657, + "step": 23925 + }, + { + "epoch": 30.713735558408217, + "grad_norm": 1.120348334312439, + "learning_rate": 2.3099272571673088e-05, + "loss": 0.2639, + "step": 23926 + }, + { + "epoch": 30.71501925545571, + "grad_norm": 2.1071829795837402, + "learning_rate": 2.3098844672657253e-05, + "loss": 0.2828, + "step": 23927 + }, + { + "epoch": 30.71630295250321, + "grad_norm": 1.3628013134002686, + "learning_rate": 2.309841677364142e-05, + "loss": 0.2839, + "step": 23928 + }, + { + "epoch": 30.717586649550707, + "grad_norm": 0.9098655581474304, + "learning_rate": 2.309798887462559e-05, + "loss": 0.271, + "step": 23929 + }, + { + "epoch": 30.7188703465982, + "grad_norm": 0.7998248338699341, + "learning_rate": 2.3097560975609755e-05, + "loss": 0.2768, + "step": 23930 + }, + { + "epoch": 30.7201540436457, + "grad_norm": 1.0109853744506836, + "learning_rate": 2.3097133076593926e-05, + "loss": 0.2758, + "step": 23931 + }, + { + "epoch": 30.721437740693197, + "grad_norm": 0.9981739521026611, + "learning_rate": 2.309670517757809e-05, + "loss": 0.291, + "step": 23932 + }, + { + "epoch": 30.72272143774069, + "grad_norm": 1.274535059928894, + "learning_rate": 2.309627727856226e-05, + "loss": 0.2804, + "step": 23933 + }, + { + "epoch": 30.72400513478819, + "grad_norm": 1.5425204038619995, + "learning_rate": 2.3095849379546428e-05, + "loss": 0.3131, + "step": 23934 + }, + { + "epoch": 30.725288831835687, + "grad_norm": 1.7127337455749512, + "learning_rate": 2.3095421480530597e-05, + "loss": 0.2741, + "step": 23935 + }, + { + "epoch": 30.726572528883185, + "grad_norm": 1.6300485134124756, + "learning_rate": 2.309499358151476e-05, + "loss": 0.2713, + "step": 23936 + }, + { + "epoch": 30.72785622593068, + "grad_norm": 0.8896158933639526, + "learning_rate": 2.309456568249893e-05, + "loss": 0.3123, + "step": 23937 + }, + { + "epoch": 30.729139922978177, + "grad_norm": 4.850986957550049, + "learning_rate": 2.30941377834831e-05, + "loss": 0.2647, + "step": 23938 + }, + { + "epoch": 30.730423620025675, + "grad_norm": 1.4454841613769531, + "learning_rate": 2.3093709884467267e-05, + "loss": 0.2614, + "step": 23939 + }, + { + "epoch": 30.73170731707317, + "grad_norm": 1.093679666519165, + "learning_rate": 2.3093281985451435e-05, + "loss": 0.2762, + "step": 23940 + }, + { + "epoch": 30.732991014120667, + "grad_norm": 0.8700023889541626, + "learning_rate": 2.30928540864356e-05, + "loss": 0.3068, + "step": 23941 + }, + { + "epoch": 30.734274711168165, + "grad_norm": 2.0565502643585205, + "learning_rate": 2.3092426187419772e-05, + "loss": 0.2944, + "step": 23942 + }, + { + "epoch": 30.73555840821566, + "grad_norm": 1.280987024307251, + "learning_rate": 2.3091998288403937e-05, + "loss": 0.2814, + "step": 23943 + }, + { + "epoch": 30.736842105263158, + "grad_norm": 1.0123683214187622, + "learning_rate": 2.3091570389388102e-05, + "loss": 0.2821, + "step": 23944 + }, + { + "epoch": 30.738125802310655, + "grad_norm": 1.4423459768295288, + "learning_rate": 2.3091142490372274e-05, + "loss": 0.29, + "step": 23945 + }, + { + "epoch": 30.739409499358153, + "grad_norm": 1.0380781888961792, + "learning_rate": 2.309071459135644e-05, + "loss": 0.3014, + "step": 23946 + }, + { + "epoch": 30.740693196405648, + "grad_norm": 0.7383925318717957, + "learning_rate": 2.309028669234061e-05, + "loss": 0.278, + "step": 23947 + }, + { + "epoch": 30.741976893453145, + "grad_norm": 1.4422334432601929, + "learning_rate": 2.3089858793324776e-05, + "loss": 0.2614, + "step": 23948 + }, + { + "epoch": 30.743260590500643, + "grad_norm": 1.0773406028747559, + "learning_rate": 2.3089430894308944e-05, + "loss": 0.2721, + "step": 23949 + }, + { + "epoch": 30.744544287548138, + "grad_norm": 2.731238603591919, + "learning_rate": 2.3089002995293113e-05, + "loss": 0.2778, + "step": 23950 + }, + { + "epoch": 30.745827984595635, + "grad_norm": 2.171494960784912, + "learning_rate": 2.3088575096277278e-05, + "loss": 0.2745, + "step": 23951 + }, + { + "epoch": 30.747111681643133, + "grad_norm": 1.0758486986160278, + "learning_rate": 2.3088147197261446e-05, + "loss": 0.2736, + "step": 23952 + }, + { + "epoch": 30.748395378690628, + "grad_norm": 1.1311299800872803, + "learning_rate": 2.3087719298245615e-05, + "loss": 0.2927, + "step": 23953 + }, + { + "epoch": 30.749679075738126, + "grad_norm": 1.0449317693710327, + "learning_rate": 2.3087291399229783e-05, + "loss": 0.2943, + "step": 23954 + }, + { + "epoch": 30.750962772785623, + "grad_norm": 1.6379410028457642, + "learning_rate": 2.308686350021395e-05, + "loss": 0.3286, + "step": 23955 + }, + { + "epoch": 30.752246469833118, + "grad_norm": 1.3195322751998901, + "learning_rate": 2.308643560119812e-05, + "loss": 0.3154, + "step": 23956 + }, + { + "epoch": 30.753530166880616, + "grad_norm": 1.16964852809906, + "learning_rate": 2.3086007702182285e-05, + "loss": 0.2751, + "step": 23957 + }, + { + "epoch": 30.754813863928113, + "grad_norm": 2.273449420928955, + "learning_rate": 2.3085579803166453e-05, + "loss": 0.2874, + "step": 23958 + }, + { + "epoch": 30.75609756097561, + "grad_norm": 1.829638957977295, + "learning_rate": 2.308515190415062e-05, + "loss": 0.2968, + "step": 23959 + }, + { + "epoch": 30.757381258023106, + "grad_norm": 2.4658079147338867, + "learning_rate": 2.3084724005134787e-05, + "loss": 0.2985, + "step": 23960 + }, + { + "epoch": 30.758664955070603, + "grad_norm": 1.0723971128463745, + "learning_rate": 2.308429610611896e-05, + "loss": 0.327, + "step": 23961 + }, + { + "epoch": 30.7599486521181, + "grad_norm": 1.9709850549697876, + "learning_rate": 2.3083868207103123e-05, + "loss": 0.3345, + "step": 23962 + }, + { + "epoch": 30.761232349165596, + "grad_norm": 1.236933946609497, + "learning_rate": 2.3083440308087292e-05, + "loss": 0.2767, + "step": 23963 + }, + { + "epoch": 30.762516046213094, + "grad_norm": 2.7836241722106934, + "learning_rate": 2.308301240907146e-05, + "loss": 0.3102, + "step": 23964 + }, + { + "epoch": 30.76379974326059, + "grad_norm": 2.175333261489868, + "learning_rate": 2.3082584510055625e-05, + "loss": 0.3415, + "step": 23965 + }, + { + "epoch": 30.765083440308086, + "grad_norm": 1.0911521911621094, + "learning_rate": 2.3082156611039797e-05, + "loss": 0.3229, + "step": 23966 + }, + { + "epoch": 30.766367137355584, + "grad_norm": 1.2427260875701904, + "learning_rate": 2.3081728712023962e-05, + "loss": 0.3182, + "step": 23967 + }, + { + "epoch": 30.76765083440308, + "grad_norm": 2.2033097743988037, + "learning_rate": 2.308130081300813e-05, + "loss": 0.3864, + "step": 23968 + }, + { + "epoch": 30.76893453145058, + "grad_norm": 1.9426037073135376, + "learning_rate": 2.30808729139923e-05, + "loss": 0.4221, + "step": 23969 + }, + { + "epoch": 30.770218228498074, + "grad_norm": 3.5072572231292725, + "learning_rate": 2.3080445014976464e-05, + "loss": 0.4517, + "step": 23970 + }, + { + "epoch": 30.77150192554557, + "grad_norm": 1.245342493057251, + "learning_rate": 2.3080017115960636e-05, + "loss": 0.2767, + "step": 23971 + }, + { + "epoch": 30.77278562259307, + "grad_norm": 1.76409113407135, + "learning_rate": 2.30795892169448e-05, + "loss": 0.2786, + "step": 23972 + }, + { + "epoch": 30.774069319640564, + "grad_norm": 1.9881340265274048, + "learning_rate": 2.307916131792897e-05, + "loss": 0.2783, + "step": 23973 + }, + { + "epoch": 30.77535301668806, + "grad_norm": 0.9784388542175293, + "learning_rate": 2.3078733418913138e-05, + "loss": 0.2961, + "step": 23974 + }, + { + "epoch": 30.77663671373556, + "grad_norm": 1.0202693939208984, + "learning_rate": 2.3078305519897306e-05, + "loss": 0.2985, + "step": 23975 + }, + { + "epoch": 30.777920410783054, + "grad_norm": 0.9261816143989563, + "learning_rate": 2.307787762088147e-05, + "loss": 0.2845, + "step": 23976 + }, + { + "epoch": 30.77920410783055, + "grad_norm": 0.8294530510902405, + "learning_rate": 2.307744972186564e-05, + "loss": 0.2773, + "step": 23977 + }, + { + "epoch": 30.78048780487805, + "grad_norm": 0.9428588151931763, + "learning_rate": 2.3077021822849808e-05, + "loss": 0.2447, + "step": 23978 + }, + { + "epoch": 30.781771501925547, + "grad_norm": 1.8498371839523315, + "learning_rate": 2.3076593923833976e-05, + "loss": 0.2456, + "step": 23979 + }, + { + "epoch": 30.78305519897304, + "grad_norm": 0.7816689014434814, + "learning_rate": 2.3076166024818145e-05, + "loss": 0.3048, + "step": 23980 + }, + { + "epoch": 30.78433889602054, + "grad_norm": 1.3078945875167847, + "learning_rate": 2.307573812580231e-05, + "loss": 0.3197, + "step": 23981 + }, + { + "epoch": 30.785622593068037, + "grad_norm": 1.5410743951797485, + "learning_rate": 2.307531022678648e-05, + "loss": 0.297, + "step": 23982 + }, + { + "epoch": 30.78690629011553, + "grad_norm": 1.1693871021270752, + "learning_rate": 2.3074882327770647e-05, + "loss": 0.2746, + "step": 23983 + }, + { + "epoch": 30.78818998716303, + "grad_norm": 0.7912006974220276, + "learning_rate": 2.307445442875481e-05, + "loss": 0.3063, + "step": 23984 + }, + { + "epoch": 30.789473684210527, + "grad_norm": 1.671161413192749, + "learning_rate": 2.3074026529738983e-05, + "loss": 0.3146, + "step": 23985 + }, + { + "epoch": 30.79075738125802, + "grad_norm": 1.203984260559082, + "learning_rate": 2.307359863072315e-05, + "loss": 0.3171, + "step": 23986 + }, + { + "epoch": 30.79204107830552, + "grad_norm": 1.160668969154358, + "learning_rate": 2.307317073170732e-05, + "loss": 0.2777, + "step": 23987 + }, + { + "epoch": 30.793324775353017, + "grad_norm": 0.9176746010780334, + "learning_rate": 2.3072742832691485e-05, + "loss": 0.3056, + "step": 23988 + }, + { + "epoch": 30.794608472400512, + "grad_norm": 1.0935914516448975, + "learning_rate": 2.3072314933675654e-05, + "loss": 0.2652, + "step": 23989 + }, + { + "epoch": 30.79589216944801, + "grad_norm": 0.9319625496864319, + "learning_rate": 2.3071887034659822e-05, + "loss": 0.29, + "step": 23990 + }, + { + "epoch": 30.797175866495508, + "grad_norm": 3.4157605171203613, + "learning_rate": 2.3071459135643987e-05, + "loss": 0.2573, + "step": 23991 + }, + { + "epoch": 30.798459563543005, + "grad_norm": 1.9895168542861938, + "learning_rate": 2.3071031236628155e-05, + "loss": 0.2678, + "step": 23992 + }, + { + "epoch": 30.7997432605905, + "grad_norm": 1.113601803779602, + "learning_rate": 2.3070603337612324e-05, + "loss": 0.2917, + "step": 23993 + }, + { + "epoch": 30.801026957637998, + "grad_norm": 1.3468101024627686, + "learning_rate": 2.3070175438596492e-05, + "loss": 0.2886, + "step": 23994 + }, + { + "epoch": 30.802310654685495, + "grad_norm": 0.8893680572509766, + "learning_rate": 2.306974753958066e-05, + "loss": 0.287, + "step": 23995 + }, + { + "epoch": 30.80359435173299, + "grad_norm": 0.9702313542366028, + "learning_rate": 2.306931964056483e-05, + "loss": 0.2867, + "step": 23996 + }, + { + "epoch": 30.804878048780488, + "grad_norm": 1.5984781980514526, + "learning_rate": 2.3068891741548994e-05, + "loss": 0.2595, + "step": 23997 + }, + { + "epoch": 30.806161745827985, + "grad_norm": 1.2543375492095947, + "learning_rate": 2.3068463842533163e-05, + "loss": 0.2739, + "step": 23998 + }, + { + "epoch": 30.80744544287548, + "grad_norm": 1.72915780544281, + "learning_rate": 2.306803594351733e-05, + "loss": 0.305, + "step": 23999 + }, + { + "epoch": 30.808729139922978, + "grad_norm": 0.9267470240592957, + "learning_rate": 2.3067608044501496e-05, + "loss": 0.2933, + "step": 24000 + }, + { + "epoch": 30.808729139922978, + "eval_cer": 0.26617877423144704, + "eval_loss": 0.4563054144382477, + "eval_runtime": 13.8811, + "eval_samples_per_second": 70.816, + "eval_steps_per_second": 0.504, + "eval_wer": 0.44531070855308413, + "step": 24000 + }, + { + "epoch": 30.810012836970476, + "grad_norm": 1.120751142501831, + "learning_rate": 2.3067180145485668e-05, + "loss": 0.2951, + "step": 24001 + }, + { + "epoch": 30.811296534017973, + "grad_norm": 2.0492498874664307, + "learning_rate": 2.3066752246469833e-05, + "loss": 0.3152, + "step": 24002 + }, + { + "epoch": 30.812580231065468, + "grad_norm": 0.7226009964942932, + "learning_rate": 2.3066324347454005e-05, + "loss": 0.2599, + "step": 24003 + }, + { + "epoch": 30.813863928112966, + "grad_norm": 1.1725550889968872, + "learning_rate": 2.306589644843817e-05, + "loss": 0.2775, + "step": 24004 + }, + { + "epoch": 30.815147625160463, + "grad_norm": 2.083989143371582, + "learning_rate": 2.3065468549422335e-05, + "loss": 0.2902, + "step": 24005 + }, + { + "epoch": 30.816431322207958, + "grad_norm": 1.1655457019805908, + "learning_rate": 2.3065040650406506e-05, + "loss": 0.2599, + "step": 24006 + }, + { + "epoch": 30.817715019255456, + "grad_norm": 1.067756175994873, + "learning_rate": 2.306461275139067e-05, + "loss": 0.2866, + "step": 24007 + }, + { + "epoch": 30.818998716302954, + "grad_norm": 1.8020344972610474, + "learning_rate": 2.306418485237484e-05, + "loss": 0.286, + "step": 24008 + }, + { + "epoch": 30.820282413350448, + "grad_norm": 1.011031985282898, + "learning_rate": 2.306375695335901e-05, + "loss": 0.3111, + "step": 24009 + }, + { + "epoch": 30.821566110397946, + "grad_norm": 1.5213960409164429, + "learning_rate": 2.3063329054343177e-05, + "loss": 0.3298, + "step": 24010 + }, + { + "epoch": 30.822849807445444, + "grad_norm": 1.0784341096878052, + "learning_rate": 2.3062901155327342e-05, + "loss": 0.3054, + "step": 24011 + }, + { + "epoch": 30.82413350449294, + "grad_norm": 1.5417932271957397, + "learning_rate": 2.306247325631151e-05, + "loss": 0.3077, + "step": 24012 + }, + { + "epoch": 30.825417201540436, + "grad_norm": 2.8045737743377686, + "learning_rate": 2.306204535729568e-05, + "loss": 0.3078, + "step": 24013 + }, + { + "epoch": 30.826700898587934, + "grad_norm": 1.574450969696045, + "learning_rate": 2.3061617458279847e-05, + "loss": 0.2679, + "step": 24014 + }, + { + "epoch": 30.82798459563543, + "grad_norm": 1.1649892330169678, + "learning_rate": 2.3061189559264015e-05, + "loss": 0.2863, + "step": 24015 + }, + { + "epoch": 30.829268292682926, + "grad_norm": 2.1576476097106934, + "learning_rate": 2.306076166024818e-05, + "loss": 0.3135, + "step": 24016 + }, + { + "epoch": 30.830551989730424, + "grad_norm": 3.602806568145752, + "learning_rate": 2.3060333761232352e-05, + "loss": 0.3305, + "step": 24017 + }, + { + "epoch": 30.83183568677792, + "grad_norm": 10.851988792419434, + "learning_rate": 2.3059905862216517e-05, + "loss": 0.3325, + "step": 24018 + }, + { + "epoch": 30.833119383825416, + "grad_norm": 2.2864267826080322, + "learning_rate": 2.3059477963200682e-05, + "loss": 0.3838, + "step": 24019 + }, + { + "epoch": 30.834403080872914, + "grad_norm": 3.722123622894287, + "learning_rate": 2.3059050064184854e-05, + "loss": 0.4288, + "step": 24020 + }, + { + "epoch": 30.83568677792041, + "grad_norm": 2.0779545307159424, + "learning_rate": 2.305862216516902e-05, + "loss": 0.2728, + "step": 24021 + }, + { + "epoch": 30.836970474967906, + "grad_norm": 0.7734099626541138, + "learning_rate": 2.305819426615319e-05, + "loss": 0.2847, + "step": 24022 + }, + { + "epoch": 30.838254172015404, + "grad_norm": 1.0278968811035156, + "learning_rate": 2.3057766367137356e-05, + "loss": 0.265, + "step": 24023 + }, + { + "epoch": 30.8395378690629, + "grad_norm": 0.8723217248916626, + "learning_rate": 2.3057338468121524e-05, + "loss": 0.3438, + "step": 24024 + }, + { + "epoch": 30.8408215661104, + "grad_norm": 3.2759952545166016, + "learning_rate": 2.3056910569105693e-05, + "loss": 0.2922, + "step": 24025 + }, + { + "epoch": 30.842105263157894, + "grad_norm": 1.4535596370697021, + "learning_rate": 2.3056482670089858e-05, + "loss": 0.2968, + "step": 24026 + }, + { + "epoch": 30.84338896020539, + "grad_norm": 2.3849594593048096, + "learning_rate": 2.3056054771074026e-05, + "loss": 0.2652, + "step": 24027 + }, + { + "epoch": 30.84467265725289, + "grad_norm": 0.8531550765037537, + "learning_rate": 2.3055626872058195e-05, + "loss": 0.2896, + "step": 24028 + }, + { + "epoch": 30.845956354300384, + "grad_norm": 1.5728914737701416, + "learning_rate": 2.3055198973042363e-05, + "loss": 0.3012, + "step": 24029 + }, + { + "epoch": 30.84724005134788, + "grad_norm": 1.3953498601913452, + "learning_rate": 2.305477107402653e-05, + "loss": 0.3037, + "step": 24030 + }, + { + "epoch": 30.84852374839538, + "grad_norm": 2.0165419578552246, + "learning_rate": 2.3054343175010696e-05, + "loss": 0.3217, + "step": 24031 + }, + { + "epoch": 30.849807445442874, + "grad_norm": 2.4122118949890137, + "learning_rate": 2.3053915275994865e-05, + "loss": 0.3025, + "step": 24032 + }, + { + "epoch": 30.85109114249037, + "grad_norm": 1.1541727781295776, + "learning_rate": 2.3053487376979033e-05, + "loss": 0.307, + "step": 24033 + }, + { + "epoch": 30.85237483953787, + "grad_norm": 0.8997744917869568, + "learning_rate": 2.30530594779632e-05, + "loss": 0.2834, + "step": 24034 + }, + { + "epoch": 30.853658536585368, + "grad_norm": 1.1995270252227783, + "learning_rate": 2.3052631578947367e-05, + "loss": 0.2727, + "step": 24035 + }, + { + "epoch": 30.854942233632862, + "grad_norm": 1.1833534240722656, + "learning_rate": 2.305220367993154e-05, + "loss": 0.2785, + "step": 24036 + }, + { + "epoch": 30.85622593068036, + "grad_norm": 0.8382864594459534, + "learning_rate": 2.3051775780915704e-05, + "loss": 0.2654, + "step": 24037 + }, + { + "epoch": 30.857509627727858, + "grad_norm": 1.0945264101028442, + "learning_rate": 2.3051347881899872e-05, + "loss": 0.276, + "step": 24038 + }, + { + "epoch": 30.858793324775352, + "grad_norm": 1.0241910219192505, + "learning_rate": 2.305091998288404e-05, + "loss": 0.2766, + "step": 24039 + }, + { + "epoch": 30.86007702182285, + "grad_norm": 2.0009374618530273, + "learning_rate": 2.3050492083868205e-05, + "loss": 0.2583, + "step": 24040 + }, + { + "epoch": 30.861360718870348, + "grad_norm": 1.140234112739563, + "learning_rate": 2.3050064184852377e-05, + "loss": 0.2791, + "step": 24041 + }, + { + "epoch": 30.862644415917842, + "grad_norm": 2.381824254989624, + "learning_rate": 2.3049636285836542e-05, + "loss": 0.2683, + "step": 24042 + }, + { + "epoch": 30.86392811296534, + "grad_norm": 4.196374416351318, + "learning_rate": 2.304920838682071e-05, + "loss": 0.2814, + "step": 24043 + }, + { + "epoch": 30.865211810012838, + "grad_norm": 1.3084843158721924, + "learning_rate": 2.304878048780488e-05, + "loss": 0.3209, + "step": 24044 + }, + { + "epoch": 30.866495507060336, + "grad_norm": 1.8615342378616333, + "learning_rate": 2.3048352588789044e-05, + "loss": 0.3243, + "step": 24045 + }, + { + "epoch": 30.86777920410783, + "grad_norm": 0.8643006682395935, + "learning_rate": 2.3047924689773216e-05, + "loss": 0.2784, + "step": 24046 + }, + { + "epoch": 30.869062901155328, + "grad_norm": 3.29240083694458, + "learning_rate": 2.304749679075738e-05, + "loss": 0.2457, + "step": 24047 + }, + { + "epoch": 30.870346598202826, + "grad_norm": 1.0117981433868408, + "learning_rate": 2.304706889174155e-05, + "loss": 0.2445, + "step": 24048 + }, + { + "epoch": 30.87163029525032, + "grad_norm": 2.337400197982788, + "learning_rate": 2.3046640992725718e-05, + "loss": 0.2796, + "step": 24049 + }, + { + "epoch": 30.872913992297818, + "grad_norm": 1.531105637550354, + "learning_rate": 2.3046213093709886e-05, + "loss": 0.2859, + "step": 24050 + }, + { + "epoch": 30.874197689345316, + "grad_norm": 1.0265806913375854, + "learning_rate": 2.304578519469405e-05, + "loss": 0.2572, + "step": 24051 + }, + { + "epoch": 30.87548138639281, + "grad_norm": 1.1195406913757324, + "learning_rate": 2.304535729567822e-05, + "loss": 0.29, + "step": 24052 + }, + { + "epoch": 30.876765083440308, + "grad_norm": 1.306898832321167, + "learning_rate": 2.3044929396662388e-05, + "loss": 0.2925, + "step": 24053 + }, + { + "epoch": 30.878048780487806, + "grad_norm": 1.567277431488037, + "learning_rate": 2.3044501497646556e-05, + "loss": 0.2698, + "step": 24054 + }, + { + "epoch": 30.8793324775353, + "grad_norm": 1.4480915069580078, + "learning_rate": 2.3044073598630725e-05, + "loss": 0.2986, + "step": 24055 + }, + { + "epoch": 30.880616174582798, + "grad_norm": 0.9437805414199829, + "learning_rate": 2.304364569961489e-05, + "loss": 0.2716, + "step": 24056 + }, + { + "epoch": 30.881899871630296, + "grad_norm": 1.3974076509475708, + "learning_rate": 2.304321780059906e-05, + "loss": 0.2916, + "step": 24057 + }, + { + "epoch": 30.883183568677794, + "grad_norm": 1.040382981300354, + "learning_rate": 2.3042789901583227e-05, + "loss": 0.2909, + "step": 24058 + }, + { + "epoch": 30.884467265725288, + "grad_norm": 3.3236188888549805, + "learning_rate": 2.304236200256739e-05, + "loss": 0.2812, + "step": 24059 + }, + { + "epoch": 30.885750962772786, + "grad_norm": 2.1757845878601074, + "learning_rate": 2.3041934103551563e-05, + "loss": 0.2639, + "step": 24060 + }, + { + "epoch": 30.887034659820284, + "grad_norm": 1.650761365890503, + "learning_rate": 2.304150620453573e-05, + "loss": 0.2972, + "step": 24061 + }, + { + "epoch": 30.888318356867778, + "grad_norm": 2.872159481048584, + "learning_rate": 2.30410783055199e-05, + "loss": 0.3381, + "step": 24062 + }, + { + "epoch": 30.889602053915276, + "grad_norm": 1.404099941253662, + "learning_rate": 2.3040650406504065e-05, + "loss": 0.2885, + "step": 24063 + }, + { + "epoch": 30.890885750962774, + "grad_norm": 1.4698814153671265, + "learning_rate": 2.3040222507488234e-05, + "loss": 0.3307, + "step": 24064 + }, + { + "epoch": 30.892169448010268, + "grad_norm": 2.0626471042633057, + "learning_rate": 2.3039794608472402e-05, + "loss": 0.292, + "step": 24065 + }, + { + "epoch": 30.893453145057766, + "grad_norm": 1.391502857208252, + "learning_rate": 2.3039366709456567e-05, + "loss": 0.308, + "step": 24066 + }, + { + "epoch": 30.894736842105264, + "grad_norm": 3.9772307872772217, + "learning_rate": 2.3038938810440736e-05, + "loss": 0.3236, + "step": 24067 + }, + { + "epoch": 30.89602053915276, + "grad_norm": 1.6950666904449463, + "learning_rate": 2.3038510911424904e-05, + "loss": 0.3411, + "step": 24068 + }, + { + "epoch": 30.897304236200256, + "grad_norm": 1.9536489248275757, + "learning_rate": 2.3038083012409072e-05, + "loss": 0.4132, + "step": 24069 + }, + { + "epoch": 30.898587933247754, + "grad_norm": 2.361800193786621, + "learning_rate": 2.303765511339324e-05, + "loss": 0.463, + "step": 24070 + }, + { + "epoch": 30.89987163029525, + "grad_norm": 1.2409427165985107, + "learning_rate": 2.303722721437741e-05, + "loss": 0.2949, + "step": 24071 + }, + { + "epoch": 30.901155327342746, + "grad_norm": 0.9985019564628601, + "learning_rate": 2.3036799315361574e-05, + "loss": 0.2783, + "step": 24072 + }, + { + "epoch": 30.902439024390244, + "grad_norm": 0.8439863920211792, + "learning_rate": 2.3036371416345743e-05, + "loss": 0.2942, + "step": 24073 + }, + { + "epoch": 30.90372272143774, + "grad_norm": 2.539687395095825, + "learning_rate": 2.303594351732991e-05, + "loss": 0.3004, + "step": 24074 + }, + { + "epoch": 30.905006418485236, + "grad_norm": 1.5885621309280396, + "learning_rate": 2.3035515618314076e-05, + "loss": 0.2817, + "step": 24075 + }, + { + "epoch": 30.906290115532734, + "grad_norm": 0.9459617137908936, + "learning_rate": 2.3035087719298248e-05, + "loss": 0.2943, + "step": 24076 + }, + { + "epoch": 30.90757381258023, + "grad_norm": 0.8632376194000244, + "learning_rate": 2.3034659820282413e-05, + "loss": 0.2983, + "step": 24077 + }, + { + "epoch": 30.90885750962773, + "grad_norm": 1.464884877204895, + "learning_rate": 2.3034231921266585e-05, + "loss": 0.2867, + "step": 24078 + }, + { + "epoch": 30.910141206675224, + "grad_norm": 0.9939081072807312, + "learning_rate": 2.303380402225075e-05, + "loss": 0.285, + "step": 24079 + }, + { + "epoch": 30.911424903722722, + "grad_norm": 0.8830277323722839, + "learning_rate": 2.3033376123234915e-05, + "loss": 0.3112, + "step": 24080 + }, + { + "epoch": 30.91270860077022, + "grad_norm": 1.2466673851013184, + "learning_rate": 2.3032948224219087e-05, + "loss": 0.2854, + "step": 24081 + }, + { + "epoch": 30.913992297817714, + "grad_norm": 1.5991188287734985, + "learning_rate": 2.303252032520325e-05, + "loss": 0.2773, + "step": 24082 + }, + { + "epoch": 30.915275994865212, + "grad_norm": 0.940104067325592, + "learning_rate": 2.303209242618742e-05, + "loss": 0.2836, + "step": 24083 + }, + { + "epoch": 30.91655969191271, + "grad_norm": 2.453810930252075, + "learning_rate": 2.303166452717159e-05, + "loss": 0.3064, + "step": 24084 + }, + { + "epoch": 30.917843388960204, + "grad_norm": 3.2438061237335205, + "learning_rate": 2.3031236628155757e-05, + "loss": 0.2907, + "step": 24085 + }, + { + "epoch": 30.919127086007702, + "grad_norm": 0.9031880497932434, + "learning_rate": 2.3030808729139925e-05, + "loss": 0.2886, + "step": 24086 + }, + { + "epoch": 30.9204107830552, + "grad_norm": 0.9307829141616821, + "learning_rate": 2.303038083012409e-05, + "loss": 0.3106, + "step": 24087 + }, + { + "epoch": 30.921694480102694, + "grad_norm": 5.1050705909729, + "learning_rate": 2.302995293110826e-05, + "loss": 0.2494, + "step": 24088 + }, + { + "epoch": 30.922978177150192, + "grad_norm": 2.6380155086517334, + "learning_rate": 2.3029525032092427e-05, + "loss": 0.2708, + "step": 24089 + }, + { + "epoch": 30.92426187419769, + "grad_norm": 1.9151484966278076, + "learning_rate": 2.3029097133076595e-05, + "loss": 0.2783, + "step": 24090 + }, + { + "epoch": 30.925545571245188, + "grad_norm": 1.0791188478469849, + "learning_rate": 2.302866923406076e-05, + "loss": 0.2686, + "step": 24091 + }, + { + "epoch": 30.926829268292682, + "grad_norm": 1.2887779474258423, + "learning_rate": 2.302824133504493e-05, + "loss": 0.264, + "step": 24092 + }, + { + "epoch": 30.92811296534018, + "grad_norm": 2.504800796508789, + "learning_rate": 2.3027813436029097e-05, + "loss": 0.3161, + "step": 24093 + }, + { + "epoch": 30.929396662387678, + "grad_norm": 2.8038904666900635, + "learning_rate": 2.3027385537013266e-05, + "loss": 0.2926, + "step": 24094 + }, + { + "epoch": 30.930680359435172, + "grad_norm": 1.0747593641281128, + "learning_rate": 2.3026957637997434e-05, + "loss": 0.2817, + "step": 24095 + }, + { + "epoch": 30.93196405648267, + "grad_norm": 1.892763614654541, + "learning_rate": 2.30265297389816e-05, + "loss": 0.2718, + "step": 24096 + }, + { + "epoch": 30.933247753530168, + "grad_norm": 0.878660261631012, + "learning_rate": 2.302610183996577e-05, + "loss": 0.2783, + "step": 24097 + }, + { + "epoch": 30.934531450577662, + "grad_norm": 0.8982719779014587, + "learning_rate": 2.3025673940949936e-05, + "loss": 0.2644, + "step": 24098 + }, + { + "epoch": 30.93581514762516, + "grad_norm": 1.0425959825515747, + "learning_rate": 2.30252460419341e-05, + "loss": 0.2896, + "step": 24099 + }, + { + "epoch": 30.937098844672658, + "grad_norm": 0.8715181946754456, + "learning_rate": 2.3024818142918273e-05, + "loss": 0.2641, + "step": 24100 + }, + { + "epoch": 30.938382541720156, + "grad_norm": 3.628230571746826, + "learning_rate": 2.3024390243902438e-05, + "loss": 0.2993, + "step": 24101 + }, + { + "epoch": 30.93966623876765, + "grad_norm": 1.713996171951294, + "learning_rate": 2.302396234488661e-05, + "loss": 0.2577, + "step": 24102 + }, + { + "epoch": 30.940949935815148, + "grad_norm": 1.948032021522522, + "learning_rate": 2.3023534445870775e-05, + "loss": 0.2906, + "step": 24103 + }, + { + "epoch": 30.942233632862646, + "grad_norm": 2.1450350284576416, + "learning_rate": 2.3023106546854943e-05, + "loss": 0.2865, + "step": 24104 + }, + { + "epoch": 30.94351732991014, + "grad_norm": 1.481985330581665, + "learning_rate": 2.302267864783911e-05, + "loss": 0.2688, + "step": 24105 + }, + { + "epoch": 30.944801026957638, + "grad_norm": 1.133149266242981, + "learning_rate": 2.3022250748823276e-05, + "loss": 0.2861, + "step": 24106 + }, + { + "epoch": 30.946084724005136, + "grad_norm": 1.5148981809616089, + "learning_rate": 2.3021822849807445e-05, + "loss": 0.2391, + "step": 24107 + }, + { + "epoch": 30.94736842105263, + "grad_norm": 1.327162504196167, + "learning_rate": 2.3021394950791613e-05, + "loss": 0.2948, + "step": 24108 + }, + { + "epoch": 30.948652118100128, + "grad_norm": 2.6722233295440674, + "learning_rate": 2.3020967051775782e-05, + "loss": 0.2731, + "step": 24109 + }, + { + "epoch": 30.949935815147626, + "grad_norm": 1.7352761030197144, + "learning_rate": 2.302053915275995e-05, + "loss": 0.2678, + "step": 24110 + }, + { + "epoch": 30.951219512195124, + "grad_norm": 2.513226270675659, + "learning_rate": 2.302011125374412e-05, + "loss": 0.3229, + "step": 24111 + }, + { + "epoch": 30.952503209242618, + "grad_norm": 1.6422749757766724, + "learning_rate": 2.3019683354728284e-05, + "loss": 0.3264, + "step": 24112 + }, + { + "epoch": 30.953786906290116, + "grad_norm": 1.743256688117981, + "learning_rate": 2.3019255455712452e-05, + "loss": 0.2785, + "step": 24113 + }, + { + "epoch": 30.955070603337614, + "grad_norm": 1.433147668838501, + "learning_rate": 2.301882755669662e-05, + "loss": 0.2737, + "step": 24114 + }, + { + "epoch": 30.956354300385108, + "grad_norm": 1.999433159828186, + "learning_rate": 2.3018399657680785e-05, + "loss": 0.2884, + "step": 24115 + }, + { + "epoch": 30.957637997432606, + "grad_norm": 1.6307991743087769, + "learning_rate": 2.3017971758664957e-05, + "loss": 0.2951, + "step": 24116 + }, + { + "epoch": 30.958921694480104, + "grad_norm": 1.4241993427276611, + "learning_rate": 2.3017543859649122e-05, + "loss": 0.3487, + "step": 24117 + }, + { + "epoch": 30.960205391527598, + "grad_norm": 2.035835027694702, + "learning_rate": 2.3017115960633294e-05, + "loss": 0.3514, + "step": 24118 + }, + { + "epoch": 30.961489088575096, + "grad_norm": 3.100935220718384, + "learning_rate": 2.301668806161746e-05, + "loss": 0.3545, + "step": 24119 + }, + { + "epoch": 30.962772785622594, + "grad_norm": 3.3045849800109863, + "learning_rate": 2.3016260162601624e-05, + "loss": 0.5005, + "step": 24120 + }, + { + "epoch": 30.964056482670088, + "grad_norm": 1.401690125465393, + "learning_rate": 2.3015832263585796e-05, + "loss": 0.2638, + "step": 24121 + }, + { + "epoch": 30.965340179717586, + "grad_norm": 1.6614083051681519, + "learning_rate": 2.301540436456996e-05, + "loss": 0.2886, + "step": 24122 + }, + { + "epoch": 30.966623876765084, + "grad_norm": 0.9009038209915161, + "learning_rate": 2.301497646555413e-05, + "loss": 0.259, + "step": 24123 + }, + { + "epoch": 30.96790757381258, + "grad_norm": 1.0600122213363647, + "learning_rate": 2.3014548566538298e-05, + "loss": 0.2812, + "step": 24124 + }, + { + "epoch": 30.969191270860076, + "grad_norm": 0.7720555663108826, + "learning_rate": 2.3014120667522466e-05, + "loss": 0.292, + "step": 24125 + }, + { + "epoch": 30.970474967907574, + "grad_norm": 1.0948556661605835, + "learning_rate": 2.3013692768506635e-05, + "loss": 0.2913, + "step": 24126 + }, + { + "epoch": 30.971758664955072, + "grad_norm": 1.4827710390090942, + "learning_rate": 2.30132648694908e-05, + "loss": 0.2942, + "step": 24127 + }, + { + "epoch": 30.973042362002566, + "grad_norm": 0.9117857217788696, + "learning_rate": 2.3012836970474968e-05, + "loss": 0.2953, + "step": 24128 + }, + { + "epoch": 30.974326059050064, + "grad_norm": 2.0287563800811768, + "learning_rate": 2.3012409071459136e-05, + "loss": 0.2915, + "step": 24129 + }, + { + "epoch": 30.975609756097562, + "grad_norm": 1.4567383527755737, + "learning_rate": 2.3011981172443305e-05, + "loss": 0.2986, + "step": 24130 + }, + { + "epoch": 30.976893453145056, + "grad_norm": 1.4641691446304321, + "learning_rate": 2.301155327342747e-05, + "loss": 0.2908, + "step": 24131 + }, + { + "epoch": 30.978177150192554, + "grad_norm": 4.909310817718506, + "learning_rate": 2.301112537441164e-05, + "loss": 0.2695, + "step": 24132 + }, + { + "epoch": 30.979460847240052, + "grad_norm": 0.9921309351921082, + "learning_rate": 2.3010697475395807e-05, + "loss": 0.2908, + "step": 24133 + }, + { + "epoch": 30.98074454428755, + "grad_norm": 1.3569415807724, + "learning_rate": 2.3010269576379975e-05, + "loss": 0.2716, + "step": 24134 + }, + { + "epoch": 30.982028241335044, + "grad_norm": 2.3838753700256348, + "learning_rate": 2.3009841677364143e-05, + "loss": 0.2818, + "step": 24135 + }, + { + "epoch": 30.983311938382542, + "grad_norm": 4.344663143157959, + "learning_rate": 2.300941377834831e-05, + "loss": 0.2988, + "step": 24136 + }, + { + "epoch": 30.98459563543004, + "grad_norm": 1.30229914188385, + "learning_rate": 2.300898587933248e-05, + "loss": 0.2829, + "step": 24137 + }, + { + "epoch": 30.985879332477534, + "grad_norm": 2.933413505554199, + "learning_rate": 2.3008557980316645e-05, + "loss": 0.2895, + "step": 24138 + }, + { + "epoch": 30.987163029525032, + "grad_norm": 0.9442101120948792, + "learning_rate": 2.3008130081300814e-05, + "loss": 0.3114, + "step": 24139 + }, + { + "epoch": 30.98844672657253, + "grad_norm": 1.3758878707885742, + "learning_rate": 2.3007702182284982e-05, + "loss": 0.2786, + "step": 24140 + }, + { + "epoch": 30.989730423620024, + "grad_norm": 1.3785651922225952, + "learning_rate": 2.3007274283269147e-05, + "loss": 0.2618, + "step": 24141 + }, + { + "epoch": 30.991014120667522, + "grad_norm": 2.0637705326080322, + "learning_rate": 2.300684638425332e-05, + "loss": 0.3046, + "step": 24142 + }, + { + "epoch": 30.99229781771502, + "grad_norm": 4.505009651184082, + "learning_rate": 2.3006418485237484e-05, + "loss": 0.2593, + "step": 24143 + }, + { + "epoch": 30.993581514762518, + "grad_norm": 2.359455108642578, + "learning_rate": 2.3005990586221652e-05, + "loss": 0.3223, + "step": 24144 + }, + { + "epoch": 30.994865211810012, + "grad_norm": 1.2324939966201782, + "learning_rate": 2.300556268720582e-05, + "loss": 0.3012, + "step": 24145 + }, + { + "epoch": 30.99614890885751, + "grad_norm": 1.5892009735107422, + "learning_rate": 2.300513478818999e-05, + "loss": 0.3537, + "step": 24146 + }, + { + "epoch": 30.997432605905008, + "grad_norm": 1.372489333152771, + "learning_rate": 2.3004706889174154e-05, + "loss": 0.3075, + "step": 24147 + }, + { + "epoch": 30.998716302952502, + "grad_norm": 3.5688116550445557, + "learning_rate": 2.3004278990158323e-05, + "loss": 0.3772, + "step": 24148 + }, + { + "epoch": 31.0, + "grad_norm": 2.956050395965576, + "learning_rate": 2.300385109114249e-05, + "loss": 0.3961, + "step": 24149 + }, + { + "epoch": 31.001283697047498, + "grad_norm": 0.9571095705032349, + "learning_rate": 2.300342319212666e-05, + "loss": 0.2456, + "step": 24150 + }, + { + "epoch": 31.002567394094992, + "grad_norm": 1.096182942390442, + "learning_rate": 2.3002995293110828e-05, + "loss": 0.2781, + "step": 24151 + }, + { + "epoch": 31.00385109114249, + "grad_norm": 1.2371994256973267, + "learning_rate": 2.3002567394094993e-05, + "loss": 0.2836, + "step": 24152 + }, + { + "epoch": 31.005134788189988, + "grad_norm": 0.7289432883262634, + "learning_rate": 2.300213949507916e-05, + "loss": 0.2883, + "step": 24153 + }, + { + "epoch": 31.006418485237482, + "grad_norm": 0.6371375322341919, + "learning_rate": 2.300171159606333e-05, + "loss": 0.2426, + "step": 24154 + }, + { + "epoch": 31.00770218228498, + "grad_norm": 3.904757022857666, + "learning_rate": 2.3001283697047495e-05, + "loss": 0.2544, + "step": 24155 + }, + { + "epoch": 31.008985879332478, + "grad_norm": 0.802646815776825, + "learning_rate": 2.3000855798031667e-05, + "loss": 0.2722, + "step": 24156 + }, + { + "epoch": 31.010269576379976, + "grad_norm": 0.9311628341674805, + "learning_rate": 2.300042789901583e-05, + "loss": 0.2418, + "step": 24157 + }, + { + "epoch": 31.01155327342747, + "grad_norm": 1.1150004863739014, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.2714, + "step": 24158 + }, + { + "epoch": 31.012836970474968, + "grad_norm": 1.2815908193588257, + "learning_rate": 2.299957210098417e-05, + "loss": 0.2729, + "step": 24159 + }, + { + "epoch": 31.014120667522466, + "grad_norm": 0.9743182063102722, + "learning_rate": 2.2999144201968333e-05, + "loss": 0.2507, + "step": 24160 + }, + { + "epoch": 31.01540436456996, + "grad_norm": 0.7854164242744446, + "learning_rate": 2.2998716302952505e-05, + "loss": 0.28, + "step": 24161 + }, + { + "epoch": 31.016688061617458, + "grad_norm": 0.9234583973884583, + "learning_rate": 2.299828840393667e-05, + "loss": 0.2506, + "step": 24162 + }, + { + "epoch": 31.017971758664956, + "grad_norm": 1.3321245908737183, + "learning_rate": 2.299786050492084e-05, + "loss": 0.2675, + "step": 24163 + }, + { + "epoch": 31.01925545571245, + "grad_norm": 1.3440020084381104, + "learning_rate": 2.2997432605905007e-05, + "loss": 0.2723, + "step": 24164 + }, + { + "epoch": 31.020539152759948, + "grad_norm": 0.8865834474563599, + "learning_rate": 2.2997004706889176e-05, + "loss": 0.2481, + "step": 24165 + }, + { + "epoch": 31.021822849807446, + "grad_norm": 1.4715756177902222, + "learning_rate": 2.2996576807873344e-05, + "loss": 0.2479, + "step": 24166 + }, + { + "epoch": 31.023106546854944, + "grad_norm": 1.1175259351730347, + "learning_rate": 2.299614890885751e-05, + "loss": 0.2566, + "step": 24167 + }, + { + "epoch": 31.024390243902438, + "grad_norm": 1.6246330738067627, + "learning_rate": 2.2995721009841677e-05, + "loss": 0.2699, + "step": 24168 + }, + { + "epoch": 31.025673940949936, + "grad_norm": 0.6809710264205933, + "learning_rate": 2.2995293110825846e-05, + "loss": 0.2481, + "step": 24169 + }, + { + "epoch": 31.026957637997434, + "grad_norm": 1.440921664237976, + "learning_rate": 2.2994865211810014e-05, + "loss": 0.2289, + "step": 24170 + }, + { + "epoch": 31.028241335044928, + "grad_norm": 5.622574329376221, + "learning_rate": 2.299443731279418e-05, + "loss": 0.2623, + "step": 24171 + }, + { + "epoch": 31.029525032092426, + "grad_norm": 0.8069926500320435, + "learning_rate": 2.299400941377835e-05, + "loss": 0.2736, + "step": 24172 + }, + { + "epoch": 31.030808729139924, + "grad_norm": 1.3299084901809692, + "learning_rate": 2.2993581514762516e-05, + "loss": 0.2657, + "step": 24173 + }, + { + "epoch": 31.03209242618742, + "grad_norm": 1.2561041116714478, + "learning_rate": 2.2993153615746684e-05, + "loss": 0.2729, + "step": 24174 + }, + { + "epoch": 31.033376123234916, + "grad_norm": 1.2657424211502075, + "learning_rate": 2.2992725716730853e-05, + "loss": 0.244, + "step": 24175 + }, + { + "epoch": 31.034659820282414, + "grad_norm": 1.27524733543396, + "learning_rate": 2.2992297817715018e-05, + "loss": 0.238, + "step": 24176 + }, + { + "epoch": 31.035943517329912, + "grad_norm": 0.8116843700408936, + "learning_rate": 2.299186991869919e-05, + "loss": 0.2416, + "step": 24177 + }, + { + "epoch": 31.037227214377406, + "grad_norm": 1.3761926889419556, + "learning_rate": 2.2991442019683355e-05, + "loss": 0.2911, + "step": 24178 + }, + { + "epoch": 31.038510911424904, + "grad_norm": 0.9283111095428467, + "learning_rate": 2.2991014120667523e-05, + "loss": 0.2216, + "step": 24179 + }, + { + "epoch": 31.039794608472402, + "grad_norm": 1.308576226234436, + "learning_rate": 2.299058622165169e-05, + "loss": 0.2975, + "step": 24180 + }, + { + "epoch": 31.041078305519896, + "grad_norm": 1.8161120414733887, + "learning_rate": 2.2990158322635857e-05, + "loss": 0.274, + "step": 24181 + }, + { + "epoch": 31.042362002567394, + "grad_norm": 2.0659074783325195, + "learning_rate": 2.298973042362003e-05, + "loss": 0.2283, + "step": 24182 + }, + { + "epoch": 31.043645699614892, + "grad_norm": 1.1312694549560547, + "learning_rate": 2.2989302524604193e-05, + "loss": 0.2687, + "step": 24183 + }, + { + "epoch": 31.044929396662386, + "grad_norm": 1.4816608428955078, + "learning_rate": 2.2988874625588362e-05, + "loss": 0.2723, + "step": 24184 + }, + { + "epoch": 31.046213093709884, + "grad_norm": 1.1704005002975464, + "learning_rate": 2.298844672657253e-05, + "loss": 0.2819, + "step": 24185 + }, + { + "epoch": 31.047496790757382, + "grad_norm": 1.8300930261611938, + "learning_rate": 2.29880188275567e-05, + "loss": 0.2306, + "step": 24186 + }, + { + "epoch": 31.048780487804876, + "grad_norm": 2.2400870323181152, + "learning_rate": 2.2987590928540864e-05, + "loss": 0.2522, + "step": 24187 + }, + { + "epoch": 31.050064184852374, + "grad_norm": 1.4077138900756836, + "learning_rate": 2.2987163029525032e-05, + "loss": 0.2888, + "step": 24188 + }, + { + "epoch": 31.051347881899872, + "grad_norm": 1.335968017578125, + "learning_rate": 2.29867351305092e-05, + "loss": 0.2839, + "step": 24189 + }, + { + "epoch": 31.05263157894737, + "grad_norm": 3.08390212059021, + "learning_rate": 2.298630723149337e-05, + "loss": 0.2767, + "step": 24190 + }, + { + "epoch": 31.053915275994864, + "grad_norm": 1.2704118490219116, + "learning_rate": 2.2985879332477537e-05, + "loss": 0.2638, + "step": 24191 + }, + { + "epoch": 31.055198973042362, + "grad_norm": 2.772965908050537, + "learning_rate": 2.2985451433461702e-05, + "loss": 0.3199, + "step": 24192 + }, + { + "epoch": 31.05648267008986, + "grad_norm": 1.2347325086593628, + "learning_rate": 2.2985023534445874e-05, + "loss": 0.2676, + "step": 24193 + }, + { + "epoch": 31.057766367137354, + "grad_norm": 2.5634236335754395, + "learning_rate": 2.298459563543004e-05, + "loss": 0.2915, + "step": 24194 + }, + { + "epoch": 31.059050064184852, + "grad_norm": 1.3790391683578491, + "learning_rate": 2.2984167736414204e-05, + "loss": 0.3452, + "step": 24195 + }, + { + "epoch": 31.06033376123235, + "grad_norm": 1.5055115222930908, + "learning_rate": 2.2983739837398376e-05, + "loss": 0.3084, + "step": 24196 + }, + { + "epoch": 31.061617458279844, + "grad_norm": 1.6920223236083984, + "learning_rate": 2.298331193838254e-05, + "loss": 0.3111, + "step": 24197 + }, + { + "epoch": 31.062901155327342, + "grad_norm": 1.746606707572937, + "learning_rate": 2.2982884039366713e-05, + "loss": 0.361, + "step": 24198 + }, + { + "epoch": 31.06418485237484, + "grad_norm": 2.903024435043335, + "learning_rate": 2.2982456140350878e-05, + "loss": 0.4576, + "step": 24199 + }, + { + "epoch": 31.065468549422338, + "grad_norm": 0.7077251076698303, + "learning_rate": 2.2982028241335046e-05, + "loss": 0.2545, + "step": 24200 + }, + { + "epoch": 31.066752246469832, + "grad_norm": 2.421304702758789, + "learning_rate": 2.2981600342319215e-05, + "loss": 0.2701, + "step": 24201 + }, + { + "epoch": 31.06803594351733, + "grad_norm": 1.4914307594299316, + "learning_rate": 2.298117244330338e-05, + "loss": 0.2682, + "step": 24202 + }, + { + "epoch": 31.069319640564828, + "grad_norm": 1.5970165729522705, + "learning_rate": 2.2980744544287548e-05, + "loss": 0.2811, + "step": 24203 + }, + { + "epoch": 31.070603337612322, + "grad_norm": 0.8237149715423584, + "learning_rate": 2.2980316645271716e-05, + "loss": 0.2791, + "step": 24204 + }, + { + "epoch": 31.07188703465982, + "grad_norm": 1.0852947235107422, + "learning_rate": 2.2979888746255885e-05, + "loss": 0.2389, + "step": 24205 + }, + { + "epoch": 31.073170731707318, + "grad_norm": 1.1342347860336304, + "learning_rate": 2.2979460847240053e-05, + "loss": 0.2847, + "step": 24206 + }, + { + "epoch": 31.074454428754812, + "grad_norm": 1.0413893461227417, + "learning_rate": 2.2979032948224222e-05, + "loss": 0.2499, + "step": 24207 + }, + { + "epoch": 31.07573812580231, + "grad_norm": 0.9216473698616028, + "learning_rate": 2.2978605049208387e-05, + "loss": 0.2534, + "step": 24208 + }, + { + "epoch": 31.077021822849808, + "grad_norm": 1.3269596099853516, + "learning_rate": 2.2978177150192555e-05, + "loss": 0.2691, + "step": 24209 + }, + { + "epoch": 31.078305519897306, + "grad_norm": 1.019108772277832, + "learning_rate": 2.2977749251176724e-05, + "loss": 0.2675, + "step": 24210 + }, + { + "epoch": 31.0795892169448, + "grad_norm": 0.8744146823883057, + "learning_rate": 2.297732135216089e-05, + "loss": 0.2766, + "step": 24211 + }, + { + "epoch": 31.080872913992298, + "grad_norm": 1.0068342685699463, + "learning_rate": 2.297689345314506e-05, + "loss": 0.2611, + "step": 24212 + }, + { + "epoch": 31.082156611039796, + "grad_norm": 1.2442190647125244, + "learning_rate": 2.2976465554129225e-05, + "loss": 0.2768, + "step": 24213 + }, + { + "epoch": 31.08344030808729, + "grad_norm": 0.9368973970413208, + "learning_rate": 2.2976037655113394e-05, + "loss": 0.2625, + "step": 24214 + }, + { + "epoch": 31.084724005134788, + "grad_norm": 0.8138470649719238, + "learning_rate": 2.2975609756097562e-05, + "loss": 0.2511, + "step": 24215 + }, + { + "epoch": 31.086007702182286, + "grad_norm": 1.4711852073669434, + "learning_rate": 2.2975181857081727e-05, + "loss": 0.2756, + "step": 24216 + }, + { + "epoch": 31.08729139922978, + "grad_norm": 0.7484991550445557, + "learning_rate": 2.29747539580659e-05, + "loss": 0.2631, + "step": 24217 + }, + { + "epoch": 31.088575096277278, + "grad_norm": 2.5174450874328613, + "learning_rate": 2.2974326059050064e-05, + "loss": 0.2404, + "step": 24218 + }, + { + "epoch": 31.089858793324776, + "grad_norm": 0.7863960266113281, + "learning_rate": 2.2973898160034232e-05, + "loss": 0.2584, + "step": 24219 + }, + { + "epoch": 31.09114249037227, + "grad_norm": 0.9191019535064697, + "learning_rate": 2.29734702610184e-05, + "loss": 0.2494, + "step": 24220 + }, + { + "epoch": 31.09242618741977, + "grad_norm": 1.1601746082305908, + "learning_rate": 2.2973042362002566e-05, + "loss": 0.2698, + "step": 24221 + }, + { + "epoch": 31.093709884467266, + "grad_norm": 0.7234481573104858, + "learning_rate": 2.2972614462986734e-05, + "loss": 0.2541, + "step": 24222 + }, + { + "epoch": 31.094993581514764, + "grad_norm": 2.493006467819214, + "learning_rate": 2.2972186563970903e-05, + "loss": 0.2673, + "step": 24223 + }, + { + "epoch": 31.09627727856226, + "grad_norm": 1.0217865705490112, + "learning_rate": 2.297175866495507e-05, + "loss": 0.2912, + "step": 24224 + }, + { + "epoch": 31.097560975609756, + "grad_norm": 1.1489558219909668, + "learning_rate": 2.297133076593924e-05, + "loss": 0.2401, + "step": 24225 + }, + { + "epoch": 31.098844672657254, + "grad_norm": 1.0960288047790527, + "learning_rate": 2.2970902866923408e-05, + "loss": 0.2789, + "step": 24226 + }, + { + "epoch": 31.10012836970475, + "grad_norm": 1.1585520505905151, + "learning_rate": 2.2970474967907573e-05, + "loss": 0.232, + "step": 24227 + }, + { + "epoch": 31.101412066752246, + "grad_norm": 1.2299108505249023, + "learning_rate": 2.297004706889174e-05, + "loss": 0.2699, + "step": 24228 + }, + { + "epoch": 31.102695763799744, + "grad_norm": 2.4678587913513184, + "learning_rate": 2.296961916987591e-05, + "loss": 0.2738, + "step": 24229 + }, + { + "epoch": 31.10397946084724, + "grad_norm": 1.3222986459732056, + "learning_rate": 2.2969191270860075e-05, + "loss": 0.2806, + "step": 24230 + }, + { + "epoch": 31.105263157894736, + "grad_norm": 0.9923749566078186, + "learning_rate": 2.2968763371844247e-05, + "loss": 0.2537, + "step": 24231 + }, + { + "epoch": 31.106546854942234, + "grad_norm": 1.9030256271362305, + "learning_rate": 2.296833547282841e-05, + "loss": 0.2995, + "step": 24232 + }, + { + "epoch": 31.107830551989732, + "grad_norm": 0.8472288250923157, + "learning_rate": 2.2967907573812583e-05, + "loss": 0.2549, + "step": 24233 + }, + { + "epoch": 31.109114249037226, + "grad_norm": 1.4575519561767578, + "learning_rate": 2.296747967479675e-05, + "loss": 0.2661, + "step": 24234 + }, + { + "epoch": 31.110397946084724, + "grad_norm": 1.0915582180023193, + "learning_rate": 2.2967051775780914e-05, + "loss": 0.2677, + "step": 24235 + }, + { + "epoch": 31.111681643132222, + "grad_norm": 2.46045184135437, + "learning_rate": 2.2966623876765085e-05, + "loss": 0.2738, + "step": 24236 + }, + { + "epoch": 31.112965340179716, + "grad_norm": 1.224183201789856, + "learning_rate": 2.296619597774925e-05, + "loss": 0.2749, + "step": 24237 + }, + { + "epoch": 31.114249037227214, + "grad_norm": 1.0490177869796753, + "learning_rate": 2.296576807873342e-05, + "loss": 0.2843, + "step": 24238 + }, + { + "epoch": 31.115532734274712, + "grad_norm": 1.490271806716919, + "learning_rate": 2.2965340179717587e-05, + "loss": 0.2497, + "step": 24239 + }, + { + "epoch": 31.116816431322206, + "grad_norm": 1.150285243988037, + "learning_rate": 2.2964912280701756e-05, + "loss": 0.3085, + "step": 24240 + }, + { + "epoch": 31.118100128369704, + "grad_norm": 0.9389891028404236, + "learning_rate": 2.2964484381685924e-05, + "loss": 0.3038, + "step": 24241 + }, + { + "epoch": 31.119383825417202, + "grad_norm": 1.5621421337127686, + "learning_rate": 2.296405648267009e-05, + "loss": 0.2762, + "step": 24242 + }, + { + "epoch": 31.1206675224647, + "grad_norm": 1.7804603576660156, + "learning_rate": 2.2963628583654257e-05, + "loss": 0.2938, + "step": 24243 + }, + { + "epoch": 31.121951219512194, + "grad_norm": 1.8013041019439697, + "learning_rate": 2.2963200684638426e-05, + "loss": 0.2767, + "step": 24244 + }, + { + "epoch": 31.123234916559692, + "grad_norm": 1.6136629581451416, + "learning_rate": 2.2962772785622594e-05, + "loss": 0.3115, + "step": 24245 + }, + { + "epoch": 31.12451861360719, + "grad_norm": 2.7182393074035645, + "learning_rate": 2.296234488660676e-05, + "loss": 0.3234, + "step": 24246 + }, + { + "epoch": 31.125802310654684, + "grad_norm": 1.2220313549041748, + "learning_rate": 2.296191698759093e-05, + "loss": 0.2937, + "step": 24247 + }, + { + "epoch": 31.127086007702182, + "grad_norm": 1.4371808767318726, + "learning_rate": 2.2961489088575096e-05, + "loss": 0.3317, + "step": 24248 + }, + { + "epoch": 31.12836970474968, + "grad_norm": 3.737452745437622, + "learning_rate": 2.2961061189559265e-05, + "loss": 0.4633, + "step": 24249 + }, + { + "epoch": 31.129653401797174, + "grad_norm": 1.724332571029663, + "learning_rate": 2.2960633290543433e-05, + "loss": 0.2904, + "step": 24250 + }, + { + "epoch": 31.130937098844672, + "grad_norm": 1.8656429052352905, + "learning_rate": 2.2960205391527598e-05, + "loss": 0.2806, + "step": 24251 + }, + { + "epoch": 31.13222079589217, + "grad_norm": 0.7410266995429993, + "learning_rate": 2.295977749251177e-05, + "loss": 0.2873, + "step": 24252 + }, + { + "epoch": 31.133504492939664, + "grad_norm": 1.9639743566513062, + "learning_rate": 2.2959349593495935e-05, + "loss": 0.2667, + "step": 24253 + }, + { + "epoch": 31.134788189987162, + "grad_norm": 1.2644072771072388, + "learning_rate": 2.2958921694480103e-05, + "loss": 0.2994, + "step": 24254 + }, + { + "epoch": 31.13607188703466, + "grad_norm": 0.7964535355567932, + "learning_rate": 2.295849379546427e-05, + "loss": 0.267, + "step": 24255 + }, + { + "epoch": 31.137355584082158, + "grad_norm": 1.113569736480713, + "learning_rate": 2.2958065896448437e-05, + "loss": 0.2804, + "step": 24256 + }, + { + "epoch": 31.138639281129652, + "grad_norm": 0.8183322548866272, + "learning_rate": 2.295763799743261e-05, + "loss": 0.2673, + "step": 24257 + }, + { + "epoch": 31.13992297817715, + "grad_norm": 1.1259865760803223, + "learning_rate": 2.2957210098416773e-05, + "loss": 0.2647, + "step": 24258 + }, + { + "epoch": 31.141206675224648, + "grad_norm": 1.330873727798462, + "learning_rate": 2.2956782199400942e-05, + "loss": 0.283, + "step": 24259 + }, + { + "epoch": 31.142490372272142, + "grad_norm": 1.091201663017273, + "learning_rate": 2.295635430038511e-05, + "loss": 0.2482, + "step": 24260 + }, + { + "epoch": 31.14377406931964, + "grad_norm": 4.767098426818848, + "learning_rate": 2.295592640136928e-05, + "loss": 0.2906, + "step": 24261 + }, + { + "epoch": 31.145057766367138, + "grad_norm": 0.9323571920394897, + "learning_rate": 2.2955498502353444e-05, + "loss": 0.2694, + "step": 24262 + }, + { + "epoch": 31.146341463414632, + "grad_norm": 1.7586513757705688, + "learning_rate": 2.2955070603337612e-05, + "loss": 0.2823, + "step": 24263 + }, + { + "epoch": 31.14762516046213, + "grad_norm": 0.6398201584815979, + "learning_rate": 2.295464270432178e-05, + "loss": 0.2549, + "step": 24264 + }, + { + "epoch": 31.14890885750963, + "grad_norm": 0.6647599339485168, + "learning_rate": 2.295421480530595e-05, + "loss": 0.2773, + "step": 24265 + }, + { + "epoch": 31.150192554557126, + "grad_norm": 0.6746947765350342, + "learning_rate": 2.2953786906290117e-05, + "loss": 0.2683, + "step": 24266 + }, + { + "epoch": 31.15147625160462, + "grad_norm": 1.94158935546875, + "learning_rate": 2.2953359007274282e-05, + "loss": 0.2564, + "step": 24267 + }, + { + "epoch": 31.15275994865212, + "grad_norm": 1.3048919439315796, + "learning_rate": 2.2952931108258454e-05, + "loss": 0.2938, + "step": 24268 + }, + { + "epoch": 31.154043645699616, + "grad_norm": 0.888405442237854, + "learning_rate": 2.295250320924262e-05, + "loss": 0.2583, + "step": 24269 + }, + { + "epoch": 31.15532734274711, + "grad_norm": 1.5625144243240356, + "learning_rate": 2.2952075310226784e-05, + "loss": 0.2459, + "step": 24270 + }, + { + "epoch": 31.15661103979461, + "grad_norm": 1.309580683708191, + "learning_rate": 2.2951647411210956e-05, + "loss": 0.2643, + "step": 24271 + }, + { + "epoch": 31.157894736842106, + "grad_norm": 1.2500039339065552, + "learning_rate": 2.295121951219512e-05, + "loss": 0.282, + "step": 24272 + }, + { + "epoch": 31.1591784338896, + "grad_norm": 1.4095343351364136, + "learning_rate": 2.2950791613179293e-05, + "loss": 0.2755, + "step": 24273 + }, + { + "epoch": 31.1604621309371, + "grad_norm": 1.3019297122955322, + "learning_rate": 2.2950363714163458e-05, + "loss": 0.2411, + "step": 24274 + }, + { + "epoch": 31.161745827984596, + "grad_norm": 0.7922349572181702, + "learning_rate": 2.2949935815147626e-05, + "loss": 0.2504, + "step": 24275 + }, + { + "epoch": 31.163029525032094, + "grad_norm": 1.1099367141723633, + "learning_rate": 2.2949507916131795e-05, + "loss": 0.2464, + "step": 24276 + }, + { + "epoch": 31.16431322207959, + "grad_norm": 1.105006456375122, + "learning_rate": 2.294908001711596e-05, + "loss": 0.296, + "step": 24277 + }, + { + "epoch": 31.165596919127086, + "grad_norm": 1.216029405593872, + "learning_rate": 2.2948652118100128e-05, + "loss": 0.2411, + "step": 24278 + }, + { + "epoch": 31.166880616174584, + "grad_norm": 1.737108588218689, + "learning_rate": 2.2948224219084297e-05, + "loss": 0.2738, + "step": 24279 + }, + { + "epoch": 31.16816431322208, + "grad_norm": 4.662249565124512, + "learning_rate": 2.2947796320068465e-05, + "loss": 0.2771, + "step": 24280 + }, + { + "epoch": 31.169448010269576, + "grad_norm": 0.7990480065345764, + "learning_rate": 2.2947368421052633e-05, + "loss": 0.2486, + "step": 24281 + }, + { + "epoch": 31.170731707317074, + "grad_norm": 1.4557315111160278, + "learning_rate": 2.29469405220368e-05, + "loss": 0.2904, + "step": 24282 + }, + { + "epoch": 31.17201540436457, + "grad_norm": 0.8510007262229919, + "learning_rate": 2.2946512623020967e-05, + "loss": 0.2988, + "step": 24283 + }, + { + "epoch": 31.173299101412066, + "grad_norm": 1.103589415550232, + "learning_rate": 2.2946084724005135e-05, + "loss": 0.2422, + "step": 24284 + }, + { + "epoch": 31.174582798459564, + "grad_norm": 1.0127218961715698, + "learning_rate": 2.2945656824989304e-05, + "loss": 0.2608, + "step": 24285 + }, + { + "epoch": 31.17586649550706, + "grad_norm": 1.4253709316253662, + "learning_rate": 2.294522892597347e-05, + "loss": 0.2939, + "step": 24286 + }, + { + "epoch": 31.177150192554556, + "grad_norm": 2.8052563667297363, + "learning_rate": 2.294480102695764e-05, + "loss": 0.2606, + "step": 24287 + }, + { + "epoch": 31.178433889602054, + "grad_norm": 1.3047776222229004, + "learning_rate": 2.2944373127941805e-05, + "loss": 0.2587, + "step": 24288 + }, + { + "epoch": 31.179717586649552, + "grad_norm": 2.6394269466400146, + "learning_rate": 2.2943945228925974e-05, + "loss": 0.2528, + "step": 24289 + }, + { + "epoch": 31.181001283697046, + "grad_norm": 1.3651130199432373, + "learning_rate": 2.2943517329910142e-05, + "loss": 0.3032, + "step": 24290 + }, + { + "epoch": 31.182284980744544, + "grad_norm": 5.14259147644043, + "learning_rate": 2.2943089430894307e-05, + "loss": 0.2993, + "step": 24291 + }, + { + "epoch": 31.183568677792042, + "grad_norm": 2.1315977573394775, + "learning_rate": 2.294266153187848e-05, + "loss": 0.2764, + "step": 24292 + }, + { + "epoch": 31.184852374839537, + "grad_norm": 2.5433459281921387, + "learning_rate": 2.2942233632862644e-05, + "loss": 0.2828, + "step": 24293 + }, + { + "epoch": 31.186136071887034, + "grad_norm": 1.1673718690872192, + "learning_rate": 2.2941805733846813e-05, + "loss": 0.3071, + "step": 24294 + }, + { + "epoch": 31.187419768934532, + "grad_norm": 1.1580246686935425, + "learning_rate": 2.294137783483098e-05, + "loss": 0.3256, + "step": 24295 + }, + { + "epoch": 31.188703465982027, + "grad_norm": 1.1872895956039429, + "learning_rate": 2.2940949935815146e-05, + "loss": 0.3117, + "step": 24296 + }, + { + "epoch": 31.189987163029524, + "grad_norm": 2.126178503036499, + "learning_rate": 2.2940522036799318e-05, + "loss": 0.3517, + "step": 24297 + }, + { + "epoch": 31.191270860077022, + "grad_norm": 1.4382710456848145, + "learning_rate": 2.2940094137783483e-05, + "loss": 0.3886, + "step": 24298 + }, + { + "epoch": 31.19255455712452, + "grad_norm": 1.5692262649536133, + "learning_rate": 2.293966623876765e-05, + "loss": 0.4232, + "step": 24299 + }, + { + "epoch": 31.193838254172015, + "grad_norm": 1.077296257019043, + "learning_rate": 2.293923833975182e-05, + "loss": 0.2771, + "step": 24300 + }, + { + "epoch": 31.195121951219512, + "grad_norm": 2.004162073135376, + "learning_rate": 2.2938810440735988e-05, + "loss": 0.2573, + "step": 24301 + }, + { + "epoch": 31.19640564826701, + "grad_norm": 0.9083272814750671, + "learning_rate": 2.2938382541720153e-05, + "loss": 0.2752, + "step": 24302 + }, + { + "epoch": 31.197689345314505, + "grad_norm": 0.8938255310058594, + "learning_rate": 2.293795464270432e-05, + "loss": 0.3075, + "step": 24303 + }, + { + "epoch": 31.198973042362002, + "grad_norm": 6.257546901702881, + "learning_rate": 2.293752674368849e-05, + "loss": 0.2787, + "step": 24304 + }, + { + "epoch": 31.2002567394095, + "grad_norm": 0.8238582015037537, + "learning_rate": 2.2937098844672658e-05, + "loss": 0.2785, + "step": 24305 + }, + { + "epoch": 31.201540436456995, + "grad_norm": 0.7142029404640198, + "learning_rate": 2.2936670945656827e-05, + "loss": 0.2664, + "step": 24306 + }, + { + "epoch": 31.202824133504492, + "grad_norm": 0.9468299150466919, + "learning_rate": 2.2936243046640992e-05, + "loss": 0.2799, + "step": 24307 + }, + { + "epoch": 31.20410783055199, + "grad_norm": 0.7978814840316772, + "learning_rate": 2.2935815147625164e-05, + "loss": 0.2661, + "step": 24308 + }, + { + "epoch": 31.205391527599488, + "grad_norm": 0.9685092568397522, + "learning_rate": 2.293538724860933e-05, + "loss": 0.2395, + "step": 24309 + }, + { + "epoch": 31.206675224646983, + "grad_norm": 1.0261427164077759, + "learning_rate": 2.2934959349593494e-05, + "loss": 0.3068, + "step": 24310 + }, + { + "epoch": 31.20795892169448, + "grad_norm": 0.9633488059043884, + "learning_rate": 2.2934531450577665e-05, + "loss": 0.2559, + "step": 24311 + }, + { + "epoch": 31.20924261874198, + "grad_norm": 0.8832012414932251, + "learning_rate": 2.293410355156183e-05, + "loss": 0.2656, + "step": 24312 + }, + { + "epoch": 31.210526315789473, + "grad_norm": 0.8621771931648254, + "learning_rate": 2.2933675652546002e-05, + "loss": 0.2624, + "step": 24313 + }, + { + "epoch": 31.21181001283697, + "grad_norm": 0.9335687756538391, + "learning_rate": 2.2933247753530167e-05, + "loss": 0.2823, + "step": 24314 + }, + { + "epoch": 31.21309370988447, + "grad_norm": 0.8930992484092712, + "learning_rate": 2.2932819854514336e-05, + "loss": 0.2664, + "step": 24315 + }, + { + "epoch": 31.214377406931963, + "grad_norm": 0.7601606249809265, + "learning_rate": 2.2932391955498504e-05, + "loss": 0.2574, + "step": 24316 + }, + { + "epoch": 31.21566110397946, + "grad_norm": 0.7981318235397339, + "learning_rate": 2.293196405648267e-05, + "loss": 0.2449, + "step": 24317 + }, + { + "epoch": 31.21694480102696, + "grad_norm": 0.9286803007125854, + "learning_rate": 2.2931536157466837e-05, + "loss": 0.2616, + "step": 24318 + }, + { + "epoch": 31.218228498074453, + "grad_norm": 2.2298245429992676, + "learning_rate": 2.2931108258451006e-05, + "loss": 0.2637, + "step": 24319 + }, + { + "epoch": 31.21951219512195, + "grad_norm": 1.1613174676895142, + "learning_rate": 2.2930680359435174e-05, + "loss": 0.3137, + "step": 24320 + }, + { + "epoch": 31.22079589216945, + "grad_norm": 0.7970431447029114, + "learning_rate": 2.2930252460419343e-05, + "loss": 0.2866, + "step": 24321 + }, + { + "epoch": 31.222079589216946, + "grad_norm": 1.117619276046753, + "learning_rate": 2.292982456140351e-05, + "loss": 0.261, + "step": 24322 + }, + { + "epoch": 31.22336328626444, + "grad_norm": 0.7980327606201172, + "learning_rate": 2.2929396662387676e-05, + "loss": 0.2532, + "step": 24323 + }, + { + "epoch": 31.22464698331194, + "grad_norm": 1.1152141094207764, + "learning_rate": 2.2928968763371845e-05, + "loss": 0.2556, + "step": 24324 + }, + { + "epoch": 31.225930680359436, + "grad_norm": 0.9391273856163025, + "learning_rate": 2.2928540864356013e-05, + "loss": 0.2607, + "step": 24325 + }, + { + "epoch": 31.22721437740693, + "grad_norm": 1.3158258199691772, + "learning_rate": 2.2928112965340178e-05, + "loss": 0.2871, + "step": 24326 + }, + { + "epoch": 31.22849807445443, + "grad_norm": 1.387863039970398, + "learning_rate": 2.292768506632435e-05, + "loss": 0.2412, + "step": 24327 + }, + { + "epoch": 31.229781771501926, + "grad_norm": 1.3741284608840942, + "learning_rate": 2.2927257167308515e-05, + "loss": 0.2726, + "step": 24328 + }, + { + "epoch": 31.23106546854942, + "grad_norm": 1.0017774105072021, + "learning_rate": 2.2926829268292687e-05, + "loss": 0.2697, + "step": 24329 + }, + { + "epoch": 31.23234916559692, + "grad_norm": 1.551253080368042, + "learning_rate": 2.292640136927685e-05, + "loss": 0.251, + "step": 24330 + }, + { + "epoch": 31.233632862644416, + "grad_norm": 1.1856684684753418, + "learning_rate": 2.2925973470261017e-05, + "loss": 0.266, + "step": 24331 + }, + { + "epoch": 31.234916559691914, + "grad_norm": 1.2586008310317993, + "learning_rate": 2.292554557124519e-05, + "loss": 0.2495, + "step": 24332 + }, + { + "epoch": 31.23620025673941, + "grad_norm": 1.078082799911499, + "learning_rate": 2.2925117672229353e-05, + "loss": 0.3014, + "step": 24333 + }, + { + "epoch": 31.237483953786906, + "grad_norm": 0.7974229454994202, + "learning_rate": 2.2924689773213522e-05, + "loss": 0.2321, + "step": 24334 + }, + { + "epoch": 31.238767650834404, + "grad_norm": 0.8667670488357544, + "learning_rate": 2.292426187419769e-05, + "loss": 0.2302, + "step": 24335 + }, + { + "epoch": 31.2400513478819, + "grad_norm": 1.9507787227630615, + "learning_rate": 2.292383397518186e-05, + "loss": 0.2645, + "step": 24336 + }, + { + "epoch": 31.241335044929397, + "grad_norm": 1.2137701511383057, + "learning_rate": 2.2923406076166027e-05, + "loss": 0.3029, + "step": 24337 + }, + { + "epoch": 31.242618741976894, + "grad_norm": 1.581807017326355, + "learning_rate": 2.2922978177150192e-05, + "loss": 0.2964, + "step": 24338 + }, + { + "epoch": 31.24390243902439, + "grad_norm": 1.1370031833648682, + "learning_rate": 2.292255027813436e-05, + "loss": 0.2361, + "step": 24339 + }, + { + "epoch": 31.245186136071887, + "grad_norm": 2.5754499435424805, + "learning_rate": 2.292212237911853e-05, + "loss": 0.2891, + "step": 24340 + }, + { + "epoch": 31.246469833119384, + "grad_norm": 1.0745832920074463, + "learning_rate": 2.2921694480102697e-05, + "loss": 0.2736, + "step": 24341 + }, + { + "epoch": 31.247753530166882, + "grad_norm": 1.159477710723877, + "learning_rate": 2.2921266581086862e-05, + "loss": 0.2913, + "step": 24342 + }, + { + "epoch": 31.249037227214377, + "grad_norm": 1.6949200630187988, + "learning_rate": 2.292083868207103e-05, + "loss": 0.268, + "step": 24343 + }, + { + "epoch": 31.250320924261874, + "grad_norm": 1.1677662134170532, + "learning_rate": 2.29204107830552e-05, + "loss": 0.3007, + "step": 24344 + }, + { + "epoch": 31.251604621309372, + "grad_norm": 1.2287726402282715, + "learning_rate": 2.2919982884039368e-05, + "loss": 0.3232, + "step": 24345 + }, + { + "epoch": 31.252888318356867, + "grad_norm": 1.2974791526794434, + "learning_rate": 2.2919554985023536e-05, + "loss": 0.2624, + "step": 24346 + }, + { + "epoch": 31.254172015404365, + "grad_norm": 1.5612341165542603, + "learning_rate": 2.29191270860077e-05, + "loss": 0.3446, + "step": 24347 + }, + { + "epoch": 31.255455712451862, + "grad_norm": 1.8704233169555664, + "learning_rate": 2.2918699186991873e-05, + "loss": 0.3612, + "step": 24348 + }, + { + "epoch": 31.256739409499357, + "grad_norm": 3.9062435626983643, + "learning_rate": 2.2918271287976038e-05, + "loss": 0.3772, + "step": 24349 + }, + { + "epoch": 31.258023106546855, + "grad_norm": 0.969697117805481, + "learning_rate": 2.2917843388960203e-05, + "loss": 0.2731, + "step": 24350 + }, + { + "epoch": 31.259306803594352, + "grad_norm": 1.1323199272155762, + "learning_rate": 2.2917415489944375e-05, + "loss": 0.2455, + "step": 24351 + }, + { + "epoch": 31.260590500641847, + "grad_norm": 1.2975749969482422, + "learning_rate": 2.291698759092854e-05, + "loss": 0.2863, + "step": 24352 + }, + { + "epoch": 31.261874197689345, + "grad_norm": 1.931359887123108, + "learning_rate": 2.291655969191271e-05, + "loss": 0.2523, + "step": 24353 + }, + { + "epoch": 31.263157894736842, + "grad_norm": 1.5392714738845825, + "learning_rate": 2.2916131792896877e-05, + "loss": 0.295, + "step": 24354 + }, + { + "epoch": 31.26444159178434, + "grad_norm": 1.5744212865829468, + "learning_rate": 2.2915703893881045e-05, + "loss": 0.2713, + "step": 24355 + }, + { + "epoch": 31.265725288831835, + "grad_norm": 1.457057237625122, + "learning_rate": 2.2915275994865213e-05, + "loss": 0.2484, + "step": 24356 + }, + { + "epoch": 31.267008985879333, + "grad_norm": 3.3051180839538574, + "learning_rate": 2.291484809584938e-05, + "loss": 0.2732, + "step": 24357 + }, + { + "epoch": 31.26829268292683, + "grad_norm": 0.9046299457550049, + "learning_rate": 2.2914420196833547e-05, + "loss": 0.2723, + "step": 24358 + }, + { + "epoch": 31.269576379974325, + "grad_norm": 1.4692922830581665, + "learning_rate": 2.2913992297817715e-05, + "loss": 0.2912, + "step": 24359 + }, + { + "epoch": 31.270860077021823, + "grad_norm": 1.3426737785339355, + "learning_rate": 2.2913564398801884e-05, + "loss": 0.2723, + "step": 24360 + }, + { + "epoch": 31.27214377406932, + "grad_norm": 1.1074600219726562, + "learning_rate": 2.2913136499786052e-05, + "loss": 0.2676, + "step": 24361 + }, + { + "epoch": 31.273427471116815, + "grad_norm": 0.843574583530426, + "learning_rate": 2.291270860077022e-05, + "loss": 0.2705, + "step": 24362 + }, + { + "epoch": 31.274711168164313, + "grad_norm": 2.6612205505371094, + "learning_rate": 2.2912280701754386e-05, + "loss": 0.2517, + "step": 24363 + }, + { + "epoch": 31.27599486521181, + "grad_norm": 0.7314371466636658, + "learning_rate": 2.2911852802738554e-05, + "loss": 0.2597, + "step": 24364 + }, + { + "epoch": 31.27727856225931, + "grad_norm": 1.0808398723602295, + "learning_rate": 2.2911424903722722e-05, + "loss": 0.28, + "step": 24365 + }, + { + "epoch": 31.278562259306803, + "grad_norm": 0.7449549436569214, + "learning_rate": 2.2910997004706887e-05, + "loss": 0.2468, + "step": 24366 + }, + { + "epoch": 31.2798459563543, + "grad_norm": 1.9964834451675415, + "learning_rate": 2.291056910569106e-05, + "loss": 0.2923, + "step": 24367 + }, + { + "epoch": 31.2811296534018, + "grad_norm": 1.2373747825622559, + "learning_rate": 2.2910141206675224e-05, + "loss": 0.2466, + "step": 24368 + }, + { + "epoch": 31.282413350449293, + "grad_norm": 1.1182372570037842, + "learning_rate": 2.2909713307659396e-05, + "loss": 0.2655, + "step": 24369 + }, + { + "epoch": 31.28369704749679, + "grad_norm": 0.8699478507041931, + "learning_rate": 2.290928540864356e-05, + "loss": 0.2904, + "step": 24370 + }, + { + "epoch": 31.28498074454429, + "grad_norm": 1.4281312227249146, + "learning_rate": 2.2908857509627726e-05, + "loss": 0.2576, + "step": 24371 + }, + { + "epoch": 31.286264441591783, + "grad_norm": 1.5710890293121338, + "learning_rate": 2.2908429610611898e-05, + "loss": 0.2672, + "step": 24372 + }, + { + "epoch": 31.28754813863928, + "grad_norm": 1.4636290073394775, + "learning_rate": 2.2908001711596063e-05, + "loss": 0.2788, + "step": 24373 + }, + { + "epoch": 31.28883183568678, + "grad_norm": 1.195112943649292, + "learning_rate": 2.290757381258023e-05, + "loss": 0.2695, + "step": 24374 + }, + { + "epoch": 31.290115532734276, + "grad_norm": 1.3002004623413086, + "learning_rate": 2.29071459135644e-05, + "loss": 0.2468, + "step": 24375 + }, + { + "epoch": 31.29139922978177, + "grad_norm": 0.9501375555992126, + "learning_rate": 2.2906718014548568e-05, + "loss": 0.2832, + "step": 24376 + }, + { + "epoch": 31.29268292682927, + "grad_norm": 0.9705412983894348, + "learning_rate": 2.2906290115532737e-05, + "loss": 0.2371, + "step": 24377 + }, + { + "epoch": 31.293966623876766, + "grad_norm": 1.7494432926177979, + "learning_rate": 2.29058622165169e-05, + "loss": 0.2888, + "step": 24378 + }, + { + "epoch": 31.29525032092426, + "grad_norm": 0.8025304675102234, + "learning_rate": 2.290543431750107e-05, + "loss": 0.2666, + "step": 24379 + }, + { + "epoch": 31.29653401797176, + "grad_norm": 1.0272047519683838, + "learning_rate": 2.290500641848524e-05, + "loss": 0.2493, + "step": 24380 + }, + { + "epoch": 31.297817715019256, + "grad_norm": 1.5174211263656616, + "learning_rate": 2.2904578519469407e-05, + "loss": 0.2466, + "step": 24381 + }, + { + "epoch": 31.29910141206675, + "grad_norm": 1.351871371269226, + "learning_rate": 2.2904150620453572e-05, + "loss": 0.2625, + "step": 24382 + }, + { + "epoch": 31.30038510911425, + "grad_norm": 1.649617314338684, + "learning_rate": 2.2903722721437744e-05, + "loss": 0.2662, + "step": 24383 + }, + { + "epoch": 31.301668806161747, + "grad_norm": 1.6259962320327759, + "learning_rate": 2.290329482242191e-05, + "loss": 0.2753, + "step": 24384 + }, + { + "epoch": 31.30295250320924, + "grad_norm": 1.3625130653381348, + "learning_rate": 2.2902866923406077e-05, + "loss": 0.2822, + "step": 24385 + }, + { + "epoch": 31.30423620025674, + "grad_norm": 1.1880899667739868, + "learning_rate": 2.2902439024390245e-05, + "loss": 0.3163, + "step": 24386 + }, + { + "epoch": 31.305519897304237, + "grad_norm": 3.944488048553467, + "learning_rate": 2.290201112537441e-05, + "loss": 0.2871, + "step": 24387 + }, + { + "epoch": 31.306803594351734, + "grad_norm": 1.9357410669326782, + "learning_rate": 2.2901583226358582e-05, + "loss": 0.282, + "step": 24388 + }, + { + "epoch": 31.30808729139923, + "grad_norm": 1.3945074081420898, + "learning_rate": 2.2901155327342747e-05, + "loss": 0.2794, + "step": 24389 + }, + { + "epoch": 31.309370988446727, + "grad_norm": 1.3075612783432007, + "learning_rate": 2.2900727428326916e-05, + "loss": 0.2839, + "step": 24390 + }, + { + "epoch": 31.310654685494224, + "grad_norm": 1.713897943496704, + "learning_rate": 2.2900299529311084e-05, + "loss": 0.2727, + "step": 24391 + }, + { + "epoch": 31.31193838254172, + "grad_norm": 3.9658913612365723, + "learning_rate": 2.289987163029525e-05, + "loss": 0.2753, + "step": 24392 + }, + { + "epoch": 31.313222079589217, + "grad_norm": 1.536424994468689, + "learning_rate": 2.289944373127942e-05, + "loss": 0.272, + "step": 24393 + }, + { + "epoch": 31.314505776636715, + "grad_norm": 1.563348650932312, + "learning_rate": 2.2899015832263586e-05, + "loss": 0.2882, + "step": 24394 + }, + { + "epoch": 31.31578947368421, + "grad_norm": 1.0375663042068481, + "learning_rate": 2.2898587933247754e-05, + "loss": 0.2804, + "step": 24395 + }, + { + "epoch": 31.317073170731707, + "grad_norm": 1.2420846223831177, + "learning_rate": 2.2898160034231923e-05, + "loss": 0.2984, + "step": 24396 + }, + { + "epoch": 31.318356867779205, + "grad_norm": 1.792402744293213, + "learning_rate": 2.289773213521609e-05, + "loss": 0.3101, + "step": 24397 + }, + { + "epoch": 31.319640564826702, + "grad_norm": 2.248293161392212, + "learning_rate": 2.2897304236200256e-05, + "loss": 0.3693, + "step": 24398 + }, + { + "epoch": 31.320924261874197, + "grad_norm": 3.6308212280273438, + "learning_rate": 2.2896876337184425e-05, + "loss": 0.4098, + "step": 24399 + }, + { + "epoch": 31.322207958921695, + "grad_norm": 0.9456512331962585, + "learning_rate": 2.2896448438168593e-05, + "loss": 0.2706, + "step": 24400 + }, + { + "epoch": 31.323491655969192, + "grad_norm": 0.9107778668403625, + "learning_rate": 2.289602053915276e-05, + "loss": 0.2602, + "step": 24401 + }, + { + "epoch": 31.324775353016687, + "grad_norm": 1.226845383644104, + "learning_rate": 2.289559264013693e-05, + "loss": 0.2896, + "step": 24402 + }, + { + "epoch": 31.326059050064185, + "grad_norm": 0.9376072287559509, + "learning_rate": 2.2895164741121095e-05, + "loss": 0.2717, + "step": 24403 + }, + { + "epoch": 31.327342747111683, + "grad_norm": 1.0800113677978516, + "learning_rate": 2.2894736842105263e-05, + "loss": 0.2706, + "step": 24404 + }, + { + "epoch": 31.328626444159177, + "grad_norm": 0.6121296882629395, + "learning_rate": 2.2894308943089432e-05, + "loss": 0.2515, + "step": 24405 + }, + { + "epoch": 31.329910141206675, + "grad_norm": 0.6574236154556274, + "learning_rate": 2.2893881044073597e-05, + "loss": 0.2954, + "step": 24406 + }, + { + "epoch": 31.331193838254173, + "grad_norm": 1.7192579507827759, + "learning_rate": 2.289345314505777e-05, + "loss": 0.2655, + "step": 24407 + }, + { + "epoch": 31.33247753530167, + "grad_norm": 0.8523129224777222, + "learning_rate": 2.2893025246041934e-05, + "loss": 0.2936, + "step": 24408 + }, + { + "epoch": 31.333761232349165, + "grad_norm": 0.9754425883293152, + "learning_rate": 2.2892597347026105e-05, + "loss": 0.2598, + "step": 24409 + }, + { + "epoch": 31.335044929396663, + "grad_norm": 1.4205384254455566, + "learning_rate": 2.289216944801027e-05, + "loss": 0.2706, + "step": 24410 + }, + { + "epoch": 31.33632862644416, + "grad_norm": 1.1696466207504272, + "learning_rate": 2.2891741548994435e-05, + "loss": 0.2677, + "step": 24411 + }, + { + "epoch": 31.337612323491655, + "grad_norm": 1.0047850608825684, + "learning_rate": 2.2891313649978607e-05, + "loss": 0.2699, + "step": 24412 + }, + { + "epoch": 31.338896020539153, + "grad_norm": 0.9079232811927795, + "learning_rate": 2.2890885750962772e-05, + "loss": 0.2594, + "step": 24413 + }, + { + "epoch": 31.34017971758665, + "grad_norm": 0.8429677486419678, + "learning_rate": 2.289045785194694e-05, + "loss": 0.2548, + "step": 24414 + }, + { + "epoch": 31.341463414634145, + "grad_norm": 0.9069638848304749, + "learning_rate": 2.289002995293111e-05, + "loss": 0.2629, + "step": 24415 + }, + { + "epoch": 31.342747111681643, + "grad_norm": 1.023458480834961, + "learning_rate": 2.2889602053915277e-05, + "loss": 0.2677, + "step": 24416 + }, + { + "epoch": 31.34403080872914, + "grad_norm": 0.7495404481887817, + "learning_rate": 2.2889174154899442e-05, + "loss": 0.2418, + "step": 24417 + }, + { + "epoch": 31.345314505776635, + "grad_norm": 1.153566598892212, + "learning_rate": 2.288874625588361e-05, + "loss": 0.2842, + "step": 24418 + }, + { + "epoch": 31.346598202824133, + "grad_norm": 1.1025954484939575, + "learning_rate": 2.288831835686778e-05, + "loss": 0.2368, + "step": 24419 + }, + { + "epoch": 31.34788189987163, + "grad_norm": 1.0009958744049072, + "learning_rate": 2.2887890457851948e-05, + "loss": 0.2228, + "step": 24420 + }, + { + "epoch": 31.34916559691913, + "grad_norm": 1.0961533784866333, + "learning_rate": 2.2887462558836116e-05, + "loss": 0.2782, + "step": 24421 + }, + { + "epoch": 31.350449293966623, + "grad_norm": 1.038803219795227, + "learning_rate": 2.288703465982028e-05, + "loss": 0.2693, + "step": 24422 + }, + { + "epoch": 31.35173299101412, + "grad_norm": 1.4855785369873047, + "learning_rate": 2.2886606760804453e-05, + "loss": 0.26, + "step": 24423 + }, + { + "epoch": 31.35301668806162, + "grad_norm": 1.11879563331604, + "learning_rate": 2.2886178861788618e-05, + "loss": 0.2668, + "step": 24424 + }, + { + "epoch": 31.354300385109113, + "grad_norm": 1.0422650575637817, + "learning_rate": 2.2885750962772783e-05, + "loss": 0.2948, + "step": 24425 + }, + { + "epoch": 31.35558408215661, + "grad_norm": 2.0291266441345215, + "learning_rate": 2.2885323063756955e-05, + "loss": 0.2556, + "step": 24426 + }, + { + "epoch": 31.35686777920411, + "grad_norm": 0.7644889950752258, + "learning_rate": 2.288489516474112e-05, + "loss": 0.25, + "step": 24427 + }, + { + "epoch": 31.358151476251603, + "grad_norm": 1.2269418239593506, + "learning_rate": 2.288446726572529e-05, + "loss": 0.2512, + "step": 24428 + }, + { + "epoch": 31.3594351732991, + "grad_norm": 0.7579815983772278, + "learning_rate": 2.2884039366709457e-05, + "loss": 0.2336, + "step": 24429 + }, + { + "epoch": 31.3607188703466, + "grad_norm": 1.1419516801834106, + "learning_rate": 2.2883611467693625e-05, + "loss": 0.2601, + "step": 24430 + }, + { + "epoch": 31.362002567394097, + "grad_norm": 2.2303409576416016, + "learning_rate": 2.2883183568677793e-05, + "loss": 0.2734, + "step": 24431 + }, + { + "epoch": 31.36328626444159, + "grad_norm": 2.0451714992523193, + "learning_rate": 2.288275566966196e-05, + "loss": 0.3133, + "step": 24432 + }, + { + "epoch": 31.36456996148909, + "grad_norm": 0.9229822158813477, + "learning_rate": 2.2882327770646127e-05, + "loss": 0.2324, + "step": 24433 + }, + { + "epoch": 31.365853658536587, + "grad_norm": 1.6608206033706665, + "learning_rate": 2.2881899871630295e-05, + "loss": 0.3133, + "step": 24434 + }, + { + "epoch": 31.36713735558408, + "grad_norm": 1.2744038105010986, + "learning_rate": 2.2881471972614464e-05, + "loss": 0.2689, + "step": 24435 + }, + { + "epoch": 31.36842105263158, + "grad_norm": 2.5816173553466797, + "learning_rate": 2.2881044073598632e-05, + "loss": 0.2556, + "step": 24436 + }, + { + "epoch": 31.369704749679077, + "grad_norm": 1.706933856010437, + "learning_rate": 2.28806161745828e-05, + "loss": 0.272, + "step": 24437 + }, + { + "epoch": 31.37098844672657, + "grad_norm": 1.220143437385559, + "learning_rate": 2.2880188275566966e-05, + "loss": 0.2437, + "step": 24438 + }, + { + "epoch": 31.37227214377407, + "grad_norm": 1.7259966135025024, + "learning_rate": 2.2879760376551134e-05, + "loss": 0.2632, + "step": 24439 + }, + { + "epoch": 31.373555840821567, + "grad_norm": 1.1391206979751587, + "learning_rate": 2.2879332477535302e-05, + "loss": 0.2809, + "step": 24440 + }, + { + "epoch": 31.374839537869065, + "grad_norm": 1.2250666618347168, + "learning_rate": 2.2878904578519467e-05, + "loss": 0.3047, + "step": 24441 + }, + { + "epoch": 31.37612323491656, + "grad_norm": 2.154484987258911, + "learning_rate": 2.287847667950364e-05, + "loss": 0.2977, + "step": 24442 + }, + { + "epoch": 31.377406931964057, + "grad_norm": 1.2629468441009521, + "learning_rate": 2.2878048780487804e-05, + "loss": 0.2957, + "step": 24443 + }, + { + "epoch": 31.378690629011555, + "grad_norm": 1.413493037223816, + "learning_rate": 2.2877620881471976e-05, + "loss": 0.2812, + "step": 24444 + }, + { + "epoch": 31.37997432605905, + "grad_norm": 1.355539321899414, + "learning_rate": 2.287719298245614e-05, + "loss": 0.3015, + "step": 24445 + }, + { + "epoch": 31.381258023106547, + "grad_norm": 1.7241601943969727, + "learning_rate": 2.2876765083440306e-05, + "loss": 0.3015, + "step": 24446 + }, + { + "epoch": 31.382541720154045, + "grad_norm": 2.8347091674804688, + "learning_rate": 2.2876337184424478e-05, + "loss": 0.3614, + "step": 24447 + }, + { + "epoch": 31.38382541720154, + "grad_norm": 1.7019063234329224, + "learning_rate": 2.2875909285408643e-05, + "loss": 0.3859, + "step": 24448 + }, + { + "epoch": 31.385109114249037, + "grad_norm": 1.8262947797775269, + "learning_rate": 2.287548138639281e-05, + "loss": 0.4563, + "step": 24449 + }, + { + "epoch": 31.386392811296535, + "grad_norm": 0.801047146320343, + "learning_rate": 2.287505348737698e-05, + "loss": 0.2709, + "step": 24450 + }, + { + "epoch": 31.387676508344033, + "grad_norm": 0.8016226291656494, + "learning_rate": 2.2874625588361148e-05, + "loss": 0.2867, + "step": 24451 + }, + { + "epoch": 31.388960205391527, + "grad_norm": 1.2302192449569702, + "learning_rate": 2.2874197689345317e-05, + "loss": 0.253, + "step": 24452 + }, + { + "epoch": 31.390243902439025, + "grad_norm": 3.9215831756591797, + "learning_rate": 2.287376979032948e-05, + "loss": 0.2901, + "step": 24453 + }, + { + "epoch": 31.391527599486523, + "grad_norm": 1.092798113822937, + "learning_rate": 2.287334189131365e-05, + "loss": 0.2705, + "step": 24454 + }, + { + "epoch": 31.392811296534017, + "grad_norm": 4.474235534667969, + "learning_rate": 2.287291399229782e-05, + "loss": 0.2678, + "step": 24455 + }, + { + "epoch": 31.394094993581515, + "grad_norm": 0.9927306175231934, + "learning_rate": 2.2872486093281987e-05, + "loss": 0.2725, + "step": 24456 + }, + { + "epoch": 31.395378690629013, + "grad_norm": 1.1053814888000488, + "learning_rate": 2.2872058194266152e-05, + "loss": 0.2766, + "step": 24457 + }, + { + "epoch": 31.396662387676507, + "grad_norm": 2.862553834915161, + "learning_rate": 2.2871630295250324e-05, + "loss": 0.2793, + "step": 24458 + }, + { + "epoch": 31.397946084724005, + "grad_norm": 1.148634910583496, + "learning_rate": 2.287120239623449e-05, + "loss": 0.2904, + "step": 24459 + }, + { + "epoch": 31.399229781771503, + "grad_norm": 1.1085007190704346, + "learning_rate": 2.2870774497218657e-05, + "loss": 0.2969, + "step": 24460 + }, + { + "epoch": 31.400513478818997, + "grad_norm": 1.1151763200759888, + "learning_rate": 2.2870346598202825e-05, + "loss": 0.2676, + "step": 24461 + }, + { + "epoch": 31.401797175866495, + "grad_norm": 1.2630308866500854, + "learning_rate": 2.286991869918699e-05, + "loss": 0.2969, + "step": 24462 + }, + { + "epoch": 31.403080872913993, + "grad_norm": 0.9178653359413147, + "learning_rate": 2.2869490800171162e-05, + "loss": 0.2579, + "step": 24463 + }, + { + "epoch": 31.40436456996149, + "grad_norm": 1.6265279054641724, + "learning_rate": 2.2869062901155327e-05, + "loss": 0.2782, + "step": 24464 + }, + { + "epoch": 31.405648267008985, + "grad_norm": 0.9863241314888, + "learning_rate": 2.2868635002139496e-05, + "loss": 0.2754, + "step": 24465 + }, + { + "epoch": 31.406931964056483, + "grad_norm": 0.8930013179779053, + "learning_rate": 2.2868207103123664e-05, + "loss": 0.2744, + "step": 24466 + }, + { + "epoch": 31.40821566110398, + "grad_norm": 1.225715160369873, + "learning_rate": 2.286777920410783e-05, + "loss": 0.2431, + "step": 24467 + }, + { + "epoch": 31.409499358151475, + "grad_norm": 1.6772270202636719, + "learning_rate": 2.2867351305092e-05, + "loss": 0.288, + "step": 24468 + }, + { + "epoch": 31.410783055198973, + "grad_norm": 1.577429175376892, + "learning_rate": 2.2866923406076166e-05, + "loss": 0.2542, + "step": 24469 + }, + { + "epoch": 31.41206675224647, + "grad_norm": 0.7475799918174744, + "learning_rate": 2.2866495507060334e-05, + "loss": 0.2684, + "step": 24470 + }, + { + "epoch": 31.413350449293965, + "grad_norm": 1.0838820934295654, + "learning_rate": 2.2866067608044503e-05, + "loss": 0.2878, + "step": 24471 + }, + { + "epoch": 31.414634146341463, + "grad_norm": 1.2674663066864014, + "learning_rate": 2.2865639709028668e-05, + "loss": 0.2281, + "step": 24472 + }, + { + "epoch": 31.41591784338896, + "grad_norm": 0.7763184309005737, + "learning_rate": 2.2865211810012836e-05, + "loss": 0.2484, + "step": 24473 + }, + { + "epoch": 31.41720154043646, + "grad_norm": 3.5818428993225098, + "learning_rate": 2.2864783910997005e-05, + "loss": 0.2655, + "step": 24474 + }, + { + "epoch": 31.418485237483953, + "grad_norm": 0.9258356690406799, + "learning_rate": 2.2864356011981173e-05, + "loss": 0.2477, + "step": 24475 + }, + { + "epoch": 31.41976893453145, + "grad_norm": 1.1265267133712769, + "learning_rate": 2.286392811296534e-05, + "loss": 0.2542, + "step": 24476 + }, + { + "epoch": 31.42105263157895, + "grad_norm": 0.8375381827354431, + "learning_rate": 2.286350021394951e-05, + "loss": 0.2391, + "step": 24477 + }, + { + "epoch": 31.422336328626443, + "grad_norm": 1.2077888250350952, + "learning_rate": 2.2863072314933675e-05, + "loss": 0.2473, + "step": 24478 + }, + { + "epoch": 31.42362002567394, + "grad_norm": 0.9855786561965942, + "learning_rate": 2.2862644415917843e-05, + "loss": 0.2752, + "step": 24479 + }, + { + "epoch": 31.42490372272144, + "grad_norm": 0.9823122620582581, + "learning_rate": 2.2862216516902012e-05, + "loss": 0.2798, + "step": 24480 + }, + { + "epoch": 31.426187419768933, + "grad_norm": 0.8916260004043579, + "learning_rate": 2.2861788617886177e-05, + "loss": 0.2519, + "step": 24481 + }, + { + "epoch": 31.42747111681643, + "grad_norm": 1.0957599878311157, + "learning_rate": 2.286136071887035e-05, + "loss": 0.2777, + "step": 24482 + }, + { + "epoch": 31.42875481386393, + "grad_norm": 1.2489211559295654, + "learning_rate": 2.2860932819854514e-05, + "loss": 0.2922, + "step": 24483 + }, + { + "epoch": 31.430038510911427, + "grad_norm": 1.0860891342163086, + "learning_rate": 2.2860504920838685e-05, + "loss": 0.2628, + "step": 24484 + }, + { + "epoch": 31.43132220795892, + "grad_norm": 1.5115270614624023, + "learning_rate": 2.286007702182285e-05, + "loss": 0.2541, + "step": 24485 + }, + { + "epoch": 31.43260590500642, + "grad_norm": 1.1522942781448364, + "learning_rate": 2.2859649122807015e-05, + "loss": 0.2509, + "step": 24486 + }, + { + "epoch": 31.433889602053917, + "grad_norm": 5.191856861114502, + "learning_rate": 2.2859221223791187e-05, + "loss": 0.2619, + "step": 24487 + }, + { + "epoch": 31.43517329910141, + "grad_norm": 1.1142237186431885, + "learning_rate": 2.2858793324775352e-05, + "loss": 0.2621, + "step": 24488 + }, + { + "epoch": 31.43645699614891, + "grad_norm": 1.6513755321502686, + "learning_rate": 2.285836542575952e-05, + "loss": 0.2537, + "step": 24489 + }, + { + "epoch": 31.437740693196407, + "grad_norm": 0.987590491771698, + "learning_rate": 2.285793752674369e-05, + "loss": 0.2537, + "step": 24490 + }, + { + "epoch": 31.4390243902439, + "grad_norm": 1.4394786357879639, + "learning_rate": 2.2857509627727858e-05, + "loss": 0.3112, + "step": 24491 + }, + { + "epoch": 31.4403080872914, + "grad_norm": 2.428417205810547, + "learning_rate": 2.2857081728712026e-05, + "loss": 0.2906, + "step": 24492 + }, + { + "epoch": 31.441591784338897, + "grad_norm": 1.8446568250656128, + "learning_rate": 2.285665382969619e-05, + "loss": 0.2812, + "step": 24493 + }, + { + "epoch": 31.44287548138639, + "grad_norm": 1.2729580402374268, + "learning_rate": 2.285622593068036e-05, + "loss": 0.2769, + "step": 24494 + }, + { + "epoch": 31.44415917843389, + "grad_norm": 1.2887940406799316, + "learning_rate": 2.2855798031664528e-05, + "loss": 0.297, + "step": 24495 + }, + { + "epoch": 31.445442875481387, + "grad_norm": 1.0980406999588013, + "learning_rate": 2.2855370132648696e-05, + "loss": 0.3292, + "step": 24496 + }, + { + "epoch": 31.446726572528885, + "grad_norm": 1.3109499216079712, + "learning_rate": 2.285494223363286e-05, + "loss": 0.3722, + "step": 24497 + }, + { + "epoch": 31.44801026957638, + "grad_norm": 2.9500510692596436, + "learning_rate": 2.2854514334617033e-05, + "loss": 0.3686, + "step": 24498 + }, + { + "epoch": 31.449293966623877, + "grad_norm": 2.9025280475616455, + "learning_rate": 2.2854086435601198e-05, + "loss": 0.5223, + "step": 24499 + }, + { + "epoch": 31.450577663671375, + "grad_norm": 0.655913770198822, + "learning_rate": 2.2853658536585366e-05, + "loss": 0.2641, + "step": 24500 + }, + { + "epoch": 31.45186136071887, + "grad_norm": 1.0917139053344727, + "learning_rate": 2.2853230637569535e-05, + "loss": 0.2994, + "step": 24501 + }, + { + "epoch": 31.453145057766367, + "grad_norm": 0.8373416066169739, + "learning_rate": 2.28528027385537e-05, + "loss": 0.2874, + "step": 24502 + }, + { + "epoch": 31.454428754813865, + "grad_norm": 1.2154113054275513, + "learning_rate": 2.285237483953787e-05, + "loss": 0.2687, + "step": 24503 + }, + { + "epoch": 31.45571245186136, + "grad_norm": 0.8398239016532898, + "learning_rate": 2.2851946940522037e-05, + "loss": 0.2915, + "step": 24504 + }, + { + "epoch": 31.456996148908857, + "grad_norm": 0.8678722381591797, + "learning_rate": 2.2851519041506205e-05, + "loss": 0.274, + "step": 24505 + }, + { + "epoch": 31.458279845956355, + "grad_norm": 1.5422238111495972, + "learning_rate": 2.2851091142490374e-05, + "loss": 0.2868, + "step": 24506 + }, + { + "epoch": 31.459563543003853, + "grad_norm": 1.136879563331604, + "learning_rate": 2.285066324347454e-05, + "loss": 0.2587, + "step": 24507 + }, + { + "epoch": 31.460847240051347, + "grad_norm": 1.212064504623413, + "learning_rate": 2.285023534445871e-05, + "loss": 0.2563, + "step": 24508 + }, + { + "epoch": 31.462130937098845, + "grad_norm": 2.471277952194214, + "learning_rate": 2.2849807445442875e-05, + "loss": 0.2712, + "step": 24509 + }, + { + "epoch": 31.463414634146343, + "grad_norm": 0.885866105556488, + "learning_rate": 2.2849379546427044e-05, + "loss": 0.2843, + "step": 24510 + }, + { + "epoch": 31.464698331193837, + "grad_norm": 2.1315925121307373, + "learning_rate": 2.2848951647411212e-05, + "loss": 0.2797, + "step": 24511 + }, + { + "epoch": 31.465982028241335, + "grad_norm": 1.041856050491333, + "learning_rate": 2.284852374839538e-05, + "loss": 0.2811, + "step": 24512 + }, + { + "epoch": 31.467265725288833, + "grad_norm": 2.1068387031555176, + "learning_rate": 2.2848095849379546e-05, + "loss": 0.2746, + "step": 24513 + }, + { + "epoch": 31.468549422336327, + "grad_norm": 1.0149590969085693, + "learning_rate": 2.2847667950363714e-05, + "loss": 0.2441, + "step": 24514 + }, + { + "epoch": 31.469833119383825, + "grad_norm": 0.8729395270347595, + "learning_rate": 2.2847240051347882e-05, + "loss": 0.2534, + "step": 24515 + }, + { + "epoch": 31.471116816431323, + "grad_norm": 1.3821371793746948, + "learning_rate": 2.284681215233205e-05, + "loss": 0.2791, + "step": 24516 + }, + { + "epoch": 31.47240051347882, + "grad_norm": 0.8140355348587036, + "learning_rate": 2.284638425331622e-05, + "loss": 0.238, + "step": 24517 + }, + { + "epoch": 31.473684210526315, + "grad_norm": 0.9901864528656006, + "learning_rate": 2.2845956354300384e-05, + "loss": 0.2753, + "step": 24518 + }, + { + "epoch": 31.474967907573813, + "grad_norm": 1.0744035243988037, + "learning_rate": 2.2845528455284556e-05, + "loss": 0.2476, + "step": 24519 + }, + { + "epoch": 31.47625160462131, + "grad_norm": 1.1037081480026245, + "learning_rate": 2.284510055626872e-05, + "loss": 0.2504, + "step": 24520 + }, + { + "epoch": 31.477535301668805, + "grad_norm": 1.0455493927001953, + "learning_rate": 2.2844672657252886e-05, + "loss": 0.2365, + "step": 24521 + }, + { + "epoch": 31.478818998716303, + "grad_norm": 2.187432050704956, + "learning_rate": 2.2844244758237058e-05, + "loss": 0.2898, + "step": 24522 + }, + { + "epoch": 31.4801026957638, + "grad_norm": 2.595247268676758, + "learning_rate": 2.2843816859221223e-05, + "loss": 0.2585, + "step": 24523 + }, + { + "epoch": 31.481386392811295, + "grad_norm": 0.9974441528320312, + "learning_rate": 2.2843388960205395e-05, + "loss": 0.2735, + "step": 24524 + }, + { + "epoch": 31.482670089858793, + "grad_norm": 1.2376372814178467, + "learning_rate": 2.284296106118956e-05, + "loss": 0.2638, + "step": 24525 + }, + { + "epoch": 31.48395378690629, + "grad_norm": 1.073360562324524, + "learning_rate": 2.2842533162173728e-05, + "loss": 0.2684, + "step": 24526 + }, + { + "epoch": 31.485237483953785, + "grad_norm": 1.8337135314941406, + "learning_rate": 2.2842105263157897e-05, + "loss": 0.2896, + "step": 24527 + }, + { + "epoch": 31.486521181001283, + "grad_norm": 1.9913170337677002, + "learning_rate": 2.284167736414206e-05, + "loss": 0.2704, + "step": 24528 + }, + { + "epoch": 31.48780487804878, + "grad_norm": 2.0179457664489746, + "learning_rate": 2.284124946512623e-05, + "loss": 0.265, + "step": 24529 + }, + { + "epoch": 31.48908857509628, + "grad_norm": 1.1548032760620117, + "learning_rate": 2.28408215661104e-05, + "loss": 0.2396, + "step": 24530 + }, + { + "epoch": 31.490372272143773, + "grad_norm": 2.2351701259613037, + "learning_rate": 2.2840393667094567e-05, + "loss": 0.2651, + "step": 24531 + }, + { + "epoch": 31.49165596919127, + "grad_norm": 2.6277072429656982, + "learning_rate": 2.2839965768078735e-05, + "loss": 0.2616, + "step": 24532 + }, + { + "epoch": 31.49293966623877, + "grad_norm": 1.9127217531204224, + "learning_rate": 2.28395378690629e-05, + "loss": 0.2743, + "step": 24533 + }, + { + "epoch": 31.494223363286263, + "grad_norm": 1.9294383525848389, + "learning_rate": 2.283910997004707e-05, + "loss": 0.3072, + "step": 24534 + }, + { + "epoch": 31.49550706033376, + "grad_norm": 0.9409891366958618, + "learning_rate": 2.2838682071031237e-05, + "loss": 0.2752, + "step": 24535 + }, + { + "epoch": 31.49679075738126, + "grad_norm": 1.7429680824279785, + "learning_rate": 2.2838254172015406e-05, + "loss": 0.3069, + "step": 24536 + }, + { + "epoch": 31.498074454428753, + "grad_norm": 1.0396512746810913, + "learning_rate": 2.283782627299957e-05, + "loss": 0.2674, + "step": 24537 + }, + { + "epoch": 31.49935815147625, + "grad_norm": 2.0761311054229736, + "learning_rate": 2.2837398373983742e-05, + "loss": 0.2855, + "step": 24538 + }, + { + "epoch": 31.50064184852375, + "grad_norm": 1.6396404504776, + "learning_rate": 2.2836970474967907e-05, + "loss": 0.2594, + "step": 24539 + }, + { + "epoch": 31.501925545571247, + "grad_norm": 1.5068587064743042, + "learning_rate": 2.2836542575952076e-05, + "loss": 0.3144, + "step": 24540 + }, + { + "epoch": 31.50320924261874, + "grad_norm": 1.4395657777786255, + "learning_rate": 2.2836114676936244e-05, + "loss": 0.2855, + "step": 24541 + }, + { + "epoch": 31.50449293966624, + "grad_norm": 1.1569396257400513, + "learning_rate": 2.283568677792041e-05, + "loss": 0.286, + "step": 24542 + }, + { + "epoch": 31.505776636713737, + "grad_norm": 1.7644215822219849, + "learning_rate": 2.283525887890458e-05, + "loss": 0.2759, + "step": 24543 + }, + { + "epoch": 31.50706033376123, + "grad_norm": 1.286939263343811, + "learning_rate": 2.2834830979888746e-05, + "loss": 0.2995, + "step": 24544 + }, + { + "epoch": 31.50834403080873, + "grad_norm": 3.3368985652923584, + "learning_rate": 2.2834403080872914e-05, + "loss": 0.3004, + "step": 24545 + }, + { + "epoch": 31.509627727856227, + "grad_norm": 1.5392502546310425, + "learning_rate": 2.2833975181857083e-05, + "loss": 0.3285, + "step": 24546 + }, + { + "epoch": 31.51091142490372, + "grad_norm": 4.537082195281982, + "learning_rate": 2.2833547282841248e-05, + "loss": 0.34, + "step": 24547 + }, + { + "epoch": 31.51219512195122, + "grad_norm": 1.68898606300354, + "learning_rate": 2.283311938382542e-05, + "loss": 0.3743, + "step": 24548 + }, + { + "epoch": 31.513478818998717, + "grad_norm": 2.2353076934814453, + "learning_rate": 2.2832691484809585e-05, + "loss": 0.4211, + "step": 24549 + }, + { + "epoch": 31.514762516046215, + "grad_norm": 0.887123167514801, + "learning_rate": 2.2832263585793753e-05, + "loss": 0.2714, + "step": 24550 + }, + { + "epoch": 31.51604621309371, + "grad_norm": 0.7353543043136597, + "learning_rate": 2.283183568677792e-05, + "loss": 0.2517, + "step": 24551 + }, + { + "epoch": 31.517329910141207, + "grad_norm": 1.0815675258636475, + "learning_rate": 2.283140778776209e-05, + "loss": 0.2599, + "step": 24552 + }, + { + "epoch": 31.518613607188705, + "grad_norm": 2.4444570541381836, + "learning_rate": 2.2830979888746255e-05, + "loss": 0.2718, + "step": 24553 + }, + { + "epoch": 31.5198973042362, + "grad_norm": 1.0473061800003052, + "learning_rate": 2.2830551989730423e-05, + "loss": 0.2596, + "step": 24554 + }, + { + "epoch": 31.521181001283697, + "grad_norm": 1.063946008682251, + "learning_rate": 2.2830124090714592e-05, + "loss": 0.2744, + "step": 24555 + }, + { + "epoch": 31.522464698331195, + "grad_norm": 1.2086535692214966, + "learning_rate": 2.282969619169876e-05, + "loss": 0.277, + "step": 24556 + }, + { + "epoch": 31.52374839537869, + "grad_norm": 0.8336340188980103, + "learning_rate": 2.282926829268293e-05, + "loss": 0.3019, + "step": 24557 + }, + { + "epoch": 31.525032092426187, + "grad_norm": 0.9641695618629456, + "learning_rate": 2.2828840393667094e-05, + "loss": 0.2528, + "step": 24558 + }, + { + "epoch": 31.526315789473685, + "grad_norm": 0.9301766157150269, + "learning_rate": 2.2828412494651265e-05, + "loss": 0.2869, + "step": 24559 + }, + { + "epoch": 31.527599486521183, + "grad_norm": 1.3171173334121704, + "learning_rate": 2.282798459563543e-05, + "loss": 0.2669, + "step": 24560 + }, + { + "epoch": 31.528883183568677, + "grad_norm": 1.930260419845581, + "learning_rate": 2.2827556696619596e-05, + "loss": 0.2626, + "step": 24561 + }, + { + "epoch": 31.530166880616175, + "grad_norm": 2.059708595275879, + "learning_rate": 2.2827128797603767e-05, + "loss": 0.2662, + "step": 24562 + }, + { + "epoch": 31.531450577663673, + "grad_norm": 0.8209036588668823, + "learning_rate": 2.2826700898587932e-05, + "loss": 0.2815, + "step": 24563 + }, + { + "epoch": 31.532734274711167, + "grad_norm": 0.7401474714279175, + "learning_rate": 2.2826272999572104e-05, + "loss": 0.2672, + "step": 24564 + }, + { + "epoch": 31.534017971758665, + "grad_norm": 1.3145716190338135, + "learning_rate": 2.282584510055627e-05, + "loss": 0.2655, + "step": 24565 + }, + { + "epoch": 31.535301668806163, + "grad_norm": 0.9374001026153564, + "learning_rate": 2.2825417201540438e-05, + "loss": 0.2797, + "step": 24566 + }, + { + "epoch": 31.536585365853657, + "grad_norm": 2.67895245552063, + "learning_rate": 2.2824989302524606e-05, + "loss": 0.2696, + "step": 24567 + }, + { + "epoch": 31.537869062901155, + "grad_norm": 0.9365885257720947, + "learning_rate": 2.282456140350877e-05, + "loss": 0.2651, + "step": 24568 + }, + { + "epoch": 31.539152759948653, + "grad_norm": 1.405956745147705, + "learning_rate": 2.282413350449294e-05, + "loss": 0.2591, + "step": 24569 + }, + { + "epoch": 31.540436456996147, + "grad_norm": 0.9572981595993042, + "learning_rate": 2.2823705605477108e-05, + "loss": 0.2624, + "step": 24570 + }, + { + "epoch": 31.541720154043645, + "grad_norm": 2.1865243911743164, + "learning_rate": 2.2823277706461276e-05, + "loss": 0.2852, + "step": 24571 + }, + { + "epoch": 31.543003851091143, + "grad_norm": 1.128703236579895, + "learning_rate": 2.2822849807445445e-05, + "loss": 0.2444, + "step": 24572 + }, + { + "epoch": 31.54428754813864, + "grad_norm": 1.5162934064865112, + "learning_rate": 2.2822421908429613e-05, + "loss": 0.2622, + "step": 24573 + }, + { + "epoch": 31.545571245186135, + "grad_norm": 1.0664185285568237, + "learning_rate": 2.2821994009413778e-05, + "loss": 0.2569, + "step": 24574 + }, + { + "epoch": 31.546854942233633, + "grad_norm": 2.1551520824432373, + "learning_rate": 2.2821566110397947e-05, + "loss": 0.2518, + "step": 24575 + }, + { + "epoch": 31.54813863928113, + "grad_norm": 0.8078514337539673, + "learning_rate": 2.2821138211382115e-05, + "loss": 0.2509, + "step": 24576 + }, + { + "epoch": 31.549422336328625, + "grad_norm": 2.125939130783081, + "learning_rate": 2.282071031236628e-05, + "loss": 0.307, + "step": 24577 + }, + { + "epoch": 31.550706033376123, + "grad_norm": 1.2902804613113403, + "learning_rate": 2.2820282413350452e-05, + "loss": 0.258, + "step": 24578 + }, + { + "epoch": 31.55198973042362, + "grad_norm": 1.1671916246414185, + "learning_rate": 2.2819854514334617e-05, + "loss": 0.2948, + "step": 24579 + }, + { + "epoch": 31.553273427471115, + "grad_norm": 1.0671331882476807, + "learning_rate": 2.281942661531879e-05, + "loss": 0.2425, + "step": 24580 + }, + { + "epoch": 31.554557124518613, + "grad_norm": 1.1865575313568115, + "learning_rate": 2.2818998716302954e-05, + "loss": 0.2596, + "step": 24581 + }, + { + "epoch": 31.55584082156611, + "grad_norm": 1.4860334396362305, + "learning_rate": 2.281857081728712e-05, + "loss": 0.2618, + "step": 24582 + }, + { + "epoch": 31.55712451861361, + "grad_norm": 2.0732438564300537, + "learning_rate": 2.281814291827129e-05, + "loss": 0.2378, + "step": 24583 + }, + { + "epoch": 31.558408215661103, + "grad_norm": 3.0475759506225586, + "learning_rate": 2.2817715019255455e-05, + "loss": 0.2471, + "step": 24584 + }, + { + "epoch": 31.5596919127086, + "grad_norm": 1.3314720392227173, + "learning_rate": 2.2817287120239624e-05, + "loss": 0.2751, + "step": 24585 + }, + { + "epoch": 31.5609756097561, + "grad_norm": 1.9861408472061157, + "learning_rate": 2.2816859221223792e-05, + "loss": 0.2926, + "step": 24586 + }, + { + "epoch": 31.562259306803593, + "grad_norm": 1.3099420070648193, + "learning_rate": 2.281643132220796e-05, + "loss": 0.2849, + "step": 24587 + }, + { + "epoch": 31.56354300385109, + "grad_norm": 2.6703126430511475, + "learning_rate": 2.281600342319213e-05, + "loss": 0.2887, + "step": 24588 + }, + { + "epoch": 31.56482670089859, + "grad_norm": 1.1524052619934082, + "learning_rate": 2.2815575524176294e-05, + "loss": 0.2739, + "step": 24589 + }, + { + "epoch": 31.566110397946083, + "grad_norm": 2.4178385734558105, + "learning_rate": 2.2815147625160463e-05, + "loss": 0.3059, + "step": 24590 + }, + { + "epoch": 31.56739409499358, + "grad_norm": 1.9167439937591553, + "learning_rate": 2.281471972614463e-05, + "loss": 0.3091, + "step": 24591 + }, + { + "epoch": 31.56867779204108, + "grad_norm": 1.2759156227111816, + "learning_rate": 2.28142918271288e-05, + "loss": 0.303, + "step": 24592 + }, + { + "epoch": 31.569961489088577, + "grad_norm": 1.8497734069824219, + "learning_rate": 2.2813863928112964e-05, + "loss": 0.3341, + "step": 24593 + }, + { + "epoch": 31.57124518613607, + "grad_norm": 1.433345079421997, + "learning_rate": 2.2813436029097133e-05, + "loss": 0.3177, + "step": 24594 + }, + { + "epoch": 31.57252888318357, + "grad_norm": 1.7033582925796509, + "learning_rate": 2.28130081300813e-05, + "loss": 0.2922, + "step": 24595 + }, + { + "epoch": 31.573812580231067, + "grad_norm": 6.301262378692627, + "learning_rate": 2.281258023106547e-05, + "loss": 0.2958, + "step": 24596 + }, + { + "epoch": 31.57509627727856, + "grad_norm": 2.0313193798065186, + "learning_rate": 2.2812152332049638e-05, + "loss": 0.3202, + "step": 24597 + }, + { + "epoch": 31.57637997432606, + "grad_norm": 8.913219451904297, + "learning_rate": 2.2811724433033803e-05, + "loss": 0.354, + "step": 24598 + }, + { + "epoch": 31.577663671373557, + "grad_norm": 9.179020881652832, + "learning_rate": 2.2811296534017975e-05, + "loss": 0.4711, + "step": 24599 + }, + { + "epoch": 31.57894736842105, + "grad_norm": 1.0007190704345703, + "learning_rate": 2.281086863500214e-05, + "loss": 0.2907, + "step": 24600 + }, + { + "epoch": 31.58023106546855, + "grad_norm": 0.8369345664978027, + "learning_rate": 2.2810440735986305e-05, + "loss": 0.2779, + "step": 24601 + }, + { + "epoch": 31.581514762516047, + "grad_norm": 1.5476555824279785, + "learning_rate": 2.2810012836970477e-05, + "loss": 0.2699, + "step": 24602 + }, + { + "epoch": 31.58279845956354, + "grad_norm": 0.9777318239212036, + "learning_rate": 2.2809584937954642e-05, + "loss": 0.3174, + "step": 24603 + }, + { + "epoch": 31.58408215661104, + "grad_norm": 1.2163984775543213, + "learning_rate": 2.2809157038938814e-05, + "loss": 0.2681, + "step": 24604 + }, + { + "epoch": 31.585365853658537, + "grad_norm": 1.519848346710205, + "learning_rate": 2.280872913992298e-05, + "loss": 0.2926, + "step": 24605 + }, + { + "epoch": 31.586649550706035, + "grad_norm": 0.8689730763435364, + "learning_rate": 2.2808301240907147e-05, + "loss": 0.2812, + "step": 24606 + }, + { + "epoch": 31.58793324775353, + "grad_norm": 0.7462111711502075, + "learning_rate": 2.2807873341891315e-05, + "loss": 0.274, + "step": 24607 + }, + { + "epoch": 31.589216944801027, + "grad_norm": 0.8085793852806091, + "learning_rate": 2.280744544287548e-05, + "loss": 0.2964, + "step": 24608 + }, + { + "epoch": 31.590500641848525, + "grad_norm": 2.6220626831054688, + "learning_rate": 2.280701754385965e-05, + "loss": 0.2813, + "step": 24609 + }, + { + "epoch": 31.59178433889602, + "grad_norm": 1.7413963079452515, + "learning_rate": 2.2806589644843817e-05, + "loss": 0.2723, + "step": 24610 + }, + { + "epoch": 31.593068035943517, + "grad_norm": 1.1972488164901733, + "learning_rate": 2.2806161745827986e-05, + "loss": 0.2771, + "step": 24611 + }, + { + "epoch": 31.594351732991015, + "grad_norm": 1.1688212156295776, + "learning_rate": 2.2805733846812154e-05, + "loss": 0.3196, + "step": 24612 + }, + { + "epoch": 31.59563543003851, + "grad_norm": 2.2594099044799805, + "learning_rate": 2.2805305947796322e-05, + "loss": 0.2875, + "step": 24613 + }, + { + "epoch": 31.596919127086007, + "grad_norm": 0.7900307774543762, + "learning_rate": 2.2804878048780487e-05, + "loss": 0.2824, + "step": 24614 + }, + { + "epoch": 31.598202824133505, + "grad_norm": 1.3491932153701782, + "learning_rate": 2.2804450149764656e-05, + "loss": 0.2878, + "step": 24615 + }, + { + "epoch": 31.599486521181003, + "grad_norm": 0.988039493560791, + "learning_rate": 2.2804022250748824e-05, + "loss": 0.2722, + "step": 24616 + }, + { + "epoch": 31.600770218228497, + "grad_norm": 1.0497101545333862, + "learning_rate": 2.280359435173299e-05, + "loss": 0.2683, + "step": 24617 + }, + { + "epoch": 31.602053915275995, + "grad_norm": 1.3377060890197754, + "learning_rate": 2.280316645271716e-05, + "loss": 0.265, + "step": 24618 + }, + { + "epoch": 31.603337612323493, + "grad_norm": 3.4895026683807373, + "learning_rate": 2.2802738553701326e-05, + "loss": 0.3036, + "step": 24619 + }, + { + "epoch": 31.604621309370987, + "grad_norm": 1.5936115980148315, + "learning_rate": 2.2802310654685495e-05, + "loss": 0.2546, + "step": 24620 + }, + { + "epoch": 31.605905006418485, + "grad_norm": 1.0114918947219849, + "learning_rate": 2.2801882755669663e-05, + "loss": 0.2754, + "step": 24621 + }, + { + "epoch": 31.607188703465983, + "grad_norm": 0.9647616744041443, + "learning_rate": 2.2801454856653828e-05, + "loss": 0.2792, + "step": 24622 + }, + { + "epoch": 31.608472400513477, + "grad_norm": 1.8113290071487427, + "learning_rate": 2.2801026957638e-05, + "loss": 0.2612, + "step": 24623 + }, + { + "epoch": 31.609756097560975, + "grad_norm": 1.6665295362472534, + "learning_rate": 2.2800599058622165e-05, + "loss": 0.2605, + "step": 24624 + }, + { + "epoch": 31.611039794608473, + "grad_norm": 1.6407945156097412, + "learning_rate": 2.2800171159606333e-05, + "loss": 0.2969, + "step": 24625 + }, + { + "epoch": 31.61232349165597, + "grad_norm": 1.0308113098144531, + "learning_rate": 2.27997432605905e-05, + "loss": 0.2722, + "step": 24626 + }, + { + "epoch": 31.613607188703465, + "grad_norm": 1.0180010795593262, + "learning_rate": 2.279931536157467e-05, + "loss": 0.2605, + "step": 24627 + }, + { + "epoch": 31.614890885750963, + "grad_norm": 1.0115634202957153, + "learning_rate": 2.2798887462558835e-05, + "loss": 0.2726, + "step": 24628 + }, + { + "epoch": 31.61617458279846, + "grad_norm": 4.1611104011535645, + "learning_rate": 2.2798459563543003e-05, + "loss": 0.2504, + "step": 24629 + }, + { + "epoch": 31.617458279845955, + "grad_norm": 2.935460329055786, + "learning_rate": 2.2798031664527172e-05, + "loss": 0.2905, + "step": 24630 + }, + { + "epoch": 31.618741976893453, + "grad_norm": 1.3947038650512695, + "learning_rate": 2.279760376551134e-05, + "loss": 0.2646, + "step": 24631 + }, + { + "epoch": 31.62002567394095, + "grad_norm": 1.1917494535446167, + "learning_rate": 2.279717586649551e-05, + "loss": 0.2643, + "step": 24632 + }, + { + "epoch": 31.621309370988445, + "grad_norm": 1.6920336484909058, + "learning_rate": 2.2796747967479674e-05, + "loss": 0.3076, + "step": 24633 + }, + { + "epoch": 31.622593068035943, + "grad_norm": 1.8208972215652466, + "learning_rate": 2.2796320068463846e-05, + "loss": 0.2758, + "step": 24634 + }, + { + "epoch": 31.62387676508344, + "grad_norm": 2.373762607574463, + "learning_rate": 2.279589216944801e-05, + "loss": 0.2826, + "step": 24635 + }, + { + "epoch": 31.625160462130935, + "grad_norm": 1.54831862449646, + "learning_rate": 2.2795464270432176e-05, + "loss": 0.2687, + "step": 24636 + }, + { + "epoch": 31.626444159178433, + "grad_norm": 23.503681182861328, + "learning_rate": 2.2795036371416347e-05, + "loss": 0.3048, + "step": 24637 + }, + { + "epoch": 31.62772785622593, + "grad_norm": 1.6954140663146973, + "learning_rate": 2.2794608472400512e-05, + "loss": 0.2764, + "step": 24638 + }, + { + "epoch": 31.62901155327343, + "grad_norm": 2.355748176574707, + "learning_rate": 2.2794180573384684e-05, + "loss": 0.3069, + "step": 24639 + }, + { + "epoch": 31.630295250320923, + "grad_norm": 1.4580289125442505, + "learning_rate": 2.279375267436885e-05, + "loss": 0.2964, + "step": 24640 + }, + { + "epoch": 31.63157894736842, + "grad_norm": 1.332787036895752, + "learning_rate": 2.2793324775353018e-05, + "loss": 0.2876, + "step": 24641 + }, + { + "epoch": 31.63286264441592, + "grad_norm": 1.2982145547866821, + "learning_rate": 2.2792896876337186e-05, + "loss": 0.3013, + "step": 24642 + }, + { + "epoch": 31.634146341463413, + "grad_norm": 5.201515197753906, + "learning_rate": 2.279246897732135e-05, + "loss": 0.3247, + "step": 24643 + }, + { + "epoch": 31.63543003851091, + "grad_norm": 2.1944851875305176, + "learning_rate": 2.279204107830552e-05, + "loss": 0.2861, + "step": 24644 + }, + { + "epoch": 31.63671373555841, + "grad_norm": 1.3150382041931152, + "learning_rate": 2.2791613179289688e-05, + "loss": 0.3, + "step": 24645 + }, + { + "epoch": 31.637997432605903, + "grad_norm": 1.7990156412124634, + "learning_rate": 2.2791185280273856e-05, + "loss": 0.3563, + "step": 24646 + }, + { + "epoch": 31.6392811296534, + "grad_norm": 5.20665979385376, + "learning_rate": 2.2790757381258025e-05, + "loss": 0.3415, + "step": 24647 + }, + { + "epoch": 31.6405648267009, + "grad_norm": 2.106966018676758, + "learning_rate": 2.2790329482242193e-05, + "loss": 0.3393, + "step": 24648 + }, + { + "epoch": 31.641848523748397, + "grad_norm": 1.7817097902297974, + "learning_rate": 2.2789901583226358e-05, + "loss": 0.4021, + "step": 24649 + }, + { + "epoch": 31.64313222079589, + "grad_norm": 1.119425892829895, + "learning_rate": 2.2789473684210527e-05, + "loss": 0.2744, + "step": 24650 + }, + { + "epoch": 31.64441591784339, + "grad_norm": 1.037516713142395, + "learning_rate": 2.2789045785194695e-05, + "loss": 0.2923, + "step": 24651 + }, + { + "epoch": 31.645699614890887, + "grad_norm": 1.2541817426681519, + "learning_rate": 2.278861788617886e-05, + "loss": 0.2842, + "step": 24652 + }, + { + "epoch": 31.64698331193838, + "grad_norm": 0.9942946434020996, + "learning_rate": 2.2788189987163032e-05, + "loss": 0.2853, + "step": 24653 + }, + { + "epoch": 31.64826700898588, + "grad_norm": 2.0132176876068115, + "learning_rate": 2.2787762088147197e-05, + "loss": 0.2811, + "step": 24654 + }, + { + "epoch": 31.649550706033377, + "grad_norm": 2.7247133255004883, + "learning_rate": 2.2787334189131365e-05, + "loss": 0.2867, + "step": 24655 + }, + { + "epoch": 31.65083440308087, + "grad_norm": 1.4929219484329224, + "learning_rate": 2.2786906290115534e-05, + "loss": 0.2808, + "step": 24656 + }, + { + "epoch": 31.65211810012837, + "grad_norm": 3.0464210510253906, + "learning_rate": 2.27864783910997e-05, + "loss": 0.2858, + "step": 24657 + }, + { + "epoch": 31.653401797175867, + "grad_norm": 3.0657260417938232, + "learning_rate": 2.278605049208387e-05, + "loss": 0.276, + "step": 24658 + }, + { + "epoch": 31.654685494223365, + "grad_norm": 1.0122562646865845, + "learning_rate": 2.2785622593068035e-05, + "loss": 0.3191, + "step": 24659 + }, + { + "epoch": 31.65596919127086, + "grad_norm": 1.2119810581207275, + "learning_rate": 2.2785194694052204e-05, + "loss": 0.2777, + "step": 24660 + }, + { + "epoch": 31.657252888318357, + "grad_norm": 1.0394561290740967, + "learning_rate": 2.2784766795036372e-05, + "loss": 0.3184, + "step": 24661 + }, + { + "epoch": 31.658536585365855, + "grad_norm": 1.7761918306350708, + "learning_rate": 2.2784338896020537e-05, + "loss": 0.2459, + "step": 24662 + }, + { + "epoch": 31.65982028241335, + "grad_norm": 1.0491000413894653, + "learning_rate": 2.278391099700471e-05, + "loss": 0.2513, + "step": 24663 + }, + { + "epoch": 31.661103979460847, + "grad_norm": 1.0034098625183105, + "learning_rate": 2.2783483097988874e-05, + "loss": 0.2786, + "step": 24664 + }, + { + "epoch": 31.662387676508345, + "grad_norm": 1.2512307167053223, + "learning_rate": 2.2783055198973043e-05, + "loss": 0.2874, + "step": 24665 + }, + { + "epoch": 31.66367137355584, + "grad_norm": 1.3225332498550415, + "learning_rate": 2.278262729995721e-05, + "loss": 0.2775, + "step": 24666 + }, + { + "epoch": 31.664955070603337, + "grad_norm": 0.8815563917160034, + "learning_rate": 2.278219940094138e-05, + "loss": 0.2605, + "step": 24667 + }, + { + "epoch": 31.666238767650835, + "grad_norm": 1.686132788658142, + "learning_rate": 2.2781771501925544e-05, + "loss": 0.2728, + "step": 24668 + }, + { + "epoch": 31.66752246469833, + "grad_norm": 0.8226271271705627, + "learning_rate": 2.2781343602909713e-05, + "loss": 0.2587, + "step": 24669 + }, + { + "epoch": 31.668806161745827, + "grad_norm": 0.9894349575042725, + "learning_rate": 2.278091570389388e-05, + "loss": 0.269, + "step": 24670 + }, + { + "epoch": 31.670089858793325, + "grad_norm": 0.887987494468689, + "learning_rate": 2.278048780487805e-05, + "loss": 0.2736, + "step": 24671 + }, + { + "epoch": 31.671373555840823, + "grad_norm": 0.9249944686889648, + "learning_rate": 2.2780059905862218e-05, + "loss": 0.2601, + "step": 24672 + }, + { + "epoch": 31.672657252888317, + "grad_norm": 1.2762634754180908, + "learning_rate": 2.2779632006846383e-05, + "loss": 0.282, + "step": 24673 + }, + { + "epoch": 31.673940949935815, + "grad_norm": 0.9126420021057129, + "learning_rate": 2.2779204107830555e-05, + "loss": 0.2665, + "step": 24674 + }, + { + "epoch": 31.675224646983313, + "grad_norm": 0.947234034538269, + "learning_rate": 2.277877620881472e-05, + "loss": 0.2777, + "step": 24675 + }, + { + "epoch": 31.676508344030808, + "grad_norm": 1.9370200634002686, + "learning_rate": 2.2778348309798885e-05, + "loss": 0.2799, + "step": 24676 + }, + { + "epoch": 31.677792041078305, + "grad_norm": 1.1730812788009644, + "learning_rate": 2.2777920410783057e-05, + "loss": 0.2632, + "step": 24677 + }, + { + "epoch": 31.679075738125803, + "grad_norm": 1.438099980354309, + "learning_rate": 2.2777492511767222e-05, + "loss": 0.2641, + "step": 24678 + }, + { + "epoch": 31.680359435173298, + "grad_norm": 1.4916439056396484, + "learning_rate": 2.2777064612751394e-05, + "loss": 0.2661, + "step": 24679 + }, + { + "epoch": 31.681643132220795, + "grad_norm": 2.6015262603759766, + "learning_rate": 2.277663671373556e-05, + "loss": 0.2646, + "step": 24680 + }, + { + "epoch": 31.682926829268293, + "grad_norm": 1.4524025917053223, + "learning_rate": 2.2776208814719727e-05, + "loss": 0.2898, + "step": 24681 + }, + { + "epoch": 31.68421052631579, + "grad_norm": 1.4625306129455566, + "learning_rate": 2.2775780915703895e-05, + "loss": 0.2608, + "step": 24682 + }, + { + "epoch": 31.685494223363285, + "grad_norm": 2.5283169746398926, + "learning_rate": 2.277535301668806e-05, + "loss": 0.27, + "step": 24683 + }, + { + "epoch": 31.686777920410783, + "grad_norm": 1.926061749458313, + "learning_rate": 2.277492511767223e-05, + "loss": 0.27, + "step": 24684 + }, + { + "epoch": 31.68806161745828, + "grad_norm": 2.4215102195739746, + "learning_rate": 2.2774497218656397e-05, + "loss": 0.2912, + "step": 24685 + }, + { + "epoch": 31.689345314505776, + "grad_norm": 1.2720510959625244, + "learning_rate": 2.2774069319640566e-05, + "loss": 0.2718, + "step": 24686 + }, + { + "epoch": 31.690629011553273, + "grad_norm": 1.6459369659423828, + "learning_rate": 2.2773641420624734e-05, + "loss": 0.2779, + "step": 24687 + }, + { + "epoch": 31.69191270860077, + "grad_norm": 1.8291471004486084, + "learning_rate": 2.2773213521608902e-05, + "loss": 0.2315, + "step": 24688 + }, + { + "epoch": 31.693196405648266, + "grad_norm": 1.691550612449646, + "learning_rate": 2.2772785622593068e-05, + "loss": 0.2859, + "step": 24689 + }, + { + "epoch": 31.694480102695763, + "grad_norm": 1.4638087749481201, + "learning_rate": 2.2772357723577236e-05, + "loss": 0.2489, + "step": 24690 + }, + { + "epoch": 31.69576379974326, + "grad_norm": 3.2180473804473877, + "learning_rate": 2.2771929824561404e-05, + "loss": 0.3039, + "step": 24691 + }, + { + "epoch": 31.69704749679076, + "grad_norm": 1.2400596141815186, + "learning_rate": 2.277150192554557e-05, + "loss": 0.2781, + "step": 24692 + }, + { + "epoch": 31.698331193838253, + "grad_norm": 2.2776732444763184, + "learning_rate": 2.277107402652974e-05, + "loss": 0.2789, + "step": 24693 + }, + { + "epoch": 31.69961489088575, + "grad_norm": 1.288403034210205, + "learning_rate": 2.2770646127513906e-05, + "loss": 0.2671, + "step": 24694 + }, + { + "epoch": 31.70089858793325, + "grad_norm": 2.038985013961792, + "learning_rate": 2.2770218228498078e-05, + "loss": 0.289, + "step": 24695 + }, + { + "epoch": 31.702182284980744, + "grad_norm": 1.7047431468963623, + "learning_rate": 2.2769790329482243e-05, + "loss": 0.3687, + "step": 24696 + }, + { + "epoch": 31.70346598202824, + "grad_norm": 1.4781428575515747, + "learning_rate": 2.2769362430466408e-05, + "loss": 0.3106, + "step": 24697 + }, + { + "epoch": 31.70474967907574, + "grad_norm": 2.1333906650543213, + "learning_rate": 2.276893453145058e-05, + "loss": 0.3834, + "step": 24698 + }, + { + "epoch": 31.706033376123234, + "grad_norm": 1.6561052799224854, + "learning_rate": 2.2768506632434745e-05, + "loss": 0.4469, + "step": 24699 + }, + { + "epoch": 31.70731707317073, + "grad_norm": 0.9468320608139038, + "learning_rate": 2.2768078733418913e-05, + "loss": 0.2808, + "step": 24700 + }, + { + "epoch": 31.70860077021823, + "grad_norm": 0.6998728513717651, + "learning_rate": 2.276765083440308e-05, + "loss": 0.283, + "step": 24701 + }, + { + "epoch": 31.709884467265724, + "grad_norm": 0.9047815799713135, + "learning_rate": 2.276722293538725e-05, + "loss": 0.2799, + "step": 24702 + }, + { + "epoch": 31.71116816431322, + "grad_norm": 1.0400376319885254, + "learning_rate": 2.276679503637142e-05, + "loss": 0.277, + "step": 24703 + }, + { + "epoch": 31.71245186136072, + "grad_norm": 1.7514997720718384, + "learning_rate": 2.2766367137355584e-05, + "loss": 0.2572, + "step": 24704 + }, + { + "epoch": 31.713735558408217, + "grad_norm": 2.406463623046875, + "learning_rate": 2.2765939238339752e-05, + "loss": 0.299, + "step": 24705 + }, + { + "epoch": 31.71501925545571, + "grad_norm": 2.9210550785064697, + "learning_rate": 2.276551133932392e-05, + "loss": 0.2771, + "step": 24706 + }, + { + "epoch": 31.71630295250321, + "grad_norm": 0.9323055148124695, + "learning_rate": 2.276508344030809e-05, + "loss": 0.2847, + "step": 24707 + }, + { + "epoch": 31.717586649550707, + "grad_norm": 1.03038489818573, + "learning_rate": 2.2764655541292254e-05, + "loss": 0.2822, + "step": 24708 + }, + { + "epoch": 31.7188703465982, + "grad_norm": 5.689121723175049, + "learning_rate": 2.2764227642276426e-05, + "loss": 0.2713, + "step": 24709 + }, + { + "epoch": 31.7201540436457, + "grad_norm": 4.9874267578125, + "learning_rate": 2.276379974326059e-05, + "loss": 0.2657, + "step": 24710 + }, + { + "epoch": 31.721437740693197, + "grad_norm": 0.9560267329216003, + "learning_rate": 2.276337184424476e-05, + "loss": 0.2919, + "step": 24711 + }, + { + "epoch": 31.72272143774069, + "grad_norm": 1.567643404006958, + "learning_rate": 2.2762943945228927e-05, + "loss": 0.2619, + "step": 24712 + }, + { + "epoch": 31.72400513478819, + "grad_norm": 1.2380813360214233, + "learning_rate": 2.2762516046213092e-05, + "loss": 0.2725, + "step": 24713 + }, + { + "epoch": 31.725288831835687, + "grad_norm": 1.0944617986679077, + "learning_rate": 2.2762088147197264e-05, + "loss": 0.2972, + "step": 24714 + }, + { + "epoch": 31.726572528883185, + "grad_norm": 0.9708751440048218, + "learning_rate": 2.276166024818143e-05, + "loss": 0.3009, + "step": 24715 + }, + { + "epoch": 31.72785622593068, + "grad_norm": 2.1169986724853516, + "learning_rate": 2.2761232349165598e-05, + "loss": 0.2701, + "step": 24716 + }, + { + "epoch": 31.729139922978177, + "grad_norm": 0.9180201292037964, + "learning_rate": 2.2760804450149766e-05, + "loss": 0.2626, + "step": 24717 + }, + { + "epoch": 31.730423620025675, + "grad_norm": 7.905758857727051, + "learning_rate": 2.276037655113393e-05, + "loss": 0.2343, + "step": 24718 + }, + { + "epoch": 31.73170731707317, + "grad_norm": 1.1760257482528687, + "learning_rate": 2.2759948652118103e-05, + "loss": 0.2817, + "step": 24719 + }, + { + "epoch": 31.732991014120667, + "grad_norm": 1.1041309833526611, + "learning_rate": 2.2759520753102268e-05, + "loss": 0.2727, + "step": 24720 + }, + { + "epoch": 31.734274711168165, + "grad_norm": 1.203149676322937, + "learning_rate": 2.2759092854086436e-05, + "loss": 0.2742, + "step": 24721 + }, + { + "epoch": 31.73555840821566, + "grad_norm": 1.0814515352249146, + "learning_rate": 2.2758664955070605e-05, + "loss": 0.2592, + "step": 24722 + }, + { + "epoch": 31.736842105263158, + "grad_norm": 0.9056252241134644, + "learning_rate": 2.275823705605477e-05, + "loss": 0.2438, + "step": 24723 + }, + { + "epoch": 31.738125802310655, + "grad_norm": 2.172581672668457, + "learning_rate": 2.2757809157038938e-05, + "loss": 0.2595, + "step": 24724 + }, + { + "epoch": 31.739409499358153, + "grad_norm": 1.0629292726516724, + "learning_rate": 2.2757381258023107e-05, + "loss": 0.2598, + "step": 24725 + }, + { + "epoch": 31.740693196405648, + "grad_norm": 1.061692714691162, + "learning_rate": 2.2756953359007275e-05, + "loss": 0.2486, + "step": 24726 + }, + { + "epoch": 31.741976893453145, + "grad_norm": 1.1371783018112183, + "learning_rate": 2.2756525459991443e-05, + "loss": 0.2345, + "step": 24727 + }, + { + "epoch": 31.743260590500643, + "grad_norm": 1.0802209377288818, + "learning_rate": 2.2756097560975612e-05, + "loss": 0.2957, + "step": 24728 + }, + { + "epoch": 31.744544287548138, + "grad_norm": 4.245956897735596, + "learning_rate": 2.2755669661959777e-05, + "loss": 0.2452, + "step": 24729 + }, + { + "epoch": 31.745827984595635, + "grad_norm": 4.05888557434082, + "learning_rate": 2.2755241762943945e-05, + "loss": 0.2609, + "step": 24730 + }, + { + "epoch": 31.747111681643133, + "grad_norm": 2.1510403156280518, + "learning_rate": 2.2754813863928114e-05, + "loss": 0.2636, + "step": 24731 + }, + { + "epoch": 31.748395378690628, + "grad_norm": 2.5481600761413574, + "learning_rate": 2.275438596491228e-05, + "loss": 0.2839, + "step": 24732 + }, + { + "epoch": 31.749679075738126, + "grad_norm": 1.7912344932556152, + "learning_rate": 2.275395806589645e-05, + "loss": 0.27, + "step": 24733 + }, + { + "epoch": 31.750962772785623, + "grad_norm": 2.369760513305664, + "learning_rate": 2.2753530166880616e-05, + "loss": 0.2466, + "step": 24734 + }, + { + "epoch": 31.752246469833118, + "grad_norm": 2.7854130268096924, + "learning_rate": 2.2753102267864787e-05, + "loss": 0.2687, + "step": 24735 + }, + { + "epoch": 31.753530166880616, + "grad_norm": 1.2248461246490479, + "learning_rate": 2.2752674368848952e-05, + "loss": 0.2371, + "step": 24736 + }, + { + "epoch": 31.754813863928113, + "grad_norm": 1.6767195463180542, + "learning_rate": 2.2752246469833117e-05, + "loss": 0.3027, + "step": 24737 + }, + { + "epoch": 31.75609756097561, + "grad_norm": 2.0204660892486572, + "learning_rate": 2.275181857081729e-05, + "loss": 0.3063, + "step": 24738 + }, + { + "epoch": 31.757381258023106, + "grad_norm": 1.8848299980163574, + "learning_rate": 2.2751390671801454e-05, + "loss": 0.2602, + "step": 24739 + }, + { + "epoch": 31.758664955070603, + "grad_norm": 3.467895269393921, + "learning_rate": 2.2750962772785623e-05, + "loss": 0.283, + "step": 24740 + }, + { + "epoch": 31.7599486521181, + "grad_norm": 1.6549034118652344, + "learning_rate": 2.275053487376979e-05, + "loss": 0.2712, + "step": 24741 + }, + { + "epoch": 31.761232349165596, + "grad_norm": 2.8181276321411133, + "learning_rate": 2.275010697475396e-05, + "loss": 0.3273, + "step": 24742 + }, + { + "epoch": 31.762516046213094, + "grad_norm": 1.499377727508545, + "learning_rate": 2.2749679075738128e-05, + "loss": 0.3058, + "step": 24743 + }, + { + "epoch": 31.76379974326059, + "grad_norm": 2.607325553894043, + "learning_rate": 2.2749251176722293e-05, + "loss": 0.2884, + "step": 24744 + }, + { + "epoch": 31.765083440308086, + "grad_norm": 1.7091495990753174, + "learning_rate": 2.274882327770646e-05, + "loss": 0.3065, + "step": 24745 + }, + { + "epoch": 31.766367137355584, + "grad_norm": 2.5818095207214355, + "learning_rate": 2.274839537869063e-05, + "loss": 0.2824, + "step": 24746 + }, + { + "epoch": 31.76765083440308, + "grad_norm": 2.5765156745910645, + "learning_rate": 2.2747967479674798e-05, + "loss": 0.3119, + "step": 24747 + }, + { + "epoch": 31.76893453145058, + "grad_norm": 1.58547043800354, + "learning_rate": 2.2747539580658963e-05, + "loss": 0.375, + "step": 24748 + }, + { + "epoch": 31.770218228498074, + "grad_norm": 2.8183345794677734, + "learning_rate": 2.2747111681643135e-05, + "loss": 0.4483, + "step": 24749 + }, + { + "epoch": 31.77150192554557, + "grad_norm": 0.9593631029129028, + "learning_rate": 2.27466837826273e-05, + "loss": 0.2635, + "step": 24750 + }, + { + "epoch": 31.77278562259307, + "grad_norm": 0.7777299880981445, + "learning_rate": 2.274625588361147e-05, + "loss": 0.2521, + "step": 24751 + }, + { + "epoch": 31.774069319640564, + "grad_norm": 1.5969592332839966, + "learning_rate": 2.2745827984595637e-05, + "loss": 0.2681, + "step": 24752 + }, + { + "epoch": 31.77535301668806, + "grad_norm": 1.183383584022522, + "learning_rate": 2.2745400085579802e-05, + "loss": 0.2713, + "step": 24753 + }, + { + "epoch": 31.77663671373556, + "grad_norm": 1.0661410093307495, + "learning_rate": 2.2744972186563974e-05, + "loss": 0.2682, + "step": 24754 + }, + { + "epoch": 31.777920410783054, + "grad_norm": 0.6388439536094666, + "learning_rate": 2.274454428754814e-05, + "loss": 0.2535, + "step": 24755 + }, + { + "epoch": 31.77920410783055, + "grad_norm": 1.627012014389038, + "learning_rate": 2.2744116388532307e-05, + "loss": 0.2824, + "step": 24756 + }, + { + "epoch": 31.78048780487805, + "grad_norm": 6.097297191619873, + "learning_rate": 2.2743688489516475e-05, + "loss": 0.2454, + "step": 24757 + }, + { + "epoch": 31.781771501925547, + "grad_norm": 0.7602789998054504, + "learning_rate": 2.274326059050064e-05, + "loss": 0.2824, + "step": 24758 + }, + { + "epoch": 31.78305519897304, + "grad_norm": 1.5292601585388184, + "learning_rate": 2.2742832691484812e-05, + "loss": 0.2749, + "step": 24759 + }, + { + "epoch": 31.78433889602054, + "grad_norm": 10.075055122375488, + "learning_rate": 2.2742404792468977e-05, + "loss": 0.286, + "step": 24760 + }, + { + "epoch": 31.785622593068037, + "grad_norm": 1.2132899761199951, + "learning_rate": 2.2741976893453146e-05, + "loss": 0.3011, + "step": 24761 + }, + { + "epoch": 31.78690629011553, + "grad_norm": 1.6167601346969604, + "learning_rate": 2.2741548994437314e-05, + "loss": 0.2653, + "step": 24762 + }, + { + "epoch": 31.78818998716303, + "grad_norm": 0.9721375703811646, + "learning_rate": 2.2741121095421483e-05, + "loss": 0.271, + "step": 24763 + }, + { + "epoch": 31.789473684210527, + "grad_norm": 1.4342113733291626, + "learning_rate": 2.2740693196405648e-05, + "loss": 0.3013, + "step": 24764 + }, + { + "epoch": 31.79075738125802, + "grad_norm": 1.1086887121200562, + "learning_rate": 2.2740265297389816e-05, + "loss": 0.2857, + "step": 24765 + }, + { + "epoch": 31.79204107830552, + "grad_norm": 0.8162440657615662, + "learning_rate": 2.2739837398373984e-05, + "loss": 0.2928, + "step": 24766 + }, + { + "epoch": 31.793324775353017, + "grad_norm": 1.0505075454711914, + "learning_rate": 2.2739409499358153e-05, + "loss": 0.2596, + "step": 24767 + }, + { + "epoch": 31.794608472400512, + "grad_norm": 1.0230432748794556, + "learning_rate": 2.273898160034232e-05, + "loss": 0.2643, + "step": 24768 + }, + { + "epoch": 31.79589216944801, + "grad_norm": 1.0017184019088745, + "learning_rate": 2.2738553701326486e-05, + "loss": 0.2707, + "step": 24769 + }, + { + "epoch": 31.797175866495508, + "grad_norm": 1.1102086305618286, + "learning_rate": 2.2738125802310658e-05, + "loss": 0.2729, + "step": 24770 + }, + { + "epoch": 31.798459563543005, + "grad_norm": 1.1978349685668945, + "learning_rate": 2.2737697903294823e-05, + "loss": 0.2531, + "step": 24771 + }, + { + "epoch": 31.7997432605905, + "grad_norm": 2.8327383995056152, + "learning_rate": 2.2737270004278988e-05, + "loss": 0.2509, + "step": 24772 + }, + { + "epoch": 31.801026957637998, + "grad_norm": 1.137129306793213, + "learning_rate": 2.273684210526316e-05, + "loss": 0.2426, + "step": 24773 + }, + { + "epoch": 31.802310654685495, + "grad_norm": 2.73872709274292, + "learning_rate": 2.2736414206247325e-05, + "loss": 0.2727, + "step": 24774 + }, + { + "epoch": 31.80359435173299, + "grad_norm": 1.3055462837219238, + "learning_rate": 2.2735986307231497e-05, + "loss": 0.2492, + "step": 24775 + }, + { + "epoch": 31.804878048780488, + "grad_norm": 1.1188238859176636, + "learning_rate": 2.2735558408215662e-05, + "loss": 0.2591, + "step": 24776 + }, + { + "epoch": 31.806161745827985, + "grad_norm": 1.2361979484558105, + "learning_rate": 2.273513050919983e-05, + "loss": 0.267, + "step": 24777 + }, + { + "epoch": 31.80744544287548, + "grad_norm": 1.5279141664505005, + "learning_rate": 2.2734702610184e-05, + "loss": 0.2732, + "step": 24778 + }, + { + "epoch": 31.808729139922978, + "grad_norm": 1.2322654724121094, + "learning_rate": 2.2734274711168164e-05, + "loss": 0.2903, + "step": 24779 + }, + { + "epoch": 31.810012836970476, + "grad_norm": 1.4955685138702393, + "learning_rate": 2.2733846812152332e-05, + "loss": 0.279, + "step": 24780 + }, + { + "epoch": 31.811296534017973, + "grad_norm": 1.855673909187317, + "learning_rate": 2.27334189131365e-05, + "loss": 0.2407, + "step": 24781 + }, + { + "epoch": 31.812580231065468, + "grad_norm": 0.8722140789031982, + "learning_rate": 2.273299101412067e-05, + "loss": 0.2556, + "step": 24782 + }, + { + "epoch": 31.813863928112966, + "grad_norm": 0.8922469615936279, + "learning_rate": 2.2732563115104837e-05, + "loss": 0.2646, + "step": 24783 + }, + { + "epoch": 31.815147625160463, + "grad_norm": 1.7002907991409302, + "learning_rate": 2.2732135216089002e-05, + "loss": 0.2974, + "step": 24784 + }, + { + "epoch": 31.816431322207958, + "grad_norm": 0.9518094062805176, + "learning_rate": 2.273170731707317e-05, + "loss": 0.2703, + "step": 24785 + }, + { + "epoch": 31.817715019255456, + "grad_norm": 1.1980552673339844, + "learning_rate": 2.273127941805734e-05, + "loss": 0.273, + "step": 24786 + }, + { + "epoch": 31.818998716302954, + "grad_norm": 10.870905876159668, + "learning_rate": 2.2730851519041507e-05, + "loss": 0.2974, + "step": 24787 + }, + { + "epoch": 31.820282413350448, + "grad_norm": 0.9375141859054565, + "learning_rate": 2.2730423620025673e-05, + "loss": 0.2485, + "step": 24788 + }, + { + "epoch": 31.821566110397946, + "grad_norm": 1.5204194784164429, + "learning_rate": 2.2729995721009844e-05, + "loss": 0.2866, + "step": 24789 + }, + { + "epoch": 31.822849807445444, + "grad_norm": 1.3723193407058716, + "learning_rate": 2.272956782199401e-05, + "loss": 0.2701, + "step": 24790 + }, + { + "epoch": 31.82413350449294, + "grad_norm": 1.490473747253418, + "learning_rate": 2.2729139922978178e-05, + "loss": 0.327, + "step": 24791 + }, + { + "epoch": 31.825417201540436, + "grad_norm": 1.2698017358779907, + "learning_rate": 2.2728712023962346e-05, + "loss": 0.2732, + "step": 24792 + }, + { + "epoch": 31.826700898587934, + "grad_norm": 1.2539924383163452, + "learning_rate": 2.272828412494651e-05, + "loss": 0.2707, + "step": 24793 + }, + { + "epoch": 31.82798459563543, + "grad_norm": 1.8489779233932495, + "learning_rate": 2.2727856225930683e-05, + "loss": 0.3484, + "step": 24794 + }, + { + "epoch": 31.829268292682926, + "grad_norm": 1.6199735403060913, + "learning_rate": 2.2727428326914848e-05, + "loss": 0.3122, + "step": 24795 + }, + { + "epoch": 31.830551989730424, + "grad_norm": 1.6117702722549438, + "learning_rate": 2.2727000427899016e-05, + "loss": 0.3146, + "step": 24796 + }, + { + "epoch": 31.83183568677792, + "grad_norm": 1.5450761318206787, + "learning_rate": 2.2726572528883185e-05, + "loss": 0.3045, + "step": 24797 + }, + { + "epoch": 31.833119383825416, + "grad_norm": 1.5068745613098145, + "learning_rate": 2.272614462986735e-05, + "loss": 0.3393, + "step": 24798 + }, + { + "epoch": 31.834403080872914, + "grad_norm": 2.2341561317443848, + "learning_rate": 2.272571673085152e-05, + "loss": 0.4893, + "step": 24799 + }, + { + "epoch": 31.83568677792041, + "grad_norm": 0.7476813197135925, + "learning_rate": 2.2725288831835687e-05, + "loss": 0.274, + "step": 24800 + }, + { + "epoch": 31.836970474967906, + "grad_norm": 1.22001314163208, + "learning_rate": 2.2724860932819855e-05, + "loss": 0.2737, + "step": 24801 + }, + { + "epoch": 31.838254172015404, + "grad_norm": 1.0795319080352783, + "learning_rate": 2.2724433033804024e-05, + "loss": 0.2722, + "step": 24802 + }, + { + "epoch": 31.8395378690629, + "grad_norm": 1.512494683265686, + "learning_rate": 2.2724005134788192e-05, + "loss": 0.3005, + "step": 24803 + }, + { + "epoch": 31.8408215661104, + "grad_norm": 0.7932227253913879, + "learning_rate": 2.2723577235772357e-05, + "loss": 0.2665, + "step": 24804 + }, + { + "epoch": 31.842105263157894, + "grad_norm": 1.2308335304260254, + "learning_rate": 2.2723149336756525e-05, + "loss": 0.2432, + "step": 24805 + }, + { + "epoch": 31.84338896020539, + "grad_norm": 1.0905674695968628, + "learning_rate": 2.2722721437740694e-05, + "loss": 0.2573, + "step": 24806 + }, + { + "epoch": 31.84467265725289, + "grad_norm": 1.7942464351654053, + "learning_rate": 2.2722293538724862e-05, + "loss": 0.2708, + "step": 24807 + }, + { + "epoch": 31.845956354300384, + "grad_norm": 1.3114452362060547, + "learning_rate": 2.272186563970903e-05, + "loss": 0.3098, + "step": 24808 + }, + { + "epoch": 31.84724005134788, + "grad_norm": 0.8724583983421326, + "learning_rate": 2.2721437740693196e-05, + "loss": 0.2754, + "step": 24809 + }, + { + "epoch": 31.84852374839538, + "grad_norm": 1.0364115238189697, + "learning_rate": 2.2721009841677367e-05, + "loss": 0.2709, + "step": 24810 + }, + { + "epoch": 31.849807445442874, + "grad_norm": 0.8498751521110535, + "learning_rate": 2.2720581942661532e-05, + "loss": 0.2594, + "step": 24811 + }, + { + "epoch": 31.85109114249037, + "grad_norm": 1.3475908041000366, + "learning_rate": 2.2720154043645697e-05, + "loss": 0.2445, + "step": 24812 + }, + { + "epoch": 31.85237483953787, + "grad_norm": 1.1562660932540894, + "learning_rate": 2.271972614462987e-05, + "loss": 0.2758, + "step": 24813 + }, + { + "epoch": 31.853658536585368, + "grad_norm": 1.826114535331726, + "learning_rate": 2.2719298245614034e-05, + "loss": 0.289, + "step": 24814 + }, + { + "epoch": 31.854942233632862, + "grad_norm": 1.6335347890853882, + "learning_rate": 2.2718870346598206e-05, + "loss": 0.2788, + "step": 24815 + }, + { + "epoch": 31.85622593068036, + "grad_norm": 4.623518943786621, + "learning_rate": 2.271844244758237e-05, + "loss": 0.2747, + "step": 24816 + }, + { + "epoch": 31.857509627727858, + "grad_norm": 0.6809067726135254, + "learning_rate": 2.271801454856654e-05, + "loss": 0.2466, + "step": 24817 + }, + { + "epoch": 31.858793324775352, + "grad_norm": 0.962794840335846, + "learning_rate": 2.2717586649550708e-05, + "loss": 0.2622, + "step": 24818 + }, + { + "epoch": 31.86007702182285, + "grad_norm": 1.3556995391845703, + "learning_rate": 2.2717158750534873e-05, + "loss": 0.271, + "step": 24819 + }, + { + "epoch": 31.861360718870348, + "grad_norm": 0.8927512764930725, + "learning_rate": 2.271673085151904e-05, + "loss": 0.2724, + "step": 24820 + }, + { + "epoch": 31.862644415917842, + "grad_norm": 1.2567381858825684, + "learning_rate": 2.271630295250321e-05, + "loss": 0.2731, + "step": 24821 + }, + { + "epoch": 31.86392811296534, + "grad_norm": 1.5086654424667358, + "learning_rate": 2.2715875053487378e-05, + "loss": 0.284, + "step": 24822 + }, + { + "epoch": 31.865211810012838, + "grad_norm": 1.0831756591796875, + "learning_rate": 2.2715447154471543e-05, + "loss": 0.2617, + "step": 24823 + }, + { + "epoch": 31.866495507060336, + "grad_norm": 0.9012628197669983, + "learning_rate": 2.2715019255455715e-05, + "loss": 0.3039, + "step": 24824 + }, + { + "epoch": 31.86777920410783, + "grad_norm": 1.4600870609283447, + "learning_rate": 2.271459135643988e-05, + "loss": 0.2734, + "step": 24825 + }, + { + "epoch": 31.869062901155328, + "grad_norm": 0.9684482216835022, + "learning_rate": 2.271416345742405e-05, + "loss": 0.2911, + "step": 24826 + }, + { + "epoch": 31.870346598202826, + "grad_norm": 0.8000003695487976, + "learning_rate": 2.2713735558408217e-05, + "loss": 0.252, + "step": 24827 + }, + { + "epoch": 31.87163029525032, + "grad_norm": 0.8044953346252441, + "learning_rate": 2.2713307659392382e-05, + "loss": 0.261, + "step": 24828 + }, + { + "epoch": 31.872913992297818, + "grad_norm": 1.1044986248016357, + "learning_rate": 2.2712879760376554e-05, + "loss": 0.2906, + "step": 24829 + }, + { + "epoch": 31.874197689345316, + "grad_norm": 1.2570128440856934, + "learning_rate": 2.271245186136072e-05, + "loss": 0.2746, + "step": 24830 + }, + { + "epoch": 31.87548138639281, + "grad_norm": 1.4134302139282227, + "learning_rate": 2.2712023962344887e-05, + "loss": 0.2743, + "step": 24831 + }, + { + "epoch": 31.876765083440308, + "grad_norm": 2.3297407627105713, + "learning_rate": 2.2711596063329056e-05, + "loss": 0.2968, + "step": 24832 + }, + { + "epoch": 31.878048780487806, + "grad_norm": 1.1115401983261108, + "learning_rate": 2.271116816431322e-05, + "loss": 0.2615, + "step": 24833 + }, + { + "epoch": 31.8793324775353, + "grad_norm": 4.873526096343994, + "learning_rate": 2.2710740265297392e-05, + "loss": 0.2652, + "step": 24834 + }, + { + "epoch": 31.880616174582798, + "grad_norm": 1.1456501483917236, + "learning_rate": 2.2710312366281557e-05, + "loss": 0.2571, + "step": 24835 + }, + { + "epoch": 31.881899871630296, + "grad_norm": 0.9533454775810242, + "learning_rate": 2.2709884467265726e-05, + "loss": 0.2847, + "step": 24836 + }, + { + "epoch": 31.883183568677794, + "grad_norm": 1.2314341068267822, + "learning_rate": 2.2709456568249894e-05, + "loss": 0.2749, + "step": 24837 + }, + { + "epoch": 31.884467265725288, + "grad_norm": 1.3087831735610962, + "learning_rate": 2.2709028669234063e-05, + "loss": 0.2839, + "step": 24838 + }, + { + "epoch": 31.885750962772786, + "grad_norm": 1.3016254901885986, + "learning_rate": 2.2708600770218228e-05, + "loss": 0.2766, + "step": 24839 + }, + { + "epoch": 31.887034659820284, + "grad_norm": 1.2086830139160156, + "learning_rate": 2.2708172871202396e-05, + "loss": 0.2726, + "step": 24840 + }, + { + "epoch": 31.888318356867778, + "grad_norm": 1.437666416168213, + "learning_rate": 2.2707744972186564e-05, + "loss": 0.2964, + "step": 24841 + }, + { + "epoch": 31.889602053915276, + "grad_norm": 3.6334574222564697, + "learning_rate": 2.2707317073170733e-05, + "loss": 0.2846, + "step": 24842 + }, + { + "epoch": 31.890885750962774, + "grad_norm": 3.692275047302246, + "learning_rate": 2.27068891741549e-05, + "loss": 0.3074, + "step": 24843 + }, + { + "epoch": 31.892169448010268, + "grad_norm": 23.06918716430664, + "learning_rate": 2.2706461275139066e-05, + "loss": 0.2803, + "step": 24844 + }, + { + "epoch": 31.893453145057766, + "grad_norm": 1.789393663406372, + "learning_rate": 2.2706033376123235e-05, + "loss": 0.3289, + "step": 24845 + }, + { + "epoch": 31.894736842105264, + "grad_norm": 1.4703947305679321, + "learning_rate": 2.2705605477107403e-05, + "loss": 0.29, + "step": 24846 + }, + { + "epoch": 31.89602053915276, + "grad_norm": 2.6879169940948486, + "learning_rate": 2.2705177578091568e-05, + "loss": 0.3478, + "step": 24847 + }, + { + "epoch": 31.897304236200256, + "grad_norm": 3.4496641159057617, + "learning_rate": 2.270474967907574e-05, + "loss": 0.4226, + "step": 24848 + }, + { + "epoch": 31.898587933247754, + "grad_norm": 2.54656982421875, + "learning_rate": 2.2704321780059905e-05, + "loss": 0.4668, + "step": 24849 + }, + { + "epoch": 31.89987163029525, + "grad_norm": 0.7347378730773926, + "learning_rate": 2.2703893881044077e-05, + "loss": 0.3046, + "step": 24850 + }, + { + "epoch": 31.901155327342746, + "grad_norm": 0.7566764950752258, + "learning_rate": 2.2703465982028242e-05, + "loss": 0.2614, + "step": 24851 + }, + { + "epoch": 31.902439024390244, + "grad_norm": 1.0178663730621338, + "learning_rate": 2.2703038083012407e-05, + "loss": 0.2868, + "step": 24852 + }, + { + "epoch": 31.90372272143774, + "grad_norm": 1.059462070465088, + "learning_rate": 2.270261018399658e-05, + "loss": 0.3033, + "step": 24853 + }, + { + "epoch": 31.905006418485236, + "grad_norm": 1.787040114402771, + "learning_rate": 2.2702182284980744e-05, + "loss": 0.2866, + "step": 24854 + }, + { + "epoch": 31.906290115532734, + "grad_norm": 0.9890152812004089, + "learning_rate": 2.2701754385964912e-05, + "loss": 0.2579, + "step": 24855 + }, + { + "epoch": 31.90757381258023, + "grad_norm": 1.0170001983642578, + "learning_rate": 2.270132648694908e-05, + "loss": 0.2693, + "step": 24856 + }, + { + "epoch": 31.90885750962773, + "grad_norm": 1.8490686416625977, + "learning_rate": 2.270089858793325e-05, + "loss": 0.2758, + "step": 24857 + }, + { + "epoch": 31.910141206675224, + "grad_norm": 1.1172876358032227, + "learning_rate": 2.2700470688917417e-05, + "loss": 0.2757, + "step": 24858 + }, + { + "epoch": 31.911424903722722, + "grad_norm": 1.2452491521835327, + "learning_rate": 2.2700042789901582e-05, + "loss": 0.2849, + "step": 24859 + }, + { + "epoch": 31.91270860077022, + "grad_norm": 1.0623054504394531, + "learning_rate": 2.269961489088575e-05, + "loss": 0.2722, + "step": 24860 + }, + { + "epoch": 31.913992297817714, + "grad_norm": 1.21195387840271, + "learning_rate": 2.269918699186992e-05, + "loss": 0.2859, + "step": 24861 + }, + { + "epoch": 31.915275994865212, + "grad_norm": 1.243373990058899, + "learning_rate": 2.2698759092854088e-05, + "loss": 0.2573, + "step": 24862 + }, + { + "epoch": 31.91655969191271, + "grad_norm": 3.120063543319702, + "learning_rate": 2.2698331193838253e-05, + "loss": 0.3026, + "step": 24863 + }, + { + "epoch": 31.917843388960204, + "grad_norm": 0.939220666885376, + "learning_rate": 2.2697903294822424e-05, + "loss": 0.2788, + "step": 24864 + }, + { + "epoch": 31.919127086007702, + "grad_norm": 0.9524410963058472, + "learning_rate": 2.269747539580659e-05, + "loss": 0.2751, + "step": 24865 + }, + { + "epoch": 31.9204107830552, + "grad_norm": 0.7550270557403564, + "learning_rate": 2.2697047496790758e-05, + "loss": 0.2546, + "step": 24866 + }, + { + "epoch": 31.921694480102694, + "grad_norm": 1.1240471601486206, + "learning_rate": 2.2696619597774926e-05, + "loss": 0.2758, + "step": 24867 + }, + { + "epoch": 31.922978177150192, + "grad_norm": 1.2850040197372437, + "learning_rate": 2.269619169875909e-05, + "loss": 0.258, + "step": 24868 + }, + { + "epoch": 31.92426187419769, + "grad_norm": 1.0279797315597534, + "learning_rate": 2.2695763799743263e-05, + "loss": 0.275, + "step": 24869 + }, + { + "epoch": 31.925545571245188, + "grad_norm": 3.0623176097869873, + "learning_rate": 2.2695335900727428e-05, + "loss": 0.2849, + "step": 24870 + }, + { + "epoch": 31.926829268292682, + "grad_norm": 2.6988368034362793, + "learning_rate": 2.2694908001711596e-05, + "loss": 0.2715, + "step": 24871 + }, + { + "epoch": 31.92811296534018, + "grad_norm": 0.9012576937675476, + "learning_rate": 2.2694480102695765e-05, + "loss": 0.2679, + "step": 24872 + }, + { + "epoch": 31.929396662387678, + "grad_norm": 0.792742133140564, + "learning_rate": 2.269405220367993e-05, + "loss": 0.255, + "step": 24873 + }, + { + "epoch": 31.930680359435172, + "grad_norm": 0.9323817491531372, + "learning_rate": 2.2693624304664102e-05, + "loss": 0.2669, + "step": 24874 + }, + { + "epoch": 31.93196405648267, + "grad_norm": 1.1119334697723389, + "learning_rate": 2.2693196405648267e-05, + "loss": 0.2838, + "step": 24875 + }, + { + "epoch": 31.933247753530168, + "grad_norm": 1.0799669027328491, + "learning_rate": 2.2692768506632435e-05, + "loss": 0.2582, + "step": 24876 + }, + { + "epoch": 31.934531450577662, + "grad_norm": 1.3146311044692993, + "learning_rate": 2.2692340607616604e-05, + "loss": 0.2553, + "step": 24877 + }, + { + "epoch": 31.93581514762516, + "grad_norm": 1.0202170610427856, + "learning_rate": 2.2691912708600772e-05, + "loss": 0.2473, + "step": 24878 + }, + { + "epoch": 31.937098844672658, + "grad_norm": 1.9913084506988525, + "learning_rate": 2.2691484809584937e-05, + "loss": 0.2519, + "step": 24879 + }, + { + "epoch": 31.938382541720156, + "grad_norm": 2.304715871810913, + "learning_rate": 2.2691056910569105e-05, + "loss": 0.2407, + "step": 24880 + }, + { + "epoch": 31.93966623876765, + "grad_norm": 0.8947903513908386, + "learning_rate": 2.2690629011553274e-05, + "loss": 0.2614, + "step": 24881 + }, + { + "epoch": 31.940949935815148, + "grad_norm": 1.5556317567825317, + "learning_rate": 2.2690201112537442e-05, + "loss": 0.2644, + "step": 24882 + }, + { + "epoch": 31.942233632862646, + "grad_norm": 1.6825058460235596, + "learning_rate": 2.268977321352161e-05, + "loss": 0.2733, + "step": 24883 + }, + { + "epoch": 31.94351732991014, + "grad_norm": 2.1071977615356445, + "learning_rate": 2.2689345314505776e-05, + "loss": 0.2814, + "step": 24884 + }, + { + "epoch": 31.944801026957638, + "grad_norm": 1.4250046014785767, + "learning_rate": 2.2688917415489947e-05, + "loss": 0.2608, + "step": 24885 + }, + { + "epoch": 31.946084724005136, + "grad_norm": 1.756131649017334, + "learning_rate": 2.2688489516474112e-05, + "loss": 0.2856, + "step": 24886 + }, + { + "epoch": 31.94736842105263, + "grad_norm": 1.1583726406097412, + "learning_rate": 2.2688061617458278e-05, + "loss": 0.2624, + "step": 24887 + }, + { + "epoch": 31.948652118100128, + "grad_norm": 1.9094709157943726, + "learning_rate": 2.268763371844245e-05, + "loss": 0.299, + "step": 24888 + }, + { + "epoch": 31.949935815147626, + "grad_norm": 0.9638705849647522, + "learning_rate": 2.2687205819426614e-05, + "loss": 0.2732, + "step": 24889 + }, + { + "epoch": 31.951219512195124, + "grad_norm": 2.144026756286621, + "learning_rate": 2.2686777920410786e-05, + "loss": 0.2923, + "step": 24890 + }, + { + "epoch": 31.952503209242618, + "grad_norm": 1.2682665586471558, + "learning_rate": 2.268635002139495e-05, + "loss": 0.2852, + "step": 24891 + }, + { + "epoch": 31.953786906290116, + "grad_norm": 1.469787836074829, + "learning_rate": 2.268592212237912e-05, + "loss": 0.3007, + "step": 24892 + }, + { + "epoch": 31.955070603337614, + "grad_norm": 1.272187352180481, + "learning_rate": 2.2685494223363288e-05, + "loss": 0.2551, + "step": 24893 + }, + { + "epoch": 31.956354300385108, + "grad_norm": 2.111802339553833, + "learning_rate": 2.2685066324347453e-05, + "loss": 0.2799, + "step": 24894 + }, + { + "epoch": 31.957637997432606, + "grad_norm": 1.574783444404602, + "learning_rate": 2.268463842533162e-05, + "loss": 0.3102, + "step": 24895 + }, + { + "epoch": 31.958921694480104, + "grad_norm": 1.3791242837905884, + "learning_rate": 2.268421052631579e-05, + "loss": 0.3393, + "step": 24896 + }, + { + "epoch": 31.960205391527598, + "grad_norm": 2.2218618392944336, + "learning_rate": 2.2683782627299958e-05, + "loss": 0.3481, + "step": 24897 + }, + { + "epoch": 31.961489088575096, + "grad_norm": 2.3598129749298096, + "learning_rate": 2.2683354728284127e-05, + "loss": 0.3075, + "step": 24898 + }, + { + "epoch": 31.962772785622594, + "grad_norm": 1.8631128072738647, + "learning_rate": 2.2682926829268295e-05, + "loss": 0.4907, + "step": 24899 + }, + { + "epoch": 31.964056482670088, + "grad_norm": 0.8525537252426147, + "learning_rate": 2.268249893025246e-05, + "loss": 0.2977, + "step": 24900 + }, + { + "epoch": 31.965340179717586, + "grad_norm": 0.9320783019065857, + "learning_rate": 2.268207103123663e-05, + "loss": 0.2897, + "step": 24901 + }, + { + "epoch": 31.966623876765084, + "grad_norm": 0.7593565583229065, + "learning_rate": 2.2681643132220797e-05, + "loss": 0.2765, + "step": 24902 + }, + { + "epoch": 31.96790757381258, + "grad_norm": 1.747301459312439, + "learning_rate": 2.2681215233204962e-05, + "loss": 0.2658, + "step": 24903 + }, + { + "epoch": 31.969191270860076, + "grad_norm": 3.8656723499298096, + "learning_rate": 2.2680787334189134e-05, + "loss": 0.2595, + "step": 24904 + }, + { + "epoch": 31.970474967907574, + "grad_norm": 3.653337240219116, + "learning_rate": 2.26803594351733e-05, + "loss": 0.2763, + "step": 24905 + }, + { + "epoch": 31.971758664955072, + "grad_norm": 0.8700554370880127, + "learning_rate": 2.2679931536157467e-05, + "loss": 0.2608, + "step": 24906 + }, + { + "epoch": 31.973042362002566, + "grad_norm": 0.8818567395210266, + "learning_rate": 2.2679503637141636e-05, + "loss": 0.2676, + "step": 24907 + }, + { + "epoch": 31.974326059050064, + "grad_norm": 0.8731329441070557, + "learning_rate": 2.26790757381258e-05, + "loss": 0.2481, + "step": 24908 + }, + { + "epoch": 31.975609756097562, + "grad_norm": 1.5418338775634766, + "learning_rate": 2.2678647839109972e-05, + "loss": 0.2968, + "step": 24909 + }, + { + "epoch": 31.976893453145056, + "grad_norm": 1.5378667116165161, + "learning_rate": 2.2678219940094137e-05, + "loss": 0.2953, + "step": 24910 + }, + { + "epoch": 31.978177150192554, + "grad_norm": 1.1813647747039795, + "learning_rate": 2.2677792041078306e-05, + "loss": 0.2692, + "step": 24911 + }, + { + "epoch": 31.979460847240052, + "grad_norm": 0.7667885422706604, + "learning_rate": 2.2677364142062474e-05, + "loss": 0.268, + "step": 24912 + }, + { + "epoch": 31.98074454428755, + "grad_norm": 1.69395911693573, + "learning_rate": 2.267693624304664e-05, + "loss": 0.281, + "step": 24913 + }, + { + "epoch": 31.982028241335044, + "grad_norm": 0.8117942810058594, + "learning_rate": 2.267650834403081e-05, + "loss": 0.2653, + "step": 24914 + }, + { + "epoch": 31.983311938382542, + "grad_norm": 1.426858901977539, + "learning_rate": 2.2676080445014976e-05, + "loss": 0.2385, + "step": 24915 + }, + { + "epoch": 31.98459563543004, + "grad_norm": 1.1278209686279297, + "learning_rate": 2.2675652545999145e-05, + "loss": 0.2627, + "step": 24916 + }, + { + "epoch": 31.985879332477534, + "grad_norm": 0.8540711998939514, + "learning_rate": 2.2675224646983313e-05, + "loss": 0.2323, + "step": 24917 + }, + { + "epoch": 31.987163029525032, + "grad_norm": 0.9908294081687927, + "learning_rate": 2.267479674796748e-05, + "loss": 0.2791, + "step": 24918 + }, + { + "epoch": 31.98844672657253, + "grad_norm": 1.022910237312317, + "learning_rate": 2.2674368848951646e-05, + "loss": 0.2765, + "step": 24919 + }, + { + "epoch": 31.989730423620024, + "grad_norm": 1.0279189348220825, + "learning_rate": 2.2673940949935815e-05, + "loss": 0.2768, + "step": 24920 + }, + { + "epoch": 31.991014120667522, + "grad_norm": 2.2938084602355957, + "learning_rate": 2.2673513050919983e-05, + "loss": 0.2777, + "step": 24921 + }, + { + "epoch": 31.99229781771502, + "grad_norm": 4.600684642791748, + "learning_rate": 2.267308515190415e-05, + "loss": 0.278, + "step": 24922 + }, + { + "epoch": 31.993581514762518, + "grad_norm": 1.2624350786209106, + "learning_rate": 2.267265725288832e-05, + "loss": 0.2593, + "step": 24923 + }, + { + "epoch": 31.994865211810012, + "grad_norm": 1.0489487648010254, + "learning_rate": 2.2672229353872485e-05, + "loss": 0.2778, + "step": 24924 + }, + { + "epoch": 31.99614890885751, + "grad_norm": 1.4034883975982666, + "learning_rate": 2.2671801454856657e-05, + "loss": 0.2878, + "step": 24925 + }, + { + "epoch": 31.997432605905008, + "grad_norm": 2.0591185092926025, + "learning_rate": 2.2671373555840822e-05, + "loss": 0.2967, + "step": 24926 + }, + { + "epoch": 31.998716302952502, + "grad_norm": 1.7749468088150024, + "learning_rate": 2.2670945656824987e-05, + "loss": 0.3091, + "step": 24927 + }, + { + "epoch": 32.0, + "grad_norm": 2.2129271030426025, + "learning_rate": 2.267051775780916e-05, + "loss": 0.4326, + "step": 24928 + }, + { + "epoch": 32.0012836970475, + "grad_norm": 0.6298240423202515, + "learning_rate": 2.2670089858793324e-05, + "loss": 0.2532, + "step": 24929 + }, + { + "epoch": 32.002567394094996, + "grad_norm": 1.3414638042449951, + "learning_rate": 2.2669661959777496e-05, + "loss": 0.2368, + "step": 24930 + }, + { + "epoch": 32.003851091142494, + "grad_norm": 1.1658283472061157, + "learning_rate": 2.266923406076166e-05, + "loss": 0.2843, + "step": 24931 + }, + { + "epoch": 32.005134788189984, + "grad_norm": 1.090819239616394, + "learning_rate": 2.266880616174583e-05, + "loss": 0.2902, + "step": 24932 + }, + { + "epoch": 32.00641848523748, + "grad_norm": 2.2560577392578125, + "learning_rate": 2.2668378262729997e-05, + "loss": 0.2688, + "step": 24933 + }, + { + "epoch": 32.00770218228498, + "grad_norm": 0.9782401919364929, + "learning_rate": 2.2667950363714162e-05, + "loss": 0.2448, + "step": 24934 + }, + { + "epoch": 32.00898587933248, + "grad_norm": 0.903683066368103, + "learning_rate": 2.266752246469833e-05, + "loss": 0.2556, + "step": 24935 + }, + { + "epoch": 32.010269576379976, + "grad_norm": 0.8718919157981873, + "learning_rate": 2.26670945656825e-05, + "loss": 0.2681, + "step": 24936 + }, + { + "epoch": 32.011553273427474, + "grad_norm": 0.9113541841506958, + "learning_rate": 2.2666666666666668e-05, + "loss": 0.2596, + "step": 24937 + }, + { + "epoch": 32.012836970474964, + "grad_norm": 0.9997109174728394, + "learning_rate": 2.2666238767650836e-05, + "loss": 0.2641, + "step": 24938 + }, + { + "epoch": 32.01412066752246, + "grad_norm": 0.8792356848716736, + "learning_rate": 2.2665810868635004e-05, + "loss": 0.2555, + "step": 24939 + }, + { + "epoch": 32.01540436456996, + "grad_norm": 1.0241312980651855, + "learning_rate": 2.266538296961917e-05, + "loss": 0.2636, + "step": 24940 + }, + { + "epoch": 32.01668806161746, + "grad_norm": 1.0140182971954346, + "learning_rate": 2.2664955070603338e-05, + "loss": 0.2576, + "step": 24941 + }, + { + "epoch": 32.017971758664956, + "grad_norm": 1.1770511865615845, + "learning_rate": 2.2664527171587506e-05, + "loss": 0.2815, + "step": 24942 + }, + { + "epoch": 32.019255455712454, + "grad_norm": 2.2232155799865723, + "learning_rate": 2.266409927257167e-05, + "loss": 0.2469, + "step": 24943 + }, + { + "epoch": 32.02053915275995, + "grad_norm": 1.2485902309417725, + "learning_rate": 2.2663671373555843e-05, + "loss": 0.2789, + "step": 24944 + }, + { + "epoch": 32.02182284980744, + "grad_norm": 1.645431637763977, + "learning_rate": 2.2663243474540008e-05, + "loss": 0.2594, + "step": 24945 + }, + { + "epoch": 32.02310654685494, + "grad_norm": 3.03269362449646, + "learning_rate": 2.266281557552418e-05, + "loss": 0.2559, + "step": 24946 + }, + { + "epoch": 32.02439024390244, + "grad_norm": 2.11716628074646, + "learning_rate": 2.2662387676508345e-05, + "loss": 0.2647, + "step": 24947 + }, + { + "epoch": 32.025673940949936, + "grad_norm": 0.9344730377197266, + "learning_rate": 2.266195977749251e-05, + "loss": 0.2572, + "step": 24948 + }, + { + "epoch": 32.026957637997434, + "grad_norm": 1.2277547121047974, + "learning_rate": 2.2661531878476682e-05, + "loss": 0.2372, + "step": 24949 + }, + { + "epoch": 32.02824133504493, + "grad_norm": 2.1088404655456543, + "learning_rate": 2.2661103979460847e-05, + "loss": 0.2491, + "step": 24950 + }, + { + "epoch": 32.02952503209243, + "grad_norm": 0.9410003423690796, + "learning_rate": 2.2660676080445015e-05, + "loss": 0.2804, + "step": 24951 + }, + { + "epoch": 32.03080872913992, + "grad_norm": 0.7843480110168457, + "learning_rate": 2.2660248181429184e-05, + "loss": 0.2144, + "step": 24952 + }, + { + "epoch": 32.03209242618742, + "grad_norm": 0.7892906069755554, + "learning_rate": 2.2659820282413352e-05, + "loss": 0.2318, + "step": 24953 + }, + { + "epoch": 32.033376123234916, + "grad_norm": 2.9974958896636963, + "learning_rate": 2.265939238339752e-05, + "loss": 0.2284, + "step": 24954 + }, + { + "epoch": 32.034659820282414, + "grad_norm": 1.2109462022781372, + "learning_rate": 2.2658964484381685e-05, + "loss": 0.2769, + "step": 24955 + }, + { + "epoch": 32.03594351732991, + "grad_norm": 1.4993219375610352, + "learning_rate": 2.2658536585365854e-05, + "loss": 0.21, + "step": 24956 + }, + { + "epoch": 32.03722721437741, + "grad_norm": 0.8440960049629211, + "learning_rate": 2.2658108686350022e-05, + "loss": 0.2274, + "step": 24957 + }, + { + "epoch": 32.0385109114249, + "grad_norm": 2.307577610015869, + "learning_rate": 2.265768078733419e-05, + "loss": 0.2397, + "step": 24958 + }, + { + "epoch": 32.0397946084724, + "grad_norm": 0.9600238800048828, + "learning_rate": 2.2657252888318356e-05, + "loss": 0.2159, + "step": 24959 + }, + { + "epoch": 32.041078305519896, + "grad_norm": 1.0807201862335205, + "learning_rate": 2.2656824989302528e-05, + "loss": 0.2471, + "step": 24960 + }, + { + "epoch": 32.042362002567394, + "grad_norm": 0.9112295508384705, + "learning_rate": 2.2656397090286693e-05, + "loss": 0.2438, + "step": 24961 + }, + { + "epoch": 32.04364569961489, + "grad_norm": 0.9265890717506409, + "learning_rate": 2.265596919127086e-05, + "loss": 0.2686, + "step": 24962 + }, + { + "epoch": 32.04492939666239, + "grad_norm": 1.271990418434143, + "learning_rate": 2.265554129225503e-05, + "loss": 0.2442, + "step": 24963 + }, + { + "epoch": 32.04621309370989, + "grad_norm": 0.9238169193267822, + "learning_rate": 2.2655113393239194e-05, + "loss": 0.2524, + "step": 24964 + }, + { + "epoch": 32.04749679075738, + "grad_norm": 1.0947322845458984, + "learning_rate": 2.2654685494223366e-05, + "loss": 0.2197, + "step": 24965 + }, + { + "epoch": 32.048780487804876, + "grad_norm": 1.5485247373580933, + "learning_rate": 2.265425759520753e-05, + "loss": 0.23, + "step": 24966 + }, + { + "epoch": 32.050064184852374, + "grad_norm": 1.2984882593154907, + "learning_rate": 2.26538296961917e-05, + "loss": 0.2822, + "step": 24967 + }, + { + "epoch": 32.05134788189987, + "grad_norm": 1.6802386045455933, + "learning_rate": 2.2653401797175868e-05, + "loss": 0.2669, + "step": 24968 + }, + { + "epoch": 32.05263157894737, + "grad_norm": 1.1495437622070312, + "learning_rate": 2.2652973898160033e-05, + "loss": 0.2396, + "step": 24969 + }, + { + "epoch": 32.05391527599487, + "grad_norm": 1.07505202293396, + "learning_rate": 2.2652545999144205e-05, + "loss": 0.2812, + "step": 24970 + }, + { + "epoch": 32.05519897304236, + "grad_norm": 1.5750179290771484, + "learning_rate": 2.265211810012837e-05, + "loss": 0.2823, + "step": 24971 + }, + { + "epoch": 32.056482670089856, + "grad_norm": 2.931206703186035, + "learning_rate": 2.265169020111254e-05, + "loss": 0.2655, + "step": 24972 + }, + { + "epoch": 32.057766367137354, + "grad_norm": 1.1458624601364136, + "learning_rate": 2.2651262302096707e-05, + "loss": 0.2602, + "step": 24973 + }, + { + "epoch": 32.05905006418485, + "grad_norm": 1.7508831024169922, + "learning_rate": 2.2650834403080872e-05, + "loss": 0.2953, + "step": 24974 + }, + { + "epoch": 32.06033376123235, + "grad_norm": 1.4136881828308105, + "learning_rate": 2.265040650406504e-05, + "loss": 0.2787, + "step": 24975 + }, + { + "epoch": 32.06161745827985, + "grad_norm": 1.6370450258255005, + "learning_rate": 2.264997860504921e-05, + "loss": 0.2884, + "step": 24976 + }, + { + "epoch": 32.062901155327346, + "grad_norm": 2.212693452835083, + "learning_rate": 2.2649550706033377e-05, + "loss": 0.3921, + "step": 24977 + }, + { + "epoch": 32.06418485237484, + "grad_norm": 7.602260589599609, + "learning_rate": 2.2649122807017545e-05, + "loss": 0.3939, + "step": 24978 + }, + { + "epoch": 32.065468549422334, + "grad_norm": 2.0006628036499023, + "learning_rate": 2.2648694908001714e-05, + "loss": 0.2638, + "step": 24979 + }, + { + "epoch": 32.06675224646983, + "grad_norm": 0.8848739862442017, + "learning_rate": 2.264826700898588e-05, + "loss": 0.2554, + "step": 24980 + }, + { + "epoch": 32.06803594351733, + "grad_norm": 1.2460557222366333, + "learning_rate": 2.2647839109970047e-05, + "loss": 0.2526, + "step": 24981 + }, + { + "epoch": 32.06931964056483, + "grad_norm": 0.705451488494873, + "learning_rate": 2.2647411210954216e-05, + "loss": 0.2773, + "step": 24982 + }, + { + "epoch": 32.070603337612326, + "grad_norm": 0.8218221068382263, + "learning_rate": 2.264698331193838e-05, + "loss": 0.2869, + "step": 24983 + }, + { + "epoch": 32.071887034659824, + "grad_norm": 0.9208546876907349, + "learning_rate": 2.2646555412922552e-05, + "loss": 0.25, + "step": 24984 + }, + { + "epoch": 32.073170731707314, + "grad_norm": 1.3288673162460327, + "learning_rate": 2.2646127513906718e-05, + "loss": 0.2602, + "step": 24985 + }, + { + "epoch": 32.07445442875481, + "grad_norm": 0.9695228338241577, + "learning_rate": 2.264569961489089e-05, + "loss": 0.252, + "step": 24986 + }, + { + "epoch": 32.07573812580231, + "grad_norm": 0.8976972699165344, + "learning_rate": 2.2645271715875054e-05, + "loss": 0.2805, + "step": 24987 + }, + { + "epoch": 32.07702182284981, + "grad_norm": 1.0080853700637817, + "learning_rate": 2.264484381685922e-05, + "loss": 0.2518, + "step": 24988 + }, + { + "epoch": 32.078305519897306, + "grad_norm": 1.756255030632019, + "learning_rate": 2.264441591784339e-05, + "loss": 0.258, + "step": 24989 + }, + { + "epoch": 32.079589216944804, + "grad_norm": 1.4016602039337158, + "learning_rate": 2.2643988018827556e-05, + "loss": 0.2416, + "step": 24990 + }, + { + "epoch": 32.080872913992295, + "grad_norm": 0.7447986602783203, + "learning_rate": 2.2643560119811725e-05, + "loss": 0.2433, + "step": 24991 + }, + { + "epoch": 32.08215661103979, + "grad_norm": 1.2263727188110352, + "learning_rate": 2.2643132220795893e-05, + "loss": 0.2384, + "step": 24992 + }, + { + "epoch": 32.08344030808729, + "grad_norm": 1.4095269441604614, + "learning_rate": 2.264270432178006e-05, + "loss": 0.2516, + "step": 24993 + }, + { + "epoch": 32.08472400513479, + "grad_norm": 1.0909405946731567, + "learning_rate": 2.264227642276423e-05, + "loss": 0.2619, + "step": 24994 + }, + { + "epoch": 32.086007702182286, + "grad_norm": 0.9835424423217773, + "learning_rate": 2.2641848523748395e-05, + "loss": 0.2661, + "step": 24995 + }, + { + "epoch": 32.087291399229784, + "grad_norm": 0.7842762470245361, + "learning_rate": 2.2641420624732563e-05, + "loss": 0.2302, + "step": 24996 + }, + { + "epoch": 32.08857509627728, + "grad_norm": 1.0115571022033691, + "learning_rate": 2.264099272571673e-05, + "loss": 0.2476, + "step": 24997 + }, + { + "epoch": 32.08985879332477, + "grad_norm": 1.0392539501190186, + "learning_rate": 2.26405648267009e-05, + "loss": 0.2606, + "step": 24998 + }, + { + "epoch": 32.09114249037227, + "grad_norm": 3.6952993869781494, + "learning_rate": 2.2640136927685065e-05, + "loss": 0.2293, + "step": 24999 + }, + { + "epoch": 32.09242618741977, + "grad_norm": 0.7526916265487671, + "learning_rate": 2.2639709028669237e-05, + "loss": 0.2412, + "step": 25000 + }, + { + "epoch": 32.09242618741977, + "eval_cer": 0.26221362835324064, + "eval_loss": 0.4680558145046234, + "eval_runtime": 13.9488, + "eval_samples_per_second": 70.472, + "eval_steps_per_second": 0.502, + "eval_wer": 0.44095390965374914, + "step": 25000 + }, + { + "epoch": 32.093709884467266, + "grad_norm": 1.2851275205612183, + "learning_rate": 2.2639281129653402e-05, + "loss": 0.2912, + "step": 25001 + }, + { + "epoch": 32.094993581514764, + "grad_norm": 0.8655152320861816, + "learning_rate": 2.263885323063757e-05, + "loss": 0.2199, + "step": 25002 + }, + { + "epoch": 32.09627727856226, + "grad_norm": 0.8122624158859253, + "learning_rate": 2.263842533162174e-05, + "loss": 0.2532, + "step": 25003 + }, + { + "epoch": 32.09756097560975, + "grad_norm": 0.9288614392280579, + "learning_rate": 2.2637997432605904e-05, + "loss": 0.2549, + "step": 25004 + }, + { + "epoch": 32.09884467265725, + "grad_norm": 1.1791267395019531, + "learning_rate": 2.2637569533590076e-05, + "loss": 0.2296, + "step": 25005 + }, + { + "epoch": 32.10012836970475, + "grad_norm": 1.0981029272079468, + "learning_rate": 2.263714163457424e-05, + "loss": 0.2663, + "step": 25006 + }, + { + "epoch": 32.101412066752246, + "grad_norm": 0.9094778299331665, + "learning_rate": 2.263671373555841e-05, + "loss": 0.2379, + "step": 25007 + }, + { + "epoch": 32.102695763799744, + "grad_norm": 1.1331712007522583, + "learning_rate": 2.2636285836542577e-05, + "loss": 0.254, + "step": 25008 + }, + { + "epoch": 32.10397946084724, + "grad_norm": 1.1301748752593994, + "learning_rate": 2.2635857937526742e-05, + "loss": 0.232, + "step": 25009 + }, + { + "epoch": 32.10526315789474, + "grad_norm": 0.804068386554718, + "learning_rate": 2.2635430038510914e-05, + "loss": 0.2101, + "step": 25010 + }, + { + "epoch": 32.10654685494223, + "grad_norm": 1.0466266870498657, + "learning_rate": 2.263500213949508e-05, + "loss": 0.2457, + "step": 25011 + }, + { + "epoch": 32.10783055198973, + "grad_norm": 1.105418086051941, + "learning_rate": 2.2634574240479248e-05, + "loss": 0.2619, + "step": 25012 + }, + { + "epoch": 32.109114249037226, + "grad_norm": 0.9764193892478943, + "learning_rate": 2.2634146341463416e-05, + "loss": 0.2193, + "step": 25013 + }, + { + "epoch": 32.110397946084724, + "grad_norm": 1.046762228012085, + "learning_rate": 2.2633718442447585e-05, + "loss": 0.2421, + "step": 25014 + }, + { + "epoch": 32.11168164313222, + "grad_norm": 1.0020734071731567, + "learning_rate": 2.263329054343175e-05, + "loss": 0.2432, + "step": 25015 + }, + { + "epoch": 32.11296534017972, + "grad_norm": 1.1053416728973389, + "learning_rate": 2.2632862644415918e-05, + "loss": 0.2595, + "step": 25016 + }, + { + "epoch": 32.11424903722722, + "grad_norm": 1.1182186603546143, + "learning_rate": 2.2632434745400086e-05, + "loss": 0.2501, + "step": 25017 + }, + { + "epoch": 32.11553273427471, + "grad_norm": 1.4299726486206055, + "learning_rate": 2.2632006846384255e-05, + "loss": 0.2396, + "step": 25018 + }, + { + "epoch": 32.116816431322206, + "grad_norm": 0.9285600781440735, + "learning_rate": 2.2631578947368423e-05, + "loss": 0.2225, + "step": 25019 + }, + { + "epoch": 32.118100128369704, + "grad_norm": 2.3433175086975098, + "learning_rate": 2.2631151048352588e-05, + "loss": 0.2704, + "step": 25020 + }, + { + "epoch": 32.1193838254172, + "grad_norm": 1.0983874797821045, + "learning_rate": 2.263072314933676e-05, + "loss": 0.2654, + "step": 25021 + }, + { + "epoch": 32.1206675224647, + "grad_norm": 1.117541790008545, + "learning_rate": 2.2630295250320925e-05, + "loss": 0.2757, + "step": 25022 + }, + { + "epoch": 32.1219512195122, + "grad_norm": 1.384263038635254, + "learning_rate": 2.262986735130509e-05, + "loss": 0.2802, + "step": 25023 + }, + { + "epoch": 32.12323491655969, + "grad_norm": 1.6624876260757446, + "learning_rate": 2.2629439452289262e-05, + "loss": 0.2541, + "step": 25024 + }, + { + "epoch": 32.12451861360719, + "grad_norm": 1.5705008506774902, + "learning_rate": 2.2629011553273427e-05, + "loss": 0.3309, + "step": 25025 + }, + { + "epoch": 32.125802310654684, + "grad_norm": 1.799283742904663, + "learning_rate": 2.2628583654257595e-05, + "loss": 0.2779, + "step": 25026 + }, + { + "epoch": 32.12708600770218, + "grad_norm": 1.708066463470459, + "learning_rate": 2.2628155755241764e-05, + "loss": 0.3449, + "step": 25027 + }, + { + "epoch": 32.12836970474968, + "grad_norm": 1.764998435974121, + "learning_rate": 2.2627727856225932e-05, + "loss": 0.3959, + "step": 25028 + }, + { + "epoch": 32.12965340179718, + "grad_norm": 1.3869267702102661, + "learning_rate": 2.26272999572101e-05, + "loss": 0.2654, + "step": 25029 + }, + { + "epoch": 32.130937098844676, + "grad_norm": 0.6573295593261719, + "learning_rate": 2.2626872058194266e-05, + "loss": 0.2827, + "step": 25030 + }, + { + "epoch": 32.13222079589217, + "grad_norm": 1.6970850229263306, + "learning_rate": 2.2626444159178434e-05, + "loss": 0.2633, + "step": 25031 + }, + { + "epoch": 32.133504492939664, + "grad_norm": 0.8470454216003418, + "learning_rate": 2.2626016260162602e-05, + "loss": 0.277, + "step": 25032 + }, + { + "epoch": 32.13478818998716, + "grad_norm": 0.6987365484237671, + "learning_rate": 2.262558836114677e-05, + "loss": 0.248, + "step": 25033 + }, + { + "epoch": 32.13607188703466, + "grad_norm": 0.7568055391311646, + "learning_rate": 2.2625160462130936e-05, + "loss": 0.2621, + "step": 25034 + }, + { + "epoch": 32.13735558408216, + "grad_norm": 1.0997488498687744, + "learning_rate": 2.2624732563115104e-05, + "loss": 0.2527, + "step": 25035 + }, + { + "epoch": 32.138639281129656, + "grad_norm": 0.8159056901931763, + "learning_rate": 2.2624304664099273e-05, + "loss": 0.2456, + "step": 25036 + }, + { + "epoch": 32.13992297817715, + "grad_norm": 0.7723220586776733, + "learning_rate": 2.262387676508344e-05, + "loss": 0.2427, + "step": 25037 + }, + { + "epoch": 32.141206675224645, + "grad_norm": 1.296278476715088, + "learning_rate": 2.262344886606761e-05, + "loss": 0.2679, + "step": 25038 + }, + { + "epoch": 32.14249037227214, + "grad_norm": 1.342255711555481, + "learning_rate": 2.2623020967051774e-05, + "loss": 0.2655, + "step": 25039 + }, + { + "epoch": 32.14377406931964, + "grad_norm": 0.8226801753044128, + "learning_rate": 2.2622593068035946e-05, + "loss": 0.2618, + "step": 25040 + }, + { + "epoch": 32.14505776636714, + "grad_norm": 1.9239357709884644, + "learning_rate": 2.262216516902011e-05, + "loss": 0.2422, + "step": 25041 + }, + { + "epoch": 32.146341463414636, + "grad_norm": 0.9029273390769958, + "learning_rate": 2.2621737270004276e-05, + "loss": 0.2595, + "step": 25042 + }, + { + "epoch": 32.147625160462134, + "grad_norm": 1.5034425258636475, + "learning_rate": 2.2621309370988448e-05, + "loss": 0.2808, + "step": 25043 + }, + { + "epoch": 32.148908857509625, + "grad_norm": 1.002935528755188, + "learning_rate": 2.2620881471972613e-05, + "loss": 0.2275, + "step": 25044 + }, + { + "epoch": 32.15019255455712, + "grad_norm": 2.61906099319458, + "learning_rate": 2.2620453572956785e-05, + "loss": 0.2859, + "step": 25045 + }, + { + "epoch": 32.15147625160462, + "grad_norm": 0.8379622101783752, + "learning_rate": 2.262002567394095e-05, + "loss": 0.2278, + "step": 25046 + }, + { + "epoch": 32.15275994865212, + "grad_norm": 1.1773711442947388, + "learning_rate": 2.261959777492512e-05, + "loss": 0.2436, + "step": 25047 + }, + { + "epoch": 32.154043645699616, + "grad_norm": 0.898571252822876, + "learning_rate": 2.2619169875909287e-05, + "loss": 0.2541, + "step": 25048 + }, + { + "epoch": 32.155327342747114, + "grad_norm": 1.1838231086730957, + "learning_rate": 2.2618741976893452e-05, + "loss": 0.2427, + "step": 25049 + }, + { + "epoch": 32.15661103979461, + "grad_norm": 1.2136023044586182, + "learning_rate": 2.261831407787762e-05, + "loss": 0.2472, + "step": 25050 + }, + { + "epoch": 32.1578947368421, + "grad_norm": 2.441465377807617, + "learning_rate": 2.261788617886179e-05, + "loss": 0.2508, + "step": 25051 + }, + { + "epoch": 32.1591784338896, + "grad_norm": 1.1560988426208496, + "learning_rate": 2.2617458279845957e-05, + "loss": 0.2571, + "step": 25052 + }, + { + "epoch": 32.1604621309371, + "grad_norm": 0.806360125541687, + "learning_rate": 2.2617030380830125e-05, + "loss": 0.2377, + "step": 25053 + }, + { + "epoch": 32.161745827984596, + "grad_norm": 2.2341623306274414, + "learning_rate": 2.2616602481814294e-05, + "loss": 0.2373, + "step": 25054 + }, + { + "epoch": 32.163029525032094, + "grad_norm": 0.8660416007041931, + "learning_rate": 2.261617458279846e-05, + "loss": 0.2531, + "step": 25055 + }, + { + "epoch": 32.16431322207959, + "grad_norm": 1.0125528573989868, + "learning_rate": 2.2615746683782627e-05, + "loss": 0.237, + "step": 25056 + }, + { + "epoch": 32.16559691912708, + "grad_norm": 0.8514198064804077, + "learning_rate": 2.2615318784766796e-05, + "loss": 0.2395, + "step": 25057 + }, + { + "epoch": 32.16688061617458, + "grad_norm": 0.9269129633903503, + "learning_rate": 2.261489088575096e-05, + "loss": 0.2684, + "step": 25058 + }, + { + "epoch": 32.16816431322208, + "grad_norm": 1.131461262702942, + "learning_rate": 2.2614462986735133e-05, + "loss": 0.2431, + "step": 25059 + }, + { + "epoch": 32.169448010269576, + "grad_norm": 2.0626988410949707, + "learning_rate": 2.2614035087719298e-05, + "loss": 0.2474, + "step": 25060 + }, + { + "epoch": 32.170731707317074, + "grad_norm": 1.814160943031311, + "learning_rate": 2.261360718870347e-05, + "loss": 0.2406, + "step": 25061 + }, + { + "epoch": 32.17201540436457, + "grad_norm": 0.7605133056640625, + "learning_rate": 2.2613179289687634e-05, + "loss": 0.2648, + "step": 25062 + }, + { + "epoch": 32.17329910141207, + "grad_norm": 1.0488680601119995, + "learning_rate": 2.26127513906718e-05, + "loss": 0.2438, + "step": 25063 + }, + { + "epoch": 32.17458279845956, + "grad_norm": 1.3759019374847412, + "learning_rate": 2.261232349165597e-05, + "loss": 0.2233, + "step": 25064 + }, + { + "epoch": 32.17586649550706, + "grad_norm": 1.1151542663574219, + "learning_rate": 2.2611895592640136e-05, + "loss": 0.2561, + "step": 25065 + }, + { + "epoch": 32.177150192554556, + "grad_norm": 1.9462603330612183, + "learning_rate": 2.2611467693624305e-05, + "loss": 0.2612, + "step": 25066 + }, + { + "epoch": 32.178433889602054, + "grad_norm": 1.58324134349823, + "learning_rate": 2.2611039794608473e-05, + "loss": 0.2398, + "step": 25067 + }, + { + "epoch": 32.17971758664955, + "grad_norm": 1.9972444772720337, + "learning_rate": 2.261061189559264e-05, + "loss": 0.2955, + "step": 25068 + }, + { + "epoch": 32.18100128369705, + "grad_norm": 1.0717977285385132, + "learning_rate": 2.261018399657681e-05, + "loss": 0.2389, + "step": 25069 + }, + { + "epoch": 32.18228498074454, + "grad_norm": 2.1767706871032715, + "learning_rate": 2.2609756097560975e-05, + "loss": 0.2741, + "step": 25070 + }, + { + "epoch": 32.18356867779204, + "grad_norm": 0.9493649005889893, + "learning_rate": 2.2609328198545143e-05, + "loss": 0.254, + "step": 25071 + }, + { + "epoch": 32.18485237483954, + "grad_norm": 1.3877689838409424, + "learning_rate": 2.2608900299529312e-05, + "loss": 0.2619, + "step": 25072 + }, + { + "epoch": 32.186136071887034, + "grad_norm": 1.3931392431259155, + "learning_rate": 2.260847240051348e-05, + "loss": 0.2912, + "step": 25073 + }, + { + "epoch": 32.18741976893453, + "grad_norm": 1.1470637321472168, + "learning_rate": 2.2608044501497645e-05, + "loss": 0.268, + "step": 25074 + }, + { + "epoch": 32.18870346598203, + "grad_norm": 1.3553404808044434, + "learning_rate": 2.2607616602481817e-05, + "loss": 0.2865, + "step": 25075 + }, + { + "epoch": 32.18998716302953, + "grad_norm": 5.108689785003662, + "learning_rate": 2.2607188703465982e-05, + "loss": 0.3302, + "step": 25076 + }, + { + "epoch": 32.19127086007702, + "grad_norm": 6.155081748962402, + "learning_rate": 2.260676080445015e-05, + "loss": 0.3478, + "step": 25077 + }, + { + "epoch": 32.19255455712452, + "grad_norm": 1.7800540924072266, + "learning_rate": 2.260633290543432e-05, + "loss": 0.416, + "step": 25078 + }, + { + "epoch": 32.193838254172015, + "grad_norm": 0.8526046276092529, + "learning_rate": 2.2605905006418484e-05, + "loss": 0.2781, + "step": 25079 + }, + { + "epoch": 32.19512195121951, + "grad_norm": 1.0036617517471313, + "learning_rate": 2.2605477107402656e-05, + "loss": 0.2795, + "step": 25080 + }, + { + "epoch": 32.19640564826701, + "grad_norm": 0.7286664247512817, + "learning_rate": 2.260504920838682e-05, + "loss": 0.276, + "step": 25081 + }, + { + "epoch": 32.19768934531451, + "grad_norm": 0.9375039339065552, + "learning_rate": 2.260462130937099e-05, + "loss": 0.2516, + "step": 25082 + }, + { + "epoch": 32.198973042362006, + "grad_norm": 1.0003156661987305, + "learning_rate": 2.2604193410355157e-05, + "loss": 0.2779, + "step": 25083 + }, + { + "epoch": 32.2002567394095, + "grad_norm": 0.7311239838600159, + "learning_rate": 2.2603765511339323e-05, + "loss": 0.2517, + "step": 25084 + }, + { + "epoch": 32.201540436456995, + "grad_norm": 0.9021604061126709, + "learning_rate": 2.2603337612323494e-05, + "loss": 0.2513, + "step": 25085 + }, + { + "epoch": 32.20282413350449, + "grad_norm": 1.1660178899765015, + "learning_rate": 2.260290971330766e-05, + "loss": 0.2579, + "step": 25086 + }, + { + "epoch": 32.20410783055199, + "grad_norm": 1.0583381652832031, + "learning_rate": 2.2602481814291828e-05, + "loss": 0.2558, + "step": 25087 + }, + { + "epoch": 32.20539152759949, + "grad_norm": 0.9210878610610962, + "learning_rate": 2.2602053915275996e-05, + "loss": 0.2648, + "step": 25088 + }, + { + "epoch": 32.206675224646986, + "grad_norm": 1.1330751180648804, + "learning_rate": 2.2601626016260165e-05, + "loss": 0.3046, + "step": 25089 + }, + { + "epoch": 32.20795892169448, + "grad_norm": 0.7914194464683533, + "learning_rate": 2.260119811724433e-05, + "loss": 0.2644, + "step": 25090 + }, + { + "epoch": 32.209242618741975, + "grad_norm": 1.0745368003845215, + "learning_rate": 2.2600770218228498e-05, + "loss": 0.2471, + "step": 25091 + }, + { + "epoch": 32.21052631578947, + "grad_norm": 1.2925316095352173, + "learning_rate": 2.2600342319212666e-05, + "loss": 0.2506, + "step": 25092 + }, + { + "epoch": 32.21181001283697, + "grad_norm": 1.6989837884902954, + "learning_rate": 2.2599914420196835e-05, + "loss": 0.2844, + "step": 25093 + }, + { + "epoch": 32.21309370988447, + "grad_norm": 0.8537924885749817, + "learning_rate": 2.2599486521181003e-05, + "loss": 0.2402, + "step": 25094 + }, + { + "epoch": 32.214377406931966, + "grad_norm": 1.3187446594238281, + "learning_rate": 2.2599058622165168e-05, + "loss": 0.2438, + "step": 25095 + }, + { + "epoch": 32.215661103979464, + "grad_norm": 1.096817970275879, + "learning_rate": 2.2598630723149337e-05, + "loss": 0.2356, + "step": 25096 + }, + { + "epoch": 32.216944801026955, + "grad_norm": 1.8605204820632935, + "learning_rate": 2.2598202824133505e-05, + "loss": 0.2469, + "step": 25097 + }, + { + "epoch": 32.21822849807445, + "grad_norm": 0.9228200316429138, + "learning_rate": 2.259777492511767e-05, + "loss": 0.2472, + "step": 25098 + }, + { + "epoch": 32.21951219512195, + "grad_norm": 0.8411217927932739, + "learning_rate": 2.2597347026101842e-05, + "loss": 0.2452, + "step": 25099 + }, + { + "epoch": 32.22079589216945, + "grad_norm": 1.0897043943405151, + "learning_rate": 2.2596919127086007e-05, + "loss": 0.2651, + "step": 25100 + }, + { + "epoch": 32.222079589216946, + "grad_norm": 1.0442575216293335, + "learning_rate": 2.259649122807018e-05, + "loss": 0.2256, + "step": 25101 + }, + { + "epoch": 32.223363286264444, + "grad_norm": 3.4318079948425293, + "learning_rate": 2.2596063329054344e-05, + "loss": 0.2789, + "step": 25102 + }, + { + "epoch": 32.224646983311935, + "grad_norm": 1.16727876663208, + "learning_rate": 2.259563543003851e-05, + "loss": 0.2612, + "step": 25103 + }, + { + "epoch": 32.22593068035943, + "grad_norm": 0.8184317946434021, + "learning_rate": 2.259520753102268e-05, + "loss": 0.2522, + "step": 25104 + }, + { + "epoch": 32.22721437740693, + "grad_norm": 1.0911171436309814, + "learning_rate": 2.2594779632006846e-05, + "loss": 0.2315, + "step": 25105 + }, + { + "epoch": 32.22849807445443, + "grad_norm": 1.344092845916748, + "learning_rate": 2.2594351732991014e-05, + "loss": 0.2151, + "step": 25106 + }, + { + "epoch": 32.229781771501926, + "grad_norm": 1.6287747621536255, + "learning_rate": 2.2593923833975182e-05, + "loss": 0.2583, + "step": 25107 + }, + { + "epoch": 32.231065468549424, + "grad_norm": 1.2866123914718628, + "learning_rate": 2.259349593495935e-05, + "loss": 0.2768, + "step": 25108 + }, + { + "epoch": 32.23234916559692, + "grad_norm": 1.1600441932678223, + "learning_rate": 2.259306803594352e-05, + "loss": 0.2697, + "step": 25109 + }, + { + "epoch": 32.23363286264441, + "grad_norm": 1.2295873165130615, + "learning_rate": 2.2592640136927684e-05, + "loss": 0.2484, + "step": 25110 + }, + { + "epoch": 32.23491655969191, + "grad_norm": 2.413573980331421, + "learning_rate": 2.2592212237911853e-05, + "loss": 0.2392, + "step": 25111 + }, + { + "epoch": 32.23620025673941, + "grad_norm": 0.9524713754653931, + "learning_rate": 2.259178433889602e-05, + "loss": 0.254, + "step": 25112 + }, + { + "epoch": 32.23748395378691, + "grad_norm": 0.9069287776947021, + "learning_rate": 2.259135643988019e-05, + "loss": 0.2698, + "step": 25113 + }, + { + "epoch": 32.238767650834404, + "grad_norm": 1.2389286756515503, + "learning_rate": 2.2590928540864355e-05, + "loss": 0.2573, + "step": 25114 + }, + { + "epoch": 32.2400513478819, + "grad_norm": 1.1563125848770142, + "learning_rate": 2.2590500641848526e-05, + "loss": 0.2908, + "step": 25115 + }, + { + "epoch": 32.2413350449294, + "grad_norm": 0.9903520345687866, + "learning_rate": 2.259007274283269e-05, + "loss": 0.2436, + "step": 25116 + }, + { + "epoch": 32.24261874197689, + "grad_norm": 1.314612865447998, + "learning_rate": 2.258964484381686e-05, + "loss": 0.2502, + "step": 25117 + }, + { + "epoch": 32.24390243902439, + "grad_norm": 0.7937415838241577, + "learning_rate": 2.2589216944801028e-05, + "loss": 0.2481, + "step": 25118 + }, + { + "epoch": 32.24518613607189, + "grad_norm": 1.0710864067077637, + "learning_rate": 2.2588789045785193e-05, + "loss": 0.2762, + "step": 25119 + }, + { + "epoch": 32.246469833119384, + "grad_norm": 1.4422513246536255, + "learning_rate": 2.2588361146769365e-05, + "loss": 0.271, + "step": 25120 + }, + { + "epoch": 32.24775353016688, + "grad_norm": 0.9798106551170349, + "learning_rate": 2.258793324775353e-05, + "loss": 0.2375, + "step": 25121 + }, + { + "epoch": 32.24903722721438, + "grad_norm": 1.2207022905349731, + "learning_rate": 2.25875053487377e-05, + "loss": 0.2847, + "step": 25122 + }, + { + "epoch": 32.25032092426187, + "grad_norm": 1.4102929830551147, + "learning_rate": 2.2587077449721867e-05, + "loss": 0.2753, + "step": 25123 + }, + { + "epoch": 32.25160462130937, + "grad_norm": 1.2099535465240479, + "learning_rate": 2.2586649550706032e-05, + "loss": 0.2732, + "step": 25124 + }, + { + "epoch": 32.25288831835687, + "grad_norm": 1.535720944404602, + "learning_rate": 2.2586221651690204e-05, + "loss": 0.2879, + "step": 25125 + }, + { + "epoch": 32.254172015404365, + "grad_norm": 1.4337477684020996, + "learning_rate": 2.258579375267437e-05, + "loss": 0.3446, + "step": 25126 + }, + { + "epoch": 32.25545571245186, + "grad_norm": 1.7251055240631104, + "learning_rate": 2.2585365853658537e-05, + "loss": 0.3685, + "step": 25127 + }, + { + "epoch": 32.25673940949936, + "grad_norm": 3.1663646697998047, + "learning_rate": 2.2584937954642706e-05, + "loss": 0.4277, + "step": 25128 + }, + { + "epoch": 32.25802310654686, + "grad_norm": 0.8855719566345215, + "learning_rate": 2.2584510055626874e-05, + "loss": 0.2817, + "step": 25129 + }, + { + "epoch": 32.25930680359435, + "grad_norm": 0.6415284872055054, + "learning_rate": 2.258408215661104e-05, + "loss": 0.2686, + "step": 25130 + }, + { + "epoch": 32.26059050064185, + "grad_norm": 1.0648812055587769, + "learning_rate": 2.2583654257595207e-05, + "loss": 0.28, + "step": 25131 + }, + { + "epoch": 32.261874197689345, + "grad_norm": 2.2227869033813477, + "learning_rate": 2.2583226358579376e-05, + "loss": 0.2501, + "step": 25132 + }, + { + "epoch": 32.26315789473684, + "grad_norm": 1.2018183469772339, + "learning_rate": 2.2582798459563544e-05, + "loss": 0.2451, + "step": 25133 + }, + { + "epoch": 32.26444159178434, + "grad_norm": 0.7833295464515686, + "learning_rate": 2.2582370560547713e-05, + "loss": 0.2667, + "step": 25134 + }, + { + "epoch": 32.26572528883184, + "grad_norm": 1.1968883275985718, + "learning_rate": 2.2581942661531878e-05, + "loss": 0.2735, + "step": 25135 + }, + { + "epoch": 32.26700898587933, + "grad_norm": 1.75508451461792, + "learning_rate": 2.258151476251605e-05, + "loss": 0.2625, + "step": 25136 + }, + { + "epoch": 32.26829268292683, + "grad_norm": 0.9289259910583496, + "learning_rate": 2.2581086863500214e-05, + "loss": 0.2471, + "step": 25137 + }, + { + "epoch": 32.269576379974325, + "grad_norm": 1.247690200805664, + "learning_rate": 2.258065896448438e-05, + "loss": 0.2712, + "step": 25138 + }, + { + "epoch": 32.27086007702182, + "grad_norm": 1.1371188163757324, + "learning_rate": 2.258023106546855e-05, + "loss": 0.2694, + "step": 25139 + }, + { + "epoch": 32.27214377406932, + "grad_norm": 1.210923671722412, + "learning_rate": 2.2579803166452716e-05, + "loss": 0.2792, + "step": 25140 + }, + { + "epoch": 32.27342747111682, + "grad_norm": 0.7720856070518494, + "learning_rate": 2.2579375267436888e-05, + "loss": 0.2708, + "step": 25141 + }, + { + "epoch": 32.274711168164316, + "grad_norm": 0.9093643426895142, + "learning_rate": 2.2578947368421053e-05, + "loss": 0.2808, + "step": 25142 + }, + { + "epoch": 32.27599486521181, + "grad_norm": 0.856226921081543, + "learning_rate": 2.257851946940522e-05, + "loss": 0.2576, + "step": 25143 + }, + { + "epoch": 32.277278562259305, + "grad_norm": 1.7162213325500488, + "learning_rate": 2.257809157038939e-05, + "loss": 0.2709, + "step": 25144 + }, + { + "epoch": 32.2785622593068, + "grad_norm": 1.022560954093933, + "learning_rate": 2.2577663671373555e-05, + "loss": 0.2385, + "step": 25145 + }, + { + "epoch": 32.2798459563543, + "grad_norm": 1.3447507619857788, + "learning_rate": 2.2577235772357723e-05, + "loss": 0.2374, + "step": 25146 + }, + { + "epoch": 32.2811296534018, + "grad_norm": 2.995368003845215, + "learning_rate": 2.2576807873341892e-05, + "loss": 0.2341, + "step": 25147 + }, + { + "epoch": 32.282413350449296, + "grad_norm": 1.0285654067993164, + "learning_rate": 2.257637997432606e-05, + "loss": 0.2352, + "step": 25148 + }, + { + "epoch": 32.283697047496794, + "grad_norm": 1.6614331007003784, + "learning_rate": 2.257595207531023e-05, + "loss": 0.2312, + "step": 25149 + }, + { + "epoch": 32.284980744544285, + "grad_norm": 1.1304124593734741, + "learning_rate": 2.2575524176294397e-05, + "loss": 0.2153, + "step": 25150 + }, + { + "epoch": 32.28626444159178, + "grad_norm": 1.8500134944915771, + "learning_rate": 2.2575096277278562e-05, + "loss": 0.2785, + "step": 25151 + }, + { + "epoch": 32.28754813863928, + "grad_norm": 1.7732380628585815, + "learning_rate": 2.257466837826273e-05, + "loss": 0.2925, + "step": 25152 + }, + { + "epoch": 32.28883183568678, + "grad_norm": 1.1444388628005981, + "learning_rate": 2.25742404792469e-05, + "loss": 0.2638, + "step": 25153 + }, + { + "epoch": 32.290115532734276, + "grad_norm": 1.0652992725372314, + "learning_rate": 2.2573812580231064e-05, + "loss": 0.2625, + "step": 25154 + }, + { + "epoch": 32.291399229781774, + "grad_norm": 0.8854779601097107, + "learning_rate": 2.2573384681215236e-05, + "loss": 0.2413, + "step": 25155 + }, + { + "epoch": 32.292682926829265, + "grad_norm": 1.0179897546768188, + "learning_rate": 2.25729567821994e-05, + "loss": 0.2331, + "step": 25156 + }, + { + "epoch": 32.29396662387676, + "grad_norm": 1.198431372642517, + "learning_rate": 2.257252888318357e-05, + "loss": 0.2461, + "step": 25157 + }, + { + "epoch": 32.29525032092426, + "grad_norm": 2.0634052753448486, + "learning_rate": 2.2572100984167738e-05, + "loss": 0.2323, + "step": 25158 + }, + { + "epoch": 32.29653401797176, + "grad_norm": 1.130423665046692, + "learning_rate": 2.2571673085151903e-05, + "loss": 0.2624, + "step": 25159 + }, + { + "epoch": 32.29781771501926, + "grad_norm": 1.376800775527954, + "learning_rate": 2.2571245186136074e-05, + "loss": 0.2432, + "step": 25160 + }, + { + "epoch": 32.299101412066754, + "grad_norm": 2.279665946960449, + "learning_rate": 2.257081728712024e-05, + "loss": 0.2659, + "step": 25161 + }, + { + "epoch": 32.30038510911425, + "grad_norm": 1.1500028371810913, + "learning_rate": 2.2570389388104408e-05, + "loss": 0.2458, + "step": 25162 + }, + { + "epoch": 32.30166880616174, + "grad_norm": 2.58484148979187, + "learning_rate": 2.2569961489088576e-05, + "loss": 0.24, + "step": 25163 + }, + { + "epoch": 32.30295250320924, + "grad_norm": 0.9110639095306396, + "learning_rate": 2.256953359007274e-05, + "loss": 0.2412, + "step": 25164 + }, + { + "epoch": 32.30423620025674, + "grad_norm": 0.9560433030128479, + "learning_rate": 2.2569105691056913e-05, + "loss": 0.2621, + "step": 25165 + }, + { + "epoch": 32.30551989730424, + "grad_norm": 1.034469723701477, + "learning_rate": 2.2568677792041078e-05, + "loss": 0.2714, + "step": 25166 + }, + { + "epoch": 32.306803594351734, + "grad_norm": 1.1445749998092651, + "learning_rate": 2.2568249893025246e-05, + "loss": 0.2589, + "step": 25167 + }, + { + "epoch": 32.30808729139923, + "grad_norm": 1.9152469635009766, + "learning_rate": 2.2567821994009415e-05, + "loss": 0.26, + "step": 25168 + }, + { + "epoch": 32.30937098844672, + "grad_norm": 1.8458609580993652, + "learning_rate": 2.2567394094993583e-05, + "loss": 0.2539, + "step": 25169 + }, + { + "epoch": 32.31065468549422, + "grad_norm": 1.2851721048355103, + "learning_rate": 2.256696619597775e-05, + "loss": 0.3037, + "step": 25170 + }, + { + "epoch": 32.31193838254172, + "grad_norm": 1.493652105331421, + "learning_rate": 2.2566538296961917e-05, + "loss": 0.2475, + "step": 25171 + }, + { + "epoch": 32.31322207958922, + "grad_norm": 2.009321689605713, + "learning_rate": 2.2566110397946085e-05, + "loss": 0.2704, + "step": 25172 + }, + { + "epoch": 32.314505776636715, + "grad_norm": 1.623252034187317, + "learning_rate": 2.2565682498930254e-05, + "loss": 0.2871, + "step": 25173 + }, + { + "epoch": 32.31578947368421, + "grad_norm": 1.3761701583862305, + "learning_rate": 2.2565254599914422e-05, + "loss": 0.2883, + "step": 25174 + }, + { + "epoch": 32.31707317073171, + "grad_norm": 1.3446711301803589, + "learning_rate": 2.2564826700898587e-05, + "loss": 0.3159, + "step": 25175 + }, + { + "epoch": 32.3183568677792, + "grad_norm": 1.5683804750442505, + "learning_rate": 2.256439880188276e-05, + "loss": 0.2887, + "step": 25176 + }, + { + "epoch": 32.3196405648267, + "grad_norm": 3.63356351852417, + "learning_rate": 2.2563970902866924e-05, + "loss": 0.2995, + "step": 25177 + }, + { + "epoch": 32.3209242618742, + "grad_norm": 2.4181325435638428, + "learning_rate": 2.256354300385109e-05, + "loss": 0.4277, + "step": 25178 + }, + { + "epoch": 32.322207958921695, + "grad_norm": 0.5669265389442444, + "learning_rate": 2.256311510483526e-05, + "loss": 0.2554, + "step": 25179 + }, + { + "epoch": 32.32349165596919, + "grad_norm": 0.725139856338501, + "learning_rate": 2.2562687205819426e-05, + "loss": 0.2655, + "step": 25180 + }, + { + "epoch": 32.32477535301669, + "grad_norm": 1.5060265064239502, + "learning_rate": 2.2562259306803597e-05, + "loss": 0.2727, + "step": 25181 + }, + { + "epoch": 32.32605905006419, + "grad_norm": 4.01234769821167, + "learning_rate": 2.2561831407787762e-05, + "loss": 0.2731, + "step": 25182 + }, + { + "epoch": 32.32734274711168, + "grad_norm": 0.7557207942008972, + "learning_rate": 2.256140350877193e-05, + "loss": 0.2818, + "step": 25183 + }, + { + "epoch": 32.32862644415918, + "grad_norm": 1.3450847864151, + "learning_rate": 2.25609756097561e-05, + "loss": 0.262, + "step": 25184 + }, + { + "epoch": 32.329910141206675, + "grad_norm": 0.6977185606956482, + "learning_rate": 2.2560547710740264e-05, + "loss": 0.2673, + "step": 25185 + }, + { + "epoch": 32.33119383825417, + "grad_norm": 0.6448099613189697, + "learning_rate": 2.2560119811724433e-05, + "loss": 0.2527, + "step": 25186 + }, + { + "epoch": 32.33247753530167, + "grad_norm": 3.037327766418457, + "learning_rate": 2.25596919127086e-05, + "loss": 0.2461, + "step": 25187 + }, + { + "epoch": 32.33376123234917, + "grad_norm": 2.1610050201416016, + "learning_rate": 2.255926401369277e-05, + "loss": 0.2818, + "step": 25188 + }, + { + "epoch": 32.33504492939666, + "grad_norm": 0.6507534980773926, + "learning_rate": 2.2558836114676938e-05, + "loss": 0.2351, + "step": 25189 + }, + { + "epoch": 32.33632862644416, + "grad_norm": 1.0001819133758545, + "learning_rate": 2.2558408215661106e-05, + "loss": 0.2695, + "step": 25190 + }, + { + "epoch": 32.337612323491655, + "grad_norm": 1.0036842823028564, + "learning_rate": 2.255798031664527e-05, + "loss": 0.2794, + "step": 25191 + }, + { + "epoch": 32.33889602053915, + "grad_norm": 1.296922206878662, + "learning_rate": 2.255755241762944e-05, + "loss": 0.2756, + "step": 25192 + }, + { + "epoch": 32.34017971758665, + "grad_norm": 1.3218179941177368, + "learning_rate": 2.2557124518613608e-05, + "loss": 0.2342, + "step": 25193 + }, + { + "epoch": 32.34146341463415, + "grad_norm": 1.17832350730896, + "learning_rate": 2.2556696619597773e-05, + "loss": 0.2747, + "step": 25194 + }, + { + "epoch": 32.342747111681646, + "grad_norm": 0.8745643496513367, + "learning_rate": 2.2556268720581945e-05, + "loss": 0.2308, + "step": 25195 + }, + { + "epoch": 32.34403080872914, + "grad_norm": 0.9187562465667725, + "learning_rate": 2.255584082156611e-05, + "loss": 0.2319, + "step": 25196 + }, + { + "epoch": 32.345314505776635, + "grad_norm": 1.1258437633514404, + "learning_rate": 2.2555412922550282e-05, + "loss": 0.2264, + "step": 25197 + }, + { + "epoch": 32.34659820282413, + "grad_norm": 1.0437613725662231, + "learning_rate": 2.2554985023534447e-05, + "loss": 0.2441, + "step": 25198 + }, + { + "epoch": 32.34788189987163, + "grad_norm": 1.188124179840088, + "learning_rate": 2.2554557124518612e-05, + "loss": 0.2375, + "step": 25199 + }, + { + "epoch": 32.34916559691913, + "grad_norm": 1.4415080547332764, + "learning_rate": 2.2554129225502784e-05, + "loss": 0.2404, + "step": 25200 + }, + { + "epoch": 32.350449293966626, + "grad_norm": 0.8126779794692993, + "learning_rate": 2.255370132648695e-05, + "loss": 0.2265, + "step": 25201 + }, + { + "epoch": 32.35173299101412, + "grad_norm": 1.203190565109253, + "learning_rate": 2.2553273427471117e-05, + "loss": 0.2723, + "step": 25202 + }, + { + "epoch": 32.353016688061615, + "grad_norm": 1.3099912405014038, + "learning_rate": 2.2552845528455286e-05, + "loss": 0.2551, + "step": 25203 + }, + { + "epoch": 32.35430038510911, + "grad_norm": 1.3941887617111206, + "learning_rate": 2.2552417629439454e-05, + "loss": 0.2281, + "step": 25204 + }, + { + "epoch": 32.35558408215661, + "grad_norm": 1.4964196681976318, + "learning_rate": 2.2551989730423622e-05, + "loss": 0.2356, + "step": 25205 + }, + { + "epoch": 32.35686777920411, + "grad_norm": 1.1334681510925293, + "learning_rate": 2.2551561831407787e-05, + "loss": 0.2427, + "step": 25206 + }, + { + "epoch": 32.35815147625161, + "grad_norm": 1.2181373834609985, + "learning_rate": 2.2551133932391956e-05, + "loss": 0.2552, + "step": 25207 + }, + { + "epoch": 32.359435173299104, + "grad_norm": 2.9738216400146484, + "learning_rate": 2.2550706033376124e-05, + "loss": 0.2569, + "step": 25208 + }, + { + "epoch": 32.360718870346595, + "grad_norm": 1.7139216661453247, + "learning_rate": 2.2550278134360293e-05, + "loss": 0.2618, + "step": 25209 + }, + { + "epoch": 32.36200256739409, + "grad_norm": 0.9403950572013855, + "learning_rate": 2.2549850235344458e-05, + "loss": 0.2611, + "step": 25210 + }, + { + "epoch": 32.36328626444159, + "grad_norm": 1.1092228889465332, + "learning_rate": 2.254942233632863e-05, + "loss": 0.2322, + "step": 25211 + }, + { + "epoch": 32.36456996148909, + "grad_norm": 1.1196858882904053, + "learning_rate": 2.2548994437312795e-05, + "loss": 0.2805, + "step": 25212 + }, + { + "epoch": 32.36585365853659, + "grad_norm": 1.254733920097351, + "learning_rate": 2.2548566538296963e-05, + "loss": 0.2714, + "step": 25213 + }, + { + "epoch": 32.367137355584084, + "grad_norm": 1.3694576025009155, + "learning_rate": 2.254813863928113e-05, + "loss": 0.2498, + "step": 25214 + }, + { + "epoch": 32.36842105263158, + "grad_norm": 0.9084597229957581, + "learning_rate": 2.2547710740265296e-05, + "loss": 0.2852, + "step": 25215 + }, + { + "epoch": 32.36970474967907, + "grad_norm": 3.462059736251831, + "learning_rate": 2.2547282841249468e-05, + "loss": 0.3022, + "step": 25216 + }, + { + "epoch": 32.37098844672657, + "grad_norm": 2.068122625350952, + "learning_rate": 2.2546854942233633e-05, + "loss": 0.2753, + "step": 25217 + }, + { + "epoch": 32.37227214377407, + "grad_norm": 3.093527317047119, + "learning_rate": 2.25464270432178e-05, + "loss": 0.2733, + "step": 25218 + }, + { + "epoch": 32.37355584082157, + "grad_norm": 1.342772364616394, + "learning_rate": 2.254599914420197e-05, + "loss": 0.2271, + "step": 25219 + }, + { + "epoch": 32.374839537869065, + "grad_norm": 2.2763431072235107, + "learning_rate": 2.2545571245186135e-05, + "loss": 0.2491, + "step": 25220 + }, + { + "epoch": 32.37612323491656, + "grad_norm": 1.933643102645874, + "learning_rate": 2.2545143346170307e-05, + "loss": 0.3095, + "step": 25221 + }, + { + "epoch": 32.37740693196405, + "grad_norm": 2.18463397026062, + "learning_rate": 2.2544715447154472e-05, + "loss": 0.2961, + "step": 25222 + }, + { + "epoch": 32.37869062901155, + "grad_norm": 1.7238624095916748, + "learning_rate": 2.254428754813864e-05, + "loss": 0.2676, + "step": 25223 + }, + { + "epoch": 32.37997432605905, + "grad_norm": 3.109194040298462, + "learning_rate": 2.254385964912281e-05, + "loss": 0.2904, + "step": 25224 + }, + { + "epoch": 32.38125802310655, + "grad_norm": 1.4901998043060303, + "learning_rate": 2.2543431750106974e-05, + "loss": 0.3078, + "step": 25225 + }, + { + "epoch": 32.382541720154045, + "grad_norm": 2.0342392921447754, + "learning_rate": 2.2543003851091142e-05, + "loss": 0.32, + "step": 25226 + }, + { + "epoch": 32.38382541720154, + "grad_norm": 2.7108495235443115, + "learning_rate": 2.254257595207531e-05, + "loss": 0.3784, + "step": 25227 + }, + { + "epoch": 32.38510911424904, + "grad_norm": 4.698432922363281, + "learning_rate": 2.254214805305948e-05, + "loss": 0.4144, + "step": 25228 + }, + { + "epoch": 32.38639281129653, + "grad_norm": 0.8106711506843567, + "learning_rate": 2.2541720154043644e-05, + "loss": 0.2577, + "step": 25229 + }, + { + "epoch": 32.38767650834403, + "grad_norm": 0.8759785890579224, + "learning_rate": 2.2541292255027816e-05, + "loss": 0.2665, + "step": 25230 + }, + { + "epoch": 32.38896020539153, + "grad_norm": 0.7843012809753418, + "learning_rate": 2.254086435601198e-05, + "loss": 0.2535, + "step": 25231 + }, + { + "epoch": 32.390243902439025, + "grad_norm": 1.35975182056427, + "learning_rate": 2.254043645699615e-05, + "loss": 0.2599, + "step": 25232 + }, + { + "epoch": 32.39152759948652, + "grad_norm": 1.0649182796478271, + "learning_rate": 2.2540008557980318e-05, + "loss": 0.262, + "step": 25233 + }, + { + "epoch": 32.39281129653402, + "grad_norm": 1.985458254814148, + "learning_rate": 2.2539580658964483e-05, + "loss": 0.2615, + "step": 25234 + }, + { + "epoch": 32.39409499358152, + "grad_norm": 1.3450533151626587, + "learning_rate": 2.2539152759948654e-05, + "loss": 0.2722, + "step": 25235 + }, + { + "epoch": 32.39537869062901, + "grad_norm": 2.310002326965332, + "learning_rate": 2.253872486093282e-05, + "loss": 0.2927, + "step": 25236 + }, + { + "epoch": 32.39666238767651, + "grad_norm": 1.0760318040847778, + "learning_rate": 2.2538296961916988e-05, + "loss": 0.2561, + "step": 25237 + }, + { + "epoch": 32.397946084724005, + "grad_norm": 0.8652054667472839, + "learning_rate": 2.2537869062901156e-05, + "loss": 0.275, + "step": 25238 + }, + { + "epoch": 32.3992297817715, + "grad_norm": 0.9640243649482727, + "learning_rate": 2.253744116388532e-05, + "loss": 0.2527, + "step": 25239 + }, + { + "epoch": 32.400513478819, + "grad_norm": 1.0709750652313232, + "learning_rate": 2.2537013264869493e-05, + "loss": 0.294, + "step": 25240 + }, + { + "epoch": 32.4017971758665, + "grad_norm": 0.8946021795272827, + "learning_rate": 2.2536585365853658e-05, + "loss": 0.2628, + "step": 25241 + }, + { + "epoch": 32.40308087291399, + "grad_norm": 1.1141959428787231, + "learning_rate": 2.2536157466837827e-05, + "loss": 0.2484, + "step": 25242 + }, + { + "epoch": 32.40436456996149, + "grad_norm": 1.1268386840820312, + "learning_rate": 2.2535729567821995e-05, + "loss": 0.2805, + "step": 25243 + }, + { + "epoch": 32.405648267008985, + "grad_norm": 1.103713035583496, + "learning_rate": 2.2535301668806163e-05, + "loss": 0.2788, + "step": 25244 + }, + { + "epoch": 32.40693196405648, + "grad_norm": 1.6091747283935547, + "learning_rate": 2.253487376979033e-05, + "loss": 0.2634, + "step": 25245 + }, + { + "epoch": 32.40821566110398, + "grad_norm": 1.1219652891159058, + "learning_rate": 2.2534445870774497e-05, + "loss": 0.2534, + "step": 25246 + }, + { + "epoch": 32.40949935815148, + "grad_norm": 0.7999095916748047, + "learning_rate": 2.2534017971758665e-05, + "loss": 0.2543, + "step": 25247 + }, + { + "epoch": 32.410783055198976, + "grad_norm": 0.8603190779685974, + "learning_rate": 2.2533590072742834e-05, + "loss": 0.2684, + "step": 25248 + }, + { + "epoch": 32.41206675224647, + "grad_norm": 0.8240912556648254, + "learning_rate": 2.2533162173727002e-05, + "loss": 0.2762, + "step": 25249 + }, + { + "epoch": 32.413350449293965, + "grad_norm": 0.982511043548584, + "learning_rate": 2.2532734274711167e-05, + "loss": 0.2421, + "step": 25250 + }, + { + "epoch": 32.41463414634146, + "grad_norm": 2.762601613998413, + "learning_rate": 2.253230637569534e-05, + "loss": 0.253, + "step": 25251 + }, + { + "epoch": 32.41591784338896, + "grad_norm": 1.1452422142028809, + "learning_rate": 2.2531878476679504e-05, + "loss": 0.2337, + "step": 25252 + }, + { + "epoch": 32.41720154043646, + "grad_norm": 1.747379183769226, + "learning_rate": 2.253145057766367e-05, + "loss": 0.2709, + "step": 25253 + }, + { + "epoch": 32.41848523748396, + "grad_norm": 1.085935354232788, + "learning_rate": 2.253102267864784e-05, + "loss": 0.2615, + "step": 25254 + }, + { + "epoch": 32.41976893453145, + "grad_norm": 1.1847171783447266, + "learning_rate": 2.2530594779632006e-05, + "loss": 0.2583, + "step": 25255 + }, + { + "epoch": 32.421052631578945, + "grad_norm": 1.5380483865737915, + "learning_rate": 2.2530166880616178e-05, + "loss": 0.2504, + "step": 25256 + }, + { + "epoch": 32.42233632862644, + "grad_norm": 1.5108686685562134, + "learning_rate": 2.2529738981600343e-05, + "loss": 0.2706, + "step": 25257 + }, + { + "epoch": 32.42362002567394, + "grad_norm": 1.2098941802978516, + "learning_rate": 2.252931108258451e-05, + "loss": 0.2445, + "step": 25258 + }, + { + "epoch": 32.42490372272144, + "grad_norm": 0.9626901745796204, + "learning_rate": 2.252888318356868e-05, + "loss": 0.2282, + "step": 25259 + }, + { + "epoch": 32.42618741976894, + "grad_norm": 1.2370370626449585, + "learning_rate": 2.2528455284552844e-05, + "loss": 0.2441, + "step": 25260 + }, + { + "epoch": 32.427471116816434, + "grad_norm": 1.0440165996551514, + "learning_rate": 2.2528027385537013e-05, + "loss": 0.2588, + "step": 25261 + }, + { + "epoch": 32.428754813863925, + "grad_norm": 3.2924118041992188, + "learning_rate": 2.252759948652118e-05, + "loss": 0.2865, + "step": 25262 + }, + { + "epoch": 32.43003851091142, + "grad_norm": 2.822782516479492, + "learning_rate": 2.252717158750535e-05, + "loss": 0.2654, + "step": 25263 + }, + { + "epoch": 32.43132220795892, + "grad_norm": 1.1693260669708252, + "learning_rate": 2.2526743688489518e-05, + "loss": 0.2499, + "step": 25264 + }, + { + "epoch": 32.43260590500642, + "grad_norm": 1.7591679096221924, + "learning_rate": 2.2526315789473686e-05, + "loss": 0.2757, + "step": 25265 + }, + { + "epoch": 32.43388960205392, + "grad_norm": 1.2974480390548706, + "learning_rate": 2.252588789045785e-05, + "loss": 0.2765, + "step": 25266 + }, + { + "epoch": 32.435173299101415, + "grad_norm": 3.6882200241088867, + "learning_rate": 2.252545999144202e-05, + "loss": 0.2709, + "step": 25267 + }, + { + "epoch": 32.436456996148905, + "grad_norm": 1.7693276405334473, + "learning_rate": 2.2525032092426188e-05, + "loss": 0.265, + "step": 25268 + }, + { + "epoch": 32.4377406931964, + "grad_norm": 1.1128206253051758, + "learning_rate": 2.2524604193410353e-05, + "loss": 0.2763, + "step": 25269 + }, + { + "epoch": 32.4390243902439, + "grad_norm": 3.2989261150360107, + "learning_rate": 2.2524176294394525e-05, + "loss": 0.2692, + "step": 25270 + }, + { + "epoch": 32.4403080872914, + "grad_norm": 1.5716723203659058, + "learning_rate": 2.252374839537869e-05, + "loss": 0.291, + "step": 25271 + }, + { + "epoch": 32.4415917843389, + "grad_norm": 1.8509724140167236, + "learning_rate": 2.2523320496362862e-05, + "loss": 0.2682, + "step": 25272 + }, + { + "epoch": 32.442875481386395, + "grad_norm": 1.12411630153656, + "learning_rate": 2.2522892597347027e-05, + "loss": 0.2813, + "step": 25273 + }, + { + "epoch": 32.44415917843389, + "grad_norm": 1.4999635219573975, + "learning_rate": 2.2522464698331192e-05, + "loss": 0.2869, + "step": 25274 + }, + { + "epoch": 32.44544287548138, + "grad_norm": 2.2866108417510986, + "learning_rate": 2.2522036799315364e-05, + "loss": 0.2718, + "step": 25275 + }, + { + "epoch": 32.44672657252888, + "grad_norm": 4.134405136108398, + "learning_rate": 2.252160890029953e-05, + "loss": 0.3111, + "step": 25276 + }, + { + "epoch": 32.44801026957638, + "grad_norm": 2.3324179649353027, + "learning_rate": 2.2521181001283697e-05, + "loss": 0.3245, + "step": 25277 + }, + { + "epoch": 32.44929396662388, + "grad_norm": 3.4949021339416504, + "learning_rate": 2.2520753102267866e-05, + "loss": 0.3938, + "step": 25278 + }, + { + "epoch": 32.450577663671375, + "grad_norm": 1.3623195886611938, + "learning_rate": 2.2520325203252034e-05, + "loss": 0.2536, + "step": 25279 + }, + { + "epoch": 32.45186136071887, + "grad_norm": 0.7689550518989563, + "learning_rate": 2.2519897304236202e-05, + "loss": 0.27, + "step": 25280 + }, + { + "epoch": 32.45314505776637, + "grad_norm": 0.7302724123001099, + "learning_rate": 2.2519469405220367e-05, + "loss": 0.2718, + "step": 25281 + }, + { + "epoch": 32.45442875481386, + "grad_norm": 1.2300548553466797, + "learning_rate": 2.2519041506204536e-05, + "loss": 0.2676, + "step": 25282 + }, + { + "epoch": 32.45571245186136, + "grad_norm": 0.8694095611572266, + "learning_rate": 2.2518613607188704e-05, + "loss": 0.2804, + "step": 25283 + }, + { + "epoch": 32.45699614890886, + "grad_norm": 0.9202761650085449, + "learning_rate": 2.2518185708172873e-05, + "loss": 0.2743, + "step": 25284 + }, + { + "epoch": 32.458279845956355, + "grad_norm": 0.7518244385719299, + "learning_rate": 2.2517757809157038e-05, + "loss": 0.2802, + "step": 25285 + }, + { + "epoch": 32.45956354300385, + "grad_norm": 0.6915493011474609, + "learning_rate": 2.2517329910141206e-05, + "loss": 0.2407, + "step": 25286 + }, + { + "epoch": 32.46084724005135, + "grad_norm": 0.8528629541397095, + "learning_rate": 2.2516902011125375e-05, + "loss": 0.2329, + "step": 25287 + }, + { + "epoch": 32.46213093709884, + "grad_norm": 1.0269228219985962, + "learning_rate": 2.2516474112109543e-05, + "loss": 0.2576, + "step": 25288 + }, + { + "epoch": 32.46341463414634, + "grad_norm": 3.1349737644195557, + "learning_rate": 2.251604621309371e-05, + "loss": 0.2935, + "step": 25289 + }, + { + "epoch": 32.46469833119384, + "grad_norm": 2.656630277633667, + "learning_rate": 2.2515618314077876e-05, + "loss": 0.2477, + "step": 25290 + }, + { + "epoch": 32.465982028241335, + "grad_norm": 0.83622145652771, + "learning_rate": 2.2515190415062048e-05, + "loss": 0.2488, + "step": 25291 + }, + { + "epoch": 32.46726572528883, + "grad_norm": 1.4492989778518677, + "learning_rate": 2.2514762516046213e-05, + "loss": 0.2662, + "step": 25292 + }, + { + "epoch": 32.46854942233633, + "grad_norm": 0.9699558615684509, + "learning_rate": 2.2514334617030378e-05, + "loss": 0.235, + "step": 25293 + }, + { + "epoch": 32.46983311938383, + "grad_norm": 0.9858879446983337, + "learning_rate": 2.251390671801455e-05, + "loss": 0.2532, + "step": 25294 + }, + { + "epoch": 32.47111681643132, + "grad_norm": 1.1427656412124634, + "learning_rate": 2.2513478818998715e-05, + "loss": 0.2589, + "step": 25295 + }, + { + "epoch": 32.47240051347882, + "grad_norm": 1.1952465772628784, + "learning_rate": 2.2513050919982887e-05, + "loss": 0.2334, + "step": 25296 + }, + { + "epoch": 32.473684210526315, + "grad_norm": 2.8752119541168213, + "learning_rate": 2.2512623020967052e-05, + "loss": 0.2376, + "step": 25297 + }, + { + "epoch": 32.47496790757381, + "grad_norm": 1.0516012907028198, + "learning_rate": 2.251219512195122e-05, + "loss": 0.2798, + "step": 25298 + }, + { + "epoch": 32.47625160462131, + "grad_norm": 0.8689924478530884, + "learning_rate": 2.251176722293539e-05, + "loss": 0.276, + "step": 25299 + }, + { + "epoch": 32.47753530166881, + "grad_norm": 0.7736949324607849, + "learning_rate": 2.2511339323919554e-05, + "loss": 0.2685, + "step": 25300 + }, + { + "epoch": 32.47881899871631, + "grad_norm": 2.6796741485595703, + "learning_rate": 2.2510911424903722e-05, + "loss": 0.2589, + "step": 25301 + }, + { + "epoch": 32.4801026957638, + "grad_norm": 2.5336272716522217, + "learning_rate": 2.251048352588789e-05, + "loss": 0.254, + "step": 25302 + }, + { + "epoch": 32.481386392811295, + "grad_norm": 1.303324818611145, + "learning_rate": 2.251005562687206e-05, + "loss": 0.2582, + "step": 25303 + }, + { + "epoch": 32.48267008985879, + "grad_norm": 1.3613452911376953, + "learning_rate": 2.2509627727856227e-05, + "loss": 0.2419, + "step": 25304 + }, + { + "epoch": 32.48395378690629, + "grad_norm": 1.9029353857040405, + "learning_rate": 2.2509199828840396e-05, + "loss": 0.2625, + "step": 25305 + }, + { + "epoch": 32.48523748395379, + "grad_norm": 1.0055863857269287, + "learning_rate": 2.250877192982456e-05, + "loss": 0.2406, + "step": 25306 + }, + { + "epoch": 32.48652118100129, + "grad_norm": 3.607515811920166, + "learning_rate": 2.250834403080873e-05, + "loss": 0.2073, + "step": 25307 + }, + { + "epoch": 32.48780487804878, + "grad_norm": 0.9417652487754822, + "learning_rate": 2.2507916131792898e-05, + "loss": 0.266, + "step": 25308 + }, + { + "epoch": 32.489088575096275, + "grad_norm": 0.9872702956199646, + "learning_rate": 2.2507488232777063e-05, + "loss": 0.2425, + "step": 25309 + }, + { + "epoch": 32.49037227214377, + "grad_norm": 2.92864727973938, + "learning_rate": 2.2507060333761234e-05, + "loss": 0.2676, + "step": 25310 + }, + { + "epoch": 32.49165596919127, + "grad_norm": 1.8244783878326416, + "learning_rate": 2.25066324347454e-05, + "loss": 0.2622, + "step": 25311 + }, + { + "epoch": 32.49293966623877, + "grad_norm": 1.2055691480636597, + "learning_rate": 2.250620453572957e-05, + "loss": 0.2495, + "step": 25312 + }, + { + "epoch": 32.49422336328627, + "grad_norm": 2.4374828338623047, + "learning_rate": 2.2505776636713736e-05, + "loss": 0.2458, + "step": 25313 + }, + { + "epoch": 32.495507060333765, + "grad_norm": 2.5838773250579834, + "learning_rate": 2.25053487376979e-05, + "loss": 0.2447, + "step": 25314 + }, + { + "epoch": 32.496790757381255, + "grad_norm": 1.547532320022583, + "learning_rate": 2.2504920838682073e-05, + "loss": 0.272, + "step": 25315 + }, + { + "epoch": 32.49807445442875, + "grad_norm": 1.9018480777740479, + "learning_rate": 2.2504492939666238e-05, + "loss": 0.273, + "step": 25316 + }, + { + "epoch": 32.49935815147625, + "grad_norm": 1.1291308403015137, + "learning_rate": 2.2504065040650407e-05, + "loss": 0.2486, + "step": 25317 + }, + { + "epoch": 32.50064184852375, + "grad_norm": 1.2046961784362793, + "learning_rate": 2.2503637141634575e-05, + "loss": 0.2634, + "step": 25318 + }, + { + "epoch": 32.50192554557125, + "grad_norm": 0.955288827419281, + "learning_rate": 2.2503209242618743e-05, + "loss": 0.2518, + "step": 25319 + }, + { + "epoch": 32.503209242618745, + "grad_norm": 1.1609933376312256, + "learning_rate": 2.2502781343602912e-05, + "loss": 0.3047, + "step": 25320 + }, + { + "epoch": 32.504492939666235, + "grad_norm": 2.282679319381714, + "learning_rate": 2.2502353444587077e-05, + "loss": 0.275, + "step": 25321 + }, + { + "epoch": 32.50577663671373, + "grad_norm": 1.9064960479736328, + "learning_rate": 2.2501925545571245e-05, + "loss": 0.2559, + "step": 25322 + }, + { + "epoch": 32.50706033376123, + "grad_norm": 5.6279497146606445, + "learning_rate": 2.2501497646555414e-05, + "loss": 0.2824, + "step": 25323 + }, + { + "epoch": 32.50834403080873, + "grad_norm": 2.181222438812256, + "learning_rate": 2.2501069747539582e-05, + "loss": 0.2884, + "step": 25324 + }, + { + "epoch": 32.50962772785623, + "grad_norm": 2.7647318840026855, + "learning_rate": 2.2500641848523747e-05, + "loss": 0.3038, + "step": 25325 + }, + { + "epoch": 32.510911424903725, + "grad_norm": 3.2015836238861084, + "learning_rate": 2.250021394950792e-05, + "loss": 0.3212, + "step": 25326 + }, + { + "epoch": 32.51219512195122, + "grad_norm": 1.8434909582138062, + "learning_rate": 2.2499786050492084e-05, + "loss": 0.3092, + "step": 25327 + }, + { + "epoch": 32.51347881899871, + "grad_norm": 2.311401605606079, + "learning_rate": 2.2499358151476252e-05, + "loss": 0.4498, + "step": 25328 + }, + { + "epoch": 32.51476251604621, + "grad_norm": 1.0065182447433472, + "learning_rate": 2.249893025246042e-05, + "loss": 0.2627, + "step": 25329 + }, + { + "epoch": 32.51604621309371, + "grad_norm": 2.856015682220459, + "learning_rate": 2.2498502353444586e-05, + "loss": 0.2624, + "step": 25330 + }, + { + "epoch": 32.51732991014121, + "grad_norm": 0.7356549501419067, + "learning_rate": 2.2498074454428758e-05, + "loss": 0.2467, + "step": 25331 + }, + { + "epoch": 32.518613607188705, + "grad_norm": 1.197121024131775, + "learning_rate": 2.2497646555412923e-05, + "loss": 0.2624, + "step": 25332 + }, + { + "epoch": 32.5198973042362, + "grad_norm": 2.0727241039276123, + "learning_rate": 2.249721865639709e-05, + "loss": 0.2575, + "step": 25333 + }, + { + "epoch": 32.52118100128369, + "grad_norm": 1.9653606414794922, + "learning_rate": 2.249679075738126e-05, + "loss": 0.251, + "step": 25334 + }, + { + "epoch": 32.52246469833119, + "grad_norm": 0.6774297952651978, + "learning_rate": 2.2496362858365424e-05, + "loss": 0.2642, + "step": 25335 + }, + { + "epoch": 32.52374839537869, + "grad_norm": 1.1536306142807007, + "learning_rate": 2.2495934959349596e-05, + "loss": 0.2633, + "step": 25336 + }, + { + "epoch": 32.52503209242619, + "grad_norm": 1.0083489418029785, + "learning_rate": 2.249550706033376e-05, + "loss": 0.2708, + "step": 25337 + }, + { + "epoch": 32.526315789473685, + "grad_norm": 0.8332233428955078, + "learning_rate": 2.249507916131793e-05, + "loss": 0.2569, + "step": 25338 + }, + { + "epoch": 32.52759948652118, + "grad_norm": 2.0493576526641846, + "learning_rate": 2.2494651262302098e-05, + "loss": 0.251, + "step": 25339 + }, + { + "epoch": 32.52888318356868, + "grad_norm": 1.278892159461975, + "learning_rate": 2.2494223363286267e-05, + "loss": 0.3137, + "step": 25340 + }, + { + "epoch": 32.53016688061617, + "grad_norm": 0.7878326177597046, + "learning_rate": 2.249379546427043e-05, + "loss": 0.2625, + "step": 25341 + }, + { + "epoch": 32.53145057766367, + "grad_norm": 0.7135564088821411, + "learning_rate": 2.24933675652546e-05, + "loss": 0.2573, + "step": 25342 + }, + { + "epoch": 32.53273427471117, + "grad_norm": 0.8071168065071106, + "learning_rate": 2.249293966623877e-05, + "loss": 0.2658, + "step": 25343 + }, + { + "epoch": 32.534017971758665, + "grad_norm": 0.697235643863678, + "learning_rate": 2.2492511767222937e-05, + "loss": 0.269, + "step": 25344 + }, + { + "epoch": 32.53530166880616, + "grad_norm": 0.8643659353256226, + "learning_rate": 2.2492083868207105e-05, + "loss": 0.2489, + "step": 25345 + }, + { + "epoch": 32.53658536585366, + "grad_norm": 1.2218315601348877, + "learning_rate": 2.249165596919127e-05, + "loss": 0.2729, + "step": 25346 + }, + { + "epoch": 32.53786906290116, + "grad_norm": 1.04714035987854, + "learning_rate": 2.249122807017544e-05, + "loss": 0.2693, + "step": 25347 + }, + { + "epoch": 32.53915275994865, + "grad_norm": 1.6011083126068115, + "learning_rate": 2.2490800171159607e-05, + "loss": 0.2347, + "step": 25348 + }, + { + "epoch": 32.54043645699615, + "grad_norm": 0.774992048740387, + "learning_rate": 2.2490372272143772e-05, + "loss": 0.2313, + "step": 25349 + }, + { + "epoch": 32.541720154043645, + "grad_norm": 0.9162653684616089, + "learning_rate": 2.2489944373127944e-05, + "loss": 0.2442, + "step": 25350 + }, + { + "epoch": 32.54300385109114, + "grad_norm": 1.037571907043457, + "learning_rate": 2.248951647411211e-05, + "loss": 0.2777, + "step": 25351 + }, + { + "epoch": 32.54428754813864, + "grad_norm": 1.1344622373580933, + "learning_rate": 2.248908857509628e-05, + "loss": 0.2347, + "step": 25352 + }, + { + "epoch": 32.54557124518614, + "grad_norm": 2.094932794570923, + "learning_rate": 2.2488660676080446e-05, + "loss": 0.2221, + "step": 25353 + }, + { + "epoch": 32.54685494223363, + "grad_norm": 8.31696605682373, + "learning_rate": 2.248823277706461e-05, + "loss": 0.25, + "step": 25354 + }, + { + "epoch": 32.54813863928113, + "grad_norm": 1.1717710494995117, + "learning_rate": 2.2487804878048783e-05, + "loss": 0.2408, + "step": 25355 + }, + { + "epoch": 32.549422336328625, + "grad_norm": 1.2095063924789429, + "learning_rate": 2.2487376979032948e-05, + "loss": 0.2681, + "step": 25356 + }, + { + "epoch": 32.55070603337612, + "grad_norm": 0.9497855305671692, + "learning_rate": 2.2486949080017116e-05, + "loss": 0.2462, + "step": 25357 + }, + { + "epoch": 32.55198973042362, + "grad_norm": 2.0300002098083496, + "learning_rate": 2.2486521181001284e-05, + "loss": 0.2618, + "step": 25358 + }, + { + "epoch": 32.55327342747112, + "grad_norm": 1.8795323371887207, + "learning_rate": 2.2486093281985453e-05, + "loss": 0.2714, + "step": 25359 + }, + { + "epoch": 32.55455712451862, + "grad_norm": 2.308821439743042, + "learning_rate": 2.248566538296962e-05, + "loss": 0.2276, + "step": 25360 + }, + { + "epoch": 32.55584082156611, + "grad_norm": 2.597339391708374, + "learning_rate": 2.2485237483953786e-05, + "loss": 0.2622, + "step": 25361 + }, + { + "epoch": 32.557124518613605, + "grad_norm": 0.9309077858924866, + "learning_rate": 2.2484809584937955e-05, + "loss": 0.2507, + "step": 25362 + }, + { + "epoch": 32.5584082156611, + "grad_norm": 1.0796308517456055, + "learning_rate": 2.2484381685922123e-05, + "loss": 0.2708, + "step": 25363 + }, + { + "epoch": 32.5596919127086, + "grad_norm": 1.024604082107544, + "learning_rate": 2.248395378690629e-05, + "loss": 0.2382, + "step": 25364 + }, + { + "epoch": 32.5609756097561, + "grad_norm": 1.3056604862213135, + "learning_rate": 2.2483525887890456e-05, + "loss": 0.2197, + "step": 25365 + }, + { + "epoch": 32.5622593068036, + "grad_norm": 1.6791224479675293, + "learning_rate": 2.2483097988874628e-05, + "loss": 0.2386, + "step": 25366 + }, + { + "epoch": 32.563543003851095, + "grad_norm": 2.088155746459961, + "learning_rate": 2.2482670089858793e-05, + "loss": 0.2195, + "step": 25367 + }, + { + "epoch": 32.564826700898585, + "grad_norm": 1.2314494848251343, + "learning_rate": 2.2482242190842962e-05, + "loss": 0.2363, + "step": 25368 + }, + { + "epoch": 32.56611039794608, + "grad_norm": 1.0244022607803345, + "learning_rate": 2.248181429182713e-05, + "loss": 0.255, + "step": 25369 + }, + { + "epoch": 32.56739409499358, + "grad_norm": 2.2123355865478516, + "learning_rate": 2.2481386392811295e-05, + "loss": 0.2967, + "step": 25370 + }, + { + "epoch": 32.56867779204108, + "grad_norm": 1.1503223180770874, + "learning_rate": 2.2480958493795467e-05, + "loss": 0.2498, + "step": 25371 + }, + { + "epoch": 32.56996148908858, + "grad_norm": 1.2139090299606323, + "learning_rate": 2.2480530594779632e-05, + "loss": 0.2709, + "step": 25372 + }, + { + "epoch": 32.571245186136075, + "grad_norm": 1.1466844081878662, + "learning_rate": 2.24801026957638e-05, + "loss": 0.2938, + "step": 25373 + }, + { + "epoch": 32.572528883183566, + "grad_norm": 4.553929328918457, + "learning_rate": 2.247967479674797e-05, + "loss": 0.2676, + "step": 25374 + }, + { + "epoch": 32.57381258023106, + "grad_norm": 1.409719705581665, + "learning_rate": 2.2479246897732134e-05, + "loss": 0.2786, + "step": 25375 + }, + { + "epoch": 32.57509627727856, + "grad_norm": 1.4455273151397705, + "learning_rate": 2.2478818998716306e-05, + "loss": 0.2987, + "step": 25376 + }, + { + "epoch": 32.57637997432606, + "grad_norm": 3.303725004196167, + "learning_rate": 2.247839109970047e-05, + "loss": 0.3358, + "step": 25377 + }, + { + "epoch": 32.57766367137356, + "grad_norm": 3.434710741043091, + "learning_rate": 2.247796320068464e-05, + "loss": 0.4327, + "step": 25378 + }, + { + "epoch": 32.578947368421055, + "grad_norm": 1.062109112739563, + "learning_rate": 2.2477535301668807e-05, + "loss": 0.2604, + "step": 25379 + }, + { + "epoch": 32.58023106546855, + "grad_norm": 0.8053621649742126, + "learning_rate": 2.2477107402652976e-05, + "loss": 0.2567, + "step": 25380 + }, + { + "epoch": 32.58151476251604, + "grad_norm": 1.163044810295105, + "learning_rate": 2.247667950363714e-05, + "loss": 0.2666, + "step": 25381 + }, + { + "epoch": 32.58279845956354, + "grad_norm": 1.8148167133331299, + "learning_rate": 2.247625160462131e-05, + "loss": 0.2663, + "step": 25382 + }, + { + "epoch": 32.58408215661104, + "grad_norm": 1.1105087995529175, + "learning_rate": 2.2475823705605478e-05, + "loss": 0.2388, + "step": 25383 + }, + { + "epoch": 32.58536585365854, + "grad_norm": 1.2559940814971924, + "learning_rate": 2.2475395806589646e-05, + "loss": 0.2608, + "step": 25384 + }, + { + "epoch": 32.586649550706035, + "grad_norm": 0.8807612657546997, + "learning_rate": 2.2474967907573815e-05, + "loss": 0.2593, + "step": 25385 + }, + { + "epoch": 32.58793324775353, + "grad_norm": 1.2989484071731567, + "learning_rate": 2.247454000855798e-05, + "loss": 0.2539, + "step": 25386 + }, + { + "epoch": 32.589216944801024, + "grad_norm": 1.6048882007598877, + "learning_rate": 2.247411210954215e-05, + "loss": 0.2563, + "step": 25387 + }, + { + "epoch": 32.59050064184852, + "grad_norm": 2.5595903396606445, + "learning_rate": 2.2473684210526316e-05, + "loss": 0.2809, + "step": 25388 + }, + { + "epoch": 32.59178433889602, + "grad_norm": 0.8545824289321899, + "learning_rate": 2.247325631151048e-05, + "loss": 0.2761, + "step": 25389 + }, + { + "epoch": 32.59306803594352, + "grad_norm": 0.9660294651985168, + "learning_rate": 2.2472828412494653e-05, + "loss": 0.2398, + "step": 25390 + }, + { + "epoch": 32.594351732991015, + "grad_norm": 2.1444287300109863, + "learning_rate": 2.2472400513478818e-05, + "loss": 0.2578, + "step": 25391 + }, + { + "epoch": 32.59563543003851, + "grad_norm": 1.3700900077819824, + "learning_rate": 2.247197261446299e-05, + "loss": 0.2477, + "step": 25392 + }, + { + "epoch": 32.59691912708601, + "grad_norm": 1.1400256156921387, + "learning_rate": 2.2471544715447155e-05, + "loss": 0.291, + "step": 25393 + }, + { + "epoch": 32.5982028241335, + "grad_norm": 1.0603748559951782, + "learning_rate": 2.2471116816431323e-05, + "loss": 0.2619, + "step": 25394 + }, + { + "epoch": 32.599486521181, + "grad_norm": 0.808992326259613, + "learning_rate": 2.2470688917415492e-05, + "loss": 0.2563, + "step": 25395 + }, + { + "epoch": 32.6007702182285, + "grad_norm": 0.990426242351532, + "learning_rate": 2.2470261018399657e-05, + "loss": 0.2625, + "step": 25396 + }, + { + "epoch": 32.602053915275995, + "grad_norm": 1.2519989013671875, + "learning_rate": 2.2469833119383825e-05, + "loss": 0.2532, + "step": 25397 + }, + { + "epoch": 32.60333761232349, + "grad_norm": 1.4491647481918335, + "learning_rate": 2.2469405220367994e-05, + "loss": 0.2308, + "step": 25398 + }, + { + "epoch": 32.60462130937099, + "grad_norm": 3.1741580963134766, + "learning_rate": 2.2468977321352162e-05, + "loss": 0.2657, + "step": 25399 + }, + { + "epoch": 32.60590500641848, + "grad_norm": 1.930168867111206, + "learning_rate": 2.246854942233633e-05, + "loss": 0.2639, + "step": 25400 + }, + { + "epoch": 32.60718870346598, + "grad_norm": 1.3127392530441284, + "learning_rate": 2.24681215233205e-05, + "loss": 0.2561, + "step": 25401 + }, + { + "epoch": 32.60847240051348, + "grad_norm": 1.0506552457809448, + "learning_rate": 2.2467693624304664e-05, + "loss": 0.2355, + "step": 25402 + }, + { + "epoch": 32.609756097560975, + "grad_norm": 1.1806666851043701, + "learning_rate": 2.2467265725288832e-05, + "loss": 0.259, + "step": 25403 + }, + { + "epoch": 32.61103979460847, + "grad_norm": 2.4016361236572266, + "learning_rate": 2.2466837826273e-05, + "loss": 0.2659, + "step": 25404 + }, + { + "epoch": 32.61232349165597, + "grad_norm": 2.203314781188965, + "learning_rate": 2.2466409927257166e-05, + "loss": 0.2274, + "step": 25405 + }, + { + "epoch": 32.61360718870347, + "grad_norm": 1.247710943222046, + "learning_rate": 2.2465982028241338e-05, + "loss": 0.2276, + "step": 25406 + }, + { + "epoch": 32.61489088575096, + "grad_norm": 1.1303536891937256, + "learning_rate": 2.2465554129225503e-05, + "loss": 0.2292, + "step": 25407 + }, + { + "epoch": 32.61617458279846, + "grad_norm": 0.8457940816879272, + "learning_rate": 2.246512623020967e-05, + "loss": 0.2749, + "step": 25408 + }, + { + "epoch": 32.617458279845955, + "grad_norm": 4.135504722595215, + "learning_rate": 2.246469833119384e-05, + "loss": 0.2559, + "step": 25409 + }, + { + "epoch": 32.61874197689345, + "grad_norm": 1.5812064409255981, + "learning_rate": 2.2464270432178005e-05, + "loss": 0.2394, + "step": 25410 + }, + { + "epoch": 32.62002567394095, + "grad_norm": 1.1121630668640137, + "learning_rate": 2.2463842533162176e-05, + "loss": 0.2891, + "step": 25411 + }, + { + "epoch": 32.62130937098845, + "grad_norm": 1.5603734254837036, + "learning_rate": 2.246341463414634e-05, + "loss": 0.2927, + "step": 25412 + }, + { + "epoch": 32.62259306803595, + "grad_norm": 1.748975157737732, + "learning_rate": 2.246298673513051e-05, + "loss": 0.2461, + "step": 25413 + }, + { + "epoch": 32.62387676508344, + "grad_norm": 1.1842870712280273, + "learning_rate": 2.2462558836114678e-05, + "loss": 0.2301, + "step": 25414 + }, + { + "epoch": 32.625160462130935, + "grad_norm": 1.394347906112671, + "learning_rate": 2.2462130937098843e-05, + "loss": 0.2874, + "step": 25415 + }, + { + "epoch": 32.62644415917843, + "grad_norm": 1.1709015369415283, + "learning_rate": 2.2461703038083015e-05, + "loss": 0.2664, + "step": 25416 + }, + { + "epoch": 32.62772785622593, + "grad_norm": 2.5481607913970947, + "learning_rate": 2.246127513906718e-05, + "loss": 0.2742, + "step": 25417 + }, + { + "epoch": 32.62901155327343, + "grad_norm": 1.6945527791976929, + "learning_rate": 2.246084724005135e-05, + "loss": 0.2618, + "step": 25418 + }, + { + "epoch": 32.63029525032093, + "grad_norm": 1.389259696006775, + "learning_rate": 2.2460419341035517e-05, + "loss": 0.2333, + "step": 25419 + }, + { + "epoch": 32.63157894736842, + "grad_norm": 4.523846626281738, + "learning_rate": 2.2459991442019685e-05, + "loss": 0.3042, + "step": 25420 + }, + { + "epoch": 32.632862644415916, + "grad_norm": 1.760924220085144, + "learning_rate": 2.245956354300385e-05, + "loss": 0.2468, + "step": 25421 + }, + { + "epoch": 32.63414634146341, + "grad_norm": 3.929018974304199, + "learning_rate": 2.245913564398802e-05, + "loss": 0.2681, + "step": 25422 + }, + { + "epoch": 32.63543003851091, + "grad_norm": 1.3067554235458374, + "learning_rate": 2.2458707744972187e-05, + "loss": 0.2968, + "step": 25423 + }, + { + "epoch": 32.63671373555841, + "grad_norm": 4.06986141204834, + "learning_rate": 2.2458279845956355e-05, + "loss": 0.2562, + "step": 25424 + }, + { + "epoch": 32.63799743260591, + "grad_norm": 1.9465632438659668, + "learning_rate": 2.2457851946940524e-05, + "loss": 0.2827, + "step": 25425 + }, + { + "epoch": 32.639281129653405, + "grad_norm": 2.0325207710266113, + "learning_rate": 2.245742404792469e-05, + "loss": 0.3093, + "step": 25426 + }, + { + "epoch": 32.640564826700896, + "grad_norm": 2.4648208618164062, + "learning_rate": 2.245699614890886e-05, + "loss": 0.2938, + "step": 25427 + }, + { + "epoch": 32.64184852374839, + "grad_norm": 2.927765369415283, + "learning_rate": 2.2456568249893026e-05, + "loss": 0.455, + "step": 25428 + }, + { + "epoch": 32.64313222079589, + "grad_norm": 1.1756017208099365, + "learning_rate": 2.245614035087719e-05, + "loss": 0.2588, + "step": 25429 + }, + { + "epoch": 32.64441591784339, + "grad_norm": 0.7049117684364319, + "learning_rate": 2.2455712451861363e-05, + "loss": 0.2582, + "step": 25430 + }, + { + "epoch": 32.64569961489089, + "grad_norm": 1.0356026887893677, + "learning_rate": 2.2455284552845528e-05, + "loss": 0.2624, + "step": 25431 + }, + { + "epoch": 32.646983311938385, + "grad_norm": 0.9938268661499023, + "learning_rate": 2.2454856653829696e-05, + "loss": 0.2961, + "step": 25432 + }, + { + "epoch": 32.64826700898588, + "grad_norm": 0.8776508569717407, + "learning_rate": 2.2454428754813864e-05, + "loss": 0.269, + "step": 25433 + }, + { + "epoch": 32.649550706033374, + "grad_norm": 1.1586289405822754, + "learning_rate": 2.2454000855798033e-05, + "loss": 0.2448, + "step": 25434 + }, + { + "epoch": 32.65083440308087, + "grad_norm": 1.1530112028121948, + "learning_rate": 2.24535729567822e-05, + "loss": 0.2506, + "step": 25435 + }, + { + "epoch": 32.65211810012837, + "grad_norm": 0.6112149357795715, + "learning_rate": 2.2453145057766366e-05, + "loss": 0.2605, + "step": 25436 + }, + { + "epoch": 32.65340179717587, + "grad_norm": 0.8695530891418457, + "learning_rate": 2.2452717158750535e-05, + "loss": 0.2793, + "step": 25437 + }, + { + "epoch": 32.654685494223365, + "grad_norm": 2.1661176681518555, + "learning_rate": 2.2452289259734703e-05, + "loss": 0.2748, + "step": 25438 + }, + { + "epoch": 32.65596919127086, + "grad_norm": 1.074995517730713, + "learning_rate": 2.245186136071887e-05, + "loss": 0.2639, + "step": 25439 + }, + { + "epoch": 32.657252888318354, + "grad_norm": 1.3093076944351196, + "learning_rate": 2.2451433461703037e-05, + "loss": 0.2526, + "step": 25440 + }, + { + "epoch": 32.65853658536585, + "grad_norm": 0.9670606255531311, + "learning_rate": 2.245100556268721e-05, + "loss": 0.263, + "step": 25441 + }, + { + "epoch": 32.65982028241335, + "grad_norm": 0.9371205568313599, + "learning_rate": 2.2450577663671373e-05, + "loss": 0.2733, + "step": 25442 + }, + { + "epoch": 32.66110397946085, + "grad_norm": 0.9177755117416382, + "learning_rate": 2.2450149764655542e-05, + "loss": 0.2574, + "step": 25443 + }, + { + "epoch": 32.662387676508345, + "grad_norm": 1.2301409244537354, + "learning_rate": 2.244972186563971e-05, + "loss": 0.2478, + "step": 25444 + }, + { + "epoch": 32.66367137355584, + "grad_norm": 0.8384509682655334, + "learning_rate": 2.2449293966623875e-05, + "loss": 0.259, + "step": 25445 + }, + { + "epoch": 32.66495507060334, + "grad_norm": 1.0086500644683838, + "learning_rate": 2.2448866067608047e-05, + "loss": 0.2467, + "step": 25446 + }, + { + "epoch": 32.66623876765083, + "grad_norm": 0.69861900806427, + "learning_rate": 2.2448438168592212e-05, + "loss": 0.2474, + "step": 25447 + }, + { + "epoch": 32.66752246469833, + "grad_norm": 0.9943105578422546, + "learning_rate": 2.244801026957638e-05, + "loss": 0.2694, + "step": 25448 + }, + { + "epoch": 32.66880616174583, + "grad_norm": 1.1131566762924194, + "learning_rate": 2.244758237056055e-05, + "loss": 0.2488, + "step": 25449 + }, + { + "epoch": 32.670089858793325, + "grad_norm": 1.209069013595581, + "learning_rate": 2.2447154471544714e-05, + "loss": 0.2376, + "step": 25450 + }, + { + "epoch": 32.67137355584082, + "grad_norm": 1.1664915084838867, + "learning_rate": 2.2446726572528886e-05, + "loss": 0.2422, + "step": 25451 + }, + { + "epoch": 32.67265725288832, + "grad_norm": 0.9289917945861816, + "learning_rate": 2.244629867351305e-05, + "loss": 0.2337, + "step": 25452 + }, + { + "epoch": 32.67394094993581, + "grad_norm": 2.4367148876190186, + "learning_rate": 2.244587077449722e-05, + "loss": 0.2581, + "step": 25453 + }, + { + "epoch": 32.67522464698331, + "grad_norm": 2.0961711406707764, + "learning_rate": 2.2445442875481388e-05, + "loss": 0.2767, + "step": 25454 + }, + { + "epoch": 32.67650834403081, + "grad_norm": 1.2291816473007202, + "learning_rate": 2.2445014976465556e-05, + "loss": 0.2658, + "step": 25455 + }, + { + "epoch": 32.677792041078305, + "grad_norm": 0.9921278953552246, + "learning_rate": 2.244458707744972e-05, + "loss": 0.252, + "step": 25456 + }, + { + "epoch": 32.6790757381258, + "grad_norm": 1.0735474824905396, + "learning_rate": 2.244415917843389e-05, + "loss": 0.2294, + "step": 25457 + }, + { + "epoch": 32.6803594351733, + "grad_norm": 0.9101552963256836, + "learning_rate": 2.2443731279418058e-05, + "loss": 0.2559, + "step": 25458 + }, + { + "epoch": 32.6816431322208, + "grad_norm": 1.4136065244674683, + "learning_rate": 2.2443303380402226e-05, + "loss": 0.2373, + "step": 25459 + }, + { + "epoch": 32.68292682926829, + "grad_norm": 2.30497407913208, + "learning_rate": 2.2442875481386395e-05, + "loss": 0.2459, + "step": 25460 + }, + { + "epoch": 32.68421052631579, + "grad_norm": 1.5178372859954834, + "learning_rate": 2.244244758237056e-05, + "loss": 0.2992, + "step": 25461 + }, + { + "epoch": 32.685494223363285, + "grad_norm": 1.123503565788269, + "learning_rate": 2.244201968335473e-05, + "loss": 0.2746, + "step": 25462 + }, + { + "epoch": 32.68677792041078, + "grad_norm": 1.2400754690170288, + "learning_rate": 2.2441591784338896e-05, + "loss": 0.2696, + "step": 25463 + }, + { + "epoch": 32.68806161745828, + "grad_norm": 1.1800591945648193, + "learning_rate": 2.244116388532306e-05, + "loss": 0.2514, + "step": 25464 + }, + { + "epoch": 32.68934531450578, + "grad_norm": 4.528770923614502, + "learning_rate": 2.2440735986307233e-05, + "loss": 0.2503, + "step": 25465 + }, + { + "epoch": 32.69062901155327, + "grad_norm": 1.1865001916885376, + "learning_rate": 2.2440308087291398e-05, + "loss": 0.2776, + "step": 25466 + }, + { + "epoch": 32.69191270860077, + "grad_norm": 2.1183741092681885, + "learning_rate": 2.243988018827557e-05, + "loss": 0.2782, + "step": 25467 + }, + { + "epoch": 32.693196405648266, + "grad_norm": 2.4785847663879395, + "learning_rate": 2.2439452289259735e-05, + "loss": 0.2311, + "step": 25468 + }, + { + "epoch": 32.69448010269576, + "grad_norm": 1.0875860452651978, + "learning_rate": 2.2439024390243904e-05, + "loss": 0.251, + "step": 25469 + }, + { + "epoch": 32.69576379974326, + "grad_norm": 1.8646270036697388, + "learning_rate": 2.2438596491228072e-05, + "loss": 0.2916, + "step": 25470 + }, + { + "epoch": 32.69704749679076, + "grad_norm": 1.3706445693969727, + "learning_rate": 2.2438168592212237e-05, + "loss": 0.3075, + "step": 25471 + }, + { + "epoch": 32.69833119383826, + "grad_norm": 1.1670732498168945, + "learning_rate": 2.2437740693196405e-05, + "loss": 0.262, + "step": 25472 + }, + { + "epoch": 32.69961489088575, + "grad_norm": 1.8534107208251953, + "learning_rate": 2.2437312794180574e-05, + "loss": 0.2863, + "step": 25473 + }, + { + "epoch": 32.700898587933246, + "grad_norm": 3.375330686569214, + "learning_rate": 2.2436884895164742e-05, + "loss": 0.3107, + "step": 25474 + }, + { + "epoch": 32.70218228498074, + "grad_norm": 1.268032431602478, + "learning_rate": 2.243645699614891e-05, + "loss": 0.2722, + "step": 25475 + }, + { + "epoch": 32.70346598202824, + "grad_norm": 1.979894757270813, + "learning_rate": 2.2436029097133076e-05, + "loss": 0.338, + "step": 25476 + }, + { + "epoch": 32.70474967907574, + "grad_norm": 4.726170063018799, + "learning_rate": 2.2435601198117244e-05, + "loss": 0.3256, + "step": 25477 + }, + { + "epoch": 32.70603337612324, + "grad_norm": 4.724859237670898, + "learning_rate": 2.2435173299101412e-05, + "loss": 0.4138, + "step": 25478 + }, + { + "epoch": 32.707317073170735, + "grad_norm": 1.1275279521942139, + "learning_rate": 2.243474540008558e-05, + "loss": 0.2917, + "step": 25479 + }, + { + "epoch": 32.708600770218226, + "grad_norm": 1.1139957904815674, + "learning_rate": 2.2434317501069746e-05, + "loss": 0.2366, + "step": 25480 + }, + { + "epoch": 32.709884467265724, + "grad_norm": 0.8924817442893982, + "learning_rate": 2.2433889602053918e-05, + "loss": 0.2612, + "step": 25481 + }, + { + "epoch": 32.71116816431322, + "grad_norm": 0.8969893455505371, + "learning_rate": 2.2433461703038083e-05, + "loss": 0.2779, + "step": 25482 + }, + { + "epoch": 32.71245186136072, + "grad_norm": 1.5252048969268799, + "learning_rate": 2.243303380402225e-05, + "loss": 0.2583, + "step": 25483 + }, + { + "epoch": 32.71373555840822, + "grad_norm": 1.4248387813568115, + "learning_rate": 2.243260590500642e-05, + "loss": 0.2958, + "step": 25484 + }, + { + "epoch": 32.715019255455715, + "grad_norm": 1.4849666357040405, + "learning_rate": 2.2432178005990585e-05, + "loss": 0.2786, + "step": 25485 + }, + { + "epoch": 32.716302952503206, + "grad_norm": 0.7603459358215332, + "learning_rate": 2.2431750106974756e-05, + "loss": 0.2941, + "step": 25486 + }, + { + "epoch": 32.717586649550704, + "grad_norm": 0.7887300252914429, + "learning_rate": 2.243132220795892e-05, + "loss": 0.2583, + "step": 25487 + }, + { + "epoch": 32.7188703465982, + "grad_norm": 0.9072586297988892, + "learning_rate": 2.243089430894309e-05, + "loss": 0.2703, + "step": 25488 + }, + { + "epoch": 32.7201540436457, + "grad_norm": 0.7540363669395447, + "learning_rate": 2.2430466409927258e-05, + "loss": 0.2856, + "step": 25489 + }, + { + "epoch": 32.7214377406932, + "grad_norm": 1.0926693677902222, + "learning_rate": 2.2430038510911423e-05, + "loss": 0.2684, + "step": 25490 + }, + { + "epoch": 32.722721437740695, + "grad_norm": 0.9518724679946899, + "learning_rate": 2.2429610611895595e-05, + "loss": 0.2446, + "step": 25491 + }, + { + "epoch": 32.72400513478819, + "grad_norm": 0.8000520467758179, + "learning_rate": 2.242918271287976e-05, + "loss": 0.2662, + "step": 25492 + }, + { + "epoch": 32.725288831835684, + "grad_norm": 1.0125168561935425, + "learning_rate": 2.242875481386393e-05, + "loss": 0.2494, + "step": 25493 + }, + { + "epoch": 32.72657252888318, + "grad_norm": 0.8495882749557495, + "learning_rate": 2.2428326914848097e-05, + "loss": 0.2592, + "step": 25494 + }, + { + "epoch": 32.72785622593068, + "grad_norm": 1.7341103553771973, + "learning_rate": 2.2427899015832265e-05, + "loss": 0.246, + "step": 25495 + }, + { + "epoch": 32.72913992297818, + "grad_norm": 1.0655779838562012, + "learning_rate": 2.242747111681643e-05, + "loss": 0.2679, + "step": 25496 + }, + { + "epoch": 32.730423620025675, + "grad_norm": 1.5453846454620361, + "learning_rate": 2.24270432178006e-05, + "loss": 0.2641, + "step": 25497 + }, + { + "epoch": 32.73170731707317, + "grad_norm": 0.8342086672782898, + "learning_rate": 2.2426615318784767e-05, + "loss": 0.2631, + "step": 25498 + }, + { + "epoch": 32.73299101412067, + "grad_norm": 4.299309730529785, + "learning_rate": 2.2426187419768936e-05, + "loss": 0.2766, + "step": 25499 + }, + { + "epoch": 32.73427471116816, + "grad_norm": 1.5799551010131836, + "learning_rate": 2.2425759520753104e-05, + "loss": 0.2466, + "step": 25500 + }, + { + "epoch": 32.73555840821566, + "grad_norm": 1.090922474861145, + "learning_rate": 2.242533162173727e-05, + "loss": 0.2784, + "step": 25501 + }, + { + "epoch": 32.73684210526316, + "grad_norm": 2.0434937477111816, + "learning_rate": 2.242490372272144e-05, + "loss": 0.2607, + "step": 25502 + }, + { + "epoch": 32.738125802310655, + "grad_norm": 1.0298742055892944, + "learning_rate": 2.2424475823705606e-05, + "loss": 0.2553, + "step": 25503 + }, + { + "epoch": 32.73940949935815, + "grad_norm": 2.2025909423828125, + "learning_rate": 2.242404792468977e-05, + "loss": 0.2483, + "step": 25504 + }, + { + "epoch": 32.74069319640565, + "grad_norm": 1.0206241607666016, + "learning_rate": 2.2423620025673943e-05, + "loss": 0.2516, + "step": 25505 + }, + { + "epoch": 32.74197689345314, + "grad_norm": 1.0769881010055542, + "learning_rate": 2.2423192126658108e-05, + "loss": 0.2513, + "step": 25506 + }, + { + "epoch": 32.74326059050064, + "grad_norm": 1.003766417503357, + "learning_rate": 2.242276422764228e-05, + "loss": 0.239, + "step": 25507 + }, + { + "epoch": 32.74454428754814, + "grad_norm": 1.2995465993881226, + "learning_rate": 2.2422336328626444e-05, + "loss": 0.2528, + "step": 25508 + }, + { + "epoch": 32.745827984595635, + "grad_norm": 0.9125510454177856, + "learning_rate": 2.2421908429610613e-05, + "loss": 0.2349, + "step": 25509 + }, + { + "epoch": 32.74711168164313, + "grad_norm": 1.0716032981872559, + "learning_rate": 2.242148053059478e-05, + "loss": 0.2536, + "step": 25510 + }, + { + "epoch": 32.74839537869063, + "grad_norm": 1.406395673751831, + "learning_rate": 2.2421052631578946e-05, + "loss": 0.2647, + "step": 25511 + }, + { + "epoch": 32.74967907573813, + "grad_norm": 0.9709333181381226, + "learning_rate": 2.2420624732563115e-05, + "loss": 0.2468, + "step": 25512 + }, + { + "epoch": 32.75096277278562, + "grad_norm": 2.25533390045166, + "learning_rate": 2.2420196833547283e-05, + "loss": 0.2775, + "step": 25513 + }, + { + "epoch": 32.75224646983312, + "grad_norm": 1.0320446491241455, + "learning_rate": 2.241976893453145e-05, + "loss": 0.2127, + "step": 25514 + }, + { + "epoch": 32.753530166880616, + "grad_norm": 1.7029578685760498, + "learning_rate": 2.241934103551562e-05, + "loss": 0.2573, + "step": 25515 + }, + { + "epoch": 32.75481386392811, + "grad_norm": 1.1280914545059204, + "learning_rate": 2.241891313649979e-05, + "loss": 0.2772, + "step": 25516 + }, + { + "epoch": 32.75609756097561, + "grad_norm": 1.6184433698654175, + "learning_rate": 2.2418485237483953e-05, + "loss": 0.2788, + "step": 25517 + }, + { + "epoch": 32.75738125802311, + "grad_norm": 1.2269914150238037, + "learning_rate": 2.2418057338468122e-05, + "loss": 0.2581, + "step": 25518 + }, + { + "epoch": 32.7586649550706, + "grad_norm": 2.3930788040161133, + "learning_rate": 2.241762943945229e-05, + "loss": 0.258, + "step": 25519 + }, + { + "epoch": 32.7599486521181, + "grad_norm": 1.2418919801712036, + "learning_rate": 2.2417201540436455e-05, + "loss": 0.3111, + "step": 25520 + }, + { + "epoch": 32.761232349165596, + "grad_norm": 9.769160270690918, + "learning_rate": 2.2416773641420627e-05, + "loss": 0.2758, + "step": 25521 + }, + { + "epoch": 32.76251604621309, + "grad_norm": 1.5597209930419922, + "learning_rate": 2.2416345742404792e-05, + "loss": 0.2321, + "step": 25522 + }, + { + "epoch": 32.76379974326059, + "grad_norm": 1.7733978033065796, + "learning_rate": 2.2415917843388964e-05, + "loss": 0.2904, + "step": 25523 + }, + { + "epoch": 32.76508344030809, + "grad_norm": 1.1329432725906372, + "learning_rate": 2.241548994437313e-05, + "loss": 0.2623, + "step": 25524 + }, + { + "epoch": 32.76636713735559, + "grad_norm": 1.723079800605774, + "learning_rate": 2.2415062045357294e-05, + "loss": 0.2887, + "step": 25525 + }, + { + "epoch": 32.76765083440308, + "grad_norm": 1.6643353700637817, + "learning_rate": 2.2414634146341466e-05, + "loss": 0.3118, + "step": 25526 + }, + { + "epoch": 32.768934531450576, + "grad_norm": 2.1061370372772217, + "learning_rate": 2.241420624732563e-05, + "loss": 0.324, + "step": 25527 + }, + { + "epoch": 32.770218228498074, + "grad_norm": 1.6273682117462158, + "learning_rate": 2.24137783483098e-05, + "loss": 0.3569, + "step": 25528 + }, + { + "epoch": 32.77150192554557, + "grad_norm": 0.9080908298492432, + "learning_rate": 2.2413350449293968e-05, + "loss": 0.251, + "step": 25529 + }, + { + "epoch": 32.77278562259307, + "grad_norm": 4.341736316680908, + "learning_rate": 2.2412922550278136e-05, + "loss": 0.2625, + "step": 25530 + }, + { + "epoch": 32.77406931964057, + "grad_norm": 1.0505318641662598, + "learning_rate": 2.2412494651262304e-05, + "loss": 0.2617, + "step": 25531 + }, + { + "epoch": 32.775353016688065, + "grad_norm": 0.7469364404678345, + "learning_rate": 2.241206675224647e-05, + "loss": 0.2452, + "step": 25532 + }, + { + "epoch": 32.776636713735556, + "grad_norm": 0.7116473317146301, + "learning_rate": 2.2411638853230638e-05, + "loss": 0.2758, + "step": 25533 + }, + { + "epoch": 32.777920410783054, + "grad_norm": 1.2033376693725586, + "learning_rate": 2.2411210954214806e-05, + "loss": 0.2404, + "step": 25534 + }, + { + "epoch": 32.77920410783055, + "grad_norm": 1.2434982061386108, + "learning_rate": 2.2410783055198975e-05, + "loss": 0.3156, + "step": 25535 + }, + { + "epoch": 32.78048780487805, + "grad_norm": 0.9941635131835938, + "learning_rate": 2.241035515618314e-05, + "loss": 0.2539, + "step": 25536 + }, + { + "epoch": 32.78177150192555, + "grad_norm": 1.3004204034805298, + "learning_rate": 2.2409927257167308e-05, + "loss": 0.2718, + "step": 25537 + }, + { + "epoch": 32.783055198973045, + "grad_norm": 0.724666953086853, + "learning_rate": 2.2409499358151477e-05, + "loss": 0.2689, + "step": 25538 + }, + { + "epoch": 32.784338896020536, + "grad_norm": 1.0116708278656006, + "learning_rate": 2.2409071459135645e-05, + "loss": 0.2666, + "step": 25539 + }, + { + "epoch": 32.785622593068034, + "grad_norm": 4.793398380279541, + "learning_rate": 2.2408643560119813e-05, + "loss": 0.2539, + "step": 25540 + }, + { + "epoch": 32.78690629011553, + "grad_norm": 0.9926183819770813, + "learning_rate": 2.240821566110398e-05, + "loss": 0.2661, + "step": 25541 + }, + { + "epoch": 32.78818998716303, + "grad_norm": 1.3969634771347046, + "learning_rate": 2.240778776208815e-05, + "loss": 0.2665, + "step": 25542 + }, + { + "epoch": 32.78947368421053, + "grad_norm": 0.8765381574630737, + "learning_rate": 2.2407359863072315e-05, + "loss": 0.263, + "step": 25543 + }, + { + "epoch": 32.790757381258025, + "grad_norm": 3.2493221759796143, + "learning_rate": 2.240693196405648e-05, + "loss": 0.257, + "step": 25544 + }, + { + "epoch": 32.79204107830552, + "grad_norm": 0.9761733412742615, + "learning_rate": 2.2406504065040652e-05, + "loss": 0.2626, + "step": 25545 + }, + { + "epoch": 32.793324775353014, + "grad_norm": 1.3181828260421753, + "learning_rate": 2.2406076166024817e-05, + "loss": 0.2473, + "step": 25546 + }, + { + "epoch": 32.79460847240051, + "grad_norm": 1.7185505628585815, + "learning_rate": 2.240564826700899e-05, + "loss": 0.2593, + "step": 25547 + }, + { + "epoch": 32.79589216944801, + "grad_norm": 1.4285547733306885, + "learning_rate": 2.2405220367993154e-05, + "loss": 0.2653, + "step": 25548 + }, + { + "epoch": 32.79717586649551, + "grad_norm": 1.0592565536499023, + "learning_rate": 2.2404792468977322e-05, + "loss": 0.241, + "step": 25549 + }, + { + "epoch": 32.798459563543005, + "grad_norm": 1.5966049432754517, + "learning_rate": 2.240436456996149e-05, + "loss": 0.257, + "step": 25550 + }, + { + "epoch": 32.7997432605905, + "grad_norm": 1.1280243396759033, + "learning_rate": 2.2403936670945656e-05, + "loss": 0.2441, + "step": 25551 + }, + { + "epoch": 32.801026957637994, + "grad_norm": 1.1528921127319336, + "learning_rate": 2.2403508771929824e-05, + "loss": 0.2807, + "step": 25552 + }, + { + "epoch": 32.80231065468549, + "grad_norm": 1.821148157119751, + "learning_rate": 2.2403080872913993e-05, + "loss": 0.2726, + "step": 25553 + }, + { + "epoch": 32.80359435173299, + "grad_norm": 4.178559303283691, + "learning_rate": 2.240265297389816e-05, + "loss": 0.2383, + "step": 25554 + }, + { + "epoch": 32.80487804878049, + "grad_norm": 0.9232017993927002, + "learning_rate": 2.240222507488233e-05, + "loss": 0.2639, + "step": 25555 + }, + { + "epoch": 32.806161745827985, + "grad_norm": 0.9396336674690247, + "learning_rate": 2.2401797175866498e-05, + "loss": 0.2365, + "step": 25556 + }, + { + "epoch": 32.80744544287548, + "grad_norm": 0.9137239456176758, + "learning_rate": 2.2401369276850663e-05, + "loss": 0.248, + "step": 25557 + }, + { + "epoch": 32.80872913992298, + "grad_norm": 1.419310212135315, + "learning_rate": 2.240094137783483e-05, + "loss": 0.2311, + "step": 25558 + }, + { + "epoch": 32.81001283697047, + "grad_norm": 1.0073665380477905, + "learning_rate": 2.2400513478819e-05, + "loss": 0.2261, + "step": 25559 + }, + { + "epoch": 32.81129653401797, + "grad_norm": 1.5754222869873047, + "learning_rate": 2.2400085579803165e-05, + "loss": 0.2818, + "step": 25560 + }, + { + "epoch": 32.81258023106547, + "grad_norm": 1.243520736694336, + "learning_rate": 2.2399657680787336e-05, + "loss": 0.2387, + "step": 25561 + }, + { + "epoch": 32.813863928112966, + "grad_norm": 2.2468202114105225, + "learning_rate": 2.23992297817715e-05, + "loss": 0.2661, + "step": 25562 + }, + { + "epoch": 32.81514762516046, + "grad_norm": 1.151718258857727, + "learning_rate": 2.2398801882755673e-05, + "loss": 0.2803, + "step": 25563 + }, + { + "epoch": 32.81643132220796, + "grad_norm": 1.3835004568099976, + "learning_rate": 2.2398373983739838e-05, + "loss": 0.2441, + "step": 25564 + }, + { + "epoch": 32.81771501925546, + "grad_norm": 2.1559553146362305, + "learning_rate": 2.2397946084724003e-05, + "loss": 0.2911, + "step": 25565 + }, + { + "epoch": 32.81899871630295, + "grad_norm": 1.8120653629302979, + "learning_rate": 2.2397518185708175e-05, + "loss": 0.2734, + "step": 25566 + }, + { + "epoch": 32.82028241335045, + "grad_norm": 1.5367156267166138, + "learning_rate": 2.239709028669234e-05, + "loss": 0.3011, + "step": 25567 + }, + { + "epoch": 32.821566110397946, + "grad_norm": 1.759671688079834, + "learning_rate": 2.239666238767651e-05, + "loss": 0.2482, + "step": 25568 + }, + { + "epoch": 32.822849807445444, + "grad_norm": 1.360383152961731, + "learning_rate": 2.2396234488660677e-05, + "loss": 0.3084, + "step": 25569 + }, + { + "epoch": 32.82413350449294, + "grad_norm": 1.2140862941741943, + "learning_rate": 2.2395806589644845e-05, + "loss": 0.2817, + "step": 25570 + }, + { + "epoch": 32.82541720154044, + "grad_norm": 3.8203325271606445, + "learning_rate": 2.2395378690629014e-05, + "loss": 0.2434, + "step": 25571 + }, + { + "epoch": 32.82670089858793, + "grad_norm": 1.1888666152954102, + "learning_rate": 2.239495079161318e-05, + "loss": 0.2807, + "step": 25572 + }, + { + "epoch": 32.82798459563543, + "grad_norm": 1.2544095516204834, + "learning_rate": 2.2394522892597347e-05, + "loss": 0.2646, + "step": 25573 + }, + { + "epoch": 32.829268292682926, + "grad_norm": 1.7663235664367676, + "learning_rate": 2.2394094993581516e-05, + "loss": 0.3004, + "step": 25574 + }, + { + "epoch": 32.830551989730424, + "grad_norm": 4.087466716766357, + "learning_rate": 2.2393667094565684e-05, + "loss": 0.2894, + "step": 25575 + }, + { + "epoch": 32.83183568677792, + "grad_norm": 1.522929310798645, + "learning_rate": 2.239323919554985e-05, + "loss": 0.3672, + "step": 25576 + }, + { + "epoch": 32.83311938382542, + "grad_norm": 1.7493191957473755, + "learning_rate": 2.239281129653402e-05, + "loss": 0.3651, + "step": 25577 + }, + { + "epoch": 32.83440308087292, + "grad_norm": 2.8056893348693848, + "learning_rate": 2.2392383397518186e-05, + "loss": 0.4217, + "step": 25578 + }, + { + "epoch": 32.83568677792041, + "grad_norm": 1.3194265365600586, + "learning_rate": 2.2391955498502354e-05, + "loss": 0.2831, + "step": 25579 + }, + { + "epoch": 32.836970474967906, + "grad_norm": 0.8919063806533813, + "learning_rate": 2.2391527599486523e-05, + "loss": 0.2617, + "step": 25580 + }, + { + "epoch": 32.838254172015404, + "grad_norm": 0.8804743885993958, + "learning_rate": 2.2391099700470688e-05, + "loss": 0.2838, + "step": 25581 + }, + { + "epoch": 32.8395378690629, + "grad_norm": 0.9435869455337524, + "learning_rate": 2.239067180145486e-05, + "loss": 0.2727, + "step": 25582 + }, + { + "epoch": 32.8408215661104, + "grad_norm": 0.8287382125854492, + "learning_rate": 2.2390243902439025e-05, + "loss": 0.2592, + "step": 25583 + }, + { + "epoch": 32.8421052631579, + "grad_norm": 2.5537590980529785, + "learning_rate": 2.2389816003423193e-05, + "loss": 0.2468, + "step": 25584 + }, + { + "epoch": 32.84338896020539, + "grad_norm": 1.7714617252349854, + "learning_rate": 2.238938810440736e-05, + "loss": 0.2554, + "step": 25585 + }, + { + "epoch": 32.844672657252886, + "grad_norm": 0.8506921529769897, + "learning_rate": 2.2388960205391526e-05, + "loss": 0.2312, + "step": 25586 + }, + { + "epoch": 32.845956354300384, + "grad_norm": 1.3341622352600098, + "learning_rate": 2.2388532306375698e-05, + "loss": 0.2649, + "step": 25587 + }, + { + "epoch": 32.84724005134788, + "grad_norm": 0.9027928113937378, + "learning_rate": 2.2388104407359863e-05, + "loss": 0.3073, + "step": 25588 + }, + { + "epoch": 32.84852374839538, + "grad_norm": 0.6787537932395935, + "learning_rate": 2.238767650834403e-05, + "loss": 0.2597, + "step": 25589 + }, + { + "epoch": 32.84980744544288, + "grad_norm": 0.9063683748245239, + "learning_rate": 2.23872486093282e-05, + "loss": 0.2714, + "step": 25590 + }, + { + "epoch": 32.851091142490375, + "grad_norm": 0.9661935567855835, + "learning_rate": 2.238682071031237e-05, + "loss": 0.2557, + "step": 25591 + }, + { + "epoch": 32.852374839537866, + "grad_norm": 0.8232955932617188, + "learning_rate": 2.2386392811296533e-05, + "loss": 0.2222, + "step": 25592 + }, + { + "epoch": 32.853658536585364, + "grad_norm": 1.4905421733856201, + "learning_rate": 2.2385964912280702e-05, + "loss": 0.2625, + "step": 25593 + }, + { + "epoch": 32.85494223363286, + "grad_norm": 3.632249116897583, + "learning_rate": 2.238553701326487e-05, + "loss": 0.254, + "step": 25594 + }, + { + "epoch": 32.85622593068036, + "grad_norm": 0.9233471751213074, + "learning_rate": 2.238510911424904e-05, + "loss": 0.2827, + "step": 25595 + }, + { + "epoch": 32.85750962772786, + "grad_norm": 1.9581589698791504, + "learning_rate": 2.2384681215233207e-05, + "loss": 0.2308, + "step": 25596 + }, + { + "epoch": 32.858793324775355, + "grad_norm": 0.8499534726142883, + "learning_rate": 2.2384253316217372e-05, + "loss": 0.2777, + "step": 25597 + }, + { + "epoch": 32.86007702182285, + "grad_norm": 0.8953955173492432, + "learning_rate": 2.238382541720154e-05, + "loss": 0.2477, + "step": 25598 + }, + { + "epoch": 32.861360718870344, + "grad_norm": 0.7605447769165039, + "learning_rate": 2.238339751818571e-05, + "loss": 0.2356, + "step": 25599 + }, + { + "epoch": 32.86264441591784, + "grad_norm": 1.5794943571090698, + "learning_rate": 2.2382969619169874e-05, + "loss": 0.2674, + "step": 25600 + }, + { + "epoch": 32.86392811296534, + "grad_norm": 0.9270040392875671, + "learning_rate": 2.2382541720154046e-05, + "loss": 0.257, + "step": 25601 + }, + { + "epoch": 32.86521181001284, + "grad_norm": 1.8917142152786255, + "learning_rate": 2.238211382113821e-05, + "loss": 0.2572, + "step": 25602 + }, + { + "epoch": 32.866495507060336, + "grad_norm": 0.8210848569869995, + "learning_rate": 2.2381685922122383e-05, + "loss": 0.2496, + "step": 25603 + }, + { + "epoch": 32.86777920410783, + "grad_norm": 0.9820746779441833, + "learning_rate": 2.2381258023106548e-05, + "loss": 0.2595, + "step": 25604 + }, + { + "epoch": 32.869062901155324, + "grad_norm": 1.8542366027832031, + "learning_rate": 2.2380830124090713e-05, + "loss": 0.2673, + "step": 25605 + }, + { + "epoch": 32.87034659820282, + "grad_norm": 7.609198093414307, + "learning_rate": 2.2380402225074884e-05, + "loss": 0.2597, + "step": 25606 + }, + { + "epoch": 32.87163029525032, + "grad_norm": 1.3928003311157227, + "learning_rate": 2.237997432605905e-05, + "loss": 0.2315, + "step": 25607 + }, + { + "epoch": 32.87291399229782, + "grad_norm": 2.850282907485962, + "learning_rate": 2.2379546427043218e-05, + "loss": 0.3022, + "step": 25608 + }, + { + "epoch": 32.874197689345316, + "grad_norm": 1.0765771865844727, + "learning_rate": 2.2379118528027386e-05, + "loss": 0.2773, + "step": 25609 + }, + { + "epoch": 32.87548138639281, + "grad_norm": 1.5177940130233765, + "learning_rate": 2.2378690629011555e-05, + "loss": 0.2371, + "step": 25610 + }, + { + "epoch": 32.87676508344031, + "grad_norm": 1.1714614629745483, + "learning_rate": 2.2378262729995723e-05, + "loss": 0.2806, + "step": 25611 + }, + { + "epoch": 32.8780487804878, + "grad_norm": 1.0901018381118774, + "learning_rate": 2.2377834830979888e-05, + "loss": 0.2495, + "step": 25612 + }, + { + "epoch": 32.8793324775353, + "grad_norm": 0.9908816814422607, + "learning_rate": 2.2377406931964057e-05, + "loss": 0.2769, + "step": 25613 + }, + { + "epoch": 32.8806161745828, + "grad_norm": 1.3909306526184082, + "learning_rate": 2.2376979032948225e-05, + "loss": 0.2812, + "step": 25614 + }, + { + "epoch": 32.881899871630296, + "grad_norm": 1.1820600032806396, + "learning_rate": 2.2376551133932393e-05, + "loss": 0.2901, + "step": 25615 + }, + { + "epoch": 32.883183568677794, + "grad_norm": 1.1522623300552368, + "learning_rate": 2.237612323491656e-05, + "loss": 0.2792, + "step": 25616 + }, + { + "epoch": 32.88446726572529, + "grad_norm": 1.584817886352539, + "learning_rate": 2.237569533590073e-05, + "loss": 0.2595, + "step": 25617 + }, + { + "epoch": 32.88575096277278, + "grad_norm": 2.3804681301116943, + "learning_rate": 2.2375267436884895e-05, + "loss": 0.265, + "step": 25618 + }, + { + "epoch": 32.88703465982028, + "grad_norm": 1.8522106409072876, + "learning_rate": 2.2374839537869064e-05, + "loss": 0.2774, + "step": 25619 + }, + { + "epoch": 32.88831835686778, + "grad_norm": 1.8170437812805176, + "learning_rate": 2.2374411638853232e-05, + "loss": 0.2846, + "step": 25620 + }, + { + "epoch": 32.889602053915276, + "grad_norm": 1.51823890209198, + "learning_rate": 2.2373983739837397e-05, + "loss": 0.3079, + "step": 25621 + }, + { + "epoch": 32.890885750962774, + "grad_norm": 1.6903605461120605, + "learning_rate": 2.237355584082157e-05, + "loss": 0.2527, + "step": 25622 + }, + { + "epoch": 32.89216944801027, + "grad_norm": 1.2172590494155884, + "learning_rate": 2.2373127941805734e-05, + "loss": 0.2667, + "step": 25623 + }, + { + "epoch": 32.89345314505777, + "grad_norm": 2.859720230102539, + "learning_rate": 2.2372700042789902e-05, + "loss": 0.314, + "step": 25624 + }, + { + "epoch": 32.89473684210526, + "grad_norm": 1.6500605344772339, + "learning_rate": 2.237227214377407e-05, + "loss": 0.2982, + "step": 25625 + }, + { + "epoch": 32.89602053915276, + "grad_norm": 2.577510118484497, + "learning_rate": 2.2371844244758236e-05, + "loss": 0.3053, + "step": 25626 + }, + { + "epoch": 32.897304236200256, + "grad_norm": 2.22522234916687, + "learning_rate": 2.2371416345742408e-05, + "loss": 0.3617, + "step": 25627 + }, + { + "epoch": 32.898587933247754, + "grad_norm": 2.5502772331237793, + "learning_rate": 2.2370988446726573e-05, + "loss": 0.4828, + "step": 25628 + }, + { + "epoch": 32.89987163029525, + "grad_norm": 2.2058191299438477, + "learning_rate": 2.237056054771074e-05, + "loss": 0.2735, + "step": 25629 + }, + { + "epoch": 32.90115532734275, + "grad_norm": 0.7427037954330444, + "learning_rate": 2.237013264869491e-05, + "loss": 0.2756, + "step": 25630 + }, + { + "epoch": 32.90243902439025, + "grad_norm": 1.0741442441940308, + "learning_rate": 2.2369704749679078e-05, + "loss": 0.2802, + "step": 25631 + }, + { + "epoch": 32.90372272143774, + "grad_norm": 7.707030296325684, + "learning_rate": 2.2369276850663243e-05, + "loss": 0.2997, + "step": 25632 + }, + { + "epoch": 32.905006418485236, + "grad_norm": 0.9420067667961121, + "learning_rate": 2.236884895164741e-05, + "loss": 0.2698, + "step": 25633 + }, + { + "epoch": 32.906290115532734, + "grad_norm": 2.3732917308807373, + "learning_rate": 2.236842105263158e-05, + "loss": 0.289, + "step": 25634 + }, + { + "epoch": 32.90757381258023, + "grad_norm": 0.8090633153915405, + "learning_rate": 2.2367993153615745e-05, + "loss": 0.2606, + "step": 25635 + }, + { + "epoch": 32.90885750962773, + "grad_norm": 1.1473668813705444, + "learning_rate": 2.2367565254599916e-05, + "loss": 0.2593, + "step": 25636 + }, + { + "epoch": 32.91014120667523, + "grad_norm": 1.3827368021011353, + "learning_rate": 2.236713735558408e-05, + "loss": 0.2882, + "step": 25637 + }, + { + "epoch": 32.91142490372272, + "grad_norm": 1.2029117345809937, + "learning_rate": 2.2366709456568253e-05, + "loss": 0.2732, + "step": 25638 + }, + { + "epoch": 32.912708600770216, + "grad_norm": 1.4673463106155396, + "learning_rate": 2.236628155755242e-05, + "loss": 0.2591, + "step": 25639 + }, + { + "epoch": 32.913992297817714, + "grad_norm": 0.8780547976493835, + "learning_rate": 2.2365853658536583e-05, + "loss": 0.2647, + "step": 25640 + }, + { + "epoch": 32.91527599486521, + "grad_norm": 1.8579773902893066, + "learning_rate": 2.2365425759520755e-05, + "loss": 0.2621, + "step": 25641 + }, + { + "epoch": 32.91655969191271, + "grad_norm": 3.910548686981201, + "learning_rate": 2.236499786050492e-05, + "loss": 0.2453, + "step": 25642 + }, + { + "epoch": 32.91784338896021, + "grad_norm": 2.3884921073913574, + "learning_rate": 2.236456996148909e-05, + "loss": 0.2647, + "step": 25643 + }, + { + "epoch": 32.919127086007705, + "grad_norm": 1.4109113216400146, + "learning_rate": 2.2364142062473257e-05, + "loss": 0.2541, + "step": 25644 + }, + { + "epoch": 32.920410783055196, + "grad_norm": 0.7677615880966187, + "learning_rate": 2.2363714163457425e-05, + "loss": 0.2524, + "step": 25645 + }, + { + "epoch": 32.921694480102694, + "grad_norm": 0.8912560343742371, + "learning_rate": 2.2363286264441594e-05, + "loss": 0.2789, + "step": 25646 + }, + { + "epoch": 32.92297817715019, + "grad_norm": 1.631169319152832, + "learning_rate": 2.236285836542576e-05, + "loss": 0.3007, + "step": 25647 + }, + { + "epoch": 32.92426187419769, + "grad_norm": 0.8256139755249023, + "learning_rate": 2.2362430466409927e-05, + "loss": 0.2572, + "step": 25648 + }, + { + "epoch": 32.92554557124519, + "grad_norm": 1.016453742980957, + "learning_rate": 2.2362002567394096e-05, + "loss": 0.2605, + "step": 25649 + }, + { + "epoch": 32.926829268292686, + "grad_norm": 0.833114504814148, + "learning_rate": 2.2361574668378264e-05, + "loss": 0.2493, + "step": 25650 + }, + { + "epoch": 32.928112965340176, + "grad_norm": 2.547360897064209, + "learning_rate": 2.236114676936243e-05, + "loss": 0.2672, + "step": 25651 + }, + { + "epoch": 32.929396662387674, + "grad_norm": 0.8565446138381958, + "learning_rate": 2.23607188703466e-05, + "loss": 0.2172, + "step": 25652 + }, + { + "epoch": 32.93068035943517, + "grad_norm": 1.2042709589004517, + "learning_rate": 2.2360290971330766e-05, + "loss": 0.2728, + "step": 25653 + }, + { + "epoch": 32.93196405648267, + "grad_norm": 1.4223487377166748, + "learning_rate": 2.2359863072314934e-05, + "loss": 0.2476, + "step": 25654 + }, + { + "epoch": 32.93324775353017, + "grad_norm": 1.1547456979751587, + "learning_rate": 2.2359435173299103e-05, + "loss": 0.2677, + "step": 25655 + }, + { + "epoch": 32.934531450577666, + "grad_norm": 1.4136313199996948, + "learning_rate": 2.2359007274283268e-05, + "loss": 0.2974, + "step": 25656 + }, + { + "epoch": 32.93581514762516, + "grad_norm": 1.1364121437072754, + "learning_rate": 2.235857937526744e-05, + "loss": 0.25, + "step": 25657 + }, + { + "epoch": 32.937098844672654, + "grad_norm": 1.300106167793274, + "learning_rate": 2.2358151476251605e-05, + "loss": 0.2485, + "step": 25658 + }, + { + "epoch": 32.93838254172015, + "grad_norm": 0.9687952995300293, + "learning_rate": 2.235772357723577e-05, + "loss": 0.254, + "step": 25659 + }, + { + "epoch": 32.93966623876765, + "grad_norm": 1.899868130683899, + "learning_rate": 2.235729567821994e-05, + "loss": 0.255, + "step": 25660 + }, + { + "epoch": 32.94094993581515, + "grad_norm": 1.129441738128662, + "learning_rate": 2.2356867779204106e-05, + "loss": 0.2483, + "step": 25661 + }, + { + "epoch": 32.942233632862646, + "grad_norm": 0.9609972834587097, + "learning_rate": 2.2356439880188278e-05, + "loss": 0.2776, + "step": 25662 + }, + { + "epoch": 32.943517329910144, + "grad_norm": 1.2115644216537476, + "learning_rate": 2.2356011981172443e-05, + "loss": 0.2701, + "step": 25663 + }, + { + "epoch": 32.94480102695764, + "grad_norm": 0.904895007610321, + "learning_rate": 2.235558408215661e-05, + "loss": 0.2376, + "step": 25664 + }, + { + "epoch": 32.94608472400513, + "grad_norm": 1.5698333978652954, + "learning_rate": 2.235515618314078e-05, + "loss": 0.259, + "step": 25665 + }, + { + "epoch": 32.94736842105263, + "grad_norm": 2.2279810905456543, + "learning_rate": 2.2354728284124945e-05, + "loss": 0.2423, + "step": 25666 + }, + { + "epoch": 32.94865211810013, + "grad_norm": 1.7971477508544922, + "learning_rate": 2.2354300385109114e-05, + "loss": 0.2353, + "step": 25667 + }, + { + "epoch": 32.949935815147626, + "grad_norm": 1.0758233070373535, + "learning_rate": 2.2353872486093282e-05, + "loss": 0.2632, + "step": 25668 + }, + { + "epoch": 32.951219512195124, + "grad_norm": 2.10520076751709, + "learning_rate": 2.235344458707745e-05, + "loss": 0.2864, + "step": 25669 + }, + { + "epoch": 32.95250320924262, + "grad_norm": 1.3609929084777832, + "learning_rate": 2.235301668806162e-05, + "loss": 0.2666, + "step": 25670 + }, + { + "epoch": 32.95378690629011, + "grad_norm": 1.4574062824249268, + "learning_rate": 2.2352588789045787e-05, + "loss": 0.3148, + "step": 25671 + }, + { + "epoch": 32.95507060333761, + "grad_norm": 1.806139349937439, + "learning_rate": 2.2352160890029952e-05, + "loss": 0.263, + "step": 25672 + }, + { + "epoch": 32.95635430038511, + "grad_norm": 1.8942872285842896, + "learning_rate": 2.235173299101412e-05, + "loss": 0.2864, + "step": 25673 + }, + { + "epoch": 32.957637997432606, + "grad_norm": 1.414963722229004, + "learning_rate": 2.235130509199829e-05, + "loss": 0.279, + "step": 25674 + }, + { + "epoch": 32.958921694480104, + "grad_norm": 1.140537977218628, + "learning_rate": 2.2350877192982454e-05, + "loss": 0.2883, + "step": 25675 + }, + { + "epoch": 32.9602053915276, + "grad_norm": 1.4821516275405884, + "learning_rate": 2.2350449293966626e-05, + "loss": 0.3119, + "step": 25676 + }, + { + "epoch": 32.9614890885751, + "grad_norm": 1.2016675472259521, + "learning_rate": 2.235002139495079e-05, + "loss": 0.3333, + "step": 25677 + }, + { + "epoch": 32.96277278562259, + "grad_norm": 3.9825384616851807, + "learning_rate": 2.2349593495934963e-05, + "loss": 0.4735, + "step": 25678 + }, + { + "epoch": 32.96405648267009, + "grad_norm": 1.034796118736267, + "learning_rate": 2.2349165596919128e-05, + "loss": 0.268, + "step": 25679 + }, + { + "epoch": 32.965340179717586, + "grad_norm": 1.638171911239624, + "learning_rate": 2.2348737697903293e-05, + "loss": 0.289, + "step": 25680 + }, + { + "epoch": 32.966623876765084, + "grad_norm": 1.0200146436691284, + "learning_rate": 2.2348309798887465e-05, + "loss": 0.2872, + "step": 25681 + }, + { + "epoch": 32.96790757381258, + "grad_norm": 0.8308331370353699, + "learning_rate": 2.234788189987163e-05, + "loss": 0.2678, + "step": 25682 + }, + { + "epoch": 32.96919127086008, + "grad_norm": 1.109042763710022, + "learning_rate": 2.2347454000855798e-05, + "loss": 0.2736, + "step": 25683 + }, + { + "epoch": 32.97047496790757, + "grad_norm": 2.5818278789520264, + "learning_rate": 2.2347026101839966e-05, + "loss": 0.2789, + "step": 25684 + }, + { + "epoch": 32.97175866495507, + "grad_norm": 2.0968356132507324, + "learning_rate": 2.2346598202824135e-05, + "loss": 0.2518, + "step": 25685 + }, + { + "epoch": 32.973042362002566, + "grad_norm": 0.895147979259491, + "learning_rate": 2.2346170303808303e-05, + "loss": 0.2457, + "step": 25686 + }, + { + "epoch": 32.974326059050064, + "grad_norm": 0.9355849027633667, + "learning_rate": 2.2345742404792468e-05, + "loss": 0.2823, + "step": 25687 + }, + { + "epoch": 32.97560975609756, + "grad_norm": 1.1553245782852173, + "learning_rate": 2.2345314505776637e-05, + "loss": 0.2516, + "step": 25688 + }, + { + "epoch": 32.97689345314506, + "grad_norm": 1.2742347717285156, + "learning_rate": 2.2344886606760805e-05, + "loss": 0.2634, + "step": 25689 + }, + { + "epoch": 32.97817715019256, + "grad_norm": 2.030836343765259, + "learning_rate": 2.2344458707744973e-05, + "loss": 0.2729, + "step": 25690 + }, + { + "epoch": 32.97946084724005, + "grad_norm": 1.0270600318908691, + "learning_rate": 2.234403080872914e-05, + "loss": 0.258, + "step": 25691 + }, + { + "epoch": 32.980744544287546, + "grad_norm": 1.2038230895996094, + "learning_rate": 2.234360290971331e-05, + "loss": 0.2271, + "step": 25692 + }, + { + "epoch": 32.982028241335044, + "grad_norm": 0.8606629967689514, + "learning_rate": 2.2343175010697475e-05, + "loss": 0.2216, + "step": 25693 + }, + { + "epoch": 32.98331193838254, + "grad_norm": 1.4819597005844116, + "learning_rate": 2.2342747111681644e-05, + "loss": 0.2621, + "step": 25694 + }, + { + "epoch": 32.98459563543004, + "grad_norm": 1.1172529458999634, + "learning_rate": 2.2342319212665812e-05, + "loss": 0.2689, + "step": 25695 + }, + { + "epoch": 32.98587933247754, + "grad_norm": 1.0595709085464478, + "learning_rate": 2.2341891313649977e-05, + "loss": 0.263, + "step": 25696 + }, + { + "epoch": 32.987163029525036, + "grad_norm": 0.8962124586105347, + "learning_rate": 2.234146341463415e-05, + "loss": 0.2597, + "step": 25697 + }, + { + "epoch": 32.988446726572526, + "grad_norm": 1.5881876945495605, + "learning_rate": 2.2341035515618314e-05, + "loss": 0.2437, + "step": 25698 + }, + { + "epoch": 32.989730423620024, + "grad_norm": 1.3839019536972046, + "learning_rate": 2.2340607616602482e-05, + "loss": 0.2798, + "step": 25699 + }, + { + "epoch": 32.99101412066752, + "grad_norm": 1.207965612411499, + "learning_rate": 2.234017971758665e-05, + "loss": 0.3236, + "step": 25700 + }, + { + "epoch": 32.99229781771502, + "grad_norm": 1.9889343976974487, + "learning_rate": 2.2339751818570816e-05, + "loss": 0.2759, + "step": 25701 + }, + { + "epoch": 32.99358151476252, + "grad_norm": 1.6331719160079956, + "learning_rate": 2.2339323919554988e-05, + "loss": 0.2712, + "step": 25702 + }, + { + "epoch": 32.994865211810016, + "grad_norm": 1.7851347923278809, + "learning_rate": 2.2338896020539153e-05, + "loss": 0.2881, + "step": 25703 + }, + { + "epoch": 32.996148908857506, + "grad_norm": 2.455361843109131, + "learning_rate": 2.233846812152332e-05, + "loss": 0.3158, + "step": 25704 + }, + { + "epoch": 32.997432605905004, + "grad_norm": 1.2874901294708252, + "learning_rate": 2.233804022250749e-05, + "loss": 0.2866, + "step": 25705 + }, + { + "epoch": 32.9987163029525, + "grad_norm": 1.886734962463379, + "learning_rate": 2.2337612323491658e-05, + "loss": 0.3406, + "step": 25706 + }, + { + "epoch": 33.0, + "grad_norm": 4.627223968505859, + "learning_rate": 2.2337184424475823e-05, + "loss": 0.4462, + "step": 25707 + }, + { + "epoch": 33.0012836970475, + "grad_norm": 0.8285253047943115, + "learning_rate": 2.233675652545999e-05, + "loss": 0.2508, + "step": 25708 + }, + { + "epoch": 33.002567394094996, + "grad_norm": 0.882004976272583, + "learning_rate": 2.233632862644416e-05, + "loss": 0.2721, + "step": 25709 + }, + { + "epoch": 33.003851091142494, + "grad_norm": 1.070132851600647, + "learning_rate": 2.2335900727428328e-05, + "loss": 0.2645, + "step": 25710 + }, + { + "epoch": 33.005134788189984, + "grad_norm": 3.335634708404541, + "learning_rate": 2.2335472828412497e-05, + "loss": 0.2678, + "step": 25711 + }, + { + "epoch": 33.00641848523748, + "grad_norm": 0.6989337801933289, + "learning_rate": 2.233504492939666e-05, + "loss": 0.2736, + "step": 25712 + }, + { + "epoch": 33.00770218228498, + "grad_norm": 0.7861772179603577, + "learning_rate": 2.2334617030380833e-05, + "loss": 0.2636, + "step": 25713 + }, + { + "epoch": 33.00898587933248, + "grad_norm": 0.8097612857818604, + "learning_rate": 2.2334189131365e-05, + "loss": 0.246, + "step": 25714 + }, + { + "epoch": 33.010269576379976, + "grad_norm": 0.9442679286003113, + "learning_rate": 2.2333761232349163e-05, + "loss": 0.2507, + "step": 25715 + }, + { + "epoch": 33.011553273427474, + "grad_norm": 0.9496549963951111, + "learning_rate": 2.2333333333333335e-05, + "loss": 0.2712, + "step": 25716 + }, + { + "epoch": 33.012836970474964, + "grad_norm": 1.2022775411605835, + "learning_rate": 2.23329054343175e-05, + "loss": 0.2789, + "step": 25717 + }, + { + "epoch": 33.01412066752246, + "grad_norm": 1.1519570350646973, + "learning_rate": 2.2332477535301672e-05, + "loss": 0.2455, + "step": 25718 + }, + { + "epoch": 33.01540436456996, + "grad_norm": 0.932076096534729, + "learning_rate": 2.2332049636285837e-05, + "loss": 0.254, + "step": 25719 + }, + { + "epoch": 33.01668806161746, + "grad_norm": 0.8221721053123474, + "learning_rate": 2.2331621737270002e-05, + "loss": 0.2521, + "step": 25720 + }, + { + "epoch": 33.017971758664956, + "grad_norm": 0.723477840423584, + "learning_rate": 2.2331193838254174e-05, + "loss": 0.2382, + "step": 25721 + }, + { + "epoch": 33.019255455712454, + "grad_norm": 0.9046516418457031, + "learning_rate": 2.233076593923834e-05, + "loss": 0.2421, + "step": 25722 + }, + { + "epoch": 33.02053915275995, + "grad_norm": 0.7547033429145813, + "learning_rate": 2.2330338040222507e-05, + "loss": 0.251, + "step": 25723 + }, + { + "epoch": 33.02182284980744, + "grad_norm": 2.023008346557617, + "learning_rate": 2.2329910141206676e-05, + "loss": 0.2677, + "step": 25724 + }, + { + "epoch": 33.02310654685494, + "grad_norm": 2.4680588245391846, + "learning_rate": 2.2329482242190844e-05, + "loss": 0.2451, + "step": 25725 + }, + { + "epoch": 33.02439024390244, + "grad_norm": 1.199902892112732, + "learning_rate": 2.2329054343175013e-05, + "loss": 0.2523, + "step": 25726 + }, + { + "epoch": 33.025673940949936, + "grad_norm": 0.7325583100318909, + "learning_rate": 2.2328626444159178e-05, + "loss": 0.2464, + "step": 25727 + }, + { + "epoch": 33.026957637997434, + "grad_norm": 1.3193864822387695, + "learning_rate": 2.2328198545143346e-05, + "loss": 0.2183, + "step": 25728 + }, + { + "epoch": 33.02824133504493, + "grad_norm": 2.166294574737549, + "learning_rate": 2.2327770646127514e-05, + "loss": 0.2709, + "step": 25729 + }, + { + "epoch": 33.02952503209243, + "grad_norm": 1.4913599491119385, + "learning_rate": 2.2327342747111683e-05, + "loss": 0.2418, + "step": 25730 + }, + { + "epoch": 33.03080872913992, + "grad_norm": 0.775074303150177, + "learning_rate": 2.2326914848095848e-05, + "loss": 0.2226, + "step": 25731 + }, + { + "epoch": 33.03209242618742, + "grad_norm": 1.004098892211914, + "learning_rate": 2.232648694908002e-05, + "loss": 0.2493, + "step": 25732 + }, + { + "epoch": 33.033376123234916, + "grad_norm": 1.2950844764709473, + "learning_rate": 2.2326059050064185e-05, + "loss": 0.233, + "step": 25733 + }, + { + "epoch": 33.034659820282414, + "grad_norm": 1.4031680822372437, + "learning_rate": 2.2325631151048353e-05, + "loss": 0.2396, + "step": 25734 + }, + { + "epoch": 33.03594351732991, + "grad_norm": 1.0732898712158203, + "learning_rate": 2.232520325203252e-05, + "loss": 0.265, + "step": 25735 + }, + { + "epoch": 33.03722721437741, + "grad_norm": 1.2773730754852295, + "learning_rate": 2.2324775353016687e-05, + "loss": 0.2322, + "step": 25736 + }, + { + "epoch": 33.0385109114249, + "grad_norm": 1.5044887065887451, + "learning_rate": 2.232434745400086e-05, + "loss": 0.2516, + "step": 25737 + }, + { + "epoch": 33.0397946084724, + "grad_norm": 4.311098098754883, + "learning_rate": 2.2323919554985023e-05, + "loss": 0.2523, + "step": 25738 + }, + { + "epoch": 33.041078305519896, + "grad_norm": 2.13118314743042, + "learning_rate": 2.2323491655969192e-05, + "loss": 0.1984, + "step": 25739 + }, + { + "epoch": 33.042362002567394, + "grad_norm": 4.520744800567627, + "learning_rate": 2.232306375695336e-05, + "loss": 0.253, + "step": 25740 + }, + { + "epoch": 33.04364569961489, + "grad_norm": 6.482609748840332, + "learning_rate": 2.2322635857937525e-05, + "loss": 0.2612, + "step": 25741 + }, + { + "epoch": 33.04492939666239, + "grad_norm": 0.917134165763855, + "learning_rate": 2.2322207958921697e-05, + "loss": 0.2394, + "step": 25742 + }, + { + "epoch": 33.04621309370989, + "grad_norm": 1.317138433456421, + "learning_rate": 2.2321780059905862e-05, + "loss": 0.2295, + "step": 25743 + }, + { + "epoch": 33.04749679075738, + "grad_norm": 1.8637384176254272, + "learning_rate": 2.232135216089003e-05, + "loss": 0.2565, + "step": 25744 + }, + { + "epoch": 33.048780487804876, + "grad_norm": 1.0690326690673828, + "learning_rate": 2.23209242618742e-05, + "loss": 0.2191, + "step": 25745 + }, + { + "epoch": 33.050064184852374, + "grad_norm": 2.51638126373291, + "learning_rate": 2.2320496362858367e-05, + "loss": 0.2482, + "step": 25746 + }, + { + "epoch": 33.05134788189987, + "grad_norm": 1.5648841857910156, + "learning_rate": 2.2320068463842532e-05, + "loss": 0.2424, + "step": 25747 + }, + { + "epoch": 33.05263157894737, + "grad_norm": 1.174433708190918, + "learning_rate": 2.23196405648267e-05, + "loss": 0.2248, + "step": 25748 + }, + { + "epoch": 33.05391527599487, + "grad_norm": 1.9060251712799072, + "learning_rate": 2.231921266581087e-05, + "loss": 0.2388, + "step": 25749 + }, + { + "epoch": 33.05519897304236, + "grad_norm": 0.9488994479179382, + "learning_rate": 2.2318784766795037e-05, + "loss": 0.2343, + "step": 25750 + }, + { + "epoch": 33.056482670089856, + "grad_norm": 1.076009750366211, + "learning_rate": 2.2318356867779206e-05, + "loss": 0.2614, + "step": 25751 + }, + { + "epoch": 33.057766367137354, + "grad_norm": 1.3971774578094482, + "learning_rate": 2.231792896876337e-05, + "loss": 0.2722, + "step": 25752 + }, + { + "epoch": 33.05905006418485, + "grad_norm": 2.758925437927246, + "learning_rate": 2.2317501069747543e-05, + "loss": 0.2477, + "step": 25753 + }, + { + "epoch": 33.06033376123235, + "grad_norm": 1.5050342082977295, + "learning_rate": 2.2317073170731708e-05, + "loss": 0.2643, + "step": 25754 + }, + { + "epoch": 33.06161745827985, + "grad_norm": 1.5168644189834595, + "learning_rate": 2.2316645271715873e-05, + "loss": 0.3347, + "step": 25755 + }, + { + "epoch": 33.062901155327346, + "grad_norm": 3.721977710723877, + "learning_rate": 2.2316217372700045e-05, + "loss": 0.3165, + "step": 25756 + }, + { + "epoch": 33.06418485237484, + "grad_norm": 2.7054996490478516, + "learning_rate": 2.231578947368421e-05, + "loss": 0.4392, + "step": 25757 + }, + { + "epoch": 33.065468549422334, + "grad_norm": 0.9540920257568359, + "learning_rate": 2.231536157466838e-05, + "loss": 0.2756, + "step": 25758 + }, + { + "epoch": 33.06675224646983, + "grad_norm": 0.8485674262046814, + "learning_rate": 2.2314933675652546e-05, + "loss": 0.2788, + "step": 25759 + }, + { + "epoch": 33.06803594351733, + "grad_norm": 0.7660641670227051, + "learning_rate": 2.2314505776636715e-05, + "loss": 0.2598, + "step": 25760 + }, + { + "epoch": 33.06931964056483, + "grad_norm": 0.7742941975593567, + "learning_rate": 2.2314077877620883e-05, + "loss": 0.2855, + "step": 25761 + }, + { + "epoch": 33.070603337612326, + "grad_norm": 0.809983491897583, + "learning_rate": 2.2313649978605048e-05, + "loss": 0.2394, + "step": 25762 + }, + { + "epoch": 33.071887034659824, + "grad_norm": 1.0052211284637451, + "learning_rate": 2.2313222079589217e-05, + "loss": 0.2572, + "step": 25763 + }, + { + "epoch": 33.073170731707314, + "grad_norm": 1.456093192100525, + "learning_rate": 2.2312794180573385e-05, + "loss": 0.2743, + "step": 25764 + }, + { + "epoch": 33.07445442875481, + "grad_norm": 1.2149603366851807, + "learning_rate": 2.2312366281557554e-05, + "loss": 0.2494, + "step": 25765 + }, + { + "epoch": 33.07573812580231, + "grad_norm": 0.9559177756309509, + "learning_rate": 2.2311938382541722e-05, + "loss": 0.2484, + "step": 25766 + }, + { + "epoch": 33.07702182284981, + "grad_norm": 0.9415400624275208, + "learning_rate": 2.231151048352589e-05, + "loss": 0.264, + "step": 25767 + }, + { + "epoch": 33.078305519897306, + "grad_norm": 0.8019266128540039, + "learning_rate": 2.2311082584510055e-05, + "loss": 0.2785, + "step": 25768 + }, + { + "epoch": 33.079589216944804, + "grad_norm": 0.941556990146637, + "learning_rate": 2.2310654685494224e-05, + "loss": 0.2529, + "step": 25769 + }, + { + "epoch": 33.080872913992295, + "grad_norm": 2.86624813079834, + "learning_rate": 2.2310226786478392e-05, + "loss": 0.2369, + "step": 25770 + }, + { + "epoch": 33.08215661103979, + "grad_norm": 0.8709390759468079, + "learning_rate": 2.2309798887462557e-05, + "loss": 0.2441, + "step": 25771 + }, + { + "epoch": 33.08344030808729, + "grad_norm": 0.9059685468673706, + "learning_rate": 2.230937098844673e-05, + "loss": 0.2481, + "step": 25772 + }, + { + "epoch": 33.08472400513479, + "grad_norm": 1.1193336248397827, + "learning_rate": 2.2308943089430894e-05, + "loss": 0.2567, + "step": 25773 + }, + { + "epoch": 33.086007702182286, + "grad_norm": 2.833888530731201, + "learning_rate": 2.2308515190415066e-05, + "loss": 0.2555, + "step": 25774 + }, + { + "epoch": 33.087291399229784, + "grad_norm": 1.5541690587997437, + "learning_rate": 2.230808729139923e-05, + "loss": 0.2449, + "step": 25775 + }, + { + "epoch": 33.08857509627728, + "grad_norm": 1.042811393737793, + "learning_rate": 2.2307659392383396e-05, + "loss": 0.263, + "step": 25776 + }, + { + "epoch": 33.08985879332477, + "grad_norm": 1.005939245223999, + "learning_rate": 2.2307231493367568e-05, + "loss": 0.2257, + "step": 25777 + }, + { + "epoch": 33.09114249037227, + "grad_norm": 0.956905722618103, + "learning_rate": 2.2306803594351733e-05, + "loss": 0.2389, + "step": 25778 + }, + { + "epoch": 33.09242618741977, + "grad_norm": 1.6711241006851196, + "learning_rate": 2.23063756953359e-05, + "loss": 0.2133, + "step": 25779 + }, + { + "epoch": 33.093709884467266, + "grad_norm": 1.1487349271774292, + "learning_rate": 2.230594779632007e-05, + "loss": 0.2352, + "step": 25780 + }, + { + "epoch": 33.094993581514764, + "grad_norm": 1.372687578201294, + "learning_rate": 2.2305519897304238e-05, + "loss": 0.2219, + "step": 25781 + }, + { + "epoch": 33.09627727856226, + "grad_norm": 0.932558536529541, + "learning_rate": 2.2305091998288406e-05, + "loss": 0.2286, + "step": 25782 + }, + { + "epoch": 33.09756097560975, + "grad_norm": 7.025850296020508, + "learning_rate": 2.230466409927257e-05, + "loss": 0.2311, + "step": 25783 + }, + { + "epoch": 33.09884467265725, + "grad_norm": 0.9184333682060242, + "learning_rate": 2.230423620025674e-05, + "loss": 0.2366, + "step": 25784 + }, + { + "epoch": 33.10012836970475, + "grad_norm": 1.9302434921264648, + "learning_rate": 2.2303808301240908e-05, + "loss": 0.2321, + "step": 25785 + }, + { + "epoch": 33.101412066752246, + "grad_norm": 1.0873854160308838, + "learning_rate": 2.2303380402225077e-05, + "loss": 0.2452, + "step": 25786 + }, + { + "epoch": 33.102695763799744, + "grad_norm": 1.026612401008606, + "learning_rate": 2.230295250320924e-05, + "loss": 0.2417, + "step": 25787 + }, + { + "epoch": 33.10397946084724, + "grad_norm": 3.023836135864258, + "learning_rate": 2.230252460419341e-05, + "loss": 0.2394, + "step": 25788 + }, + { + "epoch": 33.10526315789474, + "grad_norm": 1.1553572416305542, + "learning_rate": 2.230209670517758e-05, + "loss": 0.2176, + "step": 25789 + }, + { + "epoch": 33.10654685494223, + "grad_norm": 1.2015478610992432, + "learning_rate": 2.2301668806161747e-05, + "loss": 0.2401, + "step": 25790 + }, + { + "epoch": 33.10783055198973, + "grad_norm": 0.9611210227012634, + "learning_rate": 2.2301240907145915e-05, + "loss": 0.2383, + "step": 25791 + }, + { + "epoch": 33.109114249037226, + "grad_norm": 1.3099101781845093, + "learning_rate": 2.230081300813008e-05, + "loss": 0.2376, + "step": 25792 + }, + { + "epoch": 33.110397946084724, + "grad_norm": 1.0588974952697754, + "learning_rate": 2.2300385109114252e-05, + "loss": 0.2527, + "step": 25793 + }, + { + "epoch": 33.11168164313222, + "grad_norm": 1.4899343252182007, + "learning_rate": 2.2299957210098417e-05, + "loss": 0.2671, + "step": 25794 + }, + { + "epoch": 33.11296534017972, + "grad_norm": 1.1910202503204346, + "learning_rate": 2.2299529311082582e-05, + "loss": 0.2591, + "step": 25795 + }, + { + "epoch": 33.11424903722722, + "grad_norm": 1.199994683265686, + "learning_rate": 2.2299101412066754e-05, + "loss": 0.2417, + "step": 25796 + }, + { + "epoch": 33.11553273427471, + "grad_norm": 1.169448733329773, + "learning_rate": 2.229867351305092e-05, + "loss": 0.2194, + "step": 25797 + }, + { + "epoch": 33.116816431322206, + "grad_norm": 1.648597002029419, + "learning_rate": 2.229824561403509e-05, + "loss": 0.2696, + "step": 25798 + }, + { + "epoch": 33.118100128369704, + "grad_norm": 1.3802777528762817, + "learning_rate": 2.2297817715019256e-05, + "loss": 0.2781, + "step": 25799 + }, + { + "epoch": 33.1193838254172, + "grad_norm": 1.7803187370300293, + "learning_rate": 2.2297389816003424e-05, + "loss": 0.2543, + "step": 25800 + }, + { + "epoch": 33.1206675224647, + "grad_norm": 2.7578353881835938, + "learning_rate": 2.2296961916987593e-05, + "loss": 0.2538, + "step": 25801 + }, + { + "epoch": 33.1219512195122, + "grad_norm": 9.974509239196777, + "learning_rate": 2.2296534017971758e-05, + "loss": 0.2399, + "step": 25802 + }, + { + "epoch": 33.12323491655969, + "grad_norm": 2.2954299449920654, + "learning_rate": 2.2296106118955926e-05, + "loss": 0.2484, + "step": 25803 + }, + { + "epoch": 33.12451861360719, + "grad_norm": 1.293373942375183, + "learning_rate": 2.2295678219940094e-05, + "loss": 0.3, + "step": 25804 + }, + { + "epoch": 33.125802310654684, + "grad_norm": 1.4142744541168213, + "learning_rate": 2.2295250320924263e-05, + "loss": 0.2815, + "step": 25805 + }, + { + "epoch": 33.12708600770218, + "grad_norm": 1.406534194946289, + "learning_rate": 2.229482242190843e-05, + "loss": 0.2955, + "step": 25806 + }, + { + "epoch": 33.12836970474968, + "grad_norm": 2.335911989212036, + "learning_rate": 2.22943945228926e-05, + "loss": 0.431, + "step": 25807 + }, + { + "epoch": 33.12965340179718, + "grad_norm": 0.7620497941970825, + "learning_rate": 2.2293966623876765e-05, + "loss": 0.2737, + "step": 25808 + }, + { + "epoch": 33.130937098844676, + "grad_norm": 0.666959822177887, + "learning_rate": 2.2293538724860933e-05, + "loss": 0.2523, + "step": 25809 + }, + { + "epoch": 33.13222079589217, + "grad_norm": 1.1224277019500732, + "learning_rate": 2.22931108258451e-05, + "loss": 0.261, + "step": 25810 + }, + { + "epoch": 33.133504492939664, + "grad_norm": 1.0332008600234985, + "learning_rate": 2.2292682926829267e-05, + "loss": 0.254, + "step": 25811 + }, + { + "epoch": 33.13478818998716, + "grad_norm": 1.0302597284317017, + "learning_rate": 2.229225502781344e-05, + "loss": 0.2309, + "step": 25812 + }, + { + "epoch": 33.13607188703466, + "grad_norm": 1.1793631315231323, + "learning_rate": 2.2291827128797603e-05, + "loss": 0.2517, + "step": 25813 + }, + { + "epoch": 33.13735558408216, + "grad_norm": 1.0247838497161865, + "learning_rate": 2.2291399229781775e-05, + "loss": 0.2474, + "step": 25814 + }, + { + "epoch": 33.138639281129656, + "grad_norm": 1.574146032333374, + "learning_rate": 2.229097133076594e-05, + "loss": 0.2429, + "step": 25815 + }, + { + "epoch": 33.13992297817715, + "grad_norm": 0.8116000890731812, + "learning_rate": 2.2290543431750105e-05, + "loss": 0.2488, + "step": 25816 + }, + { + "epoch": 33.141206675224645, + "grad_norm": 0.7498741745948792, + "learning_rate": 2.2290115532734277e-05, + "loss": 0.2686, + "step": 25817 + }, + { + "epoch": 33.14249037227214, + "grad_norm": 0.9298261404037476, + "learning_rate": 2.2289687633718442e-05, + "loss": 0.2516, + "step": 25818 + }, + { + "epoch": 33.14377406931964, + "grad_norm": 0.8555737733840942, + "learning_rate": 2.228925973470261e-05, + "loss": 0.247, + "step": 25819 + }, + { + "epoch": 33.14505776636714, + "grad_norm": 1.42972731590271, + "learning_rate": 2.228883183568678e-05, + "loss": 0.2729, + "step": 25820 + }, + { + "epoch": 33.146341463414636, + "grad_norm": 1.0278139114379883, + "learning_rate": 2.2288403936670947e-05, + "loss": 0.259, + "step": 25821 + }, + { + "epoch": 33.147625160462134, + "grad_norm": 1.1024852991104126, + "learning_rate": 2.2287976037655116e-05, + "loss": 0.2824, + "step": 25822 + }, + { + "epoch": 33.148908857509625, + "grad_norm": 0.7674946784973145, + "learning_rate": 2.228754813863928e-05, + "loss": 0.2334, + "step": 25823 + }, + { + "epoch": 33.15019255455712, + "grad_norm": 1.1124807596206665, + "learning_rate": 2.228712023962345e-05, + "loss": 0.263, + "step": 25824 + }, + { + "epoch": 33.15147625160462, + "grad_norm": 0.7889492511749268, + "learning_rate": 2.2286692340607618e-05, + "loss": 0.2605, + "step": 25825 + }, + { + "epoch": 33.15275994865212, + "grad_norm": 1.1991219520568848, + "learning_rate": 2.2286264441591786e-05, + "loss": 0.2446, + "step": 25826 + }, + { + "epoch": 33.154043645699616, + "grad_norm": 0.9051690101623535, + "learning_rate": 2.228583654257595e-05, + "loss": 0.2283, + "step": 25827 + }, + { + "epoch": 33.155327342747114, + "grad_norm": 0.8562170267105103, + "learning_rate": 2.2285408643560123e-05, + "loss": 0.2339, + "step": 25828 + }, + { + "epoch": 33.15661103979461, + "grad_norm": 1.5260655879974365, + "learning_rate": 2.2284980744544288e-05, + "loss": 0.246, + "step": 25829 + }, + { + "epoch": 33.1578947368421, + "grad_norm": 0.7158058285713196, + "learning_rate": 2.2284552845528456e-05, + "loss": 0.2281, + "step": 25830 + }, + { + "epoch": 33.1591784338896, + "grad_norm": 0.9407867789268494, + "learning_rate": 2.2284124946512625e-05, + "loss": 0.2344, + "step": 25831 + }, + { + "epoch": 33.1604621309371, + "grad_norm": 0.8524138927459717, + "learning_rate": 2.228369704749679e-05, + "loss": 0.2576, + "step": 25832 + }, + { + "epoch": 33.161745827984596, + "grad_norm": 0.949292778968811, + "learning_rate": 2.228326914848096e-05, + "loss": 0.236, + "step": 25833 + }, + { + "epoch": 33.163029525032094, + "grad_norm": 1.5259757041931152, + "learning_rate": 2.2282841249465126e-05, + "loss": 0.2246, + "step": 25834 + }, + { + "epoch": 33.16431322207959, + "grad_norm": 0.9524937272071838, + "learning_rate": 2.2282413350449295e-05, + "loss": 0.2492, + "step": 25835 + }, + { + "epoch": 33.16559691912708, + "grad_norm": 1.7135359048843384, + "learning_rate": 2.2281985451433463e-05, + "loss": 0.2104, + "step": 25836 + }, + { + "epoch": 33.16688061617458, + "grad_norm": 1.814565658569336, + "learning_rate": 2.228155755241763e-05, + "loss": 0.2115, + "step": 25837 + }, + { + "epoch": 33.16816431322208, + "grad_norm": 1.616025447845459, + "learning_rate": 2.2281129653401797e-05, + "loss": 0.2025, + "step": 25838 + }, + { + "epoch": 33.169448010269576, + "grad_norm": 2.9353880882263184, + "learning_rate": 2.2280701754385965e-05, + "loss": 0.231, + "step": 25839 + }, + { + "epoch": 33.170731707317074, + "grad_norm": 1.6140586137771606, + "learning_rate": 2.2280273855370134e-05, + "loss": 0.2437, + "step": 25840 + }, + { + "epoch": 33.17201540436457, + "grad_norm": 1.0285284519195557, + "learning_rate": 2.2279845956354302e-05, + "loss": 0.2215, + "step": 25841 + }, + { + "epoch": 33.17329910141207, + "grad_norm": 1.5737736225128174, + "learning_rate": 2.227941805733847e-05, + "loss": 0.2417, + "step": 25842 + }, + { + "epoch": 33.17458279845956, + "grad_norm": 0.9588548541069031, + "learning_rate": 2.2278990158322635e-05, + "loss": 0.2339, + "step": 25843 + }, + { + "epoch": 33.17586649550706, + "grad_norm": 2.2027623653411865, + "learning_rate": 2.2278562259306804e-05, + "loss": 0.2528, + "step": 25844 + }, + { + "epoch": 33.177150192554556, + "grad_norm": 0.9163685441017151, + "learning_rate": 2.2278134360290972e-05, + "loss": 0.2271, + "step": 25845 + }, + { + "epoch": 33.178433889602054, + "grad_norm": 5.0120463371276855, + "learning_rate": 2.2277706461275137e-05, + "loss": 0.2181, + "step": 25846 + }, + { + "epoch": 33.17971758664955, + "grad_norm": 1.6554906368255615, + "learning_rate": 2.227727856225931e-05, + "loss": 0.255, + "step": 25847 + }, + { + "epoch": 33.18100128369705, + "grad_norm": 1.4962903261184692, + "learning_rate": 2.2276850663243474e-05, + "loss": 0.267, + "step": 25848 + }, + { + "epoch": 33.18228498074454, + "grad_norm": 1.3319708108901978, + "learning_rate": 2.2276422764227643e-05, + "loss": 0.2891, + "step": 25849 + }, + { + "epoch": 33.18356867779204, + "grad_norm": 2.518714189529419, + "learning_rate": 2.227599486521181e-05, + "loss": 0.2657, + "step": 25850 + }, + { + "epoch": 33.18485237483954, + "grad_norm": 1.3339744806289673, + "learning_rate": 2.2275566966195976e-05, + "loss": 0.2446, + "step": 25851 + }, + { + "epoch": 33.186136071887034, + "grad_norm": 6.470946788787842, + "learning_rate": 2.2275139067180148e-05, + "loss": 0.2394, + "step": 25852 + }, + { + "epoch": 33.18741976893453, + "grad_norm": 1.44852876663208, + "learning_rate": 2.2274711168164313e-05, + "loss": 0.2717, + "step": 25853 + }, + { + "epoch": 33.18870346598203, + "grad_norm": 5.9979424476623535, + "learning_rate": 2.227428326914848e-05, + "loss": 0.2621, + "step": 25854 + }, + { + "epoch": 33.18998716302953, + "grad_norm": 7.000166416168213, + "learning_rate": 2.227385537013265e-05, + "loss": 0.3302, + "step": 25855 + }, + { + "epoch": 33.19127086007702, + "grad_norm": 1.5881175994873047, + "learning_rate": 2.2273427471116815e-05, + "loss": 0.3012, + "step": 25856 + }, + { + "epoch": 33.19255455712452, + "grad_norm": 2.7550597190856934, + "learning_rate": 2.2272999572100986e-05, + "loss": 0.4079, + "step": 25857 + }, + { + "epoch": 33.193838254172015, + "grad_norm": 0.7382052540779114, + "learning_rate": 2.227257167308515e-05, + "loss": 0.2511, + "step": 25858 + }, + { + "epoch": 33.19512195121951, + "grad_norm": 0.7747419476509094, + "learning_rate": 2.227214377406932e-05, + "loss": 0.2502, + "step": 25859 + }, + { + "epoch": 33.19640564826701, + "grad_norm": 0.6038382649421692, + "learning_rate": 2.2271715875053488e-05, + "loss": 0.2608, + "step": 25860 + }, + { + "epoch": 33.19768934531451, + "grad_norm": 0.9425424337387085, + "learning_rate": 2.2271287976037657e-05, + "loss": 0.2727, + "step": 25861 + }, + { + "epoch": 33.198973042362006, + "grad_norm": 0.69896399974823, + "learning_rate": 2.227086007702182e-05, + "loss": 0.2506, + "step": 25862 + }, + { + "epoch": 33.2002567394095, + "grad_norm": 1.1166712045669556, + "learning_rate": 2.227043217800599e-05, + "loss": 0.2478, + "step": 25863 + }, + { + "epoch": 33.201540436456995, + "grad_norm": 1.6306116580963135, + "learning_rate": 2.227000427899016e-05, + "loss": 0.2767, + "step": 25864 + }, + { + "epoch": 33.20282413350449, + "grad_norm": 0.6909617781639099, + "learning_rate": 2.2269576379974327e-05, + "loss": 0.2485, + "step": 25865 + }, + { + "epoch": 33.20410783055199, + "grad_norm": 0.7492006421089172, + "learning_rate": 2.2269148480958495e-05, + "loss": 0.273, + "step": 25866 + }, + { + "epoch": 33.20539152759949, + "grad_norm": 0.8027096390724182, + "learning_rate": 2.226872058194266e-05, + "loss": 0.2669, + "step": 25867 + }, + { + "epoch": 33.206675224646986, + "grad_norm": 0.9410222768783569, + "learning_rate": 2.2268292682926832e-05, + "loss": 0.2695, + "step": 25868 + }, + { + "epoch": 33.20795892169448, + "grad_norm": 0.7762159109115601, + "learning_rate": 2.2267864783910997e-05, + "loss": 0.2599, + "step": 25869 + }, + { + "epoch": 33.209242618741975, + "grad_norm": 1.0159766674041748, + "learning_rate": 2.2267436884895162e-05, + "loss": 0.2542, + "step": 25870 + }, + { + "epoch": 33.21052631578947, + "grad_norm": 0.8032404780387878, + "learning_rate": 2.2267008985879334e-05, + "loss": 0.242, + "step": 25871 + }, + { + "epoch": 33.21181001283697, + "grad_norm": 0.8859509825706482, + "learning_rate": 2.22665810868635e-05, + "loss": 0.2378, + "step": 25872 + }, + { + "epoch": 33.21309370988447, + "grad_norm": 1.145815134048462, + "learning_rate": 2.226615318784767e-05, + "loss": 0.2411, + "step": 25873 + }, + { + "epoch": 33.214377406931966, + "grad_norm": 1.0562914609909058, + "learning_rate": 2.2265725288831836e-05, + "loss": 0.2436, + "step": 25874 + }, + { + "epoch": 33.215661103979464, + "grad_norm": 0.8422690629959106, + "learning_rate": 2.2265297389816004e-05, + "loss": 0.2392, + "step": 25875 + }, + { + "epoch": 33.216944801026955, + "grad_norm": 1.1903492212295532, + "learning_rate": 2.2264869490800173e-05, + "loss": 0.2579, + "step": 25876 + }, + { + "epoch": 33.21822849807445, + "grad_norm": 2.8347365856170654, + "learning_rate": 2.2264441591784338e-05, + "loss": 0.2314, + "step": 25877 + }, + { + "epoch": 33.21951219512195, + "grad_norm": 2.760554075241089, + "learning_rate": 2.2264013692768506e-05, + "loss": 0.2602, + "step": 25878 + }, + { + "epoch": 33.22079589216945, + "grad_norm": 1.2661974430084229, + "learning_rate": 2.2263585793752675e-05, + "loss": 0.2401, + "step": 25879 + }, + { + "epoch": 33.222079589216946, + "grad_norm": 0.8186199069023132, + "learning_rate": 2.2263157894736843e-05, + "loss": 0.218, + "step": 25880 + }, + { + "epoch": 33.223363286264444, + "grad_norm": 0.7985092401504517, + "learning_rate": 2.226272999572101e-05, + "loss": 0.2283, + "step": 25881 + }, + { + "epoch": 33.224646983311935, + "grad_norm": 0.860457181930542, + "learning_rate": 2.226230209670518e-05, + "loss": 0.2015, + "step": 25882 + }, + { + "epoch": 33.22593068035943, + "grad_norm": 1.0082769393920898, + "learning_rate": 2.2261874197689345e-05, + "loss": 0.2354, + "step": 25883 + }, + { + "epoch": 33.22721437740693, + "grad_norm": 0.7886748313903809, + "learning_rate": 2.2261446298673513e-05, + "loss": 0.2402, + "step": 25884 + }, + { + "epoch": 33.22849807445443, + "grad_norm": 1.0452607870101929, + "learning_rate": 2.226101839965768e-05, + "loss": 0.2206, + "step": 25885 + }, + { + "epoch": 33.229781771501926, + "grad_norm": 0.9434070587158203, + "learning_rate": 2.2260590500641847e-05, + "loss": 0.2425, + "step": 25886 + }, + { + "epoch": 33.231065468549424, + "grad_norm": 1.0909819602966309, + "learning_rate": 2.226016260162602e-05, + "loss": 0.2306, + "step": 25887 + }, + { + "epoch": 33.23234916559692, + "grad_norm": 1.5392299890518188, + "learning_rate": 2.2259734702610183e-05, + "loss": 0.2285, + "step": 25888 + }, + { + "epoch": 33.23363286264441, + "grad_norm": 1.5095270872116089, + "learning_rate": 2.2259306803594355e-05, + "loss": 0.2266, + "step": 25889 + }, + { + "epoch": 33.23491655969191, + "grad_norm": 3.7961223125457764, + "learning_rate": 2.225887890457852e-05, + "loss": 0.2114, + "step": 25890 + }, + { + "epoch": 33.23620025673941, + "grad_norm": 1.4872061014175415, + "learning_rate": 2.2258451005562685e-05, + "loss": 0.2689, + "step": 25891 + }, + { + "epoch": 33.23748395378691, + "grad_norm": 1.3217167854309082, + "learning_rate": 2.2258023106546857e-05, + "loss": 0.2292, + "step": 25892 + }, + { + "epoch": 33.238767650834404, + "grad_norm": 0.9681528210639954, + "learning_rate": 2.2257595207531022e-05, + "loss": 0.254, + "step": 25893 + }, + { + "epoch": 33.2400513478819, + "grad_norm": 1.4372068643569946, + "learning_rate": 2.225716730851519e-05, + "loss": 0.22, + "step": 25894 + }, + { + "epoch": 33.2413350449294, + "grad_norm": 1.0659334659576416, + "learning_rate": 2.225673940949936e-05, + "loss": 0.2556, + "step": 25895 + }, + { + "epoch": 33.24261874197689, + "grad_norm": 1.2981975078582764, + "learning_rate": 2.2256311510483527e-05, + "loss": 0.2479, + "step": 25896 + }, + { + "epoch": 33.24390243902439, + "grad_norm": 0.9150122404098511, + "learning_rate": 2.2255883611467696e-05, + "loss": 0.2163, + "step": 25897 + }, + { + "epoch": 33.24518613607189, + "grad_norm": 2.007810115814209, + "learning_rate": 2.225545571245186e-05, + "loss": 0.2422, + "step": 25898 + }, + { + "epoch": 33.246469833119384, + "grad_norm": 1.3578191995620728, + "learning_rate": 2.225502781343603e-05, + "loss": 0.2547, + "step": 25899 + }, + { + "epoch": 33.24775353016688, + "grad_norm": 1.3737108707427979, + "learning_rate": 2.2254599914420198e-05, + "loss": 0.2258, + "step": 25900 + }, + { + "epoch": 33.24903722721438, + "grad_norm": 1.5619986057281494, + "learning_rate": 2.2254172015404366e-05, + "loss": 0.2477, + "step": 25901 + }, + { + "epoch": 33.25032092426187, + "grad_norm": 1.4981508255004883, + "learning_rate": 2.225374411638853e-05, + "loss": 0.2481, + "step": 25902 + }, + { + "epoch": 33.25160462130937, + "grad_norm": 2.7782142162323, + "learning_rate": 2.2253316217372703e-05, + "loss": 0.2479, + "step": 25903 + }, + { + "epoch": 33.25288831835687, + "grad_norm": 1.5702672004699707, + "learning_rate": 2.2252888318356868e-05, + "loss": 0.2834, + "step": 25904 + }, + { + "epoch": 33.254172015404365, + "grad_norm": 7.711489677429199, + "learning_rate": 2.2252460419341036e-05, + "loss": 0.2793, + "step": 25905 + }, + { + "epoch": 33.25545571245186, + "grad_norm": 2.6893434524536133, + "learning_rate": 2.2252032520325205e-05, + "loss": 0.3184, + "step": 25906 + }, + { + "epoch": 33.25673940949936, + "grad_norm": 1.4767669439315796, + "learning_rate": 2.225160462130937e-05, + "loss": 0.3608, + "step": 25907 + }, + { + "epoch": 33.25802310654686, + "grad_norm": 0.8533290028572083, + "learning_rate": 2.225117672229354e-05, + "loss": 0.268, + "step": 25908 + }, + { + "epoch": 33.25930680359435, + "grad_norm": 0.8411774635314941, + "learning_rate": 2.2250748823277707e-05, + "loss": 0.247, + "step": 25909 + }, + { + "epoch": 33.26059050064185, + "grad_norm": 0.8530167937278748, + "learning_rate": 2.225032092426187e-05, + "loss": 0.2626, + "step": 25910 + }, + { + "epoch": 33.261874197689345, + "grad_norm": 0.6629534363746643, + "learning_rate": 2.2249893025246043e-05, + "loss": 0.2398, + "step": 25911 + }, + { + "epoch": 33.26315789473684, + "grad_norm": 1.7649164199829102, + "learning_rate": 2.224946512623021e-05, + "loss": 0.2789, + "step": 25912 + }, + { + "epoch": 33.26444159178434, + "grad_norm": 1.1032696962356567, + "learning_rate": 2.224903722721438e-05, + "loss": 0.2559, + "step": 25913 + }, + { + "epoch": 33.26572528883184, + "grad_norm": 8.380703926086426, + "learning_rate": 2.2248609328198545e-05, + "loss": 0.2508, + "step": 25914 + }, + { + "epoch": 33.26700898587933, + "grad_norm": 1.2139729261398315, + "learning_rate": 2.2248181429182714e-05, + "loss": 0.2619, + "step": 25915 + }, + { + "epoch": 33.26829268292683, + "grad_norm": 0.8895713686943054, + "learning_rate": 2.2247753530166882e-05, + "loss": 0.2211, + "step": 25916 + }, + { + "epoch": 33.269576379974325, + "grad_norm": 1.0600636005401611, + "learning_rate": 2.2247325631151047e-05, + "loss": 0.2729, + "step": 25917 + }, + { + "epoch": 33.27086007702182, + "grad_norm": 0.7083433270454407, + "learning_rate": 2.2246897732135215e-05, + "loss": 0.23, + "step": 25918 + }, + { + "epoch": 33.27214377406932, + "grad_norm": 2.0938596725463867, + "learning_rate": 2.2246469833119384e-05, + "loss": 0.242, + "step": 25919 + }, + { + "epoch": 33.27342747111682, + "grad_norm": 0.9771970510482788, + "learning_rate": 2.2246041934103552e-05, + "loss": 0.2579, + "step": 25920 + }, + { + "epoch": 33.274711168164316, + "grad_norm": 1.4030485153198242, + "learning_rate": 2.224561403508772e-05, + "loss": 0.2339, + "step": 25921 + }, + { + "epoch": 33.27599486521181, + "grad_norm": 0.7740535736083984, + "learning_rate": 2.224518613607189e-05, + "loss": 0.2285, + "step": 25922 + }, + { + "epoch": 33.277278562259305, + "grad_norm": 0.963095486164093, + "learning_rate": 2.2244758237056054e-05, + "loss": 0.2423, + "step": 25923 + }, + { + "epoch": 33.2785622593068, + "grad_norm": 0.8321102857589722, + "learning_rate": 2.2244330338040223e-05, + "loss": 0.2296, + "step": 25924 + }, + { + "epoch": 33.2798459563543, + "grad_norm": 0.7975884675979614, + "learning_rate": 2.224390243902439e-05, + "loss": 0.2231, + "step": 25925 + }, + { + "epoch": 33.2811296534018, + "grad_norm": 1.1535601615905762, + "learning_rate": 2.2243474540008556e-05, + "loss": 0.2223, + "step": 25926 + }, + { + "epoch": 33.282413350449296, + "grad_norm": 0.9934808015823364, + "learning_rate": 2.2243046640992728e-05, + "loss": 0.2577, + "step": 25927 + }, + { + "epoch": 33.283697047496794, + "grad_norm": 2.1806252002716064, + "learning_rate": 2.2242618741976893e-05, + "loss": 0.2498, + "step": 25928 + }, + { + "epoch": 33.284980744544285, + "grad_norm": 0.9492600560188293, + "learning_rate": 2.2242190842961065e-05, + "loss": 0.2285, + "step": 25929 + }, + { + "epoch": 33.28626444159178, + "grad_norm": 1.7533692121505737, + "learning_rate": 2.224176294394523e-05, + "loss": 0.2674, + "step": 25930 + }, + { + "epoch": 33.28754813863928, + "grad_norm": 1.097866177558899, + "learning_rate": 2.2241335044929395e-05, + "loss": 0.2334, + "step": 25931 + }, + { + "epoch": 33.28883183568678, + "grad_norm": 1.1058177947998047, + "learning_rate": 2.2240907145913566e-05, + "loss": 0.2619, + "step": 25932 + }, + { + "epoch": 33.290115532734276, + "grad_norm": 2.3154196739196777, + "learning_rate": 2.224047924689773e-05, + "loss": 0.2599, + "step": 25933 + }, + { + "epoch": 33.291399229781774, + "grad_norm": 1.5538822412490845, + "learning_rate": 2.22400513478819e-05, + "loss": 0.2349, + "step": 25934 + }, + { + "epoch": 33.292682926829265, + "grad_norm": 1.5936870574951172, + "learning_rate": 2.223962344886607e-05, + "loss": 0.2142, + "step": 25935 + }, + { + "epoch": 33.29396662387676, + "grad_norm": 0.8781291246414185, + "learning_rate": 2.2239195549850237e-05, + "loss": 0.2161, + "step": 25936 + }, + { + "epoch": 33.29525032092426, + "grad_norm": 1.0902533531188965, + "learning_rate": 2.2238767650834405e-05, + "loss": 0.2458, + "step": 25937 + }, + { + "epoch": 33.29653401797176, + "grad_norm": 0.8225656747817993, + "learning_rate": 2.223833975181857e-05, + "loss": 0.2689, + "step": 25938 + }, + { + "epoch": 33.29781771501926, + "grad_norm": 1.3461238145828247, + "learning_rate": 2.223791185280274e-05, + "loss": 0.2538, + "step": 25939 + }, + { + "epoch": 33.299101412066754, + "grad_norm": 1.2593680620193481, + "learning_rate": 2.2237483953786907e-05, + "loss": 0.259, + "step": 25940 + }, + { + "epoch": 33.30038510911425, + "grad_norm": 1.4448814392089844, + "learning_rate": 2.2237056054771075e-05, + "loss": 0.274, + "step": 25941 + }, + { + "epoch": 33.30166880616174, + "grad_norm": 0.874505877494812, + "learning_rate": 2.223662815575524e-05, + "loss": 0.2377, + "step": 25942 + }, + { + "epoch": 33.30295250320924, + "grad_norm": 2.776022434234619, + "learning_rate": 2.2236200256739412e-05, + "loss": 0.2238, + "step": 25943 + }, + { + "epoch": 33.30423620025674, + "grad_norm": 3.7154605388641357, + "learning_rate": 2.2235772357723577e-05, + "loss": 0.2207, + "step": 25944 + }, + { + "epoch": 33.30551989730424, + "grad_norm": 2.6668553352355957, + "learning_rate": 2.2235344458707746e-05, + "loss": 0.2579, + "step": 25945 + }, + { + "epoch": 33.306803594351734, + "grad_norm": 1.202694058418274, + "learning_rate": 2.2234916559691914e-05, + "loss": 0.2521, + "step": 25946 + }, + { + "epoch": 33.30808729139923, + "grad_norm": 2.564988851547241, + "learning_rate": 2.223448866067608e-05, + "loss": 0.2486, + "step": 25947 + }, + { + "epoch": 33.30937098844672, + "grad_norm": 1.6343523263931274, + "learning_rate": 2.223406076166025e-05, + "loss": 0.2913, + "step": 25948 + }, + { + "epoch": 33.31065468549422, + "grad_norm": 1.0506012439727783, + "learning_rate": 2.2233632862644416e-05, + "loss": 0.2414, + "step": 25949 + }, + { + "epoch": 33.31193838254172, + "grad_norm": 1.5826903581619263, + "learning_rate": 2.2233204963628584e-05, + "loss": 0.26, + "step": 25950 + }, + { + "epoch": 33.31322207958922, + "grad_norm": 1.0917675495147705, + "learning_rate": 2.2232777064612753e-05, + "loss": 0.2653, + "step": 25951 + }, + { + "epoch": 33.314505776636715, + "grad_norm": 3.3258631229400635, + "learning_rate": 2.2232349165596918e-05, + "loss": 0.3079, + "step": 25952 + }, + { + "epoch": 33.31578947368421, + "grad_norm": 1.3119075298309326, + "learning_rate": 2.223192126658109e-05, + "loss": 0.2913, + "step": 25953 + }, + { + "epoch": 33.31707317073171, + "grad_norm": 1.4410310983657837, + "learning_rate": 2.2231493367565255e-05, + "loss": 0.2912, + "step": 25954 + }, + { + "epoch": 33.3183568677792, + "grad_norm": 13.782186508178711, + "learning_rate": 2.2231065468549423e-05, + "loss": 0.2964, + "step": 25955 + }, + { + "epoch": 33.3196405648267, + "grad_norm": 2.6963629722595215, + "learning_rate": 2.223063756953359e-05, + "loss": 0.3584, + "step": 25956 + }, + { + "epoch": 33.3209242618742, + "grad_norm": 5.459817409515381, + "learning_rate": 2.223020967051776e-05, + "loss": 0.4209, + "step": 25957 + }, + { + "epoch": 33.322207958921695, + "grad_norm": 0.786234438419342, + "learning_rate": 2.2229781771501925e-05, + "loss": 0.2685, + "step": 25958 + }, + { + "epoch": 33.32349165596919, + "grad_norm": 1.0194536447525024, + "learning_rate": 2.2229353872486093e-05, + "loss": 0.2641, + "step": 25959 + }, + { + "epoch": 33.32477535301669, + "grad_norm": 0.8567820191383362, + "learning_rate": 2.222892597347026e-05, + "loss": 0.2758, + "step": 25960 + }, + { + "epoch": 33.32605905006419, + "grad_norm": 0.9575208425521851, + "learning_rate": 2.222849807445443e-05, + "loss": 0.2867, + "step": 25961 + }, + { + "epoch": 33.32734274711168, + "grad_norm": 1.0009467601776123, + "learning_rate": 2.22280701754386e-05, + "loss": 0.2758, + "step": 25962 + }, + { + "epoch": 33.32862644415918, + "grad_norm": 1.0664595365524292, + "learning_rate": 2.2227642276422764e-05, + "loss": 0.2714, + "step": 25963 + }, + { + "epoch": 33.329910141206675, + "grad_norm": 1.034257173538208, + "learning_rate": 2.2227214377406935e-05, + "loss": 0.2588, + "step": 25964 + }, + { + "epoch": 33.33119383825417, + "grad_norm": 1.2971383333206177, + "learning_rate": 2.22267864783911e-05, + "loss": 0.2672, + "step": 25965 + }, + { + "epoch": 33.33247753530167, + "grad_norm": 1.1419517993927002, + "learning_rate": 2.2226358579375265e-05, + "loss": 0.2352, + "step": 25966 + }, + { + "epoch": 33.33376123234917, + "grad_norm": 0.8642276525497437, + "learning_rate": 2.2225930680359437e-05, + "loss": 0.2718, + "step": 25967 + }, + { + "epoch": 33.33504492939666, + "grad_norm": 1.1912592649459839, + "learning_rate": 2.2225502781343602e-05, + "loss": 0.2521, + "step": 25968 + }, + { + "epoch": 33.33632862644416, + "grad_norm": 1.2506707906723022, + "learning_rate": 2.2225074882327774e-05, + "loss": 0.2489, + "step": 25969 + }, + { + "epoch": 33.337612323491655, + "grad_norm": 0.7831325531005859, + "learning_rate": 2.222464698331194e-05, + "loss": 0.2363, + "step": 25970 + }, + { + "epoch": 33.33889602053915, + "grad_norm": 1.4428902864456177, + "learning_rate": 2.2224219084296104e-05, + "loss": 0.272, + "step": 25971 + }, + { + "epoch": 33.34017971758665, + "grad_norm": 1.459628701210022, + "learning_rate": 2.2223791185280276e-05, + "loss": 0.239, + "step": 25972 + }, + { + "epoch": 33.34146341463415, + "grad_norm": 2.316859006881714, + "learning_rate": 2.222336328626444e-05, + "loss": 0.2434, + "step": 25973 + }, + { + "epoch": 33.342747111681646, + "grad_norm": 1.0979301929473877, + "learning_rate": 2.222293538724861e-05, + "loss": 0.2216, + "step": 25974 + }, + { + "epoch": 33.34403080872914, + "grad_norm": 1.1736605167388916, + "learning_rate": 2.2222507488232778e-05, + "loss": 0.2378, + "step": 25975 + }, + { + "epoch": 33.345314505776635, + "grad_norm": 3.0514779090881348, + "learning_rate": 2.2222079589216946e-05, + "loss": 0.2521, + "step": 25976 + }, + { + "epoch": 33.34659820282413, + "grad_norm": 1.2249537706375122, + "learning_rate": 2.2221651690201115e-05, + "loss": 0.2394, + "step": 25977 + }, + { + "epoch": 33.34788189987163, + "grad_norm": 0.8653139472007751, + "learning_rate": 2.222122379118528e-05, + "loss": 0.2224, + "step": 25978 + }, + { + "epoch": 33.34916559691913, + "grad_norm": 0.9015681743621826, + "learning_rate": 2.2220795892169448e-05, + "loss": 0.262, + "step": 25979 + }, + { + "epoch": 33.350449293966626, + "grad_norm": 1.6365593671798706, + "learning_rate": 2.2220367993153616e-05, + "loss": 0.2533, + "step": 25980 + }, + { + "epoch": 33.35173299101412, + "grad_norm": 0.8508352041244507, + "learning_rate": 2.2219940094137785e-05, + "loss": 0.26, + "step": 25981 + }, + { + "epoch": 33.353016688061615, + "grad_norm": 1.8177522420883179, + "learning_rate": 2.221951219512195e-05, + "loss": 0.2402, + "step": 25982 + }, + { + "epoch": 33.35430038510911, + "grad_norm": 1.912964940071106, + "learning_rate": 2.221908429610612e-05, + "loss": 0.2534, + "step": 25983 + }, + { + "epoch": 33.35558408215661, + "grad_norm": 0.939687192440033, + "learning_rate": 2.2218656397090287e-05, + "loss": 0.2406, + "step": 25984 + }, + { + "epoch": 33.35686777920411, + "grad_norm": 1.0943385362625122, + "learning_rate": 2.2218228498074455e-05, + "loss": 0.2272, + "step": 25985 + }, + { + "epoch": 33.35815147625161, + "grad_norm": 1.5897088050842285, + "learning_rate": 2.2217800599058623e-05, + "loss": 0.2251, + "step": 25986 + }, + { + "epoch": 33.359435173299104, + "grad_norm": 3.2106173038482666, + "learning_rate": 2.221737270004279e-05, + "loss": 0.2173, + "step": 25987 + }, + { + "epoch": 33.360718870346595, + "grad_norm": 2.698381185531616, + "learning_rate": 2.221694480102696e-05, + "loss": 0.2378, + "step": 25988 + }, + { + "epoch": 33.36200256739409, + "grad_norm": 1.5486235618591309, + "learning_rate": 2.2216516902011125e-05, + "loss": 0.2601, + "step": 25989 + }, + { + "epoch": 33.36328626444159, + "grad_norm": 1.6794521808624268, + "learning_rate": 2.2216089002995294e-05, + "loss": 0.274, + "step": 25990 + }, + { + "epoch": 33.36456996148909, + "grad_norm": 1.5194361209869385, + "learning_rate": 2.2215661103979462e-05, + "loss": 0.2663, + "step": 25991 + }, + { + "epoch": 33.36585365853659, + "grad_norm": 1.2092442512512207, + "learning_rate": 2.2215233204963627e-05, + "loss": 0.2473, + "step": 25992 + }, + { + "epoch": 33.367137355584084, + "grad_norm": 2.1287689208984375, + "learning_rate": 2.22148053059478e-05, + "loss": 0.2219, + "step": 25993 + }, + { + "epoch": 33.36842105263158, + "grad_norm": 1.2364673614501953, + "learning_rate": 2.2214377406931964e-05, + "loss": 0.251, + "step": 25994 + }, + { + "epoch": 33.36970474967907, + "grad_norm": 5.196789264678955, + "learning_rate": 2.2213949507916132e-05, + "loss": 0.2805, + "step": 25995 + }, + { + "epoch": 33.37098844672657, + "grad_norm": 1.4208019971847534, + "learning_rate": 2.22135216089003e-05, + "loss": 0.2453, + "step": 25996 + }, + { + "epoch": 33.37227214377407, + "grad_norm": 0.8178467154502869, + "learning_rate": 2.221309370988447e-05, + "loss": 0.2343, + "step": 25997 + }, + { + "epoch": 33.37355584082157, + "grad_norm": 1.223642349243164, + "learning_rate": 2.2212665810868634e-05, + "loss": 0.2454, + "step": 25998 + }, + { + "epoch": 33.374839537869065, + "grad_norm": 1.4092527627944946, + "learning_rate": 2.2212237911852803e-05, + "loss": 0.3058, + "step": 25999 + }, + { + "epoch": 33.37612323491656, + "grad_norm": 2.2632758617401123, + "learning_rate": 2.221181001283697e-05, + "loss": 0.2966, + "step": 26000 + }, + { + "epoch": 33.37612323491656, + "eval_cer": 0.2606716271783826, + "eval_loss": 0.4803558588027954, + "eval_runtime": 13.7076, + "eval_samples_per_second": 71.712, + "eval_steps_per_second": 0.511, + "eval_wer": 0.44531070855308413, + "step": 26000 + }, + { + "epoch": 33.37740693196405, + "grad_norm": 1.5186848640441895, + "learning_rate": 2.221138211382114e-05, + "loss": 0.2484, + "step": 26001 + }, + { + "epoch": 33.37869062901155, + "grad_norm": 2.930828332901001, + "learning_rate": 2.2210954214805308e-05, + "loss": 0.248, + "step": 26002 + }, + { + "epoch": 33.37997432605905, + "grad_norm": 1.610700249671936, + "learning_rate": 2.2210526315789473e-05, + "loss": 0.2382, + "step": 26003 + }, + { + "epoch": 33.38125802310655, + "grad_norm": 1.9715927839279175, + "learning_rate": 2.2210098416773645e-05, + "loss": 0.2833, + "step": 26004 + }, + { + "epoch": 33.382541720154045, + "grad_norm": 1.463158369064331, + "learning_rate": 2.220967051775781e-05, + "loss": 0.2978, + "step": 26005 + }, + { + "epoch": 33.38382541720154, + "grad_norm": 1.3526612520217896, + "learning_rate": 2.2209242618741975e-05, + "loss": 0.2968, + "step": 26006 + }, + { + "epoch": 33.38510911424904, + "grad_norm": 2.5837132930755615, + "learning_rate": 2.2208814719726147e-05, + "loss": 0.4051, + "step": 26007 + }, + { + "epoch": 33.38639281129653, + "grad_norm": 0.7637943029403687, + "learning_rate": 2.220838682071031e-05, + "loss": 0.2559, + "step": 26008 + }, + { + "epoch": 33.38767650834403, + "grad_norm": 0.6400581002235413, + "learning_rate": 2.2207958921694483e-05, + "loss": 0.2403, + "step": 26009 + }, + { + "epoch": 33.38896020539153, + "grad_norm": 1.165169596672058, + "learning_rate": 2.220753102267865e-05, + "loss": 0.2602, + "step": 26010 + }, + { + "epoch": 33.390243902439025, + "grad_norm": 0.8768997192382812, + "learning_rate": 2.2207103123662817e-05, + "loss": 0.2574, + "step": 26011 + }, + { + "epoch": 33.39152759948652, + "grad_norm": 1.0726196765899658, + "learning_rate": 2.2206675224646985e-05, + "loss": 0.2825, + "step": 26012 + }, + { + "epoch": 33.39281129653402, + "grad_norm": 1.4755574464797974, + "learning_rate": 2.220624732563115e-05, + "loss": 0.2437, + "step": 26013 + }, + { + "epoch": 33.39409499358152, + "grad_norm": 0.9086391925811768, + "learning_rate": 2.220581942661532e-05, + "loss": 0.2488, + "step": 26014 + }, + { + "epoch": 33.39537869062901, + "grad_norm": 0.9677554965019226, + "learning_rate": 2.2205391527599487e-05, + "loss": 0.2487, + "step": 26015 + }, + { + "epoch": 33.39666238767651, + "grad_norm": 3.3578503131866455, + "learning_rate": 2.2204963628583655e-05, + "loss": 0.2424, + "step": 26016 + }, + { + "epoch": 33.397946084724005, + "grad_norm": 1.5393953323364258, + "learning_rate": 2.2204535729567824e-05, + "loss": 0.2534, + "step": 26017 + }, + { + "epoch": 33.3992297817715, + "grad_norm": 0.8855764865875244, + "learning_rate": 2.2204107830551992e-05, + "loss": 0.2654, + "step": 26018 + }, + { + "epoch": 33.400513478819, + "grad_norm": 1.1493135690689087, + "learning_rate": 2.2203679931536157e-05, + "loss": 0.2368, + "step": 26019 + }, + { + "epoch": 33.4017971758665, + "grad_norm": 0.9304987192153931, + "learning_rate": 2.2203252032520326e-05, + "loss": 0.2332, + "step": 26020 + }, + { + "epoch": 33.40308087291399, + "grad_norm": 1.4003664255142212, + "learning_rate": 2.2202824133504494e-05, + "loss": 0.2822, + "step": 26021 + }, + { + "epoch": 33.40436456996149, + "grad_norm": 1.5298618078231812, + "learning_rate": 2.220239623448866e-05, + "loss": 0.2781, + "step": 26022 + }, + { + "epoch": 33.405648267008985, + "grad_norm": 2.2392892837524414, + "learning_rate": 2.220196833547283e-05, + "loss": 0.2343, + "step": 26023 + }, + { + "epoch": 33.40693196405648, + "grad_norm": 1.3141347169876099, + "learning_rate": 2.2201540436456996e-05, + "loss": 0.2855, + "step": 26024 + }, + { + "epoch": 33.40821566110398, + "grad_norm": 0.8273465633392334, + "learning_rate": 2.2201112537441168e-05, + "loss": 0.2356, + "step": 26025 + }, + { + "epoch": 33.40949935815148, + "grad_norm": 2.196780204772949, + "learning_rate": 2.2200684638425333e-05, + "loss": 0.2005, + "step": 26026 + }, + { + "epoch": 33.410783055198976, + "grad_norm": 1.3839654922485352, + "learning_rate": 2.2200256739409498e-05, + "loss": 0.2491, + "step": 26027 + }, + { + "epoch": 33.41206675224647, + "grad_norm": 1.414758324623108, + "learning_rate": 2.219982884039367e-05, + "loss": 0.2293, + "step": 26028 + }, + { + "epoch": 33.413350449293965, + "grad_norm": 0.9437862634658813, + "learning_rate": 2.2199400941377835e-05, + "loss": 0.2463, + "step": 26029 + }, + { + "epoch": 33.41463414634146, + "grad_norm": 1.02413010597229, + "learning_rate": 2.2198973042362003e-05, + "loss": 0.2432, + "step": 26030 + }, + { + "epoch": 33.41591784338896, + "grad_norm": 0.9398002624511719, + "learning_rate": 2.219854514334617e-05, + "loss": 0.2469, + "step": 26031 + }, + { + "epoch": 33.41720154043646, + "grad_norm": 1.6106865406036377, + "learning_rate": 2.2198117244330336e-05, + "loss": 0.2445, + "step": 26032 + }, + { + "epoch": 33.41848523748396, + "grad_norm": 1.1400336027145386, + "learning_rate": 2.2197689345314508e-05, + "loss": 0.2304, + "step": 26033 + }, + { + "epoch": 33.41976893453145, + "grad_norm": 1.0529842376708984, + "learning_rate": 2.2197261446298673e-05, + "loss": 0.2316, + "step": 26034 + }, + { + "epoch": 33.421052631578945, + "grad_norm": 1.7753340005874634, + "learning_rate": 2.2196833547282842e-05, + "loss": 0.264, + "step": 26035 + }, + { + "epoch": 33.42233632862644, + "grad_norm": 1.3140722513198853, + "learning_rate": 2.219640564826701e-05, + "loss": 0.2598, + "step": 26036 + }, + { + "epoch": 33.42362002567394, + "grad_norm": 1.0571494102478027, + "learning_rate": 2.219597774925118e-05, + "loss": 0.2308, + "step": 26037 + }, + { + "epoch": 33.42490372272144, + "grad_norm": 0.994232714176178, + "learning_rate": 2.2195549850235344e-05, + "loss": 0.2729, + "step": 26038 + }, + { + "epoch": 33.42618741976894, + "grad_norm": 1.1301181316375732, + "learning_rate": 2.2195121951219512e-05, + "loss": 0.2606, + "step": 26039 + }, + { + "epoch": 33.427471116816434, + "grad_norm": 1.0514299869537354, + "learning_rate": 2.219469405220368e-05, + "loss": 0.244, + "step": 26040 + }, + { + "epoch": 33.428754813863925, + "grad_norm": 1.574447751045227, + "learning_rate": 2.2194266153187845e-05, + "loss": 0.2384, + "step": 26041 + }, + { + "epoch": 33.43003851091142, + "grad_norm": 1.9797979593276978, + "learning_rate": 2.2193838254172017e-05, + "loss": 0.2348, + "step": 26042 + }, + { + "epoch": 33.43132220795892, + "grad_norm": 0.9535526037216187, + "learning_rate": 2.2193410355156182e-05, + "loss": 0.2297, + "step": 26043 + }, + { + "epoch": 33.43260590500642, + "grad_norm": 2.3648769855499268, + "learning_rate": 2.2192982456140354e-05, + "loss": 0.2428, + "step": 26044 + }, + { + "epoch": 33.43388960205392, + "grad_norm": 1.9250668287277222, + "learning_rate": 2.219255455712452e-05, + "loss": 0.2377, + "step": 26045 + }, + { + "epoch": 33.435173299101415, + "grad_norm": 1.5140000581741333, + "learning_rate": 2.2192126658108684e-05, + "loss": 0.2596, + "step": 26046 + }, + { + "epoch": 33.436456996148905, + "grad_norm": 3.357943296432495, + "learning_rate": 2.2191698759092856e-05, + "loss": 0.2314, + "step": 26047 + }, + { + "epoch": 33.4377406931964, + "grad_norm": 1.491422176361084, + "learning_rate": 2.219127086007702e-05, + "loss": 0.2499, + "step": 26048 + }, + { + "epoch": 33.4390243902439, + "grad_norm": 1.0330119132995605, + "learning_rate": 2.219084296106119e-05, + "loss": 0.2577, + "step": 26049 + }, + { + "epoch": 33.4403080872914, + "grad_norm": 1.3010642528533936, + "learning_rate": 2.2190415062045358e-05, + "loss": 0.2391, + "step": 26050 + }, + { + "epoch": 33.4415917843389, + "grad_norm": 1.3155934810638428, + "learning_rate": 2.2189987163029526e-05, + "loss": 0.2786, + "step": 26051 + }, + { + "epoch": 33.442875481386395, + "grad_norm": 1.6151231527328491, + "learning_rate": 2.2189559264013695e-05, + "loss": 0.2484, + "step": 26052 + }, + { + "epoch": 33.44415917843389, + "grad_norm": 1.0967477560043335, + "learning_rate": 2.218913136499786e-05, + "loss": 0.2714, + "step": 26053 + }, + { + "epoch": 33.44544287548138, + "grad_norm": 1.927687168121338, + "learning_rate": 2.2188703465982028e-05, + "loss": 0.2876, + "step": 26054 + }, + { + "epoch": 33.44672657252888, + "grad_norm": 1.724930763244629, + "learning_rate": 2.2188275566966196e-05, + "loss": 0.3054, + "step": 26055 + }, + { + "epoch": 33.44801026957638, + "grad_norm": 2.26261305809021, + "learning_rate": 2.2187847667950365e-05, + "loss": 0.3592, + "step": 26056 + }, + { + "epoch": 33.44929396662388, + "grad_norm": 2.7130470275878906, + "learning_rate": 2.218741976893453e-05, + "loss": 0.4252, + "step": 26057 + }, + { + "epoch": 33.450577663671375, + "grad_norm": 1.1892489194869995, + "learning_rate": 2.21869918699187e-05, + "loss": 0.2486, + "step": 26058 + }, + { + "epoch": 33.45186136071887, + "grad_norm": 1.6827244758605957, + "learning_rate": 2.2186563970902867e-05, + "loss": 0.2577, + "step": 26059 + }, + { + "epoch": 33.45314505776637, + "grad_norm": 0.9472147822380066, + "learning_rate": 2.2186136071887035e-05, + "loss": 0.2588, + "step": 26060 + }, + { + "epoch": 33.45442875481386, + "grad_norm": 1.6930487155914307, + "learning_rate": 2.2185708172871203e-05, + "loss": 0.2534, + "step": 26061 + }, + { + "epoch": 33.45571245186136, + "grad_norm": 0.900763750076294, + "learning_rate": 2.218528027385537e-05, + "loss": 0.2524, + "step": 26062 + }, + { + "epoch": 33.45699614890886, + "grad_norm": 1.0425853729248047, + "learning_rate": 2.218485237483954e-05, + "loss": 0.2562, + "step": 26063 + }, + { + "epoch": 33.458279845956355, + "grad_norm": 2.8922371864318848, + "learning_rate": 2.2184424475823705e-05, + "loss": 0.2389, + "step": 26064 + }, + { + "epoch": 33.45956354300385, + "grad_norm": 0.9180982708930969, + "learning_rate": 2.2183996576807874e-05, + "loss": 0.2506, + "step": 26065 + }, + { + "epoch": 33.46084724005135, + "grad_norm": 3.3438923358917236, + "learning_rate": 2.2183568677792042e-05, + "loss": 0.2708, + "step": 26066 + }, + { + "epoch": 33.46213093709884, + "grad_norm": 1.0184842348098755, + "learning_rate": 2.2183140778776207e-05, + "loss": 0.2775, + "step": 26067 + }, + { + "epoch": 33.46341463414634, + "grad_norm": 0.9224327206611633, + "learning_rate": 2.218271287976038e-05, + "loss": 0.2668, + "step": 26068 + }, + { + "epoch": 33.46469833119384, + "grad_norm": 0.9163581728935242, + "learning_rate": 2.2182284980744544e-05, + "loss": 0.259, + "step": 26069 + }, + { + "epoch": 33.465982028241335, + "grad_norm": 0.8141111135482788, + "learning_rate": 2.2181857081728712e-05, + "loss": 0.2624, + "step": 26070 + }, + { + "epoch": 33.46726572528883, + "grad_norm": 0.9215165376663208, + "learning_rate": 2.218142918271288e-05, + "loss": 0.233, + "step": 26071 + }, + { + "epoch": 33.46854942233633, + "grad_norm": 1.1757915019989014, + "learning_rate": 2.218100128369705e-05, + "loss": 0.2408, + "step": 26072 + }, + { + "epoch": 33.46983311938383, + "grad_norm": 1.5922397375106812, + "learning_rate": 2.2180573384681214e-05, + "loss": 0.2685, + "step": 26073 + }, + { + "epoch": 33.47111681643132, + "grad_norm": 1.133755087852478, + "learning_rate": 2.2180145485665383e-05, + "loss": 0.2396, + "step": 26074 + }, + { + "epoch": 33.47240051347882, + "grad_norm": 1.354722261428833, + "learning_rate": 2.217971758664955e-05, + "loss": 0.2334, + "step": 26075 + }, + { + "epoch": 33.473684210526315, + "grad_norm": 1.0776067972183228, + "learning_rate": 2.217928968763372e-05, + "loss": 0.2553, + "step": 26076 + }, + { + "epoch": 33.47496790757381, + "grad_norm": 0.7922641038894653, + "learning_rate": 2.2178861788617888e-05, + "loss": 0.2325, + "step": 26077 + }, + { + "epoch": 33.47625160462131, + "grad_norm": 0.9316034317016602, + "learning_rate": 2.2178433889602053e-05, + "loss": 0.263, + "step": 26078 + }, + { + "epoch": 33.47753530166881, + "grad_norm": 1.5070865154266357, + "learning_rate": 2.2178005990586225e-05, + "loss": 0.2533, + "step": 26079 + }, + { + "epoch": 33.47881899871631, + "grad_norm": 1.4679441452026367, + "learning_rate": 2.217757809157039e-05, + "loss": 0.2387, + "step": 26080 + }, + { + "epoch": 33.4801026957638, + "grad_norm": 2.127885580062866, + "learning_rate": 2.2177150192554555e-05, + "loss": 0.2217, + "step": 26081 + }, + { + "epoch": 33.481386392811295, + "grad_norm": 0.8890533447265625, + "learning_rate": 2.2176722293538727e-05, + "loss": 0.2339, + "step": 26082 + }, + { + "epoch": 33.48267008985879, + "grad_norm": 0.9479212164878845, + "learning_rate": 2.217629439452289e-05, + "loss": 0.248, + "step": 26083 + }, + { + "epoch": 33.48395378690629, + "grad_norm": 1.2474182844161987, + "learning_rate": 2.2175866495507063e-05, + "loss": 0.2274, + "step": 26084 + }, + { + "epoch": 33.48523748395379, + "grad_norm": 0.8007055521011353, + "learning_rate": 2.217543859649123e-05, + "loss": 0.2381, + "step": 26085 + }, + { + "epoch": 33.48652118100129, + "grad_norm": 1.2480003833770752, + "learning_rate": 2.2175010697475397e-05, + "loss": 0.2637, + "step": 26086 + }, + { + "epoch": 33.48780487804878, + "grad_norm": 1.8697879314422607, + "learning_rate": 2.2174582798459565e-05, + "loss": 0.2418, + "step": 26087 + }, + { + "epoch": 33.489088575096275, + "grad_norm": 0.8866709470748901, + "learning_rate": 2.217415489944373e-05, + "loss": 0.2487, + "step": 26088 + }, + { + "epoch": 33.49037227214377, + "grad_norm": 3.1011505126953125, + "learning_rate": 2.21737270004279e-05, + "loss": 0.2422, + "step": 26089 + }, + { + "epoch": 33.49165596919127, + "grad_norm": 1.399520754814148, + "learning_rate": 2.2173299101412067e-05, + "loss": 0.2373, + "step": 26090 + }, + { + "epoch": 33.49293966623877, + "grad_norm": 1.4800310134887695, + "learning_rate": 2.2172871202396236e-05, + "loss": 0.2443, + "step": 26091 + }, + { + "epoch": 33.49422336328627, + "grad_norm": 2.565221071243286, + "learning_rate": 2.2172443303380404e-05, + "loss": 0.2454, + "step": 26092 + }, + { + "epoch": 33.495507060333765, + "grad_norm": 1.6283074617385864, + "learning_rate": 2.217201540436457e-05, + "loss": 0.2575, + "step": 26093 + }, + { + "epoch": 33.496790757381255, + "grad_norm": 1.421424150466919, + "learning_rate": 2.2171587505348737e-05, + "loss": 0.2627, + "step": 26094 + }, + { + "epoch": 33.49807445442875, + "grad_norm": 2.3802380561828613, + "learning_rate": 2.2171159606332906e-05, + "loss": 0.2745, + "step": 26095 + }, + { + "epoch": 33.49935815147625, + "grad_norm": 1.3802005052566528, + "learning_rate": 2.2170731707317074e-05, + "loss": 0.259, + "step": 26096 + }, + { + "epoch": 33.50064184852375, + "grad_norm": 2.547182083129883, + "learning_rate": 2.217030380830124e-05, + "loss": 0.2442, + "step": 26097 + }, + { + "epoch": 33.50192554557125, + "grad_norm": 2.9925692081451416, + "learning_rate": 2.216987590928541e-05, + "loss": 0.298, + "step": 26098 + }, + { + "epoch": 33.503209242618745, + "grad_norm": 1.514492392539978, + "learning_rate": 2.2169448010269576e-05, + "loss": 0.235, + "step": 26099 + }, + { + "epoch": 33.504492939666235, + "grad_norm": 1.238565444946289, + "learning_rate": 2.2169020111253744e-05, + "loss": 0.2133, + "step": 26100 + }, + { + "epoch": 33.50577663671373, + "grad_norm": 2.350001096725464, + "learning_rate": 2.2168592212237913e-05, + "loss": 0.2308, + "step": 26101 + }, + { + "epoch": 33.50706033376123, + "grad_norm": 1.9128646850585938, + "learning_rate": 2.2168164313222078e-05, + "loss": 0.2965, + "step": 26102 + }, + { + "epoch": 33.50834403080873, + "grad_norm": 3.549839735031128, + "learning_rate": 2.216773641420625e-05, + "loss": 0.284, + "step": 26103 + }, + { + "epoch": 33.50962772785623, + "grad_norm": 1.485061526298523, + "learning_rate": 2.2167308515190415e-05, + "loss": 0.291, + "step": 26104 + }, + { + "epoch": 33.510911424903725, + "grad_norm": 1.5310533046722412, + "learning_rate": 2.2166880616174583e-05, + "loss": 0.3013, + "step": 26105 + }, + { + "epoch": 33.51219512195122, + "grad_norm": 2.760104179382324, + "learning_rate": 2.216645271715875e-05, + "loss": 0.385, + "step": 26106 + }, + { + "epoch": 33.51347881899871, + "grad_norm": 2.644925117492676, + "learning_rate": 2.2166024818142917e-05, + "loss": 0.4294, + "step": 26107 + }, + { + "epoch": 33.51476251604621, + "grad_norm": 0.814385712146759, + "learning_rate": 2.216559691912709e-05, + "loss": 0.2617, + "step": 26108 + }, + { + "epoch": 33.51604621309371, + "grad_norm": 3.1041266918182373, + "learning_rate": 2.2165169020111253e-05, + "loss": 0.248, + "step": 26109 + }, + { + "epoch": 33.51732991014121, + "grad_norm": 0.8474292159080505, + "learning_rate": 2.2164741121095422e-05, + "loss": 0.2682, + "step": 26110 + }, + { + "epoch": 33.518613607188705, + "grad_norm": 1.9380545616149902, + "learning_rate": 2.216431322207959e-05, + "loss": 0.2862, + "step": 26111 + }, + { + "epoch": 33.5198973042362, + "grad_norm": 1.5607129335403442, + "learning_rate": 2.216388532306376e-05, + "loss": 0.2757, + "step": 26112 + }, + { + "epoch": 33.52118100128369, + "grad_norm": 1.0298629999160767, + "learning_rate": 2.2163457424047924e-05, + "loss": 0.2661, + "step": 26113 + }, + { + "epoch": 33.52246469833119, + "grad_norm": 1.2042051553726196, + "learning_rate": 2.2163029525032092e-05, + "loss": 0.2799, + "step": 26114 + }, + { + "epoch": 33.52374839537869, + "grad_norm": 1.595040202140808, + "learning_rate": 2.216260162601626e-05, + "loss": 0.2381, + "step": 26115 + }, + { + "epoch": 33.52503209242619, + "grad_norm": 1.0809729099273682, + "learning_rate": 2.216217372700043e-05, + "loss": 0.2586, + "step": 26116 + }, + { + "epoch": 33.526315789473685, + "grad_norm": 0.8122671842575073, + "learning_rate": 2.2161745827984597e-05, + "loss": 0.2451, + "step": 26117 + }, + { + "epoch": 33.52759948652118, + "grad_norm": 1.1559898853302002, + "learning_rate": 2.2161317928968762e-05, + "loss": 0.2694, + "step": 26118 + }, + { + "epoch": 33.52888318356868, + "grad_norm": 1.3400086164474487, + "learning_rate": 2.2160890029952934e-05, + "loss": 0.2655, + "step": 26119 + }, + { + "epoch": 33.53016688061617, + "grad_norm": 1.165921926498413, + "learning_rate": 2.21604621309371e-05, + "loss": 0.2493, + "step": 26120 + }, + { + "epoch": 33.53145057766367, + "grad_norm": 0.860381543636322, + "learning_rate": 2.2160034231921264e-05, + "loss": 0.2494, + "step": 26121 + }, + { + "epoch": 33.53273427471117, + "grad_norm": 0.9062122702598572, + "learning_rate": 2.2159606332905436e-05, + "loss": 0.261, + "step": 26122 + }, + { + "epoch": 33.534017971758665, + "grad_norm": 1.0081065893173218, + "learning_rate": 2.21591784338896e-05, + "loss": 0.2583, + "step": 26123 + }, + { + "epoch": 33.53530166880616, + "grad_norm": 1.2938238382339478, + "learning_rate": 2.2158750534873773e-05, + "loss": 0.2402, + "step": 26124 + }, + { + "epoch": 33.53658536585366, + "grad_norm": 1.2112232446670532, + "learning_rate": 2.2158322635857938e-05, + "loss": 0.2609, + "step": 26125 + }, + { + "epoch": 33.53786906290116, + "grad_norm": 1.452523112297058, + "learning_rate": 2.2157894736842106e-05, + "loss": 0.2457, + "step": 26126 + }, + { + "epoch": 33.53915275994865, + "grad_norm": 1.652161955833435, + "learning_rate": 2.2157466837826275e-05, + "loss": 0.2572, + "step": 26127 + }, + { + "epoch": 33.54043645699615, + "grad_norm": 1.19569730758667, + "learning_rate": 2.215703893881044e-05, + "loss": 0.2243, + "step": 26128 + }, + { + "epoch": 33.541720154043645, + "grad_norm": 2.5293400287628174, + "learning_rate": 2.2156611039794608e-05, + "loss": 0.23, + "step": 26129 + }, + { + "epoch": 33.54300385109114, + "grad_norm": 2.2216694355010986, + "learning_rate": 2.2156183140778776e-05, + "loss": 0.238, + "step": 26130 + }, + { + "epoch": 33.54428754813864, + "grad_norm": 0.9565955400466919, + "learning_rate": 2.2155755241762945e-05, + "loss": 0.2395, + "step": 26131 + }, + { + "epoch": 33.54557124518614, + "grad_norm": 1.660977840423584, + "learning_rate": 2.2155327342747113e-05, + "loss": 0.2741, + "step": 26132 + }, + { + "epoch": 33.54685494223363, + "grad_norm": 2.2336926460266113, + "learning_rate": 2.2154899443731282e-05, + "loss": 0.234, + "step": 26133 + }, + { + "epoch": 33.54813863928113, + "grad_norm": 2.284557342529297, + "learning_rate": 2.2154471544715447e-05, + "loss": 0.2557, + "step": 26134 + }, + { + "epoch": 33.549422336328625, + "grad_norm": 1.1571968793869019, + "learning_rate": 2.2154043645699615e-05, + "loss": 0.2185, + "step": 26135 + }, + { + "epoch": 33.55070603337612, + "grad_norm": 1.1439096927642822, + "learning_rate": 2.2153615746683784e-05, + "loss": 0.2597, + "step": 26136 + }, + { + "epoch": 33.55198973042362, + "grad_norm": 1.3979593515396118, + "learning_rate": 2.215318784766795e-05, + "loss": 0.2223, + "step": 26137 + }, + { + "epoch": 33.55327342747112, + "grad_norm": 0.7095888257026672, + "learning_rate": 2.215275994865212e-05, + "loss": 0.208, + "step": 26138 + }, + { + "epoch": 33.55455712451862, + "grad_norm": 1.3994444608688354, + "learning_rate": 2.2152332049636285e-05, + "loss": 0.2422, + "step": 26139 + }, + { + "epoch": 33.55584082156611, + "grad_norm": 1.5149738788604736, + "learning_rate": 2.2151904150620457e-05, + "loss": 0.2324, + "step": 26140 + }, + { + "epoch": 33.557124518613605, + "grad_norm": 1.4816769361495972, + "learning_rate": 2.2151476251604622e-05, + "loss": 0.2444, + "step": 26141 + }, + { + "epoch": 33.5584082156611, + "grad_norm": 1.418014645576477, + "learning_rate": 2.2151048352588787e-05, + "loss": 0.2378, + "step": 26142 + }, + { + "epoch": 33.5596919127086, + "grad_norm": 1.345968246459961, + "learning_rate": 2.215062045357296e-05, + "loss": 0.2478, + "step": 26143 + }, + { + "epoch": 33.5609756097561, + "grad_norm": 1.2985674142837524, + "learning_rate": 2.2150192554557124e-05, + "loss": 0.2358, + "step": 26144 + }, + { + "epoch": 33.5622593068036, + "grad_norm": 1.0789172649383545, + "learning_rate": 2.2149764655541292e-05, + "loss": 0.2448, + "step": 26145 + }, + { + "epoch": 33.563543003851095, + "grad_norm": 2.127582311630249, + "learning_rate": 2.214933675652546e-05, + "loss": 0.2367, + "step": 26146 + }, + { + "epoch": 33.564826700898585, + "grad_norm": 2.839526653289795, + "learning_rate": 2.214890885750963e-05, + "loss": 0.2358, + "step": 26147 + }, + { + "epoch": 33.56611039794608, + "grad_norm": 3.0244905948638916, + "learning_rate": 2.2148480958493798e-05, + "loss": 0.2282, + "step": 26148 + }, + { + "epoch": 33.56739409499358, + "grad_norm": 1.3656845092773438, + "learning_rate": 2.2148053059477963e-05, + "loss": 0.2628, + "step": 26149 + }, + { + "epoch": 33.56867779204108, + "grad_norm": 1.3336830139160156, + "learning_rate": 2.214762516046213e-05, + "loss": 0.2562, + "step": 26150 + }, + { + "epoch": 33.56996148908858, + "grad_norm": 1.1604686975479126, + "learning_rate": 2.21471972614463e-05, + "loss": 0.2762, + "step": 26151 + }, + { + "epoch": 33.571245186136075, + "grad_norm": 1.1473323106765747, + "learning_rate": 2.2146769362430468e-05, + "loss": 0.2517, + "step": 26152 + }, + { + "epoch": 33.572528883183566, + "grad_norm": 2.891030788421631, + "learning_rate": 2.2146341463414633e-05, + "loss": 0.282, + "step": 26153 + }, + { + "epoch": 33.57381258023106, + "grad_norm": 4.929076194763184, + "learning_rate": 2.21459135643988e-05, + "loss": 0.3344, + "step": 26154 + }, + { + "epoch": 33.57509627727856, + "grad_norm": 2.3316965103149414, + "learning_rate": 2.214548566538297e-05, + "loss": 0.3028, + "step": 26155 + }, + { + "epoch": 33.57637997432606, + "grad_norm": 2.9857165813446045, + "learning_rate": 2.2145057766367138e-05, + "loss": 0.2799, + "step": 26156 + }, + { + "epoch": 33.57766367137356, + "grad_norm": 2.5098955631256104, + "learning_rate": 2.2144629867351307e-05, + "loss": 0.4251, + "step": 26157 + }, + { + "epoch": 33.578947368421055, + "grad_norm": 1.5995732545852661, + "learning_rate": 2.214420196833547e-05, + "loss": 0.2713, + "step": 26158 + }, + { + "epoch": 33.58023106546855, + "grad_norm": 0.8663820624351501, + "learning_rate": 2.2143774069319643e-05, + "loss": 0.2882, + "step": 26159 + }, + { + "epoch": 33.58151476251604, + "grad_norm": 1.070391058921814, + "learning_rate": 2.214334617030381e-05, + "loss": 0.2762, + "step": 26160 + }, + { + "epoch": 33.58279845956354, + "grad_norm": 1.2489691972732544, + "learning_rate": 2.2142918271287974e-05, + "loss": 0.2791, + "step": 26161 + }, + { + "epoch": 33.58408215661104, + "grad_norm": 0.8193411231040955, + "learning_rate": 2.2142490372272145e-05, + "loss": 0.2687, + "step": 26162 + }, + { + "epoch": 33.58536585365854, + "grad_norm": 1.730386734008789, + "learning_rate": 2.214206247325631e-05, + "loss": 0.2704, + "step": 26163 + }, + { + "epoch": 33.586649550706035, + "grad_norm": 0.8882891535758972, + "learning_rate": 2.2141634574240482e-05, + "loss": 0.2574, + "step": 26164 + }, + { + "epoch": 33.58793324775353, + "grad_norm": 1.9640843868255615, + "learning_rate": 2.2141206675224647e-05, + "loss": 0.247, + "step": 26165 + }, + { + "epoch": 33.589216944801024, + "grad_norm": 1.019410490989685, + "learning_rate": 2.2140778776208816e-05, + "loss": 0.2545, + "step": 26166 + }, + { + "epoch": 33.59050064184852, + "grad_norm": 1.0477755069732666, + "learning_rate": 2.2140350877192984e-05, + "loss": 0.2668, + "step": 26167 + }, + { + "epoch": 33.59178433889602, + "grad_norm": 1.6938247680664062, + "learning_rate": 2.213992297817715e-05, + "loss": 0.2488, + "step": 26168 + }, + { + "epoch": 33.59306803594352, + "grad_norm": 1.2701637744903564, + "learning_rate": 2.2139495079161317e-05, + "loss": 0.2529, + "step": 26169 + }, + { + "epoch": 33.594351732991015, + "grad_norm": 1.534432291984558, + "learning_rate": 2.2139067180145486e-05, + "loss": 0.2544, + "step": 26170 + }, + { + "epoch": 33.59563543003851, + "grad_norm": 1.8722351789474487, + "learning_rate": 2.2138639281129654e-05, + "loss": 0.2549, + "step": 26171 + }, + { + "epoch": 33.59691912708601, + "grad_norm": 0.8120633959770203, + "learning_rate": 2.2138211382113823e-05, + "loss": 0.2144, + "step": 26172 + }, + { + "epoch": 33.5982028241335, + "grad_norm": 0.9258701801300049, + "learning_rate": 2.213778348309799e-05, + "loss": 0.2589, + "step": 26173 + }, + { + "epoch": 33.599486521181, + "grad_norm": 1.3283339738845825, + "learning_rate": 2.2137355584082156e-05, + "loss": 0.2494, + "step": 26174 + }, + { + "epoch": 33.6007702182285, + "grad_norm": 0.7328206896781921, + "learning_rate": 2.2136927685066325e-05, + "loss": 0.2248, + "step": 26175 + }, + { + "epoch": 33.602053915275995, + "grad_norm": 0.8894224762916565, + "learning_rate": 2.2136499786050493e-05, + "loss": 0.2549, + "step": 26176 + }, + { + "epoch": 33.60333761232349, + "grad_norm": 1.4713040590286255, + "learning_rate": 2.2136071887034658e-05, + "loss": 0.2468, + "step": 26177 + }, + { + "epoch": 33.60462130937099, + "grad_norm": 1.252974033355713, + "learning_rate": 2.213564398801883e-05, + "loss": 0.2577, + "step": 26178 + }, + { + "epoch": 33.60590500641848, + "grad_norm": 1.521982192993164, + "learning_rate": 2.2135216089002995e-05, + "loss": 0.2169, + "step": 26179 + }, + { + "epoch": 33.60718870346598, + "grad_norm": 3.539811849594116, + "learning_rate": 2.2134788189987167e-05, + "loss": 0.2573, + "step": 26180 + }, + { + "epoch": 33.60847240051348, + "grad_norm": 1.5213472843170166, + "learning_rate": 2.213436029097133e-05, + "loss": 0.2258, + "step": 26181 + }, + { + "epoch": 33.609756097560975, + "grad_norm": 1.8879965543746948, + "learning_rate": 2.2133932391955497e-05, + "loss": 0.259, + "step": 26182 + }, + { + "epoch": 33.61103979460847, + "grad_norm": 1.6701116561889648, + "learning_rate": 2.213350449293967e-05, + "loss": 0.2633, + "step": 26183 + }, + { + "epoch": 33.61232349165597, + "grad_norm": 1.1968177556991577, + "learning_rate": 2.2133076593923833e-05, + "loss": 0.2353, + "step": 26184 + }, + { + "epoch": 33.61360718870347, + "grad_norm": 3.0039353370666504, + "learning_rate": 2.2132648694908002e-05, + "loss": 0.2383, + "step": 26185 + }, + { + "epoch": 33.61489088575096, + "grad_norm": 1.2830778360366821, + "learning_rate": 2.213222079589217e-05, + "loss": 0.2412, + "step": 26186 + }, + { + "epoch": 33.61617458279846, + "grad_norm": 1.291877031326294, + "learning_rate": 2.213179289687634e-05, + "loss": 0.2194, + "step": 26187 + }, + { + "epoch": 33.617458279845955, + "grad_norm": 1.6007680892944336, + "learning_rate": 2.2131364997860507e-05, + "loss": 0.2379, + "step": 26188 + }, + { + "epoch": 33.61874197689345, + "grad_norm": 1.6027045249938965, + "learning_rate": 2.2130937098844672e-05, + "loss": 0.2332, + "step": 26189 + }, + { + "epoch": 33.62002567394095, + "grad_norm": 2.615690231323242, + "learning_rate": 2.213050919982884e-05, + "loss": 0.2553, + "step": 26190 + }, + { + "epoch": 33.62130937098845, + "grad_norm": 1.3018263578414917, + "learning_rate": 2.213008130081301e-05, + "loss": 0.2793, + "step": 26191 + }, + { + "epoch": 33.62259306803595, + "grad_norm": 1.5057036876678467, + "learning_rate": 2.2129653401797177e-05, + "loss": 0.272, + "step": 26192 + }, + { + "epoch": 33.62387676508344, + "grad_norm": 1.7047096490859985, + "learning_rate": 2.2129225502781342e-05, + "loss": 0.2315, + "step": 26193 + }, + { + "epoch": 33.625160462130935, + "grad_norm": 2.2100133895874023, + "learning_rate": 2.2128797603765514e-05, + "loss": 0.224, + "step": 26194 + }, + { + "epoch": 33.62644415917843, + "grad_norm": 1.6518193483352661, + "learning_rate": 2.212836970474968e-05, + "loss": 0.2408, + "step": 26195 + }, + { + "epoch": 33.62772785622593, + "grad_norm": 3.2556393146514893, + "learning_rate": 2.2127941805733848e-05, + "loss": 0.2489, + "step": 26196 + }, + { + "epoch": 33.62901155327343, + "grad_norm": 0.8379652500152588, + "learning_rate": 2.2127513906718016e-05, + "loss": 0.216, + "step": 26197 + }, + { + "epoch": 33.63029525032093, + "grad_norm": 1.099861741065979, + "learning_rate": 2.212708600770218e-05, + "loss": 0.2483, + "step": 26198 + }, + { + "epoch": 33.63157894736842, + "grad_norm": 1.2666070461273193, + "learning_rate": 2.2126658108686353e-05, + "loss": 0.2946, + "step": 26199 + }, + { + "epoch": 33.632862644415916, + "grad_norm": 1.676153540611267, + "learning_rate": 2.2126230209670518e-05, + "loss": 0.245, + "step": 26200 + }, + { + "epoch": 33.63414634146341, + "grad_norm": 2.7251667976379395, + "learning_rate": 2.2125802310654686e-05, + "loss": 0.2882, + "step": 26201 + }, + { + "epoch": 33.63543003851091, + "grad_norm": 1.3141652345657349, + "learning_rate": 2.2125374411638855e-05, + "loss": 0.2532, + "step": 26202 + }, + { + "epoch": 33.63671373555841, + "grad_norm": 1.9076987504959106, + "learning_rate": 2.212494651262302e-05, + "loss": 0.3101, + "step": 26203 + }, + { + "epoch": 33.63799743260591, + "grad_norm": 1.0945662260055542, + "learning_rate": 2.212451861360719e-05, + "loss": 0.2543, + "step": 26204 + }, + { + "epoch": 33.639281129653405, + "grad_norm": 2.2872369289398193, + "learning_rate": 2.2124090714591357e-05, + "loss": 0.2765, + "step": 26205 + }, + { + "epoch": 33.640564826700896, + "grad_norm": 1.8603731393814087, + "learning_rate": 2.2123662815575525e-05, + "loss": 0.3558, + "step": 26206 + }, + { + "epoch": 33.64184852374839, + "grad_norm": 2.7227230072021484, + "learning_rate": 2.2123234916559693e-05, + "loss": 0.4566, + "step": 26207 + }, + { + "epoch": 33.64313222079589, + "grad_norm": 1.0318589210510254, + "learning_rate": 2.2122807017543862e-05, + "loss": 0.2778, + "step": 26208 + }, + { + "epoch": 33.64441591784339, + "grad_norm": 2.1632425785064697, + "learning_rate": 2.2122379118528027e-05, + "loss": 0.2647, + "step": 26209 + }, + { + "epoch": 33.64569961489089, + "grad_norm": 0.7737792134284973, + "learning_rate": 2.2121951219512195e-05, + "loss": 0.2529, + "step": 26210 + }, + { + "epoch": 33.646983311938385, + "grad_norm": 1.3802930116653442, + "learning_rate": 2.2121523320496364e-05, + "loss": 0.2656, + "step": 26211 + }, + { + "epoch": 33.64826700898588, + "grad_norm": 1.1646313667297363, + "learning_rate": 2.2121095421480532e-05, + "loss": 0.2534, + "step": 26212 + }, + { + "epoch": 33.649550706033374, + "grad_norm": 0.862859308719635, + "learning_rate": 2.21206675224647e-05, + "loss": 0.2198, + "step": 26213 + }, + { + "epoch": 33.65083440308087, + "grad_norm": 1.1098498106002808, + "learning_rate": 2.2120239623448865e-05, + "loss": 0.2648, + "step": 26214 + }, + { + "epoch": 33.65211810012837, + "grad_norm": 1.141960620880127, + "learning_rate": 2.2119811724433037e-05, + "loss": 0.2473, + "step": 26215 + }, + { + "epoch": 33.65340179717587, + "grad_norm": 0.6805545687675476, + "learning_rate": 2.2119383825417202e-05, + "loss": 0.2527, + "step": 26216 + }, + { + "epoch": 33.654685494223365, + "grad_norm": 1.2465431690216064, + "learning_rate": 2.2118955926401367e-05, + "loss": 0.2555, + "step": 26217 + }, + { + "epoch": 33.65596919127086, + "grad_norm": 1.4082571268081665, + "learning_rate": 2.211852802738554e-05, + "loss": 0.2367, + "step": 26218 + }, + { + "epoch": 33.657252888318354, + "grad_norm": 0.8538491725921631, + "learning_rate": 2.2118100128369704e-05, + "loss": 0.2678, + "step": 26219 + }, + { + "epoch": 33.65853658536585, + "grad_norm": 0.8462833166122437, + "learning_rate": 2.2117672229353876e-05, + "loss": 0.2493, + "step": 26220 + }, + { + "epoch": 33.65982028241335, + "grad_norm": 1.6989907026290894, + "learning_rate": 2.211724433033804e-05, + "loss": 0.2558, + "step": 26221 + }, + { + "epoch": 33.66110397946085, + "grad_norm": 0.7829534411430359, + "learning_rate": 2.2116816431322206e-05, + "loss": 0.2473, + "step": 26222 + }, + { + "epoch": 33.662387676508345, + "grad_norm": 1.5980048179626465, + "learning_rate": 2.2116388532306378e-05, + "loss": 0.2287, + "step": 26223 + }, + { + "epoch": 33.66367137355584, + "grad_norm": 0.8222840428352356, + "learning_rate": 2.2115960633290543e-05, + "loss": 0.2441, + "step": 26224 + }, + { + "epoch": 33.66495507060334, + "grad_norm": 0.9741033911705017, + "learning_rate": 2.211553273427471e-05, + "loss": 0.2516, + "step": 26225 + }, + { + "epoch": 33.66623876765083, + "grad_norm": 1.115181565284729, + "learning_rate": 2.211510483525888e-05, + "loss": 0.2621, + "step": 26226 + }, + { + "epoch": 33.66752246469833, + "grad_norm": 1.091841220855713, + "learning_rate": 2.2114676936243048e-05, + "loss": 0.2651, + "step": 26227 + }, + { + "epoch": 33.66880616174583, + "grad_norm": 5.907108306884766, + "learning_rate": 2.2114249037227216e-05, + "loss": 0.2316, + "step": 26228 + }, + { + "epoch": 33.670089858793325, + "grad_norm": 0.8672892451286316, + "learning_rate": 2.211382113821138e-05, + "loss": 0.279, + "step": 26229 + }, + { + "epoch": 33.67137355584082, + "grad_norm": 1.500654697418213, + "learning_rate": 2.211339323919555e-05, + "loss": 0.2618, + "step": 26230 + }, + { + "epoch": 33.67265725288832, + "grad_norm": 0.9205014109611511, + "learning_rate": 2.2112965340179718e-05, + "loss": 0.2698, + "step": 26231 + }, + { + "epoch": 33.67394094993581, + "grad_norm": 0.9182239770889282, + "learning_rate": 2.2112537441163887e-05, + "loss": 0.2392, + "step": 26232 + }, + { + "epoch": 33.67522464698331, + "grad_norm": 1.0026732683181763, + "learning_rate": 2.2112109542148052e-05, + "loss": 0.2499, + "step": 26233 + }, + { + "epoch": 33.67650834403081, + "grad_norm": 1.7005535364151, + "learning_rate": 2.2111681643132224e-05, + "loss": 0.2517, + "step": 26234 + }, + { + "epoch": 33.677792041078305, + "grad_norm": 1.580970287322998, + "learning_rate": 2.211125374411639e-05, + "loss": 0.2421, + "step": 26235 + }, + { + "epoch": 33.6790757381258, + "grad_norm": 1.9656000137329102, + "learning_rate": 2.2110825845100557e-05, + "loss": 0.2675, + "step": 26236 + }, + { + "epoch": 33.6803594351733, + "grad_norm": 1.1740103960037231, + "learning_rate": 2.2110397946084725e-05, + "loss": 0.2173, + "step": 26237 + }, + { + "epoch": 33.6816431322208, + "grad_norm": 2.4012255668640137, + "learning_rate": 2.210997004706889e-05, + "loss": 0.2639, + "step": 26238 + }, + { + "epoch": 33.68292682926829, + "grad_norm": 1.205381989479065, + "learning_rate": 2.2109542148053062e-05, + "loss": 0.253, + "step": 26239 + }, + { + "epoch": 33.68421052631579, + "grad_norm": 1.4401159286499023, + "learning_rate": 2.2109114249037227e-05, + "loss": 0.2572, + "step": 26240 + }, + { + "epoch": 33.685494223363285, + "grad_norm": 1.4578356742858887, + "learning_rate": 2.2108686350021396e-05, + "loss": 0.2282, + "step": 26241 + }, + { + "epoch": 33.68677792041078, + "grad_norm": 1.3610551357269287, + "learning_rate": 2.2108258451005564e-05, + "loss": 0.22, + "step": 26242 + }, + { + "epoch": 33.68806161745828, + "grad_norm": 0.9759379029273987, + "learning_rate": 2.210783055198973e-05, + "loss": 0.2384, + "step": 26243 + }, + { + "epoch": 33.68934531450578, + "grad_norm": 1.4781477451324463, + "learning_rate": 2.2107402652973897e-05, + "loss": 0.2329, + "step": 26244 + }, + { + "epoch": 33.69062901155327, + "grad_norm": 1.3895354270935059, + "learning_rate": 2.2106974753958066e-05, + "loss": 0.2665, + "step": 26245 + }, + { + "epoch": 33.69191270860077, + "grad_norm": 1.1566680669784546, + "learning_rate": 2.2106546854942234e-05, + "loss": 0.2695, + "step": 26246 + }, + { + "epoch": 33.693196405648266, + "grad_norm": 1.3406566381454468, + "learning_rate": 2.2106118955926403e-05, + "loss": 0.2651, + "step": 26247 + }, + { + "epoch": 33.69448010269576, + "grad_norm": 1.670141339302063, + "learning_rate": 2.210569105691057e-05, + "loss": 0.2718, + "step": 26248 + }, + { + "epoch": 33.69576379974326, + "grad_norm": 1.1902581453323364, + "learning_rate": 2.2105263157894736e-05, + "loss": 0.298, + "step": 26249 + }, + { + "epoch": 33.69704749679076, + "grad_norm": 1.155949354171753, + "learning_rate": 2.2104835258878905e-05, + "loss": 0.2604, + "step": 26250 + }, + { + "epoch": 33.69833119383826, + "grad_norm": 1.655573844909668, + "learning_rate": 2.2104407359863073e-05, + "loss": 0.2658, + "step": 26251 + }, + { + "epoch": 33.69961489088575, + "grad_norm": 2.310218334197998, + "learning_rate": 2.2103979460847238e-05, + "loss": 0.2414, + "step": 26252 + }, + { + "epoch": 33.700898587933246, + "grad_norm": 2.016258716583252, + "learning_rate": 2.210355156183141e-05, + "loss": 0.291, + "step": 26253 + }, + { + "epoch": 33.70218228498074, + "grad_norm": 1.5949383974075317, + "learning_rate": 2.2103123662815575e-05, + "loss": 0.304, + "step": 26254 + }, + { + "epoch": 33.70346598202824, + "grad_norm": 1.2144817113876343, + "learning_rate": 2.2102695763799747e-05, + "loss": 0.2777, + "step": 26255 + }, + { + "epoch": 33.70474967907574, + "grad_norm": 1.2574111223220825, + "learning_rate": 2.210226786478391e-05, + "loss": 0.3074, + "step": 26256 + }, + { + "epoch": 33.70603337612324, + "grad_norm": 5.200290203094482, + "learning_rate": 2.2101839965768077e-05, + "loss": 0.4022, + "step": 26257 + }, + { + "epoch": 33.707317073170735, + "grad_norm": 1.3686028718948364, + "learning_rate": 2.210141206675225e-05, + "loss": 0.2554, + "step": 26258 + }, + { + "epoch": 33.708600770218226, + "grad_norm": 1.0370731353759766, + "learning_rate": 2.2100984167736413e-05, + "loss": 0.2633, + "step": 26259 + }, + { + "epoch": 33.709884467265724, + "grad_norm": 7.130343437194824, + "learning_rate": 2.2100556268720582e-05, + "loss": 0.251, + "step": 26260 + }, + { + "epoch": 33.71116816431322, + "grad_norm": 0.885666012763977, + "learning_rate": 2.210012836970475e-05, + "loss": 0.2644, + "step": 26261 + }, + { + "epoch": 33.71245186136072, + "grad_norm": 1.042997121810913, + "learning_rate": 2.209970047068892e-05, + "loss": 0.2439, + "step": 26262 + }, + { + "epoch": 33.71373555840822, + "grad_norm": 0.9701763987541199, + "learning_rate": 2.2099272571673087e-05, + "loss": 0.2535, + "step": 26263 + }, + { + "epoch": 33.715019255455715, + "grad_norm": 1.096292495727539, + "learning_rate": 2.2098844672657252e-05, + "loss": 0.2511, + "step": 26264 + }, + { + "epoch": 33.716302952503206, + "grad_norm": 1.9790304899215698, + "learning_rate": 2.209841677364142e-05, + "loss": 0.2668, + "step": 26265 + }, + { + "epoch": 33.717586649550704, + "grad_norm": 0.991277277469635, + "learning_rate": 2.209798887462559e-05, + "loss": 0.2713, + "step": 26266 + }, + { + "epoch": 33.7188703465982, + "grad_norm": 1.4814121723175049, + "learning_rate": 2.2097560975609757e-05, + "loss": 0.2689, + "step": 26267 + }, + { + "epoch": 33.7201540436457, + "grad_norm": 2.800797700881958, + "learning_rate": 2.2097133076593922e-05, + "loss": 0.2649, + "step": 26268 + }, + { + "epoch": 33.7214377406932, + "grad_norm": 1.1545462608337402, + "learning_rate": 2.2096705177578094e-05, + "loss": 0.274, + "step": 26269 + }, + { + "epoch": 33.722721437740695, + "grad_norm": 0.8060817122459412, + "learning_rate": 2.209627727856226e-05, + "loss": 0.2616, + "step": 26270 + }, + { + "epoch": 33.72400513478819, + "grad_norm": 1.2330042123794556, + "learning_rate": 2.2095849379546428e-05, + "loss": 0.2592, + "step": 26271 + }, + { + "epoch": 33.725288831835684, + "grad_norm": 3.3584723472595215, + "learning_rate": 2.2095421480530596e-05, + "loss": 0.2508, + "step": 26272 + }, + { + "epoch": 33.72657252888318, + "grad_norm": 1.3855241537094116, + "learning_rate": 2.209499358151476e-05, + "loss": 0.2631, + "step": 26273 + }, + { + "epoch": 33.72785622593068, + "grad_norm": 1.2595609426498413, + "learning_rate": 2.2094565682498933e-05, + "loss": 0.2525, + "step": 26274 + }, + { + "epoch": 33.72913992297818, + "grad_norm": 1.3520668745040894, + "learning_rate": 2.2094137783483098e-05, + "loss": 0.2519, + "step": 26275 + }, + { + "epoch": 33.730423620025675, + "grad_norm": 0.8859642148017883, + "learning_rate": 2.2093709884467266e-05, + "loss": 0.2354, + "step": 26276 + }, + { + "epoch": 33.73170731707317, + "grad_norm": 0.807699978351593, + "learning_rate": 2.2093281985451435e-05, + "loss": 0.2633, + "step": 26277 + }, + { + "epoch": 33.73299101412067, + "grad_norm": 3.060314655303955, + "learning_rate": 2.20928540864356e-05, + "loss": 0.2422, + "step": 26278 + }, + { + "epoch": 33.73427471116816, + "grad_norm": 1.4116560220718384, + "learning_rate": 2.209242618741977e-05, + "loss": 0.2566, + "step": 26279 + }, + { + "epoch": 33.73555840821566, + "grad_norm": 1.0263214111328125, + "learning_rate": 2.2091998288403937e-05, + "loss": 0.2474, + "step": 26280 + }, + { + "epoch": 33.73684210526316, + "grad_norm": 1.0876494646072388, + "learning_rate": 2.2091570389388105e-05, + "loss": 0.2507, + "step": 26281 + }, + { + "epoch": 33.738125802310655, + "grad_norm": 2.3923940658569336, + "learning_rate": 2.2091142490372273e-05, + "loss": 0.237, + "step": 26282 + }, + { + "epoch": 33.73940949935815, + "grad_norm": 1.392951250076294, + "learning_rate": 2.209071459135644e-05, + "loss": 0.2556, + "step": 26283 + }, + { + "epoch": 33.74069319640565, + "grad_norm": 1.0864437818527222, + "learning_rate": 2.2090286692340607e-05, + "loss": 0.241, + "step": 26284 + }, + { + "epoch": 33.74197689345314, + "grad_norm": 1.2332284450531006, + "learning_rate": 2.2089858793324775e-05, + "loss": 0.2344, + "step": 26285 + }, + { + "epoch": 33.74326059050064, + "grad_norm": 0.8821853995323181, + "learning_rate": 2.2089430894308944e-05, + "loss": 0.2222, + "step": 26286 + }, + { + "epoch": 33.74454428754814, + "grad_norm": 1.649795651435852, + "learning_rate": 2.2089002995293112e-05, + "loss": 0.238, + "step": 26287 + }, + { + "epoch": 33.745827984595635, + "grad_norm": 1.2984846830368042, + "learning_rate": 2.208857509627728e-05, + "loss": 0.2441, + "step": 26288 + }, + { + "epoch": 33.74711168164313, + "grad_norm": 0.9657024145126343, + "learning_rate": 2.2088147197261446e-05, + "loss": 0.2347, + "step": 26289 + }, + { + "epoch": 33.74839537869063, + "grad_norm": 0.9885977506637573, + "learning_rate": 2.2087719298245614e-05, + "loss": 0.2742, + "step": 26290 + }, + { + "epoch": 33.74967907573813, + "grad_norm": 0.9578459858894348, + "learning_rate": 2.2087291399229782e-05, + "loss": 0.2488, + "step": 26291 + }, + { + "epoch": 33.75096277278562, + "grad_norm": 1.1926178932189941, + "learning_rate": 2.2086863500213947e-05, + "loss": 0.2578, + "step": 26292 + }, + { + "epoch": 33.75224646983312, + "grad_norm": 1.6065661907196045, + "learning_rate": 2.208643560119812e-05, + "loss": 0.237, + "step": 26293 + }, + { + "epoch": 33.753530166880616, + "grad_norm": 1.041281819343567, + "learning_rate": 2.2086007702182284e-05, + "loss": 0.2516, + "step": 26294 + }, + { + "epoch": 33.75481386392811, + "grad_norm": 1.241539478302002, + "learning_rate": 2.2085579803166456e-05, + "loss": 0.2836, + "step": 26295 + }, + { + "epoch": 33.75609756097561, + "grad_norm": 1.513052225112915, + "learning_rate": 2.208515190415062e-05, + "loss": 0.2456, + "step": 26296 + }, + { + "epoch": 33.75738125802311, + "grad_norm": 1.0995088815689087, + "learning_rate": 2.2084724005134786e-05, + "loss": 0.26, + "step": 26297 + }, + { + "epoch": 33.7586649550706, + "grad_norm": 1.957861304283142, + "learning_rate": 2.2084296106118958e-05, + "loss": 0.2612, + "step": 26298 + }, + { + "epoch": 33.7599486521181, + "grad_norm": 1.5588761568069458, + "learning_rate": 2.2083868207103123e-05, + "loss": 0.2749, + "step": 26299 + }, + { + "epoch": 33.761232349165596, + "grad_norm": 1.1591860055923462, + "learning_rate": 2.208344030808729e-05, + "loss": 0.2765, + "step": 26300 + }, + { + "epoch": 33.76251604621309, + "grad_norm": 1.7180273532867432, + "learning_rate": 2.208301240907146e-05, + "loss": 0.2654, + "step": 26301 + }, + { + "epoch": 33.76379974326059, + "grad_norm": 1.1877845525741577, + "learning_rate": 2.2082584510055628e-05, + "loss": 0.2782, + "step": 26302 + }, + { + "epoch": 33.76508344030809, + "grad_norm": 1.0447806119918823, + "learning_rate": 2.2082156611039797e-05, + "loss": 0.2637, + "step": 26303 + }, + { + "epoch": 33.76636713735559, + "grad_norm": 1.4692697525024414, + "learning_rate": 2.208172871202396e-05, + "loss": 0.2636, + "step": 26304 + }, + { + "epoch": 33.76765083440308, + "grad_norm": 1.3174173831939697, + "learning_rate": 2.208130081300813e-05, + "loss": 0.308, + "step": 26305 + }, + { + "epoch": 33.768934531450576, + "grad_norm": 1.7572237253189087, + "learning_rate": 2.20808729139923e-05, + "loss": 0.299, + "step": 26306 + }, + { + "epoch": 33.770218228498074, + "grad_norm": 5.497968673706055, + "learning_rate": 2.2080445014976467e-05, + "loss": 0.4263, + "step": 26307 + }, + { + "epoch": 33.77150192554557, + "grad_norm": 0.9053803086280823, + "learning_rate": 2.2080017115960632e-05, + "loss": 0.2552, + "step": 26308 + }, + { + "epoch": 33.77278562259307, + "grad_norm": 0.6241247057914734, + "learning_rate": 2.2079589216944804e-05, + "loss": 0.2574, + "step": 26309 + }, + { + "epoch": 33.77406931964057, + "grad_norm": 0.8685179352760315, + "learning_rate": 2.207916131792897e-05, + "loss": 0.2611, + "step": 26310 + }, + { + "epoch": 33.775353016688065, + "grad_norm": 0.8835586905479431, + "learning_rate": 2.2078733418913137e-05, + "loss": 0.2725, + "step": 26311 + }, + { + "epoch": 33.776636713735556, + "grad_norm": 0.9098129868507385, + "learning_rate": 2.2078305519897305e-05, + "loss": 0.2505, + "step": 26312 + }, + { + "epoch": 33.777920410783054, + "grad_norm": 0.7214182019233704, + "learning_rate": 2.207787762088147e-05, + "loss": 0.252, + "step": 26313 + }, + { + "epoch": 33.77920410783055, + "grad_norm": 0.7758545279502869, + "learning_rate": 2.2077449721865642e-05, + "loss": 0.2461, + "step": 26314 + }, + { + "epoch": 33.78048780487805, + "grad_norm": 2.971036434173584, + "learning_rate": 2.2077021822849807e-05, + "loss": 0.2584, + "step": 26315 + }, + { + "epoch": 33.78177150192555, + "grad_norm": 1.522864580154419, + "learning_rate": 2.2076593923833976e-05, + "loss": 0.2476, + "step": 26316 + }, + { + "epoch": 33.783055198973045, + "grad_norm": 1.009871482849121, + "learning_rate": 2.2076166024818144e-05, + "loss": 0.2323, + "step": 26317 + }, + { + "epoch": 33.784338896020536, + "grad_norm": 1.936385154724121, + "learning_rate": 2.207573812580231e-05, + "loss": 0.2498, + "step": 26318 + }, + { + "epoch": 33.785622593068034, + "grad_norm": 1.1909936666488647, + "learning_rate": 2.207531022678648e-05, + "loss": 0.2648, + "step": 26319 + }, + { + "epoch": 33.78690629011553, + "grad_norm": 0.9519599676132202, + "learning_rate": 2.2074882327770646e-05, + "loss": 0.2539, + "step": 26320 + }, + { + "epoch": 33.78818998716303, + "grad_norm": 4.486444473266602, + "learning_rate": 2.2074454428754814e-05, + "loss": 0.2584, + "step": 26321 + }, + { + "epoch": 33.78947368421053, + "grad_norm": 1.0634045600891113, + "learning_rate": 2.2074026529738983e-05, + "loss": 0.2641, + "step": 26322 + }, + { + "epoch": 33.790757381258025, + "grad_norm": 1.2888069152832031, + "learning_rate": 2.207359863072315e-05, + "loss": 0.273, + "step": 26323 + }, + { + "epoch": 33.79204107830552, + "grad_norm": 1.5951837301254272, + "learning_rate": 2.2073170731707316e-05, + "loss": 0.2413, + "step": 26324 + }, + { + "epoch": 33.793324775353014, + "grad_norm": 0.937344491481781, + "learning_rate": 2.2072742832691485e-05, + "loss": 0.2188, + "step": 26325 + }, + { + "epoch": 33.79460847240051, + "grad_norm": 1.8648526668548584, + "learning_rate": 2.2072314933675653e-05, + "loss": 0.2453, + "step": 26326 + }, + { + "epoch": 33.79589216944801, + "grad_norm": 1.1539208889007568, + "learning_rate": 2.207188703465982e-05, + "loss": 0.252, + "step": 26327 + }, + { + "epoch": 33.79717586649551, + "grad_norm": 0.939935564994812, + "learning_rate": 2.207145913564399e-05, + "loss": 0.2495, + "step": 26328 + }, + { + "epoch": 33.798459563543005, + "grad_norm": 1.388474941253662, + "learning_rate": 2.2071031236628155e-05, + "loss": 0.2394, + "step": 26329 + }, + { + "epoch": 33.7997432605905, + "grad_norm": 0.7734745740890503, + "learning_rate": 2.2070603337612327e-05, + "loss": 0.2145, + "step": 26330 + }, + { + "epoch": 33.801026957637994, + "grad_norm": 1.5032458305358887, + "learning_rate": 2.2070175438596492e-05, + "loss": 0.2417, + "step": 26331 + }, + { + "epoch": 33.80231065468549, + "grad_norm": 1.033115029335022, + "learning_rate": 2.2069747539580657e-05, + "loss": 0.2507, + "step": 26332 + }, + { + "epoch": 33.80359435173299, + "grad_norm": 1.4424359798431396, + "learning_rate": 2.206931964056483e-05, + "loss": 0.2422, + "step": 26333 + }, + { + "epoch": 33.80487804878049, + "grad_norm": 1.1406331062316895, + "learning_rate": 2.2068891741548994e-05, + "loss": 0.2555, + "step": 26334 + }, + { + "epoch": 33.806161745827985, + "grad_norm": 0.8787803053855896, + "learning_rate": 2.2068463842533165e-05, + "loss": 0.2136, + "step": 26335 + }, + { + "epoch": 33.80744544287548, + "grad_norm": 0.8672524094581604, + "learning_rate": 2.206803594351733e-05, + "loss": 0.2273, + "step": 26336 + }, + { + "epoch": 33.80872913992298, + "grad_norm": 1.4914417266845703, + "learning_rate": 2.20676080445015e-05, + "loss": 0.2483, + "step": 26337 + }, + { + "epoch": 33.81001283697047, + "grad_norm": 3.518368721008301, + "learning_rate": 2.2067180145485667e-05, + "loss": 0.2507, + "step": 26338 + }, + { + "epoch": 33.81129653401797, + "grad_norm": 2.6931865215301514, + "learning_rate": 2.2066752246469832e-05, + "loss": 0.2311, + "step": 26339 + }, + { + "epoch": 33.81258023106547, + "grad_norm": 1.4647654294967651, + "learning_rate": 2.2066324347454e-05, + "loss": 0.2541, + "step": 26340 + }, + { + "epoch": 33.813863928112966, + "grad_norm": 0.9174366593360901, + "learning_rate": 2.206589644843817e-05, + "loss": 0.2304, + "step": 26341 + }, + { + "epoch": 33.81514762516046, + "grad_norm": 1.2361639738082886, + "learning_rate": 2.2065468549422337e-05, + "loss": 0.2581, + "step": 26342 + }, + { + "epoch": 33.81643132220796, + "grad_norm": 2.537550449371338, + "learning_rate": 2.2065040650406506e-05, + "loss": 0.2705, + "step": 26343 + }, + { + "epoch": 33.81771501925546, + "grad_norm": 2.8505685329437256, + "learning_rate": 2.206461275139067e-05, + "loss": 0.2608, + "step": 26344 + }, + { + "epoch": 33.81899871630295, + "grad_norm": 1.2636183500289917, + "learning_rate": 2.206418485237484e-05, + "loss": 0.2502, + "step": 26345 + }, + { + "epoch": 33.82028241335045, + "grad_norm": 1.567298173904419, + "learning_rate": 2.2063756953359008e-05, + "loss": 0.2627, + "step": 26346 + }, + { + "epoch": 33.821566110397946, + "grad_norm": 1.236162781715393, + "learning_rate": 2.2063329054343176e-05, + "loss": 0.2256, + "step": 26347 + }, + { + "epoch": 33.822849807445444, + "grad_norm": 1.4266993999481201, + "learning_rate": 2.206290115532734e-05, + "loss": 0.2616, + "step": 26348 + }, + { + "epoch": 33.82413350449294, + "grad_norm": 1.0907561779022217, + "learning_rate": 2.2062473256311513e-05, + "loss": 0.2859, + "step": 26349 + }, + { + "epoch": 33.82541720154044, + "grad_norm": 2.2045252323150635, + "learning_rate": 2.2062045357295678e-05, + "loss": 0.2461, + "step": 26350 + }, + { + "epoch": 33.82670089858793, + "grad_norm": 1.7905784845352173, + "learning_rate": 2.2061617458279846e-05, + "loss": 0.2612, + "step": 26351 + }, + { + "epoch": 33.82798459563543, + "grad_norm": 1.5474824905395508, + "learning_rate": 2.2061189559264015e-05, + "loss": 0.2753, + "step": 26352 + }, + { + "epoch": 33.829268292682926, + "grad_norm": 3.5499520301818848, + "learning_rate": 2.206076166024818e-05, + "loss": 0.2858, + "step": 26353 + }, + { + "epoch": 33.830551989730424, + "grad_norm": 2.3891794681549072, + "learning_rate": 2.206033376123235e-05, + "loss": 0.262, + "step": 26354 + }, + { + "epoch": 33.83183568677792, + "grad_norm": 4.714771747589111, + "learning_rate": 2.2059905862216517e-05, + "loss": 0.3169, + "step": 26355 + }, + { + "epoch": 33.83311938382542, + "grad_norm": 2.3251566886901855, + "learning_rate": 2.2059477963200685e-05, + "loss": 0.3406, + "step": 26356 + }, + { + "epoch": 33.83440308087292, + "grad_norm": 3.299731731414795, + "learning_rate": 2.2059050064184853e-05, + "loss": 0.4134, + "step": 26357 + }, + { + "epoch": 33.83568677792041, + "grad_norm": 3.6517834663391113, + "learning_rate": 2.205862216516902e-05, + "loss": 0.264, + "step": 26358 + }, + { + "epoch": 33.836970474967906, + "grad_norm": 0.7421845197677612, + "learning_rate": 2.205819426615319e-05, + "loss": 0.2663, + "step": 26359 + }, + { + "epoch": 33.838254172015404, + "grad_norm": 0.9105878472328186, + "learning_rate": 2.2057766367137355e-05, + "loss": 0.2679, + "step": 26360 + }, + { + "epoch": 33.8395378690629, + "grad_norm": 1.7708991765975952, + "learning_rate": 2.2057338468121524e-05, + "loss": 0.2473, + "step": 26361 + }, + { + "epoch": 33.8408215661104, + "grad_norm": 1.1431612968444824, + "learning_rate": 2.2056910569105692e-05, + "loss": 0.2594, + "step": 26362 + }, + { + "epoch": 33.8421052631579, + "grad_norm": 0.943376898765564, + "learning_rate": 2.205648267008986e-05, + "loss": 0.2527, + "step": 26363 + }, + { + "epoch": 33.84338896020539, + "grad_norm": 1.040061354637146, + "learning_rate": 2.2056054771074026e-05, + "loss": 0.2902, + "step": 26364 + }, + { + "epoch": 33.844672657252886, + "grad_norm": 0.7919501662254333, + "learning_rate": 2.2055626872058194e-05, + "loss": 0.2546, + "step": 26365 + }, + { + "epoch": 33.845956354300384, + "grad_norm": 0.9584279656410217, + "learning_rate": 2.2055198973042362e-05, + "loss": 0.2648, + "step": 26366 + }, + { + "epoch": 33.84724005134788, + "grad_norm": 1.0181103944778442, + "learning_rate": 2.205477107402653e-05, + "loss": 0.2605, + "step": 26367 + }, + { + "epoch": 33.84852374839538, + "grad_norm": 1.4398478269577026, + "learning_rate": 2.20543431750107e-05, + "loss": 0.2394, + "step": 26368 + }, + { + "epoch": 33.84980744544288, + "grad_norm": 1.0219496488571167, + "learning_rate": 2.2053915275994864e-05, + "loss": 0.2758, + "step": 26369 + }, + { + "epoch": 33.851091142490375, + "grad_norm": 0.846383273601532, + "learning_rate": 2.2053487376979036e-05, + "loss": 0.2448, + "step": 26370 + }, + { + "epoch": 33.852374839537866, + "grad_norm": 1.1035070419311523, + "learning_rate": 2.20530594779632e-05, + "loss": 0.2477, + "step": 26371 + }, + { + "epoch": 33.853658536585364, + "grad_norm": 1.030857801437378, + "learning_rate": 2.2052631578947366e-05, + "loss": 0.2588, + "step": 26372 + }, + { + "epoch": 33.85494223363286, + "grad_norm": 1.0230002403259277, + "learning_rate": 2.2052203679931538e-05, + "loss": 0.2436, + "step": 26373 + }, + { + "epoch": 33.85622593068036, + "grad_norm": 1.2101465463638306, + "learning_rate": 2.2051775780915703e-05, + "loss": 0.2476, + "step": 26374 + }, + { + "epoch": 33.85750962772786, + "grad_norm": 0.7191886305809021, + "learning_rate": 2.2051347881899875e-05, + "loss": 0.2332, + "step": 26375 + }, + { + "epoch": 33.858793324775355, + "grad_norm": 1.5564600229263306, + "learning_rate": 2.205091998288404e-05, + "loss": 0.2665, + "step": 26376 + }, + { + "epoch": 33.86007702182285, + "grad_norm": 2.7542049884796143, + "learning_rate": 2.2050492083868208e-05, + "loss": 0.2152, + "step": 26377 + }, + { + "epoch": 33.861360718870344, + "grad_norm": 1.5933202505111694, + "learning_rate": 2.2050064184852377e-05, + "loss": 0.2909, + "step": 26378 + }, + { + "epoch": 33.86264441591784, + "grad_norm": 0.8481103181838989, + "learning_rate": 2.204963628583654e-05, + "loss": 0.2557, + "step": 26379 + }, + { + "epoch": 33.86392811296534, + "grad_norm": 1.0298470258712769, + "learning_rate": 2.204920838682071e-05, + "loss": 0.2407, + "step": 26380 + }, + { + "epoch": 33.86521181001284, + "grad_norm": 0.979902982711792, + "learning_rate": 2.204878048780488e-05, + "loss": 0.2367, + "step": 26381 + }, + { + "epoch": 33.866495507060336, + "grad_norm": 0.8705942630767822, + "learning_rate": 2.2048352588789047e-05, + "loss": 0.2421, + "step": 26382 + }, + { + "epoch": 33.86777920410783, + "grad_norm": 0.917461097240448, + "learning_rate": 2.2047924689773215e-05, + "loss": 0.2298, + "step": 26383 + }, + { + "epoch": 33.869062901155324, + "grad_norm": 1.7912819385528564, + "learning_rate": 2.2047496790757384e-05, + "loss": 0.2526, + "step": 26384 + }, + { + "epoch": 33.87034659820282, + "grad_norm": 1.1421349048614502, + "learning_rate": 2.204706889174155e-05, + "loss": 0.2269, + "step": 26385 + }, + { + "epoch": 33.87163029525032, + "grad_norm": 0.9765022993087769, + "learning_rate": 2.2046640992725717e-05, + "loss": 0.2439, + "step": 26386 + }, + { + "epoch": 33.87291399229782, + "grad_norm": 0.9590449333190918, + "learning_rate": 2.2046213093709885e-05, + "loss": 0.2453, + "step": 26387 + }, + { + "epoch": 33.874197689345316, + "grad_norm": 1.4986532926559448, + "learning_rate": 2.204578519469405e-05, + "loss": 0.2641, + "step": 26388 + }, + { + "epoch": 33.87548138639281, + "grad_norm": 1.2968089580535889, + "learning_rate": 2.2045357295678222e-05, + "loss": 0.2508, + "step": 26389 + }, + { + "epoch": 33.87676508344031, + "grad_norm": 1.313023328781128, + "learning_rate": 2.2044929396662387e-05, + "loss": 0.2453, + "step": 26390 + }, + { + "epoch": 33.8780487804878, + "grad_norm": 0.8861151933670044, + "learning_rate": 2.204450149764656e-05, + "loss": 0.2575, + "step": 26391 + }, + { + "epoch": 33.8793324775353, + "grad_norm": 1.9479973316192627, + "learning_rate": 2.2044073598630724e-05, + "loss": 0.2464, + "step": 26392 + }, + { + "epoch": 33.8806161745828, + "grad_norm": 1.5282906293869019, + "learning_rate": 2.204364569961489e-05, + "loss": 0.2647, + "step": 26393 + }, + { + "epoch": 33.881899871630296, + "grad_norm": 1.4858198165893555, + "learning_rate": 2.204321780059906e-05, + "loss": 0.2715, + "step": 26394 + }, + { + "epoch": 33.883183568677794, + "grad_norm": 1.3280296325683594, + "learning_rate": 2.2042789901583226e-05, + "loss": 0.2479, + "step": 26395 + }, + { + "epoch": 33.88446726572529, + "grad_norm": 1.1866618394851685, + "learning_rate": 2.2042362002567394e-05, + "loss": 0.2729, + "step": 26396 + }, + { + "epoch": 33.88575096277278, + "grad_norm": 2.57637095451355, + "learning_rate": 2.2041934103551563e-05, + "loss": 0.276, + "step": 26397 + }, + { + "epoch": 33.88703465982028, + "grad_norm": 3.3578765392303467, + "learning_rate": 2.204150620453573e-05, + "loss": 0.2396, + "step": 26398 + }, + { + "epoch": 33.88831835686778, + "grad_norm": 2.543900966644287, + "learning_rate": 2.20410783055199e-05, + "loss": 0.2524, + "step": 26399 + }, + { + "epoch": 33.889602053915276, + "grad_norm": 1.7595877647399902, + "learning_rate": 2.2040650406504065e-05, + "loss": 0.2863, + "step": 26400 + }, + { + "epoch": 33.890885750962774, + "grad_norm": 2.652031898498535, + "learning_rate": 2.2040222507488233e-05, + "loss": 0.2587, + "step": 26401 + }, + { + "epoch": 33.89216944801027, + "grad_norm": 1.1659142971038818, + "learning_rate": 2.20397946084724e-05, + "loss": 0.2557, + "step": 26402 + }, + { + "epoch": 33.89345314505777, + "grad_norm": 1.9232337474822998, + "learning_rate": 2.203936670945657e-05, + "loss": 0.2616, + "step": 26403 + }, + { + "epoch": 33.89473684210526, + "grad_norm": 1.9217156171798706, + "learning_rate": 2.2038938810440735e-05, + "loss": 0.2724, + "step": 26404 + }, + { + "epoch": 33.89602053915276, + "grad_norm": 1.3954577445983887, + "learning_rate": 2.2038510911424903e-05, + "loss": 0.277, + "step": 26405 + }, + { + "epoch": 33.897304236200256, + "grad_norm": 1.8620758056640625, + "learning_rate": 2.2038083012409072e-05, + "loss": 0.3582, + "step": 26406 + }, + { + "epoch": 33.898587933247754, + "grad_norm": 4.097865104675293, + "learning_rate": 2.203765511339324e-05, + "loss": 0.4049, + "step": 26407 + }, + { + "epoch": 33.89987163029525, + "grad_norm": 1.0517923831939697, + "learning_rate": 2.203722721437741e-05, + "loss": 0.2424, + "step": 26408 + }, + { + "epoch": 33.90115532734275, + "grad_norm": 1.296155571937561, + "learning_rate": 2.2036799315361574e-05, + "loss": 0.2617, + "step": 26409 + }, + { + "epoch": 33.90243902439025, + "grad_norm": 1.6606858968734741, + "learning_rate": 2.2036371416345745e-05, + "loss": 0.2583, + "step": 26410 + }, + { + "epoch": 33.90372272143774, + "grad_norm": 0.765740692615509, + "learning_rate": 2.203594351732991e-05, + "loss": 0.2466, + "step": 26411 + }, + { + "epoch": 33.905006418485236, + "grad_norm": 0.9670540690422058, + "learning_rate": 2.2035515618314075e-05, + "loss": 0.2613, + "step": 26412 + }, + { + "epoch": 33.906290115532734, + "grad_norm": 1.2334630489349365, + "learning_rate": 2.2035087719298247e-05, + "loss": 0.2523, + "step": 26413 + }, + { + "epoch": 33.90757381258023, + "grad_norm": 5.067219257354736, + "learning_rate": 2.2034659820282412e-05, + "loss": 0.2572, + "step": 26414 + }, + { + "epoch": 33.90885750962773, + "grad_norm": 1.9994508028030396, + "learning_rate": 2.2034231921266584e-05, + "loss": 0.2686, + "step": 26415 + }, + { + "epoch": 33.91014120667523, + "grad_norm": 2.210462808609009, + "learning_rate": 2.203380402225075e-05, + "loss": 0.2617, + "step": 26416 + }, + { + "epoch": 33.91142490372272, + "grad_norm": 0.888323187828064, + "learning_rate": 2.2033376123234918e-05, + "loss": 0.2556, + "step": 26417 + }, + { + "epoch": 33.912708600770216, + "grad_norm": 2.090898036956787, + "learning_rate": 2.2032948224219086e-05, + "loss": 0.2562, + "step": 26418 + }, + { + "epoch": 33.913992297817714, + "grad_norm": 1.055356502532959, + "learning_rate": 2.203252032520325e-05, + "loss": 0.2719, + "step": 26419 + }, + { + "epoch": 33.91527599486521, + "grad_norm": 1.4015395641326904, + "learning_rate": 2.203209242618742e-05, + "loss": 0.2509, + "step": 26420 + }, + { + "epoch": 33.91655969191271, + "grad_norm": 1.6028155088424683, + "learning_rate": 2.2031664527171588e-05, + "loss": 0.2533, + "step": 26421 + }, + { + "epoch": 33.91784338896021, + "grad_norm": 0.9850968718528748, + "learning_rate": 2.2031236628155756e-05, + "loss": 0.2377, + "step": 26422 + }, + { + "epoch": 33.919127086007705, + "grad_norm": 1.3277947902679443, + "learning_rate": 2.2030808729139925e-05, + "loss": 0.2604, + "step": 26423 + }, + { + "epoch": 33.920410783055196, + "grad_norm": 1.5289493799209595, + "learning_rate": 2.2030380830124093e-05, + "loss": 0.2504, + "step": 26424 + }, + { + "epoch": 33.921694480102694, + "grad_norm": 1.2143256664276123, + "learning_rate": 2.2029952931108258e-05, + "loss": 0.2429, + "step": 26425 + }, + { + "epoch": 33.92297817715019, + "grad_norm": 1.1803311109542847, + "learning_rate": 2.2029525032092426e-05, + "loss": 0.2324, + "step": 26426 + }, + { + "epoch": 33.92426187419769, + "grad_norm": 1.1070951223373413, + "learning_rate": 2.2029097133076595e-05, + "loss": 0.2383, + "step": 26427 + }, + { + "epoch": 33.92554557124519, + "grad_norm": 3.217524528503418, + "learning_rate": 2.202866923406076e-05, + "loss": 0.2569, + "step": 26428 + }, + { + "epoch": 33.926829268292686, + "grad_norm": 1.7343310117721558, + "learning_rate": 2.202824133504493e-05, + "loss": 0.2689, + "step": 26429 + }, + { + "epoch": 33.928112965340176, + "grad_norm": 0.8632462024688721, + "learning_rate": 2.2027813436029097e-05, + "loss": 0.2701, + "step": 26430 + }, + { + "epoch": 33.929396662387674, + "grad_norm": 0.9696222543716431, + "learning_rate": 2.202738553701327e-05, + "loss": 0.2391, + "step": 26431 + }, + { + "epoch": 33.93068035943517, + "grad_norm": 0.9330286979675293, + "learning_rate": 2.2026957637997434e-05, + "loss": 0.2282, + "step": 26432 + }, + { + "epoch": 33.93196405648267, + "grad_norm": 1.2184158563613892, + "learning_rate": 2.20265297389816e-05, + "loss": 0.2456, + "step": 26433 + }, + { + "epoch": 33.93324775353017, + "grad_norm": 1.1480432748794556, + "learning_rate": 2.202610183996577e-05, + "loss": 0.2344, + "step": 26434 + }, + { + "epoch": 33.934531450577666, + "grad_norm": 1.0485872030258179, + "learning_rate": 2.2025673940949935e-05, + "loss": 0.2191, + "step": 26435 + }, + { + "epoch": 33.93581514762516, + "grad_norm": 1.4681042432785034, + "learning_rate": 2.2025246041934104e-05, + "loss": 0.253, + "step": 26436 + }, + { + "epoch": 33.937098844672654, + "grad_norm": 1.0021346807479858, + "learning_rate": 2.2024818142918272e-05, + "loss": 0.2499, + "step": 26437 + }, + { + "epoch": 33.93838254172015, + "grad_norm": 0.9687042832374573, + "learning_rate": 2.202439024390244e-05, + "loss": 0.2328, + "step": 26438 + }, + { + "epoch": 33.93966623876765, + "grad_norm": 0.9803779721260071, + "learning_rate": 2.202396234488661e-05, + "loss": 0.2481, + "step": 26439 + }, + { + "epoch": 33.94094993581515, + "grad_norm": 1.261183738708496, + "learning_rate": 2.2023534445870774e-05, + "loss": 0.2416, + "step": 26440 + }, + { + "epoch": 33.942233632862646, + "grad_norm": 2.0830845832824707, + "learning_rate": 2.2023106546854942e-05, + "loss": 0.2696, + "step": 26441 + }, + { + "epoch": 33.943517329910144, + "grad_norm": 0.9750428199768066, + "learning_rate": 2.202267864783911e-05, + "loss": 0.2489, + "step": 26442 + }, + { + "epoch": 33.94480102695764, + "grad_norm": 1.095155954360962, + "learning_rate": 2.202225074882328e-05, + "loss": 0.2431, + "step": 26443 + }, + { + "epoch": 33.94608472400513, + "grad_norm": 0.9159427285194397, + "learning_rate": 2.2021822849807444e-05, + "loss": 0.2408, + "step": 26444 + }, + { + "epoch": 33.94736842105263, + "grad_norm": 1.1742416620254517, + "learning_rate": 2.2021394950791616e-05, + "loss": 0.3108, + "step": 26445 + }, + { + "epoch": 33.94865211810013, + "grad_norm": 1.6676180362701416, + "learning_rate": 2.202096705177578e-05, + "loss": 0.2527, + "step": 26446 + }, + { + "epoch": 33.949935815147626, + "grad_norm": 1.2956368923187256, + "learning_rate": 2.2020539152759946e-05, + "loss": 0.2667, + "step": 26447 + }, + { + "epoch": 33.951219512195124, + "grad_norm": 1.3652236461639404, + "learning_rate": 2.2020111253744118e-05, + "loss": 0.2604, + "step": 26448 + }, + { + "epoch": 33.95250320924262, + "grad_norm": 1.1643226146697998, + "learning_rate": 2.2019683354728283e-05, + "loss": 0.2888, + "step": 26449 + }, + { + "epoch": 33.95378690629011, + "grad_norm": 1.4769396781921387, + "learning_rate": 2.2019255455712455e-05, + "loss": 0.2728, + "step": 26450 + }, + { + "epoch": 33.95507060333761, + "grad_norm": 2.108738660812378, + "learning_rate": 2.201882755669662e-05, + "loss": 0.2527, + "step": 26451 + }, + { + "epoch": 33.95635430038511, + "grad_norm": 1.471549391746521, + "learning_rate": 2.2018399657680788e-05, + "loss": 0.2768, + "step": 26452 + }, + { + "epoch": 33.957637997432606, + "grad_norm": 1.1668461561203003, + "learning_rate": 2.2017971758664957e-05, + "loss": 0.2602, + "step": 26453 + }, + { + "epoch": 33.958921694480104, + "grad_norm": 3.1388814449310303, + "learning_rate": 2.201754385964912e-05, + "loss": 0.2428, + "step": 26454 + }, + { + "epoch": 33.9602053915276, + "grad_norm": 4.410024166107178, + "learning_rate": 2.201711596063329e-05, + "loss": 0.3001, + "step": 26455 + }, + { + "epoch": 33.9614890885751, + "grad_norm": 2.2692975997924805, + "learning_rate": 2.201668806161746e-05, + "loss": 0.3758, + "step": 26456 + }, + { + "epoch": 33.96277278562259, + "grad_norm": 1.797163963317871, + "learning_rate": 2.2016260162601627e-05, + "loss": 0.4032, + "step": 26457 + }, + { + "epoch": 33.96405648267009, + "grad_norm": 1.2426434755325317, + "learning_rate": 2.2015832263585795e-05, + "loss": 0.2506, + "step": 26458 + }, + { + "epoch": 33.965340179717586, + "grad_norm": 0.8761852383613586, + "learning_rate": 2.2015404364569964e-05, + "loss": 0.2643, + "step": 26459 + }, + { + "epoch": 33.966623876765084, + "grad_norm": 0.8480100035667419, + "learning_rate": 2.201497646555413e-05, + "loss": 0.283, + "step": 26460 + }, + { + "epoch": 33.96790757381258, + "grad_norm": 0.9021245837211609, + "learning_rate": 2.2014548566538297e-05, + "loss": 0.263, + "step": 26461 + }, + { + "epoch": 33.96919127086008, + "grad_norm": 1.2290055751800537, + "learning_rate": 2.2014120667522466e-05, + "loss": 0.2807, + "step": 26462 + }, + { + "epoch": 33.97047496790757, + "grad_norm": 2.423037052154541, + "learning_rate": 2.201369276850663e-05, + "loss": 0.2451, + "step": 26463 + }, + { + "epoch": 33.97175866495507, + "grad_norm": 1.3704067468643188, + "learning_rate": 2.2013264869490802e-05, + "loss": 0.2616, + "step": 26464 + }, + { + "epoch": 33.973042362002566, + "grad_norm": 1.6558510065078735, + "learning_rate": 2.2012836970474967e-05, + "loss": 0.275, + "step": 26465 + }, + { + "epoch": 33.974326059050064, + "grad_norm": 0.7525787353515625, + "learning_rate": 2.2012409071459136e-05, + "loss": 0.2606, + "step": 26466 + }, + { + "epoch": 33.97560975609756, + "grad_norm": 1.396740436553955, + "learning_rate": 2.2011981172443304e-05, + "loss": 0.2538, + "step": 26467 + }, + { + "epoch": 33.97689345314506, + "grad_norm": 0.7719113826751709, + "learning_rate": 2.201155327342747e-05, + "loss": 0.2294, + "step": 26468 + }, + { + "epoch": 33.97817715019256, + "grad_norm": 1.4326168298721313, + "learning_rate": 2.201112537441164e-05, + "loss": 0.2606, + "step": 26469 + }, + { + "epoch": 33.97946084724005, + "grad_norm": 1.309884786605835, + "learning_rate": 2.2010697475395806e-05, + "loss": 0.242, + "step": 26470 + }, + { + "epoch": 33.980744544287546, + "grad_norm": 1.3726356029510498, + "learning_rate": 2.2010269576379974e-05, + "loss": 0.2498, + "step": 26471 + }, + { + "epoch": 33.982028241335044, + "grad_norm": 1.0767567157745361, + "learning_rate": 2.2009841677364143e-05, + "loss": 0.2574, + "step": 26472 + }, + { + "epoch": 33.98331193838254, + "grad_norm": 0.8387306928634644, + "learning_rate": 2.2009413778348308e-05, + "loss": 0.2369, + "step": 26473 + }, + { + "epoch": 33.98459563543004, + "grad_norm": 1.886426568031311, + "learning_rate": 2.200898587933248e-05, + "loss": 0.2432, + "step": 26474 + }, + { + "epoch": 33.98587933247754, + "grad_norm": 2.317176342010498, + "learning_rate": 2.2008557980316645e-05, + "loss": 0.2444, + "step": 26475 + }, + { + "epoch": 33.987163029525036, + "grad_norm": 1.1950581073760986, + "learning_rate": 2.2008130081300813e-05, + "loss": 0.2688, + "step": 26476 + }, + { + "epoch": 33.988446726572526, + "grad_norm": 2.467707395553589, + "learning_rate": 2.200770218228498e-05, + "loss": 0.2489, + "step": 26477 + }, + { + "epoch": 33.989730423620024, + "grad_norm": 1.3136464357376099, + "learning_rate": 2.200727428326915e-05, + "loss": 0.2727, + "step": 26478 + }, + { + "epoch": 33.99101412066752, + "grad_norm": 1.4754101037979126, + "learning_rate": 2.2006846384253315e-05, + "loss": 0.2587, + "step": 26479 + }, + { + "epoch": 33.99229781771502, + "grad_norm": 1.0170823335647583, + "learning_rate": 2.2006418485237483e-05, + "loss": 0.2461, + "step": 26480 + }, + { + "epoch": 33.99358151476252, + "grad_norm": 1.1999714374542236, + "learning_rate": 2.2005990586221652e-05, + "loss": 0.2572, + "step": 26481 + }, + { + "epoch": 33.994865211810016, + "grad_norm": 1.557948112487793, + "learning_rate": 2.200556268720582e-05, + "loss": 0.2754, + "step": 26482 + }, + { + "epoch": 33.996148908857506, + "grad_norm": 2.1930317878723145, + "learning_rate": 2.200513478818999e-05, + "loss": 0.2655, + "step": 26483 + }, + { + "epoch": 33.997432605905004, + "grad_norm": 1.5283879041671753, + "learning_rate": 2.2004706889174154e-05, + "loss": 0.3103, + "step": 26484 + }, + { + "epoch": 33.9987163029525, + "grad_norm": 1.3127723932266235, + "learning_rate": 2.2004278990158325e-05, + "loss": 0.3143, + "step": 26485 + }, + { + "epoch": 34.0, + "grad_norm": 1.9180911779403687, + "learning_rate": 2.200385109114249e-05, + "loss": 0.4231, + "step": 26486 + }, + { + "epoch": 34.0012836970475, + "grad_norm": 0.8933417797088623, + "learning_rate": 2.2003423192126656e-05, + "loss": 0.2402, + "step": 26487 + }, + { + "epoch": 34.002567394094996, + "grad_norm": 2.1864566802978516, + "learning_rate": 2.2002995293110827e-05, + "loss": 0.2588, + "step": 26488 + }, + { + "epoch": 34.003851091142494, + "grad_norm": 1.5225627422332764, + "learning_rate": 2.2002567394094992e-05, + "loss": 0.2733, + "step": 26489 + }, + { + "epoch": 34.005134788189984, + "grad_norm": 2.4132752418518066, + "learning_rate": 2.2002139495079164e-05, + "loss": 0.2318, + "step": 26490 + }, + { + "epoch": 34.00641848523748, + "grad_norm": 0.9021621942520142, + "learning_rate": 2.200171159606333e-05, + "loss": 0.2512, + "step": 26491 + }, + { + "epoch": 34.00770218228498, + "grad_norm": 0.7326313257217407, + "learning_rate": 2.2001283697047498e-05, + "loss": 0.2111, + "step": 26492 + }, + { + "epoch": 34.00898587933248, + "grad_norm": 1.1733566522598267, + "learning_rate": 2.2000855798031666e-05, + "loss": 0.2232, + "step": 26493 + }, + { + "epoch": 34.010269576379976, + "grad_norm": 0.9841551780700684, + "learning_rate": 2.200042789901583e-05, + "loss": 0.2363, + "step": 26494 + }, + { + "epoch": 34.011553273427474, + "grad_norm": 0.8797330260276794, + "learning_rate": 2.2e-05, + "loss": 0.2151, + "step": 26495 + }, + { + "epoch": 34.012836970474964, + "grad_norm": 1.1060353517532349, + "learning_rate": 2.1999572100984168e-05, + "loss": 0.2343, + "step": 26496 + }, + { + "epoch": 34.01412066752246, + "grad_norm": 3.3892366886138916, + "learning_rate": 2.1999144201968336e-05, + "loss": 0.2645, + "step": 26497 + }, + { + "epoch": 34.01540436456996, + "grad_norm": 0.8808859586715698, + "learning_rate": 2.1998716302952505e-05, + "loss": 0.2386, + "step": 26498 + }, + { + "epoch": 34.01668806161746, + "grad_norm": 0.6872453689575195, + "learning_rate": 2.1998288403936673e-05, + "loss": 0.2225, + "step": 26499 + }, + { + "epoch": 34.017971758664956, + "grad_norm": 1.149714708328247, + "learning_rate": 2.1997860504920838e-05, + "loss": 0.2183, + "step": 26500 + }, + { + "epoch": 34.019255455712454, + "grad_norm": 2.9555675983428955, + "learning_rate": 2.1997432605905007e-05, + "loss": 0.2387, + "step": 26501 + }, + { + "epoch": 34.02053915275995, + "grad_norm": 0.736693263053894, + "learning_rate": 2.1997004706889175e-05, + "loss": 0.225, + "step": 26502 + }, + { + "epoch": 34.02182284980744, + "grad_norm": 1.079311490058899, + "learning_rate": 2.199657680787334e-05, + "loss": 0.2386, + "step": 26503 + }, + { + "epoch": 34.02310654685494, + "grad_norm": 0.917637825012207, + "learning_rate": 2.1996148908857512e-05, + "loss": 0.2529, + "step": 26504 + }, + { + "epoch": 34.02439024390244, + "grad_norm": 0.8622380495071411, + "learning_rate": 2.1995721009841677e-05, + "loss": 0.2139, + "step": 26505 + }, + { + "epoch": 34.025673940949936, + "grad_norm": 0.8116756081581116, + "learning_rate": 2.199529311082585e-05, + "loss": 0.1964, + "step": 26506 + }, + { + "epoch": 34.026957637997434, + "grad_norm": 0.6883530616760254, + "learning_rate": 2.1994865211810014e-05, + "loss": 0.2395, + "step": 26507 + }, + { + "epoch": 34.02824133504493, + "grad_norm": 1.3245006799697876, + "learning_rate": 2.199443731279418e-05, + "loss": 0.2273, + "step": 26508 + }, + { + "epoch": 34.02952503209243, + "grad_norm": 1.9797251224517822, + "learning_rate": 2.199400941377835e-05, + "loss": 0.224, + "step": 26509 + }, + { + "epoch": 34.03080872913992, + "grad_norm": 1.0353177785873413, + "learning_rate": 2.1993581514762515e-05, + "loss": 0.2205, + "step": 26510 + }, + { + "epoch": 34.03209242618742, + "grad_norm": 1.2620749473571777, + "learning_rate": 2.1993153615746684e-05, + "loss": 0.267, + "step": 26511 + }, + { + "epoch": 34.033376123234916, + "grad_norm": 1.132957100868225, + "learning_rate": 2.1992725716730852e-05, + "loss": 0.212, + "step": 26512 + }, + { + "epoch": 34.034659820282414, + "grad_norm": 0.7886394262313843, + "learning_rate": 2.199229781771502e-05, + "loss": 0.2143, + "step": 26513 + }, + { + "epoch": 34.03594351732991, + "grad_norm": 1.5941752195358276, + "learning_rate": 2.199186991869919e-05, + "loss": 0.2293, + "step": 26514 + }, + { + "epoch": 34.03722721437741, + "grad_norm": 1.5983613729476929, + "learning_rate": 2.1991442019683354e-05, + "loss": 0.2166, + "step": 26515 + }, + { + "epoch": 34.0385109114249, + "grad_norm": 2.460256338119507, + "learning_rate": 2.1991014120667523e-05, + "loss": 0.2024, + "step": 26516 + }, + { + "epoch": 34.0397946084724, + "grad_norm": 0.7245180606842041, + "learning_rate": 2.199058622165169e-05, + "loss": 0.2072, + "step": 26517 + }, + { + "epoch": 34.041078305519896, + "grad_norm": 0.9369678497314453, + "learning_rate": 2.199015832263586e-05, + "loss": 0.2174, + "step": 26518 + }, + { + "epoch": 34.042362002567394, + "grad_norm": 1.8424465656280518, + "learning_rate": 2.1989730423620024e-05, + "loss": 0.225, + "step": 26519 + }, + { + "epoch": 34.04364569961489, + "grad_norm": 1.0616463422775269, + "learning_rate": 2.1989302524604196e-05, + "loss": 0.2232, + "step": 26520 + }, + { + "epoch": 34.04492939666239, + "grad_norm": 3.4183285236358643, + "learning_rate": 2.198887462558836e-05, + "loss": 0.2296, + "step": 26521 + }, + { + "epoch": 34.04621309370989, + "grad_norm": 8.351058006286621, + "learning_rate": 2.198844672657253e-05, + "loss": 0.236, + "step": 26522 + }, + { + "epoch": 34.04749679075738, + "grad_norm": 1.6475485563278198, + "learning_rate": 2.1988018827556698e-05, + "loss": 0.2348, + "step": 26523 + }, + { + "epoch": 34.048780487804876, + "grad_norm": 1.4983922243118286, + "learning_rate": 2.1987590928540863e-05, + "loss": 0.2538, + "step": 26524 + }, + { + "epoch": 34.050064184852374, + "grad_norm": 1.1325085163116455, + "learning_rate": 2.1987163029525035e-05, + "loss": 0.2157, + "step": 26525 + }, + { + "epoch": 34.05134788189987, + "grad_norm": 1.0202314853668213, + "learning_rate": 2.19867351305092e-05, + "loss": 0.2499, + "step": 26526 + }, + { + "epoch": 34.05263157894737, + "grad_norm": 1.0715036392211914, + "learning_rate": 2.1986307231493368e-05, + "loss": 0.2112, + "step": 26527 + }, + { + "epoch": 34.05391527599487, + "grad_norm": 1.122023344039917, + "learning_rate": 2.1985879332477537e-05, + "loss": 0.2468, + "step": 26528 + }, + { + "epoch": 34.05519897304236, + "grad_norm": 2.4168508052825928, + "learning_rate": 2.1985451433461702e-05, + "loss": 0.2486, + "step": 26529 + }, + { + "epoch": 34.056482670089856, + "grad_norm": 5.0890116691589355, + "learning_rate": 2.1985023534445874e-05, + "loss": 0.2551, + "step": 26530 + }, + { + "epoch": 34.057766367137354, + "grad_norm": 1.8942584991455078, + "learning_rate": 2.198459563543004e-05, + "loss": 0.2524, + "step": 26531 + }, + { + "epoch": 34.05905006418485, + "grad_norm": 2.2805840969085693, + "learning_rate": 2.1984167736414207e-05, + "loss": 0.2577, + "step": 26532 + }, + { + "epoch": 34.06033376123235, + "grad_norm": 1.4066734313964844, + "learning_rate": 2.1983739837398375e-05, + "loss": 0.2143, + "step": 26533 + }, + { + "epoch": 34.06161745827985, + "grad_norm": 1.7610236406326294, + "learning_rate": 2.198331193838254e-05, + "loss": 0.3026, + "step": 26534 + }, + { + "epoch": 34.062901155327346, + "grad_norm": 4.802680969238281, + "learning_rate": 2.198288403936671e-05, + "loss": 0.2965, + "step": 26535 + }, + { + "epoch": 34.06418485237484, + "grad_norm": 2.258913040161133, + "learning_rate": 2.1982456140350877e-05, + "loss": 0.4006, + "step": 26536 + }, + { + "epoch": 34.065468549422334, + "grad_norm": 1.7528151273727417, + "learning_rate": 2.1982028241335046e-05, + "loss": 0.2593, + "step": 26537 + }, + { + "epoch": 34.06675224646983, + "grad_norm": 1.0027649402618408, + "learning_rate": 2.1981600342319214e-05, + "loss": 0.2566, + "step": 26538 + }, + { + "epoch": 34.06803594351733, + "grad_norm": 0.8663975596427917, + "learning_rate": 2.1981172443303382e-05, + "loss": 0.2605, + "step": 26539 + }, + { + "epoch": 34.06931964056483, + "grad_norm": 0.8644137382507324, + "learning_rate": 2.1980744544287547e-05, + "loss": 0.2715, + "step": 26540 + }, + { + "epoch": 34.070603337612326, + "grad_norm": 1.2671928405761719, + "learning_rate": 2.1980316645271716e-05, + "loss": 0.2389, + "step": 26541 + }, + { + "epoch": 34.071887034659824, + "grad_norm": 2.7241203784942627, + "learning_rate": 2.1979888746255884e-05, + "loss": 0.229, + "step": 26542 + }, + { + "epoch": 34.073170731707314, + "grad_norm": 1.0154467821121216, + "learning_rate": 2.197946084724005e-05, + "loss": 0.2317, + "step": 26543 + }, + { + "epoch": 34.07445442875481, + "grad_norm": 0.9106881022453308, + "learning_rate": 2.197903294822422e-05, + "loss": 0.2604, + "step": 26544 + }, + { + "epoch": 34.07573812580231, + "grad_norm": 4.637685775756836, + "learning_rate": 2.1978605049208386e-05, + "loss": 0.268, + "step": 26545 + }, + { + "epoch": 34.07702182284981, + "grad_norm": 1.1081124544143677, + "learning_rate": 2.1978177150192558e-05, + "loss": 0.2439, + "step": 26546 + }, + { + "epoch": 34.078305519897306, + "grad_norm": 1.7729593515396118, + "learning_rate": 2.1977749251176723e-05, + "loss": 0.2581, + "step": 26547 + }, + { + "epoch": 34.079589216944804, + "grad_norm": 1.04374098777771, + "learning_rate": 2.1977321352160888e-05, + "loss": 0.252, + "step": 26548 + }, + { + "epoch": 34.080872913992295, + "grad_norm": 1.3271242380142212, + "learning_rate": 2.197689345314506e-05, + "loss": 0.2412, + "step": 26549 + }, + { + "epoch": 34.08215661103979, + "grad_norm": 1.122990608215332, + "learning_rate": 2.1976465554129225e-05, + "loss": 0.2356, + "step": 26550 + }, + { + "epoch": 34.08344030808729, + "grad_norm": 1.0958877801895142, + "learning_rate": 2.1976037655113393e-05, + "loss": 0.225, + "step": 26551 + }, + { + "epoch": 34.08472400513479, + "grad_norm": 1.031859040260315, + "learning_rate": 2.197560975609756e-05, + "loss": 0.2389, + "step": 26552 + }, + { + "epoch": 34.086007702182286, + "grad_norm": 1.7657971382141113, + "learning_rate": 2.197518185708173e-05, + "loss": 0.2461, + "step": 26553 + }, + { + "epoch": 34.087291399229784, + "grad_norm": 0.8945644497871399, + "learning_rate": 2.19747539580659e-05, + "loss": 0.2331, + "step": 26554 + }, + { + "epoch": 34.08857509627728, + "grad_norm": 3.004403829574585, + "learning_rate": 2.1974326059050063e-05, + "loss": 0.2274, + "step": 26555 + }, + { + "epoch": 34.08985879332477, + "grad_norm": 1.3640551567077637, + "learning_rate": 2.1973898160034232e-05, + "loss": 0.243, + "step": 26556 + }, + { + "epoch": 34.09114249037227, + "grad_norm": 0.6819368004798889, + "learning_rate": 2.19734702610184e-05, + "loss": 0.213, + "step": 26557 + }, + { + "epoch": 34.09242618741977, + "grad_norm": 0.753057599067688, + "learning_rate": 2.197304236200257e-05, + "loss": 0.2479, + "step": 26558 + }, + { + "epoch": 34.093709884467266, + "grad_norm": 1.6311900615692139, + "learning_rate": 2.1972614462986734e-05, + "loss": 0.2322, + "step": 26559 + }, + { + "epoch": 34.094993581514764, + "grad_norm": 1.107992172241211, + "learning_rate": 2.1972186563970906e-05, + "loss": 0.225, + "step": 26560 + }, + { + "epoch": 34.09627727856226, + "grad_norm": 1.2090767621994019, + "learning_rate": 2.197175866495507e-05, + "loss": 0.2237, + "step": 26561 + }, + { + "epoch": 34.09756097560975, + "grad_norm": 0.9608655571937561, + "learning_rate": 2.197133076593924e-05, + "loss": 0.2272, + "step": 26562 + }, + { + "epoch": 34.09884467265725, + "grad_norm": 0.8928223848342896, + "learning_rate": 2.1970902866923407e-05, + "loss": 0.2174, + "step": 26563 + }, + { + "epoch": 34.10012836970475, + "grad_norm": 1.1020598411560059, + "learning_rate": 2.1970474967907572e-05, + "loss": 0.2355, + "step": 26564 + }, + { + "epoch": 34.101412066752246, + "grad_norm": 0.962296724319458, + "learning_rate": 2.1970047068891744e-05, + "loss": 0.1856, + "step": 26565 + }, + { + "epoch": 34.102695763799744, + "grad_norm": 2.986448049545288, + "learning_rate": 2.196961916987591e-05, + "loss": 0.2105, + "step": 26566 + }, + { + "epoch": 34.10397946084724, + "grad_norm": 0.8258481025695801, + "learning_rate": 2.1969191270860078e-05, + "loss": 0.2366, + "step": 26567 + }, + { + "epoch": 34.10526315789474, + "grad_norm": 1.0925817489624023, + "learning_rate": 2.1968763371844246e-05, + "loss": 0.2349, + "step": 26568 + }, + { + "epoch": 34.10654685494223, + "grad_norm": 1.2203247547149658, + "learning_rate": 2.196833547282841e-05, + "loss": 0.2157, + "step": 26569 + }, + { + "epoch": 34.10783055198973, + "grad_norm": 1.091105580329895, + "learning_rate": 2.1967907573812583e-05, + "loss": 0.2422, + "step": 26570 + }, + { + "epoch": 34.109114249037226, + "grad_norm": 1.118507742881775, + "learning_rate": 2.1967479674796748e-05, + "loss": 0.1978, + "step": 26571 + }, + { + "epoch": 34.110397946084724, + "grad_norm": 1.1911885738372803, + "learning_rate": 2.1967051775780916e-05, + "loss": 0.2116, + "step": 26572 + }, + { + "epoch": 34.11168164313222, + "grad_norm": 3.0302159786224365, + "learning_rate": 2.1966623876765085e-05, + "loss": 0.238, + "step": 26573 + }, + { + "epoch": 34.11296534017972, + "grad_norm": 1.038577914237976, + "learning_rate": 2.1966195977749253e-05, + "loss": 0.2306, + "step": 26574 + }, + { + "epoch": 34.11424903722722, + "grad_norm": 4.7755937576293945, + "learning_rate": 2.1965768078733418e-05, + "loss": 0.2438, + "step": 26575 + }, + { + "epoch": 34.11553273427471, + "grad_norm": 1.7113409042358398, + "learning_rate": 2.1965340179717587e-05, + "loss": 0.2362, + "step": 26576 + }, + { + "epoch": 34.116816431322206, + "grad_norm": 1.1191579103469849, + "learning_rate": 2.1964912280701755e-05, + "loss": 0.2396, + "step": 26577 + }, + { + "epoch": 34.118100128369704, + "grad_norm": 4.028477191925049, + "learning_rate": 2.1964484381685923e-05, + "loss": 0.1998, + "step": 26578 + }, + { + "epoch": 34.1193838254172, + "grad_norm": 1.4491393566131592, + "learning_rate": 2.1964056482670092e-05, + "loss": 0.21, + "step": 26579 + }, + { + "epoch": 34.1206675224647, + "grad_norm": 1.1789910793304443, + "learning_rate": 2.1963628583654257e-05, + "loss": 0.2386, + "step": 26580 + }, + { + "epoch": 34.1219512195122, + "grad_norm": 1.7300870418548584, + "learning_rate": 2.196320068463843e-05, + "loss": 0.2479, + "step": 26581 + }, + { + "epoch": 34.12323491655969, + "grad_norm": 2.0217528343200684, + "learning_rate": 2.1962772785622594e-05, + "loss": 0.2259, + "step": 26582 + }, + { + "epoch": 34.12451861360719, + "grad_norm": 1.246024489402771, + "learning_rate": 2.196234488660676e-05, + "loss": 0.2623, + "step": 26583 + }, + { + "epoch": 34.125802310654684, + "grad_norm": 1.1006180047988892, + "learning_rate": 2.196191698759093e-05, + "loss": 0.2654, + "step": 26584 + }, + { + "epoch": 34.12708600770218, + "grad_norm": 1.7996175289154053, + "learning_rate": 2.1961489088575095e-05, + "loss": 0.303, + "step": 26585 + }, + { + "epoch": 34.12836970474968, + "grad_norm": 2.5779356956481934, + "learning_rate": 2.1961061189559267e-05, + "loss": 0.3648, + "step": 26586 + }, + { + "epoch": 34.12965340179718, + "grad_norm": 1.4244813919067383, + "learning_rate": 2.1960633290543432e-05, + "loss": 0.2801, + "step": 26587 + }, + { + "epoch": 34.130937098844676, + "grad_norm": 0.7629460692405701, + "learning_rate": 2.19602053915276e-05, + "loss": 0.2619, + "step": 26588 + }, + { + "epoch": 34.13222079589217, + "grad_norm": 0.7025101184844971, + "learning_rate": 2.195977749251177e-05, + "loss": 0.2508, + "step": 26589 + }, + { + "epoch": 34.133504492939664, + "grad_norm": 1.1506705284118652, + "learning_rate": 2.1959349593495934e-05, + "loss": 0.2644, + "step": 26590 + }, + { + "epoch": 34.13478818998716, + "grad_norm": 0.8390849828720093, + "learning_rate": 2.1958921694480103e-05, + "loss": 0.2463, + "step": 26591 + }, + { + "epoch": 34.13607188703466, + "grad_norm": 1.3639777898788452, + "learning_rate": 2.195849379546427e-05, + "loss": 0.2577, + "step": 26592 + }, + { + "epoch": 34.13735558408216, + "grad_norm": 1.0572946071624756, + "learning_rate": 2.195806589644844e-05, + "loss": 0.2478, + "step": 26593 + }, + { + "epoch": 34.138639281129656, + "grad_norm": 1.1645363569259644, + "learning_rate": 2.1957637997432608e-05, + "loss": 0.2441, + "step": 26594 + }, + { + "epoch": 34.13992297817715, + "grad_norm": 1.0017963647842407, + "learning_rate": 2.1957210098416773e-05, + "loss": 0.2243, + "step": 26595 + }, + { + "epoch": 34.141206675224645, + "grad_norm": 0.8070797920227051, + "learning_rate": 2.195678219940094e-05, + "loss": 0.2275, + "step": 26596 + }, + { + "epoch": 34.14249037227214, + "grad_norm": 0.7425495386123657, + "learning_rate": 2.195635430038511e-05, + "loss": 0.2374, + "step": 26597 + }, + { + "epoch": 34.14377406931964, + "grad_norm": 1.281430959701538, + "learning_rate": 2.1955926401369278e-05, + "loss": 0.2542, + "step": 26598 + }, + { + "epoch": 34.14505776636714, + "grad_norm": 1.0704233646392822, + "learning_rate": 2.1955498502353443e-05, + "loss": 0.228, + "step": 26599 + }, + { + "epoch": 34.146341463414636, + "grad_norm": 1.9584053754806519, + "learning_rate": 2.1955070603337615e-05, + "loss": 0.246, + "step": 26600 + }, + { + "epoch": 34.147625160462134, + "grad_norm": 1.2599300146102905, + "learning_rate": 2.195464270432178e-05, + "loss": 0.2194, + "step": 26601 + }, + { + "epoch": 34.148908857509625, + "grad_norm": 3.531564712524414, + "learning_rate": 2.195421480530595e-05, + "loss": 0.2561, + "step": 26602 + }, + { + "epoch": 34.15019255455712, + "grad_norm": 1.7819650173187256, + "learning_rate": 2.1953786906290117e-05, + "loss": 0.2439, + "step": 26603 + }, + { + "epoch": 34.15147625160462, + "grad_norm": 0.9235967993736267, + "learning_rate": 2.1953359007274282e-05, + "loss": 0.2181, + "step": 26604 + }, + { + "epoch": 34.15275994865212, + "grad_norm": 1.2267545461654663, + "learning_rate": 2.1952931108258454e-05, + "loss": 0.2183, + "step": 26605 + }, + { + "epoch": 34.154043645699616, + "grad_norm": 1.9723680019378662, + "learning_rate": 2.195250320924262e-05, + "loss": 0.2379, + "step": 26606 + }, + { + "epoch": 34.155327342747114, + "grad_norm": 1.621873140335083, + "learning_rate": 2.1952075310226787e-05, + "loss": 0.2339, + "step": 26607 + }, + { + "epoch": 34.15661103979461, + "grad_norm": 0.8159772753715515, + "learning_rate": 2.1951647411210955e-05, + "loss": 0.2247, + "step": 26608 + }, + { + "epoch": 34.1578947368421, + "grad_norm": 1.1636544466018677, + "learning_rate": 2.195121951219512e-05, + "loss": 0.2442, + "step": 26609 + }, + { + "epoch": 34.1591784338896, + "grad_norm": 0.9671518206596375, + "learning_rate": 2.1950791613179292e-05, + "loss": 0.2501, + "step": 26610 + }, + { + "epoch": 34.1604621309371, + "grad_norm": 1.143734335899353, + "learning_rate": 2.1950363714163457e-05, + "loss": 0.2359, + "step": 26611 + }, + { + "epoch": 34.161745827984596, + "grad_norm": 2.7157421112060547, + "learning_rate": 2.1949935815147626e-05, + "loss": 0.2216, + "step": 26612 + }, + { + "epoch": 34.163029525032094, + "grad_norm": 0.86585932970047, + "learning_rate": 2.1949507916131794e-05, + "loss": 0.2396, + "step": 26613 + }, + { + "epoch": 34.16431322207959, + "grad_norm": 1.385095238685608, + "learning_rate": 2.1949080017115963e-05, + "loss": 0.2379, + "step": 26614 + }, + { + "epoch": 34.16559691912708, + "grad_norm": 1.6060351133346558, + "learning_rate": 2.1948652118100128e-05, + "loss": 0.2358, + "step": 26615 + }, + { + "epoch": 34.16688061617458, + "grad_norm": 1.8881523609161377, + "learning_rate": 2.1948224219084296e-05, + "loss": 0.2451, + "step": 26616 + }, + { + "epoch": 34.16816431322208, + "grad_norm": 1.2004921436309814, + "learning_rate": 2.1947796320068464e-05, + "loss": 0.2218, + "step": 26617 + }, + { + "epoch": 34.169448010269576, + "grad_norm": 1.0675997734069824, + "learning_rate": 2.1947368421052633e-05, + "loss": 0.229, + "step": 26618 + }, + { + "epoch": 34.170731707317074, + "grad_norm": 1.2162859439849854, + "learning_rate": 2.19469405220368e-05, + "loss": 0.222, + "step": 26619 + }, + { + "epoch": 34.17201540436457, + "grad_norm": 0.9572892189025879, + "learning_rate": 2.1946512623020966e-05, + "loss": 0.2334, + "step": 26620 + }, + { + "epoch": 34.17329910141207, + "grad_norm": 1.3067512512207031, + "learning_rate": 2.1946084724005138e-05, + "loss": 0.2127, + "step": 26621 + }, + { + "epoch": 34.17458279845956, + "grad_norm": 1.1433963775634766, + "learning_rate": 2.1945656824989303e-05, + "loss": 0.2477, + "step": 26622 + }, + { + "epoch": 34.17586649550706, + "grad_norm": 0.9261027574539185, + "learning_rate": 2.1945228925973468e-05, + "loss": 0.2531, + "step": 26623 + }, + { + "epoch": 34.177150192554556, + "grad_norm": 1.2305333614349365, + "learning_rate": 2.194480102695764e-05, + "loss": 0.2733, + "step": 26624 + }, + { + "epoch": 34.178433889602054, + "grad_norm": 1.3002574443817139, + "learning_rate": 2.1944373127941805e-05, + "loss": 0.2158, + "step": 26625 + }, + { + "epoch": 34.17971758664955, + "grad_norm": 1.6737303733825684, + "learning_rate": 2.1943945228925977e-05, + "loss": 0.2307, + "step": 26626 + }, + { + "epoch": 34.18100128369705, + "grad_norm": 2.6915740966796875, + "learning_rate": 2.194351732991014e-05, + "loss": 0.2469, + "step": 26627 + }, + { + "epoch": 34.18228498074454, + "grad_norm": 1.334643840789795, + "learning_rate": 2.194308943089431e-05, + "loss": 0.2424, + "step": 26628 + }, + { + "epoch": 34.18356867779204, + "grad_norm": 2.1891629695892334, + "learning_rate": 2.194266153187848e-05, + "loss": 0.2601, + "step": 26629 + }, + { + "epoch": 34.18485237483954, + "grad_norm": 1.4853578805923462, + "learning_rate": 2.1942233632862644e-05, + "loss": 0.2357, + "step": 26630 + }, + { + "epoch": 34.186136071887034, + "grad_norm": 3.1237940788269043, + "learning_rate": 2.1941805733846812e-05, + "loss": 0.257, + "step": 26631 + }, + { + "epoch": 34.18741976893453, + "grad_norm": 1.0000249147415161, + "learning_rate": 2.194137783483098e-05, + "loss": 0.2645, + "step": 26632 + }, + { + "epoch": 34.18870346598203, + "grad_norm": 1.5133379697799683, + "learning_rate": 2.194094993581515e-05, + "loss": 0.2675, + "step": 26633 + }, + { + "epoch": 34.18998716302953, + "grad_norm": 2.880113124847412, + "learning_rate": 2.1940522036799317e-05, + "loss": 0.2802, + "step": 26634 + }, + { + "epoch": 34.19127086007702, + "grad_norm": 2.655792236328125, + "learning_rate": 2.1940094137783486e-05, + "loss": 0.3299, + "step": 26635 + }, + { + "epoch": 34.19255455712452, + "grad_norm": 4.3172831535339355, + "learning_rate": 2.193966623876765e-05, + "loss": 0.4144, + "step": 26636 + }, + { + "epoch": 34.193838254172015, + "grad_norm": 0.7617392539978027, + "learning_rate": 2.193923833975182e-05, + "loss": 0.2636, + "step": 26637 + }, + { + "epoch": 34.19512195121951, + "grad_norm": 0.6761654615402222, + "learning_rate": 2.1938810440735987e-05, + "loss": 0.2465, + "step": 26638 + }, + { + "epoch": 34.19640564826701, + "grad_norm": 0.959855854511261, + "learning_rate": 2.1938382541720152e-05, + "loss": 0.2479, + "step": 26639 + }, + { + "epoch": 34.19768934531451, + "grad_norm": 5.3078203201293945, + "learning_rate": 2.1937954642704324e-05, + "loss": 0.275, + "step": 26640 + }, + { + "epoch": 34.198973042362006, + "grad_norm": 1.0032658576965332, + "learning_rate": 2.193752674368849e-05, + "loss": 0.2921, + "step": 26641 + }, + { + "epoch": 34.2002567394095, + "grad_norm": 1.3103468418121338, + "learning_rate": 2.193709884467266e-05, + "loss": 0.2474, + "step": 26642 + }, + { + "epoch": 34.201540436456995, + "grad_norm": 0.9351796507835388, + "learning_rate": 2.1936670945656826e-05, + "loss": 0.2899, + "step": 26643 + }, + { + "epoch": 34.20282413350449, + "grad_norm": 0.9854410886764526, + "learning_rate": 2.193624304664099e-05, + "loss": 0.2255, + "step": 26644 + }, + { + "epoch": 34.20410783055199, + "grad_norm": 0.8244149684906006, + "learning_rate": 2.1935815147625163e-05, + "loss": 0.2414, + "step": 26645 + }, + { + "epoch": 34.20539152759949, + "grad_norm": 1.596512794494629, + "learning_rate": 2.1935387248609328e-05, + "loss": 0.249, + "step": 26646 + }, + { + "epoch": 34.206675224646986, + "grad_norm": 0.8638772368431091, + "learning_rate": 2.1934959349593496e-05, + "loss": 0.2429, + "step": 26647 + }, + { + "epoch": 34.20795892169448, + "grad_norm": 2.5545055866241455, + "learning_rate": 2.1934531450577665e-05, + "loss": 0.2704, + "step": 26648 + }, + { + "epoch": 34.209242618741975, + "grad_norm": 0.8720489144325256, + "learning_rate": 2.1934103551561833e-05, + "loss": 0.2294, + "step": 26649 + }, + { + "epoch": 34.21052631578947, + "grad_norm": 1.1465946435928345, + "learning_rate": 2.1933675652545998e-05, + "loss": 0.241, + "step": 26650 + }, + { + "epoch": 34.21181001283697, + "grad_norm": 1.121568202972412, + "learning_rate": 2.1933247753530167e-05, + "loss": 0.2593, + "step": 26651 + }, + { + "epoch": 34.21309370988447, + "grad_norm": 1.0653176307678223, + "learning_rate": 2.1932819854514335e-05, + "loss": 0.2241, + "step": 26652 + }, + { + "epoch": 34.214377406931966, + "grad_norm": 0.7611302137374878, + "learning_rate": 2.1932391955498503e-05, + "loss": 0.2535, + "step": 26653 + }, + { + "epoch": 34.215661103979464, + "grad_norm": 5.110604286193848, + "learning_rate": 2.1931964056482672e-05, + "loss": 0.221, + "step": 26654 + }, + { + "epoch": 34.216944801026955, + "grad_norm": 1.8880810737609863, + "learning_rate": 2.1931536157466837e-05, + "loss": 0.2384, + "step": 26655 + }, + { + "epoch": 34.21822849807445, + "grad_norm": 1.2092190980911255, + "learning_rate": 2.1931108258451005e-05, + "loss": 0.2515, + "step": 26656 + }, + { + "epoch": 34.21951219512195, + "grad_norm": 0.947687029838562, + "learning_rate": 2.1930680359435174e-05, + "loss": 0.2078, + "step": 26657 + }, + { + "epoch": 34.22079589216945, + "grad_norm": 1.1406890153884888, + "learning_rate": 2.193025246041934e-05, + "loss": 0.2059, + "step": 26658 + }, + { + "epoch": 34.222079589216946, + "grad_norm": 1.137034296989441, + "learning_rate": 2.192982456140351e-05, + "loss": 0.2244, + "step": 26659 + }, + { + "epoch": 34.223363286264444, + "grad_norm": 7.1367316246032715, + "learning_rate": 2.1929396662387676e-05, + "loss": 0.2233, + "step": 26660 + }, + { + "epoch": 34.224646983311935, + "grad_norm": 1.8017542362213135, + "learning_rate": 2.1928968763371847e-05, + "loss": 0.2521, + "step": 26661 + }, + { + "epoch": 34.22593068035943, + "grad_norm": 1.4031466245651245, + "learning_rate": 2.1928540864356012e-05, + "loss": 0.228, + "step": 26662 + }, + { + "epoch": 34.22721437740693, + "grad_norm": 1.4058963060379028, + "learning_rate": 2.1928112965340177e-05, + "loss": 0.2008, + "step": 26663 + }, + { + "epoch": 34.22849807445443, + "grad_norm": 0.9016103148460388, + "learning_rate": 2.192768506632435e-05, + "loss": 0.2123, + "step": 26664 + }, + { + "epoch": 34.229781771501926, + "grad_norm": 0.9704306721687317, + "learning_rate": 2.1927257167308514e-05, + "loss": 0.2264, + "step": 26665 + }, + { + "epoch": 34.231065468549424, + "grad_norm": 1.5261800289154053, + "learning_rate": 2.1926829268292683e-05, + "loss": 0.2434, + "step": 26666 + }, + { + "epoch": 34.23234916559692, + "grad_norm": 1.1819968223571777, + "learning_rate": 2.192640136927685e-05, + "loss": 0.2101, + "step": 26667 + }, + { + "epoch": 34.23363286264441, + "grad_norm": 1.5671707391738892, + "learning_rate": 2.192597347026102e-05, + "loss": 0.2379, + "step": 26668 + }, + { + "epoch": 34.23491655969191, + "grad_norm": 1.24955415725708, + "learning_rate": 2.1925545571245188e-05, + "loss": 0.231, + "step": 26669 + }, + { + "epoch": 34.23620025673941, + "grad_norm": 2.999852418899536, + "learning_rate": 2.1925117672229353e-05, + "loss": 0.2162, + "step": 26670 + }, + { + "epoch": 34.23748395378691, + "grad_norm": 1.2509537935256958, + "learning_rate": 2.192468977321352e-05, + "loss": 0.2345, + "step": 26671 + }, + { + "epoch": 34.238767650834404, + "grad_norm": 1.2039060592651367, + "learning_rate": 2.192426187419769e-05, + "loss": 0.1979, + "step": 26672 + }, + { + "epoch": 34.2400513478819, + "grad_norm": 1.3188252449035645, + "learning_rate": 2.1923833975181858e-05, + "loss": 0.2275, + "step": 26673 + }, + { + "epoch": 34.2413350449294, + "grad_norm": 0.9050871133804321, + "learning_rate": 2.1923406076166023e-05, + "loss": 0.2385, + "step": 26674 + }, + { + "epoch": 34.24261874197689, + "grad_norm": 1.3154734373092651, + "learning_rate": 2.1922978177150195e-05, + "loss": 0.2194, + "step": 26675 + }, + { + "epoch": 34.24390243902439, + "grad_norm": 1.1871635913848877, + "learning_rate": 2.192255027813436e-05, + "loss": 0.2444, + "step": 26676 + }, + { + "epoch": 34.24518613607189, + "grad_norm": 1.2946592569351196, + "learning_rate": 2.192212237911853e-05, + "loss": 0.2578, + "step": 26677 + }, + { + "epoch": 34.246469833119384, + "grad_norm": 1.327188491821289, + "learning_rate": 2.1921694480102697e-05, + "loss": 0.2487, + "step": 26678 + }, + { + "epoch": 34.24775353016688, + "grad_norm": 1.5054773092269897, + "learning_rate": 2.1921266581086862e-05, + "loss": 0.2275, + "step": 26679 + }, + { + "epoch": 34.24903722721438, + "grad_norm": 1.5486153364181519, + "learning_rate": 2.1920838682071034e-05, + "loss": 0.2298, + "step": 26680 + }, + { + "epoch": 34.25032092426187, + "grad_norm": 1.573864221572876, + "learning_rate": 2.19204107830552e-05, + "loss": 0.2641, + "step": 26681 + }, + { + "epoch": 34.25160462130937, + "grad_norm": 1.725881814956665, + "learning_rate": 2.1919982884039367e-05, + "loss": 0.2691, + "step": 26682 + }, + { + "epoch": 34.25288831835687, + "grad_norm": 3.332562208175659, + "learning_rate": 2.1919554985023535e-05, + "loss": 0.2893, + "step": 26683 + }, + { + "epoch": 34.254172015404365, + "grad_norm": 1.4989219903945923, + "learning_rate": 2.19191270860077e-05, + "loss": 0.3075, + "step": 26684 + }, + { + "epoch": 34.25545571245186, + "grad_norm": 2.3923044204711914, + "learning_rate": 2.1918699186991872e-05, + "loss": 0.3036, + "step": 26685 + }, + { + "epoch": 34.25673940949936, + "grad_norm": 3.9170358180999756, + "learning_rate": 2.1918271287976037e-05, + "loss": 0.4107, + "step": 26686 + }, + { + "epoch": 34.25802310654686, + "grad_norm": 1.000723123550415, + "learning_rate": 2.1917843388960206e-05, + "loss": 0.2444, + "step": 26687 + }, + { + "epoch": 34.25930680359435, + "grad_norm": 1.3358696699142456, + "learning_rate": 2.1917415489944374e-05, + "loss": 0.2645, + "step": 26688 + }, + { + "epoch": 34.26059050064185, + "grad_norm": 0.7618299722671509, + "learning_rate": 2.1916987590928543e-05, + "loss": 0.2568, + "step": 26689 + }, + { + "epoch": 34.261874197689345, + "grad_norm": 0.9106769561767578, + "learning_rate": 2.1916559691912708e-05, + "loss": 0.2527, + "step": 26690 + }, + { + "epoch": 34.26315789473684, + "grad_norm": 0.8787811398506165, + "learning_rate": 2.1916131792896876e-05, + "loss": 0.2491, + "step": 26691 + }, + { + "epoch": 34.26444159178434, + "grad_norm": 0.9840806722640991, + "learning_rate": 2.1915703893881044e-05, + "loss": 0.2201, + "step": 26692 + }, + { + "epoch": 34.26572528883184, + "grad_norm": 1.2165638208389282, + "learning_rate": 2.1915275994865213e-05, + "loss": 0.2484, + "step": 26693 + }, + { + "epoch": 34.26700898587933, + "grad_norm": 1.28398859500885, + "learning_rate": 2.191484809584938e-05, + "loss": 0.2463, + "step": 26694 + }, + { + "epoch": 34.26829268292683, + "grad_norm": 0.9988111853599548, + "learning_rate": 2.1914420196833546e-05, + "loss": 0.2561, + "step": 26695 + }, + { + "epoch": 34.269576379974325, + "grad_norm": 0.8752924799919128, + "learning_rate": 2.1913992297817718e-05, + "loss": 0.2658, + "step": 26696 + }, + { + "epoch": 34.27086007702182, + "grad_norm": 1.0570342540740967, + "learning_rate": 2.1913564398801883e-05, + "loss": 0.2409, + "step": 26697 + }, + { + "epoch": 34.27214377406932, + "grad_norm": 0.8322709202766418, + "learning_rate": 2.1913136499786048e-05, + "loss": 0.255, + "step": 26698 + }, + { + "epoch": 34.27342747111682, + "grad_norm": 1.2963645458221436, + "learning_rate": 2.191270860077022e-05, + "loss": 0.2465, + "step": 26699 + }, + { + "epoch": 34.274711168164316, + "grad_norm": 0.8073360323905945, + "learning_rate": 2.1912280701754385e-05, + "loss": 0.2426, + "step": 26700 + }, + { + "epoch": 34.27599486521181, + "grad_norm": 1.3118184804916382, + "learning_rate": 2.1911852802738557e-05, + "loss": 0.2498, + "step": 26701 + }, + { + "epoch": 34.277278562259305, + "grad_norm": 1.2094932794570923, + "learning_rate": 2.1911424903722722e-05, + "loss": 0.2483, + "step": 26702 + }, + { + "epoch": 34.2785622593068, + "grad_norm": 1.0560379028320312, + "learning_rate": 2.191099700470689e-05, + "loss": 0.2225, + "step": 26703 + }, + { + "epoch": 34.2798459563543, + "grad_norm": 1.3678113222122192, + "learning_rate": 2.191056910569106e-05, + "loss": 0.2203, + "step": 26704 + }, + { + "epoch": 34.2811296534018, + "grad_norm": 0.8727612495422363, + "learning_rate": 2.1910141206675224e-05, + "loss": 0.2463, + "step": 26705 + }, + { + "epoch": 34.282413350449296, + "grad_norm": 2.605221748352051, + "learning_rate": 2.1909713307659392e-05, + "loss": 0.2674, + "step": 26706 + }, + { + "epoch": 34.283697047496794, + "grad_norm": 2.3235421180725098, + "learning_rate": 2.190928540864356e-05, + "loss": 0.2343, + "step": 26707 + }, + { + "epoch": 34.284980744544285, + "grad_norm": 0.7626538276672363, + "learning_rate": 2.190885750962773e-05, + "loss": 0.2355, + "step": 26708 + }, + { + "epoch": 34.28626444159178, + "grad_norm": 0.8731454014778137, + "learning_rate": 2.1908429610611897e-05, + "loss": 0.226, + "step": 26709 + }, + { + "epoch": 34.28754813863928, + "grad_norm": 1.2951369285583496, + "learning_rate": 2.1908001711596066e-05, + "loss": 0.2444, + "step": 26710 + }, + { + "epoch": 34.28883183568678, + "grad_norm": 1.37227201461792, + "learning_rate": 2.190757381258023e-05, + "loss": 0.2325, + "step": 26711 + }, + { + "epoch": 34.290115532734276, + "grad_norm": 1.1716338396072388, + "learning_rate": 2.19071459135644e-05, + "loss": 0.2068, + "step": 26712 + }, + { + "epoch": 34.291399229781774, + "grad_norm": 1.092914342880249, + "learning_rate": 2.1906718014548568e-05, + "loss": 0.2474, + "step": 26713 + }, + { + "epoch": 34.292682926829265, + "grad_norm": 1.933782696723938, + "learning_rate": 2.1906290115532733e-05, + "loss": 0.224, + "step": 26714 + }, + { + "epoch": 34.29396662387676, + "grad_norm": 1.372043251991272, + "learning_rate": 2.1905862216516904e-05, + "loss": 0.2345, + "step": 26715 + }, + { + "epoch": 34.29525032092426, + "grad_norm": 1.64480721950531, + "learning_rate": 2.190543431750107e-05, + "loss": 0.2211, + "step": 26716 + }, + { + "epoch": 34.29653401797176, + "grad_norm": 0.9955276846885681, + "learning_rate": 2.1905006418485238e-05, + "loss": 0.2382, + "step": 26717 + }, + { + "epoch": 34.29781771501926, + "grad_norm": 2.240394115447998, + "learning_rate": 2.1904578519469406e-05, + "loss": 0.2264, + "step": 26718 + }, + { + "epoch": 34.299101412066754, + "grad_norm": 0.9802242517471313, + "learning_rate": 2.190415062045357e-05, + "loss": 0.2309, + "step": 26719 + }, + { + "epoch": 34.30038510911425, + "grad_norm": 1.3503931760787964, + "learning_rate": 2.1903722721437743e-05, + "loss": 0.2508, + "step": 26720 + }, + { + "epoch": 34.30166880616174, + "grad_norm": 1.0958446264266968, + "learning_rate": 2.1903294822421908e-05, + "loss": 0.2286, + "step": 26721 + }, + { + "epoch": 34.30295250320924, + "grad_norm": 3.027435302734375, + "learning_rate": 2.1902866923406076e-05, + "loss": 0.2197, + "step": 26722 + }, + { + "epoch": 34.30423620025674, + "grad_norm": 1.4787853956222534, + "learning_rate": 2.1902439024390245e-05, + "loss": 0.2223, + "step": 26723 + }, + { + "epoch": 34.30551989730424, + "grad_norm": 1.263008713722229, + "learning_rate": 2.190201112537441e-05, + "loss": 0.2242, + "step": 26724 + }, + { + "epoch": 34.306803594351734, + "grad_norm": 1.0582200288772583, + "learning_rate": 2.190158322635858e-05, + "loss": 0.2342, + "step": 26725 + }, + { + "epoch": 34.30808729139923, + "grad_norm": 1.7657763957977295, + "learning_rate": 2.1901155327342747e-05, + "loss": 0.2429, + "step": 26726 + }, + { + "epoch": 34.30937098844672, + "grad_norm": 1.9652001857757568, + "learning_rate": 2.1900727428326915e-05, + "loss": 0.2541, + "step": 26727 + }, + { + "epoch": 34.31065468549422, + "grad_norm": 1.5178735256195068, + "learning_rate": 2.1900299529311084e-05, + "loss": 0.2457, + "step": 26728 + }, + { + "epoch": 34.31193838254172, + "grad_norm": 1.3568706512451172, + "learning_rate": 2.1899871630295252e-05, + "loss": 0.2683, + "step": 26729 + }, + { + "epoch": 34.31322207958922, + "grad_norm": 1.673532485961914, + "learning_rate": 2.1899443731279417e-05, + "loss": 0.2613, + "step": 26730 + }, + { + "epoch": 34.314505776636715, + "grad_norm": 1.3053488731384277, + "learning_rate": 2.1899015832263585e-05, + "loss": 0.2421, + "step": 26731 + }, + { + "epoch": 34.31578947368421, + "grad_norm": 1.7695732116699219, + "learning_rate": 2.1898587933247754e-05, + "loss": 0.2916, + "step": 26732 + }, + { + "epoch": 34.31707317073171, + "grad_norm": 1.415358543395996, + "learning_rate": 2.1898160034231922e-05, + "loss": 0.2562, + "step": 26733 + }, + { + "epoch": 34.3183568677792, + "grad_norm": 1.6004383563995361, + "learning_rate": 2.189773213521609e-05, + "loss": 0.2908, + "step": 26734 + }, + { + "epoch": 34.3196405648267, + "grad_norm": 4.557618618011475, + "learning_rate": 2.1897304236200256e-05, + "loss": 0.3407, + "step": 26735 + }, + { + "epoch": 34.3209242618742, + "grad_norm": 2.094355583190918, + "learning_rate": 2.1896876337184427e-05, + "loss": 0.3978, + "step": 26736 + }, + { + "epoch": 34.322207958921695, + "grad_norm": 0.7811105847358704, + "learning_rate": 2.1896448438168592e-05, + "loss": 0.26, + "step": 26737 + }, + { + "epoch": 34.32349165596919, + "grad_norm": 0.7761232256889343, + "learning_rate": 2.1896020539152757e-05, + "loss": 0.2496, + "step": 26738 + }, + { + "epoch": 34.32477535301669, + "grad_norm": 0.7271261811256409, + "learning_rate": 2.189559264013693e-05, + "loss": 0.2471, + "step": 26739 + }, + { + "epoch": 34.32605905006419, + "grad_norm": 0.6983126997947693, + "learning_rate": 2.1895164741121094e-05, + "loss": 0.2613, + "step": 26740 + }, + { + "epoch": 34.32734274711168, + "grad_norm": 0.815265953540802, + "learning_rate": 2.1894736842105266e-05, + "loss": 0.2318, + "step": 26741 + }, + { + "epoch": 34.32862644415918, + "grad_norm": 0.9059035778045654, + "learning_rate": 2.189430894308943e-05, + "loss": 0.2433, + "step": 26742 + }, + { + "epoch": 34.329910141206675, + "grad_norm": 1.5336897373199463, + "learning_rate": 2.18938810440736e-05, + "loss": 0.2313, + "step": 26743 + }, + { + "epoch": 34.33119383825417, + "grad_norm": 0.8901498317718506, + "learning_rate": 2.1893453145057768e-05, + "loss": 0.2588, + "step": 26744 + }, + { + "epoch": 34.33247753530167, + "grad_norm": 0.7869769334793091, + "learning_rate": 2.1893025246041933e-05, + "loss": 0.2417, + "step": 26745 + }, + { + "epoch": 34.33376123234917, + "grad_norm": 1.1108272075653076, + "learning_rate": 2.18925973470261e-05, + "loss": 0.2558, + "step": 26746 + }, + { + "epoch": 34.33504492939666, + "grad_norm": 0.8780131340026855, + "learning_rate": 2.189216944801027e-05, + "loss": 0.2492, + "step": 26747 + }, + { + "epoch": 34.33632862644416, + "grad_norm": 0.7722861766815186, + "learning_rate": 2.1891741548994438e-05, + "loss": 0.2449, + "step": 26748 + }, + { + "epoch": 34.337612323491655, + "grad_norm": 0.993169903755188, + "learning_rate": 2.1891313649978607e-05, + "loss": 0.2374, + "step": 26749 + }, + { + "epoch": 34.33889602053915, + "grad_norm": 0.8771690130233765, + "learning_rate": 2.1890885750962775e-05, + "loss": 0.2555, + "step": 26750 + }, + { + "epoch": 34.34017971758665, + "grad_norm": 1.1091896295547485, + "learning_rate": 2.189045785194694e-05, + "loss": 0.2139, + "step": 26751 + }, + { + "epoch": 34.34146341463415, + "grad_norm": 1.3250010013580322, + "learning_rate": 2.189002995293111e-05, + "loss": 0.2536, + "step": 26752 + }, + { + "epoch": 34.342747111681646, + "grad_norm": 1.1770658493041992, + "learning_rate": 2.1889602053915277e-05, + "loss": 0.2338, + "step": 26753 + }, + { + "epoch": 34.34403080872914, + "grad_norm": 0.8908804655075073, + "learning_rate": 2.1889174154899442e-05, + "loss": 0.2394, + "step": 26754 + }, + { + "epoch": 34.345314505776635, + "grad_norm": 1.6910744905471802, + "learning_rate": 2.1888746255883614e-05, + "loss": 0.2096, + "step": 26755 + }, + { + "epoch": 34.34659820282413, + "grad_norm": 3.2826128005981445, + "learning_rate": 2.188831835686778e-05, + "loss": 0.2407, + "step": 26756 + }, + { + "epoch": 34.34788189987163, + "grad_norm": 1.073998212814331, + "learning_rate": 2.188789045785195e-05, + "loss": 0.2057, + "step": 26757 + }, + { + "epoch": 34.34916559691913, + "grad_norm": 8.845213890075684, + "learning_rate": 2.1887462558836116e-05, + "loss": 0.2306, + "step": 26758 + }, + { + "epoch": 34.350449293966626, + "grad_norm": 1.0361549854278564, + "learning_rate": 2.188703465982028e-05, + "loss": 0.2153, + "step": 26759 + }, + { + "epoch": 34.35173299101412, + "grad_norm": 1.132683515548706, + "learning_rate": 2.1886606760804452e-05, + "loss": 0.2337, + "step": 26760 + }, + { + "epoch": 34.353016688061615, + "grad_norm": 1.076624870300293, + "learning_rate": 2.1886178861788617e-05, + "loss": 0.2374, + "step": 26761 + }, + { + "epoch": 34.35430038510911, + "grad_norm": 0.983758807182312, + "learning_rate": 2.1885750962772786e-05, + "loss": 0.2528, + "step": 26762 + }, + { + "epoch": 34.35558408215661, + "grad_norm": 0.9056354761123657, + "learning_rate": 2.1885323063756954e-05, + "loss": 0.2162, + "step": 26763 + }, + { + "epoch": 34.35686777920411, + "grad_norm": 1.058783769607544, + "learning_rate": 2.1884895164741123e-05, + "loss": 0.2195, + "step": 26764 + }, + { + "epoch": 34.35815147625161, + "grad_norm": 0.8869308829307556, + "learning_rate": 2.188446726572529e-05, + "loss": 0.2467, + "step": 26765 + }, + { + "epoch": 34.359435173299104, + "grad_norm": 0.9546904563903809, + "learning_rate": 2.1884039366709456e-05, + "loss": 0.2439, + "step": 26766 + }, + { + "epoch": 34.360718870346595, + "grad_norm": 2.110502004623413, + "learning_rate": 2.1883611467693624e-05, + "loss": 0.2287, + "step": 26767 + }, + { + "epoch": 34.36200256739409, + "grad_norm": 1.5052814483642578, + "learning_rate": 2.1883183568677793e-05, + "loss": 0.2173, + "step": 26768 + }, + { + "epoch": 34.36328626444159, + "grad_norm": 1.492672085762024, + "learning_rate": 2.188275566966196e-05, + "loss": 0.2352, + "step": 26769 + }, + { + "epoch": 34.36456996148909, + "grad_norm": 0.8241329193115234, + "learning_rate": 2.1882327770646126e-05, + "loss": 0.2293, + "step": 26770 + }, + { + "epoch": 34.36585365853659, + "grad_norm": 1.0422337055206299, + "learning_rate": 2.1881899871630298e-05, + "loss": 0.2184, + "step": 26771 + }, + { + "epoch": 34.367137355584084, + "grad_norm": 1.274850845336914, + "learning_rate": 2.1881471972614463e-05, + "loss": 0.2262, + "step": 26772 + }, + { + "epoch": 34.36842105263158, + "grad_norm": 7.494657516479492, + "learning_rate": 2.188104407359863e-05, + "loss": 0.2463, + "step": 26773 + }, + { + "epoch": 34.36970474967907, + "grad_norm": 1.3707892894744873, + "learning_rate": 2.18806161745828e-05, + "loss": 0.2575, + "step": 26774 + }, + { + "epoch": 34.37098844672657, + "grad_norm": 2.062093496322632, + "learning_rate": 2.1880188275566965e-05, + "loss": 0.2386, + "step": 26775 + }, + { + "epoch": 34.37227214377407, + "grad_norm": 1.2357109785079956, + "learning_rate": 2.1879760376551137e-05, + "loss": 0.2277, + "step": 26776 + }, + { + "epoch": 34.37355584082157, + "grad_norm": 1.0299111604690552, + "learning_rate": 2.1879332477535302e-05, + "loss": 0.246, + "step": 26777 + }, + { + "epoch": 34.374839537869065, + "grad_norm": 1.6727744340896606, + "learning_rate": 2.187890457851947e-05, + "loss": 0.2655, + "step": 26778 + }, + { + "epoch": 34.37612323491656, + "grad_norm": 1.1385456323623657, + "learning_rate": 2.187847667950364e-05, + "loss": 0.235, + "step": 26779 + }, + { + "epoch": 34.37740693196405, + "grad_norm": 1.6687092781066895, + "learning_rate": 2.1878048780487804e-05, + "loss": 0.2659, + "step": 26780 + }, + { + "epoch": 34.37869062901155, + "grad_norm": 1.6927052736282349, + "learning_rate": 2.1877620881471975e-05, + "loss": 0.2271, + "step": 26781 + }, + { + "epoch": 34.37997432605905, + "grad_norm": 1.5788320302963257, + "learning_rate": 2.187719298245614e-05, + "loss": 0.2434, + "step": 26782 + }, + { + "epoch": 34.38125802310655, + "grad_norm": 3.512955904006958, + "learning_rate": 2.187676508344031e-05, + "loss": 0.2395, + "step": 26783 + }, + { + "epoch": 34.382541720154045, + "grad_norm": 1.528324842453003, + "learning_rate": 2.1876337184424477e-05, + "loss": 0.2852, + "step": 26784 + }, + { + "epoch": 34.38382541720154, + "grad_norm": 1.7474831342697144, + "learning_rate": 2.1875909285408642e-05, + "loss": 0.3511, + "step": 26785 + }, + { + "epoch": 34.38510911424904, + "grad_norm": 2.2670414447784424, + "learning_rate": 2.187548138639281e-05, + "loss": 0.4045, + "step": 26786 + }, + { + "epoch": 34.38639281129653, + "grad_norm": 0.6987996101379395, + "learning_rate": 2.187505348737698e-05, + "loss": 0.2452, + "step": 26787 + }, + { + "epoch": 34.38767650834403, + "grad_norm": 0.5720242857933044, + "learning_rate": 2.1874625588361148e-05, + "loss": 0.2378, + "step": 26788 + }, + { + "epoch": 34.38896020539153, + "grad_norm": 1.092297077178955, + "learning_rate": 2.1874197689345316e-05, + "loss": 0.2372, + "step": 26789 + }, + { + "epoch": 34.390243902439025, + "grad_norm": 0.9969488978385925, + "learning_rate": 2.1873769790329484e-05, + "loss": 0.2469, + "step": 26790 + }, + { + "epoch": 34.39152759948652, + "grad_norm": 1.7381216287612915, + "learning_rate": 2.187334189131365e-05, + "loss": 0.2627, + "step": 26791 + }, + { + "epoch": 34.39281129653402, + "grad_norm": 1.3248196840286255, + "learning_rate": 2.1872913992297818e-05, + "loss": 0.2803, + "step": 26792 + }, + { + "epoch": 34.39409499358152, + "grad_norm": 1.1397064924240112, + "learning_rate": 2.1872486093281986e-05, + "loss": 0.2609, + "step": 26793 + }, + { + "epoch": 34.39537869062901, + "grad_norm": 0.808724045753479, + "learning_rate": 2.187205819426615e-05, + "loss": 0.2205, + "step": 26794 + }, + { + "epoch": 34.39666238767651, + "grad_norm": 1.0069986581802368, + "learning_rate": 2.1871630295250323e-05, + "loss": 0.2599, + "step": 26795 + }, + { + "epoch": 34.397946084724005, + "grad_norm": 1.2554954290390015, + "learning_rate": 2.1871202396234488e-05, + "loss": 0.2472, + "step": 26796 + }, + { + "epoch": 34.3992297817715, + "grad_norm": 1.3262642621994019, + "learning_rate": 2.187077449721866e-05, + "loss": 0.2582, + "step": 26797 + }, + { + "epoch": 34.400513478819, + "grad_norm": 1.4335119724273682, + "learning_rate": 2.1870346598202825e-05, + "loss": 0.2486, + "step": 26798 + }, + { + "epoch": 34.4017971758665, + "grad_norm": 1.1311126947402954, + "learning_rate": 2.186991869918699e-05, + "loss": 0.2527, + "step": 26799 + }, + { + "epoch": 34.40308087291399, + "grad_norm": 0.8677212595939636, + "learning_rate": 2.1869490800171162e-05, + "loss": 0.2434, + "step": 26800 + }, + { + "epoch": 34.40436456996149, + "grad_norm": 1.940248727798462, + "learning_rate": 2.1869062901155327e-05, + "loss": 0.2421, + "step": 26801 + }, + { + "epoch": 34.405648267008985, + "grad_norm": 1.0319669246673584, + "learning_rate": 2.1868635002139495e-05, + "loss": 0.2352, + "step": 26802 + }, + { + "epoch": 34.40693196405648, + "grad_norm": 4.06812047958374, + "learning_rate": 2.1868207103123664e-05, + "loss": 0.2266, + "step": 26803 + }, + { + "epoch": 34.40821566110398, + "grad_norm": 2.0213730335235596, + "learning_rate": 2.1867779204107832e-05, + "loss": 0.2561, + "step": 26804 + }, + { + "epoch": 34.40949935815148, + "grad_norm": 0.8493967652320862, + "learning_rate": 2.1867351305092e-05, + "loss": 0.2402, + "step": 26805 + }, + { + "epoch": 34.410783055198976, + "grad_norm": 0.8343647718429565, + "learning_rate": 2.1866923406076165e-05, + "loss": 0.2215, + "step": 26806 + }, + { + "epoch": 34.41206675224647, + "grad_norm": 0.9895411133766174, + "learning_rate": 2.1866495507060334e-05, + "loss": 0.2224, + "step": 26807 + }, + { + "epoch": 34.413350449293965, + "grad_norm": 1.981420874595642, + "learning_rate": 2.1866067608044502e-05, + "loss": 0.2276, + "step": 26808 + }, + { + "epoch": 34.41463414634146, + "grad_norm": 1.8836041688919067, + "learning_rate": 2.186563970902867e-05, + "loss": 0.2455, + "step": 26809 + }, + { + "epoch": 34.41591784338896, + "grad_norm": 0.9239197969436646, + "learning_rate": 2.1865211810012836e-05, + "loss": 0.1959, + "step": 26810 + }, + { + "epoch": 34.41720154043646, + "grad_norm": 0.8872314095497131, + "learning_rate": 2.1864783910997007e-05, + "loss": 0.235, + "step": 26811 + }, + { + "epoch": 34.41848523748396, + "grad_norm": 1.0573008060455322, + "learning_rate": 2.1864356011981173e-05, + "loss": 0.2536, + "step": 26812 + }, + { + "epoch": 34.41976893453145, + "grad_norm": 1.0403133630752563, + "learning_rate": 2.186392811296534e-05, + "loss": 0.2349, + "step": 26813 + }, + { + "epoch": 34.421052631578945, + "grad_norm": 1.6607003211975098, + "learning_rate": 2.186350021394951e-05, + "loss": 0.2091, + "step": 26814 + }, + { + "epoch": 34.42233632862644, + "grad_norm": 0.8899844288825989, + "learning_rate": 2.1863072314933674e-05, + "loss": 0.2092, + "step": 26815 + }, + { + "epoch": 34.42362002567394, + "grad_norm": 0.8486402034759521, + "learning_rate": 2.1862644415917846e-05, + "loss": 0.2292, + "step": 26816 + }, + { + "epoch": 34.42490372272144, + "grad_norm": 2.2474398612976074, + "learning_rate": 2.186221651690201e-05, + "loss": 0.2187, + "step": 26817 + }, + { + "epoch": 34.42618741976894, + "grad_norm": 1.5266144275665283, + "learning_rate": 2.186178861788618e-05, + "loss": 0.2523, + "step": 26818 + }, + { + "epoch": 34.427471116816434, + "grad_norm": 1.2586348056793213, + "learning_rate": 2.1861360718870348e-05, + "loss": 0.2216, + "step": 26819 + }, + { + "epoch": 34.428754813863925, + "grad_norm": 1.3315807580947876, + "learning_rate": 2.1860932819854513e-05, + "loss": 0.2269, + "step": 26820 + }, + { + "epoch": 34.43003851091142, + "grad_norm": 1.275437593460083, + "learning_rate": 2.1860504920838685e-05, + "loss": 0.2406, + "step": 26821 + }, + { + "epoch": 34.43132220795892, + "grad_norm": 1.1293165683746338, + "learning_rate": 2.186007702182285e-05, + "loss": 0.2393, + "step": 26822 + }, + { + "epoch": 34.43260590500642, + "grad_norm": 1.3835549354553223, + "learning_rate": 2.1859649122807018e-05, + "loss": 0.2434, + "step": 26823 + }, + { + "epoch": 34.43388960205392, + "grad_norm": 1.6231398582458496, + "learning_rate": 2.1859221223791187e-05, + "loss": 0.2405, + "step": 26824 + }, + { + "epoch": 34.435173299101415, + "grad_norm": 6.359742164611816, + "learning_rate": 2.1858793324775355e-05, + "loss": 0.2444, + "step": 26825 + }, + { + "epoch": 34.436456996148905, + "grad_norm": 1.4152038097381592, + "learning_rate": 2.185836542575952e-05, + "loss": 0.2441, + "step": 26826 + }, + { + "epoch": 34.4377406931964, + "grad_norm": 2.55112624168396, + "learning_rate": 2.185793752674369e-05, + "loss": 0.215, + "step": 26827 + }, + { + "epoch": 34.4390243902439, + "grad_norm": 1.113638997077942, + "learning_rate": 2.1857509627727857e-05, + "loss": 0.2574, + "step": 26828 + }, + { + "epoch": 34.4403080872914, + "grad_norm": 1.4666097164154053, + "learning_rate": 2.1857081728712025e-05, + "loss": 0.2388, + "step": 26829 + }, + { + "epoch": 34.4415917843389, + "grad_norm": 2.5273983478546143, + "learning_rate": 2.1856653829696194e-05, + "loss": 0.2429, + "step": 26830 + }, + { + "epoch": 34.442875481386395, + "grad_norm": 3.1035711765289307, + "learning_rate": 2.185622593068036e-05, + "loss": 0.2421, + "step": 26831 + }, + { + "epoch": 34.44415917843389, + "grad_norm": 1.3527039289474487, + "learning_rate": 2.185579803166453e-05, + "loss": 0.2594, + "step": 26832 + }, + { + "epoch": 34.44544287548138, + "grad_norm": 1.5480483770370483, + "learning_rate": 2.1855370132648696e-05, + "loss": 0.2871, + "step": 26833 + }, + { + "epoch": 34.44672657252888, + "grad_norm": 1.911351203918457, + "learning_rate": 2.185494223363286e-05, + "loss": 0.3154, + "step": 26834 + }, + { + "epoch": 34.44801026957638, + "grad_norm": 1.943251371383667, + "learning_rate": 2.1854514334617032e-05, + "loss": 0.3049, + "step": 26835 + }, + { + "epoch": 34.44929396662388, + "grad_norm": 2.788313627243042, + "learning_rate": 2.1854086435601197e-05, + "loss": 0.4248, + "step": 26836 + }, + { + "epoch": 34.450577663671375, + "grad_norm": 0.8661390542984009, + "learning_rate": 2.185365853658537e-05, + "loss": 0.2461, + "step": 26837 + }, + { + "epoch": 34.45186136071887, + "grad_norm": 1.2176425457000732, + "learning_rate": 2.1853230637569534e-05, + "loss": 0.2565, + "step": 26838 + }, + { + "epoch": 34.45314505776637, + "grad_norm": 0.914333701133728, + "learning_rate": 2.1852802738553703e-05, + "loss": 0.2474, + "step": 26839 + }, + { + "epoch": 34.45442875481386, + "grad_norm": 1.6258898973464966, + "learning_rate": 2.185237483953787e-05, + "loss": 0.305, + "step": 26840 + }, + { + "epoch": 34.45571245186136, + "grad_norm": 1.0399502515792847, + "learning_rate": 2.1851946940522036e-05, + "loss": 0.2279, + "step": 26841 + }, + { + "epoch": 34.45699614890886, + "grad_norm": 1.7008960247039795, + "learning_rate": 2.1851519041506205e-05, + "loss": 0.2673, + "step": 26842 + }, + { + "epoch": 34.458279845956355, + "grad_norm": 1.1072216033935547, + "learning_rate": 2.1851091142490373e-05, + "loss": 0.2798, + "step": 26843 + }, + { + "epoch": 34.45956354300385, + "grad_norm": 0.9679515361785889, + "learning_rate": 2.185066324347454e-05, + "loss": 0.245, + "step": 26844 + }, + { + "epoch": 34.46084724005135, + "grad_norm": 1.415000081062317, + "learning_rate": 2.185023534445871e-05, + "loss": 0.26, + "step": 26845 + }, + { + "epoch": 34.46213093709884, + "grad_norm": 1.5811378955841064, + "learning_rate": 2.1849807445442875e-05, + "loss": 0.2522, + "step": 26846 + }, + { + "epoch": 34.46341463414634, + "grad_norm": 1.3065775632858276, + "learning_rate": 2.1849379546427043e-05, + "loss": 0.2478, + "step": 26847 + }, + { + "epoch": 34.46469833119384, + "grad_norm": 1.0403282642364502, + "learning_rate": 2.184895164741121e-05, + "loss": 0.2255, + "step": 26848 + }, + { + "epoch": 34.465982028241335, + "grad_norm": 2.1604480743408203, + "learning_rate": 2.184852374839538e-05, + "loss": 0.245, + "step": 26849 + }, + { + "epoch": 34.46726572528883, + "grad_norm": 1.341651439666748, + "learning_rate": 2.1848095849379545e-05, + "loss": 0.2654, + "step": 26850 + }, + { + "epoch": 34.46854942233633, + "grad_norm": 0.9901694655418396, + "learning_rate": 2.1847667950363717e-05, + "loss": 0.2291, + "step": 26851 + }, + { + "epoch": 34.46983311938383, + "grad_norm": 2.302581787109375, + "learning_rate": 2.1847240051347882e-05, + "loss": 0.2404, + "step": 26852 + }, + { + "epoch": 34.47111681643132, + "grad_norm": 1.2396408319473267, + "learning_rate": 2.1846812152332047e-05, + "loss": 0.2376, + "step": 26853 + }, + { + "epoch": 34.47240051347882, + "grad_norm": 1.292870283126831, + "learning_rate": 2.184638425331622e-05, + "loss": 0.2227, + "step": 26854 + }, + { + "epoch": 34.473684210526315, + "grad_norm": 0.979458749294281, + "learning_rate": 2.1845956354300384e-05, + "loss": 0.2417, + "step": 26855 + }, + { + "epoch": 34.47496790757381, + "grad_norm": 1.2712997198104858, + "learning_rate": 2.1845528455284556e-05, + "loss": 0.2316, + "step": 26856 + }, + { + "epoch": 34.47625160462131, + "grad_norm": 1.2815558910369873, + "learning_rate": 2.184510055626872e-05, + "loss": 0.2733, + "step": 26857 + }, + { + "epoch": 34.47753530166881, + "grad_norm": 1.0947073698043823, + "learning_rate": 2.184467265725289e-05, + "loss": 0.2547, + "step": 26858 + }, + { + "epoch": 34.47881899871631, + "grad_norm": 1.0828561782836914, + "learning_rate": 2.1844244758237057e-05, + "loss": 0.2621, + "step": 26859 + }, + { + "epoch": 34.4801026957638, + "grad_norm": 1.0096745491027832, + "learning_rate": 2.1843816859221222e-05, + "loss": 0.2172, + "step": 26860 + }, + { + "epoch": 34.481386392811295, + "grad_norm": 1.544000267982483, + "learning_rate": 2.184338896020539e-05, + "loss": 0.2328, + "step": 26861 + }, + { + "epoch": 34.48267008985879, + "grad_norm": 1.319144368171692, + "learning_rate": 2.184296106118956e-05, + "loss": 0.2163, + "step": 26862 + }, + { + "epoch": 34.48395378690629, + "grad_norm": 0.9633342623710632, + "learning_rate": 2.1842533162173728e-05, + "loss": 0.2233, + "step": 26863 + }, + { + "epoch": 34.48523748395379, + "grad_norm": 0.9300260543823242, + "learning_rate": 2.1842105263157896e-05, + "loss": 0.2143, + "step": 26864 + }, + { + "epoch": 34.48652118100129, + "grad_norm": 1.0870691537857056, + "learning_rate": 2.1841677364142064e-05, + "loss": 0.2172, + "step": 26865 + }, + { + "epoch": 34.48780487804878, + "grad_norm": 1.122404932975769, + "learning_rate": 2.184124946512623e-05, + "loss": 0.2239, + "step": 26866 + }, + { + "epoch": 34.489088575096275, + "grad_norm": 1.9652793407440186, + "learning_rate": 2.1840821566110398e-05, + "loss": 0.2456, + "step": 26867 + }, + { + "epoch": 34.49037227214377, + "grad_norm": 1.0351752042770386, + "learning_rate": 2.1840393667094566e-05, + "loss": 0.2273, + "step": 26868 + }, + { + "epoch": 34.49165596919127, + "grad_norm": 1.6260206699371338, + "learning_rate": 2.183996576807873e-05, + "loss": 0.2547, + "step": 26869 + }, + { + "epoch": 34.49293966623877, + "grad_norm": 1.447245478630066, + "learning_rate": 2.1839537869062903e-05, + "loss": 0.2381, + "step": 26870 + }, + { + "epoch": 34.49422336328627, + "grad_norm": 1.0404260158538818, + "learning_rate": 2.1839109970047068e-05, + "loss": 0.2399, + "step": 26871 + }, + { + "epoch": 34.495507060333765, + "grad_norm": 1.5127826929092407, + "learning_rate": 2.183868207103124e-05, + "loss": 0.2403, + "step": 26872 + }, + { + "epoch": 34.496790757381255, + "grad_norm": 1.2456191778182983, + "learning_rate": 2.1838254172015405e-05, + "loss": 0.2388, + "step": 26873 + }, + { + "epoch": 34.49807445442875, + "grad_norm": 1.3091042041778564, + "learning_rate": 2.183782627299957e-05, + "loss": 0.2115, + "step": 26874 + }, + { + "epoch": 34.49935815147625, + "grad_norm": 1.2755450010299683, + "learning_rate": 2.1837398373983742e-05, + "loss": 0.2436, + "step": 26875 + }, + { + "epoch": 34.50064184852375, + "grad_norm": 1.4721481800079346, + "learning_rate": 2.1836970474967907e-05, + "loss": 0.2106, + "step": 26876 + }, + { + "epoch": 34.50192554557125, + "grad_norm": 1.8572800159454346, + "learning_rate": 2.1836542575952075e-05, + "loss": 0.2283, + "step": 26877 + }, + { + "epoch": 34.503209242618745, + "grad_norm": 1.5926593542099, + "learning_rate": 2.1836114676936244e-05, + "loss": 0.2561, + "step": 26878 + }, + { + "epoch": 34.504492939666235, + "grad_norm": 1.2579177618026733, + "learning_rate": 2.1835686777920412e-05, + "loss": 0.2393, + "step": 26879 + }, + { + "epoch": 34.50577663671373, + "grad_norm": 1.9729092121124268, + "learning_rate": 2.183525887890458e-05, + "loss": 0.2136, + "step": 26880 + }, + { + "epoch": 34.50706033376123, + "grad_norm": 1.1880580186843872, + "learning_rate": 2.1834830979888745e-05, + "loss": 0.2271, + "step": 26881 + }, + { + "epoch": 34.50834403080873, + "grad_norm": 1.5298866033554077, + "learning_rate": 2.1834403080872914e-05, + "loss": 0.2887, + "step": 26882 + }, + { + "epoch": 34.50962772785623, + "grad_norm": 1.7676982879638672, + "learning_rate": 2.1833975181857082e-05, + "loss": 0.2804, + "step": 26883 + }, + { + "epoch": 34.510911424903725, + "grad_norm": 1.585776448249817, + "learning_rate": 2.183354728284125e-05, + "loss": 0.267, + "step": 26884 + }, + { + "epoch": 34.51219512195122, + "grad_norm": 2.4510281085968018, + "learning_rate": 2.1833119383825416e-05, + "loss": 0.2824, + "step": 26885 + }, + { + "epoch": 34.51347881899871, + "grad_norm": 2.544374704360962, + "learning_rate": 2.1832691484809588e-05, + "loss": 0.435, + "step": 26886 + }, + { + "epoch": 34.51476251604621, + "grad_norm": 0.8519017696380615, + "learning_rate": 2.1832263585793753e-05, + "loss": 0.2537, + "step": 26887 + }, + { + "epoch": 34.51604621309371, + "grad_norm": 0.9378626942634583, + "learning_rate": 2.183183568677792e-05, + "loss": 0.2334, + "step": 26888 + }, + { + "epoch": 34.51732991014121, + "grad_norm": 1.376774549484253, + "learning_rate": 2.183140778776209e-05, + "loss": 0.2436, + "step": 26889 + }, + { + "epoch": 34.518613607188705, + "grad_norm": 0.7780030965805054, + "learning_rate": 2.1830979888746254e-05, + "loss": 0.2402, + "step": 26890 + }, + { + "epoch": 34.5198973042362, + "grad_norm": 1.0881412029266357, + "learning_rate": 2.1830551989730426e-05, + "loss": 0.256, + "step": 26891 + }, + { + "epoch": 34.52118100128369, + "grad_norm": 0.7612770795822144, + "learning_rate": 2.183012409071459e-05, + "loss": 0.2194, + "step": 26892 + }, + { + "epoch": 34.52246469833119, + "grad_norm": 0.7656771540641785, + "learning_rate": 2.182969619169876e-05, + "loss": 0.268, + "step": 26893 + }, + { + "epoch": 34.52374839537869, + "grad_norm": 0.9658423066139221, + "learning_rate": 2.1829268292682928e-05, + "loss": 0.2697, + "step": 26894 + }, + { + "epoch": 34.52503209242619, + "grad_norm": 0.9515830278396606, + "learning_rate": 2.1828840393667093e-05, + "loss": 0.2259, + "step": 26895 + }, + { + "epoch": 34.526315789473685, + "grad_norm": 1.0341224670410156, + "learning_rate": 2.1828412494651265e-05, + "loss": 0.2871, + "step": 26896 + }, + { + "epoch": 34.52759948652118, + "grad_norm": 0.889745831489563, + "learning_rate": 2.182798459563543e-05, + "loss": 0.2328, + "step": 26897 + }, + { + "epoch": 34.52888318356868, + "grad_norm": 1.4639642238616943, + "learning_rate": 2.18275566966196e-05, + "loss": 0.2444, + "step": 26898 + }, + { + "epoch": 34.53016688061617, + "grad_norm": 2.0947515964508057, + "learning_rate": 2.1827128797603767e-05, + "loss": 0.2684, + "step": 26899 + }, + { + "epoch": 34.53145057766367, + "grad_norm": 1.0452697277069092, + "learning_rate": 2.1826700898587935e-05, + "loss": 0.2353, + "step": 26900 + }, + { + "epoch": 34.53273427471117, + "grad_norm": 0.8986276984214783, + "learning_rate": 2.18262729995721e-05, + "loss": 0.2195, + "step": 26901 + }, + { + "epoch": 34.534017971758665, + "grad_norm": 1.0424423217773438, + "learning_rate": 2.182584510055627e-05, + "loss": 0.2313, + "step": 26902 + }, + { + "epoch": 34.53530166880616, + "grad_norm": 1.4389660358428955, + "learning_rate": 2.1825417201540437e-05, + "loss": 0.2497, + "step": 26903 + }, + { + "epoch": 34.53658536585366, + "grad_norm": 1.293015718460083, + "learning_rate": 2.1824989302524605e-05, + "loss": 0.2191, + "step": 26904 + }, + { + "epoch": 34.53786906290116, + "grad_norm": 1.608521819114685, + "learning_rate": 2.1824561403508774e-05, + "loss": 0.2191, + "step": 26905 + }, + { + "epoch": 34.53915275994865, + "grad_norm": 1.447640299797058, + "learning_rate": 2.182413350449294e-05, + "loss": 0.2288, + "step": 26906 + }, + { + "epoch": 34.54043645699615, + "grad_norm": 0.789340078830719, + "learning_rate": 2.1823705605477107e-05, + "loss": 0.2486, + "step": 26907 + }, + { + "epoch": 34.541720154043645, + "grad_norm": 1.7988457679748535, + "learning_rate": 2.1823277706461276e-05, + "loss": 0.2378, + "step": 26908 + }, + { + "epoch": 34.54300385109114, + "grad_norm": 0.9712130427360535, + "learning_rate": 2.182284980744544e-05, + "loss": 0.2456, + "step": 26909 + }, + { + "epoch": 34.54428754813864, + "grad_norm": 1.4510139226913452, + "learning_rate": 2.1822421908429612e-05, + "loss": 0.2227, + "step": 26910 + }, + { + "epoch": 34.54557124518614, + "grad_norm": 0.9951423406600952, + "learning_rate": 2.1821994009413778e-05, + "loss": 0.2476, + "step": 26911 + }, + { + "epoch": 34.54685494223363, + "grad_norm": 1.0223067998886108, + "learning_rate": 2.182156611039795e-05, + "loss": 0.2591, + "step": 26912 + }, + { + "epoch": 34.54813863928113, + "grad_norm": 1.4960460662841797, + "learning_rate": 2.1821138211382114e-05, + "loss": 0.2247, + "step": 26913 + }, + { + "epoch": 34.549422336328625, + "grad_norm": 2.6903460025787354, + "learning_rate": 2.182071031236628e-05, + "loss": 0.2293, + "step": 26914 + }, + { + "epoch": 34.55070603337612, + "grad_norm": Infinity, + "learning_rate": 2.182071031236628e-05, + "loss": 0.2118, + "step": 26915 + }, + { + "epoch": 34.55198973042362, + "grad_norm": 2.866483688354492, + "learning_rate": 2.182028241335045e-05, + "loss": 0.2192, + "step": 26916 + }, + { + "epoch": 34.55327342747112, + "grad_norm": 3.3545212745666504, + "learning_rate": 2.1819854514334616e-05, + "loss": 0.2131, + "step": 26917 + }, + { + "epoch": 34.55455712451862, + "grad_norm": 1.104834794998169, + "learning_rate": 2.1819426615318785e-05, + "loss": 0.2137, + "step": 26918 + }, + { + "epoch": 34.55584082156611, + "grad_norm": 1.0286529064178467, + "learning_rate": 2.1818998716302953e-05, + "loss": 0.2206, + "step": 26919 + }, + { + "epoch": 34.557124518613605, + "grad_norm": 2.191608428955078, + "learning_rate": 2.181857081728712e-05, + "loss": 0.2386, + "step": 26920 + }, + { + "epoch": 34.5584082156611, + "grad_norm": 1.3838465213775635, + "learning_rate": 2.181814291827129e-05, + "loss": 0.2644, + "step": 26921 + }, + { + "epoch": 34.5596919127086, + "grad_norm": 1.2764925956726074, + "learning_rate": 2.1817715019255455e-05, + "loss": 0.2268, + "step": 26922 + }, + { + "epoch": 34.5609756097561, + "grad_norm": 1.7440979480743408, + "learning_rate": 2.1817287120239623e-05, + "loss": 0.2404, + "step": 26923 + }, + { + "epoch": 34.5622593068036, + "grad_norm": 2.79585862159729, + "learning_rate": 2.181685922122379e-05, + "loss": 0.2535, + "step": 26924 + }, + { + "epoch": 34.563543003851095, + "grad_norm": 1.2033100128173828, + "learning_rate": 2.181643132220796e-05, + "loss": 0.2298, + "step": 26925 + }, + { + "epoch": 34.564826700898585, + "grad_norm": 4.338474273681641, + "learning_rate": 2.1816003423192125e-05, + "loss": 0.2181, + "step": 26926 + }, + { + "epoch": 34.56611039794608, + "grad_norm": 9.82397174835205, + "learning_rate": 2.1815575524176297e-05, + "loss": 0.2757, + "step": 26927 + }, + { + "epoch": 34.56739409499358, + "grad_norm": 1.9606715440750122, + "learning_rate": 2.1815147625160462e-05, + "loss": 0.2372, + "step": 26928 + }, + { + "epoch": 34.56867779204108, + "grad_norm": 1.7707042694091797, + "learning_rate": 2.181471972614463e-05, + "loss": 0.2687, + "step": 26929 + }, + { + "epoch": 34.56996148908858, + "grad_norm": 1.513737678527832, + "learning_rate": 2.18142918271288e-05, + "loss": 0.268, + "step": 26930 + }, + { + "epoch": 34.571245186136075, + "grad_norm": 1.3048195838928223, + "learning_rate": 2.1813863928112964e-05, + "loss": 0.2595, + "step": 26931 + }, + { + "epoch": 34.572528883183566, + "grad_norm": 2.1630494594573975, + "learning_rate": 2.1813436029097136e-05, + "loss": 0.2798, + "step": 26932 + }, + { + "epoch": 34.57381258023106, + "grad_norm": 1.9222993850708008, + "learning_rate": 2.18130081300813e-05, + "loss": 0.2921, + "step": 26933 + }, + { + "epoch": 34.57509627727856, + "grad_norm": 1.8627973794937134, + "learning_rate": 2.181258023106547e-05, + "loss": 0.2867, + "step": 26934 + }, + { + "epoch": 34.57637997432606, + "grad_norm": 2.1226372718811035, + "learning_rate": 2.1812152332049637e-05, + "loss": 0.3088, + "step": 26935 + }, + { + "epoch": 34.57766367137356, + "grad_norm": 2.6299824714660645, + "learning_rate": 2.1811724433033802e-05, + "loss": 0.3764, + "step": 26936 + }, + { + "epoch": 34.578947368421055, + "grad_norm": 1.1322306394577026, + "learning_rate": 2.1811296534017974e-05, + "loss": 0.2486, + "step": 26937 + }, + { + "epoch": 34.58023106546855, + "grad_norm": 1.2381937503814697, + "learning_rate": 2.181086863500214e-05, + "loss": 0.2472, + "step": 26938 + }, + { + "epoch": 34.58151476251604, + "grad_norm": 4.100905895233154, + "learning_rate": 2.1810440735986308e-05, + "loss": 0.2815, + "step": 26939 + }, + { + "epoch": 34.58279845956354, + "grad_norm": 1.021045446395874, + "learning_rate": 2.1810012836970476e-05, + "loss": 0.2598, + "step": 26940 + }, + { + "epoch": 34.58408215661104, + "grad_norm": 1.1068871021270752, + "learning_rate": 2.1809584937954645e-05, + "loss": 0.2454, + "step": 26941 + }, + { + "epoch": 34.58536585365854, + "grad_norm": 0.7830415964126587, + "learning_rate": 2.180915703893881e-05, + "loss": 0.2398, + "step": 26942 + }, + { + "epoch": 34.586649550706035, + "grad_norm": 1.0596592426300049, + "learning_rate": 2.1808729139922978e-05, + "loss": 0.2348, + "step": 26943 + }, + { + "epoch": 34.58793324775353, + "grad_norm": 1.0969985723495483, + "learning_rate": 2.1808301240907146e-05, + "loss": 0.242, + "step": 26944 + }, + { + "epoch": 34.589216944801024, + "grad_norm": 0.7703793048858643, + "learning_rate": 2.1807873341891315e-05, + "loss": 0.2593, + "step": 26945 + }, + { + "epoch": 34.59050064184852, + "grad_norm": 1.2558131217956543, + "learning_rate": 2.1807445442875483e-05, + "loss": 0.2489, + "step": 26946 + }, + { + "epoch": 34.59178433889602, + "grad_norm": 0.8638955354690552, + "learning_rate": 2.1807017543859648e-05, + "loss": 0.2354, + "step": 26947 + }, + { + "epoch": 34.59306803594352, + "grad_norm": 2.9240691661834717, + "learning_rate": 2.180658964484382e-05, + "loss": 0.2754, + "step": 26948 + }, + { + "epoch": 34.594351732991015, + "grad_norm": 0.7334431409835815, + "learning_rate": 2.1806161745827985e-05, + "loss": 0.2348, + "step": 26949 + }, + { + "epoch": 34.59563543003851, + "grad_norm": 1.3061953783035278, + "learning_rate": 2.180573384681215e-05, + "loss": 0.2494, + "step": 26950 + }, + { + "epoch": 34.59691912708601, + "grad_norm": 1.3653677701950073, + "learning_rate": 2.1805305947796322e-05, + "loss": 0.2399, + "step": 26951 + }, + { + "epoch": 34.5982028241335, + "grad_norm": 2.4488823413848877, + "learning_rate": 2.1804878048780487e-05, + "loss": 0.2282, + "step": 26952 + }, + { + "epoch": 34.599486521181, + "grad_norm": 0.9447447657585144, + "learning_rate": 2.180445014976466e-05, + "loss": 0.2435, + "step": 26953 + }, + { + "epoch": 34.6007702182285, + "grad_norm": 0.8171132802963257, + "learning_rate": 2.1804022250748824e-05, + "loss": 0.2241, + "step": 26954 + }, + { + "epoch": 34.602053915275995, + "grad_norm": 1.4479360580444336, + "learning_rate": 2.1803594351732992e-05, + "loss": 0.2469, + "step": 26955 + }, + { + "epoch": 34.60333761232349, + "grad_norm": 1.7186866998672485, + "learning_rate": 2.180316645271716e-05, + "loss": 0.2499, + "step": 26956 + }, + { + "epoch": 34.60462130937099, + "grad_norm": 1.3690478801727295, + "learning_rate": 2.1802738553701326e-05, + "loss": 0.2555, + "step": 26957 + }, + { + "epoch": 34.60590500641848, + "grad_norm": 1.3229848146438599, + "learning_rate": 2.1802310654685494e-05, + "loss": 0.2264, + "step": 26958 + }, + { + "epoch": 34.60718870346598, + "grad_norm": 1.1217838525772095, + "learning_rate": 2.1801882755669662e-05, + "loss": 0.2093, + "step": 26959 + }, + { + "epoch": 34.60847240051348, + "grad_norm": 1.0842698812484741, + "learning_rate": 2.180145485665383e-05, + "loss": 0.2517, + "step": 26960 + }, + { + "epoch": 34.609756097560975, + "grad_norm": 0.923234760761261, + "learning_rate": 2.1801026957638e-05, + "loss": 0.2272, + "step": 26961 + }, + { + "epoch": 34.61103979460847, + "grad_norm": 2.9943606853485107, + "learning_rate": 2.1800599058622168e-05, + "loss": 0.2337, + "step": 26962 + }, + { + "epoch": 34.61232349165597, + "grad_norm": 1.2338920831680298, + "learning_rate": 2.1800171159606333e-05, + "loss": 0.2278, + "step": 26963 + }, + { + "epoch": 34.61360718870347, + "grad_norm": 1.4014875888824463, + "learning_rate": 2.17997432605905e-05, + "loss": 0.2155, + "step": 26964 + }, + { + "epoch": 34.61489088575096, + "grad_norm": 1.2414389848709106, + "learning_rate": 2.179931536157467e-05, + "loss": 0.235, + "step": 26965 + }, + { + "epoch": 34.61617458279846, + "grad_norm": 1.1419765949249268, + "learning_rate": 2.1798887462558834e-05, + "loss": 0.2261, + "step": 26966 + }, + { + "epoch": 34.617458279845955, + "grad_norm": 1.3546744585037231, + "learning_rate": 2.1798459563543006e-05, + "loss": 0.2417, + "step": 26967 + }, + { + "epoch": 34.61874197689345, + "grad_norm": 1.4434828758239746, + "learning_rate": 2.179803166452717e-05, + "loss": 0.219, + "step": 26968 + }, + { + "epoch": 34.62002567394095, + "grad_norm": 1.1068300008773804, + "learning_rate": 2.179760376551134e-05, + "loss": 0.2515, + "step": 26969 + }, + { + "epoch": 34.62130937098845, + "grad_norm": 1.2512918710708618, + "learning_rate": 2.1797175866495508e-05, + "loss": 0.2408, + "step": 26970 + }, + { + "epoch": 34.62259306803595, + "grad_norm": 1.070764183998108, + "learning_rate": 2.1796747967479673e-05, + "loss": 0.265, + "step": 26971 + }, + { + "epoch": 34.62387676508344, + "grad_norm": 1.0345118045806885, + "learning_rate": 2.1796320068463845e-05, + "loss": 0.2519, + "step": 26972 + }, + { + "epoch": 34.625160462130935, + "grad_norm": 1.213051199913025, + "learning_rate": 2.179589216944801e-05, + "loss": 0.2466, + "step": 26973 + }, + { + "epoch": 34.62644415917843, + "grad_norm": 1.4393773078918457, + "learning_rate": 2.179546427043218e-05, + "loss": 0.2174, + "step": 26974 + }, + { + "epoch": 34.62772785622593, + "grad_norm": 1.6053279638290405, + "learning_rate": 2.1795036371416347e-05, + "loss": 0.2397, + "step": 26975 + }, + { + "epoch": 34.62901155327343, + "grad_norm": 1.388523817062378, + "learning_rate": 2.1794608472400512e-05, + "loss": 0.2143, + "step": 26976 + }, + { + "epoch": 34.63029525032093, + "grad_norm": 1.382746696472168, + "learning_rate": 2.1794180573384684e-05, + "loss": 0.2273, + "step": 26977 + }, + { + "epoch": 34.63157894736842, + "grad_norm": 1.2812368869781494, + "learning_rate": 2.179375267436885e-05, + "loss": 0.2739, + "step": 26978 + }, + { + "epoch": 34.632862644415916, + "grad_norm": 4.549227714538574, + "learning_rate": 2.1793324775353017e-05, + "loss": 0.2249, + "step": 26979 + }, + { + "epoch": 34.63414634146341, + "grad_norm": 1.2347488403320312, + "learning_rate": 2.1792896876337185e-05, + "loss": 0.1978, + "step": 26980 + }, + { + "epoch": 34.63543003851091, + "grad_norm": 3.1698548793792725, + "learning_rate": 2.1792468977321354e-05, + "loss": 0.2247, + "step": 26981 + }, + { + "epoch": 34.63671373555841, + "grad_norm": 1.957311749458313, + "learning_rate": 2.179204107830552e-05, + "loss": 0.2901, + "step": 26982 + }, + { + "epoch": 34.63799743260591, + "grad_norm": 2.1638922691345215, + "learning_rate": 2.1791613179289687e-05, + "loss": 0.2356, + "step": 26983 + }, + { + "epoch": 34.639281129653405, + "grad_norm": 4.4316534996032715, + "learning_rate": 2.1791185280273856e-05, + "loss": 0.276, + "step": 26984 + }, + { + "epoch": 34.640564826700896, + "grad_norm": 2.9145524501800537, + "learning_rate": 2.1790757381258024e-05, + "loss": 0.3479, + "step": 26985 + }, + { + "epoch": 34.64184852374839, + "grad_norm": 2.1482300758361816, + "learning_rate": 2.1790329482242193e-05, + "loss": 0.3709, + "step": 26986 + }, + { + "epoch": 34.64313222079589, + "grad_norm": 1.1457219123840332, + "learning_rate": 2.1789901583226358e-05, + "loss": 0.2456, + "step": 26987 + }, + { + "epoch": 34.64441591784339, + "grad_norm": 2.596531629562378, + "learning_rate": 2.178947368421053e-05, + "loss": 0.2767, + "step": 26988 + }, + { + "epoch": 34.64569961489089, + "grad_norm": 0.7548315525054932, + "learning_rate": 2.1789045785194694e-05, + "loss": 0.2409, + "step": 26989 + }, + { + "epoch": 34.646983311938385, + "grad_norm": 0.9725293517112732, + "learning_rate": 2.178861788617886e-05, + "loss": 0.2504, + "step": 26990 + }, + { + "epoch": 34.64826700898588, + "grad_norm": 0.9680541753768921, + "learning_rate": 2.178818998716303e-05, + "loss": 0.2522, + "step": 26991 + }, + { + "epoch": 34.649550706033374, + "grad_norm": 1.7462489604949951, + "learning_rate": 2.1787762088147196e-05, + "loss": 0.2545, + "step": 26992 + }, + { + "epoch": 34.65083440308087, + "grad_norm": 0.9510042667388916, + "learning_rate": 2.1787334189131368e-05, + "loss": 0.2448, + "step": 26993 + }, + { + "epoch": 34.65211810012837, + "grad_norm": 0.9807847142219543, + "learning_rate": 2.1786906290115533e-05, + "loss": 0.2159, + "step": 26994 + }, + { + "epoch": 34.65340179717587, + "grad_norm": 1.8201614618301392, + "learning_rate": 2.17864783910997e-05, + "loss": 0.2556, + "step": 26995 + }, + { + "epoch": 34.654685494223365, + "grad_norm": 1.5028859376907349, + "learning_rate": 2.178605049208387e-05, + "loss": 0.2462, + "step": 26996 + }, + { + "epoch": 34.65596919127086, + "grad_norm": 0.8471458554267883, + "learning_rate": 2.1785622593068035e-05, + "loss": 0.2404, + "step": 26997 + }, + { + "epoch": 34.657252888318354, + "grad_norm": 0.8273284435272217, + "learning_rate": 2.1785194694052203e-05, + "loss": 0.2457, + "step": 26998 + }, + { + "epoch": 34.65853658536585, + "grad_norm": 0.9554068446159363, + "learning_rate": 2.1784766795036372e-05, + "loss": 0.2458, + "step": 26999 + }, + { + "epoch": 34.65982028241335, + "grad_norm": 1.7972455024719238, + "learning_rate": 2.178433889602054e-05, + "loss": 0.2494, + "step": 27000 + }, + { + "epoch": 34.65982028241335, + "eval_cer": 0.26387801057372234, + "eval_loss": 0.47214630246162415, + "eval_runtime": 13.7189, + "eval_samples_per_second": 71.653, + "eval_steps_per_second": 0.51, + "eval_wer": 0.4417564778720477, + "step": 27000 + }, + { + "epoch": 34.66110397946085, + "grad_norm": 0.9784676432609558, + "learning_rate": 2.178391099700471e-05, + "loss": 0.2588, + "step": 27001 + }, + { + "epoch": 34.662387676508345, + "grad_norm": 1.025225281715393, + "learning_rate": 2.1783483097988877e-05, + "loss": 0.2526, + "step": 27002 + }, + { + "epoch": 34.66367137355584, + "grad_norm": 0.828098714351654, + "learning_rate": 2.1783055198973042e-05, + "loss": 0.2414, + "step": 27003 + }, + { + "epoch": 34.66495507060334, + "grad_norm": 1.3166276216506958, + "learning_rate": 2.178262729995721e-05, + "loss": 0.2372, + "step": 27004 + }, + { + "epoch": 34.66623876765083, + "grad_norm": 0.8032799959182739, + "learning_rate": 2.178219940094138e-05, + "loss": 0.2242, + "step": 27005 + }, + { + "epoch": 34.66752246469833, + "grad_norm": 1.1182178258895874, + "learning_rate": 2.1781771501925544e-05, + "loss": 0.2396, + "step": 27006 + }, + { + "epoch": 34.66880616174583, + "grad_norm": 0.7741937637329102, + "learning_rate": 2.1781343602909716e-05, + "loss": 0.2658, + "step": 27007 + }, + { + "epoch": 34.670089858793325, + "grad_norm": 0.9778304696083069, + "learning_rate": 2.178091570389388e-05, + "loss": 0.2322, + "step": 27008 + }, + { + "epoch": 34.67137355584082, + "grad_norm": 2.2343664169311523, + "learning_rate": 2.1780487804878052e-05, + "loss": 0.2274, + "step": 27009 + }, + { + "epoch": 34.67265725288832, + "grad_norm": 0.9999895691871643, + "learning_rate": 2.1780059905862217e-05, + "loss": 0.2326, + "step": 27010 + }, + { + "epoch": 34.67394094993581, + "grad_norm": 0.9146766066551208, + "learning_rate": 2.1779632006846383e-05, + "loss": 0.2096, + "step": 27011 + }, + { + "epoch": 34.67522464698331, + "grad_norm": 1.214656949043274, + "learning_rate": 2.1779204107830554e-05, + "loss": 0.2478, + "step": 27012 + }, + { + "epoch": 34.67650834403081, + "grad_norm": 0.8257031440734863, + "learning_rate": 2.177877620881472e-05, + "loss": 0.2321, + "step": 27013 + }, + { + "epoch": 34.677792041078305, + "grad_norm": 1.8838082551956177, + "learning_rate": 2.1778348309798888e-05, + "loss": 0.2177, + "step": 27014 + }, + { + "epoch": 34.6790757381258, + "grad_norm": 1.6505999565124512, + "learning_rate": 2.1777920410783056e-05, + "loss": 0.2396, + "step": 27015 + }, + { + "epoch": 34.6803594351733, + "grad_norm": 1.1543993949890137, + "learning_rate": 2.1777492511767225e-05, + "loss": 0.2299, + "step": 27016 + }, + { + "epoch": 34.6816431322208, + "grad_norm": 2.1336710453033447, + "learning_rate": 2.1777064612751393e-05, + "loss": 0.2238, + "step": 27017 + }, + { + "epoch": 34.68292682926829, + "grad_norm": 1.0305002927780151, + "learning_rate": 2.1776636713735558e-05, + "loss": 0.2187, + "step": 27018 + }, + { + "epoch": 34.68421052631579, + "grad_norm": 1.1519534587860107, + "learning_rate": 2.1776208814719726e-05, + "loss": 0.2233, + "step": 27019 + }, + { + "epoch": 34.685494223363285, + "grad_norm": 1.7680211067199707, + "learning_rate": 2.1775780915703895e-05, + "loss": 0.2224, + "step": 27020 + }, + { + "epoch": 34.68677792041078, + "grad_norm": 1.6490626335144043, + "learning_rate": 2.1775353016688063e-05, + "loss": 0.2301, + "step": 27021 + }, + { + "epoch": 34.68806161745828, + "grad_norm": 1.11655855178833, + "learning_rate": 2.1774925117672228e-05, + "loss": 0.2344, + "step": 27022 + }, + { + "epoch": 34.68934531450578, + "grad_norm": 1.072237253189087, + "learning_rate": 2.17744972186564e-05, + "loss": 0.2221, + "step": 27023 + }, + { + "epoch": 34.69062901155327, + "grad_norm": 3.3418779373168945, + "learning_rate": 2.1774069319640565e-05, + "loss": 0.2228, + "step": 27024 + }, + { + "epoch": 34.69191270860077, + "grad_norm": 1.4061110019683838, + "learning_rate": 2.1773641420624733e-05, + "loss": 0.2338, + "step": 27025 + }, + { + "epoch": 34.693196405648266, + "grad_norm": 2.699633836746216, + "learning_rate": 2.1773213521608902e-05, + "loss": 0.2395, + "step": 27026 + }, + { + "epoch": 34.69448010269576, + "grad_norm": 3.194814920425415, + "learning_rate": 2.1772785622593067e-05, + "loss": 0.2791, + "step": 27027 + }, + { + "epoch": 34.69576379974326, + "grad_norm": 2.306095600128174, + "learning_rate": 2.177235772357724e-05, + "loss": 0.2615, + "step": 27028 + }, + { + "epoch": 34.69704749679076, + "grad_norm": 4.4399824142456055, + "learning_rate": 2.1771929824561404e-05, + "loss": 0.2631, + "step": 27029 + }, + { + "epoch": 34.69833119383826, + "grad_norm": 1.2723360061645508, + "learning_rate": 2.1771501925545572e-05, + "loss": 0.2557, + "step": 27030 + }, + { + "epoch": 34.69961489088575, + "grad_norm": 2.828076124191284, + "learning_rate": 2.177107402652974e-05, + "loss": 0.2606, + "step": 27031 + }, + { + "epoch": 34.700898587933246, + "grad_norm": 2.47729754447937, + "learning_rate": 2.1770646127513906e-05, + "loss": 0.2643, + "step": 27032 + }, + { + "epoch": 34.70218228498074, + "grad_norm": 1.4463342428207397, + "learning_rate": 2.1770218228498077e-05, + "loss": 0.2606, + "step": 27033 + }, + { + "epoch": 34.70346598202824, + "grad_norm": 1.8638454675674438, + "learning_rate": 2.1769790329482242e-05, + "loss": 0.3079, + "step": 27034 + }, + { + "epoch": 34.70474967907574, + "grad_norm": 1.43392813205719, + "learning_rate": 2.176936243046641e-05, + "loss": 0.3803, + "step": 27035 + }, + { + "epoch": 34.70603337612324, + "grad_norm": 2.6517882347106934, + "learning_rate": 2.176893453145058e-05, + "loss": 0.4066, + "step": 27036 + }, + { + "epoch": 34.707317073170735, + "grad_norm": 1.3543063402175903, + "learning_rate": 2.1768506632434744e-05, + "loss": 0.2661, + "step": 27037 + }, + { + "epoch": 34.708600770218226, + "grad_norm": 2.5257890224456787, + "learning_rate": 2.1768078733418913e-05, + "loss": 0.2414, + "step": 27038 + }, + { + "epoch": 34.709884467265724, + "grad_norm": 1.0410727262496948, + "learning_rate": 2.176765083440308e-05, + "loss": 0.2456, + "step": 27039 + }, + { + "epoch": 34.71116816431322, + "grad_norm": 0.7926271557807922, + "learning_rate": 2.176722293538725e-05, + "loss": 0.2843, + "step": 27040 + }, + { + "epoch": 34.71245186136072, + "grad_norm": 1.6392697095870972, + "learning_rate": 2.1766795036371418e-05, + "loss": 0.2497, + "step": 27041 + }, + { + "epoch": 34.71373555840822, + "grad_norm": 1.0647536516189575, + "learning_rate": 2.1766367137355586e-05, + "loss": 0.246, + "step": 27042 + }, + { + "epoch": 34.715019255455715, + "grad_norm": 0.8633779287338257, + "learning_rate": 2.176593923833975e-05, + "loss": 0.2645, + "step": 27043 + }, + { + "epoch": 34.716302952503206, + "grad_norm": 0.9388037919998169, + "learning_rate": 2.176551133932392e-05, + "loss": 0.226, + "step": 27044 + }, + { + "epoch": 34.717586649550704, + "grad_norm": 0.7565674185752869, + "learning_rate": 2.1765083440308088e-05, + "loss": 0.2267, + "step": 27045 + }, + { + "epoch": 34.7188703465982, + "grad_norm": 0.8940629959106445, + "learning_rate": 2.1764655541292253e-05, + "loss": 0.2555, + "step": 27046 + }, + { + "epoch": 34.7201540436457, + "grad_norm": 0.9111307263374329, + "learning_rate": 2.1764227642276425e-05, + "loss": 0.2175, + "step": 27047 + }, + { + "epoch": 34.7214377406932, + "grad_norm": 0.783967912197113, + "learning_rate": 2.176379974326059e-05, + "loss": 0.226, + "step": 27048 + }, + { + "epoch": 34.722721437740695, + "grad_norm": 0.8838589191436768, + "learning_rate": 2.1763371844244762e-05, + "loss": 0.2614, + "step": 27049 + }, + { + "epoch": 34.72400513478819, + "grad_norm": 0.885148286819458, + "learning_rate": 2.1762943945228927e-05, + "loss": 0.2256, + "step": 27050 + }, + { + "epoch": 34.725288831835684, + "grad_norm": 1.1911674737930298, + "learning_rate": 2.1762516046213092e-05, + "loss": 0.2606, + "step": 27051 + }, + { + "epoch": 34.72657252888318, + "grad_norm": 1.2155555486679077, + "learning_rate": 2.1762088147197264e-05, + "loss": 0.245, + "step": 27052 + }, + { + "epoch": 34.72785622593068, + "grad_norm": 1.1291264295578003, + "learning_rate": 2.176166024818143e-05, + "loss": 0.2376, + "step": 27053 + }, + { + "epoch": 34.72913992297818, + "grad_norm": 0.7810655832290649, + "learning_rate": 2.1761232349165597e-05, + "loss": 0.2156, + "step": 27054 + }, + { + "epoch": 34.730423620025675, + "grad_norm": 1.666796326637268, + "learning_rate": 2.1760804450149766e-05, + "loss": 0.2111, + "step": 27055 + }, + { + "epoch": 34.73170731707317, + "grad_norm": 0.785419225692749, + "learning_rate": 2.1760376551133934e-05, + "loss": 0.2418, + "step": 27056 + }, + { + "epoch": 34.73299101412067, + "grad_norm": 1.4230774641036987, + "learning_rate": 2.17599486521181e-05, + "loss": 0.2401, + "step": 27057 + }, + { + "epoch": 34.73427471116816, + "grad_norm": 1.7122511863708496, + "learning_rate": 2.1759520753102267e-05, + "loss": 0.2367, + "step": 27058 + }, + { + "epoch": 34.73555840821566, + "grad_norm": 0.8618189692497253, + "learning_rate": 2.1759092854086436e-05, + "loss": 0.2325, + "step": 27059 + }, + { + "epoch": 34.73684210526316, + "grad_norm": 0.9293550252914429, + "learning_rate": 2.1758664955070604e-05, + "loss": 0.2236, + "step": 27060 + }, + { + "epoch": 34.738125802310655, + "grad_norm": 1.2328602075576782, + "learning_rate": 2.1758237056054773e-05, + "loss": 0.2163, + "step": 27061 + }, + { + "epoch": 34.73940949935815, + "grad_norm": 1.0643458366394043, + "learning_rate": 2.1757809157038938e-05, + "loss": 0.2387, + "step": 27062 + }, + { + "epoch": 34.74069319640565, + "grad_norm": 1.8314698934555054, + "learning_rate": 2.175738125802311e-05, + "loss": 0.2174, + "step": 27063 + }, + { + "epoch": 34.74197689345314, + "grad_norm": 0.9420031905174255, + "learning_rate": 2.1756953359007274e-05, + "loss": 0.208, + "step": 27064 + }, + { + "epoch": 34.74326059050064, + "grad_norm": 1.3506860733032227, + "learning_rate": 2.175652545999144e-05, + "loss": 0.241, + "step": 27065 + }, + { + "epoch": 34.74454428754814, + "grad_norm": 1.9610835313796997, + "learning_rate": 2.175609756097561e-05, + "loss": 0.2162, + "step": 27066 + }, + { + "epoch": 34.745827984595635, + "grad_norm": 1.4103788137435913, + "learning_rate": 2.1755669661959776e-05, + "loss": 0.2509, + "step": 27067 + }, + { + "epoch": 34.74711168164313, + "grad_norm": 2.0010290145874023, + "learning_rate": 2.1755241762943948e-05, + "loss": 0.2305, + "step": 27068 + }, + { + "epoch": 34.74839537869063, + "grad_norm": 2.3834707736968994, + "learning_rate": 2.1754813863928113e-05, + "loss": 0.3039, + "step": 27069 + }, + { + "epoch": 34.74967907573813, + "grad_norm": 0.9274929165840149, + "learning_rate": 2.175438596491228e-05, + "loss": 0.2275, + "step": 27070 + }, + { + "epoch": 34.75096277278562, + "grad_norm": 1.251046061515808, + "learning_rate": 2.175395806589645e-05, + "loss": 0.2659, + "step": 27071 + }, + { + "epoch": 34.75224646983312, + "grad_norm": 2.005720376968384, + "learning_rate": 2.1753530166880615e-05, + "loss": 0.2342, + "step": 27072 + }, + { + "epoch": 34.753530166880616, + "grad_norm": 1.0145350694656372, + "learning_rate": 2.1753102267864783e-05, + "loss": 0.2368, + "step": 27073 + }, + { + "epoch": 34.75481386392811, + "grad_norm": 1.7348045110702515, + "learning_rate": 2.1752674368848952e-05, + "loss": 0.2516, + "step": 27074 + }, + { + "epoch": 34.75609756097561, + "grad_norm": 1.646619200706482, + "learning_rate": 2.175224646983312e-05, + "loss": 0.2419, + "step": 27075 + }, + { + "epoch": 34.75738125802311, + "grad_norm": 1.4049686193466187, + "learning_rate": 2.175181857081729e-05, + "loss": 0.2614, + "step": 27076 + }, + { + "epoch": 34.7586649550706, + "grad_norm": 1.713585615158081, + "learning_rate": 2.1751390671801457e-05, + "loss": 0.2471, + "step": 27077 + }, + { + "epoch": 34.7599486521181, + "grad_norm": 1.68833589553833, + "learning_rate": 2.1750962772785622e-05, + "loss": 0.2888, + "step": 27078 + }, + { + "epoch": 34.761232349165596, + "grad_norm": 1.9789385795593262, + "learning_rate": 2.175053487376979e-05, + "loss": 0.2551, + "step": 27079 + }, + { + "epoch": 34.76251604621309, + "grad_norm": 1.5607936382293701, + "learning_rate": 2.175010697475396e-05, + "loss": 0.2214, + "step": 27080 + }, + { + "epoch": 34.76379974326059, + "grad_norm": 2.5251359939575195, + "learning_rate": 2.1749679075738124e-05, + "loss": 0.2785, + "step": 27081 + }, + { + "epoch": 34.76508344030809, + "grad_norm": 1.6285425424575806, + "learning_rate": 2.1749251176722296e-05, + "loss": 0.2738, + "step": 27082 + }, + { + "epoch": 34.76636713735559, + "grad_norm": 1.3221633434295654, + "learning_rate": 2.174882327770646e-05, + "loss": 0.3082, + "step": 27083 + }, + { + "epoch": 34.76765083440308, + "grad_norm": 1.5781605243682861, + "learning_rate": 2.1748395378690633e-05, + "loss": 0.289, + "step": 27084 + }, + { + "epoch": 34.768934531450576, + "grad_norm": 4.189155578613281, + "learning_rate": 2.1747967479674798e-05, + "loss": 0.3429, + "step": 27085 + }, + { + "epoch": 34.770218228498074, + "grad_norm": 4.129974365234375, + "learning_rate": 2.1747539580658963e-05, + "loss": 0.4786, + "step": 27086 + }, + { + "epoch": 34.77150192554557, + "grad_norm": 0.6907979846000671, + "learning_rate": 2.1747111681643134e-05, + "loss": 0.258, + "step": 27087 + }, + { + "epoch": 34.77278562259307, + "grad_norm": 7.904734134674072, + "learning_rate": 2.17466837826273e-05, + "loss": 0.2627, + "step": 27088 + }, + { + "epoch": 34.77406931964057, + "grad_norm": 1.1931017637252808, + "learning_rate": 2.1746255883611468e-05, + "loss": 0.26, + "step": 27089 + }, + { + "epoch": 34.775353016688065, + "grad_norm": 0.658277690410614, + "learning_rate": 2.1745827984595636e-05, + "loss": 0.2639, + "step": 27090 + }, + { + "epoch": 34.776636713735556, + "grad_norm": 0.7179901599884033, + "learning_rate": 2.1745400085579805e-05, + "loss": 0.224, + "step": 27091 + }, + { + "epoch": 34.777920410783054, + "grad_norm": 1.145542025566101, + "learning_rate": 2.1744972186563973e-05, + "loss": 0.2376, + "step": 27092 + }, + { + "epoch": 34.77920410783055, + "grad_norm": 1.3632173538208008, + "learning_rate": 2.1744544287548138e-05, + "loss": 0.2576, + "step": 27093 + }, + { + "epoch": 34.78048780487805, + "grad_norm": 0.7217596173286438, + "learning_rate": 2.1744116388532306e-05, + "loss": 0.2565, + "step": 27094 + }, + { + "epoch": 34.78177150192555, + "grad_norm": 0.8154395818710327, + "learning_rate": 2.1743688489516475e-05, + "loss": 0.2562, + "step": 27095 + }, + { + "epoch": 34.783055198973045, + "grad_norm": 0.9444825649261475, + "learning_rate": 2.1743260590500643e-05, + "loss": 0.2657, + "step": 27096 + }, + { + "epoch": 34.784338896020536, + "grad_norm": 0.9514458179473877, + "learning_rate": 2.174283269148481e-05, + "loss": 0.2535, + "step": 27097 + }, + { + "epoch": 34.785622593068034, + "grad_norm": 0.8668816685676575, + "learning_rate": 2.1742404792468977e-05, + "loss": 0.2418, + "step": 27098 + }, + { + "epoch": 34.78690629011553, + "grad_norm": 1.1780191659927368, + "learning_rate": 2.1741976893453145e-05, + "loss": 0.228, + "step": 27099 + }, + { + "epoch": 34.78818998716303, + "grad_norm": 1.9609919786453247, + "learning_rate": 2.1741548994437314e-05, + "loss": 0.2559, + "step": 27100 + }, + { + "epoch": 34.78947368421053, + "grad_norm": 0.7891112565994263, + "learning_rate": 2.1741121095421482e-05, + "loss": 0.2213, + "step": 27101 + }, + { + "epoch": 34.790757381258025, + "grad_norm": 0.9287220239639282, + "learning_rate": 2.1740693196405647e-05, + "loss": 0.2661, + "step": 27102 + }, + { + "epoch": 34.79204107830552, + "grad_norm": 0.7371464371681213, + "learning_rate": 2.174026529738982e-05, + "loss": 0.2367, + "step": 27103 + }, + { + "epoch": 34.793324775353014, + "grad_norm": 0.9821146726608276, + "learning_rate": 2.1739837398373984e-05, + "loss": 0.2116, + "step": 27104 + }, + { + "epoch": 34.79460847240051, + "grad_norm": 0.8267303109169006, + "learning_rate": 2.173940949935815e-05, + "loss": 0.2432, + "step": 27105 + }, + { + "epoch": 34.79589216944801, + "grad_norm": 0.993463397026062, + "learning_rate": 2.173898160034232e-05, + "loss": 0.2279, + "step": 27106 + }, + { + "epoch": 34.79717586649551, + "grad_norm": 1.8561649322509766, + "learning_rate": 2.1738553701326486e-05, + "loss": 0.2208, + "step": 27107 + }, + { + "epoch": 34.798459563543005, + "grad_norm": 2.9088284969329834, + "learning_rate": 2.1738125802310657e-05, + "loss": 0.258, + "step": 27108 + }, + { + "epoch": 34.7997432605905, + "grad_norm": 1.1259171962738037, + "learning_rate": 2.1737697903294822e-05, + "loss": 0.2398, + "step": 27109 + }, + { + "epoch": 34.801026957637994, + "grad_norm": 1.2380632162094116, + "learning_rate": 2.173727000427899e-05, + "loss": 0.2351, + "step": 27110 + }, + { + "epoch": 34.80231065468549, + "grad_norm": 0.8780525326728821, + "learning_rate": 2.173684210526316e-05, + "loss": 0.2218, + "step": 27111 + }, + { + "epoch": 34.80359435173299, + "grad_norm": 1.8148083686828613, + "learning_rate": 2.1736414206247324e-05, + "loss": 0.2252, + "step": 27112 + }, + { + "epoch": 34.80487804878049, + "grad_norm": 0.9669862389564514, + "learning_rate": 2.1735986307231493e-05, + "loss": 0.2319, + "step": 27113 + }, + { + "epoch": 34.806161745827985, + "grad_norm": 0.8739175796508789, + "learning_rate": 2.173555840821566e-05, + "loss": 0.2278, + "step": 27114 + }, + { + "epoch": 34.80744544287548, + "grad_norm": 1.2092509269714355, + "learning_rate": 2.173513050919983e-05, + "loss": 0.253, + "step": 27115 + }, + { + "epoch": 34.80872913992298, + "grad_norm": 0.7463101148605347, + "learning_rate": 2.1734702610183998e-05, + "loss": 0.23, + "step": 27116 + }, + { + "epoch": 34.81001283697047, + "grad_norm": 2.9749696254730225, + "learning_rate": 2.1734274711168166e-05, + "loss": 0.2383, + "step": 27117 + }, + { + "epoch": 34.81129653401797, + "grad_norm": 0.9014304876327515, + "learning_rate": 2.173384681215233e-05, + "loss": 0.2197, + "step": 27118 + }, + { + "epoch": 34.81258023106547, + "grad_norm": 2.2534024715423584, + "learning_rate": 2.17334189131365e-05, + "loss": 0.2598, + "step": 27119 + }, + { + "epoch": 34.813863928112966, + "grad_norm": 1.5137759447097778, + "learning_rate": 2.1732991014120668e-05, + "loss": 0.228, + "step": 27120 + }, + { + "epoch": 34.81514762516046, + "grad_norm": 1.1453430652618408, + "learning_rate": 2.1732563115104833e-05, + "loss": 0.2395, + "step": 27121 + }, + { + "epoch": 34.81643132220796, + "grad_norm": 3.512753486633301, + "learning_rate": 2.1732135216089005e-05, + "loss": 0.2617, + "step": 27122 + }, + { + "epoch": 34.81771501925546, + "grad_norm": 1.1453144550323486, + "learning_rate": 2.173170731707317e-05, + "loss": 0.2112, + "step": 27123 + }, + { + "epoch": 34.81899871630295, + "grad_norm": 0.9954224824905396, + "learning_rate": 2.1731279418057342e-05, + "loss": 0.2461, + "step": 27124 + }, + { + "epoch": 34.82028241335045, + "grad_norm": 1.2476915121078491, + "learning_rate": 2.1730851519041507e-05, + "loss": 0.2465, + "step": 27125 + }, + { + "epoch": 34.821566110397946, + "grad_norm": 3.2920022010803223, + "learning_rate": 2.1730423620025672e-05, + "loss": 0.2209, + "step": 27126 + }, + { + "epoch": 34.822849807445444, + "grad_norm": 2.107889413833618, + "learning_rate": 2.1729995721009844e-05, + "loss": 0.25, + "step": 27127 + }, + { + "epoch": 34.82413350449294, + "grad_norm": 1.2822985649108887, + "learning_rate": 2.172956782199401e-05, + "loss": 0.2499, + "step": 27128 + }, + { + "epoch": 34.82541720154044, + "grad_norm": 1.1199370622634888, + "learning_rate": 2.1729139922978177e-05, + "loss": 0.25, + "step": 27129 + }, + { + "epoch": 34.82670089858793, + "grad_norm": 1.6716862916946411, + "learning_rate": 2.1728712023962346e-05, + "loss": 0.2329, + "step": 27130 + }, + { + "epoch": 34.82798459563543, + "grad_norm": 1.5311191082000732, + "learning_rate": 2.1728284124946514e-05, + "loss": 0.2854, + "step": 27131 + }, + { + "epoch": 34.829268292682926, + "grad_norm": 3.4590516090393066, + "learning_rate": 2.1727856225930682e-05, + "loss": 0.2687, + "step": 27132 + }, + { + "epoch": 34.830551989730424, + "grad_norm": 1.4508230686187744, + "learning_rate": 2.1727428326914847e-05, + "loss": 0.312, + "step": 27133 + }, + { + "epoch": 34.83183568677792, + "grad_norm": 2.1164000034332275, + "learning_rate": 2.1727000427899016e-05, + "loss": 0.2685, + "step": 27134 + }, + { + "epoch": 34.83311938382542, + "grad_norm": 1.312825083732605, + "learning_rate": 2.1726572528883184e-05, + "loss": 0.3454, + "step": 27135 + }, + { + "epoch": 34.83440308087292, + "grad_norm": 2.2717010974884033, + "learning_rate": 2.1726144629867353e-05, + "loss": 0.3811, + "step": 27136 + }, + { + "epoch": 34.83568677792041, + "grad_norm": 2.644649028778076, + "learning_rate": 2.1725716730851518e-05, + "loss": 0.2607, + "step": 27137 + }, + { + "epoch": 34.836970474967906, + "grad_norm": 0.7798623442649841, + "learning_rate": 2.172528883183569e-05, + "loss": 0.2602, + "step": 27138 + }, + { + "epoch": 34.838254172015404, + "grad_norm": 1.5688494443893433, + "learning_rate": 2.1724860932819855e-05, + "loss": 0.2407, + "step": 27139 + }, + { + "epoch": 34.8395378690629, + "grad_norm": 0.9627135396003723, + "learning_rate": 2.1724433033804023e-05, + "loss": 0.2396, + "step": 27140 + }, + { + "epoch": 34.8408215661104, + "grad_norm": 1.6673816442489624, + "learning_rate": 2.172400513478819e-05, + "loss": 0.2398, + "step": 27141 + }, + { + "epoch": 34.8421052631579, + "grad_norm": 0.976715087890625, + "learning_rate": 2.1723577235772356e-05, + "loss": 0.2508, + "step": 27142 + }, + { + "epoch": 34.84338896020539, + "grad_norm": 0.6019076108932495, + "learning_rate": 2.1723149336756528e-05, + "loss": 0.2289, + "step": 27143 + }, + { + "epoch": 34.844672657252886, + "grad_norm": 0.6832377910614014, + "learning_rate": 2.1722721437740693e-05, + "loss": 0.2397, + "step": 27144 + }, + { + "epoch": 34.845956354300384, + "grad_norm": 1.0674384832382202, + "learning_rate": 2.172229353872486e-05, + "loss": 0.2451, + "step": 27145 + }, + { + "epoch": 34.84724005134788, + "grad_norm": 0.6111926436424255, + "learning_rate": 2.172186563970903e-05, + "loss": 0.2381, + "step": 27146 + }, + { + "epoch": 34.84852374839538, + "grad_norm": 4.474225044250488, + "learning_rate": 2.1721437740693195e-05, + "loss": 0.247, + "step": 27147 + }, + { + "epoch": 34.84980744544288, + "grad_norm": 1.7139081954956055, + "learning_rate": 2.1721009841677367e-05, + "loss": 0.2449, + "step": 27148 + }, + { + "epoch": 34.851091142490375, + "grad_norm": 1.2278422117233276, + "learning_rate": 2.1720581942661532e-05, + "loss": 0.2408, + "step": 27149 + }, + { + "epoch": 34.852374839537866, + "grad_norm": 1.4580799341201782, + "learning_rate": 2.17201540436457e-05, + "loss": 0.2193, + "step": 27150 + }, + { + "epoch": 34.853658536585364, + "grad_norm": 1.3707810640335083, + "learning_rate": 2.171972614462987e-05, + "loss": 0.249, + "step": 27151 + }, + { + "epoch": 34.85494223363286, + "grad_norm": 0.9454992413520813, + "learning_rate": 2.1719298245614037e-05, + "loss": 0.2459, + "step": 27152 + }, + { + "epoch": 34.85622593068036, + "grad_norm": 1.4017820358276367, + "learning_rate": 2.1718870346598202e-05, + "loss": 0.256, + "step": 27153 + }, + { + "epoch": 34.85750962772786, + "grad_norm": 0.9180291891098022, + "learning_rate": 2.171844244758237e-05, + "loss": 0.2434, + "step": 27154 + }, + { + "epoch": 34.858793324775355, + "grad_norm": 3.857348918914795, + "learning_rate": 2.171801454856654e-05, + "loss": 0.2371, + "step": 27155 + }, + { + "epoch": 34.86007702182285, + "grad_norm": 2.331040382385254, + "learning_rate": 2.1717586649550707e-05, + "loss": 0.2298, + "step": 27156 + }, + { + "epoch": 34.861360718870344, + "grad_norm": 0.9987649321556091, + "learning_rate": 2.1717158750534876e-05, + "loss": 0.2408, + "step": 27157 + }, + { + "epoch": 34.86264441591784, + "grad_norm": 1.369438648223877, + "learning_rate": 2.171673085151904e-05, + "loss": 0.2665, + "step": 27158 + }, + { + "epoch": 34.86392811296534, + "grad_norm": 1.1687341928482056, + "learning_rate": 2.171630295250321e-05, + "loss": 0.2102, + "step": 27159 + }, + { + "epoch": 34.86521181001284, + "grad_norm": 2.311986207962036, + "learning_rate": 2.1715875053487378e-05, + "loss": 0.2196, + "step": 27160 + }, + { + "epoch": 34.866495507060336, + "grad_norm": 0.8997687101364136, + "learning_rate": 2.1715447154471543e-05, + "loss": 0.2149, + "step": 27161 + }, + { + "epoch": 34.86777920410783, + "grad_norm": 0.7749794721603394, + "learning_rate": 2.1715019255455714e-05, + "loss": 0.2421, + "step": 27162 + }, + { + "epoch": 34.869062901155324, + "grad_norm": 1.8672436475753784, + "learning_rate": 2.171459135643988e-05, + "loss": 0.2291, + "step": 27163 + }, + { + "epoch": 34.87034659820282, + "grad_norm": 1.3098251819610596, + "learning_rate": 2.171416345742405e-05, + "loss": 0.2013, + "step": 27164 + }, + { + "epoch": 34.87163029525032, + "grad_norm": 1.301099181175232, + "learning_rate": 2.1713735558408216e-05, + "loss": 0.2239, + "step": 27165 + }, + { + "epoch": 34.87291399229782, + "grad_norm": 2.144829750061035, + "learning_rate": 2.171330765939238e-05, + "loss": 0.2434, + "step": 27166 + }, + { + "epoch": 34.874197689345316, + "grad_norm": 1.7438511848449707, + "learning_rate": 2.1712879760376553e-05, + "loss": 0.2571, + "step": 27167 + }, + { + "epoch": 34.87548138639281, + "grad_norm": 1.1660995483398438, + "learning_rate": 2.1712451861360718e-05, + "loss": 0.2259, + "step": 27168 + }, + { + "epoch": 34.87676508344031, + "grad_norm": 1.078679084777832, + "learning_rate": 2.1712023962344887e-05, + "loss": 0.2269, + "step": 27169 + }, + { + "epoch": 34.8780487804878, + "grad_norm": 1.3809165954589844, + "learning_rate": 2.1711596063329055e-05, + "loss": 0.2122, + "step": 27170 + }, + { + "epoch": 34.8793324775353, + "grad_norm": 2.006873846054077, + "learning_rate": 2.1711168164313223e-05, + "loss": 0.2365, + "step": 27171 + }, + { + "epoch": 34.8806161745828, + "grad_norm": 1.5627107620239258, + "learning_rate": 2.1710740265297392e-05, + "loss": 0.2375, + "step": 27172 + }, + { + "epoch": 34.881899871630296, + "grad_norm": 2.3697621822357178, + "learning_rate": 2.1710312366281557e-05, + "loss": 0.2324, + "step": 27173 + }, + { + "epoch": 34.883183568677794, + "grad_norm": 2.2021114826202393, + "learning_rate": 2.1709884467265725e-05, + "loss": 0.2458, + "step": 27174 + }, + { + "epoch": 34.88446726572529, + "grad_norm": 1.3078064918518066, + "learning_rate": 2.1709456568249894e-05, + "loss": 0.2501, + "step": 27175 + }, + { + "epoch": 34.88575096277278, + "grad_norm": 1.1000605821609497, + "learning_rate": 2.1709028669234062e-05, + "loss": 0.238, + "step": 27176 + }, + { + "epoch": 34.88703465982028, + "grad_norm": 0.990807056427002, + "learning_rate": 2.1708600770218227e-05, + "loss": 0.2582, + "step": 27177 + }, + { + "epoch": 34.88831835686778, + "grad_norm": 1.7144988775253296, + "learning_rate": 2.17081728712024e-05, + "loss": 0.2593, + "step": 27178 + }, + { + "epoch": 34.889602053915276, + "grad_norm": 5.571520805358887, + "learning_rate": 2.1707744972186564e-05, + "loss": 0.2309, + "step": 27179 + }, + { + "epoch": 34.890885750962774, + "grad_norm": 2.411841630935669, + "learning_rate": 2.1707317073170732e-05, + "loss": 0.2559, + "step": 27180 + }, + { + "epoch": 34.89216944801027, + "grad_norm": 1.2379289865493774, + "learning_rate": 2.17068891741549e-05, + "loss": 0.2341, + "step": 27181 + }, + { + "epoch": 34.89345314505777, + "grad_norm": 1.7866284847259521, + "learning_rate": 2.1706461275139066e-05, + "loss": 0.2699, + "step": 27182 + }, + { + "epoch": 34.89473684210526, + "grad_norm": 1.9392791986465454, + "learning_rate": 2.1706033376123238e-05, + "loss": 0.294, + "step": 27183 + }, + { + "epoch": 34.89602053915276, + "grad_norm": 3.5518510341644287, + "learning_rate": 2.1705605477107403e-05, + "loss": 0.3118, + "step": 27184 + }, + { + "epoch": 34.897304236200256, + "grad_norm": 1.6166898012161255, + "learning_rate": 2.170517757809157e-05, + "loss": 0.3216, + "step": 27185 + }, + { + "epoch": 34.898587933247754, + "grad_norm": 3.58622407913208, + "learning_rate": 2.170474967907574e-05, + "loss": 0.3707, + "step": 27186 + }, + { + "epoch": 34.89987163029525, + "grad_norm": 0.9832013249397278, + "learning_rate": 2.1704321780059904e-05, + "loss": 0.2357, + "step": 27187 + }, + { + "epoch": 34.90115532734275, + "grad_norm": 1.114431381225586, + "learning_rate": 2.1703893881044076e-05, + "loss": 0.2319, + "step": 27188 + }, + { + "epoch": 34.90243902439025, + "grad_norm": 0.7871573567390442, + "learning_rate": 2.170346598202824e-05, + "loss": 0.2424, + "step": 27189 + }, + { + "epoch": 34.90372272143774, + "grad_norm": 0.7690842747688293, + "learning_rate": 2.170303808301241e-05, + "loss": 0.2451, + "step": 27190 + }, + { + "epoch": 34.905006418485236, + "grad_norm": 0.8211820721626282, + "learning_rate": 2.1702610183996578e-05, + "loss": 0.2652, + "step": 27191 + }, + { + "epoch": 34.906290115532734, + "grad_norm": 1.0385832786560059, + "learning_rate": 2.1702182284980746e-05, + "loss": 0.2574, + "step": 27192 + }, + { + "epoch": 34.90757381258023, + "grad_norm": 1.3291795253753662, + "learning_rate": 2.170175438596491e-05, + "loss": 0.2858, + "step": 27193 + }, + { + "epoch": 34.90885750962773, + "grad_norm": 2.528930187225342, + "learning_rate": 2.170132648694908e-05, + "loss": 0.25, + "step": 27194 + }, + { + "epoch": 34.91014120667523, + "grad_norm": 0.8049925565719604, + "learning_rate": 2.1700898587933248e-05, + "loss": 0.2475, + "step": 27195 + }, + { + "epoch": 34.91142490372272, + "grad_norm": 1.552276372909546, + "learning_rate": 2.1700470688917417e-05, + "loss": 0.2338, + "step": 27196 + }, + { + "epoch": 34.912708600770216, + "grad_norm": 1.0255684852600098, + "learning_rate": 2.1700042789901585e-05, + "loss": 0.2647, + "step": 27197 + }, + { + "epoch": 34.913992297817714, + "grad_norm": 1.6118648052215576, + "learning_rate": 2.169961489088575e-05, + "loss": 0.2467, + "step": 27198 + }, + { + "epoch": 34.91527599486521, + "grad_norm": 0.6873728036880493, + "learning_rate": 2.1699186991869922e-05, + "loss": 0.2436, + "step": 27199 + }, + { + "epoch": 34.91655969191271, + "grad_norm": 1.715806245803833, + "learning_rate": 2.1698759092854087e-05, + "loss": 0.2515, + "step": 27200 + }, + { + "epoch": 34.91784338896021, + "grad_norm": 1.7366271018981934, + "learning_rate": 2.1698331193838252e-05, + "loss": 0.251, + "step": 27201 + }, + { + "epoch": 34.919127086007705, + "grad_norm": 0.8641132116317749, + "learning_rate": 2.1697903294822424e-05, + "loss": 0.2222, + "step": 27202 + }, + { + "epoch": 34.920410783055196, + "grad_norm": 1.142215609550476, + "learning_rate": 2.169747539580659e-05, + "loss": 0.2338, + "step": 27203 + }, + { + "epoch": 34.921694480102694, + "grad_norm": 0.8926177024841309, + "learning_rate": 2.169704749679076e-05, + "loss": 0.256, + "step": 27204 + }, + { + "epoch": 34.92297817715019, + "grad_norm": 1.07593834400177, + "learning_rate": 2.1696619597774926e-05, + "loss": 0.2386, + "step": 27205 + }, + { + "epoch": 34.92426187419769, + "grad_norm": 2.239168167114258, + "learning_rate": 2.1696191698759094e-05, + "loss": 0.2546, + "step": 27206 + }, + { + "epoch": 34.92554557124519, + "grad_norm": 0.9121810793876648, + "learning_rate": 2.1695763799743262e-05, + "loss": 0.2072, + "step": 27207 + }, + { + "epoch": 34.926829268292686, + "grad_norm": 24.147031784057617, + "learning_rate": 2.1695335900727427e-05, + "loss": 0.2404, + "step": 27208 + }, + { + "epoch": 34.928112965340176, + "grad_norm": 0.7225857973098755, + "learning_rate": 2.1694908001711596e-05, + "loss": 0.2497, + "step": 27209 + }, + { + "epoch": 34.929396662387674, + "grad_norm": 3.4035916328430176, + "learning_rate": 2.1694480102695764e-05, + "loss": 0.1985, + "step": 27210 + }, + { + "epoch": 34.93068035943517, + "grad_norm": 1.102866291999817, + "learning_rate": 2.1694052203679933e-05, + "loss": 0.2321, + "step": 27211 + }, + { + "epoch": 34.93196405648267, + "grad_norm": 2.5875120162963867, + "learning_rate": 2.16936243046641e-05, + "loss": 0.2464, + "step": 27212 + }, + { + "epoch": 34.93324775353017, + "grad_norm": 1.6273382902145386, + "learning_rate": 2.169319640564827e-05, + "loss": 0.241, + "step": 27213 + }, + { + "epoch": 34.934531450577666, + "grad_norm": 1.7342078685760498, + "learning_rate": 2.1692768506632435e-05, + "loss": 0.2293, + "step": 27214 + }, + { + "epoch": 34.93581514762516, + "grad_norm": 1.0372554063796997, + "learning_rate": 2.1692340607616603e-05, + "loss": 0.2396, + "step": 27215 + }, + { + "epoch": 34.937098844672654, + "grad_norm": 4.577152252197266, + "learning_rate": 2.169191270860077e-05, + "loss": 0.2336, + "step": 27216 + }, + { + "epoch": 34.93838254172015, + "grad_norm": 1.397215723991394, + "learning_rate": 2.1691484809584936e-05, + "loss": 0.2167, + "step": 27217 + }, + { + "epoch": 34.93966623876765, + "grad_norm": 1.117663025856018, + "learning_rate": 2.1691056910569108e-05, + "loss": 0.267, + "step": 27218 + }, + { + "epoch": 34.94094993581515, + "grad_norm": 0.8285253047943115, + "learning_rate": 2.1690629011553273e-05, + "loss": 0.1993, + "step": 27219 + }, + { + "epoch": 34.942233632862646, + "grad_norm": 5.022775650024414, + "learning_rate": 2.169020111253744e-05, + "loss": 0.2298, + "step": 27220 + }, + { + "epoch": 34.943517329910144, + "grad_norm": 1.3338795900344849, + "learning_rate": 2.168977321352161e-05, + "loss": 0.2665, + "step": 27221 + }, + { + "epoch": 34.94480102695764, + "grad_norm": 2.016791820526123, + "learning_rate": 2.1689345314505775e-05, + "loss": 0.2053, + "step": 27222 + }, + { + "epoch": 34.94608472400513, + "grad_norm": 0.9434771537780762, + "learning_rate": 2.1688917415489947e-05, + "loss": 0.2279, + "step": 27223 + }, + { + "epoch": 34.94736842105263, + "grad_norm": 1.3275036811828613, + "learning_rate": 2.1688489516474112e-05, + "loss": 0.2396, + "step": 27224 + }, + { + "epoch": 34.94865211810013, + "grad_norm": 0.9451767802238464, + "learning_rate": 2.168806161745828e-05, + "loss": 0.2424, + "step": 27225 + }, + { + "epoch": 34.949935815147626, + "grad_norm": 0.8655239939689636, + "learning_rate": 2.168763371844245e-05, + "loss": 0.2485, + "step": 27226 + }, + { + "epoch": 34.951219512195124, + "grad_norm": 1.1937952041625977, + "learning_rate": 2.1687205819426614e-05, + "loss": 0.2403, + "step": 27227 + }, + { + "epoch": 34.95250320924262, + "grad_norm": 1.7608755826950073, + "learning_rate": 2.1686777920410786e-05, + "loss": 0.2664, + "step": 27228 + }, + { + "epoch": 34.95378690629011, + "grad_norm": 3.5575098991394043, + "learning_rate": 2.168635002139495e-05, + "loss": 0.27, + "step": 27229 + }, + { + "epoch": 34.95507060333761, + "grad_norm": 2.080124855041504, + "learning_rate": 2.168592212237912e-05, + "loss": 0.2532, + "step": 27230 + }, + { + "epoch": 34.95635430038511, + "grad_norm": 1.06232750415802, + "learning_rate": 2.1685494223363287e-05, + "loss": 0.301, + "step": 27231 + }, + { + "epoch": 34.957637997432606, + "grad_norm": 3.583592653274536, + "learning_rate": 2.1685066324347456e-05, + "loss": 0.2718, + "step": 27232 + }, + { + "epoch": 34.958921694480104, + "grad_norm": 1.3204187154769897, + "learning_rate": 2.168463842533162e-05, + "loss": 0.2183, + "step": 27233 + }, + { + "epoch": 34.9602053915276, + "grad_norm": 1.749007225036621, + "learning_rate": 2.168421052631579e-05, + "loss": 0.311, + "step": 27234 + }, + { + "epoch": 34.9614890885751, + "grad_norm": 2.294813394546509, + "learning_rate": 2.1683782627299958e-05, + "loss": 0.2966, + "step": 27235 + }, + { + "epoch": 34.96277278562259, + "grad_norm": 3.2695629596710205, + "learning_rate": 2.1683354728284126e-05, + "loss": 0.4326, + "step": 27236 + }, + { + "epoch": 34.96405648267009, + "grad_norm": 0.6989214420318604, + "learning_rate": 2.1682926829268294e-05, + "loss": 0.2276, + "step": 27237 + }, + { + "epoch": 34.965340179717586, + "grad_norm": 0.9774410724639893, + "learning_rate": 2.168249893025246e-05, + "loss": 0.2457, + "step": 27238 + }, + { + "epoch": 34.966623876765084, + "grad_norm": 0.750769317150116, + "learning_rate": 2.168207103123663e-05, + "loss": 0.2482, + "step": 27239 + }, + { + "epoch": 34.96790757381258, + "grad_norm": 1.6885359287261963, + "learning_rate": 2.1681643132220796e-05, + "loss": 0.2534, + "step": 27240 + }, + { + "epoch": 34.96919127086008, + "grad_norm": 1.7569712400436401, + "learning_rate": 2.168121523320496e-05, + "loss": 0.2429, + "step": 27241 + }, + { + "epoch": 34.97047496790757, + "grad_norm": 1.5594805479049683, + "learning_rate": 2.1680787334189133e-05, + "loss": 0.2566, + "step": 27242 + }, + { + "epoch": 34.97175866495507, + "grad_norm": 0.6919602751731873, + "learning_rate": 2.1680359435173298e-05, + "loss": 0.2298, + "step": 27243 + }, + { + "epoch": 34.973042362002566, + "grad_norm": 1.2786120176315308, + "learning_rate": 2.167993153615747e-05, + "loss": 0.2246, + "step": 27244 + }, + { + "epoch": 34.974326059050064, + "grad_norm": 3.064354419708252, + "learning_rate": 2.1679503637141635e-05, + "loss": 0.2252, + "step": 27245 + }, + { + "epoch": 34.97560975609756, + "grad_norm": 2.604848623275757, + "learning_rate": 2.1679075738125803e-05, + "loss": 0.2717, + "step": 27246 + }, + { + "epoch": 34.97689345314506, + "grad_norm": 1.1702492237091064, + "learning_rate": 2.1678647839109972e-05, + "loss": 0.2247, + "step": 27247 + }, + { + "epoch": 34.97817715019256, + "grad_norm": 0.7760156989097595, + "learning_rate": 2.1678219940094137e-05, + "loss": 0.2197, + "step": 27248 + }, + { + "epoch": 34.97946084724005, + "grad_norm": 1.037898302078247, + "learning_rate": 2.1677792041078305e-05, + "loss": 0.2194, + "step": 27249 + }, + { + "epoch": 34.980744544287546, + "grad_norm": 0.8560856580734253, + "learning_rate": 2.1677364142062474e-05, + "loss": 0.2172, + "step": 27250 + }, + { + "epoch": 34.982028241335044, + "grad_norm": 0.8790203928947449, + "learning_rate": 2.1676936243046642e-05, + "loss": 0.2538, + "step": 27251 + }, + { + "epoch": 34.98331193838254, + "grad_norm": 0.9969844222068787, + "learning_rate": 2.167650834403081e-05, + "loss": 0.2338, + "step": 27252 + }, + { + "epoch": 34.98459563543004, + "grad_norm": 0.9845762848854065, + "learning_rate": 2.167608044501498e-05, + "loss": 0.2349, + "step": 27253 + }, + { + "epoch": 34.98587933247754, + "grad_norm": 1.497455358505249, + "learning_rate": 2.1675652545999144e-05, + "loss": 0.2398, + "step": 27254 + }, + { + "epoch": 34.987163029525036, + "grad_norm": 1.142151951789856, + "learning_rate": 2.1675224646983312e-05, + "loss": 0.2515, + "step": 27255 + }, + { + "epoch": 34.988446726572526, + "grad_norm": 3.694715738296509, + "learning_rate": 2.167479674796748e-05, + "loss": 0.2401, + "step": 27256 + }, + { + "epoch": 34.989730423620024, + "grad_norm": 3.814056158065796, + "learning_rate": 2.1674368848951646e-05, + "loss": 0.2389, + "step": 27257 + }, + { + "epoch": 34.99101412066752, + "grad_norm": 1.2227396965026855, + "learning_rate": 2.1673940949935818e-05, + "loss": 0.2428, + "step": 27258 + }, + { + "epoch": 34.99229781771502, + "grad_norm": 1.652900218963623, + "learning_rate": 2.1673513050919983e-05, + "loss": 0.2226, + "step": 27259 + }, + { + "epoch": 34.99358151476252, + "grad_norm": 1.0490719079971313, + "learning_rate": 2.167308515190415e-05, + "loss": 0.2156, + "step": 27260 + }, + { + "epoch": 34.994865211810016, + "grad_norm": 2.4420907497406006, + "learning_rate": 2.167265725288832e-05, + "loss": 0.2328, + "step": 27261 + }, + { + "epoch": 34.996148908857506, + "grad_norm": 1.6893658638000488, + "learning_rate": 2.1672229353872484e-05, + "loss": 0.2414, + "step": 27262 + }, + { + "epoch": 34.997432605905004, + "grad_norm": 7.092487335205078, + "learning_rate": 2.1671801454856656e-05, + "loss": 0.2511, + "step": 27263 + }, + { + "epoch": 34.9987163029525, + "grad_norm": 1.5582045316696167, + "learning_rate": 2.167137355584082e-05, + "loss": 0.3152, + "step": 27264 + }, + { + "epoch": 35.0, + "grad_norm": 2.234523057937622, + "learning_rate": 2.167094565682499e-05, + "loss": 0.3807, + "step": 27265 + }, + { + "epoch": 35.0012836970475, + "grad_norm": 1.1219643354415894, + "learning_rate": 2.1670517757809158e-05, + "loss": 0.2307, + "step": 27266 + }, + { + "epoch": 35.002567394094996, + "grad_norm": 0.6719288229942322, + "learning_rate": 2.1670089858793327e-05, + "loss": 0.2469, + "step": 27267 + }, + { + "epoch": 35.003851091142494, + "grad_norm": 1.250867247581482, + "learning_rate": 2.166966195977749e-05, + "loss": 0.2363, + "step": 27268 + }, + { + "epoch": 35.005134788189984, + "grad_norm": 1.4412637948989868, + "learning_rate": 2.166923406076166e-05, + "loss": 0.2507, + "step": 27269 + }, + { + "epoch": 35.00641848523748, + "grad_norm": 0.826976478099823, + "learning_rate": 2.166880616174583e-05, + "loss": 0.2328, + "step": 27270 + }, + { + "epoch": 35.00770218228498, + "grad_norm": 0.6154965162277222, + "learning_rate": 2.1668378262729997e-05, + "loss": 0.2433, + "step": 27271 + }, + { + "epoch": 35.00898587933248, + "grad_norm": 0.8265224099159241, + "learning_rate": 2.1667950363714165e-05, + "loss": 0.2615, + "step": 27272 + }, + { + "epoch": 35.010269576379976, + "grad_norm": 1.3919520378112793, + "learning_rate": 2.166752246469833e-05, + "loss": 0.2424, + "step": 27273 + }, + { + "epoch": 35.011553273427474, + "grad_norm": 1.509880781173706, + "learning_rate": 2.1667094565682502e-05, + "loss": 0.2127, + "step": 27274 + }, + { + "epoch": 35.012836970474964, + "grad_norm": 1.8704944849014282, + "learning_rate": 2.1666666666666667e-05, + "loss": 0.2261, + "step": 27275 + }, + { + "epoch": 35.01412066752246, + "grad_norm": 1.2189390659332275, + "learning_rate": 2.1666238767650832e-05, + "loss": 0.2132, + "step": 27276 + }, + { + "epoch": 35.01540436456996, + "grad_norm": 1.509908676147461, + "learning_rate": 2.1665810868635004e-05, + "loss": 0.2245, + "step": 27277 + }, + { + "epoch": 35.01668806161746, + "grad_norm": 1.129481315612793, + "learning_rate": 2.166538296961917e-05, + "loss": 0.2318, + "step": 27278 + }, + { + "epoch": 35.017971758664956, + "grad_norm": 1.322239637374878, + "learning_rate": 2.166495507060334e-05, + "loss": 0.2424, + "step": 27279 + }, + { + "epoch": 35.019255455712454, + "grad_norm": 1.0610411167144775, + "learning_rate": 2.1664527171587506e-05, + "loss": 0.2402, + "step": 27280 + }, + { + "epoch": 35.02053915275995, + "grad_norm": 1.1959775686264038, + "learning_rate": 2.1664099272571674e-05, + "loss": 0.2326, + "step": 27281 + }, + { + "epoch": 35.02182284980744, + "grad_norm": 0.9583576321601868, + "learning_rate": 2.1663671373555843e-05, + "loss": 0.2297, + "step": 27282 + }, + { + "epoch": 35.02310654685494, + "grad_norm": 0.7579723596572876, + "learning_rate": 2.1663243474540008e-05, + "loss": 0.2096, + "step": 27283 + }, + { + "epoch": 35.02439024390244, + "grad_norm": 2.8948066234588623, + "learning_rate": 2.1662815575524176e-05, + "loss": 0.2339, + "step": 27284 + }, + { + "epoch": 35.025673940949936, + "grad_norm": 1.1166223287582397, + "learning_rate": 2.1662387676508344e-05, + "loss": 0.2239, + "step": 27285 + }, + { + "epoch": 35.026957637997434, + "grad_norm": 1.0762686729431152, + "learning_rate": 2.1661959777492513e-05, + "loss": 0.2117, + "step": 27286 + }, + { + "epoch": 35.02824133504493, + "grad_norm": 1.999031662940979, + "learning_rate": 2.166153187847668e-05, + "loss": 0.2484, + "step": 27287 + }, + { + "epoch": 35.02952503209243, + "grad_norm": 1.2222363948822021, + "learning_rate": 2.1661103979460846e-05, + "loss": 0.1933, + "step": 27288 + }, + { + "epoch": 35.03080872913992, + "grad_norm": 0.9582937359809875, + "learning_rate": 2.1660676080445015e-05, + "loss": 0.2176, + "step": 27289 + }, + { + "epoch": 35.03209242618742, + "grad_norm": 1.9278295040130615, + "learning_rate": 2.1660248181429183e-05, + "loss": 0.2068, + "step": 27290 + }, + { + "epoch": 35.033376123234916, + "grad_norm": 1.5769050121307373, + "learning_rate": 2.165982028241335e-05, + "loss": 0.2248, + "step": 27291 + }, + { + "epoch": 35.034659820282414, + "grad_norm": 0.8702406287193298, + "learning_rate": 2.1659392383397516e-05, + "loss": 0.222, + "step": 27292 + }, + { + "epoch": 35.03594351732991, + "grad_norm": 1.19981050491333, + "learning_rate": 2.1658964484381688e-05, + "loss": 0.2006, + "step": 27293 + }, + { + "epoch": 35.03722721437741, + "grad_norm": 0.9361575245857239, + "learning_rate": 2.1658536585365853e-05, + "loss": 0.2092, + "step": 27294 + }, + { + "epoch": 35.0385109114249, + "grad_norm": 0.958920419216156, + "learning_rate": 2.1658108686350022e-05, + "loss": 0.191, + "step": 27295 + }, + { + "epoch": 35.0397946084724, + "grad_norm": 0.9635913372039795, + "learning_rate": 2.165768078733419e-05, + "loss": 0.2382, + "step": 27296 + }, + { + "epoch": 35.041078305519896, + "grad_norm": 0.8774588704109192, + "learning_rate": 2.1657252888318355e-05, + "loss": 0.2092, + "step": 27297 + }, + { + "epoch": 35.042362002567394, + "grad_norm": 2.7922799587249756, + "learning_rate": 2.1656824989302527e-05, + "loss": 0.2296, + "step": 27298 + }, + { + "epoch": 35.04364569961489, + "grad_norm": 1.997383952140808, + "learning_rate": 2.1656397090286692e-05, + "loss": 0.257, + "step": 27299 + }, + { + "epoch": 35.04492939666239, + "grad_norm": 0.8932611346244812, + "learning_rate": 2.165596919127086e-05, + "loss": 0.2105, + "step": 27300 + }, + { + "epoch": 35.04621309370989, + "grad_norm": 1.3869991302490234, + "learning_rate": 2.165554129225503e-05, + "loss": 0.2261, + "step": 27301 + }, + { + "epoch": 35.04749679075738, + "grad_norm": 1.2045276165008545, + "learning_rate": 2.1655113393239194e-05, + "loss": 0.2482, + "step": 27302 + }, + { + "epoch": 35.048780487804876, + "grad_norm": 1.3502455949783325, + "learning_rate": 2.1654685494223366e-05, + "loss": 0.2156, + "step": 27303 + }, + { + "epoch": 35.050064184852374, + "grad_norm": 1.9155899286270142, + "learning_rate": 2.165425759520753e-05, + "loss": 0.2245, + "step": 27304 + }, + { + "epoch": 35.05134788189987, + "grad_norm": 2.9415204524993896, + "learning_rate": 2.16538296961917e-05, + "loss": 0.198, + "step": 27305 + }, + { + "epoch": 35.05263157894737, + "grad_norm": 0.9182648062705994, + "learning_rate": 2.1653401797175867e-05, + "loss": 0.2066, + "step": 27306 + }, + { + "epoch": 35.05391527599487, + "grad_norm": 1.3263533115386963, + "learning_rate": 2.1652973898160036e-05, + "loss": 0.2238, + "step": 27307 + }, + { + "epoch": 35.05519897304236, + "grad_norm": 3.9727413654327393, + "learning_rate": 2.16525459991442e-05, + "loss": 0.2449, + "step": 27308 + }, + { + "epoch": 35.056482670089856, + "grad_norm": 1.7661532163619995, + "learning_rate": 2.165211810012837e-05, + "loss": 0.2238, + "step": 27309 + }, + { + "epoch": 35.057766367137354, + "grad_norm": 2.357701063156128, + "learning_rate": 2.1651690201112538e-05, + "loss": 0.2145, + "step": 27310 + }, + { + "epoch": 35.05905006418485, + "grad_norm": 1.6920533180236816, + "learning_rate": 2.1651262302096706e-05, + "loss": 0.2412, + "step": 27311 + }, + { + "epoch": 35.06033376123235, + "grad_norm": 1.603671908378601, + "learning_rate": 2.1650834403080875e-05, + "loss": 0.241, + "step": 27312 + }, + { + "epoch": 35.06161745827985, + "grad_norm": 1.328267216682434, + "learning_rate": 2.165040650406504e-05, + "loss": 0.2756, + "step": 27313 + }, + { + "epoch": 35.062901155327346, + "grad_norm": 1.5369588136672974, + "learning_rate": 2.164997860504921e-05, + "loss": 0.2826, + "step": 27314 + }, + { + "epoch": 35.06418485237484, + "grad_norm": 1.754418134689331, + "learning_rate": 2.1649550706033376e-05, + "loss": 0.3774, + "step": 27315 + }, + { + "epoch": 35.065468549422334, + "grad_norm": 1.0586425065994263, + "learning_rate": 2.164912280701754e-05, + "loss": 0.2366, + "step": 27316 + }, + { + "epoch": 35.06675224646983, + "grad_norm": 0.9727787971496582, + "learning_rate": 2.1648694908001713e-05, + "loss": 0.232, + "step": 27317 + }, + { + "epoch": 35.06803594351733, + "grad_norm": 0.7760636210441589, + "learning_rate": 2.1648267008985878e-05, + "loss": 0.2425, + "step": 27318 + }, + { + "epoch": 35.06931964056483, + "grad_norm": 0.8713462948799133, + "learning_rate": 2.164783910997005e-05, + "loss": 0.2467, + "step": 27319 + }, + { + "epoch": 35.070603337612326, + "grad_norm": 0.9318809509277344, + "learning_rate": 2.1647411210954215e-05, + "loss": 0.2382, + "step": 27320 + }, + { + "epoch": 35.071887034659824, + "grad_norm": 1.9909584522247314, + "learning_rate": 2.1646983311938383e-05, + "loss": 0.2514, + "step": 27321 + }, + { + "epoch": 35.073170731707314, + "grad_norm": 0.840162992477417, + "learning_rate": 2.1646555412922552e-05, + "loss": 0.2286, + "step": 27322 + }, + { + "epoch": 35.07445442875481, + "grad_norm": 1.2208929061889648, + "learning_rate": 2.1646127513906717e-05, + "loss": 0.2347, + "step": 27323 + }, + { + "epoch": 35.07573812580231, + "grad_norm": 1.786453127861023, + "learning_rate": 2.1645699614890885e-05, + "loss": 0.237, + "step": 27324 + }, + { + "epoch": 35.07702182284981, + "grad_norm": 0.884758472442627, + "learning_rate": 2.1645271715875054e-05, + "loss": 0.2435, + "step": 27325 + }, + { + "epoch": 35.078305519897306, + "grad_norm": 1.8584333658218384, + "learning_rate": 2.1644843816859222e-05, + "loss": 0.2135, + "step": 27326 + }, + { + "epoch": 35.079589216944804, + "grad_norm": 1.0732600688934326, + "learning_rate": 2.164441591784339e-05, + "loss": 0.245, + "step": 27327 + }, + { + "epoch": 35.080872913992295, + "grad_norm": 0.8216699361801147, + "learning_rate": 2.164398801882756e-05, + "loss": 0.2332, + "step": 27328 + }, + { + "epoch": 35.08215661103979, + "grad_norm": 0.8190125226974487, + "learning_rate": 2.1643560119811724e-05, + "loss": 0.225, + "step": 27329 + }, + { + "epoch": 35.08344030808729, + "grad_norm": 1.0426971912384033, + "learning_rate": 2.1643132220795892e-05, + "loss": 0.2169, + "step": 27330 + }, + { + "epoch": 35.08472400513479, + "grad_norm": 0.7818315029144287, + "learning_rate": 2.164270432178006e-05, + "loss": 0.2465, + "step": 27331 + }, + { + "epoch": 35.086007702182286, + "grad_norm": 0.8558733463287354, + "learning_rate": 2.1642276422764226e-05, + "loss": 0.2319, + "step": 27332 + }, + { + "epoch": 35.087291399229784, + "grad_norm": 0.879565954208374, + "learning_rate": 2.1641848523748398e-05, + "loss": 0.2163, + "step": 27333 + }, + { + "epoch": 35.08857509627728, + "grad_norm": 0.8768479228019714, + "learning_rate": 2.1641420624732563e-05, + "loss": 0.2222, + "step": 27334 + }, + { + "epoch": 35.08985879332477, + "grad_norm": 1.015338659286499, + "learning_rate": 2.1640992725716734e-05, + "loss": 0.2009, + "step": 27335 + }, + { + "epoch": 35.09114249037227, + "grad_norm": 0.8765353560447693, + "learning_rate": 2.16405648267009e-05, + "loss": 0.1903, + "step": 27336 + }, + { + "epoch": 35.09242618741977, + "grad_norm": 0.7797988653182983, + "learning_rate": 2.1640136927685065e-05, + "loss": 0.2122, + "step": 27337 + }, + { + "epoch": 35.093709884467266, + "grad_norm": 1.4219365119934082, + "learning_rate": 2.1639709028669236e-05, + "loss": 0.2247, + "step": 27338 + }, + { + "epoch": 35.094993581514764, + "grad_norm": 1.0782577991485596, + "learning_rate": 2.16392811296534e-05, + "loss": 0.2093, + "step": 27339 + }, + { + "epoch": 35.09627727856226, + "grad_norm": 0.936269223690033, + "learning_rate": 2.163885323063757e-05, + "loss": 0.2187, + "step": 27340 + }, + { + "epoch": 35.09756097560975, + "grad_norm": 1.5100356340408325, + "learning_rate": 2.1638425331621738e-05, + "loss": 0.2127, + "step": 27341 + }, + { + "epoch": 35.09884467265725, + "grad_norm": 1.3935738801956177, + "learning_rate": 2.1637997432605907e-05, + "loss": 0.1974, + "step": 27342 + }, + { + "epoch": 35.10012836970475, + "grad_norm": 1.031288504600525, + "learning_rate": 2.1637569533590075e-05, + "loss": 0.2411, + "step": 27343 + }, + { + "epoch": 35.101412066752246, + "grad_norm": 1.0350371599197388, + "learning_rate": 2.163714163457424e-05, + "loss": 0.2313, + "step": 27344 + }, + { + "epoch": 35.102695763799744, + "grad_norm": 1.8675237894058228, + "learning_rate": 2.163671373555841e-05, + "loss": 0.2152, + "step": 27345 + }, + { + "epoch": 35.10397946084724, + "grad_norm": 1.190153956413269, + "learning_rate": 2.1636285836542577e-05, + "loss": 0.1974, + "step": 27346 + }, + { + "epoch": 35.10526315789474, + "grad_norm": 1.540482759475708, + "learning_rate": 2.1635857937526745e-05, + "loss": 0.1689, + "step": 27347 + }, + { + "epoch": 35.10654685494223, + "grad_norm": 8.297982215881348, + "learning_rate": 2.163543003851091e-05, + "loss": 0.2279, + "step": 27348 + }, + { + "epoch": 35.10783055198973, + "grad_norm": 1.6609307527542114, + "learning_rate": 2.163500213949508e-05, + "loss": 0.2332, + "step": 27349 + }, + { + "epoch": 35.109114249037226, + "grad_norm": 0.9565829038619995, + "learning_rate": 2.1634574240479247e-05, + "loss": 0.2323, + "step": 27350 + }, + { + "epoch": 35.110397946084724, + "grad_norm": 1.4293491840362549, + "learning_rate": 2.1634146341463415e-05, + "loss": 0.2293, + "step": 27351 + }, + { + "epoch": 35.11168164313222, + "grad_norm": 1.1912986040115356, + "learning_rate": 2.1633718442447584e-05, + "loss": 0.2212, + "step": 27352 + }, + { + "epoch": 35.11296534017972, + "grad_norm": 1.8959197998046875, + "learning_rate": 2.163329054343175e-05, + "loss": 0.2286, + "step": 27353 + }, + { + "epoch": 35.11424903722722, + "grad_norm": 1.7604838609695435, + "learning_rate": 2.163286264441592e-05, + "loss": 0.2309, + "step": 27354 + }, + { + "epoch": 35.11553273427471, + "grad_norm": 2.3748886585235596, + "learning_rate": 2.1632434745400086e-05, + "loss": 0.217, + "step": 27355 + }, + { + "epoch": 35.116816431322206, + "grad_norm": 2.4189252853393555, + "learning_rate": 2.163200684638425e-05, + "loss": 0.2269, + "step": 27356 + }, + { + "epoch": 35.118100128369704, + "grad_norm": 0.8640199899673462, + "learning_rate": 2.1631578947368423e-05, + "loss": 0.2423, + "step": 27357 + }, + { + "epoch": 35.1193838254172, + "grad_norm": 1.4537227153778076, + "learning_rate": 2.1631151048352588e-05, + "loss": 0.2649, + "step": 27358 + }, + { + "epoch": 35.1206675224647, + "grad_norm": 1.543215036392212, + "learning_rate": 2.163072314933676e-05, + "loss": 0.2091, + "step": 27359 + }, + { + "epoch": 35.1219512195122, + "grad_norm": 1.4160772562026978, + "learning_rate": 2.1630295250320924e-05, + "loss": 0.2309, + "step": 27360 + }, + { + "epoch": 35.12323491655969, + "grad_norm": 1.2591136693954468, + "learning_rate": 2.1629867351305093e-05, + "loss": 0.2424, + "step": 27361 + }, + { + "epoch": 35.12451861360719, + "grad_norm": 1.133438229560852, + "learning_rate": 2.162943945228926e-05, + "loss": 0.2315, + "step": 27362 + }, + { + "epoch": 35.125802310654684, + "grad_norm": 2.0017952919006348, + "learning_rate": 2.1629011553273426e-05, + "loss": 0.276, + "step": 27363 + }, + { + "epoch": 35.12708600770218, + "grad_norm": 3.5563018321990967, + "learning_rate": 2.1628583654257595e-05, + "loss": 0.2958, + "step": 27364 + }, + { + "epoch": 35.12836970474968, + "grad_norm": 1.857231855392456, + "learning_rate": 2.1628155755241763e-05, + "loss": 0.3567, + "step": 27365 + }, + { + "epoch": 35.12965340179718, + "grad_norm": 2.0517008304595947, + "learning_rate": 2.162772785622593e-05, + "loss": 0.2627, + "step": 27366 + }, + { + "epoch": 35.130937098844676, + "grad_norm": 1.114988923072815, + "learning_rate": 2.16272999572101e-05, + "loss": 0.2308, + "step": 27367 + }, + { + "epoch": 35.13222079589217, + "grad_norm": 0.7311031818389893, + "learning_rate": 2.162687205819427e-05, + "loss": 0.2571, + "step": 27368 + }, + { + "epoch": 35.133504492939664, + "grad_norm": 0.6764071583747864, + "learning_rate": 2.1626444159178433e-05, + "loss": 0.2367, + "step": 27369 + }, + { + "epoch": 35.13478818998716, + "grad_norm": 0.6495004892349243, + "learning_rate": 2.1626016260162602e-05, + "loss": 0.2502, + "step": 27370 + }, + { + "epoch": 35.13607188703466, + "grad_norm": 0.6916193962097168, + "learning_rate": 2.162558836114677e-05, + "loss": 0.2427, + "step": 27371 + }, + { + "epoch": 35.13735558408216, + "grad_norm": 0.652082622051239, + "learning_rate": 2.1625160462130935e-05, + "loss": 0.2167, + "step": 27372 + }, + { + "epoch": 35.138639281129656, + "grad_norm": 1.4164693355560303, + "learning_rate": 2.1624732563115107e-05, + "loss": 0.2227, + "step": 27373 + }, + { + "epoch": 35.13992297817715, + "grad_norm": 1.1451702117919922, + "learning_rate": 2.1624304664099272e-05, + "loss": 0.2569, + "step": 27374 + }, + { + "epoch": 35.141206675224645, + "grad_norm": 1.2899125814437866, + "learning_rate": 2.1623876765083444e-05, + "loss": 0.2555, + "step": 27375 + }, + { + "epoch": 35.14249037227214, + "grad_norm": 1.1851967573165894, + "learning_rate": 2.162344886606761e-05, + "loss": 0.2558, + "step": 27376 + }, + { + "epoch": 35.14377406931964, + "grad_norm": 2.216301918029785, + "learning_rate": 2.1623020967051774e-05, + "loss": 0.2283, + "step": 27377 + }, + { + "epoch": 35.14505776636714, + "grad_norm": 2.126852512359619, + "learning_rate": 2.1622593068035946e-05, + "loss": 0.2281, + "step": 27378 + }, + { + "epoch": 35.146341463414636, + "grad_norm": 0.6517059206962585, + "learning_rate": 2.162216516902011e-05, + "loss": 0.2148, + "step": 27379 + }, + { + "epoch": 35.147625160462134, + "grad_norm": 0.8841814994812012, + "learning_rate": 2.162173727000428e-05, + "loss": 0.2204, + "step": 27380 + }, + { + "epoch": 35.148908857509625, + "grad_norm": 1.021057367324829, + "learning_rate": 2.1621309370988448e-05, + "loss": 0.2181, + "step": 27381 + }, + { + "epoch": 35.15019255455712, + "grad_norm": 0.9162527322769165, + "learning_rate": 2.1620881471972616e-05, + "loss": 0.2054, + "step": 27382 + }, + { + "epoch": 35.15147625160462, + "grad_norm": 1.0957114696502686, + "learning_rate": 2.1620453572956784e-05, + "loss": 0.2262, + "step": 27383 + }, + { + "epoch": 35.15275994865212, + "grad_norm": 0.7443256378173828, + "learning_rate": 2.162002567394095e-05, + "loss": 0.209, + "step": 27384 + }, + { + "epoch": 35.154043645699616, + "grad_norm": 2.991638660430908, + "learning_rate": 2.1619597774925118e-05, + "loss": 0.21, + "step": 27385 + }, + { + "epoch": 35.155327342747114, + "grad_norm": 2.61995792388916, + "learning_rate": 2.1619169875909286e-05, + "loss": 0.2458, + "step": 27386 + }, + { + "epoch": 35.15661103979461, + "grad_norm": 0.9666712284088135, + "learning_rate": 2.1618741976893455e-05, + "loss": 0.212, + "step": 27387 + }, + { + "epoch": 35.1578947368421, + "grad_norm": 0.6413071751594543, + "learning_rate": 2.161831407787762e-05, + "loss": 0.2059, + "step": 27388 + }, + { + "epoch": 35.1591784338896, + "grad_norm": 1.1392558813095093, + "learning_rate": 2.161788617886179e-05, + "loss": 0.2058, + "step": 27389 + }, + { + "epoch": 35.1604621309371, + "grad_norm": 2.3248095512390137, + "learning_rate": 2.1617458279845956e-05, + "loss": 0.2326, + "step": 27390 + }, + { + "epoch": 35.161745827984596, + "grad_norm": 1.2078468799591064, + "learning_rate": 2.1617030380830125e-05, + "loss": 0.2413, + "step": 27391 + }, + { + "epoch": 35.163029525032094, + "grad_norm": 1.1029361486434937, + "learning_rate": 2.1616602481814293e-05, + "loss": 0.1996, + "step": 27392 + }, + { + "epoch": 35.16431322207959, + "grad_norm": 1.447102665901184, + "learning_rate": 2.1616174582798458e-05, + "loss": 0.2041, + "step": 27393 + }, + { + "epoch": 35.16559691912708, + "grad_norm": 1.1796150207519531, + "learning_rate": 2.161574668378263e-05, + "loss": 0.2088, + "step": 27394 + }, + { + "epoch": 35.16688061617458, + "grad_norm": 0.9720272421836853, + "learning_rate": 2.1615318784766795e-05, + "loss": 0.1798, + "step": 27395 + }, + { + "epoch": 35.16816431322208, + "grad_norm": 1.8108196258544922, + "learning_rate": 2.1614890885750964e-05, + "loss": 0.2333, + "step": 27396 + }, + { + "epoch": 35.169448010269576, + "grad_norm": 0.7296644449234009, + "learning_rate": 2.1614462986735132e-05, + "loss": 0.1741, + "step": 27397 + }, + { + "epoch": 35.170731707317074, + "grad_norm": 0.9372830986976624, + "learning_rate": 2.1614035087719297e-05, + "loss": 0.2137, + "step": 27398 + }, + { + "epoch": 35.17201540436457, + "grad_norm": 1.4439834356307983, + "learning_rate": 2.161360718870347e-05, + "loss": 0.2128, + "step": 27399 + }, + { + "epoch": 35.17329910141207, + "grad_norm": 1.1075782775878906, + "learning_rate": 2.1613179289687634e-05, + "loss": 0.2288, + "step": 27400 + }, + { + "epoch": 35.17458279845956, + "grad_norm": 1.2209383249282837, + "learning_rate": 2.1612751390671802e-05, + "loss": 0.2477, + "step": 27401 + }, + { + "epoch": 35.17586649550706, + "grad_norm": 9.049408912658691, + "learning_rate": 2.161232349165597e-05, + "loss": 0.2087, + "step": 27402 + }, + { + "epoch": 35.177150192554556, + "grad_norm": 2.374901533126831, + "learning_rate": 2.161189559264014e-05, + "loss": 0.2425, + "step": 27403 + }, + { + "epoch": 35.178433889602054, + "grad_norm": 1.53545343875885, + "learning_rate": 2.1611467693624304e-05, + "loss": 0.2469, + "step": 27404 + }, + { + "epoch": 35.17971758664955, + "grad_norm": 0.9871006608009338, + "learning_rate": 2.1611039794608472e-05, + "loss": 0.2234, + "step": 27405 + }, + { + "epoch": 35.18100128369705, + "grad_norm": 0.9351980686187744, + "learning_rate": 2.161061189559264e-05, + "loss": 0.1937, + "step": 27406 + }, + { + "epoch": 35.18228498074454, + "grad_norm": 1.0533335208892822, + "learning_rate": 2.161018399657681e-05, + "loss": 0.2258, + "step": 27407 + }, + { + "epoch": 35.18356867779204, + "grad_norm": 1.2938873767852783, + "learning_rate": 2.1609756097560978e-05, + "loss": 0.2575, + "step": 27408 + }, + { + "epoch": 35.18485237483954, + "grad_norm": 3.9161388874053955, + "learning_rate": 2.1609328198545143e-05, + "loss": 0.242, + "step": 27409 + }, + { + "epoch": 35.186136071887034, + "grad_norm": 9.896136283874512, + "learning_rate": 2.160890029952931e-05, + "loss": 0.2663, + "step": 27410 + }, + { + "epoch": 35.18741976893453, + "grad_norm": 3.1053218841552734, + "learning_rate": 2.160847240051348e-05, + "loss": 0.2249, + "step": 27411 + }, + { + "epoch": 35.18870346598203, + "grad_norm": 1.8179086446762085, + "learning_rate": 2.1608044501497645e-05, + "loss": 0.2326, + "step": 27412 + }, + { + "epoch": 35.18998716302953, + "grad_norm": 1.98301362991333, + "learning_rate": 2.1607616602481816e-05, + "loss": 0.3434, + "step": 27413 + }, + { + "epoch": 35.19127086007702, + "grad_norm": 2.2430338859558105, + "learning_rate": 2.160718870346598e-05, + "loss": 0.3279, + "step": 27414 + }, + { + "epoch": 35.19255455712452, + "grad_norm": 1.618531346321106, + "learning_rate": 2.1606760804450153e-05, + "loss": 0.4105, + "step": 27415 + }, + { + "epoch": 35.193838254172015, + "grad_norm": 0.8976839184761047, + "learning_rate": 2.1606332905434318e-05, + "loss": 0.2599, + "step": 27416 + }, + { + "epoch": 35.19512195121951, + "grad_norm": 0.6692266464233398, + "learning_rate": 2.1605905006418483e-05, + "loss": 0.2414, + "step": 27417 + }, + { + "epoch": 35.19640564826701, + "grad_norm": 0.6739473938941956, + "learning_rate": 2.1605477107402655e-05, + "loss": 0.2372, + "step": 27418 + }, + { + "epoch": 35.19768934531451, + "grad_norm": 0.7454162240028381, + "learning_rate": 2.160504920838682e-05, + "loss": 0.225, + "step": 27419 + }, + { + "epoch": 35.198973042362006, + "grad_norm": 1.388862133026123, + "learning_rate": 2.160462130937099e-05, + "loss": 0.2618, + "step": 27420 + }, + { + "epoch": 35.2002567394095, + "grad_norm": 1.0528075695037842, + "learning_rate": 2.1604193410355157e-05, + "loss": 0.2292, + "step": 27421 + }, + { + "epoch": 35.201540436456995, + "grad_norm": 1.4550199508666992, + "learning_rate": 2.1603765511339325e-05, + "loss": 0.2763, + "step": 27422 + }, + { + "epoch": 35.20282413350449, + "grad_norm": 0.929513692855835, + "learning_rate": 2.1603337612323494e-05, + "loss": 0.2243, + "step": 27423 + }, + { + "epoch": 35.20410783055199, + "grad_norm": 0.8361841440200806, + "learning_rate": 2.160290971330766e-05, + "loss": 0.2453, + "step": 27424 + }, + { + "epoch": 35.20539152759949, + "grad_norm": 1.357720971107483, + "learning_rate": 2.1602481814291827e-05, + "loss": 0.2541, + "step": 27425 + }, + { + "epoch": 35.206675224646986, + "grad_norm": 0.8444256782531738, + "learning_rate": 2.1602053915275996e-05, + "loss": 0.2172, + "step": 27426 + }, + { + "epoch": 35.20795892169448, + "grad_norm": 1.4816453456878662, + "learning_rate": 2.1601626016260164e-05, + "loss": 0.2384, + "step": 27427 + }, + { + "epoch": 35.209242618741975, + "grad_norm": 1.052600622177124, + "learning_rate": 2.160119811724433e-05, + "loss": 0.2114, + "step": 27428 + }, + { + "epoch": 35.21052631578947, + "grad_norm": 0.7041911482810974, + "learning_rate": 2.16007702182285e-05, + "loss": 0.2319, + "step": 27429 + }, + { + "epoch": 35.21181001283697, + "grad_norm": 3.1063146591186523, + "learning_rate": 2.1600342319212666e-05, + "loss": 0.2324, + "step": 27430 + }, + { + "epoch": 35.21309370988447, + "grad_norm": 1.0227453708648682, + "learning_rate": 2.1599914420196834e-05, + "loss": 0.2296, + "step": 27431 + }, + { + "epoch": 35.214377406931966, + "grad_norm": 1.8272943496704102, + "learning_rate": 2.1599486521181003e-05, + "loss": 0.2554, + "step": 27432 + }, + { + "epoch": 35.215661103979464, + "grad_norm": 0.6808350682258606, + "learning_rate": 2.1599058622165168e-05, + "loss": 0.2095, + "step": 27433 + }, + { + "epoch": 35.216944801026955, + "grad_norm": 4.804802417755127, + "learning_rate": 2.159863072314934e-05, + "loss": 0.2101, + "step": 27434 + }, + { + "epoch": 35.21822849807445, + "grad_norm": 0.9021160006523132, + "learning_rate": 2.1598202824133504e-05, + "loss": 0.2275, + "step": 27435 + }, + { + "epoch": 35.21951219512195, + "grad_norm": 0.9323638677597046, + "learning_rate": 2.1597774925117673e-05, + "loss": 0.2328, + "step": 27436 + }, + { + "epoch": 35.22079589216945, + "grad_norm": 0.9280344247817993, + "learning_rate": 2.159734702610184e-05, + "loss": 0.2068, + "step": 27437 + }, + { + "epoch": 35.222079589216946, + "grad_norm": 0.9375491738319397, + "learning_rate": 2.1596919127086006e-05, + "loss": 0.2344, + "step": 27438 + }, + { + "epoch": 35.223363286264444, + "grad_norm": 1.1885578632354736, + "learning_rate": 2.1596491228070178e-05, + "loss": 0.2263, + "step": 27439 + }, + { + "epoch": 35.224646983311935, + "grad_norm": 2.5567638874053955, + "learning_rate": 2.1596063329054343e-05, + "loss": 0.2471, + "step": 27440 + }, + { + "epoch": 35.22593068035943, + "grad_norm": 0.7502668499946594, + "learning_rate": 2.159563543003851e-05, + "loss": 0.2089, + "step": 27441 + }, + { + "epoch": 35.22721437740693, + "grad_norm": 1.2088367938995361, + "learning_rate": 2.159520753102268e-05, + "loss": 0.2258, + "step": 27442 + }, + { + "epoch": 35.22849807445443, + "grad_norm": 1.508440613746643, + "learning_rate": 2.159477963200685e-05, + "loss": 0.2205, + "step": 27443 + }, + { + "epoch": 35.229781771501926, + "grad_norm": 3.1771955490112305, + "learning_rate": 2.1594351732991013e-05, + "loss": 0.2133, + "step": 27444 + }, + { + "epoch": 35.231065468549424, + "grad_norm": 1.065145492553711, + "learning_rate": 2.1593923833975182e-05, + "loss": 0.2311, + "step": 27445 + }, + { + "epoch": 35.23234916559692, + "grad_norm": 1.2989346981048584, + "learning_rate": 2.159349593495935e-05, + "loss": 0.2168, + "step": 27446 + }, + { + "epoch": 35.23363286264441, + "grad_norm": 0.8808149099349976, + "learning_rate": 2.159306803594352e-05, + "loss": 0.2009, + "step": 27447 + }, + { + "epoch": 35.23491655969191, + "grad_norm": 1.7638353109359741, + "learning_rate": 2.1592640136927687e-05, + "loss": 0.215, + "step": 27448 + }, + { + "epoch": 35.23620025673941, + "grad_norm": 1.1684459447860718, + "learning_rate": 2.1592212237911852e-05, + "loss": 0.1986, + "step": 27449 + }, + { + "epoch": 35.23748395378691, + "grad_norm": 1.4468886852264404, + "learning_rate": 2.1591784338896024e-05, + "loss": 0.1962, + "step": 27450 + }, + { + "epoch": 35.238767650834404, + "grad_norm": 2.205944776535034, + "learning_rate": 2.159135643988019e-05, + "loss": 0.2084, + "step": 27451 + }, + { + "epoch": 35.2400513478819, + "grad_norm": 0.9619829654693604, + "learning_rate": 2.1590928540864354e-05, + "loss": 0.225, + "step": 27452 + }, + { + "epoch": 35.2413350449294, + "grad_norm": 2.062488317489624, + "learning_rate": 2.1590500641848526e-05, + "loss": 0.2154, + "step": 27453 + }, + { + "epoch": 35.24261874197689, + "grad_norm": 2.213522434234619, + "learning_rate": 2.159007274283269e-05, + "loss": 0.2068, + "step": 27454 + }, + { + "epoch": 35.24390243902439, + "grad_norm": 1.0931291580200195, + "learning_rate": 2.1589644843816863e-05, + "loss": 0.2236, + "step": 27455 + }, + { + "epoch": 35.24518613607189, + "grad_norm": 2.448028326034546, + "learning_rate": 2.1589216944801028e-05, + "loss": 0.2325, + "step": 27456 + }, + { + "epoch": 35.246469833119384, + "grad_norm": 1.1254371404647827, + "learning_rate": 2.1588789045785196e-05, + "loss": 0.2292, + "step": 27457 + }, + { + "epoch": 35.24775353016688, + "grad_norm": 1.6285359859466553, + "learning_rate": 2.1588361146769364e-05, + "loss": 0.2333, + "step": 27458 + }, + { + "epoch": 35.24903722721438, + "grad_norm": 1.5161277055740356, + "learning_rate": 2.158793324775353e-05, + "loss": 0.2641, + "step": 27459 + }, + { + "epoch": 35.25032092426187, + "grad_norm": 1.0748440027236938, + "learning_rate": 2.1587505348737698e-05, + "loss": 0.2356, + "step": 27460 + }, + { + "epoch": 35.25160462130937, + "grad_norm": 1.0816514492034912, + "learning_rate": 2.1587077449721866e-05, + "loss": 0.287, + "step": 27461 + }, + { + "epoch": 35.25288831835687, + "grad_norm": 2.251373052597046, + "learning_rate": 2.1586649550706035e-05, + "loss": 0.2232, + "step": 27462 + }, + { + "epoch": 35.254172015404365, + "grad_norm": 1.5330932140350342, + "learning_rate": 2.15862216516902e-05, + "loss": 0.2724, + "step": 27463 + }, + { + "epoch": 35.25545571245186, + "grad_norm": 1.2357630729675293, + "learning_rate": 2.158579375267437e-05, + "loss": 0.2509, + "step": 27464 + }, + { + "epoch": 35.25673940949936, + "grad_norm": 2.578472375869751, + "learning_rate": 2.1585365853658537e-05, + "loss": 0.3901, + "step": 27465 + }, + { + "epoch": 35.25802310654686, + "grad_norm": 0.852830708026886, + "learning_rate": 2.1584937954642705e-05, + "loss": 0.2276, + "step": 27466 + }, + { + "epoch": 35.25930680359435, + "grad_norm": 1.0553457736968994, + "learning_rate": 2.1584510055626873e-05, + "loss": 0.2477, + "step": 27467 + }, + { + "epoch": 35.26059050064185, + "grad_norm": 0.9957289695739746, + "learning_rate": 2.158408215661104e-05, + "loss": 0.2552, + "step": 27468 + }, + { + "epoch": 35.261874197689345, + "grad_norm": 1.0184102058410645, + "learning_rate": 2.158365425759521e-05, + "loss": 0.256, + "step": 27469 + }, + { + "epoch": 35.26315789473684, + "grad_norm": 0.9533007740974426, + "learning_rate": 2.1583226358579375e-05, + "loss": 0.2329, + "step": 27470 + }, + { + "epoch": 35.26444159178434, + "grad_norm": 1.0895047187805176, + "learning_rate": 2.1582798459563544e-05, + "loss": 0.2555, + "step": 27471 + }, + { + "epoch": 35.26572528883184, + "grad_norm": 0.9149969220161438, + "learning_rate": 2.1582370560547712e-05, + "loss": 0.2265, + "step": 27472 + }, + { + "epoch": 35.26700898587933, + "grad_norm": 1.0091865062713623, + "learning_rate": 2.1581942661531877e-05, + "loss": 0.2274, + "step": 27473 + }, + { + "epoch": 35.26829268292683, + "grad_norm": 0.8938236236572266, + "learning_rate": 2.158151476251605e-05, + "loss": 0.2373, + "step": 27474 + }, + { + "epoch": 35.269576379974325, + "grad_norm": 1.0922738313674927, + "learning_rate": 2.1581086863500214e-05, + "loss": 0.2236, + "step": 27475 + }, + { + "epoch": 35.27086007702182, + "grad_norm": 0.7095153331756592, + "learning_rate": 2.1580658964484382e-05, + "loss": 0.2385, + "step": 27476 + }, + { + "epoch": 35.27214377406932, + "grad_norm": 0.9312652945518494, + "learning_rate": 2.158023106546855e-05, + "loss": 0.2366, + "step": 27477 + }, + { + "epoch": 35.27342747111682, + "grad_norm": 1.512521505355835, + "learning_rate": 2.1579803166452716e-05, + "loss": 0.2092, + "step": 27478 + }, + { + "epoch": 35.274711168164316, + "grad_norm": 1.0892266035079956, + "learning_rate": 2.1579375267436884e-05, + "loss": 0.2571, + "step": 27479 + }, + { + "epoch": 35.27599486521181, + "grad_norm": 0.8279266953468323, + "learning_rate": 2.1578947368421053e-05, + "loss": 0.2419, + "step": 27480 + }, + { + "epoch": 35.277278562259305, + "grad_norm": 0.9187785387039185, + "learning_rate": 2.157851946940522e-05, + "loss": 0.2543, + "step": 27481 + }, + { + "epoch": 35.2785622593068, + "grad_norm": 1.0310370922088623, + "learning_rate": 2.157809157038939e-05, + "loss": 0.2163, + "step": 27482 + }, + { + "epoch": 35.2798459563543, + "grad_norm": 0.9803284406661987, + "learning_rate": 2.1577663671373558e-05, + "loss": 0.2159, + "step": 27483 + }, + { + "epoch": 35.2811296534018, + "grad_norm": 1.0454519987106323, + "learning_rate": 2.1577235772357723e-05, + "loss": 0.2079, + "step": 27484 + }, + { + "epoch": 35.282413350449296, + "grad_norm": 1.1016645431518555, + "learning_rate": 2.157680787334189e-05, + "loss": 0.2325, + "step": 27485 + }, + { + "epoch": 35.283697047496794, + "grad_norm": 0.7187449336051941, + "learning_rate": 2.157637997432606e-05, + "loss": 0.2201, + "step": 27486 + }, + { + "epoch": 35.284980744544285, + "grad_norm": 1.8556827306747437, + "learning_rate": 2.1575952075310225e-05, + "loss": 0.2486, + "step": 27487 + }, + { + "epoch": 35.28626444159178, + "grad_norm": 0.9155417084693909, + "learning_rate": 2.1575524176294396e-05, + "loss": 0.2436, + "step": 27488 + }, + { + "epoch": 35.28754813863928, + "grad_norm": 0.9902904629707336, + "learning_rate": 2.157509627727856e-05, + "loss": 0.23, + "step": 27489 + }, + { + "epoch": 35.28883183568678, + "grad_norm": 0.9289954900741577, + "learning_rate": 2.1574668378262733e-05, + "loss": 0.2061, + "step": 27490 + }, + { + "epoch": 35.290115532734276, + "grad_norm": 0.9043207764625549, + "learning_rate": 2.1574240479246898e-05, + "loss": 0.2177, + "step": 27491 + }, + { + "epoch": 35.291399229781774, + "grad_norm": 1.0911235809326172, + "learning_rate": 2.1573812580231063e-05, + "loss": 0.2087, + "step": 27492 + }, + { + "epoch": 35.292682926829265, + "grad_norm": 1.3969924449920654, + "learning_rate": 2.1573384681215235e-05, + "loss": 0.2367, + "step": 27493 + }, + { + "epoch": 35.29396662387676, + "grad_norm": 0.861322283744812, + "learning_rate": 2.15729567821994e-05, + "loss": 0.2059, + "step": 27494 + }, + { + "epoch": 35.29525032092426, + "grad_norm": 1.624518871307373, + "learning_rate": 2.157252888318357e-05, + "loss": 0.2352, + "step": 27495 + }, + { + "epoch": 35.29653401797176, + "grad_norm": 1.0578752756118774, + "learning_rate": 2.1572100984167737e-05, + "loss": 0.2026, + "step": 27496 + }, + { + "epoch": 35.29781771501926, + "grad_norm": 1.0167160034179688, + "learning_rate": 2.1571673085151905e-05, + "loss": 0.2228, + "step": 27497 + }, + { + "epoch": 35.299101412066754, + "grad_norm": 1.181250810623169, + "learning_rate": 2.1571245186136074e-05, + "loss": 0.2518, + "step": 27498 + }, + { + "epoch": 35.30038510911425, + "grad_norm": 3.131925344467163, + "learning_rate": 2.157081728712024e-05, + "loss": 0.2334, + "step": 27499 + }, + { + "epoch": 35.30166880616174, + "grad_norm": 1.3914698362350464, + "learning_rate": 2.1570389388104407e-05, + "loss": 0.2368, + "step": 27500 + }, + { + "epoch": 35.30295250320924, + "grad_norm": 1.738483190536499, + "learning_rate": 2.1569961489088576e-05, + "loss": 0.2335, + "step": 27501 + }, + { + "epoch": 35.30423620025674, + "grad_norm": 1.0868912935256958, + "learning_rate": 2.1569533590072744e-05, + "loss": 0.236, + "step": 27502 + }, + { + "epoch": 35.30551989730424, + "grad_norm": 2.0698606967926025, + "learning_rate": 2.156910569105691e-05, + "loss": 0.2497, + "step": 27503 + }, + { + "epoch": 35.306803594351734, + "grad_norm": 1.1352193355560303, + "learning_rate": 2.156867779204108e-05, + "loss": 0.222, + "step": 27504 + }, + { + "epoch": 35.30808729139923, + "grad_norm": 1.1075981855392456, + "learning_rate": 2.1568249893025246e-05, + "loss": 0.2109, + "step": 27505 + }, + { + "epoch": 35.30937098844672, + "grad_norm": 1.4212716817855835, + "learning_rate": 2.1567821994009414e-05, + "loss": 0.2357, + "step": 27506 + }, + { + "epoch": 35.31065468549422, + "grad_norm": 1.206220269203186, + "learning_rate": 2.1567394094993583e-05, + "loss": 0.2106, + "step": 27507 + }, + { + "epoch": 35.31193838254172, + "grad_norm": 1.8663625717163086, + "learning_rate": 2.1566966195977748e-05, + "loss": 0.2279, + "step": 27508 + }, + { + "epoch": 35.31322207958922, + "grad_norm": 0.9836003184318542, + "learning_rate": 2.156653829696192e-05, + "loss": 0.2071, + "step": 27509 + }, + { + "epoch": 35.314505776636715, + "grad_norm": 1.3815361261367798, + "learning_rate": 2.1566110397946085e-05, + "loss": 0.263, + "step": 27510 + }, + { + "epoch": 35.31578947368421, + "grad_norm": 4.417062282562256, + "learning_rate": 2.1565682498930253e-05, + "loss": 0.2428, + "step": 27511 + }, + { + "epoch": 35.31707317073171, + "grad_norm": 1.7174612283706665, + "learning_rate": 2.156525459991442e-05, + "loss": 0.3045, + "step": 27512 + }, + { + "epoch": 35.3183568677792, + "grad_norm": 2.6989729404449463, + "learning_rate": 2.1564826700898586e-05, + "loss": 0.2848, + "step": 27513 + }, + { + "epoch": 35.3196405648267, + "grad_norm": 1.4123249053955078, + "learning_rate": 2.1564398801882758e-05, + "loss": 0.2871, + "step": 27514 + }, + { + "epoch": 35.3209242618742, + "grad_norm": 1.991551399230957, + "learning_rate": 2.1563970902866923e-05, + "loss": 0.4146, + "step": 27515 + }, + { + "epoch": 35.322207958921695, + "grad_norm": 0.8762668371200562, + "learning_rate": 2.156354300385109e-05, + "loss": 0.2711, + "step": 27516 + }, + { + "epoch": 35.32349165596919, + "grad_norm": 1.1608684062957764, + "learning_rate": 2.156311510483526e-05, + "loss": 0.2339, + "step": 27517 + }, + { + "epoch": 35.32477535301669, + "grad_norm": 1.7413582801818848, + "learning_rate": 2.156268720581943e-05, + "loss": 0.2273, + "step": 27518 + }, + { + "epoch": 35.32605905006419, + "grad_norm": 1.015153169631958, + "learning_rate": 2.1562259306803593e-05, + "loss": 0.2666, + "step": 27519 + }, + { + "epoch": 35.32734274711168, + "grad_norm": 0.8712536692619324, + "learning_rate": 2.1561831407787762e-05, + "loss": 0.2408, + "step": 27520 + }, + { + "epoch": 35.32862644415918, + "grad_norm": 1.210176944732666, + "learning_rate": 2.156140350877193e-05, + "loss": 0.2474, + "step": 27521 + }, + { + "epoch": 35.329910141206675, + "grad_norm": 0.7449585199356079, + "learning_rate": 2.15609756097561e-05, + "loss": 0.2475, + "step": 27522 + }, + { + "epoch": 35.33119383825417, + "grad_norm": 0.8935561180114746, + "learning_rate": 2.1560547710740267e-05, + "loss": 0.2509, + "step": 27523 + }, + { + "epoch": 35.33247753530167, + "grad_norm": 1.564718246459961, + "learning_rate": 2.1560119811724432e-05, + "loss": 0.2303, + "step": 27524 + }, + { + "epoch": 35.33376123234917, + "grad_norm": 0.8971218466758728, + "learning_rate": 2.1559691912708604e-05, + "loss": 0.2519, + "step": 27525 + }, + { + "epoch": 35.33504492939666, + "grad_norm": 0.8295077085494995, + "learning_rate": 2.155926401369277e-05, + "loss": 0.2483, + "step": 27526 + }, + { + "epoch": 35.33632862644416, + "grad_norm": 1.301755666732788, + "learning_rate": 2.1558836114676934e-05, + "loss": 0.2637, + "step": 27527 + }, + { + "epoch": 35.337612323491655, + "grad_norm": 1.4279061555862427, + "learning_rate": 2.1558408215661106e-05, + "loss": 0.2303, + "step": 27528 + }, + { + "epoch": 35.33889602053915, + "grad_norm": 1.2511413097381592, + "learning_rate": 2.155798031664527e-05, + "loss": 0.2418, + "step": 27529 + }, + { + "epoch": 35.34017971758665, + "grad_norm": 1.1018342971801758, + "learning_rate": 2.1557552417629443e-05, + "loss": 0.2406, + "step": 27530 + }, + { + "epoch": 35.34146341463415, + "grad_norm": 2.338345527648926, + "learning_rate": 2.1557124518613608e-05, + "loss": 0.2193, + "step": 27531 + }, + { + "epoch": 35.342747111681646, + "grad_norm": 1.2852860689163208, + "learning_rate": 2.1556696619597776e-05, + "loss": 0.266, + "step": 27532 + }, + { + "epoch": 35.34403080872914, + "grad_norm": 1.3826427459716797, + "learning_rate": 2.1556268720581944e-05, + "loss": 0.2147, + "step": 27533 + }, + { + "epoch": 35.345314505776635, + "grad_norm": 1.802885890007019, + "learning_rate": 2.155584082156611e-05, + "loss": 0.2391, + "step": 27534 + }, + { + "epoch": 35.34659820282413, + "grad_norm": 1.4059725999832153, + "learning_rate": 2.1555412922550278e-05, + "loss": 0.242, + "step": 27535 + }, + { + "epoch": 35.34788189987163, + "grad_norm": 1.5762912034988403, + "learning_rate": 2.1554985023534446e-05, + "loss": 0.2302, + "step": 27536 + }, + { + "epoch": 35.34916559691913, + "grad_norm": 0.7035233378410339, + "learning_rate": 2.1554557124518615e-05, + "loss": 0.2143, + "step": 27537 + }, + { + "epoch": 35.350449293966626, + "grad_norm": 1.9322112798690796, + "learning_rate": 2.1554129225502783e-05, + "loss": 0.2267, + "step": 27538 + }, + { + "epoch": 35.35173299101412, + "grad_norm": 0.7969146966934204, + "learning_rate": 2.1553701326486948e-05, + "loss": 0.2161, + "step": 27539 + }, + { + "epoch": 35.353016688061615, + "grad_norm": 1.0880028009414673, + "learning_rate": 2.1553273427471117e-05, + "loss": 0.2284, + "step": 27540 + }, + { + "epoch": 35.35430038510911, + "grad_norm": 1.517198085784912, + "learning_rate": 2.1552845528455285e-05, + "loss": 0.2208, + "step": 27541 + }, + { + "epoch": 35.35558408215661, + "grad_norm": 1.6536881923675537, + "learning_rate": 2.1552417629439453e-05, + "loss": 0.2266, + "step": 27542 + }, + { + "epoch": 35.35686777920411, + "grad_norm": 1.4578322172164917, + "learning_rate": 2.155198973042362e-05, + "loss": 0.2027, + "step": 27543 + }, + { + "epoch": 35.35815147625161, + "grad_norm": 1.063530445098877, + "learning_rate": 2.155156183140779e-05, + "loss": 0.191, + "step": 27544 + }, + { + "epoch": 35.359435173299104, + "grad_norm": 1.0571985244750977, + "learning_rate": 2.1551133932391955e-05, + "loss": 0.2043, + "step": 27545 + }, + { + "epoch": 35.360718870346595, + "grad_norm": 0.918674886226654, + "learning_rate": 2.1550706033376124e-05, + "loss": 0.222, + "step": 27546 + }, + { + "epoch": 35.36200256739409, + "grad_norm": 1.080026388168335, + "learning_rate": 2.1550278134360292e-05, + "loss": 0.2115, + "step": 27547 + }, + { + "epoch": 35.36328626444159, + "grad_norm": 1.5668898820877075, + "learning_rate": 2.1549850235344457e-05, + "loss": 0.2268, + "step": 27548 + }, + { + "epoch": 35.36456996148909, + "grad_norm": 1.5467652082443237, + "learning_rate": 2.154942233632863e-05, + "loss": 0.2066, + "step": 27549 + }, + { + "epoch": 35.36585365853659, + "grad_norm": 1.0209980010986328, + "learning_rate": 2.1548994437312794e-05, + "loss": 0.226, + "step": 27550 + }, + { + "epoch": 35.367137355584084, + "grad_norm": 1.32029128074646, + "learning_rate": 2.1548566538296962e-05, + "loss": 0.1934, + "step": 27551 + }, + { + "epoch": 35.36842105263158, + "grad_norm": 1.2625755071640015, + "learning_rate": 2.154813863928113e-05, + "loss": 0.2211, + "step": 27552 + }, + { + "epoch": 35.36970474967907, + "grad_norm": 5.316233158111572, + "learning_rate": 2.1547710740265296e-05, + "loss": 0.2308, + "step": 27553 + }, + { + "epoch": 35.37098844672657, + "grad_norm": 3.4197745323181152, + "learning_rate": 2.1547282841249468e-05, + "loss": 0.2501, + "step": 27554 + }, + { + "epoch": 35.37227214377407, + "grad_norm": 1.4961715936660767, + "learning_rate": 2.1546854942233633e-05, + "loss": 0.2248, + "step": 27555 + }, + { + "epoch": 35.37355584082157, + "grad_norm": 1.1702392101287842, + "learning_rate": 2.15464270432178e-05, + "loss": 0.2192, + "step": 27556 + }, + { + "epoch": 35.374839537869065, + "grad_norm": 1.1903401613235474, + "learning_rate": 2.154599914420197e-05, + "loss": 0.2991, + "step": 27557 + }, + { + "epoch": 35.37612323491656, + "grad_norm": 3.11189603805542, + "learning_rate": 2.1545571245186138e-05, + "loss": 0.2279, + "step": 27558 + }, + { + "epoch": 35.37740693196405, + "grad_norm": 3.6361680030822754, + "learning_rate": 2.1545143346170303e-05, + "loss": 0.2295, + "step": 27559 + }, + { + "epoch": 35.37869062901155, + "grad_norm": 1.4182080030441284, + "learning_rate": 2.154471544715447e-05, + "loss": 0.2363, + "step": 27560 + }, + { + "epoch": 35.37997432605905, + "grad_norm": 1.167954683303833, + "learning_rate": 2.154428754813864e-05, + "loss": 0.2291, + "step": 27561 + }, + { + "epoch": 35.38125802310655, + "grad_norm": 3.434035062789917, + "learning_rate": 2.1543859649122808e-05, + "loss": 0.2746, + "step": 27562 + }, + { + "epoch": 35.382541720154045, + "grad_norm": 4.627537250518799, + "learning_rate": 2.1543431750106976e-05, + "loss": 0.2742, + "step": 27563 + }, + { + "epoch": 35.38382541720154, + "grad_norm": 1.632441520690918, + "learning_rate": 2.154300385109114e-05, + "loss": 0.3761, + "step": 27564 + }, + { + "epoch": 35.38510911424904, + "grad_norm": 3.5108156204223633, + "learning_rate": 2.1542575952075313e-05, + "loss": 0.399, + "step": 27565 + }, + { + "epoch": 35.38639281129653, + "grad_norm": 2.2096927165985107, + "learning_rate": 2.154214805305948e-05, + "loss": 0.2619, + "step": 27566 + }, + { + "epoch": 35.38767650834403, + "grad_norm": 0.9403578042984009, + "learning_rate": 2.1541720154043643e-05, + "loss": 0.2715, + "step": 27567 + }, + { + "epoch": 35.38896020539153, + "grad_norm": 0.8022010326385498, + "learning_rate": 2.1541292255027815e-05, + "loss": 0.2494, + "step": 27568 + }, + { + "epoch": 35.390243902439025, + "grad_norm": 1.0365689992904663, + "learning_rate": 2.154086435601198e-05, + "loss": 0.2563, + "step": 27569 + }, + { + "epoch": 35.39152759948652, + "grad_norm": 1.1972033977508545, + "learning_rate": 2.1540436456996152e-05, + "loss": 0.2419, + "step": 27570 + }, + { + "epoch": 35.39281129653402, + "grad_norm": 0.9281275272369385, + "learning_rate": 2.1540008557980317e-05, + "loss": 0.2252, + "step": 27571 + }, + { + "epoch": 35.39409499358152, + "grad_norm": 0.80852210521698, + "learning_rate": 2.1539580658964485e-05, + "loss": 0.2328, + "step": 27572 + }, + { + "epoch": 35.39537869062901, + "grad_norm": 1.0170377492904663, + "learning_rate": 2.1539152759948654e-05, + "loss": 0.2375, + "step": 27573 + }, + { + "epoch": 35.39666238767651, + "grad_norm": 2.244947910308838, + "learning_rate": 2.153872486093282e-05, + "loss": 0.2478, + "step": 27574 + }, + { + "epoch": 35.397946084724005, + "grad_norm": 0.7027885317802429, + "learning_rate": 2.1538296961916987e-05, + "loss": 0.2496, + "step": 27575 + }, + { + "epoch": 35.3992297817715, + "grad_norm": 0.8287565112113953, + "learning_rate": 2.1537869062901156e-05, + "loss": 0.2582, + "step": 27576 + }, + { + "epoch": 35.400513478819, + "grad_norm": 2.447354793548584, + "learning_rate": 2.1537441163885324e-05, + "loss": 0.2419, + "step": 27577 + }, + { + "epoch": 35.4017971758665, + "grad_norm": 0.9614822268486023, + "learning_rate": 2.1537013264869493e-05, + "loss": 0.251, + "step": 27578 + }, + { + "epoch": 35.40308087291399, + "grad_norm": 2.4071919918060303, + "learning_rate": 2.153658536585366e-05, + "loss": 0.2361, + "step": 27579 + }, + { + "epoch": 35.40436456996149, + "grad_norm": 1.0073580741882324, + "learning_rate": 2.1536157466837826e-05, + "loss": 0.2268, + "step": 27580 + }, + { + "epoch": 35.405648267008985, + "grad_norm": 1.090272307395935, + "learning_rate": 2.1535729567821994e-05, + "loss": 0.2152, + "step": 27581 + }, + { + "epoch": 35.40693196405648, + "grad_norm": 1.14096999168396, + "learning_rate": 2.1535301668806163e-05, + "loss": 0.2497, + "step": 27582 + }, + { + "epoch": 35.40821566110398, + "grad_norm": 1.8468093872070312, + "learning_rate": 2.1534873769790328e-05, + "loss": 0.2244, + "step": 27583 + }, + { + "epoch": 35.40949935815148, + "grad_norm": 0.8971882462501526, + "learning_rate": 2.15344458707745e-05, + "loss": 0.215, + "step": 27584 + }, + { + "epoch": 35.410783055198976, + "grad_norm": 1.7353765964508057, + "learning_rate": 2.1534017971758665e-05, + "loss": 0.2247, + "step": 27585 + }, + { + "epoch": 35.41206675224647, + "grad_norm": 1.446796178817749, + "learning_rate": 2.1533590072742836e-05, + "loss": 0.1996, + "step": 27586 + }, + { + "epoch": 35.413350449293965, + "grad_norm": 0.949928343296051, + "learning_rate": 2.1533162173727e-05, + "loss": 0.2261, + "step": 27587 + }, + { + "epoch": 35.41463414634146, + "grad_norm": 0.8299592137336731, + "learning_rate": 2.1532734274711166e-05, + "loss": 0.204, + "step": 27588 + }, + { + "epoch": 35.41591784338896, + "grad_norm": 2.055192470550537, + "learning_rate": 2.1532306375695338e-05, + "loss": 0.2195, + "step": 27589 + }, + { + "epoch": 35.41720154043646, + "grad_norm": 0.9503214359283447, + "learning_rate": 2.1531878476679503e-05, + "loss": 0.1927, + "step": 27590 + }, + { + "epoch": 35.41848523748396, + "grad_norm": 1.1318988800048828, + "learning_rate": 2.153145057766367e-05, + "loss": 0.2363, + "step": 27591 + }, + { + "epoch": 35.41976893453145, + "grad_norm": 1.1017286777496338, + "learning_rate": 2.153102267864784e-05, + "loss": 0.24, + "step": 27592 + }, + { + "epoch": 35.421052631578945, + "grad_norm": 1.2843700647354126, + "learning_rate": 2.153059477963201e-05, + "loss": 0.2045, + "step": 27593 + }, + { + "epoch": 35.42233632862644, + "grad_norm": 1.1100231409072876, + "learning_rate": 2.1530166880616177e-05, + "loss": 0.2184, + "step": 27594 + }, + { + "epoch": 35.42362002567394, + "grad_norm": 1.0341315269470215, + "learning_rate": 2.1529738981600342e-05, + "loss": 0.245, + "step": 27595 + }, + { + "epoch": 35.42490372272144, + "grad_norm": 0.9916970729827881, + "learning_rate": 2.152931108258451e-05, + "loss": 0.2115, + "step": 27596 + }, + { + "epoch": 35.42618741976894, + "grad_norm": 1.3897812366485596, + "learning_rate": 2.152888318356868e-05, + "loss": 0.2381, + "step": 27597 + }, + { + "epoch": 35.427471116816434, + "grad_norm": 1.1150286197662354, + "learning_rate": 2.1528455284552847e-05, + "loss": 0.233, + "step": 27598 + }, + { + "epoch": 35.428754813863925, + "grad_norm": 1.3531653881072998, + "learning_rate": 2.1528027385537012e-05, + "loss": 0.2538, + "step": 27599 + }, + { + "epoch": 35.43003851091142, + "grad_norm": 1.1162668466567993, + "learning_rate": 2.152759948652118e-05, + "loss": 0.2387, + "step": 27600 + }, + { + "epoch": 35.43132220795892, + "grad_norm": 1.375411868095398, + "learning_rate": 2.152717158750535e-05, + "loss": 0.2067, + "step": 27601 + }, + { + "epoch": 35.43260590500642, + "grad_norm": 0.8934232592582703, + "learning_rate": 2.1526743688489517e-05, + "loss": 0.264, + "step": 27602 + }, + { + "epoch": 35.43388960205392, + "grad_norm": 2.8685386180877686, + "learning_rate": 2.1526315789473686e-05, + "loss": 0.2206, + "step": 27603 + }, + { + "epoch": 35.435173299101415, + "grad_norm": 1.338671326637268, + "learning_rate": 2.152588789045785e-05, + "loss": 0.2074, + "step": 27604 + }, + { + "epoch": 35.436456996148905, + "grad_norm": 1.5360654592514038, + "learning_rate": 2.1525459991442023e-05, + "loss": 0.2123, + "step": 27605 + }, + { + "epoch": 35.4377406931964, + "grad_norm": 1.0220234394073486, + "learning_rate": 2.1525032092426188e-05, + "loss": 0.2425, + "step": 27606 + }, + { + "epoch": 35.4390243902439, + "grad_norm": 2.1412792205810547, + "learning_rate": 2.1524604193410353e-05, + "loss": 0.2513, + "step": 27607 + }, + { + "epoch": 35.4403080872914, + "grad_norm": 2.3726489543914795, + "learning_rate": 2.1524176294394525e-05, + "loss": 0.2152, + "step": 27608 + }, + { + "epoch": 35.4415917843389, + "grad_norm": 1.9715156555175781, + "learning_rate": 2.152374839537869e-05, + "loss": 0.26, + "step": 27609 + }, + { + "epoch": 35.442875481386395, + "grad_norm": 1.6579670906066895, + "learning_rate": 2.152332049636286e-05, + "loss": 0.2241, + "step": 27610 + }, + { + "epoch": 35.44415917843389, + "grad_norm": 1.648149013519287, + "learning_rate": 2.1522892597347026e-05, + "loss": 0.2388, + "step": 27611 + }, + { + "epoch": 35.44544287548138, + "grad_norm": 1.2951691150665283, + "learning_rate": 2.1522464698331195e-05, + "loss": 0.2484, + "step": 27612 + }, + { + "epoch": 35.44672657252888, + "grad_norm": 1.604701042175293, + "learning_rate": 2.1522036799315363e-05, + "loss": 0.2622, + "step": 27613 + }, + { + "epoch": 35.44801026957638, + "grad_norm": 2.377366065979004, + "learning_rate": 2.1521608900299528e-05, + "loss": 0.3684, + "step": 27614 + }, + { + "epoch": 35.44929396662388, + "grad_norm": 2.6908931732177734, + "learning_rate": 2.1521181001283697e-05, + "loss": 0.412, + "step": 27615 + }, + { + "epoch": 35.450577663671375, + "grad_norm": 1.245762825012207, + "learning_rate": 2.1520753102267865e-05, + "loss": 0.2454, + "step": 27616 + }, + { + "epoch": 35.45186136071887, + "grad_norm": 1.3935376405715942, + "learning_rate": 2.1520325203252033e-05, + "loss": 0.2506, + "step": 27617 + }, + { + "epoch": 35.45314505776637, + "grad_norm": 2.1277027130126953, + "learning_rate": 2.1519897304236202e-05, + "loss": 0.2635, + "step": 27618 + }, + { + "epoch": 35.45442875481386, + "grad_norm": 1.0357683897018433, + "learning_rate": 2.151946940522037e-05, + "loss": 0.2736, + "step": 27619 + }, + { + "epoch": 35.45571245186136, + "grad_norm": 0.8667138814926147, + "learning_rate": 2.1519041506204535e-05, + "loss": 0.2529, + "step": 27620 + }, + { + "epoch": 35.45699614890886, + "grad_norm": 1.245971441268921, + "learning_rate": 2.1518613607188704e-05, + "loss": 0.2409, + "step": 27621 + }, + { + "epoch": 35.458279845956355, + "grad_norm": 0.9464114308357239, + "learning_rate": 2.1518185708172872e-05, + "loss": 0.2367, + "step": 27622 + }, + { + "epoch": 35.45956354300385, + "grad_norm": 1.3662045001983643, + "learning_rate": 2.1517757809157037e-05, + "loss": 0.2521, + "step": 27623 + }, + { + "epoch": 35.46084724005135, + "grad_norm": 0.8372938632965088, + "learning_rate": 2.151732991014121e-05, + "loss": 0.2358, + "step": 27624 + }, + { + "epoch": 35.46213093709884, + "grad_norm": 1.4439311027526855, + "learning_rate": 2.1516902011125374e-05, + "loss": 0.2626, + "step": 27625 + }, + { + "epoch": 35.46341463414634, + "grad_norm": 0.8738772869110107, + "learning_rate": 2.1516474112109546e-05, + "loss": 0.2255, + "step": 27626 + }, + { + "epoch": 35.46469833119384, + "grad_norm": 1.094473958015442, + "learning_rate": 2.151604621309371e-05, + "loss": 0.267, + "step": 27627 + }, + { + "epoch": 35.465982028241335, + "grad_norm": 1.020966649055481, + "learning_rate": 2.1515618314077876e-05, + "loss": 0.2178, + "step": 27628 + }, + { + "epoch": 35.46726572528883, + "grad_norm": 3.425997734069824, + "learning_rate": 2.1515190415062048e-05, + "loss": 0.2108, + "step": 27629 + }, + { + "epoch": 35.46854942233633, + "grad_norm": 2.9392848014831543, + "learning_rate": 2.1514762516046213e-05, + "loss": 0.2549, + "step": 27630 + }, + { + "epoch": 35.46983311938383, + "grad_norm": 2.0517830848693848, + "learning_rate": 2.151433461703038e-05, + "loss": 0.2396, + "step": 27631 + }, + { + "epoch": 35.47111681643132, + "grad_norm": 4.227936744689941, + "learning_rate": 2.151390671801455e-05, + "loss": 0.2203, + "step": 27632 + }, + { + "epoch": 35.47240051347882, + "grad_norm": 0.9817638993263245, + "learning_rate": 2.1513478818998718e-05, + "loss": 0.2261, + "step": 27633 + }, + { + "epoch": 35.473684210526315, + "grad_norm": 1.0046424865722656, + "learning_rate": 2.1513050919982886e-05, + "loss": 0.2515, + "step": 27634 + }, + { + "epoch": 35.47496790757381, + "grad_norm": 1.0590380430221558, + "learning_rate": 2.151262302096705e-05, + "loss": 0.2229, + "step": 27635 + }, + { + "epoch": 35.47625160462131, + "grad_norm": 1.9210079908370972, + "learning_rate": 2.151219512195122e-05, + "loss": 0.2281, + "step": 27636 + }, + { + "epoch": 35.47753530166881, + "grad_norm": 2.4265851974487305, + "learning_rate": 2.1511767222935388e-05, + "loss": 0.237, + "step": 27637 + }, + { + "epoch": 35.47881899871631, + "grad_norm": 0.8371517062187195, + "learning_rate": 2.1511339323919557e-05, + "loss": 0.2204, + "step": 27638 + }, + { + "epoch": 35.4801026957638, + "grad_norm": 2.8712973594665527, + "learning_rate": 2.151091142490372e-05, + "loss": 0.2278, + "step": 27639 + }, + { + "epoch": 35.481386392811295, + "grad_norm": 3.618130922317505, + "learning_rate": 2.1510483525887893e-05, + "loss": 0.2194, + "step": 27640 + }, + { + "epoch": 35.48267008985879, + "grad_norm": 1.3368473052978516, + "learning_rate": 2.151005562687206e-05, + "loss": 0.2275, + "step": 27641 + }, + { + "epoch": 35.48395378690629, + "grad_norm": 1.0750908851623535, + "learning_rate": 2.1509627727856227e-05, + "loss": 0.2165, + "step": 27642 + }, + { + "epoch": 35.48523748395379, + "grad_norm": 0.9687149524688721, + "learning_rate": 2.1509199828840395e-05, + "loss": 0.2251, + "step": 27643 + }, + { + "epoch": 35.48652118100129, + "grad_norm": 1.0284396409988403, + "learning_rate": 2.150877192982456e-05, + "loss": 0.2249, + "step": 27644 + }, + { + "epoch": 35.48780487804878, + "grad_norm": 1.4682925939559937, + "learning_rate": 2.1508344030808732e-05, + "loss": 0.2188, + "step": 27645 + }, + { + "epoch": 35.489088575096275, + "grad_norm": 1.6445376873016357, + "learning_rate": 2.1507916131792897e-05, + "loss": 0.223, + "step": 27646 + }, + { + "epoch": 35.49037227214377, + "grad_norm": 1.04465651512146, + "learning_rate": 2.1507488232777065e-05, + "loss": 0.2254, + "step": 27647 + }, + { + "epoch": 35.49165596919127, + "grad_norm": 1.2218624353408813, + "learning_rate": 2.1507060333761234e-05, + "loss": 0.2555, + "step": 27648 + }, + { + "epoch": 35.49293966623877, + "grad_norm": 4.302952766418457, + "learning_rate": 2.15066324347454e-05, + "loss": 0.2099, + "step": 27649 + }, + { + "epoch": 35.49422336328627, + "grad_norm": 0.900077760219574, + "learning_rate": 2.150620453572957e-05, + "loss": 0.204, + "step": 27650 + }, + { + "epoch": 35.495507060333765, + "grad_norm": 1.2968767881393433, + "learning_rate": 2.1505776636713736e-05, + "loss": 0.2393, + "step": 27651 + }, + { + "epoch": 35.496790757381255, + "grad_norm": 1.1247138977050781, + "learning_rate": 2.1505348737697904e-05, + "loss": 0.234, + "step": 27652 + }, + { + "epoch": 35.49807445442875, + "grad_norm": 1.3504960536956787, + "learning_rate": 2.1504920838682073e-05, + "loss": 0.2262, + "step": 27653 + }, + { + "epoch": 35.49935815147625, + "grad_norm": 1.383286952972412, + "learning_rate": 2.150449293966624e-05, + "loss": 0.1995, + "step": 27654 + }, + { + "epoch": 35.50064184852375, + "grad_norm": 2.0938055515289307, + "learning_rate": 2.1504065040650406e-05, + "loss": 0.1978, + "step": 27655 + }, + { + "epoch": 35.50192554557125, + "grad_norm": 1.767579197883606, + "learning_rate": 2.1503637141634574e-05, + "loss": 0.2439, + "step": 27656 + }, + { + "epoch": 35.503209242618745, + "grad_norm": 2.064983606338501, + "learning_rate": 2.1503209242618743e-05, + "loss": 0.312, + "step": 27657 + }, + { + "epoch": 35.504492939666235, + "grad_norm": 1.1690818071365356, + "learning_rate": 2.1502781343602908e-05, + "loss": 0.2593, + "step": 27658 + }, + { + "epoch": 35.50577663671373, + "grad_norm": 1.2777588367462158, + "learning_rate": 2.150235344458708e-05, + "loss": 0.2434, + "step": 27659 + }, + { + "epoch": 35.50706033376123, + "grad_norm": 1.1972328424453735, + "learning_rate": 2.1501925545571245e-05, + "loss": 0.2463, + "step": 27660 + }, + { + "epoch": 35.50834403080873, + "grad_norm": 1.85892915725708, + "learning_rate": 2.1501497646555413e-05, + "loss": 0.2388, + "step": 27661 + }, + { + "epoch": 35.50962772785623, + "grad_norm": 1.366420865058899, + "learning_rate": 2.150106974753958e-05, + "loss": 0.2378, + "step": 27662 + }, + { + "epoch": 35.510911424903725, + "grad_norm": 1.2575736045837402, + "learning_rate": 2.1500641848523747e-05, + "loss": 0.286, + "step": 27663 + }, + { + "epoch": 35.51219512195122, + "grad_norm": 2.79394268989563, + "learning_rate": 2.150021394950792e-05, + "loss": 0.2885, + "step": 27664 + }, + { + "epoch": 35.51347881899871, + "grad_norm": 2.4384195804595947, + "learning_rate": 2.1499786050492083e-05, + "loss": 0.3912, + "step": 27665 + }, + { + "epoch": 35.51476251604621, + "grad_norm": 0.8424442410469055, + "learning_rate": 2.1499358151476252e-05, + "loss": 0.2483, + "step": 27666 + }, + { + "epoch": 35.51604621309371, + "grad_norm": 0.8363544344902039, + "learning_rate": 2.149893025246042e-05, + "loss": 0.2476, + "step": 27667 + }, + { + "epoch": 35.51732991014121, + "grad_norm": 1.4531643390655518, + "learning_rate": 2.1498502353444585e-05, + "loss": 0.2504, + "step": 27668 + }, + { + "epoch": 35.518613607188705, + "grad_norm": 1.1131047010421753, + "learning_rate": 2.1498074454428757e-05, + "loss": 0.2471, + "step": 27669 + }, + { + "epoch": 35.5198973042362, + "grad_norm": 0.8629105687141418, + "learning_rate": 2.1497646555412922e-05, + "loss": 0.2392, + "step": 27670 + }, + { + "epoch": 35.52118100128369, + "grad_norm": 0.9941790103912354, + "learning_rate": 2.149721865639709e-05, + "loss": 0.2235, + "step": 27671 + }, + { + "epoch": 35.52246469833119, + "grad_norm": 1.6822525262832642, + "learning_rate": 2.149679075738126e-05, + "loss": 0.2338, + "step": 27672 + }, + { + "epoch": 35.52374839537869, + "grad_norm": 1.7836875915527344, + "learning_rate": 2.1496362858365427e-05, + "loss": 0.235, + "step": 27673 + }, + { + "epoch": 35.52503209242619, + "grad_norm": 1.2922029495239258, + "learning_rate": 2.1495934959349592e-05, + "loss": 0.2457, + "step": 27674 + }, + { + "epoch": 35.526315789473685, + "grad_norm": 1.041500210762024, + "learning_rate": 2.149550706033376e-05, + "loss": 0.2475, + "step": 27675 + }, + { + "epoch": 35.52759948652118, + "grad_norm": 0.927107572555542, + "learning_rate": 2.149507916131793e-05, + "loss": 0.2369, + "step": 27676 + }, + { + "epoch": 35.52888318356868, + "grad_norm": 1.8342432975769043, + "learning_rate": 2.1494651262302098e-05, + "loss": 0.2732, + "step": 27677 + }, + { + "epoch": 35.53016688061617, + "grad_norm": 0.8675385117530823, + "learning_rate": 2.1494223363286266e-05, + "loss": 0.2134, + "step": 27678 + }, + { + "epoch": 35.53145057766367, + "grad_norm": 1.9101364612579346, + "learning_rate": 2.149379546427043e-05, + "loss": 0.2345, + "step": 27679 + }, + { + "epoch": 35.53273427471117, + "grad_norm": 1.1673331260681152, + "learning_rate": 2.1493367565254603e-05, + "loss": 0.2241, + "step": 27680 + }, + { + "epoch": 35.534017971758665, + "grad_norm": 0.8881629705429077, + "learning_rate": 2.1492939666238768e-05, + "loss": 0.2323, + "step": 27681 + }, + { + "epoch": 35.53530166880616, + "grad_norm": 1.022760033607483, + "learning_rate": 2.1492511767222933e-05, + "loss": 0.2241, + "step": 27682 + }, + { + "epoch": 35.53658536585366, + "grad_norm": 0.8466789126396179, + "learning_rate": 2.1492083868207105e-05, + "loss": 0.2183, + "step": 27683 + }, + { + "epoch": 35.53786906290116, + "grad_norm": 1.0509973764419556, + "learning_rate": 2.149165596919127e-05, + "loss": 0.2385, + "step": 27684 + }, + { + "epoch": 35.53915275994865, + "grad_norm": 0.7664573192596436, + "learning_rate": 2.149122807017544e-05, + "loss": 0.2264, + "step": 27685 + }, + { + "epoch": 35.54043645699615, + "grad_norm": 1.039275050163269, + "learning_rate": 2.1490800171159606e-05, + "loss": 0.2502, + "step": 27686 + }, + { + "epoch": 35.541720154043645, + "grad_norm": 1.5459243059158325, + "learning_rate": 2.1490372272143775e-05, + "loss": 0.2351, + "step": 27687 + }, + { + "epoch": 35.54300385109114, + "grad_norm": 2.231369972229004, + "learning_rate": 2.1489944373127943e-05, + "loss": 0.2222, + "step": 27688 + }, + { + "epoch": 35.54428754813864, + "grad_norm": 1.9269245862960815, + "learning_rate": 2.1489516474112108e-05, + "loss": 0.2047, + "step": 27689 + }, + { + "epoch": 35.54557124518614, + "grad_norm": 0.8354594707489014, + "learning_rate": 2.1489088575096277e-05, + "loss": 0.2234, + "step": 27690 + }, + { + "epoch": 35.54685494223363, + "grad_norm": 0.9109161496162415, + "learning_rate": 2.1488660676080445e-05, + "loss": 0.1989, + "step": 27691 + }, + { + "epoch": 35.54813863928113, + "grad_norm": 1.1537734270095825, + "learning_rate": 2.1488232777064614e-05, + "loss": 0.2102, + "step": 27692 + }, + { + "epoch": 35.549422336328625, + "grad_norm": 1.4355013370513916, + "learning_rate": 2.1487804878048782e-05, + "loss": 0.201, + "step": 27693 + }, + { + "epoch": 35.55070603337612, + "grad_norm": 1.091198444366455, + "learning_rate": 2.148737697903295e-05, + "loss": 0.2053, + "step": 27694 + }, + { + "epoch": 35.55198973042362, + "grad_norm": 1.34925377368927, + "learning_rate": 2.1486949080017115e-05, + "loss": 0.2004, + "step": 27695 + }, + { + "epoch": 35.55327342747112, + "grad_norm": 1.445784091949463, + "learning_rate": 2.1486521181001284e-05, + "loss": 0.2182, + "step": 27696 + }, + { + "epoch": 35.55455712451862, + "grad_norm": 0.7838452458381653, + "learning_rate": 2.1486093281985452e-05, + "loss": 0.199, + "step": 27697 + }, + { + "epoch": 35.55584082156611, + "grad_norm": 1.185180902481079, + "learning_rate": 2.1485665382969617e-05, + "loss": 0.2136, + "step": 27698 + }, + { + "epoch": 35.557124518613605, + "grad_norm": 1.784017562866211, + "learning_rate": 2.148523748395379e-05, + "loss": 0.2689, + "step": 27699 + }, + { + "epoch": 35.5584082156611, + "grad_norm": 1.954878568649292, + "learning_rate": 2.1484809584937954e-05, + "loss": 0.2285, + "step": 27700 + }, + { + "epoch": 35.5596919127086, + "grad_norm": 1.1489107608795166, + "learning_rate": 2.1484381685922126e-05, + "loss": 0.222, + "step": 27701 + }, + { + "epoch": 35.5609756097561, + "grad_norm": 10.458537101745605, + "learning_rate": 2.148395378690629e-05, + "loss": 0.2073, + "step": 27702 + }, + { + "epoch": 35.5622593068036, + "grad_norm": 1.1746588945388794, + "learning_rate": 2.1483525887890456e-05, + "loss": 0.2361, + "step": 27703 + }, + { + "epoch": 35.563543003851095, + "grad_norm": 2.571479082107544, + "learning_rate": 2.1483097988874628e-05, + "loss": 0.2143, + "step": 27704 + }, + { + "epoch": 35.564826700898585, + "grad_norm": 1.457794189453125, + "learning_rate": 2.1482670089858793e-05, + "loss": 0.2554, + "step": 27705 + }, + { + "epoch": 35.56611039794608, + "grad_norm": 1.3649321794509888, + "learning_rate": 2.148224219084296e-05, + "loss": 0.216, + "step": 27706 + }, + { + "epoch": 35.56739409499358, + "grad_norm": 2.763249158859253, + "learning_rate": 2.148181429182713e-05, + "loss": 0.2603, + "step": 27707 + }, + { + "epoch": 35.56867779204108, + "grad_norm": 2.092899799346924, + "learning_rate": 2.1481386392811298e-05, + "loss": 0.2507, + "step": 27708 + }, + { + "epoch": 35.56996148908858, + "grad_norm": 0.969470202922821, + "learning_rate": 2.1480958493795466e-05, + "loss": 0.2234, + "step": 27709 + }, + { + "epoch": 35.571245186136075, + "grad_norm": 1.2068195343017578, + "learning_rate": 2.148053059477963e-05, + "loss": 0.2231, + "step": 27710 + }, + { + "epoch": 35.572528883183566, + "grad_norm": 1.2920782566070557, + "learning_rate": 2.14801026957638e-05, + "loss": 0.2629, + "step": 27711 + }, + { + "epoch": 35.57381258023106, + "grad_norm": 1.8499494791030884, + "learning_rate": 2.1479674796747968e-05, + "loss": 0.2437, + "step": 27712 + }, + { + "epoch": 35.57509627727856, + "grad_norm": 3.9815094470977783, + "learning_rate": 2.1479246897732137e-05, + "loss": 0.2869, + "step": 27713 + }, + { + "epoch": 35.57637997432606, + "grad_norm": 1.2368178367614746, + "learning_rate": 2.14788189987163e-05, + "loss": 0.2977, + "step": 27714 + }, + { + "epoch": 35.57766367137356, + "grad_norm": 3.660209894180298, + "learning_rate": 2.1478391099700473e-05, + "loss": 0.4141, + "step": 27715 + }, + { + "epoch": 35.578947368421055, + "grad_norm": 1.5438709259033203, + "learning_rate": 2.147796320068464e-05, + "loss": 0.2572, + "step": 27716 + }, + { + "epoch": 35.58023106546855, + "grad_norm": 1.1734627485275269, + "learning_rate": 2.1477535301668807e-05, + "loss": 0.2528, + "step": 27717 + }, + { + "epoch": 35.58151476251604, + "grad_norm": 1.1270503997802734, + "learning_rate": 2.1477107402652975e-05, + "loss": 0.2822, + "step": 27718 + }, + { + "epoch": 35.58279845956354, + "grad_norm": 2.1095025539398193, + "learning_rate": 2.147667950363714e-05, + "loss": 0.2829, + "step": 27719 + }, + { + "epoch": 35.58408215661104, + "grad_norm": 1.4214649200439453, + "learning_rate": 2.1476251604621312e-05, + "loss": 0.2332, + "step": 27720 + }, + { + "epoch": 35.58536585365854, + "grad_norm": 0.7320100665092468, + "learning_rate": 2.1475823705605477e-05, + "loss": 0.2404, + "step": 27721 + }, + { + "epoch": 35.586649550706035, + "grad_norm": 0.9133366942405701, + "learning_rate": 2.1475395806589646e-05, + "loss": 0.2505, + "step": 27722 + }, + { + "epoch": 35.58793324775353, + "grad_norm": 1.0906535387039185, + "learning_rate": 2.1474967907573814e-05, + "loss": 0.2252, + "step": 27723 + }, + { + "epoch": 35.589216944801024, + "grad_norm": 0.9361691474914551, + "learning_rate": 2.147454000855798e-05, + "loss": 0.2159, + "step": 27724 + }, + { + "epoch": 35.59050064184852, + "grad_norm": 0.8712897896766663, + "learning_rate": 2.147411210954215e-05, + "loss": 0.2214, + "step": 27725 + }, + { + "epoch": 35.59178433889602, + "grad_norm": 1.3439860343933105, + "learning_rate": 2.1473684210526316e-05, + "loss": 0.219, + "step": 27726 + }, + { + "epoch": 35.59306803594352, + "grad_norm": 3.0740628242492676, + "learning_rate": 2.1473256311510484e-05, + "loss": 0.225, + "step": 27727 + }, + { + "epoch": 35.594351732991015, + "grad_norm": 1.9690524339675903, + "learning_rate": 2.1472828412494653e-05, + "loss": 0.2256, + "step": 27728 + }, + { + "epoch": 35.59563543003851, + "grad_norm": 0.881428599357605, + "learning_rate": 2.1472400513478818e-05, + "loss": 0.2289, + "step": 27729 + }, + { + "epoch": 35.59691912708601, + "grad_norm": 1.2916897535324097, + "learning_rate": 2.1471972614462986e-05, + "loss": 0.2498, + "step": 27730 + }, + { + "epoch": 35.5982028241335, + "grad_norm": 1.0093764066696167, + "learning_rate": 2.1471544715447154e-05, + "loss": 0.2311, + "step": 27731 + }, + { + "epoch": 35.599486521181, + "grad_norm": 2.0976943969726562, + "learning_rate": 2.1471116816431323e-05, + "loss": 0.2101, + "step": 27732 + }, + { + "epoch": 35.6007702182285, + "grad_norm": 1.7617194652557373, + "learning_rate": 2.147068891741549e-05, + "loss": 0.2425, + "step": 27733 + }, + { + "epoch": 35.602053915275995, + "grad_norm": 1.06636381149292, + "learning_rate": 2.147026101839966e-05, + "loss": 0.2366, + "step": 27734 + }, + { + "epoch": 35.60333761232349, + "grad_norm": 1.660574197769165, + "learning_rate": 2.1469833119383825e-05, + "loss": 0.2631, + "step": 27735 + }, + { + "epoch": 35.60462130937099, + "grad_norm": 0.7576343417167664, + "learning_rate": 2.1469405220367993e-05, + "loss": 0.2197, + "step": 27736 + }, + { + "epoch": 35.60590500641848, + "grad_norm": 1.1404354572296143, + "learning_rate": 2.146897732135216e-05, + "loss": 0.2331, + "step": 27737 + }, + { + "epoch": 35.60718870346598, + "grad_norm": 1.3745968341827393, + "learning_rate": 2.1468549422336327e-05, + "loss": 0.2271, + "step": 27738 + }, + { + "epoch": 35.60847240051348, + "grad_norm": 2.537182569503784, + "learning_rate": 2.14681215233205e-05, + "loss": 0.2222, + "step": 27739 + }, + { + "epoch": 35.609756097560975, + "grad_norm": 1.0623453855514526, + "learning_rate": 2.1467693624304663e-05, + "loss": 0.2113, + "step": 27740 + }, + { + "epoch": 35.61103979460847, + "grad_norm": 1.3646000623703003, + "learning_rate": 2.1467265725288835e-05, + "loss": 0.2109, + "step": 27741 + }, + { + "epoch": 35.61232349165597, + "grad_norm": 2.4512054920196533, + "learning_rate": 2.1466837826273e-05, + "loss": 0.2529, + "step": 27742 + }, + { + "epoch": 35.61360718870347, + "grad_norm": 0.9819491505622864, + "learning_rate": 2.1466409927257165e-05, + "loss": 0.2159, + "step": 27743 + }, + { + "epoch": 35.61489088575096, + "grad_norm": 1.0368372201919556, + "learning_rate": 2.1465982028241337e-05, + "loss": 0.2032, + "step": 27744 + }, + { + "epoch": 35.61617458279846, + "grad_norm": 1.271806240081787, + "learning_rate": 2.1465554129225502e-05, + "loss": 0.2426, + "step": 27745 + }, + { + "epoch": 35.617458279845955, + "grad_norm": 1.2697869539260864, + "learning_rate": 2.146512623020967e-05, + "loss": 0.2309, + "step": 27746 + }, + { + "epoch": 35.61874197689345, + "grad_norm": 1.1477054357528687, + "learning_rate": 2.146469833119384e-05, + "loss": 0.1978, + "step": 27747 + }, + { + "epoch": 35.62002567394095, + "grad_norm": 1.0691629648208618, + "learning_rate": 2.1464270432178007e-05, + "loss": 0.225, + "step": 27748 + }, + { + "epoch": 35.62130937098845, + "grad_norm": 1.319244623184204, + "learning_rate": 2.1463842533162176e-05, + "loss": 0.2282, + "step": 27749 + }, + { + "epoch": 35.62259306803595, + "grad_norm": 2.0700669288635254, + "learning_rate": 2.146341463414634e-05, + "loss": 0.2332, + "step": 27750 + }, + { + "epoch": 35.62387676508344, + "grad_norm": 2.0526251792907715, + "learning_rate": 2.146298673513051e-05, + "loss": 0.2378, + "step": 27751 + }, + { + "epoch": 35.625160462130935, + "grad_norm": 1.4551441669464111, + "learning_rate": 2.1462558836114678e-05, + "loss": 0.2331, + "step": 27752 + }, + { + "epoch": 35.62644415917843, + "grad_norm": 3.6470937728881836, + "learning_rate": 2.1462130937098846e-05, + "loss": 0.2297, + "step": 27753 + }, + { + "epoch": 35.62772785622593, + "grad_norm": 1.0820881128311157, + "learning_rate": 2.146170303808301e-05, + "loss": 0.2171, + "step": 27754 + }, + { + "epoch": 35.62901155327343, + "grad_norm": 1.5824939012527466, + "learning_rate": 2.1461275139067183e-05, + "loss": 0.2438, + "step": 27755 + }, + { + "epoch": 35.63029525032093, + "grad_norm": 2.1866002082824707, + "learning_rate": 2.1460847240051348e-05, + "loss": 0.2284, + "step": 27756 + }, + { + "epoch": 35.63157894736842, + "grad_norm": 1.535534143447876, + "learning_rate": 2.1460419341035516e-05, + "loss": 0.2656, + "step": 27757 + }, + { + "epoch": 35.632862644415916, + "grad_norm": 1.1101983785629272, + "learning_rate": 2.1459991442019685e-05, + "loss": 0.238, + "step": 27758 + }, + { + "epoch": 35.63414634146341, + "grad_norm": 1.1160744428634644, + "learning_rate": 2.145956354300385e-05, + "loss": 0.2266, + "step": 27759 + }, + { + "epoch": 35.63543003851091, + "grad_norm": 2.083157539367676, + "learning_rate": 2.145913564398802e-05, + "loss": 0.2171, + "step": 27760 + }, + { + "epoch": 35.63671373555841, + "grad_norm": 1.584236741065979, + "learning_rate": 2.1458707744972186e-05, + "loss": 0.2755, + "step": 27761 + }, + { + "epoch": 35.63799743260591, + "grad_norm": 3.2834091186523438, + "learning_rate": 2.1458279845956355e-05, + "loss": 0.2447, + "step": 27762 + }, + { + "epoch": 35.639281129653405, + "grad_norm": 4.315969944000244, + "learning_rate": 2.1457851946940523e-05, + "loss": 0.3085, + "step": 27763 + }, + { + "epoch": 35.640564826700896, + "grad_norm": 2.303914785385132, + "learning_rate": 2.145742404792469e-05, + "loss": 0.3051, + "step": 27764 + }, + { + "epoch": 35.64184852374839, + "grad_norm": 2.4277503490448, + "learning_rate": 2.145699614890886e-05, + "loss": 0.356, + "step": 27765 + }, + { + "epoch": 35.64313222079589, + "grad_norm": 0.9937937259674072, + "learning_rate": 2.1456568249893025e-05, + "loss": 0.237, + "step": 27766 + }, + { + "epoch": 35.64441591784339, + "grad_norm": 0.6478154063224792, + "learning_rate": 2.1456140350877194e-05, + "loss": 0.2381, + "step": 27767 + }, + { + "epoch": 35.64569961489089, + "grad_norm": 1.5219248533248901, + "learning_rate": 2.1455712451861362e-05, + "loss": 0.2338, + "step": 27768 + }, + { + "epoch": 35.646983311938385, + "grad_norm": 1.0247161388397217, + "learning_rate": 2.145528455284553e-05, + "loss": 0.2548, + "step": 27769 + }, + { + "epoch": 35.64826700898588, + "grad_norm": 0.7745807766914368, + "learning_rate": 2.1454856653829695e-05, + "loss": 0.2404, + "step": 27770 + }, + { + "epoch": 35.649550706033374, + "grad_norm": 1.0155256986618042, + "learning_rate": 2.1454428754813864e-05, + "loss": 0.2312, + "step": 27771 + }, + { + "epoch": 35.65083440308087, + "grad_norm": 0.7271119356155396, + "learning_rate": 2.1454000855798032e-05, + "loss": 0.2271, + "step": 27772 + }, + { + "epoch": 35.65211810012837, + "grad_norm": 0.7368013858795166, + "learning_rate": 2.14535729567822e-05, + "loss": 0.2301, + "step": 27773 + }, + { + "epoch": 35.65340179717587, + "grad_norm": 1.8542029857635498, + "learning_rate": 2.145314505776637e-05, + "loss": 0.2472, + "step": 27774 + }, + { + "epoch": 35.654685494223365, + "grad_norm": 0.7490796446800232, + "learning_rate": 2.1452717158750534e-05, + "loss": 0.2395, + "step": 27775 + }, + { + "epoch": 35.65596919127086, + "grad_norm": 1.2196109294891357, + "learning_rate": 2.1452289259734706e-05, + "loss": 0.2331, + "step": 27776 + }, + { + "epoch": 35.657252888318354, + "grad_norm": 1.1463919878005981, + "learning_rate": 2.145186136071887e-05, + "loss": 0.2076, + "step": 27777 + }, + { + "epoch": 35.65853658536585, + "grad_norm": 1.1616162061691284, + "learning_rate": 2.1451433461703036e-05, + "loss": 0.2369, + "step": 27778 + }, + { + "epoch": 35.65982028241335, + "grad_norm": 0.7959775328636169, + "learning_rate": 2.1451005562687208e-05, + "loss": 0.2282, + "step": 27779 + }, + { + "epoch": 35.66110397946085, + "grad_norm": 1.8512046337127686, + "learning_rate": 2.1450577663671373e-05, + "loss": 0.2249, + "step": 27780 + }, + { + "epoch": 35.662387676508345, + "grad_norm": 1.2141586542129517, + "learning_rate": 2.1450149764655545e-05, + "loss": 0.2702, + "step": 27781 + }, + { + "epoch": 35.66367137355584, + "grad_norm": 0.8019582033157349, + "learning_rate": 2.144972186563971e-05, + "loss": 0.2158, + "step": 27782 + }, + { + "epoch": 35.66495507060334, + "grad_norm": 0.7656897902488708, + "learning_rate": 2.1449293966623878e-05, + "loss": 0.2184, + "step": 27783 + }, + { + "epoch": 35.66623876765083, + "grad_norm": 0.7015665769577026, + "learning_rate": 2.1448866067608046e-05, + "loss": 0.2157, + "step": 27784 + }, + { + "epoch": 35.66752246469833, + "grad_norm": 1.4925228357315063, + "learning_rate": 2.144843816859221e-05, + "loss": 0.2228, + "step": 27785 + }, + { + "epoch": 35.66880616174583, + "grad_norm": 0.961584210395813, + "learning_rate": 2.144801026957638e-05, + "loss": 0.2366, + "step": 27786 + }, + { + "epoch": 35.670089858793325, + "grad_norm": 0.916471004486084, + "learning_rate": 2.1447582370560548e-05, + "loss": 0.2384, + "step": 27787 + }, + { + "epoch": 35.67137355584082, + "grad_norm": 1.1337517499923706, + "learning_rate": 2.1447154471544717e-05, + "loss": 0.195, + "step": 27788 + }, + { + "epoch": 35.67265725288832, + "grad_norm": 0.7925015091896057, + "learning_rate": 2.1446726572528885e-05, + "loss": 0.2088, + "step": 27789 + }, + { + "epoch": 35.67394094993581, + "grad_norm": 1.7496081590652466, + "learning_rate": 2.144629867351305e-05, + "loss": 0.2295, + "step": 27790 + }, + { + "epoch": 35.67522464698331, + "grad_norm": 0.9461548328399658, + "learning_rate": 2.144587077449722e-05, + "loss": 0.1968, + "step": 27791 + }, + { + "epoch": 35.67650834403081, + "grad_norm": 0.9478204846382141, + "learning_rate": 2.1445442875481387e-05, + "loss": 0.2184, + "step": 27792 + }, + { + "epoch": 35.677792041078305, + "grad_norm": 0.8734074831008911, + "learning_rate": 2.1445014976465555e-05, + "loss": 0.2209, + "step": 27793 + }, + { + "epoch": 35.6790757381258, + "grad_norm": 0.9448955655097961, + "learning_rate": 2.144458707744972e-05, + "loss": 0.2293, + "step": 27794 + }, + { + "epoch": 35.6803594351733, + "grad_norm": 0.9977453947067261, + "learning_rate": 2.1444159178433892e-05, + "loss": 0.2265, + "step": 27795 + }, + { + "epoch": 35.6816431322208, + "grad_norm": 1.158745527267456, + "learning_rate": 2.1443731279418057e-05, + "loss": 0.2296, + "step": 27796 + }, + { + "epoch": 35.68292682926829, + "grad_norm": 0.9894996881484985, + "learning_rate": 2.1443303380402226e-05, + "loss": 0.219, + "step": 27797 + }, + { + "epoch": 35.68421052631579, + "grad_norm": 1.386974811553955, + "learning_rate": 2.1442875481386394e-05, + "loss": 0.2261, + "step": 27798 + }, + { + "epoch": 35.685494223363285, + "grad_norm": 1.7059710025787354, + "learning_rate": 2.144244758237056e-05, + "loss": 0.2393, + "step": 27799 + }, + { + "epoch": 35.68677792041078, + "grad_norm": 0.9798130393028259, + "learning_rate": 2.144201968335473e-05, + "loss": 0.2084, + "step": 27800 + }, + { + "epoch": 35.68806161745828, + "grad_norm": 0.8137903213500977, + "learning_rate": 2.1441591784338896e-05, + "loss": 0.2352, + "step": 27801 + }, + { + "epoch": 35.68934531450578, + "grad_norm": 1.966795802116394, + "learning_rate": 2.1441163885323064e-05, + "loss": 0.1938, + "step": 27802 + }, + { + "epoch": 35.69062901155327, + "grad_norm": 0.8860659599304199, + "learning_rate": 2.1440735986307233e-05, + "loss": 0.2476, + "step": 27803 + }, + { + "epoch": 35.69191270860077, + "grad_norm": 1.1265114545822144, + "learning_rate": 2.1440308087291398e-05, + "loss": 0.222, + "step": 27804 + }, + { + "epoch": 35.693196405648266, + "grad_norm": 1.270860195159912, + "learning_rate": 2.143988018827557e-05, + "loss": 0.203, + "step": 27805 + }, + { + "epoch": 35.69448010269576, + "grad_norm": 4.74609899520874, + "learning_rate": 2.1439452289259735e-05, + "loss": 0.2574, + "step": 27806 + }, + { + "epoch": 35.69576379974326, + "grad_norm": 1.2225236892700195, + "learning_rate": 2.1439024390243903e-05, + "loss": 0.2519, + "step": 27807 + }, + { + "epoch": 35.69704749679076, + "grad_norm": 1.131652593612671, + "learning_rate": 2.143859649122807e-05, + "loss": 0.1904, + "step": 27808 + }, + { + "epoch": 35.69833119383826, + "grad_norm": 1.4132728576660156, + "learning_rate": 2.143816859221224e-05, + "loss": 0.2217, + "step": 27809 + }, + { + "epoch": 35.69961489088575, + "grad_norm": 3.3223469257354736, + "learning_rate": 2.1437740693196405e-05, + "loss": 0.2708, + "step": 27810 + }, + { + "epoch": 35.700898587933246, + "grad_norm": 1.1607578992843628, + "learning_rate": 2.1437312794180573e-05, + "loss": 0.2198, + "step": 27811 + }, + { + "epoch": 35.70218228498074, + "grad_norm": 1.731935739517212, + "learning_rate": 2.143688489516474e-05, + "loss": 0.243, + "step": 27812 + }, + { + "epoch": 35.70346598202824, + "grad_norm": 1.428223967552185, + "learning_rate": 2.143645699614891e-05, + "loss": 0.2586, + "step": 27813 + }, + { + "epoch": 35.70474967907574, + "grad_norm": 4.546213150024414, + "learning_rate": 2.143602909713308e-05, + "loss": 0.2918, + "step": 27814 + }, + { + "epoch": 35.70603337612324, + "grad_norm": 2.7691400051116943, + "learning_rate": 2.1435601198117243e-05, + "loss": 0.3627, + "step": 27815 + }, + { + "epoch": 35.707317073170735, + "grad_norm": 0.7979235649108887, + "learning_rate": 2.1435173299101415e-05, + "loss": 0.2423, + "step": 27816 + }, + { + "epoch": 35.708600770218226, + "grad_norm": 1.321698546409607, + "learning_rate": 2.143474540008558e-05, + "loss": 0.2572, + "step": 27817 + }, + { + "epoch": 35.709884467265724, + "grad_norm": 0.7449456453323364, + "learning_rate": 2.1434317501069745e-05, + "loss": 0.2214, + "step": 27818 + }, + { + "epoch": 35.71116816431322, + "grad_norm": 0.8854155540466309, + "learning_rate": 2.1433889602053917e-05, + "loss": 0.2541, + "step": 27819 + }, + { + "epoch": 35.71245186136072, + "grad_norm": 0.7805168628692627, + "learning_rate": 2.1433461703038082e-05, + "loss": 0.248, + "step": 27820 + }, + { + "epoch": 35.71373555840822, + "grad_norm": 0.9718471169471741, + "learning_rate": 2.1433033804022254e-05, + "loss": 0.2294, + "step": 27821 + }, + { + "epoch": 35.715019255455715, + "grad_norm": 1.8489333391189575, + "learning_rate": 2.143260590500642e-05, + "loss": 0.2262, + "step": 27822 + }, + { + "epoch": 35.716302952503206, + "grad_norm": 0.8740512132644653, + "learning_rate": 2.1432178005990587e-05, + "loss": 0.2425, + "step": 27823 + }, + { + "epoch": 35.717586649550704, + "grad_norm": 0.959170401096344, + "learning_rate": 2.1431750106974756e-05, + "loss": 0.2204, + "step": 27824 + }, + { + "epoch": 35.7188703465982, + "grad_norm": 1.0560556650161743, + "learning_rate": 2.143132220795892e-05, + "loss": 0.2235, + "step": 27825 + }, + { + "epoch": 35.7201540436457, + "grad_norm": 1.2496769428253174, + "learning_rate": 2.143089430894309e-05, + "loss": 0.2275, + "step": 27826 + }, + { + "epoch": 35.7214377406932, + "grad_norm": 0.6250063180923462, + "learning_rate": 2.1430466409927258e-05, + "loss": 0.2211, + "step": 27827 + }, + { + "epoch": 35.722721437740695, + "grad_norm": 1.9648799896240234, + "learning_rate": 2.1430038510911426e-05, + "loss": 0.2379, + "step": 27828 + }, + { + "epoch": 35.72400513478819, + "grad_norm": 0.7554373741149902, + "learning_rate": 2.1429610611895594e-05, + "loss": 0.2423, + "step": 27829 + }, + { + "epoch": 35.725288831835684, + "grad_norm": 1.8052146434783936, + "learning_rate": 2.1429182712879763e-05, + "loss": 0.2495, + "step": 27830 + }, + { + "epoch": 35.72657252888318, + "grad_norm": 5.617584705352783, + "learning_rate": 2.1428754813863928e-05, + "loss": 0.2248, + "step": 27831 + }, + { + "epoch": 35.72785622593068, + "grad_norm": 2.972160816192627, + "learning_rate": 2.1428326914848096e-05, + "loss": 0.2466, + "step": 27832 + }, + { + "epoch": 35.72913992297818, + "grad_norm": 2.1333799362182617, + "learning_rate": 2.1427899015832265e-05, + "loss": 0.2338, + "step": 27833 + }, + { + "epoch": 35.730423620025675, + "grad_norm": 0.6702451109886169, + "learning_rate": 2.142747111681643e-05, + "loss": 0.1936, + "step": 27834 + }, + { + "epoch": 35.73170731707317, + "grad_norm": 0.8377804160118103, + "learning_rate": 2.14270432178006e-05, + "loss": 0.1948, + "step": 27835 + }, + { + "epoch": 35.73299101412067, + "grad_norm": 0.7697460055351257, + "learning_rate": 2.1426615318784767e-05, + "loss": 0.2238, + "step": 27836 + }, + { + "epoch": 35.73427471116816, + "grad_norm": 1.1778767108917236, + "learning_rate": 2.142618741976894e-05, + "loss": 0.2318, + "step": 27837 + }, + { + "epoch": 35.73555840821566, + "grad_norm": 1.4099971055984497, + "learning_rate": 2.1425759520753103e-05, + "loss": 0.2264, + "step": 27838 + }, + { + "epoch": 35.73684210526316, + "grad_norm": 2.3987679481506348, + "learning_rate": 2.142533162173727e-05, + "loss": 0.215, + "step": 27839 + }, + { + "epoch": 35.738125802310655, + "grad_norm": 1.7317461967468262, + "learning_rate": 2.142490372272144e-05, + "loss": 0.2134, + "step": 27840 + }, + { + "epoch": 35.73940949935815, + "grad_norm": 1.3306493759155273, + "learning_rate": 2.1424475823705605e-05, + "loss": 0.2267, + "step": 27841 + }, + { + "epoch": 35.74069319640565, + "grad_norm": 1.014530897140503, + "learning_rate": 2.1424047924689774e-05, + "loss": 0.2181, + "step": 27842 + }, + { + "epoch": 35.74197689345314, + "grad_norm": 1.1569037437438965, + "learning_rate": 2.1423620025673942e-05, + "loss": 0.2023, + "step": 27843 + }, + { + "epoch": 35.74326059050064, + "grad_norm": 0.8899145126342773, + "learning_rate": 2.142319212665811e-05, + "loss": 0.2203, + "step": 27844 + }, + { + "epoch": 35.74454428754814, + "grad_norm": 1.2850931882858276, + "learning_rate": 2.142276422764228e-05, + "loss": 0.2439, + "step": 27845 + }, + { + "epoch": 35.745827984595635, + "grad_norm": 1.093224048614502, + "learning_rate": 2.1422336328626444e-05, + "loss": 0.2305, + "step": 27846 + }, + { + "epoch": 35.74711168164313, + "grad_norm": 2.177917003631592, + "learning_rate": 2.1421908429610612e-05, + "loss": 0.2177, + "step": 27847 + }, + { + "epoch": 35.74839537869063, + "grad_norm": 4.98413610458374, + "learning_rate": 2.142148053059478e-05, + "loss": 0.204, + "step": 27848 + }, + { + "epoch": 35.74967907573813, + "grad_norm": 0.9043108224868774, + "learning_rate": 2.142105263157895e-05, + "loss": 0.2156, + "step": 27849 + }, + { + "epoch": 35.75096277278562, + "grad_norm": 1.0845973491668701, + "learning_rate": 2.1420624732563114e-05, + "loss": 0.218, + "step": 27850 + }, + { + "epoch": 35.75224646983312, + "grad_norm": 1.6654963493347168, + "learning_rate": 2.1420196833547283e-05, + "loss": 0.2005, + "step": 27851 + }, + { + "epoch": 35.753530166880616, + "grad_norm": 1.214816689491272, + "learning_rate": 2.141976893453145e-05, + "loss": 0.2336, + "step": 27852 + }, + { + "epoch": 35.75481386392811, + "grad_norm": 1.3739421367645264, + "learning_rate": 2.141934103551562e-05, + "loss": 0.2423, + "step": 27853 + }, + { + "epoch": 35.75609756097561, + "grad_norm": 1.1501752138137817, + "learning_rate": 2.1418913136499788e-05, + "loss": 0.2424, + "step": 27854 + }, + { + "epoch": 35.75738125802311, + "grad_norm": 0.9310334920883179, + "learning_rate": 2.1418485237483953e-05, + "loss": 0.2163, + "step": 27855 + }, + { + "epoch": 35.7586649550706, + "grad_norm": 1.3005990982055664, + "learning_rate": 2.1418057338468125e-05, + "loss": 0.2152, + "step": 27856 + }, + { + "epoch": 35.7599486521181, + "grad_norm": 1.4988096952438354, + "learning_rate": 2.141762943945229e-05, + "loss": 0.2088, + "step": 27857 + }, + { + "epoch": 35.761232349165596, + "grad_norm": 1.4413738250732422, + "learning_rate": 2.1417201540436455e-05, + "loss": 0.2391, + "step": 27858 + }, + { + "epoch": 35.76251604621309, + "grad_norm": 1.3619900941848755, + "learning_rate": 2.1416773641420626e-05, + "loss": 0.2444, + "step": 27859 + }, + { + "epoch": 35.76379974326059, + "grad_norm": 10.262596130371094, + "learning_rate": 2.141634574240479e-05, + "loss": 0.2445, + "step": 27860 + }, + { + "epoch": 35.76508344030809, + "grad_norm": 1.8691744804382324, + "learning_rate": 2.141591784338896e-05, + "loss": 0.2437, + "step": 27861 + }, + { + "epoch": 35.76636713735559, + "grad_norm": 1.318840503692627, + "learning_rate": 2.141548994437313e-05, + "loss": 0.281, + "step": 27862 + }, + { + "epoch": 35.76765083440308, + "grad_norm": 2.9846208095550537, + "learning_rate": 2.1415062045357297e-05, + "loss": 0.3006, + "step": 27863 + }, + { + "epoch": 35.768934531450576, + "grad_norm": 5.502079486846924, + "learning_rate": 2.1414634146341465e-05, + "loss": 0.3321, + "step": 27864 + }, + { + "epoch": 35.770218228498074, + "grad_norm": 3.412691354751587, + "learning_rate": 2.141420624732563e-05, + "loss": 0.3901, + "step": 27865 + }, + { + "epoch": 35.77150192554557, + "grad_norm": 0.6323177814483643, + "learning_rate": 2.14137783483098e-05, + "loss": 0.2397, + "step": 27866 + }, + { + "epoch": 35.77278562259307, + "grad_norm": 0.861529529094696, + "learning_rate": 2.1413350449293967e-05, + "loss": 0.2511, + "step": 27867 + }, + { + "epoch": 35.77406931964057, + "grad_norm": 1.0305705070495605, + "learning_rate": 2.1412922550278135e-05, + "loss": 0.2365, + "step": 27868 + }, + { + "epoch": 35.775353016688065, + "grad_norm": 1.1353182792663574, + "learning_rate": 2.14124946512623e-05, + "loss": 0.2555, + "step": 27869 + }, + { + "epoch": 35.776636713735556, + "grad_norm": 0.8145650029182434, + "learning_rate": 2.1412066752246472e-05, + "loss": 0.2598, + "step": 27870 + }, + { + "epoch": 35.777920410783054, + "grad_norm": 0.8757078647613525, + "learning_rate": 2.1411638853230637e-05, + "loss": 0.2297, + "step": 27871 + }, + { + "epoch": 35.77920410783055, + "grad_norm": 0.8126514554023743, + "learning_rate": 2.1411210954214806e-05, + "loss": 0.222, + "step": 27872 + }, + { + "epoch": 35.78048780487805, + "grad_norm": 1.0101416110992432, + "learning_rate": 2.1410783055198974e-05, + "loss": 0.2374, + "step": 27873 + }, + { + "epoch": 35.78177150192555, + "grad_norm": 4.234798431396484, + "learning_rate": 2.141035515618314e-05, + "loss": 0.2283, + "step": 27874 + }, + { + "epoch": 35.783055198973045, + "grad_norm": 1.3335012197494507, + "learning_rate": 2.140992725716731e-05, + "loss": 0.2362, + "step": 27875 + }, + { + "epoch": 35.784338896020536, + "grad_norm": 0.8211933970451355, + "learning_rate": 2.1409499358151476e-05, + "loss": 0.2399, + "step": 27876 + }, + { + "epoch": 35.785622593068034, + "grad_norm": 0.8731926083564758, + "learning_rate": 2.1409071459135644e-05, + "loss": 0.2664, + "step": 27877 + }, + { + "epoch": 35.78690629011553, + "grad_norm": 3.3972842693328857, + "learning_rate": 2.1408643560119813e-05, + "loss": 0.2293, + "step": 27878 + }, + { + "epoch": 35.78818998716303, + "grad_norm": 1.0449943542480469, + "learning_rate": 2.1408215661103978e-05, + "loss": 0.2291, + "step": 27879 + }, + { + "epoch": 35.78947368421053, + "grad_norm": 2.4749767780303955, + "learning_rate": 2.140778776208815e-05, + "loss": 0.2152, + "step": 27880 + }, + { + "epoch": 35.790757381258025, + "grad_norm": 1.6296035051345825, + "learning_rate": 2.1407359863072315e-05, + "loss": 0.2246, + "step": 27881 + }, + { + "epoch": 35.79204107830552, + "grad_norm": 1.3653528690338135, + "learning_rate": 2.1406931964056483e-05, + "loss": 0.2329, + "step": 27882 + }, + { + "epoch": 35.793324775353014, + "grad_norm": 0.79472815990448, + "learning_rate": 2.140650406504065e-05, + "loss": 0.2252, + "step": 27883 + }, + { + "epoch": 35.79460847240051, + "grad_norm": 1.1371320486068726, + "learning_rate": 2.140607616602482e-05, + "loss": 0.2111, + "step": 27884 + }, + { + "epoch": 35.79589216944801, + "grad_norm": 1.6774357557296753, + "learning_rate": 2.1405648267008985e-05, + "loss": 0.219, + "step": 27885 + }, + { + "epoch": 35.79717586649551, + "grad_norm": 2.680447578430176, + "learning_rate": 2.1405220367993153e-05, + "loss": 0.2097, + "step": 27886 + }, + { + "epoch": 35.798459563543005, + "grad_norm": 1.21336829662323, + "learning_rate": 2.140479246897732e-05, + "loss": 0.233, + "step": 27887 + }, + { + "epoch": 35.7997432605905, + "grad_norm": 0.9819955825805664, + "learning_rate": 2.140436456996149e-05, + "loss": 0.218, + "step": 27888 + }, + { + "epoch": 35.801026957637994, + "grad_norm": 2.4952826499938965, + "learning_rate": 2.140393667094566e-05, + "loss": 0.2146, + "step": 27889 + }, + { + "epoch": 35.80231065468549, + "grad_norm": 1.4420533180236816, + "learning_rate": 2.1403508771929824e-05, + "loss": 0.2251, + "step": 27890 + }, + { + "epoch": 35.80359435173299, + "grad_norm": 1.2950341701507568, + "learning_rate": 2.1403080872913995e-05, + "loss": 0.2189, + "step": 27891 + }, + { + "epoch": 35.80487804878049, + "grad_norm": 1.442190170288086, + "learning_rate": 2.140265297389816e-05, + "loss": 0.2165, + "step": 27892 + }, + { + "epoch": 35.806161745827985, + "grad_norm": 1.1371465921401978, + "learning_rate": 2.1402225074882325e-05, + "loss": 0.2176, + "step": 27893 + }, + { + "epoch": 35.80744544287548, + "grad_norm": 1.068698763847351, + "learning_rate": 2.1401797175866497e-05, + "loss": 0.218, + "step": 27894 + }, + { + "epoch": 35.80872913992298, + "grad_norm": 1.82309091091156, + "learning_rate": 2.1401369276850662e-05, + "loss": 0.2391, + "step": 27895 + }, + { + "epoch": 35.81001283697047, + "grad_norm": 1.0270930528640747, + "learning_rate": 2.1400941377834834e-05, + "loss": 0.2346, + "step": 27896 + }, + { + "epoch": 35.81129653401797, + "grad_norm": 1.1895548105239868, + "learning_rate": 2.1400513478819e-05, + "loss": 0.2133, + "step": 27897 + }, + { + "epoch": 35.81258023106547, + "grad_norm": 1.0975300073623657, + "learning_rate": 2.1400085579803167e-05, + "loss": 0.2135, + "step": 27898 + }, + { + "epoch": 35.813863928112966, + "grad_norm": 1.2638393640518188, + "learning_rate": 2.1399657680787336e-05, + "loss": 0.2249, + "step": 27899 + }, + { + "epoch": 35.81514762516046, + "grad_norm": 1.156381607055664, + "learning_rate": 2.13992297817715e-05, + "loss": 0.215, + "step": 27900 + }, + { + "epoch": 35.81643132220796, + "grad_norm": 1.060963749885559, + "learning_rate": 2.139880188275567e-05, + "loss": 0.2185, + "step": 27901 + }, + { + "epoch": 35.81771501925546, + "grad_norm": 1.149152398109436, + "learning_rate": 2.1398373983739838e-05, + "loss": 0.2194, + "step": 27902 + }, + { + "epoch": 35.81899871630295, + "grad_norm": 1.1056009531021118, + "learning_rate": 2.1397946084724006e-05, + "loss": 0.2389, + "step": 27903 + }, + { + "epoch": 35.82028241335045, + "grad_norm": 2.2171671390533447, + "learning_rate": 2.1397518185708175e-05, + "loss": 0.2343, + "step": 27904 + }, + { + "epoch": 35.821566110397946, + "grad_norm": 1.8826123476028442, + "learning_rate": 2.1397090286692343e-05, + "loss": 0.2086, + "step": 27905 + }, + { + "epoch": 35.822849807445444, + "grad_norm": 5.123171329498291, + "learning_rate": 2.1396662387676508e-05, + "loss": 0.2264, + "step": 27906 + }, + { + "epoch": 35.82413350449294, + "grad_norm": 1.435542106628418, + "learning_rate": 2.1396234488660676e-05, + "loss": 0.256, + "step": 27907 + }, + { + "epoch": 35.82541720154044, + "grad_norm": 1.8239103555679321, + "learning_rate": 2.1395806589644845e-05, + "loss": 0.2188, + "step": 27908 + }, + { + "epoch": 35.82670089858793, + "grad_norm": 3.7337732315063477, + "learning_rate": 2.139537869062901e-05, + "loss": 0.2268, + "step": 27909 + }, + { + "epoch": 35.82798459563543, + "grad_norm": 1.297162652015686, + "learning_rate": 2.139495079161318e-05, + "loss": 0.2447, + "step": 27910 + }, + { + "epoch": 35.829268292682926, + "grad_norm": 1.4870936870574951, + "learning_rate": 2.1394522892597347e-05, + "loss": 0.2493, + "step": 27911 + }, + { + "epoch": 35.830551989730424, + "grad_norm": 2.922628402709961, + "learning_rate": 2.1394094993581515e-05, + "loss": 0.2821, + "step": 27912 + }, + { + "epoch": 35.83183568677792, + "grad_norm": 1.567720890045166, + "learning_rate": 2.1393667094565683e-05, + "loss": 0.2942, + "step": 27913 + }, + { + "epoch": 35.83311938382542, + "grad_norm": 1.719438076019287, + "learning_rate": 2.139323919554985e-05, + "loss": 0.2304, + "step": 27914 + }, + { + "epoch": 35.83440308087292, + "grad_norm": 1.8787904977798462, + "learning_rate": 2.139281129653402e-05, + "loss": 0.3947, + "step": 27915 + }, + { + "epoch": 35.83568677792041, + "grad_norm": 1.1630808115005493, + "learning_rate": 2.1392383397518185e-05, + "loss": 0.2626, + "step": 27916 + }, + { + "epoch": 35.836970474967906, + "grad_norm": 0.8241047859191895, + "learning_rate": 2.1391955498502354e-05, + "loss": 0.2516, + "step": 27917 + }, + { + "epoch": 35.838254172015404, + "grad_norm": 1.1178170442581177, + "learning_rate": 2.1391527599486522e-05, + "loss": 0.2303, + "step": 27918 + }, + { + "epoch": 35.8395378690629, + "grad_norm": 0.7900044918060303, + "learning_rate": 2.1391099700470687e-05, + "loss": 0.2597, + "step": 27919 + }, + { + "epoch": 35.8408215661104, + "grad_norm": 1.6780961751937866, + "learning_rate": 2.139067180145486e-05, + "loss": 0.227, + "step": 27920 + }, + { + "epoch": 35.8421052631579, + "grad_norm": 2.8454864025115967, + "learning_rate": 2.1390243902439024e-05, + "loss": 0.2113, + "step": 27921 + }, + { + "epoch": 35.84338896020539, + "grad_norm": 0.8870652318000793, + "learning_rate": 2.1389816003423192e-05, + "loss": 0.2741, + "step": 27922 + }, + { + "epoch": 35.844672657252886, + "grad_norm": 1.8830513954162598, + "learning_rate": 2.138938810440736e-05, + "loss": 0.2137, + "step": 27923 + }, + { + "epoch": 35.845956354300384, + "grad_norm": 1.0053129196166992, + "learning_rate": 2.138896020539153e-05, + "loss": 0.2266, + "step": 27924 + }, + { + "epoch": 35.84724005134788, + "grad_norm": 0.8961975574493408, + "learning_rate": 2.1388532306375694e-05, + "loss": 0.2274, + "step": 27925 + }, + { + "epoch": 35.84852374839538, + "grad_norm": 1.1937849521636963, + "learning_rate": 2.1388104407359863e-05, + "loss": 0.2364, + "step": 27926 + }, + { + "epoch": 35.84980744544288, + "grad_norm": 1.6927999258041382, + "learning_rate": 2.138767650834403e-05, + "loss": 0.2299, + "step": 27927 + }, + { + "epoch": 35.851091142490375, + "grad_norm": 1.0067049264907837, + "learning_rate": 2.13872486093282e-05, + "loss": 0.2185, + "step": 27928 + }, + { + "epoch": 35.852374839537866, + "grad_norm": 1.5108602046966553, + "learning_rate": 2.1386820710312368e-05, + "loss": 0.2188, + "step": 27929 + }, + { + "epoch": 35.853658536585364, + "grad_norm": 0.7730516195297241, + "learning_rate": 2.1386392811296533e-05, + "loss": 0.2213, + "step": 27930 + }, + { + "epoch": 35.85494223363286, + "grad_norm": 1.3894320726394653, + "learning_rate": 2.1385964912280705e-05, + "loss": 0.2137, + "step": 27931 + }, + { + "epoch": 35.85622593068036, + "grad_norm": 1.3237104415893555, + "learning_rate": 2.138553701326487e-05, + "loss": 0.2224, + "step": 27932 + }, + { + "epoch": 35.85750962772786, + "grad_norm": 0.8412258625030518, + "learning_rate": 2.1385109114249035e-05, + "loss": 0.2271, + "step": 27933 + }, + { + "epoch": 35.858793324775355, + "grad_norm": 1.7995951175689697, + "learning_rate": 2.1384681215233207e-05, + "loss": 0.21, + "step": 27934 + }, + { + "epoch": 35.86007702182285, + "grad_norm": 4.211091995239258, + "learning_rate": 2.138425331621737e-05, + "loss": 0.2403, + "step": 27935 + }, + { + "epoch": 35.861360718870344, + "grad_norm": 1.5415927171707153, + "learning_rate": 2.1383825417201543e-05, + "loss": 0.2122, + "step": 27936 + }, + { + "epoch": 35.86264441591784, + "grad_norm": 0.9347494840621948, + "learning_rate": 2.138339751818571e-05, + "loss": 0.2168, + "step": 27937 + }, + { + "epoch": 35.86392811296534, + "grad_norm": 0.8259084820747375, + "learning_rate": 2.1382969619169877e-05, + "loss": 0.2448, + "step": 27938 + }, + { + "epoch": 35.86521181001284, + "grad_norm": 1.8143088817596436, + "learning_rate": 2.1382541720154045e-05, + "loss": 0.1902, + "step": 27939 + }, + { + "epoch": 35.866495507060336, + "grad_norm": 0.8714454770088196, + "learning_rate": 2.138211382113821e-05, + "loss": 0.2151, + "step": 27940 + }, + { + "epoch": 35.86777920410783, + "grad_norm": 1.0373154878616333, + "learning_rate": 2.138168592212238e-05, + "loss": 0.2143, + "step": 27941 + }, + { + "epoch": 35.869062901155324, + "grad_norm": 0.8004540205001831, + "learning_rate": 2.1381258023106547e-05, + "loss": 0.2431, + "step": 27942 + }, + { + "epoch": 35.87034659820282, + "grad_norm": 1.0972601175308228, + "learning_rate": 2.1380830124090715e-05, + "loss": 0.1897, + "step": 27943 + }, + { + "epoch": 35.87163029525032, + "grad_norm": 0.8432507514953613, + "learning_rate": 2.1380402225074884e-05, + "loss": 0.2312, + "step": 27944 + }, + { + "epoch": 35.87291399229782, + "grad_norm": 1.3265219926834106, + "learning_rate": 2.1379974326059052e-05, + "loss": 0.234, + "step": 27945 + }, + { + "epoch": 35.874197689345316, + "grad_norm": 1.2012856006622314, + "learning_rate": 2.1379546427043217e-05, + "loss": 0.2203, + "step": 27946 + }, + { + "epoch": 35.87548138639281, + "grad_norm": 1.0362894535064697, + "learning_rate": 2.1379118528027386e-05, + "loss": 0.247, + "step": 27947 + }, + { + "epoch": 35.87676508344031, + "grad_norm": 1.056922435760498, + "learning_rate": 2.1378690629011554e-05, + "loss": 0.2322, + "step": 27948 + }, + { + "epoch": 35.8780487804878, + "grad_norm": 1.9104372262954712, + "learning_rate": 2.137826272999572e-05, + "loss": 0.2288, + "step": 27949 + }, + { + "epoch": 35.8793324775353, + "grad_norm": 1.2866945266723633, + "learning_rate": 2.137783483097989e-05, + "loss": 0.2222, + "step": 27950 + }, + { + "epoch": 35.8806161745828, + "grad_norm": 1.1824878454208374, + "learning_rate": 2.1377406931964056e-05, + "loss": 0.207, + "step": 27951 + }, + { + "epoch": 35.881899871630296, + "grad_norm": 1.06815505027771, + "learning_rate": 2.1376979032948228e-05, + "loss": 0.2393, + "step": 27952 + }, + { + "epoch": 35.883183568677794, + "grad_norm": 1.3910990953445435, + "learning_rate": 2.1376551133932393e-05, + "loss": 0.2306, + "step": 27953 + }, + { + "epoch": 35.88446726572529, + "grad_norm": 4.315130233764648, + "learning_rate": 2.1376123234916558e-05, + "loss": 0.2604, + "step": 27954 + }, + { + "epoch": 35.88575096277278, + "grad_norm": 1.9027869701385498, + "learning_rate": 2.137569533590073e-05, + "loss": 0.2301, + "step": 27955 + }, + { + "epoch": 35.88703465982028, + "grad_norm": 1.0706099271774292, + "learning_rate": 2.1375267436884895e-05, + "loss": 0.2185, + "step": 27956 + }, + { + "epoch": 35.88831835686778, + "grad_norm": 1.0039454698562622, + "learning_rate": 2.1374839537869063e-05, + "loss": 0.2411, + "step": 27957 + }, + { + "epoch": 35.889602053915276, + "grad_norm": 6.726043224334717, + "learning_rate": 2.137441163885323e-05, + "loss": 0.2334, + "step": 27958 + }, + { + "epoch": 35.890885750962774, + "grad_norm": 1.4601030349731445, + "learning_rate": 2.13739837398374e-05, + "loss": 0.2249, + "step": 27959 + }, + { + "epoch": 35.89216944801027, + "grad_norm": 1.6810286045074463, + "learning_rate": 2.1373555840821568e-05, + "loss": 0.222, + "step": 27960 + }, + { + "epoch": 35.89345314505777, + "grad_norm": 1.536829948425293, + "learning_rate": 2.1373127941805733e-05, + "loss": 0.2639, + "step": 27961 + }, + { + "epoch": 35.89473684210526, + "grad_norm": 2.251595973968506, + "learning_rate": 2.1372700042789902e-05, + "loss": 0.2493, + "step": 27962 + }, + { + "epoch": 35.89602053915276, + "grad_norm": 1.6842657327651978, + "learning_rate": 2.137227214377407e-05, + "loss": 0.2676, + "step": 27963 + }, + { + "epoch": 35.897304236200256, + "grad_norm": 2.6750071048736572, + "learning_rate": 2.137184424475824e-05, + "loss": 0.3235, + "step": 27964 + }, + { + "epoch": 35.898587933247754, + "grad_norm": 1.683878779411316, + "learning_rate": 2.1371416345742404e-05, + "loss": 0.4007, + "step": 27965 + }, + { + "epoch": 35.89987163029525, + "grad_norm": 0.8743242621421814, + "learning_rate": 2.1370988446726575e-05, + "loss": 0.2389, + "step": 27966 + }, + { + "epoch": 35.90115532734275, + "grad_norm": 0.9954112768173218, + "learning_rate": 2.137056054771074e-05, + "loss": 0.2546, + "step": 27967 + }, + { + "epoch": 35.90243902439025, + "grad_norm": 0.9747099280357361, + "learning_rate": 2.137013264869491e-05, + "loss": 0.2363, + "step": 27968 + }, + { + "epoch": 35.90372272143774, + "grad_norm": 0.90157550573349, + "learning_rate": 2.1369704749679077e-05, + "loss": 0.2422, + "step": 27969 + }, + { + "epoch": 35.905006418485236, + "grad_norm": 6.305911540985107, + "learning_rate": 2.1369276850663242e-05, + "loss": 0.2348, + "step": 27970 + }, + { + "epoch": 35.906290115532734, + "grad_norm": 2.242292642593384, + "learning_rate": 2.1368848951647414e-05, + "loss": 0.2323, + "step": 27971 + }, + { + "epoch": 35.90757381258023, + "grad_norm": 1.2507244348526, + "learning_rate": 2.136842105263158e-05, + "loss": 0.2475, + "step": 27972 + }, + { + "epoch": 35.90885750962773, + "grad_norm": 0.93328857421875, + "learning_rate": 2.1367993153615747e-05, + "loss": 0.2447, + "step": 27973 + }, + { + "epoch": 35.91014120667523, + "grad_norm": 1.0819395780563354, + "learning_rate": 2.1367565254599916e-05, + "loss": 0.2194, + "step": 27974 + }, + { + "epoch": 35.91142490372272, + "grad_norm": 0.9460144639015198, + "learning_rate": 2.136713735558408e-05, + "loss": 0.2674, + "step": 27975 + }, + { + "epoch": 35.912708600770216, + "grad_norm": 0.9351019263267517, + "learning_rate": 2.1366709456568253e-05, + "loss": 0.2848, + "step": 27976 + }, + { + "epoch": 35.913992297817714, + "grad_norm": 1.4335883855819702, + "learning_rate": 2.1366281557552418e-05, + "loss": 0.2473, + "step": 27977 + }, + { + "epoch": 35.91527599486521, + "grad_norm": 1.1348087787628174, + "learning_rate": 2.1365853658536586e-05, + "loss": 0.2282, + "step": 27978 + }, + { + "epoch": 35.91655969191271, + "grad_norm": 1.201847791671753, + "learning_rate": 2.1365425759520755e-05, + "loss": 0.2444, + "step": 27979 + }, + { + "epoch": 35.91784338896021, + "grad_norm": 3.1663856506347656, + "learning_rate": 2.136499786050492e-05, + "loss": 0.2208, + "step": 27980 + }, + { + "epoch": 35.919127086007705, + "grad_norm": 0.8232195377349854, + "learning_rate": 2.1364569961489088e-05, + "loss": 0.2168, + "step": 27981 + }, + { + "epoch": 35.920410783055196, + "grad_norm": 0.7182499170303345, + "learning_rate": 2.1364142062473256e-05, + "loss": 0.2308, + "step": 27982 + }, + { + "epoch": 35.921694480102694, + "grad_norm": 0.7984178066253662, + "learning_rate": 2.1363714163457425e-05, + "loss": 0.2121, + "step": 27983 + }, + { + "epoch": 35.92297817715019, + "grad_norm": 1.310491919517517, + "learning_rate": 2.1363286264441593e-05, + "loss": 0.2177, + "step": 27984 + }, + { + "epoch": 35.92426187419769, + "grad_norm": 0.8778331279754639, + "learning_rate": 2.136285836542576e-05, + "loss": 0.2451, + "step": 27985 + }, + { + "epoch": 35.92554557124519, + "grad_norm": 0.8740270137786865, + "learning_rate": 2.1362430466409927e-05, + "loss": 0.2295, + "step": 27986 + }, + { + "epoch": 35.926829268292686, + "grad_norm": 1.34989595413208, + "learning_rate": 2.1362002567394095e-05, + "loss": 0.2325, + "step": 27987 + }, + { + "epoch": 35.928112965340176, + "grad_norm": 1.4752424955368042, + "learning_rate": 2.1361574668378263e-05, + "loss": 0.217, + "step": 27988 + }, + { + "epoch": 35.929396662387674, + "grad_norm": 0.9951563477516174, + "learning_rate": 2.136114676936243e-05, + "loss": 0.2208, + "step": 27989 + }, + { + "epoch": 35.93068035943517, + "grad_norm": 0.9356452822685242, + "learning_rate": 2.13607188703466e-05, + "loss": 0.2292, + "step": 27990 + }, + { + "epoch": 35.93196405648267, + "grad_norm": 1.970565915107727, + "learning_rate": 2.1360290971330765e-05, + "loss": 0.2293, + "step": 27991 + }, + { + "epoch": 35.93324775353017, + "grad_norm": 1.2127436399459839, + "learning_rate": 2.1359863072314937e-05, + "loss": 0.2204, + "step": 27992 + }, + { + "epoch": 35.934531450577666, + "grad_norm": 1.1107770204544067, + "learning_rate": 2.1359435173299102e-05, + "loss": 0.2142, + "step": 27993 + }, + { + "epoch": 35.93581514762516, + "grad_norm": 0.991125226020813, + "learning_rate": 2.1359007274283267e-05, + "loss": 0.217, + "step": 27994 + }, + { + "epoch": 35.937098844672654, + "grad_norm": 1.9005560874938965, + "learning_rate": 2.135857937526744e-05, + "loss": 0.2553, + "step": 27995 + }, + { + "epoch": 35.93838254172015, + "grad_norm": 0.8322476744651794, + "learning_rate": 2.1358151476251604e-05, + "loss": 0.2429, + "step": 27996 + }, + { + "epoch": 35.93966623876765, + "grad_norm": 2.7553718090057373, + "learning_rate": 2.1357723577235772e-05, + "loss": 0.2255, + "step": 27997 + }, + { + "epoch": 35.94094993581515, + "grad_norm": 1.2652349472045898, + "learning_rate": 2.135729567821994e-05, + "loss": 0.2359, + "step": 27998 + }, + { + "epoch": 35.942233632862646, + "grad_norm": 1.1702253818511963, + "learning_rate": 2.135686777920411e-05, + "loss": 0.2289, + "step": 27999 + }, + { + "epoch": 35.943517329910144, + "grad_norm": 1.899221658706665, + "learning_rate": 2.1356439880188278e-05, + "loss": 0.2091, + "step": 28000 + }, + { + "epoch": 35.943517329910144, + "eval_cer": 0.26147934207949874, + "eval_loss": 0.47819334268569946, + "eval_runtime": 14.2379, + "eval_samples_per_second": 69.041, + "eval_steps_per_second": 0.492, + "eval_wer": 0.4383168997936253, + "step": 28000 + }, + { + "epoch": 35.94480102695764, + "grad_norm": 2.067101001739502, + "learning_rate": 2.1356011981172443e-05, + "loss": 0.1998, + "step": 28001 + }, + { + "epoch": 35.94608472400513, + "grad_norm": 2.147486448287964, + "learning_rate": 2.135558408215661e-05, + "loss": 0.1968, + "step": 28002 + }, + { + "epoch": 35.94736842105263, + "grad_norm": 1.8352488279342651, + "learning_rate": 2.135515618314078e-05, + "loss": 0.2408, + "step": 28003 + }, + { + "epoch": 35.94865211810013, + "grad_norm": 1.7755533456802368, + "learning_rate": 2.1354728284124948e-05, + "loss": 0.2447, + "step": 28004 + }, + { + "epoch": 35.949935815147626, + "grad_norm": 1.3900420665740967, + "learning_rate": 2.1354300385109113e-05, + "loss": 0.2325, + "step": 28005 + }, + { + "epoch": 35.951219512195124, + "grad_norm": 4.182671546936035, + "learning_rate": 2.1353872486093285e-05, + "loss": 0.2398, + "step": 28006 + }, + { + "epoch": 35.95250320924262, + "grad_norm": 1.5734343528747559, + "learning_rate": 2.135344458707745e-05, + "loss": 0.2703, + "step": 28007 + }, + { + "epoch": 35.95378690629011, + "grad_norm": 1.021859049797058, + "learning_rate": 2.1353016688061618e-05, + "loss": 0.2106, + "step": 28008 + }, + { + "epoch": 35.95507060333761, + "grad_norm": 1.3119826316833496, + "learning_rate": 2.1352588789045787e-05, + "loss": 0.2259, + "step": 28009 + }, + { + "epoch": 35.95635430038511, + "grad_norm": 1.4306347370147705, + "learning_rate": 2.135216089002995e-05, + "loss": 0.2531, + "step": 28010 + }, + { + "epoch": 35.957637997432606, + "grad_norm": 1.4687583446502686, + "learning_rate": 2.1351732991014123e-05, + "loss": 0.2573, + "step": 28011 + }, + { + "epoch": 35.958921694480104, + "grad_norm": 4.004704475402832, + "learning_rate": 2.135130509199829e-05, + "loss": 0.2467, + "step": 28012 + }, + { + "epoch": 35.9602053915276, + "grad_norm": 2.148235321044922, + "learning_rate": 2.1350877192982457e-05, + "loss": 0.2629, + "step": 28013 + }, + { + "epoch": 35.9614890885751, + "grad_norm": 2.4956507682800293, + "learning_rate": 2.1350449293966625e-05, + "loss": 0.2775, + "step": 28014 + }, + { + "epoch": 35.96277278562259, + "grad_norm": 22.17893409729004, + "learning_rate": 2.135002139495079e-05, + "loss": 0.383, + "step": 28015 + }, + { + "epoch": 35.96405648267009, + "grad_norm": 0.9569814801216125, + "learning_rate": 2.1349593495934962e-05, + "loss": 0.2405, + "step": 28016 + }, + { + "epoch": 35.965340179717586, + "grad_norm": 1.4466253519058228, + "learning_rate": 2.1349165596919127e-05, + "loss": 0.2361, + "step": 28017 + }, + { + "epoch": 35.966623876765084, + "grad_norm": 1.6384869813919067, + "learning_rate": 2.1348737697903296e-05, + "loss": 0.2464, + "step": 28018 + }, + { + "epoch": 35.96790757381258, + "grad_norm": 1.1278693675994873, + "learning_rate": 2.1348309798887464e-05, + "loss": 0.2452, + "step": 28019 + }, + { + "epoch": 35.96919127086008, + "grad_norm": 0.8733402490615845, + "learning_rate": 2.1347881899871632e-05, + "loss": 0.2415, + "step": 28020 + }, + { + "epoch": 35.97047496790757, + "grad_norm": 1.301222324371338, + "learning_rate": 2.1347454000855797e-05, + "loss": 0.2463, + "step": 28021 + }, + { + "epoch": 35.97175866495507, + "grad_norm": 1.0729153156280518, + "learning_rate": 2.1347026101839966e-05, + "loss": 0.223, + "step": 28022 + }, + { + "epoch": 35.973042362002566, + "grad_norm": 1.12232506275177, + "learning_rate": 2.1346598202824134e-05, + "loss": 0.2465, + "step": 28023 + }, + { + "epoch": 35.974326059050064, + "grad_norm": 0.8279466032981873, + "learning_rate": 2.1346170303808303e-05, + "loss": 0.2422, + "step": 28024 + }, + { + "epoch": 35.97560975609756, + "grad_norm": 1.6930534839630127, + "learning_rate": 2.134574240479247e-05, + "loss": 0.2256, + "step": 28025 + }, + { + "epoch": 35.97689345314506, + "grad_norm": 0.880014955997467, + "learning_rate": 2.1345314505776636e-05, + "loss": 0.2325, + "step": 28026 + }, + { + "epoch": 35.97817715019256, + "grad_norm": 1.0474668741226196, + "learning_rate": 2.1344886606760808e-05, + "loss": 0.2423, + "step": 28027 + }, + { + "epoch": 35.97946084724005, + "grad_norm": 0.983155369758606, + "learning_rate": 2.1344458707744973e-05, + "loss": 0.2245, + "step": 28028 + }, + { + "epoch": 35.980744544287546, + "grad_norm": 0.9419406056404114, + "learning_rate": 2.1344030808729138e-05, + "loss": 0.1971, + "step": 28029 + }, + { + "epoch": 35.982028241335044, + "grad_norm": 0.8079448342323303, + "learning_rate": 2.134360290971331e-05, + "loss": 0.2225, + "step": 28030 + }, + { + "epoch": 35.98331193838254, + "grad_norm": 0.8405678868293762, + "learning_rate": 2.1343175010697475e-05, + "loss": 0.2048, + "step": 28031 + }, + { + "epoch": 35.98459563543004, + "grad_norm": 1.1530309915542603, + "learning_rate": 2.1342747111681647e-05, + "loss": 0.2089, + "step": 28032 + }, + { + "epoch": 35.98587933247754, + "grad_norm": 3.8462536334991455, + "learning_rate": 2.134231921266581e-05, + "loss": 0.2067, + "step": 28033 + }, + { + "epoch": 35.987163029525036, + "grad_norm": 1.231486201286316, + "learning_rate": 2.134189131364998e-05, + "loss": 0.2056, + "step": 28034 + }, + { + "epoch": 35.988446726572526, + "grad_norm": 1.673322319984436, + "learning_rate": 2.134146341463415e-05, + "loss": 0.2268, + "step": 28035 + }, + { + "epoch": 35.989730423620024, + "grad_norm": 3.039827346801758, + "learning_rate": 2.1341035515618313e-05, + "loss": 0.202, + "step": 28036 + }, + { + "epoch": 35.99101412066752, + "grad_norm": 1.0606954097747803, + "learning_rate": 2.1340607616602482e-05, + "loss": 0.1937, + "step": 28037 + }, + { + "epoch": 35.99229781771502, + "grad_norm": 1.7260591983795166, + "learning_rate": 2.134017971758665e-05, + "loss": 0.2269, + "step": 28038 + }, + { + "epoch": 35.99358151476252, + "grad_norm": 1.118935465812683, + "learning_rate": 2.133975181857082e-05, + "loss": 0.2366, + "step": 28039 + }, + { + "epoch": 35.994865211810016, + "grad_norm": 1.7095075845718384, + "learning_rate": 2.1339323919554987e-05, + "loss": 0.2389, + "step": 28040 + }, + { + "epoch": 35.996148908857506, + "grad_norm": 1.960028052330017, + "learning_rate": 2.1338896020539152e-05, + "loss": 0.2384, + "step": 28041 + }, + { + "epoch": 35.997432605905004, + "grad_norm": 1.0980470180511475, + "learning_rate": 2.133846812152332e-05, + "loss": 0.2535, + "step": 28042 + }, + { + "epoch": 35.9987163029525, + "grad_norm": 1.2831542491912842, + "learning_rate": 2.133804022250749e-05, + "loss": 0.2901, + "step": 28043 + }, + { + "epoch": 36.0, + "grad_norm": 2.8856003284454346, + "learning_rate": 2.1337612323491657e-05, + "loss": 0.3595, + "step": 28044 + }, + { + "epoch": 36.0012836970475, + "grad_norm": 1.0554927587509155, + "learning_rate": 2.1337184424475822e-05, + "loss": 0.2587, + "step": 28045 + }, + { + "epoch": 36.002567394094996, + "grad_norm": 1.609710454940796, + "learning_rate": 2.1336756525459994e-05, + "loss": 0.2222, + "step": 28046 + }, + { + "epoch": 36.003851091142494, + "grad_norm": 0.7154846787452698, + "learning_rate": 2.133632862644416e-05, + "loss": 0.2282, + "step": 28047 + }, + { + "epoch": 36.005134788189984, + "grad_norm": 1.2070432901382446, + "learning_rate": 2.1335900727428328e-05, + "loss": 0.2616, + "step": 28048 + }, + { + "epoch": 36.00641848523748, + "grad_norm": 1.1230088472366333, + "learning_rate": 2.1335472828412496e-05, + "loss": 0.2333, + "step": 28049 + }, + { + "epoch": 36.00770218228498, + "grad_norm": 0.8574367761611938, + "learning_rate": 2.133504492939666e-05, + "loss": 0.2256, + "step": 28050 + }, + { + "epoch": 36.00898587933248, + "grad_norm": 1.4018754959106445, + "learning_rate": 2.1334617030380833e-05, + "loss": 0.2172, + "step": 28051 + }, + { + "epoch": 36.010269576379976, + "grad_norm": 1.7797973155975342, + "learning_rate": 2.1334189131364998e-05, + "loss": 0.2128, + "step": 28052 + }, + { + "epoch": 36.011553273427474, + "grad_norm": 0.7182095050811768, + "learning_rate": 2.1333761232349166e-05, + "loss": 0.209, + "step": 28053 + }, + { + "epoch": 36.012836970474964, + "grad_norm": 0.8244314193725586, + "learning_rate": 2.1333333333333335e-05, + "loss": 0.2099, + "step": 28054 + }, + { + "epoch": 36.01412066752246, + "grad_norm": 2.210347890853882, + "learning_rate": 2.13329054343175e-05, + "loss": 0.2085, + "step": 28055 + }, + { + "epoch": 36.01540436456996, + "grad_norm": 1.1540671586990356, + "learning_rate": 2.133247753530167e-05, + "loss": 0.2508, + "step": 28056 + }, + { + "epoch": 36.01668806161746, + "grad_norm": 2.0695621967315674, + "learning_rate": 2.1332049636285836e-05, + "loss": 0.2076, + "step": 28057 + }, + { + "epoch": 36.017971758664956, + "grad_norm": 0.8949180245399475, + "learning_rate": 2.1331621737270005e-05, + "loss": 0.2145, + "step": 28058 + }, + { + "epoch": 36.019255455712454, + "grad_norm": 1.3327833414077759, + "learning_rate": 2.1331193838254173e-05, + "loss": 0.2, + "step": 28059 + }, + { + "epoch": 36.02053915275995, + "grad_norm": 1.2512515783309937, + "learning_rate": 2.1330765939238342e-05, + "loss": 0.2025, + "step": 28060 + }, + { + "epoch": 36.02182284980744, + "grad_norm": 1.742495059967041, + "learning_rate": 2.1330338040222507e-05, + "loss": 0.2295, + "step": 28061 + }, + { + "epoch": 36.02310654685494, + "grad_norm": 1.3636797666549683, + "learning_rate": 2.1329910141206675e-05, + "loss": 0.203, + "step": 28062 + }, + { + "epoch": 36.02439024390244, + "grad_norm": 1.1126031875610352, + "learning_rate": 2.1329482242190844e-05, + "loss": 0.2026, + "step": 28063 + }, + { + "epoch": 36.025673940949936, + "grad_norm": 2.912403106689453, + "learning_rate": 2.132905434317501e-05, + "loss": 0.2167, + "step": 28064 + }, + { + "epoch": 36.026957637997434, + "grad_norm": 1.3509589433670044, + "learning_rate": 2.132862644415918e-05, + "loss": 0.1911, + "step": 28065 + }, + { + "epoch": 36.02824133504493, + "grad_norm": 0.9710080623626709, + "learning_rate": 2.1328198545143345e-05, + "loss": 0.2225, + "step": 28066 + }, + { + "epoch": 36.02952503209243, + "grad_norm": 0.8638203740119934, + "learning_rate": 2.1327770646127517e-05, + "loss": 0.2113, + "step": 28067 + }, + { + "epoch": 36.03080872913992, + "grad_norm": 0.9913743138313293, + "learning_rate": 2.1327342747111682e-05, + "loss": 0.2032, + "step": 28068 + }, + { + "epoch": 36.03209242618742, + "grad_norm": 1.145994782447815, + "learning_rate": 2.1326914848095847e-05, + "loss": 0.2266, + "step": 28069 + }, + { + "epoch": 36.033376123234916, + "grad_norm": 1.3297756910324097, + "learning_rate": 2.132648694908002e-05, + "loss": 0.2024, + "step": 28070 + }, + { + "epoch": 36.034659820282414, + "grad_norm": 0.9233511686325073, + "learning_rate": 2.1326059050064184e-05, + "loss": 0.1816, + "step": 28071 + }, + { + "epoch": 36.03594351732991, + "grad_norm": 0.9477116465568542, + "learning_rate": 2.1325631151048352e-05, + "loss": 0.2108, + "step": 28072 + }, + { + "epoch": 36.03722721437741, + "grad_norm": 0.9478037357330322, + "learning_rate": 2.132520325203252e-05, + "loss": 0.1888, + "step": 28073 + }, + { + "epoch": 36.0385109114249, + "grad_norm": 1.322953701019287, + "learning_rate": 2.132477535301669e-05, + "loss": 0.2172, + "step": 28074 + }, + { + "epoch": 36.0397946084724, + "grad_norm": 1.1219247579574585, + "learning_rate": 2.1324347454000858e-05, + "loss": 0.2269, + "step": 28075 + }, + { + "epoch": 36.041078305519896, + "grad_norm": 0.791713297367096, + "learning_rate": 2.1323919554985023e-05, + "loss": 0.1818, + "step": 28076 + }, + { + "epoch": 36.042362002567394, + "grad_norm": 1.2550849914550781, + "learning_rate": 2.132349165596919e-05, + "loss": 0.2015, + "step": 28077 + }, + { + "epoch": 36.04364569961489, + "grad_norm": 1.0181394815444946, + "learning_rate": 2.132306375695336e-05, + "loss": 0.197, + "step": 28078 + }, + { + "epoch": 36.04492939666239, + "grad_norm": 1.3449060916900635, + "learning_rate": 2.1322635857937528e-05, + "loss": 0.2078, + "step": 28079 + }, + { + "epoch": 36.04621309370989, + "grad_norm": 0.8610700368881226, + "learning_rate": 2.1322207958921693e-05, + "loss": 0.1947, + "step": 28080 + }, + { + "epoch": 36.04749679075738, + "grad_norm": 1.1891380548477173, + "learning_rate": 2.1321780059905865e-05, + "loss": 0.2287, + "step": 28081 + }, + { + "epoch": 36.048780487804876, + "grad_norm": 1.0282455682754517, + "learning_rate": 2.132135216089003e-05, + "loss": 0.2223, + "step": 28082 + }, + { + "epoch": 36.050064184852374, + "grad_norm": 1.9124977588653564, + "learning_rate": 2.1320924261874198e-05, + "loss": 0.2105, + "step": 28083 + }, + { + "epoch": 36.05134788189987, + "grad_norm": 3.5174927711486816, + "learning_rate": 2.1320496362858367e-05, + "loss": 0.1896, + "step": 28084 + }, + { + "epoch": 36.05263157894737, + "grad_norm": 1.1020071506500244, + "learning_rate": 2.132006846384253e-05, + "loss": 0.219, + "step": 28085 + }, + { + "epoch": 36.05391527599487, + "grad_norm": 3.5940756797790527, + "learning_rate": 2.1319640564826703e-05, + "loss": 0.2258, + "step": 28086 + }, + { + "epoch": 36.05519897304236, + "grad_norm": 1.2564619779586792, + "learning_rate": 2.131921266581087e-05, + "loss": 0.2462, + "step": 28087 + }, + { + "epoch": 36.056482670089856, + "grad_norm": 2.0961568355560303, + "learning_rate": 2.1318784766795037e-05, + "loss": 0.2294, + "step": 28088 + }, + { + "epoch": 36.057766367137354, + "grad_norm": 1.990761160850525, + "learning_rate": 2.1318356867779205e-05, + "loss": 0.2211, + "step": 28089 + }, + { + "epoch": 36.05905006418485, + "grad_norm": 3.31673526763916, + "learning_rate": 2.131792896876337e-05, + "loss": 0.2282, + "step": 28090 + }, + { + "epoch": 36.06033376123235, + "grad_norm": 1.4996596574783325, + "learning_rate": 2.1317501069747542e-05, + "loss": 0.2643, + "step": 28091 + }, + { + "epoch": 36.06161745827985, + "grad_norm": 1.739857792854309, + "learning_rate": 2.1317073170731707e-05, + "loss": 0.3577, + "step": 28092 + }, + { + "epoch": 36.062901155327346, + "grad_norm": 1.5138062238693237, + "learning_rate": 2.1316645271715876e-05, + "loss": 0.2981, + "step": 28093 + }, + { + "epoch": 36.06418485237484, + "grad_norm": 2.3238303661346436, + "learning_rate": 2.1316217372700044e-05, + "loss": 0.3824, + "step": 28094 + }, + { + "epoch": 36.065468549422334, + "grad_norm": 0.7534722685813904, + "learning_rate": 2.1315789473684212e-05, + "loss": 0.2474, + "step": 28095 + }, + { + "epoch": 36.06675224646983, + "grad_norm": 0.9640969038009644, + "learning_rate": 2.1315361574668377e-05, + "loss": 0.2237, + "step": 28096 + }, + { + "epoch": 36.06803594351733, + "grad_norm": 1.3019953966140747, + "learning_rate": 2.1314933675652546e-05, + "loss": 0.2287, + "step": 28097 + }, + { + "epoch": 36.06931964056483, + "grad_norm": 0.7119196057319641, + "learning_rate": 2.1314505776636714e-05, + "loss": 0.2241, + "step": 28098 + }, + { + "epoch": 36.070603337612326, + "grad_norm": 0.7900106906890869, + "learning_rate": 2.1314077877620883e-05, + "loss": 0.2375, + "step": 28099 + }, + { + "epoch": 36.071887034659824, + "grad_norm": 0.908337414264679, + "learning_rate": 2.131364997860505e-05, + "loss": 0.227, + "step": 28100 + }, + { + "epoch": 36.073170731707314, + "grad_norm": 1.3025681972503662, + "learning_rate": 2.1313222079589216e-05, + "loss": 0.2389, + "step": 28101 + }, + { + "epoch": 36.07445442875481, + "grad_norm": 1.5002896785736084, + "learning_rate": 2.1312794180573385e-05, + "loss": 0.21, + "step": 28102 + }, + { + "epoch": 36.07573812580231, + "grad_norm": 0.9649004936218262, + "learning_rate": 2.1312366281557553e-05, + "loss": 0.2508, + "step": 28103 + }, + { + "epoch": 36.07702182284981, + "grad_norm": 0.700559139251709, + "learning_rate": 2.1311938382541718e-05, + "loss": 0.2493, + "step": 28104 + }, + { + "epoch": 36.078305519897306, + "grad_norm": 0.74205082654953, + "learning_rate": 2.131151048352589e-05, + "loss": 0.2246, + "step": 28105 + }, + { + "epoch": 36.079589216944804, + "grad_norm": 0.7412319779396057, + "learning_rate": 2.1311082584510055e-05, + "loss": 0.228, + "step": 28106 + }, + { + "epoch": 36.080872913992295, + "grad_norm": 1.455890417098999, + "learning_rate": 2.1310654685494227e-05, + "loss": 0.2085, + "step": 28107 + }, + { + "epoch": 36.08215661103979, + "grad_norm": 1.9471423625946045, + "learning_rate": 2.131022678647839e-05, + "loss": 0.2352, + "step": 28108 + }, + { + "epoch": 36.08344030808729, + "grad_norm": 1.2652733325958252, + "learning_rate": 2.1309798887462557e-05, + "loss": 0.2242, + "step": 28109 + }, + { + "epoch": 36.08472400513479, + "grad_norm": 0.7510931491851807, + "learning_rate": 2.130937098844673e-05, + "loss": 0.2445, + "step": 28110 + }, + { + "epoch": 36.086007702182286, + "grad_norm": 0.8524129986763, + "learning_rate": 2.1308943089430893e-05, + "loss": 0.2093, + "step": 28111 + }, + { + "epoch": 36.087291399229784, + "grad_norm": 0.8898112177848816, + "learning_rate": 2.1308515190415062e-05, + "loss": 0.2226, + "step": 28112 + }, + { + "epoch": 36.08857509627728, + "grad_norm": 6.2259602546691895, + "learning_rate": 2.130808729139923e-05, + "loss": 0.2055, + "step": 28113 + }, + { + "epoch": 36.08985879332477, + "grad_norm": 1.056425929069519, + "learning_rate": 2.13076593923834e-05, + "loss": 0.2265, + "step": 28114 + }, + { + "epoch": 36.09114249037227, + "grad_norm": 1.3418775796890259, + "learning_rate": 2.1307231493367567e-05, + "loss": 0.2003, + "step": 28115 + }, + { + "epoch": 36.09242618741977, + "grad_norm": 0.8103009462356567, + "learning_rate": 2.1306803594351732e-05, + "loss": 0.2132, + "step": 28116 + }, + { + "epoch": 36.093709884467266, + "grad_norm": 0.8983856439590454, + "learning_rate": 2.13063756953359e-05, + "loss": 0.1913, + "step": 28117 + }, + { + "epoch": 36.094993581514764, + "grad_norm": 0.935771107673645, + "learning_rate": 2.130594779632007e-05, + "loss": 0.1984, + "step": 28118 + }, + { + "epoch": 36.09627727856226, + "grad_norm": 0.9530712962150574, + "learning_rate": 2.1305519897304237e-05, + "loss": 0.2096, + "step": 28119 + }, + { + "epoch": 36.09756097560975, + "grad_norm": 0.86685711145401, + "learning_rate": 2.1305091998288402e-05, + "loss": 0.1988, + "step": 28120 + }, + { + "epoch": 36.09884467265725, + "grad_norm": 0.7434473037719727, + "learning_rate": 2.1304664099272574e-05, + "loss": 0.1772, + "step": 28121 + }, + { + "epoch": 36.10012836970475, + "grad_norm": 1.261565089225769, + "learning_rate": 2.130423620025674e-05, + "loss": 0.1834, + "step": 28122 + }, + { + "epoch": 36.101412066752246, + "grad_norm": 1.3316192626953125, + "learning_rate": 2.1303808301240908e-05, + "loss": 0.1925, + "step": 28123 + }, + { + "epoch": 36.102695763799744, + "grad_norm": 1.6510390043258667, + "learning_rate": 2.1303380402225076e-05, + "loss": 0.1958, + "step": 28124 + }, + { + "epoch": 36.10397946084724, + "grad_norm": 1.1079580783843994, + "learning_rate": 2.130295250320924e-05, + "loss": 0.1925, + "step": 28125 + }, + { + "epoch": 36.10526315789474, + "grad_norm": 1.9141606092453003, + "learning_rate": 2.1302524604193413e-05, + "loss": 0.2133, + "step": 28126 + }, + { + "epoch": 36.10654685494223, + "grad_norm": 0.951404869556427, + "learning_rate": 2.1302096705177578e-05, + "loss": 0.2118, + "step": 28127 + }, + { + "epoch": 36.10783055198973, + "grad_norm": 0.954811692237854, + "learning_rate": 2.1301668806161746e-05, + "loss": 0.2126, + "step": 28128 + }, + { + "epoch": 36.109114249037226, + "grad_norm": 1.781097173690796, + "learning_rate": 2.1301240907145915e-05, + "loss": 0.1904, + "step": 28129 + }, + { + "epoch": 36.110397946084724, + "grad_norm": 1.8115276098251343, + "learning_rate": 2.130081300813008e-05, + "loss": 0.2118, + "step": 28130 + }, + { + "epoch": 36.11168164313222, + "grad_norm": 1.8889492750167847, + "learning_rate": 2.130038510911425e-05, + "loss": 0.2208, + "step": 28131 + }, + { + "epoch": 36.11296534017972, + "grad_norm": 2.454789161682129, + "learning_rate": 2.1299957210098417e-05, + "loss": 0.2146, + "step": 28132 + }, + { + "epoch": 36.11424903722722, + "grad_norm": 1.2427585124969482, + "learning_rate": 2.1299529311082585e-05, + "loss": 0.2111, + "step": 28133 + }, + { + "epoch": 36.11553273427471, + "grad_norm": 0.9017019271850586, + "learning_rate": 2.1299101412066753e-05, + "loss": 0.1953, + "step": 28134 + }, + { + "epoch": 36.116816431322206, + "grad_norm": 1.3586031198501587, + "learning_rate": 2.1298673513050922e-05, + "loss": 0.1796, + "step": 28135 + }, + { + "epoch": 36.118100128369704, + "grad_norm": 2.1781809329986572, + "learning_rate": 2.1298245614035087e-05, + "loss": 0.2501, + "step": 28136 + }, + { + "epoch": 36.1193838254172, + "grad_norm": 1.3264796733856201, + "learning_rate": 2.1297817715019255e-05, + "loss": 0.2148, + "step": 28137 + }, + { + "epoch": 36.1206675224647, + "grad_norm": 1.663633942604065, + "learning_rate": 2.1297389816003424e-05, + "loss": 0.2101, + "step": 28138 + }, + { + "epoch": 36.1219512195122, + "grad_norm": 2.3116776943206787, + "learning_rate": 2.1296961916987592e-05, + "loss": 0.2135, + "step": 28139 + }, + { + "epoch": 36.12323491655969, + "grad_norm": 3.1087443828582764, + "learning_rate": 2.129653401797176e-05, + "loss": 0.227, + "step": 28140 + }, + { + "epoch": 36.12451861360719, + "grad_norm": 11.400135040283203, + "learning_rate": 2.1296106118955925e-05, + "loss": 0.2457, + "step": 28141 + }, + { + "epoch": 36.125802310654684, + "grad_norm": 5.416370868682861, + "learning_rate": 2.1295678219940097e-05, + "loss": 0.2986, + "step": 28142 + }, + { + "epoch": 36.12708600770218, + "grad_norm": 3.588717460632324, + "learning_rate": 2.1295250320924262e-05, + "loss": 0.2863, + "step": 28143 + }, + { + "epoch": 36.12836970474968, + "grad_norm": 5.154034614562988, + "learning_rate": 2.1294822421908427e-05, + "loss": 0.4352, + "step": 28144 + }, + { + "epoch": 36.12965340179718, + "grad_norm": 1.0289443731307983, + "learning_rate": 2.12943945228926e-05, + "loss": 0.2545, + "step": 28145 + }, + { + "epoch": 36.130937098844676, + "grad_norm": 0.9608951807022095, + "learning_rate": 2.1293966623876764e-05, + "loss": 0.2185, + "step": 28146 + }, + { + "epoch": 36.13222079589217, + "grad_norm": 0.681541383266449, + "learning_rate": 2.1293538724860936e-05, + "loss": 0.2437, + "step": 28147 + }, + { + "epoch": 36.133504492939664, + "grad_norm": 1.5987290143966675, + "learning_rate": 2.12931108258451e-05, + "loss": 0.2398, + "step": 28148 + }, + { + "epoch": 36.13478818998716, + "grad_norm": 0.8438628315925598, + "learning_rate": 2.129268292682927e-05, + "loss": 0.2301, + "step": 28149 + }, + { + "epoch": 36.13607188703466, + "grad_norm": 1.180695652961731, + "learning_rate": 2.1292255027813438e-05, + "loss": 0.2217, + "step": 28150 + }, + { + "epoch": 36.13735558408216, + "grad_norm": 0.8941894769668579, + "learning_rate": 2.1291827128797603e-05, + "loss": 0.2463, + "step": 28151 + }, + { + "epoch": 36.138639281129656, + "grad_norm": 0.6821050643920898, + "learning_rate": 2.129139922978177e-05, + "loss": 0.2237, + "step": 28152 + }, + { + "epoch": 36.13992297817715, + "grad_norm": 2.1355154514312744, + "learning_rate": 2.129097133076594e-05, + "loss": 0.2238, + "step": 28153 + }, + { + "epoch": 36.141206675224645, + "grad_norm": 0.7442516088485718, + "learning_rate": 2.1290543431750108e-05, + "loss": 0.2153, + "step": 28154 + }, + { + "epoch": 36.14249037227214, + "grad_norm": 1.8972307443618774, + "learning_rate": 2.1290115532734276e-05, + "loss": 0.2149, + "step": 28155 + }, + { + "epoch": 36.14377406931964, + "grad_norm": 4.819853782653809, + "learning_rate": 2.1289687633718445e-05, + "loss": 0.2491, + "step": 28156 + }, + { + "epoch": 36.14505776636714, + "grad_norm": 1.258230209350586, + "learning_rate": 2.128925973470261e-05, + "loss": 0.2053, + "step": 28157 + }, + { + "epoch": 36.146341463414636, + "grad_norm": 1.8880252838134766, + "learning_rate": 2.1288831835686778e-05, + "loss": 0.2373, + "step": 28158 + }, + { + "epoch": 36.147625160462134, + "grad_norm": 1.010371446609497, + "learning_rate": 2.1288403936670947e-05, + "loss": 0.2372, + "step": 28159 + }, + { + "epoch": 36.148908857509625, + "grad_norm": 0.961105227470398, + "learning_rate": 2.1287976037655112e-05, + "loss": 0.2053, + "step": 28160 + }, + { + "epoch": 36.15019255455712, + "grad_norm": 1.4128613471984863, + "learning_rate": 2.1287548138639284e-05, + "loss": 0.2416, + "step": 28161 + }, + { + "epoch": 36.15147625160462, + "grad_norm": 0.7341601848602295, + "learning_rate": 2.128712023962345e-05, + "loss": 0.2049, + "step": 28162 + }, + { + "epoch": 36.15275994865212, + "grad_norm": 1.5037556886672974, + "learning_rate": 2.1286692340607617e-05, + "loss": 0.212, + "step": 28163 + }, + { + "epoch": 36.154043645699616, + "grad_norm": 3.1277506351470947, + "learning_rate": 2.1286264441591785e-05, + "loss": 0.2026, + "step": 28164 + }, + { + "epoch": 36.155327342747114, + "grad_norm": 0.9981632828712463, + "learning_rate": 2.128583654257595e-05, + "loss": 0.2232, + "step": 28165 + }, + { + "epoch": 36.15661103979461, + "grad_norm": 1.2454036474227905, + "learning_rate": 2.1285408643560122e-05, + "loss": 0.2167, + "step": 28166 + }, + { + "epoch": 36.1578947368421, + "grad_norm": 1.083411693572998, + "learning_rate": 2.1284980744544287e-05, + "loss": 0.2144, + "step": 28167 + }, + { + "epoch": 36.1591784338896, + "grad_norm": 0.9064434170722961, + "learning_rate": 2.1284552845528456e-05, + "loss": 0.2062, + "step": 28168 + }, + { + "epoch": 36.1604621309371, + "grad_norm": 2.1713831424713135, + "learning_rate": 2.1284124946512624e-05, + "loss": 0.206, + "step": 28169 + }, + { + "epoch": 36.161745827984596, + "grad_norm": 4.276411056518555, + "learning_rate": 2.128369704749679e-05, + "loss": 0.2193, + "step": 28170 + }, + { + "epoch": 36.163029525032094, + "grad_norm": 1.0429829359054565, + "learning_rate": 2.128326914848096e-05, + "loss": 0.209, + "step": 28171 + }, + { + "epoch": 36.16431322207959, + "grad_norm": 2.685056447982788, + "learning_rate": 2.1282841249465126e-05, + "loss": 0.1847, + "step": 28172 + }, + { + "epoch": 36.16559691912708, + "grad_norm": 1.2743691205978394, + "learning_rate": 2.1282413350449294e-05, + "loss": 0.1979, + "step": 28173 + }, + { + "epoch": 36.16688061617458, + "grad_norm": 1.0616698265075684, + "learning_rate": 2.1281985451433463e-05, + "loss": 0.1989, + "step": 28174 + }, + { + "epoch": 36.16816431322208, + "grad_norm": 1.3176430463790894, + "learning_rate": 2.128155755241763e-05, + "loss": 0.211, + "step": 28175 + }, + { + "epoch": 36.169448010269576, + "grad_norm": 1.3217964172363281, + "learning_rate": 2.1281129653401796e-05, + "loss": 0.2039, + "step": 28176 + }, + { + "epoch": 36.170731707317074, + "grad_norm": 0.8034620881080627, + "learning_rate": 2.1280701754385965e-05, + "loss": 0.1975, + "step": 28177 + }, + { + "epoch": 36.17201540436457, + "grad_norm": 1.7697256803512573, + "learning_rate": 2.1280273855370133e-05, + "loss": 0.2253, + "step": 28178 + }, + { + "epoch": 36.17329910141207, + "grad_norm": 2.022656202316284, + "learning_rate": 2.12798459563543e-05, + "loss": 0.2267, + "step": 28179 + }, + { + "epoch": 36.17458279845956, + "grad_norm": 1.2297382354736328, + "learning_rate": 2.127941805733847e-05, + "loss": 0.1919, + "step": 28180 + }, + { + "epoch": 36.17586649550706, + "grad_norm": 1.02559232711792, + "learning_rate": 2.1278990158322635e-05, + "loss": 0.233, + "step": 28181 + }, + { + "epoch": 36.177150192554556, + "grad_norm": 1.3061156272888184, + "learning_rate": 2.1278562259306807e-05, + "loss": 0.2047, + "step": 28182 + }, + { + "epoch": 36.178433889602054, + "grad_norm": 1.0447092056274414, + "learning_rate": 2.127813436029097e-05, + "loss": 0.1892, + "step": 28183 + }, + { + "epoch": 36.17971758664955, + "grad_norm": 2.573793649673462, + "learning_rate": 2.1277706461275137e-05, + "loss": 0.2344, + "step": 28184 + }, + { + "epoch": 36.18100128369705, + "grad_norm": 1.237776517868042, + "learning_rate": 2.127727856225931e-05, + "loss": 0.1982, + "step": 28185 + }, + { + "epoch": 36.18228498074454, + "grad_norm": 1.1589906215667725, + "learning_rate": 2.1276850663243473e-05, + "loss": 0.2172, + "step": 28186 + }, + { + "epoch": 36.18356867779204, + "grad_norm": 1.6632769107818604, + "learning_rate": 2.1276422764227645e-05, + "loss": 0.2425, + "step": 28187 + }, + { + "epoch": 36.18485237483954, + "grad_norm": 1.1787437200546265, + "learning_rate": 2.127599486521181e-05, + "loss": 0.2063, + "step": 28188 + }, + { + "epoch": 36.186136071887034, + "grad_norm": 6.290546417236328, + "learning_rate": 2.127556696619598e-05, + "loss": 0.2738, + "step": 28189 + }, + { + "epoch": 36.18741976893453, + "grad_norm": 2.109020471572876, + "learning_rate": 2.1275139067180147e-05, + "loss": 0.2086, + "step": 28190 + }, + { + "epoch": 36.18870346598203, + "grad_norm": 1.8971750736236572, + "learning_rate": 2.1274711168164312e-05, + "loss": 0.2619, + "step": 28191 + }, + { + "epoch": 36.18998716302953, + "grad_norm": 1.5571390390396118, + "learning_rate": 2.127428326914848e-05, + "loss": 0.2464, + "step": 28192 + }, + { + "epoch": 36.19127086007702, + "grad_norm": 1.7678229808807373, + "learning_rate": 2.127385537013265e-05, + "loss": 0.2868, + "step": 28193 + }, + { + "epoch": 36.19255455712452, + "grad_norm": 2.5027005672454834, + "learning_rate": 2.1273427471116817e-05, + "loss": 0.3637, + "step": 28194 + }, + { + "epoch": 36.193838254172015, + "grad_norm": 0.7340088486671448, + "learning_rate": 2.1272999572100986e-05, + "loss": 0.2484, + "step": 28195 + }, + { + "epoch": 36.19512195121951, + "grad_norm": 0.5994445085525513, + "learning_rate": 2.1272571673085154e-05, + "loss": 0.2306, + "step": 28196 + }, + { + "epoch": 36.19640564826701, + "grad_norm": 1.274267315864563, + "learning_rate": 2.127214377406932e-05, + "loss": 0.2341, + "step": 28197 + }, + { + "epoch": 36.19768934531451, + "grad_norm": 1.3326961994171143, + "learning_rate": 2.1271715875053488e-05, + "loss": 0.2416, + "step": 28198 + }, + { + "epoch": 36.198973042362006, + "grad_norm": 0.8475691676139832, + "learning_rate": 2.1271287976037656e-05, + "loss": 0.2135, + "step": 28199 + }, + { + "epoch": 36.2002567394095, + "grad_norm": 0.7454038262367249, + "learning_rate": 2.127086007702182e-05, + "loss": 0.2198, + "step": 28200 + }, + { + "epoch": 36.201540436456995, + "grad_norm": 1.165880799293518, + "learning_rate": 2.1270432178005993e-05, + "loss": 0.2146, + "step": 28201 + }, + { + "epoch": 36.20282413350449, + "grad_norm": 0.7859717011451721, + "learning_rate": 2.1270004278990158e-05, + "loss": 0.2279, + "step": 28202 + }, + { + "epoch": 36.20410783055199, + "grad_norm": 1.6456594467163086, + "learning_rate": 2.126957637997433e-05, + "loss": 0.2016, + "step": 28203 + }, + { + "epoch": 36.20539152759949, + "grad_norm": 1.5772533416748047, + "learning_rate": 2.1269148480958495e-05, + "loss": 0.2214, + "step": 28204 + }, + { + "epoch": 36.206675224646986, + "grad_norm": 0.7380110025405884, + "learning_rate": 2.126872058194266e-05, + "loss": 0.2399, + "step": 28205 + }, + { + "epoch": 36.20795892169448, + "grad_norm": 0.7357990741729736, + "learning_rate": 2.126829268292683e-05, + "loss": 0.2583, + "step": 28206 + }, + { + "epoch": 36.209242618741975, + "grad_norm": 2.121933937072754, + "learning_rate": 2.1267864783910997e-05, + "loss": 0.2252, + "step": 28207 + }, + { + "epoch": 36.21052631578947, + "grad_norm": 0.9823404550552368, + "learning_rate": 2.1267436884895165e-05, + "loss": 0.2165, + "step": 28208 + }, + { + "epoch": 36.21181001283697, + "grad_norm": 1.575258731842041, + "learning_rate": 2.1267008985879333e-05, + "loss": 0.1991, + "step": 28209 + }, + { + "epoch": 36.21309370988447, + "grad_norm": 1.0515258312225342, + "learning_rate": 2.1266581086863502e-05, + "loss": 0.2131, + "step": 28210 + }, + { + "epoch": 36.214377406931966, + "grad_norm": 2.718648910522461, + "learning_rate": 2.126615318784767e-05, + "loss": 0.224, + "step": 28211 + }, + { + "epoch": 36.215661103979464, + "grad_norm": 2.9458019733428955, + "learning_rate": 2.1265725288831835e-05, + "loss": 0.2118, + "step": 28212 + }, + { + "epoch": 36.216944801026955, + "grad_norm": 1.2448325157165527, + "learning_rate": 2.1265297389816004e-05, + "loss": 0.2071, + "step": 28213 + }, + { + "epoch": 36.21822849807445, + "grad_norm": 0.8931396007537842, + "learning_rate": 2.1264869490800172e-05, + "loss": 0.2094, + "step": 28214 + }, + { + "epoch": 36.21951219512195, + "grad_norm": 0.8033229112625122, + "learning_rate": 2.126444159178434e-05, + "loss": 0.2242, + "step": 28215 + }, + { + "epoch": 36.22079589216945, + "grad_norm": 0.9524738788604736, + "learning_rate": 2.1264013692768506e-05, + "loss": 0.2351, + "step": 28216 + }, + { + "epoch": 36.222079589216946, + "grad_norm": 0.8804325461387634, + "learning_rate": 2.1263585793752677e-05, + "loss": 0.1847, + "step": 28217 + }, + { + "epoch": 36.223363286264444, + "grad_norm": 0.9999890327453613, + "learning_rate": 2.1263157894736842e-05, + "loss": 0.1884, + "step": 28218 + }, + { + "epoch": 36.224646983311935, + "grad_norm": 1.2100061178207397, + "learning_rate": 2.126272999572101e-05, + "loss": 0.2217, + "step": 28219 + }, + { + "epoch": 36.22593068035943, + "grad_norm": 1.0324839353561401, + "learning_rate": 2.126230209670518e-05, + "loss": 0.1987, + "step": 28220 + }, + { + "epoch": 36.22721437740693, + "grad_norm": 1.0336257219314575, + "learning_rate": 2.1261874197689344e-05, + "loss": 0.1941, + "step": 28221 + }, + { + "epoch": 36.22849807445443, + "grad_norm": 1.4762656688690186, + "learning_rate": 2.1261446298673516e-05, + "loss": 0.2273, + "step": 28222 + }, + { + "epoch": 36.229781771501926, + "grad_norm": 1.0107452869415283, + "learning_rate": 2.126101839965768e-05, + "loss": 0.1853, + "step": 28223 + }, + { + "epoch": 36.231065468549424, + "grad_norm": 1.1920084953308105, + "learning_rate": 2.1260590500641846e-05, + "loss": 0.2036, + "step": 28224 + }, + { + "epoch": 36.23234916559692, + "grad_norm": 1.2032777070999146, + "learning_rate": 2.1260162601626018e-05, + "loss": 0.2121, + "step": 28225 + }, + { + "epoch": 36.23363286264441, + "grad_norm": 0.9388899207115173, + "learning_rate": 2.1259734702610183e-05, + "loss": 0.1939, + "step": 28226 + }, + { + "epoch": 36.23491655969191, + "grad_norm": 1.937773585319519, + "learning_rate": 2.1259306803594355e-05, + "loss": 0.1913, + "step": 28227 + }, + { + "epoch": 36.23620025673941, + "grad_norm": 0.8657045364379883, + "learning_rate": 2.125887890457852e-05, + "loss": 0.2265, + "step": 28228 + }, + { + "epoch": 36.23748395378691, + "grad_norm": 1.494686245918274, + "learning_rate": 2.1258451005562688e-05, + "loss": 0.225, + "step": 28229 + }, + { + "epoch": 36.238767650834404, + "grad_norm": 2.6124281883239746, + "learning_rate": 2.1258023106546857e-05, + "loss": 0.1795, + "step": 28230 + }, + { + "epoch": 36.2400513478819, + "grad_norm": 0.955078125, + "learning_rate": 2.125759520753102e-05, + "loss": 0.179, + "step": 28231 + }, + { + "epoch": 36.2413350449294, + "grad_norm": 1.0375319719314575, + "learning_rate": 2.125716730851519e-05, + "loss": 0.21, + "step": 28232 + }, + { + "epoch": 36.24261874197689, + "grad_norm": 2.85526967048645, + "learning_rate": 2.125673940949936e-05, + "loss": 0.2113, + "step": 28233 + }, + { + "epoch": 36.24390243902439, + "grad_norm": 1.7890105247497559, + "learning_rate": 2.1256311510483527e-05, + "loss": 0.2209, + "step": 28234 + }, + { + "epoch": 36.24518613607189, + "grad_norm": 1.2640917301177979, + "learning_rate": 2.1255883611467695e-05, + "loss": 0.2013, + "step": 28235 + }, + { + "epoch": 36.246469833119384, + "grad_norm": 1.8687995672225952, + "learning_rate": 2.1255455712451864e-05, + "loss": 0.2036, + "step": 28236 + }, + { + "epoch": 36.24775353016688, + "grad_norm": 1.145392894744873, + "learning_rate": 2.125502781343603e-05, + "loss": 0.2032, + "step": 28237 + }, + { + "epoch": 36.24903722721438, + "grad_norm": 0.9923801422119141, + "learning_rate": 2.1254599914420197e-05, + "loss": 0.2243, + "step": 28238 + }, + { + "epoch": 36.25032092426187, + "grad_norm": 4.7471747398376465, + "learning_rate": 2.1254172015404365e-05, + "loss": 0.2232, + "step": 28239 + }, + { + "epoch": 36.25160462130937, + "grad_norm": 1.5935159921646118, + "learning_rate": 2.125374411638853e-05, + "loss": 0.2588, + "step": 28240 + }, + { + "epoch": 36.25288831835687, + "grad_norm": 2.3214163780212402, + "learning_rate": 2.1253316217372702e-05, + "loss": 0.2193, + "step": 28241 + }, + { + "epoch": 36.254172015404365, + "grad_norm": 2.058267593383789, + "learning_rate": 2.1252888318356867e-05, + "loss": 0.2442, + "step": 28242 + }, + { + "epoch": 36.25545571245186, + "grad_norm": 1.6965134143829346, + "learning_rate": 2.125246041934104e-05, + "loss": 0.2562, + "step": 28243 + }, + { + "epoch": 36.25673940949936, + "grad_norm": 1.8925198316574097, + "learning_rate": 2.1252032520325204e-05, + "loss": 0.3686, + "step": 28244 + }, + { + "epoch": 36.25802310654686, + "grad_norm": 1.4812043905258179, + "learning_rate": 2.125160462130937e-05, + "loss": 0.23, + "step": 28245 + }, + { + "epoch": 36.25930680359435, + "grad_norm": 0.9650840163230896, + "learning_rate": 2.125117672229354e-05, + "loss": 0.2428, + "step": 28246 + }, + { + "epoch": 36.26059050064185, + "grad_norm": 0.9624661207199097, + "learning_rate": 2.1250748823277706e-05, + "loss": 0.2277, + "step": 28247 + }, + { + "epoch": 36.261874197689345, + "grad_norm": 0.6011149883270264, + "learning_rate": 2.1250320924261874e-05, + "loss": 0.2381, + "step": 28248 + }, + { + "epoch": 36.26315789473684, + "grad_norm": 0.8791500926017761, + "learning_rate": 2.1249893025246043e-05, + "loss": 0.2416, + "step": 28249 + }, + { + "epoch": 36.26444159178434, + "grad_norm": 1.2845207452774048, + "learning_rate": 2.124946512623021e-05, + "loss": 0.2196, + "step": 28250 + }, + { + "epoch": 36.26572528883184, + "grad_norm": 0.9195432662963867, + "learning_rate": 2.124903722721438e-05, + "loss": 0.227, + "step": 28251 + }, + { + "epoch": 36.26700898587933, + "grad_norm": 2.853724718093872, + "learning_rate": 2.1248609328198545e-05, + "loss": 0.2202, + "step": 28252 + }, + { + "epoch": 36.26829268292683, + "grad_norm": 0.8052815794944763, + "learning_rate": 2.1248181429182713e-05, + "loss": 0.2298, + "step": 28253 + }, + { + "epoch": 36.269576379974325, + "grad_norm": 0.8764868378639221, + "learning_rate": 2.124775353016688e-05, + "loss": 0.229, + "step": 28254 + }, + { + "epoch": 36.27086007702182, + "grad_norm": 1.284353256225586, + "learning_rate": 2.124732563115105e-05, + "loss": 0.2034, + "step": 28255 + }, + { + "epoch": 36.27214377406932, + "grad_norm": 1.3438328504562378, + "learning_rate": 2.1246897732135215e-05, + "loss": 0.2088, + "step": 28256 + }, + { + "epoch": 36.27342747111682, + "grad_norm": 0.8313202857971191, + "learning_rate": 2.1246469833119387e-05, + "loss": 0.2116, + "step": 28257 + }, + { + "epoch": 36.274711168164316, + "grad_norm": 1.2220300436019897, + "learning_rate": 2.1246041934103552e-05, + "loss": 0.2251, + "step": 28258 + }, + { + "epoch": 36.27599486521181, + "grad_norm": 0.9665923714637756, + "learning_rate": 2.124561403508772e-05, + "loss": 0.2, + "step": 28259 + }, + { + "epoch": 36.277278562259305, + "grad_norm": 0.7855207324028015, + "learning_rate": 2.124518613607189e-05, + "loss": 0.2223, + "step": 28260 + }, + { + "epoch": 36.2785622593068, + "grad_norm": 0.841589093208313, + "learning_rate": 2.1244758237056054e-05, + "loss": 0.2135, + "step": 28261 + }, + { + "epoch": 36.2798459563543, + "grad_norm": 1.1808801889419556, + "learning_rate": 2.1244330338040225e-05, + "loss": 0.1985, + "step": 28262 + }, + { + "epoch": 36.2811296534018, + "grad_norm": 0.9055915474891663, + "learning_rate": 2.124390243902439e-05, + "loss": 0.2075, + "step": 28263 + }, + { + "epoch": 36.282413350449296, + "grad_norm": 2.2154181003570557, + "learning_rate": 2.124347454000856e-05, + "loss": 0.2282, + "step": 28264 + }, + { + "epoch": 36.283697047496794, + "grad_norm": 0.7856523990631104, + "learning_rate": 2.1243046640992727e-05, + "loss": 0.2077, + "step": 28265 + }, + { + "epoch": 36.284980744544285, + "grad_norm": 0.9668746590614319, + "learning_rate": 2.1242618741976892e-05, + "loss": 0.2291, + "step": 28266 + }, + { + "epoch": 36.28626444159178, + "grad_norm": 1.0433120727539062, + "learning_rate": 2.124219084296106e-05, + "loss": 0.2302, + "step": 28267 + }, + { + "epoch": 36.28754813863928, + "grad_norm": 0.8531848192214966, + "learning_rate": 2.124176294394523e-05, + "loss": 0.2064, + "step": 28268 + }, + { + "epoch": 36.28883183568678, + "grad_norm": 1.3363932371139526, + "learning_rate": 2.1241335044929397e-05, + "loss": 0.1917, + "step": 28269 + }, + { + "epoch": 36.290115532734276, + "grad_norm": 1.0181745290756226, + "learning_rate": 2.1240907145913566e-05, + "loss": 0.1932, + "step": 28270 + }, + { + "epoch": 36.291399229781774, + "grad_norm": 1.2228339910507202, + "learning_rate": 2.1240479246897734e-05, + "loss": 0.2155, + "step": 28271 + }, + { + "epoch": 36.292682926829265, + "grad_norm": 2.303602695465088, + "learning_rate": 2.12400513478819e-05, + "loss": 0.2301, + "step": 28272 + }, + { + "epoch": 36.29396662387676, + "grad_norm": 1.5442028045654297, + "learning_rate": 2.1239623448866068e-05, + "loss": 0.1901, + "step": 28273 + }, + { + "epoch": 36.29525032092426, + "grad_norm": 1.4847586154937744, + "learning_rate": 2.1239195549850236e-05, + "loss": 0.2145, + "step": 28274 + }, + { + "epoch": 36.29653401797176, + "grad_norm": 1.7965574264526367, + "learning_rate": 2.12387676508344e-05, + "loss": 0.2141, + "step": 28275 + }, + { + "epoch": 36.29781771501926, + "grad_norm": 1.2065902948379517, + "learning_rate": 2.1238339751818573e-05, + "loss": 0.199, + "step": 28276 + }, + { + "epoch": 36.299101412066754, + "grad_norm": 2.3055148124694824, + "learning_rate": 2.1237911852802738e-05, + "loss": 0.2305, + "step": 28277 + }, + { + "epoch": 36.30038510911425, + "grad_norm": 1.0280600786209106, + "learning_rate": 2.123748395378691e-05, + "loss": 0.1909, + "step": 28278 + }, + { + "epoch": 36.30166880616174, + "grad_norm": 1.814395546913147, + "learning_rate": 2.1237056054771075e-05, + "loss": 0.2286, + "step": 28279 + }, + { + "epoch": 36.30295250320924, + "grad_norm": 1.744775414466858, + "learning_rate": 2.123662815575524e-05, + "loss": 0.1836, + "step": 28280 + }, + { + "epoch": 36.30423620025674, + "grad_norm": 1.0893571376800537, + "learning_rate": 2.123620025673941e-05, + "loss": 0.2319, + "step": 28281 + }, + { + "epoch": 36.30551989730424, + "grad_norm": 1.1510506868362427, + "learning_rate": 2.1235772357723577e-05, + "loss": 0.2303, + "step": 28282 + }, + { + "epoch": 36.306803594351734, + "grad_norm": 1.1604853868484497, + "learning_rate": 2.1235344458707745e-05, + "loss": 0.1948, + "step": 28283 + }, + { + "epoch": 36.30808729139923, + "grad_norm": 1.1424497365951538, + "learning_rate": 2.1234916559691913e-05, + "loss": 0.1912, + "step": 28284 + }, + { + "epoch": 36.30937098844672, + "grad_norm": 1.8912593126296997, + "learning_rate": 2.123448866067608e-05, + "loss": 0.1965, + "step": 28285 + }, + { + "epoch": 36.31065468549422, + "grad_norm": 1.0757997035980225, + "learning_rate": 2.123406076166025e-05, + "loss": 0.2396, + "step": 28286 + }, + { + "epoch": 36.31193838254172, + "grad_norm": 3.2846596240997314, + "learning_rate": 2.1233632862644415e-05, + "loss": 0.2241, + "step": 28287 + }, + { + "epoch": 36.31322207958922, + "grad_norm": 1.6762701272964478, + "learning_rate": 2.1233204963628584e-05, + "loss": 0.2111, + "step": 28288 + }, + { + "epoch": 36.314505776636715, + "grad_norm": 1.381805181503296, + "learning_rate": 2.1232777064612752e-05, + "loss": 0.2359, + "step": 28289 + }, + { + "epoch": 36.31578947368421, + "grad_norm": 2.444082021713257, + "learning_rate": 2.123234916559692e-05, + "loss": 0.203, + "step": 28290 + }, + { + "epoch": 36.31707317073171, + "grad_norm": 1.3304816484451294, + "learning_rate": 2.1231921266581086e-05, + "loss": 0.2263, + "step": 28291 + }, + { + "epoch": 36.3183568677792, + "grad_norm": 4.11466646194458, + "learning_rate": 2.1231493367565254e-05, + "loss": 0.258, + "step": 28292 + }, + { + "epoch": 36.3196405648267, + "grad_norm": 3.8417739868164062, + "learning_rate": 2.1231065468549422e-05, + "loss": 0.281, + "step": 28293 + }, + { + "epoch": 36.3209242618742, + "grad_norm": 2.27812123298645, + "learning_rate": 2.123063756953359e-05, + "loss": 0.387, + "step": 28294 + }, + { + "epoch": 36.322207958921695, + "grad_norm": 0.69097900390625, + "learning_rate": 2.123020967051776e-05, + "loss": 0.2064, + "step": 28295 + }, + { + "epoch": 36.32349165596919, + "grad_norm": 1.2251179218292236, + "learning_rate": 2.1229781771501924e-05, + "loss": 0.2573, + "step": 28296 + }, + { + "epoch": 36.32477535301669, + "grad_norm": 0.8925751447677612, + "learning_rate": 2.1229353872486096e-05, + "loss": 0.2368, + "step": 28297 + }, + { + "epoch": 36.32605905006419, + "grad_norm": 0.7072308659553528, + "learning_rate": 2.122892597347026e-05, + "loss": 0.2169, + "step": 28298 + }, + { + "epoch": 36.32734274711168, + "grad_norm": 1.1052364110946655, + "learning_rate": 2.1228498074454426e-05, + "loss": 0.2371, + "step": 28299 + }, + { + "epoch": 36.32862644415918, + "grad_norm": 0.7951716184616089, + "learning_rate": 2.1228070175438598e-05, + "loss": 0.226, + "step": 28300 + }, + { + "epoch": 36.329910141206675, + "grad_norm": 1.2433669567108154, + "learning_rate": 2.1227642276422763e-05, + "loss": 0.2537, + "step": 28301 + }, + { + "epoch": 36.33119383825417, + "grad_norm": 1.3225141763687134, + "learning_rate": 2.1227214377406935e-05, + "loss": 0.2411, + "step": 28302 + }, + { + "epoch": 36.33247753530167, + "grad_norm": 2.0381834506988525, + "learning_rate": 2.12267864783911e-05, + "loss": 0.2223, + "step": 28303 + }, + { + "epoch": 36.33376123234917, + "grad_norm": 2.194922685623169, + "learning_rate": 2.1226358579375268e-05, + "loss": 0.2325, + "step": 28304 + }, + { + "epoch": 36.33504492939666, + "grad_norm": 1.9300546646118164, + "learning_rate": 2.1225930680359437e-05, + "loss": 0.2432, + "step": 28305 + }, + { + "epoch": 36.33632862644416, + "grad_norm": 1.6833522319793701, + "learning_rate": 2.12255027813436e-05, + "loss": 0.235, + "step": 28306 + }, + { + "epoch": 36.337612323491655, + "grad_norm": 0.7191252708435059, + "learning_rate": 2.122507488232777e-05, + "loss": 0.1905, + "step": 28307 + }, + { + "epoch": 36.33889602053915, + "grad_norm": 1.2678672075271606, + "learning_rate": 2.122464698331194e-05, + "loss": 0.2146, + "step": 28308 + }, + { + "epoch": 36.34017971758665, + "grad_norm": 1.4102081060409546, + "learning_rate": 2.1224219084296107e-05, + "loss": 0.2364, + "step": 28309 + }, + { + "epoch": 36.34146341463415, + "grad_norm": 1.0569881200790405, + "learning_rate": 2.1223791185280275e-05, + "loss": 0.2097, + "step": 28310 + }, + { + "epoch": 36.342747111681646, + "grad_norm": 1.9398484230041504, + "learning_rate": 2.1223363286264444e-05, + "loss": 0.2204, + "step": 28311 + }, + { + "epoch": 36.34403080872914, + "grad_norm": 0.8760197758674622, + "learning_rate": 2.122293538724861e-05, + "loss": 0.2183, + "step": 28312 + }, + { + "epoch": 36.345314505776635, + "grad_norm": 1.4058276414871216, + "learning_rate": 2.1222507488232777e-05, + "loss": 0.2259, + "step": 28313 + }, + { + "epoch": 36.34659820282413, + "grad_norm": 1.6418157815933228, + "learning_rate": 2.1222079589216946e-05, + "loss": 0.2212, + "step": 28314 + }, + { + "epoch": 36.34788189987163, + "grad_norm": 1.1665621995925903, + "learning_rate": 2.122165169020111e-05, + "loss": 0.2054, + "step": 28315 + }, + { + "epoch": 36.34916559691913, + "grad_norm": 2.6581249237060547, + "learning_rate": 2.1221223791185282e-05, + "loss": 0.2331, + "step": 28316 + }, + { + "epoch": 36.350449293966626, + "grad_norm": 1.4825397729873657, + "learning_rate": 2.1220795892169447e-05, + "loss": 0.2018, + "step": 28317 + }, + { + "epoch": 36.35173299101412, + "grad_norm": 1.0113846063613892, + "learning_rate": 2.122036799315362e-05, + "loss": 0.1929, + "step": 28318 + }, + { + "epoch": 36.353016688061615, + "grad_norm": 2.412045955657959, + "learning_rate": 2.1219940094137784e-05, + "loss": 0.2309, + "step": 28319 + }, + { + "epoch": 36.35430038510911, + "grad_norm": 1.6025289297103882, + "learning_rate": 2.121951219512195e-05, + "loss": 0.2133, + "step": 28320 + }, + { + "epoch": 36.35558408215661, + "grad_norm": 2.2551074028015137, + "learning_rate": 2.121908429610612e-05, + "loss": 0.2098, + "step": 28321 + }, + { + "epoch": 36.35686777920411, + "grad_norm": 0.9378848075866699, + "learning_rate": 2.1218656397090286e-05, + "loss": 0.2094, + "step": 28322 + }, + { + "epoch": 36.35815147625161, + "grad_norm": 2.461719036102295, + "learning_rate": 2.1218228498074454e-05, + "loss": 0.2237, + "step": 28323 + }, + { + "epoch": 36.359435173299104, + "grad_norm": 3.30067777633667, + "learning_rate": 2.1217800599058623e-05, + "loss": 0.1943, + "step": 28324 + }, + { + "epoch": 36.360718870346595, + "grad_norm": 1.074063777923584, + "learning_rate": 2.121737270004279e-05, + "loss": 0.2087, + "step": 28325 + }, + { + "epoch": 36.36200256739409, + "grad_norm": 2.100208044052124, + "learning_rate": 2.121694480102696e-05, + "loss": 0.2059, + "step": 28326 + }, + { + "epoch": 36.36328626444159, + "grad_norm": 1.6848613023757935, + "learning_rate": 2.1216516902011125e-05, + "loss": 0.2089, + "step": 28327 + }, + { + "epoch": 36.36456996148909, + "grad_norm": 3.221808671951294, + "learning_rate": 2.1216089002995293e-05, + "loss": 0.2401, + "step": 28328 + }, + { + "epoch": 36.36585365853659, + "grad_norm": 0.9782899022102356, + "learning_rate": 2.121566110397946e-05, + "loss": 0.2023, + "step": 28329 + }, + { + "epoch": 36.367137355584084, + "grad_norm": 0.9896674752235413, + "learning_rate": 2.121523320496363e-05, + "loss": 0.2069, + "step": 28330 + }, + { + "epoch": 36.36842105263158, + "grad_norm": 2.0643203258514404, + "learning_rate": 2.1214805305947795e-05, + "loss": 0.2146, + "step": 28331 + }, + { + "epoch": 36.36970474967907, + "grad_norm": 1.061180591583252, + "learning_rate": 2.1214377406931967e-05, + "loss": 0.2199, + "step": 28332 + }, + { + "epoch": 36.37098844672657, + "grad_norm": 1.449852466583252, + "learning_rate": 2.1213949507916132e-05, + "loss": 0.2257, + "step": 28333 + }, + { + "epoch": 36.37227214377407, + "grad_norm": 1.6527397632598877, + "learning_rate": 2.12135216089003e-05, + "loss": 0.1818, + "step": 28334 + }, + { + "epoch": 36.37355584082157, + "grad_norm": 1.0159498453140259, + "learning_rate": 2.121309370988447e-05, + "loss": 0.2563, + "step": 28335 + }, + { + "epoch": 36.374839537869065, + "grad_norm": 1.602673053741455, + "learning_rate": 2.1212665810868634e-05, + "loss": 0.2595, + "step": 28336 + }, + { + "epoch": 36.37612323491656, + "grad_norm": 1.589532494544983, + "learning_rate": 2.1212237911852805e-05, + "loss": 0.2056, + "step": 28337 + }, + { + "epoch": 36.37740693196405, + "grad_norm": 1.3703200817108154, + "learning_rate": 2.121181001283697e-05, + "loss": 0.1997, + "step": 28338 + }, + { + "epoch": 36.37869062901155, + "grad_norm": 1.8572304248809814, + "learning_rate": 2.121138211382114e-05, + "loss": 0.2298, + "step": 28339 + }, + { + "epoch": 36.37997432605905, + "grad_norm": 1.5337930917739868, + "learning_rate": 2.1210954214805307e-05, + "loss": 0.274, + "step": 28340 + }, + { + "epoch": 36.38125802310655, + "grad_norm": 1.7025212049484253, + "learning_rate": 2.1210526315789472e-05, + "loss": 0.235, + "step": 28341 + }, + { + "epoch": 36.382541720154045, + "grad_norm": 1.3987689018249512, + "learning_rate": 2.1210098416773644e-05, + "loss": 0.2718, + "step": 28342 + }, + { + "epoch": 36.38382541720154, + "grad_norm": 1.4196205139160156, + "learning_rate": 2.120967051775781e-05, + "loss": 0.2903, + "step": 28343 + }, + { + "epoch": 36.38510911424904, + "grad_norm": 1.9628355503082275, + "learning_rate": 2.1209242618741978e-05, + "loss": 0.3514, + "step": 28344 + }, + { + "epoch": 36.38639281129653, + "grad_norm": 1.4177181720733643, + "learning_rate": 2.1208814719726146e-05, + "loss": 0.2479, + "step": 28345 + }, + { + "epoch": 36.38767650834403, + "grad_norm": 1.225976586341858, + "learning_rate": 2.120838682071031e-05, + "loss": 0.244, + "step": 28346 + }, + { + "epoch": 36.38896020539153, + "grad_norm": 1.3904500007629395, + "learning_rate": 2.120795892169448e-05, + "loss": 0.2385, + "step": 28347 + }, + { + "epoch": 36.390243902439025, + "grad_norm": 1.0683587789535522, + "learning_rate": 2.1207531022678648e-05, + "loss": 0.2627, + "step": 28348 + }, + { + "epoch": 36.39152759948652, + "grad_norm": 2.6771419048309326, + "learning_rate": 2.1207103123662816e-05, + "loss": 0.2437, + "step": 28349 + }, + { + "epoch": 36.39281129653402, + "grad_norm": 1.105738639831543, + "learning_rate": 2.1206675224646985e-05, + "loss": 0.2389, + "step": 28350 + }, + { + "epoch": 36.39409499358152, + "grad_norm": 0.9312185049057007, + "learning_rate": 2.1206247325631153e-05, + "loss": 0.2331, + "step": 28351 + }, + { + "epoch": 36.39537869062901, + "grad_norm": 0.7250770330429077, + "learning_rate": 2.1205819426615318e-05, + "loss": 0.2083, + "step": 28352 + }, + { + "epoch": 36.39666238767651, + "grad_norm": 1.5331472158432007, + "learning_rate": 2.1205391527599486e-05, + "loss": 0.2226, + "step": 28353 + }, + { + "epoch": 36.397946084724005, + "grad_norm": 1.8847719430923462, + "learning_rate": 2.1204963628583655e-05, + "loss": 0.2399, + "step": 28354 + }, + { + "epoch": 36.3992297817715, + "grad_norm": 1.6476173400878906, + "learning_rate": 2.120453572956782e-05, + "loss": 0.2406, + "step": 28355 + }, + { + "epoch": 36.400513478819, + "grad_norm": 0.8740406036376953, + "learning_rate": 2.120410783055199e-05, + "loss": 0.2543, + "step": 28356 + }, + { + "epoch": 36.4017971758665, + "grad_norm": 1.877996563911438, + "learning_rate": 2.1203679931536157e-05, + "loss": 0.2232, + "step": 28357 + }, + { + "epoch": 36.40308087291399, + "grad_norm": 0.7176106572151184, + "learning_rate": 2.120325203252033e-05, + "loss": 0.2064, + "step": 28358 + }, + { + "epoch": 36.40436456996149, + "grad_norm": 0.8437674641609192, + "learning_rate": 2.1202824133504494e-05, + "loss": 0.2269, + "step": 28359 + }, + { + "epoch": 36.405648267008985, + "grad_norm": 0.9856520295143127, + "learning_rate": 2.120239623448866e-05, + "loss": 0.226, + "step": 28360 + }, + { + "epoch": 36.40693196405648, + "grad_norm": 1.2953306436538696, + "learning_rate": 2.120196833547283e-05, + "loss": 0.1959, + "step": 28361 + }, + { + "epoch": 36.40821566110398, + "grad_norm": 0.7623151540756226, + "learning_rate": 2.1201540436456995e-05, + "loss": 0.202, + "step": 28362 + }, + { + "epoch": 36.40949935815148, + "grad_norm": 1.4735398292541504, + "learning_rate": 2.1201112537441164e-05, + "loss": 0.1987, + "step": 28363 + }, + { + "epoch": 36.410783055198976, + "grad_norm": 0.8620578646659851, + "learning_rate": 2.1200684638425332e-05, + "loss": 0.2341, + "step": 28364 + }, + { + "epoch": 36.41206675224647, + "grad_norm": 0.723280668258667, + "learning_rate": 2.12002567394095e-05, + "loss": 0.1917, + "step": 28365 + }, + { + "epoch": 36.413350449293965, + "grad_norm": 0.8398385047912598, + "learning_rate": 2.119982884039367e-05, + "loss": 0.2164, + "step": 28366 + }, + { + "epoch": 36.41463414634146, + "grad_norm": 0.9545608758926392, + "learning_rate": 2.1199400941377834e-05, + "loss": 0.2008, + "step": 28367 + }, + { + "epoch": 36.41591784338896, + "grad_norm": 1.4634114503860474, + "learning_rate": 2.1198973042362002e-05, + "loss": 0.2046, + "step": 28368 + }, + { + "epoch": 36.41720154043646, + "grad_norm": 2.8666372299194336, + "learning_rate": 2.119854514334617e-05, + "loss": 0.2178, + "step": 28369 + }, + { + "epoch": 36.41848523748396, + "grad_norm": 1.0361047983169556, + "learning_rate": 2.119811724433034e-05, + "loss": 0.2267, + "step": 28370 + }, + { + "epoch": 36.41976893453145, + "grad_norm": 0.8382346630096436, + "learning_rate": 2.1197689345314504e-05, + "loss": 0.2039, + "step": 28371 + }, + { + "epoch": 36.421052631578945, + "grad_norm": 1.0078479051589966, + "learning_rate": 2.1197261446298676e-05, + "loss": 0.2248, + "step": 28372 + }, + { + "epoch": 36.42233632862644, + "grad_norm": 0.9655542373657227, + "learning_rate": 2.119683354728284e-05, + "loss": 0.1889, + "step": 28373 + }, + { + "epoch": 36.42362002567394, + "grad_norm": 1.54337739944458, + "learning_rate": 2.119640564826701e-05, + "loss": 0.2311, + "step": 28374 + }, + { + "epoch": 36.42490372272144, + "grad_norm": 1.2187236547470093, + "learning_rate": 2.1195977749251178e-05, + "loss": 0.2106, + "step": 28375 + }, + { + "epoch": 36.42618741976894, + "grad_norm": 0.9994356036186218, + "learning_rate": 2.1195549850235343e-05, + "loss": 0.2201, + "step": 28376 + }, + { + "epoch": 36.427471116816434, + "grad_norm": 0.8114076852798462, + "learning_rate": 2.1195121951219515e-05, + "loss": 0.2141, + "step": 28377 + }, + { + "epoch": 36.428754813863925, + "grad_norm": 1.2996196746826172, + "learning_rate": 2.119469405220368e-05, + "loss": 0.2144, + "step": 28378 + }, + { + "epoch": 36.43003851091142, + "grad_norm": 1.296676516532898, + "learning_rate": 2.1194266153187848e-05, + "loss": 0.2155, + "step": 28379 + }, + { + "epoch": 36.43132220795892, + "grad_norm": 0.9760302901268005, + "learning_rate": 2.1193838254172017e-05, + "loss": 0.2197, + "step": 28380 + }, + { + "epoch": 36.43260590500642, + "grad_norm": 1.303961157798767, + "learning_rate": 2.119341035515618e-05, + "loss": 0.1912, + "step": 28381 + }, + { + "epoch": 36.43388960205392, + "grad_norm": 1.0607341527938843, + "learning_rate": 2.1192982456140353e-05, + "loss": 0.2634, + "step": 28382 + }, + { + "epoch": 36.435173299101415, + "grad_norm": 1.1141228675842285, + "learning_rate": 2.119255455712452e-05, + "loss": 0.197, + "step": 28383 + }, + { + "epoch": 36.436456996148905, + "grad_norm": 2.819167137145996, + "learning_rate": 2.1192126658108687e-05, + "loss": 0.213, + "step": 28384 + }, + { + "epoch": 36.4377406931964, + "grad_norm": 1.0680128335952759, + "learning_rate": 2.1191698759092855e-05, + "loss": 0.2264, + "step": 28385 + }, + { + "epoch": 36.4390243902439, + "grad_norm": 0.9758908748626709, + "learning_rate": 2.1191270860077024e-05, + "loss": 0.237, + "step": 28386 + }, + { + "epoch": 36.4403080872914, + "grad_norm": 0.885647714138031, + "learning_rate": 2.119084296106119e-05, + "loss": 0.1885, + "step": 28387 + }, + { + "epoch": 36.4415917843389, + "grad_norm": 3.186038017272949, + "learning_rate": 2.1190415062045357e-05, + "loss": 0.2015, + "step": 28388 + }, + { + "epoch": 36.442875481386395, + "grad_norm": 1.5443367958068848, + "learning_rate": 2.1189987163029526e-05, + "loss": 0.2438, + "step": 28389 + }, + { + "epoch": 36.44415917843389, + "grad_norm": 1.1437058448791504, + "learning_rate": 2.1189559264013694e-05, + "loss": 0.2456, + "step": 28390 + }, + { + "epoch": 36.44544287548138, + "grad_norm": 6.44773530960083, + "learning_rate": 2.1189131364997862e-05, + "loss": 0.2186, + "step": 28391 + }, + { + "epoch": 36.44672657252888, + "grad_norm": 2.1971583366394043, + "learning_rate": 2.1188703465982027e-05, + "loss": 0.2257, + "step": 28392 + }, + { + "epoch": 36.44801026957638, + "grad_norm": 1.2828255891799927, + "learning_rate": 2.11882755669662e-05, + "loss": 0.2891, + "step": 28393 + }, + { + "epoch": 36.44929396662388, + "grad_norm": 2.343381404876709, + "learning_rate": 2.1187847667950364e-05, + "loss": 0.3626, + "step": 28394 + }, + { + "epoch": 36.450577663671375, + "grad_norm": 1.725771188735962, + "learning_rate": 2.118741976893453e-05, + "loss": 0.2443, + "step": 28395 + }, + { + "epoch": 36.45186136071887, + "grad_norm": 0.7664968371391296, + "learning_rate": 2.11869918699187e-05, + "loss": 0.2452, + "step": 28396 + }, + { + "epoch": 36.45314505776637, + "grad_norm": 0.7323307991027832, + "learning_rate": 2.1186563970902866e-05, + "loss": 0.2363, + "step": 28397 + }, + { + "epoch": 36.45442875481386, + "grad_norm": 1.9444619417190552, + "learning_rate": 2.1186136071887038e-05, + "loss": 0.2429, + "step": 28398 + }, + { + "epoch": 36.45571245186136, + "grad_norm": 1.4324934482574463, + "learning_rate": 2.1185708172871203e-05, + "loss": 0.2305, + "step": 28399 + }, + { + "epoch": 36.45699614890886, + "grad_norm": 1.2786463499069214, + "learning_rate": 2.118528027385537e-05, + "loss": 0.2209, + "step": 28400 + }, + { + "epoch": 36.458279845956355, + "grad_norm": 0.9072051644325256, + "learning_rate": 2.118485237483954e-05, + "loss": 0.2219, + "step": 28401 + }, + { + "epoch": 36.45956354300385, + "grad_norm": 0.8070627450942993, + "learning_rate": 2.1184424475823705e-05, + "loss": 0.214, + "step": 28402 + }, + { + "epoch": 36.46084724005135, + "grad_norm": 1.275738000869751, + "learning_rate": 2.1183996576807873e-05, + "loss": 0.2246, + "step": 28403 + }, + { + "epoch": 36.46213093709884, + "grad_norm": 0.9028812050819397, + "learning_rate": 2.118356867779204e-05, + "loss": 0.2248, + "step": 28404 + }, + { + "epoch": 36.46341463414634, + "grad_norm": 1.0893880128860474, + "learning_rate": 2.118314077877621e-05, + "loss": 0.2408, + "step": 28405 + }, + { + "epoch": 36.46469833119384, + "grad_norm": 1.165018916130066, + "learning_rate": 2.118271287976038e-05, + "loss": 0.2338, + "step": 28406 + }, + { + "epoch": 36.465982028241335, + "grad_norm": 0.7751673460006714, + "learning_rate": 2.1182284980744547e-05, + "loss": 0.2532, + "step": 28407 + }, + { + "epoch": 36.46726572528883, + "grad_norm": 1.1762573719024658, + "learning_rate": 2.1181857081728712e-05, + "loss": 0.2115, + "step": 28408 + }, + { + "epoch": 36.46854942233633, + "grad_norm": 0.8939087986946106, + "learning_rate": 2.118142918271288e-05, + "loss": 0.2319, + "step": 28409 + }, + { + "epoch": 36.46983311938383, + "grad_norm": 0.8440398573875427, + "learning_rate": 2.118100128369705e-05, + "loss": 0.2205, + "step": 28410 + }, + { + "epoch": 36.47111681643132, + "grad_norm": 0.8042082786560059, + "learning_rate": 2.1180573384681214e-05, + "loss": 0.2398, + "step": 28411 + }, + { + "epoch": 36.47240051347882, + "grad_norm": 1.2263439893722534, + "learning_rate": 2.1180145485665385e-05, + "loss": 0.2036, + "step": 28412 + }, + { + "epoch": 36.473684210526315, + "grad_norm": 0.8407712578773499, + "learning_rate": 2.117971758664955e-05, + "loss": 0.2207, + "step": 28413 + }, + { + "epoch": 36.47496790757381, + "grad_norm": 1.0814040899276733, + "learning_rate": 2.117928968763372e-05, + "loss": 0.2244, + "step": 28414 + }, + { + "epoch": 36.47625160462131, + "grad_norm": 1.561684012413025, + "learning_rate": 2.1178861788617887e-05, + "loss": 0.193, + "step": 28415 + }, + { + "epoch": 36.47753530166881, + "grad_norm": 1.706999659538269, + "learning_rate": 2.1178433889602052e-05, + "loss": 0.2414, + "step": 28416 + }, + { + "epoch": 36.47881899871631, + "grad_norm": 1.291022777557373, + "learning_rate": 2.1178005990586224e-05, + "loss": 0.2229, + "step": 28417 + }, + { + "epoch": 36.4801026957638, + "grad_norm": 0.893376350402832, + "learning_rate": 2.117757809157039e-05, + "loss": 0.2044, + "step": 28418 + }, + { + "epoch": 36.481386392811295, + "grad_norm": 1.0046049356460571, + "learning_rate": 2.1177150192554558e-05, + "loss": 0.2128, + "step": 28419 + }, + { + "epoch": 36.48267008985879, + "grad_norm": 1.1531622409820557, + "learning_rate": 2.1176722293538726e-05, + "loss": 0.2167, + "step": 28420 + }, + { + "epoch": 36.48395378690629, + "grad_norm": 0.8087580800056458, + "learning_rate": 2.117629439452289e-05, + "loss": 0.2168, + "step": 28421 + }, + { + "epoch": 36.48523748395379, + "grad_norm": 0.8698473572731018, + "learning_rate": 2.1175866495507063e-05, + "loss": 0.1858, + "step": 28422 + }, + { + "epoch": 36.48652118100129, + "grad_norm": 1.8139574527740479, + "learning_rate": 2.1175438596491228e-05, + "loss": 0.1908, + "step": 28423 + }, + { + "epoch": 36.48780487804878, + "grad_norm": 2.5053703784942627, + "learning_rate": 2.1175010697475396e-05, + "loss": 0.2068, + "step": 28424 + }, + { + "epoch": 36.489088575096275, + "grad_norm": 7.960709571838379, + "learning_rate": 2.1174582798459565e-05, + "loss": 0.2098, + "step": 28425 + }, + { + "epoch": 36.49037227214377, + "grad_norm": 1.122437596321106, + "learning_rate": 2.1174154899443733e-05, + "loss": 0.2106, + "step": 28426 + }, + { + "epoch": 36.49165596919127, + "grad_norm": 1.0934782028198242, + "learning_rate": 2.1173727000427898e-05, + "loss": 0.2185, + "step": 28427 + }, + { + "epoch": 36.49293966623877, + "grad_norm": 1.0440306663513184, + "learning_rate": 2.1173299101412067e-05, + "loss": 0.1994, + "step": 28428 + }, + { + "epoch": 36.49422336328627, + "grad_norm": 2.203707456588745, + "learning_rate": 2.1172871202396235e-05, + "loss": 0.2243, + "step": 28429 + }, + { + "epoch": 36.495507060333765, + "grad_norm": 1.917109489440918, + "learning_rate": 2.1172443303380403e-05, + "loss": 0.2172, + "step": 28430 + }, + { + "epoch": 36.496790757381255, + "grad_norm": 0.9782662391662598, + "learning_rate": 2.1172015404364572e-05, + "loss": 0.2281, + "step": 28431 + }, + { + "epoch": 36.49807445442875, + "grad_norm": 1.1293398141860962, + "learning_rate": 2.1171587505348737e-05, + "loss": 0.207, + "step": 28432 + }, + { + "epoch": 36.49935815147625, + "grad_norm": 1.1836858987808228, + "learning_rate": 2.117115960633291e-05, + "loss": 0.2324, + "step": 28433 + }, + { + "epoch": 36.50064184852375, + "grad_norm": 0.9800149202346802, + "learning_rate": 2.1170731707317074e-05, + "loss": 0.197, + "step": 28434 + }, + { + "epoch": 36.50192554557125, + "grad_norm": 5.2865424156188965, + "learning_rate": 2.117030380830124e-05, + "loss": 0.232, + "step": 28435 + }, + { + "epoch": 36.503209242618745, + "grad_norm": 1.6635432243347168, + "learning_rate": 2.116987590928541e-05, + "loss": 0.2214, + "step": 28436 + }, + { + "epoch": 36.504492939666235, + "grad_norm": 2.2130179405212402, + "learning_rate": 2.1169448010269575e-05, + "loss": 0.2348, + "step": 28437 + }, + { + "epoch": 36.50577663671373, + "grad_norm": 1.316657304763794, + "learning_rate": 2.1169020111253747e-05, + "loss": 0.2351, + "step": 28438 + }, + { + "epoch": 36.50706033376123, + "grad_norm": 1.3814165592193604, + "learning_rate": 2.1168592212237912e-05, + "loss": 0.2419, + "step": 28439 + }, + { + "epoch": 36.50834403080873, + "grad_norm": 1.293415904045105, + "learning_rate": 2.116816431322208e-05, + "loss": 0.2211, + "step": 28440 + }, + { + "epoch": 36.50962772785623, + "grad_norm": 1.118712306022644, + "learning_rate": 2.116773641420625e-05, + "loss": 0.2387, + "step": 28441 + }, + { + "epoch": 36.510911424903725, + "grad_norm": 1.2606884241104126, + "learning_rate": 2.1167308515190414e-05, + "loss": 0.2225, + "step": 28442 + }, + { + "epoch": 36.51219512195122, + "grad_norm": 1.4263205528259277, + "learning_rate": 2.1166880616174583e-05, + "loss": 0.2755, + "step": 28443 + }, + { + "epoch": 36.51347881899871, + "grad_norm": 1.755216121673584, + "learning_rate": 2.116645271715875e-05, + "loss": 0.355, + "step": 28444 + }, + { + "epoch": 36.51476251604621, + "grad_norm": 2.216844320297241, + "learning_rate": 2.116602481814292e-05, + "loss": 0.2362, + "step": 28445 + }, + { + "epoch": 36.51604621309371, + "grad_norm": 0.902845561504364, + "learning_rate": 2.1165596919127088e-05, + "loss": 0.2304, + "step": 28446 + }, + { + "epoch": 36.51732991014121, + "grad_norm": 1.0132689476013184, + "learning_rate": 2.1165169020111256e-05, + "loss": 0.2424, + "step": 28447 + }, + { + "epoch": 36.518613607188705, + "grad_norm": 0.7311301827430725, + "learning_rate": 2.116474112109542e-05, + "loss": 0.2628, + "step": 28448 + }, + { + "epoch": 36.5198973042362, + "grad_norm": 2.2439041137695312, + "learning_rate": 2.116431322207959e-05, + "loss": 0.249, + "step": 28449 + }, + { + "epoch": 36.52118100128369, + "grad_norm": 0.915645182132721, + "learning_rate": 2.1163885323063758e-05, + "loss": 0.2499, + "step": 28450 + }, + { + "epoch": 36.52246469833119, + "grad_norm": 0.8310260772705078, + "learning_rate": 2.1163457424047923e-05, + "loss": 0.2121, + "step": 28451 + }, + { + "epoch": 36.52374839537869, + "grad_norm": 0.9098817110061646, + "learning_rate": 2.1163029525032095e-05, + "loss": 0.2422, + "step": 28452 + }, + { + "epoch": 36.52503209242619, + "grad_norm": 0.81654953956604, + "learning_rate": 2.116260162601626e-05, + "loss": 0.2284, + "step": 28453 + }, + { + "epoch": 36.526315789473685, + "grad_norm": 1.1478180885314941, + "learning_rate": 2.116217372700043e-05, + "loss": 0.2354, + "step": 28454 + }, + { + "epoch": 36.52759948652118, + "grad_norm": 1.2908750772476196, + "learning_rate": 2.1161745827984597e-05, + "loss": 0.2196, + "step": 28455 + }, + { + "epoch": 36.52888318356868, + "grad_norm": 0.9251888394355774, + "learning_rate": 2.1161317928968762e-05, + "loss": 0.2373, + "step": 28456 + }, + { + "epoch": 36.53016688061617, + "grad_norm": 0.7394933104515076, + "learning_rate": 2.1160890029952934e-05, + "loss": 0.2269, + "step": 28457 + }, + { + "epoch": 36.53145057766367, + "grad_norm": 0.8793061375617981, + "learning_rate": 2.11604621309371e-05, + "loss": 0.2222, + "step": 28458 + }, + { + "epoch": 36.53273427471117, + "grad_norm": 0.7973064184188843, + "learning_rate": 2.1160034231921267e-05, + "loss": 0.2179, + "step": 28459 + }, + { + "epoch": 36.534017971758665, + "grad_norm": 0.7145177125930786, + "learning_rate": 2.1159606332905435e-05, + "loss": 0.2196, + "step": 28460 + }, + { + "epoch": 36.53530166880616, + "grad_norm": 1.9284660816192627, + "learning_rate": 2.1159178433889604e-05, + "loss": 0.2365, + "step": 28461 + }, + { + "epoch": 36.53658536585366, + "grad_norm": 0.8359377384185791, + "learning_rate": 2.1158750534873772e-05, + "loss": 0.2175, + "step": 28462 + }, + { + "epoch": 36.53786906290116, + "grad_norm": 1.2463860511779785, + "learning_rate": 2.1158322635857937e-05, + "loss": 0.2023, + "step": 28463 + }, + { + "epoch": 36.53915275994865, + "grad_norm": 1.103556513786316, + "learning_rate": 2.1157894736842106e-05, + "loss": 0.2103, + "step": 28464 + }, + { + "epoch": 36.54043645699615, + "grad_norm": 1.4442058801651, + "learning_rate": 2.1157466837826274e-05, + "loss": 0.2099, + "step": 28465 + }, + { + "epoch": 36.541720154043645, + "grad_norm": 1.2101784944534302, + "learning_rate": 2.1157038938810442e-05, + "loss": 0.2058, + "step": 28466 + }, + { + "epoch": 36.54300385109114, + "grad_norm": 2.1973984241485596, + "learning_rate": 2.1156611039794607e-05, + "loss": 0.2429, + "step": 28467 + }, + { + "epoch": 36.54428754813864, + "grad_norm": 3.464258909225464, + "learning_rate": 2.115618314077878e-05, + "loss": 0.2009, + "step": 28468 + }, + { + "epoch": 36.54557124518614, + "grad_norm": 0.9312631487846375, + "learning_rate": 2.1155755241762944e-05, + "loss": 0.2043, + "step": 28469 + }, + { + "epoch": 36.54685494223363, + "grad_norm": 1.0853825807571411, + "learning_rate": 2.115532734274711e-05, + "loss": 0.2131, + "step": 28470 + }, + { + "epoch": 36.54813863928113, + "grad_norm": 0.9780134558677673, + "learning_rate": 2.115489944373128e-05, + "loss": 0.2051, + "step": 28471 + }, + { + "epoch": 36.549422336328625, + "grad_norm": 1.0741766691207886, + "learning_rate": 2.1154471544715446e-05, + "loss": 0.1876, + "step": 28472 + }, + { + "epoch": 36.55070603337612, + "grad_norm": 1.8128478527069092, + "learning_rate": 2.1154043645699618e-05, + "loss": 0.2053, + "step": 28473 + }, + { + "epoch": 36.55198973042362, + "grad_norm": 2.3314709663391113, + "learning_rate": 2.1153615746683783e-05, + "loss": 0.1865, + "step": 28474 + }, + { + "epoch": 36.55327342747112, + "grad_norm": 1.0581690073013306, + "learning_rate": 2.1153187847667948e-05, + "loss": 0.2261, + "step": 28475 + }, + { + "epoch": 36.55455712451862, + "grad_norm": 1.7133220434188843, + "learning_rate": 2.115275994865212e-05, + "loss": 0.1947, + "step": 28476 + }, + { + "epoch": 36.55584082156611, + "grad_norm": 1.290266513824463, + "learning_rate": 2.1152332049636285e-05, + "loss": 0.2273, + "step": 28477 + }, + { + "epoch": 36.557124518613605, + "grad_norm": 2.1429073810577393, + "learning_rate": 2.1151904150620453e-05, + "loss": 0.216, + "step": 28478 + }, + { + "epoch": 36.5584082156611, + "grad_norm": 1.2718453407287598, + "learning_rate": 2.115147625160462e-05, + "loss": 0.2083, + "step": 28479 + }, + { + "epoch": 36.5596919127086, + "grad_norm": 1.1089800596237183, + "learning_rate": 2.115104835258879e-05, + "loss": 0.2161, + "step": 28480 + }, + { + "epoch": 36.5609756097561, + "grad_norm": 1.2445261478424072, + "learning_rate": 2.115062045357296e-05, + "loss": 0.2051, + "step": 28481 + }, + { + "epoch": 36.5622593068036, + "grad_norm": 1.5490652322769165, + "learning_rate": 2.1150192554557123e-05, + "loss": 0.1765, + "step": 28482 + }, + { + "epoch": 36.563543003851095, + "grad_norm": 2.0389645099639893, + "learning_rate": 2.1149764655541292e-05, + "loss": 0.2715, + "step": 28483 + }, + { + "epoch": 36.564826700898585, + "grad_norm": 1.160561203956604, + "learning_rate": 2.114933675652546e-05, + "loss": 0.2182, + "step": 28484 + }, + { + "epoch": 36.56611039794608, + "grad_norm": 1.0295790433883667, + "learning_rate": 2.114890885750963e-05, + "loss": 0.1916, + "step": 28485 + }, + { + "epoch": 36.56739409499358, + "grad_norm": 1.2861613035202026, + "learning_rate": 2.1148480958493794e-05, + "loss": 0.2228, + "step": 28486 + }, + { + "epoch": 36.56867779204108, + "grad_norm": 1.3752753734588623, + "learning_rate": 2.1148053059477966e-05, + "loss": 0.1993, + "step": 28487 + }, + { + "epoch": 36.56996148908858, + "grad_norm": 1.0930485725402832, + "learning_rate": 2.114762516046213e-05, + "loss": 0.2011, + "step": 28488 + }, + { + "epoch": 36.571245186136075, + "grad_norm": 1.7419110536575317, + "learning_rate": 2.11471972614463e-05, + "loss": 0.2153, + "step": 28489 + }, + { + "epoch": 36.572528883183566, + "grad_norm": 2.0240015983581543, + "learning_rate": 2.1146769362430467e-05, + "loss": 0.2428, + "step": 28490 + }, + { + "epoch": 36.57381258023106, + "grad_norm": 2.2668442726135254, + "learning_rate": 2.1146341463414632e-05, + "loss": 0.2231, + "step": 28491 + }, + { + "epoch": 36.57509627727856, + "grad_norm": 1.35321843624115, + "learning_rate": 2.1145913564398804e-05, + "loss": 0.2812, + "step": 28492 + }, + { + "epoch": 36.57637997432606, + "grad_norm": 2.9475409984588623, + "learning_rate": 2.114548566538297e-05, + "loss": 0.3013, + "step": 28493 + }, + { + "epoch": 36.57766367137356, + "grad_norm": 1.9192661046981812, + "learning_rate": 2.1145057766367138e-05, + "loss": 0.3616, + "step": 28494 + }, + { + "epoch": 36.578947368421055, + "grad_norm": 1.2571847438812256, + "learning_rate": 2.1144629867351306e-05, + "loss": 0.2535, + "step": 28495 + }, + { + "epoch": 36.58023106546855, + "grad_norm": 1.0470986366271973, + "learning_rate": 2.114420196833547e-05, + "loss": 0.2508, + "step": 28496 + }, + { + "epoch": 36.58151476251604, + "grad_norm": 0.9324575662612915, + "learning_rate": 2.1143774069319643e-05, + "loss": 0.2503, + "step": 28497 + }, + { + "epoch": 36.58279845956354, + "grad_norm": 0.871974527835846, + "learning_rate": 2.1143346170303808e-05, + "loss": 0.2253, + "step": 28498 + }, + { + "epoch": 36.58408215661104, + "grad_norm": 0.7591220736503601, + "learning_rate": 2.1142918271287976e-05, + "loss": 0.2183, + "step": 28499 + }, + { + "epoch": 36.58536585365854, + "grad_norm": 0.7967252731323242, + "learning_rate": 2.1142490372272145e-05, + "loss": 0.2493, + "step": 28500 + }, + { + "epoch": 36.586649550706035, + "grad_norm": 3.3677866458892822, + "learning_rate": 2.1142062473256313e-05, + "loss": 0.2306, + "step": 28501 + }, + { + "epoch": 36.58793324775353, + "grad_norm": 0.95449298620224, + "learning_rate": 2.1141634574240478e-05, + "loss": 0.2255, + "step": 28502 + }, + { + "epoch": 36.589216944801024, + "grad_norm": 0.8243200778961182, + "learning_rate": 2.1141206675224647e-05, + "loss": 0.2283, + "step": 28503 + }, + { + "epoch": 36.59050064184852, + "grad_norm": 1.3603825569152832, + "learning_rate": 2.1140778776208815e-05, + "loss": 0.2404, + "step": 28504 + }, + { + "epoch": 36.59178433889602, + "grad_norm": 0.8358678817749023, + "learning_rate": 2.1140350877192983e-05, + "loss": 0.2234, + "step": 28505 + }, + { + "epoch": 36.59306803594352, + "grad_norm": 1.1060073375701904, + "learning_rate": 2.1139922978177152e-05, + "loss": 0.2122, + "step": 28506 + }, + { + "epoch": 36.594351732991015, + "grad_norm": 1.5875771045684814, + "learning_rate": 2.1139495079161317e-05, + "loss": 0.2103, + "step": 28507 + }, + { + "epoch": 36.59563543003851, + "grad_norm": 1.6804735660552979, + "learning_rate": 2.113906718014549e-05, + "loss": 0.2279, + "step": 28508 + }, + { + "epoch": 36.59691912708601, + "grad_norm": 2.09652042388916, + "learning_rate": 2.1138639281129654e-05, + "loss": 0.2072, + "step": 28509 + }, + { + "epoch": 36.5982028241335, + "grad_norm": 1.69742751121521, + "learning_rate": 2.113821138211382e-05, + "loss": 0.2487, + "step": 28510 + }, + { + "epoch": 36.599486521181, + "grad_norm": 0.8214726448059082, + "learning_rate": 2.113778348309799e-05, + "loss": 0.2351, + "step": 28511 + }, + { + "epoch": 36.6007702182285, + "grad_norm": 1.3569928407669067, + "learning_rate": 2.1137355584082156e-05, + "loss": 0.211, + "step": 28512 + }, + { + "epoch": 36.602053915275995, + "grad_norm": 2.6892268657684326, + "learning_rate": 2.1136927685066327e-05, + "loss": 0.2101, + "step": 28513 + }, + { + "epoch": 36.60333761232349, + "grad_norm": 0.7445815801620483, + "learning_rate": 2.1136499786050492e-05, + "loss": 0.2101, + "step": 28514 + }, + { + "epoch": 36.60462130937099, + "grad_norm": 0.6994360685348511, + "learning_rate": 2.113607188703466e-05, + "loss": 0.199, + "step": 28515 + }, + { + "epoch": 36.60590500641848, + "grad_norm": 1.1082807779312134, + "learning_rate": 2.113564398801883e-05, + "loss": 0.213, + "step": 28516 + }, + { + "epoch": 36.60718870346598, + "grad_norm": 1.0124824047088623, + "learning_rate": 2.1135216089002994e-05, + "loss": 0.2163, + "step": 28517 + }, + { + "epoch": 36.60847240051348, + "grad_norm": 0.8430065512657166, + "learning_rate": 2.1134788189987163e-05, + "loss": 0.2148, + "step": 28518 + }, + { + "epoch": 36.609756097560975, + "grad_norm": 2.654282331466675, + "learning_rate": 2.113436029097133e-05, + "loss": 0.2309, + "step": 28519 + }, + { + "epoch": 36.61103979460847, + "grad_norm": 5.470390319824219, + "learning_rate": 2.11339323919555e-05, + "loss": 0.2152, + "step": 28520 + }, + { + "epoch": 36.61232349165597, + "grad_norm": 1.4427783489227295, + "learning_rate": 2.1133504492939668e-05, + "loss": 0.1908, + "step": 28521 + }, + { + "epoch": 36.61360718870347, + "grad_norm": 0.9567577838897705, + "learning_rate": 2.1133076593923836e-05, + "loss": 0.2173, + "step": 28522 + }, + { + "epoch": 36.61489088575096, + "grad_norm": 2.48689603805542, + "learning_rate": 2.1132648694908e-05, + "loss": 0.2115, + "step": 28523 + }, + { + "epoch": 36.61617458279846, + "grad_norm": 1.6873468160629272, + "learning_rate": 2.113222079589217e-05, + "loss": 0.2371, + "step": 28524 + }, + { + "epoch": 36.617458279845955, + "grad_norm": 3.566288948059082, + "learning_rate": 2.1131792896876338e-05, + "loss": 0.2348, + "step": 28525 + }, + { + "epoch": 36.61874197689345, + "grad_norm": 1.1537243127822876, + "learning_rate": 2.1131364997860503e-05, + "loss": 0.2125, + "step": 28526 + }, + { + "epoch": 36.62002567394095, + "grad_norm": 1.0679932832717896, + "learning_rate": 2.1130937098844675e-05, + "loss": 0.2111, + "step": 28527 + }, + { + "epoch": 36.62130937098845, + "grad_norm": 0.9074884057044983, + "learning_rate": 2.113050919982884e-05, + "loss": 0.1965, + "step": 28528 + }, + { + "epoch": 36.62259306803595, + "grad_norm": 1.6388354301452637, + "learning_rate": 2.1130081300813012e-05, + "loss": 0.2029, + "step": 28529 + }, + { + "epoch": 36.62387676508344, + "grad_norm": 1.4381062984466553, + "learning_rate": 2.1129653401797177e-05, + "loss": 0.2467, + "step": 28530 + }, + { + "epoch": 36.625160462130935, + "grad_norm": 1.3680927753448486, + "learning_rate": 2.1129225502781342e-05, + "loss": 0.207, + "step": 28531 + }, + { + "epoch": 36.62644415917843, + "grad_norm": 1.827917456626892, + "learning_rate": 2.1128797603765514e-05, + "loss": 0.2551, + "step": 28532 + }, + { + "epoch": 36.62772785622593, + "grad_norm": 2.24221134185791, + "learning_rate": 2.112836970474968e-05, + "loss": 0.2003, + "step": 28533 + }, + { + "epoch": 36.62901155327343, + "grad_norm": 1.4159162044525146, + "learning_rate": 2.1127941805733847e-05, + "loss": 0.1962, + "step": 28534 + }, + { + "epoch": 36.63029525032093, + "grad_norm": 1.6726628541946411, + "learning_rate": 2.1127513906718015e-05, + "loss": 0.2401, + "step": 28535 + }, + { + "epoch": 36.63157894736842, + "grad_norm": 3.4521656036376953, + "learning_rate": 2.112708600770218e-05, + "loss": 0.2581, + "step": 28536 + }, + { + "epoch": 36.632862644415916, + "grad_norm": 1.1315319538116455, + "learning_rate": 2.1126658108686352e-05, + "loss": 0.2194, + "step": 28537 + }, + { + "epoch": 36.63414634146341, + "grad_norm": 2.5930137634277344, + "learning_rate": 2.1126230209670517e-05, + "loss": 0.2378, + "step": 28538 + }, + { + "epoch": 36.63543003851091, + "grad_norm": 2.0588924884796143, + "learning_rate": 2.1125802310654686e-05, + "loss": 0.213, + "step": 28539 + }, + { + "epoch": 36.63671373555841, + "grad_norm": 3.0358569622039795, + "learning_rate": 2.1125374411638854e-05, + "loss": 0.2218, + "step": 28540 + }, + { + "epoch": 36.63799743260591, + "grad_norm": 1.4682703018188477, + "learning_rate": 2.1124946512623023e-05, + "loss": 0.2533, + "step": 28541 + }, + { + "epoch": 36.639281129653405, + "grad_norm": 1.7327880859375, + "learning_rate": 2.1124518613607188e-05, + "loss": 0.2839, + "step": 28542 + }, + { + "epoch": 36.640564826700896, + "grad_norm": 1.5962680578231812, + "learning_rate": 2.1124090714591356e-05, + "loss": 0.3217, + "step": 28543 + }, + { + "epoch": 36.64184852374839, + "grad_norm": 3.712763786315918, + "learning_rate": 2.1123662815575524e-05, + "loss": 0.3809, + "step": 28544 + }, + { + "epoch": 36.64313222079589, + "grad_norm": 0.9666699767112732, + "learning_rate": 2.1123234916559693e-05, + "loss": 0.2429, + "step": 28545 + }, + { + "epoch": 36.64441591784339, + "grad_norm": 0.8806714415550232, + "learning_rate": 2.112280701754386e-05, + "loss": 0.2419, + "step": 28546 + }, + { + "epoch": 36.64569961489089, + "grad_norm": 0.9136018753051758, + "learning_rate": 2.1122379118528026e-05, + "loss": 0.2435, + "step": 28547 + }, + { + "epoch": 36.646983311938385, + "grad_norm": 1.0306084156036377, + "learning_rate": 2.1121951219512198e-05, + "loss": 0.2545, + "step": 28548 + }, + { + "epoch": 36.64826700898588, + "grad_norm": 0.9469505548477173, + "learning_rate": 2.1121523320496363e-05, + "loss": 0.2365, + "step": 28549 + }, + { + "epoch": 36.649550706033374, + "grad_norm": 0.8859641551971436, + "learning_rate": 2.1121095421480528e-05, + "loss": 0.2456, + "step": 28550 + }, + { + "epoch": 36.65083440308087, + "grad_norm": 1.7821584939956665, + "learning_rate": 2.11206675224647e-05, + "loss": 0.2307, + "step": 28551 + }, + { + "epoch": 36.65211810012837, + "grad_norm": 2.008071184158325, + "learning_rate": 2.1120239623448865e-05, + "loss": 0.2067, + "step": 28552 + }, + { + "epoch": 36.65340179717587, + "grad_norm": 0.7512043118476868, + "learning_rate": 2.1119811724433037e-05, + "loss": 0.2416, + "step": 28553 + }, + { + "epoch": 36.654685494223365, + "grad_norm": 0.8503357172012329, + "learning_rate": 2.11193838254172e-05, + "loss": 0.2207, + "step": 28554 + }, + { + "epoch": 36.65596919127086, + "grad_norm": 1.1943681240081787, + "learning_rate": 2.111895592640137e-05, + "loss": 0.2094, + "step": 28555 + }, + { + "epoch": 36.657252888318354, + "grad_norm": 1.310470700263977, + "learning_rate": 2.111852802738554e-05, + "loss": 0.2072, + "step": 28556 + }, + { + "epoch": 36.65853658536585, + "grad_norm": 1.4827085733413696, + "learning_rate": 2.1118100128369704e-05, + "loss": 0.2348, + "step": 28557 + }, + { + "epoch": 36.65982028241335, + "grad_norm": 2.088935613632202, + "learning_rate": 2.1117672229353872e-05, + "loss": 0.2594, + "step": 28558 + }, + { + "epoch": 36.66110397946085, + "grad_norm": 1.3429337739944458, + "learning_rate": 2.111724433033804e-05, + "loss": 0.2409, + "step": 28559 + }, + { + "epoch": 36.662387676508345, + "grad_norm": 1.3167951107025146, + "learning_rate": 2.111681643132221e-05, + "loss": 0.2039, + "step": 28560 + }, + { + "epoch": 36.66367137355584, + "grad_norm": 0.7619811296463013, + "learning_rate": 2.1116388532306377e-05, + "loss": 0.2043, + "step": 28561 + }, + { + "epoch": 36.66495507060334, + "grad_norm": 0.8679265975952148, + "learning_rate": 2.1115960633290546e-05, + "loss": 0.1734, + "step": 28562 + }, + { + "epoch": 36.66623876765083, + "grad_norm": 1.0426665544509888, + "learning_rate": 2.111553273427471e-05, + "loss": 0.225, + "step": 28563 + }, + { + "epoch": 36.66752246469833, + "grad_norm": 1.026552677154541, + "learning_rate": 2.111510483525888e-05, + "loss": 0.2286, + "step": 28564 + }, + { + "epoch": 36.66880616174583, + "grad_norm": 1.1809580326080322, + "learning_rate": 2.1114676936243047e-05, + "loss": 0.2158, + "step": 28565 + }, + { + "epoch": 36.670089858793325, + "grad_norm": 1.1183985471725464, + "learning_rate": 2.1114249037227212e-05, + "loss": 0.2117, + "step": 28566 + }, + { + "epoch": 36.67137355584082, + "grad_norm": 1.0968776941299438, + "learning_rate": 2.1113821138211384e-05, + "loss": 0.2293, + "step": 28567 + }, + { + "epoch": 36.67265725288832, + "grad_norm": 2.43636417388916, + "learning_rate": 2.111339323919555e-05, + "loss": 0.2061, + "step": 28568 + }, + { + "epoch": 36.67394094993581, + "grad_norm": 1.5493943691253662, + "learning_rate": 2.111296534017972e-05, + "loss": 0.2014, + "step": 28569 + }, + { + "epoch": 36.67522464698331, + "grad_norm": 1.100367546081543, + "learning_rate": 2.1112537441163886e-05, + "loss": 0.2255, + "step": 28570 + }, + { + "epoch": 36.67650834403081, + "grad_norm": 1.1048665046691895, + "learning_rate": 2.111210954214805e-05, + "loss": 0.2271, + "step": 28571 + }, + { + "epoch": 36.677792041078305, + "grad_norm": 0.9435204863548279, + "learning_rate": 2.1111681643132223e-05, + "loss": 0.1986, + "step": 28572 + }, + { + "epoch": 36.6790757381258, + "grad_norm": 1.0633294582366943, + "learning_rate": 2.1111253744116388e-05, + "loss": 0.218, + "step": 28573 + }, + { + "epoch": 36.6803594351733, + "grad_norm": 0.997121274471283, + "learning_rate": 2.1110825845100556e-05, + "loss": 0.2171, + "step": 28574 + }, + { + "epoch": 36.6816431322208, + "grad_norm": 0.7871102094650269, + "learning_rate": 2.1110397946084725e-05, + "loss": 0.2044, + "step": 28575 + }, + { + "epoch": 36.68292682926829, + "grad_norm": 1.998975396156311, + "learning_rate": 2.1109970047068893e-05, + "loss": 0.1857, + "step": 28576 + }, + { + "epoch": 36.68421052631579, + "grad_norm": 1.2900540828704834, + "learning_rate": 2.110954214805306e-05, + "loss": 0.1887, + "step": 28577 + }, + { + "epoch": 36.685494223363285, + "grad_norm": 1.5354840755462646, + "learning_rate": 2.1109114249037227e-05, + "loss": 0.2263, + "step": 28578 + }, + { + "epoch": 36.68677792041078, + "grad_norm": 1.1374311447143555, + "learning_rate": 2.1108686350021395e-05, + "loss": 0.2589, + "step": 28579 + }, + { + "epoch": 36.68806161745828, + "grad_norm": 1.2699000835418701, + "learning_rate": 2.1108258451005563e-05, + "loss": 0.2182, + "step": 28580 + }, + { + "epoch": 36.68934531450578, + "grad_norm": 1.142890453338623, + "learning_rate": 2.1107830551989732e-05, + "loss": 0.2394, + "step": 28581 + }, + { + "epoch": 36.69062901155327, + "grad_norm": 1.4607539176940918, + "learning_rate": 2.1107402652973897e-05, + "loss": 0.1965, + "step": 28582 + }, + { + "epoch": 36.69191270860077, + "grad_norm": 2.6440963745117188, + "learning_rate": 2.110697475395807e-05, + "loss": 0.2074, + "step": 28583 + }, + { + "epoch": 36.693196405648266, + "grad_norm": 1.0414977073669434, + "learning_rate": 2.1106546854942234e-05, + "loss": 0.2077, + "step": 28584 + }, + { + "epoch": 36.69448010269576, + "grad_norm": 3.8253562450408936, + "learning_rate": 2.1106118955926402e-05, + "loss": 0.2184, + "step": 28585 + }, + { + "epoch": 36.69576379974326, + "grad_norm": 1.4860200881958008, + "learning_rate": 2.110569105691057e-05, + "loss": 0.2106, + "step": 28586 + }, + { + "epoch": 36.69704749679076, + "grad_norm": 1.2207190990447998, + "learning_rate": 2.1105263157894736e-05, + "loss": 0.2087, + "step": 28587 + }, + { + "epoch": 36.69833119383826, + "grad_norm": 1.8054546117782593, + "learning_rate": 2.1104835258878907e-05, + "loss": 0.2383, + "step": 28588 + }, + { + "epoch": 36.69961489088575, + "grad_norm": 2.0001890659332275, + "learning_rate": 2.1104407359863072e-05, + "loss": 0.2227, + "step": 28589 + }, + { + "epoch": 36.700898587933246, + "grad_norm": 2.0649523735046387, + "learning_rate": 2.110397946084724e-05, + "loss": 0.2941, + "step": 28590 + }, + { + "epoch": 36.70218228498074, + "grad_norm": 1.213036298751831, + "learning_rate": 2.110355156183141e-05, + "loss": 0.2237, + "step": 28591 + }, + { + "epoch": 36.70346598202824, + "grad_norm": 3.5708703994750977, + "learning_rate": 2.1103123662815574e-05, + "loss": 0.2758, + "step": 28592 + }, + { + "epoch": 36.70474967907574, + "grad_norm": 2.6870639324188232, + "learning_rate": 2.1102695763799746e-05, + "loss": 0.2962, + "step": 28593 + }, + { + "epoch": 36.70603337612324, + "grad_norm": 5.65285062789917, + "learning_rate": 2.110226786478391e-05, + "loss": 0.4576, + "step": 28594 + }, + { + "epoch": 36.707317073170735, + "grad_norm": 1.1216455698013306, + "learning_rate": 2.110183996576808e-05, + "loss": 0.238, + "step": 28595 + }, + { + "epoch": 36.708600770218226, + "grad_norm": 1.1735316514968872, + "learning_rate": 2.1101412066752248e-05, + "loss": 0.2411, + "step": 28596 + }, + { + "epoch": 36.709884467265724, + "grad_norm": 3.267461061477661, + "learning_rate": 2.1100984167736413e-05, + "loss": 0.2404, + "step": 28597 + }, + { + "epoch": 36.71116816431322, + "grad_norm": 3.865992307662964, + "learning_rate": 2.110055626872058e-05, + "loss": 0.2625, + "step": 28598 + }, + { + "epoch": 36.71245186136072, + "grad_norm": 1.8655588626861572, + "learning_rate": 2.110012836970475e-05, + "loss": 0.2328, + "step": 28599 + }, + { + "epoch": 36.71373555840822, + "grad_norm": 1.1660398244857788, + "learning_rate": 2.1099700470688918e-05, + "loss": 0.2275, + "step": 28600 + }, + { + "epoch": 36.715019255455715, + "grad_norm": 0.8582912683486938, + "learning_rate": 2.1099272571673087e-05, + "loss": 0.245, + "step": 28601 + }, + { + "epoch": 36.716302952503206, + "grad_norm": 0.7122206687927246, + "learning_rate": 2.1098844672657255e-05, + "loss": 0.2232, + "step": 28602 + }, + { + "epoch": 36.717586649550704, + "grad_norm": 1.2183659076690674, + "learning_rate": 2.109841677364142e-05, + "loss": 0.2444, + "step": 28603 + }, + { + "epoch": 36.7188703465982, + "grad_norm": 1.0916309356689453, + "learning_rate": 2.109798887462559e-05, + "loss": 0.2278, + "step": 28604 + }, + { + "epoch": 36.7201540436457, + "grad_norm": 1.0329896211624146, + "learning_rate": 2.1097560975609757e-05, + "loss": 0.2402, + "step": 28605 + }, + { + "epoch": 36.7214377406932, + "grad_norm": 1.1540385484695435, + "learning_rate": 2.1097133076593922e-05, + "loss": 0.2574, + "step": 28606 + }, + { + "epoch": 36.722721437740695, + "grad_norm": 2.506667375564575, + "learning_rate": 2.1096705177578094e-05, + "loss": 0.2357, + "step": 28607 + }, + { + "epoch": 36.72400513478819, + "grad_norm": 1.0994930267333984, + "learning_rate": 2.109627727856226e-05, + "loss": 0.2411, + "step": 28608 + }, + { + "epoch": 36.725288831835684, + "grad_norm": 0.9694913029670715, + "learning_rate": 2.109584937954643e-05, + "loss": 0.2359, + "step": 28609 + }, + { + "epoch": 36.72657252888318, + "grad_norm": 1.0111500024795532, + "learning_rate": 2.1095421480530595e-05, + "loss": 0.246, + "step": 28610 + }, + { + "epoch": 36.72785622593068, + "grad_norm": 3.89347243309021, + "learning_rate": 2.109499358151476e-05, + "loss": 0.2343, + "step": 28611 + }, + { + "epoch": 36.72913992297818, + "grad_norm": 0.8848824501037598, + "learning_rate": 2.1094565682498932e-05, + "loss": 0.2104, + "step": 28612 + }, + { + "epoch": 36.730423620025675, + "grad_norm": 1.753208041191101, + "learning_rate": 2.1094137783483097e-05, + "loss": 0.2147, + "step": 28613 + }, + { + "epoch": 36.73170731707317, + "grad_norm": 1.0030696392059326, + "learning_rate": 2.1093709884467266e-05, + "loss": 0.2211, + "step": 28614 + }, + { + "epoch": 36.73299101412067, + "grad_norm": 1.553722620010376, + "learning_rate": 2.1093281985451434e-05, + "loss": 0.2351, + "step": 28615 + }, + { + "epoch": 36.73427471116816, + "grad_norm": 1.0488333702087402, + "learning_rate": 2.1092854086435603e-05, + "loss": 0.2162, + "step": 28616 + }, + { + "epoch": 36.73555840821566, + "grad_norm": 1.0848748683929443, + "learning_rate": 2.109242618741977e-05, + "loss": 0.2279, + "step": 28617 + }, + { + "epoch": 36.73684210526316, + "grad_norm": 0.9142143130302429, + "learning_rate": 2.1091998288403936e-05, + "loss": 0.1927, + "step": 28618 + }, + { + "epoch": 36.738125802310655, + "grad_norm": 1.9206219911575317, + "learning_rate": 2.1091570389388104e-05, + "loss": 0.2178, + "step": 28619 + }, + { + "epoch": 36.73940949935815, + "grad_norm": 1.2759644985198975, + "learning_rate": 2.1091142490372273e-05, + "loss": 0.2268, + "step": 28620 + }, + { + "epoch": 36.74069319640565, + "grad_norm": 0.9614579081535339, + "learning_rate": 2.109071459135644e-05, + "loss": 0.2348, + "step": 28621 + }, + { + "epoch": 36.74197689345314, + "grad_norm": 1.305519938468933, + "learning_rate": 2.1090286692340606e-05, + "loss": 0.1937, + "step": 28622 + }, + { + "epoch": 36.74326059050064, + "grad_norm": 2.103677272796631, + "learning_rate": 2.1089858793324778e-05, + "loss": 0.1981, + "step": 28623 + }, + { + "epoch": 36.74454428754814, + "grad_norm": 3.4280025959014893, + "learning_rate": 2.1089430894308943e-05, + "loss": 0.2213, + "step": 28624 + }, + { + "epoch": 36.745827984595635, + "grad_norm": 1.0343589782714844, + "learning_rate": 2.108900299529311e-05, + "loss": 0.2414, + "step": 28625 + }, + { + "epoch": 36.74711168164313, + "grad_norm": 1.016921877861023, + "learning_rate": 2.108857509627728e-05, + "loss": 0.2315, + "step": 28626 + }, + { + "epoch": 36.74839537869063, + "grad_norm": 1.0404258966445923, + "learning_rate": 2.1088147197261445e-05, + "loss": 0.2109, + "step": 28627 + }, + { + "epoch": 36.74967907573813, + "grad_norm": 2.9751055240631104, + "learning_rate": 2.1087719298245617e-05, + "loss": 0.2194, + "step": 28628 + }, + { + "epoch": 36.75096277278562, + "grad_norm": 0.9677078127861023, + "learning_rate": 2.1087291399229782e-05, + "loss": 0.1904, + "step": 28629 + }, + { + "epoch": 36.75224646983312, + "grad_norm": 1.8887059688568115, + "learning_rate": 2.108686350021395e-05, + "loss": 0.2026, + "step": 28630 + }, + { + "epoch": 36.753530166880616, + "grad_norm": 1.738775610923767, + "learning_rate": 2.108643560119812e-05, + "loss": 0.1789, + "step": 28631 + }, + { + "epoch": 36.75481386392811, + "grad_norm": 1.3174004554748535, + "learning_rate": 2.1086007702182284e-05, + "loss": 0.2371, + "step": 28632 + }, + { + "epoch": 36.75609756097561, + "grad_norm": 1.4271847009658813, + "learning_rate": 2.1085579803166455e-05, + "loss": 0.2257, + "step": 28633 + }, + { + "epoch": 36.75738125802311, + "grad_norm": 1.042961835861206, + "learning_rate": 2.108515190415062e-05, + "loss": 0.2289, + "step": 28634 + }, + { + "epoch": 36.7586649550706, + "grad_norm": 1.8251997232437134, + "learning_rate": 2.108472400513479e-05, + "loss": 0.2441, + "step": 28635 + }, + { + "epoch": 36.7599486521181, + "grad_norm": 1.018147587776184, + "learning_rate": 2.1084296106118957e-05, + "loss": 0.2113, + "step": 28636 + }, + { + "epoch": 36.761232349165596, + "grad_norm": 11.75373649597168, + "learning_rate": 2.1083868207103126e-05, + "loss": 0.2145, + "step": 28637 + }, + { + "epoch": 36.76251604621309, + "grad_norm": 0.9439563751220703, + "learning_rate": 2.108344030808729e-05, + "loss": 0.1926, + "step": 28638 + }, + { + "epoch": 36.76379974326059, + "grad_norm": 1.7835053205490112, + "learning_rate": 2.108301240907146e-05, + "loss": 0.2369, + "step": 28639 + }, + { + "epoch": 36.76508344030809, + "grad_norm": 1.3246134519577026, + "learning_rate": 2.1082584510055628e-05, + "loss": 0.2337, + "step": 28640 + }, + { + "epoch": 36.76636713735559, + "grad_norm": 2.741917848587036, + "learning_rate": 2.1082156611039796e-05, + "loss": 0.1959, + "step": 28641 + }, + { + "epoch": 36.76765083440308, + "grad_norm": 1.4979223012924194, + "learning_rate": 2.1081728712023964e-05, + "loss": 0.2685, + "step": 28642 + }, + { + "epoch": 36.768934531450576, + "grad_norm": 1.6306827068328857, + "learning_rate": 2.108130081300813e-05, + "loss": 0.2892, + "step": 28643 + }, + { + "epoch": 36.770218228498074, + "grad_norm": 3.989504814147949, + "learning_rate": 2.10808729139923e-05, + "loss": 0.3368, + "step": 28644 + }, + { + "epoch": 36.77150192554557, + "grad_norm": 2.800672769546509, + "learning_rate": 2.1080445014976466e-05, + "loss": 0.263, + "step": 28645 + }, + { + "epoch": 36.77278562259307, + "grad_norm": 1.5055961608886719, + "learning_rate": 2.108001711596063e-05, + "loss": 0.2252, + "step": 28646 + }, + { + "epoch": 36.77406931964057, + "grad_norm": 1.7797013521194458, + "learning_rate": 2.1079589216944803e-05, + "loss": 0.2527, + "step": 28647 + }, + { + "epoch": 36.775353016688065, + "grad_norm": 1.1400017738342285, + "learning_rate": 2.1079161317928968e-05, + "loss": 0.2615, + "step": 28648 + }, + { + "epoch": 36.776636713735556, + "grad_norm": 1.0036139488220215, + "learning_rate": 2.107873341891314e-05, + "loss": 0.2337, + "step": 28649 + }, + { + "epoch": 36.777920410783054, + "grad_norm": 0.8883529901504517, + "learning_rate": 2.1078305519897305e-05, + "loss": 0.2488, + "step": 28650 + }, + { + "epoch": 36.77920410783055, + "grad_norm": 1.1167778968811035, + "learning_rate": 2.1077877620881473e-05, + "loss": 0.2468, + "step": 28651 + }, + { + "epoch": 36.78048780487805, + "grad_norm": 0.8558018803596497, + "learning_rate": 2.107744972186564e-05, + "loss": 0.2566, + "step": 28652 + }, + { + "epoch": 36.78177150192555, + "grad_norm": 0.7281935811042786, + "learning_rate": 2.1077021822849807e-05, + "loss": 0.2214, + "step": 28653 + }, + { + "epoch": 36.783055198973045, + "grad_norm": 0.8256838321685791, + "learning_rate": 2.1076593923833975e-05, + "loss": 0.2488, + "step": 28654 + }, + { + "epoch": 36.784338896020536, + "grad_norm": 1.6599390506744385, + "learning_rate": 2.1076166024818144e-05, + "loss": 0.2502, + "step": 28655 + }, + { + "epoch": 36.785622593068034, + "grad_norm": 0.7833580374717712, + "learning_rate": 2.1075738125802312e-05, + "loss": 0.2285, + "step": 28656 + }, + { + "epoch": 36.78690629011553, + "grad_norm": 0.8272777795791626, + "learning_rate": 2.107531022678648e-05, + "loss": 0.243, + "step": 28657 + }, + { + "epoch": 36.78818998716303, + "grad_norm": 0.8618758320808411, + "learning_rate": 2.1074882327770645e-05, + "loss": 0.2267, + "step": 28658 + }, + { + "epoch": 36.78947368421053, + "grad_norm": 1.6297398805618286, + "learning_rate": 2.1074454428754814e-05, + "loss": 0.239, + "step": 28659 + }, + { + "epoch": 36.790757381258025, + "grad_norm": 0.9946914315223694, + "learning_rate": 2.1074026529738982e-05, + "loss": 0.2155, + "step": 28660 + }, + { + "epoch": 36.79204107830552, + "grad_norm": 1.0143275260925293, + "learning_rate": 2.107359863072315e-05, + "loss": 0.2479, + "step": 28661 + }, + { + "epoch": 36.793324775353014, + "grad_norm": 0.8664084076881409, + "learning_rate": 2.1073170731707316e-05, + "loss": 0.2404, + "step": 28662 + }, + { + "epoch": 36.79460847240051, + "grad_norm": 0.9226168990135193, + "learning_rate": 2.1072742832691487e-05, + "loss": 0.2182, + "step": 28663 + }, + { + "epoch": 36.79589216944801, + "grad_norm": 0.9370124340057373, + "learning_rate": 2.1072314933675652e-05, + "loss": 0.2139, + "step": 28664 + }, + { + "epoch": 36.79717586649551, + "grad_norm": 0.9063311219215393, + "learning_rate": 2.107188703465982e-05, + "loss": 0.2439, + "step": 28665 + }, + { + "epoch": 36.798459563543005, + "grad_norm": 1.036573886871338, + "learning_rate": 2.107145913564399e-05, + "loss": 0.2184, + "step": 28666 + }, + { + "epoch": 36.7997432605905, + "grad_norm": 1.0589231252670288, + "learning_rate": 2.1071031236628154e-05, + "loss": 0.2312, + "step": 28667 + }, + { + "epoch": 36.801026957637994, + "grad_norm": 0.9292938709259033, + "learning_rate": 2.1070603337612326e-05, + "loss": 0.2076, + "step": 28668 + }, + { + "epoch": 36.80231065468549, + "grad_norm": 1.5889567136764526, + "learning_rate": 2.107017543859649e-05, + "loss": 0.2146, + "step": 28669 + }, + { + "epoch": 36.80359435173299, + "grad_norm": 5.736649036407471, + "learning_rate": 2.106974753958066e-05, + "loss": 0.2017, + "step": 28670 + }, + { + "epoch": 36.80487804878049, + "grad_norm": 1.634810447692871, + "learning_rate": 2.1069319640564828e-05, + "loss": 0.2026, + "step": 28671 + }, + { + "epoch": 36.806161745827985, + "grad_norm": 0.8598349690437317, + "learning_rate": 2.1068891741548993e-05, + "loss": 0.2158, + "step": 28672 + }, + { + "epoch": 36.80744544287548, + "grad_norm": 1.1005162000656128, + "learning_rate": 2.106846384253316e-05, + "loss": 0.1994, + "step": 28673 + }, + { + "epoch": 36.80872913992298, + "grad_norm": 1.835774540901184, + "learning_rate": 2.106803594351733e-05, + "loss": 0.205, + "step": 28674 + }, + { + "epoch": 36.81001283697047, + "grad_norm": 0.9852920174598694, + "learning_rate": 2.1067608044501498e-05, + "loss": 0.2236, + "step": 28675 + }, + { + "epoch": 36.81129653401797, + "grad_norm": 1.0867279767990112, + "learning_rate": 2.1067180145485667e-05, + "loss": 0.2092, + "step": 28676 + }, + { + "epoch": 36.81258023106547, + "grad_norm": 2.5753533840179443, + "learning_rate": 2.1066752246469835e-05, + "loss": 0.2167, + "step": 28677 + }, + { + "epoch": 36.813863928112966, + "grad_norm": 1.033649206161499, + "learning_rate": 2.1066324347454e-05, + "loss": 0.2164, + "step": 28678 + }, + { + "epoch": 36.81514762516046, + "grad_norm": 0.9906002283096313, + "learning_rate": 2.106589644843817e-05, + "loss": 0.2227, + "step": 28679 + }, + { + "epoch": 36.81643132220796, + "grad_norm": 1.0279779434204102, + "learning_rate": 2.1065468549422337e-05, + "loss": 0.1939, + "step": 28680 + }, + { + "epoch": 36.81771501925546, + "grad_norm": 1.3200942277908325, + "learning_rate": 2.1065040650406502e-05, + "loss": 0.2111, + "step": 28681 + }, + { + "epoch": 36.81899871630295, + "grad_norm": 2.6879289150238037, + "learning_rate": 2.1064612751390674e-05, + "loss": 0.2009, + "step": 28682 + }, + { + "epoch": 36.82028241335045, + "grad_norm": 1.364432692527771, + "learning_rate": 2.106418485237484e-05, + "loss": 0.2334, + "step": 28683 + }, + { + "epoch": 36.821566110397946, + "grad_norm": 1.6395645141601562, + "learning_rate": 2.106375695335901e-05, + "loss": 0.2188, + "step": 28684 + }, + { + "epoch": 36.822849807445444, + "grad_norm": 1.1142592430114746, + "learning_rate": 2.1063329054343176e-05, + "loss": 0.2318, + "step": 28685 + }, + { + "epoch": 36.82413350449294, + "grad_norm": 1.9727221727371216, + "learning_rate": 2.106290115532734e-05, + "loss": 0.2608, + "step": 28686 + }, + { + "epoch": 36.82541720154044, + "grad_norm": 1.7896463871002197, + "learning_rate": 2.1062473256311512e-05, + "loss": 0.2686, + "step": 28687 + }, + { + "epoch": 36.82670089858793, + "grad_norm": 1.6585602760314941, + "learning_rate": 2.1062045357295677e-05, + "loss": 0.2448, + "step": 28688 + }, + { + "epoch": 36.82798459563543, + "grad_norm": 3.3302221298217773, + "learning_rate": 2.1061617458279846e-05, + "loss": 0.2465, + "step": 28689 + }, + { + "epoch": 36.829268292682926, + "grad_norm": 1.5781731605529785, + "learning_rate": 2.1061189559264014e-05, + "loss": 0.2295, + "step": 28690 + }, + { + "epoch": 36.830551989730424, + "grad_norm": 2.8430683612823486, + "learning_rate": 2.1060761660248183e-05, + "loss": 0.2475, + "step": 28691 + }, + { + "epoch": 36.83183568677792, + "grad_norm": 1.8006545305252075, + "learning_rate": 2.106033376123235e-05, + "loss": 0.2974, + "step": 28692 + }, + { + "epoch": 36.83311938382542, + "grad_norm": 2.144533157348633, + "learning_rate": 2.1059905862216516e-05, + "loss": 0.2724, + "step": 28693 + }, + { + "epoch": 36.83440308087292, + "grad_norm": 3.259997844696045, + "learning_rate": 2.1059477963200684e-05, + "loss": 0.3292, + "step": 28694 + }, + { + "epoch": 36.83568677792041, + "grad_norm": 2.471513271331787, + "learning_rate": 2.1059050064184853e-05, + "loss": 0.2445, + "step": 28695 + }, + { + "epoch": 36.836970474967906, + "grad_norm": 1.242098093032837, + "learning_rate": 2.105862216516902e-05, + "loss": 0.2641, + "step": 28696 + }, + { + "epoch": 36.838254172015404, + "grad_norm": 1.0601736307144165, + "learning_rate": 2.1058194266153186e-05, + "loss": 0.2312, + "step": 28697 + }, + { + "epoch": 36.8395378690629, + "grad_norm": 1.0593715906143188, + "learning_rate": 2.1057766367137358e-05, + "loss": 0.2455, + "step": 28698 + }, + { + "epoch": 36.8408215661104, + "grad_norm": 3.3491098880767822, + "learning_rate": 2.1057338468121523e-05, + "loss": 0.2562, + "step": 28699 + }, + { + "epoch": 36.8421052631579, + "grad_norm": 0.9741585850715637, + "learning_rate": 2.105691056910569e-05, + "loss": 0.2397, + "step": 28700 + }, + { + "epoch": 36.84338896020539, + "grad_norm": 2.079249620437622, + "learning_rate": 2.105648267008986e-05, + "loss": 0.2286, + "step": 28701 + }, + { + "epoch": 36.844672657252886, + "grad_norm": 2.3317453861236572, + "learning_rate": 2.1056054771074025e-05, + "loss": 0.2307, + "step": 28702 + }, + { + "epoch": 36.845956354300384, + "grad_norm": 1.8417201042175293, + "learning_rate": 2.1055626872058197e-05, + "loss": 0.2468, + "step": 28703 + }, + { + "epoch": 36.84724005134788, + "grad_norm": 1.1340134143829346, + "learning_rate": 2.1055198973042362e-05, + "loss": 0.259, + "step": 28704 + }, + { + "epoch": 36.84852374839538, + "grad_norm": 0.8965432047843933, + "learning_rate": 2.105477107402653e-05, + "loss": 0.2413, + "step": 28705 + }, + { + "epoch": 36.84980744544288, + "grad_norm": 1.2183563709259033, + "learning_rate": 2.10543431750107e-05, + "loss": 0.2426, + "step": 28706 + }, + { + "epoch": 36.851091142490375, + "grad_norm": 0.9113839268684387, + "learning_rate": 2.1053915275994864e-05, + "loss": 0.2002, + "step": 28707 + }, + { + "epoch": 36.852374839537866, + "grad_norm": 0.9496136903762817, + "learning_rate": 2.1053487376979035e-05, + "loss": 0.2446, + "step": 28708 + }, + { + "epoch": 36.853658536585364, + "grad_norm": 1.4934990406036377, + "learning_rate": 2.10530594779632e-05, + "loss": 0.2249, + "step": 28709 + }, + { + "epoch": 36.85494223363286, + "grad_norm": 0.8985641002655029, + "learning_rate": 2.105263157894737e-05, + "loss": 0.2313, + "step": 28710 + }, + { + "epoch": 36.85622593068036, + "grad_norm": 1.5241756439208984, + "learning_rate": 2.1052203679931537e-05, + "loss": 0.2286, + "step": 28711 + }, + { + "epoch": 36.85750962772786, + "grad_norm": 1.0106463432312012, + "learning_rate": 2.1051775780915706e-05, + "loss": 0.187, + "step": 28712 + }, + { + "epoch": 36.858793324775355, + "grad_norm": 1.4456866979599, + "learning_rate": 2.105134788189987e-05, + "loss": 0.2292, + "step": 28713 + }, + { + "epoch": 36.86007702182285, + "grad_norm": 0.7853289246559143, + "learning_rate": 2.105091998288404e-05, + "loss": 0.2326, + "step": 28714 + }, + { + "epoch": 36.861360718870344, + "grad_norm": 1.1397380828857422, + "learning_rate": 2.1050492083868208e-05, + "loss": 0.2383, + "step": 28715 + }, + { + "epoch": 36.86264441591784, + "grad_norm": 2.4583725929260254, + "learning_rate": 2.1050064184852376e-05, + "loss": 0.2026, + "step": 28716 + }, + { + "epoch": 36.86392811296534, + "grad_norm": 1.6862860918045044, + "learning_rate": 2.1049636285836544e-05, + "loss": 0.2227, + "step": 28717 + }, + { + "epoch": 36.86521181001284, + "grad_norm": 1.4314295053482056, + "learning_rate": 2.104920838682071e-05, + "loss": 0.2343, + "step": 28718 + }, + { + "epoch": 36.866495507060336, + "grad_norm": 2.74774169921875, + "learning_rate": 2.1048780487804878e-05, + "loss": 0.2186, + "step": 28719 + }, + { + "epoch": 36.86777920410783, + "grad_norm": 1.3621546030044556, + "learning_rate": 2.1048352588789046e-05, + "loss": 0.2248, + "step": 28720 + }, + { + "epoch": 36.869062901155324, + "grad_norm": 0.9583876132965088, + "learning_rate": 2.104792468977321e-05, + "loss": 0.2159, + "step": 28721 + }, + { + "epoch": 36.87034659820282, + "grad_norm": 0.851274311542511, + "learning_rate": 2.1047496790757383e-05, + "loss": 0.1984, + "step": 28722 + }, + { + "epoch": 36.87163029525032, + "grad_norm": 0.8686825633049011, + "learning_rate": 2.1047068891741548e-05, + "loss": 0.1906, + "step": 28723 + }, + { + "epoch": 36.87291399229782, + "grad_norm": 1.7170826196670532, + "learning_rate": 2.104664099272572e-05, + "loss": 0.2008, + "step": 28724 + }, + { + "epoch": 36.874197689345316, + "grad_norm": 2.2431366443634033, + "learning_rate": 2.1046213093709885e-05, + "loss": 0.2221, + "step": 28725 + }, + { + "epoch": 36.87548138639281, + "grad_norm": 2.2230184078216553, + "learning_rate": 2.104578519469405e-05, + "loss": 0.2074, + "step": 28726 + }, + { + "epoch": 36.87676508344031, + "grad_norm": 1.220367431640625, + "learning_rate": 2.1045357295678222e-05, + "loss": 0.2394, + "step": 28727 + }, + { + "epoch": 36.8780487804878, + "grad_norm": 1.3298052549362183, + "learning_rate": 2.1044929396662387e-05, + "loss": 0.2167, + "step": 28728 + }, + { + "epoch": 36.8793324775353, + "grad_norm": 1.9759379625320435, + "learning_rate": 2.1044501497646555e-05, + "loss": 0.1999, + "step": 28729 + }, + { + "epoch": 36.8806161745828, + "grad_norm": 1.228448510169983, + "learning_rate": 2.1044073598630724e-05, + "loss": 0.218, + "step": 28730 + }, + { + "epoch": 36.881899871630296, + "grad_norm": 2.960395097732544, + "learning_rate": 2.1043645699614892e-05, + "loss": 0.2142, + "step": 28731 + }, + { + "epoch": 36.883183568677794, + "grad_norm": 1.217335820198059, + "learning_rate": 2.104321780059906e-05, + "loss": 0.2342, + "step": 28732 + }, + { + "epoch": 36.88446726572529, + "grad_norm": 2.375084161758423, + "learning_rate": 2.1042789901583225e-05, + "loss": 0.2003, + "step": 28733 + }, + { + "epoch": 36.88575096277278, + "grad_norm": 1.128533959388733, + "learning_rate": 2.1042362002567394e-05, + "loss": 0.2404, + "step": 28734 + }, + { + "epoch": 36.88703465982028, + "grad_norm": 1.3611880540847778, + "learning_rate": 2.1041934103551562e-05, + "loss": 0.2408, + "step": 28735 + }, + { + "epoch": 36.88831835686778, + "grad_norm": 1.5045852661132812, + "learning_rate": 2.104150620453573e-05, + "loss": 0.2359, + "step": 28736 + }, + { + "epoch": 36.889602053915276, + "grad_norm": 1.2559640407562256, + "learning_rate": 2.1041078305519896e-05, + "loss": 0.2302, + "step": 28737 + }, + { + "epoch": 36.890885750962774, + "grad_norm": 3.2274975776672363, + "learning_rate": 2.1040650406504067e-05, + "loss": 0.2505, + "step": 28738 + }, + { + "epoch": 36.89216944801027, + "grad_norm": 1.932682752609253, + "learning_rate": 2.1040222507488233e-05, + "loss": 0.2665, + "step": 28739 + }, + { + "epoch": 36.89345314505777, + "grad_norm": 1.4965627193450928, + "learning_rate": 2.10397946084724e-05, + "loss": 0.2386, + "step": 28740 + }, + { + "epoch": 36.89473684210526, + "grad_norm": 1.6099271774291992, + "learning_rate": 2.103936670945657e-05, + "loss": 0.257, + "step": 28741 + }, + { + "epoch": 36.89602053915276, + "grad_norm": 1.4858975410461426, + "learning_rate": 2.1038938810440734e-05, + "loss": 0.2928, + "step": 28742 + }, + { + "epoch": 36.897304236200256, + "grad_norm": 2.365126848220825, + "learning_rate": 2.1038510911424906e-05, + "loss": 0.3388, + "step": 28743 + }, + { + "epoch": 36.898587933247754, + "grad_norm": 3.6446375846862793, + "learning_rate": 2.103808301240907e-05, + "loss": 0.3898, + "step": 28744 + }, + { + "epoch": 36.89987163029525, + "grad_norm": 0.6991501450538635, + "learning_rate": 2.103765511339324e-05, + "loss": 0.2353, + "step": 28745 + }, + { + "epoch": 36.90115532734275, + "grad_norm": 0.847396194934845, + "learning_rate": 2.1037227214377408e-05, + "loss": 0.2462, + "step": 28746 + }, + { + "epoch": 36.90243902439025, + "grad_norm": 0.6827064156532288, + "learning_rate": 2.1036799315361573e-05, + "loss": 0.2445, + "step": 28747 + }, + { + "epoch": 36.90372272143774, + "grad_norm": 1.2426037788391113, + "learning_rate": 2.1036371416345745e-05, + "loss": 0.238, + "step": 28748 + }, + { + "epoch": 36.905006418485236, + "grad_norm": 0.6657384037971497, + "learning_rate": 2.103594351732991e-05, + "loss": 0.2314, + "step": 28749 + }, + { + "epoch": 36.906290115532734, + "grad_norm": 1.0469273328781128, + "learning_rate": 2.1035515618314078e-05, + "loss": 0.2239, + "step": 28750 + }, + { + "epoch": 36.90757381258023, + "grad_norm": 1.0954707860946655, + "learning_rate": 2.1035087719298247e-05, + "loss": 0.2475, + "step": 28751 + }, + { + "epoch": 36.90885750962773, + "grad_norm": 0.8188923001289368, + "learning_rate": 2.1034659820282415e-05, + "loss": 0.2279, + "step": 28752 + }, + { + "epoch": 36.91014120667523, + "grad_norm": 1.281478762626648, + "learning_rate": 2.103423192126658e-05, + "loss": 0.2392, + "step": 28753 + }, + { + "epoch": 36.91142490372272, + "grad_norm": 3.1862964630126953, + "learning_rate": 2.103380402225075e-05, + "loss": 0.2501, + "step": 28754 + }, + { + "epoch": 36.912708600770216, + "grad_norm": 0.8932477235794067, + "learning_rate": 2.1033376123234917e-05, + "loss": 0.2392, + "step": 28755 + }, + { + "epoch": 36.913992297817714, + "grad_norm": 1.5730094909667969, + "learning_rate": 2.1032948224219085e-05, + "loss": 0.2549, + "step": 28756 + }, + { + "epoch": 36.91527599486521, + "grad_norm": 1.6027148962020874, + "learning_rate": 2.1032520325203254e-05, + "loss": 0.2369, + "step": 28757 + }, + { + "epoch": 36.91655969191271, + "grad_norm": 2.6556236743927, + "learning_rate": 2.103209242618742e-05, + "loss": 0.23, + "step": 28758 + }, + { + "epoch": 36.91784338896021, + "grad_norm": 0.9223546385765076, + "learning_rate": 2.103166452717159e-05, + "loss": 0.23, + "step": 28759 + }, + { + "epoch": 36.919127086007705, + "grad_norm": 1.9172829389572144, + "learning_rate": 2.1031236628155756e-05, + "loss": 0.233, + "step": 28760 + }, + { + "epoch": 36.920410783055196, + "grad_norm": 1.0148828029632568, + "learning_rate": 2.103080872913992e-05, + "loss": 0.2193, + "step": 28761 + }, + { + "epoch": 36.921694480102694, + "grad_norm": 0.9759458303451538, + "learning_rate": 2.1030380830124092e-05, + "loss": 0.2186, + "step": 28762 + }, + { + "epoch": 36.92297817715019, + "grad_norm": 0.9750206470489502, + "learning_rate": 2.1029952931108257e-05, + "loss": 0.2244, + "step": 28763 + }, + { + "epoch": 36.92426187419769, + "grad_norm": 1.503240942955017, + "learning_rate": 2.102952503209243e-05, + "loss": 0.2169, + "step": 28764 + }, + { + "epoch": 36.92554557124519, + "grad_norm": 0.9147610664367676, + "learning_rate": 2.1029097133076594e-05, + "loss": 0.2161, + "step": 28765 + }, + { + "epoch": 36.926829268292686, + "grad_norm": 0.9199352860450745, + "learning_rate": 2.1028669234060763e-05, + "loss": 0.2275, + "step": 28766 + }, + { + "epoch": 36.928112965340176, + "grad_norm": 0.8329793214797974, + "learning_rate": 2.102824133504493e-05, + "loss": 0.1976, + "step": 28767 + }, + { + "epoch": 36.929396662387674, + "grad_norm": 0.7998455762863159, + "learning_rate": 2.1027813436029096e-05, + "loss": 0.2104, + "step": 28768 + }, + { + "epoch": 36.93068035943517, + "grad_norm": 1.1131805181503296, + "learning_rate": 2.1027385537013265e-05, + "loss": 0.2122, + "step": 28769 + }, + { + "epoch": 36.93196405648267, + "grad_norm": 0.9072822332382202, + "learning_rate": 2.1026957637997433e-05, + "loss": 0.1969, + "step": 28770 + }, + { + "epoch": 36.93324775353017, + "grad_norm": 0.9552128911018372, + "learning_rate": 2.10265297389816e-05, + "loss": 0.2028, + "step": 28771 + }, + { + "epoch": 36.934531450577666, + "grad_norm": 1.7520058155059814, + "learning_rate": 2.102610183996577e-05, + "loss": 0.2017, + "step": 28772 + }, + { + "epoch": 36.93581514762516, + "grad_norm": 1.4896109104156494, + "learning_rate": 2.1025673940949938e-05, + "loss": 0.2327, + "step": 28773 + }, + { + "epoch": 36.937098844672654, + "grad_norm": 2.960442304611206, + "learning_rate": 2.1025246041934103e-05, + "loss": 0.1911, + "step": 28774 + }, + { + "epoch": 36.93838254172015, + "grad_norm": 2.7772979736328125, + "learning_rate": 2.102481814291827e-05, + "loss": 0.2057, + "step": 28775 + }, + { + "epoch": 36.93966623876765, + "grad_norm": 1.3335704803466797, + "learning_rate": 2.102439024390244e-05, + "loss": 0.2031, + "step": 28776 + }, + { + "epoch": 36.94094993581515, + "grad_norm": 2.2710154056549072, + "learning_rate": 2.1023962344886605e-05, + "loss": 0.2266, + "step": 28777 + }, + { + "epoch": 36.942233632862646, + "grad_norm": 1.0715360641479492, + "learning_rate": 2.1023534445870777e-05, + "loss": 0.2269, + "step": 28778 + }, + { + "epoch": 36.943517329910144, + "grad_norm": 0.9406909942626953, + "learning_rate": 2.1023106546854942e-05, + "loss": 0.213, + "step": 28779 + }, + { + "epoch": 36.94480102695764, + "grad_norm": 2.2069315910339355, + "learning_rate": 2.102267864783911e-05, + "loss": 0.1989, + "step": 28780 + }, + { + "epoch": 36.94608472400513, + "grad_norm": 1.3978350162506104, + "learning_rate": 2.102225074882328e-05, + "loss": 0.221, + "step": 28781 + }, + { + "epoch": 36.94736842105263, + "grad_norm": 1.0488064289093018, + "learning_rate": 2.1021822849807444e-05, + "loss": 0.2427, + "step": 28782 + }, + { + "epoch": 36.94865211810013, + "grad_norm": 1.0244994163513184, + "learning_rate": 2.1021394950791616e-05, + "loss": 0.1792, + "step": 28783 + }, + { + "epoch": 36.949935815147626, + "grad_norm": 2.5009875297546387, + "learning_rate": 2.102096705177578e-05, + "loss": 0.2354, + "step": 28784 + }, + { + "epoch": 36.951219512195124, + "grad_norm": 2.1902401447296143, + "learning_rate": 2.102053915275995e-05, + "loss": 0.2863, + "step": 28785 + }, + { + "epoch": 36.95250320924262, + "grad_norm": 1.7932206392288208, + "learning_rate": 2.1020111253744117e-05, + "loss": 0.2168, + "step": 28786 + }, + { + "epoch": 36.95378690629011, + "grad_norm": 1.586036205291748, + "learning_rate": 2.1019683354728282e-05, + "loss": 0.2498, + "step": 28787 + }, + { + "epoch": 36.95507060333761, + "grad_norm": 2.3111443519592285, + "learning_rate": 2.1019255455712454e-05, + "loss": 0.2208, + "step": 28788 + }, + { + "epoch": 36.95635430038511, + "grad_norm": 1.5124355554580688, + "learning_rate": 2.101882755669662e-05, + "loss": 0.229, + "step": 28789 + }, + { + "epoch": 36.957637997432606, + "grad_norm": 1.3764277696609497, + "learning_rate": 2.1018399657680788e-05, + "loss": 0.2337, + "step": 28790 + }, + { + "epoch": 36.958921694480104, + "grad_norm": 1.3721240758895874, + "learning_rate": 2.1017971758664956e-05, + "loss": 0.2058, + "step": 28791 + }, + { + "epoch": 36.9602053915276, + "grad_norm": 2.4959161281585693, + "learning_rate": 2.1017543859649124e-05, + "loss": 0.2204, + "step": 28792 + }, + { + "epoch": 36.9614890885751, + "grad_norm": 1.6313270330429077, + "learning_rate": 2.101711596063329e-05, + "loss": 0.2583, + "step": 28793 + }, + { + "epoch": 36.96277278562259, + "grad_norm": 3.4925708770751953, + "learning_rate": 2.1016688061617458e-05, + "loss": 0.3651, + "step": 28794 + }, + { + "epoch": 36.96405648267009, + "grad_norm": 1.1377842426300049, + "learning_rate": 2.1016260162601626e-05, + "loss": 0.2299, + "step": 28795 + }, + { + "epoch": 36.965340179717586, + "grad_norm": 1.5269562005996704, + "learning_rate": 2.1015832263585795e-05, + "loss": 0.2257, + "step": 28796 + }, + { + "epoch": 36.966623876765084, + "grad_norm": 1.0214515924453735, + "learning_rate": 2.1015404364569963e-05, + "loss": 0.2346, + "step": 28797 + }, + { + "epoch": 36.96790757381258, + "grad_norm": 1.0834274291992188, + "learning_rate": 2.1014976465554128e-05, + "loss": 0.2199, + "step": 28798 + }, + { + "epoch": 36.96919127086008, + "grad_norm": 2.7054152488708496, + "learning_rate": 2.10145485665383e-05, + "loss": 0.2352, + "step": 28799 + }, + { + "epoch": 36.97047496790757, + "grad_norm": 1.1531940698623657, + "learning_rate": 2.1014120667522465e-05, + "loss": 0.2278, + "step": 28800 + }, + { + "epoch": 36.97175866495507, + "grad_norm": 1.4090832471847534, + "learning_rate": 2.101369276850663e-05, + "loss": 0.224, + "step": 28801 + }, + { + "epoch": 36.973042362002566, + "grad_norm": 1.5177335739135742, + "learning_rate": 2.1013264869490802e-05, + "loss": 0.2179, + "step": 28802 + }, + { + "epoch": 36.974326059050064, + "grad_norm": 0.9341722130775452, + "learning_rate": 2.1012836970474967e-05, + "loss": 0.2151, + "step": 28803 + }, + { + "epoch": 36.97560975609756, + "grad_norm": 0.8982879519462585, + "learning_rate": 2.101240907145914e-05, + "loss": 0.2227, + "step": 28804 + }, + { + "epoch": 36.97689345314506, + "grad_norm": 2.476400852203369, + "learning_rate": 2.1011981172443304e-05, + "loss": 0.2337, + "step": 28805 + }, + { + "epoch": 36.97817715019256, + "grad_norm": 0.942144513130188, + "learning_rate": 2.1011553273427472e-05, + "loss": 0.2236, + "step": 28806 + }, + { + "epoch": 36.97946084724005, + "grad_norm": 1.0014622211456299, + "learning_rate": 2.101112537441164e-05, + "loss": 0.2086, + "step": 28807 + }, + { + "epoch": 36.980744544287546, + "grad_norm": 1.4171966314315796, + "learning_rate": 2.1010697475395805e-05, + "loss": 0.1944, + "step": 28808 + }, + { + "epoch": 36.982028241335044, + "grad_norm": 2.604792833328247, + "learning_rate": 2.1010269576379974e-05, + "loss": 0.2324, + "step": 28809 + }, + { + "epoch": 36.98331193838254, + "grad_norm": 0.9242556095123291, + "learning_rate": 2.1009841677364142e-05, + "loss": 0.2268, + "step": 28810 + }, + { + "epoch": 36.98459563543004, + "grad_norm": 1.0026923418045044, + "learning_rate": 2.100941377834831e-05, + "loss": 0.2342, + "step": 28811 + }, + { + "epoch": 36.98587933247754, + "grad_norm": 0.8939149379730225, + "learning_rate": 2.100898587933248e-05, + "loss": 0.1968, + "step": 28812 + }, + { + "epoch": 36.987163029525036, + "grad_norm": 1.8506300449371338, + "learning_rate": 2.1008557980316648e-05, + "loss": 0.2539, + "step": 28813 + }, + { + "epoch": 36.988446726572526, + "grad_norm": 1.2336395978927612, + "learning_rate": 2.1008130081300813e-05, + "loss": 0.2442, + "step": 28814 + }, + { + "epoch": 36.989730423620024, + "grad_norm": 2.1378908157348633, + "learning_rate": 2.100770218228498e-05, + "loss": 0.2132, + "step": 28815 + }, + { + "epoch": 36.99101412066752, + "grad_norm": 1.4398900270462036, + "learning_rate": 2.100727428326915e-05, + "loss": 0.2721, + "step": 28816 + }, + { + "epoch": 36.99229781771502, + "grad_norm": 1.9815255403518677, + "learning_rate": 2.1006846384253314e-05, + "loss": 0.2232, + "step": 28817 + }, + { + "epoch": 36.99358151476252, + "grad_norm": 1.0315946340560913, + "learning_rate": 2.1006418485237486e-05, + "loss": 0.2297, + "step": 28818 + }, + { + "epoch": 36.994865211810016, + "grad_norm": 1.2611083984375, + "learning_rate": 2.100599058622165e-05, + "loss": 0.2197, + "step": 28819 + }, + { + "epoch": 36.996148908857506, + "grad_norm": 1.95431649684906, + "learning_rate": 2.1005562687205823e-05, + "loss": 0.2685, + "step": 28820 + }, + { + "epoch": 36.997432605905004, + "grad_norm": 1.5149873495101929, + "learning_rate": 2.1005134788189988e-05, + "loss": 0.2891, + "step": 28821 + }, + { + "epoch": 36.9987163029525, + "grad_norm": 1.2407238483428955, + "learning_rate": 2.1004706889174153e-05, + "loss": 0.2976, + "step": 28822 + }, + { + "epoch": 37.0, + "grad_norm": 2.8512086868286133, + "learning_rate": 2.1004278990158325e-05, + "loss": 0.438, + "step": 28823 + }, + { + "epoch": 37.0012836970475, + "grad_norm": 2.764052629470825, + "learning_rate": 2.100385109114249e-05, + "loss": 0.2256, + "step": 28824 + }, + { + "epoch": 37.002567394094996, + "grad_norm": 0.7026678323745728, + "learning_rate": 2.100342319212666e-05, + "loss": 0.2418, + "step": 28825 + }, + { + "epoch": 37.003851091142494, + "grad_norm": 0.8417608737945557, + "learning_rate": 2.1002995293110827e-05, + "loss": 0.247, + "step": 28826 + }, + { + "epoch": 37.005134788189984, + "grad_norm": 0.8648425936698914, + "learning_rate": 2.1002567394094995e-05, + "loss": 0.2322, + "step": 28827 + }, + { + "epoch": 37.00641848523748, + "grad_norm": 1.2741568088531494, + "learning_rate": 2.1002139495079164e-05, + "loss": 0.2354, + "step": 28828 + }, + { + "epoch": 37.00770218228498, + "grad_norm": 2.2728874683380127, + "learning_rate": 2.100171159606333e-05, + "loss": 0.2344, + "step": 28829 + }, + { + "epoch": 37.00898587933248, + "grad_norm": 1.2459641695022583, + "learning_rate": 2.1001283697047497e-05, + "loss": 0.2336, + "step": 28830 + }, + { + "epoch": 37.010269576379976, + "grad_norm": 1.134053111076355, + "learning_rate": 2.1000855798031665e-05, + "loss": 0.2099, + "step": 28831 + }, + { + "epoch": 37.011553273427474, + "grad_norm": 1.2703449726104736, + "learning_rate": 2.1000427899015834e-05, + "loss": 0.2444, + "step": 28832 + }, + { + "epoch": 37.012836970474964, + "grad_norm": 2.573197603225708, + "learning_rate": 2.1e-05, + "loss": 0.2101, + "step": 28833 + }, + { + "epoch": 37.01412066752246, + "grad_norm": 2.822570323944092, + "learning_rate": 2.099957210098417e-05, + "loss": 0.2336, + "step": 28834 + }, + { + "epoch": 37.01540436456996, + "grad_norm": 1.7542436122894287, + "learning_rate": 2.0999144201968336e-05, + "loss": 0.2185, + "step": 28835 + }, + { + "epoch": 37.01668806161746, + "grad_norm": 2.2829182147979736, + "learning_rate": 2.0998716302952504e-05, + "loss": 0.2319, + "step": 28836 + }, + { + "epoch": 37.017971758664956, + "grad_norm": 1.2568820714950562, + "learning_rate": 2.0998288403936672e-05, + "loss": 0.2199, + "step": 28837 + }, + { + "epoch": 37.019255455712454, + "grad_norm": 1.1272227764129639, + "learning_rate": 2.0997860504920838e-05, + "loss": 0.2128, + "step": 28838 + }, + { + "epoch": 37.02053915275995, + "grad_norm": 0.8684880137443542, + "learning_rate": 2.099743260590501e-05, + "loss": 0.2323, + "step": 28839 + }, + { + "epoch": 37.02182284980744, + "grad_norm": 0.9709179401397705, + "learning_rate": 2.0997004706889174e-05, + "loss": 0.2342, + "step": 28840 + }, + { + "epoch": 37.02310654685494, + "grad_norm": 1.3001196384429932, + "learning_rate": 2.0996576807873343e-05, + "loss": 0.1756, + "step": 28841 + }, + { + "epoch": 37.02439024390244, + "grad_norm": 1.4088129997253418, + "learning_rate": 2.099614890885751e-05, + "loss": 0.217, + "step": 28842 + }, + { + "epoch": 37.025673940949936, + "grad_norm": 1.0600401163101196, + "learning_rate": 2.0995721009841676e-05, + "loss": 0.2256, + "step": 28843 + }, + { + "epoch": 37.026957637997434, + "grad_norm": 2.177889585494995, + "learning_rate": 2.0995293110825848e-05, + "loss": 0.2029, + "step": 28844 + }, + { + "epoch": 37.02824133504493, + "grad_norm": 0.9234833717346191, + "learning_rate": 2.0994865211810013e-05, + "loss": 0.1882, + "step": 28845 + }, + { + "epoch": 37.02952503209243, + "grad_norm": 1.5161316394805908, + "learning_rate": 2.099443731279418e-05, + "loss": 0.2005, + "step": 28846 + }, + { + "epoch": 37.03080872913992, + "grad_norm": 0.8363113403320312, + "learning_rate": 2.099400941377835e-05, + "loss": 0.1908, + "step": 28847 + }, + { + "epoch": 37.03209242618742, + "grad_norm": 1.1682361364364624, + "learning_rate": 2.0993581514762515e-05, + "loss": 0.1729, + "step": 28848 + }, + { + "epoch": 37.033376123234916, + "grad_norm": 1.907867431640625, + "learning_rate": 2.0993153615746683e-05, + "loss": 0.2084, + "step": 28849 + }, + { + "epoch": 37.034659820282414, + "grad_norm": 1.0504580736160278, + "learning_rate": 2.099272571673085e-05, + "loss": 0.2133, + "step": 28850 + }, + { + "epoch": 37.03594351732991, + "grad_norm": 1.1299914121627808, + "learning_rate": 2.099229781771502e-05, + "loss": 0.1942, + "step": 28851 + }, + { + "epoch": 37.03722721437741, + "grad_norm": 3.8683671951293945, + "learning_rate": 2.099186991869919e-05, + "loss": 0.1906, + "step": 28852 + }, + { + "epoch": 37.0385109114249, + "grad_norm": 1.1921696662902832, + "learning_rate": 2.0991442019683357e-05, + "loss": 0.2119, + "step": 28853 + }, + { + "epoch": 37.0397946084724, + "grad_norm": 1.0416924953460693, + "learning_rate": 2.0991014120667522e-05, + "loss": 0.1978, + "step": 28854 + }, + { + "epoch": 37.041078305519896, + "grad_norm": 2.7721641063690186, + "learning_rate": 2.099058622165169e-05, + "loss": 0.1979, + "step": 28855 + }, + { + "epoch": 37.042362002567394, + "grad_norm": 1.0363701581954956, + "learning_rate": 2.099015832263586e-05, + "loss": 0.171, + "step": 28856 + }, + { + "epoch": 37.04364569961489, + "grad_norm": 1.3177038431167603, + "learning_rate": 2.0989730423620024e-05, + "loss": 0.207, + "step": 28857 + }, + { + "epoch": 37.04492939666239, + "grad_norm": 0.86019366979599, + "learning_rate": 2.0989302524604196e-05, + "loss": 0.1904, + "step": 28858 + }, + { + "epoch": 37.04621309370989, + "grad_norm": 1.552461862564087, + "learning_rate": 2.098887462558836e-05, + "loss": 0.1992, + "step": 28859 + }, + { + "epoch": 37.04749679075738, + "grad_norm": 1.1687573194503784, + "learning_rate": 2.0988446726572532e-05, + "loss": 0.1891, + "step": 28860 + }, + { + "epoch": 37.048780487804876, + "grad_norm": 1.470354676246643, + "learning_rate": 2.0988018827556697e-05, + "loss": 0.2083, + "step": 28861 + }, + { + "epoch": 37.050064184852374, + "grad_norm": 3.4159557819366455, + "learning_rate": 2.0987590928540862e-05, + "loss": 0.2054, + "step": 28862 + }, + { + "epoch": 37.05134788189987, + "grad_norm": 2.5420241355895996, + "learning_rate": 2.0987163029525034e-05, + "loss": 0.1897, + "step": 28863 + }, + { + "epoch": 37.05263157894737, + "grad_norm": 1.1881572008132935, + "learning_rate": 2.09867351305092e-05, + "loss": 0.2343, + "step": 28864 + }, + { + "epoch": 37.05391527599487, + "grad_norm": 1.1163744926452637, + "learning_rate": 2.0986307231493368e-05, + "loss": 0.2016, + "step": 28865 + }, + { + "epoch": 37.05519897304236, + "grad_norm": 1.222241759300232, + "learning_rate": 2.0985879332477536e-05, + "loss": 0.1869, + "step": 28866 + }, + { + "epoch": 37.056482670089856, + "grad_norm": 2.6320173740386963, + "learning_rate": 2.0985451433461705e-05, + "loss": 0.1954, + "step": 28867 + }, + { + "epoch": 37.057766367137354, + "grad_norm": 2.7163779735565186, + "learning_rate": 2.0985023534445873e-05, + "loss": 0.2057, + "step": 28868 + }, + { + "epoch": 37.05905006418485, + "grad_norm": 1.3131564855575562, + "learning_rate": 2.0984595635430038e-05, + "loss": 0.2402, + "step": 28869 + }, + { + "epoch": 37.06033376123235, + "grad_norm": 1.4490134716033936, + "learning_rate": 2.0984167736414206e-05, + "loss": 0.2477, + "step": 28870 + }, + { + "epoch": 37.06161745827985, + "grad_norm": 1.3196196556091309, + "learning_rate": 2.0983739837398375e-05, + "loss": 0.2331, + "step": 28871 + }, + { + "epoch": 37.062901155327346, + "grad_norm": 1.4725606441497803, + "learning_rate": 2.0983311938382543e-05, + "loss": 0.3123, + "step": 28872 + }, + { + "epoch": 37.06418485237484, + "grad_norm": 5.357900142669678, + "learning_rate": 2.0982884039366708e-05, + "loss": 0.3563, + "step": 28873 + }, + { + "epoch": 37.065468549422334, + "grad_norm": 1.2754466533660889, + "learning_rate": 2.098245614035088e-05, + "loss": 0.2342, + "step": 28874 + }, + { + "epoch": 37.06675224646983, + "grad_norm": 1.2900711297988892, + "learning_rate": 2.0982028241335045e-05, + "loss": 0.2475, + "step": 28875 + }, + { + "epoch": 37.06803594351733, + "grad_norm": 0.8121231198310852, + "learning_rate": 2.098160034231921e-05, + "loss": 0.2278, + "step": 28876 + }, + { + "epoch": 37.06931964056483, + "grad_norm": 0.8921619057655334, + "learning_rate": 2.0981172443303382e-05, + "loss": 0.2421, + "step": 28877 + }, + { + "epoch": 37.070603337612326, + "grad_norm": 0.9466673731803894, + "learning_rate": 2.0980744544287547e-05, + "loss": 0.2388, + "step": 28878 + }, + { + "epoch": 37.071887034659824, + "grad_norm": 1.3339117765426636, + "learning_rate": 2.098031664527172e-05, + "loss": 0.2136, + "step": 28879 + }, + { + "epoch": 37.073170731707314, + "grad_norm": 1.4990943670272827, + "learning_rate": 2.0979888746255884e-05, + "loss": 0.2358, + "step": 28880 + }, + { + "epoch": 37.07445442875481, + "grad_norm": 0.7544625401496887, + "learning_rate": 2.0979460847240052e-05, + "loss": 0.2405, + "step": 28881 + }, + { + "epoch": 37.07573812580231, + "grad_norm": 0.615300714969635, + "learning_rate": 2.097903294822422e-05, + "loss": 0.1889, + "step": 28882 + }, + { + "epoch": 37.07702182284981, + "grad_norm": 0.9333179593086243, + "learning_rate": 2.0978605049208386e-05, + "loss": 0.2108, + "step": 28883 + }, + { + "epoch": 37.078305519897306, + "grad_norm": 1.0034939050674438, + "learning_rate": 2.0978177150192554e-05, + "loss": 0.2172, + "step": 28884 + }, + { + "epoch": 37.079589216944804, + "grad_norm": 0.7350817322731018, + "learning_rate": 2.0977749251176722e-05, + "loss": 0.2198, + "step": 28885 + }, + { + "epoch": 37.080872913992295, + "grad_norm": 1.4053642749786377, + "learning_rate": 2.097732135216089e-05, + "loss": 0.2147, + "step": 28886 + }, + { + "epoch": 37.08215661103979, + "grad_norm": 0.8933840990066528, + "learning_rate": 2.097689345314506e-05, + "loss": 0.2239, + "step": 28887 + }, + { + "epoch": 37.08344030808729, + "grad_norm": 1.227540135383606, + "learning_rate": 2.0976465554129228e-05, + "loss": 0.2007, + "step": 28888 + }, + { + "epoch": 37.08472400513479, + "grad_norm": 0.9656389355659485, + "learning_rate": 2.0976037655113393e-05, + "loss": 0.1988, + "step": 28889 + }, + { + "epoch": 37.086007702182286, + "grad_norm": 2.904221296310425, + "learning_rate": 2.097560975609756e-05, + "loss": 0.2101, + "step": 28890 + }, + { + "epoch": 37.087291399229784, + "grad_norm": 0.8853569626808167, + "learning_rate": 2.097518185708173e-05, + "loss": 0.1905, + "step": 28891 + }, + { + "epoch": 37.08857509627728, + "grad_norm": 1.2258414030075073, + "learning_rate": 2.0974753958065894e-05, + "loss": 0.2071, + "step": 28892 + }, + { + "epoch": 37.08985879332477, + "grad_norm": 3.4257261753082275, + "learning_rate": 2.0974326059050066e-05, + "loss": 0.2124, + "step": 28893 + }, + { + "epoch": 37.09114249037227, + "grad_norm": 0.7750609517097473, + "learning_rate": 2.097389816003423e-05, + "loss": 0.2009, + "step": 28894 + }, + { + "epoch": 37.09242618741977, + "grad_norm": 1.2101683616638184, + "learning_rate": 2.0973470261018403e-05, + "loss": 0.2339, + "step": 28895 + }, + { + "epoch": 37.093709884467266, + "grad_norm": 0.6699429154396057, + "learning_rate": 2.0973042362002568e-05, + "loss": 0.1976, + "step": 28896 + }, + { + "epoch": 37.094993581514764, + "grad_norm": 0.9798194766044617, + "learning_rate": 2.0972614462986733e-05, + "loss": 0.2048, + "step": 28897 + }, + { + "epoch": 37.09627727856226, + "grad_norm": 1.3342311382293701, + "learning_rate": 2.0972186563970905e-05, + "loss": 0.2036, + "step": 28898 + }, + { + "epoch": 37.09756097560975, + "grad_norm": 0.8695948123931885, + "learning_rate": 2.097175866495507e-05, + "loss": 0.2005, + "step": 28899 + }, + { + "epoch": 37.09884467265725, + "grad_norm": 0.9985533356666565, + "learning_rate": 2.097133076593924e-05, + "loss": 0.1865, + "step": 28900 + }, + { + "epoch": 37.10012836970475, + "grad_norm": 2.4454355239868164, + "learning_rate": 2.0970902866923407e-05, + "loss": 0.1866, + "step": 28901 + }, + { + "epoch": 37.101412066752246, + "grad_norm": 6.060166358947754, + "learning_rate": 2.0970474967907575e-05, + "loss": 0.1836, + "step": 28902 + }, + { + "epoch": 37.102695763799744, + "grad_norm": 1.3365685939788818, + "learning_rate": 2.0970047068891744e-05, + "loss": 0.2283, + "step": 28903 + }, + { + "epoch": 37.10397946084724, + "grad_norm": 1.098956823348999, + "learning_rate": 2.096961916987591e-05, + "loss": 0.1919, + "step": 28904 + }, + { + "epoch": 37.10526315789474, + "grad_norm": 1.1063669919967651, + "learning_rate": 2.0969191270860077e-05, + "loss": 0.1796, + "step": 28905 + }, + { + "epoch": 37.10654685494223, + "grad_norm": 0.9845638275146484, + "learning_rate": 2.0968763371844245e-05, + "loss": 0.1906, + "step": 28906 + }, + { + "epoch": 37.10783055198973, + "grad_norm": 1.2997727394104004, + "learning_rate": 2.0968335472828414e-05, + "loss": 0.2277, + "step": 28907 + }, + { + "epoch": 37.109114249037226, + "grad_norm": 0.941903293132782, + "learning_rate": 2.096790757381258e-05, + "loss": 0.2236, + "step": 28908 + }, + { + "epoch": 37.110397946084724, + "grad_norm": 1.5447239875793457, + "learning_rate": 2.0967479674796747e-05, + "loss": 0.1912, + "step": 28909 + }, + { + "epoch": 37.11168164313222, + "grad_norm": 1.4724018573760986, + "learning_rate": 2.0967051775780916e-05, + "loss": 0.1764, + "step": 28910 + }, + { + "epoch": 37.11296534017972, + "grad_norm": 5.905067443847656, + "learning_rate": 2.0966623876765084e-05, + "loss": 0.1888, + "step": 28911 + }, + { + "epoch": 37.11424903722722, + "grad_norm": 3.7154972553253174, + "learning_rate": 2.0966195977749253e-05, + "loss": 0.1925, + "step": 28912 + }, + { + "epoch": 37.11553273427471, + "grad_norm": 1.6836820840835571, + "learning_rate": 2.0965768078733418e-05, + "loss": 0.179, + "step": 28913 + }, + { + "epoch": 37.116816431322206, + "grad_norm": 2.7559943199157715, + "learning_rate": 2.096534017971759e-05, + "loss": 0.2071, + "step": 28914 + }, + { + "epoch": 37.118100128369704, + "grad_norm": 6.411684036254883, + "learning_rate": 2.0964912280701754e-05, + "loss": 0.2216, + "step": 28915 + }, + { + "epoch": 37.1193838254172, + "grad_norm": 2.4623665809631348, + "learning_rate": 2.096448438168592e-05, + "loss": 0.2116, + "step": 28916 + }, + { + "epoch": 37.1206675224647, + "grad_norm": 6.749063014984131, + "learning_rate": 2.096405648267009e-05, + "loss": 0.2126, + "step": 28917 + }, + { + "epoch": 37.1219512195122, + "grad_norm": 2.2940471172332764, + "learning_rate": 2.0963628583654256e-05, + "loss": 0.2144, + "step": 28918 + }, + { + "epoch": 37.12323491655969, + "grad_norm": 2.7192554473876953, + "learning_rate": 2.0963200684638428e-05, + "loss": 0.2431, + "step": 28919 + }, + { + "epoch": 37.12451861360719, + "grad_norm": 1.2463902235031128, + "learning_rate": 2.0962772785622593e-05, + "loss": 0.2494, + "step": 28920 + }, + { + "epoch": 37.125802310654684, + "grad_norm": 3.516998529434204, + "learning_rate": 2.096234488660676e-05, + "loss": 0.2504, + "step": 28921 + }, + { + "epoch": 37.12708600770218, + "grad_norm": 2.560046672821045, + "learning_rate": 2.096191698759093e-05, + "loss": 0.2671, + "step": 28922 + }, + { + "epoch": 37.12836970474968, + "grad_norm": 5.141798973083496, + "learning_rate": 2.0961489088575095e-05, + "loss": 0.3626, + "step": 28923 + }, + { + "epoch": 37.12965340179718, + "grad_norm": 4.136301040649414, + "learning_rate": 2.0961061189559263e-05, + "loss": 0.2157, + "step": 28924 + }, + { + "epoch": 37.130937098844676, + "grad_norm": 1.1470438241958618, + "learning_rate": 2.0960633290543432e-05, + "loss": 0.2341, + "step": 28925 + }, + { + "epoch": 37.13222079589217, + "grad_norm": 0.9026839733123779, + "learning_rate": 2.09602053915276e-05, + "loss": 0.2357, + "step": 28926 + }, + { + "epoch": 37.133504492939664, + "grad_norm": 0.975519597530365, + "learning_rate": 2.095977749251177e-05, + "loss": 0.246, + "step": 28927 + }, + { + "epoch": 37.13478818998716, + "grad_norm": 0.9361819624900818, + "learning_rate": 2.0959349593495937e-05, + "loss": 0.2139, + "step": 28928 + }, + { + "epoch": 37.13607188703466, + "grad_norm": 1.178544044494629, + "learning_rate": 2.0958921694480102e-05, + "loss": 0.2044, + "step": 28929 + }, + { + "epoch": 37.13735558408216, + "grad_norm": 0.8304630517959595, + "learning_rate": 2.095849379546427e-05, + "loss": 0.2404, + "step": 28930 + }, + { + "epoch": 37.138639281129656, + "grad_norm": 2.589921474456787, + "learning_rate": 2.095806589644844e-05, + "loss": 0.212, + "step": 28931 + }, + { + "epoch": 37.13992297817715, + "grad_norm": 2.810096025466919, + "learning_rate": 2.0957637997432604e-05, + "loss": 0.2208, + "step": 28932 + }, + { + "epoch": 37.141206675224645, + "grad_norm": 1.2585068941116333, + "learning_rate": 2.0957210098416776e-05, + "loss": 0.2485, + "step": 28933 + }, + { + "epoch": 37.14249037227214, + "grad_norm": 1.5752325057983398, + "learning_rate": 2.095678219940094e-05, + "loss": 0.2536, + "step": 28934 + }, + { + "epoch": 37.14377406931964, + "grad_norm": 1.7015702724456787, + "learning_rate": 2.0956354300385112e-05, + "loss": 0.2021, + "step": 28935 + }, + { + "epoch": 37.14505776636714, + "grad_norm": 1.2362842559814453, + "learning_rate": 2.0955926401369277e-05, + "loss": 0.2142, + "step": 28936 + }, + { + "epoch": 37.146341463414636, + "grad_norm": 0.9315371513366699, + "learning_rate": 2.0955498502353443e-05, + "loss": 0.2227, + "step": 28937 + }, + { + "epoch": 37.147625160462134, + "grad_norm": 1.1380409002304077, + "learning_rate": 2.0955070603337614e-05, + "loss": 0.2082, + "step": 28938 + }, + { + "epoch": 37.148908857509625, + "grad_norm": 1.523278832435608, + "learning_rate": 2.095464270432178e-05, + "loss": 0.2121, + "step": 28939 + }, + { + "epoch": 37.15019255455712, + "grad_norm": 0.8129517436027527, + "learning_rate": 2.0954214805305948e-05, + "loss": 0.2173, + "step": 28940 + }, + { + "epoch": 37.15147625160462, + "grad_norm": 1.4991787672042847, + "learning_rate": 2.0953786906290116e-05, + "loss": 0.2047, + "step": 28941 + }, + { + "epoch": 37.15275994865212, + "grad_norm": 4.68625020980835, + "learning_rate": 2.0953359007274285e-05, + "loss": 0.1915, + "step": 28942 + }, + { + "epoch": 37.154043645699616, + "grad_norm": 2.213017225265503, + "learning_rate": 2.0952931108258453e-05, + "loss": 0.2081, + "step": 28943 + }, + { + "epoch": 37.155327342747114, + "grad_norm": 1.427376627922058, + "learning_rate": 2.0952503209242618e-05, + "loss": 0.2145, + "step": 28944 + }, + { + "epoch": 37.15661103979461, + "grad_norm": 0.9083349704742432, + "learning_rate": 2.0952075310226786e-05, + "loss": 0.1944, + "step": 28945 + }, + { + "epoch": 37.1578947368421, + "grad_norm": 0.8563050627708435, + "learning_rate": 2.0951647411210955e-05, + "loss": 0.2232, + "step": 28946 + }, + { + "epoch": 37.1591784338896, + "grad_norm": 0.9630118012428284, + "learning_rate": 2.0951219512195123e-05, + "loss": 0.209, + "step": 28947 + }, + { + "epoch": 37.1604621309371, + "grad_norm": 0.900597333908081, + "learning_rate": 2.0950791613179288e-05, + "loss": 0.198, + "step": 28948 + }, + { + "epoch": 37.161745827984596, + "grad_norm": 1.2190001010894775, + "learning_rate": 2.095036371416346e-05, + "loss": 0.2052, + "step": 28949 + }, + { + "epoch": 37.163029525032094, + "grad_norm": 1.1238903999328613, + "learning_rate": 2.0949935815147625e-05, + "loss": 0.2088, + "step": 28950 + }, + { + "epoch": 37.16431322207959, + "grad_norm": 1.0526800155639648, + "learning_rate": 2.0949507916131793e-05, + "loss": 0.1899, + "step": 28951 + }, + { + "epoch": 37.16559691912708, + "grad_norm": 1.1323235034942627, + "learning_rate": 2.0949080017115962e-05, + "loss": 0.2211, + "step": 28952 + }, + { + "epoch": 37.16688061617458, + "grad_norm": 0.9122915863990784, + "learning_rate": 2.0948652118100127e-05, + "loss": 0.1992, + "step": 28953 + }, + { + "epoch": 37.16816431322208, + "grad_norm": 1.1422594785690308, + "learning_rate": 2.09482242190843e-05, + "loss": 0.2019, + "step": 28954 + }, + { + "epoch": 37.169448010269576, + "grad_norm": 1.024193525314331, + "learning_rate": 2.0947796320068464e-05, + "loss": 0.2086, + "step": 28955 + }, + { + "epoch": 37.170731707317074, + "grad_norm": 1.0573360919952393, + "learning_rate": 2.0947368421052632e-05, + "loss": 0.1963, + "step": 28956 + }, + { + "epoch": 37.17201540436457, + "grad_norm": 0.9789042472839355, + "learning_rate": 2.09469405220368e-05, + "loss": 0.2111, + "step": 28957 + }, + { + "epoch": 37.17329910141207, + "grad_norm": 1.355027198791504, + "learning_rate": 2.0946512623020966e-05, + "loss": 0.1931, + "step": 28958 + }, + { + "epoch": 37.17458279845956, + "grad_norm": 2.135143518447876, + "learning_rate": 2.0946084724005137e-05, + "loss": 0.1927, + "step": 28959 + }, + { + "epoch": 37.17586649550706, + "grad_norm": 0.8968077301979065, + "learning_rate": 2.0945656824989302e-05, + "loss": 0.1922, + "step": 28960 + }, + { + "epoch": 37.177150192554556, + "grad_norm": 1.213683843612671, + "learning_rate": 2.094522892597347e-05, + "loss": 0.2185, + "step": 28961 + }, + { + "epoch": 37.178433889602054, + "grad_norm": 1.042436957359314, + "learning_rate": 2.094480102695764e-05, + "loss": 0.2085, + "step": 28962 + }, + { + "epoch": 37.17971758664955, + "grad_norm": 1.438951015472412, + "learning_rate": 2.0944373127941808e-05, + "loss": 0.2069, + "step": 28963 + }, + { + "epoch": 37.18100128369705, + "grad_norm": 1.0808039903640747, + "learning_rate": 2.0943945228925973e-05, + "loss": 0.2142, + "step": 28964 + }, + { + "epoch": 37.18228498074454, + "grad_norm": 2.2897188663482666, + "learning_rate": 2.094351732991014e-05, + "loss": 0.1983, + "step": 28965 + }, + { + "epoch": 37.18356867779204, + "grad_norm": 1.0690512657165527, + "learning_rate": 2.094308943089431e-05, + "loss": 0.1898, + "step": 28966 + }, + { + "epoch": 37.18485237483954, + "grad_norm": 1.2563233375549316, + "learning_rate": 2.0942661531878478e-05, + "loss": 0.2031, + "step": 28967 + }, + { + "epoch": 37.186136071887034, + "grad_norm": 1.6432501077651978, + "learning_rate": 2.0942233632862646e-05, + "loss": 0.2365, + "step": 28968 + }, + { + "epoch": 37.18741976893453, + "grad_norm": 1.369336485862732, + "learning_rate": 2.094180573384681e-05, + "loss": 0.2309, + "step": 28969 + }, + { + "epoch": 37.18870346598203, + "grad_norm": 1.6165845394134521, + "learning_rate": 2.094137783483098e-05, + "loss": 0.2131, + "step": 28970 + }, + { + "epoch": 37.18998716302953, + "grad_norm": 1.367179036140442, + "learning_rate": 2.0940949935815148e-05, + "loss": 0.2475, + "step": 28971 + }, + { + "epoch": 37.19127086007702, + "grad_norm": 1.4448814392089844, + "learning_rate": 2.0940522036799313e-05, + "loss": 0.2843, + "step": 28972 + }, + { + "epoch": 37.19255455712452, + "grad_norm": 2.3049728870391846, + "learning_rate": 2.0940094137783485e-05, + "loss": 0.3736, + "step": 28973 + }, + { + "epoch": 37.193838254172015, + "grad_norm": 1.7484652996063232, + "learning_rate": 2.093966623876765e-05, + "loss": 0.2454, + "step": 28974 + }, + { + "epoch": 37.19512195121951, + "grad_norm": 0.933415412902832, + "learning_rate": 2.0939238339751822e-05, + "loss": 0.231, + "step": 28975 + }, + { + "epoch": 37.19640564826701, + "grad_norm": 0.9895921349525452, + "learning_rate": 2.0938810440735987e-05, + "loss": 0.2375, + "step": 28976 + }, + { + "epoch": 37.19768934531451, + "grad_norm": 0.8483465313911438, + "learning_rate": 2.0938382541720152e-05, + "loss": 0.2313, + "step": 28977 + }, + { + "epoch": 37.198973042362006, + "grad_norm": 1.0310146808624268, + "learning_rate": 2.0937954642704324e-05, + "loss": 0.2188, + "step": 28978 + }, + { + "epoch": 37.2002567394095, + "grad_norm": 2.9836652278900146, + "learning_rate": 2.093752674368849e-05, + "loss": 0.227, + "step": 28979 + }, + { + "epoch": 37.201540436456995, + "grad_norm": 1.5439280271530151, + "learning_rate": 2.0937098844672657e-05, + "loss": 0.2364, + "step": 28980 + }, + { + "epoch": 37.20282413350449, + "grad_norm": 0.9405295252799988, + "learning_rate": 2.0936670945656826e-05, + "loss": 0.2097, + "step": 28981 + }, + { + "epoch": 37.20410783055199, + "grad_norm": 2.122469186782837, + "learning_rate": 2.0936243046640994e-05, + "loss": 0.2192, + "step": 28982 + }, + { + "epoch": 37.20539152759949, + "grad_norm": 2.121951103210449, + "learning_rate": 2.0935815147625162e-05, + "loss": 0.2456, + "step": 28983 + }, + { + "epoch": 37.206675224646986, + "grad_norm": 0.884428083896637, + "learning_rate": 2.0935387248609327e-05, + "loss": 0.2221, + "step": 28984 + }, + { + "epoch": 37.20795892169448, + "grad_norm": 0.9140594601631165, + "learning_rate": 2.0934959349593496e-05, + "loss": 0.2343, + "step": 28985 + }, + { + "epoch": 37.209242618741975, + "grad_norm": 0.9948635697364807, + "learning_rate": 2.0934531450577664e-05, + "loss": 0.2164, + "step": 28986 + }, + { + "epoch": 37.21052631578947, + "grad_norm": 1.0092976093292236, + "learning_rate": 2.0934103551561833e-05, + "loss": 0.2157, + "step": 28987 + }, + { + "epoch": 37.21181001283697, + "grad_norm": 1.0345585346221924, + "learning_rate": 2.0933675652545998e-05, + "loss": 0.2117, + "step": 28988 + }, + { + "epoch": 37.21309370988447, + "grad_norm": 1.0878204107284546, + "learning_rate": 2.093324775353017e-05, + "loss": 0.2268, + "step": 28989 + }, + { + "epoch": 37.214377406931966, + "grad_norm": 0.9014996290206909, + "learning_rate": 2.0932819854514334e-05, + "loss": 0.2284, + "step": 28990 + }, + { + "epoch": 37.215661103979464, + "grad_norm": 0.9922034740447998, + "learning_rate": 2.0932391955498503e-05, + "loss": 0.1948, + "step": 28991 + }, + { + "epoch": 37.216944801026955, + "grad_norm": 0.8115811944007874, + "learning_rate": 2.093196405648267e-05, + "loss": 0.2028, + "step": 28992 + }, + { + "epoch": 37.21822849807445, + "grad_norm": 2.287372350692749, + "learning_rate": 2.0931536157466836e-05, + "loss": 0.2106, + "step": 28993 + }, + { + "epoch": 37.21951219512195, + "grad_norm": 0.6827536821365356, + "learning_rate": 2.0931108258451008e-05, + "loss": 0.2117, + "step": 28994 + }, + { + "epoch": 37.22079589216945, + "grad_norm": 1.5654457807540894, + "learning_rate": 2.0930680359435173e-05, + "loss": 0.2212, + "step": 28995 + }, + { + "epoch": 37.222079589216946, + "grad_norm": 2.1321473121643066, + "learning_rate": 2.093025246041934e-05, + "loss": 0.1837, + "step": 28996 + }, + { + "epoch": 37.223363286264444, + "grad_norm": 0.9839359521865845, + "learning_rate": 2.092982456140351e-05, + "loss": 0.193, + "step": 28997 + }, + { + "epoch": 37.224646983311935, + "grad_norm": 1.5340062379837036, + "learning_rate": 2.0929396662387675e-05, + "loss": 0.2182, + "step": 28998 + }, + { + "epoch": 37.22593068035943, + "grad_norm": 0.9980356097221375, + "learning_rate": 2.0928968763371847e-05, + "loss": 0.2077, + "step": 28999 + }, + { + "epoch": 37.22721437740693, + "grad_norm": 0.8505688309669495, + "learning_rate": 2.0928540864356012e-05, + "loss": 0.2055, + "step": 29000 + }, + { + "epoch": 37.22721437740693, + "eval_cer": 0.26015762678676324, + "eval_loss": 0.49147847294807434, + "eval_runtime": 13.7114, + "eval_samples_per_second": 71.692, + "eval_steps_per_second": 0.511, + "eval_wer": 0.43705572116487046, + "step": 29000 + }, + { + "epoch": 37.22721437740693, + "step": 29000, + "total_flos": 9.916619460929043e+20, + "train_loss": 0.5429986819732806, + "train_runtime": 120211.6675, + "train_samples_per_second": 207.31, + "train_steps_per_second": 0.648 + } + ], + "logging_steps": 1.0, + "max_steps": 77900, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 1000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 5 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.916619460929043e+20, + "train_batch_size": 160, + "trial_name": null, + "trial_params": null +}