diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4934 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.8057726999398676, + "eval_steps": 500, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004008819402685909, + "grad_norm": 0.7079163193702698, + "learning_rate": 1.2016021361815755e-06, + "loss": 0.4167, + "step": 10 + }, + { + "epoch": 0.008017638805371818, + "grad_norm": 0.7350050210952759, + "learning_rate": 2.5367156208277703e-06, + "loss": 0.4259, + "step": 20 + }, + { + "epoch": 0.012026458208057728, + "grad_norm": 0.8004947900772095, + "learning_rate": 3.871829105473966e-06, + "loss": 0.4058, + "step": 30 + }, + { + "epoch": 0.016035277610743637, + "grad_norm": 0.7146145701408386, + "learning_rate": 5.206942590120161e-06, + "loss": 0.3923, + "step": 40 + }, + { + "epoch": 0.020044097013429546, + "grad_norm": 0.7212072610855103, + "learning_rate": 6.542056074766355e-06, + "loss": 0.3844, + "step": 50 + }, + { + "epoch": 0.024052916416115455, + "grad_norm": 0.6941127181053162, + "learning_rate": 7.87716955941255e-06, + "loss": 0.3679, + "step": 60 + }, + { + "epoch": 0.028061735818801364, + "grad_norm": 0.5748901963233948, + "learning_rate": 9.212283044058744e-06, + "loss": 0.3518, + "step": 70 + }, + { + "epoch": 0.032070555221487274, + "grad_norm": 0.5205843448638916, + "learning_rate": 1.054739652870494e-05, + "loss": 0.3631, + "step": 80 + }, + { + "epoch": 0.03607937462417318, + "grad_norm": 0.5671967267990112, + "learning_rate": 1.1882510013351136e-05, + "loss": 0.3526, + "step": 90 + }, + { + "epoch": 0.04008819402685909, + "grad_norm": 0.6912459135055542, + "learning_rate": 1.321762349799733e-05, + "loss": 0.3186, + "step": 100 + }, + { + "epoch": 0.044097013429545, + "grad_norm": 0.6880580186843872, + "learning_rate": 1.4552736982643526e-05, + "loss": 0.3127, + "step": 110 + }, + { + "epoch": 0.04810583283223091, + "grad_norm": 0.7009742259979248, + "learning_rate": 1.588785046728972e-05, + "loss": 0.3092, + "step": 120 + }, + { + "epoch": 0.05211465223491682, + "grad_norm": 0.7727891802787781, + "learning_rate": 1.7222963951935918e-05, + "loss": 0.3138, + "step": 130 + }, + { + "epoch": 0.05612347163760273, + "grad_norm": 0.6901352405548096, + "learning_rate": 1.855807743658211e-05, + "loss": 0.3014, + "step": 140 + }, + { + "epoch": 0.06013229104028864, + "grad_norm": 0.8163438439369202, + "learning_rate": 1.9893190921228304e-05, + "loss": 0.312, + "step": 150 + }, + { + "epoch": 0.06414111044297455, + "grad_norm": 0.8670288324356079, + "learning_rate": 2.12283044058745e-05, + "loss": 0.2875, + "step": 160 + }, + { + "epoch": 0.06814992984566046, + "grad_norm": 1.0734412670135498, + "learning_rate": 2.2563417890520698e-05, + "loss": 0.2919, + "step": 170 + }, + { + "epoch": 0.07215874924834637, + "grad_norm": 0.704152524471283, + "learning_rate": 2.389853137516689e-05, + "loss": 0.3094, + "step": 180 + }, + { + "epoch": 0.07616756865103227, + "grad_norm": 0.7793599963188171, + "learning_rate": 2.5233644859813084e-05, + "loss": 0.2992, + "step": 190 + }, + { + "epoch": 0.08017638805371818, + "grad_norm": 0.8480731248855591, + "learning_rate": 2.656875834445928e-05, + "loss": 0.2957, + "step": 200 + }, + { + "epoch": 0.0841852074564041, + "grad_norm": 0.8737421631813049, + "learning_rate": 2.7903871829105478e-05, + "loss": 0.2794, + "step": 210 + }, + { + "epoch": 0.08819402685909, + "grad_norm": 0.8049966096878052, + "learning_rate": 2.923898531375167e-05, + "loss": 0.2767, + "step": 220 + }, + { + "epoch": 0.09220284626177591, + "grad_norm": 0.8555333614349365, + "learning_rate": 3.0574098798397864e-05, + "loss": 0.2899, + "step": 230 + }, + { + "epoch": 0.09621166566446182, + "grad_norm": 0.8982564806938171, + "learning_rate": 3.190921228304406e-05, + "loss": 0.2661, + "step": 240 + }, + { + "epoch": 0.10022048506714773, + "grad_norm": 0.7863436937332153, + "learning_rate": 3.324432576769025e-05, + "loss": 0.284, + "step": 250 + }, + { + "epoch": 0.10422930446983364, + "grad_norm": 0.861031711101532, + "learning_rate": 3.457943925233645e-05, + "loss": 0.266, + "step": 260 + }, + { + "epoch": 0.10823812387251955, + "grad_norm": 0.7962524890899658, + "learning_rate": 3.5914552736982644e-05, + "loss": 0.2589, + "step": 270 + }, + { + "epoch": 0.11224694327520546, + "grad_norm": 0.7888882756233215, + "learning_rate": 3.7249666221628844e-05, + "loss": 0.2692, + "step": 280 + }, + { + "epoch": 0.11625576267789137, + "grad_norm": 1.0324134826660156, + "learning_rate": 3.858477970627504e-05, + "loss": 0.2785, + "step": 290 + }, + { + "epoch": 0.12026458208057728, + "grad_norm": 0.8519011735916138, + "learning_rate": 3.991989319092123e-05, + "loss": 0.2837, + "step": 300 + }, + { + "epoch": 0.12427340148326319, + "grad_norm": 0.9894006848335266, + "learning_rate": 4.1255006675567424e-05, + "loss": 0.2729, + "step": 310 + }, + { + "epoch": 0.1282822208859491, + "grad_norm": 0.8675716519355774, + "learning_rate": 4.259012016021362e-05, + "loss": 0.2637, + "step": 320 + }, + { + "epoch": 0.132291040288635, + "grad_norm": 0.8296481370925903, + "learning_rate": 4.392523364485982e-05, + "loss": 0.2597, + "step": 330 + }, + { + "epoch": 0.1362998596913209, + "grad_norm": 0.7083739042282104, + "learning_rate": 4.526034712950601e-05, + "loss": 0.2731, + "step": 340 + }, + { + "epoch": 0.14030867909400682, + "grad_norm": 0.8215944766998291, + "learning_rate": 4.6595460614152204e-05, + "loss": 0.2655, + "step": 350 + }, + { + "epoch": 0.14431749849669273, + "grad_norm": 0.7739771008491516, + "learning_rate": 4.79305740987984e-05, + "loss": 0.2661, + "step": 360 + }, + { + "epoch": 0.14832631789937864, + "grad_norm": 0.6515551805496216, + "learning_rate": 4.92656875834446e-05, + "loss": 0.272, + "step": 370 + }, + { + "epoch": 0.15233513730206455, + "grad_norm": 0.7235403060913086, + "learning_rate": 5.0600801068090784e-05, + "loss": 0.295, + "step": 380 + }, + { + "epoch": 0.15634395670475046, + "grad_norm": 0.7624292373657227, + "learning_rate": 5.1935914552736984e-05, + "loss": 0.2622, + "step": 390 + }, + { + "epoch": 0.16035277610743637, + "grad_norm": 0.7019667029380798, + "learning_rate": 5.327102803738318e-05, + "loss": 0.2562, + "step": 400 + }, + { + "epoch": 0.16436159551012228, + "grad_norm": 0.7686800360679626, + "learning_rate": 5.460614152202938e-05, + "loss": 0.2726, + "step": 410 + }, + { + "epoch": 0.1683704149128082, + "grad_norm": 0.6799090504646301, + "learning_rate": 5.594125500667558e-05, + "loss": 0.266, + "step": 420 + }, + { + "epoch": 0.1723792343154941, + "grad_norm": 0.6165328025817871, + "learning_rate": 5.7276368491321764e-05, + "loss": 0.2706, + "step": 430 + }, + { + "epoch": 0.17638805371818, + "grad_norm": 2.7531023025512695, + "learning_rate": 5.8611481975967965e-05, + "loss": 0.2645, + "step": 440 + }, + { + "epoch": 0.1803968731208659, + "grad_norm": 0.7134599685668945, + "learning_rate": 5.994659546061415e-05, + "loss": 0.2823, + "step": 450 + }, + { + "epoch": 0.18440569252355182, + "grad_norm": 0.8196555376052856, + "learning_rate": 6.128170894526035e-05, + "loss": 0.2568, + "step": 460 + }, + { + "epoch": 0.18841451192623773, + "grad_norm": 0.7205436825752258, + "learning_rate": 6.261682242990654e-05, + "loss": 0.2638, + "step": 470 + }, + { + "epoch": 0.19242333132892364, + "grad_norm": 0.6776229739189148, + "learning_rate": 6.395193591455274e-05, + "loss": 0.2565, + "step": 480 + }, + { + "epoch": 0.19643215073160955, + "grad_norm": 0.5640079379081726, + "learning_rate": 6.528704939919892e-05, + "loss": 0.2652, + "step": 490 + }, + { + "epoch": 0.20044097013429546, + "grad_norm": 0.6904841661453247, + "learning_rate": 6.662216288384512e-05, + "loss": 0.2761, + "step": 500 + }, + { + "epoch": 0.20444978953698137, + "grad_norm": 0.602130651473999, + "learning_rate": 6.795727636849132e-05, + "loss": 0.2628, + "step": 510 + }, + { + "epoch": 0.20845860893966728, + "grad_norm": 0.5913508534431458, + "learning_rate": 6.929238985313752e-05, + "loss": 0.2655, + "step": 520 + }, + { + "epoch": 0.2124674283423532, + "grad_norm": 0.6655270457267761, + "learning_rate": 7.062750333778372e-05, + "loss": 0.2453, + "step": 530 + }, + { + "epoch": 0.2164762477450391, + "grad_norm": 0.7957231998443604, + "learning_rate": 7.196261682242991e-05, + "loss": 0.2572, + "step": 540 + }, + { + "epoch": 0.220485067147725, + "grad_norm": 0.6448558568954468, + "learning_rate": 7.329773030707611e-05, + "loss": 0.2674, + "step": 550 + }, + { + "epoch": 0.22449388655041091, + "grad_norm": 0.6882309317588806, + "learning_rate": 7.46328437917223e-05, + "loss": 0.2634, + "step": 560 + }, + { + "epoch": 0.22850270595309682, + "grad_norm": 0.5508169531822205, + "learning_rate": 7.59679572763685e-05, + "loss": 0.2821, + "step": 570 + }, + { + "epoch": 0.23251152535578273, + "grad_norm": 0.567504346370697, + "learning_rate": 7.73030707610147e-05, + "loss": 0.2623, + "step": 580 + }, + { + "epoch": 0.23652034475846864, + "grad_norm": 0.5799248218536377, + "learning_rate": 7.863818424566088e-05, + "loss": 0.2574, + "step": 590 + }, + { + "epoch": 0.24052916416115455, + "grad_norm": 0.6579704880714417, + "learning_rate": 7.997329773030708e-05, + "loss": 0.2639, + "step": 600 + }, + { + "epoch": 0.24453798356384046, + "grad_norm": 0.6886210441589355, + "learning_rate": 8.130841121495327e-05, + "loss": 0.2604, + "step": 610 + }, + { + "epoch": 0.24854680296652637, + "grad_norm": 0.702531635761261, + "learning_rate": 8.264352469959947e-05, + "loss": 0.2549, + "step": 620 + }, + { + "epoch": 0.25255562236921225, + "grad_norm": 0.605786144733429, + "learning_rate": 8.397863818424566e-05, + "loss": 0.2515, + "step": 630 + }, + { + "epoch": 0.2565644417718982, + "grad_norm": 0.7157173752784729, + "learning_rate": 8.531375166889186e-05, + "loss": 0.2565, + "step": 640 + }, + { + "epoch": 0.26057326117458407, + "grad_norm": 0.552195131778717, + "learning_rate": 8.664886515353804e-05, + "loss": 0.2675, + "step": 650 + }, + { + "epoch": 0.26458208057727, + "grad_norm": 0.7387903928756714, + "learning_rate": 8.798397863818424e-05, + "loss": 0.2593, + "step": 660 + }, + { + "epoch": 0.2685908999799559, + "grad_norm": 0.5697731971740723, + "learning_rate": 8.931909212283044e-05, + "loss": 0.2525, + "step": 670 + }, + { + "epoch": 0.2725997193826418, + "grad_norm": 0.5313478708267212, + "learning_rate": 9.065420560747664e-05, + "loss": 0.2503, + "step": 680 + }, + { + "epoch": 0.2766085387853277, + "grad_norm": 0.5595772862434387, + "learning_rate": 9.198931909212284e-05, + "loss": 0.2455, + "step": 690 + }, + { + "epoch": 0.28061735818801364, + "grad_norm": 0.6393229365348816, + "learning_rate": 9.332443257676903e-05, + "loss": 0.2541, + "step": 700 + }, + { + "epoch": 0.2846261775906995, + "grad_norm": 0.6859897375106812, + "learning_rate": 9.465954606141523e-05, + "loss": 0.2552, + "step": 710 + }, + { + "epoch": 0.28863499699338546, + "grad_norm": 0.5158377289772034, + "learning_rate": 9.599465954606142e-05, + "loss": 0.2658, + "step": 720 + }, + { + "epoch": 0.29264381639607134, + "grad_norm": 0.5928187966346741, + "learning_rate": 9.732977303070762e-05, + "loss": 0.2515, + "step": 730 + }, + { + "epoch": 0.2966526357987573, + "grad_norm": 0.5400727391242981, + "learning_rate": 9.86648865153538e-05, + "loss": 0.255, + "step": 740 + }, + { + "epoch": 0.30066145520144316, + "grad_norm": 0.6557461023330688, + "learning_rate": 0.0001, + "loss": 0.266, + "step": 750 + }, + { + "epoch": 0.3046702746041291, + "grad_norm": 0.5242008566856384, + "learning_rate": 9.999945572080073e-05, + "loss": 0.2581, + "step": 760 + }, + { + "epoch": 0.308679094006815, + "grad_norm": 0.6318579912185669, + "learning_rate": 9.999782289505249e-05, + "loss": 0.2487, + "step": 770 + }, + { + "epoch": 0.3126879134095009, + "grad_norm": 0.5616021156311035, + "learning_rate": 9.999510155830382e-05, + "loss": 0.2477, + "step": 780 + }, + { + "epoch": 0.3166967328121868, + "grad_norm": 0.6462226510047913, + "learning_rate": 9.999129176980139e-05, + "loss": 0.2534, + "step": 790 + }, + { + "epoch": 0.32070555221487274, + "grad_norm": 0.6198021769523621, + "learning_rate": 9.998639361248875e-05, + "loss": 0.2526, + "step": 800 + }, + { + "epoch": 0.3247143716175586, + "grad_norm": 0.7665322422981262, + "learning_rate": 9.99804071930045e-05, + "loss": 0.243, + "step": 810 + }, + { + "epoch": 0.32872319102024455, + "grad_norm": 0.4766557514667511, + "learning_rate": 9.997333264168e-05, + "loss": 0.2535, + "step": 820 + }, + { + "epoch": 0.33273201042293044, + "grad_norm": 0.5588070154190063, + "learning_rate": 9.996517011253648e-05, + "loss": 0.2597, + "step": 830 + }, + { + "epoch": 0.3367408298256164, + "grad_norm": 0.5454280376434326, + "learning_rate": 9.995591978328171e-05, + "loss": 0.244, + "step": 840 + }, + { + "epoch": 0.34074964922830225, + "grad_norm": 0.49733200669288635, + "learning_rate": 9.994558185530623e-05, + "loss": 0.2537, + "step": 850 + }, + { + "epoch": 0.3447584686309882, + "grad_norm": 0.5581687688827515, + "learning_rate": 9.993415655367875e-05, + "loss": 0.2404, + "step": 860 + }, + { + "epoch": 0.34876728803367407, + "grad_norm": 0.6027563810348511, + "learning_rate": 9.992164412714143e-05, + "loss": 0.2482, + "step": 870 + }, + { + "epoch": 0.35277610743636, + "grad_norm": 0.5277743935585022, + "learning_rate": 9.990804484810444e-05, + "loss": 0.2495, + "step": 880 + }, + { + "epoch": 0.3567849268390459, + "grad_norm": 0.4644189774990082, + "learning_rate": 9.989335901263996e-05, + "loss": 0.2484, + "step": 890 + }, + { + "epoch": 0.3607937462417318, + "grad_norm": 0.5761633515357971, + "learning_rate": 9.987758694047575e-05, + "loss": 0.2449, + "step": 900 + }, + { + "epoch": 0.3648025656444177, + "grad_norm": 0.5238028168678284, + "learning_rate": 9.986072897498829e-05, + "loss": 0.2492, + "step": 910 + }, + { + "epoch": 0.36881138504710365, + "grad_norm": 0.7435324788093567, + "learning_rate": 9.984278548319515e-05, + "loss": 0.2329, + "step": 920 + }, + { + "epoch": 0.3728202044497895, + "grad_norm": 0.5836918950080872, + "learning_rate": 9.982375685574712e-05, + "loss": 0.2366, + "step": 930 + }, + { + "epoch": 0.37682902385247546, + "grad_norm": 0.5967078804969788, + "learning_rate": 9.980364350691962e-05, + "loss": 0.2323, + "step": 940 + }, + { + "epoch": 0.38083784325516135, + "grad_norm": 0.5183435082435608, + "learning_rate": 9.978244587460376e-05, + "loss": 0.2496, + "step": 950 + }, + { + "epoch": 0.3848466626578473, + "grad_norm": 0.5637623071670532, + "learning_rate": 9.976016442029675e-05, + "loss": 0.2469, + "step": 960 + }, + { + "epoch": 0.38885548206053316, + "grad_norm": 0.5092435479164124, + "learning_rate": 9.973679962909189e-05, + "loss": 0.2423, + "step": 970 + }, + { + "epoch": 0.3928643014632191, + "grad_norm": 0.589830219745636, + "learning_rate": 9.971235200966795e-05, + "loss": 0.2327, + "step": 980 + }, + { + "epoch": 0.396873120865905, + "grad_norm": 0.47097843885421753, + "learning_rate": 9.968682209427817e-05, + "loss": 0.2597, + "step": 990 + }, + { + "epoch": 0.4008819402685909, + "grad_norm": 0.5486937761306763, + "learning_rate": 9.966021043873864e-05, + "loss": 0.2471, + "step": 1000 + }, + { + "epoch": 0.4048907596712768, + "grad_norm": 0.7114580273628235, + "learning_rate": 9.963251762241616e-05, + "loss": 0.2438, + "step": 1010 + }, + { + "epoch": 0.40889957907396274, + "grad_norm": 0.6333823800086975, + "learning_rate": 9.96037442482157e-05, + "loss": 0.2264, + "step": 1020 + }, + { + "epoch": 0.4129083984766486, + "grad_norm": 0.6404210329055786, + "learning_rate": 9.95738909425672e-05, + "loss": 0.2388, + "step": 1030 + }, + { + "epoch": 0.41691721787933456, + "grad_norm": 0.4633651673793793, + "learning_rate": 9.954295835541203e-05, + "loss": 0.2438, + "step": 1040 + }, + { + "epoch": 0.42092603728202044, + "grad_norm": 0.5816488265991211, + "learning_rate": 9.951094716018871e-05, + "loss": 0.2397, + "step": 1050 + }, + { + "epoch": 0.4249348566847064, + "grad_norm": 0.4773317277431488, + "learning_rate": 9.947785805381836e-05, + "loss": 0.2549, + "step": 1060 + }, + { + "epoch": 0.42894367608739226, + "grad_norm": 0.6512565016746521, + "learning_rate": 9.944369175668948e-05, + "loss": 0.2341, + "step": 1070 + }, + { + "epoch": 0.4329524954900782, + "grad_norm": 0.5367722511291504, + "learning_rate": 9.940844901264225e-05, + "loss": 0.2331, + "step": 1080 + }, + { + "epoch": 0.4369613148927641, + "grad_norm": 0.46036213636398315, + "learning_rate": 9.937213058895237e-05, + "loss": 0.2506, + "step": 1090 + }, + { + "epoch": 0.44097013429545, + "grad_norm": 0.5731292366981506, + "learning_rate": 9.933473727631435e-05, + "loss": 0.2458, + "step": 1100 + }, + { + "epoch": 0.4449789536981359, + "grad_norm": 0.4861133396625519, + "learning_rate": 9.929626988882428e-05, + "loss": 0.2385, + "step": 1110 + }, + { + "epoch": 0.44898777310082183, + "grad_norm": 0.5132921934127808, + "learning_rate": 9.925672926396212e-05, + "loss": 0.239, + "step": 1120 + }, + { + "epoch": 0.4529965925035077, + "grad_norm": 0.5186300873756409, + "learning_rate": 9.921611626257344e-05, + "loss": 0.2342, + "step": 1130 + }, + { + "epoch": 0.45700541190619365, + "grad_norm": 0.5068562626838684, + "learning_rate": 9.917443176885073e-05, + "loss": 0.2377, + "step": 1140 + }, + { + "epoch": 0.46101423130887953, + "grad_norm": 0.5708321928977966, + "learning_rate": 9.913167669031409e-05, + "loss": 0.2245, + "step": 1150 + }, + { + "epoch": 0.46502305071156547, + "grad_norm": 0.5989340543746948, + "learning_rate": 9.908785195779153e-05, + "loss": 0.2235, + "step": 1160 + }, + { + "epoch": 0.46903187011425135, + "grad_norm": 0.5365070700645447, + "learning_rate": 9.904295852539867e-05, + "loss": 0.2434, + "step": 1170 + }, + { + "epoch": 0.4730406895169373, + "grad_norm": 0.625773549079895, + "learning_rate": 9.899699737051793e-05, + "loss": 0.23, + "step": 1180 + }, + { + "epoch": 0.47704950891962317, + "grad_norm": 0.5225462317466736, + "learning_rate": 9.894996949377738e-05, + "loss": 0.2219, + "step": 1190 + }, + { + "epoch": 0.4810583283223091, + "grad_norm": 0.6289816498756409, + "learning_rate": 9.890187591902879e-05, + "loss": 0.2428, + "step": 1200 + }, + { + "epoch": 0.485067147724995, + "grad_norm": 0.5729430317878723, + "learning_rate": 9.885271769332547e-05, + "loss": 0.2267, + "step": 1210 + }, + { + "epoch": 0.4890759671276809, + "grad_norm": 0.4762110412120819, + "learning_rate": 9.880249588689941e-05, + "loss": 0.2306, + "step": 1220 + }, + { + "epoch": 0.4930847865303668, + "grad_norm": 0.5188407301902771, + "learning_rate": 9.875121159313797e-05, + "loss": 0.2389, + "step": 1230 + }, + { + "epoch": 0.49709360593305274, + "grad_norm": 0.630754828453064, + "learning_rate": 9.869886592856016e-05, + "loss": 0.2262, + "step": 1240 + }, + { + "epoch": 0.5011024253357387, + "grad_norm": 0.5617574453353882, + "learning_rate": 9.864546003279222e-05, + "loss": 0.2362, + "step": 1250 + }, + { + "epoch": 0.5051112447384245, + "grad_norm": 0.6316712498664856, + "learning_rate": 9.859099506854285e-05, + "loss": 0.2265, + "step": 1260 + }, + { + "epoch": 0.5091200641411104, + "grad_norm": 0.4762302339076996, + "learning_rate": 9.8535472221578e-05, + "loss": 0.2286, + "step": 1270 + }, + { + "epoch": 0.5131288835437964, + "grad_norm": 0.6005476117134094, + "learning_rate": 9.847889270069483e-05, + "loss": 0.2217, + "step": 1280 + }, + { + "epoch": 0.5171377029464823, + "grad_norm": 0.5312756299972534, + "learning_rate": 9.842125773769563e-05, + "loss": 0.2285, + "step": 1290 + }, + { + "epoch": 0.5211465223491681, + "grad_norm": 0.7248687744140625, + "learning_rate": 9.836256858736086e-05, + "loss": 0.2354, + "step": 1300 + }, + { + "epoch": 0.5251553417518541, + "grad_norm": 0.5596895813941956, + "learning_rate": 9.830282652742186e-05, + "loss": 0.2286, + "step": 1310 + }, + { + "epoch": 0.52916416115454, + "grad_norm": 0.6484787464141846, + "learning_rate": 9.824203285853305e-05, + "loss": 0.2325, + "step": 1320 + }, + { + "epoch": 0.533172980557226, + "grad_norm": 0.5286840200424194, + "learning_rate": 9.81801889042436e-05, + "loss": 0.2213, + "step": 1330 + }, + { + "epoch": 0.5371817999599118, + "grad_norm": 0.5632983446121216, + "learning_rate": 9.811729601096865e-05, + "loss": 0.2262, + "step": 1340 + }, + { + "epoch": 0.5411906193625977, + "grad_norm": 0.6314755082130432, + "learning_rate": 9.805335554795993e-05, + "loss": 0.226, + "step": 1350 + }, + { + "epoch": 0.5451994387652837, + "grad_norm": 0.5536089539527893, + "learning_rate": 9.798836890727601e-05, + "loss": 0.2363, + "step": 1360 + }, + { + "epoch": 0.5492082581679696, + "grad_norm": 0.5642661452293396, + "learning_rate": 9.792233750375193e-05, + "loss": 0.2367, + "step": 1370 + }, + { + "epoch": 0.5532170775706554, + "grad_norm": 0.4720064103603363, + "learning_rate": 9.785526277496851e-05, + "loss": 0.2278, + "step": 1380 + }, + { + "epoch": 0.5572258969733414, + "grad_norm": 0.568137526512146, + "learning_rate": 9.778714618122091e-05, + "loss": 0.2135, + "step": 1390 + }, + { + "epoch": 0.5612347163760273, + "grad_norm": 0.5233467221260071, + "learning_rate": 9.771798920548693e-05, + "loss": 0.2243, + "step": 1400 + }, + { + "epoch": 0.5652435357787132, + "grad_norm": 0.5088178515434265, + "learning_rate": 9.764779335339473e-05, + "loss": 0.2438, + "step": 1410 + }, + { + "epoch": 0.569252355181399, + "grad_norm": 0.6083818078041077, + "learning_rate": 9.757656015318998e-05, + "loss": 0.2223, + "step": 1420 + }, + { + "epoch": 0.573261174584085, + "grad_norm": 0.5877081155776978, + "learning_rate": 9.750429115570264e-05, + "loss": 0.2298, + "step": 1430 + }, + { + "epoch": 0.5772699939867709, + "grad_norm": 0.6110019683837891, + "learning_rate": 9.743098793431321e-05, + "loss": 0.2323, + "step": 1440 + }, + { + "epoch": 0.5812788133894567, + "grad_norm": 0.5051080584526062, + "learning_rate": 9.735665208491842e-05, + "loss": 0.2436, + "step": 1450 + }, + { + "epoch": 0.5852876327921427, + "grad_norm": 0.5243321657180786, + "learning_rate": 9.728128522589655e-05, + "loss": 0.2338, + "step": 1460 + }, + { + "epoch": 0.5892964521948286, + "grad_norm": 0.6249774694442749, + "learning_rate": 9.720488899807214e-05, + "loss": 0.226, + "step": 1470 + }, + { + "epoch": 0.5933052715975146, + "grad_norm": 0.5004896521568298, + "learning_rate": 9.71274650646803e-05, + "loss": 0.2144, + "step": 1480 + }, + { + "epoch": 0.5973140910002004, + "grad_norm": 0.6254176497459412, + "learning_rate": 9.704901511133048e-05, + "loss": 0.219, + "step": 1490 + }, + { + "epoch": 0.6013229104028863, + "grad_norm": 0.5976850390434265, + "learning_rate": 9.696954084596979e-05, + "loss": 0.2323, + "step": 1500 + }, + { + "epoch": 0.6053317298055723, + "grad_norm": 0.588320791721344, + "learning_rate": 9.688904399884583e-05, + "loss": 0.2049, + "step": 1510 + }, + { + "epoch": 0.6093405492082582, + "grad_norm": 0.655425488948822, + "learning_rate": 9.680752632246896e-05, + "loss": 0.224, + "step": 1520 + }, + { + "epoch": 0.613349368610944, + "grad_norm": 0.6558622121810913, + "learning_rate": 9.672498959157422e-05, + "loss": 0.2201, + "step": 1530 + }, + { + "epoch": 0.61735818801363, + "grad_norm": 0.6564059853553772, + "learning_rate": 9.664143560308263e-05, + "loss": 0.2075, + "step": 1540 + }, + { + "epoch": 0.6213670074163159, + "grad_norm": 0.573246419429779, + "learning_rate": 9.655686617606212e-05, + "loss": 0.2091, + "step": 1550 + }, + { + "epoch": 0.6253758268190018, + "grad_norm": 0.6015535593032837, + "learning_rate": 9.647128315168788e-05, + "loss": 0.2221, + "step": 1560 + }, + { + "epoch": 0.6293846462216877, + "grad_norm": 0.6874203085899353, + "learning_rate": 9.638468839320232e-05, + "loss": 0.213, + "step": 1570 + }, + { + "epoch": 0.6333934656243736, + "grad_norm": 0.5189663171768188, + "learning_rate": 9.629708378587445e-05, + "loss": 0.2161, + "step": 1580 + }, + { + "epoch": 0.6374022850270595, + "grad_norm": 0.571725070476532, + "learning_rate": 9.62084712369589e-05, + "loss": 0.2236, + "step": 1590 + }, + { + "epoch": 0.6414111044297455, + "grad_norm": 0.6262040734291077, + "learning_rate": 9.61188526756544e-05, + "loss": 0.2346, + "step": 1600 + }, + { + "epoch": 0.6454199238324313, + "grad_norm": 0.6156971454620361, + "learning_rate": 9.602823005306164e-05, + "loss": 0.2089, + "step": 1610 + }, + { + "epoch": 0.6494287432351172, + "grad_norm": 0.5515331625938416, + "learning_rate": 9.5936605342141e-05, + "loss": 0.2225, + "step": 1620 + }, + { + "epoch": 0.6534375626378032, + "grad_norm": 0.6686428785324097, + "learning_rate": 9.584398053766941e-05, + "loss": 0.2189, + "step": 1630 + }, + { + "epoch": 0.6574463820404891, + "grad_norm": 0.5298424959182739, + "learning_rate": 9.575035765619708e-05, + "loss": 0.2297, + "step": 1640 + }, + { + "epoch": 0.6614552014431749, + "grad_norm": 0.6391364932060242, + "learning_rate": 9.565573873600349e-05, + "loss": 0.2441, + "step": 1650 + }, + { + "epoch": 0.6654640208458609, + "grad_norm": 0.6574255228042603, + "learning_rate": 9.556012583705303e-05, + "loss": 0.2329, + "step": 1660 + }, + { + "epoch": 0.6694728402485468, + "grad_norm": 0.5856221914291382, + "learning_rate": 9.546352104095019e-05, + "loss": 0.2001, + "step": 1670 + }, + { + "epoch": 0.6734816596512327, + "grad_norm": 0.6181838512420654, + "learning_rate": 9.536592645089421e-05, + "loss": 0.2255, + "step": 1680 + }, + { + "epoch": 0.6774904790539186, + "grad_norm": 0.635492742061615, + "learning_rate": 9.52673441916333e-05, + "loss": 0.1973, + "step": 1690 + }, + { + "epoch": 0.6814992984566045, + "grad_norm": 0.6166744232177734, + "learning_rate": 9.51677764094184e-05, + "loss": 0.2248, + "step": 1700 + }, + { + "epoch": 0.6855081178592904, + "grad_norm": 0.6294150352478027, + "learning_rate": 9.506722527195639e-05, + "loss": 0.2123, + "step": 1710 + }, + { + "epoch": 0.6895169372619764, + "grad_norm": 0.6106050610542297, + "learning_rate": 9.496569296836301e-05, + "loss": 0.208, + "step": 1720 + }, + { + "epoch": 0.6935257566646622, + "grad_norm": 0.6652440428733826, + "learning_rate": 9.486318170911508e-05, + "loss": 0.2112, + "step": 1730 + }, + { + "epoch": 0.6975345760673481, + "grad_norm": 0.5508642792701721, + "learning_rate": 9.475969372600246e-05, + "loss": 0.2299, + "step": 1740 + }, + { + "epoch": 0.7015433954700341, + "grad_norm": 0.5851196050643921, + "learning_rate": 9.465523127207938e-05, + "loss": 0.2283, + "step": 1750 + }, + { + "epoch": 0.70555221487272, + "grad_norm": 0.6574164628982544, + "learning_rate": 9.454979662161547e-05, + "loss": 0.2149, + "step": 1760 + }, + { + "epoch": 0.7095610342754058, + "grad_norm": 0.562202513217926, + "learning_rate": 9.444339207004626e-05, + "loss": 0.2162, + "step": 1770 + }, + { + "epoch": 0.7135698536780918, + "grad_norm": 0.5654606223106384, + "learning_rate": 9.433601993392308e-05, + "loss": 0.2283, + "step": 1780 + }, + { + "epoch": 0.7175786730807777, + "grad_norm": 0.5194072127342224, + "learning_rate": 9.422768255086274e-05, + "loss": 0.2266, + "step": 1790 + }, + { + "epoch": 0.7215874924834637, + "grad_norm": 0.651335597038269, + "learning_rate": 9.411838227949663e-05, + "loss": 0.1999, + "step": 1800 + }, + { + "epoch": 0.7255963118861495, + "grad_norm": 0.6659877300262451, + "learning_rate": 9.400812149941932e-05, + "loss": 0.2148, + "step": 1810 + }, + { + "epoch": 0.7296051312888354, + "grad_norm": 0.6771412491798401, + "learning_rate": 9.389690261113672e-05, + "loss": 0.2233, + "step": 1820 + }, + { + "epoch": 0.7336139506915214, + "grad_norm": 0.8170326948165894, + "learning_rate": 9.378472803601397e-05, + "loss": 0.2282, + "step": 1830 + }, + { + "epoch": 0.7376227700942073, + "grad_norm": 0.6430959701538086, + "learning_rate": 9.36716002162226e-05, + "loss": 0.2036, + "step": 1840 + }, + { + "epoch": 0.7416315894968931, + "grad_norm": 0.6288866996765137, + "learning_rate": 9.355752161468731e-05, + "loss": 0.2223, + "step": 1850 + }, + { + "epoch": 0.745640408899579, + "grad_norm": 0.7772784233093262, + "learning_rate": 9.344249471503259e-05, + "loss": 0.2183, + "step": 1860 + }, + { + "epoch": 0.749649228302265, + "grad_norm": 0.6505045890808105, + "learning_rate": 9.332652202152833e-05, + "loss": 0.2126, + "step": 1870 + }, + { + "epoch": 0.7536580477049509, + "grad_norm": 0.5706261992454529, + "learning_rate": 9.320960605903553e-05, + "loss": 0.2107, + "step": 1880 + }, + { + "epoch": 0.7576668671076368, + "grad_norm": 0.5667653679847717, + "learning_rate": 9.309174937295126e-05, + "loss": 0.2036, + "step": 1890 + }, + { + "epoch": 0.7616756865103227, + "grad_norm": 0.6292815208435059, + "learning_rate": 9.297295452915323e-05, + "loss": 0.2038, + "step": 1900 + }, + { + "epoch": 0.7656845059130086, + "grad_norm": 0.6061923503875732, + "learning_rate": 9.285322411394393e-05, + "loss": 0.2183, + "step": 1910 + }, + { + "epoch": 0.7696933253156946, + "grad_norm": 0.7514089941978455, + "learning_rate": 9.273256073399434e-05, + "loss": 0.2135, + "step": 1920 + }, + { + "epoch": 0.7737021447183804, + "grad_norm": 0.6030351519584656, + "learning_rate": 9.261096701628718e-05, + "loss": 0.2098, + "step": 1930 + }, + { + "epoch": 0.7777109641210663, + "grad_norm": 0.7148683667182922, + "learning_rate": 9.248844560805969e-05, + "loss": 0.2085, + "step": 1940 + }, + { + "epoch": 0.7817197835237523, + "grad_norm": 0.7136949300765991, + "learning_rate": 9.236499917674606e-05, + "loss": 0.1998, + "step": 1950 + }, + { + "epoch": 0.7857286029264382, + "grad_norm": 0.7132196426391602, + "learning_rate": 9.224063040991924e-05, + "loss": 0.2082, + "step": 1960 + }, + { + "epoch": 0.789737422329124, + "grad_norm": 0.5503913164138794, + "learning_rate": 9.211534201523255e-05, + "loss": 0.2238, + "step": 1970 + }, + { + "epoch": 0.79374624173181, + "grad_norm": 0.7679104804992676, + "learning_rate": 9.198913672036072e-05, + "loss": 0.1971, + "step": 1980 + }, + { + "epoch": 0.7977550611344959, + "grad_norm": 0.9002260565757751, + "learning_rate": 9.186201727294036e-05, + "loss": 0.1998, + "step": 1990 + }, + { + "epoch": 0.8017638805371818, + "grad_norm": 0.5790923833847046, + "learning_rate": 9.173398644051035e-05, + "loss": 0.2113, + "step": 2000 + }, + { + "epoch": 0.8057726999398677, + "grad_norm": 0.6548293828964233, + "learning_rate": 9.160504701045145e-05, + "loss": 0.1969, + "step": 2010 + }, + { + "epoch": 0.8097815193425536, + "grad_norm": 0.6647776961326599, + "learning_rate": 9.147520178992563e-05, + "loss": 0.1979, + "step": 2020 + }, + { + "epoch": 0.8137903387452395, + "grad_norm": 0.6299743056297302, + "learning_rate": 9.134445360581503e-05, + "loss": 0.206, + "step": 2030 + }, + { + "epoch": 0.8177991581479255, + "grad_norm": 0.6221920847892761, + "learning_rate": 9.121280530466027e-05, + "loss": 0.1889, + "step": 2040 + }, + { + "epoch": 0.8218079775506113, + "grad_norm": 0.6568713784217834, + "learning_rate": 9.108025975259869e-05, + "loss": 0.2094, + "step": 2050 + }, + { + "epoch": 0.8258167969532972, + "grad_norm": 0.8146998882293701, + "learning_rate": 9.094681983530173e-05, + "loss": 0.2159, + "step": 2060 + }, + { + "epoch": 0.8298256163559832, + "grad_norm": 0.6871969103813171, + "learning_rate": 9.081248845791227e-05, + "loss": 0.1827, + "step": 2070 + }, + { + "epoch": 0.8338344357586691, + "grad_norm": 0.7771655321121216, + "learning_rate": 9.067726854498127e-05, + "loss": 0.1995, + "step": 2080 + }, + { + "epoch": 0.8378432551613549, + "grad_norm": 0.8692470192909241, + "learning_rate": 9.054116304040416e-05, + "loss": 0.202, + "step": 2090 + }, + { + "epoch": 0.8418520745640409, + "grad_norm": 0.5309840440750122, + "learning_rate": 9.040417490735676e-05, + "loss": 0.2159, + "step": 2100 + }, + { + "epoch": 0.8458608939667268, + "grad_norm": 0.6645334362983704, + "learning_rate": 9.026630712823072e-05, + "loss": 0.2175, + "step": 2110 + }, + { + "epoch": 0.8498697133694127, + "grad_norm": 0.613962709903717, + "learning_rate": 9.012756270456861e-05, + "loss": 0.2081, + "step": 2120 + }, + { + "epoch": 0.8538785327720986, + "grad_norm": 0.6764446496963501, + "learning_rate": 8.99879446569986e-05, + "loss": 0.213, + "step": 2130 + }, + { + "epoch": 0.8578873521747845, + "grad_norm": 0.6048487424850464, + "learning_rate": 8.984745602516865e-05, + "loss": 0.1879, + "step": 2140 + }, + { + "epoch": 0.8618961715774704, + "grad_norm": 0.5892179608345032, + "learning_rate": 8.970609986768035e-05, + "loss": 0.1827, + "step": 2150 + }, + { + "epoch": 0.8659049909801564, + "grad_norm": 0.7431573867797852, + "learning_rate": 8.956387926202234e-05, + "loss": 0.2055, + "step": 2160 + }, + { + "epoch": 0.8699138103828422, + "grad_norm": 0.6326702833175659, + "learning_rate": 8.942079730450335e-05, + "loss": 0.206, + "step": 2170 + }, + { + "epoch": 0.8739226297855281, + "grad_norm": 0.6847805976867676, + "learning_rate": 8.927685711018467e-05, + "loss": 0.2161, + "step": 2180 + }, + { + "epoch": 0.8779314491882141, + "grad_norm": 0.636877179145813, + "learning_rate": 8.913206181281248e-05, + "loss": 0.2014, + "step": 2190 + }, + { + "epoch": 0.8819402685909, + "grad_norm": 0.756361722946167, + "learning_rate": 8.89864145647495e-05, + "loss": 0.2063, + "step": 2200 + }, + { + "epoch": 0.8859490879935858, + "grad_norm": 0.5681055784225464, + "learning_rate": 8.883991853690646e-05, + "loss": 0.1997, + "step": 2210 + }, + { + "epoch": 0.8899579073962718, + "grad_norm": 0.6439403891563416, + "learning_rate": 8.869257691867296e-05, + "loss": 0.2029, + "step": 2220 + }, + { + "epoch": 0.8939667267989577, + "grad_norm": 0.6258695721626282, + "learning_rate": 8.854439291784813e-05, + "loss": 0.2062, + "step": 2230 + }, + { + "epoch": 0.8979755462016437, + "grad_norm": 0.6915255188941956, + "learning_rate": 8.839536976057075e-05, + "loss": 0.2008, + "step": 2240 + }, + { + "epoch": 0.9019843656043295, + "grad_norm": 0.7225965857505798, + "learning_rate": 8.824551069124898e-05, + "loss": 0.1915, + "step": 2250 + }, + { + "epoch": 0.9059931850070154, + "grad_norm": 0.784816563129425, + "learning_rate": 8.809481897248983e-05, + "loss": 0.1897, + "step": 2260 + }, + { + "epoch": 0.9100020044097014, + "grad_norm": 0.7496415972709656, + "learning_rate": 8.7943297885028e-05, + "loss": 0.198, + "step": 2270 + }, + { + "epoch": 0.9140108238123873, + "grad_norm": 0.6198856830596924, + "learning_rate": 8.779095072765453e-05, + "loss": 0.2055, + "step": 2280 + }, + { + "epoch": 0.9180196432150731, + "grad_norm": 0.6876329183578491, + "learning_rate": 8.763778081714498e-05, + "loss": 0.1969, + "step": 2290 + }, + { + "epoch": 0.9220284626177591, + "grad_norm": 0.7026522159576416, + "learning_rate": 8.748379148818722e-05, + "loss": 0.1811, + "step": 2300 + }, + { + "epoch": 0.926037282020445, + "grad_norm": 0.6701675057411194, + "learning_rate": 8.732898609330875e-05, + "loss": 0.1902, + "step": 2310 + }, + { + "epoch": 0.9300461014231309, + "grad_norm": 0.6713166236877441, + "learning_rate": 8.717336800280386e-05, + "loss": 0.2093, + "step": 2320 + }, + { + "epoch": 0.9340549208258168, + "grad_norm": 0.7247043251991272, + "learning_rate": 8.701694060466014e-05, + "loss": 0.1916, + "step": 2330 + }, + { + "epoch": 0.9380637402285027, + "grad_norm": 0.6550298929214478, + "learning_rate": 8.685970730448475e-05, + "loss": 0.2034, + "step": 2340 + }, + { + "epoch": 0.9420725596311886, + "grad_norm": 0.7075363397598267, + "learning_rate": 8.670167152543026e-05, + "loss": 0.1823, + "step": 2350 + }, + { + "epoch": 0.9460813790338746, + "grad_norm": 0.7122249603271484, + "learning_rate": 8.654283670812017e-05, + "loss": 0.1941, + "step": 2360 + }, + { + "epoch": 0.9500901984365604, + "grad_norm": 0.6687220335006714, + "learning_rate": 8.638320631057397e-05, + "loss": 0.1933, + "step": 2370 + }, + { + "epoch": 0.9540990178392463, + "grad_norm": 0.635455310344696, + "learning_rate": 8.622278380813186e-05, + "loss": 0.1967, + "step": 2380 + }, + { + "epoch": 0.9581078372419323, + "grad_norm": 0.7970702052116394, + "learning_rate": 8.606157269337906e-05, + "loss": 0.1901, + "step": 2390 + }, + { + "epoch": 0.9621166566446182, + "grad_norm": 0.7364137768745422, + "learning_rate": 8.589957647606988e-05, + "loss": 0.1945, + "step": 2400 + }, + { + "epoch": 0.966125476047304, + "grad_norm": 0.7844299674034119, + "learning_rate": 8.573679868305114e-05, + "loss": 0.1821, + "step": 2410 + }, + { + "epoch": 0.97013429544999, + "grad_norm": 0.8092600703239441, + "learning_rate": 8.557324285818552e-05, + "loss": 0.1934, + "step": 2420 + }, + { + "epoch": 0.9741431148526759, + "grad_norm": 0.66877281665802, + "learning_rate": 8.540891256227437e-05, + "loss": 0.2021, + "step": 2430 + }, + { + "epoch": 0.9781519342553618, + "grad_norm": 0.7711961269378662, + "learning_rate": 8.524381137298014e-05, + "loss": 0.1801, + "step": 2440 + }, + { + "epoch": 0.9821607536580477, + "grad_norm": 0.6817704439163208, + "learning_rate": 8.507794288474856e-05, + "loss": 0.1928, + "step": 2450 + }, + { + "epoch": 0.9861695730607336, + "grad_norm": 0.8401746153831482, + "learning_rate": 8.491131070873038e-05, + "loss": 0.1884, + "step": 2460 + }, + { + "epoch": 0.9901783924634195, + "grad_norm": 0.7808353900909424, + "learning_rate": 8.474391847270265e-05, + "loss": 0.1966, + "step": 2470 + }, + { + "epoch": 0.9941872118661055, + "grad_norm": 0.6367965340614319, + "learning_rate": 8.45757698209899e-05, + "loss": 0.1892, + "step": 2480 + }, + { + "epoch": 0.9981960312687913, + "grad_norm": 0.7107962369918823, + "learning_rate": 8.440686841438462e-05, + "loss": 0.1961, + "step": 2490 + }, + { + "epoch": 1.002004409701343, + "grad_norm": 0.593016505241394, + "learning_rate": 8.423721793006775e-05, + "loss": 0.1773, + "step": 2500 + }, + { + "epoch": 1.006013229104029, + "grad_norm": 0.7551445364952087, + "learning_rate": 8.406682206152845e-05, + "loss": 0.1733, + "step": 2510 + }, + { + "epoch": 1.0100220485067148, + "grad_norm": 0.8561877608299255, + "learning_rate": 8.389568451848382e-05, + "loss": 0.1594, + "step": 2520 + }, + { + "epoch": 1.0140308679094008, + "grad_norm": 0.8644078969955444, + "learning_rate": 8.372380902679804e-05, + "loss": 0.179, + "step": 2530 + }, + { + "epoch": 1.0180396873120865, + "grad_norm": 0.778167188167572, + "learning_rate": 8.355119932840129e-05, + "loss": 0.1616, + "step": 2540 + }, + { + "epoch": 1.0220485067147724, + "grad_norm": 0.7065404057502747, + "learning_rate": 8.337785918120837e-05, + "loss": 0.1768, + "step": 2550 + }, + { + "epoch": 1.0260573261174584, + "grad_norm": 0.8743630051612854, + "learning_rate": 8.320379235903668e-05, + "loss": 0.1687, + "step": 2560 + }, + { + "epoch": 1.0300661455201443, + "grad_norm": 1.0897860527038574, + "learning_rate": 8.302900265152427e-05, + "loss": 0.1558, + "step": 2570 + }, + { + "epoch": 1.0340749649228302, + "grad_norm": 0.7313379645347595, + "learning_rate": 8.285349386404722e-05, + "loss": 0.16, + "step": 2580 + }, + { + "epoch": 1.0380837843255162, + "grad_norm": 0.8040058016777039, + "learning_rate": 8.267726981763682e-05, + "loss": 0.1571, + "step": 2590 + }, + { + "epoch": 1.0420926037282021, + "grad_norm": 0.8637468218803406, + "learning_rate": 8.250033434889637e-05, + "loss": 0.16, + "step": 2600 + }, + { + "epoch": 1.046101423130888, + "grad_norm": 0.7505359053611755, + "learning_rate": 8.232269130991769e-05, + "loss": 0.1597, + "step": 2610 + }, + { + "epoch": 1.0501102425335738, + "grad_norm": 0.8430061340332031, + "learning_rate": 8.214434456819725e-05, + "loss": 0.1723, + "step": 2620 + }, + { + "epoch": 1.0541190619362597, + "grad_norm": 0.66597580909729, + "learning_rate": 8.196529800655188e-05, + "loss": 0.1751, + "step": 2630 + }, + { + "epoch": 1.0581278813389456, + "grad_norm": 0.8823577761650085, + "learning_rate": 8.178555552303437e-05, + "loss": 0.1701, + "step": 2640 + }, + { + "epoch": 1.0621367007416316, + "grad_norm": 0.9401513338088989, + "learning_rate": 8.160512103084851e-05, + "loss": 0.1564, + "step": 2650 + }, + { + "epoch": 1.0661455201443175, + "grad_norm": 0.7342818379402161, + "learning_rate": 8.142399845826394e-05, + "loss": 0.1507, + "step": 2660 + }, + { + "epoch": 1.0701543395470035, + "grad_norm": 0.8487102389335632, + "learning_rate": 8.12421917485306e-05, + "loss": 0.1633, + "step": 2670 + }, + { + "epoch": 1.0741631589496894, + "grad_norm": 0.8836720585823059, + "learning_rate": 8.105970485979295e-05, + "loss": 0.1682, + "step": 2680 + }, + { + "epoch": 1.0781719783523753, + "grad_norm": 0.6858396530151367, + "learning_rate": 8.087654176500366e-05, + "loss": 0.1723, + "step": 2690 + }, + { + "epoch": 1.082180797755061, + "grad_norm": 1.028981328010559, + "learning_rate": 8.069270645183722e-05, + "loss": 0.1555, + "step": 2700 + }, + { + "epoch": 1.086189617157747, + "grad_norm": 0.9475600719451904, + "learning_rate": 8.050820292260313e-05, + "loss": 0.1591, + "step": 2710 + }, + { + "epoch": 1.090198436560433, + "grad_norm": 0.683160126209259, + "learning_rate": 8.032303519415874e-05, + "loss": 0.1703, + "step": 2720 + }, + { + "epoch": 1.0942072559631189, + "grad_norm": 0.8751930594444275, + "learning_rate": 8.013720729782173e-05, + "loss": 0.1489, + "step": 2730 + }, + { + "epoch": 1.0982160753658048, + "grad_norm": 0.8032315373420715, + "learning_rate": 7.995072327928243e-05, + "loss": 0.1439, + "step": 2740 + }, + { + "epoch": 1.1022248947684907, + "grad_norm": 0.7631738185882568, + "learning_rate": 7.976358719851579e-05, + "loss": 0.1676, + "step": 2750 + }, + { + "epoch": 1.1062337141711767, + "grad_norm": 0.7207862734794617, + "learning_rate": 7.957580312969283e-05, + "loss": 0.1494, + "step": 2760 + }, + { + "epoch": 1.1102425335738626, + "grad_norm": 0.6857604384422302, + "learning_rate": 7.938737516109207e-05, + "loss": 0.1594, + "step": 2770 + }, + { + "epoch": 1.1142513529765483, + "grad_norm": 1.0340170860290527, + "learning_rate": 7.919830739501043e-05, + "loss": 0.1621, + "step": 2780 + }, + { + "epoch": 1.1182601723792343, + "grad_norm": 0.7190383672714233, + "learning_rate": 7.900860394767402e-05, + "loss": 0.1638, + "step": 2790 + }, + { + "epoch": 1.1222689917819202, + "grad_norm": 0.8485333919525146, + "learning_rate": 7.881826894914846e-05, + "loss": 0.1619, + "step": 2800 + }, + { + "epoch": 1.1262778111846061, + "grad_norm": 0.8466002345085144, + "learning_rate": 7.862730654324899e-05, + "loss": 0.1448, + "step": 2810 + }, + { + "epoch": 1.130286630587292, + "grad_norm": 0.7490071058273315, + "learning_rate": 7.843572088745019e-05, + "loss": 0.1649, + "step": 2820 + }, + { + "epoch": 1.134295449989978, + "grad_norm": 0.7291231751441956, + "learning_rate": 7.824351615279557e-05, + "loss": 0.1604, + "step": 2830 + }, + { + "epoch": 1.138304269392664, + "grad_norm": 1.1249662637710571, + "learning_rate": 7.80506965238067e-05, + "loss": 0.1383, + "step": 2840 + }, + { + "epoch": 1.1423130887953499, + "grad_norm": 0.8020785450935364, + "learning_rate": 7.785726619839212e-05, + "loss": 0.1565, + "step": 2850 + }, + { + "epoch": 1.1463219081980356, + "grad_norm": 0.9652583599090576, + "learning_rate": 7.766322938775589e-05, + "loss": 0.1513, + "step": 2860 + }, + { + "epoch": 1.1503307276007215, + "grad_norm": 0.8806086182594299, + "learning_rate": 7.746859031630605e-05, + "loss": 0.1607, + "step": 2870 + }, + { + "epoch": 1.1543395470034075, + "grad_norm": 0.9319799542427063, + "learning_rate": 7.72733532215625e-05, + "loss": 0.1588, + "step": 2880 + }, + { + "epoch": 1.1583483664060934, + "grad_norm": 0.9107722640037537, + "learning_rate": 7.707752235406485e-05, + "loss": 0.1445, + "step": 2890 + }, + { + "epoch": 1.1623571858087793, + "grad_norm": 0.9413526654243469, + "learning_rate": 7.688110197727975e-05, + "loss": 0.1589, + "step": 2900 + }, + { + "epoch": 1.1663660052114653, + "grad_norm": 0.9594728350639343, + "learning_rate": 7.668409636750828e-05, + "loss": 0.1584, + "step": 2910 + }, + { + "epoch": 1.1703748246141512, + "grad_norm": 0.7484379410743713, + "learning_rate": 7.648650981379264e-05, + "loss": 0.1582, + "step": 2920 + }, + { + "epoch": 1.174383644016837, + "grad_norm": 0.840965747833252, + "learning_rate": 7.628834661782288e-05, + "loss": 0.1563, + "step": 2930 + }, + { + "epoch": 1.1783924634195229, + "grad_norm": 0.7504467368125916, + "learning_rate": 7.608961109384321e-05, + "loss": 0.145, + "step": 2940 + }, + { + "epoch": 1.1824012828222088, + "grad_norm": 1.0212056636810303, + "learning_rate": 7.589030756855813e-05, + "loss": 0.1562, + "step": 2950 + }, + { + "epoch": 1.1864101022248947, + "grad_norm": 0.9360294342041016, + "learning_rate": 7.569044038103813e-05, + "loss": 0.156, + "step": 2960 + }, + { + "epoch": 1.1904189216275807, + "grad_norm": 0.649131178855896, + "learning_rate": 7.549001388262535e-05, + "loss": 0.1713, + "step": 2970 + }, + { + "epoch": 1.1944277410302666, + "grad_norm": 1.106505274772644, + "learning_rate": 7.528903243683874e-05, + "loss": 0.1475, + "step": 2980 + }, + { + "epoch": 1.1984365604329525, + "grad_norm": 0.8083673119544983, + "learning_rate": 7.508750041927914e-05, + "loss": 0.1512, + "step": 2990 + }, + { + "epoch": 1.2024453798356385, + "grad_norm": 0.7395840287208557, + "learning_rate": 7.488542221753394e-05, + "loss": 0.1481, + "step": 3000 + }, + { + "epoch": 1.2064541992383244, + "grad_norm": 0.923462986946106, + "learning_rate": 7.46828022310816e-05, + "loss": 0.1537, + "step": 3010 + }, + { + "epoch": 1.2104630186410101, + "grad_norm": 0.8510660529136658, + "learning_rate": 7.44796448711959e-05, + "loss": 0.1525, + "step": 3020 + }, + { + "epoch": 1.214471838043696, + "grad_norm": 0.881767749786377, + "learning_rate": 7.427595456084981e-05, + "loss": 0.1641, + "step": 3030 + }, + { + "epoch": 1.218480657446382, + "grad_norm": 0.8366743326187134, + "learning_rate": 7.407173573461934e-05, + "loss": 0.1502, + "step": 3040 + }, + { + "epoch": 1.222489476849068, + "grad_norm": 0.8755321502685547, + "learning_rate": 7.386699283858683e-05, + "loss": 0.1495, + "step": 3050 + }, + { + "epoch": 1.2264982962517539, + "grad_norm": 0.841222882270813, + "learning_rate": 7.366173033024428e-05, + "loss": 0.1423, + "step": 3060 + }, + { + "epoch": 1.2305071156544398, + "grad_norm": 0.8285235166549683, + "learning_rate": 7.345595267839621e-05, + "loss": 0.1632, + "step": 3070 + }, + { + "epoch": 1.2345159350571258, + "grad_norm": 0.764156699180603, + "learning_rate": 7.324966436306246e-05, + "loss": 0.1466, + "step": 3080 + }, + { + "epoch": 1.2385247544598115, + "grad_norm": 1.1134533882141113, + "learning_rate": 7.30428698753806e-05, + "loss": 0.1393, + "step": 3090 + }, + { + "epoch": 1.2425335738624974, + "grad_norm": 0.8127875328063965, + "learning_rate": 7.283557371750813e-05, + "loss": 0.1597, + "step": 3100 + }, + { + "epoch": 1.2465423932651833, + "grad_norm": 0.8257074356079102, + "learning_rate": 7.262778040252455e-05, + "loss": 0.1659, + "step": 3110 + }, + { + "epoch": 1.2505512126678693, + "grad_norm": 0.7807098031044006, + "learning_rate": 7.2419494454333e-05, + "loss": 0.1476, + "step": 3120 + }, + { + "epoch": 1.2545600320705552, + "grad_norm": 0.7114003300666809, + "learning_rate": 7.221072040756188e-05, + "loss": 0.1467, + "step": 3130 + }, + { + "epoch": 1.2585688514732412, + "grad_norm": 0.7870392203330994, + "learning_rate": 7.2001462807466e-05, + "loss": 0.1471, + "step": 3140 + }, + { + "epoch": 1.262577670875927, + "grad_norm": 0.6909427046775818, + "learning_rate": 7.179172620982774e-05, + "loss": 0.1575, + "step": 3150 + }, + { + "epoch": 1.266586490278613, + "grad_norm": 0.8754594922065735, + "learning_rate": 7.158151518085776e-05, + "loss": 0.155, + "step": 3160 + }, + { + "epoch": 1.270595309681299, + "grad_norm": 0.7454276084899902, + "learning_rate": 7.137083429709573e-05, + "loss": 0.1431, + "step": 3170 + }, + { + "epoch": 1.274604129083985, + "grad_norm": 0.9142866134643555, + "learning_rate": 7.115968814531052e-05, + "loss": 0.1342, + "step": 3180 + }, + { + "epoch": 1.2786129484866706, + "grad_norm": 0.8666753768920898, + "learning_rate": 7.09480813224005e-05, + "loss": 0.142, + "step": 3190 + }, + { + "epoch": 1.2826217678893566, + "grad_norm": 0.8461101651191711, + "learning_rate": 7.073601843529333e-05, + "loss": 0.1396, + "step": 3200 + }, + { + "epoch": 1.2866305872920425, + "grad_norm": 0.8602980375289917, + "learning_rate": 7.052350410084574e-05, + "loss": 0.1435, + "step": 3210 + }, + { + "epoch": 1.2906394066947284, + "grad_norm": 1.0527535676956177, + "learning_rate": 7.031054294574303e-05, + "loss": 0.1474, + "step": 3220 + }, + { + "epoch": 1.2946482260974144, + "grad_norm": 0.84455806016922, + "learning_rate": 7.009713960639826e-05, + "loss": 0.1565, + "step": 3230 + }, + { + "epoch": 1.2986570455001, + "grad_norm": 0.7223050594329834, + "learning_rate": 6.98832987288514e-05, + "loss": 0.1482, + "step": 3240 + }, + { + "epoch": 1.302665864902786, + "grad_norm": 0.8750767111778259, + "learning_rate": 6.966902496866807e-05, + "loss": 0.1611, + "step": 3250 + }, + { + "epoch": 1.306674684305472, + "grad_norm": 0.7444009184837341, + "learning_rate": 6.945432299083834e-05, + "loss": 0.1647, + "step": 3260 + }, + { + "epoch": 1.310683503708158, + "grad_norm": 1.013881802558899, + "learning_rate": 6.9239197469675e-05, + "loss": 0.1412, + "step": 3270 + }, + { + "epoch": 1.3146923231108438, + "grad_norm": 0.8479213118553162, + "learning_rate": 6.902365308871193e-05, + "loss": 0.1369, + "step": 3280 + }, + { + "epoch": 1.3187011425135298, + "grad_norm": 0.8772777318954468, + "learning_rate": 6.880769454060201e-05, + "loss": 0.1501, + "step": 3290 + }, + { + "epoch": 1.3227099619162157, + "grad_norm": 0.8388547301292419, + "learning_rate": 6.859132652701514e-05, + "loss": 0.1402, + "step": 3300 + }, + { + "epoch": 1.3267187813189016, + "grad_norm": 0.8197916746139526, + "learning_rate": 6.837455375853561e-05, + "loss": 0.1351, + "step": 3310 + }, + { + "epoch": 1.3307276007215876, + "grad_norm": 0.9061885476112366, + "learning_rate": 6.815738095455984e-05, + "loss": 0.139, + "step": 3320 + }, + { + "epoch": 1.3347364201242735, + "grad_norm": 0.721653938293457, + "learning_rate": 6.793981284319339e-05, + "loss": 0.1556, + "step": 3330 + }, + { + "epoch": 1.3387452395269592, + "grad_norm": 0.9494278430938721, + "learning_rate": 6.772185416114814e-05, + "loss": 0.1423, + "step": 3340 + }, + { + "epoch": 1.3427540589296452, + "grad_norm": 0.8513092994689941, + "learning_rate": 6.750350965363919e-05, + "loss": 0.1393, + "step": 3350 + }, + { + "epoch": 1.346762878332331, + "grad_norm": 0.8258860111236572, + "learning_rate": 6.728478407428151e-05, + "loss": 0.146, + "step": 3360 + }, + { + "epoch": 1.350771697735017, + "grad_norm": 0.8146616220474243, + "learning_rate": 6.706568218498639e-05, + "loss": 0.148, + "step": 3370 + }, + { + "epoch": 1.354780517137703, + "grad_norm": 0.9726580381393433, + "learning_rate": 6.684620875585787e-05, + "loss": 0.1404, + "step": 3380 + }, + { + "epoch": 1.358789336540389, + "grad_norm": 1.0220385789871216, + "learning_rate": 6.662636856508887e-05, + "loss": 0.1504, + "step": 3390 + }, + { + "epoch": 1.3627981559430746, + "grad_norm": 0.9221115708351135, + "learning_rate": 6.640616639885708e-05, + "loss": 0.1407, + "step": 3400 + }, + { + "epoch": 1.3668069753457606, + "grad_norm": 0.9321884512901306, + "learning_rate": 6.618560705122086e-05, + "loss": 0.1286, + "step": 3410 + }, + { + "epoch": 1.3708157947484465, + "grad_norm": 0.8789135217666626, + "learning_rate": 6.596469532401483e-05, + "loss": 0.1478, + "step": 3420 + }, + { + "epoch": 1.3748246141511324, + "grad_norm": 0.8220512270927429, + "learning_rate": 6.574343602674528e-05, + "loss": 0.1439, + "step": 3430 + }, + { + "epoch": 1.3788334335538184, + "grad_norm": 1.0369560718536377, + "learning_rate": 6.552183397648555e-05, + "loss": 0.1323, + "step": 3440 + }, + { + "epoch": 1.3828422529565043, + "grad_norm": 1.0133991241455078, + "learning_rate": 6.529989399777109e-05, + "loss": 0.1472, + "step": 3450 + }, + { + "epoch": 1.3868510723591903, + "grad_norm": 0.9306389093399048, + "learning_rate": 6.507762092249448e-05, + "loss": 0.1446, + "step": 3460 + }, + { + "epoch": 1.3908598917618762, + "grad_norm": 1.021039366722107, + "learning_rate": 6.485501958980016e-05, + "loss": 0.1341, + "step": 3470 + }, + { + "epoch": 1.3948687111645621, + "grad_norm": 0.7612369656562805, + "learning_rate": 6.463209484597913e-05, + "loss": 0.1437, + "step": 3480 + }, + { + "epoch": 1.398877530567248, + "grad_norm": 0.7720378041267395, + "learning_rate": 6.440885154436344e-05, + "loss": 0.1184, + "step": 3490 + }, + { + "epoch": 1.4028863499699338, + "grad_norm": 0.9269343614578247, + "learning_rate": 6.418529454522051e-05, + "loss": 0.1474, + "step": 3500 + }, + { + "epoch": 1.4068951693726197, + "grad_norm": 0.8597378730773926, + "learning_rate": 6.396142871564731e-05, + "loss": 0.1395, + "step": 3510 + }, + { + "epoch": 1.4109039887753057, + "grad_norm": 0.9362756013870239, + "learning_rate": 6.373725892946443e-05, + "loss": 0.1476, + "step": 3520 + }, + { + "epoch": 1.4149128081779916, + "grad_norm": 0.8636417388916016, + "learning_rate": 6.351279006710994e-05, + "loss": 0.1333, + "step": 3530 + }, + { + "epoch": 1.4189216275806775, + "grad_norm": 0.9320933818817139, + "learning_rate": 6.328802701553313e-05, + "loss": 0.1464, + "step": 3540 + }, + { + "epoch": 1.4229304469833635, + "grad_norm": 1.1692008972167969, + "learning_rate": 6.306297466808818e-05, + "loss": 0.1515, + "step": 3550 + }, + { + "epoch": 1.4269392663860492, + "grad_norm": 0.7800849676132202, + "learning_rate": 6.283763792442751e-05, + "loss": 0.1414, + "step": 3560 + }, + { + "epoch": 1.4309480857887351, + "grad_norm": 1.0798330307006836, + "learning_rate": 6.261202169039526e-05, + "loss": 0.1478, + "step": 3570 + }, + { + "epoch": 1.434956905191421, + "grad_norm": 0.8681895136833191, + "learning_rate": 6.23861308779203e-05, + "loss": 0.1413, + "step": 3580 + }, + { + "epoch": 1.438965724594107, + "grad_norm": 1.3371766805648804, + "learning_rate": 6.21599704049095e-05, + "loss": 0.132, + "step": 3590 + }, + { + "epoch": 1.442974543996793, + "grad_norm": 0.923513650894165, + "learning_rate": 6.19335451951405e-05, + "loss": 0.1435, + "step": 3600 + }, + { + "epoch": 1.4469833633994789, + "grad_norm": 0.9107206463813782, + "learning_rate": 6.170686017815456e-05, + "loss": 0.1219, + "step": 3610 + }, + { + "epoch": 1.4509921828021648, + "grad_norm": 0.9753092527389526, + "learning_rate": 6.147992028914926e-05, + "loss": 0.1426, + "step": 3620 + }, + { + "epoch": 1.4550010022048507, + "grad_norm": 0.9150570631027222, + "learning_rate": 6.125273046887106e-05, + "loss": 0.1342, + "step": 3630 + }, + { + "epoch": 1.4590098216075367, + "grad_norm": 1.0572060346603394, + "learning_rate": 6.10252956635077e-05, + "loss": 0.1274, + "step": 3640 + }, + { + "epoch": 1.4630186410102226, + "grad_norm": 0.7989734411239624, + "learning_rate": 6.079762082458049e-05, + "loss": 0.1385, + "step": 3650 + }, + { + "epoch": 1.4670274604129083, + "grad_norm": 0.8875731229782104, + "learning_rate": 6.056971090883665e-05, + "loss": 0.1413, + "step": 3660 + }, + { + "epoch": 1.4710362798155943, + "grad_norm": 0.9534810185432434, + "learning_rate": 6.0341570878141184e-05, + "loss": 0.1267, + "step": 3670 + }, + { + "epoch": 1.4750450992182802, + "grad_norm": 0.7729069590568542, + "learning_rate": 6.0113205699369056e-05, + "loss": 0.1469, + "step": 3680 + }, + { + "epoch": 1.4790539186209661, + "grad_norm": 0.6528967022895813, + "learning_rate": 5.988462034429692e-05, + "loss": 0.1314, + "step": 3690 + }, + { + "epoch": 1.483062738023652, + "grad_norm": 1.0471932888031006, + "learning_rate": 5.965581978949494e-05, + "loss": 0.1294, + "step": 3700 + }, + { + "epoch": 1.487071557426338, + "grad_norm": 0.8370137810707092, + "learning_rate": 5.942680901621842e-05, + "loss": 0.1507, + "step": 3710 + }, + { + "epoch": 1.4910803768290237, + "grad_norm": 0.7025067210197449, + "learning_rate": 5.9197593010299377e-05, + "loss": 0.1386, + "step": 3720 + }, + { + "epoch": 1.4950891962317097, + "grad_norm": 0.9664121866226196, + "learning_rate": 5.8968176762037985e-05, + "loss": 0.145, + "step": 3730 + }, + { + "epoch": 1.4990980156343956, + "grad_norm": 0.8898931741714478, + "learning_rate": 5.87385652660939e-05, + "loss": 0.1386, + "step": 3740 + }, + { + "epoch": 1.5031068350370815, + "grad_norm": 0.750616192817688, + "learning_rate": 5.850876352137759e-05, + "loss": 0.153, + "step": 3750 + }, + { + "epoch": 1.5071156544397675, + "grad_norm": 1.0957409143447876, + "learning_rate": 5.827877653094144e-05, + "loss": 0.1329, + "step": 3760 + }, + { + "epoch": 1.5111244738424534, + "grad_norm": 0.8789597749710083, + "learning_rate": 5.8048609301870816e-05, + "loss": 0.1329, + "step": 3770 + }, + { + "epoch": 1.5151332932451393, + "grad_norm": 0.7944477200508118, + "learning_rate": 5.781826684517515e-05, + "loss": 0.1256, + "step": 3780 + }, + { + "epoch": 1.5191421126478253, + "grad_norm": 0.8657981753349304, + "learning_rate": 5.758775417567878e-05, + "loss": 0.1266, + "step": 3790 + }, + { + "epoch": 1.5231509320505112, + "grad_norm": 0.8267760276794434, + "learning_rate": 5.73570763119117e-05, + "loss": 0.1269, + "step": 3800 + }, + { + "epoch": 1.5271597514531972, + "grad_norm": 0.9449699521064758, + "learning_rate": 5.7126238276000474e-05, + "loss": 0.1331, + "step": 3810 + }, + { + "epoch": 1.531168570855883, + "grad_norm": 1.0582398176193237, + "learning_rate": 5.689524509355873e-05, + "loss": 0.1277, + "step": 3820 + }, + { + "epoch": 1.5351773902585688, + "grad_norm": 0.8139535784721375, + "learning_rate": 5.6664101793577865e-05, + "loss": 0.1275, + "step": 3830 + }, + { + "epoch": 1.5391862096612547, + "grad_norm": 0.7074098587036133, + "learning_rate": 5.643281340831745e-05, + "loss": 0.1307, + "step": 3840 + }, + { + "epoch": 1.5431950290639407, + "grad_norm": 0.858897864818573, + "learning_rate": 5.6201384973195825e-05, + "loss": 0.1296, + "step": 3850 + }, + { + "epoch": 1.5472038484666266, + "grad_norm": 0.984902560710907, + "learning_rate": 5.596982152668029e-05, + "loss": 0.1315, + "step": 3860 + }, + { + "epoch": 1.5512126678693123, + "grad_norm": 0.9450563192367554, + "learning_rate": 5.5738128110177523e-05, + "loss": 0.1275, + "step": 3870 + }, + { + "epoch": 1.5552214872719983, + "grad_norm": 1.13248610496521, + "learning_rate": 5.550630976792385e-05, + "loss": 0.1364, + "step": 3880 + }, + { + "epoch": 1.5592303066746842, + "grad_norm": 0.9023851752281189, + "learning_rate": 5.5274371546875304e-05, + "loss": 0.1262, + "step": 3890 + }, + { + "epoch": 1.5632391260773701, + "grad_norm": 0.9542123079299927, + "learning_rate": 5.5042318496597876e-05, + "loss": 0.1398, + "step": 3900 + }, + { + "epoch": 1.567247945480056, + "grad_norm": 0.8645676374435425, + "learning_rate": 5.4810155669157495e-05, + "loss": 0.1356, + "step": 3910 + }, + { + "epoch": 1.571256764882742, + "grad_norm": 0.8348353505134583, + "learning_rate": 5.457788811901008e-05, + "loss": 0.1431, + "step": 3920 + }, + { + "epoch": 1.575265584285428, + "grad_norm": 0.8592683672904968, + "learning_rate": 5.434552090289145e-05, + "loss": 0.1243, + "step": 3930 + }, + { + "epoch": 1.579274403688114, + "grad_norm": 0.9037445187568665, + "learning_rate": 5.411305907970734e-05, + "loss": 0.1201, + "step": 3940 + }, + { + "epoch": 1.5832832230907998, + "grad_norm": 0.7110516428947449, + "learning_rate": 5.3880507710423134e-05, + "loss": 0.1331, + "step": 3950 + }, + { + "epoch": 1.5872920424934858, + "grad_norm": 0.8847816586494446, + "learning_rate": 5.3647871857953735e-05, + "loss": 0.1224, + "step": 3960 + }, + { + "epoch": 1.5913008618961717, + "grad_norm": 0.9340296983718872, + "learning_rate": 5.341515658705339e-05, + "loss": 0.1315, + "step": 3970 + }, + { + "epoch": 1.5953096812988576, + "grad_norm": 0.9499775767326355, + "learning_rate": 5.318236696420534e-05, + "loss": 0.1338, + "step": 3980 + }, + { + "epoch": 1.5993185007015434, + "grad_norm": 0.9325523972511292, + "learning_rate": 5.294950805751158e-05, + "loss": 0.1277, + "step": 3990 + }, + { + "epoch": 1.6033273201042293, + "grad_norm": 0.9514039158821106, + "learning_rate": 5.271658493658245e-05, + "loss": 0.1287, + "step": 4000 + }, + { + "epoch": 1.6073361395069152, + "grad_norm": 1.022368311882019, + "learning_rate": 5.248360267242637e-05, + "loss": 0.1363, + "step": 4010 + }, + { + "epoch": 1.6113449589096012, + "grad_norm": 0.8409161567687988, + "learning_rate": 5.2250566337339326e-05, + "loss": 0.1341, + "step": 4020 + }, + { + "epoch": 1.6153537783122869, + "grad_norm": 1.0613347291946411, + "learning_rate": 5.201748100479452e-05, + "loss": 0.1329, + "step": 4030 + }, + { + "epoch": 1.6193625977149728, + "grad_norm": 0.8661359548568726, + "learning_rate": 5.178435174933188e-05, + "loss": 0.119, + "step": 4040 + }, + { + "epoch": 1.6233714171176588, + "grad_norm": 0.9642584919929504, + "learning_rate": 5.15511836464476e-05, + "loss": 0.1279, + "step": 4050 + }, + { + "epoch": 1.6273802365203447, + "grad_norm": 0.9616632461547852, + "learning_rate": 5.131798177248357e-05, + "loss": 0.1294, + "step": 4060 + }, + { + "epoch": 1.6313890559230306, + "grad_norm": 1.1416373252868652, + "learning_rate": 5.108475120451702e-05, + "loss": 0.1394, + "step": 4070 + }, + { + "epoch": 1.6353978753257166, + "grad_norm": 0.9488154649734497, + "learning_rate": 5.085149702024977e-05, + "loss": 0.1222, + "step": 4080 + }, + { + "epoch": 1.6394066947284025, + "grad_norm": 1.030707597732544, + "learning_rate": 5.061822429789788e-05, + "loss": 0.1304, + "step": 4090 + }, + { + "epoch": 1.6434155141310884, + "grad_norm": 1.0803980827331543, + "learning_rate": 5.038493811608095e-05, + "loss": 0.1326, + "step": 4100 + }, + { + "epoch": 1.6474243335337744, + "grad_norm": 0.8971238136291504, + "learning_rate": 5.015164355371164e-05, + "loss": 0.1163, + "step": 4110 + }, + { + "epoch": 1.6514331529364603, + "grad_norm": 0.7943403124809265, + "learning_rate": 4.9918345689885035e-05, + "loss": 0.1268, + "step": 4120 + }, + { + "epoch": 1.6554419723391463, + "grad_norm": 1.109113097190857, + "learning_rate": 4.968504960376815e-05, + "loss": 0.1289, + "step": 4130 + }, + { + "epoch": 1.6594507917418322, + "grad_norm": 1.1698325872421265, + "learning_rate": 4.945176037448923e-05, + "loss": 0.1138, + "step": 4140 + }, + { + "epoch": 1.663459611144518, + "grad_norm": 1.1132344007492065, + "learning_rate": 4.9218483081027284e-05, + "loss": 0.1244, + "step": 4150 + }, + { + "epoch": 1.6674684305472038, + "grad_norm": 0.8619892001152039, + "learning_rate": 4.8985222802101475e-05, + "loss": 0.1296, + "step": 4160 + }, + { + "epoch": 1.6714772499498898, + "grad_norm": 1.010392427444458, + "learning_rate": 4.875198461606047e-05, + "loss": 0.1307, + "step": 4170 + }, + { + "epoch": 1.6754860693525757, + "grad_norm": 0.8872926831245422, + "learning_rate": 4.851877360077203e-05, + "loss": 0.1241, + "step": 4180 + }, + { + "epoch": 1.6794948887552614, + "grad_norm": 1.035994052886963, + "learning_rate": 4.828559483351233e-05, + "loss": 0.112, + "step": 4190 + }, + { + "epoch": 1.6835037081579474, + "grad_norm": 1.1755554676055908, + "learning_rate": 4.805245339085548e-05, + "loss": 0.1198, + "step": 4200 + }, + { + "epoch": 1.6875125275606333, + "grad_norm": 1.008541226387024, + "learning_rate": 4.781935434856299e-05, + "loss": 0.1348, + "step": 4210 + }, + { + "epoch": 1.6915213469633192, + "grad_norm": 1.0429742336273193, + "learning_rate": 4.758630278147327e-05, + "loss": 0.1205, + "step": 4220 + }, + { + "epoch": 1.6955301663660052, + "grad_norm": 0.8936703205108643, + "learning_rate": 4.735330376339111e-05, + "loss": 0.119, + "step": 4230 + }, + { + "epoch": 1.6995389857686911, + "grad_norm": 0.9886868596076965, + "learning_rate": 4.712036236697728e-05, + "loss": 0.1084, + "step": 4240 + }, + { + "epoch": 1.703547805171377, + "grad_norm": 0.9149814248085022, + "learning_rate": 4.6887483663638084e-05, + "loss": 0.1303, + "step": 4250 + }, + { + "epoch": 1.707556624574063, + "grad_norm": 0.9031015634536743, + "learning_rate": 4.665467272341484e-05, + "loss": 0.109, + "step": 4260 + }, + { + "epoch": 1.711565443976749, + "grad_norm": 1.041288137435913, + "learning_rate": 4.6421934614873654e-05, + "loss": 0.1246, + "step": 4270 + }, + { + "epoch": 1.7155742633794349, + "grad_norm": 0.9827173352241516, + "learning_rate": 4.6189274404994984e-05, + "loss": 0.1252, + "step": 4280 + }, + { + "epoch": 1.7195830827821208, + "grad_norm": 1.0415915250778198, + "learning_rate": 4.595669715906333e-05, + "loss": 0.1122, + "step": 4290 + }, + { + "epoch": 1.7235919021848067, + "grad_norm": 1.0126681327819824, + "learning_rate": 4.572420794055698e-05, + "loss": 0.1213, + "step": 4300 + }, + { + "epoch": 1.7276007215874924, + "grad_norm": 0.9639745354652405, + "learning_rate": 4.549181181103778e-05, + "loss": 0.1279, + "step": 4310 + }, + { + "epoch": 1.7316095409901784, + "grad_norm": 1.1144078969955444, + "learning_rate": 4.5259513830040875e-05, + "loss": 0.1189, + "step": 4320 + }, + { + "epoch": 1.7356183603928643, + "grad_norm": 1.139124870300293, + "learning_rate": 4.502731905496463e-05, + "loss": 0.1112, + "step": 4330 + }, + { + "epoch": 1.7396271797955503, + "grad_norm": 1.0518343448638916, + "learning_rate": 4.479523254096055e-05, + "loss": 0.1321, + "step": 4340 + }, + { + "epoch": 1.743635999198236, + "grad_norm": 0.7808403968811035, + "learning_rate": 4.456325934082302e-05, + "loss": 0.1391, + "step": 4350 + }, + { + "epoch": 1.747644818600922, + "grad_norm": 1.047770619392395, + "learning_rate": 4.433140450487962e-05, + "loss": 0.1302, + "step": 4360 + }, + { + "epoch": 1.7516536380036078, + "grad_norm": 0.9837223291397095, + "learning_rate": 4.409967308088091e-05, + "loss": 0.1193, + "step": 4370 + }, + { + "epoch": 1.7556624574062938, + "grad_norm": 1.0093597173690796, + "learning_rate": 4.3868070113890626e-05, + "loss": 0.1163, + "step": 4380 + }, + { + "epoch": 1.7596712768089797, + "grad_norm": 1.1313358545303345, + "learning_rate": 4.36366006461759e-05, + "loss": 0.1274, + "step": 4390 + }, + { + "epoch": 1.7636800962116657, + "grad_norm": 0.9579795598983765, + "learning_rate": 4.340526971709735e-05, + "loss": 0.1103, + "step": 4400 + }, + { + "epoch": 1.7676889156143516, + "grad_norm": 1.0444706678390503, + "learning_rate": 4.317408236299952e-05, + "loss": 0.1121, + "step": 4410 + }, + { + "epoch": 1.7716977350170375, + "grad_norm": 0.9483968019485474, + "learning_rate": 4.2943043617101134e-05, + "loss": 0.1086, + "step": 4420 + }, + { + "epoch": 1.7757065544197235, + "grad_norm": 1.0954207181930542, + "learning_rate": 4.2712158509385495e-05, + "loss": 0.1166, + "step": 4430 + }, + { + "epoch": 1.7797153738224094, + "grad_norm": 1.169009804725647, + "learning_rate": 4.2481432066491114e-05, + "loss": 0.1164, + "step": 4440 + }, + { + "epoch": 1.7837241932250953, + "grad_norm": 0.9690777063369751, + "learning_rate": 4.2250869311602124e-05, + "loss": 0.1237, + "step": 4450 + }, + { + "epoch": 1.7877330126277813, + "grad_norm": 1.0763111114501953, + "learning_rate": 4.2020475264338966e-05, + "loss": 0.1382, + "step": 4460 + }, + { + "epoch": 1.791741832030467, + "grad_norm": 0.924728274345398, + "learning_rate": 4.179025494064916e-05, + "loss": 0.104, + "step": 4470 + }, + { + "epoch": 1.795750651433153, + "grad_norm": 0.9748139977455139, + "learning_rate": 4.156021335269806e-05, + "loss": 0.1071, + "step": 4480 + }, + { + "epoch": 1.7997594708358389, + "grad_norm": 1.1556870937347412, + "learning_rate": 4.133035550875968e-05, + "loss": 0.1137, + "step": 4490 + }, + { + "epoch": 1.8037682902385248, + "grad_norm": 1.1552350521087646, + "learning_rate": 4.110068641310775e-05, + "loss": 0.1207, + "step": 4500 + }, + { + "epoch": 1.8077771096412105, + "grad_norm": 1.115271806716919, + "learning_rate": 4.0871211065906786e-05, + "loss": 0.1205, + "step": 4510 + }, + { + "epoch": 1.8117859290438965, + "grad_norm": 0.9051127433776855, + "learning_rate": 4.0641934463103054e-05, + "loss": 0.1123, + "step": 4520 + }, + { + "epoch": 1.8157947484465824, + "grad_norm": 1.0964293479919434, + "learning_rate": 4.0412861596316013e-05, + "loss": 0.1092, + "step": 4530 + }, + { + "epoch": 1.8198035678492683, + "grad_norm": 1.308677315711975, + "learning_rate": 4.0183997452729534e-05, + "loss": 0.1182, + "step": 4540 + }, + { + "epoch": 1.8238123872519543, + "grad_norm": 0.9863505959510803, + "learning_rate": 3.99553470149833e-05, + "loss": 0.1138, + "step": 4550 + }, + { + "epoch": 1.8278212066546402, + "grad_norm": 0.9477949142456055, + "learning_rate": 3.9726915261064426e-05, + "loss": 0.123, + "step": 4560 + }, + { + "epoch": 1.8318300260573261, + "grad_norm": 1.130746841430664, + "learning_rate": 3.9498707164198984e-05, + "loss": 0.1096, + "step": 4570 + }, + { + "epoch": 1.835838845460012, + "grad_norm": 1.0901241302490234, + "learning_rate": 3.927072769274377e-05, + "loss": 0.1062, + "step": 4580 + }, + { + "epoch": 1.839847664862698, + "grad_norm": 0.79862380027771, + "learning_rate": 3.904298181007817e-05, + "loss": 0.1117, + "step": 4590 + }, + { + "epoch": 1.843856484265384, + "grad_norm": 0.8396957516670227, + "learning_rate": 3.881547447449606e-05, + "loss": 0.1247, + "step": 4600 + }, + { + "epoch": 1.84786530366807, + "grad_norm": 1.0613499879837036, + "learning_rate": 3.858821063909782e-05, + "loss": 0.1101, + "step": 4610 + }, + { + "epoch": 1.8518741230707558, + "grad_norm": 1.147533655166626, + "learning_rate": 3.8361195251682614e-05, + "loss": 0.1141, + "step": 4620 + }, + { + "epoch": 1.8558829424734415, + "grad_norm": 1.1135718822479248, + "learning_rate": 3.8134433254640576e-05, + "loss": 0.1266, + "step": 4630 + }, + { + "epoch": 1.8598917618761275, + "grad_norm": 1.0798869132995605, + "learning_rate": 3.790792958484522e-05, + "loss": 0.1132, + "step": 4640 + }, + { + "epoch": 1.8639005812788134, + "grad_norm": 0.9285503029823303, + "learning_rate": 3.7681689173545984e-05, + "loss": 0.1059, + "step": 4650 + }, + { + "epoch": 1.8679094006814991, + "grad_norm": 1.1934738159179688, + "learning_rate": 3.745571694626088e-05, + "loss": 0.1013, + "step": 4660 + }, + { + "epoch": 1.871918220084185, + "grad_norm": 1.0734087228775024, + "learning_rate": 3.7230017822669204e-05, + "loss": 0.1056, + "step": 4670 + }, + { + "epoch": 1.875927039486871, + "grad_norm": 0.9423579573631287, + "learning_rate": 3.700459671650452e-05, + "loss": 0.1193, + "step": 4680 + }, + { + "epoch": 1.879935858889557, + "grad_norm": 0.9041392803192139, + "learning_rate": 3.677945853544755e-05, + "loss": 0.1098, + "step": 4690 + }, + { + "epoch": 1.8839446782922429, + "grad_norm": 1.1040509939193726, + "learning_rate": 3.6554608181019465e-05, + "loss": 0.1195, + "step": 4700 + }, + { + "epoch": 1.8879534976949288, + "grad_norm": 1.2079628705978394, + "learning_rate": 3.633005054847514e-05, + "loss": 0.12, + "step": 4710 + }, + { + "epoch": 1.8919623170976148, + "grad_norm": 0.9661321640014648, + "learning_rate": 3.6105790526696445e-05, + "loss": 0.1128, + "step": 4720 + }, + { + "epoch": 1.8959711365003007, + "grad_norm": 1.2310171127319336, + "learning_rate": 3.588183299808604e-05, + "loss": 0.1165, + "step": 4730 + }, + { + "epoch": 1.8999799559029866, + "grad_norm": 0.9907431602478027, + "learning_rate": 3.565818283846089e-05, + "loss": 0.1037, + "step": 4740 + }, + { + "epoch": 1.9039887753056726, + "grad_norm": 0.9235789775848389, + "learning_rate": 3.543484491694615e-05, + "loss": 0.0974, + "step": 4750 + }, + { + "epoch": 1.9079975947083585, + "grad_norm": 1.1032791137695312, + "learning_rate": 3.521182409586925e-05, + "loss": 0.1223, + "step": 4760 + }, + { + "epoch": 1.9120064141110444, + "grad_norm": 1.138131856918335, + "learning_rate": 3.4989125230653965e-05, + "loss": 0.1085, + "step": 4770 + }, + { + "epoch": 1.9160152335137302, + "grad_norm": 1.0244325399398804, + "learning_rate": 3.476675316971466e-05, + "loss": 0.0997, + "step": 4780 + }, + { + "epoch": 1.920024052916416, + "grad_norm": 1.141847014427185, + "learning_rate": 3.454471275435083e-05, + "loss": 0.1054, + "step": 4790 + }, + { + "epoch": 1.924032872319102, + "grad_norm": 0.9330345988273621, + "learning_rate": 3.4323008818641696e-05, + "loss": 0.1065, + "step": 4800 + }, + { + "epoch": 1.928041691721788, + "grad_norm": 0.9627101421356201, + "learning_rate": 3.410164618934082e-05, + "loss": 0.0913, + "step": 4810 + }, + { + "epoch": 1.9320505111244737, + "grad_norm": 0.9817176461219788, + "learning_rate": 3.388062968577124e-05, + "loss": 0.1243, + "step": 4820 + }, + { + "epoch": 1.9360593305271596, + "grad_norm": 1.1931806802749634, + "learning_rate": 3.3659964119720356e-05, + "loss": 0.1068, + "step": 4830 + }, + { + "epoch": 1.9400681499298456, + "grad_norm": 1.1554603576660156, + "learning_rate": 3.3439654295335274e-05, + "loss": 0.1116, + "step": 4840 + }, + { + "epoch": 1.9440769693325315, + "grad_norm": 1.0284534692764282, + "learning_rate": 3.321970500901819e-05, + "loss": 0.1021, + "step": 4850 + }, + { + "epoch": 1.9480857887352174, + "grad_norm": 0.9820400476455688, + "learning_rate": 3.3000121049321956e-05, + "loss": 0.093, + "step": 4860 + }, + { + "epoch": 1.9520946081379034, + "grad_norm": 0.9649590849876404, + "learning_rate": 3.2780907196845845e-05, + "loss": 0.105, + "step": 4870 + }, + { + "epoch": 1.9561034275405893, + "grad_norm": 1.1404318809509277, + "learning_rate": 3.256206822413145e-05, + "loss": 0.1028, + "step": 4880 + }, + { + "epoch": 1.9601122469432752, + "grad_norm": 0.8916597366333008, + "learning_rate": 3.234360889555884e-05, + "loss": 0.114, + "step": 4890 + }, + { + "epoch": 1.9641210663459612, + "grad_norm": 1.0824633836746216, + "learning_rate": 3.2125533967242704e-05, + "loss": 0.1047, + "step": 4900 + }, + { + "epoch": 1.9681298857486471, + "grad_norm": 1.3187285661697388, + "learning_rate": 3.190784818692897e-05, + "loss": 0.1035, + "step": 4910 + }, + { + "epoch": 1.972138705151333, + "grad_norm": 1.2455309629440308, + "learning_rate": 3.169055629389132e-05, + "loss": 0.1032, + "step": 4920 + }, + { + "epoch": 1.976147524554019, + "grad_norm": 0.8298673629760742, + "learning_rate": 3.147366301882805e-05, + "loss": 0.1028, + "step": 4930 + }, + { + "epoch": 1.9801563439567047, + "grad_norm": 1.0020873546600342, + "learning_rate": 3.1257173083759086e-05, + "loss": 0.1167, + "step": 4940 + }, + { + "epoch": 1.9841651633593906, + "grad_norm": 1.1114490032196045, + "learning_rate": 3.104109120192317e-05, + "loss": 0.0998, + "step": 4950 + }, + { + "epoch": 1.9881739827620766, + "grad_norm": 1.0216310024261475, + "learning_rate": 3.082542207767523e-05, + "loss": 0.1189, + "step": 4960 + }, + { + "epoch": 1.9921828021647625, + "grad_norm": 1.2210197448730469, + "learning_rate": 3.0610170406384045e-05, + "loss": 0.1088, + "step": 4970 + }, + { + "epoch": 1.9961916215674482, + "grad_norm": 1.0631357431411743, + "learning_rate": 3.0395340874329837e-05, + "loss": 0.1098, + "step": 4980 + }, + { + "epoch": 2.0, + "grad_norm": 1.2607094049453735, + "learning_rate": 3.0180938158602483e-05, + "loss": 0.1112, + "step": 4990 + }, + { + "epoch": 2.004008819402686, + "grad_norm": 1.0189579725265503, + "learning_rate": 2.996696692699952e-05, + "loss": 0.0646, + "step": 5000 + }, + { + "epoch": 2.008017638805372, + "grad_norm": 1.3740028142929077, + "learning_rate": 2.9753431837924545e-05, + "loss": 0.083, + "step": 5010 + }, + { + "epoch": 2.012026458208058, + "grad_norm": 1.5331498384475708, + "learning_rate": 2.9540337540285868e-05, + "loss": 0.0717, + "step": 5020 + }, + { + "epoch": 2.0160352776107437, + "grad_norm": 1.4585199356079102, + "learning_rate": 2.9327688673395236e-05, + "loss": 0.071, + "step": 5030 + }, + { + "epoch": 2.0200440970134297, + "grad_norm": 1.2551562786102295, + "learning_rate": 2.911548986686683e-05, + "loss": 0.0805, + "step": 5040 + }, + { + "epoch": 2.0240529164161156, + "grad_norm": 0.9197141528129578, + "learning_rate": 2.890374574051654e-05, + "loss": 0.0747, + "step": 5050 + }, + { + "epoch": 2.0280617358188016, + "grad_norm": 1.359002947807312, + "learning_rate": 2.869246090426131e-05, + "loss": 0.0746, + "step": 5060 + }, + { + "epoch": 2.0320705552214875, + "grad_norm": 1.1969698667526245, + "learning_rate": 2.8481639958018758e-05, + "loss": 0.0703, + "step": 5070 + }, + { + "epoch": 2.036079374624173, + "grad_norm": 1.179650902748108, + "learning_rate": 2.827128749160715e-05, + "loss": 0.0744, + "step": 5080 + }, + { + "epoch": 2.040088194026859, + "grad_norm": 1.3521727323532104, + "learning_rate": 2.8061408084645358e-05, + "loss": 0.0712, + "step": 5090 + }, + { + "epoch": 2.044097013429545, + "grad_norm": 1.1871998310089111, + "learning_rate": 2.78520063064532e-05, + "loss": 0.0617, + "step": 5100 + }, + { + "epoch": 2.048105832832231, + "grad_norm": 1.5966202020645142, + "learning_rate": 2.7643086715951964e-05, + "loss": 0.0822, + "step": 5110 + }, + { + "epoch": 2.0521146522349167, + "grad_norm": 1.5227017402648926, + "learning_rate": 2.7434653861565175e-05, + "loss": 0.0782, + "step": 5120 + }, + { + "epoch": 2.0561234716376027, + "grad_norm": 1.2805331945419312, + "learning_rate": 2.7226712281119448e-05, + "loss": 0.065, + "step": 5130 + }, + { + "epoch": 2.0601322910402886, + "grad_norm": 1.0091133117675781, + "learning_rate": 2.701926650174592e-05, + "loss": 0.0771, + "step": 5140 + }, + { + "epoch": 2.0641411104429745, + "grad_norm": 1.114707589149475, + "learning_rate": 2.6812321039781507e-05, + "loss": 0.0796, + "step": 5150 + }, + { + "epoch": 2.0681499298456605, + "grad_norm": 1.2408510446548462, + "learning_rate": 2.6605880400670573e-05, + "loss": 0.0624, + "step": 5160 + }, + { + "epoch": 2.0721587492483464, + "grad_norm": 1.6427772045135498, + "learning_rate": 2.639994907886697e-05, + "loss": 0.0682, + "step": 5170 + }, + { + "epoch": 2.0761675686510324, + "grad_norm": 1.5963749885559082, + "learning_rate": 2.61945315577361e-05, + "loss": 0.0594, + "step": 5180 + }, + { + "epoch": 2.0801763880537183, + "grad_norm": 1.3586746454238892, + "learning_rate": 2.5989632309457318e-05, + "loss": 0.0764, + "step": 5190 + }, + { + "epoch": 2.0841852074564042, + "grad_norm": 1.2391273975372314, + "learning_rate": 2.5785255794926573e-05, + "loss": 0.0554, + "step": 5200 + }, + { + "epoch": 2.08819402685909, + "grad_norm": 1.079005241394043, + "learning_rate": 2.558140646365929e-05, + "loss": 0.0618, + "step": 5210 + }, + { + "epoch": 2.092202846261776, + "grad_norm": 1.494195580482483, + "learning_rate": 2.537808875369351e-05, + "loss": 0.0745, + "step": 5220 + }, + { + "epoch": 2.0962116656644616, + "grad_norm": 1.301558256149292, + "learning_rate": 2.5175307091493255e-05, + "loss": 0.0661, + "step": 5230 + }, + { + "epoch": 2.1002204850671475, + "grad_norm": 1.9365872144699097, + "learning_rate": 2.497306589185212e-05, + "loss": 0.0726, + "step": 5240 + }, + { + "epoch": 2.1042293044698335, + "grad_norm": 1.3354028463363647, + "learning_rate": 2.4771369557797264e-05, + "loss": 0.0742, + "step": 5250 + }, + { + "epoch": 2.1082381238725194, + "grad_norm": 1.2166097164154053, + "learning_rate": 2.4570222480493437e-05, + "loss": 0.0763, + "step": 5260 + }, + { + "epoch": 2.1122469432752053, + "grad_norm": 1.8834477663040161, + "learning_rate": 2.4369629039147458e-05, + "loss": 0.0657, + "step": 5270 + }, + { + "epoch": 2.1162557626778913, + "grad_norm": 1.174580693244934, + "learning_rate": 2.416959360091283e-05, + "loss": 0.0725, + "step": 5280 + }, + { + "epoch": 2.120264582080577, + "grad_norm": 1.636049747467041, + "learning_rate": 2.397012052079469e-05, + "loss": 0.0677, + "step": 5290 + }, + { + "epoch": 2.124273401483263, + "grad_norm": 1.0291727781295776, + "learning_rate": 2.3771214141554932e-05, + "loss": 0.072, + "step": 5300 + }, + { + "epoch": 2.128282220885949, + "grad_norm": 1.4774961471557617, + "learning_rate": 2.3572878793617785e-05, + "loss": 0.0626, + "step": 5310 + }, + { + "epoch": 2.132291040288635, + "grad_norm": 1.3647609949111938, + "learning_rate": 2.3375118794975436e-05, + "loss": 0.0822, + "step": 5320 + }, + { + "epoch": 2.136299859691321, + "grad_norm": 1.2686548233032227, + "learning_rate": 2.3177938451093994e-05, + "loss": 0.0654, + "step": 5330 + }, + { + "epoch": 2.140308679094007, + "grad_norm": 1.0904122591018677, + "learning_rate": 2.298134205481986e-05, + "loss": 0.0788, + "step": 5340 + }, + { + "epoch": 2.144317498496693, + "grad_norm": 1.5554347038269043, + "learning_rate": 2.278533388628621e-05, + "loss": 0.0618, + "step": 5350 + }, + { + "epoch": 2.148326317899379, + "grad_norm": 1.4490818977355957, + "learning_rate": 2.2589918212819787e-05, + "loss": 0.0714, + "step": 5360 + }, + { + "epoch": 2.1523351373020647, + "grad_norm": 1.056925892829895, + "learning_rate": 2.2395099288848066e-05, + "loss": 0.0787, + "step": 5370 + }, + { + "epoch": 2.1563439567047507, + "grad_norm": 1.642364740371704, + "learning_rate": 2.2200881355806565e-05, + "loss": 0.0766, + "step": 5380 + }, + { + "epoch": 2.1603527761074366, + "grad_norm": 1.0628970861434937, + "learning_rate": 2.2007268642046476e-05, + "loss": 0.0557, + "step": 5390 + }, + { + "epoch": 2.164361595510122, + "grad_norm": 0.9546886086463928, + "learning_rate": 2.181426536274277e-05, + "loss": 0.0591, + "step": 5400 + }, + { + "epoch": 2.168370414912808, + "grad_norm": 1.6997793912887573, + "learning_rate": 2.1621875719802258e-05, + "loss": 0.069, + "step": 5410 + }, + { + "epoch": 2.172379234315494, + "grad_norm": 1.2331558465957642, + "learning_rate": 2.1430103901772135e-05, + "loss": 0.0765, + "step": 5420 + }, + { + "epoch": 2.17638805371818, + "grad_norm": 1.4007397890090942, + "learning_rate": 2.1238954083748887e-05, + "loss": 0.0759, + "step": 5430 + }, + { + "epoch": 2.180396873120866, + "grad_norm": 1.1436303853988647, + "learning_rate": 2.1048430427287304e-05, + "loss": 0.0681, + "step": 5440 + }, + { + "epoch": 2.1844056925235518, + "grad_norm": 0.89713454246521, + "learning_rate": 2.085853708030991e-05, + "loss": 0.0701, + "step": 5450 + }, + { + "epoch": 2.1884145119262377, + "grad_norm": 1.5042600631713867, + "learning_rate": 2.0669278177016664e-05, + "loss": 0.0654, + "step": 5460 + }, + { + "epoch": 2.1924233313289236, + "grad_norm": 1.211078405380249, + "learning_rate": 2.0480657837794963e-05, + "loss": 0.069, + "step": 5470 + }, + { + "epoch": 2.1964321507316096, + "grad_norm": 0.9574674367904663, + "learning_rate": 2.0292680169129828e-05, + "loss": 0.0623, + "step": 5480 + }, + { + "epoch": 2.2004409701342955, + "grad_norm": 1.1876091957092285, + "learning_rate": 2.0105349263514728e-05, + "loss": 0.0637, + "step": 5490 + }, + { + "epoch": 2.2044497895369815, + "grad_norm": 1.3990014791488647, + "learning_rate": 1.991866919936226e-05, + "loss": 0.0659, + "step": 5500 + }, + { + "epoch": 2.2084586089396674, + "grad_norm": 0.9827179312705994, + "learning_rate": 1.9732644040915427e-05, + "loss": 0.0603, + "step": 5510 + }, + { + "epoch": 2.2124674283423533, + "grad_norm": 1.0140836238861084, + "learning_rate": 1.9547277838159222e-05, + "loss": 0.0574, + "step": 5520 + }, + { + "epoch": 2.2164762477450393, + "grad_norm": 1.0066441297531128, + "learning_rate": 1.936257462673238e-05, + "loss": 0.0693, + "step": 5530 + }, + { + "epoch": 2.220485067147725, + "grad_norm": 1.0355478525161743, + "learning_rate": 1.9178538427839537e-05, + "loss": 0.0623, + "step": 5540 + }, + { + "epoch": 2.2244938865504107, + "grad_norm": 1.0241667032241821, + "learning_rate": 1.8995173248163716e-05, + "loss": 0.0575, + "step": 5550 + }, + { + "epoch": 2.2285027059530966, + "grad_norm": 1.0945931673049927, + "learning_rate": 1.8812483079779008e-05, + "loss": 0.0617, + "step": 5560 + }, + { + "epoch": 2.2325115253557826, + "grad_norm": 1.2159384489059448, + "learning_rate": 1.863047190006375e-05, + "loss": 0.0764, + "step": 5570 + }, + { + "epoch": 2.2365203447584685, + "grad_norm": 1.3158025741577148, + "learning_rate": 1.8449143671613962e-05, + "loss": 0.0663, + "step": 5580 + }, + { + "epoch": 2.2405291641611544, + "grad_norm": 1.1542671918869019, + "learning_rate": 1.8268502342156918e-05, + "loss": 0.064, + "step": 5590 + }, + { + "epoch": 2.2445379835638404, + "grad_norm": 1.233852744102478, + "learning_rate": 1.808855184446535e-05, + "loss": 0.0708, + "step": 5600 + }, + { + "epoch": 2.2485468029665263, + "grad_norm": 1.0461921691894531, + "learning_rate": 1.7909296096271783e-05, + "loss": 0.0611, + "step": 5610 + }, + { + "epoch": 2.2525556223692123, + "grad_norm": 1.2904634475708008, + "learning_rate": 1.773073900018321e-05, + "loss": 0.0598, + "step": 5620 + }, + { + "epoch": 2.256564441771898, + "grad_norm": 1.213394284248352, + "learning_rate": 1.7552884443596168e-05, + "loss": 0.0608, + "step": 5630 + }, + { + "epoch": 2.260573261174584, + "grad_norm": 1.203125, + "learning_rate": 1.73757362986121e-05, + "loss": 0.0638, + "step": 5640 + }, + { + "epoch": 2.26458208057727, + "grad_norm": 1.0718966722488403, + "learning_rate": 1.7199298421952987e-05, + "loss": 0.0628, + "step": 5650 + }, + { + "epoch": 2.268590899979956, + "grad_norm": 1.5006955862045288, + "learning_rate": 1.7023574654877482e-05, + "loss": 0.0591, + "step": 5660 + }, + { + "epoch": 2.272599719382642, + "grad_norm": 1.0694504976272583, + "learning_rate": 1.684856882309729e-05, + "loss": 0.0699, + "step": 5670 + }, + { + "epoch": 2.276608538785328, + "grad_norm": 1.068630337715149, + "learning_rate": 1.6674284736693713e-05, + "loss": 0.0599, + "step": 5680 + }, + { + "epoch": 2.280617358188014, + "grad_norm": 0.9531617760658264, + "learning_rate": 1.6500726190034888e-05, + "loss": 0.0595, + "step": 5690 + }, + { + "epoch": 2.2846261775906997, + "grad_norm": 1.1300429105758667, + "learning_rate": 1.6327896961693086e-05, + "loss": 0.0704, + "step": 5700 + }, + { + "epoch": 2.2886349969933857, + "grad_norm": 1.248582124710083, + "learning_rate": 1.6155800814362475e-05, + "loss": 0.0591, + "step": 5710 + }, + { + "epoch": 2.292643816396071, + "grad_norm": 1.2277759313583374, + "learning_rate": 1.598444149477718e-05, + "loss": 0.0644, + "step": 5720 + }, + { + "epoch": 2.296652635798757, + "grad_norm": 1.4432833194732666, + "learning_rate": 1.5813822733629745e-05, + "loss": 0.0715, + "step": 5730 + }, + { + "epoch": 2.300661455201443, + "grad_norm": 1.1492823362350464, + "learning_rate": 1.5643948245489836e-05, + "loss": 0.0525, + "step": 5740 + }, + { + "epoch": 2.304670274604129, + "grad_norm": 1.4520362615585327, + "learning_rate": 1.547482172872351e-05, + "loss": 0.0536, + "step": 5750 + }, + { + "epoch": 2.308679094006815, + "grad_norm": 1.236132025718689, + "learning_rate": 1.530644686541258e-05, + "loss": 0.0584, + "step": 5760 + }, + { + "epoch": 2.312687913409501, + "grad_norm": 1.4177806377410889, + "learning_rate": 1.5138827321274435e-05, + "loss": 0.0597, + "step": 5770 + }, + { + "epoch": 2.316696732812187, + "grad_norm": 1.0297455787658691, + "learning_rate": 1.497196674558235e-05, + "loss": 0.0627, + "step": 5780 + }, + { + "epoch": 2.3207055522148727, + "grad_norm": 1.1963504552841187, + "learning_rate": 1.4805868771085946e-05, + "loss": 0.0627, + "step": 5790 + }, + { + "epoch": 2.3247143716175587, + "grad_norm": 1.5588128566741943, + "learning_rate": 1.4640537013932121e-05, + "loss": 0.0609, + "step": 5800 + }, + { + "epoch": 2.3287231910202446, + "grad_norm": 1.5374112129211426, + "learning_rate": 1.4475975073586345e-05, + "loss": 0.0716, + "step": 5810 + }, + { + "epoch": 2.3327320104229305, + "grad_norm": 1.6463807821273804, + "learning_rate": 1.431218653275424e-05, + "loss": 0.0737, + "step": 5820 + }, + { + "epoch": 2.3367408298256165, + "grad_norm": 1.3641928434371948, + "learning_rate": 1.4149174957303629e-05, + "loss": 0.0672, + "step": 5830 + }, + { + "epoch": 2.3407496492283024, + "grad_norm": 1.259701132774353, + "learning_rate": 1.398694389618696e-05, + "loss": 0.0759, + "step": 5840 + }, + { + "epoch": 2.3447584686309884, + "grad_norm": 1.2060563564300537, + "learning_rate": 1.3825496881363864e-05, + "loss": 0.0628, + "step": 5850 + }, + { + "epoch": 2.348767288033674, + "grad_norm": 1.3083685636520386, + "learning_rate": 1.3664837427724431e-05, + "loss": 0.0578, + "step": 5860 + }, + { + "epoch": 2.35277610743636, + "grad_norm": 1.3214398622512817, + "learning_rate": 1.3504969033012615e-05, + "loss": 0.06, + "step": 5870 + }, + { + "epoch": 2.3567849268390457, + "grad_norm": 1.1904963254928589, + "learning_rate": 1.3345895177750094e-05, + "loss": 0.0617, + "step": 5880 + }, + { + "epoch": 2.3607937462417317, + "grad_norm": 1.1517525911331177, + "learning_rate": 1.3187619325160483e-05, + "loss": 0.0528, + "step": 5890 + }, + { + "epoch": 2.3648025656444176, + "grad_norm": 1.424729824066162, + "learning_rate": 1.3030144921093979e-05, + "loss": 0.0652, + "step": 5900 + }, + { + "epoch": 2.3688113850471035, + "grad_norm": 1.4582880735397339, + "learning_rate": 1.2873475393952245e-05, + "loss": 0.0641, + "step": 5910 + }, + { + "epoch": 2.3728202044497895, + "grad_norm": 1.2188777923583984, + "learning_rate": 1.2717614154613877e-05, + "loss": 0.067, + "step": 5920 + }, + { + "epoch": 2.3768290238524754, + "grad_norm": 1.2932417392730713, + "learning_rate": 1.2562564596360144e-05, + "loss": 0.0535, + "step": 5930 + }, + { + "epoch": 2.3808378432551613, + "grad_norm": 1.2565412521362305, + "learning_rate": 1.2408330094800974e-05, + "loss": 0.0642, + "step": 5940 + }, + { + "epoch": 2.3848466626578473, + "grad_norm": 1.1354554891586304, + "learning_rate": 1.225491400780162e-05, + "loss": 0.0518, + "step": 5950 + }, + { + "epoch": 2.388855482060533, + "grad_norm": 1.0824904441833496, + "learning_rate": 1.2102319675409491e-05, + "loss": 0.0593, + "step": 5960 + }, + { + "epoch": 2.392864301463219, + "grad_norm": 1.3248436450958252, + "learning_rate": 1.1950550419781414e-05, + "loss": 0.0606, + "step": 5970 + }, + { + "epoch": 2.396873120865905, + "grad_norm": 1.3530750274658203, + "learning_rate": 1.1799609545111363e-05, + "loss": 0.058, + "step": 5980 + }, + { + "epoch": 2.400881940268591, + "grad_norm": 1.5529499053955078, + "learning_rate": 1.1649500337558478e-05, + "loss": 0.066, + "step": 5990 + }, + { + "epoch": 2.404890759671277, + "grad_norm": 0.9849441647529602, + "learning_rate": 1.15002260651755e-05, + "loss": 0.0657, + "step": 6000 + }, + { + "epoch": 2.408899579073963, + "grad_norm": 1.6223032474517822, + "learning_rate": 1.1351789977837696e-05, + "loss": 0.0687, + "step": 6010 + }, + { + "epoch": 2.412908398476649, + "grad_norm": 1.4085158109664917, + "learning_rate": 1.1204195307172094e-05, + "loss": 0.0608, + "step": 6020 + }, + { + "epoch": 2.416917217879335, + "grad_norm": 1.329626441001892, + "learning_rate": 1.1057445266487016e-05, + "loss": 0.0619, + "step": 6030 + }, + { + "epoch": 2.4209260372820203, + "grad_norm": 1.2898280620574951, + "learning_rate": 1.091154305070226e-05, + "loss": 0.0653, + "step": 6040 + }, + { + "epoch": 2.424934856684706, + "grad_norm": 1.290812611579895, + "learning_rate": 1.0766491836279486e-05, + "loss": 0.0636, + "step": 6050 + }, + { + "epoch": 2.428943676087392, + "grad_norm": 1.226360559463501, + "learning_rate": 1.0622294781153036e-05, + "loss": 0.0486, + "step": 6060 + }, + { + "epoch": 2.432952495490078, + "grad_norm": 1.4300650358200073, + "learning_rate": 1.047895502466122e-05, + "loss": 0.0711, + "step": 6070 + }, + { + "epoch": 2.436961314892764, + "grad_norm": 1.4043900966644287, + "learning_rate": 1.0336475687477964e-05, + "loss": 0.0625, + "step": 6080 + }, + { + "epoch": 2.44097013429545, + "grad_norm": 1.0565260648727417, + "learning_rate": 1.0194859871544831e-05, + "loss": 0.0561, + "step": 6090 + }, + { + "epoch": 2.444978953698136, + "grad_norm": 1.1184768676757812, + "learning_rate": 1.0054110660003551e-05, + "loss": 0.0584, + "step": 6100 + }, + { + "epoch": 2.448987773100822, + "grad_norm": 1.2461347579956055, + "learning_rate": 9.914231117128841e-06, + "loss": 0.0709, + "step": 6110 + }, + { + "epoch": 2.4529965925035078, + "grad_norm": 1.2430334091186523, + "learning_rate": 9.77522428826173e-06, + "loss": 0.0606, + "step": 6120 + }, + { + "epoch": 2.4570054119061937, + "grad_norm": 1.2371201515197754, + "learning_rate": 9.637093199743236e-06, + "loss": 0.0627, + "step": 6130 + }, + { + "epoch": 2.4610142313088796, + "grad_norm": 1.1069798469543457, + "learning_rate": 9.499840858848497e-06, + "loss": 0.0564, + "step": 6140 + }, + { + "epoch": 2.4650230507115656, + "grad_norm": 1.2197552919387817, + "learning_rate": 9.363470253721268e-06, + "loss": 0.0611, + "step": 6150 + }, + { + "epoch": 2.4690318701142515, + "grad_norm": 1.005348563194275, + "learning_rate": 9.227984353308926e-06, + "loss": 0.0513, + "step": 6160 + }, + { + "epoch": 2.4730406895169375, + "grad_norm": 1.237045407295227, + "learning_rate": 9.09338610729773e-06, + "loss": 0.0598, + "step": 6170 + }, + { + "epoch": 2.477049508919623, + "grad_norm": 1.4536449909210205, + "learning_rate": 8.959678446048725e-06, + "loss": 0.0587, + "step": 6180 + }, + { + "epoch": 2.481058328322309, + "grad_norm": 1.0354303121566772, + "learning_rate": 8.826864280533853e-06, + "loss": 0.0589, + "step": 6190 + }, + { + "epoch": 2.485067147724995, + "grad_norm": 1.427465796470642, + "learning_rate": 8.694946502272628e-06, + "loss": 0.0482, + "step": 6200 + }, + { + "epoch": 2.4890759671276808, + "grad_norm": 0.8941324353218079, + "learning_rate": 8.563927983269154e-06, + "loss": 0.0635, + "step": 6210 + }, + { + "epoch": 2.4930847865303667, + "grad_norm": 1.0785568952560425, + "learning_rate": 8.433811575949618e-06, + "loss": 0.0622, + "step": 6220 + }, + { + "epoch": 2.4970936059330526, + "grad_norm": 1.0514103174209595, + "learning_rate": 8.304600113100181e-06, + "loss": 0.0566, + "step": 6230 + }, + { + "epoch": 2.5011024253357386, + "grad_norm": 1.5485719442367554, + "learning_rate": 8.1762964078053e-06, + "loss": 0.051, + "step": 6240 + }, + { + "epoch": 2.5051112447384245, + "grad_norm": 1.6941360235214233, + "learning_rate": 8.048903253386515e-06, + "loss": 0.0497, + "step": 6250 + }, + { + "epoch": 2.5091200641411104, + "grad_norm": 1.2973785400390625, + "learning_rate": 7.922423423341551e-06, + "loss": 0.0544, + "step": 6260 + }, + { + "epoch": 2.5131288835437964, + "grad_norm": 1.2612277269363403, + "learning_rate": 7.796859671284045e-06, + "loss": 0.0614, + "step": 6270 + }, + { + "epoch": 2.5171377029464823, + "grad_norm": 1.012540340423584, + "learning_rate": 7.672214730883565e-06, + "loss": 0.0655, + "step": 6280 + }, + { + "epoch": 2.5211465223491683, + "grad_norm": 1.5126792192459106, + "learning_rate": 7.548491315806011e-06, + "loss": 0.055, + "step": 6290 + }, + { + "epoch": 2.525155341751854, + "grad_norm": 1.1852082014083862, + "learning_rate": 7.425692119654648e-06, + "loss": 0.0621, + "step": 6300 + }, + { + "epoch": 2.52916416115454, + "grad_norm": 1.29707670211792, + "learning_rate": 7.3038198159114005e-06, + "loss": 0.0605, + "step": 6310 + }, + { + "epoch": 2.533172980557226, + "grad_norm": 1.53734290599823, + "learning_rate": 7.1828770578786616e-06, + "loss": 0.0581, + "step": 6320 + }, + { + "epoch": 2.537181799959912, + "grad_norm": 1.385482668876648, + "learning_rate": 7.062866478621538e-06, + "loss": 0.0601, + "step": 6330 + }, + { + "epoch": 2.541190619362598, + "grad_norm": 1.2017461061477661, + "learning_rate": 6.943790690910512e-06, + "loss": 0.0504, + "step": 6340 + }, + { + "epoch": 2.545199438765284, + "grad_norm": 1.387803077697754, + "learning_rate": 6.825652287164541e-06, + "loss": 0.0574, + "step": 6350 + }, + { + "epoch": 2.54920825816797, + "grad_norm": 1.3186421394348145, + "learning_rate": 6.708453839394657e-06, + "loss": 0.0585, + "step": 6360 + }, + { + "epoch": 2.5532170775706553, + "grad_norm": 1.398294448852539, + "learning_rate": 6.592197899147984e-06, + "loss": 0.0694, + "step": 6370 + }, + { + "epoch": 2.5572258969733412, + "grad_norm": 1.121980905532837, + "learning_rate": 6.476886997452092e-06, + "loss": 0.0513, + "step": 6380 + }, + { + "epoch": 2.561234716376027, + "grad_norm": 1.4234181642532349, + "learning_rate": 6.362523644760016e-06, + "loss": 0.0546, + "step": 6390 + }, + { + "epoch": 2.565243535778713, + "grad_norm": 1.428277611732483, + "learning_rate": 6.24911033089548e-06, + "loss": 0.0598, + "step": 6400 + }, + { + "epoch": 2.569252355181399, + "grad_norm": 1.793627381324768, + "learning_rate": 6.1366495249988275e-06, + "loss": 0.0624, + "step": 6410 + }, + { + "epoch": 2.573261174584085, + "grad_norm": 1.2697495222091675, + "learning_rate": 6.0251436754731495e-06, + "loss": 0.058, + "step": 6420 + }, + { + "epoch": 2.577269993986771, + "grad_norm": 1.402320384979248, + "learning_rate": 5.914595209931006e-06, + "loss": 0.0523, + "step": 6430 + }, + { + "epoch": 2.581278813389457, + "grad_norm": 1.0470867156982422, + "learning_rate": 5.805006535141621e-06, + "loss": 0.0645, + "step": 6440 + }, + { + "epoch": 2.585287632792143, + "grad_norm": 1.0179369449615479, + "learning_rate": 5.6963800369784385e-06, + "loss": 0.0579, + "step": 6450 + }, + { + "epoch": 2.5892964521948287, + "grad_norm": 1.298664927482605, + "learning_rate": 5.588718080367195e-06, + "loss": 0.0596, + "step": 6460 + }, + { + "epoch": 2.5933052715975147, + "grad_norm": 1.2758408784866333, + "learning_rate": 5.4820230092344385e-06, + "loss": 0.0635, + "step": 6470 + }, + { + "epoch": 2.5973140910002, + "grad_norm": 1.3012737035751343, + "learning_rate": 5.376297146456488e-06, + "loss": 0.0542, + "step": 6480 + }, + { + "epoch": 2.601322910402886, + "grad_norm": 1.1228282451629639, + "learning_rate": 5.271542793808837e-06, + "loss": 0.0547, + "step": 6490 + }, + { + "epoch": 2.605331729805572, + "grad_norm": 1.2128888368606567, + "learning_rate": 5.1677622319161125e-06, + "loss": 0.0582, + "step": 6500 + }, + { + "epoch": 2.609340549208258, + "grad_norm": 1.4961844682693481, + "learning_rate": 5.064957720202374e-06, + "loss": 0.0548, + "step": 6510 + }, + { + "epoch": 2.613349368610944, + "grad_norm": 1.5457652807235718, + "learning_rate": 4.963131496841878e-06, + "loss": 0.069, + "step": 6520 + }, + { + "epoch": 2.61735818801363, + "grad_norm": 0.805304229259491, + "learning_rate": 4.862285778710462e-06, + "loss": 0.0454, + "step": 6530 + }, + { + "epoch": 2.621367007416316, + "grad_norm": 1.2888411283493042, + "learning_rate": 4.762422761337182e-06, + "loss": 0.0531, + "step": 6540 + }, + { + "epoch": 2.6253758268190017, + "grad_norm": 1.2126528024673462, + "learning_rate": 4.663544618856575e-06, + "loss": 0.0688, + "step": 6550 + }, + { + "epoch": 2.6293846462216877, + "grad_norm": 1.269785761833191, + "learning_rate": 4.565653503961281e-06, + "loss": 0.0551, + "step": 6560 + }, + { + "epoch": 2.6333934656243736, + "grad_norm": 1.2016196250915527, + "learning_rate": 4.468751547855215e-06, + "loss": 0.0692, + "step": 6570 + }, + { + "epoch": 2.6374022850270595, + "grad_norm": 1.2045531272888184, + "learning_rate": 4.372840860207123e-06, + "loss": 0.0468, + "step": 6580 + }, + { + "epoch": 2.6414111044297455, + "grad_norm": 0.6648709177970886, + "learning_rate": 4.2779235291047105e-06, + "loss": 0.053, + "step": 6590 + }, + { + "epoch": 2.6454199238324314, + "grad_norm": 1.4050956964492798, + "learning_rate": 4.184001621009137e-06, + "loss": 0.054, + "step": 6600 + }, + { + "epoch": 2.6494287432351173, + "grad_norm": 1.4628591537475586, + "learning_rate": 4.091077180710029e-06, + "loss": 0.0633, + "step": 6610 + }, + { + "epoch": 2.6534375626378033, + "grad_norm": 0.8363884091377258, + "learning_rate": 3.9991522312809945e-06, + "loss": 0.0523, + "step": 6620 + }, + { + "epoch": 2.657446382040489, + "grad_norm": 1.3806829452514648, + "learning_rate": 3.908228774035544e-06, + "loss": 0.057, + "step": 6630 + }, + { + "epoch": 2.661455201443175, + "grad_norm": 1.4262441396713257, + "learning_rate": 3.818308788483533e-06, + "loss": 0.0458, + "step": 6640 + }, + { + "epoch": 2.665464020845861, + "grad_norm": 1.5106767416000366, + "learning_rate": 3.72939423228808e-06, + "loss": 0.0636, + "step": 6650 + }, + { + "epoch": 2.669472840248547, + "grad_norm": 1.71439790725708, + "learning_rate": 3.6414870412229184e-06, + "loss": 0.0601, + "step": 6660 + }, + { + "epoch": 2.673481659651233, + "grad_norm": 1.167715072631836, + "learning_rate": 3.5545891291302704e-06, + "loss": 0.0486, + "step": 6670 + }, + { + "epoch": 2.6774904790539185, + "grad_norm": 1.4082682132720947, + "learning_rate": 3.4687023878791857e-06, + "loss": 0.0543, + "step": 6680 + }, + { + "epoch": 2.6814992984566044, + "grad_norm": 1.1505839824676514, + "learning_rate": 3.3838286873243197e-06, + "loss": 0.0512, + "step": 6690 + }, + { + "epoch": 2.6855081178592903, + "grad_norm": 1.6079833507537842, + "learning_rate": 3.2999698752652685e-06, + "loss": 0.0567, + "step": 6700 + }, + { + "epoch": 2.6895169372619763, + "grad_norm": 1.8533159494400024, + "learning_rate": 3.2171277774063204e-06, + "loss": 0.0588, + "step": 6710 + }, + { + "epoch": 2.693525756664662, + "grad_norm": 1.6835551261901855, + "learning_rate": 3.1353041973166965e-06, + "loss": 0.0619, + "step": 6720 + }, + { + "epoch": 2.697534576067348, + "grad_norm": 1.7107609510421753, + "learning_rate": 3.054500916391312e-06, + "loss": 0.0581, + "step": 6730 + }, + { + "epoch": 2.701543395470034, + "grad_norm": 1.131338119506836, + "learning_rate": 2.9747196938119614e-06, + "loss": 0.0501, + "step": 6740 + }, + { + "epoch": 2.70555221487272, + "grad_norm": 1.9655126333236694, + "learning_rate": 2.8959622665090338e-06, + "loss": 0.049, + "step": 6750 + }, + { + "epoch": 2.709561034275406, + "grad_norm": 1.6402628421783447, + "learning_rate": 2.818230349123724e-06, + "loss": 0.0671, + "step": 6760 + }, + { + "epoch": 2.713569853678092, + "grad_norm": 1.2333426475524902, + "learning_rate": 2.741525633970665e-06, + "loss": 0.0526, + "step": 6770 + }, + { + "epoch": 2.717578673080778, + "grad_norm": 1.035477638244629, + "learning_rate": 2.665849791001074e-06, + "loss": 0.0479, + "step": 6780 + }, + { + "epoch": 2.7215874924834638, + "grad_norm": 1.5815109014511108, + "learning_rate": 2.591204467766456e-06, + "loss": 0.0659, + "step": 6790 + }, + { + "epoch": 2.7255963118861493, + "grad_norm": 1.1497759819030762, + "learning_rate": 2.517591289382676e-06, + "loss": 0.0502, + "step": 6800 + }, + { + "epoch": 2.729605131288835, + "grad_norm": 1.2210969924926758, + "learning_rate": 2.4450118584946002e-06, + "loss": 0.0589, + "step": 6810 + }, + { + "epoch": 2.733613950691521, + "grad_norm": 1.4234390258789062, + "learning_rate": 2.373467755241221e-06, + "loss": 0.0577, + "step": 6820 + }, + { + "epoch": 2.737622770094207, + "grad_norm": 1.0173630714416504, + "learning_rate": 2.302960537221227e-06, + "loss": 0.0418, + "step": 6830 + }, + { + "epoch": 2.741631589496893, + "grad_norm": 1.2278695106506348, + "learning_rate": 2.2334917394590873e-06, + "loss": 0.0429, + "step": 6840 + }, + { + "epoch": 2.745640408899579, + "grad_norm": 1.5566179752349854, + "learning_rate": 2.1650628743716874e-06, + "loss": 0.0504, + "step": 6850 + }, + { + "epoch": 2.749649228302265, + "grad_norm": 1.4304202795028687, + "learning_rate": 2.097675431735341e-06, + "loss": 0.0547, + "step": 6860 + }, + { + "epoch": 2.753658047704951, + "grad_norm": 1.333284854888916, + "learning_rate": 2.0313308786533647e-06, + "loss": 0.0629, + "step": 6870 + }, + { + "epoch": 2.7576668671076368, + "grad_norm": 1.5218358039855957, + "learning_rate": 1.966030659524182e-06, + "loss": 0.0514, + "step": 6880 + }, + { + "epoch": 2.7616756865103227, + "grad_norm": 1.650320291519165, + "learning_rate": 1.9017761960098302e-06, + "loss": 0.0574, + "step": 6890 + }, + { + "epoch": 2.7656845059130086, + "grad_norm": 1.31277334690094, + "learning_rate": 1.838568887005021e-06, + "loss": 0.0574, + "step": 6900 + }, + { + "epoch": 2.7696933253156946, + "grad_norm": 1.2339560985565186, + "learning_rate": 1.776410108606702e-06, + "loss": 0.0568, + "step": 6910 + }, + { + "epoch": 2.7737021447183805, + "grad_norm": 1.1014797687530518, + "learning_rate": 1.7153012140840808e-06, + "loss": 0.0633, + "step": 6920 + }, + { + "epoch": 2.7777109641210664, + "grad_norm": 1.3516331911087036, + "learning_rate": 1.6552435338491544e-06, + "loss": 0.0444, + "step": 6930 + }, + { + "epoch": 2.7817197835237524, + "grad_norm": 1.180059552192688, + "learning_rate": 1.596238375427772e-06, + "loss": 0.0529, + "step": 6940 + }, + { + "epoch": 2.7857286029264383, + "grad_norm": 1.1580662727355957, + "learning_rate": 1.538287023431162e-06, + "loss": 0.0567, + "step": 6950 + }, + { + "epoch": 2.7897374223291243, + "grad_norm": 1.3882676362991333, + "learning_rate": 1.4813907395279214e-06, + "loss": 0.0631, + "step": 6960 + }, + { + "epoch": 2.79374624173181, + "grad_norm": 0.8586792349815369, + "learning_rate": 1.4255507624166109e-06, + "loss": 0.0487, + "step": 6970 + }, + { + "epoch": 2.797755061134496, + "grad_norm": 1.3401018381118774, + "learning_rate": 1.3707683077987588e-06, + "loss": 0.059, + "step": 6980 + }, + { + "epoch": 2.801763880537182, + "grad_norm": 1.160061240196228, + "learning_rate": 1.3170445683523769e-06, + "loss": 0.0511, + "step": 6990 + }, + { + "epoch": 2.8057726999398676, + "grad_norm": 1.5262839794158936, + "learning_rate": 1.264380713706037e-06, + "loss": 0.0571, + "step": 7000 + } + ], + "logging_steps": 10, + "max_steps": 7482, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.328918701667516e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}