diff --git "a/checkpoint-1695/trainer_state.json" "b/checkpoint-1695/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1695/trainer_state.json" @@ -0,0 +1,11982 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9986731534719153, + "eval_steps": 142, + "global_step": 1695, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.588052749633789, + "learning_rate": 2.9999999999999997e-05, + "loss": 3.3182, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 3.3362529277801514, + "eval_runtime": 14.4233, + "eval_samples_per_second": 33.071, + "eval_steps_per_second": 8.32, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 4.520856857299805, + "learning_rate": 5.9999999999999995e-05, + "loss": 3.2788, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 4.619396209716797, + "learning_rate": 8.999999999999999e-05, + "loss": 3.3097, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 4.416432857513428, + "learning_rate": 0.00011999999999999999, + "loss": 2.9162, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 3.6663408279418945, + "learning_rate": 0.00015, + "loss": 2.0914, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 2.739701747894287, + "learning_rate": 0.00017999999999999998, + "loss": 0.9915, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 1.6202051639556885, + "learning_rate": 0.00020999999999999998, + "loss": 0.4153, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 0.975229799747467, + "learning_rate": 0.00023999999999999998, + "loss": 0.1806, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 1.136542558670044, + "learning_rate": 0.00027, + "loss": 0.1403, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 3.98671555519104, + "learning_rate": 0.0003, + "loss": 0.386, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 0.3339874744415283, + "learning_rate": 0.0002999997392879692, + "loss": 0.1334, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 0.4813332259654999, + "learning_rate": 0.0002999989571527831, + "loss": 0.1525, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 0.3785192370414734, + "learning_rate": 0.0002999976535971604, + "loss": 0.1408, + "step": 13 + }, + { + "epoch": 0.02, + "grad_norm": 0.1563730090856552, + "learning_rate": 0.00029999582862563263, + "loss": 0.137, + "step": 14 + }, + { + "epoch": 0.03, + "grad_norm": 0.121751569211483, + "learning_rate": 0.00029999348224454364, + "loss": 0.1371, + "step": 15 + }, + { + "epoch": 0.03, + "grad_norm": 0.22550074756145477, + "learning_rate": 0.0002999906144620498, + "loss": 0.1512, + "step": 16 + }, + { + "epoch": 0.03, + "grad_norm": 0.2235211282968521, + "learning_rate": 0.00029998722528811996, + "loss": 0.1483, + "step": 17 + }, + { + "epoch": 0.03, + "grad_norm": 1.4022941589355469, + "learning_rate": 0.0002999833147345355, + "loss": 0.1124, + "step": 18 + }, + { + "epoch": 0.03, + "grad_norm": 0.16621588170528412, + "learning_rate": 0.0002999788828148901, + "loss": 0.1414, + "step": 19 + }, + { + "epoch": 0.04, + "grad_norm": 0.06893815100193024, + "learning_rate": 0.00029997392954458983, + "loss": 0.1364, + "step": 20 + }, + { + "epoch": 0.04, + "grad_norm": 0.3331284821033478, + "learning_rate": 0.000299968454940853, + "loss": 0.1433, + "step": 21 + }, + { + "epoch": 0.04, + "grad_norm": 0.1405547559261322, + "learning_rate": 0.0002999624590227103, + "loss": 0.1291, + "step": 22 + }, + { + "epoch": 0.04, + "grad_norm": 0.14127376675605774, + "learning_rate": 0.00029995594181100437, + "loss": 0.1298, + "step": 23 + }, + { + "epoch": 0.04, + "grad_norm": 0.20128677785396576, + "learning_rate": 0.00029994890332839025, + "loss": 0.1347, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.31639915704727173, + "learning_rate": 0.0002999413435993347, + "loss": 0.1288, + "step": 25 + }, + { + "epoch": 0.05, + "grad_norm": 0.8167548775672913, + "learning_rate": 0.00029993326265011667, + "loss": 0.1785, + "step": 26 + }, + { + "epoch": 0.05, + "grad_norm": 0.1112348735332489, + "learning_rate": 0.0002999246605088267, + "loss": 0.1168, + "step": 27 + }, + { + "epoch": 0.05, + "grad_norm": 0.06746704876422882, + "learning_rate": 0.0002999155372053673, + "loss": 0.1238, + "step": 28 + }, + { + "epoch": 0.05, + "grad_norm": 0.6724908947944641, + "learning_rate": 0.0002999058927714525, + "loss": 0.2079, + "step": 29 + }, + { + "epoch": 0.05, + "grad_norm": 0.1440785676240921, + "learning_rate": 0.00029989572724060796, + "loss": 0.1283, + "step": 30 + }, + { + "epoch": 0.05, + "grad_norm": 0.11361633986234665, + "learning_rate": 0.00029988504064817065, + "loss": 0.14, + "step": 31 + }, + { + "epoch": 0.06, + "grad_norm": 0.10072154551744461, + "learning_rate": 0.00029987383303128884, + "loss": 0.1389, + "step": 32 + }, + { + "epoch": 0.06, + "grad_norm": 0.07634437829256058, + "learning_rate": 0.00029986210442892213, + "loss": 0.1373, + "step": 33 + }, + { + "epoch": 0.06, + "grad_norm": 0.16817660629749298, + "learning_rate": 0.0002998498548818408, + "loss": 0.1385, + "step": 34 + }, + { + "epoch": 0.06, + "grad_norm": 0.11250842362642288, + "learning_rate": 0.00029983708443262654, + "loss": 0.1389, + "step": 35 + }, + { + "epoch": 0.06, + "grad_norm": 0.10796725004911423, + "learning_rate": 0.0002998237931256712, + "loss": 0.1414, + "step": 36 + }, + { + "epoch": 0.07, + "grad_norm": 0.04835154488682747, + "learning_rate": 0.0002998099810071777, + "loss": 0.1348, + "step": 37 + }, + { + "epoch": 0.07, + "grad_norm": 0.29721006751060486, + "learning_rate": 0.00029979564812515906, + "loss": 0.1374, + "step": 38 + }, + { + "epoch": 0.07, + "grad_norm": 0.13514482975006104, + "learning_rate": 0.0002997807945294387, + "loss": 0.1395, + "step": 39 + }, + { + "epoch": 0.07, + "grad_norm": 0.2045493870973587, + "learning_rate": 0.0002997654202716501, + "loss": 0.1258, + "step": 40 + }, + { + "epoch": 0.07, + "grad_norm": 0.17623500525951385, + "learning_rate": 0.0002997495254052367, + "loss": 0.1399, + "step": 41 + }, + { + "epoch": 0.07, + "grad_norm": 0.15333342552185059, + "learning_rate": 0.00029973310998545157, + "loss": 0.1407, + "step": 42 + }, + { + "epoch": 0.08, + "grad_norm": 0.11854752153158188, + "learning_rate": 0.0002997161740693573, + "loss": 0.1365, + "step": 43 + }, + { + "epoch": 0.08, + "grad_norm": 0.34156811237335205, + "learning_rate": 0.00029969871771582594, + "loss": 0.1064, + "step": 44 + }, + { + "epoch": 0.08, + "grad_norm": 0.07359682768583298, + "learning_rate": 0.0002996807409855385, + "loss": 0.1267, + "step": 45 + }, + { + "epoch": 0.08, + "grad_norm": 0.2271728217601776, + "learning_rate": 0.00029966224394098494, + "loss": 0.1378, + "step": 46 + }, + { + "epoch": 0.08, + "grad_norm": 0.21861566603183746, + "learning_rate": 0.0002996432266464641, + "loss": 0.1415, + "step": 47 + }, + { + "epoch": 0.08, + "grad_norm": 0.11931619793176651, + "learning_rate": 0.00029962368916808306, + "loss": 0.1375, + "step": 48 + }, + { + "epoch": 0.09, + "grad_norm": 0.19349679350852966, + "learning_rate": 0.00029960363157375717, + "loss": 0.132, + "step": 49 + }, + { + "epoch": 0.09, + "grad_norm": 0.35250943899154663, + "learning_rate": 0.00029958305393320997, + "loss": 0.1513, + "step": 50 + }, + { + "epoch": 0.09, + "grad_norm": 0.10226023942232132, + "learning_rate": 0.00029956195631797257, + "loss": 0.1332, + "step": 51 + }, + { + "epoch": 0.09, + "grad_norm": 0.34944722056388855, + "learning_rate": 0.00029954033880138364, + "loss": 0.1512, + "step": 52 + }, + { + "epoch": 0.09, + "grad_norm": 0.24920719861984253, + "learning_rate": 0.00029951820145858915, + "loss": 0.1433, + "step": 53 + }, + { + "epoch": 0.1, + "grad_norm": 0.5471547842025757, + "learning_rate": 0.0002994955443665421, + "loss": 0.1453, + "step": 54 + }, + { + "epoch": 0.1, + "grad_norm": 0.04468518868088722, + "learning_rate": 0.00029947236760400215, + "loss": 0.1328, + "step": 55 + }, + { + "epoch": 0.1, + "grad_norm": 0.1944192349910736, + "learning_rate": 0.00029944867125153543, + "loss": 0.1319, + "step": 56 + }, + { + "epoch": 0.1, + "grad_norm": 0.10784479230642319, + "learning_rate": 0.0002994244553915143, + "loss": 0.1394, + "step": 57 + }, + { + "epoch": 0.1, + "grad_norm": 0.16469427943229675, + "learning_rate": 0.0002993997201081169, + "loss": 0.1448, + "step": 58 + }, + { + "epoch": 0.1, + "grad_norm": 0.07228899002075195, + "learning_rate": 0.00029937446548732716, + "loss": 0.1302, + "step": 59 + }, + { + "epoch": 0.11, + "grad_norm": 0.17821665108203888, + "learning_rate": 0.0002993486916169341, + "loss": 0.1365, + "step": 60 + }, + { + "epoch": 0.11, + "grad_norm": 0.1656705141067505, + "learning_rate": 0.0002993223985865318, + "loss": 0.1232, + "step": 61 + }, + { + "epoch": 0.11, + "grad_norm": 0.25225961208343506, + "learning_rate": 0.0002992955864875192, + "loss": 0.1308, + "step": 62 + }, + { + "epoch": 0.11, + "grad_norm": 0.26701709628105164, + "learning_rate": 0.00029926825541309925, + "loss": 0.1126, + "step": 63 + }, + { + "epoch": 0.11, + "grad_norm": 0.30766233801841736, + "learning_rate": 0.0002992404054582793, + "loss": 0.1051, + "step": 64 + }, + { + "epoch": 0.11, + "grad_norm": 1.0783226490020752, + "learning_rate": 0.00029921203671987023, + "loss": 0.1031, + "step": 65 + }, + { + "epoch": 0.12, + "grad_norm": 0.797533392906189, + "learning_rate": 0.0002991831492964863, + "loss": 0.1243, + "step": 66 + }, + { + "epoch": 0.12, + "grad_norm": 0.3874399662017822, + "learning_rate": 0.0002991537432885449, + "loss": 0.098, + "step": 67 + }, + { + "epoch": 0.12, + "grad_norm": 0.5988464951515198, + "learning_rate": 0.0002991238187982659, + "loss": 0.0899, + "step": 68 + }, + { + "epoch": 0.12, + "grad_norm": 0.5161219835281372, + "learning_rate": 0.00029909337592967173, + "loss": 0.0893, + "step": 69 + }, + { + "epoch": 0.12, + "grad_norm": 0.9815970659255981, + "learning_rate": 0.0002990624147885866, + "loss": 0.1132, + "step": 70 + }, + { + "epoch": 0.13, + "grad_norm": 1.014042615890503, + "learning_rate": 0.00029903093548263655, + "loss": 0.1526, + "step": 71 + }, + { + "epoch": 0.13, + "grad_norm": 0.4545246958732605, + "learning_rate": 0.00029899893812124857, + "loss": 0.1303, + "step": 72 + }, + { + "epoch": 0.13, + "grad_norm": 0.4577489197254181, + "learning_rate": 0.00029896642281565067, + "loss": 0.0745, + "step": 73 + }, + { + "epoch": 0.13, + "grad_norm": 0.49255430698394775, + "learning_rate": 0.00029893338967887124, + "loss": 0.0924, + "step": 74 + }, + { + "epoch": 0.13, + "grad_norm": 0.39145103096961975, + "learning_rate": 0.0002988998388257388, + "loss": 0.1121, + "step": 75 + }, + { + "epoch": 0.13, + "grad_norm": 0.2706741392612457, + "learning_rate": 0.00029886577037288147, + "loss": 0.0727, + "step": 76 + }, + { + "epoch": 0.14, + "grad_norm": 0.1395532339811325, + "learning_rate": 0.0002988311844387266, + "loss": 0.0509, + "step": 77 + }, + { + "epoch": 0.14, + "grad_norm": 0.4092063307762146, + "learning_rate": 0.0002987960811435006, + "loss": 0.0785, + "step": 78 + }, + { + "epoch": 0.14, + "grad_norm": 0.4469921886920929, + "learning_rate": 0.000298760460609228, + "loss": 0.123, + "step": 79 + }, + { + "epoch": 0.14, + "grad_norm": 0.7060185670852661, + "learning_rate": 0.00029872432295973154, + "loss": 0.1112, + "step": 80 + }, + { + "epoch": 0.14, + "grad_norm": 0.5608384013175964, + "learning_rate": 0.00029868766832063154, + "loss": 0.1248, + "step": 81 + }, + { + "epoch": 0.15, + "grad_norm": 0.2733088731765747, + "learning_rate": 0.0002986504968193454, + "loss": 0.08, + "step": 82 + }, + { + "epoch": 0.15, + "grad_norm": 0.28215891122817993, + "learning_rate": 0.0002986128085850871, + "loss": 0.0603, + "step": 83 + }, + { + "epoch": 0.15, + "grad_norm": 0.28432655334472656, + "learning_rate": 0.0002985746037488671, + "loss": 0.1094, + "step": 84 + }, + { + "epoch": 0.15, + "grad_norm": 0.3250674903392792, + "learning_rate": 0.00029853588244349154, + "loss": 0.0937, + "step": 85 + }, + { + "epoch": 0.15, + "grad_norm": 0.31528908014297485, + "learning_rate": 0.00029849664480356187, + "loss": 0.0984, + "step": 86 + }, + { + "epoch": 0.15, + "grad_norm": 0.4400915503501892, + "learning_rate": 0.00029845689096547436, + "loss": 0.1054, + "step": 87 + }, + { + "epoch": 0.16, + "grad_norm": 0.2794625759124756, + "learning_rate": 0.0002984166210674198, + "loss": 0.103, + "step": 88 + }, + { + "epoch": 0.16, + "grad_norm": 0.24721817672252655, + "learning_rate": 0.00029837583524938287, + "loss": 0.0763, + "step": 89 + }, + { + "epoch": 0.16, + "grad_norm": 0.17728295922279358, + "learning_rate": 0.00029833453365314146, + "loss": 0.0799, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 0.42136892676353455, + "learning_rate": 0.00029829271642226664, + "loss": 0.1157, + "step": 91 + }, + { + "epoch": 0.16, + "grad_norm": 0.26066917181015015, + "learning_rate": 0.0002982503837021218, + "loss": 0.0754, + "step": 92 + }, + { + "epoch": 0.16, + "grad_norm": 0.3443453013896942, + "learning_rate": 0.00029820753563986226, + "loss": 0.1262, + "step": 93 + }, + { + "epoch": 0.17, + "grad_norm": 1.1035971641540527, + "learning_rate": 0.0002981641723844348, + "loss": 0.2149, + "step": 94 + }, + { + "epoch": 0.17, + "grad_norm": 0.4937790036201477, + "learning_rate": 0.00029812029408657695, + "loss": 0.0519, + "step": 95 + }, + { + "epoch": 0.17, + "grad_norm": 0.25491034984588623, + "learning_rate": 0.00029807590089881683, + "loss": 0.0463, + "step": 96 + }, + { + "epoch": 0.17, + "grad_norm": 0.8630254864692688, + "learning_rate": 0.00029803099297547216, + "loss": 0.1097, + "step": 97 + }, + { + "epoch": 0.17, + "grad_norm": 0.7856675386428833, + "learning_rate": 0.0002979855704726502, + "loss": 0.1249, + "step": 98 + }, + { + "epoch": 0.18, + "grad_norm": 0.41131559014320374, + "learning_rate": 0.00029793963354824685, + "loss": 0.0578, + "step": 99 + }, + { + "epoch": 0.18, + "grad_norm": 0.2710832357406616, + "learning_rate": 0.00029789318236194616, + "loss": 0.0695, + "step": 100 + }, + { + "epoch": 0.18, + "grad_norm": 0.37780898809432983, + "learning_rate": 0.0002978462170752199, + "loss": 0.1595, + "step": 101 + }, + { + "epoch": 0.18, + "grad_norm": 0.6756994128227234, + "learning_rate": 0.00029779873785132696, + "loss": 0.1593, + "step": 102 + }, + { + "epoch": 0.18, + "grad_norm": 0.3440113067626953, + "learning_rate": 0.0002977507448553128, + "loss": 0.1084, + "step": 103 + }, + { + "epoch": 0.18, + "grad_norm": 0.46663540601730347, + "learning_rate": 0.0002977022382540087, + "loss": 0.1467, + "step": 104 + }, + { + "epoch": 0.19, + "grad_norm": 0.29806986451148987, + "learning_rate": 0.0002976532182160314, + "loss": 0.114, + "step": 105 + }, + { + "epoch": 0.19, + "grad_norm": 0.17679756879806519, + "learning_rate": 0.0002976036849117824, + "loss": 0.1148, + "step": 106 + }, + { + "epoch": 0.19, + "grad_norm": 0.17152459919452667, + "learning_rate": 0.0002975536385134475, + "loss": 0.1135, + "step": 107 + }, + { + "epoch": 0.19, + "grad_norm": 0.16383178532123566, + "learning_rate": 0.00029750307919499595, + "loss": 0.0987, + "step": 108 + }, + { + "epoch": 0.19, + "grad_norm": 0.27941015362739563, + "learning_rate": 0.00029745200713217996, + "loss": 0.141, + "step": 109 + }, + { + "epoch": 0.19, + "grad_norm": 0.1865178793668747, + "learning_rate": 0.0002974004225025344, + "loss": 0.1066, + "step": 110 + }, + { + "epoch": 0.2, + "grad_norm": 0.09909848123788834, + "learning_rate": 0.0002973483254853756, + "loss": 0.0829, + "step": 111 + }, + { + "epoch": 0.2, + "grad_norm": 0.17434453964233398, + "learning_rate": 0.0002972957162618011, + "loss": 0.0908, + "step": 112 + }, + { + "epoch": 0.2, + "grad_norm": 0.1075962707400322, + "learning_rate": 0.0002972425950146891, + "loss": 0.1005, + "step": 113 + }, + { + "epoch": 0.2, + "grad_norm": 0.23516638576984406, + "learning_rate": 0.00029718896192869755, + "loss": 0.107, + "step": 114 + }, + { + "epoch": 0.2, + "grad_norm": 0.1640479862689972, + "learning_rate": 0.00029713481719026365, + "loss": 0.0947, + "step": 115 + }, + { + "epoch": 0.21, + "grad_norm": 0.1907346248626709, + "learning_rate": 0.00029708016098760315, + "loss": 0.0757, + "step": 116 + }, + { + "epoch": 0.21, + "grad_norm": 0.26523101329803467, + "learning_rate": 0.0002970249935107099, + "loss": 0.1213, + "step": 117 + }, + { + "epoch": 0.21, + "grad_norm": 0.4174201190471649, + "learning_rate": 0.0002969693149513548, + "loss": 0.1036, + "step": 118 + }, + { + "epoch": 0.21, + "grad_norm": 0.5090858340263367, + "learning_rate": 0.00029691312550308546, + "loss": 0.1229, + "step": 119 + }, + { + "epoch": 0.21, + "grad_norm": 0.5807089805603027, + "learning_rate": 0.00029685642536122543, + "loss": 0.1017, + "step": 120 + }, + { + "epoch": 0.21, + "grad_norm": 0.20680812001228333, + "learning_rate": 0.00029679921472287353, + "loss": 0.0656, + "step": 121 + }, + { + "epoch": 0.22, + "grad_norm": 0.8702337145805359, + "learning_rate": 0.0002967414937869031, + "loss": 0.2062, + "step": 122 + }, + { + "epoch": 0.22, + "grad_norm": 0.8191081881523132, + "learning_rate": 0.00029668326275396133, + "loss": 0.1474, + "step": 123 + }, + { + "epoch": 0.22, + "grad_norm": 0.3332749307155609, + "learning_rate": 0.0002966245218264687, + "loss": 0.0991, + "step": 124 + }, + { + "epoch": 0.22, + "grad_norm": 0.4073905050754547, + "learning_rate": 0.000296565271208618, + "loss": 0.1113, + "step": 125 + }, + { + "epoch": 0.22, + "grad_norm": 1.1542918682098389, + "learning_rate": 0.00029650551110637393, + "loss": 0.1672, + "step": 126 + }, + { + "epoch": 0.22, + "grad_norm": 1.1717615127563477, + "learning_rate": 0.0002964452417274723, + "loss": 0.1665, + "step": 127 + }, + { + "epoch": 0.23, + "grad_norm": 0.24656681716442108, + "learning_rate": 0.00029638446328141894, + "loss": 0.113, + "step": 128 + }, + { + "epoch": 0.23, + "grad_norm": 0.12859101593494415, + "learning_rate": 0.00029632317597948964, + "loss": 0.111, + "step": 129 + }, + { + "epoch": 0.23, + "grad_norm": 0.0825003981590271, + "learning_rate": 0.0002962613800347288, + "loss": 0.1235, + "step": 130 + }, + { + "epoch": 0.23, + "grad_norm": 0.14479093253612518, + "learning_rate": 0.0002961990756619491, + "loss": 0.1031, + "step": 131 + }, + { + "epoch": 0.23, + "grad_norm": 0.2117856740951538, + "learning_rate": 0.0002961362630777305, + "loss": 0.0995, + "step": 132 + }, + { + "epoch": 0.24, + "grad_norm": 0.1228349581360817, + "learning_rate": 0.00029607294250041965, + "loss": 0.0804, + "step": 133 + }, + { + "epoch": 0.24, + "grad_norm": 0.18281131982803345, + "learning_rate": 0.000296009114150129, + "loss": 0.0843, + "step": 134 + }, + { + "epoch": 0.24, + "grad_norm": 0.2280908077955246, + "learning_rate": 0.0002959447782487361, + "loss": 0.1361, + "step": 135 + }, + { + "epoch": 0.24, + "grad_norm": 0.21089224517345428, + "learning_rate": 0.0002958799350198829, + "loss": 0.136, + "step": 136 + }, + { + "epoch": 0.24, + "grad_norm": 0.4448394477367401, + "learning_rate": 0.00029581458468897485, + "loss": 0.1293, + "step": 137 + }, + { + "epoch": 0.24, + "grad_norm": 0.35630327463150024, + "learning_rate": 0.0002957487274831803, + "loss": 0.0892, + "step": 138 + }, + { + "epoch": 0.25, + "grad_norm": 0.3235824406147003, + "learning_rate": 0.00029568236363142924, + "loss": 0.0862, + "step": 139 + }, + { + "epoch": 0.25, + "grad_norm": 0.5782188177108765, + "learning_rate": 0.0002956154933644133, + "loss": 0.1067, + "step": 140 + }, + { + "epoch": 0.25, + "grad_norm": 0.0916428491473198, + "learning_rate": 0.00029554811691458405, + "loss": 0.0717, + "step": 141 + }, + { + "epoch": 0.25, + "grad_norm": 0.10682029277086258, + "learning_rate": 0.00029548023451615295, + "loss": 0.0729, + "step": 142 + }, + { + "epoch": 0.25, + "eval_loss": 0.09042291343212128, + "eval_runtime": 14.7658, + "eval_samples_per_second": 32.304, + "eval_steps_per_second": 8.127, + "step": 142 + }, + { + "epoch": 0.25, + "grad_norm": 0.10448458790779114, + "learning_rate": 0.00029541184640509015, + "loss": 0.1013, + "step": 143 + }, + { + "epoch": 0.25, + "grad_norm": 0.20653600990772247, + "learning_rate": 0.00029534295281912355, + "loss": 0.1109, + "step": 144 + }, + { + "epoch": 0.26, + "grad_norm": 0.09114749729633331, + "learning_rate": 0.00029527355399773845, + "loss": 0.098, + "step": 145 + }, + { + "epoch": 0.26, + "grad_norm": 0.13702392578125, + "learning_rate": 0.0002952036501821762, + "loss": 0.0434, + "step": 146 + }, + { + "epoch": 0.26, + "grad_norm": 0.21909640729427338, + "learning_rate": 0.00029513324161543366, + "loss": 0.1072, + "step": 147 + }, + { + "epoch": 0.26, + "grad_norm": 0.1765926033258438, + "learning_rate": 0.00029506232854226237, + "loss": 0.0912, + "step": 148 + }, + { + "epoch": 0.26, + "grad_norm": 0.26851925253868103, + "learning_rate": 0.00029499091120916755, + "loss": 0.1134, + "step": 149 + }, + { + "epoch": 0.27, + "grad_norm": 0.15720008313655853, + "learning_rate": 0.0002949189898644072, + "loss": 0.0953, + "step": 150 + }, + { + "epoch": 0.27, + "grad_norm": 0.31068509817123413, + "learning_rate": 0.0002948465647579916, + "loss": 0.1179, + "step": 151 + }, + { + "epoch": 0.27, + "grad_norm": 0.3737366497516632, + "learning_rate": 0.00029477363614168194, + "loss": 0.0745, + "step": 152 + }, + { + "epoch": 0.27, + "grad_norm": 0.295796275138855, + "learning_rate": 0.0002947002042689898, + "loss": 0.1448, + "step": 153 + }, + { + "epoch": 0.27, + "grad_norm": 0.21946462988853455, + "learning_rate": 0.0002946262693951762, + "loss": 0.0938, + "step": 154 + }, + { + "epoch": 0.27, + "grad_norm": 0.1442556530237198, + "learning_rate": 0.00029455183177725053, + "loss": 0.0778, + "step": 155 + }, + { + "epoch": 0.28, + "grad_norm": 0.15714137256145477, + "learning_rate": 0.00029447689167396996, + "loss": 0.1192, + "step": 156 + }, + { + "epoch": 0.28, + "grad_norm": 0.1749090850353241, + "learning_rate": 0.0002944014493458383, + "loss": 0.1065, + "step": 157 + }, + { + "epoch": 0.28, + "grad_norm": 0.4440750777721405, + "learning_rate": 0.0002943255050551051, + "loss": 0.1143, + "step": 158 + }, + { + "epoch": 0.28, + "grad_norm": 0.3883216083049774, + "learning_rate": 0.0002942490590657651, + "loss": 0.1257, + "step": 159 + }, + { + "epoch": 0.28, + "grad_norm": 0.1770515888929367, + "learning_rate": 0.00029417211164355664, + "loss": 0.0917, + "step": 160 + }, + { + "epoch": 0.28, + "grad_norm": 0.13733014464378357, + "learning_rate": 0.0002940946630559613, + "loss": 0.0694, + "step": 161 + }, + { + "epoch": 0.29, + "grad_norm": 0.10783812403678894, + "learning_rate": 0.0002940167135722029, + "loss": 0.053, + "step": 162 + }, + { + "epoch": 0.29, + "grad_norm": 0.4282841086387634, + "learning_rate": 0.0002939382634632463, + "loss": 0.1514, + "step": 163 + }, + { + "epoch": 0.29, + "grad_norm": 0.2022620588541031, + "learning_rate": 0.00029385931300179673, + "loss": 0.093, + "step": 164 + }, + { + "epoch": 0.29, + "grad_norm": 0.09893293678760529, + "learning_rate": 0.0002937798624622985, + "loss": 0.0482, + "step": 165 + }, + { + "epoch": 0.29, + "grad_norm": 0.24493345618247986, + "learning_rate": 0.0002936999121209346, + "loss": 0.1104, + "step": 166 + }, + { + "epoch": 0.3, + "grad_norm": 0.558118999004364, + "learning_rate": 0.0002936194622556251, + "loss": 0.1593, + "step": 167 + }, + { + "epoch": 0.3, + "grad_norm": 0.30332863330841064, + "learning_rate": 0.00029353851314602674, + "loss": 0.1431, + "step": 168 + }, + { + "epoch": 0.3, + "grad_norm": 0.22926372289657593, + "learning_rate": 0.00029345706507353153, + "loss": 0.0903, + "step": 169 + }, + { + "epoch": 0.3, + "grad_norm": 0.1272840052843094, + "learning_rate": 0.0002933751183212661, + "loss": 0.0581, + "step": 170 + }, + { + "epoch": 0.3, + "grad_norm": 0.14229409396648407, + "learning_rate": 0.0002932926731740905, + "loss": 0.0786, + "step": 171 + }, + { + "epoch": 0.3, + "grad_norm": 0.15394841134548187, + "learning_rate": 0.00029320972991859725, + "loss": 0.0891, + "step": 172 + }, + { + "epoch": 0.31, + "grad_norm": 0.14996570348739624, + "learning_rate": 0.00029312628884311045, + "loss": 0.0697, + "step": 173 + }, + { + "epoch": 0.31, + "grad_norm": 0.17957860231399536, + "learning_rate": 0.0002930423502376846, + "loss": 0.0597, + "step": 174 + }, + { + "epoch": 0.31, + "grad_norm": 0.2494930922985077, + "learning_rate": 0.00029295791439410383, + "loss": 0.1173, + "step": 175 + }, + { + "epoch": 0.31, + "grad_norm": 0.25140491127967834, + "learning_rate": 0.0002928729816058807, + "loss": 0.1015, + "step": 176 + }, + { + "epoch": 0.31, + "grad_norm": 0.23032979667186737, + "learning_rate": 0.00029278755216825505, + "loss": 0.1271, + "step": 177 + }, + { + "epoch": 0.31, + "grad_norm": 0.18228159844875336, + "learning_rate": 0.0002927016263781935, + "loss": 0.0758, + "step": 178 + }, + { + "epoch": 0.32, + "grad_norm": 0.18998126685619354, + "learning_rate": 0.0002926152045343877, + "loss": 0.0658, + "step": 179 + }, + { + "epoch": 0.32, + "grad_norm": 0.2079799771308899, + "learning_rate": 0.00029252828693725403, + "loss": 0.1376, + "step": 180 + }, + { + "epoch": 0.32, + "grad_norm": 0.20679551362991333, + "learning_rate": 0.00029244087388893185, + "loss": 0.0989, + "step": 181 + }, + { + "epoch": 0.32, + "grad_norm": 0.13382777571678162, + "learning_rate": 0.000292352965693283, + "loss": 0.0739, + "step": 182 + }, + { + "epoch": 0.32, + "grad_norm": 0.1363653689622879, + "learning_rate": 0.00029226456265589045, + "loss": 0.0781, + "step": 183 + }, + { + "epoch": 0.33, + "grad_norm": 0.21474190056324005, + "learning_rate": 0.00029217566508405737, + "loss": 0.0994, + "step": 184 + }, + { + "epoch": 0.33, + "grad_norm": 0.1632685363292694, + "learning_rate": 0.000292086273286806, + "loss": 0.0938, + "step": 185 + }, + { + "epoch": 0.33, + "grad_norm": 0.18359023332595825, + "learning_rate": 0.0002919963875748765, + "loss": 0.0837, + "step": 186 + }, + { + "epoch": 0.33, + "grad_norm": 0.21232351660728455, + "learning_rate": 0.000291906008260726, + "loss": 0.1021, + "step": 187 + }, + { + "epoch": 0.33, + "grad_norm": 0.1560080200433731, + "learning_rate": 0.0002918151356585276, + "loss": 0.0775, + "step": 188 + }, + { + "epoch": 0.33, + "grad_norm": 0.206360325217247, + "learning_rate": 0.00029172377008416893, + "loss": 0.0859, + "step": 189 + }, + { + "epoch": 0.34, + "grad_norm": 0.16424915194511414, + "learning_rate": 0.0002916319118552515, + "loss": 0.1071, + "step": 190 + }, + { + "epoch": 0.34, + "grad_norm": 0.15049318969249725, + "learning_rate": 0.00029153956129108913, + "loss": 0.0837, + "step": 191 + }, + { + "epoch": 0.34, + "grad_norm": 0.08157264441251755, + "learning_rate": 0.0002914467187127073, + "loss": 0.063, + "step": 192 + }, + { + "epoch": 0.34, + "grad_norm": 0.2039349228143692, + "learning_rate": 0.00029135338444284166, + "loss": 0.1077, + "step": 193 + }, + { + "epoch": 0.34, + "grad_norm": 0.1816401183605194, + "learning_rate": 0.00029125955880593705, + "loss": 0.1035, + "step": 194 + }, + { + "epoch": 0.34, + "grad_norm": 0.13115471601486206, + "learning_rate": 0.0002911652421281465, + "loss": 0.0862, + "step": 195 + }, + { + "epoch": 0.35, + "grad_norm": 0.20337167382240295, + "learning_rate": 0.00029107043473732995, + "loss": 0.0724, + "step": 196 + }, + { + "epoch": 0.35, + "grad_norm": 0.12583452463150024, + "learning_rate": 0.000290975136963053, + "loss": 0.0701, + "step": 197 + }, + { + "epoch": 0.35, + "grad_norm": 0.4595123827457428, + "learning_rate": 0.0002908793491365861, + "loss": 0.1763, + "step": 198 + }, + { + "epoch": 0.35, + "grad_norm": 0.17089878022670746, + "learning_rate": 0.00029078307159090294, + "loss": 0.0478, + "step": 199 + }, + { + "epoch": 0.35, + "grad_norm": 0.22770017385482788, + "learning_rate": 0.00029068630466067995, + "loss": 0.0551, + "step": 200 + }, + { + "epoch": 0.36, + "grad_norm": 0.16812334954738617, + "learning_rate": 0.00029058904868229426, + "loss": 0.0829, + "step": 201 + }, + { + "epoch": 0.36, + "grad_norm": 0.15563331544399261, + "learning_rate": 0.0002904913039938234, + "loss": 0.0551, + "step": 202 + }, + { + "epoch": 0.36, + "grad_norm": 0.14948880672454834, + "learning_rate": 0.00029039307093504355, + "loss": 0.1255, + "step": 203 + }, + { + "epoch": 0.36, + "grad_norm": 0.4740292429924011, + "learning_rate": 0.0002902943498474286, + "loss": 0.1865, + "step": 204 + }, + { + "epoch": 0.36, + "grad_norm": 0.2227841019630432, + "learning_rate": 0.00029019514107414887, + "loss": 0.0801, + "step": 205 + }, + { + "epoch": 0.36, + "grad_norm": 0.3277839422225952, + "learning_rate": 0.00029009544496006996, + "loss": 0.0785, + "step": 206 + }, + { + "epoch": 0.37, + "grad_norm": 0.30582112073898315, + "learning_rate": 0.0002899952618517515, + "loss": 0.0802, + "step": 207 + }, + { + "epoch": 0.37, + "grad_norm": 0.2919937074184418, + "learning_rate": 0.00028989459209744617, + "loss": 0.0533, + "step": 208 + }, + { + "epoch": 0.37, + "grad_norm": 0.31071627140045166, + "learning_rate": 0.00028979343604709816, + "loss": 0.1474, + "step": 209 + }, + { + "epoch": 0.37, + "grad_norm": 0.14612245559692383, + "learning_rate": 0.000289691794052342, + "loss": 0.1118, + "step": 210 + }, + { + "epoch": 0.37, + "grad_norm": 0.2461383044719696, + "learning_rate": 0.0002895896664665017, + "loss": 0.155, + "step": 211 + }, + { + "epoch": 0.38, + "grad_norm": 0.2300606220960617, + "learning_rate": 0.0002894870536445891, + "loss": 0.1023, + "step": 212 + }, + { + "epoch": 0.38, + "grad_norm": 0.3330621123313904, + "learning_rate": 0.0002893839559433028, + "loss": 0.1851, + "step": 213 + }, + { + "epoch": 0.38, + "grad_norm": 0.08405052870512009, + "learning_rate": 0.00028928037372102694, + "loss": 0.1162, + "step": 214 + }, + { + "epoch": 0.38, + "grad_norm": 0.08986380696296692, + "learning_rate": 0.00028917630733783, + "loss": 0.1033, + "step": 215 + }, + { + "epoch": 0.38, + "grad_norm": 0.08752616494894028, + "learning_rate": 0.00028907175715546334, + "loss": 0.1081, + "step": 216 + }, + { + "epoch": 0.38, + "grad_norm": 0.15312182903289795, + "learning_rate": 0.00028896672353736027, + "loss": 0.1084, + "step": 217 + }, + { + "epoch": 0.39, + "grad_norm": 0.1530596911907196, + "learning_rate": 0.00028886120684863437, + "loss": 0.1143, + "step": 218 + }, + { + "epoch": 0.39, + "grad_norm": 0.19303447008132935, + "learning_rate": 0.00028875520745607865, + "loss": 0.1322, + "step": 219 + }, + { + "epoch": 0.39, + "grad_norm": 0.2029002606868744, + "learning_rate": 0.00028864872572816405, + "loss": 0.1, + "step": 220 + }, + { + "epoch": 0.39, + "grad_norm": 0.0897076427936554, + "learning_rate": 0.00028854176203503806, + "loss": 0.0964, + "step": 221 + }, + { + "epoch": 0.39, + "grad_norm": 0.14917291700839996, + "learning_rate": 0.00028843431674852363, + "loss": 0.078, + "step": 222 + }, + { + "epoch": 0.39, + "grad_norm": 0.1606171429157257, + "learning_rate": 0.00028832639024211785, + "loss": 0.0844, + "step": 223 + }, + { + "epoch": 0.4, + "grad_norm": 0.15833866596221924, + "learning_rate": 0.0002882179828909905, + "loss": 0.103, + "step": 224 + }, + { + "epoch": 0.4, + "grad_norm": 0.2102007120847702, + "learning_rate": 0.00028810909507198304, + "loss": 0.0783, + "step": 225 + }, + { + "epoch": 0.4, + "grad_norm": 0.14001865684986115, + "learning_rate": 0.00028799972716360693, + "loss": 0.0999, + "step": 226 + }, + { + "epoch": 0.4, + "grad_norm": 0.07050840556621552, + "learning_rate": 0.00028788987954604255, + "loss": 0.0504, + "step": 227 + }, + { + "epoch": 0.4, + "grad_norm": 0.1809457242488861, + "learning_rate": 0.0002877795526011379, + "loss": 0.063, + "step": 228 + }, + { + "epoch": 0.41, + "grad_norm": 0.19804933667182922, + "learning_rate": 0.00028766874671240706, + "loss": 0.0819, + "step": 229 + }, + { + "epoch": 0.41, + "grad_norm": 0.8398628830909729, + "learning_rate": 0.0002875574622650291, + "loss": 0.0495, + "step": 230 + }, + { + "epoch": 0.41, + "grad_norm": 0.28259506821632385, + "learning_rate": 0.0002874456996458466, + "loss": 0.147, + "step": 231 + }, + { + "epoch": 0.41, + "grad_norm": 0.5770221948623657, + "learning_rate": 0.00028733345924336444, + "loss": 0.0941, + "step": 232 + }, + { + "epoch": 0.41, + "grad_norm": 0.2959941625595093, + "learning_rate": 0.0002872207414477482, + "loss": 0.15, + "step": 233 + }, + { + "epoch": 0.41, + "grad_norm": 0.41449636220932007, + "learning_rate": 0.0002871075466508229, + "loss": 0.1244, + "step": 234 + }, + { + "epoch": 0.42, + "grad_norm": 0.20178668200969696, + "learning_rate": 0.000286993875246072, + "loss": 0.1241, + "step": 235 + }, + { + "epoch": 0.42, + "grad_norm": 0.2657439708709717, + "learning_rate": 0.0002868797276286355, + "loss": 0.108, + "step": 236 + }, + { + "epoch": 0.42, + "grad_norm": 0.26215022802352905, + "learning_rate": 0.0002867651041953087, + "loss": 0.0814, + "step": 237 + }, + { + "epoch": 0.42, + "grad_norm": 0.17194584012031555, + "learning_rate": 0.00028665000534454116, + "loss": 0.0937, + "step": 238 + }, + { + "epoch": 0.42, + "grad_norm": 0.14481855928897858, + "learning_rate": 0.00028653443147643495, + "loss": 0.0866, + "step": 239 + }, + { + "epoch": 0.42, + "grad_norm": 0.2787633240222931, + "learning_rate": 0.00028641838299274335, + "loss": 0.0711, + "step": 240 + }, + { + "epoch": 0.43, + "grad_norm": 0.20350907742977142, + "learning_rate": 0.0002863018602968695, + "loss": 0.146, + "step": 241 + }, + { + "epoch": 0.43, + "grad_norm": 0.29709184169769287, + "learning_rate": 0.0002861848637938649, + "loss": 0.1273, + "step": 242 + }, + { + "epoch": 0.43, + "grad_norm": 0.15158367156982422, + "learning_rate": 0.00028606739389042834, + "loss": 0.0778, + "step": 243 + }, + { + "epoch": 0.43, + "grad_norm": 0.13614635169506073, + "learning_rate": 0.0002859494509949039, + "loss": 0.0609, + "step": 244 + }, + { + "epoch": 0.43, + "grad_norm": 0.20077449083328247, + "learning_rate": 0.00028583103551728004, + "loss": 0.097, + "step": 245 + }, + { + "epoch": 0.44, + "grad_norm": 0.08916395157575607, + "learning_rate": 0.00028571214786918806, + "loss": 0.0704, + "step": 246 + }, + { + "epoch": 0.44, + "grad_norm": 0.18170690536499023, + "learning_rate": 0.00028559278846390033, + "loss": 0.0662, + "step": 247 + }, + { + "epoch": 0.44, + "grad_norm": 0.20803312957286835, + "learning_rate": 0.00028547295771632936, + "loss": 0.0683, + "step": 248 + }, + { + "epoch": 0.44, + "grad_norm": 0.40586671233177185, + "learning_rate": 0.0002853526560430261, + "loss": 0.0909, + "step": 249 + }, + { + "epoch": 0.44, + "grad_norm": 0.16656708717346191, + "learning_rate": 0.0002852318838621784, + "loss": 0.0793, + "step": 250 + }, + { + "epoch": 0.44, + "grad_norm": 0.24462048709392548, + "learning_rate": 0.00028511064159360977, + "loss": 0.12, + "step": 251 + }, + { + "epoch": 0.45, + "grad_norm": 0.32009223103523254, + "learning_rate": 0.00028498892965877776, + "loss": 0.1005, + "step": 252 + }, + { + "epoch": 0.45, + "grad_norm": 0.17075763642787933, + "learning_rate": 0.0002848667484807726, + "loss": 0.088, + "step": 253 + }, + { + "epoch": 0.45, + "grad_norm": 0.4186219573020935, + "learning_rate": 0.00028474409848431556, + "loss": 0.1242, + "step": 254 + }, + { + "epoch": 0.45, + "grad_norm": 0.2534870505332947, + "learning_rate": 0.0002846209800957579, + "loss": 0.0672, + "step": 255 + }, + { + "epoch": 0.45, + "grad_norm": 0.22438766062259674, + "learning_rate": 0.00028449739374307876, + "loss": 0.0628, + "step": 256 + }, + { + "epoch": 0.45, + "grad_norm": 0.3011839687824249, + "learning_rate": 0.00028437333985588414, + "loss": 0.0731, + "step": 257 + }, + { + "epoch": 0.46, + "grad_norm": 0.3355525732040405, + "learning_rate": 0.00028424881886540525, + "loss": 0.1556, + "step": 258 + }, + { + "epoch": 0.46, + "grad_norm": 0.3344305455684662, + "learning_rate": 0.00028412383120449705, + "loss": 0.1138, + "step": 259 + }, + { + "epoch": 0.46, + "grad_norm": 0.12889593839645386, + "learning_rate": 0.00028399837730763667, + "loss": 0.0386, + "step": 260 + }, + { + "epoch": 0.46, + "grad_norm": 0.1852964162826538, + "learning_rate": 0.000283872457610922, + "loss": 0.0777, + "step": 261 + }, + { + "epoch": 0.46, + "grad_norm": 0.15753400325775146, + "learning_rate": 0.00028374607255207007, + "loss": 0.0519, + "step": 262 + }, + { + "epoch": 0.47, + "grad_norm": 0.33515694737434387, + "learning_rate": 0.00028361922257041575, + "loss": 0.1075, + "step": 263 + }, + { + "epoch": 0.47, + "grad_norm": 0.4648870825767517, + "learning_rate": 0.00028349190810690974, + "loss": 0.1426, + "step": 264 + }, + { + "epoch": 0.47, + "grad_norm": 0.47806718945503235, + "learning_rate": 0.0002833641296041176, + "loss": 0.1695, + "step": 265 + }, + { + "epoch": 0.47, + "grad_norm": 0.13909588754177094, + "learning_rate": 0.000283235887506218, + "loss": 0.0683, + "step": 266 + }, + { + "epoch": 0.47, + "grad_norm": 0.19695843756198883, + "learning_rate": 0.0002831071822590009, + "loss": 0.072, + "step": 267 + }, + { + "epoch": 0.47, + "grad_norm": 0.10471002012491226, + "learning_rate": 0.00028297801430986647, + "loss": 0.0725, + "step": 268 + }, + { + "epoch": 0.48, + "grad_norm": 0.11718752980232239, + "learning_rate": 0.0002828483841078232, + "loss": 0.0799, + "step": 269 + }, + { + "epoch": 0.48, + "grad_norm": 0.18288259208202362, + "learning_rate": 0.0002827182921034865, + "loss": 0.1348, + "step": 270 + }, + { + "epoch": 0.48, + "grad_norm": 0.13153032958507538, + "learning_rate": 0.000282587738749077, + "loss": 0.0865, + "step": 271 + }, + { + "epoch": 0.48, + "grad_norm": 0.2975251376628876, + "learning_rate": 0.00028245672449841915, + "loss": 0.118, + "step": 272 + }, + { + "epoch": 0.48, + "grad_norm": 0.23405657708644867, + "learning_rate": 0.00028232524980693945, + "loss": 0.1012, + "step": 273 + }, + { + "epoch": 0.48, + "grad_norm": 0.15189243853092194, + "learning_rate": 0.000282193315131665, + "loss": 0.1089, + "step": 274 + }, + { + "epoch": 0.49, + "grad_norm": 0.11026138812303543, + "learning_rate": 0.0002820609209312219, + "loss": 0.0626, + "step": 275 + }, + { + "epoch": 0.49, + "grad_norm": 0.11485940217971802, + "learning_rate": 0.0002819280676658337, + "loss": 0.0767, + "step": 276 + }, + { + "epoch": 0.49, + "grad_norm": 0.3087170422077179, + "learning_rate": 0.0002817947557973196, + "loss": 0.0892, + "step": 277 + }, + { + "epoch": 0.49, + "grad_norm": 0.07682602852582932, + "learning_rate": 0.00028166098578909313, + "loss": 0.0467, + "step": 278 + }, + { + "epoch": 0.49, + "grad_norm": 0.34041598439216614, + "learning_rate": 0.0002815267581061602, + "loss": 0.0758, + "step": 279 + }, + { + "epoch": 0.5, + "grad_norm": 0.24435921013355255, + "learning_rate": 0.0002813920732151177, + "loss": 0.0772, + "step": 280 + }, + { + "epoch": 0.5, + "grad_norm": 0.2376328855752945, + "learning_rate": 0.0002812569315841521, + "loss": 0.0935, + "step": 281 + }, + { + "epoch": 0.5, + "grad_norm": 0.0582437627017498, + "learning_rate": 0.0002811213336830373, + "loss": 0.0224, + "step": 282 + }, + { + "epoch": 0.5, + "grad_norm": 0.19927099347114563, + "learning_rate": 0.00028098527998313334, + "loss": 0.1243, + "step": 283 + }, + { + "epoch": 0.5, + "grad_norm": 0.15403445065021515, + "learning_rate": 0.00028084877095738473, + "loss": 0.0278, + "step": 284 + }, + { + "epoch": 0.5, + "eval_loss": 0.078863725066185, + "eval_runtime": 14.7476, + "eval_samples_per_second": 32.344, + "eval_steps_per_second": 8.137, + "step": 284 + }, + { + "epoch": 0.5, + "grad_norm": 0.240775465965271, + "learning_rate": 0.0002807118070803187, + "loss": 0.0359, + "step": 285 + }, + { + "epoch": 0.51, + "grad_norm": 0.38222241401672363, + "learning_rate": 0.0002805743888280437, + "loss": 0.1152, + "step": 286 + }, + { + "epoch": 0.51, + "grad_norm": 0.20165561139583588, + "learning_rate": 0.0002804365166782476, + "loss": 0.0426, + "step": 287 + }, + { + "epoch": 0.51, + "grad_norm": 0.4264529049396515, + "learning_rate": 0.00028029819111019614, + "loss": 0.1029, + "step": 288 + }, + { + "epoch": 0.51, + "grad_norm": 0.22892819344997406, + "learning_rate": 0.00028015941260473113, + "loss": 0.0956, + "step": 289 + }, + { + "epoch": 0.51, + "grad_norm": 0.29828211665153503, + "learning_rate": 0.00028002018164426893, + "loss": 0.0788, + "step": 290 + }, + { + "epoch": 0.51, + "grad_norm": 0.19386076927185059, + "learning_rate": 0.00027988049871279874, + "loss": 0.0889, + "step": 291 + }, + { + "epoch": 0.52, + "grad_norm": 0.30981379747390747, + "learning_rate": 0.0002797403642958808, + "loss": 0.1692, + "step": 292 + }, + { + "epoch": 0.52, + "grad_norm": 0.36764079332351685, + "learning_rate": 0.00027959977888064477, + "loss": 0.0837, + "step": 293 + }, + { + "epoch": 0.52, + "grad_norm": 0.1701124757528305, + "learning_rate": 0.00027945874295578826, + "loss": 0.1026, + "step": 294 + }, + { + "epoch": 0.52, + "grad_norm": 0.11988788843154907, + "learning_rate": 0.0002793172570115746, + "loss": 0.0993, + "step": 295 + }, + { + "epoch": 0.52, + "grad_norm": 0.15950341522693634, + "learning_rate": 0.00027917532153983176, + "loss": 0.0597, + "step": 296 + }, + { + "epoch": 0.53, + "grad_norm": 0.18084204196929932, + "learning_rate": 0.0002790329370339501, + "loss": 0.0776, + "step": 297 + }, + { + "epoch": 0.53, + "grad_norm": 0.1948510855436325, + "learning_rate": 0.000278890103988881, + "loss": 0.0899, + "step": 298 + }, + { + "epoch": 0.53, + "grad_norm": 0.16357307136058807, + "learning_rate": 0.0002787468229011351, + "loss": 0.109, + "step": 299 + }, + { + "epoch": 0.53, + "grad_norm": 0.15433883666992188, + "learning_rate": 0.0002786030942687805, + "loss": 0.0884, + "step": 300 + }, + { + "epoch": 0.53, + "grad_norm": 0.11737463623285294, + "learning_rate": 0.00027845891859144085, + "loss": 0.0868, + "step": 301 + }, + { + "epoch": 0.53, + "grad_norm": 0.1610182225704193, + "learning_rate": 0.000278314296370294, + "loss": 0.1315, + "step": 302 + }, + { + "epoch": 0.54, + "grad_norm": 0.1955747902393341, + "learning_rate": 0.00027816922810806996, + "loss": 0.0893, + "step": 303 + }, + { + "epoch": 0.54, + "grad_norm": 0.25150609016418457, + "learning_rate": 0.0002780237143090493, + "loss": 0.1179, + "step": 304 + }, + { + "epoch": 0.54, + "grad_norm": 0.24072997272014618, + "learning_rate": 0.0002778777554790614, + "loss": 0.0967, + "step": 305 + }, + { + "epoch": 0.54, + "grad_norm": 0.2026025652885437, + "learning_rate": 0.00027773135212548245, + "loss": 0.0809, + "step": 306 + }, + { + "epoch": 0.54, + "grad_norm": 0.17768602073192596, + "learning_rate": 0.000277584504757234, + "loss": 0.0726, + "step": 307 + }, + { + "epoch": 0.54, + "grad_norm": 0.22531519830226898, + "learning_rate": 0.00027743721388478115, + "loss": 0.0797, + "step": 308 + }, + { + "epoch": 0.55, + "grad_norm": 0.22690407931804657, + "learning_rate": 0.00027728948002013053, + "loss": 0.068, + "step": 309 + }, + { + "epoch": 0.55, + "grad_norm": 0.3111308217048645, + "learning_rate": 0.00027714130367682875, + "loss": 0.1222, + "step": 310 + }, + { + "epoch": 0.55, + "grad_norm": 0.31476208567619324, + "learning_rate": 0.0002769926853699606, + "loss": 0.123, + "step": 311 + }, + { + "epoch": 0.55, + "grad_norm": 0.19214238226413727, + "learning_rate": 0.0002768436256161471, + "loss": 0.0778, + "step": 312 + }, + { + "epoch": 0.55, + "grad_norm": 0.21092087030410767, + "learning_rate": 0.0002766941249335439, + "loss": 0.0908, + "step": 313 + }, + { + "epoch": 0.56, + "grad_norm": 0.24106797575950623, + "learning_rate": 0.00027654418384183925, + "loss": 0.1054, + "step": 314 + }, + { + "epoch": 0.56, + "grad_norm": 0.24553944170475006, + "learning_rate": 0.0002763938028622526, + "loss": 0.0687, + "step": 315 + }, + { + "epoch": 0.56, + "grad_norm": 0.18032313883304596, + "learning_rate": 0.0002762429825175323, + "loss": 0.0984, + "step": 316 + }, + { + "epoch": 0.56, + "grad_norm": 0.2341250777244568, + "learning_rate": 0.000276091723331954, + "loss": 0.0861, + "step": 317 + }, + { + "epoch": 0.56, + "grad_norm": 0.23054689168930054, + "learning_rate": 0.000275940025831319, + "loss": 0.1023, + "step": 318 + }, + { + "epoch": 0.56, + "grad_norm": 0.15571041405200958, + "learning_rate": 0.000275787890542952, + "loss": 0.061, + "step": 319 + }, + { + "epoch": 0.57, + "grad_norm": 0.16762854158878326, + "learning_rate": 0.0002756353179956998, + "loss": 0.1124, + "step": 320 + }, + { + "epoch": 0.57, + "grad_norm": 0.16507089138031006, + "learning_rate": 0.000275482308719929, + "loss": 0.1121, + "step": 321 + }, + { + "epoch": 0.57, + "grad_norm": 0.166361466050148, + "learning_rate": 0.00027532886324752433, + "loss": 0.1243, + "step": 322 + }, + { + "epoch": 0.57, + "grad_norm": 0.21798831224441528, + "learning_rate": 0.000275174982111887, + "loss": 0.1074, + "step": 323 + }, + { + "epoch": 0.57, + "grad_norm": 0.19063718616962433, + "learning_rate": 0.0002750206658479324, + "loss": 0.1058, + "step": 324 + }, + { + "epoch": 0.57, + "grad_norm": 0.20267756283283234, + "learning_rate": 0.00027486591499208867, + "loss": 0.0875, + "step": 325 + }, + { + "epoch": 0.58, + "grad_norm": 0.11513421684503555, + "learning_rate": 0.0002747107300822946, + "loss": 0.0674, + "step": 326 + }, + { + "epoch": 0.58, + "grad_norm": 0.16988199949264526, + "learning_rate": 0.0002745551116579978, + "loss": 0.089, + "step": 327 + }, + { + "epoch": 0.58, + "grad_norm": 0.20574574172496796, + "learning_rate": 0.0002743990602601529, + "loss": 0.0906, + "step": 328 + }, + { + "epoch": 0.58, + "grad_norm": 0.14412453770637512, + "learning_rate": 0.00027424257643121966, + "loss": 0.0996, + "step": 329 + }, + { + "epoch": 0.58, + "grad_norm": 0.1305454671382904, + "learning_rate": 0.00027408566071516084, + "loss": 0.0636, + "step": 330 + }, + { + "epoch": 0.59, + "grad_norm": 0.18056617677211761, + "learning_rate": 0.00027392831365744073, + "loss": 0.1004, + "step": 331 + }, + { + "epoch": 0.59, + "grad_norm": 0.15762409567832947, + "learning_rate": 0.00027377053580502297, + "loss": 0.0938, + "step": 332 + }, + { + "epoch": 0.59, + "grad_norm": 0.2165631651878357, + "learning_rate": 0.00027361232770636856, + "loss": 0.0829, + "step": 333 + }, + { + "epoch": 0.59, + "grad_norm": 0.21345216035842896, + "learning_rate": 0.0002734536899114343, + "loss": 0.1053, + "step": 334 + }, + { + "epoch": 0.59, + "grad_norm": 0.22907692193984985, + "learning_rate": 0.00027329462297167066, + "loss": 0.1451, + "step": 335 + }, + { + "epoch": 0.59, + "grad_norm": 0.24929089844226837, + "learning_rate": 0.0002731351274400198, + "loss": 0.0909, + "step": 336 + }, + { + "epoch": 0.6, + "grad_norm": 0.13142186403274536, + "learning_rate": 0.00027297520387091376, + "loss": 0.0523, + "step": 337 + }, + { + "epoch": 0.6, + "grad_norm": 0.27698859572410583, + "learning_rate": 0.0002728148528202725, + "loss": 0.0865, + "step": 338 + }, + { + "epoch": 0.6, + "grad_norm": 0.2599867880344391, + "learning_rate": 0.000272654074845502, + "loss": 0.0654, + "step": 339 + }, + { + "epoch": 0.6, + "grad_norm": 0.22103819251060486, + "learning_rate": 0.0002724928705054924, + "loss": 0.1108, + "step": 340 + }, + { + "epoch": 0.6, + "grad_norm": 0.2073899507522583, + "learning_rate": 0.0002723312403606157, + "loss": 0.0928, + "step": 341 + }, + { + "epoch": 0.61, + "grad_norm": 0.23784543573856354, + "learning_rate": 0.00027216918497272426, + "loss": 0.095, + "step": 342 + }, + { + "epoch": 0.61, + "grad_norm": 0.16584208607673645, + "learning_rate": 0.0002720067049051486, + "loss": 0.0521, + "step": 343 + }, + { + "epoch": 0.61, + "grad_norm": 0.1800609678030014, + "learning_rate": 0.0002718438007226955, + "loss": 0.0737, + "step": 344 + }, + { + "epoch": 0.61, + "grad_norm": 0.1386508196592331, + "learning_rate": 0.0002716804729916461, + "loss": 0.0522, + "step": 345 + }, + { + "epoch": 0.61, + "grad_norm": 0.22217071056365967, + "learning_rate": 0.0002715167222797537, + "loss": 0.1045, + "step": 346 + }, + { + "epoch": 0.61, + "grad_norm": 0.26573020219802856, + "learning_rate": 0.0002713525491562421, + "loss": 0.0719, + "step": 347 + }, + { + "epoch": 0.62, + "grad_norm": 0.20932160317897797, + "learning_rate": 0.00027118795419180336, + "loss": 0.1289, + "step": 348 + }, + { + "epoch": 0.62, + "grad_norm": 0.19539090991020203, + "learning_rate": 0.000271022937958596, + "loss": 0.0606, + "step": 349 + }, + { + "epoch": 0.62, + "grad_norm": 0.15271329879760742, + "learning_rate": 0.00027085750103024295, + "loss": 0.0343, + "step": 350 + }, + { + "epoch": 0.62, + "grad_norm": 0.25894200801849365, + "learning_rate": 0.00027069164398182944, + "loss": 0.0762, + "step": 351 + }, + { + "epoch": 0.62, + "grad_norm": 0.16486695408821106, + "learning_rate": 0.00027052536738990125, + "loss": 0.0618, + "step": 352 + }, + { + "epoch": 0.62, + "grad_norm": 0.24119453132152557, + "learning_rate": 0.00027035867183246244, + "loss": 0.1013, + "step": 353 + }, + { + "epoch": 0.63, + "grad_norm": 0.35628098249435425, + "learning_rate": 0.00027019155788897355, + "loss": 0.0878, + "step": 354 + }, + { + "epoch": 0.63, + "grad_norm": 0.14005188643932343, + "learning_rate": 0.0002700240261403494, + "loss": 0.0432, + "step": 355 + }, + { + "epoch": 0.63, + "grad_norm": 0.13526731729507446, + "learning_rate": 0.0002698560771689572, + "loss": 0.0513, + "step": 356 + }, + { + "epoch": 0.63, + "grad_norm": 0.2159578949213028, + "learning_rate": 0.0002696877115586146, + "loss": 0.0831, + "step": 357 + }, + { + "epoch": 0.63, + "grad_norm": 0.26970624923706055, + "learning_rate": 0.00026951892989458744, + "loss": 0.1336, + "step": 358 + }, + { + "epoch": 0.64, + "grad_norm": 0.17370541393756866, + "learning_rate": 0.00026934973276358787, + "loss": 0.073, + "step": 359 + }, + { + "epoch": 0.64, + "grad_norm": 0.19606231153011322, + "learning_rate": 0.0002691801207537722, + "loss": 0.0718, + "step": 360 + }, + { + "epoch": 0.64, + "grad_norm": 0.2545556426048279, + "learning_rate": 0.0002690100944547391, + "loss": 0.1326, + "step": 361 + }, + { + "epoch": 0.64, + "grad_norm": 0.15419915318489075, + "learning_rate": 0.0002688396544575271, + "loss": 0.0391, + "step": 362 + }, + { + "epoch": 0.64, + "grad_norm": 0.2486819475889206, + "learning_rate": 0.0002686688013546131, + "loss": 0.1099, + "step": 363 + }, + { + "epoch": 0.64, + "grad_norm": 0.12814386188983917, + "learning_rate": 0.0002684975357399099, + "loss": 0.0437, + "step": 364 + }, + { + "epoch": 0.65, + "grad_norm": 0.09224840253591537, + "learning_rate": 0.00026832585820876407, + "loss": 0.0487, + "step": 365 + }, + { + "epoch": 0.65, + "grad_norm": 0.16062654554843903, + "learning_rate": 0.00026815376935795444, + "loss": 0.0549, + "step": 366 + }, + { + "epoch": 0.65, + "grad_norm": 0.267597496509552, + "learning_rate": 0.0002679812697856894, + "loss": 0.117, + "step": 367 + }, + { + "epoch": 0.65, + "grad_norm": 0.23974187672138214, + "learning_rate": 0.0002678083600916051, + "loss": 0.0536, + "step": 368 + }, + { + "epoch": 0.65, + "grad_norm": 0.1853066086769104, + "learning_rate": 0.0002676350408767634, + "loss": 0.0392, + "step": 369 + }, + { + "epoch": 0.65, + "grad_norm": 0.4043184220790863, + "learning_rate": 0.00026746131274364975, + "loss": 0.1079, + "step": 370 + }, + { + "epoch": 0.66, + "grad_norm": 0.19574593007564545, + "learning_rate": 0.0002672871762961709, + "loss": 0.0866, + "step": 371 + }, + { + "epoch": 0.66, + "grad_norm": 0.34985387325286865, + "learning_rate": 0.00026711263213965314, + "loss": 0.1326, + "step": 372 + }, + { + "epoch": 0.66, + "grad_norm": 0.3007662892341614, + "learning_rate": 0.0002669376808808399, + "loss": 0.0793, + "step": 373 + }, + { + "epoch": 0.66, + "grad_norm": 0.2934662103652954, + "learning_rate": 0.00026676232312788993, + "loss": 0.1095, + "step": 374 + }, + { + "epoch": 0.66, + "grad_norm": 0.19983640313148499, + "learning_rate": 0.0002665865594903748, + "loss": 0.063, + "step": 375 + }, + { + "epoch": 0.67, + "grad_norm": 0.20034237205982208, + "learning_rate": 0.0002664103905792772, + "loss": 0.0927, + "step": 376 + }, + { + "epoch": 0.67, + "grad_norm": 0.24908727407455444, + "learning_rate": 0.0002662338170069884, + "loss": 0.0869, + "step": 377 + }, + { + "epoch": 0.67, + "grad_norm": 0.14556318521499634, + "learning_rate": 0.0002660568393873066, + "loss": 0.0679, + "step": 378 + }, + { + "epoch": 0.67, + "grad_norm": 0.14582981169223785, + "learning_rate": 0.0002658794583354343, + "loss": 0.062, + "step": 379 + }, + { + "epoch": 0.67, + "grad_norm": 0.21219995617866516, + "learning_rate": 0.00026570167446797656, + "loss": 0.1065, + "step": 380 + }, + { + "epoch": 0.67, + "grad_norm": 0.1637437492609024, + "learning_rate": 0.0002655234884029385, + "loss": 0.0754, + "step": 381 + }, + { + "epoch": 0.68, + "grad_norm": 0.3341504633426666, + "learning_rate": 0.00026534490075972363, + "loss": 0.1185, + "step": 382 + }, + { + "epoch": 0.68, + "grad_norm": 0.28448009490966797, + "learning_rate": 0.00026516591215913115, + "loss": 0.1244, + "step": 383 + }, + { + "epoch": 0.68, + "grad_norm": 0.10381971299648285, + "learning_rate": 0.0002649865232233541, + "loss": 0.0538, + "step": 384 + }, + { + "epoch": 0.68, + "grad_norm": 0.30114489793777466, + "learning_rate": 0.00026480673457597733, + "loss": 0.1083, + "step": 385 + }, + { + "epoch": 0.68, + "grad_norm": 0.25989478826522827, + "learning_rate": 0.00026462654684197487, + "loss": 0.123, + "step": 386 + }, + { + "epoch": 0.68, + "grad_norm": 0.14556318521499634, + "learning_rate": 0.00026444596064770833, + "loss": 0.121, + "step": 387 + }, + { + "epoch": 0.69, + "grad_norm": 0.11710379272699356, + "learning_rate": 0.0002642649766209242, + "loss": 0.0585, + "step": 388 + }, + { + "epoch": 0.69, + "grad_norm": 0.16899935901165009, + "learning_rate": 0.000264083595390752, + "loss": 0.0905, + "step": 389 + }, + { + "epoch": 0.69, + "grad_norm": 0.14508718252182007, + "learning_rate": 0.00026390181758770205, + "loss": 0.0807, + "step": 390 + }, + { + "epoch": 0.69, + "grad_norm": 0.3146648705005646, + "learning_rate": 0.000263719643843663, + "loss": 0.1388, + "step": 391 + }, + { + "epoch": 0.69, + "grad_norm": 0.18293187022209167, + "learning_rate": 0.0002635370747919002, + "loss": 0.094, + "step": 392 + }, + { + "epoch": 0.7, + "grad_norm": 0.1427004188299179, + "learning_rate": 0.0002633541110670528, + "loss": 0.0749, + "step": 393 + }, + { + "epoch": 0.7, + "grad_norm": 0.2598584294319153, + "learning_rate": 0.0002631707533051321, + "loss": 0.1237, + "step": 394 + }, + { + "epoch": 0.7, + "grad_norm": 0.19870835542678833, + "learning_rate": 0.0002629870021435192, + "loss": 0.0817, + "step": 395 + }, + { + "epoch": 0.7, + "grad_norm": 0.2234540730714798, + "learning_rate": 0.00026280285822096247, + "loss": 0.1058, + "step": 396 + }, + { + "epoch": 0.7, + "grad_norm": 0.1785740852355957, + "learning_rate": 0.0002626183221775758, + "loss": 0.0676, + "step": 397 + }, + { + "epoch": 0.7, + "grad_norm": 0.11512486636638641, + "learning_rate": 0.000262433394654836, + "loss": 0.0893, + "step": 398 + }, + { + "epoch": 0.71, + "grad_norm": 0.35373592376708984, + "learning_rate": 0.00026224807629558094, + "loss": 0.1077, + "step": 399 + }, + { + "epoch": 0.71, + "grad_norm": 0.2722702920436859, + "learning_rate": 0.0002620623677440068, + "loss": 0.0925, + "step": 400 + }, + { + "epoch": 0.71, + "grad_norm": 0.4532068073749542, + "learning_rate": 0.0002618762696456664, + "loss": 0.1217, + "step": 401 + }, + { + "epoch": 0.71, + "grad_norm": 0.19945968687534332, + "learning_rate": 0.0002616897826474666, + "loss": 0.0899, + "step": 402 + }, + { + "epoch": 0.71, + "grad_norm": 0.25592172145843506, + "learning_rate": 0.00026150290739766606, + "loss": 0.0519, + "step": 403 + }, + { + "epoch": 0.71, + "grad_norm": 0.16839353740215302, + "learning_rate": 0.00026131564454587314, + "loss": 0.0814, + "step": 404 + }, + { + "epoch": 0.72, + "grad_norm": 0.28912433981895447, + "learning_rate": 0.0002611279947430436, + "loss": 0.0737, + "step": 405 + }, + { + "epoch": 0.72, + "grad_norm": 0.053034551441669464, + "learning_rate": 0.0002609399586414782, + "loss": 0.0157, + "step": 406 + }, + { + "epoch": 0.72, + "grad_norm": 0.34488940238952637, + "learning_rate": 0.0002607515368948206, + "loss": 0.0951, + "step": 407 + }, + { + "epoch": 0.72, + "grad_norm": 0.3413633704185486, + "learning_rate": 0.00026056273015805494, + "loss": 0.0712, + "step": 408 + }, + { + "epoch": 0.72, + "grad_norm": 0.4603371024131775, + "learning_rate": 0.0002603735390875039, + "loss": 0.0946, + "step": 409 + }, + { + "epoch": 0.73, + "grad_norm": 0.243332177400589, + "learning_rate": 0.0002601839643408259, + "loss": 0.041, + "step": 410 + }, + { + "epoch": 0.73, + "grad_norm": 0.27589449286460876, + "learning_rate": 0.0002599940065770131, + "loss": 0.0841, + "step": 411 + }, + { + "epoch": 0.73, + "grad_norm": 0.3786303997039795, + "learning_rate": 0.0002598036664563893, + "loss": 0.1031, + "step": 412 + }, + { + "epoch": 0.73, + "grad_norm": 0.23370252549648285, + "learning_rate": 0.00025961294464060716, + "loss": 0.0509, + "step": 413 + }, + { + "epoch": 0.73, + "grad_norm": 0.3785135746002197, + "learning_rate": 0.00025942184179264635, + "loss": 0.1116, + "step": 414 + }, + { + "epoch": 0.73, + "grad_norm": 0.3081932067871094, + "learning_rate": 0.0002592303585768111, + "loss": 0.08, + "step": 415 + }, + { + "epoch": 0.74, + "grad_norm": 0.13592147827148438, + "learning_rate": 0.00025903849565872767, + "loss": 0.0269, + "step": 416 + }, + { + "epoch": 0.74, + "grad_norm": 0.29880547523498535, + "learning_rate": 0.0002588462537053424, + "loss": 0.0596, + "step": 417 + }, + { + "epoch": 0.74, + "grad_norm": 0.11771093308925629, + "learning_rate": 0.00025865363338491913, + "loss": 0.0549, + "step": 418 + }, + { + "epoch": 0.74, + "grad_norm": 0.2312057763338089, + "learning_rate": 0.00025846063536703705, + "loss": 0.0744, + "step": 419 + }, + { + "epoch": 0.74, + "grad_norm": 0.33669716119766235, + "learning_rate": 0.00025826726032258815, + "loss": 0.059, + "step": 420 + }, + { + "epoch": 0.74, + "grad_norm": 0.2426741123199463, + "learning_rate": 0.00025807350892377513, + "loss": 0.0996, + "step": 421 + }, + { + "epoch": 0.75, + "grad_norm": 0.4019070863723755, + "learning_rate": 0.000257879381844109, + "loss": 0.0864, + "step": 422 + }, + { + "epoch": 0.75, + "grad_norm": 0.09328150004148483, + "learning_rate": 0.00025768487975840653, + "loss": 0.0328, + "step": 423 + }, + { + "epoch": 0.75, + "grad_norm": 0.40782174468040466, + "learning_rate": 0.00025749000334278826, + "loss": 0.0446, + "step": 424 + }, + { + "epoch": 0.75, + "grad_norm": 0.21026232838630676, + "learning_rate": 0.00025729475327467574, + "loss": 0.096, + "step": 425 + }, + { + "epoch": 0.75, + "grad_norm": 0.3340080678462982, + "learning_rate": 0.00025709913023278967, + "loss": 0.1243, + "step": 426 + }, + { + "epoch": 0.75, + "eval_loss": 0.07237789034843445, + "eval_runtime": 14.7609, + "eval_samples_per_second": 32.315, + "eval_steps_per_second": 8.13, + "step": 426 + }, + { + "epoch": 0.76, + "grad_norm": 0.2728588879108429, + "learning_rate": 0.00025690313489714706, + "loss": 0.1162, + "step": 427 + }, + { + "epoch": 0.76, + "grad_norm": 0.22674015164375305, + "learning_rate": 0.00025670676794905915, + "loss": 0.0822, + "step": 428 + }, + { + "epoch": 0.76, + "grad_norm": 0.13643956184387207, + "learning_rate": 0.0002565100300711289, + "loss": 0.0652, + "step": 429 + }, + { + "epoch": 0.76, + "grad_norm": 0.1212250292301178, + "learning_rate": 0.0002563129219472488, + "loss": 0.0515, + "step": 430 + }, + { + "epoch": 0.76, + "grad_norm": 0.15632915496826172, + "learning_rate": 0.0002561154442625983, + "loss": 0.105, + "step": 431 + }, + { + "epoch": 0.76, + "grad_norm": 0.14310358464717865, + "learning_rate": 0.00025591759770364145, + "loss": 0.078, + "step": 432 + }, + { + "epoch": 0.77, + "grad_norm": 0.1461591273546219, + "learning_rate": 0.00025571938295812475, + "loss": 0.0626, + "step": 433 + }, + { + "epoch": 0.77, + "grad_norm": 0.12564843893051147, + "learning_rate": 0.00025552080071507423, + "loss": 0.1015, + "step": 434 + }, + { + "epoch": 0.77, + "grad_norm": 0.18360354006290436, + "learning_rate": 0.00025532185166479384, + "loss": 0.125, + "step": 435 + }, + { + "epoch": 0.77, + "grad_norm": 0.26957467198371887, + "learning_rate": 0.00025512253649886236, + "loss": 0.1049, + "step": 436 + }, + { + "epoch": 0.77, + "grad_norm": 0.18892253935337067, + "learning_rate": 0.00025492285591013116, + "loss": 0.0907, + "step": 437 + }, + { + "epoch": 0.77, + "grad_norm": 0.25057581067085266, + "learning_rate": 0.0002547228105927221, + "loss": 0.1081, + "step": 438 + }, + { + "epoch": 0.78, + "grad_norm": 0.13890071213245392, + "learning_rate": 0.00025452240124202477, + "loss": 0.0865, + "step": 439 + }, + { + "epoch": 0.78, + "grad_norm": 0.17899222671985626, + "learning_rate": 0.0002543216285546942, + "loss": 0.0437, + "step": 440 + }, + { + "epoch": 0.78, + "grad_norm": 0.26809147000312805, + "learning_rate": 0.00025412049322864845, + "loss": 0.0507, + "step": 441 + }, + { + "epoch": 0.78, + "grad_norm": 0.19062833487987518, + "learning_rate": 0.0002539189959630662, + "loss": 0.0516, + "step": 442 + }, + { + "epoch": 0.78, + "grad_norm": 0.23480018973350525, + "learning_rate": 0.0002537171374583843, + "loss": 0.0672, + "step": 443 + }, + { + "epoch": 0.79, + "grad_norm": 0.10694094747304916, + "learning_rate": 0.0002535149184162952, + "loss": 0.0248, + "step": 444 + }, + { + "epoch": 0.79, + "grad_norm": 0.2778875231742859, + "learning_rate": 0.0002533123395397448, + "loss": 0.1295, + "step": 445 + }, + { + "epoch": 0.79, + "grad_norm": 0.30790090560913086, + "learning_rate": 0.00025310940153292974, + "loss": 0.0518, + "step": 446 + }, + { + "epoch": 0.79, + "grad_norm": 0.2887485921382904, + "learning_rate": 0.00025290610510129513, + "loss": 0.0739, + "step": 447 + }, + { + "epoch": 0.79, + "grad_norm": 0.3820766508579254, + "learning_rate": 0.00025270245095153197, + "loss": 0.0873, + "step": 448 + }, + { + "epoch": 0.79, + "grad_norm": 0.09947656840085983, + "learning_rate": 0.00025249843979157467, + "loss": 0.0137, + "step": 449 + }, + { + "epoch": 0.8, + "grad_norm": 0.2997414469718933, + "learning_rate": 0.00025229407233059883, + "loss": 0.1314, + "step": 450 + }, + { + "epoch": 0.8, + "grad_norm": 0.24817639589309692, + "learning_rate": 0.00025208934927901857, + "loss": 0.1275, + "step": 451 + }, + { + "epoch": 0.8, + "grad_norm": 0.28992751240730286, + "learning_rate": 0.0002518842713484839, + "loss": 0.1294, + "step": 452 + }, + { + "epoch": 0.8, + "grad_norm": 0.20057415962219238, + "learning_rate": 0.00025167883925187874, + "loss": 0.0559, + "step": 453 + }, + { + "epoch": 0.8, + "grad_norm": 0.18815775215625763, + "learning_rate": 0.000251473053703318, + "loss": 0.0922, + "step": 454 + }, + { + "epoch": 0.8, + "grad_norm": 0.2501707375049591, + "learning_rate": 0.00025126691541814514, + "loss": 0.0712, + "step": 455 + }, + { + "epoch": 0.81, + "grad_norm": 0.19835379719734192, + "learning_rate": 0.00025106042511293005, + "loss": 0.0974, + "step": 456 + }, + { + "epoch": 0.81, + "grad_norm": 0.1476050615310669, + "learning_rate": 0.0002508535835054661, + "loss": 0.1005, + "step": 457 + }, + { + "epoch": 0.81, + "grad_norm": 0.18511542677879333, + "learning_rate": 0.0002506463913147679, + "loss": 0.1343, + "step": 458 + }, + { + "epoch": 0.81, + "grad_norm": 0.1092979907989502, + "learning_rate": 0.0002504388492610687, + "loss": 0.0728, + "step": 459 + }, + { + "epoch": 0.81, + "grad_norm": 0.11496451497077942, + "learning_rate": 0.00025023095806581797, + "loss": 0.0949, + "step": 460 + }, + { + "epoch": 0.82, + "grad_norm": 0.10526807606220245, + "learning_rate": 0.0002500227184516789, + "loss": 0.0874, + "step": 461 + }, + { + "epoch": 0.82, + "grad_norm": 0.12002858519554138, + "learning_rate": 0.0002498141311425258, + "loss": 0.0597, + "step": 462 + }, + { + "epoch": 0.82, + "grad_norm": 0.18872810900211334, + "learning_rate": 0.00024960519686344164, + "loss": 0.0938, + "step": 463 + }, + { + "epoch": 0.82, + "grad_norm": 0.15304633975028992, + "learning_rate": 0.0002493959163407154, + "loss": 0.0942, + "step": 464 + }, + { + "epoch": 0.82, + "grad_norm": 0.13661184906959534, + "learning_rate": 0.0002491862903018398, + "loss": 0.104, + "step": 465 + }, + { + "epoch": 0.82, + "grad_norm": 0.12348087877035141, + "learning_rate": 0.00024897631947550853, + "loss": 0.0914, + "step": 466 + }, + { + "epoch": 0.83, + "grad_norm": 0.17801643908023834, + "learning_rate": 0.00024876600459161396, + "loss": 0.1041, + "step": 467 + }, + { + "epoch": 0.83, + "grad_norm": 0.16505847871303558, + "learning_rate": 0.00024855534638124424, + "loss": 0.0657, + "step": 468 + }, + { + "epoch": 0.83, + "grad_norm": 0.17176857590675354, + "learning_rate": 0.0002483443455766812, + "loss": 0.0809, + "step": 469 + }, + { + "epoch": 0.83, + "grad_norm": 0.20494180917739868, + "learning_rate": 0.0002481330029113975, + "loss": 0.0939, + "step": 470 + }, + { + "epoch": 0.83, + "grad_norm": 0.08561734110116959, + "learning_rate": 0.000247921319120054, + "loss": 0.0316, + "step": 471 + }, + { + "epoch": 0.84, + "grad_norm": 0.17116233706474304, + "learning_rate": 0.0002477092949384977, + "loss": 0.0593, + "step": 472 + }, + { + "epoch": 0.84, + "grad_norm": 0.19205418229103088, + "learning_rate": 0.00024749693110375854, + "loss": 0.0684, + "step": 473 + }, + { + "epoch": 0.84, + "grad_norm": 0.1300123929977417, + "learning_rate": 0.0002472842283540473, + "loss": 0.0581, + "step": 474 + }, + { + "epoch": 0.84, + "grad_norm": 0.24078501760959625, + "learning_rate": 0.0002470711874287529, + "loss": 0.0483, + "step": 475 + }, + { + "epoch": 0.84, + "grad_norm": 0.3071228861808777, + "learning_rate": 0.00024685780906843975, + "loss": 0.0961, + "step": 476 + }, + { + "epoch": 0.84, + "grad_norm": 0.24373668432235718, + "learning_rate": 0.0002466440940148452, + "loss": 0.1139, + "step": 477 + }, + { + "epoch": 0.85, + "grad_norm": 0.22678449749946594, + "learning_rate": 0.00024643004301087715, + "loss": 0.0324, + "step": 478 + }, + { + "epoch": 0.85, + "grad_norm": 0.34472745656967163, + "learning_rate": 0.00024621565680061117, + "loss": 0.1041, + "step": 479 + }, + { + "epoch": 0.85, + "grad_norm": 0.16587162017822266, + "learning_rate": 0.0002460009361292881, + "loss": 0.0992, + "step": 480 + }, + { + "epoch": 0.85, + "grad_norm": 0.42953774333000183, + "learning_rate": 0.0002457858817433115, + "loss": 0.1284, + "step": 481 + }, + { + "epoch": 0.85, + "grad_norm": 0.6054782867431641, + "learning_rate": 0.00024557049439024486, + "loss": 0.1034, + "step": 482 + }, + { + "epoch": 0.85, + "grad_norm": 0.207829549908638, + "learning_rate": 0.0002453547748188092, + "loss": 0.0928, + "step": 483 + }, + { + "epoch": 0.86, + "grad_norm": 0.36941850185394287, + "learning_rate": 0.00024513872377888036, + "loss": 0.0765, + "step": 484 + }, + { + "epoch": 0.86, + "grad_norm": 0.20771794021129608, + "learning_rate": 0.0002449223420214864, + "loss": 0.1001, + "step": 485 + }, + { + "epoch": 0.86, + "grad_norm": 0.16417552530765533, + "learning_rate": 0.00024470563029880497, + "loss": 0.0783, + "step": 486 + }, + { + "epoch": 0.86, + "grad_norm": 0.12799693644046783, + "learning_rate": 0.0002444885893641609, + "loss": 0.0753, + "step": 487 + }, + { + "epoch": 0.86, + "grad_norm": 0.21585993468761444, + "learning_rate": 0.00024427121997202313, + "loss": 0.1241, + "step": 488 + }, + { + "epoch": 0.87, + "grad_norm": 0.1481488198041916, + "learning_rate": 0.00024405352287800266, + "loss": 0.1086, + "step": 489 + }, + { + "epoch": 0.87, + "grad_norm": 0.18564724922180176, + "learning_rate": 0.00024383549883884949, + "loss": 0.0801, + "step": 490 + }, + { + "epoch": 0.87, + "grad_norm": 0.16308890283107758, + "learning_rate": 0.00024361714861245015, + "loss": 0.0884, + "step": 491 + }, + { + "epoch": 0.87, + "grad_norm": 0.10771831125020981, + "learning_rate": 0.00024339847295782503, + "loss": 0.0507, + "step": 492 + }, + { + "epoch": 0.87, + "grad_norm": 0.11360620707273483, + "learning_rate": 0.00024317947263512578, + "loss": 0.0541, + "step": 493 + }, + { + "epoch": 0.87, + "grad_norm": 0.17256328463554382, + "learning_rate": 0.00024296014840563264, + "loss": 0.0797, + "step": 494 + }, + { + "epoch": 0.88, + "grad_norm": 0.14872509241104126, + "learning_rate": 0.00024274050103175192, + "loss": 0.098, + "step": 495 + }, + { + "epoch": 0.88, + "grad_norm": 0.10749954730272293, + "learning_rate": 0.00024252053127701297, + "loss": 0.0629, + "step": 496 + }, + { + "epoch": 0.88, + "grad_norm": 0.22221991419792175, + "learning_rate": 0.00024230023990606608, + "loss": 0.0737, + "step": 497 + }, + { + "epoch": 0.88, + "grad_norm": 0.1851089596748352, + "learning_rate": 0.00024207962768467927, + "loss": 0.0961, + "step": 498 + }, + { + "epoch": 0.88, + "grad_norm": 0.1685558706521988, + "learning_rate": 0.0002418586953797361, + "loss": 0.0863, + "step": 499 + }, + { + "epoch": 0.88, + "grad_norm": 0.14281867444515228, + "learning_rate": 0.00024163744375923268, + "loss": 0.0334, + "step": 500 + }, + { + "epoch": 0.89, + "grad_norm": 0.24673086404800415, + "learning_rate": 0.00024141587359227513, + "loss": 0.1468, + "step": 501 + }, + { + "epoch": 0.89, + "grad_norm": 0.21355701982975006, + "learning_rate": 0.00024119398564907685, + "loss": 0.1145, + "step": 502 + }, + { + "epoch": 0.89, + "grad_norm": 0.16007691621780396, + "learning_rate": 0.00024097178070095598, + "loss": 0.0799, + "step": 503 + }, + { + "epoch": 0.89, + "grad_norm": 0.3183180093765259, + "learning_rate": 0.0002407492595203326, + "loss": 0.1405, + "step": 504 + }, + { + "epoch": 0.89, + "grad_norm": 0.08811590075492859, + "learning_rate": 0.0002405264228807259, + "loss": 0.0359, + "step": 505 + }, + { + "epoch": 0.9, + "grad_norm": 0.1502489596605301, + "learning_rate": 0.0002403032715567519, + "loss": 0.0763, + "step": 506 + }, + { + "epoch": 0.9, + "grad_norm": 0.07442978769540787, + "learning_rate": 0.00024007980632412032, + "loss": 0.0365, + "step": 507 + }, + { + "epoch": 0.9, + "grad_norm": 0.2458060383796692, + "learning_rate": 0.00023985602795963227, + "loss": 0.0724, + "step": 508 + }, + { + "epoch": 0.9, + "grad_norm": 0.1589912623167038, + "learning_rate": 0.0002396319372411771, + "loss": 0.0701, + "step": 509 + }, + { + "epoch": 0.9, + "grad_norm": 0.1447596549987793, + "learning_rate": 0.00023940753494773018, + "loss": 0.0411, + "step": 510 + }, + { + "epoch": 0.9, + "grad_norm": 0.20220863819122314, + "learning_rate": 0.00023918282185934984, + "loss": 0.082, + "step": 511 + }, + { + "epoch": 0.91, + "grad_norm": 0.08220729231834412, + "learning_rate": 0.00023895779875717483, + "loss": 0.0423, + "step": 512 + }, + { + "epoch": 0.91, + "grad_norm": 0.18194489181041718, + "learning_rate": 0.0002387324664234216, + "loss": 0.058, + "step": 513 + }, + { + "epoch": 0.91, + "grad_norm": 0.2533247470855713, + "learning_rate": 0.00023850682564138142, + "loss": 0.08, + "step": 514 + }, + { + "epoch": 0.91, + "grad_norm": 0.4217647910118103, + "learning_rate": 0.00023828087719541784, + "loss": 0.1557, + "step": 515 + }, + { + "epoch": 0.91, + "grad_norm": 0.1498800665140152, + "learning_rate": 0.00023805462187096398, + "loss": 0.0298, + "step": 516 + }, + { + "epoch": 0.91, + "grad_norm": 0.1469077467918396, + "learning_rate": 0.0002378280604545196, + "loss": 0.0392, + "step": 517 + }, + { + "epoch": 0.92, + "grad_norm": 0.23052054643630981, + "learning_rate": 0.0002376011937336485, + "loss": 0.1275, + "step": 518 + }, + { + "epoch": 0.92, + "grad_norm": 0.2608712315559387, + "learning_rate": 0.0002373740224969758, + "loss": 0.0984, + "step": 519 + }, + { + "epoch": 0.92, + "grad_norm": 0.3406033515930176, + "learning_rate": 0.00023714654753418518, + "loss": 0.1668, + "step": 520 + }, + { + "epoch": 0.92, + "grad_norm": 0.1383407562971115, + "learning_rate": 0.0002369187696360161, + "loss": 0.0319, + "step": 521 + }, + { + "epoch": 0.92, + "grad_norm": 0.2986023724079132, + "learning_rate": 0.00023669068959426105, + "loss": 0.0797, + "step": 522 + }, + { + "epoch": 0.93, + "grad_norm": 0.18390725553035736, + "learning_rate": 0.00023646230820176289, + "loss": 0.0514, + "step": 523 + }, + { + "epoch": 0.93, + "grad_norm": 0.3618876338005066, + "learning_rate": 0.00023623362625241188, + "loss": 0.1465, + "step": 524 + }, + { + "epoch": 0.93, + "grad_norm": 0.23905062675476074, + "learning_rate": 0.00023600464454114325, + "loss": 0.0669, + "step": 525 + }, + { + "epoch": 0.93, + "grad_norm": 0.24474327266216278, + "learning_rate": 0.00023577536386393415, + "loss": 0.1048, + "step": 526 + }, + { + "epoch": 0.93, + "grad_norm": 0.11233604699373245, + "learning_rate": 0.000235545785017801, + "loss": 0.0653, + "step": 527 + }, + { + "epoch": 0.93, + "grad_norm": 0.12456522136926651, + "learning_rate": 0.0002353159088007966, + "loss": 0.0855, + "step": 528 + }, + { + "epoch": 0.94, + "grad_norm": 0.14804989099502563, + "learning_rate": 0.00023508573601200764, + "loss": 0.0554, + "step": 529 + }, + { + "epoch": 0.94, + "grad_norm": 0.1283787339925766, + "learning_rate": 0.00023485526745155167, + "loss": 0.0644, + "step": 530 + }, + { + "epoch": 0.94, + "grad_norm": 0.16255222260951996, + "learning_rate": 0.00023462450392057436, + "loss": 0.1088, + "step": 531 + }, + { + "epoch": 0.94, + "grad_norm": 0.18897709250450134, + "learning_rate": 0.0002343934462212467, + "loss": 0.0673, + "step": 532 + }, + { + "epoch": 0.94, + "grad_norm": 0.1460375338792801, + "learning_rate": 0.00023416209515676235, + "loss": 0.0923, + "step": 533 + }, + { + "epoch": 0.94, + "grad_norm": 0.13687273859977722, + "learning_rate": 0.0002339304515313348, + "loss": 0.1142, + "step": 534 + }, + { + "epoch": 0.95, + "grad_norm": 0.11255405843257904, + "learning_rate": 0.00023369851615019432, + "loss": 0.0513, + "step": 535 + }, + { + "epoch": 0.95, + "grad_norm": 0.18131427466869354, + "learning_rate": 0.0002334662898195856, + "loss": 0.1058, + "step": 536 + }, + { + "epoch": 0.95, + "grad_norm": 0.19648568332195282, + "learning_rate": 0.0002332337733467646, + "loss": 0.0721, + "step": 537 + }, + { + "epoch": 0.95, + "grad_norm": 0.1530655175447464, + "learning_rate": 0.00023300096753999582, + "loss": 0.0698, + "step": 538 + }, + { + "epoch": 0.95, + "grad_norm": 0.20368973910808563, + "learning_rate": 0.00023276787320854965, + "loss": 0.0925, + "step": 539 + }, + { + "epoch": 0.96, + "grad_norm": 0.14580555260181427, + "learning_rate": 0.00023253449116269936, + "loss": 0.052, + "step": 540 + }, + { + "epoch": 0.96, + "grad_norm": 0.20287400484085083, + "learning_rate": 0.00023230082221371832, + "loss": 0.0714, + "step": 541 + }, + { + "epoch": 0.96, + "grad_norm": 0.2907051742076874, + "learning_rate": 0.00023206686717387737, + "loss": 0.1166, + "step": 542 + }, + { + "epoch": 0.96, + "grad_norm": 0.3000693619251251, + "learning_rate": 0.00023183262685644174, + "loss": 0.1326, + "step": 543 + }, + { + "epoch": 0.96, + "grad_norm": 0.0760418102145195, + "learning_rate": 0.0002315981020756683, + "loss": 0.0591, + "step": 544 + }, + { + "epoch": 0.96, + "grad_norm": 0.144688680768013, + "learning_rate": 0.00023136329364680284, + "loss": 0.074, + "step": 545 + }, + { + "epoch": 0.97, + "grad_norm": 0.14553490281105042, + "learning_rate": 0.00023112820238607712, + "loss": 0.0609, + "step": 546 + }, + { + "epoch": 0.97, + "grad_norm": 0.13767151534557343, + "learning_rate": 0.0002308928291107061, + "loss": 0.0694, + "step": 547 + }, + { + "epoch": 0.97, + "grad_norm": 0.1333126723766327, + "learning_rate": 0.00023065717463888503, + "loss": 0.0509, + "step": 548 + }, + { + "epoch": 0.97, + "grad_norm": 0.1710643619298935, + "learning_rate": 0.00023042123978978663, + "loss": 0.0957, + "step": 549 + }, + { + "epoch": 0.97, + "grad_norm": 0.14117231965065002, + "learning_rate": 0.00023018502538355823, + "loss": 0.0372, + "step": 550 + }, + { + "epoch": 0.97, + "grad_norm": 0.24988646805286407, + "learning_rate": 0.0002299485322413191, + "loss": 0.0711, + "step": 551 + }, + { + "epoch": 0.98, + "grad_norm": 0.225083127617836, + "learning_rate": 0.00022971176118515731, + "loss": 0.0913, + "step": 552 + }, + { + "epoch": 0.98, + "grad_norm": 0.2095450758934021, + "learning_rate": 0.00022947471303812704, + "loss": 0.0485, + "step": 553 + }, + { + "epoch": 0.98, + "grad_norm": 0.2737296521663666, + "learning_rate": 0.0002292373886242456, + "loss": 0.0643, + "step": 554 + }, + { + "epoch": 0.98, + "grad_norm": 0.15709654986858368, + "learning_rate": 0.0002289997887684908, + "loss": 0.0327, + "step": 555 + }, + { + "epoch": 0.98, + "grad_norm": 0.17161165177822113, + "learning_rate": 0.00022876191429679785, + "loss": 0.0356, + "step": 556 + }, + { + "epoch": 0.99, + "grad_norm": 0.4428029954433441, + "learning_rate": 0.00022852376603605656, + "loss": 0.0976, + "step": 557 + }, + { + "epoch": 0.99, + "grad_norm": 0.3834541440010071, + "learning_rate": 0.00022828534481410845, + "loss": 0.1103, + "step": 558 + }, + { + "epoch": 0.99, + "grad_norm": 0.4154812693595886, + "learning_rate": 0.00022804665145974396, + "loss": 0.1864, + "step": 559 + }, + { + "epoch": 0.99, + "grad_norm": 0.2642601430416107, + "learning_rate": 0.0002278076868026995, + "loss": 0.0944, + "step": 560 + }, + { + "epoch": 0.99, + "grad_norm": 0.166983962059021, + "learning_rate": 0.0002275684516736545, + "loss": 0.0799, + "step": 561 + }, + { + "epoch": 0.99, + "grad_norm": 0.1374656707048416, + "learning_rate": 0.0002273289469042287, + "loss": 0.0509, + "step": 562 + }, + { + "epoch": 1.0, + "grad_norm": 0.12492585182189941, + "learning_rate": 0.00022708917332697905, + "loss": 0.0491, + "step": 563 + }, + { + "epoch": 1.0, + "grad_norm": 0.0952799916267395, + "learning_rate": 0.000226849131775397, + "loss": 0.0561, + "step": 564 + }, + { + "epoch": 1.0, + "grad_norm": 0.16703763604164124, + "learning_rate": 0.00022660882308390544, + "loss": 0.0882, + "step": 565 + }, + { + "epoch": 1.0, + "grad_norm": 0.17010749876499176, + "learning_rate": 0.00022636824808785602, + "loss": 0.0465, + "step": 566 + }, + { + "epoch": 1.0, + "grad_norm": 0.15894334018230438, + "learning_rate": 0.0002261274076235259, + "loss": 0.0787, + "step": 567 + }, + { + "epoch": 1.0, + "grad_norm": 0.3054545819759369, + "learning_rate": 0.00022588630252811528, + "loss": 0.0808, + "step": 568 + }, + { + "epoch": 1.0, + "eval_loss": 0.09363073855638504, + "eval_runtime": 14.6975, + "eval_samples_per_second": 32.455, + "eval_steps_per_second": 8.165, + "step": 568 + }, + { + "epoch": 1.01, + "grad_norm": 0.07171467691659927, + "learning_rate": 0.00022564493363974413, + "loss": 0.0222, + "step": 569 + }, + { + "epoch": 1.01, + "grad_norm": 0.17009137570858002, + "learning_rate": 0.00022540330179744933, + "loss": 0.0542, + "step": 570 + }, + { + "epoch": 1.01, + "grad_norm": 0.07475418597459793, + "learning_rate": 0.000225161407841182, + "loss": 0.0386, + "step": 571 + }, + { + "epoch": 1.01, + "grad_norm": 0.20071469247341156, + "learning_rate": 0.0002249192526118043, + "loss": 0.0566, + "step": 572 + }, + { + "epoch": 1.01, + "grad_norm": 0.40051212906837463, + "learning_rate": 0.0002246768369510867, + "loss": 0.067, + "step": 573 + }, + { + "epoch": 1.02, + "grad_norm": 0.23527373373508453, + "learning_rate": 0.0002244341617017049, + "loss": 0.0945, + "step": 574 + }, + { + "epoch": 1.02, + "grad_norm": 0.4186932146549225, + "learning_rate": 0.00022419122770723699, + "loss": 0.0524, + "step": 575 + }, + { + "epoch": 1.02, + "grad_norm": 0.18408560752868652, + "learning_rate": 0.00022394803581216047, + "loss": 0.0421, + "step": 576 + }, + { + "epoch": 1.02, + "grad_norm": 0.25409191846847534, + "learning_rate": 0.00022370458686184942, + "loss": 0.0583, + "step": 577 + }, + { + "epoch": 1.02, + "grad_norm": 0.15828213095664978, + "learning_rate": 0.00022346088170257147, + "loss": 0.027, + "step": 578 + }, + { + "epoch": 1.02, + "grad_norm": 0.3453011214733124, + "learning_rate": 0.00022321692118148478, + "loss": 0.0444, + "step": 579 + }, + { + "epoch": 1.03, + "grad_norm": 0.2533039450645447, + "learning_rate": 0.00022297270614663533, + "loss": 0.0376, + "step": 580 + }, + { + "epoch": 1.03, + "grad_norm": 0.2254706472158432, + "learning_rate": 0.00022272823744695365, + "loss": 0.0226, + "step": 581 + }, + { + "epoch": 1.03, + "grad_norm": 0.27252522110939026, + "learning_rate": 0.00022248351593225223, + "loss": 0.0645, + "step": 582 + }, + { + "epoch": 1.03, + "grad_norm": 0.27917397022247314, + "learning_rate": 0.00022223854245322235, + "loss": 0.0407, + "step": 583 + }, + { + "epoch": 1.03, + "grad_norm": 0.13697022199630737, + "learning_rate": 0.00022199331786143097, + "loss": 0.0603, + "step": 584 + }, + { + "epoch": 1.03, + "grad_norm": 0.5482609272003174, + "learning_rate": 0.00022174784300931828, + "loss": 0.0724, + "step": 585 + }, + { + "epoch": 1.04, + "grad_norm": 0.2505982220172882, + "learning_rate": 0.00022150211875019416, + "loss": 0.0255, + "step": 586 + }, + { + "epoch": 1.04, + "grad_norm": 0.10142415761947632, + "learning_rate": 0.00022125614593823553, + "loss": 0.014, + "step": 587 + }, + { + "epoch": 1.04, + "grad_norm": 0.2175186574459076, + "learning_rate": 0.00022100992542848342, + "loss": 0.0366, + "step": 588 + }, + { + "epoch": 1.04, + "grad_norm": 0.32991528511047363, + "learning_rate": 0.00022076345807683974, + "loss": 0.06, + "step": 589 + }, + { + "epoch": 1.04, + "grad_norm": 0.4545266330242157, + "learning_rate": 0.00022051674474006457, + "loss": 0.0535, + "step": 590 + }, + { + "epoch": 1.05, + "grad_norm": 0.2949434518814087, + "learning_rate": 0.00022026978627577302, + "loss": 0.0379, + "step": 591 + }, + { + "epoch": 1.05, + "grad_norm": 0.9262584447860718, + "learning_rate": 0.0002200225835424324, + "loss": 0.197, + "step": 592 + }, + { + "epoch": 1.05, + "grad_norm": 0.1416931003332138, + "learning_rate": 0.00021977513739935894, + "loss": 0.0142, + "step": 593 + }, + { + "epoch": 1.05, + "grad_norm": 0.23968350887298584, + "learning_rate": 0.00021952744870671527, + "loss": 0.0445, + "step": 594 + }, + { + "epoch": 1.05, + "grad_norm": 0.18966619670391083, + "learning_rate": 0.00021927951832550696, + "loss": 0.0303, + "step": 595 + }, + { + "epoch": 1.05, + "grad_norm": 0.12732306122779846, + "learning_rate": 0.00021903134711757973, + "loss": 0.0853, + "step": 596 + }, + { + "epoch": 1.06, + "grad_norm": 0.34209054708480835, + "learning_rate": 0.00021878293594561655, + "loss": 0.1223, + "step": 597 + }, + { + "epoch": 1.06, + "grad_norm": 0.30927759408950806, + "learning_rate": 0.00021853428567313453, + "loss": 0.1215, + "step": 598 + }, + { + "epoch": 1.06, + "grad_norm": 0.14330783486366272, + "learning_rate": 0.00021828539716448186, + "loss": 0.0478, + "step": 599 + }, + { + "epoch": 1.06, + "grad_norm": 0.09433628618717194, + "learning_rate": 0.00021803627128483494, + "loss": 0.0409, + "step": 600 + }, + { + "epoch": 1.06, + "grad_norm": 0.15185511112213135, + "learning_rate": 0.00021778690890019532, + "loss": 0.0794, + "step": 601 + }, + { + "epoch": 1.07, + "grad_norm": 0.12092185020446777, + "learning_rate": 0.00021753731087738654, + "loss": 0.0358, + "step": 602 + }, + { + "epoch": 1.07, + "grad_norm": 0.09449411928653717, + "learning_rate": 0.00021728747808405152, + "loss": 0.0626, + "step": 603 + }, + { + "epoch": 1.07, + "grad_norm": 0.1485297530889511, + "learning_rate": 0.00021703741138864906, + "loss": 0.0685, + "step": 604 + }, + { + "epoch": 1.07, + "grad_norm": 0.16232629120349884, + "learning_rate": 0.00021678711166045106, + "loss": 0.051, + "step": 605 + }, + { + "epoch": 1.07, + "grad_norm": 0.18323880434036255, + "learning_rate": 0.00021653657976953953, + "loss": 0.0704, + "step": 606 + }, + { + "epoch": 1.07, + "grad_norm": 0.13198308646678925, + "learning_rate": 0.00021628581658680355, + "loss": 0.0746, + "step": 607 + }, + { + "epoch": 1.08, + "grad_norm": 0.14159634709358215, + "learning_rate": 0.0002160348229839362, + "loss": 0.0431, + "step": 608 + }, + { + "epoch": 1.08, + "grad_norm": 0.07700249552726746, + "learning_rate": 0.00021578359983343148, + "loss": 0.0224, + "step": 609 + }, + { + "epoch": 1.08, + "grad_norm": 0.16824749112129211, + "learning_rate": 0.00021553214800858127, + "loss": 0.0411, + "step": 610 + }, + { + "epoch": 1.08, + "grad_norm": 0.20787864923477173, + "learning_rate": 0.0002152804683834726, + "loss": 0.0366, + "step": 611 + }, + { + "epoch": 1.08, + "grad_norm": 0.21740445494651794, + "learning_rate": 0.0002150285618329841, + "loss": 0.0903, + "step": 612 + }, + { + "epoch": 1.08, + "grad_norm": 0.22452165186405182, + "learning_rate": 0.0002147764292327834, + "loss": 0.0356, + "step": 613 + }, + { + "epoch": 1.09, + "grad_norm": 0.26688578724861145, + "learning_rate": 0.0002145240714593239, + "loss": 0.0325, + "step": 614 + }, + { + "epoch": 1.09, + "grad_norm": 0.3858661949634552, + "learning_rate": 0.00021427148938984157, + "loss": 0.0674, + "step": 615 + }, + { + "epoch": 1.09, + "grad_norm": 0.43455496430397034, + "learning_rate": 0.00021401868390235232, + "loss": 0.0612, + "step": 616 + }, + { + "epoch": 1.09, + "grad_norm": 0.20362605154514313, + "learning_rate": 0.0002137656558756486, + "loss": 0.0206, + "step": 617 + }, + { + "epoch": 1.09, + "grad_norm": 0.5878596305847168, + "learning_rate": 0.00021351240618929632, + "loss": 0.1207, + "step": 618 + }, + { + "epoch": 1.1, + "grad_norm": 0.1784306913614273, + "learning_rate": 0.000213258935723632, + "loss": 0.0421, + "step": 619 + }, + { + "epoch": 1.1, + "grad_norm": 0.47718602418899536, + "learning_rate": 0.00021300524535975977, + "loss": 0.1035, + "step": 620 + }, + { + "epoch": 1.1, + "grad_norm": 0.3729299008846283, + "learning_rate": 0.00021275133597954793, + "loss": 0.1177, + "step": 621 + }, + { + "epoch": 1.1, + "grad_norm": 0.22808651626110077, + "learning_rate": 0.00021249720846562624, + "loss": 0.0381, + "step": 622 + }, + { + "epoch": 1.1, + "grad_norm": 0.2834159731864929, + "learning_rate": 0.00021224286370138268, + "loss": 0.0334, + "step": 623 + }, + { + "epoch": 1.1, + "grad_norm": 0.19477446377277374, + "learning_rate": 0.00021198830257096053, + "loss": 0.0438, + "step": 624 + }, + { + "epoch": 1.11, + "grad_norm": 0.05400063097476959, + "learning_rate": 0.00021173352595925502, + "loss": 0.0107, + "step": 625 + }, + { + "epoch": 1.11, + "grad_norm": 0.15404914319515228, + "learning_rate": 0.00021147853475191058, + "loss": 0.0239, + "step": 626 + }, + { + "epoch": 1.11, + "grad_norm": 0.2577894330024719, + "learning_rate": 0.00021122332983531747, + "loss": 0.0344, + "step": 627 + }, + { + "epoch": 1.11, + "grad_norm": 0.7029692530632019, + "learning_rate": 0.00021096791209660902, + "loss": 0.1042, + "step": 628 + }, + { + "epoch": 1.11, + "grad_norm": 0.09395071119070053, + "learning_rate": 0.00021071228242365824, + "loss": 0.0125, + "step": 629 + }, + { + "epoch": 1.11, + "grad_norm": 0.3859443664550781, + "learning_rate": 0.00021045644170507484, + "loss": 0.0751, + "step": 630 + }, + { + "epoch": 1.12, + "grad_norm": 0.20036044716835022, + "learning_rate": 0.00021020039083020217, + "loss": 0.0379, + "step": 631 + }, + { + "epoch": 1.12, + "grad_norm": 0.22928644716739655, + "learning_rate": 0.00020994413068911423, + "loss": 0.0532, + "step": 632 + }, + { + "epoch": 1.12, + "grad_norm": 0.2149987667798996, + "learning_rate": 0.00020968766217261233, + "loss": 0.0379, + "step": 633 + }, + { + "epoch": 1.12, + "grad_norm": 0.47850465774536133, + "learning_rate": 0.00020943098617222215, + "loss": 0.0885, + "step": 634 + }, + { + "epoch": 1.12, + "grad_norm": 0.2132718861103058, + "learning_rate": 0.00020917410358019072, + "loss": 0.0439, + "step": 635 + }, + { + "epoch": 1.13, + "grad_norm": 0.06894449889659882, + "learning_rate": 0.000208917015289483, + "loss": 0.0147, + "step": 636 + }, + { + "epoch": 1.13, + "grad_norm": 0.2504015564918518, + "learning_rate": 0.00020865972219377932, + "loss": 0.0551, + "step": 637 + }, + { + "epoch": 1.13, + "grad_norm": 0.21898828446865082, + "learning_rate": 0.0002084022251874716, + "loss": 0.0768, + "step": 638 + }, + { + "epoch": 1.13, + "grad_norm": 0.14049361646175385, + "learning_rate": 0.00020814452516566075, + "loss": 0.0204, + "step": 639 + }, + { + "epoch": 1.13, + "grad_norm": 0.17026256024837494, + "learning_rate": 0.00020788662302415352, + "loss": 0.027, + "step": 640 + }, + { + "epoch": 1.13, + "grad_norm": 0.2352689951658249, + "learning_rate": 0.00020762851965945892, + "loss": 0.0789, + "step": 641 + }, + { + "epoch": 1.14, + "grad_norm": 0.31628891825675964, + "learning_rate": 0.00020737021596878578, + "loss": 0.0518, + "step": 642 + }, + { + "epoch": 1.14, + "grad_norm": 0.26351508498191833, + "learning_rate": 0.00020711171285003915, + "loss": 0.0499, + "step": 643 + }, + { + "epoch": 1.14, + "grad_norm": 0.18696492910385132, + "learning_rate": 0.00020685301120181734, + "loss": 0.0487, + "step": 644 + }, + { + "epoch": 1.14, + "grad_norm": 0.1336275190114975, + "learning_rate": 0.0002065941119234087, + "loss": 0.0441, + "step": 645 + }, + { + "epoch": 1.14, + "grad_norm": 0.17000997066497803, + "learning_rate": 0.00020633501591478884, + "loss": 0.0391, + "step": 646 + }, + { + "epoch": 1.14, + "grad_norm": 0.2624777853488922, + "learning_rate": 0.0002060757240766168, + "loss": 0.0736, + "step": 647 + }, + { + "epoch": 1.15, + "grad_norm": 0.18875586986541748, + "learning_rate": 0.00020581623731023277, + "loss": 0.0449, + "step": 648 + }, + { + "epoch": 1.15, + "grad_norm": 0.2557006776332855, + "learning_rate": 0.00020555655651765433, + "loss": 0.0296, + "step": 649 + }, + { + "epoch": 1.15, + "grad_norm": 0.27907809615135193, + "learning_rate": 0.0002052966826015735, + "loss": 0.0263, + "step": 650 + }, + { + "epoch": 1.15, + "grad_norm": 0.25195273756980896, + "learning_rate": 0.00020503661646535388, + "loss": 0.0199, + "step": 651 + }, + { + "epoch": 1.15, + "grad_norm": 0.34424659609794617, + "learning_rate": 0.0002047763590130269, + "loss": 0.1449, + "step": 652 + }, + { + "epoch": 1.16, + "grad_norm": 0.28247949481010437, + "learning_rate": 0.0002045159111492893, + "loss": 0.0371, + "step": 653 + }, + { + "epoch": 1.16, + "grad_norm": 0.15257826447486877, + "learning_rate": 0.00020425527377949956, + "loss": 0.0171, + "step": 654 + }, + { + "epoch": 1.16, + "grad_norm": 0.2261778712272644, + "learning_rate": 0.0002039944478096751, + "loss": 0.0405, + "step": 655 + }, + { + "epoch": 1.16, + "grad_norm": 0.2752448618412018, + "learning_rate": 0.00020373343414648877, + "loss": 0.0707, + "step": 656 + }, + { + "epoch": 1.16, + "grad_norm": 0.3369431495666504, + "learning_rate": 0.00020347223369726586, + "loss": 0.1337, + "step": 657 + }, + { + "epoch": 1.16, + "grad_norm": 0.34381601214408875, + "learning_rate": 0.00020321084736998118, + "loss": 0.1116, + "step": 658 + }, + { + "epoch": 1.17, + "grad_norm": 0.4304129183292389, + "learning_rate": 0.00020294927607325539, + "loss": 0.1044, + "step": 659 + }, + { + "epoch": 1.17, + "grad_norm": 0.186915785074234, + "learning_rate": 0.0002026875207163523, + "loss": 0.0438, + "step": 660 + }, + { + "epoch": 1.17, + "grad_norm": 0.15853986144065857, + "learning_rate": 0.0002024255822091757, + "loss": 0.0171, + "step": 661 + }, + { + "epoch": 1.17, + "grad_norm": 0.22487328946590424, + "learning_rate": 0.00020216346146226558, + "loss": 0.0545, + "step": 662 + }, + { + "epoch": 1.17, + "grad_norm": 0.1993045210838318, + "learning_rate": 0.00020190115938679593, + "loss": 0.0937, + "step": 663 + }, + { + "epoch": 1.17, + "grad_norm": 0.0910387709736824, + "learning_rate": 0.00020163867689457072, + "loss": 0.0179, + "step": 664 + }, + { + "epoch": 1.18, + "grad_norm": 0.22309695184230804, + "learning_rate": 0.00020137601489802123, + "loss": 0.0758, + "step": 665 + }, + { + "epoch": 1.18, + "grad_norm": 0.1319982260465622, + "learning_rate": 0.00020111317431020272, + "loss": 0.0369, + "step": 666 + }, + { + "epoch": 1.18, + "grad_norm": 0.09432458132505417, + "learning_rate": 0.0002008501560447911, + "loss": 0.0291, + "step": 667 + }, + { + "epoch": 1.18, + "grad_norm": 0.09810501337051392, + "learning_rate": 0.00020058696101608027, + "loss": 0.0252, + "step": 668 + }, + { + "epoch": 1.18, + "grad_norm": 0.26495859026908875, + "learning_rate": 0.0002003235901389782, + "loss": 0.0555, + "step": 669 + }, + { + "epoch": 1.19, + "grad_norm": 0.6845064163208008, + "learning_rate": 0.00020006004432900442, + "loss": 0.0952, + "step": 670 + }, + { + "epoch": 1.19, + "grad_norm": 0.186613067984581, + "learning_rate": 0.0001997963245022863, + "loss": 0.0419, + "step": 671 + }, + { + "epoch": 1.19, + "grad_norm": 0.22056430578231812, + "learning_rate": 0.00019953243157555647, + "loss": 0.041, + "step": 672 + }, + { + "epoch": 1.19, + "grad_norm": 0.30706262588500977, + "learning_rate": 0.00019926836646614885, + "loss": 0.0507, + "step": 673 + }, + { + "epoch": 1.19, + "grad_norm": 0.21987871825695038, + "learning_rate": 0.00019900413009199625, + "loss": 0.0397, + "step": 674 + }, + { + "epoch": 1.19, + "grad_norm": 0.2214607149362564, + "learning_rate": 0.00019873972337162667, + "loss": 0.0349, + "step": 675 + }, + { + "epoch": 1.2, + "grad_norm": 0.37210360169410706, + "learning_rate": 0.00019847514722416026, + "loss": 0.075, + "step": 676 + }, + { + "epoch": 1.2, + "grad_norm": 0.2434106171131134, + "learning_rate": 0.0001982104025693062, + "loss": 0.0284, + "step": 677 + }, + { + "epoch": 1.2, + "grad_norm": 0.22016049921512604, + "learning_rate": 0.00019794549032735932, + "loss": 0.032, + "step": 678 + }, + { + "epoch": 1.2, + "grad_norm": 0.07571116834878922, + "learning_rate": 0.0001976804114191971, + "loss": 0.0079, + "step": 679 + }, + { + "epoch": 1.2, + "grad_norm": 0.35380011796951294, + "learning_rate": 0.00019741516676627632, + "loss": 0.1136, + "step": 680 + }, + { + "epoch": 1.2, + "grad_norm": 0.2167171835899353, + "learning_rate": 0.00019714975729062998, + "loss": 0.0448, + "step": 681 + }, + { + "epoch": 1.21, + "grad_norm": 0.37797045707702637, + "learning_rate": 0.00019688418391486398, + "loss": 0.1041, + "step": 682 + }, + { + "epoch": 1.21, + "grad_norm": 0.6682038903236389, + "learning_rate": 0.00019661844756215397, + "loss": 0.0392, + "step": 683 + }, + { + "epoch": 1.21, + "grad_norm": 0.15291425585746765, + "learning_rate": 0.0001963525491562421, + "loss": 0.0154, + "step": 684 + }, + { + "epoch": 1.21, + "grad_norm": 0.276319295167923, + "learning_rate": 0.00019608648962143394, + "loss": 0.0474, + "step": 685 + }, + { + "epoch": 1.21, + "grad_norm": 0.35119596123695374, + "learning_rate": 0.00019582026988259506, + "loss": 0.0549, + "step": 686 + }, + { + "epoch": 1.22, + "grad_norm": 0.36834511160850525, + "learning_rate": 0.0001955538908651481, + "loss": 0.0458, + "step": 687 + }, + { + "epoch": 1.22, + "grad_norm": 0.32360026240348816, + "learning_rate": 0.00019528735349506906, + "loss": 0.029, + "step": 688 + }, + { + "epoch": 1.22, + "grad_norm": 0.3501654863357544, + "learning_rate": 0.00019502065869888473, + "loss": 0.0687, + "step": 689 + }, + { + "epoch": 1.22, + "grad_norm": 0.260672390460968, + "learning_rate": 0.000194753807403669, + "loss": 0.0546, + "step": 690 + }, + { + "epoch": 1.22, + "grad_norm": 0.3679283559322357, + "learning_rate": 0.00019448680053703971, + "loss": 0.075, + "step": 691 + }, + { + "epoch": 1.22, + "grad_norm": 0.3225049376487732, + "learning_rate": 0.00019421963902715563, + "loss": 0.0821, + "step": 692 + }, + { + "epoch": 1.23, + "grad_norm": 0.20469306409358978, + "learning_rate": 0.0001939523238027129, + "loss": 0.0775, + "step": 693 + }, + { + "epoch": 1.23, + "grad_norm": 0.22201740741729736, + "learning_rate": 0.0001936848557929423, + "loss": 0.0265, + "step": 694 + }, + { + "epoch": 1.23, + "grad_norm": 0.2975887358188629, + "learning_rate": 0.0001934172359276054, + "loss": 0.0892, + "step": 695 + }, + { + "epoch": 1.23, + "grad_norm": 0.2806803584098816, + "learning_rate": 0.00019314946513699187, + "loss": 0.0394, + "step": 696 + }, + { + "epoch": 1.23, + "grad_norm": 0.23834392428398132, + "learning_rate": 0.0001928815443519158, + "loss": 0.0469, + "step": 697 + }, + { + "epoch": 1.23, + "grad_norm": 0.2856217920780182, + "learning_rate": 0.000192613474503713, + "loss": 0.1068, + "step": 698 + }, + { + "epoch": 1.24, + "grad_norm": 0.18297094106674194, + "learning_rate": 0.00019234525652423714, + "loss": 0.0521, + "step": 699 + }, + { + "epoch": 1.24, + "grad_norm": 0.26084813475608826, + "learning_rate": 0.00019207689134585694, + "loss": 0.0729, + "step": 700 + }, + { + "epoch": 1.24, + "grad_norm": 0.14558996260166168, + "learning_rate": 0.00019180837990145287, + "loss": 0.0275, + "step": 701 + }, + { + "epoch": 1.24, + "grad_norm": 0.128230482339859, + "learning_rate": 0.0001915397231244137, + "loss": 0.0406, + "step": 702 + }, + { + "epoch": 1.24, + "grad_norm": 0.057518381625413895, + "learning_rate": 0.0001912709219486336, + "loss": 0.0127, + "step": 703 + }, + { + "epoch": 1.25, + "grad_norm": 0.16675953567028046, + "learning_rate": 0.0001910019773085085, + "loss": 0.0455, + "step": 704 + }, + { + "epoch": 1.25, + "grad_norm": 0.14262209832668304, + "learning_rate": 0.0001907328901389331, + "loss": 0.0444, + "step": 705 + }, + { + "epoch": 1.25, + "grad_norm": 0.1892000138759613, + "learning_rate": 0.00019046366137529758, + "loss": 0.0485, + "step": 706 + }, + { + "epoch": 1.25, + "grad_norm": 0.22300001978874207, + "learning_rate": 0.0001901942919534843, + "loss": 0.0533, + "step": 707 + }, + { + "epoch": 1.25, + "grad_norm": 0.19573500752449036, + "learning_rate": 0.00018992478280986468, + "loss": 0.0825, + "step": 708 + }, + { + "epoch": 1.25, + "grad_norm": 0.2073289453983307, + "learning_rate": 0.00018965513488129559, + "loss": 0.0695, + "step": 709 + }, + { + "epoch": 1.26, + "grad_norm": 0.17569467425346375, + "learning_rate": 0.00018938534910511648, + "loss": 0.0474, + "step": 710 + }, + { + "epoch": 1.26, + "eval_loss": 0.07800926268100739, + "eval_runtime": 14.7232, + "eval_samples_per_second": 32.398, + "eval_steps_per_second": 8.15, + "step": 710 + }, + { + "epoch": 1.26, + "grad_norm": 0.2483418583869934, + "learning_rate": 0.00018911542641914612, + "loss": 0.0522, + "step": 711 + }, + { + "epoch": 1.26, + "grad_norm": 0.22553446888923645, + "learning_rate": 0.00018884536776167898, + "loss": 0.0624, + "step": 712 + }, + { + "epoch": 1.26, + "grad_norm": 0.2794416546821594, + "learning_rate": 0.00018857517407148232, + "loss": 0.0591, + "step": 713 + }, + { + "epoch": 1.26, + "grad_norm": 0.4945233166217804, + "learning_rate": 0.00018830484628779267, + "loss": 0.1097, + "step": 714 + }, + { + "epoch": 1.26, + "grad_norm": 0.22541265189647675, + "learning_rate": 0.00018803438535031287, + "loss": 0.11, + "step": 715 + }, + { + "epoch": 1.27, + "grad_norm": 0.1390594094991684, + "learning_rate": 0.00018776379219920844, + "loss": 0.0238, + "step": 716 + }, + { + "epoch": 1.27, + "grad_norm": 0.35142239928245544, + "learning_rate": 0.00018749306777510462, + "loss": 0.105, + "step": 717 + }, + { + "epoch": 1.27, + "grad_norm": 0.23132985830307007, + "learning_rate": 0.00018722221301908294, + "loss": 0.0271, + "step": 718 + }, + { + "epoch": 1.27, + "grad_norm": 0.11399706453084946, + "learning_rate": 0.00018695122887267787, + "loss": 0.0383, + "step": 719 + }, + { + "epoch": 1.27, + "grad_norm": 0.27700984477996826, + "learning_rate": 0.00018668011627787396, + "loss": 0.0477, + "step": 720 + }, + { + "epoch": 1.28, + "grad_norm": 0.13391317427158356, + "learning_rate": 0.00018640887617710195, + "loss": 0.028, + "step": 721 + }, + { + "epoch": 1.28, + "grad_norm": 0.23833520710468292, + "learning_rate": 0.00018613750951323603, + "loss": 0.0725, + "step": 722 + }, + { + "epoch": 1.28, + "grad_norm": 0.12027294188737869, + "learning_rate": 0.0001858660172295901, + "loss": 0.0181, + "step": 723 + }, + { + "epoch": 1.28, + "grad_norm": 0.2397276610136032, + "learning_rate": 0.00018559440026991506, + "loss": 0.0548, + "step": 724 + }, + { + "epoch": 1.28, + "grad_norm": 0.19492855668067932, + "learning_rate": 0.00018532265957839494, + "loss": 0.0349, + "step": 725 + }, + { + "epoch": 1.28, + "grad_norm": 0.36717554926872253, + "learning_rate": 0.000185050796099644, + "loss": 0.103, + "step": 726 + }, + { + "epoch": 1.29, + "grad_norm": 0.19141976535320282, + "learning_rate": 0.0001847788107787033, + "loss": 0.0441, + "step": 727 + }, + { + "epoch": 1.29, + "grad_norm": 0.19812935590744019, + "learning_rate": 0.00018450670456103739, + "loss": 0.0388, + "step": 728 + }, + { + "epoch": 1.29, + "grad_norm": 0.14811943471431732, + "learning_rate": 0.00018423447839253126, + "loss": 0.0184, + "step": 729 + }, + { + "epoch": 1.29, + "grad_norm": 0.2438066601753235, + "learning_rate": 0.0001839621332194866, + "loss": 0.0228, + "step": 730 + }, + { + "epoch": 1.29, + "grad_norm": 0.299697607755661, + "learning_rate": 0.00018368966998861898, + "loss": 0.0392, + "step": 731 + }, + { + "epoch": 1.3, + "grad_norm": 0.26123788952827454, + "learning_rate": 0.00018341708964705433, + "loss": 0.0569, + "step": 732 + }, + { + "epoch": 1.3, + "grad_norm": 0.1523085981607437, + "learning_rate": 0.00018314439314232557, + "loss": 0.0488, + "step": 733 + }, + { + "epoch": 1.3, + "grad_norm": 0.17505189776420593, + "learning_rate": 0.0001828715814223696, + "loss": 0.0342, + "step": 734 + }, + { + "epoch": 1.3, + "grad_norm": 0.2612646222114563, + "learning_rate": 0.0001825986554355236, + "loss": 0.0413, + "step": 735 + }, + { + "epoch": 1.3, + "grad_norm": 0.2304815948009491, + "learning_rate": 0.00018232561613052212, + "loss": 0.0361, + "step": 736 + }, + { + "epoch": 1.3, + "grad_norm": 0.5065823793411255, + "learning_rate": 0.00018205246445649362, + "loss": 0.0506, + "step": 737 + }, + { + "epoch": 1.31, + "grad_norm": 0.252676397562027, + "learning_rate": 0.00018177920136295712, + "loss": 0.0422, + "step": 738 + }, + { + "epoch": 1.31, + "grad_norm": 0.5118948817253113, + "learning_rate": 0.00018150582779981897, + "loss": 0.0794, + "step": 739 + }, + { + "epoch": 1.31, + "grad_norm": 0.20689886808395386, + "learning_rate": 0.00018123234471736942, + "loss": 0.0196, + "step": 740 + }, + { + "epoch": 1.31, + "grad_norm": 0.21955521404743195, + "learning_rate": 0.00018095875306627977, + "loss": 0.0205, + "step": 741 + }, + { + "epoch": 1.31, + "grad_norm": 0.2635563910007477, + "learning_rate": 0.00018068505379759825, + "loss": 0.026, + "step": 742 + }, + { + "epoch": 1.31, + "grad_norm": 0.4259761869907379, + "learning_rate": 0.00018041124786274756, + "loss": 0.0669, + "step": 743 + }, + { + "epoch": 1.32, + "grad_norm": 0.24592095613479614, + "learning_rate": 0.000180137336213521, + "loss": 0.054, + "step": 744 + }, + { + "epoch": 1.32, + "grad_norm": 0.448553204536438, + "learning_rate": 0.0001798633198020794, + "loss": 0.0244, + "step": 745 + }, + { + "epoch": 1.32, + "grad_norm": 0.16729894280433655, + "learning_rate": 0.0001795891995809478, + "loss": 0.0597, + "step": 746 + }, + { + "epoch": 1.32, + "grad_norm": 0.2796671390533447, + "learning_rate": 0.000179314976503012, + "loss": 0.0281, + "step": 747 + }, + { + "epoch": 1.32, + "grad_norm": 0.6444141864776611, + "learning_rate": 0.00017904065152151544, + "loss": 0.1028, + "step": 748 + }, + { + "epoch": 1.33, + "grad_norm": 0.5876030921936035, + "learning_rate": 0.00017876622559005577, + "loss": 0.0896, + "step": 749 + }, + { + "epoch": 1.33, + "grad_norm": 0.7038650512695312, + "learning_rate": 0.00017849169966258156, + "loss": 0.0294, + "step": 750 + }, + { + "epoch": 1.33, + "grad_norm": 0.22545520961284637, + "learning_rate": 0.00017821707469338892, + "loss": 0.0491, + "step": 751 + }, + { + "epoch": 1.33, + "grad_norm": 0.25605642795562744, + "learning_rate": 0.00017794235163711835, + "loss": 0.0343, + "step": 752 + }, + { + "epoch": 1.33, + "grad_norm": 0.07614222168922424, + "learning_rate": 0.0001776675314487512, + "loss": 0.0149, + "step": 753 + }, + { + "epoch": 1.33, + "grad_norm": 0.37879428267478943, + "learning_rate": 0.0001773926150836066, + "loss": 0.1085, + "step": 754 + }, + { + "epoch": 1.34, + "grad_norm": 0.18689392507076263, + "learning_rate": 0.00017711760349733792, + "loss": 0.0424, + "step": 755 + }, + { + "epoch": 1.34, + "grad_norm": 0.14959782361984253, + "learning_rate": 0.00017684249764592948, + "loss": 0.0421, + "step": 756 + }, + { + "epoch": 1.34, + "grad_norm": 0.35820502042770386, + "learning_rate": 0.0001765672984856934, + "loss": 0.0542, + "step": 757 + }, + { + "epoch": 1.34, + "grad_norm": 0.2389790266752243, + "learning_rate": 0.00017629200697326613, + "loss": 0.0545, + "step": 758 + }, + { + "epoch": 1.34, + "grad_norm": 0.25757715106010437, + "learning_rate": 0.00017601662406560508, + "loss": 0.0549, + "step": 759 + }, + { + "epoch": 1.34, + "grad_norm": 0.25923579931259155, + "learning_rate": 0.00017574115071998547, + "loss": 0.0401, + "step": 760 + }, + { + "epoch": 1.35, + "grad_norm": 0.1554333120584488, + "learning_rate": 0.0001754655878939968, + "loss": 0.0321, + "step": 761 + }, + { + "epoch": 1.35, + "grad_norm": 0.1587299108505249, + "learning_rate": 0.00017518993654553963, + "loss": 0.0495, + "step": 762 + }, + { + "epoch": 1.35, + "grad_norm": 0.3896007835865021, + "learning_rate": 0.00017491419763282227, + "loss": 0.1092, + "step": 763 + }, + { + "epoch": 1.35, + "grad_norm": 0.20783177018165588, + "learning_rate": 0.00017463837211435744, + "loss": 0.0281, + "step": 764 + }, + { + "epoch": 1.35, + "grad_norm": 0.19074185192584991, + "learning_rate": 0.00017436246094895894, + "loss": 0.0401, + "step": 765 + }, + { + "epoch": 1.36, + "grad_norm": 0.18701818585395813, + "learning_rate": 0.00017408646509573813, + "loss": 0.0332, + "step": 766 + }, + { + "epoch": 1.36, + "grad_norm": 0.20894934237003326, + "learning_rate": 0.00017381038551410097, + "loss": 0.0626, + "step": 767 + }, + { + "epoch": 1.36, + "grad_norm": 0.16977371275424957, + "learning_rate": 0.00017353422316374427, + "loss": 0.0475, + "step": 768 + }, + { + "epoch": 1.36, + "grad_norm": 0.16158726811408997, + "learning_rate": 0.00017325797900465278, + "loss": 0.024, + "step": 769 + }, + { + "epoch": 1.36, + "grad_norm": 0.8635274767875671, + "learning_rate": 0.00017298165399709536, + "loss": 0.1246, + "step": 770 + }, + { + "epoch": 1.36, + "grad_norm": 0.24966581165790558, + "learning_rate": 0.0001727052491016222, + "loss": 0.041, + "step": 771 + }, + { + "epoch": 1.37, + "grad_norm": 0.21421608328819275, + "learning_rate": 0.00017242876527906096, + "loss": 0.0525, + "step": 772 + }, + { + "epoch": 1.37, + "grad_norm": 0.14579136669635773, + "learning_rate": 0.0001721522034905138, + "loss": 0.0229, + "step": 773 + }, + { + "epoch": 1.37, + "grad_norm": 0.22795641422271729, + "learning_rate": 0.0001718755646973539, + "loss": 0.0443, + "step": 774 + }, + { + "epoch": 1.37, + "grad_norm": 0.3049260675907135, + "learning_rate": 0.00017159884986122194, + "loss": 0.0606, + "step": 775 + }, + { + "epoch": 1.37, + "grad_norm": 0.16482001543045044, + "learning_rate": 0.00017132205994402328, + "loss": 0.0439, + "step": 776 + }, + { + "epoch": 1.37, + "grad_norm": 0.2296137511730194, + "learning_rate": 0.00017104519590792396, + "loss": 0.0303, + "step": 777 + }, + { + "epoch": 1.38, + "grad_norm": 0.19414740800857544, + "learning_rate": 0.00017076825871534772, + "loss": 0.0474, + "step": 778 + }, + { + "epoch": 1.38, + "grad_norm": 0.2893425524234772, + "learning_rate": 0.00017049124932897277, + "loss": 0.0582, + "step": 779 + }, + { + "epoch": 1.38, + "grad_norm": 0.11354421079158783, + "learning_rate": 0.00017021416871172815, + "loss": 0.0187, + "step": 780 + }, + { + "epoch": 1.38, + "grad_norm": 0.19562894105911255, + "learning_rate": 0.00016993701782679055, + "loss": 0.033, + "step": 781 + }, + { + "epoch": 1.38, + "grad_norm": 0.13439138233661652, + "learning_rate": 0.0001696597976375808, + "loss": 0.0192, + "step": 782 + }, + { + "epoch": 1.39, + "grad_norm": 0.2628462016582489, + "learning_rate": 0.00016938250910776084, + "loss": 0.0378, + "step": 783 + }, + { + "epoch": 1.39, + "grad_norm": 0.10226990282535553, + "learning_rate": 0.0001691051532012301, + "loss": 0.014, + "step": 784 + }, + { + "epoch": 1.39, + "grad_norm": 0.43657186627388, + "learning_rate": 0.0001688277308821221, + "loss": 0.0589, + "step": 785 + }, + { + "epoch": 1.39, + "grad_norm": 0.3169918954372406, + "learning_rate": 0.00016855024311480146, + "loss": 0.0591, + "step": 786 + }, + { + "epoch": 1.39, + "grad_norm": 0.4922636151313782, + "learning_rate": 0.00016827269086386003, + "loss": 0.0697, + "step": 787 + }, + { + "epoch": 1.39, + "grad_norm": 0.08833907544612885, + "learning_rate": 0.00016799507509411405, + "loss": 0.0116, + "step": 788 + }, + { + "epoch": 1.4, + "grad_norm": 0.3545498251914978, + "learning_rate": 0.00016771739677060043, + "loss": 0.0632, + "step": 789 + }, + { + "epoch": 1.4, + "grad_norm": 0.37221550941467285, + "learning_rate": 0.00016743965685857358, + "loss": 0.04, + "step": 790 + }, + { + "epoch": 1.4, + "grad_norm": 0.3635517954826355, + "learning_rate": 0.000167161856323502, + "loss": 0.0725, + "step": 791 + }, + { + "epoch": 1.4, + "grad_norm": 0.09715539216995239, + "learning_rate": 0.00016688399613106484, + "loss": 0.0089, + "step": 792 + }, + { + "epoch": 1.4, + "grad_norm": 0.07484902441501617, + "learning_rate": 0.0001666060772471488, + "loss": 0.007, + "step": 793 + }, + { + "epoch": 1.4, + "grad_norm": 0.1089821457862854, + "learning_rate": 0.00016632810063784448, + "loss": 0.0121, + "step": 794 + }, + { + "epoch": 1.41, + "grad_norm": 0.09242980927228928, + "learning_rate": 0.00016605006726944313, + "loss": 0.0101, + "step": 795 + }, + { + "epoch": 1.41, + "grad_norm": 0.7516760230064392, + "learning_rate": 0.0001657719781084333, + "loss": 0.0543, + "step": 796 + }, + { + "epoch": 1.41, + "grad_norm": 0.717779278755188, + "learning_rate": 0.0001654938341214976, + "loss": 0.0535, + "step": 797 + }, + { + "epoch": 1.41, + "grad_norm": 0.4909898340702057, + "learning_rate": 0.0001652156362755091, + "loss": 0.0752, + "step": 798 + }, + { + "epoch": 1.41, + "grad_norm": 0.6857799887657166, + "learning_rate": 0.00016493738553752818, + "loss": 0.1142, + "step": 799 + }, + { + "epoch": 1.42, + "grad_norm": 0.28686514496803284, + "learning_rate": 0.00016465908287479906, + "loss": 0.0321, + "step": 800 + }, + { + "epoch": 1.42, + "grad_norm": 0.1494898945093155, + "learning_rate": 0.0001643807292547463, + "loss": 0.0089, + "step": 801 + }, + { + "epoch": 1.42, + "grad_norm": 0.34240686893463135, + "learning_rate": 0.00016410232564497185, + "loss": 0.0526, + "step": 802 + }, + { + "epoch": 1.42, + "grad_norm": 0.17216762900352478, + "learning_rate": 0.00016382387301325135, + "loss": 0.0124, + "step": 803 + }, + { + "epoch": 1.42, + "grad_norm": 0.18440192937850952, + "learning_rate": 0.0001635453723275307, + "loss": 0.0629, + "step": 804 + }, + { + "epoch": 1.42, + "grad_norm": 0.17247916758060455, + "learning_rate": 0.00016326682455592304, + "loss": 0.0342, + "step": 805 + }, + { + "epoch": 1.43, + "grad_norm": 0.23954929411411285, + "learning_rate": 0.0001629882306667051, + "loss": 0.0399, + "step": 806 + }, + { + "epoch": 1.43, + "grad_norm": 0.11504701524972916, + "learning_rate": 0.0001627095916283139, + "loss": 0.0146, + "step": 807 + }, + { + "epoch": 1.43, + "grad_norm": 0.28793883323669434, + "learning_rate": 0.00016243090840934344, + "loss": 0.1013, + "step": 808 + }, + { + "epoch": 1.43, + "grad_norm": 0.4467115104198456, + "learning_rate": 0.00016215218197854127, + "loss": 0.0562, + "step": 809 + }, + { + "epoch": 1.43, + "grad_norm": 0.1595917046070099, + "learning_rate": 0.00016187341330480522, + "loss": 0.04, + "step": 810 + }, + { + "epoch": 1.43, + "grad_norm": 0.1741933822631836, + "learning_rate": 0.00016159460335717993, + "loss": 0.0374, + "step": 811 + }, + { + "epoch": 1.44, + "grad_norm": 0.16274484992027283, + "learning_rate": 0.00016131575310485345, + "loss": 0.03, + "step": 812 + }, + { + "epoch": 1.44, + "grad_norm": 0.20965149998664856, + "learning_rate": 0.00016103686351715403, + "loss": 0.0486, + "step": 813 + }, + { + "epoch": 1.44, + "grad_norm": 0.28146764636039734, + "learning_rate": 0.00016075793556354653, + "loss": 0.0546, + "step": 814 + }, + { + "epoch": 1.44, + "grad_norm": 0.7190961241722107, + "learning_rate": 0.00016047897021362938, + "loss": 0.0832, + "step": 815 + }, + { + "epoch": 1.44, + "grad_norm": 0.3478730618953705, + "learning_rate": 0.00016019996843713084, + "loss": 0.043, + "step": 816 + }, + { + "epoch": 1.45, + "grad_norm": 0.3473072648048401, + "learning_rate": 0.00015992093120390581, + "loss": 0.0579, + "step": 817 + }, + { + "epoch": 1.45, + "grad_norm": 0.21544550359249115, + "learning_rate": 0.00015964185948393246, + "loss": 0.0515, + "step": 818 + }, + { + "epoch": 1.45, + "grad_norm": 0.317604124546051, + "learning_rate": 0.00015936275424730892, + "loss": 0.0678, + "step": 819 + }, + { + "epoch": 1.45, + "grad_norm": 0.23350350558757782, + "learning_rate": 0.00015908361646424973, + "loss": 0.0382, + "step": 820 + }, + { + "epoch": 1.45, + "grad_norm": 0.1787886768579483, + "learning_rate": 0.0001588044471050826, + "loss": 0.0306, + "step": 821 + }, + { + "epoch": 1.45, + "grad_norm": 0.148136705160141, + "learning_rate": 0.00015852524714024503, + "loss": 0.0308, + "step": 822 + }, + { + "epoch": 1.46, + "grad_norm": 0.20286808907985687, + "learning_rate": 0.00015824601754028082, + "loss": 0.0512, + "step": 823 + }, + { + "epoch": 1.46, + "grad_norm": 0.2771085202693939, + "learning_rate": 0.0001579667592758369, + "loss": 0.0424, + "step": 824 + }, + { + "epoch": 1.46, + "grad_norm": 0.230366513133049, + "learning_rate": 0.00015768747331765975, + "loss": 0.0329, + "step": 825 + }, + { + "epoch": 1.46, + "grad_norm": 0.3995148539543152, + "learning_rate": 0.00015740816063659224, + "loss": 0.0668, + "step": 826 + }, + { + "epoch": 1.46, + "grad_norm": 0.41903263330459595, + "learning_rate": 0.00015712882220357, + "loss": 0.123, + "step": 827 + }, + { + "epoch": 1.46, + "grad_norm": 0.10426704585552216, + "learning_rate": 0.00015684945898961823, + "loss": 0.0139, + "step": 828 + }, + { + "epoch": 1.47, + "grad_norm": 0.28077027201652527, + "learning_rate": 0.00015657007196584832, + "loss": 0.0649, + "step": 829 + }, + { + "epoch": 1.47, + "grad_norm": 0.4622378647327423, + "learning_rate": 0.0001562906621034543, + "loss": 0.0728, + "step": 830 + }, + { + "epoch": 1.47, + "grad_norm": 0.23062212765216827, + "learning_rate": 0.00015601123037370973, + "loss": 0.0187, + "step": 831 + }, + { + "epoch": 1.47, + "grad_norm": 0.28703922033309937, + "learning_rate": 0.00015573177774796414, + "loss": 0.0486, + "step": 832 + }, + { + "epoch": 1.47, + "grad_norm": 0.3227209448814392, + "learning_rate": 0.00015545230519763966, + "loss": 0.0474, + "step": 833 + }, + { + "epoch": 1.48, + "grad_norm": 0.19180624186992645, + "learning_rate": 0.00015517281369422766, + "loss": 0.0612, + "step": 834 + }, + { + "epoch": 1.48, + "grad_norm": 0.3641510605812073, + "learning_rate": 0.00015489330420928552, + "loss": 0.0584, + "step": 835 + }, + { + "epoch": 1.48, + "grad_norm": 0.34426817297935486, + "learning_rate": 0.00015461377771443298, + "loss": 0.0735, + "step": 836 + }, + { + "epoch": 1.48, + "grad_norm": 0.21041937172412872, + "learning_rate": 0.00015433423518134904, + "loss": 0.045, + "step": 837 + }, + { + "epoch": 1.48, + "grad_norm": 0.2598513066768646, + "learning_rate": 0.00015405467758176835, + "loss": 0.0411, + "step": 838 + }, + { + "epoch": 1.48, + "grad_norm": 0.29700911045074463, + "learning_rate": 0.000153775105887478, + "loss": 0.0624, + "step": 839 + }, + { + "epoch": 1.49, + "grad_norm": 0.1756899207830429, + "learning_rate": 0.00015349552107031395, + "loss": 0.0299, + "step": 840 + }, + { + "epoch": 1.49, + "grad_norm": 0.19117477536201477, + "learning_rate": 0.00015321592410215797, + "loss": 0.0951, + "step": 841 + }, + { + "epoch": 1.49, + "grad_norm": 0.11488031595945358, + "learning_rate": 0.00015293631595493398, + "loss": 0.0201, + "step": 842 + }, + { + "epoch": 1.49, + "grad_norm": 0.29568618535995483, + "learning_rate": 0.00015265669760060465, + "loss": 0.0463, + "step": 843 + }, + { + "epoch": 1.49, + "grad_norm": 0.24253536760807037, + "learning_rate": 0.0001523770700111683, + "loss": 0.0494, + "step": 844 + }, + { + "epoch": 1.49, + "grad_norm": 0.1810959130525589, + "learning_rate": 0.00015209743415865533, + "loss": 0.0326, + "step": 845 + }, + { + "epoch": 1.5, + "grad_norm": 0.2873782217502594, + "learning_rate": 0.00015181779101512477, + "loss": 0.0411, + "step": 846 + }, + { + "epoch": 1.5, + "grad_norm": 0.05380136892199516, + "learning_rate": 0.00015153814155266103, + "loss": 0.0099, + "step": 847 + }, + { + "epoch": 1.5, + "grad_norm": 0.07258699834346771, + "learning_rate": 0.0001512584867433705, + "loss": 0.009, + "step": 848 + }, + { + "epoch": 1.5, + "grad_norm": 0.11949209123849869, + "learning_rate": 0.00015097882755937822, + "loss": 0.0145, + "step": 849 + }, + { + "epoch": 1.5, + "grad_norm": 0.2874244749546051, + "learning_rate": 0.00015069916497282432, + "loss": 0.0368, + "step": 850 + }, + { + "epoch": 1.51, + "grad_norm": 0.23363856971263885, + "learning_rate": 0.00015041949995586078, + "loss": 0.0658, + "step": 851 + }, + { + "epoch": 1.51, + "grad_norm": 0.2872653305530548, + "learning_rate": 0.00015013983348064814, + "loss": 0.0379, + "step": 852 + }, + { + "epoch": 1.51, + "eval_loss": 0.08633331954479218, + "eval_runtime": 14.7153, + "eval_samples_per_second": 32.415, + "eval_steps_per_second": 8.155, + "step": 852 + }, + { + "epoch": 1.51, + "grad_norm": 0.08994939923286438, + "learning_rate": 0.00014986016651935183, + "loss": 0.0097, + "step": 853 + }, + { + "epoch": 1.51, + "grad_norm": 0.44967612624168396, + "learning_rate": 0.00014958050004413922, + "loss": 0.0501, + "step": 854 + }, + { + "epoch": 1.51, + "grad_norm": 0.5344903469085693, + "learning_rate": 0.0001493008350271757, + "loss": 0.0284, + "step": 855 + }, + { + "epoch": 1.51, + "grad_norm": 0.5308881998062134, + "learning_rate": 0.00014902117244062178, + "loss": 0.0467, + "step": 856 + }, + { + "epoch": 1.52, + "grad_norm": 0.42243242263793945, + "learning_rate": 0.0001487415132566295, + "loss": 0.0418, + "step": 857 + }, + { + "epoch": 1.52, + "grad_norm": 0.4274597764015198, + "learning_rate": 0.00014846185844733894, + "loss": 0.0898, + "step": 858 + }, + { + "epoch": 1.52, + "grad_norm": 0.7866750955581665, + "learning_rate": 0.0001481822089848752, + "loss": 0.0868, + "step": 859 + }, + { + "epoch": 1.52, + "grad_norm": 0.3989715874195099, + "learning_rate": 0.00014790256584134467, + "loss": 0.0539, + "step": 860 + }, + { + "epoch": 1.52, + "grad_norm": 0.275583416223526, + "learning_rate": 0.00014762292998883166, + "loss": 0.0577, + "step": 861 + }, + { + "epoch": 1.52, + "grad_norm": 0.31740522384643555, + "learning_rate": 0.00014734330239939535, + "loss": 0.0169, + "step": 862 + }, + { + "epoch": 1.53, + "grad_norm": 0.23814305663108826, + "learning_rate": 0.00014706368404506602, + "loss": 0.031, + "step": 863 + }, + { + "epoch": 1.53, + "grad_norm": 0.21870914101600647, + "learning_rate": 0.000146784075897842, + "loss": 0.0239, + "step": 864 + }, + { + "epoch": 1.53, + "grad_norm": 0.08485715836286545, + "learning_rate": 0.00014650447892968605, + "loss": 0.0093, + "step": 865 + }, + { + "epoch": 1.53, + "grad_norm": 0.1792098879814148, + "learning_rate": 0.000146224894112522, + "loss": 0.0296, + "step": 866 + }, + { + "epoch": 1.53, + "grad_norm": 0.2863057255744934, + "learning_rate": 0.00014594532241823165, + "loss": 0.0521, + "step": 867 + }, + { + "epoch": 1.54, + "grad_norm": 0.477017343044281, + "learning_rate": 0.00014566576481865097, + "loss": 0.08, + "step": 868 + }, + { + "epoch": 1.54, + "grad_norm": 0.3662970960140228, + "learning_rate": 0.00014538622228556697, + "loss": 0.0751, + "step": 869 + }, + { + "epoch": 1.54, + "grad_norm": 0.3300603926181793, + "learning_rate": 0.00014510669579071448, + "loss": 0.075, + "step": 870 + }, + { + "epoch": 1.54, + "grad_norm": 0.04433250427246094, + "learning_rate": 0.0001448271863057723, + "loss": 0.0065, + "step": 871 + }, + { + "epoch": 1.54, + "grad_norm": 0.2359628975391388, + "learning_rate": 0.0001445476948023603, + "loss": 0.0521, + "step": 872 + }, + { + "epoch": 1.54, + "grad_norm": 0.14717410504817963, + "learning_rate": 0.0001442682222520359, + "loss": 0.0216, + "step": 873 + }, + { + "epoch": 1.55, + "grad_norm": 0.22751399874687195, + "learning_rate": 0.00014398876962629025, + "loss": 0.0403, + "step": 874 + }, + { + "epoch": 1.55, + "grad_norm": 0.6166908144950867, + "learning_rate": 0.0001437093378965457, + "loss": 0.0649, + "step": 875 + }, + { + "epoch": 1.55, + "grad_norm": 0.2543468475341797, + "learning_rate": 0.0001434299280341517, + "loss": 0.0781, + "step": 876 + }, + { + "epoch": 1.55, + "grad_norm": 0.15570877492427826, + "learning_rate": 0.00014315054101038174, + "loss": 0.0314, + "step": 877 + }, + { + "epoch": 1.55, + "grad_norm": 0.13937588036060333, + "learning_rate": 0.00014287117779643, + "loss": 0.0317, + "step": 878 + }, + { + "epoch": 1.56, + "grad_norm": 0.3276956081390381, + "learning_rate": 0.00014259183936340774, + "loss": 0.0869, + "step": 879 + }, + { + "epoch": 1.56, + "grad_norm": 0.271882027387619, + "learning_rate": 0.00014231252668234023, + "loss": 0.0329, + "step": 880 + }, + { + "epoch": 1.56, + "grad_norm": 0.18096867203712463, + "learning_rate": 0.0001420332407241631, + "loss": 0.0375, + "step": 881 + }, + { + "epoch": 1.56, + "grad_norm": 0.1566796898841858, + "learning_rate": 0.0001417539824597192, + "loss": 0.0333, + "step": 882 + }, + { + "epoch": 1.56, + "grad_norm": 0.16603341698646545, + "learning_rate": 0.00014147475285975503, + "loss": 0.033, + "step": 883 + }, + { + "epoch": 1.56, + "grad_norm": 0.1516629457473755, + "learning_rate": 0.00014119555289491737, + "loss": 0.0305, + "step": 884 + }, + { + "epoch": 1.57, + "grad_norm": 0.23150323331356049, + "learning_rate": 0.00014091638353575024, + "loss": 0.0261, + "step": 885 + }, + { + "epoch": 1.57, + "grad_norm": 0.1631498783826828, + "learning_rate": 0.00014063724575269105, + "loss": 0.0334, + "step": 886 + }, + { + "epoch": 1.57, + "grad_norm": 0.2955824136734009, + "learning_rate": 0.00014035814051606752, + "loss": 0.0552, + "step": 887 + }, + { + "epoch": 1.57, + "grad_norm": 0.24213866889476776, + "learning_rate": 0.00014007906879609424, + "loss": 0.0279, + "step": 888 + }, + { + "epoch": 1.57, + "grad_norm": 0.25006791949272156, + "learning_rate": 0.00013980003156286914, + "loss": 0.0522, + "step": 889 + }, + { + "epoch": 1.57, + "grad_norm": 0.22357212007045746, + "learning_rate": 0.00013952102978637056, + "loss": 0.0396, + "step": 890 + }, + { + "epoch": 1.58, + "grad_norm": 0.3780936598777771, + "learning_rate": 0.00013924206443645344, + "loss": 0.0745, + "step": 891 + }, + { + "epoch": 1.58, + "grad_norm": 0.42298954725265503, + "learning_rate": 0.00013896313648284597, + "loss": 0.1081, + "step": 892 + }, + { + "epoch": 1.58, + "grad_norm": 0.3097688555717468, + "learning_rate": 0.00013868424689514658, + "loss": 0.0757, + "step": 893 + }, + { + "epoch": 1.58, + "grad_norm": 0.15456968545913696, + "learning_rate": 0.0001384053966428201, + "loss": 0.029, + "step": 894 + }, + { + "epoch": 1.58, + "grad_norm": 0.2668134272098541, + "learning_rate": 0.00013812658669519473, + "loss": 0.0398, + "step": 895 + }, + { + "epoch": 1.59, + "grad_norm": 0.39498788118362427, + "learning_rate": 0.00013784781802145873, + "loss": 0.0756, + "step": 896 + }, + { + "epoch": 1.59, + "grad_norm": 0.782438337802887, + "learning_rate": 0.00013756909159065656, + "loss": 0.0827, + "step": 897 + }, + { + "epoch": 1.59, + "grad_norm": 0.5805864334106445, + "learning_rate": 0.00013729040837168608, + "loss": 0.0536, + "step": 898 + }, + { + "epoch": 1.59, + "grad_norm": 0.42258474230766296, + "learning_rate": 0.00013701176933329494, + "loss": 0.093, + "step": 899 + }, + { + "epoch": 1.59, + "grad_norm": 0.3261685073375702, + "learning_rate": 0.00013673317544407693, + "loss": 0.0751, + "step": 900 + }, + { + "epoch": 1.59, + "grad_norm": 0.5293501019477844, + "learning_rate": 0.0001364546276724693, + "loss": 0.1383, + "step": 901 + }, + { + "epoch": 1.6, + "grad_norm": 0.14614896476268768, + "learning_rate": 0.00013617612698674865, + "loss": 0.0263, + "step": 902 + }, + { + "epoch": 1.6, + "grad_norm": 0.12985162436962128, + "learning_rate": 0.00013589767435502812, + "loss": 0.027, + "step": 903 + }, + { + "epoch": 1.6, + "grad_norm": 0.12051744014024734, + "learning_rate": 0.0001356192707452537, + "loss": 0.0301, + "step": 904 + }, + { + "epoch": 1.6, + "grad_norm": 0.2052602469921112, + "learning_rate": 0.00013534091712520097, + "loss": 0.0843, + "step": 905 + }, + { + "epoch": 1.6, + "grad_norm": 0.3517119586467743, + "learning_rate": 0.0001350626144624718, + "loss": 0.0769, + "step": 906 + }, + { + "epoch": 1.6, + "grad_norm": 0.182536780834198, + "learning_rate": 0.00013478436372449086, + "loss": 0.0798, + "step": 907 + }, + { + "epoch": 1.61, + "grad_norm": 0.31057825684547424, + "learning_rate": 0.0001345061658785024, + "loss": 0.0799, + "step": 908 + }, + { + "epoch": 1.61, + "grad_norm": 0.2310357391834259, + "learning_rate": 0.0001342280218915667, + "loss": 0.0506, + "step": 909 + }, + { + "epoch": 1.61, + "grad_norm": 0.2205679565668106, + "learning_rate": 0.00013394993273055687, + "loss": 0.0501, + "step": 910 + }, + { + "epoch": 1.61, + "grad_norm": 0.09648188203573227, + "learning_rate": 0.0001336718993621555, + "loss": 0.0253, + "step": 911 + }, + { + "epoch": 1.61, + "grad_norm": 0.14384910464286804, + "learning_rate": 0.00013339392275285117, + "loss": 0.0474, + "step": 912 + }, + { + "epoch": 1.62, + "grad_norm": 0.22472958266735077, + "learning_rate": 0.0001331160038689351, + "loss": 0.0715, + "step": 913 + }, + { + "epoch": 1.62, + "grad_norm": 0.17647628486156464, + "learning_rate": 0.000132838143676498, + "loss": 0.0379, + "step": 914 + }, + { + "epoch": 1.62, + "grad_norm": 0.17897732555866241, + "learning_rate": 0.0001325603431414264, + "loss": 0.0407, + "step": 915 + }, + { + "epoch": 1.62, + "grad_norm": 0.15975071489810944, + "learning_rate": 0.00013228260322939954, + "loss": 0.0386, + "step": 916 + }, + { + "epoch": 1.62, + "grad_norm": 0.20889371633529663, + "learning_rate": 0.00013200492490588595, + "loss": 0.0395, + "step": 917 + }, + { + "epoch": 1.62, + "grad_norm": 0.19162769615650177, + "learning_rate": 0.00013172730913613994, + "loss": 0.0348, + "step": 918 + }, + { + "epoch": 1.63, + "grad_norm": 0.11488895863294601, + "learning_rate": 0.00013144975688519857, + "loss": 0.0299, + "step": 919 + }, + { + "epoch": 1.63, + "grad_norm": 0.2982803285121918, + "learning_rate": 0.0001311722691178779, + "loss": 0.0772, + "step": 920 + }, + { + "epoch": 1.63, + "grad_norm": 0.28049570322036743, + "learning_rate": 0.00013089484679876988, + "loss": 0.0604, + "step": 921 + }, + { + "epoch": 1.63, + "grad_norm": 0.534457266330719, + "learning_rate": 0.00013061749089223913, + "loss": 0.0674, + "step": 922 + }, + { + "epoch": 1.63, + "grad_norm": 0.14333413541316986, + "learning_rate": 0.00013034020236241917, + "loss": 0.0243, + "step": 923 + }, + { + "epoch": 1.63, + "grad_norm": 0.17806164920330048, + "learning_rate": 0.00013006298217320945, + "loss": 0.0284, + "step": 924 + }, + { + "epoch": 1.64, + "grad_norm": 0.1662513166666031, + "learning_rate": 0.00012978583128827185, + "loss": 0.0204, + "step": 925 + }, + { + "epoch": 1.64, + "grad_norm": 0.6015653014183044, + "learning_rate": 0.00012950875067102718, + "loss": 0.0641, + "step": 926 + }, + { + "epoch": 1.64, + "grad_norm": 0.21260066330432892, + "learning_rate": 0.00012923174128465226, + "loss": 0.0294, + "step": 927 + }, + { + "epoch": 1.64, + "grad_norm": 0.14397118985652924, + "learning_rate": 0.00012895480409207607, + "loss": 0.0159, + "step": 928 + }, + { + "epoch": 1.64, + "grad_norm": 0.34683769941329956, + "learning_rate": 0.0001286779400559767, + "loss": 0.0346, + "step": 929 + }, + { + "epoch": 1.65, + "grad_norm": 0.02308204025030136, + "learning_rate": 0.00012840115013877803, + "loss": 0.0024, + "step": 930 + }, + { + "epoch": 1.65, + "grad_norm": 0.22624412178993225, + "learning_rate": 0.00012812443530264608, + "loss": 0.039, + "step": 931 + }, + { + "epoch": 1.65, + "grad_norm": 0.4369746744632721, + "learning_rate": 0.0001278477965094862, + "loss": 0.0878, + "step": 932 + }, + { + "epoch": 1.65, + "grad_norm": 0.4473649561405182, + "learning_rate": 0.00012757123472093904, + "loss": 0.0528, + "step": 933 + }, + { + "epoch": 1.65, + "grad_norm": 0.3166683316230774, + "learning_rate": 0.00012729475089837778, + "loss": 0.0266, + "step": 934 + }, + { + "epoch": 1.65, + "grad_norm": 0.29405805468559265, + "learning_rate": 0.00012701834600290464, + "loss": 0.0471, + "step": 935 + }, + { + "epoch": 1.66, + "grad_norm": 0.07540614157915115, + "learning_rate": 0.00012674202099534723, + "loss": 0.0136, + "step": 936 + }, + { + "epoch": 1.66, + "grad_norm": 0.7350433468818665, + "learning_rate": 0.00012646577683625567, + "loss": 0.0554, + "step": 937 + }, + { + "epoch": 1.66, + "grad_norm": 0.7809972167015076, + "learning_rate": 0.00012618961448589903, + "loss": 0.0659, + "step": 938 + }, + { + "epoch": 1.66, + "grad_norm": 0.3870202302932739, + "learning_rate": 0.00012591353490426182, + "loss": 0.031, + "step": 939 + }, + { + "epoch": 1.66, + "grad_norm": 0.39360955357551575, + "learning_rate": 0.00012563753905104106, + "loss": 0.0467, + "step": 940 + }, + { + "epoch": 1.66, + "grad_norm": 0.3007020354270935, + "learning_rate": 0.00012536162788564254, + "loss": 0.0369, + "step": 941 + }, + { + "epoch": 1.67, + "grad_norm": 0.0672825500369072, + "learning_rate": 0.00012508580236717768, + "loss": 0.0096, + "step": 942 + }, + { + "epoch": 1.67, + "grad_norm": 0.2634739875793457, + "learning_rate": 0.00012481006345446037, + "loss": 0.0515, + "step": 943 + }, + { + "epoch": 1.67, + "grad_norm": 0.24073559045791626, + "learning_rate": 0.00012453441210600322, + "loss": 0.028, + "step": 944 + }, + { + "epoch": 1.67, + "grad_norm": 0.20282107591629028, + "learning_rate": 0.00012425884928001453, + "loss": 0.0176, + "step": 945 + }, + { + "epoch": 1.67, + "grad_norm": 0.19443388283252716, + "learning_rate": 0.0001239833759343949, + "loss": 0.0242, + "step": 946 + }, + { + "epoch": 1.68, + "grad_norm": 0.26304328441619873, + "learning_rate": 0.00012370799302673384, + "loss": 0.018, + "step": 947 + }, + { + "epoch": 1.68, + "grad_norm": 0.29775339365005493, + "learning_rate": 0.00012343270151430659, + "loss": 0.0694, + "step": 948 + }, + { + "epoch": 1.68, + "grad_norm": 0.6493348479270935, + "learning_rate": 0.00012315750235407052, + "loss": 0.0985, + "step": 949 + }, + { + "epoch": 1.68, + "grad_norm": 0.24055354297161102, + "learning_rate": 0.0001228823965026621, + "loss": 0.0411, + "step": 950 + }, + { + "epoch": 1.68, + "grad_norm": 0.24912531673908234, + "learning_rate": 0.0001226073849163934, + "loss": 0.0299, + "step": 951 + }, + { + "epoch": 1.68, + "grad_norm": 0.17892080545425415, + "learning_rate": 0.00012233246855124875, + "loss": 0.0348, + "step": 952 + }, + { + "epoch": 1.69, + "grad_norm": 0.3222208321094513, + "learning_rate": 0.00012205764836288167, + "loss": 0.0402, + "step": 953 + }, + { + "epoch": 1.69, + "grad_norm": 0.17411519587039948, + "learning_rate": 0.00012178292530661106, + "loss": 0.0269, + "step": 954 + }, + { + "epoch": 1.69, + "grad_norm": 0.315654456615448, + "learning_rate": 0.00012150830033741843, + "loss": 0.0273, + "step": 955 + }, + { + "epoch": 1.69, + "grad_norm": 0.23225267231464386, + "learning_rate": 0.00012123377440994423, + "loss": 0.0316, + "step": 956 + }, + { + "epoch": 1.69, + "grad_norm": 0.31462836265563965, + "learning_rate": 0.00012095934847848452, + "loss": 0.071, + "step": 957 + }, + { + "epoch": 1.69, + "grad_norm": 0.36353588104248047, + "learning_rate": 0.00012068502349698805, + "loss": 0.0391, + "step": 958 + }, + { + "epoch": 1.7, + "grad_norm": 0.24388179183006287, + "learning_rate": 0.0001204108004190522, + "loss": 0.03, + "step": 959 + }, + { + "epoch": 1.7, + "grad_norm": 0.5372264981269836, + "learning_rate": 0.00012013668019792057, + "loss": 0.0585, + "step": 960 + }, + { + "epoch": 1.7, + "grad_norm": 0.09891072660684586, + "learning_rate": 0.000119862663786479, + "loss": 0.0116, + "step": 961 + }, + { + "epoch": 1.7, + "grad_norm": 0.37838393449783325, + "learning_rate": 0.0001195887521372524, + "loss": 0.0611, + "step": 962 + }, + { + "epoch": 1.7, + "grad_norm": 0.13470232486724854, + "learning_rate": 0.00011931494620240171, + "loss": 0.0211, + "step": 963 + }, + { + "epoch": 1.71, + "grad_norm": 0.5314339995384216, + "learning_rate": 0.00011904124693372024, + "loss": 0.0631, + "step": 964 + }, + { + "epoch": 1.71, + "grad_norm": 0.4563555121421814, + "learning_rate": 0.00011876765528263052, + "loss": 0.1134, + "step": 965 + }, + { + "epoch": 1.71, + "grad_norm": 0.2271527349948883, + "learning_rate": 0.00011849417220018107, + "loss": 0.0882, + "step": 966 + }, + { + "epoch": 1.71, + "grad_norm": 0.33432772755622864, + "learning_rate": 0.0001182207986370429, + "loss": 0.0908, + "step": 967 + }, + { + "epoch": 1.71, + "grad_norm": 0.28085678815841675, + "learning_rate": 0.00011794753554350634, + "loss": 0.0761, + "step": 968 + }, + { + "epoch": 1.71, + "grad_norm": 0.23504915833473206, + "learning_rate": 0.00011767438386947787, + "loss": 0.037, + "step": 969 + }, + { + "epoch": 1.72, + "grad_norm": 0.11846155673265457, + "learning_rate": 0.00011740134456447641, + "loss": 0.0245, + "step": 970 + }, + { + "epoch": 1.72, + "grad_norm": 0.16506996750831604, + "learning_rate": 0.00011712841857763042, + "loss": 0.0159, + "step": 971 + }, + { + "epoch": 1.72, + "grad_norm": 0.3428071141242981, + "learning_rate": 0.0001168556068576744, + "loss": 0.0586, + "step": 972 + }, + { + "epoch": 1.72, + "grad_norm": 0.41366422176361084, + "learning_rate": 0.00011658291035294564, + "loss": 0.0328, + "step": 973 + }, + { + "epoch": 1.72, + "grad_norm": 0.2286631166934967, + "learning_rate": 0.00011631033001138099, + "loss": 0.0615, + "step": 974 + }, + { + "epoch": 1.72, + "grad_norm": 0.2114325612783432, + "learning_rate": 0.0001160378667805134, + "loss": 0.0431, + "step": 975 + }, + { + "epoch": 1.73, + "grad_norm": 0.2964124083518982, + "learning_rate": 0.00011576552160746875, + "loss": 0.0921, + "step": 976 + }, + { + "epoch": 1.73, + "grad_norm": 0.23333175480365753, + "learning_rate": 0.0001154932954389626, + "loss": 0.0419, + "step": 977 + }, + { + "epoch": 1.73, + "grad_norm": 0.22366467118263245, + "learning_rate": 0.0001152211892212967, + "loss": 0.0426, + "step": 978 + }, + { + "epoch": 1.73, + "grad_norm": 0.10860833525657654, + "learning_rate": 0.00011494920390035602, + "loss": 0.0185, + "step": 979 + }, + { + "epoch": 1.73, + "grad_norm": 0.3171096444129944, + "learning_rate": 0.00011467734042160505, + "loss": 0.0547, + "step": 980 + }, + { + "epoch": 1.74, + "grad_norm": 0.18347975611686707, + "learning_rate": 0.00011440559973008493, + "loss": 0.0315, + "step": 981 + }, + { + "epoch": 1.74, + "grad_norm": 0.3099198639392853, + "learning_rate": 0.00011413398277040988, + "loss": 0.0552, + "step": 982 + }, + { + "epoch": 1.74, + "grad_norm": 0.166449636220932, + "learning_rate": 0.00011386249048676397, + "loss": 0.0314, + "step": 983 + }, + { + "epoch": 1.74, + "grad_norm": 0.1382310539484024, + "learning_rate": 0.00011359112382289807, + "loss": 0.0204, + "step": 984 + }, + { + "epoch": 1.74, + "grad_norm": 0.33030879497528076, + "learning_rate": 0.00011331988372212604, + "loss": 0.0351, + "step": 985 + }, + { + "epoch": 1.74, + "grad_norm": 0.10897009819746017, + "learning_rate": 0.00011304877112732208, + "loss": 0.0175, + "step": 986 + }, + { + "epoch": 1.75, + "grad_norm": 0.40048226714134216, + "learning_rate": 0.0001127777869809171, + "loss": 0.1135, + "step": 987 + }, + { + "epoch": 1.75, + "grad_norm": 0.22304043173789978, + "learning_rate": 0.00011250693222489535, + "loss": 0.0309, + "step": 988 + }, + { + "epoch": 1.75, + "grad_norm": 0.22760216891765594, + "learning_rate": 0.00011223620780079154, + "loss": 0.0207, + "step": 989 + }, + { + "epoch": 1.75, + "grad_norm": 0.19811508059501648, + "learning_rate": 0.00011196561464968713, + "loss": 0.0393, + "step": 990 + }, + { + "epoch": 1.75, + "grad_norm": 0.4455642104148865, + "learning_rate": 0.00011169515371220728, + "loss": 0.1054, + "step": 991 + }, + { + "epoch": 1.75, + "grad_norm": 0.0268537737429142, + "learning_rate": 0.00011142482592851769, + "loss": 0.0042, + "step": 992 + }, + { + "epoch": 1.76, + "grad_norm": 0.11751615256071091, + "learning_rate": 0.000111154632238321, + "loss": 0.0148, + "step": 993 + }, + { + "epoch": 1.76, + "grad_norm": 0.28696444630622864, + "learning_rate": 0.00011088457358085382, + "loss": 0.0545, + "step": 994 + }, + { + "epoch": 1.76, + "eval_loss": 0.0792541652917862, + "eval_runtime": 14.6647, + "eval_samples_per_second": 32.527, + "eval_steps_per_second": 8.183, + "step": 994 + }, + { + "epoch": 1.76, + "grad_norm": 0.1033347100019455, + "learning_rate": 0.00011061465089488348, + "loss": 0.0119, + "step": 995 + }, + { + "epoch": 1.76, + "grad_norm": 0.3465028703212738, + "learning_rate": 0.00011034486511870443, + "loss": 0.0479, + "step": 996 + }, + { + "epoch": 1.76, + "grad_norm": 0.4551747143268585, + "learning_rate": 0.00011007521719013536, + "loss": 0.0604, + "step": 997 + }, + { + "epoch": 1.77, + "grad_norm": 0.10942043364048004, + "learning_rate": 0.00010980570804651567, + "loss": 0.013, + "step": 998 + }, + { + "epoch": 1.77, + "grad_norm": 0.41811010241508484, + "learning_rate": 0.00010953633862470239, + "loss": 0.0966, + "step": 999 + }, + { + "epoch": 1.77, + "grad_norm": 0.12761569023132324, + "learning_rate": 0.00010926710986106691, + "loss": 0.0173, + "step": 1000 + }, + { + "epoch": 1.77, + "grad_norm": 0.263926237821579, + "learning_rate": 0.0001089980226914915, + "loss": 0.0825, + "step": 1001 + }, + { + "epoch": 1.77, + "grad_norm": 0.41034916043281555, + "learning_rate": 0.00010872907805136636, + "loss": 0.0674, + "step": 1002 + }, + { + "epoch": 1.77, + "grad_norm": 0.23882801830768585, + "learning_rate": 0.00010846027687558626, + "loss": 0.0398, + "step": 1003 + }, + { + "epoch": 1.78, + "grad_norm": 0.2567130923271179, + "learning_rate": 0.0001081916200985471, + "loss": 0.0408, + "step": 1004 + }, + { + "epoch": 1.78, + "grad_norm": 0.16284267604351044, + "learning_rate": 0.00010792310865414305, + "loss": 0.0243, + "step": 1005 + }, + { + "epoch": 1.78, + "grad_norm": 0.29053768515586853, + "learning_rate": 0.00010765474347576287, + "loss": 0.0765, + "step": 1006 + }, + { + "epoch": 1.78, + "grad_norm": 0.4428679943084717, + "learning_rate": 0.00010738652549628698, + "loss": 0.0808, + "step": 1007 + }, + { + "epoch": 1.78, + "grad_norm": 0.3464658856391907, + "learning_rate": 0.00010711845564808416, + "loss": 0.0588, + "step": 1008 + }, + { + "epoch": 1.79, + "grad_norm": 0.2980837821960449, + "learning_rate": 0.00010685053486300813, + "loss": 0.0558, + "step": 1009 + }, + { + "epoch": 1.79, + "grad_norm": 0.11548680812120438, + "learning_rate": 0.0001065827640723946, + "loss": 0.0243, + "step": 1010 + }, + { + "epoch": 1.79, + "grad_norm": 0.11409750580787659, + "learning_rate": 0.0001063151442070577, + "loss": 0.0194, + "step": 1011 + }, + { + "epoch": 1.79, + "grad_norm": 0.24823527038097382, + "learning_rate": 0.00010604767619728706, + "loss": 0.0569, + "step": 1012 + }, + { + "epoch": 1.79, + "grad_norm": 0.15851640701293945, + "learning_rate": 0.00010578036097284441, + "loss": 0.0304, + "step": 1013 + }, + { + "epoch": 1.79, + "grad_norm": 0.09604015946388245, + "learning_rate": 0.00010551319946296026, + "loss": 0.0193, + "step": 1014 + }, + { + "epoch": 1.8, + "grad_norm": 0.39140328764915466, + "learning_rate": 0.00010524619259633097, + "loss": 0.0595, + "step": 1015 + }, + { + "epoch": 1.8, + "grad_norm": 0.12756693363189697, + "learning_rate": 0.00010497934130111524, + "loss": 0.0214, + "step": 1016 + }, + { + "epoch": 1.8, + "grad_norm": 0.23306307196617126, + "learning_rate": 0.00010471264650493093, + "loss": 0.0436, + "step": 1017 + }, + { + "epoch": 1.8, + "grad_norm": 0.1584581732749939, + "learning_rate": 0.00010444610913485194, + "loss": 0.0232, + "step": 1018 + }, + { + "epoch": 1.8, + "grad_norm": 0.13806113600730896, + "learning_rate": 0.00010417973011740491, + "loss": 0.0207, + "step": 1019 + }, + { + "epoch": 1.8, + "grad_norm": 0.36593613028526306, + "learning_rate": 0.00010391351037856604, + "loss": 0.1585, + "step": 1020 + }, + { + "epoch": 1.81, + "grad_norm": 0.22285635769367218, + "learning_rate": 0.0001036474508437579, + "loss": 0.0265, + "step": 1021 + }, + { + "epoch": 1.81, + "grad_norm": 0.05805375054478645, + "learning_rate": 0.00010338155243784604, + "loss": 0.0091, + "step": 1022 + }, + { + "epoch": 1.81, + "grad_norm": 0.14522014558315277, + "learning_rate": 0.00010311581608513602, + "loss": 0.021, + "step": 1023 + }, + { + "epoch": 1.81, + "grad_norm": 0.2817407250404358, + "learning_rate": 0.00010285024270937002, + "loss": 0.0402, + "step": 1024 + }, + { + "epoch": 1.81, + "grad_norm": 0.323799729347229, + "learning_rate": 0.00010258483323372363, + "loss": 0.0892, + "step": 1025 + }, + { + "epoch": 1.82, + "grad_norm": 0.26789921522140503, + "learning_rate": 0.0001023195885808029, + "loss": 0.0513, + "step": 1026 + }, + { + "epoch": 1.82, + "grad_norm": 0.17003491520881653, + "learning_rate": 0.00010205450967264066, + "loss": 0.0254, + "step": 1027 + }, + { + "epoch": 1.82, + "grad_norm": 0.20790556073188782, + "learning_rate": 0.00010178959743069378, + "loss": 0.0548, + "step": 1028 + }, + { + "epoch": 1.82, + "grad_norm": 0.5027781128883362, + "learning_rate": 0.00010152485277583974, + "loss": 0.0664, + "step": 1029 + }, + { + "epoch": 1.82, + "grad_norm": 0.06910550594329834, + "learning_rate": 0.0001012602766283733, + "loss": 0.0044, + "step": 1030 + }, + { + "epoch": 1.82, + "grad_norm": 0.4314959943294525, + "learning_rate": 0.00010099586990800374, + "loss": 0.0266, + "step": 1031 + }, + { + "epoch": 1.83, + "grad_norm": 0.11808943003416061, + "learning_rate": 0.00010073163353385115, + "loss": 0.0138, + "step": 1032 + }, + { + "epoch": 1.83, + "grad_norm": 0.3632063865661621, + "learning_rate": 0.00010046756842444354, + "loss": 0.0569, + "step": 1033 + }, + { + "epoch": 1.83, + "grad_norm": 0.36305737495422363, + "learning_rate": 0.00010020367549771366, + "loss": 0.0413, + "step": 1034 + }, + { + "epoch": 1.83, + "grad_norm": 0.4507397711277008, + "learning_rate": 9.993995567099555e-05, + "loss": 0.0685, + "step": 1035 + }, + { + "epoch": 1.83, + "grad_norm": 0.1823229193687439, + "learning_rate": 9.96764098610218e-05, + "loss": 0.0154, + "step": 1036 + }, + { + "epoch": 1.83, + "grad_norm": 0.4844795763492584, + "learning_rate": 9.941303898391971e-05, + "loss": 0.0643, + "step": 1037 + }, + { + "epoch": 1.84, + "grad_norm": 0.44893908500671387, + "learning_rate": 9.914984395520884e-05, + "loss": 0.0849, + "step": 1038 + }, + { + "epoch": 1.84, + "grad_norm": 0.6581425666809082, + "learning_rate": 9.888682568979734e-05, + "loss": 0.1001, + "step": 1039 + }, + { + "epoch": 1.84, + "grad_norm": 0.19968827068805695, + "learning_rate": 9.862398510197874e-05, + "loss": 0.0261, + "step": 1040 + }, + { + "epoch": 1.84, + "grad_norm": 0.4013736844062805, + "learning_rate": 9.836132310542926e-05, + "loss": 0.0443, + "step": 1041 + }, + { + "epoch": 1.84, + "grad_norm": 0.35353174805641174, + "learning_rate": 9.809884061320407e-05, + "loss": 0.0757, + "step": 1042 + }, + { + "epoch": 1.85, + "grad_norm": 0.3231182396411896, + "learning_rate": 9.783653853773438e-05, + "loss": 0.0618, + "step": 1043 + }, + { + "epoch": 1.85, + "grad_norm": 0.2694651186466217, + "learning_rate": 9.757441779082433e-05, + "loss": 0.0273, + "step": 1044 + }, + { + "epoch": 1.85, + "grad_norm": 0.18274915218353271, + "learning_rate": 9.731247928364765e-05, + "loss": 0.0308, + "step": 1045 + }, + { + "epoch": 1.85, + "grad_norm": 0.38829588890075684, + "learning_rate": 9.705072392674457e-05, + "loss": 0.0639, + "step": 1046 + }, + { + "epoch": 1.85, + "grad_norm": 0.33814239501953125, + "learning_rate": 9.678915263001884e-05, + "loss": 0.0469, + "step": 1047 + }, + { + "epoch": 1.85, + "grad_norm": 0.36192476749420166, + "learning_rate": 9.65277663027341e-05, + "loss": 0.0441, + "step": 1048 + }, + { + "epoch": 1.86, + "grad_norm": 0.613652229309082, + "learning_rate": 9.626656585351127e-05, + "loss": 0.0616, + "step": 1049 + }, + { + "epoch": 1.86, + "grad_norm": 0.4053266644477844, + "learning_rate": 9.60055521903249e-05, + "loss": 0.052, + "step": 1050 + }, + { + "epoch": 1.86, + "grad_norm": 0.1685592234134674, + "learning_rate": 9.574472622050037e-05, + "loss": 0.0216, + "step": 1051 + }, + { + "epoch": 1.86, + "grad_norm": 0.3046499788761139, + "learning_rate": 9.548408885071069e-05, + "loss": 0.0416, + "step": 1052 + }, + { + "epoch": 1.86, + "grad_norm": 0.3596743643283844, + "learning_rate": 9.522364098697309e-05, + "loss": 0.0679, + "step": 1053 + }, + { + "epoch": 1.86, + "grad_norm": 0.5623247623443604, + "learning_rate": 9.496338353464612e-05, + "loss": 0.0837, + "step": 1054 + }, + { + "epoch": 1.87, + "grad_norm": 0.10884875059127808, + "learning_rate": 9.470331739842645e-05, + "loss": 0.0117, + "step": 1055 + }, + { + "epoch": 1.87, + "grad_norm": 0.31701669096946716, + "learning_rate": 9.444344348234564e-05, + "loss": 0.0323, + "step": 1056 + }, + { + "epoch": 1.87, + "grad_norm": 0.4528666138648987, + "learning_rate": 9.418376268976726e-05, + "loss": 0.1085, + "step": 1057 + }, + { + "epoch": 1.87, + "grad_norm": 0.28140756487846375, + "learning_rate": 9.392427592338317e-05, + "loss": 0.0216, + "step": 1058 + }, + { + "epoch": 1.87, + "grad_norm": 0.20548555254936218, + "learning_rate": 9.366498408521118e-05, + "loss": 0.0252, + "step": 1059 + }, + { + "epoch": 1.88, + "grad_norm": 0.17361027002334595, + "learning_rate": 9.340588807659126e-05, + "loss": 0.0318, + "step": 1060 + }, + { + "epoch": 1.88, + "grad_norm": 0.2615595757961273, + "learning_rate": 9.314698879818263e-05, + "loss": 0.0254, + "step": 1061 + }, + { + "epoch": 1.88, + "grad_norm": 0.22027018666267395, + "learning_rate": 9.288828714996086e-05, + "loss": 0.057, + "step": 1062 + }, + { + "epoch": 1.88, + "grad_norm": 0.38918057084083557, + "learning_rate": 9.262978403121419e-05, + "loss": 0.0337, + "step": 1063 + }, + { + "epoch": 1.88, + "grad_norm": 0.19435515999794006, + "learning_rate": 9.237148034054107e-05, + "loss": 0.0304, + "step": 1064 + }, + { + "epoch": 1.88, + "grad_norm": 0.3071884512901306, + "learning_rate": 9.211337697584653e-05, + "loss": 0.0357, + "step": 1065 + }, + { + "epoch": 1.89, + "grad_norm": 0.17016515135765076, + "learning_rate": 9.185547483433921e-05, + "loss": 0.018, + "step": 1066 + }, + { + "epoch": 1.89, + "grad_norm": 0.46096545457839966, + "learning_rate": 9.159777481252838e-05, + "loss": 0.0846, + "step": 1067 + }, + { + "epoch": 1.89, + "grad_norm": 0.294148325920105, + "learning_rate": 9.134027780622068e-05, + "loss": 0.0521, + "step": 1068 + }, + { + "epoch": 1.89, + "grad_norm": 0.19079583883285522, + "learning_rate": 9.108298471051694e-05, + "loss": 0.0218, + "step": 1069 + }, + { + "epoch": 1.89, + "grad_norm": 0.4090532958507538, + "learning_rate": 9.08258964198093e-05, + "loss": 0.0768, + "step": 1070 + }, + { + "epoch": 1.89, + "grad_norm": 0.11493656784296036, + "learning_rate": 9.056901382777784e-05, + "loss": 0.0153, + "step": 1071 + }, + { + "epoch": 1.9, + "grad_norm": 0.37453579902648926, + "learning_rate": 9.031233782738764e-05, + "loss": 0.0453, + "step": 1072 + }, + { + "epoch": 1.9, + "grad_norm": 0.36182692646980286, + "learning_rate": 9.005586931088577e-05, + "loss": 0.0753, + "step": 1073 + }, + { + "epoch": 1.9, + "grad_norm": 0.33530041575431824, + "learning_rate": 8.979960916979781e-05, + "loss": 0.0513, + "step": 1074 + }, + { + "epoch": 1.9, + "grad_norm": 0.35975998640060425, + "learning_rate": 8.95435582949252e-05, + "loss": 0.0459, + "step": 1075 + }, + { + "epoch": 1.9, + "grad_norm": 0.33220142126083374, + "learning_rate": 8.928771757634177e-05, + "loss": 0.0689, + "step": 1076 + }, + { + "epoch": 1.91, + "grad_norm": 0.19460581243038177, + "learning_rate": 8.903208790339093e-05, + "loss": 0.0303, + "step": 1077 + }, + { + "epoch": 1.91, + "grad_norm": 0.2682279944419861, + "learning_rate": 8.87766701646825e-05, + "loss": 0.0288, + "step": 1078 + }, + { + "epoch": 1.91, + "grad_norm": 0.31635788083076477, + "learning_rate": 8.852146524808943e-05, + "loss": 0.0559, + "step": 1079 + }, + { + "epoch": 1.91, + "grad_norm": 0.14291277527809143, + "learning_rate": 8.826647404074496e-05, + "loss": 0.0248, + "step": 1080 + }, + { + "epoch": 1.91, + "grad_norm": 0.2888137400150299, + "learning_rate": 8.801169742903948e-05, + "loss": 0.0913, + "step": 1081 + }, + { + "epoch": 1.91, + "grad_norm": 0.17214493453502655, + "learning_rate": 8.775713629861726e-05, + "loss": 0.029, + "step": 1082 + }, + { + "epoch": 1.92, + "grad_norm": 0.2867276668548584, + "learning_rate": 8.750279153437373e-05, + "loss": 0.0378, + "step": 1083 + }, + { + "epoch": 1.92, + "grad_norm": 0.210460364818573, + "learning_rate": 8.724866402045208e-05, + "loss": 0.0699, + "step": 1084 + }, + { + "epoch": 1.92, + "grad_norm": 0.3305147588253021, + "learning_rate": 8.69947546402402e-05, + "loss": 0.0423, + "step": 1085 + }, + { + "epoch": 1.92, + "grad_norm": 0.1408224254846573, + "learning_rate": 8.674106427636799e-05, + "loss": 0.0174, + "step": 1086 + }, + { + "epoch": 1.92, + "grad_norm": 0.1414298713207245, + "learning_rate": 8.648759381070368e-05, + "loss": 0.0207, + "step": 1087 + }, + { + "epoch": 1.92, + "grad_norm": 0.26083138585090637, + "learning_rate": 8.623434412435143e-05, + "loss": 0.0345, + "step": 1088 + }, + { + "epoch": 1.93, + "grad_norm": 0.05916636809706688, + "learning_rate": 8.598131609764768e-05, + "loss": 0.0066, + "step": 1089 + }, + { + "epoch": 1.93, + "grad_norm": 0.26731768250465393, + "learning_rate": 8.572851061015842e-05, + "loss": 0.0863, + "step": 1090 + }, + { + "epoch": 1.93, + "grad_norm": 0.21286404132843018, + "learning_rate": 8.547592854067616e-05, + "loss": 0.0262, + "step": 1091 + }, + { + "epoch": 1.93, + "grad_norm": 0.1005811020731926, + "learning_rate": 8.522357076721658e-05, + "loss": 0.0115, + "step": 1092 + }, + { + "epoch": 1.93, + "grad_norm": 0.24411919713020325, + "learning_rate": 8.49714381670159e-05, + "loss": 0.0517, + "step": 1093 + }, + { + "epoch": 1.94, + "grad_norm": 0.1755681186914444, + "learning_rate": 8.471953161652742e-05, + "loss": 0.0183, + "step": 1094 + }, + { + "epoch": 1.94, + "grad_norm": 0.2896975874900818, + "learning_rate": 8.446785199141869e-05, + "loss": 0.0555, + "step": 1095 + }, + { + "epoch": 1.94, + "grad_norm": 0.28867337107658386, + "learning_rate": 8.421640016656856e-05, + "loss": 0.0239, + "step": 1096 + }, + { + "epoch": 1.94, + "grad_norm": 0.15457966923713684, + "learning_rate": 8.396517701606377e-05, + "loss": 0.0137, + "step": 1097 + }, + { + "epoch": 1.94, + "grad_norm": 0.21171018481254578, + "learning_rate": 8.371418341319642e-05, + "loss": 0.0228, + "step": 1098 + }, + { + "epoch": 1.94, + "grad_norm": 0.3345678746700287, + "learning_rate": 8.346342023046048e-05, + "loss": 0.079, + "step": 1099 + }, + { + "epoch": 1.95, + "grad_norm": 0.2039625495672226, + "learning_rate": 8.321288833954895e-05, + "loss": 0.0498, + "step": 1100 + }, + { + "epoch": 1.95, + "grad_norm": 0.36770549416542053, + "learning_rate": 8.296258861135097e-05, + "loss": 0.0346, + "step": 1101 + }, + { + "epoch": 1.95, + "grad_norm": 0.212180495262146, + "learning_rate": 8.271252191594845e-05, + "loss": 0.0505, + "step": 1102 + }, + { + "epoch": 1.95, + "grad_norm": 0.45844799280166626, + "learning_rate": 8.246268912261342e-05, + "loss": 0.0539, + "step": 1103 + }, + { + "epoch": 1.95, + "grad_norm": 0.26958250999450684, + "learning_rate": 8.221309109980472e-05, + "loss": 0.0254, + "step": 1104 + }, + { + "epoch": 1.95, + "grad_norm": 0.3047500252723694, + "learning_rate": 8.196372871516502e-05, + "loss": 0.0519, + "step": 1105 + }, + { + "epoch": 1.96, + "grad_norm": 0.5248620510101318, + "learning_rate": 8.171460283551809e-05, + "loss": 0.0376, + "step": 1106 + }, + { + "epoch": 1.96, + "grad_norm": 0.3489890396595001, + "learning_rate": 8.146571432686543e-05, + "loss": 0.038, + "step": 1107 + }, + { + "epoch": 1.96, + "grad_norm": 0.04020179435610771, + "learning_rate": 8.121706405438345e-05, + "loss": 0.0049, + "step": 1108 + }, + { + "epoch": 1.96, + "grad_norm": 0.20535163581371307, + "learning_rate": 8.09686528824203e-05, + "loss": 0.0163, + "step": 1109 + }, + { + "epoch": 1.96, + "grad_norm": 0.5453580021858215, + "learning_rate": 8.072048167449306e-05, + "loss": 0.0463, + "step": 1110 + }, + { + "epoch": 1.97, + "grad_norm": 0.3431243896484375, + "learning_rate": 8.047255129328466e-05, + "loss": 0.0576, + "step": 1111 + }, + { + "epoch": 1.97, + "grad_norm": 0.29239121079444885, + "learning_rate": 8.022486260064099e-05, + "loss": 0.037, + "step": 1112 + }, + { + "epoch": 1.97, + "grad_norm": 0.5388960838317871, + "learning_rate": 7.997741645756762e-05, + "loss": 0.1145, + "step": 1113 + }, + { + "epoch": 1.97, + "grad_norm": 0.3808930814266205, + "learning_rate": 7.9730213724227e-05, + "loss": 0.047, + "step": 1114 + }, + { + "epoch": 1.97, + "grad_norm": 0.39548593759536743, + "learning_rate": 7.948325525993545e-05, + "loss": 0.0197, + "step": 1115 + }, + { + "epoch": 1.97, + "grad_norm": 0.03463061898946762, + "learning_rate": 7.923654192316021e-05, + "loss": 0.0043, + "step": 1116 + }, + { + "epoch": 1.98, + "grad_norm": 0.12383898347616196, + "learning_rate": 7.899007457151657e-05, + "loss": 0.0132, + "step": 1117 + }, + { + "epoch": 1.98, + "grad_norm": 0.3975280821323395, + "learning_rate": 7.874385406176444e-05, + "loss": 0.0338, + "step": 1118 + }, + { + "epoch": 1.98, + "grad_norm": 0.17554999887943268, + "learning_rate": 7.849788124980581e-05, + "loss": 0.0146, + "step": 1119 + }, + { + "epoch": 1.98, + "grad_norm": 0.1743941754102707, + "learning_rate": 7.825215699068171e-05, + "loss": 0.0142, + "step": 1120 + }, + { + "epoch": 1.98, + "grad_norm": 0.15461145341396332, + "learning_rate": 7.800668213856896e-05, + "loss": 0.0145, + "step": 1121 + }, + { + "epoch": 1.98, + "grad_norm": 0.25048068165779114, + "learning_rate": 7.776145754677773e-05, + "loss": 0.0269, + "step": 1122 + }, + { + "epoch": 1.99, + "grad_norm": 0.10131586343050003, + "learning_rate": 7.751648406774778e-05, + "loss": 0.0126, + "step": 1123 + }, + { + "epoch": 1.99, + "grad_norm": 0.7245112061500549, + "learning_rate": 7.727176255304633e-05, + "loss": 0.1092, + "step": 1124 + }, + { + "epoch": 1.99, + "grad_norm": 0.3287739157676697, + "learning_rate": 7.702729385336469e-05, + "loss": 0.0286, + "step": 1125 + }, + { + "epoch": 1.99, + "grad_norm": 0.3910561203956604, + "learning_rate": 7.678307881851516e-05, + "loss": 0.0365, + "step": 1126 + }, + { + "epoch": 1.99, + "grad_norm": 0.31922951340675354, + "learning_rate": 7.653911829742857e-05, + "loss": 0.1163, + "step": 1127 + }, + { + "epoch": 2.0, + "grad_norm": 0.2800161838531494, + "learning_rate": 7.629541313815056e-05, + "loss": 0.0653, + "step": 1128 + }, + { + "epoch": 2.0, + "grad_norm": 0.11548816412687302, + "learning_rate": 7.605196418783948e-05, + "loss": 0.0081, + "step": 1129 + }, + { + "epoch": 2.0, + "grad_norm": 0.5253295302391052, + "learning_rate": 7.580877229276301e-05, + "loss": 0.0402, + "step": 1130 + }, + { + "epoch": 2.0, + "grad_norm": 0.06854914873838425, + "learning_rate": 7.55658382982951e-05, + "loss": 0.0072, + "step": 1131 + }, + { + "epoch": 2.0, + "grad_norm": 0.06590896844863892, + "learning_rate": 7.532316304891326e-05, + "loss": 0.0086, + "step": 1132 + }, + { + "epoch": 2.0, + "grad_norm": 0.05458898842334747, + "learning_rate": 7.508074738819568e-05, + "loss": 0.0066, + "step": 1133 + }, + { + "epoch": 2.01, + "grad_norm": 0.05792460963129997, + "learning_rate": 7.483859215881798e-05, + "loss": 0.0077, + "step": 1134 + }, + { + "epoch": 2.01, + "grad_norm": 0.06486326456069946, + "learning_rate": 7.459669820255067e-05, + "loss": 0.0061, + "step": 1135 + }, + { + "epoch": 2.01, + "grad_norm": 0.03994043543934822, + "learning_rate": 7.43550663602559e-05, + "loss": 0.0044, + "step": 1136 + }, + { + "epoch": 2.01, + "eval_loss": 0.08185647428035736, + "eval_runtime": 14.6532, + "eval_samples_per_second": 32.553, + "eval_steps_per_second": 8.189, + "step": 1136 + }, + { + "epoch": 2.01, + "grad_norm": 0.052452631294727325, + "learning_rate": 7.411369747188468e-05, + "loss": 0.0055, + "step": 1137 + }, + { + "epoch": 2.01, + "grad_norm": 0.04694485291838646, + "learning_rate": 7.387259237647407e-05, + "loss": 0.006, + "step": 1138 + }, + { + "epoch": 2.02, + "grad_norm": 0.15115123987197876, + "learning_rate": 7.363175191214394e-05, + "loss": 0.0157, + "step": 1139 + }, + { + "epoch": 2.02, + "grad_norm": 0.09530965983867645, + "learning_rate": 7.339117691609453e-05, + "loss": 0.0105, + "step": 1140 + }, + { + "epoch": 2.02, + "grad_norm": 0.07174500077962875, + "learning_rate": 7.315086822460303e-05, + "loss": 0.0063, + "step": 1141 + }, + { + "epoch": 2.02, + "grad_norm": 0.023437095806002617, + "learning_rate": 7.291082667302095e-05, + "loss": 0.0038, + "step": 1142 + }, + { + "epoch": 2.02, + "grad_norm": 0.28587350249290466, + "learning_rate": 7.267105309577132e-05, + "loss": 0.0669, + "step": 1143 + }, + { + "epoch": 2.02, + "grad_norm": 0.011934146285057068, + "learning_rate": 7.243154832634547e-05, + "loss": 0.0021, + "step": 1144 + }, + { + "epoch": 2.03, + "grad_norm": 0.05439841374754906, + "learning_rate": 7.21923131973005e-05, + "loss": 0.0055, + "step": 1145 + }, + { + "epoch": 2.03, + "grad_norm": 0.24451234936714172, + "learning_rate": 7.195334854025606e-05, + "loss": 0.0181, + "step": 1146 + }, + { + "epoch": 2.03, + "grad_norm": 0.04562409222126007, + "learning_rate": 7.171465518589153e-05, + "loss": 0.0049, + "step": 1147 + }, + { + "epoch": 2.03, + "grad_norm": 0.020290831103920937, + "learning_rate": 7.147623396394345e-05, + "loss": 0.0034, + "step": 1148 + }, + { + "epoch": 2.03, + "grad_norm": 0.01824161596596241, + "learning_rate": 7.123808570320211e-05, + "loss": 0.0027, + "step": 1149 + }, + { + "epoch": 2.03, + "grad_norm": 0.25729525089263916, + "learning_rate": 7.100021123150916e-05, + "loss": 0.0142, + "step": 1150 + }, + { + "epoch": 2.04, + "grad_norm": 0.23073968291282654, + "learning_rate": 7.07626113757544e-05, + "loss": 0.0529, + "step": 1151 + }, + { + "epoch": 2.04, + "grad_norm": 0.029780661687254906, + "learning_rate": 7.052528696187294e-05, + "loss": 0.004, + "step": 1152 + }, + { + "epoch": 2.04, + "grad_norm": 0.03356673941016197, + "learning_rate": 7.028823881484269e-05, + "loss": 0.0036, + "step": 1153 + }, + { + "epoch": 2.04, + "grad_norm": 0.0346224419772625, + "learning_rate": 7.005146775868085e-05, + "loss": 0.0038, + "step": 1154 + }, + { + "epoch": 2.04, + "grad_norm": 0.09049813449382782, + "learning_rate": 6.981497461644176e-05, + "loss": 0.005, + "step": 1155 + }, + { + "epoch": 2.05, + "grad_norm": 0.056721314787864685, + "learning_rate": 6.957876021021343e-05, + "loss": 0.0037, + "step": 1156 + }, + { + "epoch": 2.05, + "grad_norm": 0.04788971692323685, + "learning_rate": 6.934282536111497e-05, + "loss": 0.004, + "step": 1157 + }, + { + "epoch": 2.05, + "grad_norm": 0.01849561557173729, + "learning_rate": 6.910717088929383e-05, + "loss": 0.0023, + "step": 1158 + }, + { + "epoch": 2.05, + "grad_norm": 0.012392883189022541, + "learning_rate": 6.887179761392282e-05, + "loss": 0.0015, + "step": 1159 + }, + { + "epoch": 2.05, + "grad_norm": 0.04246802628040314, + "learning_rate": 6.863670635319714e-05, + "loss": 0.003, + "step": 1160 + }, + { + "epoch": 2.05, + "grad_norm": 0.13071779906749725, + "learning_rate": 6.84018979243317e-05, + "loss": 0.0075, + "step": 1161 + }, + { + "epoch": 2.06, + "grad_norm": 0.016270257532596588, + "learning_rate": 6.816737314355825e-05, + "loss": 0.0018, + "step": 1162 + }, + { + "epoch": 2.06, + "grad_norm": 0.029110154137015343, + "learning_rate": 6.793313282612257e-05, + "loss": 0.0028, + "step": 1163 + }, + { + "epoch": 2.06, + "grad_norm": 0.06283704191446304, + "learning_rate": 6.769917778628164e-05, + "loss": 0.0025, + "step": 1164 + }, + { + "epoch": 2.06, + "grad_norm": 0.011395593173801899, + "learning_rate": 6.746550883730067e-05, + "loss": 0.0013, + "step": 1165 + }, + { + "epoch": 2.06, + "grad_norm": 0.6366950869560242, + "learning_rate": 6.723212679145038e-05, + "loss": 0.035, + "step": 1166 + }, + { + "epoch": 2.06, + "grad_norm": 0.17910248041152954, + "learning_rate": 6.699903246000417e-05, + "loss": 0.0074, + "step": 1167 + }, + { + "epoch": 2.07, + "grad_norm": 0.011851671151816845, + "learning_rate": 6.676622665323537e-05, + "loss": 0.0015, + "step": 1168 + }, + { + "epoch": 2.07, + "grad_norm": 0.2713810205459595, + "learning_rate": 6.653371018041438e-05, + "loss": 0.0427, + "step": 1169 + }, + { + "epoch": 2.07, + "grad_norm": 0.015064065344631672, + "learning_rate": 6.630148384980567e-05, + "loss": 0.0016, + "step": 1170 + }, + { + "epoch": 2.07, + "grad_norm": 0.040614135563373566, + "learning_rate": 6.606954846866519e-05, + "loss": 0.0028, + "step": 1171 + }, + { + "epoch": 2.07, + "grad_norm": 0.013327802531421185, + "learning_rate": 6.583790484323762e-05, + "loss": 0.0016, + "step": 1172 + }, + { + "epoch": 2.08, + "grad_norm": 0.028794527053833008, + "learning_rate": 6.560655377875328e-05, + "loss": 0.0027, + "step": 1173 + }, + { + "epoch": 2.08, + "grad_norm": 0.01775909960269928, + "learning_rate": 6.537549607942571e-05, + "loss": 0.0013, + "step": 1174 + }, + { + "epoch": 2.08, + "grad_norm": 0.03265360742807388, + "learning_rate": 6.514473254844833e-05, + "loss": 0.0021, + "step": 1175 + }, + { + "epoch": 2.08, + "grad_norm": 0.14050878584384918, + "learning_rate": 6.491426398799231e-05, + "loss": 0.0074, + "step": 1176 + }, + { + "epoch": 2.08, + "grad_norm": 0.6901656985282898, + "learning_rate": 6.468409119920337e-05, + "loss": 0.0547, + "step": 1177 + }, + { + "epoch": 2.08, + "grad_norm": 0.22920645773410797, + "learning_rate": 6.445421498219904e-05, + "loss": 0.0175, + "step": 1178 + }, + { + "epoch": 2.09, + "grad_norm": 0.008751977235078812, + "learning_rate": 6.422463613606586e-05, + "loss": 0.0011, + "step": 1179 + }, + { + "epoch": 2.09, + "grad_norm": 0.02107217162847519, + "learning_rate": 6.399535545885673e-05, + "loss": 0.002, + "step": 1180 + }, + { + "epoch": 2.09, + "grad_norm": 0.05465129017829895, + "learning_rate": 6.376637374758807e-05, + "loss": 0.003, + "step": 1181 + }, + { + "epoch": 2.09, + "grad_norm": 0.05943121016025543, + "learning_rate": 6.353769179823712e-05, + "loss": 0.0033, + "step": 1182 + }, + { + "epoch": 2.09, + "grad_norm": 0.025792196393013, + "learning_rate": 6.330931040573895e-05, + "loss": 0.0024, + "step": 1183 + }, + { + "epoch": 2.09, + "grad_norm": 0.009321498684585094, + "learning_rate": 6.308123036398388e-05, + "loss": 0.001, + "step": 1184 + }, + { + "epoch": 2.1, + "grad_norm": 0.039804425090551376, + "learning_rate": 6.285345246581482e-05, + "loss": 0.0019, + "step": 1185 + }, + { + "epoch": 2.1, + "grad_norm": 0.032443080097436905, + "learning_rate": 6.262597750302417e-05, + "loss": 0.0025, + "step": 1186 + }, + { + "epoch": 2.1, + "grad_norm": 0.01482646819204092, + "learning_rate": 6.23988062663515e-05, + "loss": 0.0014, + "step": 1187 + }, + { + "epoch": 2.1, + "grad_norm": 0.004279375541955233, + "learning_rate": 6.217193954548041e-05, + "loss": 0.0005, + "step": 1188 + }, + { + "epoch": 2.1, + "grad_norm": 0.04546025022864342, + "learning_rate": 6.194537812903598e-05, + "loss": 0.0021, + "step": 1189 + }, + { + "epoch": 2.11, + "grad_norm": 0.09013216942548752, + "learning_rate": 6.171912280458213e-05, + "loss": 0.0042, + "step": 1190 + }, + { + "epoch": 2.11, + "grad_norm": 0.018135396763682365, + "learning_rate": 6.149317435861854e-05, + "loss": 0.0017, + "step": 1191 + }, + { + "epoch": 2.11, + "grad_norm": 0.009064175188541412, + "learning_rate": 6.12675335765784e-05, + "loss": 0.0009, + "step": 1192 + }, + { + "epoch": 2.11, + "grad_norm": 0.07935329526662827, + "learning_rate": 6.104220124282517e-05, + "loss": 0.0054, + "step": 1193 + }, + { + "epoch": 2.11, + "grad_norm": 0.017721056938171387, + "learning_rate": 6.081717814065014e-05, + "loss": 0.0014, + "step": 1194 + }, + { + "epoch": 2.11, + "grad_norm": 0.004983251914381981, + "learning_rate": 6.059246505226984e-05, + "loss": 0.0005, + "step": 1195 + }, + { + "epoch": 2.12, + "grad_norm": 0.013217182829976082, + "learning_rate": 6.036806275882288e-05, + "loss": 0.0011, + "step": 1196 + }, + { + "epoch": 2.12, + "grad_norm": 0.49577173590660095, + "learning_rate": 6.014397204036775e-05, + "loss": 0.0247, + "step": 1197 + }, + { + "epoch": 2.12, + "grad_norm": 0.03079071268439293, + "learning_rate": 5.992019367587967e-05, + "loss": 0.0013, + "step": 1198 + }, + { + "epoch": 2.12, + "grad_norm": 0.024660365656018257, + "learning_rate": 5.969672844324809e-05, + "loss": 0.002, + "step": 1199 + }, + { + "epoch": 2.12, + "grad_norm": 0.006923030596226454, + "learning_rate": 5.9473577119274094e-05, + "loss": 0.0007, + "step": 1200 + }, + { + "epoch": 2.12, + "grad_norm": 0.43394070863723755, + "learning_rate": 5.9250740479667393e-05, + "loss": 0.0121, + "step": 1201 + }, + { + "epoch": 2.13, + "grad_norm": 0.0026637266855686903, + "learning_rate": 5.9028219299043996e-05, + "loss": 0.0003, + "step": 1202 + }, + { + "epoch": 2.13, + "grad_norm": 0.07496347278356552, + "learning_rate": 5.880601435092315e-05, + "loss": 0.0023, + "step": 1203 + }, + { + "epoch": 2.13, + "grad_norm": 0.21698249876499176, + "learning_rate": 5.858412640772487e-05, + "loss": 0.0044, + "step": 1204 + }, + { + "epoch": 2.13, + "grad_norm": 0.02845817059278488, + "learning_rate": 5.836255624076732e-05, + "loss": 0.0012, + "step": 1205 + }, + { + "epoch": 2.13, + "grad_norm": 0.006147551350295544, + "learning_rate": 5.814130462026387e-05, + "loss": 0.0007, + "step": 1206 + }, + { + "epoch": 2.14, + "grad_norm": 0.1702822744846344, + "learning_rate": 5.792037231532072e-05, + "loss": 0.0046, + "step": 1207 + }, + { + "epoch": 2.14, + "grad_norm": 0.023039784282445908, + "learning_rate": 5.769976009393396e-05, + "loss": 0.0012, + "step": 1208 + }, + { + "epoch": 2.14, + "grad_norm": 0.1928468644618988, + "learning_rate": 5.7479468722986995e-05, + "loss": 0.009, + "step": 1209 + }, + { + "epoch": 2.14, + "grad_norm": 0.008321943692862988, + "learning_rate": 5.725949896824805e-05, + "loss": 0.0005, + "step": 1210 + }, + { + "epoch": 2.14, + "grad_norm": 0.0016513692680746317, + "learning_rate": 5.703985159436731e-05, + "loss": 0.0002, + "step": 1211 + }, + { + "epoch": 2.14, + "grad_norm": 0.019418880343437195, + "learning_rate": 5.682052736487422e-05, + "loss": 0.0014, + "step": 1212 + }, + { + "epoch": 2.15, + "grad_norm": 0.0035696502309292555, + "learning_rate": 5.6601527042175e-05, + "loss": 0.0004, + "step": 1213 + }, + { + "epoch": 2.15, + "grad_norm": 0.007776456885039806, + "learning_rate": 5.638285138754984e-05, + "loss": 0.0005, + "step": 1214 + }, + { + "epoch": 2.15, + "grad_norm": 0.005890173837542534, + "learning_rate": 5.616450116115044e-05, + "loss": 0.0004, + "step": 1215 + }, + { + "epoch": 2.15, + "grad_norm": 0.0022528611589223146, + "learning_rate": 5.5946477121997296e-05, + "loss": 0.0003, + "step": 1216 + }, + { + "epoch": 2.15, + "grad_norm": 0.041470710188150406, + "learning_rate": 5.572878002797685e-05, + "loss": 0.0015, + "step": 1217 + }, + { + "epoch": 2.15, + "grad_norm": 0.030972883105278015, + "learning_rate": 5.5511410635839145e-05, + "loss": 0.001, + "step": 1218 + }, + { + "epoch": 2.16, + "grad_norm": 0.24281874299049377, + "learning_rate": 5.5294369701195e-05, + "loss": 0.0108, + "step": 1219 + }, + { + "epoch": 2.16, + "grad_norm": 0.005365898832678795, + "learning_rate": 5.507765797851356e-05, + "loss": 0.0004, + "step": 1220 + }, + { + "epoch": 2.16, + "grad_norm": 0.024286767467856407, + "learning_rate": 5.486127622111966e-05, + "loss": 0.0008, + "step": 1221 + }, + { + "epoch": 2.16, + "grad_norm": 0.003390002530068159, + "learning_rate": 5.464522518119078e-05, + "loss": 0.0003, + "step": 1222 + }, + { + "epoch": 2.16, + "grad_norm": 0.007139614783227444, + "learning_rate": 5.442950560975508e-05, + "loss": 0.0005, + "step": 1223 + }, + { + "epoch": 2.17, + "grad_norm": 0.0038057060446590185, + "learning_rate": 5.4214118256688485e-05, + "loss": 0.0003, + "step": 1224 + }, + { + "epoch": 2.17, + "grad_norm": 0.006339641287922859, + "learning_rate": 5.3999063870711856e-05, + "loss": 0.0004, + "step": 1225 + }, + { + "epoch": 2.17, + "grad_norm": 0.02324921451508999, + "learning_rate": 5.378434319938888e-05, + "loss": 0.0008, + "step": 1226 + }, + { + "epoch": 2.17, + "grad_norm": 0.007727119605988264, + "learning_rate": 5.356995698912286e-05, + "loss": 0.0006, + "step": 1227 + }, + { + "epoch": 2.17, + "grad_norm": 0.7749217748641968, + "learning_rate": 5.335590598515476e-05, + "loss": 0.0567, + "step": 1228 + }, + { + "epoch": 2.17, + "grad_norm": 0.022492818534374237, + "learning_rate": 5.314219093156026e-05, + "loss": 0.0009, + "step": 1229 + }, + { + "epoch": 2.18, + "grad_norm": 0.08937911689281464, + "learning_rate": 5.2928812571247104e-05, + "loss": 0.0017, + "step": 1230 + }, + { + "epoch": 2.18, + "grad_norm": 0.08381486684083939, + "learning_rate": 5.27157716459527e-05, + "loss": 0.0018, + "step": 1231 + }, + { + "epoch": 2.18, + "grad_norm": 0.18147748708724976, + "learning_rate": 5.2503068896241456e-05, + "loss": 0.0032, + "step": 1232 + }, + { + "epoch": 2.18, + "grad_norm": 0.33674654364585876, + "learning_rate": 5.229070506150227e-05, + "loss": 0.0417, + "step": 1233 + }, + { + "epoch": 2.18, + "grad_norm": 0.0017751978011801839, + "learning_rate": 5.207868087994595e-05, + "loss": 0.0002, + "step": 1234 + }, + { + "epoch": 2.18, + "grad_norm": 0.023002570495009422, + "learning_rate": 5.186699708860252e-05, + "loss": 0.001, + "step": 1235 + }, + { + "epoch": 2.19, + "grad_norm": 0.0026608568150550127, + "learning_rate": 5.165565442331876e-05, + "loss": 0.0003, + "step": 1236 + }, + { + "epoch": 2.19, + "grad_norm": 0.004896901082247496, + "learning_rate": 5.1444653618755735e-05, + "loss": 0.0004, + "step": 1237 + }, + { + "epoch": 2.19, + "grad_norm": 0.009385212324559689, + "learning_rate": 5.1233995408386016e-05, + "loss": 0.0007, + "step": 1238 + }, + { + "epoch": 2.19, + "grad_norm": 0.009065024554729462, + "learning_rate": 5.102368052449145e-05, + "loss": 0.0005, + "step": 1239 + }, + { + "epoch": 2.19, + "grad_norm": 0.02201078087091446, + "learning_rate": 5.081370969816022e-05, + "loss": 0.0011, + "step": 1240 + }, + { + "epoch": 2.2, + "grad_norm": 0.5560052990913391, + "learning_rate": 5.060408365928458e-05, + "loss": 0.0068, + "step": 1241 + }, + { + "epoch": 2.2, + "grad_norm": 0.05455511063337326, + "learning_rate": 5.039480313655836e-05, + "loss": 0.0017, + "step": 1242 + }, + { + "epoch": 2.2, + "grad_norm": 0.3444525897502899, + "learning_rate": 5.0185868857474135e-05, + "loss": 0.0197, + "step": 1243 + }, + { + "epoch": 2.2, + "grad_norm": 0.14291395246982574, + "learning_rate": 4.997728154832106e-05, + "loss": 0.0023, + "step": 1244 + }, + { + "epoch": 2.2, + "grad_norm": 0.005757753737270832, + "learning_rate": 4.976904193418203e-05, + "loss": 0.0005, + "step": 1245 + }, + { + "epoch": 2.2, + "grad_norm": 0.002776543376967311, + "learning_rate": 4.9561150738931295e-05, + "loss": 0.0003, + "step": 1246 + }, + { + "epoch": 2.21, + "grad_norm": 0.007652276195585728, + "learning_rate": 4.935360868523213e-05, + "loss": 0.0007, + "step": 1247 + }, + { + "epoch": 2.21, + "grad_norm": 0.5766460299491882, + "learning_rate": 4.914641649453386e-05, + "loss": 0.0299, + "step": 1248 + }, + { + "epoch": 2.21, + "grad_norm": 0.0027644610963761806, + "learning_rate": 4.893957488706993e-05, + "loss": 0.0003, + "step": 1249 + }, + { + "epoch": 2.21, + "grad_norm": 0.014254375360906124, + "learning_rate": 4.873308458185485e-05, + "loss": 0.0011, + "step": 1250 + }, + { + "epoch": 2.21, + "grad_norm": 0.7376680374145508, + "learning_rate": 4.8526946296682e-05, + "loss": 0.0184, + "step": 1251 + }, + { + "epoch": 2.21, + "grad_norm": 0.06830843538045883, + "learning_rate": 4.832116074812124e-05, + "loss": 0.0028, + "step": 1252 + }, + { + "epoch": 2.22, + "grad_norm": 0.7363386750221252, + "learning_rate": 4.811572865151605e-05, + "loss": 0.0267, + "step": 1253 + }, + { + "epoch": 2.22, + "grad_norm": 0.014233013615012169, + "learning_rate": 4.7910650720981446e-05, + "loss": 0.0009, + "step": 1254 + }, + { + "epoch": 2.22, + "grad_norm": 0.013635286130011082, + "learning_rate": 4.7705927669401154e-05, + "loss": 0.0008, + "step": 1255 + }, + { + "epoch": 2.22, + "grad_norm": 0.02411198988556862, + "learning_rate": 4.75015602084253e-05, + "loss": 0.0018, + "step": 1256 + }, + { + "epoch": 2.22, + "grad_norm": 0.02610674500465393, + "learning_rate": 4.729754904846805e-05, + "loss": 0.0013, + "step": 1257 + }, + { + "epoch": 2.23, + "grad_norm": 0.0034811110235750675, + "learning_rate": 4.7093894898704834e-05, + "loss": 0.0003, + "step": 1258 + }, + { + "epoch": 2.23, + "grad_norm": 0.538255512714386, + "learning_rate": 4.6890598467070246e-05, + "loss": 0.0167, + "step": 1259 + }, + { + "epoch": 2.23, + "grad_norm": 0.036283351480960846, + "learning_rate": 4.6687660460255214e-05, + "loss": 0.0014, + "step": 1260 + }, + { + "epoch": 2.23, + "grad_norm": 0.016377754509449005, + "learning_rate": 4.64850815837048e-05, + "loss": 0.0012, + "step": 1261 + }, + { + "epoch": 2.23, + "grad_norm": 0.011219850741326809, + "learning_rate": 4.628286254161571e-05, + "loss": 0.0008, + "step": 1262 + }, + { + "epoch": 2.23, + "grad_norm": 0.0374603345990181, + "learning_rate": 4.608100403693379e-05, + "loss": 0.0018, + "step": 1263 + }, + { + "epoch": 2.24, + "grad_norm": 0.008386081084609032, + "learning_rate": 4.5879506771351566e-05, + "loss": 0.0007, + "step": 1264 + }, + { + "epoch": 2.24, + "grad_norm": 0.029542380943894386, + "learning_rate": 4.567837144530585e-05, + "loss": 0.0012, + "step": 1265 + }, + { + "epoch": 2.24, + "grad_norm": 0.012669527903199196, + "learning_rate": 4.547759875797523e-05, + "loss": 0.0007, + "step": 1266 + }, + { + "epoch": 2.24, + "grad_norm": 0.06358956545591354, + "learning_rate": 4.5277189407277846e-05, + "loss": 0.0026, + "step": 1267 + }, + { + "epoch": 2.24, + "grad_norm": 0.005824268329888582, + "learning_rate": 4.5077144089868796e-05, + "loss": 0.0003, + "step": 1268 + }, + { + "epoch": 2.25, + "grad_norm": 0.8795007467269897, + "learning_rate": 4.4877463501137635e-05, + "loss": 0.0181, + "step": 1269 + }, + { + "epoch": 2.25, + "grad_norm": 0.023934688419103622, + "learning_rate": 4.4678148335206126e-05, + "loss": 0.001, + "step": 1270 + }, + { + "epoch": 2.25, + "grad_norm": 0.05723075196146965, + "learning_rate": 4.447919928492571e-05, + "loss": 0.0024, + "step": 1271 + }, + { + "epoch": 2.25, + "grad_norm": 0.21149350702762604, + "learning_rate": 4.4280617041875246e-05, + "loss": 0.0923, + "step": 1272 + }, + { + "epoch": 2.25, + "grad_norm": 0.017069244757294655, + "learning_rate": 4.408240229635854e-05, + "loss": 0.001, + "step": 1273 + }, + { + "epoch": 2.25, + "grad_norm": 0.13155558705329895, + "learning_rate": 4.38845557374017e-05, + "loss": 0.1008, + "step": 1274 + }, + { + "epoch": 2.26, + "grad_norm": 0.005120429676026106, + "learning_rate": 4.3687078052751156e-05, + "loss": 0.0003, + "step": 1275 + }, + { + "epoch": 2.26, + "grad_norm": 0.003916988614946604, + "learning_rate": 4.348996992887107e-05, + "loss": 0.0004, + "step": 1276 + }, + { + "epoch": 2.26, + "grad_norm": 0.047271549701690674, + "learning_rate": 4.329323205094082e-05, + "loss": 0.002, + "step": 1277 + }, + { + "epoch": 2.26, + "grad_norm": 0.07258959859609604, + "learning_rate": 4.3096865102852956e-05, + "loss": 0.0021, + "step": 1278 + }, + { + "epoch": 2.26, + "eval_loss": 0.11266309022903442, + "eval_runtime": 14.6862, + "eval_samples_per_second": 32.479, + "eval_steps_per_second": 8.171, + "step": 1278 + }, + { + "epoch": 2.26, + "grad_norm": 0.10085738450288773, + "learning_rate": 4.290086976721032e-05, + "loss": 0.0038, + "step": 1279 + }, + { + "epoch": 2.26, + "grad_norm": 0.009946956299245358, + "learning_rate": 4.270524672532421e-05, + "loss": 0.0007, + "step": 1280 + }, + { + "epoch": 2.27, + "grad_norm": 0.05360298231244087, + "learning_rate": 4.2509996657211766e-05, + "loss": 0.0024, + "step": 1281 + }, + { + "epoch": 2.27, + "grad_norm": 0.022336110472679138, + "learning_rate": 4.231512024159348e-05, + "loss": 0.0014, + "step": 1282 + }, + { + "epoch": 2.27, + "grad_norm": 0.006835728883743286, + "learning_rate": 4.212061815589103e-05, + "loss": 0.0005, + "step": 1283 + }, + { + "epoch": 2.27, + "grad_norm": 0.0074784718453884125, + "learning_rate": 4.192649107622485e-05, + "loss": 0.0007, + "step": 1284 + }, + { + "epoch": 2.27, + "grad_norm": 0.023102423176169395, + "learning_rate": 4.173273967741183e-05, + "loss": 0.0016, + "step": 1285 + }, + { + "epoch": 2.28, + "grad_norm": 0.05313856899738312, + "learning_rate": 4.153936463296295e-05, + "loss": 0.0016, + "step": 1286 + }, + { + "epoch": 2.28, + "grad_norm": 0.004305421840399504, + "learning_rate": 4.134636661508087e-05, + "loss": 0.0005, + "step": 1287 + }, + { + "epoch": 2.28, + "grad_norm": 0.1759803593158722, + "learning_rate": 4.1153746294657586e-05, + "loss": 0.0041, + "step": 1288 + }, + { + "epoch": 2.28, + "grad_norm": 0.009421673603355885, + "learning_rate": 4.0961504341272334e-05, + "loss": 0.0008, + "step": 1289 + }, + { + "epoch": 2.28, + "grad_norm": 0.6674251556396484, + "learning_rate": 4.076964142318889e-05, + "loss": 0.0086, + "step": 1290 + }, + { + "epoch": 2.28, + "grad_norm": 0.01362668164074421, + "learning_rate": 4.0578158207353614e-05, + "loss": 0.0008, + "step": 1291 + }, + { + "epoch": 2.29, + "grad_norm": 0.2785921096801758, + "learning_rate": 4.038705535939284e-05, + "loss": 0.0093, + "step": 1292 + }, + { + "epoch": 2.29, + "grad_norm": 0.0885733962059021, + "learning_rate": 4.0196333543610706e-05, + "loss": 0.0018, + "step": 1293 + }, + { + "epoch": 2.29, + "grad_norm": 0.018255922943353653, + "learning_rate": 4.000599342298688e-05, + "loss": 0.0007, + "step": 1294 + }, + { + "epoch": 2.29, + "grad_norm": 0.3033410906791687, + "learning_rate": 3.981603565917409e-05, + "loss": 0.0375, + "step": 1295 + }, + { + "epoch": 2.29, + "grad_norm": 0.004288067575544119, + "learning_rate": 3.9626460912496087e-05, + "loss": 0.0004, + "step": 1296 + }, + { + "epoch": 2.29, + "grad_norm": 0.3900052607059479, + "learning_rate": 3.9437269841945035e-05, + "loss": 0.0115, + "step": 1297 + }, + { + "epoch": 2.3, + "grad_norm": 0.16617615520954132, + "learning_rate": 3.924846310517943e-05, + "loss": 0.0036, + "step": 1298 + }, + { + "epoch": 2.3, + "grad_norm": 0.004422987345606089, + "learning_rate": 3.9060041358521834e-05, + "loss": 0.0005, + "step": 1299 + }, + { + "epoch": 2.3, + "grad_norm": 0.002911049872636795, + "learning_rate": 3.887200525695638e-05, + "loss": 0.0004, + "step": 1300 + }, + { + "epoch": 2.3, + "grad_norm": 0.020865343511104584, + "learning_rate": 3.8684355454126823e-05, + "loss": 0.001, + "step": 1301 + }, + { + "epoch": 2.3, + "grad_norm": 0.04416746646165848, + "learning_rate": 3.849709260233394e-05, + "loss": 0.0033, + "step": 1302 + }, + { + "epoch": 2.31, + "grad_norm": 0.03306471183896065, + "learning_rate": 3.8310217352533376e-05, + "loss": 0.0025, + "step": 1303 + }, + { + "epoch": 2.31, + "grad_norm": 0.03033634088933468, + "learning_rate": 3.812373035433358e-05, + "loss": 0.0009, + "step": 1304 + }, + { + "epoch": 2.31, + "grad_norm": 0.0477125309407711, + "learning_rate": 3.793763225599317e-05, + "loss": 0.0021, + "step": 1305 + }, + { + "epoch": 2.31, + "grad_norm": 0.006260554771870375, + "learning_rate": 3.77519237044191e-05, + "loss": 0.0004, + "step": 1306 + }, + { + "epoch": 2.31, + "grad_norm": 0.014828209765255451, + "learning_rate": 3.756660534516401e-05, + "loss": 0.0009, + "step": 1307 + }, + { + "epoch": 2.31, + "grad_norm": 0.01470520906150341, + "learning_rate": 3.738167782242422e-05, + "loss": 0.0011, + "step": 1308 + }, + { + "epoch": 2.32, + "grad_norm": 0.19026656448841095, + "learning_rate": 3.7197141779037535e-05, + "loss": 0.0044, + "step": 1309 + }, + { + "epoch": 2.32, + "grad_norm": 0.45979931950569153, + "learning_rate": 3.701299785648079e-05, + "loss": 0.0623, + "step": 1310 + }, + { + "epoch": 2.32, + "grad_norm": 0.011812691576778889, + "learning_rate": 3.6829246694867844e-05, + "loss": 0.0009, + "step": 1311 + }, + { + "epoch": 2.32, + "grad_norm": 0.11467989534139633, + "learning_rate": 3.664588893294721e-05, + "loss": 0.0041, + "step": 1312 + }, + { + "epoch": 2.32, + "grad_norm": 0.008386659435927868, + "learning_rate": 3.6462925208099794e-05, + "loss": 0.0004, + "step": 1313 + }, + { + "epoch": 2.32, + "grad_norm": 0.0743747353553772, + "learning_rate": 3.628035615633692e-05, + "loss": 0.0018, + "step": 1314 + }, + { + "epoch": 2.33, + "grad_norm": 0.1064383015036583, + "learning_rate": 3.609818241229794e-05, + "loss": 0.0014, + "step": 1315 + }, + { + "epoch": 2.33, + "grad_norm": 0.0030197727028280497, + "learning_rate": 3.591640460924797e-05, + "loss": 0.0004, + "step": 1316 + }, + { + "epoch": 2.33, + "grad_norm": 0.08606261014938354, + "learning_rate": 3.5735023379075784e-05, + "loss": 0.0023, + "step": 1317 + }, + { + "epoch": 2.33, + "grad_norm": 0.005634244531393051, + "learning_rate": 3.555403935229163e-05, + "loss": 0.0006, + "step": 1318 + }, + { + "epoch": 2.33, + "grad_norm": 0.0060554565861821175, + "learning_rate": 3.5373453158025056e-05, + "loss": 0.0005, + "step": 1319 + }, + { + "epoch": 2.34, + "grad_norm": 0.2659899890422821, + "learning_rate": 3.51932654240227e-05, + "loss": 0.0054, + "step": 1320 + }, + { + "epoch": 2.34, + "grad_norm": 0.14752821624279022, + "learning_rate": 3.501347677664587e-05, + "loss": 0.0028, + "step": 1321 + }, + { + "epoch": 2.34, + "grad_norm": 0.006984313018620014, + "learning_rate": 3.483408784086888e-05, + "loss": 0.0006, + "step": 1322 + }, + { + "epoch": 2.34, + "grad_norm": 0.014710571616888046, + "learning_rate": 3.4655099240276356e-05, + "loss": 0.001, + "step": 1323 + }, + { + "epoch": 2.34, + "grad_norm": 0.1552634835243225, + "learning_rate": 3.447651159706143e-05, + "loss": 0.0088, + "step": 1324 + }, + { + "epoch": 2.34, + "grad_norm": 0.19928738474845886, + "learning_rate": 3.429832553202349e-05, + "loss": 0.0079, + "step": 1325 + }, + { + "epoch": 2.35, + "grad_norm": 0.07880314439535141, + "learning_rate": 3.4120541664565684e-05, + "loss": 0.0015, + "step": 1326 + }, + { + "epoch": 2.35, + "grad_norm": 0.5979902744293213, + "learning_rate": 3.394316061269337e-05, + "loss": 0.0297, + "step": 1327 + }, + { + "epoch": 2.35, + "grad_norm": 0.9888120293617249, + "learning_rate": 3.376618299301156e-05, + "loss": 0.0232, + "step": 1328 + }, + { + "epoch": 2.35, + "grad_norm": 0.018368270248174667, + "learning_rate": 3.358960942072282e-05, + "loss": 0.0011, + "step": 1329 + }, + { + "epoch": 2.35, + "grad_norm": 0.033457737416028976, + "learning_rate": 3.3413440509625196e-05, + "loss": 0.0013, + "step": 1330 + }, + { + "epoch": 2.35, + "grad_norm": 0.023191537708044052, + "learning_rate": 3.323767687211006e-05, + "loss": 0.0011, + "step": 1331 + }, + { + "epoch": 2.36, + "grad_norm": 0.007936120964586735, + "learning_rate": 3.306231911916005e-05, + "loss": 0.0007, + "step": 1332 + }, + { + "epoch": 2.36, + "grad_norm": 0.003727326402440667, + "learning_rate": 3.288736786034685e-05, + "loss": 0.0004, + "step": 1333 + }, + { + "epoch": 2.36, + "grad_norm": 0.31322526931762695, + "learning_rate": 3.271282370382912e-05, + "loss": 0.0342, + "step": 1334 + }, + { + "epoch": 2.36, + "grad_norm": 0.5764428973197937, + "learning_rate": 3.253868725635029e-05, + "loss": 0.0078, + "step": 1335 + }, + { + "epoch": 2.36, + "grad_norm": 0.012030922807753086, + "learning_rate": 3.236495912323657e-05, + "loss": 0.0009, + "step": 1336 + }, + { + "epoch": 2.37, + "grad_norm": 0.009075929410755634, + "learning_rate": 3.219163990839486e-05, + "loss": 0.0006, + "step": 1337 + }, + { + "epoch": 2.37, + "grad_norm": 0.10224148631095886, + "learning_rate": 3.201873021431059e-05, + "loss": 0.0022, + "step": 1338 + }, + { + "epoch": 2.37, + "grad_norm": 0.033005669713020325, + "learning_rate": 3.1846230642045555e-05, + "loss": 0.0011, + "step": 1339 + }, + { + "epoch": 2.37, + "grad_norm": 0.005630008410662413, + "learning_rate": 3.1674141791235886e-05, + "loss": 0.0005, + "step": 1340 + }, + { + "epoch": 2.37, + "grad_norm": 0.583130955696106, + "learning_rate": 3.150246426009014e-05, + "loss": 0.0057, + "step": 1341 + }, + { + "epoch": 2.37, + "grad_norm": 0.0040301731787621975, + "learning_rate": 3.133119864538684e-05, + "loss": 0.0003, + "step": 1342 + }, + { + "epoch": 2.38, + "grad_norm": 0.012081477791070938, + "learning_rate": 3.116034554247285e-05, + "loss": 0.001, + "step": 1343 + }, + { + "epoch": 2.38, + "grad_norm": 0.027355549857020378, + "learning_rate": 3.09899055452609e-05, + "loss": 0.0014, + "step": 1344 + }, + { + "epoch": 2.38, + "grad_norm": 0.02681993693113327, + "learning_rate": 3.0819879246227736e-05, + "loss": 0.0013, + "step": 1345 + }, + { + "epoch": 2.38, + "grad_norm": 0.09049945324659348, + "learning_rate": 3.06502672364121e-05, + "loss": 0.0046, + "step": 1346 + }, + { + "epoch": 2.38, + "grad_norm": 0.036559611558914185, + "learning_rate": 3.04810701054125e-05, + "loss": 0.0015, + "step": 1347 + }, + { + "epoch": 2.38, + "grad_norm": 0.010112352669239044, + "learning_rate": 3.0312288441385364e-05, + "loss": 0.0007, + "step": 1348 + }, + { + "epoch": 2.39, + "grad_norm": 0.08881451189517975, + "learning_rate": 3.0143922831042793e-05, + "loss": 0.0016, + "step": 1349 + }, + { + "epoch": 2.39, + "grad_norm": 0.18841829895973206, + "learning_rate": 2.9975973859650616e-05, + "loss": 0.0077, + "step": 1350 + }, + { + "epoch": 2.39, + "grad_norm": 0.011637813411653042, + "learning_rate": 2.9808442111026486e-05, + "loss": 0.0008, + "step": 1351 + }, + { + "epoch": 2.39, + "grad_norm": 0.007817978039383888, + "learning_rate": 2.9641328167537547e-05, + "loss": 0.0006, + "step": 1352 + }, + { + "epoch": 2.39, + "grad_norm": 0.12936629354953766, + "learning_rate": 2.9474632610098748e-05, + "loss": 0.003, + "step": 1353 + }, + { + "epoch": 2.4, + "grad_norm": 0.14426392316818237, + "learning_rate": 2.9308356018170566e-05, + "loss": 0.0079, + "step": 1354 + }, + { + "epoch": 2.4, + "grad_norm": 0.19471372663974762, + "learning_rate": 2.9142498969757046e-05, + "loss": 0.012, + "step": 1355 + }, + { + "epoch": 2.4, + "grad_norm": 0.08268406242132187, + "learning_rate": 2.8977062041403997e-05, + "loss": 0.0037, + "step": 1356 + }, + { + "epoch": 2.4, + "grad_norm": 0.006442641373723745, + "learning_rate": 2.881204580819662e-05, + "loss": 0.0005, + "step": 1357 + }, + { + "epoch": 2.4, + "grad_norm": 0.021419553086161613, + "learning_rate": 2.8647450843757897e-05, + "loss": 0.0009, + "step": 1358 + }, + { + "epoch": 2.4, + "grad_norm": 0.008514653891324997, + "learning_rate": 2.8483277720246277e-05, + "loss": 0.0005, + "step": 1359 + }, + { + "epoch": 2.41, + "grad_norm": 0.007075514644384384, + "learning_rate": 2.831952700835386e-05, + "loss": 0.0005, + "step": 1360 + }, + { + "epoch": 2.41, + "grad_norm": 0.005311162211000919, + "learning_rate": 2.8156199277304442e-05, + "loss": 0.0006, + "step": 1361 + }, + { + "epoch": 2.41, + "grad_norm": 0.008779114112257957, + "learning_rate": 2.7993295094851354e-05, + "loss": 0.0005, + "step": 1362 + }, + { + "epoch": 2.41, + "grad_norm": 0.011863750405609608, + "learning_rate": 2.7830815027275734e-05, + "loss": 0.0009, + "step": 1363 + }, + { + "epoch": 2.41, + "grad_norm": 0.04168037325143814, + "learning_rate": 2.7668759639384314e-05, + "loss": 0.0012, + "step": 1364 + }, + { + "epoch": 2.41, + "grad_norm": 0.38817283511161804, + "learning_rate": 2.7507129494507595e-05, + "loss": 0.0308, + "step": 1365 + }, + { + "epoch": 2.42, + "grad_norm": 0.003068127203732729, + "learning_rate": 2.7345925154497912e-05, + "loss": 0.0003, + "step": 1366 + }, + { + "epoch": 2.42, + "grad_norm": 0.0036126230843365192, + "learning_rate": 2.7185147179727473e-05, + "loss": 0.0003, + "step": 1367 + }, + { + "epoch": 2.42, + "grad_norm": 0.06432247161865234, + "learning_rate": 2.7024796129086234e-05, + "loss": 0.0015, + "step": 1368 + }, + { + "epoch": 2.42, + "grad_norm": 0.0021110824309289455, + "learning_rate": 2.68648725599802e-05, + "loss": 0.0002, + "step": 1369 + }, + { + "epoch": 2.42, + "grad_norm": 0.03788725286722183, + "learning_rate": 2.6705377028329312e-05, + "loss": 0.0017, + "step": 1370 + }, + { + "epoch": 2.43, + "grad_norm": 0.3461664915084839, + "learning_rate": 2.654631008856562e-05, + "loss": 0.0131, + "step": 1371 + }, + { + "epoch": 2.43, + "grad_norm": 0.01737704686820507, + "learning_rate": 2.638767229363145e-05, + "loss": 0.0006, + "step": 1372 + }, + { + "epoch": 2.43, + "grad_norm": 0.45186784863471985, + "learning_rate": 2.6229464194977056e-05, + "loss": 0.0329, + "step": 1373 + }, + { + "epoch": 2.43, + "grad_norm": 0.04621649533510208, + "learning_rate": 2.607168634255927e-05, + "loss": 0.001, + "step": 1374 + }, + { + "epoch": 2.43, + "grad_norm": 0.0025596965570002794, + "learning_rate": 2.5914339284839142e-05, + "loss": 0.0002, + "step": 1375 + }, + { + "epoch": 2.43, + "grad_norm": 0.013345273211598396, + "learning_rate": 2.575742356878032e-05, + "loss": 0.0006, + "step": 1376 + }, + { + "epoch": 2.44, + "grad_norm": 0.5172482132911682, + "learning_rate": 2.56009397398471e-05, + "loss": 0.0471, + "step": 1377 + }, + { + "epoch": 2.44, + "grad_norm": 0.06014901399612427, + "learning_rate": 2.5444888342002195e-05, + "loss": 0.0026, + "step": 1378 + }, + { + "epoch": 2.44, + "grad_norm": 0.029812335968017578, + "learning_rate": 2.5289269917705406e-05, + "loss": 0.0012, + "step": 1379 + }, + { + "epoch": 2.44, + "grad_norm": 0.03772888705134392, + "learning_rate": 2.5134085007911348e-05, + "loss": 0.0009, + "step": 1380 + }, + { + "epoch": 2.44, + "grad_norm": 0.011846140958368778, + "learning_rate": 2.497933415206762e-05, + "loss": 0.0007, + "step": 1381 + }, + { + "epoch": 2.44, + "grad_norm": 0.0023455750197172165, + "learning_rate": 2.4825017888113036e-05, + "loss": 0.0003, + "step": 1382 + }, + { + "epoch": 2.45, + "grad_norm": 0.015617525205016136, + "learning_rate": 2.4671136752475644e-05, + "loss": 0.0008, + "step": 1383 + }, + { + "epoch": 2.45, + "grad_norm": 0.04346747323870659, + "learning_rate": 2.4517691280070996e-05, + "loss": 0.0012, + "step": 1384 + }, + { + "epoch": 2.45, + "grad_norm": 0.2637970447540283, + "learning_rate": 2.436468200430019e-05, + "loss": 0.0052, + "step": 1385 + }, + { + "epoch": 2.45, + "grad_norm": 0.13508789241313934, + "learning_rate": 2.4212109457048e-05, + "loss": 0.0074, + "step": 1386 + }, + { + "epoch": 2.45, + "grad_norm": 0.026154443621635437, + "learning_rate": 2.4059974168681047e-05, + "loss": 0.0008, + "step": 1387 + }, + { + "epoch": 2.46, + "grad_norm": 0.021336521953344345, + "learning_rate": 2.3908276668045972e-05, + "loss": 0.0011, + "step": 1388 + }, + { + "epoch": 2.46, + "grad_norm": 0.005644786171615124, + "learning_rate": 2.3757017482467683e-05, + "loss": 0.0005, + "step": 1389 + }, + { + "epoch": 2.46, + "grad_norm": 1.1730983257293701, + "learning_rate": 2.3606197137747364e-05, + "loss": 0.0273, + "step": 1390 + }, + { + "epoch": 2.46, + "grad_norm": 0.11611776053905487, + "learning_rate": 2.3455816158160716e-05, + "loss": 0.0035, + "step": 1391 + }, + { + "epoch": 2.46, + "grad_norm": 0.010708926245570183, + "learning_rate": 2.330587506645611e-05, + "loss": 0.0004, + "step": 1392 + }, + { + "epoch": 2.46, + "grad_norm": 0.02782527729868889, + "learning_rate": 2.3156374383852905e-05, + "loss": 0.0015, + "step": 1393 + }, + { + "epoch": 2.47, + "grad_norm": 0.015624145045876503, + "learning_rate": 2.300731463003938e-05, + "loss": 0.0008, + "step": 1394 + }, + { + "epoch": 2.47, + "grad_norm": 0.03852810710668564, + "learning_rate": 2.2858696323171222e-05, + "loss": 0.0015, + "step": 1395 + }, + { + "epoch": 2.47, + "grad_norm": 0.2508693039417267, + "learning_rate": 2.2710519979869456e-05, + "loss": 0.0085, + "step": 1396 + }, + { + "epoch": 2.47, + "grad_norm": 0.10691177099943161, + "learning_rate": 2.256278611521881e-05, + "loss": 0.0051, + "step": 1397 + }, + { + "epoch": 2.47, + "grad_norm": 0.023507850244641304, + "learning_rate": 2.2415495242765957e-05, + "loss": 0.0012, + "step": 1398 + }, + { + "epoch": 2.48, + "grad_norm": 0.015225178562104702, + "learning_rate": 2.226864787451753e-05, + "loss": 0.0007, + "step": 1399 + }, + { + "epoch": 2.48, + "grad_norm": 0.0027131009846925735, + "learning_rate": 2.2122244520938588e-05, + "loss": 0.0002, + "step": 1400 + }, + { + "epoch": 2.48, + "grad_norm": 0.08041994273662567, + "learning_rate": 2.197628569095066e-05, + "loss": 0.0026, + "step": 1401 + }, + { + "epoch": 2.48, + "grad_norm": 0.43206658959388733, + "learning_rate": 2.1830771891929998e-05, + "loss": 0.0066, + "step": 1402 + }, + { + "epoch": 2.48, + "grad_norm": 0.8441733121871948, + "learning_rate": 2.168570362970599e-05, + "loss": 0.0244, + "step": 1403 + }, + { + "epoch": 2.48, + "grad_norm": 0.00975915789604187, + "learning_rate": 2.154108140855914e-05, + "loss": 0.0004, + "step": 1404 + }, + { + "epoch": 2.49, + "grad_norm": 0.008404737338423729, + "learning_rate": 2.1396905731219503e-05, + "loss": 0.0006, + "step": 1405 + }, + { + "epoch": 2.49, + "grad_norm": 0.4085506498813629, + "learning_rate": 2.125317709886488e-05, + "loss": 0.022, + "step": 1406 + }, + { + "epoch": 2.49, + "grad_norm": 0.01728154346346855, + "learning_rate": 2.1109896011118983e-05, + "loss": 0.0006, + "step": 1407 + }, + { + "epoch": 2.49, + "grad_norm": 0.22465384006500244, + "learning_rate": 2.096706296604994e-05, + "loss": 0.0054, + "step": 1408 + }, + { + "epoch": 2.49, + "grad_norm": 0.04408938065171242, + "learning_rate": 2.0824678460168244e-05, + "loss": 0.0014, + "step": 1409 + }, + { + "epoch": 2.49, + "grad_norm": 0.004848984070122242, + "learning_rate": 2.0682742988425365e-05, + "loss": 0.0004, + "step": 1410 + }, + { + "epoch": 2.5, + "grad_norm": 0.01854972168803215, + "learning_rate": 2.054125704421174e-05, + "loss": 0.0009, + "step": 1411 + }, + { + "epoch": 2.5, + "grad_norm": 0.0032519930973649025, + "learning_rate": 2.0400221119355158e-05, + "loss": 0.0003, + "step": 1412 + }, + { + "epoch": 2.5, + "grad_norm": 0.003512805327773094, + "learning_rate": 2.0259635704119188e-05, + "loss": 0.0003, + "step": 1413 + }, + { + "epoch": 2.5, + "grad_norm": 0.2469976246356964, + "learning_rate": 2.0119501287201224e-05, + "loss": 0.0053, + "step": 1414 + }, + { + "epoch": 2.5, + "grad_norm": 0.0765346884727478, + "learning_rate": 1.997981835573102e-05, + "loss": 0.0017, + "step": 1415 + }, + { + "epoch": 2.51, + "grad_norm": 0.01032665278762579, + "learning_rate": 1.9840587395268852e-05, + "loss": 0.0005, + "step": 1416 + }, + { + "epoch": 2.51, + "grad_norm": 0.027239112183451653, + "learning_rate": 1.970180888980384e-05, + "loss": 0.0016, + "step": 1417 + }, + { + "epoch": 2.51, + "grad_norm": 0.0050926231779158115, + "learning_rate": 1.9563483321752333e-05, + "loss": 0.0004, + "step": 1418 + }, + { + "epoch": 2.51, + "grad_norm": 0.6338268518447876, + "learning_rate": 1.9425611171956253e-05, + "loss": 0.0206, + "step": 1419 + }, + { + "epoch": 2.51, + "grad_norm": 0.5161135792732239, + "learning_rate": 1.9288192919681273e-05, + "loss": 0.0258, + "step": 1420 + }, + { + "epoch": 2.51, + "eval_loss": 0.11881372332572937, + "eval_runtime": 14.7018, + "eval_samples_per_second": 32.445, + "eval_steps_per_second": 8.162, + "step": 1420 + }, + { + "epoch": 2.51, + "grad_norm": 0.0028544398956000805, + "learning_rate": 1.9151229042615268e-05, + "loss": 0.0003, + "step": 1421 + }, + { + "epoch": 2.52, + "grad_norm": 0.12843045592308044, + "learning_rate": 1.9014720016866626e-05, + "loss": 0.0016, + "step": 1422 + }, + { + "epoch": 2.52, + "grad_norm": 0.01778980903327465, + "learning_rate": 1.8878666316962642e-05, + "loss": 0.0007, + "step": 1423 + }, + { + "epoch": 2.52, + "grad_norm": 0.005622228141874075, + "learning_rate": 1.8743068415847874e-05, + "loss": 0.0005, + "step": 1424 + }, + { + "epoch": 2.52, + "grad_norm": 0.03248097002506256, + "learning_rate": 1.8607926784882233e-05, + "loss": 0.002, + "step": 1425 + }, + { + "epoch": 2.52, + "grad_norm": 0.003068873193114996, + "learning_rate": 1.8473241893839825e-05, + "loss": 0.0003, + "step": 1426 + }, + { + "epoch": 2.52, + "grad_norm": 0.00804146844893694, + "learning_rate": 1.8339014210906854e-05, + "loss": 0.0004, + "step": 1427 + }, + { + "epoch": 2.53, + "grad_norm": 0.0017168130725622177, + "learning_rate": 1.820524420268032e-05, + "loss": 0.0002, + "step": 1428 + }, + { + "epoch": 2.53, + "grad_norm": 0.027117310091853142, + "learning_rate": 1.8071932334166285e-05, + "loss": 0.0006, + "step": 1429 + }, + { + "epoch": 2.53, + "grad_norm": 0.006395867094397545, + "learning_rate": 1.7939079068778074e-05, + "loss": 0.0003, + "step": 1430 + }, + { + "epoch": 2.53, + "grad_norm": 0.43112286925315857, + "learning_rate": 1.7806684868335002e-05, + "loss": 0.0326, + "step": 1431 + }, + { + "epoch": 2.53, + "grad_norm": 0.008108658716082573, + "learning_rate": 1.7674750193060583e-05, + "loss": 0.0006, + "step": 1432 + }, + { + "epoch": 2.54, + "grad_norm": 0.006907780654728413, + "learning_rate": 1.7543275501580867e-05, + "loss": 0.0004, + "step": 1433 + }, + { + "epoch": 2.54, + "grad_norm": 0.0025743867736309767, + "learning_rate": 1.7412261250923003e-05, + "loss": 0.0002, + "step": 1434 + }, + { + "epoch": 2.54, + "grad_norm": 0.25082695484161377, + "learning_rate": 1.7281707896513475e-05, + "loss": 0.0091, + "step": 1435 + }, + { + "epoch": 2.54, + "grad_norm": 0.0044022053480148315, + "learning_rate": 1.715161589217674e-05, + "loss": 0.0003, + "step": 1436 + }, + { + "epoch": 2.54, + "grad_norm": 0.08191661536693573, + "learning_rate": 1.7021985690133493e-05, + "loss": 0.0036, + "step": 1437 + }, + { + "epoch": 2.54, + "grad_norm": 0.006498047150671482, + "learning_rate": 1.689281774099908e-05, + "loss": 0.0004, + "step": 1438 + }, + { + "epoch": 2.55, + "grad_norm": 0.017313893884420395, + "learning_rate": 1.6764112493782018e-05, + "loss": 0.001, + "step": 1439 + }, + { + "epoch": 2.55, + "grad_norm": 0.0027831539046019316, + "learning_rate": 1.663587039588237e-05, + "loss": 0.0003, + "step": 1440 + }, + { + "epoch": 2.55, + "grad_norm": 0.002531023696064949, + "learning_rate": 1.6508091893090236e-05, + "loss": 0.0003, + "step": 1441 + }, + { + "epoch": 2.55, + "grad_norm": 0.002510587451979518, + "learning_rate": 1.6380777429584253e-05, + "loss": 0.0003, + "step": 1442 + }, + { + "epoch": 2.55, + "grad_norm": 0.00382997072301805, + "learning_rate": 1.6253927447929882e-05, + "loss": 0.0003, + "step": 1443 + }, + { + "epoch": 2.55, + "grad_norm": 0.0067413197830319405, + "learning_rate": 1.612754238907797e-05, + "loss": 0.0005, + "step": 1444 + }, + { + "epoch": 2.56, + "grad_norm": 0.001188279245980084, + "learning_rate": 1.6001622692363314e-05, + "loss": 0.0001, + "step": 1445 + }, + { + "epoch": 2.56, + "grad_norm": 0.040217697620391846, + "learning_rate": 1.587616879550293e-05, + "loss": 0.0009, + "step": 1446 + }, + { + "epoch": 2.56, + "grad_norm": 0.016694562509655952, + "learning_rate": 1.5751181134594742e-05, + "loss": 0.0007, + "step": 1447 + }, + { + "epoch": 2.56, + "grad_norm": 0.04169061779975891, + "learning_rate": 1.562666014411587e-05, + "loss": 0.0024, + "step": 1448 + }, + { + "epoch": 2.56, + "grad_norm": 0.0032720069866627455, + "learning_rate": 1.550260625692122e-05, + "loss": 0.0004, + "step": 1449 + }, + { + "epoch": 2.57, + "grad_norm": 0.3045656383037567, + "learning_rate": 1.5379019904242086e-05, + "loss": 0.0132, + "step": 1450 + }, + { + "epoch": 2.57, + "grad_norm": 0.04755120724439621, + "learning_rate": 1.5255901515684372e-05, + "loss": 0.0011, + "step": 1451 + }, + { + "epoch": 2.57, + "grad_norm": 0.007945549674332142, + "learning_rate": 1.5133251519227413e-05, + "loss": 0.0006, + "step": 1452 + }, + { + "epoch": 2.57, + "grad_norm": 0.015786288306117058, + "learning_rate": 1.5011070341222248e-05, + "loss": 0.0009, + "step": 1453 + }, + { + "epoch": 2.57, + "grad_norm": 0.22754225134849548, + "learning_rate": 1.4889358406390216e-05, + "loss": 0.0255, + "step": 1454 + }, + { + "epoch": 2.57, + "grad_norm": 0.0039327009581029415, + "learning_rate": 1.4768116137821584e-05, + "loss": 0.0002, + "step": 1455 + }, + { + "epoch": 2.58, + "grad_norm": 0.009196938015520573, + "learning_rate": 1.4647343956973867e-05, + "loss": 0.0006, + "step": 1456 + }, + { + "epoch": 2.58, + "grad_norm": 0.008127289824187756, + "learning_rate": 1.4527042283670592e-05, + "loss": 0.0004, + "step": 1457 + }, + { + "epoch": 2.58, + "grad_norm": 0.010122274979948997, + "learning_rate": 1.4407211536099661e-05, + "loss": 0.0005, + "step": 1458 + }, + { + "epoch": 2.58, + "grad_norm": 0.25765278935432434, + "learning_rate": 1.4287852130811933e-05, + "loss": 0.0081, + "step": 1459 + }, + { + "epoch": 2.58, + "grad_norm": 0.14113549888134003, + "learning_rate": 1.4168964482719914e-05, + "loss": 0.0081, + "step": 1460 + }, + { + "epoch": 2.58, + "grad_norm": 0.0034420734737068415, + "learning_rate": 1.4050549005096051e-05, + "loss": 0.0002, + "step": 1461 + }, + { + "epoch": 2.59, + "grad_norm": 0.06208482384681702, + "learning_rate": 1.3932606109571642e-05, + "loss": 0.0023, + "step": 1462 + }, + { + "epoch": 2.59, + "grad_norm": 0.05258166790008545, + "learning_rate": 1.3815136206135064e-05, + "loss": 0.0023, + "step": 1463 + }, + { + "epoch": 2.59, + "grad_norm": 0.002335899043828249, + "learning_rate": 1.3698139703130507e-05, + "loss": 0.0001, + "step": 1464 + }, + { + "epoch": 2.59, + "grad_norm": 0.004368484020233154, + "learning_rate": 1.3581617007256645e-05, + "loss": 0.0004, + "step": 1465 + }, + { + "epoch": 2.59, + "grad_norm": 0.04118827357888222, + "learning_rate": 1.3465568523565006e-05, + "loss": 0.0013, + "step": 1466 + }, + { + "epoch": 2.6, + "grad_norm": 0.6105291247367859, + "learning_rate": 1.3349994655458785e-05, + "loss": 0.0568, + "step": 1467 + }, + { + "epoch": 2.6, + "grad_norm": 0.060310959815979004, + "learning_rate": 1.3234895804691275e-05, + "loss": 0.0022, + "step": 1468 + }, + { + "epoch": 2.6, + "grad_norm": 0.008944911882281303, + "learning_rate": 1.3120272371364498e-05, + "loss": 0.0006, + "step": 1469 + }, + { + "epoch": 2.6, + "grad_norm": 0.01793264038860798, + "learning_rate": 1.3006124753927943e-05, + "loss": 0.0007, + "step": 1470 + }, + { + "epoch": 2.6, + "grad_norm": 0.11373735219240189, + "learning_rate": 1.2892453349177073e-05, + "loss": 0.0024, + "step": 1471 + }, + { + "epoch": 2.6, + "grad_norm": 0.0023288100492209196, + "learning_rate": 1.2779258552251826e-05, + "loss": 0.0002, + "step": 1472 + }, + { + "epoch": 2.61, + "grad_norm": 0.004119677934795618, + "learning_rate": 1.2666540756635546e-05, + "loss": 0.0003, + "step": 1473 + }, + { + "epoch": 2.61, + "grad_norm": 0.07424701005220413, + "learning_rate": 1.2554300354153324e-05, + "loss": 0.0025, + "step": 1474 + }, + { + "epoch": 2.61, + "grad_norm": 0.002191951498389244, + "learning_rate": 1.2442537734970842e-05, + "loss": 0.0002, + "step": 1475 + }, + { + "epoch": 2.61, + "grad_norm": 0.007462161593139172, + "learning_rate": 1.2331253287592929e-05, + "loss": 0.0005, + "step": 1476 + }, + { + "epoch": 2.61, + "grad_norm": 0.004635229241102934, + "learning_rate": 1.2220447398862076e-05, + "loss": 0.0003, + "step": 1477 + }, + { + "epoch": 2.61, + "grad_norm": 0.006031651049852371, + "learning_rate": 1.2110120453957405e-05, + "loss": 0.0003, + "step": 1478 + }, + { + "epoch": 2.62, + "grad_norm": 0.0031433457043021917, + "learning_rate": 1.2000272836393022e-05, + "loss": 0.0002, + "step": 1479 + }, + { + "epoch": 2.62, + "grad_norm": 0.03857436031103134, + "learning_rate": 1.1890904928016926e-05, + "loss": 0.0017, + "step": 1480 + }, + { + "epoch": 2.62, + "grad_norm": 0.04181879758834839, + "learning_rate": 1.1782017109009456e-05, + "loss": 0.0004, + "step": 1481 + }, + { + "epoch": 2.62, + "grad_norm": 0.0021477253176271915, + "learning_rate": 1.1673609757882153e-05, + "loss": 0.0002, + "step": 1482 + }, + { + "epoch": 2.62, + "grad_norm": 0.1069151759147644, + "learning_rate": 1.1565683251476354e-05, + "loss": 0.0028, + "step": 1483 + }, + { + "epoch": 2.63, + "grad_norm": 0.025872178375720978, + "learning_rate": 1.1458237964961942e-05, + "loss": 0.0007, + "step": 1484 + }, + { + "epoch": 2.63, + "grad_norm": 0.008154606446623802, + "learning_rate": 1.1351274271835947e-05, + "loss": 0.0004, + "step": 1485 + }, + { + "epoch": 2.63, + "grad_norm": 0.6758263111114502, + "learning_rate": 1.1244792543921327e-05, + "loss": 0.0177, + "step": 1486 + }, + { + "epoch": 2.63, + "grad_norm": 0.0019666326697915792, + "learning_rate": 1.1138793151365611e-05, + "loss": 0.0002, + "step": 1487 + }, + { + "epoch": 2.63, + "grad_norm": 0.01791139505803585, + "learning_rate": 1.1033276462639734e-05, + "loss": 0.0005, + "step": 1488 + }, + { + "epoch": 2.63, + "grad_norm": 0.337736040353775, + "learning_rate": 1.0928242844536644e-05, + "loss": 0.0069, + "step": 1489 + }, + { + "epoch": 2.64, + "grad_norm": 0.04357455298304558, + "learning_rate": 1.0823692662170013e-05, + "loss": 0.0019, + "step": 1490 + }, + { + "epoch": 2.64, + "grad_norm": 0.39360976219177246, + "learning_rate": 1.0719626278973058e-05, + "loss": 0.0109, + "step": 1491 + }, + { + "epoch": 2.64, + "grad_norm": 0.010024788789451122, + "learning_rate": 1.0616044056697209e-05, + "loss": 0.0006, + "step": 1492 + }, + { + "epoch": 2.64, + "grad_norm": 0.006321369204670191, + "learning_rate": 1.051294635541089e-05, + "loss": 0.0004, + "step": 1493 + }, + { + "epoch": 2.64, + "grad_norm": 0.014464390464127064, + "learning_rate": 1.0410333533498272e-05, + "loss": 0.0005, + "step": 1494 + }, + { + "epoch": 2.64, + "grad_norm": 0.0068323672749102116, + "learning_rate": 1.0308205947657977e-05, + "loss": 0.0005, + "step": 1495 + }, + { + "epoch": 2.65, + "grad_norm": 0.09922464936971664, + "learning_rate": 1.0206563952901842e-05, + "loss": 0.0019, + "step": 1496 + }, + { + "epoch": 2.65, + "grad_norm": 0.004133671522140503, + "learning_rate": 1.0105407902553786e-05, + "loss": 0.0002, + "step": 1497 + }, + { + "epoch": 2.65, + "grad_norm": 0.0314289852976799, + "learning_rate": 1.0004738148248431e-05, + "loss": 0.0012, + "step": 1498 + }, + { + "epoch": 2.65, + "grad_norm": 0.023681310936808586, + "learning_rate": 9.904555039930018e-06, + "loss": 0.0006, + "step": 1499 + }, + { + "epoch": 2.65, + "grad_norm": 0.4027155041694641, + "learning_rate": 9.804858925851121e-06, + "loss": 0.0931, + "step": 1500 + }, + { + "epoch": 2.66, + "grad_norm": 0.00320827285759151, + "learning_rate": 9.70565015257136e-06, + "loss": 0.0004, + "step": 1501 + }, + { + "epoch": 2.66, + "grad_norm": 0.0075167641043663025, + "learning_rate": 9.606929064956397e-06, + "loss": 0.0005, + "step": 1502 + }, + { + "epoch": 2.66, + "grad_norm": 0.005038086324930191, + "learning_rate": 9.508696006176547e-06, + "loss": 0.0003, + "step": 1503 + }, + { + "epoch": 2.66, + "grad_norm": 0.0022592307068407536, + "learning_rate": 9.410951317705707e-06, + "loss": 0.0002, + "step": 1504 + }, + { + "epoch": 2.66, + "grad_norm": 0.02426138147711754, + "learning_rate": 9.313695339320065e-06, + "loss": 0.0006, + "step": 1505 + }, + { + "epoch": 2.66, + "grad_norm": 0.3398880660533905, + "learning_rate": 9.216928409097008e-06, + "loss": 0.0086, + "step": 1506 + }, + { + "epoch": 2.67, + "grad_norm": 0.004356835037469864, + "learning_rate": 9.120650863413925e-06, + "loss": 0.0003, + "step": 1507 + }, + { + "epoch": 2.67, + "grad_norm": 0.007244028151035309, + "learning_rate": 9.024863036946972e-06, + "loss": 0.0005, + "step": 1508 + }, + { + "epoch": 2.67, + "grad_norm": 0.0058324565179646015, + "learning_rate": 8.92956526267003e-06, + "loss": 0.0004, + "step": 1509 + }, + { + "epoch": 2.67, + "grad_norm": 0.002389086177572608, + "learning_rate": 8.83475787185346e-06, + "loss": 0.0002, + "step": 1510 + }, + { + "epoch": 2.67, + "grad_norm": 0.002029439900070429, + "learning_rate": 8.74044119406292e-06, + "loss": 0.0002, + "step": 1511 + }, + { + "epoch": 2.67, + "grad_norm": 0.012978079728782177, + "learning_rate": 8.646615557158348e-06, + "loss": 0.0006, + "step": 1512 + }, + { + "epoch": 2.68, + "grad_norm": 0.010945575311779976, + "learning_rate": 8.553281287292684e-06, + "loss": 0.0005, + "step": 1513 + }, + { + "epoch": 2.68, + "grad_norm": 0.052716005593538284, + "learning_rate": 8.460438708910832e-06, + "loss": 0.0017, + "step": 1514 + }, + { + "epoch": 2.68, + "grad_norm": 0.0019225055584684014, + "learning_rate": 8.368088144748514e-06, + "loss": 0.0002, + "step": 1515 + }, + { + "epoch": 2.68, + "grad_norm": 0.007164741866290569, + "learning_rate": 8.276229915831035e-06, + "loss": 0.0005, + "step": 1516 + }, + { + "epoch": 2.68, + "grad_norm": 0.09314575791358948, + "learning_rate": 8.184864341472419e-06, + "loss": 0.0019, + "step": 1517 + }, + { + "epoch": 2.69, + "grad_norm": 0.004978674463927746, + "learning_rate": 8.093991739273981e-06, + "loss": 0.0004, + "step": 1518 + }, + { + "epoch": 2.69, + "grad_norm": 0.028860457241535187, + "learning_rate": 8.003612425123517e-06, + "loss": 0.0011, + "step": 1519 + }, + { + "epoch": 2.69, + "grad_norm": 0.4438951313495636, + "learning_rate": 7.913726713194018e-06, + "loss": 0.0235, + "step": 1520 + }, + { + "epoch": 2.69, + "grad_norm": 0.02038520760834217, + "learning_rate": 7.824334915942593e-06, + "loss": 0.0008, + "step": 1521 + }, + { + "epoch": 2.69, + "grad_norm": 0.04750734195113182, + "learning_rate": 7.735437344109491e-06, + "loss": 0.0011, + "step": 1522 + }, + { + "epoch": 2.69, + "grad_norm": 0.0024837073870003223, + "learning_rate": 7.647034306716987e-06, + "loss": 0.0002, + "step": 1523 + }, + { + "epoch": 2.7, + "grad_norm": 0.017344102263450623, + "learning_rate": 7.559126111068131e-06, + "loss": 0.0007, + "step": 1524 + }, + { + "epoch": 2.7, + "grad_norm": 0.578118622303009, + "learning_rate": 7.471713062745965e-06, + "loss": 0.0605, + "step": 1525 + }, + { + "epoch": 2.7, + "grad_norm": 0.002272555371746421, + "learning_rate": 7.384795465612242e-06, + "loss": 0.0002, + "step": 1526 + }, + { + "epoch": 2.7, + "grad_norm": 0.001334111439064145, + "learning_rate": 7.298373621806492e-06, + "loss": 0.0001, + "step": 1527 + }, + { + "epoch": 2.7, + "grad_norm": 0.04619232565164566, + "learning_rate": 7.2124478317449385e-06, + "loss": 0.0017, + "step": 1528 + }, + { + "epoch": 2.7, + "grad_norm": 0.019857389852404594, + "learning_rate": 7.127018394119322e-06, + "loss": 0.0009, + "step": 1529 + }, + { + "epoch": 2.71, + "grad_norm": 0.007324309553951025, + "learning_rate": 7.0420856058961415e-06, + "loss": 0.0004, + "step": 1530 + }, + { + "epoch": 2.71, + "grad_norm": 0.0024603945203125477, + "learning_rate": 6.957649762315365e-06, + "loss": 0.0003, + "step": 1531 + }, + { + "epoch": 2.71, + "grad_norm": 0.1148630753159523, + "learning_rate": 6.873711156889539e-06, + "loss": 0.0035, + "step": 1532 + }, + { + "epoch": 2.71, + "grad_norm": 0.01363435760140419, + "learning_rate": 6.790270081402743e-06, + "loss": 0.0007, + "step": 1533 + }, + { + "epoch": 2.71, + "grad_norm": 0.0023427484557032585, + "learning_rate": 6.707326825909476e-06, + "loss": 0.0002, + "step": 1534 + }, + { + "epoch": 2.72, + "grad_norm": 0.004616041202098131, + "learning_rate": 6.624881678733851e-06, + "loss": 0.0003, + "step": 1535 + }, + { + "epoch": 2.72, + "grad_norm": 0.1254875212907791, + "learning_rate": 6.542934926468435e-06, + "loss": 0.0061, + "step": 1536 + }, + { + "epoch": 2.72, + "grad_norm": 0.002315373858436942, + "learning_rate": 6.461486853973263e-06, + "loss": 0.0002, + "step": 1537 + }, + { + "epoch": 2.72, + "grad_norm": 0.14256764948368073, + "learning_rate": 6.3805377443748915e-06, + "loss": 0.0041, + "step": 1538 + }, + { + "epoch": 2.72, + "grad_norm": 0.002877446124330163, + "learning_rate": 6.300087879065397e-06, + "loss": 0.0002, + "step": 1539 + }, + { + "epoch": 2.72, + "grad_norm": 0.02434990368783474, + "learning_rate": 6.2201375377014585e-06, + "loss": 0.0009, + "step": 1540 + }, + { + "epoch": 2.73, + "grad_norm": 0.46708935499191284, + "learning_rate": 6.140686998203265e-06, + "loss": 0.0055, + "step": 1541 + }, + { + "epoch": 2.73, + "grad_norm": 0.003949303645640612, + "learning_rate": 6.061736536753642e-06, + "loss": 0.0003, + "step": 1542 + }, + { + "epoch": 2.73, + "grad_norm": 0.00705227255821228, + "learning_rate": 5.983286427797057e-06, + "loss": 0.0003, + "step": 1543 + }, + { + "epoch": 2.73, + "grad_norm": 0.5218189358711243, + "learning_rate": 5.905336944038652e-06, + "loss": 0.0326, + "step": 1544 + }, + { + "epoch": 2.73, + "grad_norm": 0.01854199543595314, + "learning_rate": 5.827888356443361e-06, + "loss": 0.001, + "step": 1545 + }, + { + "epoch": 2.74, + "grad_norm": 0.00732912914827466, + "learning_rate": 5.750940934234893e-06, + "loss": 0.0003, + "step": 1546 + }, + { + "epoch": 2.74, + "grad_norm": 0.0005807331181131303, + "learning_rate": 5.674494944894853e-06, + "loss": 0.0001, + "step": 1547 + }, + { + "epoch": 2.74, + "grad_norm": 0.011191635392606258, + "learning_rate": 5.5985506541617045e-06, + "loss": 0.0004, + "step": 1548 + }, + { + "epoch": 2.74, + "grad_norm": 0.0050989980809390545, + "learning_rate": 5.5231083260300405e-06, + "loss": 0.0003, + "step": 1549 + }, + { + "epoch": 2.74, + "grad_norm": 0.022310905158519745, + "learning_rate": 5.448168222749466e-06, + "loss": 0.0008, + "step": 1550 + }, + { + "epoch": 2.74, + "grad_norm": 0.19058680534362793, + "learning_rate": 5.373730604823817e-06, + "loss": 0.0079, + "step": 1551 + }, + { + "epoch": 2.75, + "grad_norm": 0.00598218385130167, + "learning_rate": 5.299795731010193e-06, + "loss": 0.0004, + "step": 1552 + }, + { + "epoch": 2.75, + "grad_norm": 0.0020094155333936214, + "learning_rate": 5.226363858318039e-06, + "loss": 0.0002, + "step": 1553 + }, + { + "epoch": 2.75, + "grad_norm": 0.0019954463932663202, + "learning_rate": 5.153435242008386e-06, + "loss": 0.0002, + "step": 1554 + }, + { + "epoch": 2.75, + "grad_norm": 0.014590064063668251, + "learning_rate": 5.0810101355927445e-06, + "loss": 0.0004, + "step": 1555 + }, + { + "epoch": 2.75, + "grad_norm": 0.5252607464790344, + "learning_rate": 5.009088790832444e-06, + "loss": 0.0116, + "step": 1556 + }, + { + "epoch": 2.75, + "grad_norm": 0.09568741917610168, + "learning_rate": 4.937671457737597e-06, + "loss": 0.0015, + "step": 1557 + }, + { + "epoch": 2.76, + "grad_norm": 0.18557792901992798, + "learning_rate": 4.8667583845662995e-06, + "loss": 0.0052, + "step": 1558 + }, + { + "epoch": 2.76, + "grad_norm": 0.010230082087218761, + "learning_rate": 4.796349817823802e-06, + "loss": 0.0004, + "step": 1559 + }, + { + "epoch": 2.76, + "grad_norm": 0.0064284843392670155, + "learning_rate": 4.726446002261542e-06, + "loss": 0.0004, + "step": 1560 + }, + { + "epoch": 2.76, + "grad_norm": 0.005189824849367142, + "learning_rate": 4.657047180876406e-06, + "loss": 0.0003, + "step": 1561 + }, + { + "epoch": 2.76, + "grad_norm": 0.551846981048584, + "learning_rate": 4.588153594909854e-06, + "loss": 0.0429, + "step": 1562 + }, + { + "epoch": 2.76, + "eval_loss": 0.12607495486736298, + "eval_runtime": 14.6966, + "eval_samples_per_second": 32.457, + "eval_steps_per_second": 8.165, + "step": 1562 + }, + { + "epoch": 2.77, + "grad_norm": 0.0018496522679924965, + "learning_rate": 4.519765483847005e-06, + "loss": 0.0001, + "step": 1563 + }, + { + "epoch": 2.77, + "grad_norm": 0.002661915495991707, + "learning_rate": 4.4518830854159246e-06, + "loss": 0.0002, + "step": 1564 + }, + { + "epoch": 2.77, + "grad_norm": 0.08861052989959717, + "learning_rate": 4.38450663558671e-06, + "loss": 0.0026, + "step": 1565 + }, + { + "epoch": 2.77, + "grad_norm": 0.021314358338713646, + "learning_rate": 4.317636368570726e-06, + "loss": 0.0007, + "step": 1566 + }, + { + "epoch": 2.77, + "grad_norm": 0.00789622776210308, + "learning_rate": 4.2512725168197315e-06, + "loss": 0.0004, + "step": 1567 + }, + { + "epoch": 2.77, + "grad_norm": 0.017269369214773178, + "learning_rate": 4.1854153110251085e-06, + "loss": 0.0005, + "step": 1568 + }, + { + "epoch": 2.78, + "grad_norm": 0.051121633499860764, + "learning_rate": 4.120064980117099e-06, + "loss": 0.002, + "step": 1569 + }, + { + "epoch": 2.78, + "grad_norm": 0.007395217195153236, + "learning_rate": 4.055221751263921e-06, + "loss": 0.0004, + "step": 1570 + }, + { + "epoch": 2.78, + "grad_norm": 0.015933679416775703, + "learning_rate": 3.9908858498710175e-06, + "loss": 0.0008, + "step": 1571 + }, + { + "epoch": 2.78, + "grad_norm": 0.015494209714233875, + "learning_rate": 3.927057499580344e-06, + "loss": 0.0004, + "step": 1572 + }, + { + "epoch": 2.78, + "grad_norm": 0.3765923082828522, + "learning_rate": 3.86373692226945e-06, + "loss": 0.0679, + "step": 1573 + }, + { + "epoch": 2.78, + "grad_norm": 0.05609598383307457, + "learning_rate": 3.800924338050848e-06, + "loss": 0.0024, + "step": 1574 + }, + { + "epoch": 2.79, + "grad_norm": 0.019249215722084045, + "learning_rate": 3.7386199652711445e-06, + "loss": 0.0009, + "step": 1575 + }, + { + "epoch": 2.79, + "grad_norm": 0.007718250621110201, + "learning_rate": 3.676824020510327e-06, + "loss": 0.0004, + "step": 1576 + }, + { + "epoch": 2.79, + "grad_norm": 0.08048631250858307, + "learning_rate": 3.6155367185810137e-06, + "loss": 0.0014, + "step": 1577 + }, + { + "epoch": 2.79, + "grad_norm": 0.4660739302635193, + "learning_rate": 3.5547582725277025e-06, + "loss": 0.0387, + "step": 1578 + }, + { + "epoch": 2.79, + "grad_norm": 0.03236926719546318, + "learning_rate": 3.494488893626024e-06, + "loss": 0.0009, + "step": 1579 + }, + { + "epoch": 2.8, + "grad_norm": 0.011793696321547031, + "learning_rate": 3.434728791381991e-06, + "loss": 0.0007, + "step": 1580 + }, + { + "epoch": 2.8, + "grad_norm": 0.34605443477630615, + "learning_rate": 3.3754781735313154e-06, + "loss": 0.0449, + "step": 1581 + }, + { + "epoch": 2.8, + "grad_norm": 0.020334916189312935, + "learning_rate": 3.3167372460386753e-06, + "loss": 0.0012, + "step": 1582 + }, + { + "epoch": 2.8, + "grad_norm": 0.02422250621020794, + "learning_rate": 3.2585062130969008e-06, + "loss": 0.0006, + "step": 1583 + }, + { + "epoch": 2.8, + "grad_norm": 0.5089511275291443, + "learning_rate": 3.2007852771264554e-06, + "loss": 0.0125, + "step": 1584 + }, + { + "epoch": 2.8, + "grad_norm": 0.00923207402229309, + "learning_rate": 3.143574638774554e-06, + "loss": 0.0007, + "step": 1585 + }, + { + "epoch": 2.81, + "grad_norm": 1.072019100189209, + "learning_rate": 3.086874496914532e-06, + "loss": 0.0429, + "step": 1586 + }, + { + "epoch": 2.81, + "grad_norm": 0.00296564563177526, + "learning_rate": 3.0306850486451926e-06, + "loss": 0.0003, + "step": 1587 + }, + { + "epoch": 2.81, + "grad_norm": 0.02711191587150097, + "learning_rate": 2.9750064892900936e-06, + "loss": 0.0011, + "step": 1588 + }, + { + "epoch": 2.81, + "grad_norm": 0.009598941542208195, + "learning_rate": 2.919839012396813e-06, + "loss": 0.0006, + "step": 1589 + }, + { + "epoch": 2.81, + "grad_norm": 0.05129231885075569, + "learning_rate": 2.865182809736366e-06, + "loss": 0.0016, + "step": 1590 + }, + { + "epoch": 2.81, + "grad_norm": 0.011009477078914642, + "learning_rate": 2.8110380713024407e-06, + "loss": 0.0007, + "step": 1591 + }, + { + "epoch": 2.82, + "grad_norm": 0.0025108640547841787, + "learning_rate": 2.757404985310879e-06, + "loss": 0.0002, + "step": 1592 + }, + { + "epoch": 2.82, + "grad_norm": 0.012325168587267399, + "learning_rate": 2.7042837381988814e-06, + "loss": 0.0006, + "step": 1593 + }, + { + "epoch": 2.82, + "grad_norm": 0.000850474345497787, + "learning_rate": 2.6516745146244188e-06, + "loss": 0.0001, + "step": 1594 + }, + { + "epoch": 2.82, + "grad_norm": 0.013229799456894398, + "learning_rate": 2.5995774974656048e-06, + "loss": 0.001, + "step": 1595 + }, + { + "epoch": 2.82, + "grad_norm": 0.013208562508225441, + "learning_rate": 2.5479928678199923e-06, + "loss": 0.0006, + "step": 1596 + }, + { + "epoch": 2.83, + "grad_norm": 0.010120869614183903, + "learning_rate": 2.4969208050040435e-06, + "loss": 0.0005, + "step": 1597 + }, + { + "epoch": 2.83, + "grad_norm": 0.012023484334349632, + "learning_rate": 2.446361486552462e-06, + "loss": 0.0005, + "step": 1598 + }, + { + "epoch": 2.83, + "grad_norm": 0.0028177513740956783, + "learning_rate": 2.396315088217543e-06, + "loss": 0.0002, + "step": 1599 + }, + { + "epoch": 2.83, + "grad_norm": 0.006563682109117508, + "learning_rate": 2.3467817839685765e-06, + "loss": 0.0005, + "step": 1600 + }, + { + "epoch": 2.83, + "grad_norm": 0.0017960217082872987, + "learning_rate": 2.297761745991278e-06, + "loss": 0.0002, + "step": 1601 + }, + { + "epoch": 2.83, + "grad_norm": 0.002174670109525323, + "learning_rate": 2.249255144687173e-06, + "loss": 0.0002, + "step": 1602 + }, + { + "epoch": 2.84, + "grad_norm": 0.003578716889023781, + "learning_rate": 2.2012621486729998e-06, + "loss": 0.0002, + "step": 1603 + }, + { + "epoch": 2.84, + "grad_norm": 0.004278930369764566, + "learning_rate": 2.1537829247800897e-06, + "loss": 0.0003, + "step": 1604 + }, + { + "epoch": 2.84, + "grad_norm": 0.0018558504525572062, + "learning_rate": 2.106817638053837e-06, + "loss": 0.0002, + "step": 1605 + }, + { + "epoch": 2.84, + "grad_norm": 0.013900038786232471, + "learning_rate": 2.0603664517531315e-06, + "loss": 0.001, + "step": 1606 + }, + { + "epoch": 2.84, + "grad_norm": 0.007856069132685661, + "learning_rate": 2.0144295273497422e-06, + "loss": 0.0003, + "step": 1607 + }, + { + "epoch": 2.84, + "grad_norm": 0.040434129536151886, + "learning_rate": 1.9690070245277856e-06, + "loss": 0.0011, + "step": 1608 + }, + { + "epoch": 2.85, + "grad_norm": 0.0025377573911100626, + "learning_rate": 1.9240991011831573e-06, + "loss": 0.0002, + "step": 1609 + }, + { + "epoch": 2.85, + "grad_norm": 0.19315004348754883, + "learning_rate": 1.8797059134230186e-06, + "loss": 0.0075, + "step": 1610 + }, + { + "epoch": 2.85, + "grad_norm": 0.006814354099333286, + "learning_rate": 1.8358276155651941e-06, + "loss": 0.0003, + "step": 1611 + }, + { + "epoch": 2.85, + "grad_norm": 0.004595267120748758, + "learning_rate": 1.7924643601377075e-06, + "loss": 0.0003, + "step": 1612 + }, + { + "epoch": 2.85, + "grad_norm": 0.07894070446491241, + "learning_rate": 1.7496162978781814e-06, + "loss": 0.0024, + "step": 1613 + }, + { + "epoch": 2.86, + "grad_norm": 0.006976725999265909, + "learning_rate": 1.7072835777333537e-06, + "loss": 0.0003, + "step": 1614 + }, + { + "epoch": 2.86, + "grad_norm": 1.0698426961898804, + "learning_rate": 1.6654663468585294e-06, + "loss": 0.0241, + "step": 1615 + }, + { + "epoch": 2.86, + "grad_norm": 1.146973967552185, + "learning_rate": 1.6241647506171462e-06, + "loss": 0.0595, + "step": 1616 + }, + { + "epoch": 2.86, + "grad_norm": 0.008781477808952332, + "learning_rate": 1.5833789325801428e-06, + "loss": 0.0005, + "step": 1617 + }, + { + "epoch": 2.86, + "grad_norm": 0.13376377522945404, + "learning_rate": 1.5431090345255914e-06, + "loss": 0.0098, + "step": 1618 + }, + { + "epoch": 2.86, + "grad_norm": 0.004795625805854797, + "learning_rate": 1.5033551964381163e-06, + "loss": 0.0004, + "step": 1619 + }, + { + "epoch": 2.87, + "grad_norm": 0.029682036489248276, + "learning_rate": 1.4641175565084263e-06, + "loss": 0.0014, + "step": 1620 + }, + { + "epoch": 2.87, + "grad_norm": 0.7801033854484558, + "learning_rate": 1.4253962511328654e-06, + "loss": 0.0228, + "step": 1621 + }, + { + "epoch": 2.87, + "grad_norm": 0.04092922434210777, + "learning_rate": 1.3871914149128805e-06, + "loss": 0.002, + "step": 1622 + }, + { + "epoch": 2.87, + "grad_norm": 0.01378623116761446, + "learning_rate": 1.3495031806546043e-06, + "loss": 0.0007, + "step": 1623 + }, + { + "epoch": 2.87, + "grad_norm": 0.5313521027565002, + "learning_rate": 1.3123316793684224e-06, + "loss": 0.029, + "step": 1624 + }, + { + "epoch": 2.87, + "grad_norm": 0.009036810137331486, + "learning_rate": 1.275677040268408e-06, + "loss": 0.0007, + "step": 1625 + }, + { + "epoch": 2.88, + "grad_norm": 0.012370087206363678, + "learning_rate": 1.239539390771971e-06, + "loss": 0.0007, + "step": 1626 + }, + { + "epoch": 2.88, + "grad_norm": 0.005388198886066675, + "learning_rate": 1.2039188564994085e-06, + "loss": 0.0004, + "step": 1627 + }, + { + "epoch": 2.88, + "grad_norm": 0.0035593092907220125, + "learning_rate": 1.1688155612733563e-06, + "loss": 0.0003, + "step": 1628 + }, + { + "epoch": 2.88, + "grad_norm": 0.005360861774533987, + "learning_rate": 1.1342296271185381e-06, + "loss": 0.0004, + "step": 1629 + }, + { + "epoch": 2.88, + "grad_norm": 0.11837761104106903, + "learning_rate": 1.1001611742611827e-06, + "loss": 0.0039, + "step": 1630 + }, + { + "epoch": 2.89, + "grad_norm": 0.005718675442039967, + "learning_rate": 1.0666103211287246e-06, + "loss": 0.0003, + "step": 1631 + }, + { + "epoch": 2.89, + "grad_norm": 0.029535168781876564, + "learning_rate": 1.0335771843493045e-06, + "loss": 0.0011, + "step": 1632 + }, + { + "epoch": 2.89, + "grad_norm": 0.013675598427653313, + "learning_rate": 1.0010618787513858e-06, + "loss": 0.0009, + "step": 1633 + }, + { + "epoch": 2.89, + "grad_norm": 0.0008247630903497338, + "learning_rate": 9.690645173634048e-07, + "loss": 0.0001, + "step": 1634 + }, + { + "epoch": 2.89, + "grad_norm": 0.0013318447163328528, + "learning_rate": 9.37585211413322e-07, + "loss": 0.0001, + "step": 1635 + }, + { + "epoch": 2.89, + "grad_norm": 0.0068739596754312515, + "learning_rate": 9.066240703282546e-07, + "loss": 0.0004, + "step": 1636 + }, + { + "epoch": 2.9, + "grad_norm": 0.003807327477261424, + "learning_rate": 8.761812017340775e-07, + "loss": 0.0003, + "step": 1637 + }, + { + "epoch": 2.9, + "grad_norm": 0.01904761791229248, + "learning_rate": 8.462567114551066e-07, + "loss": 0.0004, + "step": 1638 + }, + { + "epoch": 2.9, + "grad_norm": 0.24286866188049316, + "learning_rate": 8.168507035136663e-07, + "loss": 0.009, + "step": 1639 + }, + { + "epoch": 2.9, + "grad_norm": 0.0069158561527729034, + "learning_rate": 7.879632801297387e-07, + "loss": 0.0004, + "step": 1640 + }, + { + "epoch": 2.9, + "grad_norm": 0.03142062947154045, + "learning_rate": 7.595945417206817e-07, + "loss": 0.0016, + "step": 1641 + }, + { + "epoch": 2.9, + "grad_norm": 0.00610806280747056, + "learning_rate": 7.317445869007288e-07, + "loss": 0.0005, + "step": 1642 + }, + { + "epoch": 2.91, + "grad_norm": 0.0015223894733935595, + "learning_rate": 7.044135124808226e-07, + "loss": 0.0002, + "step": 1643 + }, + { + "epoch": 2.91, + "grad_norm": 0.02586345002055168, + "learning_rate": 6.776014134681651e-07, + "loss": 0.0011, + "step": 1644 + }, + { + "epoch": 2.91, + "grad_norm": 0.01392897218465805, + "learning_rate": 6.513083830659016e-07, + "loss": 0.0009, + "step": 1645 + }, + { + "epoch": 2.91, + "grad_norm": 0.09576506912708282, + "learning_rate": 6.255345126728373e-07, + "loss": 0.0024, + "step": 1646 + }, + { + "epoch": 2.91, + "grad_norm": 0.1048772782087326, + "learning_rate": 6.002798918830543e-07, + "loss": 0.0021, + "step": 1647 + }, + { + "epoch": 2.92, + "grad_norm": 0.47677847743034363, + "learning_rate": 5.755446084856785e-07, + "loss": 0.0243, + "step": 1648 + }, + { + "epoch": 2.92, + "grad_norm": 0.008908413350582123, + "learning_rate": 5.513287484645301e-07, + "loss": 0.0005, + "step": 1649 + }, + { + "epoch": 2.92, + "grad_norm": 0.015466040931642056, + "learning_rate": 5.276323959978235e-07, + "loss": 0.0009, + "step": 1650 + }, + { + "epoch": 2.92, + "grad_norm": 0.007094045635312796, + "learning_rate": 5.044556334578676e-07, + "loss": 0.0006, + "step": 1651 + }, + { + "epoch": 2.92, + "grad_norm": 1.6165049076080322, + "learning_rate": 4.817985414107994e-07, + "loss": 0.021, + "step": 1652 + }, + { + "epoch": 2.92, + "grad_norm": 0.0037352838553488255, + "learning_rate": 4.596611986163512e-07, + "loss": 0.0003, + "step": 1653 + }, + { + "epoch": 2.93, + "grad_norm": 0.18181180953979492, + "learning_rate": 4.380436820274169e-07, + "loss": 0.0047, + "step": 1654 + }, + { + "epoch": 2.93, + "grad_norm": 0.4960530400276184, + "learning_rate": 4.1694606679000265e-07, + "loss": 0.03, + "step": 1655 + }, + { + "epoch": 2.93, + "grad_norm": 0.008095409721136093, + "learning_rate": 3.9636842624277685e-07, + "loss": 0.0004, + "step": 1656 + }, + { + "epoch": 2.93, + "grad_norm": 0.003887269413098693, + "learning_rate": 3.763108319169206e-07, + "loss": 0.0003, + "step": 1657 + }, + { + "epoch": 2.93, + "grad_norm": 0.0033724040258675814, + "learning_rate": 3.567733535358608e-07, + "loss": 0.0002, + "step": 1658 + }, + { + "epoch": 2.93, + "grad_norm": 0.018379656597971916, + "learning_rate": 3.377560590149875e-07, + "loss": 0.0009, + "step": 1659 + }, + { + "epoch": 2.94, + "grad_norm": 0.007410319987684488, + "learning_rate": 3.1925901446148704e-07, + "loss": 0.0004, + "step": 1660 + }, + { + "epoch": 2.94, + "grad_norm": 0.22414793074131012, + "learning_rate": 3.0128228417404254e-07, + "loss": 0.0068, + "step": 1661 + }, + { + "epoch": 2.94, + "grad_norm": 0.014489777386188507, + "learning_rate": 2.838259306426505e-07, + "loss": 0.0008, + "step": 1662 + }, + { + "epoch": 2.94, + "grad_norm": 0.00448219757527113, + "learning_rate": 2.668900145484043e-07, + "loss": 0.0003, + "step": 1663 + }, + { + "epoch": 2.94, + "grad_norm": 0.005045249592512846, + "learning_rate": 2.504745947632447e-07, + "loss": 0.0004, + "step": 1664 + }, + { + "epoch": 2.95, + "grad_norm": 0.032420847564935684, + "learning_rate": 2.3457972834984294e-07, + "loss": 0.0017, + "step": 1665 + }, + { + "epoch": 2.95, + "grad_norm": 0.16054433584213257, + "learning_rate": 2.192054705612678e-07, + "loss": 0.0049, + "step": 1666 + }, + { + "epoch": 2.95, + "grad_norm": 0.0027328289579600096, + "learning_rate": 2.0435187484091898e-07, + "loss": 0.0002, + "step": 1667 + }, + { + "epoch": 2.95, + "grad_norm": 0.060554660856723785, + "learning_rate": 1.9001899282227727e-07, + "loss": 0.002, + "step": 1668 + }, + { + "epoch": 2.95, + "grad_norm": 0.0209331214427948, + "learning_rate": 1.7620687432873815e-07, + "loss": 0.0011, + "step": 1669 + }, + { + "epoch": 2.95, + "grad_norm": 0.047592274844646454, + "learning_rate": 1.62915567373445e-07, + "loss": 0.0014, + "step": 1670 + }, + { + "epoch": 2.96, + "grad_norm": 0.009739769622683525, + "learning_rate": 1.501451181591229e-07, + "loss": 0.0007, + "step": 1671 + }, + { + "epoch": 2.96, + "grad_norm": 0.004680559504777193, + "learning_rate": 1.3789557107787842e-07, + "loss": 0.0003, + "step": 1672 + }, + { + "epoch": 2.96, + "grad_norm": 0.006286246236413717, + "learning_rate": 1.2616696871111665e-07, + "loss": 0.0003, + "step": 1673 + }, + { + "epoch": 2.96, + "grad_norm": 0.011701489798724651, + "learning_rate": 1.1495935182932459e-07, + "loss": 0.0008, + "step": 1674 + }, + { + "epoch": 2.96, + "grad_norm": 0.0025694698560982943, + "learning_rate": 1.0427275939200453e-07, + "loss": 0.0002, + "step": 1675 + }, + { + "epoch": 2.97, + "grad_norm": 0.003664742223918438, + "learning_rate": 9.41072285474409e-08, + "loss": 0.0003, + "step": 1676 + }, + { + "epoch": 2.97, + "grad_norm": 0.007470333017408848, + "learning_rate": 8.446279463266703e-08, + "loss": 0.0004, + "step": 1677 + }, + { + "epoch": 2.97, + "grad_norm": 0.004365910310298204, + "learning_rate": 7.533949117328186e-08, + "loss": 0.0004, + "step": 1678 + }, + { + "epoch": 2.97, + "grad_norm": 0.004565135575830936, + "learning_rate": 6.673734988333346e-08, + "loss": 0.0005, + "step": 1679 + }, + { + "epoch": 2.97, + "grad_norm": 0.0038688648492097855, + "learning_rate": 5.8656400665252346e-08, + "loss": 0.0003, + "step": 1680 + }, + { + "epoch": 2.97, + "grad_norm": 0.04518837854266167, + "learning_rate": 5.109667160973496e-08, + "loss": 0.0009, + "step": 1681 + }, + { + "epoch": 2.98, + "grad_norm": 0.003927054814994335, + "learning_rate": 4.405818899557711e-08, + "loss": 0.0003, + "step": 1682 + }, + { + "epoch": 2.98, + "grad_norm": 0.003371802158653736, + "learning_rate": 3.7540977289707285e-08, + "loss": 0.0002, + "step": 1683 + }, + { + "epoch": 2.98, + "grad_norm": 0.0015175462467595935, + "learning_rate": 3.154505914697014e-08, + "loss": 0.0001, + "step": 1684 + }, + { + "epoch": 2.98, + "grad_norm": 0.6906746029853821, + "learning_rate": 2.607045541015984e-08, + "loss": 0.0241, + "step": 1685 + }, + { + "epoch": 2.98, + "grad_norm": 0.34494316577911377, + "learning_rate": 2.1117185109853495e-08, + "loss": 0.0273, + "step": 1686 + }, + { + "epoch": 2.98, + "grad_norm": 0.007323576603084803, + "learning_rate": 1.6685265464444486e-08, + "loss": 0.0004, + "step": 1687 + }, + { + "epoch": 2.99, + "grad_norm": 0.5140528082847595, + "learning_rate": 1.2774711879975919e-08, + "loss": 0.0153, + "step": 1688 + }, + { + "epoch": 2.99, + "grad_norm": 0.0021088668145239353, + "learning_rate": 9.385537950173937e-09, + "loss": 0.0001, + "step": 1689 + }, + { + "epoch": 2.99, + "grad_norm": 0.03296704962849617, + "learning_rate": 6.5177554563311526e-09, + "loss": 0.0011, + "step": 1690 + }, + { + "epoch": 2.99, + "grad_norm": 0.010219641029834747, + "learning_rate": 4.171374367323288e-09, + "loss": 0.0007, + "step": 1691 + }, + { + "epoch": 2.99, + "grad_norm": 0.008996028453111649, + "learning_rate": 2.3464028395592247e-09, + "loss": 0.0005, + "step": 1692 + }, + { + "epoch": 3.0, + "grad_norm": 0.10714947432279587, + "learning_rate": 1.042847216931042e-09, + "loss": 0.0035, + "step": 1693 + }, + { + "epoch": 3.0, + "grad_norm": 0.21872705221176147, + "learning_rate": 2.607120307973609e-10, + "loss": 0.0048, + "step": 1694 + }, + { + "epoch": 3.0, + "grad_norm": 0.007370665203779936, + "learning_rate": 0.0, + "loss": 0.0003, + "step": 1695 + } + ], + "logging_steps": 1, + "max_steps": 1695, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 565, + "total_flos": 1.549439947809751e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}