diff --git "a/checkpoint-18096/trainer_state.json" "b/checkpoint-18096/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-18096/trainer_state.json" @@ -0,0 +1,6482 @@ +{ + "best_global_step": 18096, + "best_metric": 4.172011852264404, + "best_model_checkpoint": "./checkpoints/kani_tts_vi/checkpoint-18096", + "epoch": 2.9985086992543497, + "eval_steps": 1508, + "global_step": 18096, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_loss": 5.765849590301514, + "eval_runtime": 183.6622, + "eval_samples_per_second": 116.829, + "eval_steps_per_second": 14.608, + "step": 0 + }, + { + "epoch": 0.00016570008285004143, + "grad_norm": 1.1235119104385376, + "learning_rate": 0.0, + "loss": 5.7579, + "step": 1 + }, + { + "epoch": 0.0033140016570008283, + "grad_norm": 1.0993030071258545, + "learning_rate": 2.098288238542242e-07, + "loss": 5.7648, + "step": 20 + }, + { + "epoch": 0.006628003314001657, + "grad_norm": 1.1002686023712158, + "learning_rate": 4.3070127001656544e-07, + "loss": 5.7493, + "step": 40 + }, + { + "epoch": 0.009942004971002486, + "grad_norm": 1.1715701818466187, + "learning_rate": 6.515737161789068e-07, + "loss": 5.7552, + "step": 60 + }, + { + "epoch": 0.013256006628003313, + "grad_norm": 1.0982500314712524, + "learning_rate": 8.724461623412479e-07, + "loss": 5.7295, + "step": 80 + }, + { + "epoch": 0.016570008285004142, + "grad_norm": 1.0379549264907837, + "learning_rate": 1.0933186085035892e-06, + "loss": 5.7161, + "step": 100 + }, + { + "epoch": 0.01988400994200497, + "grad_norm": 0.9815719127655029, + "learning_rate": 1.3141910546659305e-06, + "loss": 5.6992, + "step": 120 + }, + { + "epoch": 0.0231980115990058, + "grad_norm": 0.9279279112815857, + "learning_rate": 1.5350635008282719e-06, + "loss": 5.6663, + "step": 140 + }, + { + "epoch": 0.026512013256006627, + "grad_norm": 0.7874447703361511, + "learning_rate": 1.7559359469906132e-06, + "loss": 5.6036, + "step": 160 + }, + { + "epoch": 0.029826014913007456, + "grad_norm": 0.641957700252533, + "learning_rate": 1.9768083931529544e-06, + "loss": 5.5488, + "step": 180 + }, + { + "epoch": 0.033140016570008285, + "grad_norm": 0.583203911781311, + "learning_rate": 2.1976808393152955e-06, + "loss": 5.5007, + "step": 200 + }, + { + "epoch": 0.03645401822700911, + "grad_norm": 0.6346896290779114, + "learning_rate": 2.418553285477637e-06, + "loss": 5.4716, + "step": 220 + }, + { + "epoch": 0.03976801988400994, + "grad_norm": 0.4862409830093384, + "learning_rate": 2.639425731639978e-06, + "loss": 5.4267, + "step": 240 + }, + { + "epoch": 0.04308202154101077, + "grad_norm": 0.42509493231773376, + "learning_rate": 2.8602981778023194e-06, + "loss": 5.369, + "step": 260 + }, + { + "epoch": 0.0463960231980116, + "grad_norm": 0.4597702920436859, + "learning_rate": 3.0811706239646605e-06, + "loss": 5.3477, + "step": 280 + }, + { + "epoch": 0.04971002485501243, + "grad_norm": 0.36589357256889343, + "learning_rate": 3.3020430701270016e-06, + "loss": 5.2935, + "step": 300 + }, + { + "epoch": 0.05302402651201325, + "grad_norm": 0.36844995617866516, + "learning_rate": 3.522915516289343e-06, + "loss": 5.2919, + "step": 320 + }, + { + "epoch": 0.056338028169014086, + "grad_norm": 0.3322344124317169, + "learning_rate": 3.7437879624516843e-06, + "loss": 5.2649, + "step": 340 + }, + { + "epoch": 0.05965202982601491, + "grad_norm": 0.3425517678260803, + "learning_rate": 3.964660408614026e-06, + "loss": 5.2381, + "step": 360 + }, + { + "epoch": 0.06296603148301574, + "grad_norm": 0.31897497177124023, + "learning_rate": 4.185532854776367e-06, + "loss": 5.2144, + "step": 380 + }, + { + "epoch": 0.06628003314001657, + "grad_norm": 0.32102635502815247, + "learning_rate": 4.406405300938708e-06, + "loss": 5.1851, + "step": 400 + }, + { + "epoch": 0.0695940347970174, + "grad_norm": 0.3096570372581482, + "learning_rate": 4.62727774710105e-06, + "loss": 5.158, + "step": 420 + }, + { + "epoch": 0.07290803645401822, + "grad_norm": 0.3279052972793579, + "learning_rate": 4.8481501932633905e-06, + "loss": 5.1294, + "step": 440 + }, + { + "epoch": 0.07622203811101906, + "grad_norm": 0.3023601174354553, + "learning_rate": 5.069022639425733e-06, + "loss": 5.1182, + "step": 460 + }, + { + "epoch": 0.07953603976801989, + "grad_norm": 0.31305599212646484, + "learning_rate": 5.289895085588074e-06, + "loss": 5.0938, + "step": 480 + }, + { + "epoch": 0.08285004142502071, + "grad_norm": 0.3819890022277832, + "learning_rate": 5.510767531750415e-06, + "loss": 5.0821, + "step": 500 + }, + { + "epoch": 0.08616404308202154, + "grad_norm": 0.36871927976608276, + "learning_rate": 5.731639977912756e-06, + "loss": 5.0603, + "step": 520 + }, + { + "epoch": 0.08947804473902236, + "grad_norm": 0.339070588350296, + "learning_rate": 5.9525124240750975e-06, + "loss": 5.0473, + "step": 540 + }, + { + "epoch": 0.0927920463960232, + "grad_norm": 0.3293665051460266, + "learning_rate": 6.173384870237439e-06, + "loss": 5.0157, + "step": 560 + }, + { + "epoch": 0.09610604805302403, + "grad_norm": 0.3422246277332306, + "learning_rate": 6.39425731639978e-06, + "loss": 5.0175, + "step": 580 + }, + { + "epoch": 0.09942004971002485, + "grad_norm": 0.4414791166782379, + "learning_rate": 6.615129762562121e-06, + "loss": 5.0043, + "step": 600 + }, + { + "epoch": 0.10273405136702568, + "grad_norm": 0.38322046399116516, + "learning_rate": 6.836002208724463e-06, + "loss": 4.9613, + "step": 620 + }, + { + "epoch": 0.1060480530240265, + "grad_norm": 0.46550989151000977, + "learning_rate": 7.056874654886804e-06, + "loss": 4.9527, + "step": 640 + }, + { + "epoch": 0.10936205468102735, + "grad_norm": 0.35673046112060547, + "learning_rate": 7.277747101049145e-06, + "loss": 4.9505, + "step": 660 + }, + { + "epoch": 0.11267605633802817, + "grad_norm": 0.4334908425807953, + "learning_rate": 7.498619547211487e-06, + "loss": 4.919, + "step": 680 + }, + { + "epoch": 0.115990057995029, + "grad_norm": 0.43396326899528503, + "learning_rate": 7.719491993373828e-06, + "loss": 4.8998, + "step": 700 + }, + { + "epoch": 0.11930405965202982, + "grad_norm": 0.4822360575199127, + "learning_rate": 7.940364439536168e-06, + "loss": 4.868, + "step": 720 + }, + { + "epoch": 0.12261806130903065, + "grad_norm": 0.4604535698890686, + "learning_rate": 8.16123688569851e-06, + "loss": 4.8607, + "step": 740 + }, + { + "epoch": 0.1259320629660315, + "grad_norm": 0.44878089427948, + "learning_rate": 8.382109331860851e-06, + "loss": 4.849, + "step": 760 + }, + { + "epoch": 0.12924606462303231, + "grad_norm": 0.40941646695137024, + "learning_rate": 8.602981778023193e-06, + "loss": 4.8311, + "step": 780 + }, + { + "epoch": 0.13256006628003314, + "grad_norm": 0.4913872182369232, + "learning_rate": 8.823854224185534e-06, + "loss": 4.8188, + "step": 800 + }, + { + "epoch": 0.13587406793703397, + "grad_norm": 0.4528390169143677, + "learning_rate": 9.044726670347876e-06, + "loss": 4.7939, + "step": 820 + }, + { + "epoch": 0.1391880695940348, + "grad_norm": 0.5730827450752258, + "learning_rate": 9.265599116510216e-06, + "loss": 4.7842, + "step": 840 + }, + { + "epoch": 0.14250207125103562, + "grad_norm": 0.4298979341983795, + "learning_rate": 9.486471562672557e-06, + "loss": 4.7936, + "step": 860 + }, + { + "epoch": 0.14581607290803644, + "grad_norm": 0.5327754020690918, + "learning_rate": 9.707344008834899e-06, + "loss": 4.7684, + "step": 880 + }, + { + "epoch": 0.1491300745650373, + "grad_norm": 0.5210330486297607, + "learning_rate": 9.92821645499724e-06, + "loss": 4.7625, + "step": 900 + }, + { + "epoch": 0.15244407622203812, + "grad_norm": 0.46335604786872864, + "learning_rate": 1.014908890115958e-05, + "loss": 4.7557, + "step": 920 + }, + { + "epoch": 0.15575807787903895, + "grad_norm": 0.5017201900482178, + "learning_rate": 1.0369961347321922e-05, + "loss": 4.7161, + "step": 940 + }, + { + "epoch": 0.15907207953603977, + "grad_norm": 0.5733606815338135, + "learning_rate": 1.0590833793484264e-05, + "loss": 4.7297, + "step": 960 + }, + { + "epoch": 0.1623860811930406, + "grad_norm": 0.5582395792007446, + "learning_rate": 1.0811706239646605e-05, + "loss": 4.7073, + "step": 980 + }, + { + "epoch": 0.16570008285004142, + "grad_norm": 0.5947230458259583, + "learning_rate": 1.1032578685808945e-05, + "loss": 4.6942, + "step": 1000 + }, + { + "epoch": 0.16901408450704225, + "grad_norm": 0.575781524181366, + "learning_rate": 1.1253451131971287e-05, + "loss": 4.6821, + "step": 1020 + }, + { + "epoch": 0.17232808616404308, + "grad_norm": 0.6180970668792725, + "learning_rate": 1.1474323578133628e-05, + "loss": 4.682, + "step": 1040 + }, + { + "epoch": 0.1756420878210439, + "grad_norm": 0.7229332327842712, + "learning_rate": 1.169519602429597e-05, + "loss": 4.667, + "step": 1060 + }, + { + "epoch": 0.17895608947804473, + "grad_norm": 0.6373065114021301, + "learning_rate": 1.1916068470458311e-05, + "loss": 4.647, + "step": 1080 + }, + { + "epoch": 0.18227009113504558, + "grad_norm": 0.5774452090263367, + "learning_rate": 1.2136940916620653e-05, + "loss": 4.6472, + "step": 1100 + }, + { + "epoch": 0.1855840927920464, + "grad_norm": 0.6182013750076294, + "learning_rate": 1.2357813362782993e-05, + "loss": 4.6357, + "step": 1120 + }, + { + "epoch": 0.18889809444904723, + "grad_norm": 0.5788661241531372, + "learning_rate": 1.2578685808945334e-05, + "loss": 4.6124, + "step": 1140 + }, + { + "epoch": 0.19221209610604806, + "grad_norm": 0.575722336769104, + "learning_rate": 1.2799558255107676e-05, + "loss": 4.6073, + "step": 1160 + }, + { + "epoch": 0.19552609776304888, + "grad_norm": 0.5865082740783691, + "learning_rate": 1.3020430701270017e-05, + "loss": 4.6138, + "step": 1180 + }, + { + "epoch": 0.1988400994200497, + "grad_norm": 0.6683821678161621, + "learning_rate": 1.3241303147432359e-05, + "loss": 4.6021, + "step": 1200 + }, + { + "epoch": 0.20215410107705054, + "grad_norm": 0.631481945514679, + "learning_rate": 1.3462175593594699e-05, + "loss": 4.6043, + "step": 1220 + }, + { + "epoch": 0.20546810273405136, + "grad_norm": 0.622015118598938, + "learning_rate": 1.368304803975704e-05, + "loss": 4.5875, + "step": 1240 + }, + { + "epoch": 0.2087821043910522, + "grad_norm": 0.6793993711471558, + "learning_rate": 1.3903920485919382e-05, + "loss": 4.5888, + "step": 1260 + }, + { + "epoch": 0.212096106048053, + "grad_norm": 0.7085295915603638, + "learning_rate": 1.4124792932081723e-05, + "loss": 4.5804, + "step": 1280 + }, + { + "epoch": 0.21541010770505387, + "grad_norm": 0.6480498909950256, + "learning_rate": 1.4345665378244065e-05, + "loss": 4.5743, + "step": 1300 + }, + { + "epoch": 0.2187241093620547, + "grad_norm": 0.6320337057113647, + "learning_rate": 1.4566537824406407e-05, + "loss": 4.563, + "step": 1320 + }, + { + "epoch": 0.22203811101905552, + "grad_norm": 0.6786954402923584, + "learning_rate": 1.4787410270568746e-05, + "loss": 4.5345, + "step": 1340 + }, + { + "epoch": 0.22535211267605634, + "grad_norm": 0.6301207542419434, + "learning_rate": 1.5008282716731088e-05, + "loss": 4.5504, + "step": 1360 + }, + { + "epoch": 0.22866611433305717, + "grad_norm": 0.6827840805053711, + "learning_rate": 1.522915516289343e-05, + "loss": 4.5564, + "step": 1380 + }, + { + "epoch": 0.231980115990058, + "grad_norm": 0.6710033416748047, + "learning_rate": 1.545002760905577e-05, + "loss": 4.5316, + "step": 1400 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.7792025804519653, + "learning_rate": 1.567090005521811e-05, + "loss": 4.513, + "step": 1420 + }, + { + "epoch": 0.23860811930405965, + "grad_norm": 0.7562044858932495, + "learning_rate": 1.5891772501380454e-05, + "loss": 4.5154, + "step": 1440 + }, + { + "epoch": 0.24192212096106047, + "grad_norm": 0.778243362903595, + "learning_rate": 1.6112644947542794e-05, + "loss": 4.5014, + "step": 1460 + }, + { + "epoch": 0.2452361226180613, + "grad_norm": 0.7965789437294006, + "learning_rate": 1.6333517393705137e-05, + "loss": 4.5302, + "step": 1480 + }, + { + "epoch": 0.24855012427506215, + "grad_norm": 0.7689153552055359, + "learning_rate": 1.6554389839867477e-05, + "loss": 4.5054, + "step": 1500 + }, + { + "epoch": 0.24987572493786248, + "eval_loss": 4.517783164978027, + "eval_runtime": 182.5131, + "eval_samples_per_second": 117.564, + "eval_steps_per_second": 14.7, + "step": 1508 + }, + { + "epoch": 0.251864125932063, + "grad_norm": 0.7492868304252625, + "learning_rate": 1.6775262286029817e-05, + "loss": 4.5075, + "step": 1520 + }, + { + "epoch": 0.2551781275890638, + "grad_norm": 0.7577246427536011, + "learning_rate": 1.699613473219216e-05, + "loss": 4.5119, + "step": 1540 + }, + { + "epoch": 0.25849212924606463, + "grad_norm": 0.8268622756004333, + "learning_rate": 1.72170071783545e-05, + "loss": 4.5144, + "step": 1560 + }, + { + "epoch": 0.2618061309030654, + "grad_norm": 0.7629988789558411, + "learning_rate": 1.7437879624516844e-05, + "loss": 4.487, + "step": 1580 + }, + { + "epoch": 0.2651201325600663, + "grad_norm": 0.7350299954414368, + "learning_rate": 1.7658752070679183e-05, + "loss": 4.486, + "step": 1600 + }, + { + "epoch": 0.26843413421706713, + "grad_norm": 0.7801204323768616, + "learning_rate": 1.7879624516841527e-05, + "loss": 4.4934, + "step": 1620 + }, + { + "epoch": 0.27174813587406793, + "grad_norm": 0.7653331756591797, + "learning_rate": 1.8100496963003867e-05, + "loss": 4.4947, + "step": 1640 + }, + { + "epoch": 0.2750621375310688, + "grad_norm": 0.8405510187149048, + "learning_rate": 1.8321369409166206e-05, + "loss": 4.4733, + "step": 1660 + }, + { + "epoch": 0.2783761391880696, + "grad_norm": 0.8701684474945068, + "learning_rate": 1.854224185532855e-05, + "loss": 4.4698, + "step": 1680 + }, + { + "epoch": 0.28169014084507044, + "grad_norm": 0.8502447605133057, + "learning_rate": 1.876311430149089e-05, + "loss": 4.4662, + "step": 1700 + }, + { + "epoch": 0.28500414250207123, + "grad_norm": 0.9050076603889465, + "learning_rate": 1.8983986747653233e-05, + "loss": 4.4789, + "step": 1720 + }, + { + "epoch": 0.2883181441590721, + "grad_norm": 0.8459646105766296, + "learning_rate": 1.9204859193815573e-05, + "loss": 4.4739, + "step": 1740 + }, + { + "epoch": 0.2916321458160729, + "grad_norm": 0.81717449426651, + "learning_rate": 1.9425731639977913e-05, + "loss": 4.4621, + "step": 1760 + }, + { + "epoch": 0.29494614747307374, + "grad_norm": 0.9974759221076965, + "learning_rate": 1.9646604086140256e-05, + "loss": 4.4599, + "step": 1780 + }, + { + "epoch": 0.2982601491300746, + "grad_norm": 0.895898699760437, + "learning_rate": 1.9867476532302596e-05, + "loss": 4.4681, + "step": 1800 + }, + { + "epoch": 0.3015741507870754, + "grad_norm": 0.8755784034729004, + "learning_rate": 1.999998810418467e-05, + "loss": 4.4528, + "step": 1820 + }, + { + "epoch": 0.30488815244407624, + "grad_norm": 0.8729479312896729, + "learning_rate": 1.9999854276587224e-05, + "loss": 4.4485, + "step": 1840 + }, + { + "epoch": 0.30820215410107704, + "grad_norm": 0.9529006481170654, + "learning_rate": 1.9999571753619784e-05, + "loss": 4.4633, + "step": 1860 + }, + { + "epoch": 0.3115161557580779, + "grad_norm": 0.8693030476570129, + "learning_rate": 1.999914053948339e-05, + "loss": 4.4502, + "step": 1880 + }, + { + "epoch": 0.3148301574150787, + "grad_norm": 0.7164018750190735, + "learning_rate": 1.9998560640590102e-05, + "loss": 4.4249, + "step": 1900 + }, + { + "epoch": 0.31814415907207955, + "grad_norm": 0.9510648846626282, + "learning_rate": 1.999783206556287e-05, + "loss": 4.4349, + "step": 1920 + }, + { + "epoch": 0.32145816072908034, + "grad_norm": 0.8796148300170898, + "learning_rate": 1.9996954825235418e-05, + "loss": 4.4453, + "step": 1940 + }, + { + "epoch": 0.3247721623860812, + "grad_norm": 0.9448794722557068, + "learning_rate": 1.99959289326521e-05, + "loss": 4.4147, + "step": 1960 + }, + { + "epoch": 0.328086164043082, + "grad_norm": 0.901636004447937, + "learning_rate": 1.999475440306769e-05, + "loss": 4.445, + "step": 1980 + }, + { + "epoch": 0.33140016570008285, + "grad_norm": 0.9294936060905457, + "learning_rate": 1.999343125394714e-05, + "loss": 4.4192, + "step": 2000 + }, + { + "epoch": 0.3347141673570837, + "grad_norm": 0.8538640141487122, + "learning_rate": 1.9991959504965365e-05, + "loss": 4.415, + "step": 2020 + }, + { + "epoch": 0.3380281690140845, + "grad_norm": 0.9281598329544067, + "learning_rate": 1.9990339178006905e-05, + "loss": 4.42, + "step": 2040 + }, + { + "epoch": 0.34134217067108535, + "grad_norm": 0.8479010462760925, + "learning_rate": 1.998857029716562e-05, + "loss": 4.4252, + "step": 2060 + }, + { + "epoch": 0.34465617232808615, + "grad_norm": 0.8258492350578308, + "learning_rate": 1.9986652888744338e-05, + "loss": 4.4225, + "step": 2080 + }, + { + "epoch": 0.347970173985087, + "grad_norm": 0.8847241997718811, + "learning_rate": 1.9984586981254445e-05, + "loss": 4.4044, + "step": 2100 + }, + { + "epoch": 0.3512841756420878, + "grad_norm": 0.9128835797309875, + "learning_rate": 1.998237260541548e-05, + "loss": 4.4257, + "step": 2120 + }, + { + "epoch": 0.35459817729908866, + "grad_norm": 0.8789246082305908, + "learning_rate": 1.9980009794154666e-05, + "loss": 4.4122, + "step": 2140 + }, + { + "epoch": 0.35791217895608946, + "grad_norm": 0.8495468497276306, + "learning_rate": 1.997749858260642e-05, + "loss": 4.4236, + "step": 2160 + }, + { + "epoch": 0.3612261806130903, + "grad_norm": 1.0621867179870605, + "learning_rate": 1.9974839008111845e-05, + "loss": 4.4052, + "step": 2180 + }, + { + "epoch": 0.36454018227009116, + "grad_norm": 0.9372245669364929, + "learning_rate": 1.9972031110218152e-05, + "loss": 4.4094, + "step": 2200 + }, + { + "epoch": 0.36785418392709196, + "grad_norm": 0.8533006906509399, + "learning_rate": 1.9969074930678095e-05, + "loss": 4.4054, + "step": 2220 + }, + { + "epoch": 0.3711681855840928, + "grad_norm": 0.8973360657691956, + "learning_rate": 1.9965970513449343e-05, + "loss": 4.4105, + "step": 2240 + }, + { + "epoch": 0.3744821872410936, + "grad_norm": 0.9037894010543823, + "learning_rate": 1.99627179046938e-05, + "loss": 4.4053, + "step": 2260 + }, + { + "epoch": 0.37779618889809446, + "grad_norm": 0.9603806138038635, + "learning_rate": 1.9959317152776968e-05, + "loss": 4.3804, + "step": 2280 + }, + { + "epoch": 0.38111019055509526, + "grad_norm": 0.9186010360717773, + "learning_rate": 1.995576830826719e-05, + "loss": 4.3957, + "step": 2300 + }, + { + "epoch": 0.3844241922120961, + "grad_norm": 0.8926252126693726, + "learning_rate": 1.9952071423934907e-05, + "loss": 4.3982, + "step": 2320 + }, + { + "epoch": 0.3877381938690969, + "grad_norm": 0.9808516502380371, + "learning_rate": 1.9948226554751877e-05, + "loss": 4.3909, + "step": 2340 + }, + { + "epoch": 0.39105219552609777, + "grad_norm": 0.9596909284591675, + "learning_rate": 1.9944233757890358e-05, + "loss": 4.3946, + "step": 2360 + }, + { + "epoch": 0.39436619718309857, + "grad_norm": 0.9925335645675659, + "learning_rate": 1.994009309272226e-05, + "loss": 4.3979, + "step": 2380 + }, + { + "epoch": 0.3976801988400994, + "grad_norm": 1.0596026182174683, + "learning_rate": 1.993580462081825e-05, + "loss": 4.4022, + "step": 2400 + }, + { + "epoch": 0.4009942004971003, + "grad_norm": 0.92696213722229, + "learning_rate": 1.9931368405946855e-05, + "loss": 4.376, + "step": 2420 + }, + { + "epoch": 0.40430820215410107, + "grad_norm": 0.8824082016944885, + "learning_rate": 1.9926784514073488e-05, + "loss": 4.3839, + "step": 2440 + }, + { + "epoch": 0.4076222038111019, + "grad_norm": 0.923729419708252, + "learning_rate": 1.9922053013359493e-05, + "loss": 4.3766, + "step": 2460 + }, + { + "epoch": 0.4109362054681027, + "grad_norm": 0.8875575661659241, + "learning_rate": 1.991717397416113e-05, + "loss": 4.3816, + "step": 2480 + }, + { + "epoch": 0.4142502071251036, + "grad_norm": 0.8997732996940613, + "learning_rate": 1.9912147469028503e-05, + "loss": 4.362, + "step": 2500 + }, + { + "epoch": 0.4175642087821044, + "grad_norm": 1.0456558465957642, + "learning_rate": 1.990697357270451e-05, + "loss": 4.3761, + "step": 2520 + }, + { + "epoch": 0.4208782104391052, + "grad_norm": 0.9833074808120728, + "learning_rate": 1.9901652362123707e-05, + "loss": 4.3751, + "step": 2540 + }, + { + "epoch": 0.424192212096106, + "grad_norm": 0.9908930659294128, + "learning_rate": 1.9896183916411188e-05, + "loss": 4.3667, + "step": 2560 + }, + { + "epoch": 0.4275062137531069, + "grad_norm": 0.9966866970062256, + "learning_rate": 1.9890568316881397e-05, + "loss": 4.3612, + "step": 2580 + }, + { + "epoch": 0.43082021541010773, + "grad_norm": 1.0175843238830566, + "learning_rate": 1.9884805647036915e-05, + "loss": 4.3734, + "step": 2600 + }, + { + "epoch": 0.43413421706710853, + "grad_norm": 0.9913970232009888, + "learning_rate": 1.987889599256722e-05, + "loss": 4.3816, + "step": 2620 + }, + { + "epoch": 0.4374482187241094, + "grad_norm": 0.9239439368247986, + "learning_rate": 1.987283944134742e-05, + "loss": 4.3615, + "step": 2640 + }, + { + "epoch": 0.4407622203811102, + "grad_norm": 1.023244023323059, + "learning_rate": 1.986663608343694e-05, + "loss": 4.363, + "step": 2660 + }, + { + "epoch": 0.44407622203811103, + "grad_norm": 0.9417888522148132, + "learning_rate": 1.9860286011078183e-05, + "loss": 4.3894, + "step": 2680 + }, + { + "epoch": 0.44739022369511183, + "grad_norm": 0.9343833327293396, + "learning_rate": 1.9853789318695163e-05, + "loss": 4.3501, + "step": 2700 + }, + { + "epoch": 0.4507042253521127, + "grad_norm": 0.9146170020103455, + "learning_rate": 1.9847146102892095e-05, + "loss": 4.3693, + "step": 2720 + }, + { + "epoch": 0.4540182270091135, + "grad_norm": 0.9748229384422302, + "learning_rate": 1.9840356462451963e-05, + "loss": 4.3481, + "step": 2740 + }, + { + "epoch": 0.45733222866611434, + "grad_norm": 0.8722891807556152, + "learning_rate": 1.9833420498335043e-05, + "loss": 4.3684, + "step": 2760 + }, + { + "epoch": 0.46064623032311514, + "grad_norm": 1.0745731592178345, + "learning_rate": 1.982633831367742e-05, + "loss": 4.3512, + "step": 2780 + }, + { + "epoch": 0.463960231980116, + "grad_norm": 0.8993934988975525, + "learning_rate": 1.9819110013789427e-05, + "loss": 4.363, + "step": 2800 + }, + { + "epoch": 0.46727423363711684, + "grad_norm": 0.9059973359107971, + "learning_rate": 1.9811735706154107e-05, + "loss": 4.3481, + "step": 2820 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.9444851279258728, + "learning_rate": 1.9804215500425602e-05, + "loss": 4.3643, + "step": 2840 + }, + { + "epoch": 0.4739022369511185, + "grad_norm": 0.9410237073898315, + "learning_rate": 1.9796549508427515e-05, + "loss": 4.3573, + "step": 2860 + }, + { + "epoch": 0.4772162386081193, + "grad_norm": 1.0426051616668701, + "learning_rate": 1.9788737844151257e-05, + "loss": 4.3644, + "step": 2880 + }, + { + "epoch": 0.48053024026512015, + "grad_norm": 1.070942997932434, + "learning_rate": 1.9780780623754355e-05, + "loss": 4.3498, + "step": 2900 + }, + { + "epoch": 0.48384424192212094, + "grad_norm": 1.0217543840408325, + "learning_rate": 1.9772677965558717e-05, + "loss": 4.3601, + "step": 2920 + }, + { + "epoch": 0.4871582435791218, + "grad_norm": 0.9259801506996155, + "learning_rate": 1.976442999004887e-05, + "loss": 4.356, + "step": 2940 + }, + { + "epoch": 0.4904722452361226, + "grad_norm": 0.9750253558158875, + "learning_rate": 1.9756036819870183e-05, + "loss": 4.3424, + "step": 2960 + }, + { + "epoch": 0.49378624689312345, + "grad_norm": 0.8607650399208069, + "learning_rate": 1.9747498579827027e-05, + "loss": 4.3516, + "step": 2980 + }, + { + "epoch": 0.4971002485501243, + "grad_norm": 1.0398263931274414, + "learning_rate": 1.973881539688093e-05, + "loss": 4.3279, + "step": 3000 + }, + { + "epoch": 0.49975144987572495, + "eval_loss": 4.357917785644531, + "eval_runtime": 183.0818, + "eval_samples_per_second": 117.199, + "eval_steps_per_second": 14.655, + "step": 3016 + }, + { + "epoch": 0.5004142502071252, + "grad_norm": 0.9603216648101807, + "learning_rate": 1.9729987400148678e-05, + "loss": 4.3376, + "step": 3020 + }, + { + "epoch": 0.503728251864126, + "grad_norm": 1.1100867986679077, + "learning_rate": 1.9721014720900415e-05, + "loss": 4.3478, + "step": 3040 + }, + { + "epoch": 0.5070422535211268, + "grad_norm": 0.9792053699493408, + "learning_rate": 1.9711897492557667e-05, + "loss": 4.3337, + "step": 3060 + }, + { + "epoch": 0.5103562551781275, + "grad_norm": 0.9616000056266785, + "learning_rate": 1.970263585069138e-05, + "loss": 4.355, + "step": 3080 + }, + { + "epoch": 0.5136702568351285, + "grad_norm": 1.013371467590332, + "learning_rate": 1.969322993301988e-05, + "loss": 4.3538, + "step": 3100 + }, + { + "epoch": 0.5169842584921293, + "grad_norm": 0.9979972243309021, + "learning_rate": 1.9683679879406847e-05, + "loss": 4.3362, + "step": 3120 + }, + { + "epoch": 0.52029826014913, + "grad_norm": 0.9573869705200195, + "learning_rate": 1.9673985831859227e-05, + "loss": 4.3393, + "step": 3140 + }, + { + "epoch": 0.5236122618061309, + "grad_norm": 0.9969048500061035, + "learning_rate": 1.966414793452512e-05, + "loss": 4.3344, + "step": 3160 + }, + { + "epoch": 0.5269262634631318, + "grad_norm": 0.9501482844352722, + "learning_rate": 1.9654166333691638e-05, + "loss": 4.3455, + "step": 3180 + }, + { + "epoch": 0.5302402651201326, + "grad_norm": 1.1361968517303467, + "learning_rate": 1.964404117778272e-05, + "loss": 4.3376, + "step": 3200 + }, + { + "epoch": 0.5335542667771334, + "grad_norm": 1.069237232208252, + "learning_rate": 1.963377261735695e-05, + "loss": 4.3341, + "step": 3220 + }, + { + "epoch": 0.5368682684341343, + "grad_norm": 1.0862236022949219, + "learning_rate": 1.9623360805105287e-05, + "loss": 4.3593, + "step": 3240 + }, + { + "epoch": 0.5401822700911351, + "grad_norm": 0.9376192688941956, + "learning_rate": 1.9612805895848815e-05, + "loss": 4.3356, + "step": 3260 + }, + { + "epoch": 0.5434962717481359, + "grad_norm": 0.9688388705253601, + "learning_rate": 1.9602108046536432e-05, + "loss": 4.3291, + "step": 3280 + }, + { + "epoch": 0.5468102734051367, + "grad_norm": 0.9039995670318604, + "learning_rate": 1.9591267416242527e-05, + "loss": 4.3391, + "step": 3300 + }, + { + "epoch": 0.5501242750621376, + "grad_norm": 0.9087806344032288, + "learning_rate": 1.9580284166164597e-05, + "loss": 4.3338, + "step": 3320 + }, + { + "epoch": 0.5534382767191384, + "grad_norm": 0.9528907537460327, + "learning_rate": 1.9569158459620874e-05, + "loss": 4.3439, + "step": 3340 + }, + { + "epoch": 0.5567522783761392, + "grad_norm": 0.9101842641830444, + "learning_rate": 1.9557890462047867e-05, + "loss": 4.3311, + "step": 3360 + }, + { + "epoch": 0.56006628003314, + "grad_norm": 0.9301391243934631, + "learning_rate": 1.9546480340997923e-05, + "loss": 4.339, + "step": 3380 + }, + { + "epoch": 0.5633802816901409, + "grad_norm": 0.9672527313232422, + "learning_rate": 1.9534928266136737e-05, + "loss": 4.3244, + "step": 3400 + }, + { + "epoch": 0.5666942833471417, + "grad_norm": 1.080894112586975, + "learning_rate": 1.9523234409240813e-05, + "loss": 4.3371, + "step": 3420 + }, + { + "epoch": 0.5700082850041425, + "grad_norm": 0.9596937894821167, + "learning_rate": 1.9511398944194926e-05, + "loss": 4.3321, + "step": 3440 + }, + { + "epoch": 0.5733222866611434, + "grad_norm": 1.0157365798950195, + "learning_rate": 1.9499422046989524e-05, + "loss": 4.3469, + "step": 3460 + }, + { + "epoch": 0.5766362883181442, + "grad_norm": 0.967965841293335, + "learning_rate": 1.9487303895718115e-05, + "loss": 4.3191, + "step": 3480 + }, + { + "epoch": 0.579950289975145, + "grad_norm": 0.9909558296203613, + "learning_rate": 1.947504467057463e-05, + "loss": 4.3309, + "step": 3500 + }, + { + "epoch": 0.5832642916321458, + "grad_norm": 0.9317603707313538, + "learning_rate": 1.9462644553850723e-05, + "loss": 4.3195, + "step": 3520 + }, + { + "epoch": 0.5865782932891467, + "grad_norm": 1.025415301322937, + "learning_rate": 1.9450103729933075e-05, + "loss": 4.3271, + "step": 3540 + }, + { + "epoch": 0.5898922949461475, + "grad_norm": 0.8723011016845703, + "learning_rate": 1.9437422385300652e-05, + "loss": 4.328, + "step": 3560 + }, + { + "epoch": 0.5932062966031483, + "grad_norm": 0.9427792429924011, + "learning_rate": 1.9424600708521922e-05, + "loss": 4.3242, + "step": 3580 + }, + { + "epoch": 0.5965202982601492, + "grad_norm": 1.0956817865371704, + "learning_rate": 1.9411638890252064e-05, + "loss": 4.3018, + "step": 3600 + }, + { + "epoch": 0.59983429991715, + "grad_norm": 1.0019570589065552, + "learning_rate": 1.9398537123230123e-05, + "loss": 4.3306, + "step": 3620 + }, + { + "epoch": 0.6031483015741508, + "grad_norm": 0.9429236054420471, + "learning_rate": 1.938529560227614e-05, + "loss": 4.3175, + "step": 3640 + }, + { + "epoch": 0.6064623032311516, + "grad_norm": 0.9683657884597778, + "learning_rate": 1.9371914524288276e-05, + "loss": 4.3166, + "step": 3660 + }, + { + "epoch": 0.6097763048881525, + "grad_norm": 0.9779403805732727, + "learning_rate": 1.9358394088239865e-05, + "loss": 4.3154, + "step": 3680 + }, + { + "epoch": 0.6130903065451533, + "grad_norm": 0.9354090690612793, + "learning_rate": 1.934473449517646e-05, + "loss": 4.3094, + "step": 3700 + }, + { + "epoch": 0.6164043082021541, + "grad_norm": 0.9338115453720093, + "learning_rate": 1.933093594821285e-05, + "loss": 4.31, + "step": 3720 + }, + { + "epoch": 0.6197183098591549, + "grad_norm": 0.9287164211273193, + "learning_rate": 1.9316998652530018e-05, + "loss": 4.3184, + "step": 3740 + }, + { + "epoch": 0.6230323115161558, + "grad_norm": 1.1104332208633423, + "learning_rate": 1.9302922815372124e-05, + "loss": 4.3088, + "step": 3760 + }, + { + "epoch": 0.6263463131731566, + "grad_norm": 0.9717329740524292, + "learning_rate": 1.9288708646043405e-05, + "loss": 4.3218, + "step": 3780 + }, + { + "epoch": 0.6296603148301574, + "grad_norm": 0.9376388788223267, + "learning_rate": 1.9274356355905053e-05, + "loss": 4.317, + "step": 3800 + }, + { + "epoch": 0.6329743164871583, + "grad_norm": 0.9961258769035339, + "learning_rate": 1.9259866158372083e-05, + "loss": 4.3082, + "step": 3820 + }, + { + "epoch": 0.6362883181441591, + "grad_norm": 0.9656636118888855, + "learning_rate": 1.924523826891017e-05, + "loss": 4.3019, + "step": 3840 + }, + { + "epoch": 0.6396023198011599, + "grad_norm": 0.9857081174850464, + "learning_rate": 1.9230472905032425e-05, + "loss": 4.3042, + "step": 3860 + }, + { + "epoch": 0.6429163214581607, + "grad_norm": 0.8999795913696289, + "learning_rate": 1.921557028629617e-05, + "loss": 4.3074, + "step": 3880 + }, + { + "epoch": 0.6462303231151616, + "grad_norm": 0.9694218039512634, + "learning_rate": 1.9200530634299673e-05, + "loss": 4.296, + "step": 3900 + }, + { + "epoch": 0.6495443247721624, + "grad_norm": 1.013126015663147, + "learning_rate": 1.9185354172678866e-05, + "loss": 4.3199, + "step": 3920 + }, + { + "epoch": 0.6528583264291632, + "grad_norm": 0.9119258522987366, + "learning_rate": 1.917004112710398e-05, + "loss": 4.3009, + "step": 3940 + }, + { + "epoch": 0.656172328086164, + "grad_norm": 0.9842638969421387, + "learning_rate": 1.915459172527624e-05, + "loss": 4.3134, + "step": 3960 + }, + { + "epoch": 0.6594863297431649, + "grad_norm": 1.049775242805481, + "learning_rate": 1.9139006196924444e-05, + "loss": 4.2959, + "step": 3980 + }, + { + "epoch": 0.6628003314001657, + "grad_norm": 1.0449132919311523, + "learning_rate": 1.9123284773801562e-05, + "loss": 4.2983, + "step": 4000 + }, + { + "epoch": 0.6661143330571665, + "grad_norm": 0.9151386618614197, + "learning_rate": 1.910742768968128e-05, + "loss": 4.3012, + "step": 4020 + }, + { + "epoch": 0.6694283347141674, + "grad_norm": 1.0705188512802124, + "learning_rate": 1.909143518035453e-05, + "loss": 4.3045, + "step": 4040 + }, + { + "epoch": 0.6727423363711682, + "grad_norm": 0.9801315665245056, + "learning_rate": 1.9075307483625997e-05, + "loss": 4.3054, + "step": 4060 + }, + { + "epoch": 0.676056338028169, + "grad_norm": 1.0176928043365479, + "learning_rate": 1.905904483931055e-05, + "loss": 4.2863, + "step": 4080 + }, + { + "epoch": 0.6793703396851698, + "grad_norm": 0.9228439927101135, + "learning_rate": 1.9042647489229705e-05, + "loss": 4.3091, + "step": 4100 + }, + { + "epoch": 0.6826843413421707, + "grad_norm": 0.9789882302284241, + "learning_rate": 1.9026115677208024e-05, + "loss": 4.3015, + "step": 4120 + }, + { + "epoch": 0.6859983429991715, + "grad_norm": 0.9386183023452759, + "learning_rate": 1.900944964906947e-05, + "loss": 4.2977, + "step": 4140 + }, + { + "epoch": 0.6893123446561723, + "grad_norm": 0.980752170085907, + "learning_rate": 1.899264965263379e-05, + "loss": 4.2968, + "step": 4160 + }, + { + "epoch": 0.6926263463131731, + "grad_norm": 0.982489824295044, + "learning_rate": 1.897571593771278e-05, + "loss": 4.3367, + "step": 4180 + }, + { + "epoch": 0.695940347970174, + "grad_norm": 0.9551573395729065, + "learning_rate": 1.895864875610662e-05, + "loss": 4.3031, + "step": 4200 + }, + { + "epoch": 0.6992543496271748, + "grad_norm": 0.8884013295173645, + "learning_rate": 1.894144836160009e-05, + "loss": 4.2998, + "step": 4220 + }, + { + "epoch": 0.7025683512841756, + "grad_norm": 0.9692120552062988, + "learning_rate": 1.892411500995882e-05, + "loss": 4.2899, + "step": 4240 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.919230580329895, + "learning_rate": 1.890664895892548e-05, + "loss": 4.2929, + "step": 4260 + }, + { + "epoch": 0.7091963545981773, + "grad_norm": 1.0192986726760864, + "learning_rate": 1.888905046821595e-05, + "loss": 4.2914, + "step": 4280 + }, + { + "epoch": 0.7125103562551781, + "grad_norm": 1.0561175346374512, + "learning_rate": 1.887131979951545e-05, + "loss": 4.294, + "step": 4300 + }, + { + "epoch": 0.7158243579121789, + "grad_norm": 0.941726803779602, + "learning_rate": 1.885345721647466e-05, + "loss": 4.2879, + "step": 4320 + }, + { + "epoch": 0.7191383595691798, + "grad_norm": 0.9086028933525085, + "learning_rate": 1.883546298470578e-05, + "loss": 4.2945, + "step": 4340 + }, + { + "epoch": 0.7224523612261806, + "grad_norm": 0.893491804599762, + "learning_rate": 1.8817337371778615e-05, + "loss": 4.3018, + "step": 4360 + }, + { + "epoch": 0.7257663628831814, + "grad_norm": 1.0064398050308228, + "learning_rate": 1.8799080647216557e-05, + "loss": 4.3059, + "step": 4380 + }, + { + "epoch": 0.7290803645401823, + "grad_norm": 1.003888487815857, + "learning_rate": 1.878069308249261e-05, + "loss": 4.285, + "step": 4400 + }, + { + "epoch": 0.7323943661971831, + "grad_norm": 1.0065879821777344, + "learning_rate": 1.8762174951025324e-05, + "loss": 4.2913, + "step": 4420 + }, + { + "epoch": 0.7357083678541839, + "grad_norm": 1.0527596473693848, + "learning_rate": 1.8743526528174762e-05, + "loss": 4.278, + "step": 4440 + }, + { + "epoch": 0.7390223695111847, + "grad_norm": 1.0759984254837036, + "learning_rate": 1.8724748091238373e-05, + "loss": 4.2954, + "step": 4460 + }, + { + "epoch": 0.7423363711681856, + "grad_norm": 0.9671703577041626, + "learning_rate": 1.8705839919446888e-05, + "loss": 4.2995, + "step": 4480 + }, + { + "epoch": 0.7456503728251864, + "grad_norm": 0.9959936738014221, + "learning_rate": 1.8686802293960166e-05, + "loss": 4.2792, + "step": 4500 + }, + { + "epoch": 0.7489643744821872, + "grad_norm": 1.0670673847198486, + "learning_rate": 1.8667635497863008e-05, + "loss": 4.2989, + "step": 4520 + }, + { + "epoch": 0.7496271748135874, + "eval_loss": 4.297667980194092, + "eval_runtime": 183.8703, + "eval_samples_per_second": 116.696, + "eval_steps_per_second": 14.592, + "step": 4524 + }, + { + "epoch": 0.752278376139188, + "grad_norm": 1.0218068361282349, + "learning_rate": 1.8648339816160953e-05, + "loss": 4.2841, + "step": 4540 + }, + { + "epoch": 0.7555923777961889, + "grad_norm": 0.9335347414016724, + "learning_rate": 1.8628915535776035e-05, + "loss": 4.2919, + "step": 4560 + }, + { + "epoch": 0.7589063794531897, + "grad_norm": 0.9821425080299377, + "learning_rate": 1.8609362945542518e-05, + "loss": 4.2914, + "step": 4580 + }, + { + "epoch": 0.7622203811101905, + "grad_norm": 0.9833670854568481, + "learning_rate": 1.85896823362026e-05, + "loss": 4.3044, + "step": 4600 + }, + { + "epoch": 0.7655343827671914, + "grad_norm": 0.9810864925384521, + "learning_rate": 1.85698740004021e-05, + "loss": 4.2797, + "step": 4620 + }, + { + "epoch": 0.7688483844241922, + "grad_norm": 1.0434892177581787, + "learning_rate": 1.8549938232686084e-05, + "loss": 4.2741, + "step": 4640 + }, + { + "epoch": 0.772162386081193, + "grad_norm": 0.9879078269004822, + "learning_rate": 1.8529875329494514e-05, + "loss": 4.2854, + "step": 4660 + }, + { + "epoch": 0.7754763877381938, + "grad_norm": 1.0669186115264893, + "learning_rate": 1.850968558915782e-05, + "loss": 4.296, + "step": 4680 + }, + { + "epoch": 0.7787903893951947, + "grad_norm": 1.0384941101074219, + "learning_rate": 1.848936931189246e-05, + "loss": 4.2854, + "step": 4700 + }, + { + "epoch": 0.7821043910521955, + "grad_norm": 1.0485628843307495, + "learning_rate": 1.8468926799796484e-05, + "loss": 4.2611, + "step": 4720 + }, + { + "epoch": 0.7854183927091963, + "grad_norm": 0.9525585174560547, + "learning_rate": 1.8448358356845002e-05, + "loss": 4.2868, + "step": 4740 + }, + { + "epoch": 0.7887323943661971, + "grad_norm": 0.8990052938461304, + "learning_rate": 1.8427664288885697e-05, + "loss": 4.2811, + "step": 4760 + }, + { + "epoch": 0.792046396023198, + "grad_norm": 1.1199525594711304, + "learning_rate": 1.8406844903634266e-05, + "loss": 4.2844, + "step": 4780 + }, + { + "epoch": 0.7953603976801988, + "grad_norm": 1.0135475397109985, + "learning_rate": 1.838590051066983e-05, + "loss": 4.2791, + "step": 4800 + }, + { + "epoch": 0.7986743993371996, + "grad_norm": 0.9762597680091858, + "learning_rate": 1.8364831421430366e-05, + "loss": 4.2762, + "step": 4820 + }, + { + "epoch": 0.8019884009942005, + "grad_norm": 0.9348472356796265, + "learning_rate": 1.8343637949208034e-05, + "loss": 4.2846, + "step": 4840 + }, + { + "epoch": 0.8053024026512013, + "grad_norm": 1.14839506149292, + "learning_rate": 1.832232040914455e-05, + "loss": 4.29, + "step": 4860 + }, + { + "epoch": 0.8086164043082021, + "grad_norm": 1.0135358572006226, + "learning_rate": 1.8300879118226476e-05, + "loss": 4.2769, + "step": 4880 + }, + { + "epoch": 0.8119304059652029, + "grad_norm": 0.9548913240432739, + "learning_rate": 1.8279314395280537e-05, + "loss": 4.263, + "step": 4900 + }, + { + "epoch": 0.8152444076222038, + "grad_norm": 1.088307499885559, + "learning_rate": 1.825762656096884e-05, + "loss": 4.293, + "step": 4920 + }, + { + "epoch": 0.8185584092792046, + "grad_norm": 1.019329309463501, + "learning_rate": 1.823581593778415e-05, + "loss": 4.2726, + "step": 4940 + }, + { + "epoch": 0.8218724109362054, + "grad_norm": 1.0125458240509033, + "learning_rate": 1.8213882850045057e-05, + "loss": 4.2755, + "step": 4960 + }, + { + "epoch": 0.8251864125932062, + "grad_norm": 1.0081562995910645, + "learning_rate": 1.8191827623891175e-05, + "loss": 4.2598, + "step": 4980 + }, + { + "epoch": 0.8285004142502072, + "grad_norm": 0.9964544773101807, + "learning_rate": 1.816965058727828e-05, + "loss": 4.2522, + "step": 5000 + }, + { + "epoch": 0.831814415907208, + "grad_norm": 1.1516706943511963, + "learning_rate": 1.8147352069973446e-05, + "loss": 4.2673, + "step": 5020 + }, + { + "epoch": 0.8351284175642087, + "grad_norm": 0.9268865585327148, + "learning_rate": 1.8124932403550132e-05, + "loss": 4.2708, + "step": 5040 + }, + { + "epoch": 0.8384424192212097, + "grad_norm": 1.0599693059921265, + "learning_rate": 1.8102391921383263e-05, + "loss": 4.2805, + "step": 5060 + }, + { + "epoch": 0.8417564208782105, + "grad_norm": 1.2651729583740234, + "learning_rate": 1.8079730958644247e-05, + "loss": 4.2737, + "step": 5080 + }, + { + "epoch": 0.8450704225352113, + "grad_norm": 0.9597588181495667, + "learning_rate": 1.805694985229602e-05, + "loss": 4.2756, + "step": 5100 + }, + { + "epoch": 0.848384424192212, + "grad_norm": 1.1017850637435913, + "learning_rate": 1.8034048941088026e-05, + "loss": 4.2731, + "step": 5120 + }, + { + "epoch": 0.851698425849213, + "grad_norm": 0.9653656482696533, + "learning_rate": 1.8011028565551174e-05, + "loss": 4.2792, + "step": 5140 + }, + { + "epoch": 0.8550124275062138, + "grad_norm": 1.0836886167526245, + "learning_rate": 1.7987889067992776e-05, + "loss": 4.2765, + "step": 5160 + }, + { + "epoch": 0.8583264291632146, + "grad_norm": 1.1189730167388916, + "learning_rate": 1.7964630792491462e-05, + "loss": 4.2771, + "step": 5180 + }, + { + "epoch": 0.8616404308202155, + "grad_norm": 1.004889965057373, + "learning_rate": 1.7941254084892056e-05, + "loss": 4.2713, + "step": 5200 + }, + { + "epoch": 0.8649544324772163, + "grad_norm": 0.9272435307502747, + "learning_rate": 1.7917759292800446e-05, + "loss": 4.2617, + "step": 5220 + }, + { + "epoch": 0.8682684341342171, + "grad_norm": 0.9324814677238464, + "learning_rate": 1.7894146765578405e-05, + "loss": 4.2717, + "step": 5240 + }, + { + "epoch": 0.8715824357912179, + "grad_norm": 0.929558277130127, + "learning_rate": 1.787041685433839e-05, + "loss": 4.277, + "step": 5260 + }, + { + "epoch": 0.8748964374482188, + "grad_norm": 0.975135862827301, + "learning_rate": 1.784656991193834e-05, + "loss": 4.2807, + "step": 5280 + }, + { + "epoch": 0.8782104391052196, + "grad_norm": 0.8951101899147034, + "learning_rate": 1.7822606292976415e-05, + "loss": 4.2547, + "step": 5300 + }, + { + "epoch": 0.8815244407622204, + "grad_norm": 1.0090558528900146, + "learning_rate": 1.779852635378573e-05, + "loss": 4.2542, + "step": 5320 + }, + { + "epoch": 0.8848384424192212, + "grad_norm": 0.9711282849311829, + "learning_rate": 1.7774330452429044e-05, + "loss": 4.2825, + "step": 5340 + }, + { + "epoch": 0.8881524440762221, + "grad_norm": 0.956322431564331, + "learning_rate": 1.7750018948693452e-05, + "loss": 4.2844, + "step": 5360 + }, + { + "epoch": 0.8914664457332229, + "grad_norm": 1.0528624057769775, + "learning_rate": 1.772559220408503e-05, + "loss": 4.264, + "step": 5380 + }, + { + "epoch": 0.8947804473902237, + "grad_norm": 0.9689446091651917, + "learning_rate": 1.7701050581823444e-05, + "loss": 4.2799, + "step": 5400 + }, + { + "epoch": 0.8980944490472246, + "grad_norm": 0.9626989960670471, + "learning_rate": 1.767639444683658e-05, + "loss": 4.2534, + "step": 5420 + }, + { + "epoch": 0.9014084507042254, + "grad_norm": 0.9567079544067383, + "learning_rate": 1.7651624165755096e-05, + "loss": 4.2571, + "step": 5440 + }, + { + "epoch": 0.9047224523612262, + "grad_norm": 1.0106322765350342, + "learning_rate": 1.762674010690696e-05, + "loss": 4.2729, + "step": 5460 + }, + { + "epoch": 0.908036454018227, + "grad_norm": 0.9494588971138, + "learning_rate": 1.7601742640312e-05, + "loss": 4.2544, + "step": 5480 + }, + { + "epoch": 0.9113504556752279, + "grad_norm": 0.8869370222091675, + "learning_rate": 1.7576632137676394e-05, + "loss": 4.2644, + "step": 5500 + }, + { + "epoch": 0.9146644573322287, + "grad_norm": 1.0021344423294067, + "learning_rate": 1.7551408972387126e-05, + "loss": 4.2554, + "step": 5520 + }, + { + "epoch": 0.9179784589892295, + "grad_norm": 0.9648563861846924, + "learning_rate": 1.7526073519506453e-05, + "loss": 4.2553, + "step": 5540 + }, + { + "epoch": 0.9212924606462303, + "grad_norm": 1.0345970392227173, + "learning_rate": 1.7500626155766317e-05, + "loss": 4.2687, + "step": 5560 + }, + { + "epoch": 0.9246064623032312, + "grad_norm": 1.0000388622283936, + "learning_rate": 1.7475067259562748e-05, + "loss": 4.2545, + "step": 5580 + }, + { + "epoch": 0.927920463960232, + "grad_norm": 1.0084024667739868, + "learning_rate": 1.744939721095024e-05, + "loss": 4.2379, + "step": 5600 + }, + { + "epoch": 0.9312344656172328, + "grad_norm": 1.0267122983932495, + "learning_rate": 1.7423616391636092e-05, + "loss": 4.2488, + "step": 5620 + }, + { + "epoch": 0.9345484672742337, + "grad_norm": 0.9455956220626831, + "learning_rate": 1.7397725184974736e-05, + "loss": 4.2498, + "step": 5640 + }, + { + "epoch": 0.9378624689312345, + "grad_norm": 1.036926507949829, + "learning_rate": 1.737172397596204e-05, + "loss": 4.2601, + "step": 5660 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.905185341835022, + "learning_rate": 1.7345613151229575e-05, + "loss": 4.2274, + "step": 5680 + }, + { + "epoch": 0.9444904722452361, + "grad_norm": 0.9288860559463501, + "learning_rate": 1.7319393099038873e-05, + "loss": 4.265, + "step": 5700 + }, + { + "epoch": 0.947804473902237, + "grad_norm": 0.9936960339546204, + "learning_rate": 1.729306420927565e-05, + "loss": 4.2579, + "step": 5720 + }, + { + "epoch": 0.9511184755592378, + "grad_norm": 0.944254994392395, + "learning_rate": 1.726662687344402e-05, + "loss": 4.2587, + "step": 5740 + }, + { + "epoch": 0.9544324772162386, + "grad_norm": 0.9940594434738159, + "learning_rate": 1.724008148466065e-05, + "loss": 4.2483, + "step": 5760 + }, + { + "epoch": 0.9577464788732394, + "grad_norm": 0.9212367534637451, + "learning_rate": 1.721342843764893e-05, + "loss": 4.2546, + "step": 5780 + }, + { + "epoch": 0.9610604805302403, + "grad_norm": 0.9834422469139099, + "learning_rate": 1.7186668128733106e-05, + "loss": 4.2637, + "step": 5800 + }, + { + "epoch": 0.9643744821872411, + "grad_norm": 0.9573394656181335, + "learning_rate": 1.715980095583238e-05, + "loss": 4.2684, + "step": 5820 + }, + { + "epoch": 0.9676884838442419, + "grad_norm": 0.9837244749069214, + "learning_rate": 1.7132827318455e-05, + "loss": 4.2523, + "step": 5840 + }, + { + "epoch": 0.9710024855012428, + "grad_norm": 1.0186023712158203, + "learning_rate": 1.71057476176923e-05, + "loss": 4.262, + "step": 5860 + }, + { + "epoch": 0.9743164871582436, + "grad_norm": 0.9724565148353577, + "learning_rate": 1.707856225621277e-05, + "loss": 4.2617, + "step": 5880 + }, + { + "epoch": 0.9776304888152444, + "grad_norm": 1.0614138841629028, + "learning_rate": 1.705127163825603e-05, + "loss": 4.254, + "step": 5900 + }, + { + "epoch": 0.9809444904722452, + "grad_norm": 1.0305408239364624, + "learning_rate": 1.7023876169626858e-05, + "loss": 4.2446, + "step": 5920 + }, + { + "epoch": 0.9842584921292461, + "grad_norm": 1.1059362888336182, + "learning_rate": 1.6996376257689117e-05, + "loss": 4.2659, + "step": 5940 + }, + { + "epoch": 0.9875724937862469, + "grad_norm": 1.0872952938079834, + "learning_rate": 1.6968772311359722e-05, + "loss": 4.2439, + "step": 5960 + }, + { + "epoch": 0.9908864954432477, + "grad_norm": 0.8474650382995605, + "learning_rate": 1.694106474110256e-05, + "loss": 4.258, + "step": 5980 + }, + { + "epoch": 0.9942004971002486, + "grad_norm": 0.9521352052688599, + "learning_rate": 1.691325395892238e-05, + "loss": 4.268, + "step": 6000 + }, + { + "epoch": 0.9975144987572494, + "grad_norm": 0.9597455859184265, + "learning_rate": 1.6885340378358652e-05, + "loss": 4.2604, + "step": 6020 + }, + { + "epoch": 0.9995028997514499, + "eval_loss": 4.258742809295654, + "eval_runtime": 183.2566, + "eval_samples_per_second": 117.087, + "eval_steps_per_second": 14.641, + "step": 6032 + }, + { + "epoch": 1.0008285004142503, + "grad_norm": 1.0202631950378418, + "learning_rate": 1.6857324414479453e-05, + "loss": 4.2604, + "step": 6040 + }, + { + "epoch": 1.004142502071251, + "grad_norm": 1.0613888502120972, + "learning_rate": 1.682920648387526e-05, + "loss": 4.2454, + "step": 6060 + }, + { + "epoch": 1.007456503728252, + "grad_norm": 1.03452730178833, + "learning_rate": 1.6800987004652777e-05, + "loss": 4.2425, + "step": 6080 + }, + { + "epoch": 1.0107705053852527, + "grad_norm": 0.9524587392807007, + "learning_rate": 1.6772666396428712e-05, + "loss": 4.2349, + "step": 6100 + }, + { + "epoch": 1.0140845070422535, + "grad_norm": 0.9935766458511353, + "learning_rate": 1.6744245080323526e-05, + "loss": 4.2485, + "step": 6120 + }, + { + "epoch": 1.0173985086992543, + "grad_norm": 1.034247636795044, + "learning_rate": 1.6715723478955196e-05, + "loss": 4.2627, + "step": 6140 + }, + { + "epoch": 1.020712510356255, + "grad_norm": 0.9751043915748596, + "learning_rate": 1.6687102016432907e-05, + "loss": 4.2419, + "step": 6160 + }, + { + "epoch": 1.024026512013256, + "grad_norm": 0.9609410166740417, + "learning_rate": 1.6658381118350758e-05, + "loss": 4.2421, + "step": 6180 + }, + { + "epoch": 1.027340513670257, + "grad_norm": 0.9059689044952393, + "learning_rate": 1.6629561211781426e-05, + "loss": 4.2403, + "step": 6200 + }, + { + "epoch": 1.0306545153272577, + "grad_norm": 0.8922605514526367, + "learning_rate": 1.6600642725269823e-05, + "loss": 4.237, + "step": 6220 + }, + { + "epoch": 1.0339685169842585, + "grad_norm": 0.9706618785858154, + "learning_rate": 1.6571626088826726e-05, + "loss": 4.2432, + "step": 6240 + }, + { + "epoch": 1.0372825186412593, + "grad_norm": 0.9916077852249146, + "learning_rate": 1.6542511733922363e-05, + "loss": 4.2409, + "step": 6260 + }, + { + "epoch": 1.04059652029826, + "grad_norm": 0.9546970129013062, + "learning_rate": 1.651330009348003e-05, + "loss": 4.2345, + "step": 6280 + }, + { + "epoch": 1.043910521955261, + "grad_norm": 0.9932180643081665, + "learning_rate": 1.6483991601869617e-05, + "loss": 4.2325, + "step": 6300 + }, + { + "epoch": 1.0472245236122617, + "grad_norm": 0.9520214796066284, + "learning_rate": 1.6454586694901176e-05, + "loss": 4.2501, + "step": 6320 + }, + { + "epoch": 1.0505385252692627, + "grad_norm": 0.9115946888923645, + "learning_rate": 1.6425085809818438e-05, + "loss": 4.2363, + "step": 6340 + }, + { + "epoch": 1.0538525269262635, + "grad_norm": 0.999303936958313, + "learning_rate": 1.639548938529229e-05, + "loss": 4.2494, + "step": 6360 + }, + { + "epoch": 1.0571665285832643, + "grad_norm": 1.1122279167175293, + "learning_rate": 1.636579786141428e-05, + "loss": 4.2508, + "step": 6380 + }, + { + "epoch": 1.0604805302402651, + "grad_norm": 0.9042637348175049, + "learning_rate": 1.6336011679690053e-05, + "loss": 4.2501, + "step": 6400 + }, + { + "epoch": 1.063794531897266, + "grad_norm": 1.0097508430480957, + "learning_rate": 1.630613128303279e-05, + "loss": 4.2268, + "step": 6420 + }, + { + "epoch": 1.0671085335542667, + "grad_norm": 0.9422595500946045, + "learning_rate": 1.6276157115756635e-05, + "loss": 4.2454, + "step": 6440 + }, + { + "epoch": 1.0704225352112675, + "grad_norm": 0.9440616369247437, + "learning_rate": 1.624608962357007e-05, + "loss": 4.2297, + "step": 6460 + }, + { + "epoch": 1.0737365368682685, + "grad_norm": 0.9379023909568787, + "learning_rate": 1.6215929253569295e-05, + "loss": 4.241, + "step": 6480 + }, + { + "epoch": 1.0770505385252693, + "grad_norm": 1.0485597848892212, + "learning_rate": 1.618567645423159e-05, + "loss": 4.2409, + "step": 6500 + }, + { + "epoch": 1.0803645401822701, + "grad_norm": 1.0009465217590332, + "learning_rate": 1.6155331675408625e-05, + "loss": 4.2511, + "step": 6520 + }, + { + "epoch": 1.083678541839271, + "grad_norm": 1.0552219152450562, + "learning_rate": 1.6124895368319787e-05, + "loss": 4.2259, + "step": 6540 + }, + { + "epoch": 1.0869925434962717, + "grad_norm": 0.9327201843261719, + "learning_rate": 1.6094367985545466e-05, + "loss": 4.252, + "step": 6560 + }, + { + "epoch": 1.0903065451532725, + "grad_norm": 1.0329855680465698, + "learning_rate": 1.6063749981020326e-05, + "loss": 4.2249, + "step": 6580 + }, + { + "epoch": 1.0936205468102733, + "grad_norm": 1.0211057662963867, + "learning_rate": 1.6033041810026548e-05, + "loss": 4.2206, + "step": 6600 + }, + { + "epoch": 1.0969345484672743, + "grad_norm": 1.0131185054779053, + "learning_rate": 1.6002243929187078e-05, + "loss": 4.2331, + "step": 6620 + }, + { + "epoch": 1.1002485501242751, + "grad_norm": 1.0400645732879639, + "learning_rate": 1.5971356796458813e-05, + "loss": 4.2282, + "step": 6640 + }, + { + "epoch": 1.103562551781276, + "grad_norm": 1.0232553482055664, + "learning_rate": 1.5940380871125806e-05, + "loss": 4.2398, + "step": 6660 + }, + { + "epoch": 1.1068765534382767, + "grad_norm": 1.0014575719833374, + "learning_rate": 1.5909316613792445e-05, + "loss": 4.2449, + "step": 6680 + }, + { + "epoch": 1.1101905550952775, + "grad_norm": 0.9087008833885193, + "learning_rate": 1.587816448637658e-05, + "loss": 4.2335, + "step": 6700 + }, + { + "epoch": 1.1135045567522783, + "grad_norm": 1.007001280784607, + "learning_rate": 1.5846924952102673e-05, + "loss": 4.2418, + "step": 6720 + }, + { + "epoch": 1.1168185584092791, + "grad_norm": 0.8542742729187012, + "learning_rate": 1.58155984754949e-05, + "loss": 4.2174, + "step": 6740 + }, + { + "epoch": 1.12013256006628, + "grad_norm": 0.8994142413139343, + "learning_rate": 1.5784185522370266e-05, + "loss": 4.2337, + "step": 6760 + }, + { + "epoch": 1.123446561723281, + "grad_norm": 1.0020296573638916, + "learning_rate": 1.5752686559831628e-05, + "loss": 4.2242, + "step": 6780 + }, + { + "epoch": 1.1267605633802817, + "grad_norm": 0.9319907426834106, + "learning_rate": 1.5721102056260812e-05, + "loss": 4.2236, + "step": 6800 + }, + { + "epoch": 1.1300745650372825, + "grad_norm": 0.9891326427459717, + "learning_rate": 1.5689432481311605e-05, + "loss": 4.2334, + "step": 6820 + }, + { + "epoch": 1.1333885666942833, + "grad_norm": 0.9461179971694946, + "learning_rate": 1.5657678305902785e-05, + "loss": 4.2357, + "step": 6840 + }, + { + "epoch": 1.1367025683512841, + "grad_norm": 0.9898168444633484, + "learning_rate": 1.5625840002211117e-05, + "loss": 4.2212, + "step": 6860 + }, + { + "epoch": 1.140016570008285, + "grad_norm": 0.9957179427146912, + "learning_rate": 1.5593918043664334e-05, + "loss": 4.1962, + "step": 6880 + }, + { + "epoch": 1.143330571665286, + "grad_norm": 0.9826033711433411, + "learning_rate": 1.55619129049341e-05, + "loss": 4.2264, + "step": 6900 + }, + { + "epoch": 1.1466445733222868, + "grad_norm": 0.9359017610549927, + "learning_rate": 1.5529825061928944e-05, + "loss": 4.2508, + "step": 6920 + }, + { + "epoch": 1.1499585749792876, + "grad_norm": 0.9601158499717712, + "learning_rate": 1.5497654991787188e-05, + "loss": 4.2173, + "step": 6940 + }, + { + "epoch": 1.1532725766362883, + "grad_norm": 0.9853656888008118, + "learning_rate": 1.546540317286985e-05, + "loss": 4.2477, + "step": 6960 + }, + { + "epoch": 1.1565865782932891, + "grad_norm": 0.9066295623779297, + "learning_rate": 1.5433070084753535e-05, + "loss": 4.198, + "step": 6980 + }, + { + "epoch": 1.15990057995029, + "grad_norm": 1.0091142654418945, + "learning_rate": 1.5400656208223293e-05, + "loss": 4.2253, + "step": 7000 + }, + { + "epoch": 1.1632145816072907, + "grad_norm": 0.9510090947151184, + "learning_rate": 1.5368162025265494e-05, + "loss": 4.2284, + "step": 7020 + }, + { + "epoch": 1.1665285832642915, + "grad_norm": 0.9539738893508911, + "learning_rate": 1.5335588019060626e-05, + "loss": 4.2219, + "step": 7040 + }, + { + "epoch": 1.1698425849212923, + "grad_norm": 0.9674385786056519, + "learning_rate": 1.5302934673976146e-05, + "loss": 4.2292, + "step": 7060 + }, + { + "epoch": 1.1731565865782934, + "grad_norm": 0.9703903198242188, + "learning_rate": 1.5270202475559242e-05, + "loss": 4.226, + "step": 7080 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 1.0406928062438965, + "learning_rate": 1.523739191052965e-05, + "loss": 4.2229, + "step": 7100 + }, + { + "epoch": 1.179784589892295, + "grad_norm": 0.999985933303833, + "learning_rate": 1.520450346677239e-05, + "loss": 4.2422, + "step": 7120 + }, + { + "epoch": 1.1830985915492958, + "grad_norm": 1.0058645009994507, + "learning_rate": 1.5171537633330516e-05, + "loss": 4.2156, + "step": 7140 + }, + { + "epoch": 1.1864125932062966, + "grad_norm": 0.9950315356254578, + "learning_rate": 1.5138494900397854e-05, + "loss": 4.2257, + "step": 7160 + }, + { + "epoch": 1.1897265948632973, + "grad_norm": 1.1084192991256714, + "learning_rate": 1.5105375759311702e-05, + "loss": 4.2165, + "step": 7180 + }, + { + "epoch": 1.1930405965202984, + "grad_norm": 0.9400287866592407, + "learning_rate": 1.5072180702545533e-05, + "loss": 4.2251, + "step": 7200 + }, + { + "epoch": 1.1963545981772992, + "grad_norm": 0.9689525365829468, + "learning_rate": 1.5038910223701666e-05, + "loss": 4.2216, + "step": 7220 + }, + { + "epoch": 1.1996685998343, + "grad_norm": 0.9514891505241394, + "learning_rate": 1.500556481750392e-05, + "loss": 4.2222, + "step": 7240 + }, + { + "epoch": 1.2029826014913008, + "grad_norm": 0.9617292284965515, + "learning_rate": 1.4972144979790277e-05, + "loss": 4.2153, + "step": 7260 + }, + { + "epoch": 1.2062966031483016, + "grad_norm": 0.9278431534767151, + "learning_rate": 1.493865120750549e-05, + "loss": 4.2281, + "step": 7280 + }, + { + "epoch": 1.2096106048053024, + "grad_norm": 0.9646384716033936, + "learning_rate": 1.4905083998693704e-05, + "loss": 4.2172, + "step": 7300 + }, + { + "epoch": 1.2129246064623032, + "grad_norm": 0.9889400601387024, + "learning_rate": 1.4871443852491045e-05, + "loss": 4.2332, + "step": 7320 + }, + { + "epoch": 1.216238608119304, + "grad_norm": 0.9791849255561829, + "learning_rate": 1.4837731269118204e-05, + "loss": 4.2112, + "step": 7340 + }, + { + "epoch": 1.219552609776305, + "grad_norm": 0.9904794096946716, + "learning_rate": 1.4803946749872984e-05, + "loss": 4.2139, + "step": 7360 + }, + { + "epoch": 1.2228666114333058, + "grad_norm": 0.9656121730804443, + "learning_rate": 1.4770090797122873e-05, + "loss": 4.2141, + "step": 7380 + }, + { + "epoch": 1.2261806130903066, + "grad_norm": 0.9957573413848877, + "learning_rate": 1.4736163914297545e-05, + "loss": 4.202, + "step": 7400 + }, + { + "epoch": 1.2294946147473074, + "grad_norm": 0.9430182576179504, + "learning_rate": 1.4702166605881393e-05, + "loss": 4.2262, + "step": 7420 + }, + { + "epoch": 1.2328086164043082, + "grad_norm": 0.9636789560317993, + "learning_rate": 1.466809937740602e-05, + "loss": 4.2185, + "step": 7440 + }, + { + "epoch": 1.236122618061309, + "grad_norm": 0.9161622524261475, + "learning_rate": 1.4633962735442721e-05, + "loss": 4.2168, + "step": 7460 + }, + { + "epoch": 1.2394366197183098, + "grad_norm": 0.9367121458053589, + "learning_rate": 1.4599757187594948e-05, + "loss": 4.2111, + "step": 7480 + }, + { + "epoch": 1.2427506213753108, + "grad_norm": 1.0097243785858154, + "learning_rate": 1.456548324249078e-05, + "loss": 4.2276, + "step": 7500 + }, + { + "epoch": 1.2460646230323116, + "grad_norm": 0.9057493209838867, + "learning_rate": 1.4531141409775339e-05, + "loss": 4.2117, + "step": 7520 + }, + { + "epoch": 1.2493786246893124, + "grad_norm": 1.0492953062057495, + "learning_rate": 1.4496732200103211e-05, + "loss": 4.215, + "step": 7540 + }, + { + "epoch": 1.2493786246893124, + "eval_loss": 4.231438159942627, + "eval_runtime": 183.1978, + "eval_samples_per_second": 117.125, + "eval_steps_per_second": 14.645, + "step": 7540 + }, + { + "epoch": 1.2526926263463132, + "grad_norm": 0.9622147083282471, + "learning_rate": 1.4462256125130876e-05, + "loss": 4.2298, + "step": 7560 + }, + { + "epoch": 1.256006628003314, + "grad_norm": 0.8801897168159485, + "learning_rate": 1.4427713697509072e-05, + "loss": 4.2392, + "step": 7580 + }, + { + "epoch": 1.2593206296603148, + "grad_norm": 0.9456971883773804, + "learning_rate": 1.4393105430875189e-05, + "loss": 4.2013, + "step": 7600 + }, + { + "epoch": 1.2626346313173156, + "grad_norm": 0.966806948184967, + "learning_rate": 1.4358431839845632e-05, + "loss": 4.219, + "step": 7620 + }, + { + "epoch": 1.2659486329743164, + "grad_norm": 0.8955146074295044, + "learning_rate": 1.432369344000816e-05, + "loss": 4.2261, + "step": 7640 + }, + { + "epoch": 1.2692626346313174, + "grad_norm": 0.9939355254173279, + "learning_rate": 1.4288890747914215e-05, + "loss": 4.2202, + "step": 7660 + }, + { + "epoch": 1.2725766362883182, + "grad_norm": 0.9999364018440247, + "learning_rate": 1.4254024281071263e-05, + "loss": 4.2253, + "step": 7680 + }, + { + "epoch": 1.275890637945319, + "grad_norm": 0.952719509601593, + "learning_rate": 1.4219094557935069e-05, + "loss": 4.211, + "step": 7700 + }, + { + "epoch": 1.2792046396023198, + "grad_norm": 0.9277374744415283, + "learning_rate": 1.4184102097902024e-05, + "loss": 4.2375, + "step": 7720 + }, + { + "epoch": 1.2825186412593206, + "grad_norm": 1.01985764503479, + "learning_rate": 1.414904742130138e-05, + "loss": 4.2089, + "step": 7740 + }, + { + "epoch": 1.2858326429163214, + "grad_norm": 0.968488335609436, + "learning_rate": 1.411393104938755e-05, + "loss": 4.2226, + "step": 7760 + }, + { + "epoch": 1.2891466445733224, + "grad_norm": 0.8811097741127014, + "learning_rate": 1.407875350433233e-05, + "loss": 4.2193, + "step": 7780 + }, + { + "epoch": 1.2924606462303232, + "grad_norm": 0.914741039276123, + "learning_rate": 1.4043515309217153e-05, + "loss": 4.2061, + "step": 7800 + }, + { + "epoch": 1.295774647887324, + "grad_norm": 0.976520299911499, + "learning_rate": 1.4008216988025298e-05, + "loss": 4.2178, + "step": 7820 + }, + { + "epoch": 1.2990886495443248, + "grad_norm": 0.8726244568824768, + "learning_rate": 1.397285906563411e-05, + "loss": 4.2229, + "step": 7840 + }, + { + "epoch": 1.3024026512013256, + "grad_norm": 0.9613338112831116, + "learning_rate": 1.3937442067807186e-05, + "loss": 4.2005, + "step": 7860 + }, + { + "epoch": 1.3057166528583264, + "grad_norm": 0.9856804609298706, + "learning_rate": 1.3901966521186556e-05, + "loss": 4.2092, + "step": 7880 + }, + { + "epoch": 1.3090306545153272, + "grad_norm": 0.9347455501556396, + "learning_rate": 1.3866432953284868e-05, + "loss": 4.2169, + "step": 7900 + }, + { + "epoch": 1.312344656172328, + "grad_norm": 0.9626860618591309, + "learning_rate": 1.3830841892477514e-05, + "loss": 4.2091, + "step": 7920 + }, + { + "epoch": 1.3156586578293288, + "grad_norm": 0.8912602066993713, + "learning_rate": 1.3795193867994808e-05, + "loss": 4.1994, + "step": 7940 + }, + { + "epoch": 1.3189726594863298, + "grad_norm": 0.9122039675712585, + "learning_rate": 1.3759489409914091e-05, + "loss": 4.2183, + "step": 7960 + }, + { + "epoch": 1.3222866611433306, + "grad_norm": 0.9546826481819153, + "learning_rate": 1.3723729049151861e-05, + "loss": 4.2165, + "step": 7980 + }, + { + "epoch": 1.3256006628003314, + "grad_norm": 0.9048042893409729, + "learning_rate": 1.3687913317455877e-05, + "loss": 4.2119, + "step": 8000 + }, + { + "epoch": 1.3289146644573322, + "grad_norm": 0.928930401802063, + "learning_rate": 1.3652042747397243e-05, + "loss": 4.2097, + "step": 8020 + }, + { + "epoch": 1.332228666114333, + "grad_norm": 1.0380319356918335, + "learning_rate": 1.36161178723625e-05, + "loss": 4.2324, + "step": 8040 + }, + { + "epoch": 1.335542667771334, + "grad_norm": 0.9912282824516296, + "learning_rate": 1.3580139226545698e-05, + "loss": 4.2084, + "step": 8060 + }, + { + "epoch": 1.3388566694283348, + "grad_norm": 1.1464933156967163, + "learning_rate": 1.3544107344940431e-05, + "loss": 4.2162, + "step": 8080 + }, + { + "epoch": 1.3421706710853356, + "grad_norm": 0.9147726893424988, + "learning_rate": 1.3508022763331908e-05, + "loss": 4.2015, + "step": 8100 + }, + { + "epoch": 1.3454846727423364, + "grad_norm": 0.9918350577354431, + "learning_rate": 1.3471886018288967e-05, + "loss": 4.2183, + "step": 8120 + }, + { + "epoch": 1.3487986743993372, + "grad_norm": 0.9817278981208801, + "learning_rate": 1.3435697647156105e-05, + "loss": 4.2142, + "step": 8140 + }, + { + "epoch": 1.352112676056338, + "grad_norm": 1.0856775045394897, + "learning_rate": 1.3399458188045486e-05, + "loss": 4.2162, + "step": 8160 + }, + { + "epoch": 1.3554266777133388, + "grad_norm": 0.9583995342254639, + "learning_rate": 1.3363168179828947e-05, + "loss": 4.2098, + "step": 8180 + }, + { + "epoch": 1.3587406793703396, + "grad_norm": 0.9794642925262451, + "learning_rate": 1.3326828162129966e-05, + "loss": 4.2084, + "step": 8200 + }, + { + "epoch": 1.3620546810273404, + "grad_norm": 1.0175050497055054, + "learning_rate": 1.3290438675315655e-05, + "loss": 4.2043, + "step": 8220 + }, + { + "epoch": 1.3653686826843414, + "grad_norm": 0.9913382530212402, + "learning_rate": 1.3254000260488722e-05, + "loss": 4.2258, + "step": 8240 + }, + { + "epoch": 1.3686826843413422, + "grad_norm": 1.0796618461608887, + "learning_rate": 1.3217513459479418e-05, + "loss": 4.2015, + "step": 8260 + }, + { + "epoch": 1.371996685998343, + "grad_norm": 0.9818981885910034, + "learning_rate": 1.318097881483749e-05, + "loss": 4.2194, + "step": 8280 + }, + { + "epoch": 1.3753106876553438, + "grad_norm": 1.053113579750061, + "learning_rate": 1.3144396869824103e-05, + "loss": 4.2009, + "step": 8300 + }, + { + "epoch": 1.3786246893123446, + "grad_norm": 0.9535871744155884, + "learning_rate": 1.310776816840377e-05, + "loss": 4.213, + "step": 8320 + }, + { + "epoch": 1.3819386909693454, + "grad_norm": 0.9760191440582275, + "learning_rate": 1.3071093255236259e-05, + "loss": 4.2167, + "step": 8340 + }, + { + "epoch": 1.3852526926263464, + "grad_norm": 1.0344904661178589, + "learning_rate": 1.3034372675668492e-05, + "loss": 4.2131, + "step": 8360 + }, + { + "epoch": 1.3885666942833472, + "grad_norm": 1.0134178400039673, + "learning_rate": 1.2997606975726443e-05, + "loss": 4.2055, + "step": 8380 + }, + { + "epoch": 1.391880695940348, + "grad_norm": 0.9983969330787659, + "learning_rate": 1.2960796702107016e-05, + "loss": 4.2217, + "step": 8400 + }, + { + "epoch": 1.3951946975973488, + "grad_norm": 0.9546549916267395, + "learning_rate": 1.2923942402169915e-05, + "loss": 4.1993, + "step": 8420 + }, + { + "epoch": 1.3985086992543496, + "grad_norm": 0.9388076663017273, + "learning_rate": 1.28870446239295e-05, + "loss": 4.1999, + "step": 8440 + }, + { + "epoch": 1.4018227009113504, + "grad_norm": 0.9410669803619385, + "learning_rate": 1.2850103916046643e-05, + "loss": 4.2023, + "step": 8460 + }, + { + "epoch": 1.4051367025683512, + "grad_norm": 1.000666618347168, + "learning_rate": 1.2813120827820575e-05, + "loss": 4.2113, + "step": 8480 + }, + { + "epoch": 1.408450704225352, + "grad_norm": 0.9235167503356934, + "learning_rate": 1.2776095909180704e-05, + "loss": 4.1981, + "step": 8500 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.9727587103843689, + "learning_rate": 1.2739029710678455e-05, + "loss": 4.2116, + "step": 8520 + }, + { + "epoch": 1.4150787075393538, + "grad_norm": 0.9622088074684143, + "learning_rate": 1.2701922783479069e-05, + "loss": 4.2064, + "step": 8540 + }, + { + "epoch": 1.4183927091963546, + "grad_norm": 0.937461793422699, + "learning_rate": 1.266477567935341e-05, + "loss": 4.2135, + "step": 8560 + }, + { + "epoch": 1.4217067108533554, + "grad_norm": 0.9551966190338135, + "learning_rate": 1.2627588950669771e-05, + "loss": 4.2222, + "step": 8580 + }, + { + "epoch": 1.4250207125103562, + "grad_norm": 0.866661787033081, + "learning_rate": 1.2590363150385642e-05, + "loss": 4.2006, + "step": 8600 + }, + { + "epoch": 1.428334714167357, + "grad_norm": 0.9046264290809631, + "learning_rate": 1.2553098832039513e-05, + "loss": 4.1837, + "step": 8620 + }, + { + "epoch": 1.431648715824358, + "grad_norm": 0.9259505271911621, + "learning_rate": 1.2515796549742611e-05, + "loss": 4.1949, + "step": 8640 + }, + { + "epoch": 1.4349627174813588, + "grad_norm": 1.0003501176834106, + "learning_rate": 1.247845685817069e-05, + "loss": 4.213, + "step": 8660 + }, + { + "epoch": 1.4382767191383596, + "grad_norm": 0.910469651222229, + "learning_rate": 1.2441080312555766e-05, + "loss": 4.205, + "step": 8680 + }, + { + "epoch": 1.4415907207953604, + "grad_norm": 0.901976466178894, + "learning_rate": 1.2403667468677867e-05, + "loss": 4.2, + "step": 8700 + }, + { + "epoch": 1.4449047224523612, + "grad_norm": 0.9283767342567444, + "learning_rate": 1.2366218882856757e-05, + "loss": 4.2017, + "step": 8720 + }, + { + "epoch": 1.448218724109362, + "grad_norm": 0.9129683375358582, + "learning_rate": 1.2328735111943697e-05, + "loss": 4.2152, + "step": 8740 + }, + { + "epoch": 1.4515327257663628, + "grad_norm": 0.9534892439842224, + "learning_rate": 1.2291216713313119e-05, + "loss": 4.1924, + "step": 8760 + }, + { + "epoch": 1.4548467274233636, + "grad_norm": 0.9333394169807434, + "learning_rate": 1.2253664244854371e-05, + "loss": 4.1926, + "step": 8780 + }, + { + "epoch": 1.4581607290803644, + "grad_norm": 1.0264989137649536, + "learning_rate": 1.2216078264963412e-05, + "loss": 4.1898, + "step": 8800 + }, + { + "epoch": 1.4614747307373654, + "grad_norm": 0.9208402037620544, + "learning_rate": 1.21784593325345e-05, + "loss": 4.2064, + "step": 8820 + }, + { + "epoch": 1.4647887323943662, + "grad_norm": 1.0441491603851318, + "learning_rate": 1.2140808006951898e-05, + "loss": 4.1906, + "step": 8840 + }, + { + "epoch": 1.468102734051367, + "grad_norm": 0.9142622947692871, + "learning_rate": 1.2103124848081542e-05, + "loss": 4.2125, + "step": 8860 + }, + { + "epoch": 1.4714167357083678, + "grad_norm": 1.0053941011428833, + "learning_rate": 1.2065410416262724e-05, + "loss": 4.2072, + "step": 8880 + }, + { + "epoch": 1.4747307373653686, + "grad_norm": 0.9855654835700989, + "learning_rate": 1.2027665272299753e-05, + "loss": 4.2007, + "step": 8900 + }, + { + "epoch": 1.4780447390223694, + "grad_norm": 1.0185344219207764, + "learning_rate": 1.1989889977453626e-05, + "loss": 4.1957, + "step": 8920 + }, + { + "epoch": 1.4813587406793705, + "grad_norm": 0.9754390716552734, + "learning_rate": 1.1952085093433668e-05, + "loss": 4.2235, + "step": 8940 + }, + { + "epoch": 1.4846727423363713, + "grad_norm": 0.9458717703819275, + "learning_rate": 1.1914251182389195e-05, + "loss": 4.1901, + "step": 8960 + }, + { + "epoch": 1.487986743993372, + "grad_norm": 1.0231667757034302, + "learning_rate": 1.1876388806901149e-05, + "loss": 4.2011, + "step": 8980 + }, + { + "epoch": 1.4913007456503728, + "grad_norm": 0.926291823387146, + "learning_rate": 1.1838498529973724e-05, + "loss": 4.2104, + "step": 9000 + }, + { + "epoch": 1.4946147473073736, + "grad_norm": 1.0425750017166138, + "learning_rate": 1.1800580915026004e-05, + "loss": 4.2027, + "step": 9020 + }, + { + "epoch": 1.4979287489643744, + "grad_norm": 1.0140166282653809, + "learning_rate": 1.1762636525883587e-05, + "loss": 4.1969, + "step": 9040 + }, + { + "epoch": 1.4992543496271749, + "eval_loss": 4.210264205932617, + "eval_runtime": 185.247, + "eval_samples_per_second": 115.829, + "eval_steps_per_second": 14.483, + "step": 9048 + }, + { + "epoch": 1.5012427506213752, + "grad_norm": 0.9570029377937317, + "learning_rate": 1.1724665926770187e-05, + "loss": 4.207, + "step": 9060 + }, + { + "epoch": 1.504556752278376, + "grad_norm": 0.9974936246871948, + "learning_rate": 1.1686669682299265e-05, + "loss": 4.1934, + "step": 9080 + }, + { + "epoch": 1.5078707539353768, + "grad_norm": 0.9549845457077026, + "learning_rate": 1.1648648357465617e-05, + "loss": 4.2108, + "step": 9100 + }, + { + "epoch": 1.5111847555923776, + "grad_norm": 1.033993124961853, + "learning_rate": 1.161060251763698e-05, + "loss": 4.2105, + "step": 9120 + }, + { + "epoch": 1.5144987572493787, + "grad_norm": 0.977915346622467, + "learning_rate": 1.1572532728545615e-05, + "loss": 4.1883, + "step": 9140 + }, + { + "epoch": 1.5178127589063795, + "grad_norm": 0.9645556807518005, + "learning_rate": 1.1534439556279916e-05, + "loss": 4.2035, + "step": 9160 + }, + { + "epoch": 1.5211267605633803, + "grad_norm": 1.0582506656646729, + "learning_rate": 1.1496323567275964e-05, + "loss": 4.199, + "step": 9180 + }, + { + "epoch": 1.5244407622203813, + "grad_norm": 0.9036635160446167, + "learning_rate": 1.1458185328309136e-05, + "loss": 4.1853, + "step": 9200 + }, + { + "epoch": 1.527754763877382, + "grad_norm": 1.0219875574111938, + "learning_rate": 1.1420025406485644e-05, + "loss": 4.1896, + "step": 9220 + }, + { + "epoch": 1.5310687655343829, + "grad_norm": 0.998590350151062, + "learning_rate": 1.138184436923413e-05, + "loss": 4.2042, + "step": 9240 + }, + { + "epoch": 1.5343827671913837, + "grad_norm": 0.9597691297531128, + "learning_rate": 1.1343642784297212e-05, + "loss": 4.1928, + "step": 9260 + }, + { + "epoch": 1.5376967688483845, + "grad_norm": 0.9390765428543091, + "learning_rate": 1.1305421219723048e-05, + "loss": 4.1932, + "step": 9280 + }, + { + "epoch": 1.5410107705053853, + "grad_norm": 0.9515460729598999, + "learning_rate": 1.126718024385689e-05, + "loss": 4.204, + "step": 9300 + }, + { + "epoch": 1.544324772162386, + "grad_norm": 0.9674809575080872, + "learning_rate": 1.122892042533263e-05, + "loss": 4.17, + "step": 9320 + }, + { + "epoch": 1.5476387738193869, + "grad_norm": 0.9792401790618896, + "learning_rate": 1.1190642333064343e-05, + "loss": 4.1987, + "step": 9340 + }, + { + "epoch": 1.5509527754763877, + "grad_norm": 0.9243532419204712, + "learning_rate": 1.1152346536237837e-05, + "loss": 4.1897, + "step": 9360 + }, + { + "epoch": 1.5542667771333885, + "grad_norm": 1.041626214981079, + "learning_rate": 1.111403360430217e-05, + "loss": 4.1983, + "step": 9380 + }, + { + "epoch": 1.5575807787903893, + "grad_norm": 0.9920730590820312, + "learning_rate": 1.1075704106961214e-05, + "loss": 4.1936, + "step": 9400 + }, + { + "epoch": 1.5608947804473903, + "grad_norm": 0.886774480342865, + "learning_rate": 1.1037358614165146e-05, + "loss": 4.1978, + "step": 9420 + }, + { + "epoch": 1.564208782104391, + "grad_norm": 0.9583536982536316, + "learning_rate": 1.0998997696102002e-05, + "loss": 4.1979, + "step": 9440 + }, + { + "epoch": 1.5675227837613919, + "grad_norm": 1.0704070329666138, + "learning_rate": 1.0960621923189184e-05, + "loss": 4.2022, + "step": 9460 + }, + { + "epoch": 1.5708367854183927, + "grad_norm": 1.0425106287002563, + "learning_rate": 1.0922231866064981e-05, + "loss": 4.1956, + "step": 9480 + }, + { + "epoch": 1.5741507870753937, + "grad_norm": 1.0345118045806885, + "learning_rate": 1.0883828095580086e-05, + "loss": 4.1976, + "step": 9500 + }, + { + "epoch": 1.5774647887323945, + "grad_norm": 0.9319311380386353, + "learning_rate": 1.084541118278911e-05, + "loss": 4.2084, + "step": 9520 + }, + { + "epoch": 1.5807787903893953, + "grad_norm": 0.9795758128166199, + "learning_rate": 1.0806981698942082e-05, + "loss": 4.1814, + "step": 9540 + }, + { + "epoch": 1.584092792046396, + "grad_norm": 1.0910810232162476, + "learning_rate": 1.0768540215475962e-05, + "loss": 4.2026, + "step": 9560 + }, + { + "epoch": 1.5874067937033969, + "grad_norm": 0.9699711799621582, + "learning_rate": 1.073008730400614e-05, + "loss": 4.1913, + "step": 9580 + }, + { + "epoch": 1.5907207953603977, + "grad_norm": 0.9747998118400574, + "learning_rate": 1.0691623536317937e-05, + "loss": 4.1926, + "step": 9600 + }, + { + "epoch": 1.5940347970173985, + "grad_norm": 0.9838584065437317, + "learning_rate": 1.06531494843581e-05, + "loss": 4.2161, + "step": 9620 + }, + { + "epoch": 1.5973487986743993, + "grad_norm": 0.9865525364875793, + "learning_rate": 1.0614665720226316e-05, + "loss": 4.1927, + "step": 9640 + }, + { + "epoch": 1.6006628003314, + "grad_norm": 0.9045295715332031, + "learning_rate": 1.0576172816166672e-05, + "loss": 4.1949, + "step": 9660 + }, + { + "epoch": 1.6039768019884009, + "grad_norm": 0.9054475426673889, + "learning_rate": 1.053767134455917e-05, + "loss": 4.201, + "step": 9680 + }, + { + "epoch": 1.6072908036454017, + "grad_norm": 0.9848572611808777, + "learning_rate": 1.049916187791121e-05, + "loss": 4.1954, + "step": 9700 + }, + { + "epoch": 1.6106048053024027, + "grad_norm": 0.9712388515472412, + "learning_rate": 1.0460644988849076e-05, + "loss": 4.1875, + "step": 9720 + }, + { + "epoch": 1.6139188069594035, + "grad_norm": 0.9746183156967163, + "learning_rate": 1.0422121250109423e-05, + "loss": 4.1951, + "step": 9740 + }, + { + "epoch": 1.6172328086164043, + "grad_norm": 0.9594894647598267, + "learning_rate": 1.0383591234530752e-05, + "loss": 4.2002, + "step": 9760 + }, + { + "epoch": 1.6205468102734053, + "grad_norm": 0.9843876957893372, + "learning_rate": 1.0345055515044908e-05, + "loss": 4.1876, + "step": 9780 + }, + { + "epoch": 1.623860811930406, + "grad_norm": 0.8735640645027161, + "learning_rate": 1.0306514664668548e-05, + "loss": 4.1953, + "step": 9800 + }, + { + "epoch": 1.627174813587407, + "grad_norm": 0.922606885433197, + "learning_rate": 1.0267969256494622e-05, + "loss": 4.204, + "step": 9820 + }, + { + "epoch": 1.6304888152444077, + "grad_norm": 1.0555611848831177, + "learning_rate": 1.0229419863683854e-05, + "loss": 4.1843, + "step": 9840 + }, + { + "epoch": 1.6338028169014085, + "grad_norm": 1.0076203346252441, + "learning_rate": 1.0190867059456225e-05, + "loss": 4.1911, + "step": 9860 + }, + { + "epoch": 1.6371168185584093, + "grad_norm": 0.9469670057296753, + "learning_rate": 1.0152311417082435e-05, + "loss": 4.1859, + "step": 9880 + }, + { + "epoch": 1.64043082021541, + "grad_norm": 1.0290344953536987, + "learning_rate": 1.0113753509875389e-05, + "loss": 4.189, + "step": 9900 + }, + { + "epoch": 1.6437448218724109, + "grad_norm": 0.9199296832084656, + "learning_rate": 1.0075193911181668e-05, + "loss": 4.1967, + "step": 9920 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 1.2012604475021362, + "learning_rate": 1.0036633194373016e-05, + "loss": 4.2006, + "step": 9940 + }, + { + "epoch": 1.6503728251864125, + "grad_norm": 0.984897255897522, + "learning_rate": 9.99807193283778e-06, + "loss": 4.176, + "step": 9960 + }, + { + "epoch": 1.6536868268434133, + "grad_norm": 0.9443606734275818, + "learning_rate": 9.95951069997243e-06, + "loss": 4.1968, + "step": 9980 + }, + { + "epoch": 1.6570008285004143, + "grad_norm": 1.0236579179763794, + "learning_rate": 9.920950069172995e-06, + "loss": 4.186, + "step": 10000 + }, + { + "epoch": 1.660314830157415, + "grad_norm": 0.9176040291786194, + "learning_rate": 9.882390613826565e-06, + "loss": 4.179, + "step": 10020 + }, + { + "epoch": 1.663628831814416, + "grad_norm": 0.9962909817695618, + "learning_rate": 9.843832907302736e-06, + "loss": 4.18, + "step": 10040 + }, + { + "epoch": 1.6669428334714167, + "grad_norm": 0.9841374158859253, + "learning_rate": 9.805277522945112e-06, + "loss": 4.1857, + "step": 10060 + }, + { + "epoch": 1.6702568351284177, + "grad_norm": 0.9344466924667358, + "learning_rate": 9.766725034062757e-06, + "loss": 4.1999, + "step": 10080 + }, + { + "epoch": 1.6735708367854185, + "grad_norm": 0.9379906058311462, + "learning_rate": 9.728176013921688e-06, + "loss": 4.1945, + "step": 10100 + }, + { + "epoch": 1.6768848384424193, + "grad_norm": 0.938956618309021, + "learning_rate": 9.689631035736336e-06, + "loss": 4.2072, + "step": 10120 + }, + { + "epoch": 1.68019884009942, + "grad_norm": 0.905906617641449, + "learning_rate": 9.651090672661036e-06, + "loss": 4.1878, + "step": 10140 + }, + { + "epoch": 1.683512841756421, + "grad_norm": 1.03682279586792, + "learning_rate": 9.612555497781492e-06, + "loss": 4.1991, + "step": 10160 + }, + { + "epoch": 1.6868268434134217, + "grad_norm": 0.9360038042068481, + "learning_rate": 9.574026084106265e-06, + "loss": 4.2123, + "step": 10180 + }, + { + "epoch": 1.6901408450704225, + "grad_norm": 1.1169397830963135, + "learning_rate": 9.535503004558245e-06, + "loss": 4.2039, + "step": 10200 + }, + { + "epoch": 1.6934548467274233, + "grad_norm": 1.166301965713501, + "learning_rate": 9.496986831966133e-06, + "loss": 4.2037, + "step": 10220 + }, + { + "epoch": 1.696768848384424, + "grad_norm": 0.9521096348762512, + "learning_rate": 9.458478139055928e-06, + "loss": 4.2001, + "step": 10240 + }, + { + "epoch": 1.700082850041425, + "grad_norm": 0.9201565384864807, + "learning_rate": 9.419977498442416e-06, + "loss": 4.1789, + "step": 10260 + }, + { + "epoch": 1.7033968516984257, + "grad_norm": 1.0877536535263062, + "learning_rate": 9.38148548262063e-06, + "loss": 4.1893, + "step": 10280 + }, + { + "epoch": 1.7067108533554267, + "grad_norm": 0.9479674696922302, + "learning_rate": 9.343002663957376e-06, + "loss": 4.1993, + "step": 10300 + }, + { + "epoch": 1.7100248550124275, + "grad_norm": 1.0255833864212036, + "learning_rate": 9.304529614682675e-06, + "loss": 4.1699, + "step": 10320 + }, + { + "epoch": 1.7133388566694283, + "grad_norm": 0.9878126978874207, + "learning_rate": 9.266066906881303e-06, + "loss": 4.1793, + "step": 10340 + }, + { + "epoch": 1.716652858326429, + "grad_norm": 1.0169334411621094, + "learning_rate": 9.227615112484248e-06, + "loss": 4.1888, + "step": 10360 + }, + { + "epoch": 1.7199668599834301, + "grad_norm": 0.9481993913650513, + "learning_rate": 9.189174803260221e-06, + "loss": 4.1964, + "step": 10380 + }, + { + "epoch": 1.723280861640431, + "grad_norm": 0.976496696472168, + "learning_rate": 9.150746550807156e-06, + "loss": 4.1953, + "step": 10400 + }, + { + "epoch": 1.7265948632974317, + "grad_norm": 0.8949211835861206, + "learning_rate": 9.112330926543692e-06, + "loss": 4.1837, + "step": 10420 + }, + { + "epoch": 1.7299088649544325, + "grad_norm": 0.9897208213806152, + "learning_rate": 9.073928501700711e-06, + "loss": 4.1889, + "step": 10440 + }, + { + "epoch": 1.7332228666114333, + "grad_norm": 0.934702455997467, + "learning_rate": 9.0355398473128e-06, + "loss": 4.1869, + "step": 10460 + }, + { + "epoch": 1.7365368682684341, + "grad_norm": 0.962416410446167, + "learning_rate": 8.997165534209802e-06, + "loss": 4.2035, + "step": 10480 + }, + { + "epoch": 1.739850869925435, + "grad_norm": 0.9377831220626831, + "learning_rate": 8.958806133008302e-06, + "loss": 4.1863, + "step": 10500 + }, + { + "epoch": 1.7431648715824357, + "grad_norm": 0.9816327095031738, + "learning_rate": 8.92046221410314e-06, + "loss": 4.1951, + "step": 10520 + }, + { + "epoch": 1.7464788732394365, + "grad_norm": 0.8874932527542114, + "learning_rate": 8.882134347658955e-06, + "loss": 4.1984, + "step": 10540 + }, + { + "epoch": 1.7491300745650373, + "eval_loss": 4.1958441734313965, + "eval_runtime": 184.5629, + "eval_samples_per_second": 116.258, + "eval_steps_per_second": 14.537, + "step": 10556 + }, + { + "epoch": 1.7497928748964373, + "grad_norm": 0.9585704207420349, + "learning_rate": 8.84382310360167e-06, + "loss": 4.195, + "step": 10560 + }, + { + "epoch": 1.7531068765534383, + "grad_norm": 0.9877380132675171, + "learning_rate": 8.805529051610053e-06, + "loss": 4.1858, + "step": 10580 + }, + { + "epoch": 1.7564208782104391, + "grad_norm": 0.9174914360046387, + "learning_rate": 8.767252761107227e-06, + "loss": 4.1802, + "step": 10600 + }, + { + "epoch": 1.75973487986744, + "grad_norm": 0.9251859188079834, + "learning_rate": 8.7289948012522e-06, + "loss": 4.1871, + "step": 10620 + }, + { + "epoch": 1.7630488815244407, + "grad_norm": 0.8957480192184448, + "learning_rate": 8.690755740931415e-06, + "loss": 4.1763, + "step": 10640 + }, + { + "epoch": 1.7663628831814417, + "grad_norm": 0.9659335017204285, + "learning_rate": 8.652536148750272e-06, + "loss": 4.1713, + "step": 10660 + }, + { + "epoch": 1.7696768848384425, + "grad_norm": 0.9275119304656982, + "learning_rate": 8.614336593024702e-06, + "loss": 4.1841, + "step": 10680 + }, + { + "epoch": 1.7729908864954433, + "grad_norm": 0.9698359370231628, + "learning_rate": 8.576157641772688e-06, + "loss": 4.1738, + "step": 10700 + }, + { + "epoch": 1.7763048881524441, + "grad_norm": 0.9098818898200989, + "learning_rate": 8.537999862705826e-06, + "loss": 4.1794, + "step": 10720 + }, + { + "epoch": 1.779618889809445, + "grad_norm": 1.0119200944900513, + "learning_rate": 8.499863823220898e-06, + "loss": 4.1845, + "step": 10740 + }, + { + "epoch": 1.7829328914664457, + "grad_norm": 0.9860332012176514, + "learning_rate": 8.461750090391413e-06, + "loss": 4.1983, + "step": 10760 + }, + { + "epoch": 1.7862468931234465, + "grad_norm": 1.0655232667922974, + "learning_rate": 8.423659230959196e-06, + "loss": 4.1816, + "step": 10780 + }, + { + "epoch": 1.7895608947804473, + "grad_norm": 1.0304181575775146, + "learning_rate": 8.385591811325935e-06, + "loss": 4.1728, + "step": 10800 + }, + { + "epoch": 1.7928748964374481, + "grad_norm": 0.9768161177635193, + "learning_rate": 8.347548397544794e-06, + "loss": 4.1644, + "step": 10820 + }, + { + "epoch": 1.796188898094449, + "grad_norm": 0.9339047074317932, + "learning_rate": 8.309529555311958e-06, + "loss": 4.191, + "step": 10840 + }, + { + "epoch": 1.7995028997514497, + "grad_norm": 0.976172924041748, + "learning_rate": 8.27153584995825e-06, + "loss": 4.1711, + "step": 10860 + }, + { + "epoch": 1.8028169014084507, + "grad_norm": 0.9775459170341492, + "learning_rate": 8.23356784644071e-06, + "loss": 4.1698, + "step": 10880 + }, + { + "epoch": 1.8061309030654515, + "grad_norm": 0.9672629237174988, + "learning_rate": 8.195626109334196e-06, + "loss": 4.1854, + "step": 10900 + }, + { + "epoch": 1.8094449047224523, + "grad_norm": 0.9547395706176758, + "learning_rate": 8.157711202822993e-06, + "loss": 4.194, + "step": 10920 + }, + { + "epoch": 1.8127589063794531, + "grad_norm": 1.073926568031311, + "learning_rate": 8.119823690692428e-06, + "loss": 4.1604, + "step": 10940 + }, + { + "epoch": 1.8160729080364542, + "grad_norm": 1.1207832098007202, + "learning_rate": 8.081964136320466e-06, + "loss": 4.1897, + "step": 10960 + }, + { + "epoch": 1.819386909693455, + "grad_norm": 0.8986170291900635, + "learning_rate": 8.044133102669363e-06, + "loss": 4.1608, + "step": 10980 + }, + { + "epoch": 1.8227009113504558, + "grad_norm": 0.9992567300796509, + "learning_rate": 8.006331152277262e-06, + "loss": 4.1733, + "step": 11000 + }, + { + "epoch": 1.8260149130074566, + "grad_norm": 0.9626819491386414, + "learning_rate": 7.968558847249863e-06, + "loss": 4.163, + "step": 11020 + }, + { + "epoch": 1.8293289146644574, + "grad_norm": 0.9611823558807373, + "learning_rate": 7.930816749252032e-06, + "loss": 4.1674, + "step": 11040 + }, + { + "epoch": 1.8326429163214581, + "grad_norm": 0.9247469902038574, + "learning_rate": 7.893105419499473e-06, + "loss": 4.1947, + "step": 11060 + }, + { + "epoch": 1.835956917978459, + "grad_norm": 0.969030499458313, + "learning_rate": 7.855425418750373e-06, + "loss": 4.1882, + "step": 11080 + }, + { + "epoch": 1.8392709196354597, + "grad_norm": 1.0117552280426025, + "learning_rate": 7.817777307297053e-06, + "loss": 4.1727, + "step": 11100 + }, + { + "epoch": 1.8425849212924605, + "grad_norm": 0.9557654857635498, + "learning_rate": 7.780161644957666e-06, + "loss": 4.1819, + "step": 11120 + }, + { + "epoch": 1.8458989229494613, + "grad_norm": 0.9352748990058899, + "learning_rate": 7.742578991067841e-06, + "loss": 4.1841, + "step": 11140 + }, + { + "epoch": 1.8492129246064621, + "grad_norm": 0.9859512448310852, + "learning_rate": 7.705029904472383e-06, + "loss": 4.1806, + "step": 11160 + }, + { + "epoch": 1.8525269262634632, + "grad_norm": 0.9357641339302063, + "learning_rate": 7.66751494351697e-06, + "loss": 4.1946, + "step": 11180 + }, + { + "epoch": 1.855840927920464, + "grad_norm": 0.8775931596755981, + "learning_rate": 7.63003466603982e-06, + "loss": 4.1855, + "step": 11200 + }, + { + "epoch": 1.8591549295774648, + "grad_norm": 0.9871083498001099, + "learning_rate": 7.592589629363436e-06, + "loss": 4.1833, + "step": 11220 + }, + { + "epoch": 1.8624689312344658, + "grad_norm": 0.9627810716629028, + "learning_rate": 7.5551803902862805e-06, + "loss": 4.1932, + "step": 11240 + }, + { + "epoch": 1.8657829328914666, + "grad_norm": 0.968146562576294, + "learning_rate": 7.517807505074533e-06, + "loss": 4.1786, + "step": 11260 + }, + { + "epoch": 1.8690969345484674, + "grad_norm": 0.9244858026504517, + "learning_rate": 7.480471529453788e-06, + "loss": 4.1698, + "step": 11280 + }, + { + "epoch": 1.8724109362054682, + "grad_norm": 0.995374321937561, + "learning_rate": 7.443173018600804e-06, + "loss": 4.1974, + "step": 11300 + }, + { + "epoch": 1.875724937862469, + "grad_norm": 1.0751982927322388, + "learning_rate": 7.4059125271352575e-06, + "loss": 4.1533, + "step": 11320 + }, + { + "epoch": 1.8790389395194698, + "grad_norm": 0.8667049407958984, + "learning_rate": 7.368690609111468e-06, + "loss": 4.1835, + "step": 11340 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.9692635536193848, + "learning_rate": 7.331507818010195e-06, + "loss": 4.1699, + "step": 11360 + }, + { + "epoch": 1.8856669428334714, + "grad_norm": 0.9785922169685364, + "learning_rate": 7.294364706730386e-06, + "loss": 4.155, + "step": 11380 + }, + { + "epoch": 1.8889809444904722, + "grad_norm": 1.056333303451538, + "learning_rate": 7.25726182758095e-06, + "loss": 4.1721, + "step": 11400 + }, + { + "epoch": 1.892294946147473, + "grad_norm": 1.010582685470581, + "learning_rate": 7.2201997322725695e-06, + "loss": 4.1878, + "step": 11420 + }, + { + "epoch": 1.8956089478044738, + "grad_norm": 1.0259019136428833, + "learning_rate": 7.183178971909464e-06, + "loss": 4.1821, + "step": 11440 + }, + { + "epoch": 1.8989229494614748, + "grad_norm": 1.0169850587844849, + "learning_rate": 7.146200096981228e-06, + "loss": 4.1578, + "step": 11460 + }, + { + "epoch": 1.9022369511184756, + "grad_norm": 0.916630744934082, + "learning_rate": 7.109263657354617e-06, + "loss": 4.1748, + "step": 11480 + }, + { + "epoch": 1.9055509527754764, + "grad_norm": 1.0078721046447754, + "learning_rate": 7.072370202265397e-06, + "loss": 4.1708, + "step": 11500 + }, + { + "epoch": 1.9088649544324772, + "grad_norm": 0.9932781457901001, + "learning_rate": 7.035520280310156e-06, + "loss": 4.184, + "step": 11520 + }, + { + "epoch": 1.9121789560894782, + "grad_norm": 0.9127968549728394, + "learning_rate": 6.998714439438152e-06, + "loss": 4.1741, + "step": 11540 + }, + { + "epoch": 1.915492957746479, + "grad_norm": 0.9821435213088989, + "learning_rate": 6.961953226943181e-06, + "loss": 4.1698, + "step": 11560 + }, + { + "epoch": 1.9188069594034798, + "grad_norm": 0.9207146763801575, + "learning_rate": 6.925237189455409e-06, + "loss": 4.1828, + "step": 11580 + }, + { + "epoch": 1.9221209610604806, + "grad_norm": 0.9620000123977661, + "learning_rate": 6.888566872933276e-06, + "loss": 4.1689, + "step": 11600 + }, + { + "epoch": 1.9254349627174814, + "grad_norm": 0.9162537455558777, + "learning_rate": 6.851942822655357e-06, + "loss": 4.1753, + "step": 11620 + }, + { + "epoch": 1.9287489643744822, + "grad_norm": 0.980129599571228, + "learning_rate": 6.815365583212252e-06, + "loss": 4.1742, + "step": 11640 + }, + { + "epoch": 1.932062966031483, + "grad_norm": 0.9730511903762817, + "learning_rate": 6.778835698498513e-06, + "loss": 4.1714, + "step": 11660 + }, + { + "epoch": 1.9353769676884838, + "grad_norm": 1.029762625694275, + "learning_rate": 6.742353711704515e-06, + "loss": 4.1762, + "step": 11680 + }, + { + "epoch": 1.9386909693454846, + "grad_norm": 0.9680106043815613, + "learning_rate": 6.705920165308425e-06, + "loss": 4.1796, + "step": 11700 + }, + { + "epoch": 1.9420049710024854, + "grad_norm": 1.1075645685195923, + "learning_rate": 6.6695356010681e-06, + "loss": 4.1802, + "step": 11720 + }, + { + "epoch": 1.9453189726594862, + "grad_norm": 1.0098459720611572, + "learning_rate": 6.633200560013051e-06, + "loss": 4.1653, + "step": 11740 + }, + { + "epoch": 1.9486329743164872, + "grad_norm": 0.9555035829544067, + "learning_rate": 6.5969155824363874e-06, + "loss": 4.1844, + "step": 11760 + }, + { + "epoch": 1.951946975973488, + "grad_norm": 1.0087344646453857, + "learning_rate": 6.560681207886783e-06, + "loss": 4.1737, + "step": 11780 + }, + { + "epoch": 1.9552609776304888, + "grad_norm": 0.9341910481452942, + "learning_rate": 6.524497975160468e-06, + "loss": 4.1707, + "step": 11800 + }, + { + "epoch": 1.9585749792874898, + "grad_norm": 0.9860461354255676, + "learning_rate": 6.488366422293203e-06, + "loss": 4.1759, + "step": 11820 + }, + { + "epoch": 1.9618889809444906, + "grad_norm": 0.9648852944374084, + "learning_rate": 6.452287086552271e-06, + "loss": 4.1735, + "step": 11840 + }, + { + "epoch": 1.9652029826014914, + "grad_norm": 1.0442854166030884, + "learning_rate": 6.41626050442852e-06, + "loss": 4.1969, + "step": 11860 + }, + { + "epoch": 1.9685169842584922, + "grad_norm": 0.9219726920127869, + "learning_rate": 6.3802872116283375e-06, + "loss": 4.1855, + "step": 11880 + }, + { + "epoch": 1.971830985915493, + "grad_norm": 0.9622851014137268, + "learning_rate": 6.34436774306574e-06, + "loss": 4.158, + "step": 11900 + }, + { + "epoch": 1.9751449875724938, + "grad_norm": 1.1578805446624756, + "learning_rate": 6.3085026328543675e-06, + "loss": 4.1736, + "step": 11920 + }, + { + "epoch": 1.9784589892294946, + "grad_norm": 0.9219034910202026, + "learning_rate": 6.272692414299582e-06, + "loss": 4.1874, + "step": 11940 + }, + { + "epoch": 1.9817729908864954, + "grad_norm": 1.0071330070495605, + "learning_rate": 6.236937619890508e-06, + "loss": 4.1664, + "step": 11960 + }, + { + "epoch": 1.9850869925434962, + "grad_norm": 1.0246719121932983, + "learning_rate": 6.201238781292136e-06, + "loss": 4.1865, + "step": 11980 + }, + { + "epoch": 1.988400994200497, + "grad_norm": 0.9987154603004456, + "learning_rate": 6.1655964293374016e-06, + "loss": 4.1794, + "step": 12000 + }, + { + "epoch": 1.9917149958574978, + "grad_norm": 0.9923566579818726, + "learning_rate": 6.130011094019292e-06, + "loss": 4.1793, + "step": 12020 + }, + { + "epoch": 1.9950289975144988, + "grad_norm": 0.9495494365692139, + "learning_rate": 6.094483304482985e-06, + "loss": 4.1703, + "step": 12040 + }, + { + "epoch": 1.9983429991714996, + "grad_norm": 0.9649271965026855, + "learning_rate": 6.0590135890179656e-06, + "loss": 4.1898, + "step": 12060 + }, + { + "epoch": 1.9990057995028998, + "eval_loss": 4.185189247131348, + "eval_runtime": 183.8947, + "eval_samples_per_second": 116.681, + "eval_steps_per_second": 14.59, + "step": 12064 + }, + { + "epoch": 2.0016570008285006, + "grad_norm": 1.043952465057373, + "learning_rate": 6.023602475050153e-06, + "loss": 4.1625, + "step": 12080 + }, + { + "epoch": 2.0049710024855014, + "grad_norm": 1.0042601823806763, + "learning_rate": 5.988250489134102e-06, + "loss": 4.1763, + "step": 12100 + }, + { + "epoch": 2.008285004142502, + "grad_norm": 0.9356751441955566, + "learning_rate": 5.952958156945124e-06, + "loss": 4.1701, + "step": 12120 + }, + { + "epoch": 2.011599005799503, + "grad_norm": 0.9411850571632385, + "learning_rate": 5.917726003271515e-06, + "loss": 4.1793, + "step": 12140 + }, + { + "epoch": 2.014913007456504, + "grad_norm": 0.8957574367523193, + "learning_rate": 5.882554552006718e-06, + "loss": 4.1606, + "step": 12160 + }, + { + "epoch": 2.0182270091135046, + "grad_norm": 0.9589739441871643, + "learning_rate": 5.847444326141551e-06, + "loss": 4.1652, + "step": 12180 + }, + { + "epoch": 2.0215410107705054, + "grad_norm": 0.9536027312278748, + "learning_rate": 5.812395847756426e-06, + "loss": 4.1643, + "step": 12200 + }, + { + "epoch": 2.024855012427506, + "grad_norm": 1.0450109243392944, + "learning_rate": 5.777409638013578e-06, + "loss": 4.1618, + "step": 12220 + }, + { + "epoch": 2.028169014084507, + "grad_norm": 0.953590452671051, + "learning_rate": 5.742486217149334e-06, + "loss": 4.1784, + "step": 12240 + }, + { + "epoch": 2.031483015741508, + "grad_norm": 0.9736341834068298, + "learning_rate": 5.707626104466357e-06, + "loss": 4.165, + "step": 12260 + }, + { + "epoch": 2.0347970173985086, + "grad_norm": 0.9720235466957092, + "learning_rate": 5.672829818325937e-06, + "loss": 4.185, + "step": 12280 + }, + { + "epoch": 2.0381110190555094, + "grad_norm": 0.985802412033081, + "learning_rate": 5.63809787614028e-06, + "loss": 4.1725, + "step": 12300 + }, + { + "epoch": 2.04142502071251, + "grad_norm": 0.9624573588371277, + "learning_rate": 5.603430794364808e-06, + "loss": 4.1851, + "step": 12320 + }, + { + "epoch": 2.044739022369511, + "grad_norm": 1.0084534883499146, + "learning_rate": 5.5688290884904935e-06, + "loss": 4.1794, + "step": 12340 + }, + { + "epoch": 2.048053024026512, + "grad_norm": 0.9534232020378113, + "learning_rate": 5.5342932730361735e-06, + "loss": 4.1734, + "step": 12360 + }, + { + "epoch": 2.051367025683513, + "grad_norm": 0.9948390126228333, + "learning_rate": 5.499823861540918e-06, + "loss": 4.1734, + "step": 12380 + }, + { + "epoch": 2.054681027340514, + "grad_norm": 0.9592251777648926, + "learning_rate": 5.465421366556391e-06, + "loss": 4.1957, + "step": 12400 + }, + { + "epoch": 2.0579950289975146, + "grad_norm": 0.9579429030418396, + "learning_rate": 5.431086299639214e-06, + "loss": 4.1602, + "step": 12420 + }, + { + "epoch": 2.0613090306545154, + "grad_norm": 0.9736006855964661, + "learning_rate": 5.3968191713433795e-06, + "loss": 4.1579, + "step": 12440 + }, + { + "epoch": 2.0646230323115162, + "grad_norm": 0.9202005863189697, + "learning_rate": 5.3626204912126375e-06, + "loss": 4.1829, + "step": 12460 + }, + { + "epoch": 2.067937033968517, + "grad_norm": 1.0262824296951294, + "learning_rate": 5.328490767772948e-06, + "loss": 4.161, + "step": 12480 + }, + { + "epoch": 2.071251035625518, + "grad_norm": 0.9637208580970764, + "learning_rate": 5.2944305085248815e-06, + "loss": 4.1809, + "step": 12500 + }, + { + "epoch": 2.0745650372825186, + "grad_norm": 0.9922279715538025, + "learning_rate": 5.260440219936111e-06, + "loss": 4.1838, + "step": 12520 + }, + { + "epoch": 2.0778790389395194, + "grad_norm": 0.9777281284332275, + "learning_rate": 5.226520407433855e-06, + "loss": 4.1658, + "step": 12540 + }, + { + "epoch": 2.08119304059652, + "grad_norm": 0.981768786907196, + "learning_rate": 5.192671575397364e-06, + "loss": 4.1665, + "step": 12560 + }, + { + "epoch": 2.084507042253521, + "grad_norm": 0.9591443538665771, + "learning_rate": 5.158894227150441e-06, + "loss": 4.1692, + "step": 12580 + }, + { + "epoch": 2.087821043910522, + "grad_norm": 0.9779587984085083, + "learning_rate": 5.1251888649539276e-06, + "loss": 4.1737, + "step": 12600 + }, + { + "epoch": 2.0911350455675226, + "grad_norm": 0.9546878933906555, + "learning_rate": 5.091555989998259e-06, + "loss": 4.1715, + "step": 12620 + }, + { + "epoch": 2.0944490472245234, + "grad_norm": 1.0533303022384644, + "learning_rate": 5.057996102396009e-06, + "loss": 4.1786, + "step": 12640 + }, + { + "epoch": 2.0977630488815247, + "grad_norm": 0.9684029817581177, + "learning_rate": 5.024509701174431e-06, + "loss": 4.1732, + "step": 12660 + }, + { + "epoch": 2.1010770505385254, + "grad_norm": 1.078629732131958, + "learning_rate": 4.991097284268069e-06, + "loss": 4.167, + "step": 12680 + }, + { + "epoch": 2.1043910521955262, + "grad_norm": 0.987321674823761, + "learning_rate": 4.957759348511326e-06, + "loss": 4.1701, + "step": 12700 + }, + { + "epoch": 2.107705053852527, + "grad_norm": 0.942710280418396, + "learning_rate": 4.9244963896311e-06, + "loss": 4.1599, + "step": 12720 + }, + { + "epoch": 2.111019055509528, + "grad_norm": 0.9669007658958435, + "learning_rate": 4.8913089022394e-06, + "loss": 4.165, + "step": 12740 + }, + { + "epoch": 2.1143330571665286, + "grad_norm": 0.9463768601417542, + "learning_rate": 4.858197379825981e-06, + "loss": 4.143, + "step": 12760 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 1.0507632493972778, + "learning_rate": 4.825162314751032e-06, + "loss": 4.1591, + "step": 12780 + }, + { + "epoch": 2.1209610604805302, + "grad_norm": 0.992092490196228, + "learning_rate": 4.792204198237826e-06, + "loss": 4.168, + "step": 12800 + }, + { + "epoch": 2.124275062137531, + "grad_norm": 1.009311318397522, + "learning_rate": 4.759323520365443e-06, + "loss": 4.1623, + "step": 12820 + }, + { + "epoch": 2.127589063794532, + "grad_norm": 0.9661020636558533, + "learning_rate": 4.726520770061468e-06, + "loss": 4.1572, + "step": 12840 + }, + { + "epoch": 2.1309030654515326, + "grad_norm": 0.9420536756515503, + "learning_rate": 4.693796435094708e-06, + "loss": 4.1755, + "step": 12860 + }, + { + "epoch": 2.1342170671085334, + "grad_norm": 1.0199921131134033, + "learning_rate": 4.661151002067974e-06, + "loss": 4.158, + "step": 12880 + }, + { + "epoch": 2.1375310687655342, + "grad_norm": 1.1151738166809082, + "learning_rate": 4.628584956410805e-06, + "loss": 4.1682, + "step": 12900 + }, + { + "epoch": 2.140845070422535, + "grad_norm": 0.9603314995765686, + "learning_rate": 4.596098782372287e-06, + "loss": 4.1752, + "step": 12920 + }, + { + "epoch": 2.1441590720795363, + "grad_norm": 0.940518856048584, + "learning_rate": 4.563692963013817e-06, + "loss": 4.157, + "step": 12940 + }, + { + "epoch": 2.147473073736537, + "grad_norm": 1.019020438194275, + "learning_rate": 4.531367980201956e-06, + "loss": 4.181, + "step": 12960 + }, + { + "epoch": 2.150787075393538, + "grad_norm": 0.9523174166679382, + "learning_rate": 4.499124314601229e-06, + "loss": 4.1685, + "step": 12980 + }, + { + "epoch": 2.1541010770505387, + "grad_norm": 0.9682093262672424, + "learning_rate": 4.466962445667007e-06, + "loss": 4.1677, + "step": 13000 + }, + { + "epoch": 2.1574150787075395, + "grad_norm": 0.9744412302970886, + "learning_rate": 4.4348828516383565e-06, + "loss": 4.1608, + "step": 13020 + }, + { + "epoch": 2.1607290803645403, + "grad_norm": 1.0344312191009521, + "learning_rate": 4.402886009530936e-06, + "loss": 4.1708, + "step": 13040 + }, + { + "epoch": 2.164043082021541, + "grad_norm": 1.0090404748916626, + "learning_rate": 4.370972395129909e-06, + "loss": 4.188, + "step": 13060 + }, + { + "epoch": 2.167357083678542, + "grad_norm": 0.9304625391960144, + "learning_rate": 4.339142482982865e-06, + "loss": 4.1743, + "step": 13080 + }, + { + "epoch": 2.1706710853355426, + "grad_norm": 0.9409207701683044, + "learning_rate": 4.307396746392752e-06, + "loss": 4.1665, + "step": 13100 + }, + { + "epoch": 2.1739850869925434, + "grad_norm": 0.9290302991867065, + "learning_rate": 4.275735657410856e-06, + "loss": 4.1622, + "step": 13120 + }, + { + "epoch": 2.1772990886495442, + "grad_norm": 1.0090850591659546, + "learning_rate": 4.24415968682977e-06, + "loss": 4.1593, + "step": 13140 + }, + { + "epoch": 2.180613090306545, + "grad_norm": 0.9124213457107544, + "learning_rate": 4.2126693041764e-06, + "loss": 4.1831, + "step": 13160 + }, + { + "epoch": 2.183927091963546, + "grad_norm": 0.9703252911567688, + "learning_rate": 4.1812649777049826e-06, + "loss": 4.1767, + "step": 13180 + }, + { + "epoch": 2.1872410936205466, + "grad_norm": 0.9261910319328308, + "learning_rate": 4.149947174390111e-06, + "loss": 4.1645, + "step": 13200 + }, + { + "epoch": 2.1905550952775474, + "grad_norm": 0.9648107886314392, + "learning_rate": 4.118716359919813e-06, + "loss": 4.1703, + "step": 13220 + }, + { + "epoch": 2.1938690969345487, + "grad_norm": 0.9281497597694397, + "learning_rate": 4.0875729986886015e-06, + "loss": 4.1616, + "step": 13240 + }, + { + "epoch": 2.1971830985915495, + "grad_norm": 1.1000781059265137, + "learning_rate": 4.056517553790588e-06, + "loss": 4.1651, + "step": 13260 + }, + { + "epoch": 2.2004971002485503, + "grad_norm": 0.9868336319923401, + "learning_rate": 4.025550487012594e-06, + "loss": 4.1486, + "step": 13280 + }, + { + "epoch": 2.203811101905551, + "grad_norm": 0.9502912163734436, + "learning_rate": 3.994672258827268e-06, + "loss": 4.1693, + "step": 13300 + }, + { + "epoch": 2.207125103562552, + "grad_norm": 1.009382724761963, + "learning_rate": 3.963883328386264e-06, + "loss": 4.1859, + "step": 13320 + }, + { + "epoch": 2.2104391052195527, + "grad_norm": 1.0071057081222534, + "learning_rate": 3.933184153513386e-06, + "loss": 4.1553, + "step": 13340 + }, + { + "epoch": 2.2137531068765535, + "grad_norm": 0.9749252796173096, + "learning_rate": 3.9025751906978125e-06, + "loss": 4.1565, + "step": 13360 + }, + { + "epoch": 2.2170671085335543, + "grad_norm": 1.0308611392974854, + "learning_rate": 3.87205689508727e-06, + "loss": 4.1576, + "step": 13380 + }, + { + "epoch": 2.220381110190555, + "grad_norm": 1.0869494676589966, + "learning_rate": 3.841629720481308e-06, + "loss": 4.1825, + "step": 13400 + }, + { + "epoch": 2.223695111847556, + "grad_norm": 1.158950686454773, + "learning_rate": 3.81129411932451e-06, + "loss": 4.1695, + "step": 13420 + }, + { + "epoch": 2.2270091135045567, + "grad_norm": 0.9217210412025452, + "learning_rate": 3.781050542699799e-06, + "loss": 4.1559, + "step": 13440 + }, + { + "epoch": 2.2303231151615575, + "grad_norm": 1.0294437408447266, + "learning_rate": 3.7508994403217082e-06, + "loss": 4.1726, + "step": 13460 + }, + { + "epoch": 2.2336371168185583, + "grad_norm": 0.9723818898200989, + "learning_rate": 3.7208412605296986e-06, + "loss": 4.1521, + "step": 13480 + }, + { + "epoch": 2.236951118475559, + "grad_norm": 0.9427555203437805, + "learning_rate": 3.690876450281503e-06, + "loss": 4.1743, + "step": 13500 + }, + { + "epoch": 2.24026512013256, + "grad_norm": 1.0316344499588013, + "learning_rate": 3.6610054551464746e-06, + "loss": 4.178, + "step": 13520 + }, + { + "epoch": 2.243579121789561, + "grad_norm": 0.9229859113693237, + "learning_rate": 3.6312287192989448e-06, + "loss": 4.1806, + "step": 13540 + }, + { + "epoch": 2.246893123446562, + "grad_norm": 0.9554008841514587, + "learning_rate": 3.6015466855116486e-06, + "loss": 4.1758, + "step": 13560 + }, + { + "epoch": 2.2488815244407623, + "eval_loss": 4.178183555603027, + "eval_runtime": 182.9233, + "eval_samples_per_second": 117.301, + "eval_steps_per_second": 14.667, + "step": 13572 + }, + { + "epoch": 2.2502071251035627, + "grad_norm": 0.9155376553535461, + "learning_rate": 3.5719597951491115e-06, + "loss": 4.1816, + "step": 13580 + }, + { + "epoch": 2.2535211267605635, + "grad_norm": 0.9061245918273926, + "learning_rate": 3.542468488161107e-06, + "loss": 4.1736, + "step": 13600 + }, + { + "epoch": 2.2568351284175643, + "grad_norm": 1.023432731628418, + "learning_rate": 3.5130732030761108e-06, + "loss": 4.1791, + "step": 13620 + }, + { + "epoch": 2.260149130074565, + "grad_norm": 1.0252330303192139, + "learning_rate": 3.483774376994764e-06, + "loss": 4.1632, + "step": 13640 + }, + { + "epoch": 2.263463131731566, + "grad_norm": 1.006892204284668, + "learning_rate": 3.4545724455834005e-06, + "loss": 4.1626, + "step": 13660 + }, + { + "epoch": 2.2667771333885667, + "grad_norm": 0.9699810743331909, + "learning_rate": 3.4254678430675416e-06, + "loss": 4.1737, + "step": 13680 + }, + { + "epoch": 2.2700911350455675, + "grad_norm": 1.0339150428771973, + "learning_rate": 3.396461002225465e-06, + "loss": 4.1569, + "step": 13700 + }, + { + "epoch": 2.2734051367025683, + "grad_norm": 0.9977650046348572, + "learning_rate": 3.367552354381741e-06, + "loss": 4.1894, + "step": 13720 + }, + { + "epoch": 2.276719138359569, + "grad_norm": 0.9500966668128967, + "learning_rate": 3.3387423294008457e-06, + "loss": 4.1601, + "step": 13740 + }, + { + "epoch": 2.28003314001657, + "grad_norm": 0.9600996375083923, + "learning_rate": 3.3100313556807595e-06, + "loss": 4.1848, + "step": 13760 + }, + { + "epoch": 2.2833471416735707, + "grad_norm": 0.9719670414924622, + "learning_rate": 3.2814198601465817e-06, + "loss": 4.1503, + "step": 13780 + }, + { + "epoch": 2.286661143330572, + "grad_norm": 1.088294506072998, + "learning_rate": 3.2529082682442116e-06, + "loss": 4.1591, + "step": 13800 + }, + { + "epoch": 2.2899751449875723, + "grad_norm": 1.0588421821594238, + "learning_rate": 3.2244970039339885e-06, + "loss": 4.1693, + "step": 13820 + }, + { + "epoch": 2.2932891466445735, + "grad_norm": 1.0016313791275024, + "learning_rate": 3.1961864896844242e-06, + "loss": 4.1613, + "step": 13840 + }, + { + "epoch": 2.2966031483015743, + "grad_norm": 1.0516911745071411, + "learning_rate": 3.1679771464658837e-06, + "loss": 4.166, + "step": 13860 + }, + { + "epoch": 2.299917149958575, + "grad_norm": 0.9770933389663696, + "learning_rate": 3.139869393744359e-06, + "loss": 4.1515, + "step": 13880 + }, + { + "epoch": 2.303231151615576, + "grad_norm": 0.9561361074447632, + "learning_rate": 3.1118636494752087e-06, + "loss": 4.1662, + "step": 13900 + }, + { + "epoch": 2.3065451532725767, + "grad_norm": 0.9181007146835327, + "learning_rate": 3.083960330096946e-06, + "loss": 4.1554, + "step": 13920 + }, + { + "epoch": 2.3098591549295775, + "grad_norm": 0.96213698387146, + "learning_rate": 3.056159850525062e-06, + "loss": 4.1615, + "step": 13940 + }, + { + "epoch": 2.3131731565865783, + "grad_norm": 1.0713417530059814, + "learning_rate": 3.0284626241458424e-06, + "loss": 4.1535, + "step": 13960 + }, + { + "epoch": 2.316487158243579, + "grad_norm": 0.9928842186927795, + "learning_rate": 3.0008690628102155e-06, + "loss": 4.1554, + "step": 13980 + }, + { + "epoch": 2.31980115990058, + "grad_norm": 0.9541782140731812, + "learning_rate": 2.9733795768276484e-06, + "loss": 4.1578, + "step": 14000 + }, + { + "epoch": 2.3231151615575807, + "grad_norm": 0.9561997056007385, + "learning_rate": 2.9459945749600194e-06, + "loss": 4.183, + "step": 14020 + }, + { + "epoch": 2.3264291632145815, + "grad_norm": 0.941018283367157, + "learning_rate": 2.9187144644155684e-06, + "loss": 4.154, + "step": 14040 + }, + { + "epoch": 2.3297431648715823, + "grad_norm": 0.9637402296066284, + "learning_rate": 2.8915396508428116e-06, + "loss": 4.171, + "step": 14060 + }, + { + "epoch": 2.333057166528583, + "grad_norm": 0.9399302005767822, + "learning_rate": 2.8644705383245385e-06, + "loss": 4.1542, + "step": 14080 + }, + { + "epoch": 2.3363711681855843, + "grad_norm": 1.004807472229004, + "learning_rate": 2.837507529371787e-06, + "loss": 4.1617, + "step": 14100 + }, + { + "epoch": 2.3396851698425847, + "grad_norm": 1.010384202003479, + "learning_rate": 2.810651024917852e-06, + "loss": 4.1528, + "step": 14120 + }, + { + "epoch": 2.342999171499586, + "grad_norm": 0.9864283204078674, + "learning_rate": 2.7839014243123474e-06, + "loss": 4.1691, + "step": 14140 + }, + { + "epoch": 2.3463131731565867, + "grad_norm": 1.0043703317642212, + "learning_rate": 2.7572591253152383e-06, + "loss": 4.1644, + "step": 14160 + }, + { + "epoch": 2.3496271748135875, + "grad_norm": 1.037273645401001, + "learning_rate": 2.730724524090951e-06, + "loss": 4.1576, + "step": 14180 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 1.0367027521133423, + "learning_rate": 2.7042980152024733e-06, + "loss": 4.1463, + "step": 14200 + }, + { + "epoch": 2.356255178127589, + "grad_norm": 0.9750577807426453, + "learning_rate": 2.677979991605478e-06, + "loss": 4.1594, + "step": 14220 + }, + { + "epoch": 2.35956917978459, + "grad_norm": 0.9956997036933899, + "learning_rate": 2.6517708446424985e-06, + "loss": 4.1731, + "step": 14240 + }, + { + "epoch": 2.3628831814415907, + "grad_norm": 1.0000735521316528, + "learning_rate": 2.625670964037088e-06, + "loss": 4.1611, + "step": 14260 + }, + { + "epoch": 2.3661971830985915, + "grad_norm": 1.0631577968597412, + "learning_rate": 2.5996807378880494e-06, + "loss": 4.1505, + "step": 14280 + }, + { + "epoch": 2.3695111847555923, + "grad_norm": 1.018710970878601, + "learning_rate": 2.573800552663639e-06, + "loss": 4.1659, + "step": 14300 + }, + { + "epoch": 2.372825186412593, + "grad_norm": 1.0739405155181885, + "learning_rate": 2.5480307931958393e-06, + "loss": 4.1675, + "step": 14320 + }, + { + "epoch": 2.376139188069594, + "grad_norm": 0.9721205234527588, + "learning_rate": 2.5223718426746223e-06, + "loss": 4.1585, + "step": 14340 + }, + { + "epoch": 2.3794531897265947, + "grad_norm": 0.9352970719337463, + "learning_rate": 2.4968240826422673e-06, + "loss": 4.1609, + "step": 14360 + }, + { + "epoch": 2.3827671913835955, + "grad_norm": 0.9602931141853333, + "learning_rate": 2.471387892987671e-06, + "loss": 4.1374, + "step": 14380 + }, + { + "epoch": 2.3860811930405967, + "grad_norm": 0.9990806579589844, + "learning_rate": 2.4460636519407043e-06, + "loss": 4.1637, + "step": 14400 + }, + { + "epoch": 2.3893951946975975, + "grad_norm": 1.027967095375061, + "learning_rate": 2.420851736066598e-06, + "loss": 4.1732, + "step": 14420 + }, + { + "epoch": 2.3927091963545983, + "grad_norm": 0.934998095035553, + "learning_rate": 2.3957525202603336e-06, + "loss": 4.1577, + "step": 14440 + }, + { + "epoch": 2.396023198011599, + "grad_norm": 0.9207898378372192, + "learning_rate": 2.3707663777410616e-06, + "loss": 4.1586, + "step": 14460 + }, + { + "epoch": 2.3993371996686, + "grad_norm": 0.948034405708313, + "learning_rate": 2.3458936800465726e-06, + "loss": 4.1717, + "step": 14480 + }, + { + "epoch": 2.4026512013256007, + "grad_norm": 0.9645612239837646, + "learning_rate": 2.3211347970277497e-06, + "loss": 4.1702, + "step": 14500 + }, + { + "epoch": 2.4059652029826015, + "grad_norm": 0.9512291550636292, + "learning_rate": 2.296490096843087e-06, + "loss": 4.155, + "step": 14520 + }, + { + "epoch": 2.4092792046396023, + "grad_norm": 0.9387156963348389, + "learning_rate": 2.2719599459532105e-06, + "loss": 4.1818, + "step": 14540 + }, + { + "epoch": 2.412593206296603, + "grad_norm": 1.0570317506790161, + "learning_rate": 2.2475447091154133e-06, + "loss": 4.1668, + "step": 14560 + }, + { + "epoch": 2.415907207953604, + "grad_norm": 0.9484657049179077, + "learning_rate": 2.223244749378262e-06, + "loss": 4.1575, + "step": 14580 + }, + { + "epoch": 2.4192212096106047, + "grad_norm": 0.9645959138870239, + "learning_rate": 2.199060428076165e-06, + "loss": 4.1698, + "step": 14600 + }, + { + "epoch": 2.4225352112676055, + "grad_norm": 0.9635546207427979, + "learning_rate": 2.1749921048240263e-06, + "loss": 4.1686, + "step": 14620 + }, + { + "epoch": 2.4258492129246063, + "grad_norm": 0.9637837409973145, + "learning_rate": 2.151040137511887e-06, + "loss": 4.1477, + "step": 14640 + }, + { + "epoch": 2.429163214581607, + "grad_norm": 0.9572291374206543, + "learning_rate": 2.1272048822995963e-06, + "loss": 4.1661, + "step": 14660 + }, + { + "epoch": 2.432477216238608, + "grad_norm": 1.01437246799469, + "learning_rate": 2.103486693611534e-06, + "loss": 4.1554, + "step": 14680 + }, + { + "epoch": 2.435791217895609, + "grad_norm": 0.9094828963279724, + "learning_rate": 2.0798859241313176e-06, + "loss": 4.1627, + "step": 14700 + }, + { + "epoch": 2.43910521955261, + "grad_norm": 1.051871418952942, + "learning_rate": 2.056402924796581e-06, + "loss": 4.1421, + "step": 14720 + }, + { + "epoch": 2.4424192212096107, + "grad_norm": 1.0210424661636353, + "learning_rate": 2.0330380447937357e-06, + "loss": 4.1588, + "step": 14740 + }, + { + "epoch": 2.4457332228666115, + "grad_norm": 0.9816873669624329, + "learning_rate": 2.0097916315527987e-06, + "loss": 4.1648, + "step": 14760 + }, + { + "epoch": 2.4490472245236123, + "grad_norm": 0.980525553226471, + "learning_rate": 1.9866640307422013e-06, + "loss": 4.1532, + "step": 14780 + }, + { + "epoch": 2.452361226180613, + "grad_norm": 0.9599906802177429, + "learning_rate": 1.9636555862636775e-06, + "loss": 4.163, + "step": 14800 + }, + { + "epoch": 2.455675227837614, + "grad_norm": 0.9619313478469849, + "learning_rate": 1.940766640247126e-06, + "loss": 4.1492, + "step": 14820 + }, + { + "epoch": 2.4589892294946147, + "grad_norm": 0.9351578950881958, + "learning_rate": 1.9179975330455347e-06, + "loss": 4.1585, + "step": 14840 + }, + { + "epoch": 2.4623032311516155, + "grad_norm": 0.9171659350395203, + "learning_rate": 1.8953486032299206e-06, + "loss": 4.1666, + "step": 14860 + }, + { + "epoch": 2.4656172328086163, + "grad_norm": 0.9781689643859863, + "learning_rate": 1.8728201875842956e-06, + "loss": 4.1848, + "step": 14880 + }, + { + "epoch": 2.468931234465617, + "grad_norm": 1.0406521558761597, + "learning_rate": 1.8504126211006445e-06, + "loss": 4.1692, + "step": 14900 + }, + { + "epoch": 2.472245236122618, + "grad_norm": 1.0125136375427246, + "learning_rate": 1.8281262369739682e-06, + "loss": 4.1586, + "step": 14920 + }, + { + "epoch": 2.4755592377796187, + "grad_norm": 0.9620126485824585, + "learning_rate": 1.805961366597303e-06, + "loss": 4.1729, + "step": 14940 + }, + { + "epoch": 2.4788732394366195, + "grad_norm": 1.0717898607254028, + "learning_rate": 1.783918339556816e-06, + "loss": 4.1698, + "step": 14960 + }, + { + "epoch": 2.4821872410936203, + "grad_norm": 0.9398671984672546, + "learning_rate": 1.7619974836268894e-06, + "loss": 4.1491, + "step": 14980 + }, + { + "epoch": 2.4855012427506216, + "grad_norm": 0.8983393311500549, + "learning_rate": 1.7401991247652495e-06, + "loss": 4.1605, + "step": 15000 + }, + { + "epoch": 2.4888152444076224, + "grad_norm": 1.0154719352722168, + "learning_rate": 1.7185235871081264e-06, + "loss": 4.1357, + "step": 15020 + }, + { + "epoch": 2.492129246064623, + "grad_norm": 1.0226125717163086, + "learning_rate": 1.696971192965422e-06, + "loss": 4.1403, + "step": 15040 + }, + { + "epoch": 2.495443247721624, + "grad_norm": 0.92442387342453, + "learning_rate": 1.6755422628159313e-06, + "loss": 4.1718, + "step": 15060 + }, + { + "epoch": 2.4987572493786248, + "grad_norm": 0.9485278725624084, + "learning_rate": 1.6542371153025693e-06, + "loss": 4.1709, + "step": 15080 + }, + { + "epoch": 2.4987572493786248, + "eval_loss": 4.174120903015137, + "eval_runtime": 185.4173, + "eval_samples_per_second": 115.723, + "eval_steps_per_second": 14.47, + "step": 15080 + }, + { + "epoch": 2.5020712510356256, + "grad_norm": 1.0055153369903564, + "learning_rate": 1.6330560672276264e-06, + "loss": 4.1465, + "step": 15100 + }, + { + "epoch": 2.5053852526926264, + "grad_norm": 0.9800291657447815, + "learning_rate": 1.611999433548076e-06, + "loss": 4.1563, + "step": 15120 + }, + { + "epoch": 2.508699254349627, + "grad_norm": 0.9124789237976074, + "learning_rate": 1.5910675273708697e-06, + "loss": 4.166, + "step": 15140 + }, + { + "epoch": 2.512013256006628, + "grad_norm": 1.0099622011184692, + "learning_rate": 1.5702606599483005e-06, + "loss": 4.1514, + "step": 15160 + }, + { + "epoch": 2.5153272576636287, + "grad_norm": 0.9255741834640503, + "learning_rate": 1.5495791406733574e-06, + "loss": 4.1703, + "step": 15180 + }, + { + "epoch": 2.5186412593206295, + "grad_norm": 0.9527518153190613, + "learning_rate": 1.5290232770751434e-06, + "loss": 4.1473, + "step": 15200 + }, + { + "epoch": 2.5219552609776303, + "grad_norm": 0.9795065522193909, + "learning_rate": 1.508593374814281e-06, + "loss": 4.1791, + "step": 15220 + }, + { + "epoch": 2.525269262634631, + "grad_norm": 1.0523587465286255, + "learning_rate": 1.4882897376783912e-06, + "loss": 4.1596, + "step": 15240 + }, + { + "epoch": 2.5285832642916324, + "grad_norm": 0.979905903339386, + "learning_rate": 1.4681126675775525e-06, + "loss": 4.1614, + "step": 15260 + }, + { + "epoch": 2.5318972659486327, + "grad_norm": 0.9790669083595276, + "learning_rate": 1.4480624645398268e-06, + "loss": 4.1659, + "step": 15280 + }, + { + "epoch": 2.535211267605634, + "grad_norm": 0.9634921550750732, + "learning_rate": 1.428139426706796e-06, + "loss": 4.1758, + "step": 15300 + }, + { + "epoch": 2.5385252692626348, + "grad_norm": 0.9196787476539612, + "learning_rate": 1.4083438503291292e-06, + "loss": 4.1548, + "step": 15320 + }, + { + "epoch": 2.5418392709196356, + "grad_norm": 1.0133346319198608, + "learning_rate": 1.3886760297621648e-06, + "loss": 4.1569, + "step": 15340 + }, + { + "epoch": 2.5451532725766364, + "grad_norm": 0.9064164757728577, + "learning_rate": 1.3691362574615564e-06, + "loss": 4.1491, + "step": 15360 + }, + { + "epoch": 2.548467274233637, + "grad_norm": 0.9749716520309448, + "learning_rate": 1.3497248239788985e-06, + "loss": 4.1436, + "step": 15380 + }, + { + "epoch": 2.551781275890638, + "grad_norm": 0.9947595000267029, + "learning_rate": 1.3304420179574296e-06, + "loss": 4.167, + "step": 15400 + }, + { + "epoch": 2.5550952775476388, + "grad_norm": 0.9736104607582092, + "learning_rate": 1.3112881261277277e-06, + "loss": 4.1529, + "step": 15420 + }, + { + "epoch": 2.5584092792046396, + "grad_norm": 0.9650049209594727, + "learning_rate": 1.2922634333034423e-06, + "loss": 4.159, + "step": 15440 + }, + { + "epoch": 2.5617232808616404, + "grad_norm": 0.964367687702179, + "learning_rate": 1.2733682223770727e-06, + "loss": 4.1613, + "step": 15460 + }, + { + "epoch": 2.565037282518641, + "grad_norm": 0.9139639735221863, + "learning_rate": 1.2546027743157486e-06, + "loss": 4.1662, + "step": 15480 + }, + { + "epoch": 2.568351284175642, + "grad_norm": 0.9967067837715149, + "learning_rate": 1.2359673681570627e-06, + "loss": 4.1609, + "step": 15500 + }, + { + "epoch": 2.5716652858326428, + "grad_norm": 0.9958235025405884, + "learning_rate": 1.217462281004913e-06, + "loss": 4.1565, + "step": 15520 + }, + { + "epoch": 2.5749792874896436, + "grad_norm": 1.0585681200027466, + "learning_rate": 1.1990877880253871e-06, + "loss": 4.1578, + "step": 15540 + }, + { + "epoch": 2.578293289146645, + "grad_norm": 0.9713040590286255, + "learning_rate": 1.1808441624426724e-06, + "loss": 4.1695, + "step": 15560 + }, + { + "epoch": 2.581607290803645, + "grad_norm": 0.9452921152114868, + "learning_rate": 1.1627316755349838e-06, + "loss": 4.1577, + "step": 15580 + }, + { + "epoch": 2.5849212924606464, + "grad_norm": 1.0168826580047607, + "learning_rate": 1.1447505966305438e-06, + "loss": 4.1552, + "step": 15600 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 0.9631409049034119, + "learning_rate": 1.1269011931035645e-06, + "loss": 4.1622, + "step": 15620 + }, + { + "epoch": 2.591549295774648, + "grad_norm": 0.9362034201622009, + "learning_rate": 1.1091837303702813e-06, + "loss": 4.1576, + "step": 15640 + }, + { + "epoch": 2.594863297431649, + "grad_norm": 0.9658742547035217, + "learning_rate": 1.0915984718849992e-06, + "loss": 4.1484, + "step": 15660 + }, + { + "epoch": 2.5981772990886496, + "grad_norm": 1.0002686977386475, + "learning_rate": 1.0741456791361826e-06, + "loss": 4.1455, + "step": 15680 + }, + { + "epoch": 2.6014913007456504, + "grad_norm": 0.9866043925285339, + "learning_rate": 1.056825611642558e-06, + "loss": 4.1572, + "step": 15700 + }, + { + "epoch": 2.604805302402651, + "grad_norm": 0.9977617263793945, + "learning_rate": 1.039638526949266e-06, + "loss": 4.1573, + "step": 15720 + }, + { + "epoch": 2.608119304059652, + "grad_norm": 0.9258089065551758, + "learning_rate": 1.0225846806240192e-06, + "loss": 4.1697, + "step": 15740 + }, + { + "epoch": 2.6114333057166528, + "grad_norm": 0.9411051273345947, + "learning_rate": 1.0056643262533162e-06, + "loss": 4.1746, + "step": 15760 + }, + { + "epoch": 2.6147473073736536, + "grad_norm": 0.9454408288002014, + "learning_rate": 9.888777154386541e-07, + "loss": 4.1658, + "step": 15780 + }, + { + "epoch": 2.6180613090306544, + "grad_norm": 1.0077019929885864, + "learning_rate": 9.72225097792805e-07, + "loss": 4.1683, + "step": 15800 + }, + { + "epoch": 2.6213753106876556, + "grad_norm": 0.9899587035179138, + "learning_rate": 9.55706720936087e-07, + "loss": 4.167, + "step": 15820 + }, + { + "epoch": 2.624689312344656, + "grad_norm": 0.9944363832473755, + "learning_rate": 9.393228304927005e-07, + "loss": 4.1632, + "step": 15840 + }, + { + "epoch": 2.628003314001657, + "grad_norm": 0.9317917227745056, + "learning_rate": 9.230736700870569e-07, + "loss": 4.1642, + "step": 15860 + }, + { + "epoch": 2.6313173156586576, + "grad_norm": 1.0653148889541626, + "learning_rate": 9.069594813401694e-07, + "loss": 4.1474, + "step": 15880 + }, + { + "epoch": 2.634631317315659, + "grad_norm": 0.9596170783042908, + "learning_rate": 8.90980503866059e-07, + "loss": 4.1773, + "step": 15900 + }, + { + "epoch": 2.6379453189726596, + "grad_norm": 0.9840303063392639, + "learning_rate": 8.751369752681804e-07, + "loss": 4.1579, + "step": 15920 + }, + { + "epoch": 2.6412593206296604, + "grad_norm": 0.956961989402771, + "learning_rate": 8.594291311359048e-07, + "loss": 4.1562, + "step": 15940 + }, + { + "epoch": 2.644573322286661, + "grad_norm": 0.965333104133606, + "learning_rate": 8.438572050410032e-07, + "loss": 4.1327, + "step": 15960 + }, + { + "epoch": 2.647887323943662, + "grad_norm": 0.9944215416908264, + "learning_rate": 8.284214285341819e-07, + "loss": 4.1629, + "step": 15980 + }, + { + "epoch": 2.651201325600663, + "grad_norm": 0.9452559947967529, + "learning_rate": 8.131220311416366e-07, + "loss": 4.1668, + "step": 16000 + }, + { + "epoch": 2.6545153272576636, + "grad_norm": 1.1162261962890625, + "learning_rate": 7.979592403616343e-07, + "loss": 4.1526, + "step": 16020 + }, + { + "epoch": 2.6578293289146644, + "grad_norm": 0.9779930710792542, + "learning_rate": 7.829332816611446e-07, + "loss": 4.1613, + "step": 16040 + }, + { + "epoch": 2.661143330571665, + "grad_norm": 0.9355151653289795, + "learning_rate": 7.680443784724678e-07, + "loss": 4.1309, + "step": 16060 + }, + { + "epoch": 2.664457332228666, + "grad_norm": 1.0204670429229736, + "learning_rate": 7.532927521899302e-07, + "loss": 4.1616, + "step": 16080 + }, + { + "epoch": 2.667771333885667, + "grad_norm": 0.9264137148857117, + "learning_rate": 7.386786221665776e-07, + "loss": 4.1571, + "step": 16100 + }, + { + "epoch": 2.671085335542668, + "grad_norm": 0.910361647605896, + "learning_rate": 7.242022057109277e-07, + "loss": 4.1536, + "step": 16120 + }, + { + "epoch": 2.6743993371996684, + "grad_norm": 0.9735772609710693, + "learning_rate": 7.098637180837231e-07, + "loss": 4.1309, + "step": 16140 + }, + { + "epoch": 2.6777133388566696, + "grad_norm": 1.0591309070587158, + "learning_rate": 6.956633724947481e-07, + "loss": 4.1808, + "step": 16160 + }, + { + "epoch": 2.6810273405136704, + "grad_norm": 1.0182360410690308, + "learning_rate": 6.816013800996402e-07, + "loss": 4.1627, + "step": 16180 + }, + { + "epoch": 2.684341342170671, + "grad_norm": 1.0391919612884521, + "learning_rate": 6.676779499967634e-07, + "loss": 4.1484, + "step": 16200 + }, + { + "epoch": 2.687655343827672, + "grad_norm": 0.9495192766189575, + "learning_rate": 6.538932892240923e-07, + "loss": 4.1547, + "step": 16220 + }, + { + "epoch": 2.690969345484673, + "grad_norm": 1.013307809829712, + "learning_rate": 6.402476027561422e-07, + "loss": 4.1481, + "step": 16240 + }, + { + "epoch": 2.6942833471416736, + "grad_norm": 0.9629780650138855, + "learning_rate": 6.267410935009033e-07, + "loss": 4.1626, + "step": 16260 + }, + { + "epoch": 2.6975973487986744, + "grad_norm": 0.9699434638023376, + "learning_rate": 6.133739622968471e-07, + "loss": 4.1719, + "step": 16280 + }, + { + "epoch": 2.700911350455675, + "grad_norm": 0.9978121519088745, + "learning_rate": 6.001464079099184e-07, + "loss": 4.1731, + "step": 16300 + }, + { + "epoch": 2.704225352112676, + "grad_norm": 0.9384540319442749, + "learning_rate": 5.870586270305934e-07, + "loss": 4.1496, + "step": 16320 + }, + { + "epoch": 2.707539353769677, + "grad_norm": 0.9746360182762146, + "learning_rate": 5.741108142709528e-07, + "loss": 4.172, + "step": 16340 + }, + { + "epoch": 2.7108533554266776, + "grad_norm": 0.991398811340332, + "learning_rate": 5.613031621617792e-07, + "loss": 4.1649, + "step": 16360 + }, + { + "epoch": 2.7141673570836784, + "grad_norm": 0.9522440433502197, + "learning_rate": 5.486358611497089e-07, + "loss": 4.1632, + "step": 16380 + }, + { + "epoch": 2.717481358740679, + "grad_norm": 0.9380577802658081, + "learning_rate": 5.361090995943829e-07, + "loss": 4.1479, + "step": 16400 + }, + { + "epoch": 2.7207953603976804, + "grad_norm": 0.9882134795188904, + "learning_rate": 5.237230637656631e-07, + "loss": 4.165, + "step": 16420 + }, + { + "epoch": 2.724109362054681, + "grad_norm": 1.0366357564926147, + "learning_rate": 5.11477937840853e-07, + "loss": 4.1667, + "step": 16440 + }, + { + "epoch": 2.727423363711682, + "grad_norm": 0.9156529307365417, + "learning_rate": 4.993739039019551e-07, + "loss": 4.1702, + "step": 16460 + }, + { + "epoch": 2.730737365368683, + "grad_norm": 1.0622930526733398, + "learning_rate": 4.874111419329752e-07, + "loss": 4.1602, + "step": 16480 + }, + { + "epoch": 2.7340513670256836, + "grad_norm": 0.9343971610069275, + "learning_rate": 4.7558982981723565e-07, + "loss": 4.1486, + "step": 16500 + }, + { + "epoch": 2.7373653686826844, + "grad_norm": 0.9685918092727661, + "learning_rate": 4.639101433347393e-07, + "loss": 4.1671, + "step": 16520 + }, + { + "epoch": 2.7406793703396852, + "grad_norm": 0.9789609909057617, + "learning_rate": 4.5237225615954225e-07, + "loss": 4.1603, + "step": 16540 + }, + { + "epoch": 2.743993371996686, + "grad_norm": 0.9678868055343628, + "learning_rate": 4.409763398571887e-07, + "loss": 4.1484, + "step": 16560 + }, + { + "epoch": 2.747307373653687, + "grad_norm": 0.9723621606826782, + "learning_rate": 4.297225638821445e-07, + "loss": 4.1596, + "step": 16580 + }, + { + "epoch": 2.7486329743164872, + "eval_loss": 4.172341346740723, + "eval_runtime": 181.7458, + "eval_samples_per_second": 118.06, + "eval_steps_per_second": 14.762, + "step": 16588 + }, + { + "epoch": 2.7506213753106876, + "grad_norm": 0.9724571108818054, + "learning_rate": 4.1861109557528865e-07, + "loss": 4.1671, + "step": 16600 + }, + { + "epoch": 2.7539353769676884, + "grad_norm": 0.9629175662994385, + "learning_rate": 4.076421001614173e-07, + "loss": 4.1473, + "step": 16620 + }, + { + "epoch": 2.757249378624689, + "grad_norm": 0.9648428559303284, + "learning_rate": 3.9681574074678875e-07, + "loss": 4.148, + "step": 16640 + }, + { + "epoch": 2.76056338028169, + "grad_norm": 1.0365052223205566, + "learning_rate": 3.861321783167027e-07, + "loss": 4.151, + "step": 16660 + }, + { + "epoch": 2.763877381938691, + "grad_norm": 1.0133048295974731, + "learning_rate": 3.755915717331005e-07, + "loss": 4.1612, + "step": 16680 + }, + { + "epoch": 2.7671913835956916, + "grad_norm": 1.0168633460998535, + "learning_rate": 3.6519407773220606e-07, + "loss": 4.1646, + "step": 16700 + }, + { + "epoch": 2.770505385252693, + "grad_norm": 1.030612826347351, + "learning_rate": 3.5493985092219353e-07, + "loss": 4.1478, + "step": 16720 + }, + { + "epoch": 2.773819386909693, + "grad_norm": 0.9936444759368896, + "learning_rate": 3.4482904378088986e-07, + "loss": 4.1726, + "step": 16740 + }, + { + "epoch": 2.7771333885666944, + "grad_norm": 1.0011498928070068, + "learning_rate": 3.34861806653507e-07, + "loss": 4.1548, + "step": 16760 + }, + { + "epoch": 2.7804473902236952, + "grad_norm": 0.960295557975769, + "learning_rate": 3.250382877504066e-07, + "loss": 4.1425, + "step": 16780 + }, + { + "epoch": 2.783761391880696, + "grad_norm": 0.953752338886261, + "learning_rate": 3.1535863314489436e-07, + "loss": 4.156, + "step": 16800 + }, + { + "epoch": 2.787075393537697, + "grad_norm": 1.051132082939148, + "learning_rate": 3.058229867710505e-07, + "loss": 4.1779, + "step": 16820 + }, + { + "epoch": 2.7903893951946976, + "grad_norm": 0.9826200604438782, + "learning_rate": 2.96431490421587e-07, + "loss": 4.1518, + "step": 16840 + }, + { + "epoch": 2.7937033968516984, + "grad_norm": 1.0669680833816528, + "learning_rate": 2.871842837457406e-07, + "loss": 4.1584, + "step": 16860 + }, + { + "epoch": 2.7970173985086992, + "grad_norm": 0.9622154235839844, + "learning_rate": 2.780815042472007e-07, + "loss": 4.1563, + "step": 16880 + }, + { + "epoch": 2.8003314001657, + "grad_norm": 1.0448014736175537, + "learning_rate": 2.691232872820515e-07, + "loss": 4.1762, + "step": 16900 + }, + { + "epoch": 2.803645401822701, + "grad_norm": 0.98758465051651, + "learning_rate": 2.603097660567766e-07, + "loss": 4.1722, + "step": 16920 + }, + { + "epoch": 2.8069594034797016, + "grad_norm": 0.945831298828125, + "learning_rate": 2.5164107162626293e-07, + "loss": 4.1614, + "step": 16940 + }, + { + "epoch": 2.8102734051367024, + "grad_norm": 0.9767056703567505, + "learning_rate": 2.431173328918646e-07, + "loss": 4.1498, + "step": 16960 + }, + { + "epoch": 2.8135874067937037, + "grad_norm": 0.9321964979171753, + "learning_rate": 2.3473867659947302e-07, + "loss": 4.16, + "step": 16980 + }, + { + "epoch": 2.816901408450704, + "grad_norm": 0.8986331224441528, + "learning_rate": 2.265052273376467e-07, + "loss": 4.1427, + "step": 17000 + }, + { + "epoch": 2.8202154101077053, + "grad_norm": 0.9428537487983704, + "learning_rate": 2.1841710753574551e-07, + "loss": 4.171, + "step": 17020 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.9248889684677124, + "learning_rate": 2.104744374621215e-07, + "loss": 4.1439, + "step": 17040 + }, + { + "epoch": 2.826843413421707, + "grad_norm": 1.0635182857513428, + "learning_rate": 2.0267733522232104e-07, + "loss": 4.1442, + "step": 17060 + }, + { + "epoch": 2.8301574150787077, + "grad_norm": 0.9721777439117432, + "learning_rate": 1.9502591675733317e-07, + "loss": 4.16, + "step": 17080 + }, + { + "epoch": 2.8334714167357085, + "grad_norm": 0.9717017412185669, + "learning_rate": 1.8752029584186648e-07, + "loss": 4.1581, + "step": 17100 + }, + { + "epoch": 2.8367854183927093, + "grad_norm": 0.9264596700668335, + "learning_rate": 1.801605840826559e-07, + "loss": 4.1769, + "step": 17120 + }, + { + "epoch": 2.84009942004971, + "grad_norm": 1.1058419942855835, + "learning_rate": 1.7294689091680196e-07, + "loss": 4.1702, + "step": 17140 + }, + { + "epoch": 2.843413421706711, + "grad_norm": 0.9733535051345825, + "learning_rate": 1.6587932361014415e-07, + "loss": 4.1623, + "step": 17160 + }, + { + "epoch": 2.8467274233637117, + "grad_norm": 1.0296661853790283, + "learning_rate": 1.5895798725566903e-07, + "loss": 4.1508, + "step": 17180 + }, + { + "epoch": 2.8500414250207124, + "grad_norm": 0.9222320318222046, + "learning_rate": 1.521829847719425e-07, + "loss": 4.1591, + "step": 17200 + }, + { + "epoch": 2.8533554266777132, + "grad_norm": 1.0537161827087402, + "learning_rate": 1.4555441690158323e-07, + "loss": 4.1668, + "step": 17220 + }, + { + "epoch": 2.856669428334714, + "grad_norm": 0.9625121355056763, + "learning_rate": 1.3907238220976173e-07, + "loss": 4.1469, + "step": 17240 + }, + { + "epoch": 2.859983429991715, + "grad_norm": 1.0898228883743286, + "learning_rate": 1.3273697708273913e-07, + "loss": 4.1446, + "step": 17260 + }, + { + "epoch": 2.863297431648716, + "grad_norm": 1.1046921014785767, + "learning_rate": 1.2654829572642746e-07, + "loss": 4.1508, + "step": 17280 + }, + { + "epoch": 2.8666114333057164, + "grad_norm": 0.9655560255050659, + "learning_rate": 1.2050643016499498e-07, + "loss": 4.1622, + "step": 17300 + }, + { + "epoch": 2.8699254349627177, + "grad_norm": 0.9041799902915955, + "learning_rate": 1.1461147023949404e-07, + "loss": 4.152, + "step": 17320 + }, + { + "epoch": 2.873239436619718, + "grad_norm": 0.9666980504989624, + "learning_rate": 1.0886350360652665e-07, + "loss": 4.1568, + "step": 17340 + }, + { + "epoch": 2.8765534382767193, + "grad_norm": 1.0637248754501343, + "learning_rate": 1.0326261573694207e-07, + "loss": 4.1492, + "step": 17360 + }, + { + "epoch": 2.87986743993372, + "grad_norm": 1.0064494609832764, + "learning_rate": 9.780888991456128e-08, + "loss": 4.1813, + "step": 17380 + }, + { + "epoch": 2.883181441590721, + "grad_norm": 0.9964413642883301, + "learning_rate": 9.250240723494563e-08, + "loss": 4.157, + "step": 17400 + }, + { + "epoch": 2.8864954432477217, + "grad_norm": 1.020708680152893, + "learning_rate": 8.734324660418348e-08, + "loss": 4.1523, + "step": 17420 + }, + { + "epoch": 2.8898094449047225, + "grad_norm": 1.0335192680358887, + "learning_rate": 8.233148473772546e-08, + "loss": 4.1514, + "step": 17440 + }, + { + "epoch": 2.8931234465617233, + "grad_norm": 0.9418586492538452, + "learning_rate": 7.746719615923437e-08, + "loss": 4.1658, + "step": 17460 + }, + { + "epoch": 2.896437448218724, + "grad_norm": 0.9596615433692932, + "learning_rate": 7.27504531994827e-08, + "loss": 4.1624, + "step": 17480 + }, + { + "epoch": 2.899751449875725, + "grad_norm": 0.9299134612083435, + "learning_rate": 6.818132599527794e-08, + "loss": 4.1626, + "step": 17500 + }, + { + "epoch": 2.9030654515327257, + "grad_norm": 0.9338573813438416, + "learning_rate": 6.375988248841559e-08, + "loss": 4.1689, + "step": 17520 + }, + { + "epoch": 2.9063794531897265, + "grad_norm": 0.9326637387275696, + "learning_rate": 5.948618842467113e-08, + "loss": 4.1634, + "step": 17540 + }, + { + "epoch": 2.9096934548467273, + "grad_norm": 0.9195525646209717, + "learning_rate": 5.5360307352823054e-08, + "loss": 4.1468, + "step": 17560 + }, + { + "epoch": 2.9130074565037285, + "grad_norm": 1.1046154499053955, + "learning_rate": 5.138230062370575e-08, + "loss": 4.1366, + "step": 17580 + }, + { + "epoch": 2.916321458160729, + "grad_norm": 0.9665520191192627, + "learning_rate": 4.755222738930143e-08, + "loss": 4.1677, + "step": 17600 + }, + { + "epoch": 2.91963545981773, + "grad_norm": 1.0512491464614868, + "learning_rate": 4.3870144601853015e-08, + "loss": 4.1527, + "step": 17620 + }, + { + "epoch": 2.922949461474731, + "grad_norm": 1.0874909162521362, + "learning_rate": 4.033610701302704e-08, + "loss": 4.1477, + "step": 17640 + }, + { + "epoch": 2.9262634631317317, + "grad_norm": 0.9638181328773499, + "learning_rate": 3.6950167173090965e-08, + "loss": 4.1594, + "step": 17660 + }, + { + "epoch": 2.9295774647887325, + "grad_norm": 1.0048037767410278, + "learning_rate": 3.3712375430134946e-08, + "loss": 4.1518, + "step": 17680 + }, + { + "epoch": 2.9328914664457333, + "grad_norm": 0.9321887493133545, + "learning_rate": 3.062277992932794e-08, + "loss": 4.1571, + "step": 17700 + }, + { + "epoch": 2.936205468102734, + "grad_norm": 0.9977238774299622, + "learning_rate": 2.7681426612190533e-08, + "loss": 4.1572, + "step": 17720 + }, + { + "epoch": 2.939519469759735, + "grad_norm": 1.0277584791183472, + "learning_rate": 2.488835921592436e-08, + "loss": 4.1736, + "step": 17740 + }, + { + "epoch": 2.9428334714167357, + "grad_norm": 1.0330194234848022, + "learning_rate": 2.2243619272752647e-08, + "loss": 4.1573, + "step": 17760 + }, + { + "epoch": 2.9461474730737365, + "grad_norm": 0.9719997048377991, + "learning_rate": 1.974724610930734e-08, + "loss": 4.1627, + "step": 17780 + }, + { + "epoch": 2.9494614747307373, + "grad_norm": 0.969202995300293, + "learning_rate": 1.7399276846041814e-08, + "loss": 4.1432, + "step": 17800 + }, + { + "epoch": 2.952775476387738, + "grad_norm": 0.9278626441955566, + "learning_rate": 1.5199746396681314e-08, + "loss": 4.1599, + "step": 17820 + }, + { + "epoch": 2.956089478044739, + "grad_norm": 1.0520209074020386, + "learning_rate": 1.3148687467701149e-08, + "loss": 4.1602, + "step": 17840 + }, + { + "epoch": 2.9594034797017397, + "grad_norm": 0.9267175793647766, + "learning_rate": 1.1246130557843738e-08, + "loss": 4.1622, + "step": 17860 + }, + { + "epoch": 2.962717481358741, + "grad_norm": 0.9517655968666077, + "learning_rate": 9.492103957660092e-09, + "loss": 4.1752, + "step": 17880 + }, + { + "epoch": 2.9660314830157413, + "grad_norm": 1.0329607725143433, + "learning_rate": 7.886633749095708e-09, + "loss": 4.1464, + "step": 17900 + }, + { + "epoch": 2.9693454846727425, + "grad_norm": 0.9467251896858215, + "learning_rate": 6.429743805095312e-09, + "loss": 4.1644, + "step": 17920 + }, + { + "epoch": 2.9726594863297433, + "grad_norm": 0.9774031043052673, + "learning_rate": 5.12145578925316e-09, + "loss": 4.1556, + "step": 17940 + }, + { + "epoch": 2.975973487986744, + "grad_norm": 0.9558754563331604, + "learning_rate": 3.961789155492168e-09, + "loss": 4.1565, + "step": 17960 + }, + { + "epoch": 2.979287489643745, + "grad_norm": 0.989288330078125, + "learning_rate": 2.950761147767489e-09, + "loss": 4.1564, + "step": 17980 + }, + { + "epoch": 2.9826014913007457, + "grad_norm": 0.9828252196311951, + "learning_rate": 2.0883867998178208e-09, + "loss": 4.154, + "step": 18000 + }, + { + "epoch": 2.9859154929577465, + "grad_norm": 0.9588239789009094, + "learning_rate": 1.3746789349355915e-09, + "loss": 4.1514, + "step": 18020 + }, + { + "epoch": 2.9892294946147473, + "grad_norm": 0.9303632974624634, + "learning_rate": 8.096481657826616e-10, + "loss": 4.1412, + "step": 18040 + }, + { + "epoch": 2.992543496271748, + "grad_norm": 0.9522117972373962, + "learning_rate": 3.933028942271211e-10, + "loss": 4.1562, + "step": 18060 + }, + { + "epoch": 2.995857497928749, + "grad_norm": 0.9780867099761963, + "learning_rate": 1.256493112200552e-10, + "loss": 4.1653, + "step": 18080 + }, + { + "epoch": 2.9985086992543497, + "eval_loss": 4.172011852264404, + "eval_runtime": 185.3666, + "eval_samples_per_second": 115.754, + "eval_steps_per_second": 14.474, + "step": 18096 + } + ], + "logging_steps": 20, + "max_steps": 18105, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1508, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 3, + "early_stopping_threshold": 0.001 + }, + "attributes": { + "early_stopping_patience_counter": 1 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.762790195133645e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}