| { | |
| "best_metric": 3.8681728839874268, | |
| "best_model_checkpoint": "checkpoints/test_1M_1-2025-02-12-12-32/checkpoint-10000", | |
| "epoch": 0.7914523149980214, | |
| "eval_steps": 10000, | |
| "global_step": 10000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0019786307874950534, | |
| "grad_norm": 254.82342529296875, | |
| "learning_rate": 2.499208537244918e-07, | |
| "loss": 5.7705, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.003957261574990107, | |
| "grad_norm": 153.19989013671875, | |
| "learning_rate": 2.498384096875041e-07, | |
| "loss": 5.6747, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.00593589236248516, | |
| "grad_norm": 224.5292510986328, | |
| "learning_rate": 2.4975596565051644e-07, | |
| "loss": 5.6201, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.007914523149980214, | |
| "grad_norm": 175.854248046875, | |
| "learning_rate": 2.4967352161352873e-07, | |
| "loss": 5.6974, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.009893153937475268, | |
| "grad_norm": 163.52769470214844, | |
| "learning_rate": 2.49591077576541e-07, | |
| "loss": 5.5417, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.01187178472497032, | |
| "grad_norm": 254.2264862060547, | |
| "learning_rate": 2.4950863353955335e-07, | |
| "loss": 5.8201, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.013850415512465374, | |
| "grad_norm": 175.30279541015625, | |
| "learning_rate": 2.4942618950256564e-07, | |
| "loss": 5.5302, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.015829046299960427, | |
| "grad_norm": 300.1286315917969, | |
| "learning_rate": 2.4934374546557797e-07, | |
| "loss": 5.6572, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.01780767708745548, | |
| "grad_norm": 201.56961059570312, | |
| "learning_rate": 2.4926130142859026e-07, | |
| "loss": 5.2914, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.019786307874950535, | |
| "grad_norm": 245.64854431152344, | |
| "learning_rate": 2.491788573916026e-07, | |
| "loss": 5.4478, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.02176493866244559, | |
| "grad_norm": 239.78257751464844, | |
| "learning_rate": 2.490964133546149e-07, | |
| "loss": 5.4161, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.02374356944994064, | |
| "grad_norm": 150.18310546875, | |
| "learning_rate": 2.4901396931762717e-07, | |
| "loss": 5.4978, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.025722200237435693, | |
| "grad_norm": 172.03607177734375, | |
| "learning_rate": 2.489315252806395e-07, | |
| "loss": 5.4105, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.027700831024930747, | |
| "grad_norm": 343.2570495605469, | |
| "learning_rate": 2.488490812436518e-07, | |
| "loss": 5.5195, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0296794618124258, | |
| "grad_norm": 329.7228698730469, | |
| "learning_rate": 2.4876663720666413e-07, | |
| "loss": 5.4494, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.031658092599920855, | |
| "grad_norm": 174.63136291503906, | |
| "learning_rate": 2.486841931696764e-07, | |
| "loss": 5.2864, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.033636723387415905, | |
| "grad_norm": 356.6216125488281, | |
| "learning_rate": 2.486017491326887e-07, | |
| "loss": 5.4376, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.03561535417491096, | |
| "grad_norm": 166.16783142089844, | |
| "learning_rate": 2.4851930509570104e-07, | |
| "loss": 5.3141, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.03759398496240601, | |
| "grad_norm": 220.06170654296875, | |
| "learning_rate": 2.484368610587133e-07, | |
| "loss": 5.5457, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.03957261574990107, | |
| "grad_norm": 154.55517578125, | |
| "learning_rate": 2.483544170217256e-07, | |
| "loss": 5.1264, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.04155124653739612, | |
| "grad_norm": 184.18443298339844, | |
| "learning_rate": 2.4827197298473794e-07, | |
| "loss": 5.3702, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.04352987732489118, | |
| "grad_norm": 128.84693908691406, | |
| "learning_rate": 2.4818952894775023e-07, | |
| "loss": 5.0207, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.04550850811238623, | |
| "grad_norm": 196.2894287109375, | |
| "learning_rate": 2.4810708491076257e-07, | |
| "loss": 5.315, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.04748713889988128, | |
| "grad_norm": 200.00257873535156, | |
| "learning_rate": 2.4802464087377485e-07, | |
| "loss": 5.2215, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.049465769687376336, | |
| "grad_norm": 271.8963928222656, | |
| "learning_rate": 2.479421968367872e-07, | |
| "loss": 5.3286, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.051444400474871387, | |
| "grad_norm": 181.56686401367188, | |
| "learning_rate": 2.478597527997995e-07, | |
| "loss": 4.9967, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.053423031262366444, | |
| "grad_norm": 242.8925323486328, | |
| "learning_rate": 2.477773087628118e-07, | |
| "loss": 5.1984, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.055401662049861494, | |
| "grad_norm": 210.05746459960938, | |
| "learning_rate": 2.476948647258241e-07, | |
| "loss": 5.0975, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.05738029283735655, | |
| "grad_norm": 181.1220245361328, | |
| "learning_rate": 2.476124206888364e-07, | |
| "loss": 5.0036, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.0593589236248516, | |
| "grad_norm": 166.00709533691406, | |
| "learning_rate": 2.475299766518487e-07, | |
| "loss": 5.3082, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.06133755441234666, | |
| "grad_norm": 151.4649200439453, | |
| "learning_rate": 2.47447532614861e-07, | |
| "loss": 5.1391, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.06331618519984171, | |
| "grad_norm": 149.88165283203125, | |
| "learning_rate": 2.4736508857787335e-07, | |
| "loss": 5.0783, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.06529481598733676, | |
| "grad_norm": 172.47061157226562, | |
| "learning_rate": 2.4728264454088563e-07, | |
| "loss": 4.9624, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.06727344677483181, | |
| "grad_norm": 298.1490478515625, | |
| "learning_rate": 2.4720020050389797e-07, | |
| "loss": 5.1937, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.06925207756232687, | |
| "grad_norm": 164.37867736816406, | |
| "learning_rate": 2.4711775646691025e-07, | |
| "loss": 5.1792, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.07123070834982193, | |
| "grad_norm": 216.8033905029297, | |
| "learning_rate": 2.4703531242992254e-07, | |
| "loss": 5.152, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.07320933913731698, | |
| "grad_norm": 211.95762634277344, | |
| "learning_rate": 2.469528683929349e-07, | |
| "loss": 4.9146, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.07518796992481203, | |
| "grad_norm": 257.61968994140625, | |
| "learning_rate": 2.4687042435594716e-07, | |
| "loss": 5.095, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.07716660071230709, | |
| "grad_norm": 179.43719482421875, | |
| "learning_rate": 2.467879803189595e-07, | |
| "loss": 5.0316, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.07914523149980214, | |
| "grad_norm": 180.3157958984375, | |
| "learning_rate": 2.467055362819718e-07, | |
| "loss": 4.9441, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.08112386228729719, | |
| "grad_norm": 162.77447509765625, | |
| "learning_rate": 2.4662309224498407e-07, | |
| "loss": 4.9724, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.08310249307479224, | |
| "grad_norm": 123.65939331054688, | |
| "learning_rate": 2.465406482079964e-07, | |
| "loss": 5.2271, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.08508112386228729, | |
| "grad_norm": 163.114990234375, | |
| "learning_rate": 2.464582041710087e-07, | |
| "loss": 4.9724, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.08705975464978236, | |
| "grad_norm": 204.76400756835938, | |
| "learning_rate": 2.46375760134021e-07, | |
| "loss": 4.8724, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.0890383854372774, | |
| "grad_norm": 307.8963623046875, | |
| "learning_rate": 2.462933160970333e-07, | |
| "loss": 4.9256, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.09101701622477246, | |
| "grad_norm": 133.03707885742188, | |
| "learning_rate": 2.462108720600456e-07, | |
| "loss": 4.8792, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.09299564701226751, | |
| "grad_norm": 161.41697692871094, | |
| "learning_rate": 2.4612842802305794e-07, | |
| "loss": 5.054, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.09497427779976256, | |
| "grad_norm": 135.36228942871094, | |
| "learning_rate": 2.460459839860702e-07, | |
| "loss": 4.8655, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.09695290858725762, | |
| "grad_norm": 179.60646057128906, | |
| "learning_rate": 2.4596353994908256e-07, | |
| "loss": 4.7832, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.09893153937475267, | |
| "grad_norm": 335.71380615234375, | |
| "learning_rate": 2.4588109591209485e-07, | |
| "loss": 4.9979, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.10091017016224772, | |
| "grad_norm": 149.5147247314453, | |
| "learning_rate": 2.457986518751072e-07, | |
| "loss": 4.714, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.10288880094974277, | |
| "grad_norm": 154.0236358642578, | |
| "learning_rate": 2.4571620783811947e-07, | |
| "loss": 4.8015, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.10486743173723784, | |
| "grad_norm": 450.5319519042969, | |
| "learning_rate": 2.456337638011318e-07, | |
| "loss": 4.6914, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.10684606252473289, | |
| "grad_norm": 195.87863159179688, | |
| "learning_rate": 2.455513197641441e-07, | |
| "loss": 5.0124, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.10882469331222794, | |
| "grad_norm": 198.12225341796875, | |
| "learning_rate": 2.454688757271564e-07, | |
| "loss": 4.5305, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.11080332409972299, | |
| "grad_norm": 161.57623291015625, | |
| "learning_rate": 2.453864316901687e-07, | |
| "loss": 4.7806, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.11278195488721804, | |
| "grad_norm": 187.8081817626953, | |
| "learning_rate": 2.45303987653181e-07, | |
| "loss": 4.9401, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.1147605856747131, | |
| "grad_norm": 160.1893768310547, | |
| "learning_rate": 2.4522154361619334e-07, | |
| "loss": 4.8119, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.11673921646220815, | |
| "grad_norm": 181.8563995361328, | |
| "learning_rate": 2.4513909957920563e-07, | |
| "loss": 4.7979, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.1187178472497032, | |
| "grad_norm": 184.80641174316406, | |
| "learning_rate": 2.4505665554221796e-07, | |
| "loss": 4.8448, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.12069647803719825, | |
| "grad_norm": 151.4502410888672, | |
| "learning_rate": 2.4497421150523025e-07, | |
| "loss": 4.7101, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.12267510882469332, | |
| "grad_norm": 163.2119598388672, | |
| "learning_rate": 2.4489176746824253e-07, | |
| "loss": 4.8802, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.12465373961218837, | |
| "grad_norm": 147.33741760253906, | |
| "learning_rate": 2.4480932343125487e-07, | |
| "loss": 4.6433, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.12663237039968342, | |
| "grad_norm": 145.84716796875, | |
| "learning_rate": 2.4472687939426716e-07, | |
| "loss": 4.4118, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.12861100118717847, | |
| "grad_norm": 111.55641174316406, | |
| "learning_rate": 2.4464443535727944e-07, | |
| "loss": 4.819, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.13058963197467352, | |
| "grad_norm": 145.68092346191406, | |
| "learning_rate": 2.445619913202918e-07, | |
| "loss": 4.7752, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.13256826276216857, | |
| "grad_norm": 274.0830078125, | |
| "learning_rate": 2.4447954728330407e-07, | |
| "loss": 4.8566, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.13454689354966362, | |
| "grad_norm": 141.83982849121094, | |
| "learning_rate": 2.4439710324631635e-07, | |
| "loss": 4.6643, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.1365255243371587, | |
| "grad_norm": 182.46160888671875, | |
| "learning_rate": 2.443146592093287e-07, | |
| "loss": 4.731, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.13850415512465375, | |
| "grad_norm": 200.28773498535156, | |
| "learning_rate": 2.44232215172341e-07, | |
| "loss": 4.5525, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.1404827859121488, | |
| "grad_norm": 163.7792510986328, | |
| "learning_rate": 2.441497711353533e-07, | |
| "loss": 4.8076, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.14246141669964385, | |
| "grad_norm": 422.9642639160156, | |
| "learning_rate": 2.440673270983656e-07, | |
| "loss": 4.7045, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.1444400474871389, | |
| "grad_norm": 187.99957275390625, | |
| "learning_rate": 2.4398488306137794e-07, | |
| "loss": 4.6615, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.14641867827463395, | |
| "grad_norm": 144.52732849121094, | |
| "learning_rate": 2.439024390243902e-07, | |
| "loss": 4.7912, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.148397309062129, | |
| "grad_norm": 192.0771026611328, | |
| "learning_rate": 2.4381999498740256e-07, | |
| "loss": 4.7916, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.15037593984962405, | |
| "grad_norm": 148.06878662109375, | |
| "learning_rate": 2.4373755095041484e-07, | |
| "loss": 4.7782, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.1523545706371191, | |
| "grad_norm": 131.4456329345703, | |
| "learning_rate": 2.436551069134272e-07, | |
| "loss": 4.579, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.15433320142461418, | |
| "grad_norm": 141.84681701660156, | |
| "learning_rate": 2.4357266287643947e-07, | |
| "loss": 4.5776, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.15631183221210923, | |
| "grad_norm": 122.31990051269531, | |
| "learning_rate": 2.4349021883945175e-07, | |
| "loss": 4.5185, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.15829046299960428, | |
| "grad_norm": 229.08372497558594, | |
| "learning_rate": 2.434077748024641e-07, | |
| "loss": 4.6352, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.16026909378709933, | |
| "grad_norm": 136.54153442382812, | |
| "learning_rate": 2.433253307654764e-07, | |
| "loss": 4.5512, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.16224772457459438, | |
| "grad_norm": 237.05514526367188, | |
| "learning_rate": 2.432428867284887e-07, | |
| "loss": 4.7146, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.16422635536208943, | |
| "grad_norm": 149.2750244140625, | |
| "learning_rate": 2.43160442691501e-07, | |
| "loss": 4.6935, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.16620498614958448, | |
| "grad_norm": 149.77297973632812, | |
| "learning_rate": 2.4307799865451334e-07, | |
| "loss": 4.8223, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.16818361693707953, | |
| "grad_norm": 235.3883056640625, | |
| "learning_rate": 2.429955546175256e-07, | |
| "loss": 4.6266, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.17016224772457458, | |
| "grad_norm": 137.77316284179688, | |
| "learning_rate": 2.429131105805379e-07, | |
| "loss": 4.8543, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.17214087851206966, | |
| "grad_norm": 143.8935089111328, | |
| "learning_rate": 2.4283066654355025e-07, | |
| "loss": 4.651, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.1741195092995647, | |
| "grad_norm": 191.43856811523438, | |
| "learning_rate": 2.4274822250656253e-07, | |
| "loss": 4.4166, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.17609814008705976, | |
| "grad_norm": 135.82838439941406, | |
| "learning_rate": 2.426657784695748e-07, | |
| "loss": 4.7078, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.1780767708745548, | |
| "grad_norm": 114.28646087646484, | |
| "learning_rate": 2.4258333443258715e-07, | |
| "loss": 4.5316, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.18005540166204986, | |
| "grad_norm": 237.41001892089844, | |
| "learning_rate": 2.4250089039559944e-07, | |
| "loss": 4.4699, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.1820340324495449, | |
| "grad_norm": 124.57892608642578, | |
| "learning_rate": 2.424184463586117e-07, | |
| "loss": 4.5101, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.18401266323703996, | |
| "grad_norm": 147.15554809570312, | |
| "learning_rate": 2.4233600232162406e-07, | |
| "loss": 4.5974, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.18599129402453501, | |
| "grad_norm": 166.0609588623047, | |
| "learning_rate": 2.4225355828463635e-07, | |
| "loss": 4.5105, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.18796992481203006, | |
| "grad_norm": 188.97705078125, | |
| "learning_rate": 2.421711142476487e-07, | |
| "loss": 4.587, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.18994855559952512, | |
| "grad_norm": 243.09271240234375, | |
| "learning_rate": 2.4208867021066097e-07, | |
| "loss": 4.7686, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.1919271863870202, | |
| "grad_norm": 127.40078735351562, | |
| "learning_rate": 2.420062261736733e-07, | |
| "loss": 4.4476, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.19390581717451524, | |
| "grad_norm": 253.8776092529297, | |
| "learning_rate": 2.419237821366856e-07, | |
| "loss": 4.5478, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.1958844479620103, | |
| "grad_norm": 123.27115631103516, | |
| "learning_rate": 2.4184133809969793e-07, | |
| "loss": 4.3502, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.19786307874950534, | |
| "grad_norm": 138.00375366210938, | |
| "learning_rate": 2.417588940627102e-07, | |
| "loss": 4.3534, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.1998417095370004, | |
| "grad_norm": 115.53954315185547, | |
| "learning_rate": 2.4167645002572256e-07, | |
| "loss": 4.7066, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.20182034032449545, | |
| "grad_norm": 180.38809204101562, | |
| "learning_rate": 2.4159400598873484e-07, | |
| "loss": 4.6605, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.2037989711119905, | |
| "grad_norm": 129.8457489013672, | |
| "learning_rate": 2.415115619517472e-07, | |
| "loss": 4.3849, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.20577760189948555, | |
| "grad_norm": 156.64404296875, | |
| "learning_rate": 2.4142911791475946e-07, | |
| "loss": 4.3434, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.2077562326869806, | |
| "grad_norm": 162.81320190429688, | |
| "learning_rate": 2.4134667387777175e-07, | |
| "loss": 4.5466, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.20973486347447567, | |
| "grad_norm": 128.7244873046875, | |
| "learning_rate": 2.412642298407841e-07, | |
| "loss": 4.5358, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.21171349426197072, | |
| "grad_norm": 217.59042358398438, | |
| "learning_rate": 2.4118178580379637e-07, | |
| "loss": 4.5235, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.21369212504946578, | |
| "grad_norm": 144.84365844726562, | |
| "learning_rate": 2.410993417668087e-07, | |
| "loss": 4.3811, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.21567075583696083, | |
| "grad_norm": 146.22451782226562, | |
| "learning_rate": 2.41016897729821e-07, | |
| "loss": 4.3797, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.21764938662445588, | |
| "grad_norm": 198.39772033691406, | |
| "learning_rate": 2.409344536928333e-07, | |
| "loss": 4.4303, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.21962801741195093, | |
| "grad_norm": 158.10592651367188, | |
| "learning_rate": 2.408520096558456e-07, | |
| "loss": 4.3633, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.22160664819944598, | |
| "grad_norm": 166.79954528808594, | |
| "learning_rate": 2.407695656188579e-07, | |
| "loss": 4.5392, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.22358527898694103, | |
| "grad_norm": 207.30593872070312, | |
| "learning_rate": 2.406871215818702e-07, | |
| "loss": 4.5003, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.22556390977443608, | |
| "grad_norm": 128.81883239746094, | |
| "learning_rate": 2.4060467754488253e-07, | |
| "loss": 4.5416, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.22754254056193116, | |
| "grad_norm": 181.48960876464844, | |
| "learning_rate": 2.405222335078948e-07, | |
| "loss": 4.1725, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.2295211713494262, | |
| "grad_norm": 179.47384643554688, | |
| "learning_rate": 2.4043978947090715e-07, | |
| "loss": 4.5229, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.23149980213692126, | |
| "grad_norm": 144.242919921875, | |
| "learning_rate": 2.4035734543391943e-07, | |
| "loss": 4.3295, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.2334784329244163, | |
| "grad_norm": 177.61968994140625, | |
| "learning_rate": 2.402749013969317e-07, | |
| "loss": 4.4266, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.23545706371191136, | |
| "grad_norm": 143.8682861328125, | |
| "learning_rate": 2.4019245735994406e-07, | |
| "loss": 4.2341, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.2374356944994064, | |
| "grad_norm": 128.8461151123047, | |
| "learning_rate": 2.4011001332295634e-07, | |
| "loss": 4.3676, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.23941432528690146, | |
| "grad_norm": 160.70687866210938, | |
| "learning_rate": 2.400275692859687e-07, | |
| "loss": 4.3945, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.2413929560743965, | |
| "grad_norm": 157.65855407714844, | |
| "learning_rate": 2.3994512524898097e-07, | |
| "loss": 4.4967, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.24337158686189156, | |
| "grad_norm": 125.79988861083984, | |
| "learning_rate": 2.398626812119933e-07, | |
| "loss": 4.279, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.24535021764938664, | |
| "grad_norm": 168.8534698486328, | |
| "learning_rate": 2.397802371750056e-07, | |
| "loss": 4.4813, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.2473288484368817, | |
| "grad_norm": 120.4126968383789, | |
| "learning_rate": 2.3969779313801793e-07, | |
| "loss": 4.1997, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.24930747922437674, | |
| "grad_norm": 115.56365203857422, | |
| "learning_rate": 2.396153491010302e-07, | |
| "loss": 4.4076, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.2512861100118718, | |
| "grad_norm": 152.89859008789062, | |
| "learning_rate": 2.3953290506404255e-07, | |
| "loss": 4.2893, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.25326474079936684, | |
| "grad_norm": 177.6272735595703, | |
| "learning_rate": 2.3945046102705484e-07, | |
| "loss": 4.4892, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.2552433715868619, | |
| "grad_norm": 131.46661376953125, | |
| "learning_rate": 2.393680169900671e-07, | |
| "loss": 4.2702, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.25722200237435694, | |
| "grad_norm": 101.60210418701172, | |
| "learning_rate": 2.3928557295307946e-07, | |
| "loss": 4.2209, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.259200633161852, | |
| "grad_norm": 199.7799835205078, | |
| "learning_rate": 2.3920312891609174e-07, | |
| "loss": 4.1502, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.26117926394934704, | |
| "grad_norm": 163.44424438476562, | |
| "learning_rate": 2.391206848791041e-07, | |
| "loss": 4.3423, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 148.59519958496094, | |
| "learning_rate": 2.3903824084211637e-07, | |
| "loss": 4.4833, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 0.26513652552433714, | |
| "grad_norm": 129.75927734375, | |
| "learning_rate": 2.3895579680512865e-07, | |
| "loss": 4.4745, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.2671151563118322, | |
| "grad_norm": 126.6795654296875, | |
| "learning_rate": 2.38873352768141e-07, | |
| "loss": 4.3964, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 0.26909378709932724, | |
| "grad_norm": 157.1032257080078, | |
| "learning_rate": 2.387909087311533e-07, | |
| "loss": 4.3419, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.2710724178868223, | |
| "grad_norm": 142.79139709472656, | |
| "learning_rate": 2.3870846469416556e-07, | |
| "loss": 4.2243, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 0.2730510486743174, | |
| "grad_norm": 137.3797607421875, | |
| "learning_rate": 2.386260206571779e-07, | |
| "loss": 4.1661, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.27502967946181245, | |
| "grad_norm": 148.77401733398438, | |
| "learning_rate": 2.385435766201902e-07, | |
| "loss": 4.483, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 0.2770083102493075, | |
| "grad_norm": 124.54267120361328, | |
| "learning_rate": 2.384611325832025e-07, | |
| "loss": 4.369, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.27898694103680255, | |
| "grad_norm": 113.43370056152344, | |
| "learning_rate": 2.383786885462148e-07, | |
| "loss": 4.1491, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 0.2809655718242976, | |
| "grad_norm": 155.67677307128906, | |
| "learning_rate": 2.3829624450922712e-07, | |
| "loss": 4.3403, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.28294420261179265, | |
| "grad_norm": 201.27784729003906, | |
| "learning_rate": 2.3821380047223943e-07, | |
| "loss": 4.3563, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 0.2849228333992877, | |
| "grad_norm": 104.74275970458984, | |
| "learning_rate": 2.3813135643525174e-07, | |
| "loss": 4.2706, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.28690146418678275, | |
| "grad_norm": 133.6251678466797, | |
| "learning_rate": 2.3804891239826405e-07, | |
| "loss": 4.3637, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 0.2888800949742778, | |
| "grad_norm": 102.35352325439453, | |
| "learning_rate": 2.3796646836127634e-07, | |
| "loss": 4.2585, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.29085872576177285, | |
| "grad_norm": 156.72654724121094, | |
| "learning_rate": 2.3788402432428868e-07, | |
| "loss": 4.3448, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 0.2928373565492679, | |
| "grad_norm": 121.19142150878906, | |
| "learning_rate": 2.3780158028730096e-07, | |
| "loss": 4.1475, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.29481598733676295, | |
| "grad_norm": 138.72952270507812, | |
| "learning_rate": 2.3771913625031327e-07, | |
| "loss": 4.2475, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 0.296794618124258, | |
| "grad_norm": 314.35113525390625, | |
| "learning_rate": 2.3763669221332559e-07, | |
| "loss": 4.2643, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.29877324891175305, | |
| "grad_norm": 131.71240234375, | |
| "learning_rate": 2.375542481763379e-07, | |
| "loss": 4.2741, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 0.3007518796992481, | |
| "grad_norm": 193.2744598388672, | |
| "learning_rate": 2.374718041393502e-07, | |
| "loss": 4.2314, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.30273051048674315, | |
| "grad_norm": 146.98760986328125, | |
| "learning_rate": 2.3738936010236252e-07, | |
| "loss": 4.5421, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 0.3047091412742382, | |
| "grad_norm": 106.49159240722656, | |
| "learning_rate": 2.373069160653748e-07, | |
| "loss": 4.0922, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.30668777206173325, | |
| "grad_norm": 128.12686157226562, | |
| "learning_rate": 2.3722447202838712e-07, | |
| "loss": 4.3171, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 0.30866640284922836, | |
| "grad_norm": 165.8458251953125, | |
| "learning_rate": 2.3714202799139943e-07, | |
| "loss": 4.1937, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.3106450336367234, | |
| "grad_norm": 129.49652099609375, | |
| "learning_rate": 2.3705958395441171e-07, | |
| "loss": 4.2486, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 0.31262366442421846, | |
| "grad_norm": 113.08882141113281, | |
| "learning_rate": 2.3697713991742405e-07, | |
| "loss": 3.9533, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.3146022952117135, | |
| "grad_norm": 116.51021575927734, | |
| "learning_rate": 2.3689469588043634e-07, | |
| "loss": 4.2525, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 0.31658092599920856, | |
| "grad_norm": 95.54279327392578, | |
| "learning_rate": 2.3681225184344867e-07, | |
| "loss": 4.1355, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.3185595567867036, | |
| "grad_norm": 123.10621643066406, | |
| "learning_rate": 2.3672980780646096e-07, | |
| "loss": 4.3705, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 0.32053818757419866, | |
| "grad_norm": 142.11273193359375, | |
| "learning_rate": 2.3664736376947327e-07, | |
| "loss": 4.2712, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.3225168183616937, | |
| "grad_norm": 162.17141723632812, | |
| "learning_rate": 2.3656491973248558e-07, | |
| "loss": 4.1127, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 0.32449544914918876, | |
| "grad_norm": 160.26893615722656, | |
| "learning_rate": 2.364824756954979e-07, | |
| "loss": 4.2687, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.3264740799366838, | |
| "grad_norm": 134.65093994140625, | |
| "learning_rate": 2.3640003165851018e-07, | |
| "loss": 4.3567, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 0.32845271072417886, | |
| "grad_norm": 178.23516845703125, | |
| "learning_rate": 2.3631758762152252e-07, | |
| "loss": 4.097, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.3304313415116739, | |
| "grad_norm": 151.1556396484375, | |
| "learning_rate": 2.362351435845348e-07, | |
| "loss": 4.1602, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 0.33240997229916897, | |
| "grad_norm": 154.64442443847656, | |
| "learning_rate": 2.3615269954754711e-07, | |
| "loss": 4.2365, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.334388603086664, | |
| "grad_norm": 226.6827850341797, | |
| "learning_rate": 2.3607025551055943e-07, | |
| "loss": 4.3196, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 0.33636723387415907, | |
| "grad_norm": 172.67916870117188, | |
| "learning_rate": 2.359878114735717e-07, | |
| "loss": 4.4476, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.3383458646616541, | |
| "grad_norm": 124.78984069824219, | |
| "learning_rate": 2.3590536743658405e-07, | |
| "loss": 4.4006, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 0.34032449544914917, | |
| "grad_norm": 156.81365966796875, | |
| "learning_rate": 2.3582292339959633e-07, | |
| "loss": 4.3914, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.3423031262366442, | |
| "grad_norm": 116.53181457519531, | |
| "learning_rate": 2.3574047936260865e-07, | |
| "loss": 4.2846, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 0.3442817570241393, | |
| "grad_norm": 146.16543579101562, | |
| "learning_rate": 2.3565803532562096e-07, | |
| "loss": 4.1371, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.3462603878116344, | |
| "grad_norm": 213.07974243164062, | |
| "learning_rate": 2.3557559128863327e-07, | |
| "loss": 4.2294, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 0.3482390185991294, | |
| "grad_norm": 99.38206481933594, | |
| "learning_rate": 2.3549314725164558e-07, | |
| "loss": 4.1726, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.3502176493866245, | |
| "grad_norm": 162.97059631347656, | |
| "learning_rate": 2.354107032146579e-07, | |
| "loss": 4.0507, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 0.3521962801741195, | |
| "grad_norm": 132.77474975585938, | |
| "learning_rate": 2.3532825917767018e-07, | |
| "loss": 4.0016, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.3541749109616146, | |
| "grad_norm": 126.9658203125, | |
| "learning_rate": 2.3524581514068252e-07, | |
| "loss": 4.2731, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 0.3561535417491096, | |
| "grad_norm": 194.47755432128906, | |
| "learning_rate": 2.351633711036948e-07, | |
| "loss": 4.1119, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.3581321725366047, | |
| "grad_norm": 153.6606903076172, | |
| "learning_rate": 2.3508092706670709e-07, | |
| "loss": 4.4556, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 0.3601108033240997, | |
| "grad_norm": 146.66709899902344, | |
| "learning_rate": 2.3499848302971942e-07, | |
| "loss": 4.3314, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.3620894341115948, | |
| "grad_norm": 111.01129913330078, | |
| "learning_rate": 2.349160389927317e-07, | |
| "loss": 4.2929, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 0.3640680648990898, | |
| "grad_norm": 137.40582275390625, | |
| "learning_rate": 2.3483359495574405e-07, | |
| "loss": 4.2198, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.3660466956865849, | |
| "grad_norm": 142.0623779296875, | |
| "learning_rate": 2.3475115091875633e-07, | |
| "loss": 4.2013, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 0.36802532647407993, | |
| "grad_norm": 135.2795867919922, | |
| "learning_rate": 2.3466870688176864e-07, | |
| "loss": 4.231, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.370003957261575, | |
| "grad_norm": 127.59281158447266, | |
| "learning_rate": 2.3458626284478096e-07, | |
| "loss": 3.9613, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 0.37198258804907003, | |
| "grad_norm": 132.48663330078125, | |
| "learning_rate": 2.3450381880779327e-07, | |
| "loss": 4.1925, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.3739612188365651, | |
| "grad_norm": 135.35409545898438, | |
| "learning_rate": 2.3442137477080555e-07, | |
| "loss": 4.1828, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 0.37593984962406013, | |
| "grad_norm": 107.55503845214844, | |
| "learning_rate": 2.343389307338179e-07, | |
| "loss": 4.2578, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.3779184804115552, | |
| "grad_norm": 132.79620361328125, | |
| "learning_rate": 2.3425648669683018e-07, | |
| "loss": 4.0254, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 0.37989711119905023, | |
| "grad_norm": 123.6044692993164, | |
| "learning_rate": 2.341740426598425e-07, | |
| "loss": 3.9981, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.38187574198654534, | |
| "grad_norm": 149.656005859375, | |
| "learning_rate": 2.340915986228548e-07, | |
| "loss": 4.1067, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 0.3838543727740404, | |
| "grad_norm": 122.97380065917969, | |
| "learning_rate": 2.3400915458586708e-07, | |
| "loss": 4.2396, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.38583300356153544, | |
| "grad_norm": 140.10183715820312, | |
| "learning_rate": 2.3392671054887942e-07, | |
| "loss": 4.1309, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 0.3878116343490305, | |
| "grad_norm": 137.91583251953125, | |
| "learning_rate": 2.338442665118917e-07, | |
| "loss": 4.0575, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.38979026513652554, | |
| "grad_norm": 137.72152709960938, | |
| "learning_rate": 2.3376182247490402e-07, | |
| "loss": 4.159, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 0.3917688959240206, | |
| "grad_norm": 84.3819808959961, | |
| "learning_rate": 2.3367937843791633e-07, | |
| "loss": 4.2611, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.39374752671151564, | |
| "grad_norm": 200.3111114501953, | |
| "learning_rate": 2.3359693440092864e-07, | |
| "loss": 4.167, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 0.3957261574990107, | |
| "grad_norm": 123.27460479736328, | |
| "learning_rate": 2.3351449036394095e-07, | |
| "loss": 4.2918, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.39770478828650574, | |
| "grad_norm": 111.70620727539062, | |
| "learning_rate": 2.3343204632695327e-07, | |
| "loss": 4.2242, | |
| "step": 5025 | |
| }, | |
| { | |
| "epoch": 0.3996834190740008, | |
| "grad_norm": 107.74165344238281, | |
| "learning_rate": 2.3334960228996555e-07, | |
| "loss": 4.3572, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.40166204986149584, | |
| "grad_norm": 138.31423950195312, | |
| "learning_rate": 2.332671582529779e-07, | |
| "loss": 4.1759, | |
| "step": 5075 | |
| }, | |
| { | |
| "epoch": 0.4036406806489909, | |
| "grad_norm": 104.73587799072266, | |
| "learning_rate": 2.3318471421599017e-07, | |
| "loss": 4.1695, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.40561931143648594, | |
| "grad_norm": 138.1061553955078, | |
| "learning_rate": 2.3310227017900246e-07, | |
| "loss": 4.0986, | |
| "step": 5125 | |
| }, | |
| { | |
| "epoch": 0.407597942223981, | |
| "grad_norm": 148.92279052734375, | |
| "learning_rate": 2.330198261420148e-07, | |
| "loss": 4.3455, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.40957657301147604, | |
| "grad_norm": 321.29852294921875, | |
| "learning_rate": 2.3293738210502708e-07, | |
| "loss": 4.1285, | |
| "step": 5175 | |
| }, | |
| { | |
| "epoch": 0.4115552037989711, | |
| "grad_norm": 114.85989379882812, | |
| "learning_rate": 2.3285493806803942e-07, | |
| "loss": 3.9628, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.41353383458646614, | |
| "grad_norm": 137.27610778808594, | |
| "learning_rate": 2.327724940310517e-07, | |
| "loss": 4.1521, | |
| "step": 5225 | |
| }, | |
| { | |
| "epoch": 0.4155124653739612, | |
| "grad_norm": 96.02686309814453, | |
| "learning_rate": 2.3269004999406402e-07, | |
| "loss": 4.027, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.4174910961614563, | |
| "grad_norm": 213.81649780273438, | |
| "learning_rate": 2.3260760595707633e-07, | |
| "loss": 4.0522, | |
| "step": 5275 | |
| }, | |
| { | |
| "epoch": 0.41946972694895135, | |
| "grad_norm": 160.4125518798828, | |
| "learning_rate": 2.3252516192008864e-07, | |
| "loss": 4.09, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.4214483577364464, | |
| "grad_norm": 167.58741760253906, | |
| "learning_rate": 2.3244271788310093e-07, | |
| "loss": 4.1128, | |
| "step": 5325 | |
| }, | |
| { | |
| "epoch": 0.42342698852394145, | |
| "grad_norm": 159.55303955078125, | |
| "learning_rate": 2.3236027384611326e-07, | |
| "loss": 4.0867, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.4254056193114365, | |
| "grad_norm": 122.51324462890625, | |
| "learning_rate": 2.3227782980912555e-07, | |
| "loss": 4.2261, | |
| "step": 5375 | |
| }, | |
| { | |
| "epoch": 0.42738425009893155, | |
| "grad_norm": 185.9108428955078, | |
| "learning_rate": 2.3219538577213789e-07, | |
| "loss": 3.9684, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.4293628808864266, | |
| "grad_norm": 195.37579345703125, | |
| "learning_rate": 2.3211294173515017e-07, | |
| "loss": 4.0779, | |
| "step": 5425 | |
| }, | |
| { | |
| "epoch": 0.43134151167392165, | |
| "grad_norm": 157.84371948242188, | |
| "learning_rate": 2.3203049769816246e-07, | |
| "loss": 4.1991, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.4333201424614167, | |
| "grad_norm": 111.01512908935547, | |
| "learning_rate": 2.319480536611748e-07, | |
| "loss": 3.9962, | |
| "step": 5475 | |
| }, | |
| { | |
| "epoch": 0.43529877324891175, | |
| "grad_norm": 114.49053955078125, | |
| "learning_rate": 2.3186560962418708e-07, | |
| "loss": 3.8972, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.4372774040364068, | |
| "grad_norm": 168.17874145507812, | |
| "learning_rate": 2.317831655871994e-07, | |
| "loss": 4.1913, | |
| "step": 5525 | |
| }, | |
| { | |
| "epoch": 0.43925603482390185, | |
| "grad_norm": 140.61912536621094, | |
| "learning_rate": 2.317007215502117e-07, | |
| "loss": 4.1396, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.4412346656113969, | |
| "grad_norm": 138.01805114746094, | |
| "learning_rate": 2.3161827751322401e-07, | |
| "loss": 4.1399, | |
| "step": 5575 | |
| }, | |
| { | |
| "epoch": 0.44321329639889195, | |
| "grad_norm": 188.0181427001953, | |
| "learning_rate": 2.3153583347623633e-07, | |
| "loss": 4.0329, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.445191927186387, | |
| "grad_norm": 170.8402099609375, | |
| "learning_rate": 2.3145338943924864e-07, | |
| "loss": 4.3414, | |
| "step": 5625 | |
| }, | |
| { | |
| "epoch": 0.44717055797388205, | |
| "grad_norm": 200.65077209472656, | |
| "learning_rate": 2.3137094540226092e-07, | |
| "loss": 4.2154, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.4491491887613771, | |
| "grad_norm": 120.18091583251953, | |
| "learning_rate": 2.3128850136527326e-07, | |
| "loss": 4.0372, | |
| "step": 5675 | |
| }, | |
| { | |
| "epoch": 0.45112781954887216, | |
| "grad_norm": 89.9730224609375, | |
| "learning_rate": 2.3120605732828555e-07, | |
| "loss": 4.1059, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.45310645033636726, | |
| "grad_norm": 133.7999267578125, | |
| "learning_rate": 2.3112361329129786e-07, | |
| "loss": 4.2035, | |
| "step": 5725 | |
| }, | |
| { | |
| "epoch": 0.4550850811238623, | |
| "grad_norm": 88.3386459350586, | |
| "learning_rate": 2.3104116925431017e-07, | |
| "loss": 4.0566, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.45706371191135736, | |
| "grad_norm": 130.95127868652344, | |
| "learning_rate": 2.3095872521732245e-07, | |
| "loss": 4.3084, | |
| "step": 5775 | |
| }, | |
| { | |
| "epoch": 0.4590423426988524, | |
| "grad_norm": 162.55679321289062, | |
| "learning_rate": 2.308762811803348e-07, | |
| "loss": 4.0288, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.46102097348634746, | |
| "grad_norm": 104.4178695678711, | |
| "learning_rate": 2.3079383714334708e-07, | |
| "loss": 3.9244, | |
| "step": 5825 | |
| }, | |
| { | |
| "epoch": 0.4629996042738425, | |
| "grad_norm": 235.28123474121094, | |
| "learning_rate": 2.307113931063594e-07, | |
| "loss": 4.1106, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.46497823506133756, | |
| "grad_norm": 289.6645812988281, | |
| "learning_rate": 2.306289490693717e-07, | |
| "loss": 4.0457, | |
| "step": 5875 | |
| }, | |
| { | |
| "epoch": 0.4669568658488326, | |
| "grad_norm": 99.97111511230469, | |
| "learning_rate": 2.30546505032384e-07, | |
| "loss": 4.2542, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.46893549663632766, | |
| "grad_norm": 260.0950622558594, | |
| "learning_rate": 2.304640609953963e-07, | |
| "loss": 4.1564, | |
| "step": 5925 | |
| }, | |
| { | |
| "epoch": 0.4709141274238227, | |
| "grad_norm": 113.74392700195312, | |
| "learning_rate": 2.3038161695840864e-07, | |
| "loss": 4.0403, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.47289275821131777, | |
| "grad_norm": 79.32340240478516, | |
| "learning_rate": 2.3029917292142092e-07, | |
| "loss": 4.0408, | |
| "step": 5975 | |
| }, | |
| { | |
| "epoch": 0.4748713889988128, | |
| "grad_norm": 95.92308807373047, | |
| "learning_rate": 2.3021672888443326e-07, | |
| "loss": 3.9811, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.47685001978630787, | |
| "grad_norm": 94.5758285522461, | |
| "learning_rate": 2.3013428484744554e-07, | |
| "loss": 4.1102, | |
| "step": 6025 | |
| }, | |
| { | |
| "epoch": 0.4788286505738029, | |
| "grad_norm": 142.32131958007812, | |
| "learning_rate": 2.3005184081045786e-07, | |
| "loss": 3.989, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.48080728136129797, | |
| "grad_norm": 97.84469604492188, | |
| "learning_rate": 2.2996939677347017e-07, | |
| "loss": 3.9512, | |
| "step": 6075 | |
| }, | |
| { | |
| "epoch": 0.482785912148793, | |
| "grad_norm": 94.38491821289062, | |
| "learning_rate": 2.2988695273648245e-07, | |
| "loss": 3.9475, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.48476454293628807, | |
| "grad_norm": 124.32872772216797, | |
| "learning_rate": 2.2980450869949476e-07, | |
| "loss": 4.1352, | |
| "step": 6125 | |
| }, | |
| { | |
| "epoch": 0.4867431737237831, | |
| "grad_norm": 196.1511993408203, | |
| "learning_rate": 2.2972206466250708e-07, | |
| "loss": 4.2956, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.48872180451127817, | |
| "grad_norm": 144.1227264404297, | |
| "learning_rate": 2.296396206255194e-07, | |
| "loss": 3.9718, | |
| "step": 6175 | |
| }, | |
| { | |
| "epoch": 0.4907004352987733, | |
| "grad_norm": 115.52275085449219, | |
| "learning_rate": 2.295571765885317e-07, | |
| "loss": 3.9135, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.4926790660862683, | |
| "grad_norm": 117.71548461914062, | |
| "learning_rate": 2.29474732551544e-07, | |
| "loss": 3.9026, | |
| "step": 6225 | |
| }, | |
| { | |
| "epoch": 0.4946576968737634, | |
| "grad_norm": 135.42698669433594, | |
| "learning_rate": 2.293922885145563e-07, | |
| "loss": 4.0369, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.4966363276612584, | |
| "grad_norm": 142.4741973876953, | |
| "learning_rate": 2.2930984447756863e-07, | |
| "loss": 4.3588, | |
| "step": 6275 | |
| }, | |
| { | |
| "epoch": 0.4986149584487535, | |
| "grad_norm": 128.56195068359375, | |
| "learning_rate": 2.2922740044058092e-07, | |
| "loss": 3.9089, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.5005935892362485, | |
| "grad_norm": 96.84894561767578, | |
| "learning_rate": 2.2914495640359323e-07, | |
| "loss": 4.1722, | |
| "step": 6325 | |
| }, | |
| { | |
| "epoch": 0.5025722200237436, | |
| "grad_norm": 236.92965698242188, | |
| "learning_rate": 2.2906251236660554e-07, | |
| "loss": 3.9729, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.5045508508112386, | |
| "grad_norm": 135.83609008789062, | |
| "learning_rate": 2.2898006832961783e-07, | |
| "loss": 4.0322, | |
| "step": 6375 | |
| }, | |
| { | |
| "epoch": 0.5065294815987337, | |
| "grad_norm": 123.36375427246094, | |
| "learning_rate": 2.2889762429263017e-07, | |
| "loss": 4.0042, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.5085081123862287, | |
| "grad_norm": 118.30574035644531, | |
| "learning_rate": 2.2881518025564245e-07, | |
| "loss": 4.1079, | |
| "step": 6425 | |
| }, | |
| { | |
| "epoch": 0.5104867431737238, | |
| "grad_norm": 107.81358337402344, | |
| "learning_rate": 2.2873273621865476e-07, | |
| "loss": 4.1198, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.5124653739612188, | |
| "grad_norm": 146.2493438720703, | |
| "learning_rate": 2.2865029218166707e-07, | |
| "loss": 4.0814, | |
| "step": 6475 | |
| }, | |
| { | |
| "epoch": 0.5144440047487139, | |
| "grad_norm": 136.8212890625, | |
| "learning_rate": 2.2856784814467939e-07, | |
| "loss": 4.0562, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.5164226355362089, | |
| "grad_norm": 139.30670166015625, | |
| "learning_rate": 2.2848540410769167e-07, | |
| "loss": 4.1199, | |
| "step": 6525 | |
| }, | |
| { | |
| "epoch": 0.518401266323704, | |
| "grad_norm": 194.90414428710938, | |
| "learning_rate": 2.28402960070704e-07, | |
| "loss": 4.0562, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.520379897111199, | |
| "grad_norm": 103.54257202148438, | |
| "learning_rate": 2.283205160337163e-07, | |
| "loss": 4.0797, | |
| "step": 6575 | |
| }, | |
| { | |
| "epoch": 0.5223585278986941, | |
| "grad_norm": 101.63102722167969, | |
| "learning_rate": 2.2823807199672863e-07, | |
| "loss": 4.0591, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.5243371586861891, | |
| "grad_norm": 104.28479766845703, | |
| "learning_rate": 2.2815562795974092e-07, | |
| "loss": 3.8991, | |
| "step": 6625 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 166.01107788085938, | |
| "learning_rate": 2.2807318392275323e-07, | |
| "loss": 4.1011, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.5282944202611792, | |
| "grad_norm": 154.64959716796875, | |
| "learning_rate": 2.2799073988576554e-07, | |
| "loss": 3.9283, | |
| "step": 6675 | |
| }, | |
| { | |
| "epoch": 0.5302730510486743, | |
| "grad_norm": 96.0099868774414, | |
| "learning_rate": 2.2790829584877782e-07, | |
| "loss": 3.8247, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.5322516818361693, | |
| "grad_norm": 120.90514373779297, | |
| "learning_rate": 2.2782585181179014e-07, | |
| "loss": 4.0629, | |
| "step": 6725 | |
| }, | |
| { | |
| "epoch": 0.5342303126236644, | |
| "grad_norm": 106.48863983154297, | |
| "learning_rate": 2.2774340777480245e-07, | |
| "loss": 4.0127, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.5362089434111594, | |
| "grad_norm": 113.17047882080078, | |
| "learning_rate": 2.2766096373781476e-07, | |
| "loss": 4.03, | |
| "step": 6775 | |
| }, | |
| { | |
| "epoch": 0.5381875741986545, | |
| "grad_norm": 130.6500701904297, | |
| "learning_rate": 2.2757851970082707e-07, | |
| "loss": 4.0192, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.5401662049861495, | |
| "grad_norm": 142.3747100830078, | |
| "learning_rate": 2.2749607566383938e-07, | |
| "loss": 4.1507, | |
| "step": 6825 | |
| }, | |
| { | |
| "epoch": 0.5421448357736446, | |
| "grad_norm": 125.88548278808594, | |
| "learning_rate": 2.2741363162685167e-07, | |
| "loss": 4.2026, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.5441234665611397, | |
| "grad_norm": 156.44570922851562, | |
| "learning_rate": 2.27331187589864e-07, | |
| "loss": 4.1063, | |
| "step": 6875 | |
| }, | |
| { | |
| "epoch": 0.5461020973486348, | |
| "grad_norm": 150.82635498046875, | |
| "learning_rate": 2.272487435528763e-07, | |
| "loss": 4.0477, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.5480807281361298, | |
| "grad_norm": 170.67994689941406, | |
| "learning_rate": 2.271662995158886e-07, | |
| "loss": 4.1398, | |
| "step": 6925 | |
| }, | |
| { | |
| "epoch": 0.5500593589236249, | |
| "grad_norm": 114.224609375, | |
| "learning_rate": 2.2708385547890091e-07, | |
| "loss": 4.0906, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.55203798971112, | |
| "grad_norm": 135.5966033935547, | |
| "learning_rate": 2.2700141144191323e-07, | |
| "loss": 3.872, | |
| "step": 6975 | |
| }, | |
| { | |
| "epoch": 0.554016620498615, | |
| "grad_norm": 120.73974609375, | |
| "learning_rate": 2.2691896740492554e-07, | |
| "loss": 3.9762, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.55599525128611, | |
| "grad_norm": 107.66891479492188, | |
| "learning_rate": 2.2683652336793782e-07, | |
| "loss": 4.0551, | |
| "step": 7025 | |
| }, | |
| { | |
| "epoch": 0.5579738820736051, | |
| "grad_norm": 107.60162353515625, | |
| "learning_rate": 2.2675407933095013e-07, | |
| "loss": 3.973, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.5599525128611001, | |
| "grad_norm": 118.88258361816406, | |
| "learning_rate": 2.2667163529396245e-07, | |
| "loss": 3.9864, | |
| "step": 7075 | |
| }, | |
| { | |
| "epoch": 0.5619311436485952, | |
| "grad_norm": 148.85667419433594, | |
| "learning_rate": 2.2658919125697476e-07, | |
| "loss": 3.9409, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.5639097744360902, | |
| "grad_norm": 148.57321166992188, | |
| "learning_rate": 2.2650674721998704e-07, | |
| "loss": 3.9611, | |
| "step": 7125 | |
| }, | |
| { | |
| "epoch": 0.5658884052235853, | |
| "grad_norm": 172.39999389648438, | |
| "learning_rate": 2.2642430318299938e-07, | |
| "loss": 3.97, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.5678670360110804, | |
| "grad_norm": 120.57051086425781, | |
| "learning_rate": 2.2634185914601167e-07, | |
| "loss": 3.9352, | |
| "step": 7175 | |
| }, | |
| { | |
| "epoch": 0.5698456667985754, | |
| "grad_norm": 143.2531280517578, | |
| "learning_rate": 2.26259415109024e-07, | |
| "loss": 3.9686, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.5718242975860705, | |
| "grad_norm": 123.57396697998047, | |
| "learning_rate": 2.261769710720363e-07, | |
| "loss": 4.0855, | |
| "step": 7225 | |
| }, | |
| { | |
| "epoch": 0.5738029283735655, | |
| "grad_norm": 115.12631225585938, | |
| "learning_rate": 2.260945270350486e-07, | |
| "loss": 3.9754, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.5757815591610606, | |
| "grad_norm": 114.95091247558594, | |
| "learning_rate": 2.260120829980609e-07, | |
| "loss": 3.8981, | |
| "step": 7275 | |
| }, | |
| { | |
| "epoch": 0.5777601899485556, | |
| "grad_norm": 105.46833038330078, | |
| "learning_rate": 2.2592963896107322e-07, | |
| "loss": 3.9452, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.5797388207360507, | |
| "grad_norm": 132.89012145996094, | |
| "learning_rate": 2.258471949240855e-07, | |
| "loss": 4.0808, | |
| "step": 7325 | |
| }, | |
| { | |
| "epoch": 0.5817174515235457, | |
| "grad_norm": 143.6460418701172, | |
| "learning_rate": 2.2576475088709782e-07, | |
| "loss": 4.0108, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.5836960823110408, | |
| "grad_norm": 130.83352661132812, | |
| "learning_rate": 2.2568230685011013e-07, | |
| "loss": 3.9701, | |
| "step": 7375 | |
| }, | |
| { | |
| "epoch": 0.5856747130985358, | |
| "grad_norm": 111.10405731201172, | |
| "learning_rate": 2.2559986281312244e-07, | |
| "loss": 4.3162, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.5876533438860309, | |
| "grad_norm": 163.31959533691406, | |
| "learning_rate": 2.2551741877613476e-07, | |
| "loss": 3.9096, | |
| "step": 7425 | |
| }, | |
| { | |
| "epoch": 0.5896319746735259, | |
| "grad_norm": 134.72927856445312, | |
| "learning_rate": 2.2543497473914704e-07, | |
| "loss": 3.8888, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.591610605461021, | |
| "grad_norm": 124.26619720458984, | |
| "learning_rate": 2.2535253070215938e-07, | |
| "loss": 4.0683, | |
| "step": 7475 | |
| }, | |
| { | |
| "epoch": 0.593589236248516, | |
| "grad_norm": 106.8174057006836, | |
| "learning_rate": 2.2527008666517166e-07, | |
| "loss": 4.0547, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5955678670360111, | |
| "grad_norm": 108.11019897460938, | |
| "learning_rate": 2.2518764262818398e-07, | |
| "loss": 3.9728, | |
| "step": 7525 | |
| }, | |
| { | |
| "epoch": 0.5975464978235061, | |
| "grad_norm": 117.44151306152344, | |
| "learning_rate": 2.251051985911963e-07, | |
| "loss": 4.0569, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.5995251286110012, | |
| "grad_norm": 106.18008422851562, | |
| "learning_rate": 2.250227545542086e-07, | |
| "loss": 3.9042, | |
| "step": 7575 | |
| }, | |
| { | |
| "epoch": 0.6015037593984962, | |
| "grad_norm": 88.67406463623047, | |
| "learning_rate": 2.249403105172209e-07, | |
| "loss": 4.0719, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.6034823901859913, | |
| "grad_norm": 111.12770080566406, | |
| "learning_rate": 2.248578664802332e-07, | |
| "loss": 3.9749, | |
| "step": 7625 | |
| }, | |
| { | |
| "epoch": 0.6054610209734863, | |
| "grad_norm": 119.26530456542969, | |
| "learning_rate": 2.247754224432455e-07, | |
| "loss": 3.9832, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.6074396517609814, | |
| "grad_norm": 157.9289093017578, | |
| "learning_rate": 2.2469297840625782e-07, | |
| "loss": 3.9538, | |
| "step": 7675 | |
| }, | |
| { | |
| "epoch": 0.6094182825484764, | |
| "grad_norm": 122.70995330810547, | |
| "learning_rate": 2.2461053436927013e-07, | |
| "loss": 3.8497, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.6113969133359715, | |
| "grad_norm": 142.41835021972656, | |
| "learning_rate": 2.2452809033228242e-07, | |
| "loss": 3.9172, | |
| "step": 7725 | |
| }, | |
| { | |
| "epoch": 0.6133755441234665, | |
| "grad_norm": 128.31825256347656, | |
| "learning_rate": 2.2444564629529475e-07, | |
| "loss": 3.7326, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.6153541749109616, | |
| "grad_norm": 142.67408752441406, | |
| "learning_rate": 2.2436320225830704e-07, | |
| "loss": 3.8782, | |
| "step": 7775 | |
| }, | |
| { | |
| "epoch": 0.6173328056984567, | |
| "grad_norm": 145.0731658935547, | |
| "learning_rate": 2.2428075822131938e-07, | |
| "loss": 4.024, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.6193114364859518, | |
| "grad_norm": 187.09068298339844, | |
| "learning_rate": 2.2419831418433166e-07, | |
| "loss": 3.8939, | |
| "step": 7825 | |
| }, | |
| { | |
| "epoch": 0.6212900672734468, | |
| "grad_norm": 122.93965148925781, | |
| "learning_rate": 2.2411587014734397e-07, | |
| "loss": 4.0373, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.6232686980609419, | |
| "grad_norm": 152.1845245361328, | |
| "learning_rate": 2.2403342611035628e-07, | |
| "loss": 4.1168, | |
| "step": 7875 | |
| }, | |
| { | |
| "epoch": 0.6252473288484369, | |
| "grad_norm": 100.07666778564453, | |
| "learning_rate": 2.239509820733686e-07, | |
| "loss": 3.9718, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.627225959635932, | |
| "grad_norm": 130.85479736328125, | |
| "learning_rate": 2.2386853803638088e-07, | |
| "loss": 4.0301, | |
| "step": 7925 | |
| }, | |
| { | |
| "epoch": 0.629204590423427, | |
| "grad_norm": 123.073974609375, | |
| "learning_rate": 2.237860939993932e-07, | |
| "loss": 4.0104, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.6311832212109221, | |
| "grad_norm": 168.19808959960938, | |
| "learning_rate": 2.237036499624055e-07, | |
| "loss": 3.9421, | |
| "step": 7975 | |
| }, | |
| { | |
| "epoch": 0.6331618519984171, | |
| "grad_norm": 118.69593811035156, | |
| "learning_rate": 2.2362120592541782e-07, | |
| "loss": 3.8238, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.6351404827859122, | |
| "grad_norm": 192.9334259033203, | |
| "learning_rate": 2.2353876188843013e-07, | |
| "loss": 3.8227, | |
| "step": 8025 | |
| }, | |
| { | |
| "epoch": 0.6371191135734072, | |
| "grad_norm": 103.11824035644531, | |
| "learning_rate": 2.2345631785144241e-07, | |
| "loss": 3.9359, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.6390977443609023, | |
| "grad_norm": 129.3599090576172, | |
| "learning_rate": 2.2337387381445475e-07, | |
| "loss": 4.0562, | |
| "step": 8075 | |
| }, | |
| { | |
| "epoch": 0.6410763751483973, | |
| "grad_norm": 124.06795501708984, | |
| "learning_rate": 2.2329142977746704e-07, | |
| "loss": 4.1502, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.6430550059358924, | |
| "grad_norm": 113.18289184570312, | |
| "learning_rate": 2.2320898574047935e-07, | |
| "loss": 4.0059, | |
| "step": 8125 | |
| }, | |
| { | |
| "epoch": 0.6450336367233874, | |
| "grad_norm": 117.89970397949219, | |
| "learning_rate": 2.2312654170349166e-07, | |
| "loss": 4.0162, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.6470122675108825, | |
| "grad_norm": 109.6517105102539, | |
| "learning_rate": 2.2304409766650397e-07, | |
| "loss": 3.9979, | |
| "step": 8175 | |
| }, | |
| { | |
| "epoch": 0.6489908982983775, | |
| "grad_norm": 123.35499572753906, | |
| "learning_rate": 2.2296165362951628e-07, | |
| "loss": 3.9273, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.6509695290858726, | |
| "grad_norm": 141.97459411621094, | |
| "learning_rate": 2.228792095925286e-07, | |
| "loss": 4.0558, | |
| "step": 8225 | |
| }, | |
| { | |
| "epoch": 0.6529481598733676, | |
| "grad_norm": 159.06973266601562, | |
| "learning_rate": 2.2279676555554088e-07, | |
| "loss": 3.8374, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.6549267906608627, | |
| "grad_norm": 120.933837890625, | |
| "learning_rate": 2.227143215185532e-07, | |
| "loss": 3.8412, | |
| "step": 8275 | |
| }, | |
| { | |
| "epoch": 0.6569054214483577, | |
| "grad_norm": 106.266357421875, | |
| "learning_rate": 2.226318774815655e-07, | |
| "loss": 3.8757, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.6588840522358528, | |
| "grad_norm": 138.7765655517578, | |
| "learning_rate": 2.225494334445778e-07, | |
| "loss": 4.2284, | |
| "step": 8325 | |
| }, | |
| { | |
| "epoch": 0.6608626830233478, | |
| "grad_norm": 120.76045989990234, | |
| "learning_rate": 2.2246698940759013e-07, | |
| "loss": 3.9127, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.6628413138108429, | |
| "grad_norm": 117.31808471679688, | |
| "learning_rate": 2.223845453706024e-07, | |
| "loss": 3.7577, | |
| "step": 8375 | |
| }, | |
| { | |
| "epoch": 0.6648199445983379, | |
| "grad_norm": 108.21405029296875, | |
| "learning_rate": 2.2230210133361475e-07, | |
| "loss": 4.1095, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.666798575385833, | |
| "grad_norm": 126.65251159667969, | |
| "learning_rate": 2.2221965729662703e-07, | |
| "loss": 3.9047, | |
| "step": 8425 | |
| }, | |
| { | |
| "epoch": 0.668777206173328, | |
| "grad_norm": 135.06512451171875, | |
| "learning_rate": 2.2213721325963935e-07, | |
| "loss": 3.9267, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.6707558369608231, | |
| "grad_norm": 150.37025451660156, | |
| "learning_rate": 2.2205476922265166e-07, | |
| "loss": 3.9662, | |
| "step": 8475 | |
| }, | |
| { | |
| "epoch": 0.6727344677483181, | |
| "grad_norm": 138.01531982421875, | |
| "learning_rate": 2.2197232518566397e-07, | |
| "loss": 3.8107, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.6747130985358132, | |
| "grad_norm": 130.35153198242188, | |
| "learning_rate": 2.2188988114867625e-07, | |
| "loss": 3.8068, | |
| "step": 8525 | |
| }, | |
| { | |
| "epoch": 0.6766917293233082, | |
| "grad_norm": 161.9180145263672, | |
| "learning_rate": 2.218074371116886e-07, | |
| "loss": 4.0893, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.6786703601108033, | |
| "grad_norm": 165.08409118652344, | |
| "learning_rate": 2.2172499307470088e-07, | |
| "loss": 3.8419, | |
| "step": 8575 | |
| }, | |
| { | |
| "epoch": 0.6806489908982983, | |
| "grad_norm": 153.2915496826172, | |
| "learning_rate": 2.216425490377132e-07, | |
| "loss": 3.9302, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.6826276216857934, | |
| "grad_norm": 153.20138549804688, | |
| "learning_rate": 2.215601050007255e-07, | |
| "loss": 3.9947, | |
| "step": 8625 | |
| }, | |
| { | |
| "epoch": 0.6846062524732884, | |
| "grad_norm": 124.32341003417969, | |
| "learning_rate": 2.2147766096373779e-07, | |
| "loss": 3.8241, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.6865848832607835, | |
| "grad_norm": 209.813232421875, | |
| "learning_rate": 2.2139521692675012e-07, | |
| "loss": 3.917, | |
| "step": 8675 | |
| }, | |
| { | |
| "epoch": 0.6885635140482786, | |
| "grad_norm": 116.88125610351562, | |
| "learning_rate": 2.213127728897624e-07, | |
| "loss": 3.9474, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.6905421448357737, | |
| "grad_norm": 178.58721923828125, | |
| "learning_rate": 2.2123032885277472e-07, | |
| "loss": 3.9247, | |
| "step": 8725 | |
| }, | |
| { | |
| "epoch": 0.6925207756232687, | |
| "grad_norm": 123.67437744140625, | |
| "learning_rate": 2.2114788481578703e-07, | |
| "loss": 3.9548, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.6944994064107638, | |
| "grad_norm": 154.60626220703125, | |
| "learning_rate": 2.2106544077879934e-07, | |
| "loss": 3.8239, | |
| "step": 8775 | |
| }, | |
| { | |
| "epoch": 0.6964780371982588, | |
| "grad_norm": 141.65699768066406, | |
| "learning_rate": 2.2098299674181166e-07, | |
| "loss": 3.8458, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.6984566679857539, | |
| "grad_norm": 112.9280776977539, | |
| "learning_rate": 2.2090055270482397e-07, | |
| "loss": 3.9648, | |
| "step": 8825 | |
| }, | |
| { | |
| "epoch": 0.700435298773249, | |
| "grad_norm": 154.57643127441406, | |
| "learning_rate": 2.2081810866783625e-07, | |
| "loss": 4.0078, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.702413929560744, | |
| "grad_norm": 129.07418823242188, | |
| "learning_rate": 2.207356646308486e-07, | |
| "loss": 4.0276, | |
| "step": 8875 | |
| }, | |
| { | |
| "epoch": 0.704392560348239, | |
| "grad_norm": 113.59859466552734, | |
| "learning_rate": 2.2065322059386088e-07, | |
| "loss": 4.1596, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.7063711911357341, | |
| "grad_norm": 136.26283264160156, | |
| "learning_rate": 2.2057077655687316e-07, | |
| "loss": 3.948, | |
| "step": 8925 | |
| }, | |
| { | |
| "epoch": 0.7083498219232292, | |
| "grad_norm": 118.27870178222656, | |
| "learning_rate": 2.204883325198855e-07, | |
| "loss": 3.7611, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.7103284527107242, | |
| "grad_norm": 159.56643676757812, | |
| "learning_rate": 2.2040588848289778e-07, | |
| "loss": 4.0215, | |
| "step": 8975 | |
| }, | |
| { | |
| "epoch": 0.7123070834982193, | |
| "grad_norm": 125.84573364257812, | |
| "learning_rate": 2.2032344444591012e-07, | |
| "loss": 4.0046, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 148.7548065185547, | |
| "learning_rate": 2.202410004089224e-07, | |
| "loss": 4.0033, | |
| "step": 9025 | |
| }, | |
| { | |
| "epoch": 0.7162643450732094, | |
| "grad_norm": 109.41517639160156, | |
| "learning_rate": 2.2015855637193472e-07, | |
| "loss": 3.9298, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.7182429758607044, | |
| "grad_norm": 112.52848815917969, | |
| "learning_rate": 2.2007611233494703e-07, | |
| "loss": 4.1003, | |
| "step": 9075 | |
| }, | |
| { | |
| "epoch": 0.7202216066481995, | |
| "grad_norm": 114.72808074951172, | |
| "learning_rate": 2.1999366829795934e-07, | |
| "loss": 4.1112, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.7222002374356945, | |
| "grad_norm": 144.11619567871094, | |
| "learning_rate": 2.1991122426097163e-07, | |
| "loss": 3.764, | |
| "step": 9125 | |
| }, | |
| { | |
| "epoch": 0.7241788682231896, | |
| "grad_norm": 118.64055633544922, | |
| "learning_rate": 2.1982878022398396e-07, | |
| "loss": 3.9262, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.7261574990106846, | |
| "grad_norm": 166.79525756835938, | |
| "learning_rate": 2.1974633618699625e-07, | |
| "loss": 4.0518, | |
| "step": 9175 | |
| }, | |
| { | |
| "epoch": 0.7281361297981797, | |
| "grad_norm": 128.2512969970703, | |
| "learning_rate": 2.1966389215000856e-07, | |
| "loss": 3.952, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.7301147605856747, | |
| "grad_norm": 143.56414794921875, | |
| "learning_rate": 2.1958144811302087e-07, | |
| "loss": 3.7034, | |
| "step": 9225 | |
| }, | |
| { | |
| "epoch": 0.7320933913731698, | |
| "grad_norm": 120.13394165039062, | |
| "learning_rate": 2.1949900407603316e-07, | |
| "loss": 3.7839, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.7340720221606648, | |
| "grad_norm": 148.74070739746094, | |
| "learning_rate": 2.194165600390455e-07, | |
| "loss": 3.8871, | |
| "step": 9275 | |
| }, | |
| { | |
| "epoch": 0.7360506529481599, | |
| "grad_norm": 148.17022705078125, | |
| "learning_rate": 2.1933411600205778e-07, | |
| "loss": 3.7486, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.7380292837356549, | |
| "grad_norm": 112.7260513305664, | |
| "learning_rate": 2.192516719650701e-07, | |
| "loss": 3.9436, | |
| "step": 9325 | |
| }, | |
| { | |
| "epoch": 0.74000791452315, | |
| "grad_norm": 131.4718780517578, | |
| "learning_rate": 2.191692279280824e-07, | |
| "loss": 4.1101, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.741986545310645, | |
| "grad_norm": 106.73101043701172, | |
| "learning_rate": 2.1908678389109472e-07, | |
| "loss": 3.9285, | |
| "step": 9375 | |
| }, | |
| { | |
| "epoch": 0.7439651760981401, | |
| "grad_norm": 120.58040618896484, | |
| "learning_rate": 2.1900433985410703e-07, | |
| "loss": 3.8471, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.7459438068856351, | |
| "grad_norm": 135.69512939453125, | |
| "learning_rate": 2.1892189581711934e-07, | |
| "loss": 3.7629, | |
| "step": 9425 | |
| }, | |
| { | |
| "epoch": 0.7479224376731302, | |
| "grad_norm": 125.78627014160156, | |
| "learning_rate": 2.1883945178013162e-07, | |
| "loss": 4.0646, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.7499010684606252, | |
| "grad_norm": 150.2305145263672, | |
| "learning_rate": 2.1875700774314396e-07, | |
| "loss": 3.9361, | |
| "step": 9475 | |
| }, | |
| { | |
| "epoch": 0.7518796992481203, | |
| "grad_norm": 95.4436264038086, | |
| "learning_rate": 2.1867456370615625e-07, | |
| "loss": 3.7688, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.7538583300356153, | |
| "grad_norm": 141.27809143066406, | |
| "learning_rate": 2.1859211966916853e-07, | |
| "loss": 4.0217, | |
| "step": 9525 | |
| }, | |
| { | |
| "epoch": 0.7558369608231104, | |
| "grad_norm": 133.8254852294922, | |
| "learning_rate": 2.1850967563218087e-07, | |
| "loss": 4.0683, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.7578155916106054, | |
| "grad_norm": 139.919189453125, | |
| "learning_rate": 2.1842723159519316e-07, | |
| "loss": 3.9958, | |
| "step": 9575 | |
| }, | |
| { | |
| "epoch": 0.7597942223981005, | |
| "grad_norm": 173.58946228027344, | |
| "learning_rate": 2.183447875582055e-07, | |
| "loss": 3.9474, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.7617728531855956, | |
| "grad_norm": 107.07398223876953, | |
| "learning_rate": 2.1826234352121778e-07, | |
| "loss": 3.8308, | |
| "step": 9625 | |
| }, | |
| { | |
| "epoch": 0.7637514839730907, | |
| "grad_norm": 124.00753784179688, | |
| "learning_rate": 2.181798994842301e-07, | |
| "loss": 3.8218, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.7657301147605857, | |
| "grad_norm": 138.23736572265625, | |
| "learning_rate": 2.180974554472424e-07, | |
| "loss": 3.7296, | |
| "step": 9675 | |
| }, | |
| { | |
| "epoch": 0.7677087455480808, | |
| "grad_norm": 128.9496612548828, | |
| "learning_rate": 2.1801501141025471e-07, | |
| "loss": 3.9163, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.7696873763355758, | |
| "grad_norm": 108.07875061035156, | |
| "learning_rate": 2.17932567373267e-07, | |
| "loss": 3.9408, | |
| "step": 9725 | |
| }, | |
| { | |
| "epoch": 0.7716660071230709, | |
| "grad_norm": 126.18501281738281, | |
| "learning_rate": 2.1785012333627934e-07, | |
| "loss": 4.1993, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.7736446379105659, | |
| "grad_norm": 144.8102264404297, | |
| "learning_rate": 2.1776767929929162e-07, | |
| "loss": 3.877, | |
| "step": 9775 | |
| }, | |
| { | |
| "epoch": 0.775623268698061, | |
| "grad_norm": 118.8504638671875, | |
| "learning_rate": 2.1768523526230396e-07, | |
| "loss": 3.9788, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.777601899485556, | |
| "grad_norm": 127.45133209228516, | |
| "learning_rate": 2.1760279122531625e-07, | |
| "loss": 3.8987, | |
| "step": 9825 | |
| }, | |
| { | |
| "epoch": 0.7795805302730511, | |
| "grad_norm": 134.95892333984375, | |
| "learning_rate": 2.1752034718832853e-07, | |
| "loss": 3.8251, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.7815591610605461, | |
| "grad_norm": 124.00614929199219, | |
| "learning_rate": 2.1743790315134087e-07, | |
| "loss": 3.6875, | |
| "step": 9875 | |
| }, | |
| { | |
| "epoch": 0.7835377918480412, | |
| "grad_norm": 126.81105041503906, | |
| "learning_rate": 2.1735545911435315e-07, | |
| "loss": 3.7447, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.7855164226355362, | |
| "grad_norm": 106.54443359375, | |
| "learning_rate": 2.172730150773655e-07, | |
| "loss": 3.9051, | |
| "step": 9925 | |
| }, | |
| { | |
| "epoch": 0.7874950534230313, | |
| "grad_norm": 156.5098876953125, | |
| "learning_rate": 2.1719057104037778e-07, | |
| "loss": 3.9587, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 128.83648681640625, | |
| "learning_rate": 2.171081270033901e-07, | |
| "loss": 4.0755, | |
| "step": 9975 | |
| }, | |
| { | |
| "epoch": 0.7914523149980214, | |
| "grad_norm": 131.54664611816406, | |
| "learning_rate": 2.170256829664024e-07, | |
| "loss": 4.0072, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.7914523149980214, | |
| "eval_loss": 3.8681728839874268, | |
| "eval_runtime": 9.5698, | |
| "eval_samples_per_second": 264.165, | |
| "eval_steps_per_second": 33.021, | |
| "step": 10000 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 75810, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 35767296000000.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |