| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 1498, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0033377837116154874, | |
| "grad_norm": 10.97334621037521, | |
| "learning_rate": 1.3333333333333334e-06, | |
| "loss": 1.0976, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.006675567423230975, | |
| "grad_norm": 8.500612119645204, | |
| "learning_rate": 3e-06, | |
| "loss": 1.0533, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.010013351134846462, | |
| "grad_norm": 4.4978733433398705, | |
| "learning_rate": 4.666666666666667e-06, | |
| "loss": 0.9037, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.01335113484646195, | |
| "grad_norm": 3.8887423585372596, | |
| "learning_rate": 6.333333333333334e-06, | |
| "loss": 0.7853, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.016688918558077435, | |
| "grad_norm": 3.1870823314914474, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.774, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.020026702269692925, | |
| "grad_norm": 3.6334777762827684, | |
| "learning_rate": 9.666666666666667e-06, | |
| "loss": 0.7773, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02336448598130841, | |
| "grad_norm": 3.683322604774294, | |
| "learning_rate": 1.1333333333333334e-05, | |
| "loss": 0.7457, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0267022696929239, | |
| "grad_norm": 3.6697978407247605, | |
| "learning_rate": 1.3000000000000001e-05, | |
| "loss": 0.7103, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.030040053404539385, | |
| "grad_norm": 3.651275266228758, | |
| "learning_rate": 1.4666666666666668e-05, | |
| "loss": 0.7227, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.03337783711615487, | |
| "grad_norm": 3.8445471248297265, | |
| "learning_rate": 1.6333333333333335e-05, | |
| "loss": 0.6611, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.036715620827770364, | |
| "grad_norm": 3.5025746500731545, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.6182, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.04005340453938585, | |
| "grad_norm": 3.8698798018604577, | |
| "learning_rate": 1.9666666666666666e-05, | |
| "loss": 0.6551, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.043391188251001335, | |
| "grad_norm": 2.990673727539358, | |
| "learning_rate": 2.1333333333333335e-05, | |
| "loss": 0.6866, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.04672897196261682, | |
| "grad_norm": 3.2866308072265213, | |
| "learning_rate": 2.3000000000000003e-05, | |
| "loss": 0.6887, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.050066755674232306, | |
| "grad_norm": 3.271721003686755, | |
| "learning_rate": 2.466666666666667e-05, | |
| "loss": 0.7327, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.0534045393858478, | |
| "grad_norm": 3.0753677314727743, | |
| "learning_rate": 2.633333333333333e-05, | |
| "loss": 0.7003, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.056742323097463285, | |
| "grad_norm": 2.6258441723789607, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 0.7144, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.06008010680907877, | |
| "grad_norm": 2.434906123559183, | |
| "learning_rate": 2.9666666666666672e-05, | |
| "loss": 0.704, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.06341789052069426, | |
| "grad_norm": 2.760855540008962, | |
| "learning_rate": 3.1333333333333334e-05, | |
| "loss": 0.7329, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.06675567423230974, | |
| "grad_norm": 2.215504245536271, | |
| "learning_rate": 3.3e-05, | |
| "loss": 0.7367, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07009345794392523, | |
| "grad_norm": 2.6569341544736713, | |
| "learning_rate": 3.466666666666667e-05, | |
| "loss": 0.7713, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.07343124165554073, | |
| "grad_norm": 2.6507368382475973, | |
| "learning_rate": 3.633333333333333e-05, | |
| "loss": 0.7772, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.07676902536715621, | |
| "grad_norm": 2.1348618645661483, | |
| "learning_rate": 3.8e-05, | |
| "loss": 0.7712, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.0801068090787717, | |
| "grad_norm": 2.264768791536783, | |
| "learning_rate": 3.966666666666667e-05, | |
| "loss": 0.7594, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08344459279038718, | |
| "grad_norm": 1.9771816174965278, | |
| "learning_rate": 4.133333333333333e-05, | |
| "loss": 0.7945, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.08678237650200267, | |
| "grad_norm": 1.8584709556275458, | |
| "learning_rate": 4.3e-05, | |
| "loss": 0.7918, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.09012016021361816, | |
| "grad_norm": 1.6443791716346257, | |
| "learning_rate": 4.466666666666667e-05, | |
| "loss": 0.7961, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.09345794392523364, | |
| "grad_norm": 1.6647164088912758, | |
| "learning_rate": 4.633333333333333e-05, | |
| "loss": 0.8114, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.09679572763684913, | |
| "grad_norm": 1.9000184800356008, | |
| "learning_rate": 4.8e-05, | |
| "loss": 0.8218, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.10013351134846461, | |
| "grad_norm": 1.5342772082491383, | |
| "learning_rate": 4.966666666666667e-05, | |
| "loss": 0.8185, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.10347129506008011, | |
| "grad_norm": 1.4750796923574772, | |
| "learning_rate": 4.9851632047477745e-05, | |
| "loss": 0.8223, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.1068090787716956, | |
| "grad_norm": 1.6148204398582195, | |
| "learning_rate": 4.966617210682493e-05, | |
| "loss": 0.8303, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.11014686248331108, | |
| "grad_norm": 1.4390159182334255, | |
| "learning_rate": 4.948071216617211e-05, | |
| "loss": 0.7991, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.11348464619492657, | |
| "grad_norm": 1.2663492608255318, | |
| "learning_rate": 4.929525222551929e-05, | |
| "loss": 0.8523, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.11682242990654206, | |
| "grad_norm": 1.5497168860322745, | |
| "learning_rate": 4.910979228486647e-05, | |
| "loss": 0.8384, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.12016021361815754, | |
| "grad_norm": 1.3218119424151125, | |
| "learning_rate": 4.8924332344213654e-05, | |
| "loss": 0.8334, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.12349799732977303, | |
| "grad_norm": 1.3314085264665803, | |
| "learning_rate": 4.873887240356083e-05, | |
| "loss": 0.8775, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.1268357810413885, | |
| "grad_norm": 1.417575113114604, | |
| "learning_rate": 4.855341246290801e-05, | |
| "loss": 0.8604, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.130173564753004, | |
| "grad_norm": 1.366705643461715, | |
| "learning_rate": 4.8367952522255196e-05, | |
| "loss": 0.8578, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.13351134846461948, | |
| "grad_norm": 1.3085988961094133, | |
| "learning_rate": 4.818249258160238e-05, | |
| "loss": 0.8493, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.13684913217623498, | |
| "grad_norm": 1.3664654247475687, | |
| "learning_rate": 4.7997032640949556e-05, | |
| "loss": 0.8739, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.14018691588785046, | |
| "grad_norm": 1.2448228793197444, | |
| "learning_rate": 4.781157270029674e-05, | |
| "loss": 0.8758, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.14352469959946595, | |
| "grad_norm": 1.2166707854427619, | |
| "learning_rate": 4.762611275964392e-05, | |
| "loss": 0.8621, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.14686248331108145, | |
| "grad_norm": 1.2528234662317728, | |
| "learning_rate": 4.74406528189911e-05, | |
| "loss": 0.8358, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.15020026702269693, | |
| "grad_norm": 1.2078820039150766, | |
| "learning_rate": 4.725519287833828e-05, | |
| "loss": 0.8527, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.15353805073431243, | |
| "grad_norm": 1.1953586214155212, | |
| "learning_rate": 4.7069732937685464e-05, | |
| "loss": 0.86, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1568758344459279, | |
| "grad_norm": 1.239697699729331, | |
| "learning_rate": 4.688427299703264e-05, | |
| "loss": 0.8512, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.1602136181575434, | |
| "grad_norm": 1.0943965020451794, | |
| "learning_rate": 4.6698813056379824e-05, | |
| "loss": 0.8485, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.16355140186915887, | |
| "grad_norm": 1.1753697981159985, | |
| "learning_rate": 4.651335311572701e-05, | |
| "loss": 0.8574, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.16688918558077437, | |
| "grad_norm": 1.2273163170418067, | |
| "learning_rate": 4.632789317507419e-05, | |
| "loss": 0.8755, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.17022696929238984, | |
| "grad_norm": 1.1408298234921244, | |
| "learning_rate": 4.6142433234421366e-05, | |
| "loss": 0.8656, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.17356475300400534, | |
| "grad_norm": 1.099138875783624, | |
| "learning_rate": 4.595697329376854e-05, | |
| "loss": 0.8641, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.17690253671562084, | |
| "grad_norm": 1.1726455447900384, | |
| "learning_rate": 4.577151335311573e-05, | |
| "loss": 0.8428, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.1802403204272363, | |
| "grad_norm": 1.1238710518885906, | |
| "learning_rate": 4.558605341246291e-05, | |
| "loss": 0.8564, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1835781041388518, | |
| "grad_norm": 1.2209709343561501, | |
| "learning_rate": 4.540059347181009e-05, | |
| "loss": 0.8841, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.18691588785046728, | |
| "grad_norm": 1.0719606532900603, | |
| "learning_rate": 4.5215133531157275e-05, | |
| "loss": 0.8564, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.19025367156208278, | |
| "grad_norm": 1.1632077631864237, | |
| "learning_rate": 4.502967359050445e-05, | |
| "loss": 0.8336, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.19359145527369825, | |
| "grad_norm": 1.0912246571194697, | |
| "learning_rate": 4.4844213649851635e-05, | |
| "loss": 0.8386, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.19692923898531375, | |
| "grad_norm": 1.182192263363281, | |
| "learning_rate": 4.465875370919881e-05, | |
| "loss": 0.8725, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.20026702269692923, | |
| "grad_norm": 1.1652841802413654, | |
| "learning_rate": 4.4473293768546e-05, | |
| "loss": 0.8673, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.20360480640854473, | |
| "grad_norm": 1.006954327768831, | |
| "learning_rate": 4.428783382789318e-05, | |
| "loss": 0.8428, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.20694259012016022, | |
| "grad_norm": 1.043615772924013, | |
| "learning_rate": 4.4102373887240354e-05, | |
| "loss": 0.8826, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2102803738317757, | |
| "grad_norm": 1.0218351388644316, | |
| "learning_rate": 4.391691394658754e-05, | |
| "loss": 0.8604, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.2136181575433912, | |
| "grad_norm": 1.093123947434233, | |
| "learning_rate": 4.373145400593472e-05, | |
| "loss": 0.8471, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.21695594125500667, | |
| "grad_norm": 1.0437185361786059, | |
| "learning_rate": 4.35459940652819e-05, | |
| "loss": 0.8365, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.22029372496662217, | |
| "grad_norm": 1.1728538235708217, | |
| "learning_rate": 4.336053412462908e-05, | |
| "loss": 0.8456, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.22363150867823764, | |
| "grad_norm": 1.072793065927316, | |
| "learning_rate": 4.317507418397626e-05, | |
| "loss": 0.8574, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.22696929238985314, | |
| "grad_norm": 0.9434393974539138, | |
| "learning_rate": 4.2989614243323446e-05, | |
| "loss": 0.8342, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.23030707610146864, | |
| "grad_norm": 1.0832629194902035, | |
| "learning_rate": 4.280415430267062e-05, | |
| "loss": 0.8509, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.2336448598130841, | |
| "grad_norm": 0.9763109475541231, | |
| "learning_rate": 4.2618694362017805e-05, | |
| "loss": 0.8515, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2369826435246996, | |
| "grad_norm": 0.9622690418923676, | |
| "learning_rate": 4.243323442136499e-05, | |
| "loss": 0.8451, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.24032042723631508, | |
| "grad_norm": 1.0311536354503212, | |
| "learning_rate": 4.2247774480712165e-05, | |
| "loss": 0.8793, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.24365821094793058, | |
| "grad_norm": 1.0616057184675154, | |
| "learning_rate": 4.206231454005935e-05, | |
| "loss": 0.8496, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.24699599465954605, | |
| "grad_norm": 0.9694776849343971, | |
| "learning_rate": 4.187685459940653e-05, | |
| "loss": 0.8276, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.25033377837116155, | |
| "grad_norm": 1.0669771488195197, | |
| "learning_rate": 4.1691394658753714e-05, | |
| "loss": 0.8608, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.253671562082777, | |
| "grad_norm": 1.0330982860266757, | |
| "learning_rate": 4.150593471810089e-05, | |
| "loss": 0.8626, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2570093457943925, | |
| "grad_norm": 1.0894982304893939, | |
| "learning_rate": 4.132047477744807e-05, | |
| "loss": 0.8456, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.260347129506008, | |
| "grad_norm": 1.0233378411857879, | |
| "learning_rate": 4.1135014836795256e-05, | |
| "loss": 0.858, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2636849132176235, | |
| "grad_norm": 0.9532755270759551, | |
| "learning_rate": 4.094955489614243e-05, | |
| "loss": 0.8325, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.26702269692923897, | |
| "grad_norm": 1.012705355808147, | |
| "learning_rate": 4.0764094955489616e-05, | |
| "loss": 0.8603, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2703604806408545, | |
| "grad_norm": 1.0304787118053764, | |
| "learning_rate": 4.05786350148368e-05, | |
| "loss": 0.8481, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.27369826435246997, | |
| "grad_norm": 1.0003080869140883, | |
| "learning_rate": 4.039317507418398e-05, | |
| "loss": 0.841, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.27703604806408544, | |
| "grad_norm": 1.0009426776633654, | |
| "learning_rate": 4.020771513353116e-05, | |
| "loss": 0.8504, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.2803738317757009, | |
| "grad_norm": 0.9644210719992499, | |
| "learning_rate": 4.002225519287834e-05, | |
| "loss": 0.8481, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.28371161548731644, | |
| "grad_norm": 1.0494436578225004, | |
| "learning_rate": 3.9836795252225525e-05, | |
| "loss": 0.8354, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.2870493991989319, | |
| "grad_norm": 0.8887585557590956, | |
| "learning_rate": 3.96513353115727e-05, | |
| "loss": 0.83, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2903871829105474, | |
| "grad_norm": 0.9747027542446707, | |
| "learning_rate": 3.9465875370919884e-05, | |
| "loss": 0.8307, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.2937249666221629, | |
| "grad_norm": 0.9383108633240661, | |
| "learning_rate": 3.928041543026707e-05, | |
| "loss": 0.8135, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2970627503337784, | |
| "grad_norm": 1.0410990959669617, | |
| "learning_rate": 3.9094955489614244e-05, | |
| "loss": 0.8485, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.30040053404539385, | |
| "grad_norm": 0.9780902917654535, | |
| "learning_rate": 3.890949554896143e-05, | |
| "loss": 0.8196, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3037383177570093, | |
| "grad_norm": 0.9442982752168075, | |
| "learning_rate": 3.87240356083086e-05, | |
| "loss": 0.808, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.30707610146862485, | |
| "grad_norm": 1.0040732428090156, | |
| "learning_rate": 3.853857566765579e-05, | |
| "loss": 0.8478, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.3104138851802403, | |
| "grad_norm": 0.9957954242378592, | |
| "learning_rate": 3.835311572700297e-05, | |
| "loss": 0.8243, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.3137516688918558, | |
| "grad_norm": 0.9906243125042739, | |
| "learning_rate": 3.8167655786350146e-05, | |
| "loss": 0.8198, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.3170894526034713, | |
| "grad_norm": 0.9245770524956366, | |
| "learning_rate": 3.7982195845697336e-05, | |
| "loss": 0.8225, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.3204272363150868, | |
| "grad_norm": 0.9689880972401121, | |
| "learning_rate": 3.779673590504451e-05, | |
| "loss": 0.8081, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.32376502002670227, | |
| "grad_norm": 0.9050315548245393, | |
| "learning_rate": 3.7611275964391695e-05, | |
| "loss": 0.8257, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.32710280373831774, | |
| "grad_norm": 0.965523047639, | |
| "learning_rate": 3.742581602373887e-05, | |
| "loss": 0.8351, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.33044058744993327, | |
| "grad_norm": 1.0138619406917988, | |
| "learning_rate": 3.7240356083086054e-05, | |
| "loss": 0.8297, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.33377837116154874, | |
| "grad_norm": 0.9226993749632075, | |
| "learning_rate": 3.705489614243324e-05, | |
| "loss": 0.822, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3371161548731642, | |
| "grad_norm": 0.932421956217793, | |
| "learning_rate": 3.6869436201780414e-05, | |
| "loss": 0.8131, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.3404539385847797, | |
| "grad_norm": 0.8929781381117078, | |
| "learning_rate": 3.6683976261127604e-05, | |
| "loss": 0.8144, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.3437917222963952, | |
| "grad_norm": 0.922998838382151, | |
| "learning_rate": 3.649851632047478e-05, | |
| "loss": 0.8624, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.3471295060080107, | |
| "grad_norm": 0.93469331823402, | |
| "learning_rate": 3.6313056379821956e-05, | |
| "loss": 0.8259, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.35046728971962615, | |
| "grad_norm": 1.0259483980065804, | |
| "learning_rate": 3.612759643916914e-05, | |
| "loss": 0.85, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.3538050734312417, | |
| "grad_norm": 0.9757464996869175, | |
| "learning_rate": 3.594213649851632e-05, | |
| "loss": 0.8322, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 1.0326434851152306, | |
| "learning_rate": 3.5756676557863506e-05, | |
| "loss": 0.8152, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.3604806408544726, | |
| "grad_norm": 0.8969653625459288, | |
| "learning_rate": 3.557121661721068e-05, | |
| "loss": 0.8342, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3638184245660881, | |
| "grad_norm": 0.9685090624506036, | |
| "learning_rate": 3.5385756676557865e-05, | |
| "loss": 0.8454, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.3671562082777036, | |
| "grad_norm": 0.9464787707831517, | |
| "learning_rate": 3.520029673590505e-05, | |
| "loss": 0.8144, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3704939919893191, | |
| "grad_norm": 0.9715120332083621, | |
| "learning_rate": 3.5014836795252225e-05, | |
| "loss": 0.8075, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.37383177570093457, | |
| "grad_norm": 1.0181655310980833, | |
| "learning_rate": 3.482937685459941e-05, | |
| "loss": 0.8433, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3771695594125501, | |
| "grad_norm": 1.0013772700433445, | |
| "learning_rate": 3.464391691394659e-05, | |
| "loss": 0.8253, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.38050734312416556, | |
| "grad_norm": 0.9482215787610626, | |
| "learning_rate": 3.445845697329377e-05, | |
| "loss": 0.8195, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.38384512683578104, | |
| "grad_norm": 1.0461820886125337, | |
| "learning_rate": 3.427299703264095e-05, | |
| "loss": 0.831, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.3871829105473965, | |
| "grad_norm": 0.9198239920283778, | |
| "learning_rate": 3.4087537091988134e-05, | |
| "loss": 0.8152, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.39052069425901204, | |
| "grad_norm": 0.9782331163351092, | |
| "learning_rate": 3.390207715133532e-05, | |
| "loss": 0.8154, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.3938584779706275, | |
| "grad_norm": 0.9186397229393198, | |
| "learning_rate": 3.371661721068249e-05, | |
| "loss": 0.8153, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.397196261682243, | |
| "grad_norm": 0.9337443617921134, | |
| "learning_rate": 3.3531157270029676e-05, | |
| "loss": 0.8233, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.40053404539385845, | |
| "grad_norm": 0.9434322651580768, | |
| "learning_rate": 3.334569732937686e-05, | |
| "loss": 0.8345, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.403871829105474, | |
| "grad_norm": 1.0512846063850414, | |
| "learning_rate": 3.3160237388724036e-05, | |
| "loss": 0.8173, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.40720961281708945, | |
| "grad_norm": 0.9350959223867034, | |
| "learning_rate": 3.297477744807122e-05, | |
| "loss": 0.8372, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.4105473965287049, | |
| "grad_norm": 0.9197353611822743, | |
| "learning_rate": 3.27893175074184e-05, | |
| "loss": 0.8215, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.41388518024032045, | |
| "grad_norm": 0.8518070420704498, | |
| "learning_rate": 3.260385756676558e-05, | |
| "loss": 0.7951, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.4172229639519359, | |
| "grad_norm": 0.9858909592901012, | |
| "learning_rate": 3.241839762611276e-05, | |
| "loss": 0.8035, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.4205607476635514, | |
| "grad_norm": 1.071276470614738, | |
| "learning_rate": 3.223293768545994e-05, | |
| "loss": 0.8121, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.42389853137516686, | |
| "grad_norm": 0.9226349965551451, | |
| "learning_rate": 3.204747774480713e-05, | |
| "loss": 0.8024, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.4272363150867824, | |
| "grad_norm": 0.9911335494782234, | |
| "learning_rate": 3.1862017804154304e-05, | |
| "loss": 0.7998, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.43057409879839786, | |
| "grad_norm": 0.857226373613729, | |
| "learning_rate": 3.167655786350148e-05, | |
| "loss": 0.7985, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.43391188251001334, | |
| "grad_norm": 0.9012240464805917, | |
| "learning_rate": 3.149109792284867e-05, | |
| "loss": 0.8109, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.43724966622162886, | |
| "grad_norm": 0.9124837740946565, | |
| "learning_rate": 3.1305637982195846e-05, | |
| "loss": 0.8015, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.44058744993324434, | |
| "grad_norm": 0.9907278141668688, | |
| "learning_rate": 3.112017804154303e-05, | |
| "loss": 0.8102, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.4439252336448598, | |
| "grad_norm": 0.9447867252541866, | |
| "learning_rate": 3.0934718100890206e-05, | |
| "loss": 0.8308, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.4472630173564753, | |
| "grad_norm": 0.9514834392779774, | |
| "learning_rate": 3.074925816023739e-05, | |
| "loss": 0.8108, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.4506008010680908, | |
| "grad_norm": 0.8898801356986638, | |
| "learning_rate": 3.056379821958457e-05, | |
| "loss": 0.7952, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.4539385847797063, | |
| "grad_norm": 0.9121421167479317, | |
| "learning_rate": 3.0378338278931752e-05, | |
| "loss": 0.7766, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.45727636849132175, | |
| "grad_norm": 0.9206580766916015, | |
| "learning_rate": 3.0192878338278935e-05, | |
| "loss": 0.7976, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.4606141522029373, | |
| "grad_norm": 0.8875003615985043, | |
| "learning_rate": 3.0007418397626115e-05, | |
| "loss": 0.7965, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.46395193591455275, | |
| "grad_norm": 1.0057518919464419, | |
| "learning_rate": 2.9821958456973298e-05, | |
| "loss": 0.7745, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.4672897196261682, | |
| "grad_norm": 0.9890615318492613, | |
| "learning_rate": 2.9636498516320477e-05, | |
| "loss": 0.8021, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4706275033377837, | |
| "grad_norm": 0.9958806046763854, | |
| "learning_rate": 2.9451038575667654e-05, | |
| "loss": 0.7948, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.4739652870493992, | |
| "grad_norm": 0.9427315789994021, | |
| "learning_rate": 2.926557863501484e-05, | |
| "loss": 0.7995, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4773030707610147, | |
| "grad_norm": 0.9054405934645217, | |
| "learning_rate": 2.908011869436202e-05, | |
| "loss": 0.7941, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.48064085447263016, | |
| "grad_norm": 0.9373946350999136, | |
| "learning_rate": 2.8894658753709203e-05, | |
| "loss": 0.8178, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.48397863818424564, | |
| "grad_norm": 0.9693149989780067, | |
| "learning_rate": 2.8709198813056383e-05, | |
| "loss": 0.8044, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.48731642189586116, | |
| "grad_norm": 0.8575646876326481, | |
| "learning_rate": 2.852373887240356e-05, | |
| "loss": 0.789, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.49065420560747663, | |
| "grad_norm": 0.986208895766066, | |
| "learning_rate": 2.8338278931750746e-05, | |
| "loss": 0.8155, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.4939919893190921, | |
| "grad_norm": 0.9731059895113887, | |
| "learning_rate": 2.8152818991097922e-05, | |
| "loss": 0.8092, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.49732977303070763, | |
| "grad_norm": 1.0244020266746006, | |
| "learning_rate": 2.796735905044511e-05, | |
| "loss": 0.8086, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.5006675567423231, | |
| "grad_norm": 0.8755711588079153, | |
| "learning_rate": 2.7781899109792285e-05, | |
| "loss": 0.7978, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.5040053404539386, | |
| "grad_norm": 0.8857975744161016, | |
| "learning_rate": 2.7596439169139465e-05, | |
| "loss": 0.7945, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.507343124165554, | |
| "grad_norm": 1.0437729771562312, | |
| "learning_rate": 2.741097922848665e-05, | |
| "loss": 0.8, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.5106809078771696, | |
| "grad_norm": 0.8938452774095116, | |
| "learning_rate": 2.7225519287833828e-05, | |
| "loss": 0.7919, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.514018691588785, | |
| "grad_norm": 0.9507910351515566, | |
| "learning_rate": 2.7040059347181014e-05, | |
| "loss": 0.811, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5173564753004005, | |
| "grad_norm": 0.9446476391640979, | |
| "learning_rate": 2.685459940652819e-05, | |
| "loss": 0.812, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.520694259012016, | |
| "grad_norm": 0.8939815937066375, | |
| "learning_rate": 2.666913946587537e-05, | |
| "loss": 0.7968, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5240320427236315, | |
| "grad_norm": 0.9078803372635648, | |
| "learning_rate": 2.6483679525222553e-05, | |
| "loss": 0.816, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.527369826435247, | |
| "grad_norm": 0.8194365993553059, | |
| "learning_rate": 2.6298219584569733e-05, | |
| "loss": 0.7861, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.5307076101468625, | |
| "grad_norm": 0.8408501195751673, | |
| "learning_rate": 2.6112759643916916e-05, | |
| "loss": 0.7743, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.5340453938584779, | |
| "grad_norm": 0.8531884984260966, | |
| "learning_rate": 2.5927299703264096e-05, | |
| "loss": 0.8047, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5373831775700935, | |
| "grad_norm": 0.8791442187279712, | |
| "learning_rate": 2.5741839762611276e-05, | |
| "loss": 0.7854, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.540720961281709, | |
| "grad_norm": 0.9753162027608423, | |
| "learning_rate": 2.555637982195846e-05, | |
| "loss": 0.8098, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.5440587449933244, | |
| "grad_norm": 0.8625073065074, | |
| "learning_rate": 2.537091988130564e-05, | |
| "loss": 0.7967, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.5473965287049399, | |
| "grad_norm": 0.9742653975215142, | |
| "learning_rate": 2.518545994065282e-05, | |
| "loss": 0.7768, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5507343124165555, | |
| "grad_norm": 0.8892913090889087, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.7603, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.5540720961281709, | |
| "grad_norm": 0.8685382218468735, | |
| "learning_rate": 2.4814540059347184e-05, | |
| "loss": 0.8061, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.5574098798397864, | |
| "grad_norm": 0.8609646194613518, | |
| "learning_rate": 2.4629080118694364e-05, | |
| "loss": 0.8238, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.5607476635514018, | |
| "grad_norm": 0.9013458211054559, | |
| "learning_rate": 2.4443620178041544e-05, | |
| "loss": 0.7874, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5640854472630173, | |
| "grad_norm": 0.9448632241585405, | |
| "learning_rate": 2.4258160237388723e-05, | |
| "loss": 0.777, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.5674232309746329, | |
| "grad_norm": 0.8652639715383189, | |
| "learning_rate": 2.4072700296735907e-05, | |
| "loss": 0.7781, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5707610146862483, | |
| "grad_norm": 0.885349438416903, | |
| "learning_rate": 2.3887240356083086e-05, | |
| "loss": 0.7852, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.5740987983978638, | |
| "grad_norm": 0.9226549883190552, | |
| "learning_rate": 2.370178041543027e-05, | |
| "loss": 0.7937, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5774365821094793, | |
| "grad_norm": 0.8894377701419424, | |
| "learning_rate": 2.351632047477745e-05, | |
| "loss": 0.7858, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.5807743658210948, | |
| "grad_norm": 0.9417967618419559, | |
| "learning_rate": 2.333086053412463e-05, | |
| "loss": 0.7721, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5841121495327103, | |
| "grad_norm": 0.9222488011231172, | |
| "learning_rate": 2.3145400593471812e-05, | |
| "loss": 0.7511, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.5874499332443258, | |
| "grad_norm": 0.8993397519730585, | |
| "learning_rate": 2.2959940652818992e-05, | |
| "loss": 0.7827, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5907877169559412, | |
| "grad_norm": 0.9136313639945539, | |
| "learning_rate": 2.2774480712166175e-05, | |
| "loss": 0.7714, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.5941255006675568, | |
| "grad_norm": 0.9199729414745823, | |
| "learning_rate": 2.258902077151335e-05, | |
| "loss": 0.7761, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5974632843791722, | |
| "grad_norm": 0.8409864114208272, | |
| "learning_rate": 2.2403560830860534e-05, | |
| "loss": 0.7758, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.6008010680907877, | |
| "grad_norm": 0.8927435513620092, | |
| "learning_rate": 2.2218100890207717e-05, | |
| "loss": 0.8088, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.6041388518024032, | |
| "grad_norm": 1.0111242127600466, | |
| "learning_rate": 2.2032640949554897e-05, | |
| "loss": 0.787, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.6074766355140186, | |
| "grad_norm": 0.9750007023233266, | |
| "learning_rate": 2.184718100890208e-05, | |
| "loss": 0.7951, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.6108144192256342, | |
| "grad_norm": 0.8947817876635858, | |
| "learning_rate": 2.166172106824926e-05, | |
| "loss": 0.7718, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.6141522029372497, | |
| "grad_norm": 0.8937079235831037, | |
| "learning_rate": 2.147626112759644e-05, | |
| "loss": 0.7653, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.6174899866488651, | |
| "grad_norm": 0.9633412553738314, | |
| "learning_rate": 2.129080118694362e-05, | |
| "loss": 0.7588, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.6208277703604806, | |
| "grad_norm": 0.9697623965265878, | |
| "learning_rate": 2.1105341246290803e-05, | |
| "loss": 0.7743, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.6241655540720962, | |
| "grad_norm": 0.9170590153248661, | |
| "learning_rate": 2.0919881305637982e-05, | |
| "loss": 0.7939, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.6275033377837116, | |
| "grad_norm": 0.8884986167305851, | |
| "learning_rate": 2.0734421364985165e-05, | |
| "loss": 0.7609, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.6308411214953271, | |
| "grad_norm": 0.9117598511296207, | |
| "learning_rate": 2.0548961424332345e-05, | |
| "loss": 0.7816, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.6341789052069426, | |
| "grad_norm": 0.9143844046049939, | |
| "learning_rate": 2.0363501483679525e-05, | |
| "loss": 0.7682, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6375166889185581, | |
| "grad_norm": 1.001882223179808, | |
| "learning_rate": 2.0178041543026708e-05, | |
| "loss": 0.7859, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.6408544726301736, | |
| "grad_norm": 0.9515527243629021, | |
| "learning_rate": 1.9992581602373888e-05, | |
| "loss": 0.771, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.644192256341789, | |
| "grad_norm": 0.9987405390627165, | |
| "learning_rate": 1.980712166172107e-05, | |
| "loss": 0.7882, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.6475300400534045, | |
| "grad_norm": 0.902559098376266, | |
| "learning_rate": 1.962166172106825e-05, | |
| "loss": 0.7865, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.6508678237650201, | |
| "grad_norm": 0.9530475037552353, | |
| "learning_rate": 1.943620178041543e-05, | |
| "loss": 0.7651, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.6542056074766355, | |
| "grad_norm": 0.9044360545717226, | |
| "learning_rate": 1.9250741839762613e-05, | |
| "loss": 0.7611, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.657543391188251, | |
| "grad_norm": 0.8949422084811579, | |
| "learning_rate": 1.9065281899109793e-05, | |
| "loss": 0.7535, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.6608811748998665, | |
| "grad_norm": 0.9212385076203463, | |
| "learning_rate": 1.8879821958456976e-05, | |
| "loss": 0.7828, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.664218958611482, | |
| "grad_norm": 0.9685512993064703, | |
| "learning_rate": 1.8694362017804153e-05, | |
| "loss": 0.7598, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.6675567423230975, | |
| "grad_norm": 0.8371458023739065, | |
| "learning_rate": 1.8508902077151336e-05, | |
| "loss": 0.7593, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.670894526034713, | |
| "grad_norm": 0.9561174634421302, | |
| "learning_rate": 1.8323442136498515e-05, | |
| "loss": 0.7918, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.6742323097463284, | |
| "grad_norm": 0.988014946142732, | |
| "learning_rate": 1.81379821958457e-05, | |
| "loss": 0.7752, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.677570093457944, | |
| "grad_norm": 0.9594260489502082, | |
| "learning_rate": 1.795252225519288e-05, | |
| "loss": 0.7555, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.6809078771695594, | |
| "grad_norm": 0.9047734227550646, | |
| "learning_rate": 1.7767062314540058e-05, | |
| "loss": 0.7616, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.6842456608811749, | |
| "grad_norm": 0.8663553522012484, | |
| "learning_rate": 1.758160237388724e-05, | |
| "loss": 0.7796, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.6875834445927904, | |
| "grad_norm": 0.921325208555024, | |
| "learning_rate": 1.739614243323442e-05, | |
| "loss": 0.7641, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.6909212283044058, | |
| "grad_norm": 0.8876679055429972, | |
| "learning_rate": 1.7210682492581604e-05, | |
| "loss": 0.7676, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.6942590120160214, | |
| "grad_norm": 0.8958203531086095, | |
| "learning_rate": 1.7025222551928784e-05, | |
| "loss": 0.7444, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.6975967957276369, | |
| "grad_norm": 0.8697101652022063, | |
| "learning_rate": 1.6839762611275967e-05, | |
| "loss": 0.7994, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.7009345794392523, | |
| "grad_norm": 1.088915969712606, | |
| "learning_rate": 1.6654302670623147e-05, | |
| "loss": 0.7664, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.7042723631508678, | |
| "grad_norm": 0.9166954419190961, | |
| "learning_rate": 1.6468842729970326e-05, | |
| "loss": 0.7576, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.7076101468624834, | |
| "grad_norm": 1.0111723788145828, | |
| "learning_rate": 1.628338278931751e-05, | |
| "loss": 0.7437, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.7109479305740988, | |
| "grad_norm": 0.8935454904692272, | |
| "learning_rate": 1.609792284866469e-05, | |
| "loss": 0.7651, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.9796810043507851, | |
| "learning_rate": 1.5912462908011872e-05, | |
| "loss": 0.7453, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.7176234979973297, | |
| "grad_norm": 0.9497605242721375, | |
| "learning_rate": 1.572700296735905e-05, | |
| "loss": 0.7518, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.7209612817089452, | |
| "grad_norm": 0.9261889971154708, | |
| "learning_rate": 1.5541543026706232e-05, | |
| "loss": 0.7509, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.7242990654205608, | |
| "grad_norm": 0.9221518501402585, | |
| "learning_rate": 1.5356083086053415e-05, | |
| "loss": 0.7383, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.7276368491321762, | |
| "grad_norm": 0.9905384423101319, | |
| "learning_rate": 1.5170623145400595e-05, | |
| "loss": 0.7651, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.7309746328437917, | |
| "grad_norm": 0.9591889790233724, | |
| "learning_rate": 1.4985163204747776e-05, | |
| "loss": 0.7631, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.7343124165554072, | |
| "grad_norm": 0.9087058012685387, | |
| "learning_rate": 1.4799703264094956e-05, | |
| "loss": 0.7535, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7376502002670227, | |
| "grad_norm": 0.8692391342259392, | |
| "learning_rate": 1.4614243323442137e-05, | |
| "loss": 0.7374, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.7409879839786382, | |
| "grad_norm": 0.9298588940493733, | |
| "learning_rate": 1.4428783382789319e-05, | |
| "loss": 0.7423, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.7443257676902537, | |
| "grad_norm": 0.9381934382110304, | |
| "learning_rate": 1.42433234421365e-05, | |
| "loss": 0.7556, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.7476635514018691, | |
| "grad_norm": 0.9063091001621449, | |
| "learning_rate": 1.4057863501483681e-05, | |
| "loss": 0.742, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.7510013351134847, | |
| "grad_norm": 0.9344982528376456, | |
| "learning_rate": 1.387240356083086e-05, | |
| "loss": 0.7554, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.7543391188251002, | |
| "grad_norm": 0.9193075340785176, | |
| "learning_rate": 1.3686943620178041e-05, | |
| "loss": 0.7673, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.7576769025367156, | |
| "grad_norm": 0.9336576279968588, | |
| "learning_rate": 1.3501483679525222e-05, | |
| "loss": 0.7023, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.7610146862483311, | |
| "grad_norm": 0.9768530487657828, | |
| "learning_rate": 1.3316023738872405e-05, | |
| "loss": 0.7782, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.7643524699599465, | |
| "grad_norm": 0.9679956607339216, | |
| "learning_rate": 1.3130563798219587e-05, | |
| "loss": 0.763, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.7676902536715621, | |
| "grad_norm": 0.869544797745516, | |
| "learning_rate": 1.2945103857566765e-05, | |
| "loss": 0.7654, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7710280373831776, | |
| "grad_norm": 0.985241850392532, | |
| "learning_rate": 1.2759643916913946e-05, | |
| "loss": 0.7699, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.774365821094793, | |
| "grad_norm": 0.9549632229431664, | |
| "learning_rate": 1.2574183976261128e-05, | |
| "loss": 0.7446, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.7777036048064085, | |
| "grad_norm": 0.9575556607619793, | |
| "learning_rate": 1.2388724035608309e-05, | |
| "loss": 0.761, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.7810413885180241, | |
| "grad_norm": 0.9994928229391189, | |
| "learning_rate": 1.2203264094955489e-05, | |
| "loss": 0.7493, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.7843791722296395, | |
| "grad_norm": 0.9965021892630818, | |
| "learning_rate": 1.2017804154302672e-05, | |
| "loss": 0.7602, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.787716955941255, | |
| "grad_norm": 0.9615733998987331, | |
| "learning_rate": 1.1832344213649853e-05, | |
| "loss": 0.7575, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.7910547396528705, | |
| "grad_norm": 0.995773743198864, | |
| "learning_rate": 1.1646884272997033e-05, | |
| "loss": 0.7411, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.794392523364486, | |
| "grad_norm": 0.8979251525490485, | |
| "learning_rate": 1.1461424332344215e-05, | |
| "loss": 0.7396, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.7977303070761015, | |
| "grad_norm": 0.8587947464461109, | |
| "learning_rate": 1.1275964391691394e-05, | |
| "loss": 0.7534, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.8010680907877169, | |
| "grad_norm": 0.9219949726663529, | |
| "learning_rate": 1.1090504451038576e-05, | |
| "loss": 0.7472, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.8044058744993324, | |
| "grad_norm": 0.9232814244897973, | |
| "learning_rate": 1.0905044510385757e-05, | |
| "loss": 0.7361, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.807743658210948, | |
| "grad_norm": 0.9036693012652329, | |
| "learning_rate": 1.0719584569732939e-05, | |
| "loss": 0.7714, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.8110814419225634, | |
| "grad_norm": 0.8986092027119317, | |
| "learning_rate": 1.053412462908012e-05, | |
| "loss": 0.7291, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.8144192256341789, | |
| "grad_norm": 0.9600751192343961, | |
| "learning_rate": 1.0348664688427301e-05, | |
| "loss": 0.7406, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.8177570093457944, | |
| "grad_norm": 0.9765858612379583, | |
| "learning_rate": 1.0163204747774481e-05, | |
| "loss": 0.744, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.8210947930574098, | |
| "grad_norm": 0.9436664489477504, | |
| "learning_rate": 9.977744807121663e-06, | |
| "loss": 0.7537, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.8244325767690254, | |
| "grad_norm": 0.9348268117808438, | |
| "learning_rate": 9.792284866468842e-06, | |
| "loss": 0.7466, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.8277703604806409, | |
| "grad_norm": 0.8839373635727904, | |
| "learning_rate": 9.606824925816024e-06, | |
| "loss": 0.7225, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.8311081441922563, | |
| "grad_norm": 0.9350825333412003, | |
| "learning_rate": 9.421364985163205e-06, | |
| "loss": 0.7535, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.8344459279038718, | |
| "grad_norm": 0.9170815856659084, | |
| "learning_rate": 9.235905044510387e-06, | |
| "loss": 0.7129, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8377837116154874, | |
| "grad_norm": 0.8931728840330895, | |
| "learning_rate": 9.050445103857568e-06, | |
| "loss": 0.7312, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.8411214953271028, | |
| "grad_norm": 0.947606427769052, | |
| "learning_rate": 8.864985163204748e-06, | |
| "loss": 0.7198, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.8444592790387183, | |
| "grad_norm": 0.9867265496764112, | |
| "learning_rate": 8.679525222551929e-06, | |
| "loss": 0.7627, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.8477970627503337, | |
| "grad_norm": 1.0256443772674286, | |
| "learning_rate": 8.49406528189911e-06, | |
| "loss": 0.7466, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.8511348464619493, | |
| "grad_norm": 1.0226735020250939, | |
| "learning_rate": 8.30860534124629e-06, | |
| "loss": 0.7524, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.8544726301735648, | |
| "grad_norm": 0.918655948279863, | |
| "learning_rate": 8.123145400593472e-06, | |
| "loss": 0.7395, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.8578104138851802, | |
| "grad_norm": 0.9337021354955276, | |
| "learning_rate": 7.937685459940653e-06, | |
| "loss": 0.7311, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.8611481975967957, | |
| "grad_norm": 1.0290658689780736, | |
| "learning_rate": 7.752225519287835e-06, | |
| "loss": 0.7533, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.8644859813084113, | |
| "grad_norm": 0.9244023648001366, | |
| "learning_rate": 7.566765578635016e-06, | |
| "loss": 0.7366, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.8678237650200267, | |
| "grad_norm": 0.9537206897694197, | |
| "learning_rate": 7.381305637982196e-06, | |
| "loss": 0.7399, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8711615487316422, | |
| "grad_norm": 0.8743472670723075, | |
| "learning_rate": 7.195845697329377e-06, | |
| "loss": 0.7296, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.8744993324432577, | |
| "grad_norm": 0.8934119128550586, | |
| "learning_rate": 7.0103857566765585e-06, | |
| "loss": 0.7416, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.8778371161548731, | |
| "grad_norm": 0.9618800479933417, | |
| "learning_rate": 6.824925816023739e-06, | |
| "loss": 0.7322, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.8811748998664887, | |
| "grad_norm": 0.830157637044439, | |
| "learning_rate": 6.6394658753709205e-06, | |
| "loss": 0.7292, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.8845126835781041, | |
| "grad_norm": 0.9363830426057013, | |
| "learning_rate": 6.4540059347181e-06, | |
| "loss": 0.734, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.8878504672897196, | |
| "grad_norm": 0.9442444285959836, | |
| "learning_rate": 6.2685459940652825e-06, | |
| "loss": 0.7297, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.8911882510013351, | |
| "grad_norm": 0.9348111437754156, | |
| "learning_rate": 6.083086053412463e-06, | |
| "loss": 0.7382, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.8945260347129506, | |
| "grad_norm": 0.894299426623852, | |
| "learning_rate": 5.8976261127596445e-06, | |
| "loss": 0.7125, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.8978638184245661, | |
| "grad_norm": 0.9736360020611752, | |
| "learning_rate": 5.712166172106825e-06, | |
| "loss": 0.7277, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.9012016021361816, | |
| "grad_norm": 0.9591002259783217, | |
| "learning_rate": 5.5267062314540065e-06, | |
| "loss": 0.7258, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.904539385847797, | |
| "grad_norm": 0.9710736921612175, | |
| "learning_rate": 5.341246290801187e-06, | |
| "loss": 0.7381, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.9078771695594126, | |
| "grad_norm": 0.8730608837945383, | |
| "learning_rate": 5.155786350148368e-06, | |
| "loss": 0.7251, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.9112149532710281, | |
| "grad_norm": 0.903052568907302, | |
| "learning_rate": 4.970326409495549e-06, | |
| "loss": 0.7094, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.9145527369826435, | |
| "grad_norm": 0.9621382332532343, | |
| "learning_rate": 4.7848664688427305e-06, | |
| "loss": 0.7227, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.917890520694259, | |
| "grad_norm": 0.9252627526168606, | |
| "learning_rate": 4.599406528189911e-06, | |
| "loss": 0.7271, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.9212283044058746, | |
| "grad_norm": 0.9417940189754115, | |
| "learning_rate": 4.413946587537092e-06, | |
| "loss": 0.7421, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.92456608811749, | |
| "grad_norm": 0.9682630131774531, | |
| "learning_rate": 4.228486646884274e-06, | |
| "loss": 0.7421, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.9279038718291055, | |
| "grad_norm": 1.0307114272788542, | |
| "learning_rate": 4.0430267062314545e-06, | |
| "loss": 0.7521, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.9312416555407209, | |
| "grad_norm": 0.8832770378598581, | |
| "learning_rate": 3.857566765578635e-06, | |
| "loss": 0.7124, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.9345794392523364, | |
| "grad_norm": 0.9746407857253272, | |
| "learning_rate": 3.672106824925816e-06, | |
| "loss": 0.7302, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.937917222963952, | |
| "grad_norm": 0.9499407254108757, | |
| "learning_rate": 3.4866468842729975e-06, | |
| "loss": 0.7128, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.9412550066755674, | |
| "grad_norm": 0.9145822943128387, | |
| "learning_rate": 3.3011869436201785e-06, | |
| "loss": 0.7322, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.9445927903871829, | |
| "grad_norm": 0.9757309463266676, | |
| "learning_rate": 3.115727002967359e-06, | |
| "loss": 0.7336, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.9479305740987984, | |
| "grad_norm": 0.9534792908684466, | |
| "learning_rate": 2.93026706231454e-06, | |
| "loss": 0.7312, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.9512683578104139, | |
| "grad_norm": 0.9679148570410533, | |
| "learning_rate": 2.744807121661721e-06, | |
| "loss": 0.7128, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.9546061415220294, | |
| "grad_norm": 0.9493106907067503, | |
| "learning_rate": 2.559347181008902e-06, | |
| "loss": 0.7306, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.9579439252336449, | |
| "grad_norm": 0.9679382523098454, | |
| "learning_rate": 2.3738872403560835e-06, | |
| "loss": 0.7328, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.9612817089452603, | |
| "grad_norm": 0.9317606747634454, | |
| "learning_rate": 2.188427299703264e-06, | |
| "loss": 0.7238, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.9646194926568759, | |
| "grad_norm": 0.9275615394707037, | |
| "learning_rate": 2.0029673590504455e-06, | |
| "loss": 0.7171, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.9679572763684913, | |
| "grad_norm": 0.9633454475236382, | |
| "learning_rate": 1.8175074183976263e-06, | |
| "loss": 0.708, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.9712950600801068, | |
| "grad_norm": 0.9399727708328366, | |
| "learning_rate": 1.6320474777448073e-06, | |
| "loss": 0.7291, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.9746328437917223, | |
| "grad_norm": 1.020554951634919, | |
| "learning_rate": 1.4465875370919883e-06, | |
| "loss": 0.7461, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.9779706275033377, | |
| "grad_norm": 0.985081620856249, | |
| "learning_rate": 1.2611275964391693e-06, | |
| "loss": 0.7228, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.9813084112149533, | |
| "grad_norm": 0.9385893286244219, | |
| "learning_rate": 1.0756676557863502e-06, | |
| "loss": 0.72, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.9846461949265688, | |
| "grad_norm": 1.0049513545701543, | |
| "learning_rate": 8.902077151335312e-07, | |
| "loss": 0.7147, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.9879839786381842, | |
| "grad_norm": 1.0099532146134622, | |
| "learning_rate": 7.047477744807121e-07, | |
| "loss": 0.7282, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.9913217623497997, | |
| "grad_norm": 0.8888912225564214, | |
| "learning_rate": 5.192878338278931e-07, | |
| "loss": 0.7332, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.9946595460614153, | |
| "grad_norm": 0.9997610443207388, | |
| "learning_rate": 3.338278931750742e-07, | |
| "loss": 0.7225, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.9979973297730307, | |
| "grad_norm": 0.9412531346267364, | |
| "learning_rate": 1.4836795252225522e-07, | |
| "loss": 0.7318, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 1498, | |
| "total_flos": 92324696948736.0, | |
| "train_loss": 0.788116905017593, | |
| "train_runtime": 3845.5406, | |
| "train_samples_per_second": 49.839, | |
| "train_steps_per_second": 0.39 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1498, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 180000000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 92324696948736.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |