{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0025188916876575, "eval_steps": 500, "global_step": 398, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025188916876574307, "grad_norm": 29.875, "learning_rate": 2.3076923076923078e-07, "loss": 1.1914, "memory/device_mem_reserved(gib)": 80.65, "memory/max_mem_active(gib)": 67.28, "memory/max_mem_allocated(gib)": 67.28, "step": 1 }, { "epoch": 0.005037783375314861, "grad_norm": 26.875, "learning_rate": 4.6153846153846156e-07, "loss": 1.1719, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 2 }, { "epoch": 0.007556675062972292, "grad_norm": 28.0, "learning_rate": 6.923076923076923e-07, "loss": 1.166, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 3 }, { "epoch": 0.010075566750629723, "grad_norm": 61.25, "learning_rate": 9.230769230769231e-07, "loss": 1.6523, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 4 }, { "epoch": 0.012594458438287154, "grad_norm": 7.1875, "learning_rate": 1.1538461538461538e-06, "loss": 1.1035, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 5 }, { "epoch": 0.015113350125944584, "grad_norm": 6.0625, "learning_rate": 1.3846153846153846e-06, "loss": 1.1689, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 6 }, { "epoch": 0.017632241813602016, "grad_norm": 5.03125, "learning_rate": 1.6153846153846154e-06, "loss": 1.0928, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 7 }, { "epoch": 0.020151133501259445, "grad_norm": 9.3125, "learning_rate": 1.8461538461538462e-06, "loss": 1.0908, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 8 }, { "epoch": 0.022670025188916875, "grad_norm": 4.625, "learning_rate": 2.076923076923077e-06, "loss": 1.0566, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 9 }, { "epoch": 0.02518891687657431, "grad_norm": 3.59375, "learning_rate": 2.3076923076923077e-06, "loss": 1.0049, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 10 }, { "epoch": 0.027707808564231738, "grad_norm": 2.40625, "learning_rate": 2.5384615384615385e-06, "loss": 1.0156, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 11 }, { "epoch": 0.030226700251889168, "grad_norm": 2.34375, "learning_rate": 2.7692307692307693e-06, "loss": 0.9209, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 12 }, { "epoch": 0.0327455919395466, "grad_norm": 4.71875, "learning_rate": 3e-06, "loss": 1.1035, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 13 }, { "epoch": 0.03526448362720403, "grad_norm": 3.09375, "learning_rate": 3.230769230769231e-06, "loss": 1.0547, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 14 }, { "epoch": 0.037783375314861464, "grad_norm": 1.765625, "learning_rate": 3.4615384615384617e-06, "loss": 0.9268, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 15 }, { "epoch": 0.04030226700251889, "grad_norm": 2.171875, "learning_rate": 3.6923076923076925e-06, "loss": 1.0215, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 16 }, { "epoch": 0.042821158690176324, "grad_norm": 5.1875, "learning_rate": 3.923076923076923e-06, "loss": 0.9707, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 17 }, { "epoch": 0.04534005037783375, "grad_norm": 4.09375, "learning_rate": 4.153846153846154e-06, "loss": 0.959, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 18 }, { "epoch": 0.04785894206549118, "grad_norm": 3.421875, "learning_rate": 4.384615384615385e-06, "loss": 1.0137, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 19 }, { "epoch": 0.05037783375314862, "grad_norm": 2.71875, "learning_rate": 4.615384615384615e-06, "loss": 1.0137, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 20 }, { "epoch": 0.05289672544080604, "grad_norm": 1.6796875, "learning_rate": 4.8461538461538465e-06, "loss": 0.9062, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 21 }, { "epoch": 0.055415617128463476, "grad_norm": 2.109375, "learning_rate": 5.076923076923077e-06, "loss": 0.875, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 22 }, { "epoch": 0.05793450881612091, "grad_norm": 2.609375, "learning_rate": 5.307692307692308e-06, "loss": 0.9893, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 23 }, { "epoch": 0.060453400503778336, "grad_norm": 5.71875, "learning_rate": 5.5384615384615385e-06, "loss": 1.0166, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 24 }, { "epoch": 0.06297229219143577, "grad_norm": 1.640625, "learning_rate": 5.769230769230769e-06, "loss": 0.9834, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 25 }, { "epoch": 0.0654911838790932, "grad_norm": 1.390625, "learning_rate": 6e-06, "loss": 0.8379, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 26 }, { "epoch": 0.06801007556675064, "grad_norm": 1.4765625, "learning_rate": 6.2307692307692305e-06, "loss": 0.9717, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 27 }, { "epoch": 0.07052896725440806, "grad_norm": 1.453125, "learning_rate": 6.461538461538462e-06, "loss": 0.873, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 28 }, { "epoch": 0.07304785894206549, "grad_norm": 1.40625, "learning_rate": 6.692307692307692e-06, "loss": 0.9424, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 29 }, { "epoch": 0.07556675062972293, "grad_norm": 2.21875, "learning_rate": 6.923076923076923e-06, "loss": 0.9629, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 30 }, { "epoch": 0.07808564231738035, "grad_norm": 1.6875, "learning_rate": 7.153846153846154e-06, "loss": 0.8477, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 31 }, { "epoch": 0.08060453400503778, "grad_norm": 1.390625, "learning_rate": 7.384615384615385e-06, "loss": 0.9854, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 32 }, { "epoch": 0.08312342569269521, "grad_norm": 1.609375, "learning_rate": 7.615384615384615e-06, "loss": 0.9248, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 33 }, { "epoch": 0.08564231738035265, "grad_norm": 1.3828125, "learning_rate": 7.846153846153847e-06, "loss": 0.9873, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 34 }, { "epoch": 0.08816120906801007, "grad_norm": 1.2890625, "learning_rate": 8.076923076923077e-06, "loss": 0.8662, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 35 }, { "epoch": 0.0906801007556675, "grad_norm": 1.984375, "learning_rate": 8.307692307692307e-06, "loss": 0.835, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 36 }, { "epoch": 0.09319899244332494, "grad_norm": 1.546875, "learning_rate": 8.53846153846154e-06, "loss": 0.9141, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 37 }, { "epoch": 0.09571788413098237, "grad_norm": 1.453125, "learning_rate": 8.76923076923077e-06, "loss": 0.9336, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 38 }, { "epoch": 0.0982367758186398, "grad_norm": 1.3515625, "learning_rate": 9e-06, "loss": 0.959, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 39 }, { "epoch": 0.10075566750629723, "grad_norm": 1.3828125, "learning_rate": 8.998925871900279e-06, "loss": 0.8184, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 40 }, { "epoch": 0.10327455919395466, "grad_norm": 1.4765625, "learning_rate": 8.997849176845461e-06, "loss": 0.9004, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 41 }, { "epoch": 0.10579345088161209, "grad_norm": 2.1875, "learning_rate": 8.996769905622757e-06, "loss": 0.9365, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 42 }, { "epoch": 0.10831234256926953, "grad_norm": 1.6953125, "learning_rate": 8.995688048975247e-06, "loss": 0.9414, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 43 }, { "epoch": 0.11083123425692695, "grad_norm": 2.234375, "learning_rate": 8.994603597601599e-06, "loss": 1.0234, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 44 }, { "epoch": 0.11335012594458438, "grad_norm": 1.390625, "learning_rate": 8.993516542155818e-06, "loss": 0.8896, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 45 }, { "epoch": 0.11586901763224182, "grad_norm": 1.1328125, "learning_rate": 8.992426873246962e-06, "loss": 0.8965, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 46 }, { "epoch": 0.11838790931989925, "grad_norm": 1.703125, "learning_rate": 8.991334581438888e-06, "loss": 0.8301, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 47 }, { "epoch": 0.12090680100755667, "grad_norm": 1.875, "learning_rate": 8.990239657249966e-06, "loss": 0.8691, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 48 }, { "epoch": 0.12342569269521411, "grad_norm": 1.4375, "learning_rate": 8.989142091152815e-06, "loss": 0.9229, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 49 }, { "epoch": 0.12594458438287154, "grad_norm": 1.390625, "learning_rate": 8.988041873574018e-06, "loss": 0.9941, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 50 }, { "epoch": 0.12846347607052896, "grad_norm": 1.5625, "learning_rate": 8.986938994893847e-06, "loss": 0.8838, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 51 }, { "epoch": 0.1309823677581864, "grad_norm": 1.234375, "learning_rate": 8.985833445445984e-06, "loss": 0.8643, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 52 }, { "epoch": 0.13350125944584382, "grad_norm": 1.515625, "learning_rate": 8.984725215517241e-06, "loss": 0.9756, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 53 }, { "epoch": 0.13602015113350127, "grad_norm": 1.453125, "learning_rate": 8.98361429534727e-06, "loss": 0.8311, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 54 }, { "epoch": 0.1385390428211587, "grad_norm": 1.46875, "learning_rate": 8.982500675128276e-06, "loss": 0.8545, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 55 }, { "epoch": 0.14105793450881612, "grad_norm": 1.328125, "learning_rate": 8.981384345004732e-06, "loss": 0.9189, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 56 }, { "epoch": 0.14357682619647355, "grad_norm": 1.5078125, "learning_rate": 8.980265295073092e-06, "loss": 0.8623, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 57 }, { "epoch": 0.14609571788413098, "grad_norm": 1.203125, "learning_rate": 8.979143515381489e-06, "loss": 0.8184, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 58 }, { "epoch": 0.1486146095717884, "grad_norm": 1.453125, "learning_rate": 8.978018995929444e-06, "loss": 0.8418, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 59 }, { "epoch": 0.15113350125944586, "grad_norm": 1.1953125, "learning_rate": 8.976891726667572e-06, "loss": 0.9297, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 60 }, { "epoch": 0.15365239294710328, "grad_norm": 1.1953125, "learning_rate": 8.97576169749728e-06, "loss": 0.8945, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 61 }, { "epoch": 0.1561712846347607, "grad_norm": 1.578125, "learning_rate": 8.974628898270462e-06, "loss": 0.9258, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 62 }, { "epoch": 0.15869017632241814, "grad_norm": 1.234375, "learning_rate": 8.9734933187892e-06, "loss": 0.8389, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 63 }, { "epoch": 0.16120906801007556, "grad_norm": 1.1640625, "learning_rate": 8.97235494880546e-06, "loss": 0.8242, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 64 }, { "epoch": 0.163727959697733, "grad_norm": 2.734375, "learning_rate": 8.971213778020776e-06, "loss": 0.876, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 65 }, { "epoch": 0.16624685138539042, "grad_norm": 1.453125, "learning_rate": 8.970069796085946e-06, "loss": 0.9004, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 66 }, { "epoch": 0.16876574307304787, "grad_norm": 1.7421875, "learning_rate": 8.968922992600714e-06, "loss": 0.8809, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 67 }, { "epoch": 0.1712846347607053, "grad_norm": 1.390625, "learning_rate": 8.96777335711346e-06, "loss": 0.8721, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 68 }, { "epoch": 0.17380352644836272, "grad_norm": 1.5078125, "learning_rate": 8.966620879120879e-06, "loss": 0.8545, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 69 }, { "epoch": 0.17632241813602015, "grad_norm": 1.5859375, "learning_rate": 8.965465548067666e-06, "loss": 0.9199, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 70 }, { "epoch": 0.17884130982367757, "grad_norm": 1.234375, "learning_rate": 8.964307353346186e-06, "loss": 0.8467, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 71 }, { "epoch": 0.181360201511335, "grad_norm": 1.234375, "learning_rate": 8.963146284296154e-06, "loss": 0.9258, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 72 }, { "epoch": 0.18387909319899245, "grad_norm": 1.3125, "learning_rate": 8.961982330204308e-06, "loss": 0.8691, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 73 }, { "epoch": 0.18639798488664988, "grad_norm": 1.3046875, "learning_rate": 8.960815480304078e-06, "loss": 0.7891, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 74 }, { "epoch": 0.1889168765743073, "grad_norm": 1.3828125, "learning_rate": 8.959645723775257e-06, "loss": 0.8916, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 75 }, { "epoch": 0.19143576826196473, "grad_norm": 1.3515625, "learning_rate": 8.958473049743662e-06, "loss": 0.9062, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 76 }, { "epoch": 0.19395465994962216, "grad_norm": 1.21875, "learning_rate": 8.9572974472808e-06, "loss": 0.8076, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 77 }, { "epoch": 0.1964735516372796, "grad_norm": 1.2421875, "learning_rate": 8.956118905403529e-06, "loss": 0.874, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 78 }, { "epoch": 0.19899244332493704, "grad_norm": 1.3828125, "learning_rate": 8.954937413073714e-06, "loss": 0.9102, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 79 }, { "epoch": 0.20151133501259447, "grad_norm": 1.25, "learning_rate": 8.953752959197885e-06, "loss": 0.8408, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 80 }, { "epoch": 0.2040302267002519, "grad_norm": 1.3828125, "learning_rate": 8.952565532626881e-06, "loss": 0.958, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 81 }, { "epoch": 0.20654911838790932, "grad_norm": 1.421875, "learning_rate": 8.951375122155523e-06, "loss": 0.9668, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 82 }, { "epoch": 0.20906801007556675, "grad_norm": 1.2890625, "learning_rate": 8.950181716522227e-06, "loss": 0.8779, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 83 }, { "epoch": 0.21158690176322417, "grad_norm": 1.1796875, "learning_rate": 8.948985304408678e-06, "loss": 0.8057, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 84 }, { "epoch": 0.2141057934508816, "grad_norm": 1.203125, "learning_rate": 8.947785874439462e-06, "loss": 0.8672, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 85 }, { "epoch": 0.21662468513853905, "grad_norm": 1.5078125, "learning_rate": 8.946583415181705e-06, "loss": 0.8369, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 86 }, { "epoch": 0.21914357682619648, "grad_norm": 1.2265625, "learning_rate": 8.945377915144704e-06, "loss": 0.8311, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 87 }, { "epoch": 0.2216624685138539, "grad_norm": 1.1484375, "learning_rate": 8.944169362779576e-06, "loss": 0.8066, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 88 }, { "epoch": 0.22418136020151133, "grad_norm": 1.2109375, "learning_rate": 8.942957746478874e-06, "loss": 0.8584, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 89 }, { "epoch": 0.22670025188916876, "grad_norm": 1.203125, "learning_rate": 8.941743054576224e-06, "loss": 0.8223, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 90 }, { "epoch": 0.22921914357682618, "grad_norm": 1.1015625, "learning_rate": 8.940525275345949e-06, "loss": 0.8828, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 91 }, { "epoch": 0.23173803526448364, "grad_norm": 2.3125, "learning_rate": 8.939304397002686e-06, "loss": 0.9102, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 92 }, { "epoch": 0.23425692695214106, "grad_norm": 1.453125, "learning_rate": 8.938080407701019e-06, "loss": 0.8379, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 93 }, { "epoch": 0.2367758186397985, "grad_norm": 1.2421875, "learning_rate": 8.936853295535081e-06, "loss": 0.7842, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 94 }, { "epoch": 0.23929471032745592, "grad_norm": 1.2578125, "learning_rate": 8.935623048538179e-06, "loss": 0.7998, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 95 }, { "epoch": 0.24181360201511334, "grad_norm": 1.1796875, "learning_rate": 8.934389654682394e-06, "loss": 0.8838, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 96 }, { "epoch": 0.24433249370277077, "grad_norm": 1.1796875, "learning_rate": 8.933153101878202e-06, "loss": 0.8379, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 97 }, { "epoch": 0.24685138539042822, "grad_norm": 1.3828125, "learning_rate": 8.93191337797407e-06, "loss": 0.8896, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 98 }, { "epoch": 0.24937027707808565, "grad_norm": 1.1640625, "learning_rate": 8.930670470756064e-06, "loss": 0.8779, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 99 }, { "epoch": 0.2518891687657431, "grad_norm": 2.21875, "learning_rate": 8.929424367947436e-06, "loss": 0.8799, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 100 }, { "epoch": 0.25440806045340053, "grad_norm": 1.3671875, "learning_rate": 8.928175057208238e-06, "loss": 0.7812, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 101 }, { "epoch": 0.25692695214105793, "grad_norm": 1.5546875, "learning_rate": 8.926922526134899e-06, "loss": 0.7754, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 102 }, { "epoch": 0.2594458438287154, "grad_norm": 1.3359375, "learning_rate": 8.925666762259823e-06, "loss": 0.832, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 103 }, { "epoch": 0.2619647355163728, "grad_norm": 1.359375, "learning_rate": 8.924407753050969e-06, "loss": 0.8779, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 104 }, { "epoch": 0.26448362720403024, "grad_norm": 1.515625, "learning_rate": 8.923145485911444e-06, "loss": 0.7803, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 105 }, { "epoch": 0.26700251889168763, "grad_norm": 1.328125, "learning_rate": 8.92187994817907e-06, "loss": 0.9385, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 106 }, { "epoch": 0.2695214105793451, "grad_norm": 1.2421875, "learning_rate": 8.920611127125973e-06, "loss": 0.9365, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 107 }, { "epoch": 0.27204030226700254, "grad_norm": 1.234375, "learning_rate": 8.919339009958147e-06, "loss": 0.8379, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 108 }, { "epoch": 0.27455919395465994, "grad_norm": 1.1796875, "learning_rate": 8.918063583815029e-06, "loss": 0.8555, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 109 }, { "epoch": 0.2770780856423174, "grad_norm": 1.34375, "learning_rate": 8.916784835769064e-06, "loss": 0.9082, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 110 }, { "epoch": 0.2795969773299748, "grad_norm": 1.3046875, "learning_rate": 8.915502752825269e-06, "loss": 0.8721, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 111 }, { "epoch": 0.28211586901763225, "grad_norm": 1.1953125, "learning_rate": 8.914217321920789e-06, "loss": 0.8408, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 112 }, { "epoch": 0.28463476070528965, "grad_norm": 1.2421875, "learning_rate": 8.912928529924463e-06, "loss": 0.8203, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 113 }, { "epoch": 0.2871536523929471, "grad_norm": 1.734375, "learning_rate": 8.911636363636363e-06, "loss": 0.8457, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 114 }, { "epoch": 0.28967254408060455, "grad_norm": 1.3359375, "learning_rate": 8.910340809787358e-06, "loss": 0.8301, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 115 }, { "epoch": 0.29219143576826195, "grad_norm": 1.3203125, "learning_rate": 8.909041855038646e-06, "loss": 0.8018, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 116 }, { "epoch": 0.2947103274559194, "grad_norm": 1.2265625, "learning_rate": 8.907739485981309e-06, "loss": 0.7891, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 117 }, { "epoch": 0.2972292191435768, "grad_norm": 1.1015625, "learning_rate": 8.906433689135838e-06, "loss": 0.8418, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 118 }, { "epoch": 0.29974811083123426, "grad_norm": 1.453125, "learning_rate": 8.905124450951684e-06, "loss": 0.8604, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 119 }, { "epoch": 0.3022670025188917, "grad_norm": 2.21875, "learning_rate": 8.903811757806774e-06, "loss": 0.8984, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 120 }, { "epoch": 0.3047858942065491, "grad_norm": 1.5234375, "learning_rate": 8.902495596007047e-06, "loss": 0.9189, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 121 }, { "epoch": 0.30730478589420657, "grad_norm": 1.3515625, "learning_rate": 8.901175951785976e-06, "loss": 0.8018, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 122 }, { "epoch": 0.30982367758186397, "grad_norm": 1.265625, "learning_rate": 8.899852811304093e-06, "loss": 0.8066, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 123 }, { "epoch": 0.3123425692695214, "grad_norm": 1.171875, "learning_rate": 8.89852616064849e-06, "loss": 0.8506, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 124 }, { "epoch": 0.3148614609571788, "grad_norm": 1.0625, "learning_rate": 8.89719598583235e-06, "loss": 0.7969, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 125 }, { "epoch": 0.31738035264483627, "grad_norm": 1.421875, "learning_rate": 8.895862272794443e-06, "loss": 0.792, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 126 }, { "epoch": 0.3198992443324937, "grad_norm": 1.1171875, "learning_rate": 8.894525007398639e-06, "loss": 0.8096, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 127 }, { "epoch": 0.3224181360201511, "grad_norm": 1.2265625, "learning_rate": 8.893184175433398e-06, "loss": 0.7803, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 128 }, { "epoch": 0.3249370277078086, "grad_norm": 1.4453125, "learning_rate": 8.891839762611276e-06, "loss": 0.8662, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 129 }, { "epoch": 0.327455919395466, "grad_norm": 1.46875, "learning_rate": 8.890491754568414e-06, "loss": 0.7393, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 130 }, { "epoch": 0.32997481108312343, "grad_norm": 1.1875, "learning_rate": 8.889140136864028e-06, "loss": 0.7891, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 131 }, { "epoch": 0.33249370277078083, "grad_norm": 1.15625, "learning_rate": 8.88778489497989e-06, "loss": 0.9453, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 132 }, { "epoch": 0.3350125944584383, "grad_norm": 1.21875, "learning_rate": 8.88642601431981e-06, "loss": 0.7617, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 133 }, { "epoch": 0.33753148614609574, "grad_norm": 1.15625, "learning_rate": 8.88506348020911e-06, "loss": 0.8564, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 134 }, { "epoch": 0.34005037783375314, "grad_norm": 1.296875, "learning_rate": 8.883697277894106e-06, "loss": 0.8047, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 135 }, { "epoch": 0.3425692695214106, "grad_norm": 1.3046875, "learning_rate": 8.882327392541561e-06, "loss": 0.8721, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 136 }, { "epoch": 0.345088161209068, "grad_norm": 1.3046875, "learning_rate": 8.880953809238152e-06, "loss": 0.8652, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 137 }, { "epoch": 0.34760705289672544, "grad_norm": 1.3515625, "learning_rate": 8.879576512989938e-06, "loss": 0.8438, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 138 }, { "epoch": 0.3501259445843829, "grad_norm": 1.1015625, "learning_rate": 8.878195488721804e-06, "loss": 0.8008, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 139 }, { "epoch": 0.3526448362720403, "grad_norm": 1.53125, "learning_rate": 8.876810721276917e-06, "loss": 0.9053, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 140 }, { "epoch": 0.35516372795969775, "grad_norm": 1.3359375, "learning_rate": 8.875422195416164e-06, "loss": 0.8672, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 141 }, { "epoch": 0.35768261964735515, "grad_norm": 1.2265625, "learning_rate": 8.874029895817606e-06, "loss": 0.8008, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 142 }, { "epoch": 0.3602015113350126, "grad_norm": 1.1953125, "learning_rate": 8.8726338070759e-06, "loss": 0.7822, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 143 }, { "epoch": 0.36272040302267, "grad_norm": 1.140625, "learning_rate": 8.871233913701741e-06, "loss": 0.7842, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 144 }, { "epoch": 0.36523929471032746, "grad_norm": 1.1953125, "learning_rate": 8.869830200121285e-06, "loss": 0.7891, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 145 }, { "epoch": 0.3677581863979849, "grad_norm": 1.125, "learning_rate": 8.868422650675573e-06, "loss": 0.8408, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 146 }, { "epoch": 0.3702770780856423, "grad_norm": 1.2578125, "learning_rate": 8.867011249619946e-06, "loss": 0.8447, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 147 }, { "epoch": 0.37279596977329976, "grad_norm": 1.2421875, "learning_rate": 8.865595981123458e-06, "loss": 0.8613, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 148 }, { "epoch": 0.37531486146095716, "grad_norm": 1.4375, "learning_rate": 8.864176829268293e-06, "loss": 0.79, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 149 }, { "epoch": 0.3778337531486146, "grad_norm": 1.2578125, "learning_rate": 8.862753778049153e-06, "loss": 0.7529, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 150 }, { "epoch": 0.380352644836272, "grad_norm": 1.328125, "learning_rate": 8.86132681137267e-06, "loss": 0.8779, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 151 }, { "epoch": 0.38287153652392947, "grad_norm": 1.2890625, "learning_rate": 8.859895913056788e-06, "loss": 0.8652, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 152 }, { "epoch": 0.3853904282115869, "grad_norm": 1.265625, "learning_rate": 8.858461066830166e-06, "loss": 0.7529, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 153 }, { "epoch": 0.3879093198992443, "grad_norm": 1.359375, "learning_rate": 8.857022256331542e-06, "loss": 0.8027, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 154 }, { "epoch": 0.3904282115869018, "grad_norm": 1.40625, "learning_rate": 8.85557946510913e-06, "loss": 0.833, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 155 }, { "epoch": 0.3929471032745592, "grad_norm": 1.1328125, "learning_rate": 8.854132676619979e-06, "loss": 0.8008, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 156 }, { "epoch": 0.3954659949622166, "grad_norm": 1.1796875, "learning_rate": 8.852681874229348e-06, "loss": 0.8594, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 157 }, { "epoch": 0.3979848866498741, "grad_norm": 1.140625, "learning_rate": 8.851227041210064e-06, "loss": 0.7725, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 158 }, { "epoch": 0.4005037783375315, "grad_norm": 1.3828125, "learning_rate": 8.849768160741887e-06, "loss": 0.9307, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 159 }, { "epoch": 0.40302267002518893, "grad_norm": 1.296875, "learning_rate": 8.84830521591085e-06, "loss": 0.8184, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 160 }, { "epoch": 0.40554156171284633, "grad_norm": 1.0625, "learning_rate": 8.846838189708618e-06, "loss": 0.8447, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 161 }, { "epoch": 0.4080604534005038, "grad_norm": 1.1484375, "learning_rate": 8.845367065031817e-06, "loss": 0.8145, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 162 }, { "epoch": 0.4105793450881612, "grad_norm": 1.3671875, "learning_rate": 8.843891824681381e-06, "loss": 0.8037, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 163 }, { "epoch": 0.41309823677581864, "grad_norm": 1.125, "learning_rate": 8.842412451361868e-06, "loss": 0.832, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 164 }, { "epoch": 0.4156171284634761, "grad_norm": 1.1484375, "learning_rate": 8.840928927680798e-06, "loss": 0.8184, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 165 }, { "epoch": 0.4181360201511335, "grad_norm": 1.171875, "learning_rate": 8.839441236147964e-06, "loss": 0.8506, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 166 }, { "epoch": 0.42065491183879095, "grad_norm": 1.140625, "learning_rate": 8.837949359174743e-06, "loss": 0.8154, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 167 }, { "epoch": 0.42317380352644834, "grad_norm": 1.1015625, "learning_rate": 8.836453279073408e-06, "loss": 0.7705, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 168 }, { "epoch": 0.4256926952141058, "grad_norm": 1.34375, "learning_rate": 8.834952978056426e-06, "loss": 0.8115, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 169 }, { "epoch": 0.4282115869017632, "grad_norm": 1.421875, "learning_rate": 8.833448438235755e-06, "loss": 0.8203, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 170 }, { "epoch": 0.43073047858942065, "grad_norm": 1.25, "learning_rate": 8.831939641622132e-06, "loss": 0.7988, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 171 }, { "epoch": 0.4332493702770781, "grad_norm": 1.203125, "learning_rate": 8.83042657012435e-06, "loss": 0.7988, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 172 }, { "epoch": 0.4357682619647355, "grad_norm": 1.15625, "learning_rate": 8.82890920554855e-06, "loss": 0.8359, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 173 }, { "epoch": 0.43828715365239296, "grad_norm": 1.3515625, "learning_rate": 8.827387529597475e-06, "loss": 0.8623, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 174 }, { "epoch": 0.44080604534005036, "grad_norm": 1.4453125, "learning_rate": 8.825861523869745e-06, "loss": 0.8232, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 175 }, { "epoch": 0.4433249370277078, "grad_norm": 1.2734375, "learning_rate": 8.824331169859111e-06, "loss": 0.7861, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 176 }, { "epoch": 0.44584382871536526, "grad_norm": 1.21875, "learning_rate": 8.82279644895371e-06, "loss": 0.7861, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 177 }, { "epoch": 0.44836272040302266, "grad_norm": 1.3671875, "learning_rate": 8.821257342435307e-06, "loss": 0.7646, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 178 }, { "epoch": 0.4508816120906801, "grad_norm": 1.2734375, "learning_rate": 8.819713831478538e-06, "loss": 0.7959, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 179 }, { "epoch": 0.4534005037783375, "grad_norm": 1.4140625, "learning_rate": 8.818165897150136e-06, "loss": 0.8311, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 180 }, { "epoch": 0.45591939546599497, "grad_norm": 1.0859375, "learning_rate": 8.816613520408164e-06, "loss": 0.748, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 181 }, { "epoch": 0.45843828715365237, "grad_norm": 1.5625, "learning_rate": 8.815056682101229e-06, "loss": 0.8076, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 182 }, { "epoch": 0.4609571788413098, "grad_norm": 1.28125, "learning_rate": 8.8134953629677e-06, "loss": 0.7988, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 183 }, { "epoch": 0.4634760705289673, "grad_norm": 1.3671875, "learning_rate": 8.81192954363491e-06, "loss": 0.8828, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 184 }, { "epoch": 0.4659949622166247, "grad_norm": 1.234375, "learning_rate": 8.810359204618346e-06, "loss": 0.8242, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 185 }, { "epoch": 0.46851385390428213, "grad_norm": 1.640625, "learning_rate": 8.808784326320862e-06, "loss": 0.7812, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 186 }, { "epoch": 0.47103274559193953, "grad_norm": 1.0859375, "learning_rate": 8.807204889031843e-06, "loss": 0.7666, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 187 }, { "epoch": 0.473551637279597, "grad_norm": 1.296875, "learning_rate": 8.805620872926398e-06, "loss": 0.7598, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 188 }, { "epoch": 0.4760705289672544, "grad_norm": 1.046875, "learning_rate": 8.804032258064517e-06, "loss": 0.7295, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 189 }, { "epoch": 0.47858942065491183, "grad_norm": 1.3125, "learning_rate": 8.802439024390243e-06, "loss": 0.8799, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 190 }, { "epoch": 0.4811083123425693, "grad_norm": 1.140625, "learning_rate": 8.80084115173083e-06, "loss": 0.7549, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 191 }, { "epoch": 0.4836272040302267, "grad_norm": 1.265625, "learning_rate": 8.799238619795886e-06, "loss": 0.8633, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 192 }, { "epoch": 0.48614609571788414, "grad_norm": 1.109375, "learning_rate": 8.79763140817651e-06, "loss": 0.8408, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 193 }, { "epoch": 0.48866498740554154, "grad_norm": 1.2109375, "learning_rate": 8.796019496344436e-06, "loss": 0.8496, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 194 }, { "epoch": 0.491183879093199, "grad_norm": 1.671875, "learning_rate": 8.794402863651155e-06, "loss": 0.833, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 195 }, { "epoch": 0.49370277078085645, "grad_norm": 1.421875, "learning_rate": 8.792781489327033e-06, "loss": 0.8457, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 196 }, { "epoch": 0.49622166246851385, "grad_norm": 1.109375, "learning_rate": 8.791155352480418e-06, "loss": 0.7988, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 197 }, { "epoch": 0.4987405541561713, "grad_norm": 1.2265625, "learning_rate": 8.789524432096748e-06, "loss": 0.8125, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 198 }, { "epoch": 0.5012594458438288, "grad_norm": 1.1953125, "learning_rate": 8.787888707037644e-06, "loss": 0.7598, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 199 }, { "epoch": 0.5037783375314862, "grad_norm": 1.2109375, "learning_rate": 8.786248156039995e-06, "loss": 0.8047, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 200 }, { "epoch": 0.5062972292191436, "grad_norm": 1.140625, "learning_rate": 8.784602757715037e-06, "loss": 0.8418, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 201 }, { "epoch": 0.5088161209068011, "grad_norm": 1.3125, "learning_rate": 8.782952490547428e-06, "loss": 0.8369, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 202 }, { "epoch": 0.5113350125944585, "grad_norm": 2.515625, "learning_rate": 8.781297332894304e-06, "loss": 0.79, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 203 }, { "epoch": 0.5138539042821159, "grad_norm": 1.4453125, "learning_rate": 8.779637262984337e-06, "loss": 0.8691, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 204 }, { "epoch": 0.5163727959697733, "grad_norm": 1.4765625, "learning_rate": 8.777972258916777e-06, "loss": 0.8809, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 205 }, { "epoch": 0.5188916876574308, "grad_norm": 1.2421875, "learning_rate": 8.776302298660493e-06, "loss": 0.8086, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 206 }, { "epoch": 0.5214105793450882, "grad_norm": 1.1484375, "learning_rate": 8.774627360052997e-06, "loss": 0.7715, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 207 }, { "epoch": 0.5239294710327456, "grad_norm": 1.2734375, "learning_rate": 8.77294742079947e-06, "loss": 0.8145, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 208 }, { "epoch": 0.5264483627204031, "grad_norm": 1.34375, "learning_rate": 8.771262458471761e-06, "loss": 0.7949, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 209 }, { "epoch": 0.5289672544080605, "grad_norm": 1.3671875, "learning_rate": 8.769572450507402e-06, "loss": 0.75, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 210 }, { "epoch": 0.5314861460957179, "grad_norm": 1.203125, "learning_rate": 8.767877374208597e-06, "loss": 0.8477, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 211 }, { "epoch": 0.5340050377833753, "grad_norm": 1.171875, "learning_rate": 8.7661772067412e-06, "loss": 0.8154, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 212 }, { "epoch": 0.5365239294710328, "grad_norm": 1.125, "learning_rate": 8.76447192513369e-06, "loss": 0.7559, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 213 }, { "epoch": 0.5390428211586902, "grad_norm": 1.2265625, "learning_rate": 8.76276150627615e-06, "loss": 0.8047, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 214 }, { "epoch": 0.5415617128463476, "grad_norm": 1.1015625, "learning_rate": 8.761045926919207e-06, "loss": 0.792, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 215 }, { "epoch": 0.5440806045340051, "grad_norm": 1.1640625, "learning_rate": 8.75932516367299e-06, "loss": 0.7939, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 216 }, { "epoch": 0.5465994962216625, "grad_norm": 1.21875, "learning_rate": 8.757599193006053e-06, "loss": 0.7637, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 217 }, { "epoch": 0.5491183879093199, "grad_norm": 1.2421875, "learning_rate": 8.755867991244318e-06, "loss": 0.7939, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 218 }, { "epoch": 0.5516372795969773, "grad_norm": 1.0546875, "learning_rate": 8.754131534569984e-06, "loss": 0.7812, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 219 }, { "epoch": 0.5541561712846348, "grad_norm": 1.203125, "learning_rate": 8.752389799020436e-06, "loss": 0.7695, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 220 }, { "epoch": 0.5566750629722922, "grad_norm": 1.2265625, "learning_rate": 8.750642760487145e-06, "loss": 0.8857, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 221 }, { "epoch": 0.5591939546599496, "grad_norm": 1.1953125, "learning_rate": 8.748890394714553e-06, "loss": 0.8574, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 222 }, { "epoch": 0.5617128463476071, "grad_norm": 1.125, "learning_rate": 8.747132677298948e-06, "loss": 0.8193, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 223 }, { "epoch": 0.5642317380352645, "grad_norm": 1.0234375, "learning_rate": 8.74536958368734e-06, "loss": 0.7441, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 224 }, { "epoch": 0.5667506297229219, "grad_norm": 1.203125, "learning_rate": 8.74360108917631e-06, "loss": 0.792, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 225 }, { "epoch": 0.5692695214105793, "grad_norm": 1.28125, "learning_rate": 8.741827168910857e-06, "loss": 0.8105, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 226 }, { "epoch": 0.5717884130982368, "grad_norm": 1.0625, "learning_rate": 8.740047797883237e-06, "loss": 0.8115, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 227 }, { "epoch": 0.5743073047858942, "grad_norm": 1.140625, "learning_rate": 8.738262950931784e-06, "loss": 0.7998, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 228 }, { "epoch": 0.5768261964735516, "grad_norm": 1.234375, "learning_rate": 8.736472602739726e-06, "loss": 0.7686, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 229 }, { "epoch": 0.5793450881612091, "grad_norm": 1.203125, "learning_rate": 8.73467672783399e-06, "loss": 0.7959, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 230 }, { "epoch": 0.5818639798488665, "grad_norm": 1.1796875, "learning_rate": 8.73287530058399e-06, "loss": 0.6982, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 231 }, { "epoch": 0.5843828715365239, "grad_norm": 1.21875, "learning_rate": 8.731068295200414e-06, "loss": 0.7695, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 232 }, { "epoch": 0.5869017632241813, "grad_norm": 1.1953125, "learning_rate": 8.729255685733977e-06, "loss": 0.7637, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 233 }, { "epoch": 0.5894206549118388, "grad_norm": 1.15625, "learning_rate": 8.727437446074203e-06, "loss": 0.7646, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 234 }, { "epoch": 0.5919395465994962, "grad_norm": 1.125, "learning_rate": 8.72561354994815e-06, "loss": 0.7607, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 235 }, { "epoch": 0.5944584382871536, "grad_norm": 10.6875, "learning_rate": 8.723783970919162e-06, "loss": 0.8115, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 236 }, { "epoch": 0.5969773299748111, "grad_norm": 1.171875, "learning_rate": 8.721948682385575e-06, "loss": 0.7295, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 237 }, { "epoch": 0.5994962216624685, "grad_norm": 1.1953125, "learning_rate": 8.720107657579442e-06, "loss": 0.7764, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 238 }, { "epoch": 0.6020151133501259, "grad_norm": 1.09375, "learning_rate": 8.718260869565218e-06, "loss": 0.8428, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 239 }, { "epoch": 0.6045340050377834, "grad_norm": 1.1796875, "learning_rate": 8.716408291238462e-06, "loss": 0.7832, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 240 }, { "epoch": 0.6070528967254408, "grad_norm": 1.171875, "learning_rate": 8.714549895324495e-06, "loss": 0.7861, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 241 }, { "epoch": 0.6095717884130982, "grad_norm": 1.2109375, "learning_rate": 8.712685654377075e-06, "loss": 0.7676, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 242 }, { "epoch": 0.6120906801007556, "grad_norm": 1.15625, "learning_rate": 8.710815540777039e-06, "loss": 0.7598, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 243 }, { "epoch": 0.6146095717884131, "grad_norm": 1.140625, "learning_rate": 8.708939526730937e-06, "loss": 0.7979, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 244 }, { "epoch": 0.6171284634760705, "grad_norm": 1.2421875, "learning_rate": 8.707057584269663e-06, "loss": 0.7959, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 245 }, { "epoch": 0.6196473551637279, "grad_norm": 1.125, "learning_rate": 8.705169685247055e-06, "loss": 0.8115, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 246 }, { "epoch": 0.6221662468513854, "grad_norm": 1.0546875, "learning_rate": 8.7032758013385e-06, "loss": 0.7598, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 247 }, { "epoch": 0.6246851385390428, "grad_norm": 1.1953125, "learning_rate": 8.701375904039512e-06, "loss": 0.8301, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 248 }, { "epoch": 0.6272040302267002, "grad_norm": 1.609375, "learning_rate": 8.699469964664311e-06, "loss": 0.7529, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 249 }, { "epoch": 0.6297229219143576, "grad_norm": 1.203125, "learning_rate": 8.697557954344365e-06, "loss": 0.7852, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 250 }, { "epoch": 0.6322418136020151, "grad_norm": 1.1953125, "learning_rate": 8.695639844026941e-06, "loss": 0.8721, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 251 }, { "epoch": 0.6347607052896725, "grad_norm": 1.2421875, "learning_rate": 8.693715604473637e-06, "loss": 0.915, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 252 }, { "epoch": 0.6372795969773299, "grad_norm": 1.1640625, "learning_rate": 8.691785206258892e-06, "loss": 0.752, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 253 }, { "epoch": 0.6397984886649875, "grad_norm": 1.546875, "learning_rate": 8.689848619768477e-06, "loss": 0.8145, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 254 }, { "epoch": 0.6423173803526449, "grad_norm": 1.296875, "learning_rate": 8.687905815198002e-06, "loss": 0.8467, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 255 }, { "epoch": 0.6448362720403022, "grad_norm": 1.328125, "learning_rate": 8.685956762551366e-06, "loss": 0.8008, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 256 }, { "epoch": 0.6473551637279596, "grad_norm": 1.140625, "learning_rate": 8.684001431639228e-06, "loss": 0.7744, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 257 }, { "epoch": 0.6498740554156172, "grad_norm": 1.03125, "learning_rate": 8.682039792077434e-06, "loss": 0.7354, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 258 }, { "epoch": 0.6523929471032746, "grad_norm": 1.1171875, "learning_rate": 8.680071813285458e-06, "loss": 0.833, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 259 }, { "epoch": 0.654911838790932, "grad_norm": 1.1640625, "learning_rate": 8.678097464484805e-06, "loss": 0.7764, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 260 }, { "epoch": 0.6574307304785895, "grad_norm": 1.46875, "learning_rate": 8.676116714697407e-06, "loss": 0.8633, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 261 }, { "epoch": 0.6599496221662469, "grad_norm": 1.09375, "learning_rate": 8.674129532744001e-06, "loss": 0.8398, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 262 }, { "epoch": 0.6624685138539043, "grad_norm": 1.3671875, "learning_rate": 8.672135887242501e-06, "loss": 0.8594, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 263 }, { "epoch": 0.6649874055415617, "grad_norm": 1.234375, "learning_rate": 8.670135746606335e-06, "loss": 0.7314, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 264 }, { "epoch": 0.6675062972292192, "grad_norm": 1.0546875, "learning_rate": 8.668129079042786e-06, "loss": 0.7266, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 265 }, { "epoch": 0.6700251889168766, "grad_norm": 1.2734375, "learning_rate": 8.666115852551298e-06, "loss": 0.9062, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 266 }, { "epoch": 0.672544080604534, "grad_norm": 1.1796875, "learning_rate": 8.66409603492179e-06, "loss": 0.707, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 267 }, { "epoch": 0.6750629722921915, "grad_norm": 1.046875, "learning_rate": 8.66206959373292e-06, "loss": 0.7881, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 268 }, { "epoch": 0.6775818639798489, "grad_norm": 1.1328125, "learning_rate": 8.660036496350365e-06, "loss": 0.7598, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 269 }, { "epoch": 0.6801007556675063, "grad_norm": 1.3125, "learning_rate": 8.657996709925059e-06, "loss": 0.8418, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 270 }, { "epoch": 0.6826196473551638, "grad_norm": 1.2421875, "learning_rate": 8.655950201391432e-06, "loss": 0.7588, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 271 }, { "epoch": 0.6851385390428212, "grad_norm": 1.0625, "learning_rate": 8.653896937465615e-06, "loss": 0.7383, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 272 }, { "epoch": 0.6876574307304786, "grad_norm": 1.28125, "learning_rate": 8.651836884643645e-06, "loss": 0.7686, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 273 }, { "epoch": 0.690176322418136, "grad_norm": 1.328125, "learning_rate": 8.649770009199633e-06, "loss": 0.8057, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 274 }, { "epoch": 0.6926952141057935, "grad_norm": 1.28125, "learning_rate": 8.647696277183928e-06, "loss": 0.79, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 275 }, { "epoch": 0.6952141057934509, "grad_norm": 1.1171875, "learning_rate": 8.645615654421265e-06, "loss": 0.8838, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 276 }, { "epoch": 0.6977329974811083, "grad_norm": 1.5703125, "learning_rate": 8.643528106508877e-06, "loss": 0.7314, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 277 }, { "epoch": 0.7002518891687658, "grad_norm": 1.3359375, "learning_rate": 8.641433598814596e-06, "loss": 0.7676, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 278 }, { "epoch": 0.7027707808564232, "grad_norm": 1.2890625, "learning_rate": 8.639332096474953e-06, "loss": 0.8281, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 279 }, { "epoch": 0.7052896725440806, "grad_norm": 1.15625, "learning_rate": 8.637223564393235e-06, "loss": 0.7646, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 280 }, { "epoch": 0.707808564231738, "grad_norm": 1.2265625, "learning_rate": 8.635107967237528e-06, "loss": 0.8105, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 281 }, { "epoch": 0.7103274559193955, "grad_norm": 1.2265625, "learning_rate": 8.632985269438747e-06, "loss": 0.8438, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 282 }, { "epoch": 0.7128463476070529, "grad_norm": 1.109375, "learning_rate": 8.630855435188644e-06, "loss": 0.7891, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 283 }, { "epoch": 0.7153652392947103, "grad_norm": 1.0859375, "learning_rate": 8.628718428437793e-06, "loss": 0.9287, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 284 }, { "epoch": 0.7178841309823678, "grad_norm": 1.125, "learning_rate": 8.626574212893554e-06, "loss": 0.7598, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 285 }, { "epoch": 0.7204030226700252, "grad_norm": 1.140625, "learning_rate": 8.624422752018023e-06, "loss": 0.8291, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 286 }, { "epoch": 0.7229219143576826, "grad_norm": 1.296875, "learning_rate": 8.62226400902595e-06, "loss": 0.7832, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 287 }, { "epoch": 0.72544080604534, "grad_norm": 1.6328125, "learning_rate": 8.62009794688265e-06, "loss": 0.8164, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 288 }, { "epoch": 0.7279596977329975, "grad_norm": 1.234375, "learning_rate": 8.617924528301888e-06, "loss": 0.7773, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 289 }, { "epoch": 0.7304785894206549, "grad_norm": 1.171875, "learning_rate": 8.615743715743717e-06, "loss": 0.8291, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 290 }, { "epoch": 0.7329974811083123, "grad_norm": 1.078125, "learning_rate": 8.613555471412343e-06, "loss": 0.7959, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 291 }, { "epoch": 0.7355163727959698, "grad_norm": 1.40625, "learning_rate": 8.611359757253934e-06, "loss": 0.7549, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 292 }, { "epoch": 0.7380352644836272, "grad_norm": 1.0703125, "learning_rate": 8.609156534954409e-06, "loss": 0.7715, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 293 }, { "epoch": 0.7405541561712846, "grad_norm": 1.1015625, "learning_rate": 8.606945765937203e-06, "loss": 0.7861, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 294 }, { "epoch": 0.743073047858942, "grad_norm": 1.3046875, "learning_rate": 8.604727411361037e-06, "loss": 0.8574, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 295 }, { "epoch": 0.7455919395465995, "grad_norm": 1.125, "learning_rate": 8.602501432117624e-06, "loss": 0.75, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 296 }, { "epoch": 0.7481108312342569, "grad_norm": 1.0546875, "learning_rate": 8.600267788829381e-06, "loss": 0.8682, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 297 }, { "epoch": 0.7506297229219143, "grad_norm": 1.078125, "learning_rate": 8.598026441847098e-06, "loss": 0.8125, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 298 }, { "epoch": 0.7531486146095718, "grad_norm": 1.09375, "learning_rate": 8.595777351247601e-06, "loss": 0.8555, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 299 }, { "epoch": 0.7556675062972292, "grad_norm": 1.15625, "learning_rate": 8.593520476831378e-06, "loss": 0.7256, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 300 }, { "epoch": 0.7581863979848866, "grad_norm": 1.0546875, "learning_rate": 8.591255778120185e-06, "loss": 0.8066, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 301 }, { "epoch": 0.760705289672544, "grad_norm": 1.1015625, "learning_rate": 8.58898321435462e-06, "loss": 0.8301, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 302 }, { "epoch": 0.7632241813602015, "grad_norm": 1.1171875, "learning_rate": 8.58670274449169e-06, "loss": 0.7686, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 303 }, { "epoch": 0.7657430730478589, "grad_norm": 1.234375, "learning_rate": 8.584414327202324e-06, "loss": 0.7969, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 304 }, { "epoch": 0.7682619647355163, "grad_norm": 1.171875, "learning_rate": 8.582117920868892e-06, "loss": 0.8574, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 305 }, { "epoch": 0.7707808564231738, "grad_norm": 1.0703125, "learning_rate": 8.57981348358267e-06, "loss": 0.7412, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 306 }, { "epoch": 0.7732997481108312, "grad_norm": 1.265625, "learning_rate": 8.5775009731413e-06, "loss": 0.9023, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 307 }, { "epoch": 0.7758186397984886, "grad_norm": 1.21875, "learning_rate": 8.575180347046208e-06, "loss": 0.8477, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 308 }, { "epoch": 0.7783375314861462, "grad_norm": 1.1015625, "learning_rate": 8.5728515625e-06, "loss": 0.8184, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 309 }, { "epoch": 0.7808564231738035, "grad_norm": 1.0078125, "learning_rate": 8.570514576403836e-06, "loss": 0.7373, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 310 }, { "epoch": 0.783375314861461, "grad_norm": 1.1875, "learning_rate": 8.568169345354762e-06, "loss": 0.7637, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 311 }, { "epoch": 0.7858942065491183, "grad_norm": 1.1328125, "learning_rate": 8.565815825643039e-06, "loss": 0.7979, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 312 }, { "epoch": 0.7884130982367759, "grad_norm": 1.3203125, "learning_rate": 8.56345397324941e-06, "loss": 0.8418, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 313 }, { "epoch": 0.7909319899244333, "grad_norm": 1.1796875, "learning_rate": 8.561083743842366e-06, "loss": 0.8135, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 314 }, { "epoch": 0.7934508816120907, "grad_norm": 1.1953125, "learning_rate": 8.558705092775365e-06, "loss": 0.8242, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 315 }, { "epoch": 0.7959697732997482, "grad_norm": 1.34375, "learning_rate": 8.556317975084042e-06, "loss": 0.6963, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 316 }, { "epoch": 0.7984886649874056, "grad_norm": 1.453125, "learning_rate": 8.55392234548336e-06, "loss": 0.8691, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 317 }, { "epoch": 0.801007556675063, "grad_norm": 1.078125, "learning_rate": 8.551518158364756e-06, "loss": 0.7109, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 318 }, { "epoch": 0.8035264483627204, "grad_norm": 1.15625, "learning_rate": 8.54910536779324e-06, "loss": 0.7812, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 319 }, { "epoch": 0.8060453400503779, "grad_norm": 1.1171875, "learning_rate": 8.546683927504481e-06, "loss": 0.7041, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 320 }, { "epoch": 0.8085642317380353, "grad_norm": 1.2109375, "learning_rate": 8.544253790901836e-06, "loss": 0.8018, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 321 }, { "epoch": 0.8110831234256927, "grad_norm": 1.0703125, "learning_rate": 8.54181491105337e-06, "loss": 0.7627, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 322 }, { "epoch": 0.8136020151133502, "grad_norm": 1.171875, "learning_rate": 8.539367240688826e-06, "loss": 0.75, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 323 }, { "epoch": 0.8161209068010076, "grad_norm": 1.15625, "learning_rate": 8.536910732196591e-06, "loss": 0.7305, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 324 }, { "epoch": 0.818639798488665, "grad_norm": 1.0703125, "learning_rate": 8.534445337620579e-06, "loss": 0.7949, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 325 }, { "epoch": 0.8211586901763224, "grad_norm": 1.1171875, "learning_rate": 8.531971008657139e-06, "loss": 0.79, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 326 }, { "epoch": 0.8236775818639799, "grad_norm": 1.0703125, "learning_rate": 8.529487696651876e-06, "loss": 0.7666, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 327 }, { "epoch": 0.8261964735516373, "grad_norm": 1.0078125, "learning_rate": 8.526995352596485e-06, "loss": 0.6963, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 328 }, { "epoch": 0.8287153652392947, "grad_norm": 1.0390625, "learning_rate": 8.524493927125505e-06, "loss": 0.7451, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 329 }, { "epoch": 0.8312342569269522, "grad_norm": 1.078125, "learning_rate": 8.521983370513081e-06, "loss": 0.7168, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 330 }, { "epoch": 0.8337531486146096, "grad_norm": 1.5390625, "learning_rate": 8.519463632669648e-06, "loss": 0.7754, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 331 }, { "epoch": 0.836272040302267, "grad_norm": 1.265625, "learning_rate": 8.51693466313861e-06, "loss": 0.8301, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 332 }, { "epoch": 0.8387909319899244, "grad_norm": 1.3046875, "learning_rate": 8.514396411092986e-06, "loss": 0.7988, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 333 }, { "epoch": 0.8413098236775819, "grad_norm": 3.15625, "learning_rate": 8.511848825331972e-06, "loss": 0.7119, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 334 }, { "epoch": 0.8438287153652393, "grad_norm": 1.1171875, "learning_rate": 8.509291854277527e-06, "loss": 0.7715, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 335 }, { "epoch": 0.8463476070528967, "grad_norm": 2.578125, "learning_rate": 8.506725445970883e-06, "loss": 0.7773, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 336 }, { "epoch": 0.8488664987405542, "grad_norm": 1.21875, "learning_rate": 8.504149548069022e-06, "loss": 0.7607, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 337 }, { "epoch": 0.8513853904282116, "grad_norm": 1.2109375, "learning_rate": 8.501564107841121e-06, "loss": 0.752, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 338 }, { "epoch": 0.853904282115869, "grad_norm": 1.1328125, "learning_rate": 8.498969072164949e-06, "loss": 0.7812, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 339 }, { "epoch": 0.8564231738035264, "grad_norm": 1.1171875, "learning_rate": 8.496364387523239e-06, "loss": 0.8232, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 340 }, { "epoch": 0.8589420654911839, "grad_norm": 1.234375, "learning_rate": 8.49375e-06, "loss": 0.8281, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 341 }, { "epoch": 0.8614609571788413, "grad_norm": 1.2734375, "learning_rate": 8.4911258552768e-06, "loss": 0.9336, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 342 }, { "epoch": 0.8639798488664987, "grad_norm": 1.4921875, "learning_rate": 8.488491898629e-06, "loss": 0.7705, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 343 }, { "epoch": 0.8664987405541562, "grad_norm": 1.359375, "learning_rate": 8.485848074921957e-06, "loss": 0.8467, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 344 }, { "epoch": 0.8690176322418136, "grad_norm": 1.1171875, "learning_rate": 8.483194328607173e-06, "loss": 0.7803, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 345 }, { "epoch": 0.871536523929471, "grad_norm": 1.1953125, "learning_rate": 8.480530603718405e-06, "loss": 0.792, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 346 }, { "epoch": 0.8740554156171285, "grad_norm": 1.1328125, "learning_rate": 8.477856843867728e-06, "loss": 0.832, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 347 }, { "epoch": 0.8765743073047859, "grad_norm": 1.0078125, "learning_rate": 8.47517299224156e-06, "loss": 0.7949, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 348 }, { "epoch": 0.8790931989924433, "grad_norm": 1.078125, "learning_rate": 8.472478991596638e-06, "loss": 0.7598, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 349 }, { "epoch": 0.8816120906801007, "grad_norm": 1.09375, "learning_rate": 8.469774784255946e-06, "loss": 0.7158, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 350 }, { "epoch": 0.8841309823677582, "grad_norm": 1.3125, "learning_rate": 8.467060312104597e-06, "loss": 0.7334, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 351 }, { "epoch": 0.8866498740554156, "grad_norm": 1.1484375, "learning_rate": 8.464335516585674e-06, "loss": 0.8545, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 352 }, { "epoch": 0.889168765743073, "grad_norm": 1.21875, "learning_rate": 8.46160033869602e-06, "loss": 0.748, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 353 }, { "epoch": 0.8916876574307305, "grad_norm": 1.265625, "learning_rate": 8.458854718981973e-06, "loss": 0.7686, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 354 }, { "epoch": 0.8942065491183879, "grad_norm": 1.25, "learning_rate": 8.456098597535062e-06, "loss": 0.8613, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 355 }, { "epoch": 0.8967254408060453, "grad_norm": 1.1484375, "learning_rate": 8.453331913987651e-06, "loss": 0.7715, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 356 }, { "epoch": 0.8992443324937027, "grad_norm": 1.1328125, "learning_rate": 8.450554607508531e-06, "loss": 0.7793, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 357 }, { "epoch": 0.9017632241813602, "grad_norm": 1.15625, "learning_rate": 8.447766616798463e-06, "loss": 0.8506, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 358 }, { "epoch": 0.9042821158690176, "grad_norm": 0.984375, "learning_rate": 8.444967880085653e-06, "loss": 0.7002, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 359 }, { "epoch": 0.906801007556675, "grad_norm": 1.0625, "learning_rate": 8.442158335121218e-06, "loss": 0.8105, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 360 }, { "epoch": 0.9093198992443325, "grad_norm": 1.1953125, "learning_rate": 8.439337919174549e-06, "loss": 0.751, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 361 }, { "epoch": 0.9118387909319899, "grad_norm": 1.171875, "learning_rate": 8.436506569028646e-06, "loss": 0.8604, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 362 }, { "epoch": 0.9143576826196473, "grad_norm": 1.3828125, "learning_rate": 8.4336642209754e-06, "loss": 0.8877, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 363 }, { "epoch": 0.9168765743073047, "grad_norm": 1.046875, "learning_rate": 8.430810810810811e-06, "loss": 0.79, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 364 }, { "epoch": 0.9193954659949622, "grad_norm": 1.0703125, "learning_rate": 8.427946273830157e-06, "loss": 0.7539, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 365 }, { "epoch": 0.9219143576826196, "grad_norm": 1.21875, "learning_rate": 8.425070544823096e-06, "loss": 0.7451, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 366 }, { "epoch": 0.924433249370277, "grad_norm": 1.0625, "learning_rate": 8.422183558068725e-06, "loss": 0.8232, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 367 }, { "epoch": 0.9269521410579346, "grad_norm": 1.234375, "learning_rate": 8.419285247330573e-06, "loss": 0.7607, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 368 }, { "epoch": 0.929471032745592, "grad_norm": 1.1484375, "learning_rate": 8.416375545851529e-06, "loss": 0.8428, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 369 }, { "epoch": 0.9319899244332494, "grad_norm": 1.0546875, "learning_rate": 8.413454386348721e-06, "loss": 0.7803, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 370 }, { "epoch": 0.9345088161209067, "grad_norm": 1.0234375, "learning_rate": 8.41052170100833e-06, "loss": 0.7412, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 371 }, { "epoch": 0.9370277078085643, "grad_norm": 1.046875, "learning_rate": 8.407577421480343e-06, "loss": 0.9385, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 372 }, { "epoch": 0.9395465994962217, "grad_norm": 1.1328125, "learning_rate": 8.40462147887324e-06, "loss": 0.7285, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 373 }, { "epoch": 0.9420654911838791, "grad_norm": 1.1015625, "learning_rate": 8.401653803748621e-06, "loss": 0.7832, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 374 }, { "epoch": 0.9445843828715366, "grad_norm": 1.0703125, "learning_rate": 8.398674326115776e-06, "loss": 0.7686, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 375 }, { "epoch": 0.947103274559194, "grad_norm": 1.21875, "learning_rate": 8.395682975426168e-06, "loss": 0.7559, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 376 }, { "epoch": 0.9496221662468514, "grad_norm": 1.1953125, "learning_rate": 8.392679680567879e-06, "loss": 0.793, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 377 }, { "epoch": 0.9521410579345088, "grad_norm": 1.1328125, "learning_rate": 8.389664369859969e-06, "loss": 0.7383, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 378 }, { "epoch": 0.9546599496221663, "grad_norm": 5.4375, "learning_rate": 8.38663697104677e-06, "loss": 0.7324, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 379 }, { "epoch": 0.9571788413098237, "grad_norm": 1.140625, "learning_rate": 8.383597411292124e-06, "loss": 0.7949, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 380 }, { "epoch": 0.9596977329974811, "grad_norm": 1.109375, "learning_rate": 8.380545617173523e-06, "loss": 0.7471, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 381 }, { "epoch": 0.9622166246851386, "grad_norm": 1.15625, "learning_rate": 8.377481514676227e-06, "loss": 0.8252, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 382 }, { "epoch": 0.964735516372796, "grad_norm": 1.0859375, "learning_rate": 8.374405029187249e-06, "loss": 0.7275, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 383 }, { "epoch": 0.9672544080604534, "grad_norm": 1.3515625, "learning_rate": 8.371316085489314e-06, "loss": 0.8535, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 384 }, { "epoch": 0.9697732997481109, "grad_norm": 1.0078125, "learning_rate": 8.368214607754734e-06, "loss": 0.7021, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 385 }, { "epoch": 0.9722921914357683, "grad_norm": 1.046875, "learning_rate": 8.365100519539192e-06, "loss": 0.7461, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 386 }, { "epoch": 0.9748110831234257, "grad_norm": 1.15625, "learning_rate": 8.361973743775462e-06, "loss": 0.7598, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 387 }, { "epoch": 0.9773299748110831, "grad_norm": 1.4609375, "learning_rate": 8.358834202767068e-06, "loss": 0.7588, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 388 }, { "epoch": 0.9798488664987406, "grad_norm": 1.0703125, "learning_rate": 8.355681818181818e-06, "loss": 0.7598, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 389 }, { "epoch": 0.982367758186398, "grad_norm": 1.203125, "learning_rate": 8.35251651104532e-06, "loss": 0.7852, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 390 }, { "epoch": 0.9848866498740554, "grad_norm": 1.296875, "learning_rate": 8.349338201734368e-06, "loss": 0.7285, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 391 }, { "epoch": 0.9874055415617129, "grad_norm": 1.0234375, "learning_rate": 8.346146809970272e-06, "loss": 0.8027, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 392 }, { "epoch": 0.9899244332493703, "grad_norm": 1.125, "learning_rate": 8.342942254812099e-06, "loss": 0.749, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 393 }, { "epoch": 0.9924433249370277, "grad_norm": 1.109375, "learning_rate": 8.339724454649827e-06, "loss": 0.7881, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 394 }, { "epoch": 0.9949622166246851, "grad_norm": 1.71875, "learning_rate": 8.336493327197423e-06, "loss": 0.7803, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 395 }, { "epoch": 0.9974811083123426, "grad_norm": 1.1953125, "learning_rate": 8.33324878948582e-06, "loss": 0.8389, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 396 }, { "epoch": 1.0, "grad_norm": 1.203125, "learning_rate": 8.329990757855822e-06, "loss": 0.8027, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 397 }, { "epoch": 1.0025188916876575, "grad_norm": 1.21875, "learning_rate": 8.326719147950914e-06, "loss": 0.7734, "memory/device_mem_reserved(gib)": 101.22, "memory/max_mem_active(gib)": 87.86, "memory/max_mem_allocated(gib)": 87.86, "step": 398 } ], "logging_steps": 1, "max_steps": 794, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 199, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.695614396091073e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }