{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.045, "eval_steps": 500, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.5e-05, "grad_norm": 1.4140625, "learning_rate": 0.0, "loss": 11.8522, "memory/device_reserved (GiB)": 96.47, "memory/max_active (GiB)": 96.28, "memory/max_allocated (GiB)": 96.28, "step": 1, "tokens_per_second_per_gpu": 10113.27 }, { "epoch": 5e-05, "grad_norm": 1.9609375, "learning_rate": 1.5e-06, "loss": 11.8565, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 2, "tokens_per_second_per_gpu": 14367.33 }, { "epoch": 7.5e-05, "grad_norm": 2.078125, "learning_rate": 3e-06, "loss": 11.8575, "memory/device_reserved (GiB)": 116.62, "memory/max_active (GiB)": 116.59, "memory/max_allocated (GiB)": 116.59, "step": 3, "tokens_per_second_per_gpu": 12864.73 }, { "epoch": 0.0001, "grad_norm": 2.5, "learning_rate": 4.5e-06, "loss": 11.8552, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 4, "tokens_per_second_per_gpu": 15288.11 }, { "epoch": 0.000125, "grad_norm": 2.109375, "learning_rate": 6e-06, "loss": 11.8529, "memory/device_reserved (GiB)": 96.05, "memory/max_active (GiB)": 95.88, "memory/max_allocated (GiB)": 95.88, "step": 5, "tokens_per_second_per_gpu": 14712.91 }, { "epoch": 0.00015, "grad_norm": 2.4375, "learning_rate": 7.5e-06, "loss": 11.8555, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 6, "tokens_per_second_per_gpu": 16409.32 }, { "epoch": 0.000175, "grad_norm": 2.15625, "learning_rate": 9e-06, "loss": 11.8506, "memory/device_reserved (GiB)": 65.47, "memory/max_active (GiB)": 65.28, "memory/max_allocated (GiB)": 65.28, "step": 7, "tokens_per_second_per_gpu": 19830.93 }, { "epoch": 0.0002, "grad_norm": 1.4921875, "learning_rate": 1.0500000000000001e-05, "loss": 11.8421, "memory/device_reserved (GiB)": 106.43, "memory/max_active (GiB)": 106.39, "memory/max_allocated (GiB)": 106.39, "step": 8, "tokens_per_second_per_gpu": 13488.19 }, { "epoch": 0.000225, "grad_norm": 2.65625, "learning_rate": 1.2e-05, "loss": 11.8375, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 9, "tokens_per_second_per_gpu": 25286.91 }, { "epoch": 0.00025, "grad_norm": 2.40625, "learning_rate": 1.35e-05, "loss": 11.8272, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 10, "tokens_per_second_per_gpu": 11478.41 }, { "epoch": 0.000275, "grad_norm": 2.21875, "learning_rate": 1.5e-05, "loss": 11.8243, "memory/device_reserved (GiB)": 65.8, "memory/max_active (GiB)": 65.62, "memory/max_allocated (GiB)": 65.62, "step": 11, "tokens_per_second_per_gpu": 20833.69 }, { "epoch": 0.0003, "grad_norm": 2.578125, "learning_rate": 1.6499999999999998e-05, "loss": 11.8145, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 12, "tokens_per_second_per_gpu": 16467.72 }, { "epoch": 0.000325, "grad_norm": 2.640625, "learning_rate": 1.8e-05, "loss": 11.7886, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 13, "tokens_per_second_per_gpu": 20761.06 }, { "epoch": 0.00035, "grad_norm": 2.578125, "learning_rate": 1.95e-05, "loss": 11.7645, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 14, "tokens_per_second_per_gpu": 20960.88 }, { "epoch": 0.000375, "grad_norm": 2.53125, "learning_rate": 2.1000000000000002e-05, "loss": 11.7409, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 15, "tokens_per_second_per_gpu": 20311.95 }, { "epoch": 0.0004, "grad_norm": 2.546875, "learning_rate": 2.2499999999999998e-05, "loss": 11.7151, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 16, "tokens_per_second_per_gpu": 29147.7 }, { "epoch": 0.000425, "grad_norm": 2.5, "learning_rate": 2.4e-05, "loss": 11.6894, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 17, "tokens_per_second_per_gpu": 20343.18 }, { "epoch": 0.00045, "grad_norm": 2.453125, "learning_rate": 2.5500000000000003e-05, "loss": 11.6645, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 18, "tokens_per_second_per_gpu": 16488.41 }, { "epoch": 0.000475, "grad_norm": 2.40625, "learning_rate": 2.7e-05, "loss": 11.6393, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 19, "tokens_per_second_per_gpu": 13259.28 }, { "epoch": 0.0005, "grad_norm": 2.46875, "learning_rate": 2.85e-05, "loss": 11.6023, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 20, "tokens_per_second_per_gpu": 24581.09 }, { "epoch": 0.000525, "grad_norm": 2.40625, "learning_rate": 3e-05, "loss": 11.5807, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 21, "tokens_per_second_per_gpu": 14633.44 }, { "epoch": 0.00055, "grad_norm": 1.828125, "learning_rate": 3.15e-05, "loss": 11.5649, "memory/device_reserved (GiB)": 116.46, "memory/max_active (GiB)": 116.28, "memory/max_allocated (GiB)": 116.28, "step": 22, "tokens_per_second_per_gpu": 11695.69 }, { "epoch": 0.000575, "grad_norm": 2.359375, "learning_rate": 3.2999999999999996e-05, "loss": 11.5076, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 23, "tokens_per_second_per_gpu": 11924.69 }, { "epoch": 0.0006, "grad_norm": 2.421875, "learning_rate": 3.45e-05, "loss": 11.4497, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 24, "tokens_per_second_per_gpu": 13825.88 }, { "epoch": 0.000625, "grad_norm": 2.078125, "learning_rate": 3.6e-05, "loss": 11.3925, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 25, "tokens_per_second_per_gpu": 19013.33 }, { "epoch": 0.00065, "grad_norm": 2.0, "learning_rate": 3.7500000000000003e-05, "loss": 11.3541, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 26, "tokens_per_second_per_gpu": 14746.24 }, { "epoch": 0.000675, "grad_norm": 2.015625, "learning_rate": 3.9e-05, "loss": 11.2926, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 27, "tokens_per_second_per_gpu": 27625.65 }, { "epoch": 0.0007, "grad_norm": 1.8125, "learning_rate": 4.05e-05, "loss": 11.2507, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.22, "memory/max_allocated (GiB)": 97.22, "step": 28, "tokens_per_second_per_gpu": 13825.79 }, { "epoch": 0.000725, "grad_norm": 2.03125, "learning_rate": 4.2000000000000004e-05, "loss": 11.1991, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 29, "tokens_per_second_per_gpu": 15703.89 }, { "epoch": 0.00075, "grad_norm": 2.15625, "learning_rate": 4.35e-05, "loss": 11.1543, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 30, "tokens_per_second_per_gpu": 13682.35 }, { "epoch": 0.000775, "grad_norm": 2.09375, "learning_rate": 4.4999999999999996e-05, "loss": 11.1203, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 31, "tokens_per_second_per_gpu": 11194.86 }, { "epoch": 0.0008, "grad_norm": 1.984375, "learning_rate": 4.65e-05, "loss": 11.084, "memory/device_reserved (GiB)": 55.41, "memory/max_active (GiB)": 55.39, "memory/max_allocated (GiB)": 55.39, "step": 32, "tokens_per_second_per_gpu": 23639.16 }, { "epoch": 0.000825, "grad_norm": 2.0625, "learning_rate": 4.8e-05, "loss": 11.0413, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 33, "tokens_per_second_per_gpu": 17374.55 }, { "epoch": 0.00085, "grad_norm": 2.03125, "learning_rate": 4.9500000000000004e-05, "loss": 11.0086, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 34, "tokens_per_second_per_gpu": 18050.07 }, { "epoch": 0.000875, "grad_norm": 2.046875, "learning_rate": 5.1000000000000006e-05, "loss": 10.9648, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 35, "tokens_per_second_per_gpu": 17830.67 }, { "epoch": 0.0009, "grad_norm": 2.0, "learning_rate": 5.250000000000001e-05, "loss": 10.935, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 36, "tokens_per_second_per_gpu": 18055.54 }, { "epoch": 0.000925, "grad_norm": 1.953125, "learning_rate": 5.4e-05, "loss": 10.9138, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 37, "tokens_per_second_per_gpu": 12863.56 }, { "epoch": 0.00095, "grad_norm": 1.9375, "learning_rate": 5.55e-05, "loss": 10.8853, "memory/device_reserved (GiB)": 86.21, "memory/max_active (GiB)": 86.02, "memory/max_allocated (GiB)": 86.02, "step": 38, "tokens_per_second_per_gpu": 16561.42 }, { "epoch": 0.000975, "grad_norm": 1.984375, "learning_rate": 5.7e-05, "loss": 10.8452, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 39, "tokens_per_second_per_gpu": 27814.44 }, { "epoch": 0.001, "grad_norm": 1.953125, "learning_rate": 5.85e-05, "loss": 10.8298, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 40, "tokens_per_second_per_gpu": 13328.96 }, { "epoch": 0.001025, "grad_norm": 1.9296875, "learning_rate": 6e-05, "loss": 10.8025, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 41, "tokens_per_second_per_gpu": 14174.17 }, { "epoch": 0.00105, "grad_norm": 1.9375, "learning_rate": 6.15e-05, "loss": 10.7586, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 42, "tokens_per_second_per_gpu": 17081.9 }, { "epoch": 0.001075, "grad_norm": 2.03125, "learning_rate": 6.3e-05, "loss": 10.7286, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 43, "tokens_per_second_per_gpu": 11448.62 }, { "epoch": 0.0011, "grad_norm": 1.9375, "learning_rate": 6.45e-05, "loss": 10.6858, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 44, "tokens_per_second_per_gpu": 15992.23 }, { "epoch": 0.001125, "grad_norm": 1.953125, "learning_rate": 6.599999999999999e-05, "loss": 10.6427, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 45, "tokens_per_second_per_gpu": 25205.97 }, { "epoch": 0.00115, "grad_norm": 1.9609375, "learning_rate": 6.75e-05, "loss": 10.6118, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 46, "tokens_per_second_per_gpu": 29758.87 }, { "epoch": 0.001175, "grad_norm": 1.9296875, "learning_rate": 6.9e-05, "loss": 10.5835, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 47, "tokens_per_second_per_gpu": 18263.99 }, { "epoch": 0.0012, "grad_norm": 1.921875, "learning_rate": 7.05e-05, "loss": 10.5462, "memory/device_reserved (GiB)": 106.61, "memory/max_active (GiB)": 106.43, "memory/max_allocated (GiB)": 106.43, "step": 48, "tokens_per_second_per_gpu": 13281.22 }, { "epoch": 0.001225, "grad_norm": 1.9296875, "learning_rate": 7.2e-05, "loss": 10.5096, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 49, "tokens_per_second_per_gpu": 18020.26 }, { "epoch": 0.00125, "grad_norm": 2.0, "learning_rate": 7.35e-05, "loss": 10.4851, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 50, "tokens_per_second_per_gpu": 10632.26 }, { "epoch": 0.001275, "grad_norm": 1.9609375, "learning_rate": 7.500000000000001e-05, "loss": 10.4375, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 51, "tokens_per_second_per_gpu": 12961.08 }, { "epoch": 0.0013, "grad_norm": 1.953125, "learning_rate": 7.65e-05, "loss": 10.3899, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 52, "tokens_per_second_per_gpu": 18205.62 }, { "epoch": 0.001325, "grad_norm": 1.96875, "learning_rate": 7.8e-05, "loss": 10.3562, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 53, "tokens_per_second_per_gpu": 14096.89 }, { "epoch": 0.00135, "grad_norm": 1.9453125, "learning_rate": 7.95e-05, "loss": 10.3243, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 54, "tokens_per_second_per_gpu": 16412.86 }, { "epoch": 0.001375, "grad_norm": 1.953125, "learning_rate": 8.1e-05, "loss": 10.2912, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 55, "tokens_per_second_per_gpu": 19145.67 }, { "epoch": 0.0014, "grad_norm": 1.953125, "learning_rate": 8.25e-05, "loss": 10.2505, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 56, "tokens_per_second_per_gpu": 21133.31 }, { "epoch": 0.001425, "grad_norm": 1.9609375, "learning_rate": 8.400000000000001e-05, "loss": 10.212, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 57, "tokens_per_second_per_gpu": 20491.2 }, { "epoch": 0.00145, "grad_norm": 1.9453125, "learning_rate": 8.55e-05, "loss": 10.1893, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 58, "tokens_per_second_per_gpu": 29421.88 }, { "epoch": 0.001475, "grad_norm": 1.9296875, "learning_rate": 8.7e-05, "loss": 10.1563, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 59, "tokens_per_second_per_gpu": 16188.45 }, { "epoch": 0.0015, "grad_norm": 1.9296875, "learning_rate": 8.85e-05, "loss": 10.1225, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 60, "tokens_per_second_per_gpu": 17302.53 }, { "epoch": 0.001525, "grad_norm": 1.9453125, "learning_rate": 8.999999999999999e-05, "loss": 10.075, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 61, "tokens_per_second_per_gpu": 28146.34 }, { "epoch": 0.00155, "grad_norm": 1.9453125, "learning_rate": 9.15e-05, "loss": 10.0451, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 62, "tokens_per_second_per_gpu": 12682.79 }, { "epoch": 0.001575, "grad_norm": 1.9453125, "learning_rate": 9.3e-05, "loss": 10.0105, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 63, "tokens_per_second_per_gpu": 18692.27 }, { "epoch": 0.0016, "grad_norm": 1.9375, "learning_rate": 9.45e-05, "loss": 9.9809, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 64, "tokens_per_second_per_gpu": 17462.65 }, { "epoch": 0.001625, "grad_norm": 1.9375, "learning_rate": 9.6e-05, "loss": 9.9433, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 65, "tokens_per_second_per_gpu": 18468.93 }, { "epoch": 0.00165, "grad_norm": 1.9375, "learning_rate": 9.750000000000001e-05, "loss": 9.912, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 66, "tokens_per_second_per_gpu": 24963.62 }, { "epoch": 0.001675, "grad_norm": 1.9296875, "learning_rate": 9.900000000000001e-05, "loss": 9.8826, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 67, "tokens_per_second_per_gpu": 18187.91 }, { "epoch": 0.0017, "grad_norm": 1.9296875, "learning_rate": 0.0001005, "loss": 9.8485, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 68, "tokens_per_second_per_gpu": 20826.71 }, { "epoch": 0.001725, "grad_norm": 1.9296875, "learning_rate": 0.00010200000000000001, "loss": 9.8242, "memory/device_reserved (GiB)": 85.82, "memory/max_active (GiB)": 85.68, "memory/max_allocated (GiB)": 85.68, "step": 69, "tokens_per_second_per_gpu": 15292.29 }, { "epoch": 0.00175, "grad_norm": 1.90625, "learning_rate": 0.00010350000000000001, "loss": 9.8045, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 70, "tokens_per_second_per_gpu": 17578.42 }, { "epoch": 0.001775, "grad_norm": 1.9296875, "learning_rate": 0.00010500000000000002, "loss": 9.7595, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 71, "tokens_per_second_per_gpu": 14253.84 }, { "epoch": 0.0018, "grad_norm": 1.90625, "learning_rate": 0.00010649999999999999, "loss": 9.7357, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 72, "tokens_per_second_per_gpu": 12218.86 }, { "epoch": 0.001825, "grad_norm": 1.90625, "learning_rate": 0.000108, "loss": 9.71, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 73, "tokens_per_second_per_gpu": 12161.82 }, { "epoch": 0.00185, "grad_norm": 1.921875, "learning_rate": 0.00010949999999999999, "loss": 9.6728, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 74, "tokens_per_second_per_gpu": 13479.64 }, { "epoch": 0.001875, "grad_norm": 1.921875, "learning_rate": 0.000111, "loss": 9.6381, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 75, "tokens_per_second_per_gpu": 17570.05 }, { "epoch": 0.0019, "grad_norm": 1.921875, "learning_rate": 0.0001125, "loss": 9.6079, "memory/device_reserved (GiB)": 55.57, "memory/max_active (GiB)": 55.42, "memory/max_allocated (GiB)": 55.42, "step": 76, "tokens_per_second_per_gpu": 23474.03 }, { "epoch": 0.001925, "grad_norm": 1.90625, "learning_rate": 0.000114, "loss": 9.5737, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 77, "tokens_per_second_per_gpu": 18397.47 }, { "epoch": 0.00195, "grad_norm": 1.90625, "learning_rate": 0.0001155, "loss": 9.5544, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 78, "tokens_per_second_per_gpu": 12954.17 }, { "epoch": 0.001975, "grad_norm": 1.90625, "learning_rate": 0.000117, "loss": 9.5119, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 79, "tokens_per_second_per_gpu": 26147.35 }, { "epoch": 0.002, "grad_norm": 1.8984375, "learning_rate": 0.00011850000000000001, "loss": 9.4847, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 80, "tokens_per_second_per_gpu": 23232.84 }, { "epoch": 0.002025, "grad_norm": 1.890625, "learning_rate": 0.00012, "loss": 9.4505, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 81, "tokens_per_second_per_gpu": 21169.24 }, { "epoch": 0.00205, "grad_norm": 1.890625, "learning_rate": 0.00012150000000000001, "loss": 9.4277, "memory/device_reserved (GiB)": 54.53, "memory/max_active (GiB)": 54.51, "memory/max_allocated (GiB)": 54.51, "step": 82, "tokens_per_second_per_gpu": 23963.73 }, { "epoch": 0.002075, "grad_norm": 1.90625, "learning_rate": 0.000123, "loss": 9.3781, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 83, "tokens_per_second_per_gpu": 14312.0 }, { "epoch": 0.0021, "grad_norm": 1.8984375, "learning_rate": 0.00012450000000000002, "loss": 9.3415, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 84, "tokens_per_second_per_gpu": 16280.78 }, { "epoch": 0.002125, "grad_norm": 2.046875, "learning_rate": 0.000126, "loss": 9.318, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 85, "tokens_per_second_per_gpu": 19801.36 }, { "epoch": 0.00215, "grad_norm": 1.9296875, "learning_rate": 0.0001275, "loss": 9.278, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 86, "tokens_per_second_per_gpu": 13346.9 }, { "epoch": 0.002175, "grad_norm": 1.8828125, "learning_rate": 0.000129, "loss": 9.239, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 87, "tokens_per_second_per_gpu": 23574.41 }, { "epoch": 0.0022, "grad_norm": 1.8828125, "learning_rate": 0.0001305, "loss": 9.2153, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 88, "tokens_per_second_per_gpu": 18081.15 }, { "epoch": 0.002225, "grad_norm": 1.875, "learning_rate": 0.00013199999999999998, "loss": 9.1842, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 89, "tokens_per_second_per_gpu": 11315.12 }, { "epoch": 0.00225, "grad_norm": 1.90625, "learning_rate": 0.0001335, "loss": 9.1474, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 90, "tokens_per_second_per_gpu": 18805.53 }, { "epoch": 0.002275, "grad_norm": 1.8984375, "learning_rate": 0.000135, "loss": 9.1243, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 91, "tokens_per_second_per_gpu": 14134.35 }, { "epoch": 0.0023, "grad_norm": 1.84375, "learning_rate": 0.0001365, "loss": 9.073, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 92, "tokens_per_second_per_gpu": 23165.95 }, { "epoch": 0.002325, "grad_norm": 1.8984375, "learning_rate": 0.000138, "loss": 9.0363, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 93, "tokens_per_second_per_gpu": 20973.75 }, { "epoch": 0.00235, "grad_norm": 1.921875, "learning_rate": 0.0001395, "loss": 9.0101, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 94, "tokens_per_second_per_gpu": 15576.18 }, { "epoch": 0.002375, "grad_norm": 1.8046875, "learning_rate": 0.000141, "loss": 8.9853, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 95, "tokens_per_second_per_gpu": 14553.59 }, { "epoch": 0.0024, "grad_norm": 1.921875, "learning_rate": 0.0001425, "loss": 8.9455, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 96, "tokens_per_second_per_gpu": 22003.01 }, { "epoch": 0.002425, "grad_norm": 1.9375, "learning_rate": 0.000144, "loss": 8.9151, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 97, "tokens_per_second_per_gpu": 24774.45 }, { "epoch": 0.00245, "grad_norm": 1.890625, "learning_rate": 0.0001455, "loss": 8.9188, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 98, "tokens_per_second_per_gpu": 17545.23 }, { "epoch": 0.002475, "grad_norm": 1.8671875, "learning_rate": 0.000147, "loss": 8.8533, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 99, "tokens_per_second_per_gpu": 28303.37 }, { "epoch": 0.0025, "grad_norm": 1.96875, "learning_rate": 0.0001485, "loss": 8.842, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 100, "tokens_per_second_per_gpu": 12277.99 }, { "epoch": 0.002525, "grad_norm": 2.03125, "learning_rate": 0.00015000000000000001, "loss": 8.8054, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 101, "tokens_per_second_per_gpu": 16013.77 }, { "epoch": 0.00255, "grad_norm": 2.015625, "learning_rate": 0.00015150000000000002, "loss": 8.772, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 102, "tokens_per_second_per_gpu": 15915.63 }, { "epoch": 0.002575, "grad_norm": 1.953125, "learning_rate": 0.000153, "loss": 8.7379, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 103, "tokens_per_second_per_gpu": 16339.16 }, { "epoch": 0.0026, "grad_norm": 1.9296875, "learning_rate": 0.0001545, "loss": 8.7368, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 104, "tokens_per_second_per_gpu": 17360.52 }, { "epoch": 0.002625, "grad_norm": 1.796875, "learning_rate": 0.000156, "loss": 8.7116, "memory/device_reserved (GiB)": 116.82, "memory/max_active (GiB)": 116.63, "memory/max_allocated (GiB)": 116.63, "step": 105, "tokens_per_second_per_gpu": 11628.86 }, { "epoch": 0.00265, "grad_norm": 1.8046875, "learning_rate": 0.0001575, "loss": 8.658, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 106, "tokens_per_second_per_gpu": 14518.02 }, { "epoch": 0.002675, "grad_norm": 1.8359375, "learning_rate": 0.000159, "loss": 8.6233, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 107, "tokens_per_second_per_gpu": 15223.45 }, { "epoch": 0.0027, "grad_norm": 1.828125, "learning_rate": 0.0001605, "loss": 8.6067, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 108, "tokens_per_second_per_gpu": 20769.23 }, { "epoch": 0.002725, "grad_norm": 1.7421875, "learning_rate": 0.000162, "loss": 8.5856, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 109, "tokens_per_second_per_gpu": 15339.85 }, { "epoch": 0.00275, "grad_norm": 1.84375, "learning_rate": 0.0001635, "loss": 8.5574, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 110, "tokens_per_second_per_gpu": 15311.5 }, { "epoch": 0.002775, "grad_norm": 1.859375, "learning_rate": 0.000165, "loss": 8.5285, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 111, "tokens_per_second_per_gpu": 24004.34 }, { "epoch": 0.0028, "grad_norm": 1.859375, "learning_rate": 0.0001665, "loss": 8.5042, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 112, "tokens_per_second_per_gpu": 16264.38 }, { "epoch": 0.002825, "grad_norm": 1.7734375, "learning_rate": 0.00016800000000000002, "loss": 8.4878, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 113, "tokens_per_second_per_gpu": 16499.25 }, { "epoch": 0.00285, "grad_norm": 1.71875, "learning_rate": 0.0001695, "loss": 8.4758, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 114, "tokens_per_second_per_gpu": 18766.82 }, { "epoch": 0.002875, "grad_norm": 1.953125, "learning_rate": 0.000171, "loss": 8.436, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 115, "tokens_per_second_per_gpu": 24251.24 }, { "epoch": 0.0029, "grad_norm": 1.734375, "learning_rate": 0.00017250000000000002, "loss": 8.3989, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 116, "tokens_per_second_per_gpu": 26008.36 }, { "epoch": 0.002925, "grad_norm": 1.71875, "learning_rate": 0.000174, "loss": 8.3964, "memory/device_reserved (GiB)": 44.36, "memory/max_active (GiB)": 44.31, "memory/max_allocated (GiB)": 44.31, "step": 117, "tokens_per_second_per_gpu": 28300.31 }, { "epoch": 0.00295, "grad_norm": 1.78125, "learning_rate": 0.0001755, "loss": 8.3671, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 118, "tokens_per_second_per_gpu": 22267.71 }, { "epoch": 0.002975, "grad_norm": 1.71875, "learning_rate": 0.000177, "loss": 8.3798, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 119, "tokens_per_second_per_gpu": 14792.14 }, { "epoch": 0.003, "grad_norm": 1.7109375, "learning_rate": 0.0001785, "loss": 8.3586, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 120, "tokens_per_second_per_gpu": 10568.44 }, { "epoch": 0.003025, "grad_norm": 1.71875, "learning_rate": 0.00017999999999999998, "loss": 8.3195, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 121, "tokens_per_second_per_gpu": 14450.29 }, { "epoch": 0.00305, "grad_norm": 1.7109375, "learning_rate": 0.0001815, "loss": 8.3072, "memory/device_reserved (GiB)": 106.43, "memory/max_active (GiB)": 106.39, "memory/max_allocated (GiB)": 106.39, "step": 122, "tokens_per_second_per_gpu": 13337.37 }, { "epoch": 0.003075, "grad_norm": 1.6953125, "learning_rate": 0.000183, "loss": 8.2467, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 123, "tokens_per_second_per_gpu": 22983.6 }, { "epoch": 0.0031, "grad_norm": 1.6796875, "learning_rate": 0.0001845, "loss": 8.2711, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 124, "tokens_per_second_per_gpu": 11193.17 }, { "epoch": 0.003125, "grad_norm": 1.703125, "learning_rate": 0.000186, "loss": 8.1947, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 125, "tokens_per_second_per_gpu": 29001.17 }, { "epoch": 0.00315, "grad_norm": 1.6796875, "learning_rate": 0.0001875, "loss": 8.1856, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 126, "tokens_per_second_per_gpu": 12841.52 }, { "epoch": 0.003175, "grad_norm": 1.640625, "learning_rate": 0.000189, "loss": 8.1477, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 127, "tokens_per_second_per_gpu": 28509.42 }, { "epoch": 0.0032, "grad_norm": 1.65625, "learning_rate": 0.0001905, "loss": 8.1589, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 128, "tokens_per_second_per_gpu": 10589.91 }, { "epoch": 0.003225, "grad_norm": 1.59375, "learning_rate": 0.000192, "loss": 8.1094, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 129, "tokens_per_second_per_gpu": 24798.28 }, { "epoch": 0.00325, "grad_norm": 1.625, "learning_rate": 0.00019350000000000001, "loss": 8.1358, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 130, "tokens_per_second_per_gpu": 12263.69 }, { "epoch": 0.003275, "grad_norm": 1.5703125, "learning_rate": 0.00019500000000000002, "loss": 8.0674, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 131, "tokens_per_second_per_gpu": 28079.35 }, { "epoch": 0.0033, "grad_norm": 1.5859375, "learning_rate": 0.0001965, "loss": 8.087, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 132, "tokens_per_second_per_gpu": 10688.8 }, { "epoch": 0.003325, "grad_norm": 1.59375, "learning_rate": 0.00019800000000000002, "loss": 8.0269, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 133, "tokens_per_second_per_gpu": 15863.07 }, { "epoch": 0.00335, "grad_norm": 1.7578125, "learning_rate": 0.00019950000000000002, "loss": 8.0064, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 134, "tokens_per_second_per_gpu": 11967.49 }, { "epoch": 0.003375, "grad_norm": 1.5703125, "learning_rate": 0.000201, "loss": 7.9791, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 135, "tokens_per_second_per_gpu": 26791.45 }, { "epoch": 0.0034, "grad_norm": 1.546875, "learning_rate": 0.00020250000000000002, "loss": 7.968, "memory/device_reserved (GiB)": 106.61, "memory/max_active (GiB)": 106.43, "memory/max_allocated (GiB)": 106.43, "step": 136, "tokens_per_second_per_gpu": 12979.66 }, { "epoch": 0.003425, "grad_norm": 1.546875, "learning_rate": 0.00020400000000000003, "loss": 7.9574, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 137, "tokens_per_second_per_gpu": 13414.42 }, { "epoch": 0.00345, "grad_norm": 1.515625, "learning_rate": 0.0002055, "loss": 7.9439, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 138, "tokens_per_second_per_gpu": 12722.04 }, { "epoch": 0.003475, "grad_norm": 1.546875, "learning_rate": 0.00020700000000000002, "loss": 7.9011, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 139, "tokens_per_second_per_gpu": 11606.73 }, { "epoch": 0.0035, "grad_norm": 1.46875, "learning_rate": 0.00020850000000000003, "loss": 7.8625, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 140, "tokens_per_second_per_gpu": 24157.78 }, { "epoch": 0.003525, "grad_norm": 1.515625, "learning_rate": 0.00021000000000000004, "loss": 7.8862, "memory/device_reserved (GiB)": 56.59, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 141, "tokens_per_second_per_gpu": 21971.14 }, { "epoch": 0.00355, "grad_norm": 1.46875, "learning_rate": 0.0002115, "loss": 7.8545, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 142, "tokens_per_second_per_gpu": 17642.49 }, { "epoch": 0.003575, "grad_norm": 1.5859375, "learning_rate": 0.00021299999999999997, "loss": 7.8455, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 143, "tokens_per_second_per_gpu": 17403.7 }, { "epoch": 0.0036, "grad_norm": 1.5859375, "learning_rate": 0.00021449999999999998, "loss": 7.7895, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 144, "tokens_per_second_per_gpu": 15428.52 }, { "epoch": 0.003625, "grad_norm": 1.4765625, "learning_rate": 0.000216, "loss": 7.783, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 145, "tokens_per_second_per_gpu": 13122.72 }, { "epoch": 0.00365, "grad_norm": 1.5390625, "learning_rate": 0.0002175, "loss": 7.7573, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 146, "tokens_per_second_per_gpu": 16315.45 }, { "epoch": 0.003675, "grad_norm": 1.4609375, "learning_rate": 0.00021899999999999998, "loss": 7.7107, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 147, "tokens_per_second_per_gpu": 21124.56 }, { "epoch": 0.0037, "grad_norm": 1.3671875, "learning_rate": 0.0002205, "loss": 7.6997, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 148, "tokens_per_second_per_gpu": 17408.26 }, { "epoch": 0.003725, "grad_norm": 1.3046875, "learning_rate": 0.000222, "loss": 7.69, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 149, "tokens_per_second_per_gpu": 19775.99 }, { "epoch": 0.00375, "grad_norm": 1.3515625, "learning_rate": 0.00022349999999999998, "loss": 7.6797, "memory/device_reserved (GiB)": 46.4, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 150, "tokens_per_second_per_gpu": 27338.02 }, { "epoch": 0.003775, "grad_norm": 1.328125, "learning_rate": 0.000225, "loss": 7.6377, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 151, "tokens_per_second_per_gpu": 19781.04 }, { "epoch": 0.0038, "grad_norm": 1.34375, "learning_rate": 0.0002265, "loss": 7.6052, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 152, "tokens_per_second_per_gpu": 29420.29 }, { "epoch": 0.003825, "grad_norm": 1.265625, "learning_rate": 0.000228, "loss": 7.5885, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 153, "tokens_per_second_per_gpu": 23792.23 }, { "epoch": 0.00385, "grad_norm": 1.1953125, "learning_rate": 0.0002295, "loss": 7.5674, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 154, "tokens_per_second_per_gpu": 13774.57 }, { "epoch": 0.003875, "grad_norm": 1.09375, "learning_rate": 0.000231, "loss": 7.5224, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 155, "tokens_per_second_per_gpu": 11775.92 }, { "epoch": 0.0039, "grad_norm": 1.0390625, "learning_rate": 0.0002325, "loss": 7.4967, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 156, "tokens_per_second_per_gpu": 17476.95 }, { "epoch": 0.003925, "grad_norm": 1.0625, "learning_rate": 0.000234, "loss": 7.4889, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 157, "tokens_per_second_per_gpu": 22535.63 }, { "epoch": 0.00395, "grad_norm": 1.234375, "learning_rate": 0.0002355, "loss": 7.4489, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 158, "tokens_per_second_per_gpu": 22947.52 }, { "epoch": 0.003975, "grad_norm": 1.265625, "learning_rate": 0.00023700000000000001, "loss": 7.4398, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 159, "tokens_per_second_per_gpu": 19761.71 }, { "epoch": 0.004, "grad_norm": 1.2265625, "learning_rate": 0.0002385, "loss": 7.4449, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 160, "tokens_per_second_per_gpu": 11184.69 }, { "epoch": 0.004025, "grad_norm": 1.1015625, "learning_rate": 0.00024, "loss": 7.4107, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 161, "tokens_per_second_per_gpu": 10845.35 }, { "epoch": 0.00405, "grad_norm": 0.97265625, "learning_rate": 0.00024150000000000002, "loss": 7.3702, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 162, "tokens_per_second_per_gpu": 14283.05 }, { "epoch": 0.004075, "grad_norm": 0.8125, "learning_rate": 0.00024300000000000002, "loss": 7.3678, "memory/device_reserved (GiB)": 106.61, "memory/max_active (GiB)": 106.43, "memory/max_allocated (GiB)": 106.43, "step": 163, "tokens_per_second_per_gpu": 13074.86 }, { "epoch": 0.0041, "grad_norm": 0.88671875, "learning_rate": 0.00024450000000000003, "loss": 7.3401, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 164, "tokens_per_second_per_gpu": 17268.37 }, { "epoch": 0.004125, "grad_norm": 1.1875, "learning_rate": 0.000246, "loss": 7.322, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 165, "tokens_per_second_per_gpu": 23181.25 }, { "epoch": 0.00415, "grad_norm": 1.4375, "learning_rate": 0.0002475, "loss": 7.314, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 166, "tokens_per_second_per_gpu": 21147.95 }, { "epoch": 0.004175, "grad_norm": 1.1015625, "learning_rate": 0.00024900000000000004, "loss": 7.2783, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 167, "tokens_per_second_per_gpu": 13163.05 }, { "epoch": 0.0042, "grad_norm": 2.765625, "learning_rate": 0.0002505, "loss": 7.2582, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 168, "tokens_per_second_per_gpu": 20884.13 }, { "epoch": 0.004225, "grad_norm": 1.4609375, "learning_rate": 0.000252, "loss": 7.2605, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 169, "tokens_per_second_per_gpu": 21101.67 }, { "epoch": 0.00425, "grad_norm": 1.3671875, "learning_rate": 0.00025350000000000004, "loss": 7.2529, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 170, "tokens_per_second_per_gpu": 10651.87 }, { "epoch": 0.004275, "grad_norm": 1.5, "learning_rate": 0.000255, "loss": 7.2333, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 171, "tokens_per_second_per_gpu": 12389.56 }, { "epoch": 0.0043, "grad_norm": 1.3828125, "learning_rate": 0.0002565, "loss": 7.1841, "memory/device_reserved (GiB)": 35.75, "memory/max_active (GiB)": 35.55, "memory/max_allocated (GiB)": 35.55, "step": 172, "tokens_per_second_per_gpu": 32568.39 }, { "epoch": 0.004325, "grad_norm": 1.2421875, "learning_rate": 0.000258, "loss": 7.1907, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 173, "tokens_per_second_per_gpu": 14351.23 }, { "epoch": 0.00435, "grad_norm": 0.91796875, "learning_rate": 0.00025949999999999997, "loss": 7.1502, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 174, "tokens_per_second_per_gpu": 13354.11 }, { "epoch": 0.004375, "grad_norm": 1.046875, "learning_rate": 0.000261, "loss": 7.1554, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 175, "tokens_per_second_per_gpu": 13196.72 }, { "epoch": 0.0044, "grad_norm": 1.96875, "learning_rate": 0.0002625, "loss": 7.1339, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 176, "tokens_per_second_per_gpu": 13604.63 }, { "epoch": 0.004425, "grad_norm": 0.7265625, "learning_rate": 0.00026399999999999997, "loss": 7.1441, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 177, "tokens_per_second_per_gpu": 13785.94 }, { "epoch": 0.00445, "grad_norm": 1.328125, "learning_rate": 0.0002655, "loss": 7.0853, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 178, "tokens_per_second_per_gpu": 11579.12 }, { "epoch": 0.004475, "grad_norm": 1.2109375, "learning_rate": 0.000267, "loss": 7.0699, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 179, "tokens_per_second_per_gpu": 10948.45 }, { "epoch": 0.0045, "grad_norm": 1.453125, "learning_rate": 0.0002685, "loss": 7.0561, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 180, "tokens_per_second_per_gpu": 14519.8 }, { "epoch": 0.004525, "grad_norm": 1.0234375, "learning_rate": 0.00027, "loss": 7.0393, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 181, "tokens_per_second_per_gpu": 23359.86 }, { "epoch": 0.00455, "grad_norm": 2.40625, "learning_rate": 0.0002715, "loss": 7.0337, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 182, "tokens_per_second_per_gpu": 12234.05 }, { "epoch": 0.004575, "grad_norm": 1.1484375, "learning_rate": 0.000273, "loss": 7.0279, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 183, "tokens_per_second_per_gpu": 15878.0 }, { "epoch": 0.0046, "grad_norm": 3.90625, "learning_rate": 0.0002745, "loss": 7.0404, "memory/device_reserved (GiB)": 54.53, "memory/max_active (GiB)": 54.51, "memory/max_allocated (GiB)": 54.51, "step": 184, "tokens_per_second_per_gpu": 25079.26 }, { "epoch": 0.004625, "grad_norm": 3.390625, "learning_rate": 0.000276, "loss": 7.0054, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 185, "tokens_per_second_per_gpu": 17835.82 }, { "epoch": 0.00465, "grad_norm": 2.140625, "learning_rate": 0.0002775, "loss": 7.0173, "memory/device_reserved (GiB)": 64.72, "memory/max_active (GiB)": 64.71, "memory/max_allocated (GiB)": 64.71, "step": 186, "tokens_per_second_per_gpu": 21128.58 }, { "epoch": 0.004675, "grad_norm": 1.2421875, "learning_rate": 0.000279, "loss": 6.9916, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 187, "tokens_per_second_per_gpu": 16074.21 }, { "epoch": 0.0047, "grad_norm": 1.71875, "learning_rate": 0.0002805, "loss": 6.9546, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 188, "tokens_per_second_per_gpu": 16776.22 }, { "epoch": 0.004725, "grad_norm": 1.7421875, "learning_rate": 0.000282, "loss": 6.9615, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 189, "tokens_per_second_per_gpu": 14952.36 }, { "epoch": 0.00475, "grad_norm": 1.390625, "learning_rate": 0.0002835, "loss": 6.9156, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 190, "tokens_per_second_per_gpu": 18444.9 }, { "epoch": 0.004775, "grad_norm": 3.5, "learning_rate": 0.000285, "loss": 6.9623, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 191, "tokens_per_second_per_gpu": 16633.68 }, { "epoch": 0.0048, "grad_norm": 2.4375, "learning_rate": 0.0002865, "loss": 6.95, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 192, "tokens_per_second_per_gpu": 15988.24 }, { "epoch": 0.004825, "grad_norm": 2.046875, "learning_rate": 0.000288, "loss": 6.9216, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 193, "tokens_per_second_per_gpu": 23228.95 }, { "epoch": 0.00485, "grad_norm": 1.8828125, "learning_rate": 0.0002895, "loss": 6.9107, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 194, "tokens_per_second_per_gpu": 23455.36 }, { "epoch": 0.004875, "grad_norm": 2.046875, "learning_rate": 0.000291, "loss": 6.8728, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 195, "tokens_per_second_per_gpu": 23002.12 }, { "epoch": 0.0049, "grad_norm": 1.3125, "learning_rate": 0.0002925, "loss": 6.8685, "memory/device_reserved (GiB)": 96.05, "memory/max_active (GiB)": 95.88, "memory/max_allocated (GiB)": 95.88, "step": 196, "tokens_per_second_per_gpu": 14133.7 }, { "epoch": 0.004925, "grad_norm": 1.4765625, "learning_rate": 0.000294, "loss": 6.8648, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 197, "tokens_per_second_per_gpu": 17224.48 }, { "epoch": 0.00495, "grad_norm": 1.6015625, "learning_rate": 0.0002955, "loss": 6.8384, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 198, "tokens_per_second_per_gpu": 23150.63 }, { "epoch": 0.004975, "grad_norm": 1.109375, "learning_rate": 0.000297, "loss": 6.8076, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 199, "tokens_per_second_per_gpu": 11875.26 }, { "epoch": 0.005, "grad_norm": 2.484375, "learning_rate": 0.00029850000000000005, "loss": 6.823, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 200, "tokens_per_second_per_gpu": 23406.28 }, { "epoch": 0.005025, "grad_norm": 1.6484375, "learning_rate": 0.00030000000000000003, "loss": 6.8048, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 201, "tokens_per_second_per_gpu": 10617.78 }, { "epoch": 0.00505, "grad_norm": 1.5078125, "learning_rate": 0.0003015, "loss": 6.7684, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 202, "tokens_per_second_per_gpu": 13684.91 }, { "epoch": 0.005075, "grad_norm": 1.53125, "learning_rate": 0.00030300000000000005, "loss": 6.772, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 203, "tokens_per_second_per_gpu": 18313.35 }, { "epoch": 0.0051, "grad_norm": 1.6796875, "learning_rate": 0.00030450000000000003, "loss": 6.7853, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 204, "tokens_per_second_per_gpu": 13789.29 }, { "epoch": 0.005125, "grad_norm": 0.78515625, "learning_rate": 0.000306, "loss": 6.7004, "memory/device_reserved (GiB)": 65.6, "memory/max_active (GiB)": 65.59, "memory/max_allocated (GiB)": 65.59, "step": 205, "tokens_per_second_per_gpu": 19648.21 }, { "epoch": 0.00515, "grad_norm": 1.234375, "learning_rate": 0.0003075, "loss": 6.7114, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 206, "tokens_per_second_per_gpu": 22801.55 }, { "epoch": 0.005175, "grad_norm": 0.86328125, "learning_rate": 0.000309, "loss": 6.6741, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 207, "tokens_per_second_per_gpu": 19189.05 }, { "epoch": 0.0052, "grad_norm": 2.28125, "learning_rate": 0.0003105, "loss": 6.6876, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 208, "tokens_per_second_per_gpu": 10729.76 }, { "epoch": 0.005225, "grad_norm": 1.8046875, "learning_rate": 0.000312, "loss": 6.6986, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 209, "tokens_per_second_per_gpu": 15902.49 }, { "epoch": 0.00525, "grad_norm": 0.76953125, "learning_rate": 0.0003135, "loss": 6.6709, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 210, "tokens_per_second_per_gpu": 19629.94 }, { "epoch": 0.005275, "grad_norm": 1.265625, "learning_rate": 0.000315, "loss": 6.6525, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 211, "tokens_per_second_per_gpu": 13647.69 }, { "epoch": 0.0053, "grad_norm": 2.28125, "learning_rate": 0.0003165, "loss": 6.6315, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 212, "tokens_per_second_per_gpu": 15828.77 }, { "epoch": 0.005325, "grad_norm": 1.421875, "learning_rate": 0.000318, "loss": 6.6843, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 213, "tokens_per_second_per_gpu": 25225.91 }, { "epoch": 0.00535, "grad_norm": 1.2265625, "learning_rate": 0.0003195, "loss": 6.5544, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 214, "tokens_per_second_per_gpu": 27476.32 }, { "epoch": 0.005375, "grad_norm": 2.640625, "learning_rate": 0.000321, "loss": 6.6028, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 215, "tokens_per_second_per_gpu": 24398.02 }, { "epoch": 0.0054, "grad_norm": 1.78125, "learning_rate": 0.0003225, "loss": 6.5805, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 216, "tokens_per_second_per_gpu": 12542.35 }, { "epoch": 0.005425, "grad_norm": 2.421875, "learning_rate": 0.000324, "loss": 6.5806, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 217, "tokens_per_second_per_gpu": 20289.66 }, { "epoch": 0.00545, "grad_norm": 1.7734375, "learning_rate": 0.0003255, "loss": 6.5347, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 218, "tokens_per_second_per_gpu": 13309.35 }, { "epoch": 0.005475, "grad_norm": 1.5859375, "learning_rate": 0.000327, "loss": 6.5371, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 219, "tokens_per_second_per_gpu": 16431.29 }, { "epoch": 0.0055, "grad_norm": 1.3828125, "learning_rate": 0.0003285, "loss": 6.4963, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 220, "tokens_per_second_per_gpu": 23608.17 }, { "epoch": 0.005525, "grad_norm": 4.09375, "learning_rate": 0.00033, "loss": 6.5532, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 221, "tokens_per_second_per_gpu": 10974.99 }, { "epoch": 0.00555, "grad_norm": 1.5625, "learning_rate": 0.00033150000000000003, "loss": 6.4981, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 222, "tokens_per_second_per_gpu": 14579.44 }, { "epoch": 0.005575, "grad_norm": 6.125, "learning_rate": 0.000333, "loss": 6.5222, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 223, "tokens_per_second_per_gpu": 19605.68 }, { "epoch": 0.0056, "grad_norm": 2.46875, "learning_rate": 0.0003345, "loss": 6.5364, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 224, "tokens_per_second_per_gpu": 13806.16 }, { "epoch": 0.005625, "grad_norm": 3.890625, "learning_rate": 0.00033600000000000004, "loss": 6.5207, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 225, "tokens_per_second_per_gpu": 26269.6 }, { "epoch": 0.00565, "grad_norm": 3.734375, "learning_rate": 0.0003375, "loss": 6.4858, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 226, "tokens_per_second_per_gpu": 20579.08 }, { "epoch": 0.005675, "grad_norm": 3.5625, "learning_rate": 0.000339, "loss": 6.4789, "memory/device_reserved (GiB)": 44.36, "memory/max_active (GiB)": 44.31, "memory/max_allocated (GiB)": 44.31, "step": 227, "tokens_per_second_per_gpu": 27515.58 }, { "epoch": 0.0057, "grad_norm": 2.59375, "learning_rate": 0.00034050000000000004, "loss": 6.47, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 228, "tokens_per_second_per_gpu": 10753.35 }, { "epoch": 0.005725, "grad_norm": 2.171875, "learning_rate": 0.000342, "loss": 6.4482, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 229, "tokens_per_second_per_gpu": 14153.77 }, { "epoch": 0.00575, "grad_norm": 3.828125, "learning_rate": 0.0003435, "loss": 6.4549, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 230, "tokens_per_second_per_gpu": 19510.53 }, { "epoch": 0.005775, "grad_norm": 1.9921875, "learning_rate": 0.00034500000000000004, "loss": 6.4509, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 231, "tokens_per_second_per_gpu": 10704.27 }, { "epoch": 0.0058, "grad_norm": 2.015625, "learning_rate": 0.0003465, "loss": 6.4231, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 232, "tokens_per_second_per_gpu": 21824.89 }, { "epoch": 0.005825, "grad_norm": 0.8203125, "learning_rate": 0.000348, "loss": 6.4051, "memory/device_reserved (GiB)": 75.81, "memory/max_active (GiB)": 75.79, "memory/max_allocated (GiB)": 75.79, "step": 233, "tokens_per_second_per_gpu": 16925.67 }, { "epoch": 0.00585, "grad_norm": 2.1875, "learning_rate": 0.00034950000000000004, "loss": 6.3973, "memory/device_reserved (GiB)": 66.36, "memory/max_active (GiB)": 66.15, "memory/max_allocated (GiB)": 66.15, "step": 234, "tokens_per_second_per_gpu": 20236.96 }, { "epoch": 0.005875, "grad_norm": 1.6640625, "learning_rate": 0.000351, "loss": 6.3834, "memory/device_reserved (GiB)": 75.99, "memory/max_active (GiB)": 75.82, "memory/max_allocated (GiB)": 75.82, "step": 235, "tokens_per_second_per_gpu": 17484.29 }, { "epoch": 0.0059, "grad_norm": 0.99609375, "learning_rate": 0.0003525, "loss": 6.3811, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 236, "tokens_per_second_per_gpu": 11144.8 }, { "epoch": 0.005925, "grad_norm": 4.84375, "learning_rate": 0.000354, "loss": 6.3751, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 237, "tokens_per_second_per_gpu": 10733.47 }, { "epoch": 0.00595, "grad_norm": 2.65625, "learning_rate": 0.00035549999999999997, "loss": 6.361, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 238, "tokens_per_second_per_gpu": 17334.73 }, { "epoch": 0.005975, "grad_norm": 4.46875, "learning_rate": 0.000357, "loss": 6.3772, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 239, "tokens_per_second_per_gpu": 10780.3 }, { "epoch": 0.006, "grad_norm": 2.140625, "learning_rate": 0.0003585, "loss": 6.3772, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 240, "tokens_per_second_per_gpu": 15482.53 }, { "epoch": 0.006025, "grad_norm": 2.015625, "learning_rate": 0.00035999999999999997, "loss": 6.3289, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 241, "tokens_per_second_per_gpu": 13489.74 }, { "epoch": 0.00605, "grad_norm": 1.46875, "learning_rate": 0.0003615, "loss": 6.3249, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 242, "tokens_per_second_per_gpu": 21995.04 }, { "epoch": 0.006075, "grad_norm": 2.546875, "learning_rate": 0.000363, "loss": 6.3271, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 243, "tokens_per_second_per_gpu": 13945.52 }, { "epoch": 0.0061, "grad_norm": 2.15625, "learning_rate": 0.0003645, "loss": 6.3476, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 244, "tokens_per_second_per_gpu": 10482.63 }, { "epoch": 0.006125, "grad_norm": 1.5546875, "learning_rate": 0.000366, "loss": 6.3106, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 245, "tokens_per_second_per_gpu": 27008.24 }, { "epoch": 0.00615, "grad_norm": 2.328125, "learning_rate": 0.0003675, "loss": 6.271, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 246, "tokens_per_second_per_gpu": 12620.12 }, { "epoch": 0.006175, "grad_norm": 1.34375, "learning_rate": 0.000369, "loss": 6.2885, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 247, "tokens_per_second_per_gpu": 15969.03 }, { "epoch": 0.0062, "grad_norm": 2.078125, "learning_rate": 0.0003705, "loss": 6.2552, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 248, "tokens_per_second_per_gpu": 10710.12 }, { "epoch": 0.006225, "grad_norm": 1.65625, "learning_rate": 0.000372, "loss": 6.2701, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 249, "tokens_per_second_per_gpu": 19292.23 }, { "epoch": 0.00625, "grad_norm": 2.0625, "learning_rate": 0.0003735, "loss": 6.2288, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 250, "tokens_per_second_per_gpu": 23566.66 }, { "epoch": 0.006275, "grad_norm": 0.81640625, "learning_rate": 0.000375, "loss": 6.2518, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 251, "tokens_per_second_per_gpu": 13278.48 }, { "epoch": 0.0063, "grad_norm": 1.4765625, "learning_rate": 0.0003765, "loss": 6.2485, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 252, "tokens_per_second_per_gpu": 14448.19 }, { "epoch": 0.006325, "grad_norm": 1.8125, "learning_rate": 0.000378, "loss": 6.2229, "memory/device_reserved (GiB)": 75.81, "memory/max_active (GiB)": 75.79, "memory/max_allocated (GiB)": 75.79, "step": 253, "tokens_per_second_per_gpu": 17067.56 }, { "epoch": 0.00635, "grad_norm": 1.4140625, "learning_rate": 0.0003795, "loss": 6.1781, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 254, "tokens_per_second_per_gpu": 14953.78 }, { "epoch": 0.006375, "grad_norm": 1.15625, "learning_rate": 0.000381, "loss": 6.1929, "memory/device_reserved (GiB)": 105.95, "memory/max_active (GiB)": 105.92, "memory/max_allocated (GiB)": 105.92, "step": 255, "tokens_per_second_per_gpu": 13283.01 }, { "epoch": 0.0064, "grad_norm": 2.375, "learning_rate": 0.00038250000000000003, "loss": 6.2004, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 256, "tokens_per_second_per_gpu": 16219.15 }, { "epoch": 0.006425, "grad_norm": 1.3671875, "learning_rate": 0.000384, "loss": 6.1678, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 257, "tokens_per_second_per_gpu": 22812.31 }, { "epoch": 0.00645, "grad_norm": 0.6640625, "learning_rate": 0.0003855, "loss": 6.1649, "memory/device_reserved (GiB)": 116.62, "memory/max_active (GiB)": 116.59, "memory/max_allocated (GiB)": 116.59, "step": 258, "tokens_per_second_per_gpu": 11776.11 }, { "epoch": 0.006475, "grad_norm": 1.4375, "learning_rate": 0.00038700000000000003, "loss": 6.1555, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 259, "tokens_per_second_per_gpu": 24243.64 }, { "epoch": 0.0065, "grad_norm": 3.03125, "learning_rate": 0.0003885, "loss": 6.1646, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 260, "tokens_per_second_per_gpu": 11462.77 }, { "epoch": 0.006525, "grad_norm": 1.1875, "learning_rate": 0.00039000000000000005, "loss": 6.1184, "memory/device_reserved (GiB)": 64.72, "memory/max_active (GiB)": 64.71, "memory/max_allocated (GiB)": 64.71, "step": 261, "tokens_per_second_per_gpu": 21325.28 }, { "epoch": 0.00655, "grad_norm": 1.8671875, "learning_rate": 0.00039150000000000003, "loss": 6.1366, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 262, "tokens_per_second_per_gpu": 16384.71 }, { "epoch": 0.006575, "grad_norm": 2.578125, "learning_rate": 0.000393, "loss": 6.1101, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 263, "tokens_per_second_per_gpu": 20899.4 }, { "epoch": 0.0066, "grad_norm": 1.1640625, "learning_rate": 0.00039450000000000005, "loss": 6.1165, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 264, "tokens_per_second_per_gpu": 20700.35 }, { "epoch": 0.006625, "grad_norm": 3.6875, "learning_rate": 0.00039600000000000003, "loss": 6.1216, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 265, "tokens_per_second_per_gpu": 20662.3 }, { "epoch": 0.00665, "grad_norm": 2.0625, "learning_rate": 0.0003975, "loss": 6.1067, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 266, "tokens_per_second_per_gpu": 28170.42 }, { "epoch": 0.006675, "grad_norm": 3.21875, "learning_rate": 0.00039900000000000005, "loss": 6.1037, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 267, "tokens_per_second_per_gpu": 20687.04 }, { "epoch": 0.0067, "grad_norm": 2.828125, "learning_rate": 0.00040050000000000003, "loss": 6.0967, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 268, "tokens_per_second_per_gpu": 15553.67 }, { "epoch": 0.006725, "grad_norm": 1.9375, "learning_rate": 0.000402, "loss": 6.1002, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 269, "tokens_per_second_per_gpu": 12872.5 }, { "epoch": 0.00675, "grad_norm": 2.484375, "learning_rate": 0.00040350000000000005, "loss": 6.0722, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 270, "tokens_per_second_per_gpu": 23894.25 }, { "epoch": 0.006775, "grad_norm": 1.71875, "learning_rate": 0.00040500000000000003, "loss": 6.0557, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 271, "tokens_per_second_per_gpu": 14989.36 }, { "epoch": 0.0068, "grad_norm": 1.75, "learning_rate": 0.0004065, "loss": 6.0969, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.42, "memory/max_allocated (GiB)": 107.42, "step": 272, "tokens_per_second_per_gpu": 12358.27 }, { "epoch": 0.006825, "grad_norm": 3.390625, "learning_rate": 0.00040800000000000005, "loss": 6.0441, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 273, "tokens_per_second_per_gpu": 11998.63 }, { "epoch": 0.00685, "grad_norm": 3.03125, "learning_rate": 0.00040950000000000003, "loss": 6.064, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 274, "tokens_per_second_per_gpu": 13516.06 }, { "epoch": 0.006875, "grad_norm": 0.921875, "learning_rate": 0.000411, "loss": 6.0358, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 275, "tokens_per_second_per_gpu": 18393.16 }, { "epoch": 0.0069, "grad_norm": 1.78125, "learning_rate": 0.00041250000000000005, "loss": 6.0143, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 276, "tokens_per_second_per_gpu": 13892.95 }, { "epoch": 0.006925, "grad_norm": 1.3828125, "learning_rate": 0.00041400000000000003, "loss": 6.027, "memory/device_reserved (GiB)": 55.57, "memory/max_active (GiB)": 55.42, "memory/max_allocated (GiB)": 55.42, "step": 277, "tokens_per_second_per_gpu": 23646.87 }, { "epoch": 0.00695, "grad_norm": 0.56640625, "learning_rate": 0.00041550000000000007, "loss": 6.0531, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.22, "memory/max_allocated (GiB)": 97.22, "step": 278, "tokens_per_second_per_gpu": 13443.08 }, { "epoch": 0.006975, "grad_norm": 1.6953125, "learning_rate": 0.00041700000000000005, "loss": 6.0202, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 279, "tokens_per_second_per_gpu": 14926.19 }, { "epoch": 0.007, "grad_norm": 1.8828125, "learning_rate": 0.00041850000000000004, "loss": 6.0194, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 280, "tokens_per_second_per_gpu": 13202.63 }, { "epoch": 0.007025, "grad_norm": 3.09375, "learning_rate": 0.00042000000000000007, "loss": 6.0073, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 281, "tokens_per_second_per_gpu": 10819.47 }, { "epoch": 0.00705, "grad_norm": 1.4453125, "learning_rate": 0.00042150000000000005, "loss": 5.9679, "memory/device_reserved (GiB)": 65.6, "memory/max_active (GiB)": 65.59, "memory/max_allocated (GiB)": 65.59, "step": 282, "tokens_per_second_per_gpu": 20483.99 }, { "epoch": 0.007075, "grad_norm": 2.59375, "learning_rate": 0.000423, "loss": 5.9946, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 283, "tokens_per_second_per_gpu": 16783.38 }, { "epoch": 0.0071, "grad_norm": 1.4921875, "learning_rate": 0.00042449999999999996, "loss": 5.9906, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 284, "tokens_per_second_per_gpu": 17912.03 }, { "epoch": 0.007125, "grad_norm": 2.3125, "learning_rate": 0.00042599999999999995, "loss": 5.937, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 285, "tokens_per_second_per_gpu": 17377.43 }, { "epoch": 0.00715, "grad_norm": 1.359375, "learning_rate": 0.0004275, "loss": 5.9561, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 286, "tokens_per_second_per_gpu": 17684.12 }, { "epoch": 0.007175, "grad_norm": 1.7578125, "learning_rate": 0.00042899999999999997, "loss": 5.9556, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 287, "tokens_per_second_per_gpu": 12247.94 }, { "epoch": 0.0072, "grad_norm": 1.5546875, "learning_rate": 0.0004305, "loss": 5.9822, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 288, "tokens_per_second_per_gpu": 22691.03 }, { "epoch": 0.007225, "grad_norm": 2.453125, "learning_rate": 0.000432, "loss": 5.9453, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 289, "tokens_per_second_per_gpu": 27821.06 }, { "epoch": 0.00725, "grad_norm": 1.5390625, "learning_rate": 0.00043349999999999997, "loss": 5.9294, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 290, "tokens_per_second_per_gpu": 13001.78 }, { "epoch": 0.007275, "grad_norm": 1.6796875, "learning_rate": 0.000435, "loss": 5.918, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 291, "tokens_per_second_per_gpu": 13674.18 }, { "epoch": 0.0073, "grad_norm": 2.1875, "learning_rate": 0.0004365, "loss": 5.8812, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 292, "tokens_per_second_per_gpu": 16468.28 }, { "epoch": 0.007325, "grad_norm": 2.234375, "learning_rate": 0.00043799999999999997, "loss": 5.923, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 293, "tokens_per_second_per_gpu": 11320.62 }, { "epoch": 0.00735, "grad_norm": 3.40625, "learning_rate": 0.0004395, "loss": 5.9529, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 294, "tokens_per_second_per_gpu": 15543.82 }, { "epoch": 0.007375, "grad_norm": 1.609375, "learning_rate": 0.000441, "loss": 5.8957, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 295, "tokens_per_second_per_gpu": 24001.72 }, { "epoch": 0.0074, "grad_norm": 4.53125, "learning_rate": 0.00044249999999999997, "loss": 5.9546, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 296, "tokens_per_second_per_gpu": 29039.42 }, { "epoch": 0.007425, "grad_norm": 2.8125, "learning_rate": 0.000444, "loss": 5.9206, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 297, "tokens_per_second_per_gpu": 17603.25 }, { "epoch": 0.00745, "grad_norm": 1.671875, "learning_rate": 0.0004455, "loss": 5.9082, "memory/device_reserved (GiB)": 96.22, "memory/max_active (GiB)": 96.19, "memory/max_allocated (GiB)": 96.19, "step": 298, "tokens_per_second_per_gpu": 14518.78 }, { "epoch": 0.007475, "grad_norm": 2.15625, "learning_rate": 0.00044699999999999997, "loss": 5.8941, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 299, "tokens_per_second_per_gpu": 17418.19 }, { "epoch": 0.0075, "grad_norm": 3.359375, "learning_rate": 0.0004485, "loss": 5.9427, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 300, "tokens_per_second_per_gpu": 10391.59 }, { "epoch": 0.007525, "grad_norm": 1.5703125, "learning_rate": 0.00045, "loss": 5.8692, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 301, "tokens_per_second_per_gpu": 12700.79 }, { "epoch": 0.00755, "grad_norm": 5.375, "learning_rate": 0.00045149999999999997, "loss": 5.9102, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 302, "tokens_per_second_per_gpu": 17700.47 }, { "epoch": 0.007575, "grad_norm": 1.7734375, "learning_rate": 0.000453, "loss": 5.8898, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 303, "tokens_per_second_per_gpu": 13903.07 }, { "epoch": 0.0076, "grad_norm": 3.734375, "learning_rate": 0.0004545, "loss": 5.8984, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 304, "tokens_per_second_per_gpu": 15661.73 }, { "epoch": 0.007625, "grad_norm": 1.5625, "learning_rate": 0.000456, "loss": 5.8876, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 305, "tokens_per_second_per_gpu": 18629.37 }, { "epoch": 0.00765, "grad_norm": 3.109375, "learning_rate": 0.0004575, "loss": 5.8696, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 306, "tokens_per_second_per_gpu": 21176.8 }, { "epoch": 0.007675, "grad_norm": 1.90625, "learning_rate": 0.000459, "loss": 5.8622, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 307, "tokens_per_second_per_gpu": 20872.03 }, { "epoch": 0.0077, "grad_norm": 1.8828125, "learning_rate": 0.0004605, "loss": 5.8359, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 308, "tokens_per_second_per_gpu": 28337.42 }, { "epoch": 0.007725, "grad_norm": 3.09375, "learning_rate": 0.000462, "loss": 5.8549, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 309, "tokens_per_second_per_gpu": 16396.84 }, { "epoch": 0.00775, "grad_norm": 1.3203125, "learning_rate": 0.0004635, "loss": 5.8754, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 310, "tokens_per_second_per_gpu": 17129.48 }, { "epoch": 0.007775, "grad_norm": 2.609375, "learning_rate": 0.000465, "loss": 5.8323, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 311, "tokens_per_second_per_gpu": 26610.75 }, { "epoch": 0.0078, "grad_norm": 2.453125, "learning_rate": 0.0004665, "loss": 5.8602, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 312, "tokens_per_second_per_gpu": 12657.25 }, { "epoch": 0.007825, "grad_norm": 1.9609375, "learning_rate": 0.000468, "loss": 5.7925, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 313, "tokens_per_second_per_gpu": 17932.7 }, { "epoch": 0.00785, "grad_norm": 2.109375, "learning_rate": 0.00046950000000000003, "loss": 5.8132, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 314, "tokens_per_second_per_gpu": 16799.33 }, { "epoch": 0.007875, "grad_norm": 1.421875, "learning_rate": 0.000471, "loss": 5.7892, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 315, "tokens_per_second_per_gpu": 18141.5 }, { "epoch": 0.0079, "grad_norm": 1.7109375, "learning_rate": 0.0004725, "loss": 5.786, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 316, "tokens_per_second_per_gpu": 23874.72 }, { "epoch": 0.007925, "grad_norm": 2.09375, "learning_rate": 0.00047400000000000003, "loss": 5.8001, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 317, "tokens_per_second_per_gpu": 17655.22 }, { "epoch": 0.00795, "grad_norm": 1.359375, "learning_rate": 0.0004755, "loss": 5.7977, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 318, "tokens_per_second_per_gpu": 20234.84 }, { "epoch": 0.007975, "grad_norm": 1.2734375, "learning_rate": 0.000477, "loss": 5.7867, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.62, "memory/max_allocated (GiB)": 66.62, "step": 319, "tokens_per_second_per_gpu": 19123.55 }, { "epoch": 0.008, "grad_norm": 1.5859375, "learning_rate": 0.00047850000000000003, "loss": 5.7642, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 320, "tokens_per_second_per_gpu": 17129.43 }, { "epoch": 0.008025, "grad_norm": 0.9609375, "learning_rate": 0.00048, "loss": 5.77, "memory/device_reserved (GiB)": 116.82, "memory/max_active (GiB)": 116.63, "memory/max_allocated (GiB)": 116.63, "step": 321, "tokens_per_second_per_gpu": 11982.9 }, { "epoch": 0.00805, "grad_norm": 2.5625, "learning_rate": 0.0004815, "loss": 5.7734, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 322, "tokens_per_second_per_gpu": 11706.1 }, { "epoch": 0.008075, "grad_norm": 2.265625, "learning_rate": 0.00048300000000000003, "loss": 5.7813, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 323, "tokens_per_second_per_gpu": 11609.12 }, { "epoch": 0.0081, "grad_norm": 1.2421875, "learning_rate": 0.0004845, "loss": 5.7423, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 324, "tokens_per_second_per_gpu": 12947.85 }, { "epoch": 0.008125, "grad_norm": 2.734375, "learning_rate": 0.00048600000000000005, "loss": 5.7299, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 325, "tokens_per_second_per_gpu": 17295.23 }, { "epoch": 0.00815, "grad_norm": 1.09375, "learning_rate": 0.00048750000000000003, "loss": 5.715, "memory/device_reserved (GiB)": 106.61, "memory/max_active (GiB)": 106.43, "memory/max_allocated (GiB)": 106.43, "step": 326, "tokens_per_second_per_gpu": 12821.18 }, { "epoch": 0.008175, "grad_norm": 1.34375, "learning_rate": 0.0004890000000000001, "loss": 5.6944, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 327, "tokens_per_second_per_gpu": 18206.65 }, { "epoch": 0.0082, "grad_norm": 1.96875, "learning_rate": 0.0004905, "loss": 5.6995, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 328, "tokens_per_second_per_gpu": 12674.44 }, { "epoch": 0.008225, "grad_norm": 1.2890625, "learning_rate": 0.000492, "loss": 5.6914, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 329, "tokens_per_second_per_gpu": 24306.41 }, { "epoch": 0.00825, "grad_norm": 2.1875, "learning_rate": 0.0004935, "loss": 5.6915, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 330, "tokens_per_second_per_gpu": 22007.27 }, { "epoch": 0.008275, "grad_norm": 1.8828125, "learning_rate": 0.000495, "loss": 5.6937, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 331, "tokens_per_second_per_gpu": 20489.9 }, { "epoch": 0.0083, "grad_norm": 0.921875, "learning_rate": 0.0004965000000000001, "loss": 5.6585, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 332, "tokens_per_second_per_gpu": 25772.73 }, { "epoch": 0.008325, "grad_norm": 2.484375, "learning_rate": 0.0004980000000000001, "loss": 5.6552, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 333, "tokens_per_second_per_gpu": 13862.42 }, { "epoch": 0.00835, "grad_norm": 1.546875, "learning_rate": 0.0004995, "loss": 5.6913, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 334, "tokens_per_second_per_gpu": 15762.32 }, { "epoch": 0.008375, "grad_norm": 1.5546875, "learning_rate": 0.000501, "loss": 5.6442, "memory/device_reserved (GiB)": 95.34, "memory/max_active (GiB)": 95.31, "memory/max_allocated (GiB)": 95.31, "step": 335, "tokens_per_second_per_gpu": 14983.88 }, { "epoch": 0.0084, "grad_norm": 1.671875, "learning_rate": 0.0005025, "loss": 5.6312, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 336, "tokens_per_second_per_gpu": 13351.84 }, { "epoch": 0.008425, "grad_norm": 1.40625, "learning_rate": 0.000504, "loss": 5.6469, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 337, "tokens_per_second_per_gpu": 22869.77 }, { "epoch": 0.00845, "grad_norm": 2.171875, "learning_rate": 0.0005055000000000001, "loss": 5.6419, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 338, "tokens_per_second_per_gpu": 17643.47 }, { "epoch": 0.008475, "grad_norm": 2.203125, "learning_rate": 0.0005070000000000001, "loss": 5.6344, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 339, "tokens_per_second_per_gpu": 11028.47 }, { "epoch": 0.0085, "grad_norm": 1.40625, "learning_rate": 0.0005085000000000001, "loss": 5.6111, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 340, "tokens_per_second_per_gpu": 18176.67 }, { "epoch": 0.008525, "grad_norm": 2.203125, "learning_rate": 0.00051, "loss": 5.6515, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 341, "tokens_per_second_per_gpu": 13632.46 }, { "epoch": 0.00855, "grad_norm": 1.5703125, "learning_rate": 0.0005115, "loss": 5.5968, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 342, "tokens_per_second_per_gpu": 23112.23 }, { "epoch": 0.008575, "grad_norm": 1.4453125, "learning_rate": 0.000513, "loss": 5.5629, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 343, "tokens_per_second_per_gpu": 20206.81 }, { "epoch": 0.0086, "grad_norm": 3.296875, "learning_rate": 0.0005145000000000001, "loss": 5.5977, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 344, "tokens_per_second_per_gpu": 14975.98 }, { "epoch": 0.008625, "grad_norm": 1.109375, "learning_rate": 0.000516, "loss": 5.5893, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 345, "tokens_per_second_per_gpu": 14113.65 }, { "epoch": 0.00865, "grad_norm": 3.28125, "learning_rate": 0.0005175, "loss": 5.6231, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 346, "tokens_per_second_per_gpu": 20693.41 }, { "epoch": 0.008675, "grad_norm": 1.8125, "learning_rate": 0.0005189999999999999, "loss": 5.6164, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 347, "tokens_per_second_per_gpu": 23825.52 }, { "epoch": 0.0087, "grad_norm": 1.5390625, "learning_rate": 0.0005205, "loss": 5.5675, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 348, "tokens_per_second_per_gpu": 16857.89 }, { "epoch": 0.008725, "grad_norm": 2.75, "learning_rate": 0.000522, "loss": 5.5807, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 349, "tokens_per_second_per_gpu": 28326.39 }, { "epoch": 0.00875, "grad_norm": 1.9921875, "learning_rate": 0.0005235, "loss": 5.6311, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 350, "tokens_per_second_per_gpu": 12109.1 }, { "epoch": 0.008775, "grad_norm": 3.203125, "learning_rate": 0.000525, "loss": 5.6523, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 351, "tokens_per_second_per_gpu": 16000.25 }, { "epoch": 0.0088, "grad_norm": 1.6640625, "learning_rate": 0.0005265, "loss": 5.5537, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 352, "tokens_per_second_per_gpu": 14934.51 }, { "epoch": 0.008825, "grad_norm": 3.15625, "learning_rate": 0.0005279999999999999, "loss": 5.6061, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 353, "tokens_per_second_per_gpu": 16549.75 }, { "epoch": 0.00885, "grad_norm": 1.5234375, "learning_rate": 0.0005295, "loss": 5.6041, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 354, "tokens_per_second_per_gpu": 16705.98 }, { "epoch": 0.008875, "grad_norm": 1.109375, "learning_rate": 0.000531, "loss": 5.5725, "memory/device_reserved (GiB)": 116.82, "memory/max_active (GiB)": 116.63, "memory/max_allocated (GiB)": 116.63, "step": 355, "tokens_per_second_per_gpu": 11265.66 }, { "epoch": 0.0089, "grad_norm": 3.453125, "learning_rate": 0.0005325, "loss": 5.5823, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 356, "tokens_per_second_per_gpu": 14535.57 }, { "epoch": 0.008925, "grad_norm": 1.703125, "learning_rate": 0.000534, "loss": 5.5609, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 357, "tokens_per_second_per_gpu": 15244.71 }, { "epoch": 0.00895, "grad_norm": 2.03125, "learning_rate": 0.0005355, "loss": 5.566, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 358, "tokens_per_second_per_gpu": 21337.02 }, { "epoch": 0.008975, "grad_norm": 2.78125, "learning_rate": 0.000537, "loss": 5.5426, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 359, "tokens_per_second_per_gpu": 15241.54 }, { "epoch": 0.009, "grad_norm": 1.90625, "learning_rate": 0.0005385, "loss": 5.5765, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 360, "tokens_per_second_per_gpu": 15529.99 }, { "epoch": 0.009025, "grad_norm": 1.3046875, "learning_rate": 0.00054, "loss": 5.54, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 361, "tokens_per_second_per_gpu": 24004.0 }, { "epoch": 0.00905, "grad_norm": 2.078125, "learning_rate": 0.0005415, "loss": 5.5417, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 362, "tokens_per_second_per_gpu": 15780.65 }, { "epoch": 0.009075, "grad_norm": 1.7890625, "learning_rate": 0.000543, "loss": 5.5005, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 363, "tokens_per_second_per_gpu": 16430.9 }, { "epoch": 0.0091, "grad_norm": 1.1796875, "learning_rate": 0.0005445, "loss": 5.5036, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 364, "tokens_per_second_per_gpu": 18590.79 }, { "epoch": 0.009125, "grad_norm": 2.546875, "learning_rate": 0.000546, "loss": 5.5181, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 365, "tokens_per_second_per_gpu": 23522.39 }, { "epoch": 0.00915, "grad_norm": 1.4453125, "learning_rate": 0.0005475, "loss": 5.4785, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 366, "tokens_per_second_per_gpu": 24475.69 }, { "epoch": 0.009175, "grad_norm": 1.78125, "learning_rate": 0.000549, "loss": 5.5404, "memory/device_reserved (GiB)": 44.36, "memory/max_active (GiB)": 44.31, "memory/max_allocated (GiB)": 44.31, "step": 367, "tokens_per_second_per_gpu": 26815.12 }, { "epoch": 0.0092, "grad_norm": 1.375, "learning_rate": 0.0005505, "loss": 5.504, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 368, "tokens_per_second_per_gpu": 20778.02 }, { "epoch": 0.009225, "grad_norm": 1.4296875, "learning_rate": 0.000552, "loss": 5.4814, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 369, "tokens_per_second_per_gpu": 14316.08 }, { "epoch": 0.00925, "grad_norm": 2.609375, "learning_rate": 0.0005535, "loss": 5.511, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 370, "tokens_per_second_per_gpu": 10701.49 }, { "epoch": 0.009275, "grad_norm": 1.7421875, "learning_rate": 0.000555, "loss": 5.4941, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 371, "tokens_per_second_per_gpu": 14128.74 }, { "epoch": 0.0093, "grad_norm": 1.5546875, "learning_rate": 0.0005565, "loss": 5.5033, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 372, "tokens_per_second_per_gpu": 14965.08 }, { "epoch": 0.009325, "grad_norm": 1.6875, "learning_rate": 0.000558, "loss": 5.4621, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 373, "tokens_per_second_per_gpu": 22609.52 }, { "epoch": 0.00935, "grad_norm": 1.9375, "learning_rate": 0.0005595, "loss": 5.4817, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 374, "tokens_per_second_per_gpu": 11197.35 }, { "epoch": 0.009375, "grad_norm": 1.0, "learning_rate": 0.000561, "loss": 5.4396, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 375, "tokens_per_second_per_gpu": 28581.16 }, { "epoch": 0.0094, "grad_norm": 2.71875, "learning_rate": 0.0005625000000000001, "loss": 5.4588, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 376, "tokens_per_second_per_gpu": 12821.04 }, { "epoch": 0.009425, "grad_norm": 1.6796875, "learning_rate": 0.000564, "loss": 5.4783, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 377, "tokens_per_second_per_gpu": 28157.5 }, { "epoch": 0.00945, "grad_norm": 3.578125, "learning_rate": 0.0005655, "loss": 5.4974, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 378, "tokens_per_second_per_gpu": 10621.37 }, { "epoch": 0.009475, "grad_norm": 2.1875, "learning_rate": 0.000567, "loss": 5.4746, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 379, "tokens_per_second_per_gpu": 24821.19 }, { "epoch": 0.0095, "grad_norm": 1.3671875, "learning_rate": 0.0005685, "loss": 5.4594, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 380, "tokens_per_second_per_gpu": 12329.99 }, { "epoch": 0.009525, "grad_norm": 2.984375, "learning_rate": 0.00057, "loss": 5.4521, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 381, "tokens_per_second_per_gpu": 28262.59 }, { "epoch": 0.00955, "grad_norm": 1.65625, "learning_rate": 0.0005715000000000001, "loss": 5.4569, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 382, "tokens_per_second_per_gpu": 10720.31 }, { "epoch": 0.009575, "grad_norm": 2.890625, "learning_rate": 0.000573, "loss": 5.4517, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 383, "tokens_per_second_per_gpu": 16377.78 }, { "epoch": 0.0096, "grad_norm": 1.375, "learning_rate": 0.0005745, "loss": 5.4428, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 384, "tokens_per_second_per_gpu": 11858.18 }, { "epoch": 0.009625, "grad_norm": 1.7578125, "learning_rate": 0.000576, "loss": 5.4415, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 385, "tokens_per_second_per_gpu": 27428.29 }, { "epoch": 0.00965, "grad_norm": 1.1484375, "learning_rate": 0.0005775, "loss": 5.4172, "memory/device_reserved (GiB)": 106.61, "memory/max_active (GiB)": 106.43, "memory/max_allocated (GiB)": 106.43, "step": 386, "tokens_per_second_per_gpu": 12872.46 }, { "epoch": 0.009675, "grad_norm": 1.8984375, "learning_rate": 0.000579, "loss": 5.4366, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 387, "tokens_per_second_per_gpu": 13022.41 }, { "epoch": 0.0097, "grad_norm": 0.99609375, "learning_rate": 0.0005805000000000001, "loss": 5.3766, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 388, "tokens_per_second_per_gpu": 12718.47 }, { "epoch": 0.009725, "grad_norm": 1.5859375, "learning_rate": 0.000582, "loss": 5.4005, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 389, "tokens_per_second_per_gpu": 11694.71 }, { "epoch": 0.00975, "grad_norm": 1.28125, "learning_rate": 0.0005835, "loss": 5.3742, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 390, "tokens_per_second_per_gpu": 24283.15 }, { "epoch": 0.009775, "grad_norm": 1.015625, "learning_rate": 0.000585, "loss": 5.375, "memory/device_reserved (GiB)": 64.28, "memory/max_active (GiB)": 64.24, "memory/max_allocated (GiB)": 64.24, "step": 391, "tokens_per_second_per_gpu": 19818.7 }, { "epoch": 0.0098, "grad_norm": 0.98046875, "learning_rate": 0.0005865, "loss": 5.3742, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 392, "tokens_per_second_per_gpu": 18020.46 }, { "epoch": 0.009825, "grad_norm": 1.765625, "learning_rate": 0.000588, "loss": 5.3619, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 393, "tokens_per_second_per_gpu": 17465.06 }, { "epoch": 0.00985, "grad_norm": 1.53125, "learning_rate": 0.0005895000000000001, "loss": 5.3528, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 394, "tokens_per_second_per_gpu": 15511.58 }, { "epoch": 0.009875, "grad_norm": 1.265625, "learning_rate": 0.000591, "loss": 5.3576, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 395, "tokens_per_second_per_gpu": 13178.37 }, { "epoch": 0.0099, "grad_norm": 1.703125, "learning_rate": 0.0005925, "loss": 5.3456, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 396, "tokens_per_second_per_gpu": 16060.13 }, { "epoch": 0.009925, "grad_norm": 1.4921875, "learning_rate": 0.000594, "loss": 5.3503, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 397, "tokens_per_second_per_gpu": 20178.48 }, { "epoch": 0.00995, "grad_norm": 1.7734375, "learning_rate": 0.0005955, "loss": 5.3191, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 398, "tokens_per_second_per_gpu": 18053.95 }, { "epoch": 0.009975, "grad_norm": 1.484375, "learning_rate": 0.0005970000000000001, "loss": 5.3211, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 399, "tokens_per_second_per_gpu": 19582.96 }, { "epoch": 0.01, "grad_norm": 1.1171875, "learning_rate": 0.0005985000000000001, "loss": 5.2943, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 400, "tokens_per_second_per_gpu": 27247.94 }, { "epoch": 0.010025, "grad_norm": 1.5234375, "learning_rate": 0.0006000000000000001, "loss": 5.288, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 401, "tokens_per_second_per_gpu": 19826.97 }, { "epoch": 0.01005, "grad_norm": 0.9296875, "learning_rate": 0.0006015, "loss": 5.3066, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 402, "tokens_per_second_per_gpu": 29227.09 }, { "epoch": 0.010075, "grad_norm": 1.609375, "learning_rate": 0.000603, "loss": 5.2984, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 403, "tokens_per_second_per_gpu": 22390.36 }, { "epoch": 0.0101, "grad_norm": 1.578125, "learning_rate": 0.0006045, "loss": 5.3217, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 404, "tokens_per_second_per_gpu": 13644.97 }, { "epoch": 0.010125, "grad_norm": 1.4765625, "learning_rate": 0.0006060000000000001, "loss": 5.2995, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 405, "tokens_per_second_per_gpu": 12044.35 }, { "epoch": 0.01015, "grad_norm": 0.90625, "learning_rate": 0.0006075000000000001, "loss": 5.256, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 406, "tokens_per_second_per_gpu": 17645.88 }, { "epoch": 0.010175, "grad_norm": 1.203125, "learning_rate": 0.0006090000000000001, "loss": 5.2678, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 407, "tokens_per_second_per_gpu": 22583.62 }, { "epoch": 0.0102, "grad_norm": 1.4140625, "learning_rate": 0.0006104999999999999, "loss": 5.2801, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 408, "tokens_per_second_per_gpu": 23278.88 }, { "epoch": 0.010225, "grad_norm": 1.5, "learning_rate": 0.000612, "loss": 5.2774, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 409, "tokens_per_second_per_gpu": 19473.34 }, { "epoch": 0.01025, "grad_norm": 1.5859375, "learning_rate": 0.0006135, "loss": 5.2565, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 410, "tokens_per_second_per_gpu": 11045.47 }, { "epoch": 0.010275, "grad_norm": 1.703125, "learning_rate": 0.000615, "loss": 5.2468, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 411, "tokens_per_second_per_gpu": 10683.23 }, { "epoch": 0.0103, "grad_norm": 1.84375, "learning_rate": 0.0006165, "loss": 5.2826, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 412, "tokens_per_second_per_gpu": 14529.7 }, { "epoch": 0.010325, "grad_norm": 0.734375, "learning_rate": 0.000618, "loss": 5.2481, "memory/device_reserved (GiB)": 75.99, "memory/max_active (GiB)": 75.82, "memory/max_allocated (GiB)": 75.82, "step": 413, "tokens_per_second_per_gpu": 17593.25 }, { "epoch": 0.01035, "grad_norm": 1.7109375, "learning_rate": 0.0006194999999999999, "loss": 5.2232, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 414, "tokens_per_second_per_gpu": 17093.39 }, { "epoch": 0.010375, "grad_norm": 1.578125, "learning_rate": 0.000621, "loss": 5.2343, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 415, "tokens_per_second_per_gpu": 23955.92 }, { "epoch": 0.0104, "grad_norm": 1.34375, "learning_rate": 0.0006225, "loss": 5.2171, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 416, "tokens_per_second_per_gpu": 20826.03 }, { "epoch": 0.010425, "grad_norm": 2.140625, "learning_rate": 0.000624, "loss": 5.2358, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 417, "tokens_per_second_per_gpu": 13142.25 }, { "epoch": 0.01045, "grad_norm": 0.9609375, "learning_rate": 0.0006255, "loss": 5.2269, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 418, "tokens_per_second_per_gpu": 21967.82 }, { "epoch": 0.010475, "grad_norm": 1.765625, "learning_rate": 0.000627, "loss": 5.2265, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 419, "tokens_per_second_per_gpu": 21095.14 }, { "epoch": 0.0105, "grad_norm": 1.4921875, "learning_rate": 0.0006284999999999999, "loss": 5.2096, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 420, "tokens_per_second_per_gpu": 10749.11 }, { "epoch": 0.010525, "grad_norm": 1.8671875, "learning_rate": 0.00063, "loss": 5.2376, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 421, "tokens_per_second_per_gpu": 12669.53 }, { "epoch": 0.01055, "grad_norm": 1.109375, "learning_rate": 0.0006315, "loss": 5.2109, "memory/device_reserved (GiB)": 35.74, "memory/max_active (GiB)": 35.55, "memory/max_allocated (GiB)": 35.55, "step": 422, "tokens_per_second_per_gpu": 32527.74 }, { "epoch": 0.010575, "grad_norm": 1.90625, "learning_rate": 0.000633, "loss": 5.254, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 423, "tokens_per_second_per_gpu": 13882.72 }, { "epoch": 0.0106, "grad_norm": 1.2421875, "learning_rate": 0.0006345, "loss": 5.2145, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 424, "tokens_per_second_per_gpu": 13425.9 }, { "epoch": 0.010625, "grad_norm": 1.84375, "learning_rate": 0.000636, "loss": 5.2325, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 425, "tokens_per_second_per_gpu": 13528.43 }, { "epoch": 0.01065, "grad_norm": 1.9765625, "learning_rate": 0.0006375, "loss": 5.2319, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 426, "tokens_per_second_per_gpu": 13537.98 }, { "epoch": 0.010675, "grad_norm": 0.94921875, "learning_rate": 0.000639, "loss": 5.2119, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 427, "tokens_per_second_per_gpu": 13629.3 }, { "epoch": 0.0107, "grad_norm": 1.71875, "learning_rate": 0.0006405, "loss": 5.1756, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 428, "tokens_per_second_per_gpu": 12033.44 }, { "epoch": 0.010725, "grad_norm": 1.71875, "learning_rate": 0.000642, "loss": 5.1711, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 429, "tokens_per_second_per_gpu": 11424.62 }, { "epoch": 0.01075, "grad_norm": 1.1328125, "learning_rate": 0.0006435, "loss": 5.1615, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 430, "tokens_per_second_per_gpu": 14307.07 }, { "epoch": 0.010775, "grad_norm": 1.7890625, "learning_rate": 0.000645, "loss": 5.1884, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 431, "tokens_per_second_per_gpu": 23292.92 }, { "epoch": 0.0108, "grad_norm": 0.99609375, "learning_rate": 0.0006465, "loss": 5.1263, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 432, "tokens_per_second_per_gpu": 12055.68 }, { "epoch": 0.010825, "grad_norm": 2.03125, "learning_rate": 0.000648, "loss": 5.1667, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 433, "tokens_per_second_per_gpu": 15814.35 }, { "epoch": 0.01085, "grad_norm": 1.0, "learning_rate": 0.0006495, "loss": 5.1792, "memory/device_reserved (GiB)": 35.75, "memory/max_active (GiB)": 35.55, "memory/max_allocated (GiB)": 35.55, "step": 434, "tokens_per_second_per_gpu": 31869.07 }, { "epoch": 0.010875, "grad_norm": 1.7890625, "learning_rate": 0.000651, "loss": 5.1774, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 435, "tokens_per_second_per_gpu": 17780.22 }, { "epoch": 0.0109, "grad_norm": 0.96484375, "learning_rate": 0.0006525, "loss": 5.1609, "memory/device_reserved (GiB)": 64.72, "memory/max_active (GiB)": 64.71, "memory/max_allocated (GiB)": 64.71, "step": 436, "tokens_per_second_per_gpu": 20990.56 }, { "epoch": 0.010925, "grad_norm": 1.21875, "learning_rate": 0.000654, "loss": 5.155, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 437, "tokens_per_second_per_gpu": 15721.11 }, { "epoch": 0.01095, "grad_norm": 1.8125, "learning_rate": 0.0006555, "loss": 5.1546, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 438, "tokens_per_second_per_gpu": 16032.3 }, { "epoch": 0.010975, "grad_norm": 1.09375, "learning_rate": 0.000657, "loss": 5.1691, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 439, "tokens_per_second_per_gpu": 14897.76 }, { "epoch": 0.011, "grad_norm": 1.5625, "learning_rate": 0.0006585, "loss": 5.1653, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 440, "tokens_per_second_per_gpu": 17592.32 }, { "epoch": 0.011025, "grad_norm": 1.6171875, "learning_rate": 0.00066, "loss": 5.1145, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 441, "tokens_per_second_per_gpu": 15542.88 }, { "epoch": 0.01105, "grad_norm": 1.5546875, "learning_rate": 0.0006615, "loss": 5.1483, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 442, "tokens_per_second_per_gpu": 15714.0 }, { "epoch": 0.011075, "grad_norm": 0.85546875, "learning_rate": 0.0006630000000000001, "loss": 5.123, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 443, "tokens_per_second_per_gpu": 22852.44 }, { "epoch": 0.0111, "grad_norm": 1.25, "learning_rate": 0.0006645, "loss": 5.101, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 444, "tokens_per_second_per_gpu": 23724.21 }, { "epoch": 0.011125, "grad_norm": 1.3203125, "learning_rate": 0.000666, "loss": 5.1005, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 445, "tokens_per_second_per_gpu": 23283.26 }, { "epoch": 0.01115, "grad_norm": 1.171875, "learning_rate": 0.0006675, "loss": 5.0967, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 446, "tokens_per_second_per_gpu": 16798.21 }, { "epoch": 0.011175, "grad_norm": 1.3359375, "learning_rate": 0.000669, "loss": 5.0599, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 447, "tokens_per_second_per_gpu": 16730.83 }, { "epoch": 0.0112, "grad_norm": 1.09375, "learning_rate": 0.0006705, "loss": 5.1002, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 448, "tokens_per_second_per_gpu": 23268.45 }, { "epoch": 0.011225, "grad_norm": 1.375, "learning_rate": 0.0006720000000000001, "loss": 5.0991, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 449, "tokens_per_second_per_gpu": 12080.1 }, { "epoch": 0.01125, "grad_norm": 1.296875, "learning_rate": 0.0006735, "loss": 5.0855, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 450, "tokens_per_second_per_gpu": 23444.1 }, { "epoch": 0.011275, "grad_norm": 1.875, "learning_rate": 0.000675, "loss": 5.0845, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 451, "tokens_per_second_per_gpu": 10597.13 }, { "epoch": 0.0113, "grad_norm": 1.46875, "learning_rate": 0.0006765, "loss": 5.0765, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 452, "tokens_per_second_per_gpu": 13661.84 }, { "epoch": 0.011325, "grad_norm": 1.421875, "learning_rate": 0.000678, "loss": 5.0778, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 453, "tokens_per_second_per_gpu": 17742.6 }, { "epoch": 0.01135, "grad_norm": 1.3125, "learning_rate": 0.0006795, "loss": 5.0534, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 454, "tokens_per_second_per_gpu": 13824.71 }, { "epoch": 0.011375, "grad_norm": 0.7890625, "learning_rate": 0.0006810000000000001, "loss": 5.0384, "memory/device_reserved (GiB)": 86.01, "memory/max_active (GiB)": 85.99, "memory/max_allocated (GiB)": 85.99, "step": 455, "tokens_per_second_per_gpu": 15958.68 }, { "epoch": 0.0114, "grad_norm": 1.1953125, "learning_rate": 0.0006825000000000001, "loss": 5.0726, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 456, "tokens_per_second_per_gpu": 22549.37 }, { "epoch": 0.011425, "grad_norm": 1.421875, "learning_rate": 0.000684, "loss": 5.0717, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 457, "tokens_per_second_per_gpu": 17454.27 }, { "epoch": 0.01145, "grad_norm": 1.2890625, "learning_rate": 0.0006855, "loss": 5.0392, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 458, "tokens_per_second_per_gpu": 10555.64 }, { "epoch": 0.011475, "grad_norm": 1.3671875, "learning_rate": 0.000687, "loss": 5.0364, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 459, "tokens_per_second_per_gpu": 15970.9 }, { "epoch": 0.0115, "grad_norm": 1.421875, "learning_rate": 0.0006885000000000001, "loss": 5.0505, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 460, "tokens_per_second_per_gpu": 19690.19 }, { "epoch": 0.011525, "grad_norm": 0.87890625, "learning_rate": 0.0006900000000000001, "loss": 5.0343, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 461, "tokens_per_second_per_gpu": 13411.87 }, { "epoch": 0.01155, "grad_norm": 1.21875, "learning_rate": 0.0006915000000000001, "loss": 5.0325, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 462, "tokens_per_second_per_gpu": 15477.74 }, { "epoch": 0.011575, "grad_norm": 1.1875, "learning_rate": 0.000693, "loss": 5.0414, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 463, "tokens_per_second_per_gpu": 24239.41 }, { "epoch": 0.0116, "grad_norm": 1.3515625, "learning_rate": 0.0006945, "loss": 5.0252, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 464, "tokens_per_second_per_gpu": 27795.12 }, { "epoch": 0.011625, "grad_norm": 1.3515625, "learning_rate": 0.000696, "loss": 5.0269, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 465, "tokens_per_second_per_gpu": 23769.84 }, { "epoch": 0.01165, "grad_norm": 1.1796875, "learning_rate": 0.0006975000000000001, "loss": 5.0095, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 466, "tokens_per_second_per_gpu": 12951.47 }, { "epoch": 0.011675, "grad_norm": 1.953125, "learning_rate": 0.0006990000000000001, "loss": 5.0224, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 467, "tokens_per_second_per_gpu": 19644.99 }, { "epoch": 0.0117, "grad_norm": 1.109375, "learning_rate": 0.0007005000000000001, "loss": 5.0523, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 468, "tokens_per_second_per_gpu": 13321.55 }, { "epoch": 0.011725, "grad_norm": 1.6328125, "learning_rate": 0.000702, "loss": 4.9968, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 469, "tokens_per_second_per_gpu": 16132.44 }, { "epoch": 0.01175, "grad_norm": 1.046875, "learning_rate": 0.0007035, "loss": 4.9927, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 470, "tokens_per_second_per_gpu": 23835.93 }, { "epoch": 0.011775, "grad_norm": 1.3203125, "learning_rate": 0.000705, "loss": 4.9889, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 471, "tokens_per_second_per_gpu": 11064.47 }, { "epoch": 0.0118, "grad_norm": 1.2734375, "learning_rate": 0.0007065, "loss": 5.0127, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 472, "tokens_per_second_per_gpu": 14467.37 }, { "epoch": 0.011825, "grad_norm": 1.265625, "learning_rate": 0.000708, "loss": 4.9693, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 473, "tokens_per_second_per_gpu": 19450.16 }, { "epoch": 0.01185, "grad_norm": 0.97265625, "learning_rate": 0.0007095, "loss": 4.9993, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 474, "tokens_per_second_per_gpu": 13284.67 }, { "epoch": 0.011875, "grad_norm": 0.953125, "learning_rate": 0.0007109999999999999, "loss": 4.996, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 475, "tokens_per_second_per_gpu": 26285.64 }, { "epoch": 0.0119, "grad_norm": 1.0078125, "learning_rate": 0.0007125, "loss": 4.9634, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 476, "tokens_per_second_per_gpu": 20365.05 }, { "epoch": 0.011925, "grad_norm": 1.265625, "learning_rate": 0.000714, "loss": 4.9965, "memory/device_reserved (GiB)": 86.01, "memory/max_active (GiB)": 85.99, "memory/max_allocated (GiB)": 85.99, "step": 477, "tokens_per_second_per_gpu": 16429.08 }, { "epoch": 0.01195, "grad_norm": 1.0703125, "learning_rate": 0.0007155, "loss": 4.9372, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 478, "tokens_per_second_per_gpu": 10905.38 }, { "epoch": 0.011975, "grad_norm": 2.34375, "learning_rate": 0.000717, "loss": 5.0128, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 479, "tokens_per_second_per_gpu": 14370.8 }, { "epoch": 0.012, "grad_norm": 1.0078125, "learning_rate": 0.0007185, "loss": 4.9353, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 480, "tokens_per_second_per_gpu": 20155.51 }, { "epoch": 0.012025, "grad_norm": 1.984375, "learning_rate": 0.0007199999999999999, "loss": 5.0033, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 481, "tokens_per_second_per_gpu": 10597.67 }, { "epoch": 0.01205, "grad_norm": 1.234375, "learning_rate": 0.0007215, "loss": 4.9886, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 482, "tokens_per_second_per_gpu": 21737.28 }, { "epoch": 0.012075, "grad_norm": 0.59765625, "learning_rate": 0.000723, "loss": 5.0065, "memory/device_reserved (GiB)": 116.62, "memory/max_active (GiB)": 116.59, "memory/max_allocated (GiB)": 116.59, "step": 483, "tokens_per_second_per_gpu": 12182.64 }, { "epoch": 0.0121, "grad_norm": 0.9765625, "learning_rate": 0.0007245, "loss": 4.9613, "memory/device_reserved (GiB)": 66.36, "memory/max_active (GiB)": 66.15, "memory/max_allocated (GiB)": 66.15, "step": 484, "tokens_per_second_per_gpu": 20178.81 }, { "epoch": 0.012125, "grad_norm": 0.80859375, "learning_rate": 0.000726, "loss": 4.9621, "memory/device_reserved (GiB)": 44.36, "memory/max_active (GiB)": 44.31, "memory/max_allocated (GiB)": 44.31, "step": 485, "tokens_per_second_per_gpu": 27862.89 }, { "epoch": 0.01215, "grad_norm": 1.046875, "learning_rate": 0.0007275, "loss": 5.008, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 486, "tokens_per_second_per_gpu": 11152.89 }, { "epoch": 0.012175, "grad_norm": 1.828125, "learning_rate": 0.000729, "loss": 4.9628, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 487, "tokens_per_second_per_gpu": 11021.42 }, { "epoch": 0.0122, "grad_norm": 1.15625, "learning_rate": 0.0007305, "loss": 4.91, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 488, "tokens_per_second_per_gpu": 17074.03 }, { "epoch": 0.012225, "grad_norm": 2.234375, "learning_rate": 0.000732, "loss": 4.9738, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 489, "tokens_per_second_per_gpu": 10616.35 }, { "epoch": 0.01225, "grad_norm": 1.0625, "learning_rate": 0.0007335, "loss": 4.9433, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 490, "tokens_per_second_per_gpu": 15533.06 }, { "epoch": 0.012275, "grad_norm": 1.65625, "learning_rate": 0.000735, "loss": 4.9302, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 491, "tokens_per_second_per_gpu": 13896.02 }, { "epoch": 0.0123, "grad_norm": 1.2265625, "learning_rate": 0.0007365, "loss": 4.954, "memory/device_reserved (GiB)": 65.6, "memory/max_active (GiB)": 65.59, "memory/max_allocated (GiB)": 65.59, "step": 492, "tokens_per_second_per_gpu": 19729.35 }, { "epoch": 0.012325, "grad_norm": 1.046875, "learning_rate": 0.000738, "loss": 4.9099, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 493, "tokens_per_second_per_gpu": 13845.8 }, { "epoch": 0.01235, "grad_norm": 1.3984375, "learning_rate": 0.0007395, "loss": 4.9399, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 494, "tokens_per_second_per_gpu": 10618.75 }, { "epoch": 0.012375, "grad_norm": 1.4609375, "learning_rate": 0.000741, "loss": 4.9179, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 495, "tokens_per_second_per_gpu": 27084.48 }, { "epoch": 0.0124, "grad_norm": 1.453125, "learning_rate": 0.0007425, "loss": 4.9334, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 496, "tokens_per_second_per_gpu": 12816.96 }, { "epoch": 0.012425, "grad_norm": 1.421875, "learning_rate": 0.000744, "loss": 4.9212, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 497, "tokens_per_second_per_gpu": 15751.8 }, { "epoch": 0.01245, "grad_norm": 1.765625, "learning_rate": 0.0007455, "loss": 4.9219, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 498, "tokens_per_second_per_gpu": 10833.31 }, { "epoch": 0.012475, "grad_norm": 1.4765625, "learning_rate": 0.000747, "loss": 4.8889, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 499, "tokens_per_second_per_gpu": 19113.86 }, { "epoch": 0.0125, "grad_norm": 1.2578125, "learning_rate": 0.0007485, "loss": 4.9049, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 500, "tokens_per_second_per_gpu": 23587.36 }, { "epoch": 0.012525, "grad_norm": 0.7265625, "learning_rate": 0.00075, "loss": 4.9078, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 501, "tokens_per_second_per_gpu": 13657.16 }, { "epoch": 0.01255, "grad_norm": 0.91796875, "learning_rate": 0.0007515, "loss": 4.886, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 502, "tokens_per_second_per_gpu": 14260.79 }, { "epoch": 0.012575, "grad_norm": 0.9453125, "learning_rate": 0.000753, "loss": 4.892, "memory/device_reserved (GiB)": 116.82, "memory/max_active (GiB)": 116.63, "memory/max_allocated (GiB)": 116.63, "step": 503, "tokens_per_second_per_gpu": 11839.29 }, { "epoch": 0.0126, "grad_norm": 1.609375, "learning_rate": 0.0007545000000000001, "loss": 4.8983, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 504, "tokens_per_second_per_gpu": 15066.39 }, { "epoch": 0.012625, "grad_norm": 0.94140625, "learning_rate": 0.000756, "loss": 4.8666, "memory/device_reserved (GiB)": 85.82, "memory/max_active (GiB)": 85.68, "memory/max_allocated (GiB)": 85.68, "step": 505, "tokens_per_second_per_gpu": 15483.14 }, { "epoch": 0.01265, "grad_norm": 1.1015625, "learning_rate": 0.0007575, "loss": 4.8921, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 506, "tokens_per_second_per_gpu": 15845.94 }, { "epoch": 0.012675, "grad_norm": 1.203125, "learning_rate": 0.000759, "loss": 4.9045, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 507, "tokens_per_second_per_gpu": 22449.83 }, { "epoch": 0.0127, "grad_norm": 0.65234375, "learning_rate": 0.0007605, "loss": 4.8885, "memory/device_reserved (GiB)": 116.62, "memory/max_active (GiB)": 116.59, "memory/max_allocated (GiB)": 116.59, "step": 508, "tokens_per_second_per_gpu": 11891.1 }, { "epoch": 0.012725, "grad_norm": 0.9375, "learning_rate": 0.000762, "loss": 4.8704, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 509, "tokens_per_second_per_gpu": 23913.13 }, { "epoch": 0.01275, "grad_norm": 1.2734375, "learning_rate": 0.0007635000000000001, "loss": 4.8923, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 510, "tokens_per_second_per_gpu": 11536.04 }, { "epoch": 0.012775, "grad_norm": 1.0625, "learning_rate": 0.0007650000000000001, "loss": 4.8894, "memory/device_reserved (GiB)": 55.41, "memory/max_active (GiB)": 55.39, "memory/max_allocated (GiB)": 55.39, "step": 511, "tokens_per_second_per_gpu": 23906.98 }, { "epoch": 0.0128, "grad_norm": 1.046875, "learning_rate": 0.0007665, "loss": 4.8705, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 512, "tokens_per_second_per_gpu": 16694.03 }, { "epoch": 0.012825, "grad_norm": 1.9609375, "learning_rate": 0.000768, "loss": 4.8346, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 513, "tokens_per_second_per_gpu": 20378.31 }, { "epoch": 0.01285, "grad_norm": 1.046875, "learning_rate": 0.0007695, "loss": 4.8485, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 514, "tokens_per_second_per_gpu": 20427.7 }, { "epoch": 0.012875, "grad_norm": 1.1328125, "learning_rate": 0.000771, "loss": 4.8488, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 515, "tokens_per_second_per_gpu": 19873.12 }, { "epoch": 0.0129, "grad_norm": 1.15625, "learning_rate": 0.0007725000000000001, "loss": 4.865, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 516, "tokens_per_second_per_gpu": 29061.12 }, { "epoch": 0.012925, "grad_norm": 1.40625, "learning_rate": 0.0007740000000000001, "loss": 4.8664, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 517, "tokens_per_second_per_gpu": 19782.62 }, { "epoch": 0.01295, "grad_norm": 1.375, "learning_rate": 0.0007755, "loss": 4.8138, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 518, "tokens_per_second_per_gpu": 15907.75 }, { "epoch": 0.012975, "grad_norm": 1.2734375, "learning_rate": 0.000777, "loss": 4.8448, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 519, "tokens_per_second_per_gpu": 13069.18 }, { "epoch": 0.013, "grad_norm": 1.109375, "learning_rate": 0.0007785, "loss": 4.833, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 520, "tokens_per_second_per_gpu": 24571.55 }, { "epoch": 0.013025, "grad_norm": 1.1953125, "learning_rate": 0.0007800000000000001, "loss": 4.8258, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 521, "tokens_per_second_per_gpu": 14113.28 }, { "epoch": 0.01305, "grad_norm": 0.63671875, "learning_rate": 0.0007815000000000001, "loss": 4.8495, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.42, "memory/max_allocated (GiB)": 107.42, "step": 522, "tokens_per_second_per_gpu": 12246.09 }, { "epoch": 0.013075, "grad_norm": 1.1328125, "learning_rate": 0.0007830000000000001, "loss": 4.7972, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 523, "tokens_per_second_per_gpu": 11794.21 }, { "epoch": 0.0131, "grad_norm": 1.296875, "learning_rate": 0.0007845, "loss": 4.8171, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 524, "tokens_per_second_per_gpu": 13708.18 }, { "epoch": 0.013125, "grad_norm": 1.1015625, "learning_rate": 0.000786, "loss": 4.8277, "memory/device_reserved (GiB)": 76.0, "memory/max_active (GiB)": 75.82, "memory/max_allocated (GiB)": 75.82, "step": 525, "tokens_per_second_per_gpu": 16280.46 }, { "epoch": 0.01315, "grad_norm": 0.85546875, "learning_rate": 0.0007875, "loss": 4.7908, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 526, "tokens_per_second_per_gpu": 14095.54 }, { "epoch": 0.013175, "grad_norm": 0.875, "learning_rate": 0.0007890000000000001, "loss": 4.8103, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 527, "tokens_per_second_per_gpu": 27187.7 }, { "epoch": 0.0132, "grad_norm": 0.427734375, "learning_rate": 0.0007905000000000001, "loss": 4.8443, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.22, "memory/max_allocated (GiB)": 97.22, "step": 528, "tokens_per_second_per_gpu": 13439.64 }, { "epoch": 0.013225, "grad_norm": 0.828125, "learning_rate": 0.0007920000000000001, "loss": 4.823, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 529, "tokens_per_second_per_gpu": 15758.54 }, { "epoch": 0.01325, "grad_norm": 1.390625, "learning_rate": 0.0007935, "loss": 4.7866, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 530, "tokens_per_second_per_gpu": 13305.83 }, { "epoch": 0.013275, "grad_norm": 1.3671875, "learning_rate": 0.000795, "loss": 4.7801, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 531, "tokens_per_second_per_gpu": 10844.99 }, { "epoch": 0.0133, "grad_norm": 0.68359375, "learning_rate": 0.0007965, "loss": 4.7682, "memory/device_reserved (GiB)": 55.41, "memory/max_active (GiB)": 55.39, "memory/max_allocated (GiB)": 55.39, "step": 532, "tokens_per_second_per_gpu": 23295.2 }, { "epoch": 0.013325, "grad_norm": 0.78515625, "learning_rate": 0.0007980000000000001, "loss": 4.785, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 533, "tokens_per_second_per_gpu": 16707.26 }, { "epoch": 0.01335, "grad_norm": 1.1796875, "learning_rate": 0.0007995000000000001, "loss": 4.7577, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 534, "tokens_per_second_per_gpu": 17687.21 }, { "epoch": 0.013375, "grad_norm": 1.8828125, "learning_rate": 0.0008010000000000001, "loss": 4.76, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 535, "tokens_per_second_per_gpu": 17166.43 }, { "epoch": 0.0134, "grad_norm": 1.0625, "learning_rate": 0.0008025, "loss": 4.7775, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 536, "tokens_per_second_per_gpu": 17498.13 }, { "epoch": 0.013425, "grad_norm": 1.609375, "learning_rate": 0.000804, "loss": 4.8569, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 537, "tokens_per_second_per_gpu": 12425.24 }, { "epoch": 0.01345, "grad_norm": 1.0, "learning_rate": 0.0008055000000000001, "loss": 4.8056, "memory/device_reserved (GiB)": 75.81, "memory/max_active (GiB)": 75.79, "memory/max_allocated (GiB)": 75.79, "step": 538, "tokens_per_second_per_gpu": 18788.27 }, { "epoch": 0.013475, "grad_norm": 1.125, "learning_rate": 0.0008070000000000001, "loss": 4.7491, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 539, "tokens_per_second_per_gpu": 28487.08 }, { "epoch": 0.0135, "grad_norm": 1.2578125, "learning_rate": 0.0008085000000000001, "loss": 4.7733, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 540, "tokens_per_second_per_gpu": 13076.5 }, { "epoch": 0.013525, "grad_norm": 0.97265625, "learning_rate": 0.0008100000000000001, "loss": 4.7893, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 541, "tokens_per_second_per_gpu": 13678.33 }, { "epoch": 0.01355, "grad_norm": 1.125, "learning_rate": 0.0008115, "loss": 4.7722, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 542, "tokens_per_second_per_gpu": 16023.57 }, { "epoch": 0.013575, "grad_norm": 1.640625, "learning_rate": 0.000813, "loss": 4.7831, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 543, "tokens_per_second_per_gpu": 11301.26 }, { "epoch": 0.0136, "grad_norm": 1.484375, "learning_rate": 0.0008145000000000001, "loss": 4.7541, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 544, "tokens_per_second_per_gpu": 15508.59 }, { "epoch": 0.013625, "grad_norm": 1.6875, "learning_rate": 0.0008160000000000001, "loss": 4.7578, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 545, "tokens_per_second_per_gpu": 23714.86 }, { "epoch": 0.01365, "grad_norm": 1.2109375, "learning_rate": 0.0008175000000000001, "loss": 4.7204, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 546, "tokens_per_second_per_gpu": 28743.13 }, { "epoch": 0.013675, "grad_norm": 1.2734375, "learning_rate": 0.0008190000000000001, "loss": 4.7364, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 547, "tokens_per_second_per_gpu": 17636.2 }, { "epoch": 0.0137, "grad_norm": 1.0625, "learning_rate": 0.0008205, "loss": 4.7432, "memory/device_reserved (GiB)": 116.62, "memory/max_active (GiB)": 116.59, "memory/max_allocated (GiB)": 116.59, "step": 548, "tokens_per_second_per_gpu": 12183.48 }, { "epoch": 0.013725, "grad_norm": 1.453125, "learning_rate": 0.000822, "loss": 4.7323, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 549, "tokens_per_second_per_gpu": 17752.09 }, { "epoch": 0.01375, "grad_norm": 1.2265625, "learning_rate": 0.0008235000000000001, "loss": 4.7479, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 550, "tokens_per_second_per_gpu": 10274.31 }, { "epoch": 0.013775, "grad_norm": 1.5234375, "learning_rate": 0.0008250000000000001, "loss": 4.7619, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 551, "tokens_per_second_per_gpu": 12826.91 }, { "epoch": 0.0138, "grad_norm": 1.1015625, "learning_rate": 0.0008265000000000001, "loss": 4.7235, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 552, "tokens_per_second_per_gpu": 17776.55 }, { "epoch": 0.013825, "grad_norm": 1.484375, "learning_rate": 0.0008280000000000001, "loss": 4.7444, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 553, "tokens_per_second_per_gpu": 13770.33 }, { "epoch": 0.01385, "grad_norm": 1.1484375, "learning_rate": 0.0008295, "loss": 4.7188, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 554, "tokens_per_second_per_gpu": 15891.18 }, { "epoch": 0.013875, "grad_norm": 1.46875, "learning_rate": 0.0008310000000000001, "loss": 4.7704, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 555, "tokens_per_second_per_gpu": 18349.08 }, { "epoch": 0.0139, "grad_norm": 1.0390625, "learning_rate": 0.0008325000000000001, "loss": 4.7279, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 556, "tokens_per_second_per_gpu": 20154.87 }, { "epoch": 0.013925, "grad_norm": 1.46875, "learning_rate": 0.0008340000000000001, "loss": 4.757, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 557, "tokens_per_second_per_gpu": 20461.44 }, { "epoch": 0.01395, "grad_norm": 0.9609375, "learning_rate": 0.0008355000000000001, "loss": 4.7024, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 558, "tokens_per_second_per_gpu": 27992.79 }, { "epoch": 0.013975, "grad_norm": 1.2578125, "learning_rate": 0.0008370000000000001, "loss": 4.7113, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 559, "tokens_per_second_per_gpu": 16143.29 }, { "epoch": 0.014, "grad_norm": 1.578125, "learning_rate": 0.0008385, "loss": 4.768, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 560, "tokens_per_second_per_gpu": 16928.19 }, { "epoch": 0.014025, "grad_norm": 0.9921875, "learning_rate": 0.0008400000000000001, "loss": 4.7299, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 561, "tokens_per_second_per_gpu": 27506.14 }, { "epoch": 0.01405, "grad_norm": 1.1640625, "learning_rate": 0.0008415000000000001, "loss": 4.7365, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 562, "tokens_per_second_per_gpu": 12513.73 }, { "epoch": 0.014075, "grad_norm": 0.92578125, "learning_rate": 0.0008430000000000001, "loss": 4.71, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 563, "tokens_per_second_per_gpu": 18273.11 }, { "epoch": 0.0141, "grad_norm": 1.140625, "learning_rate": 0.0008445, "loss": 4.6957, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 564, "tokens_per_second_per_gpu": 17327.13 }, { "epoch": 0.014125, "grad_norm": 1.1875, "learning_rate": 0.000846, "loss": 4.7155, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 565, "tokens_per_second_per_gpu": 17953.85 }, { "epoch": 0.01415, "grad_norm": 0.96875, "learning_rate": 0.0008475, "loss": 4.7078, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 566, "tokens_per_second_per_gpu": 23940.78 }, { "epoch": 0.014175, "grad_norm": 1.4921875, "learning_rate": 0.0008489999999999999, "loss": 4.6803, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 567, "tokens_per_second_per_gpu": 18301.8 }, { "epoch": 0.0142, "grad_norm": 0.74609375, "learning_rate": 0.0008504999999999999, "loss": 4.6783, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 568, "tokens_per_second_per_gpu": 20963.25 }, { "epoch": 0.014225, "grad_norm": 0.90234375, "learning_rate": 0.0008519999999999999, "loss": 4.6884, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.62, "memory/max_allocated (GiB)": 66.62, "step": 569, "tokens_per_second_per_gpu": 19053.4 }, { "epoch": 0.01425, "grad_norm": 1.046875, "learning_rate": 0.0008535, "loss": 4.7415, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 570, "tokens_per_second_per_gpu": 17056.4 }, { "epoch": 0.014275, "grad_norm": 0.9140625, "learning_rate": 0.000855, "loss": 4.6862, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 571, "tokens_per_second_per_gpu": 14023.55 }, { "epoch": 0.0143, "grad_norm": 1.140625, "learning_rate": 0.0008565, "loss": 4.6836, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 572, "tokens_per_second_per_gpu": 11860.39 }, { "epoch": 0.014325, "grad_norm": 1.6640625, "learning_rate": 0.0008579999999999999, "loss": 4.6776, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 573, "tokens_per_second_per_gpu": 11641.51 }, { "epoch": 0.01435, "grad_norm": 0.95703125, "learning_rate": 0.0008594999999999999, "loss": 4.6735, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 574, "tokens_per_second_per_gpu": 12834.61 }, { "epoch": 0.014375, "grad_norm": 1.1875, "learning_rate": 0.000861, "loss": 4.6719, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 575, "tokens_per_second_per_gpu": 17493.27 }, { "epoch": 0.0144, "grad_norm": 1.046875, "learning_rate": 0.0008625, "loss": 4.6856, "memory/device_reserved (GiB)": 105.55, "memory/max_active (GiB)": 105.52, "memory/max_allocated (GiB)": 105.52, "step": 576, "tokens_per_second_per_gpu": 13904.87 }, { "epoch": 0.014425, "grad_norm": 0.87109375, "learning_rate": 0.000864, "loss": 4.622, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 577, "tokens_per_second_per_gpu": 18065.02 }, { "epoch": 0.01445, "grad_norm": 1.265625, "learning_rate": 0.0008655, "loss": 4.6533, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 578, "tokens_per_second_per_gpu": 12873.58 }, { "epoch": 0.014475, "grad_norm": 1.2890625, "learning_rate": 0.0008669999999999999, "loss": 4.6492, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 579, "tokens_per_second_per_gpu": 24328.52 }, { "epoch": 0.0145, "grad_norm": 1.265625, "learning_rate": 0.0008684999999999999, "loss": 4.6707, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 580, "tokens_per_second_per_gpu": 22248.03 }, { "epoch": 0.014525, "grad_norm": 1.0078125, "learning_rate": 0.00087, "loss": 4.6156, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 581, "tokens_per_second_per_gpu": 21044.33 }, { "epoch": 0.01455, "grad_norm": 0.82421875, "learning_rate": 0.0008715, "loss": 4.6616, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 582, "tokens_per_second_per_gpu": 25938.24 }, { "epoch": 0.014575, "grad_norm": 1.03125, "learning_rate": 0.000873, "loss": 4.631, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 583, "tokens_per_second_per_gpu": 14007.37 }, { "epoch": 0.0146, "grad_norm": 1.3671875, "learning_rate": 0.0008745, "loss": 4.6407, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 584, "tokens_per_second_per_gpu": 15860.89 }, { "epoch": 0.014625, "grad_norm": 0.5625, "learning_rate": 0.0008759999999999999, "loss": 4.6244, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 585, "tokens_per_second_per_gpu": 19936.75 }, { "epoch": 0.01465, "grad_norm": 0.74609375, "learning_rate": 0.0008774999999999999, "loss": 4.643, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 586, "tokens_per_second_per_gpu": 13009.24 }, { "epoch": 0.014675, "grad_norm": 0.8359375, "learning_rate": 0.000879, "loss": 4.5823, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 587, "tokens_per_second_per_gpu": 22949.52 }, { "epoch": 0.0147, "grad_norm": 1.1875, "learning_rate": 0.0008805, "loss": 4.5917, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 588, "tokens_per_second_per_gpu": 17626.26 }, { "epoch": 0.014725, "grad_norm": 1.203125, "learning_rate": 0.000882, "loss": 4.6276, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 589, "tokens_per_second_per_gpu": 11276.91 }, { "epoch": 0.01475, "grad_norm": 1.3046875, "learning_rate": 0.0008835, "loss": 4.6088, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 590, "tokens_per_second_per_gpu": 18475.03 }, { "epoch": 0.014775, "grad_norm": 1.25, "learning_rate": 0.0008849999999999999, "loss": 4.5822, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 591, "tokens_per_second_per_gpu": 13935.39 }, { "epoch": 0.0148, "grad_norm": 1.234375, "learning_rate": 0.0008865, "loss": 4.6124, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 592, "tokens_per_second_per_gpu": 22533.62 }, { "epoch": 0.014825, "grad_norm": 1.1796875, "learning_rate": 0.000888, "loss": 4.5935, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 593, "tokens_per_second_per_gpu": 20001.52 }, { "epoch": 0.01485, "grad_norm": 1.4296875, "learning_rate": 0.0008895, "loss": 4.6317, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 594, "tokens_per_second_per_gpu": 15354.66 }, { "epoch": 0.014875, "grad_norm": 1.0078125, "learning_rate": 0.000891, "loss": 4.6145, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 595, "tokens_per_second_per_gpu": 14318.26 }, { "epoch": 0.0149, "grad_norm": 1.109375, "learning_rate": 0.0008925, "loss": 4.6151, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 596, "tokens_per_second_per_gpu": 20742.98 }, { "epoch": 0.014925, "grad_norm": 1.375, "learning_rate": 0.0008939999999999999, "loss": 4.5799, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 597, "tokens_per_second_per_gpu": 23984.28 }, { "epoch": 0.01495, "grad_norm": 0.69140625, "learning_rate": 0.0008955, "loss": 4.5981, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 598, "tokens_per_second_per_gpu": 16670.83 }, { "epoch": 0.014975, "grad_norm": 0.80078125, "learning_rate": 0.000897, "loss": 4.6112, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 599, "tokens_per_second_per_gpu": 27816.85 }, { "epoch": 0.015, "grad_norm": 1.1015625, "learning_rate": 0.0008985, "loss": 4.5933, "memory/device_reserved (GiB)": 116.62, "memory/max_active (GiB)": 116.59, "memory/max_allocated (GiB)": 116.59, "step": 600, "tokens_per_second_per_gpu": 11445.03 }, { "epoch": 0.015025, "grad_norm": 1.65625, "learning_rate": 0.0009, "loss": 4.6027, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 601, "tokens_per_second_per_gpu": 15770.32 }, { "epoch": 0.01505, "grad_norm": 1.28125, "learning_rate": 0.0009015, "loss": 4.6001, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 602, "tokens_per_second_per_gpu": 15726.6 }, { "epoch": 0.015075, "grad_norm": 1.0234375, "learning_rate": 0.0009029999999999999, "loss": 4.5985, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 603, "tokens_per_second_per_gpu": 15772.31 }, { "epoch": 0.0151, "grad_norm": 1.0, "learning_rate": 0.0009045, "loss": 4.6085, "memory/device_reserved (GiB)": 86.21, "memory/max_active (GiB)": 86.02, "memory/max_allocated (GiB)": 86.02, "step": 604, "tokens_per_second_per_gpu": 15054.44 }, { "epoch": 0.015125, "grad_norm": 0.78515625, "learning_rate": 0.000906, "loss": 4.5859, "memory/device_reserved (GiB)": 116.82, "memory/max_active (GiB)": 116.63, "memory/max_allocated (GiB)": 116.63, "step": 605, "tokens_per_second_per_gpu": 11351.8 }, { "epoch": 0.01515, "grad_norm": 1.015625, "learning_rate": 0.0009075, "loss": 4.5783, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 606, "tokens_per_second_per_gpu": 13960.19 }, { "epoch": 0.015175, "grad_norm": 1.375, "learning_rate": 0.000909, "loss": 4.5985, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 607, "tokens_per_second_per_gpu": 14514.56 }, { "epoch": 0.0152, "grad_norm": 1.078125, "learning_rate": 0.0009105, "loss": 4.5933, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 608, "tokens_per_second_per_gpu": 20340.13 }, { "epoch": 0.015225, "grad_norm": 1.125, "learning_rate": 0.000912, "loss": 4.5708, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 609, "tokens_per_second_per_gpu": 15241.93 }, { "epoch": 0.01525, "grad_norm": 1.4140625, "learning_rate": 0.0009135, "loss": 4.581, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 610, "tokens_per_second_per_gpu": 14670.75 }, { "epoch": 0.015275, "grad_norm": 1.109375, "learning_rate": 0.000915, "loss": 4.5345, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 611, "tokens_per_second_per_gpu": 24002.92 }, { "epoch": 0.0153, "grad_norm": 0.9296875, "learning_rate": 0.0009165, "loss": 4.5609, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 612, "tokens_per_second_per_gpu": 15243.53 }, { "epoch": 0.015325, "grad_norm": 1.0703125, "learning_rate": 0.000918, "loss": 4.5549, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 613, "tokens_per_second_per_gpu": 16900.31 }, { "epoch": 0.01535, "grad_norm": 0.80859375, "learning_rate": 0.0009195, "loss": 4.596, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 614, "tokens_per_second_per_gpu": 17962.43 }, { "epoch": 0.015375, "grad_norm": 1.15625, "learning_rate": 0.000921, "loss": 4.5736, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 615, "tokens_per_second_per_gpu": 23533.87 }, { "epoch": 0.0154, "grad_norm": 1.046875, "learning_rate": 0.0009225, "loss": 4.528, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 616, "tokens_per_second_per_gpu": 23612.63 }, { "epoch": 0.015425, "grad_norm": 0.8203125, "learning_rate": 0.000924, "loss": 4.5654, "memory/device_reserved (GiB)": 44.36, "memory/max_active (GiB)": 44.31, "memory/max_allocated (GiB)": 44.31, "step": 617, "tokens_per_second_per_gpu": 27140.18 }, { "epoch": 0.01545, "grad_norm": 0.74609375, "learning_rate": 0.0009255, "loss": 4.5183, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 618, "tokens_per_second_per_gpu": 21112.2 }, { "epoch": 0.015475, "grad_norm": 0.74609375, "learning_rate": 0.000927, "loss": 4.5115, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 619, "tokens_per_second_per_gpu": 14023.39 }, { "epoch": 0.0155, "grad_norm": 0.83984375, "learning_rate": 0.0009285, "loss": 4.571, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 620, "tokens_per_second_per_gpu": 10660.72 }, { "epoch": 0.015525, "grad_norm": 1.59375, "learning_rate": 0.00093, "loss": 4.5378, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 621, "tokens_per_second_per_gpu": 13887.27 }, { "epoch": 0.01555, "grad_norm": 0.81640625, "learning_rate": 0.0009315, "loss": 4.5182, "memory/device_reserved (GiB)": 106.61, "memory/max_active (GiB)": 106.43, "memory/max_allocated (GiB)": 106.43, "step": 622, "tokens_per_second_per_gpu": 12599.55 }, { "epoch": 0.015575, "grad_norm": 1.109375, "learning_rate": 0.000933, "loss": 4.5178, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 623, "tokens_per_second_per_gpu": 22381.71 }, { "epoch": 0.0156, "grad_norm": 1.3515625, "learning_rate": 0.0009345, "loss": 4.5791, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 624, "tokens_per_second_per_gpu": 11257.84 }, { "epoch": 0.015625, "grad_norm": 0.98046875, "learning_rate": 0.000936, "loss": 4.4941, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 625, "tokens_per_second_per_gpu": 28125.29 }, { "epoch": 0.01565, "grad_norm": 1.1484375, "learning_rate": 0.0009375, "loss": 4.4869, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 626, "tokens_per_second_per_gpu": 12723.07 }, { "epoch": 0.015675, "grad_norm": 1.0078125, "learning_rate": 0.0009390000000000001, "loss": 4.5184, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 627, "tokens_per_second_per_gpu": 28370.82 }, { "epoch": 0.0157, "grad_norm": 1.5625, "learning_rate": 0.0009405, "loss": 4.5224, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 628, "tokens_per_second_per_gpu": 10796.89 }, { "epoch": 0.015725, "grad_norm": 1.375, "learning_rate": 0.000942, "loss": 4.5194, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 629, "tokens_per_second_per_gpu": 24411.89 }, { "epoch": 0.01575, "grad_norm": 1.109375, "learning_rate": 0.0009435, "loss": 4.5586, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 630, "tokens_per_second_per_gpu": 12466.13 }, { "epoch": 0.015775, "grad_norm": 1.296875, "learning_rate": 0.000945, "loss": 4.5237, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 631, "tokens_per_second_per_gpu": 27738.58 }, { "epoch": 0.0158, "grad_norm": 1.390625, "learning_rate": 0.0009465000000000001, "loss": 4.5444, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 632, "tokens_per_second_per_gpu": 11013.64 }, { "epoch": 0.015825, "grad_norm": 1.4140625, "learning_rate": 0.0009480000000000001, "loss": 4.5787, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 633, "tokens_per_second_per_gpu": 16052.35 }, { "epoch": 0.01585, "grad_norm": 0.90625, "learning_rate": 0.0009495, "loss": 4.4986, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 634, "tokens_per_second_per_gpu": 11947.37 }, { "epoch": 0.015875, "grad_norm": 1.3203125, "learning_rate": 0.000951, "loss": 4.5096, "memory/device_reserved (GiB)": 46.4, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 635, "tokens_per_second_per_gpu": 26348.38 }, { "epoch": 0.0159, "grad_norm": 0.71875, "learning_rate": 0.0009525, "loss": 4.5136, "memory/device_reserved (GiB)": 106.43, "memory/max_active (GiB)": 106.39, "memory/max_allocated (GiB)": 106.39, "step": 636, "tokens_per_second_per_gpu": 13212.18 }, { "epoch": 0.015925, "grad_norm": 0.98046875, "learning_rate": 0.000954, "loss": 4.5138, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 637, "tokens_per_second_per_gpu": 13454.01 }, { "epoch": 0.01595, "grad_norm": 1.0546875, "learning_rate": 0.0009555000000000001, "loss": 4.4973, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 638, "tokens_per_second_per_gpu": 12811.8 }, { "epoch": 0.015975, "grad_norm": 1.3515625, "learning_rate": 0.0009570000000000001, "loss": 4.5068, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 639, "tokens_per_second_per_gpu": 11600.39 }, { "epoch": 0.016, "grad_norm": 0.79296875, "learning_rate": 0.0009585, "loss": 4.4857, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 640, "tokens_per_second_per_gpu": 24842.12 }, { "epoch": 0.016025, "grad_norm": 0.890625, "learning_rate": 0.00096, "loss": 4.4842, "memory/device_reserved (GiB)": 56.67, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 641, "tokens_per_second_per_gpu": 21456.84 }, { "epoch": 0.01605, "grad_norm": 0.828125, "learning_rate": 0.0009615, "loss": 4.466, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 642, "tokens_per_second_per_gpu": 17990.16 }, { "epoch": 0.016075, "grad_norm": 1.2265625, "learning_rate": 0.000963, "loss": 4.4888, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 643, "tokens_per_second_per_gpu": 17654.52 }, { "epoch": 0.0161, "grad_norm": 1.0703125, "learning_rate": 0.0009645000000000001, "loss": 4.4954, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 644, "tokens_per_second_per_gpu": 15106.46 }, { "epoch": 0.016125, "grad_norm": 1.125, "learning_rate": 0.0009660000000000001, "loss": 4.5317, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 645, "tokens_per_second_per_gpu": 13487.92 }, { "epoch": 0.01615, "grad_norm": 1.1328125, "learning_rate": 0.0009675, "loss": 4.4801, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 646, "tokens_per_second_per_gpu": 16118.7 }, { "epoch": 0.016175, "grad_norm": 0.7890625, "learning_rate": 0.000969, "loss": 4.4335, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 647, "tokens_per_second_per_gpu": 20566.71 }, { "epoch": 0.0162, "grad_norm": 0.7734375, "learning_rate": 0.0009705, "loss": 4.4668, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 648, "tokens_per_second_per_gpu": 17841.6 }, { "epoch": 0.016225, "grad_norm": 0.734375, "learning_rate": 0.0009720000000000001, "loss": 4.46, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 649, "tokens_per_second_per_gpu": 19779.97 }, { "epoch": 0.01625, "grad_norm": 0.7578125, "learning_rate": 0.0009735000000000001, "loss": 4.4493, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 650, "tokens_per_second_per_gpu": 27025.52 }, { "epoch": 0.016275, "grad_norm": 0.96875, "learning_rate": 0.0009750000000000001, "loss": 4.4655, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 651, "tokens_per_second_per_gpu": 20443.12 }, { "epoch": 0.0163, "grad_norm": 0.9609375, "learning_rate": 0.0009765, "loss": 4.4355, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 652, "tokens_per_second_per_gpu": 29540.34 }, { "epoch": 0.016325, "grad_norm": 1.140625, "learning_rate": 0.0009780000000000001, "loss": 4.4297, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 653, "tokens_per_second_per_gpu": 23302.96 }, { "epoch": 0.01635, "grad_norm": 1.046875, "learning_rate": 0.0009795000000000001, "loss": 4.4429, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 654, "tokens_per_second_per_gpu": 13697.56 }, { "epoch": 0.016375, "grad_norm": 1.296875, "learning_rate": 0.000981, "loss": 4.4571, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 655, "tokens_per_second_per_gpu": 11881.42 }, { "epoch": 0.0164, "grad_norm": 1.046875, "learning_rate": 0.0009825, "loss": 4.4358, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 656, "tokens_per_second_per_gpu": 17330.62 }, { "epoch": 0.016425, "grad_norm": 0.89453125, "learning_rate": 0.000984, "loss": 4.4538, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 657, "tokens_per_second_per_gpu": 22314.63 }, { "epoch": 0.01645, "grad_norm": 1.1328125, "learning_rate": 0.0009855, "loss": 4.4755, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 658, "tokens_per_second_per_gpu": 22477.39 }, { "epoch": 0.016475, "grad_norm": 1.2734375, "learning_rate": 0.000987, "loss": 4.4233, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 659, "tokens_per_second_per_gpu": 19217.13 }, { "epoch": 0.0165, "grad_norm": 1.390625, "learning_rate": 0.0009885, "loss": 4.4375, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 660, "tokens_per_second_per_gpu": 10906.99 }, { "epoch": 0.016525, "grad_norm": 1.1875, "learning_rate": 0.00099, "loss": 4.4373, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 661, "tokens_per_second_per_gpu": 10677.88 }, { "epoch": 0.01655, "grad_norm": 1.3046875, "learning_rate": 0.0009915, "loss": 4.4593, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 662, "tokens_per_second_per_gpu": 14418.37 }, { "epoch": 0.016575, "grad_norm": 0.72265625, "learning_rate": 0.0009930000000000002, "loss": 4.458, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 663, "tokens_per_second_per_gpu": 20997.85 }, { "epoch": 0.0166, "grad_norm": 0.77734375, "learning_rate": 0.0009945000000000002, "loss": 4.4455, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 664, "tokens_per_second_per_gpu": 17202.15 }, { "epoch": 0.016625, "grad_norm": 0.9609375, "learning_rate": 0.0009960000000000001, "loss": 4.4507, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 665, "tokens_per_second_per_gpu": 23234.58 }, { "epoch": 0.01665, "grad_norm": 1.09375, "learning_rate": 0.0009975000000000001, "loss": 4.4299, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 666, "tokens_per_second_per_gpu": 20131.84 }, { "epoch": 0.016675, "grad_norm": 1.0703125, "learning_rate": 0.000999, "loss": 4.4141, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 667, "tokens_per_second_per_gpu": 12876.93 }, { "epoch": 0.0167, "grad_norm": 1.1171875, "learning_rate": 0.0010005, "loss": 4.4459, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 668, "tokens_per_second_per_gpu": 20721.5 }, { "epoch": 0.016725, "grad_norm": 1.25, "learning_rate": 0.001002, "loss": 4.4295, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 669, "tokens_per_second_per_gpu": 20655.19 }, { "epoch": 0.01675, "grad_norm": 1.421875, "learning_rate": 0.0010035, "loss": 4.4412, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 670, "tokens_per_second_per_gpu": 10668.21 }, { "epoch": 0.016775, "grad_norm": 1.4140625, "learning_rate": 0.001005, "loss": 4.4278, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 671, "tokens_per_second_per_gpu": 12234.8 }, { "epoch": 0.0168, "grad_norm": 0.75, "learning_rate": 0.0010065, "loss": 4.4231, "memory/device_reserved (GiB)": 54.53, "memory/max_active (GiB)": 54.51, "memory/max_allocated (GiB)": 54.51, "step": 672, "tokens_per_second_per_gpu": 25113.3 }, { "epoch": 0.016825, "grad_norm": 0.8671875, "learning_rate": 0.001008, "loss": 4.451, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 673, "tokens_per_second_per_gpu": 13385.5 }, { "epoch": 0.01685, "grad_norm": 1.0859375, "learning_rate": 0.0010095, "loss": 4.4236, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 674, "tokens_per_second_per_gpu": 13327.94 }, { "epoch": 0.016875, "grad_norm": 0.97265625, "learning_rate": 0.0010110000000000002, "loss": 4.4301, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 675, "tokens_per_second_per_gpu": 13494.21 }, { "epoch": 0.0169, "grad_norm": 1.125, "learning_rate": 0.0010125000000000002, "loss": 4.3718, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 676, "tokens_per_second_per_gpu": 13602.25 }, { "epoch": 0.016925, "grad_norm": 0.90625, "learning_rate": 0.0010140000000000001, "loss": 4.4297, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 677, "tokens_per_second_per_gpu": 13944.26 }, { "epoch": 0.01695, "grad_norm": 1.15625, "learning_rate": 0.0010155000000000001, "loss": 4.4109, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 678, "tokens_per_second_per_gpu": 11617.26 }, { "epoch": 0.016975, "grad_norm": 1.4140625, "learning_rate": 0.0010170000000000001, "loss": 4.3999, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 679, "tokens_per_second_per_gpu": 11180.92 }, { "epoch": 0.017, "grad_norm": 0.84765625, "learning_rate": 0.0010185, "loss": 4.4157, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 680, "tokens_per_second_per_gpu": 14825.02 }, { "epoch": 0.017025, "grad_norm": 1.0390625, "learning_rate": 0.00102, "loss": 4.3597, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 681, "tokens_per_second_per_gpu": 22971.95 }, { "epoch": 0.01705, "grad_norm": 1.125, "learning_rate": 0.0010215, "loss": 4.4062, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 682, "tokens_per_second_per_gpu": 12607.26 }, { "epoch": 0.017075, "grad_norm": 1.078125, "learning_rate": 0.001023, "loss": 4.3799, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 683, "tokens_per_second_per_gpu": 15431.77 }, { "epoch": 0.0171, "grad_norm": 0.78515625, "learning_rate": 0.0010245, "loss": 4.42, "memory/device_reserved (GiB)": 54.53, "memory/max_active (GiB)": 54.51, "memory/max_allocated (GiB)": 54.51, "step": 684, "tokens_per_second_per_gpu": 24116.64 }, { "epoch": 0.017125, "grad_norm": 0.67578125, "learning_rate": 0.001026, "loss": 4.384, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 685, "tokens_per_second_per_gpu": 18716.14 }, { "epoch": 0.01715, "grad_norm": 0.56640625, "learning_rate": 0.0010275000000000002, "loss": 4.3776, "memory/device_reserved (GiB)": 64.72, "memory/max_active (GiB)": 64.71, "memory/max_allocated (GiB)": 64.71, "step": 686, "tokens_per_second_per_gpu": 21279.86 }, { "epoch": 0.017175, "grad_norm": 0.80859375, "learning_rate": 0.0010290000000000002, "loss": 4.3523, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 687, "tokens_per_second_per_gpu": 16089.91 }, { "epoch": 0.0172, "grad_norm": 1.2890625, "learning_rate": 0.0010305000000000002, "loss": 4.4312, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 688, "tokens_per_second_per_gpu": 16023.5 }, { "epoch": 0.017225, "grad_norm": 0.91796875, "learning_rate": 0.001032, "loss": 4.4171, "memory/device_reserved (GiB)": 106.43, "memory/max_active (GiB)": 106.39, "memory/max_allocated (GiB)": 106.39, "step": 689, "tokens_per_second_per_gpu": 12676.43 }, { "epoch": 0.01725, "grad_norm": 0.796875, "learning_rate": 0.0010335, "loss": 4.3996, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 690, "tokens_per_second_per_gpu": 17629.84 }, { "epoch": 0.017275, "grad_norm": 0.81640625, "learning_rate": 0.001035, "loss": 4.3428, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 691, "tokens_per_second_per_gpu": 15948.26 }, { "epoch": 0.0173, "grad_norm": 0.9921875, "learning_rate": 0.0010364999999999999, "loss": 4.3735, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 692, "tokens_per_second_per_gpu": 15735.58 }, { "epoch": 0.017325, "grad_norm": 0.9765625, "learning_rate": 0.0010379999999999999, "loss": 4.3679, "memory/device_reserved (GiB)": 64.28, "memory/max_active (GiB)": 64.24, "memory/max_allocated (GiB)": 64.24, "step": 693, "tokens_per_second_per_gpu": 20176.03 }, { "epoch": 0.01735, "grad_norm": 0.9453125, "learning_rate": 0.0010394999999999998, "loss": 4.3436, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 694, "tokens_per_second_per_gpu": 23715.93 }, { "epoch": 0.017375, "grad_norm": 0.98828125, "learning_rate": 0.001041, "loss": 4.3495, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 695, "tokens_per_second_per_gpu": 22791.24 }, { "epoch": 0.0174, "grad_norm": 0.91796875, "learning_rate": 0.0010425, "loss": 4.3918, "memory/device_reserved (GiB)": 95.73, "memory/max_active (GiB)": 95.72, "memory/max_allocated (GiB)": 95.72, "step": 696, "tokens_per_second_per_gpu": 13962.27 }, { "epoch": 0.017425, "grad_norm": 0.9296875, "learning_rate": 0.001044, "loss": 4.3458, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 697, "tokens_per_second_per_gpu": 16712.35 }, { "epoch": 0.01745, "grad_norm": 1.3046875, "learning_rate": 0.0010455, "loss": 4.3761, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 698, "tokens_per_second_per_gpu": 23364.66 }, { "epoch": 0.017475, "grad_norm": 0.78125, "learning_rate": 0.001047, "loss": 4.3282, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 699, "tokens_per_second_per_gpu": 11980.92 }, { "epoch": 0.0175, "grad_norm": 0.71484375, "learning_rate": 0.0010485, "loss": 4.3502, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 700, "tokens_per_second_per_gpu": 23283.07 }, { "epoch": 0.017525, "grad_norm": 0.90234375, "learning_rate": 0.00105, "loss": 4.3564, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 701, "tokens_per_second_per_gpu": 10571.76 }, { "epoch": 0.01755, "grad_norm": 1.5859375, "learning_rate": 0.0010515, "loss": 4.3846, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 702, "tokens_per_second_per_gpu": 13571.64 }, { "epoch": 0.017575, "grad_norm": 0.9609375, "learning_rate": 0.001053, "loss": 4.3363, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 703, "tokens_per_second_per_gpu": 18223.75 }, { "epoch": 0.0176, "grad_norm": 0.66015625, "learning_rate": 0.0010544999999999999, "loss": 4.3362, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 704, "tokens_per_second_per_gpu": 13502.17 }, { "epoch": 0.017625, "grad_norm": 0.58203125, "learning_rate": 0.0010559999999999999, "loss": 4.3416, "memory/device_reserved (GiB)": 64.72, "memory/max_active (GiB)": 64.71, "memory/max_allocated (GiB)": 64.71, "step": 705, "tokens_per_second_per_gpu": 20950.1 }, { "epoch": 0.01765, "grad_norm": 0.57421875, "learning_rate": 0.0010575, "loss": 4.3056, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 706, "tokens_per_second_per_gpu": 22995.23 }, { "epoch": 0.017675, "grad_norm": 0.5234375, "learning_rate": 0.001059, "loss": 4.376, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 707, "tokens_per_second_per_gpu": 18736.46 }, { "epoch": 0.0177, "grad_norm": 0.5859375, "learning_rate": 0.0010605, "loss": 4.2943, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 708, "tokens_per_second_per_gpu": 9817.85 }, { "epoch": 0.017725, "grad_norm": 0.875, "learning_rate": 0.001062, "loss": 4.3191, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 709, "tokens_per_second_per_gpu": 15326.64 }, { "epoch": 0.01775, "grad_norm": 1.0703125, "learning_rate": 0.0010635, "loss": 4.3173, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 710, "tokens_per_second_per_gpu": 20147.5 }, { "epoch": 0.017775, "grad_norm": 0.8828125, "learning_rate": 0.001065, "loss": 4.3517, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 711, "tokens_per_second_per_gpu": 13137.37 }, { "epoch": 0.0178, "grad_norm": 0.9921875, "learning_rate": 0.0010665, "loss": 4.3551, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 712, "tokens_per_second_per_gpu": 15408.06 }, { "epoch": 0.017825, "grad_norm": 0.90625, "learning_rate": 0.001068, "loss": 4.3224, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 713, "tokens_per_second_per_gpu": 24006.17 }, { "epoch": 0.01785, "grad_norm": 0.78125, "learning_rate": 0.0010695, "loss": 4.302, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 714, "tokens_per_second_per_gpu": 27794.43 }, { "epoch": 0.017875, "grad_norm": 0.8984375, "learning_rate": 0.001071, "loss": 4.328, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 715, "tokens_per_second_per_gpu": 23610.18 }, { "epoch": 0.0179, "grad_norm": 1.3671875, "learning_rate": 0.0010724999999999999, "loss": 4.3467, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 716, "tokens_per_second_per_gpu": 12535.87 }, { "epoch": 0.017925, "grad_norm": 1.0390625, "learning_rate": 0.001074, "loss": 4.3243, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 717, "tokens_per_second_per_gpu": 19645.37 }, { "epoch": 0.01795, "grad_norm": 1.3046875, "learning_rate": 0.0010755, "loss": 4.3234, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 718, "tokens_per_second_per_gpu": 12719.06 }, { "epoch": 0.017975, "grad_norm": 0.89453125, "learning_rate": 0.001077, "loss": 4.2764, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 719, "tokens_per_second_per_gpu": 15718.35 }, { "epoch": 0.018, "grad_norm": 1.0390625, "learning_rate": 0.0010785, "loss": 4.3154, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 720, "tokens_per_second_per_gpu": 23219.81 }, { "epoch": 0.018025, "grad_norm": 0.9453125, "learning_rate": 0.00108, "loss": 4.2961, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 721, "tokens_per_second_per_gpu": 10771.17 }, { "epoch": 0.01805, "grad_norm": 1.046875, "learning_rate": 0.0010815, "loss": 4.2995, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 722, "tokens_per_second_per_gpu": 14607.62 }, { "epoch": 0.018075, "grad_norm": 0.9296875, "learning_rate": 0.001083, "loss": 4.2775, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 723, "tokens_per_second_per_gpu": 19713.39 }, { "epoch": 0.0181, "grad_norm": 0.9296875, "learning_rate": 0.0010845, "loss": 4.323, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 724, "tokens_per_second_per_gpu": 13590.73 }, { "epoch": 0.018125, "grad_norm": 0.609375, "learning_rate": 0.001086, "loss": 4.2901, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 725, "tokens_per_second_per_gpu": 25867.16 }, { "epoch": 0.01815, "grad_norm": 0.6953125, "learning_rate": 0.0010875, "loss": 4.2734, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 726, "tokens_per_second_per_gpu": 19972.61 }, { "epoch": 0.018175, "grad_norm": 0.5625, "learning_rate": 0.001089, "loss": 4.2972, "memory/device_reserved (GiB)": 55.41, "memory/max_active (GiB)": 55.39, "memory/max_allocated (GiB)": 55.39, "step": 727, "tokens_per_second_per_gpu": 23501.73 }, { "epoch": 0.0182, "grad_norm": 0.6875, "learning_rate": 0.0010904999999999999, "loss": 4.2483, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 728, "tokens_per_second_per_gpu": 10835.25 }, { "epoch": 0.018225, "grad_norm": 0.91796875, "learning_rate": 0.001092, "loss": 4.2755, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 729, "tokens_per_second_per_gpu": 14033.33 }, { "epoch": 0.01825, "grad_norm": 1.0, "learning_rate": 0.0010935, "loss": 4.2731, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 730, "tokens_per_second_per_gpu": 20008.37 }, { "epoch": 0.018275, "grad_norm": 1.1953125, "learning_rate": 0.001095, "loss": 4.3156, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 731, "tokens_per_second_per_gpu": 10642.88 }, { "epoch": 0.0183, "grad_norm": 0.765625, "learning_rate": 0.0010965, "loss": 4.3125, "memory/device_reserved (GiB)": 64.72, "memory/max_active (GiB)": 64.71, "memory/max_allocated (GiB)": 64.71, "step": 732, "tokens_per_second_per_gpu": 19666.02 }, { "epoch": 0.018325, "grad_norm": 0.3984375, "learning_rate": 0.001098, "loss": 4.2876, "memory/device_reserved (GiB)": 75.81, "memory/max_active (GiB)": 75.79, "memory/max_allocated (GiB)": 75.79, "step": 733, "tokens_per_second_per_gpu": 16491.26 }, { "epoch": 0.01835, "grad_norm": 0.69921875, "learning_rate": 0.0010995, "loss": 4.284, "memory/device_reserved (GiB)": 66.36, "memory/max_active (GiB)": 66.15, "memory/max_allocated (GiB)": 66.15, "step": 734, "tokens_per_second_per_gpu": 20252.7 }, { "epoch": 0.018375, "grad_norm": 0.59765625, "learning_rate": 0.001101, "loss": 4.2738, "memory/device_reserved (GiB)": 65.6, "memory/max_active (GiB)": 65.59, "memory/max_allocated (GiB)": 65.59, "step": 735, "tokens_per_second_per_gpu": 20697.42 }, { "epoch": 0.0184, "grad_norm": 0.51171875, "learning_rate": 0.0011025, "loss": 4.2875, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 736, "tokens_per_second_per_gpu": 11373.74 }, { "epoch": 0.018425, "grad_norm": 0.8125, "learning_rate": 0.001104, "loss": 4.2635, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 737, "tokens_per_second_per_gpu": 10693.57 }, { "epoch": 0.01845, "grad_norm": 1.3984375, "learning_rate": 0.0011055, "loss": 4.2615, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 738, "tokens_per_second_per_gpu": 17354.53 }, { "epoch": 0.018475, "grad_norm": 1.109375, "learning_rate": 0.001107, "loss": 4.3006, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 739, "tokens_per_second_per_gpu": 10483.33 }, { "epoch": 0.0185, "grad_norm": 0.9375, "learning_rate": 0.0011085000000000001, "loss": 4.3083, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 740, "tokens_per_second_per_gpu": 15289.44 }, { "epoch": 0.018525, "grad_norm": 1.015625, "learning_rate": 0.00111, "loss": 4.2625, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 741, "tokens_per_second_per_gpu": 13523.74 }, { "epoch": 0.01855, "grad_norm": 0.87890625, "learning_rate": 0.0011115, "loss": 4.2739, "memory/device_reserved (GiB)": 75.99, "memory/max_active (GiB)": 75.82, "memory/max_allocated (GiB)": 75.82, "step": 742, "tokens_per_second_per_gpu": 18011.11 }, { "epoch": 0.018575, "grad_norm": 1.15625, "learning_rate": 0.001113, "loss": 4.2581, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 743, "tokens_per_second_per_gpu": 13689.08 }, { "epoch": 0.0186, "grad_norm": 1.0703125, "learning_rate": 0.0011145, "loss": 4.3402, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 744, "tokens_per_second_per_gpu": 10500.49 }, { "epoch": 0.018625, "grad_norm": 1.2109375, "learning_rate": 0.001116, "loss": 4.2791, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 745, "tokens_per_second_per_gpu": 26982.78 }, { "epoch": 0.01865, "grad_norm": 1.3828125, "learning_rate": 0.0011175, "loss": 4.2859, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 746, "tokens_per_second_per_gpu": 13026.25 }, { "epoch": 0.018675, "grad_norm": 1.140625, "learning_rate": 0.001119, "loss": 4.2499, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 747, "tokens_per_second_per_gpu": 16189.63 }, { "epoch": 0.0187, "grad_norm": 1.171875, "learning_rate": 0.0011205, "loss": 4.2731, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 748, "tokens_per_second_per_gpu": 10579.96 }, { "epoch": 0.018725, "grad_norm": 1.03125, "learning_rate": 0.001122, "loss": 4.2359, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 749, "tokens_per_second_per_gpu": 19407.28 }, { "epoch": 0.01875, "grad_norm": 0.95703125, "learning_rate": 0.0011235, "loss": 4.2581, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 750, "tokens_per_second_per_gpu": 23617.55 }, { "epoch": 0.018775, "grad_norm": 0.6328125, "learning_rate": 0.0011250000000000001, "loss": 4.3045, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 751, "tokens_per_second_per_gpu": 13068.33 }, { "epoch": 0.0188, "grad_norm": 0.640625, "learning_rate": 0.0011265000000000001, "loss": 4.2676, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 752, "tokens_per_second_per_gpu": 13771.68 }, { "epoch": 0.018825, "grad_norm": 0.60546875, "learning_rate": 0.001128, "loss": 4.2648, "memory/device_reserved (GiB)": 106.43, "memory/max_active (GiB)": 106.39, "memory/max_allocated (GiB)": 106.39, "step": 753, "tokens_per_second_per_gpu": 13303.3 }, { "epoch": 0.01885, "grad_norm": 0.85546875, "learning_rate": 0.0011295, "loss": 4.2346, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 754, "tokens_per_second_per_gpu": 15023.83 }, { "epoch": 0.018875, "grad_norm": 0.703125, "learning_rate": 0.001131, "loss": 4.2744, "memory/device_reserved (GiB)": 95.73, "memory/max_active (GiB)": 95.72, "memory/max_allocated (GiB)": 95.72, "step": 755, "tokens_per_second_per_gpu": 14251.88 }, { "epoch": 0.0189, "grad_norm": 0.7265625, "learning_rate": 0.0011325, "loss": 4.248, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 756, "tokens_per_second_per_gpu": 15816.61 }, { "epoch": 0.018925, "grad_norm": 0.58203125, "learning_rate": 0.001134, "loss": 4.2405, "memory/device_reserved (GiB)": 64.28, "memory/max_active (GiB)": 64.24, "memory/max_allocated (GiB)": 64.24, "step": 757, "tokens_per_second_per_gpu": 20549.28 }, { "epoch": 0.01895, "grad_norm": 0.578125, "learning_rate": 0.0011355, "loss": 4.2793, "memory/device_reserved (GiB)": 116.76, "memory/max_active (GiB)": 116.62, "memory/max_allocated (GiB)": 116.62, "step": 758, "tokens_per_second_per_gpu": 11312.75 }, { "epoch": 0.018975, "grad_norm": 0.6953125, "learning_rate": 0.001137, "loss": 4.2081, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 759, "tokens_per_second_per_gpu": 23944.88 }, { "epoch": 0.019, "grad_norm": 0.87890625, "learning_rate": 0.0011385, "loss": 4.2243, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 760, "tokens_per_second_per_gpu": 11275.72 }, { "epoch": 0.019025, "grad_norm": 0.83203125, "learning_rate": 0.00114, "loss": 4.2931, "memory/device_reserved (GiB)": 64.72, "memory/max_active (GiB)": 64.71, "memory/max_allocated (GiB)": 64.71, "step": 761, "tokens_per_second_per_gpu": 21357.98 }, { "epoch": 0.01905, "grad_norm": 1.0, "learning_rate": 0.0011415, "loss": 4.2489, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 762, "tokens_per_second_per_gpu": 16497.69 }, { "epoch": 0.019075, "grad_norm": 1.203125, "learning_rate": 0.0011430000000000001, "loss": 4.2337, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 763, "tokens_per_second_per_gpu": 20636.06 }, { "epoch": 0.0191, "grad_norm": 0.99609375, "learning_rate": 0.0011445000000000001, "loss": 4.2254, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 764, "tokens_per_second_per_gpu": 20259.33 }, { "epoch": 0.019125, "grad_norm": 0.9296875, "learning_rate": 0.001146, "loss": 4.2183, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 765, "tokens_per_second_per_gpu": 19732.0 }, { "epoch": 0.01915, "grad_norm": 1.1796875, "learning_rate": 0.0011475, "loss": 4.2311, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 766, "tokens_per_second_per_gpu": 27902.94 }, { "epoch": 0.019175, "grad_norm": 0.91015625, "learning_rate": 0.001149, "loss": 4.2232, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 767, "tokens_per_second_per_gpu": 20052.66 }, { "epoch": 0.0192, "grad_norm": 0.91796875, "learning_rate": 0.0011505, "loss": 4.2346, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 768, "tokens_per_second_per_gpu": 15697.03 }, { "epoch": 0.019225, "grad_norm": 0.99609375, "learning_rate": 0.001152, "loss": 4.2464, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 769, "tokens_per_second_per_gpu": 12845.51 }, { "epoch": 0.01925, "grad_norm": 1.078125, "learning_rate": 0.0011535, "loss": 4.2225, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 770, "tokens_per_second_per_gpu": 23653.26 }, { "epoch": 0.019275, "grad_norm": 0.96484375, "learning_rate": 0.001155, "loss": 4.2396, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 771, "tokens_per_second_per_gpu": 14273.68 }, { "epoch": 0.0193, "grad_norm": 0.70703125, "learning_rate": 0.0011565, "loss": 4.2397, "memory/device_reserved (GiB)": 107.65, "memory/max_active (GiB)": 107.42, "memory/max_allocated (GiB)": 107.42, "step": 772, "tokens_per_second_per_gpu": 12059.06 }, { "epoch": 0.019325, "grad_norm": 0.69921875, "learning_rate": 0.001158, "loss": 4.2116, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 773, "tokens_per_second_per_gpu": 11660.1 }, { "epoch": 0.01935, "grad_norm": 0.7421875, "learning_rate": 0.0011595000000000002, "loss": 4.2021, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 774, "tokens_per_second_per_gpu": 13582.27 }, { "epoch": 0.019375, "grad_norm": 0.703125, "learning_rate": 0.0011610000000000001, "loss": 4.1982, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 775, "tokens_per_second_per_gpu": 18758.41 }, { "epoch": 0.0194, "grad_norm": 0.55859375, "learning_rate": 0.0011625000000000001, "loss": 4.2039, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 776, "tokens_per_second_per_gpu": 14417.85 }, { "epoch": 0.019425, "grad_norm": 0.412109375, "learning_rate": 0.001164, "loss": 4.2137, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 777, "tokens_per_second_per_gpu": 26982.62 }, { "epoch": 0.01945, "grad_norm": 0.228515625, "learning_rate": 0.0011655, "loss": 4.2103, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.22, "memory/max_allocated (GiB)": 97.22, "step": 778, "tokens_per_second_per_gpu": 13358.12 }, { "epoch": 0.019475, "grad_norm": 0.51953125, "learning_rate": 0.001167, "loss": 4.1924, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 779, "tokens_per_second_per_gpu": 15792.71 }, { "epoch": 0.0195, "grad_norm": 0.6640625, "learning_rate": 0.0011685, "loss": 4.1817, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 780, "tokens_per_second_per_gpu": 13207.4 }, { "epoch": 0.019525, "grad_norm": 0.87109375, "learning_rate": 0.00117, "loss": 4.1663, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 781, "tokens_per_second_per_gpu": 10813.46 }, { "epoch": 0.01955, "grad_norm": 0.6953125, "learning_rate": 0.0011715, "loss": 4.2043, "memory/device_reserved (GiB)": 55.41, "memory/max_active (GiB)": 55.39, "memory/max_allocated (GiB)": 55.39, "step": 782, "tokens_per_second_per_gpu": 23506.66 }, { "epoch": 0.019575, "grad_norm": 0.828125, "learning_rate": 0.001173, "loss": 4.1703, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 783, "tokens_per_second_per_gpu": 16915.22 }, { "epoch": 0.0196, "grad_norm": 1.1015625, "learning_rate": 0.0011745, "loss": 4.1881, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 784, "tokens_per_second_per_gpu": 17696.0 }, { "epoch": 0.019625, "grad_norm": 0.81640625, "learning_rate": 0.001176, "loss": 4.195, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 785, "tokens_per_second_per_gpu": 17502.46 }, { "epoch": 0.01965, "grad_norm": 0.73046875, "learning_rate": 0.0011775000000000002, "loss": 4.1859, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 786, "tokens_per_second_per_gpu": 17551.28 }, { "epoch": 0.019675, "grad_norm": 0.77734375, "learning_rate": 0.0011790000000000001, "loss": 4.1696, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 787, "tokens_per_second_per_gpu": 12517.52 }, { "epoch": 0.0197, "grad_norm": 0.6640625, "learning_rate": 0.0011805000000000001, "loss": 4.1752, "memory/device_reserved (GiB)": 74.93, "memory/max_active (GiB)": 74.91, "memory/max_allocated (GiB)": 74.91, "step": 788, "tokens_per_second_per_gpu": 18716.26 }, { "epoch": 0.019725, "grad_norm": 0.640625, "learning_rate": 0.001182, "loss": 4.153, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 789, "tokens_per_second_per_gpu": 27835.37 }, { "epoch": 0.01975, "grad_norm": 0.76953125, "learning_rate": 0.0011835, "loss": 4.1705, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 790, "tokens_per_second_per_gpu": 13647.17 }, { "epoch": 0.019775, "grad_norm": 0.68359375, "learning_rate": 0.001185, "loss": 4.1683, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 791, "tokens_per_second_per_gpu": 13913.91 }, { "epoch": 0.0198, "grad_norm": 0.81640625, "learning_rate": 0.0011865, "loss": 4.1613, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 792, "tokens_per_second_per_gpu": 16453.75 }, { "epoch": 0.019825, "grad_norm": 1.109375, "learning_rate": 0.001188, "loss": 4.1648, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 793, "tokens_per_second_per_gpu": 11092.52 }, { "epoch": 0.01985, "grad_norm": 1.2578125, "learning_rate": 0.0011895, "loss": 4.166, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 794, "tokens_per_second_per_gpu": 15958.37 }, { "epoch": 0.019875, "grad_norm": 0.984375, "learning_rate": 0.001191, "loss": 4.1632, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 795, "tokens_per_second_per_gpu": 23828.9 }, { "epoch": 0.0199, "grad_norm": 1.0234375, "learning_rate": 0.0011925, "loss": 4.1899, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 796, "tokens_per_second_per_gpu": 29158.8 }, { "epoch": 0.019925, "grad_norm": 0.97265625, "learning_rate": 0.0011940000000000002, "loss": 4.1711, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 797, "tokens_per_second_per_gpu": 17977.68 }, { "epoch": 0.01995, "grad_norm": 1.125, "learning_rate": 0.0011955000000000002, "loss": 4.1735, "memory/device_reserved (GiB)": 116.62, "memory/max_active (GiB)": 116.59, "memory/max_allocated (GiB)": 116.59, "step": 798, "tokens_per_second_per_gpu": 12618.18 }, { "epoch": 0.019975, "grad_norm": 0.70703125, "learning_rate": 0.0011970000000000001, "loss": 4.1553, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 799, "tokens_per_second_per_gpu": 18004.03 }, { "epoch": 0.02, "grad_norm": 0.68359375, "learning_rate": 0.0011985000000000001, "loss": 4.1309, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 800, "tokens_per_second_per_gpu": 10526.07 }, { "epoch": 0.020025, "grad_norm": 0.82421875, "learning_rate": 0.0012000000000000001, "loss": 4.1664, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 801, "tokens_per_second_per_gpu": 12681.69 }, { "epoch": 0.02005, "grad_norm": 0.9375, "learning_rate": 0.0012015, "loss": 4.1584, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 802, "tokens_per_second_per_gpu": 17749.85 }, { "epoch": 0.020075, "grad_norm": 0.9921875, "learning_rate": 0.001203, "loss": 4.1191, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 803, "tokens_per_second_per_gpu": 13825.33 }, { "epoch": 0.0201, "grad_norm": 1.0625, "learning_rate": 0.0012045, "loss": 4.1437, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 804, "tokens_per_second_per_gpu": 15694.31 }, { "epoch": 0.020125, "grad_norm": 0.8046875, "learning_rate": 0.001206, "loss": 4.1752, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 805, "tokens_per_second_per_gpu": 18735.96 }, { "epoch": 0.02015, "grad_norm": 0.8671875, "learning_rate": 0.0012075, "loss": 4.1254, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 806, "tokens_per_second_per_gpu": 20133.41 }, { "epoch": 0.020175, "grad_norm": 0.98828125, "learning_rate": 0.001209, "loss": 4.1115, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 807, "tokens_per_second_per_gpu": 19605.39 }, { "epoch": 0.0202, "grad_norm": 0.8828125, "learning_rate": 0.0012105000000000002, "loss": 4.1363, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 808, "tokens_per_second_per_gpu": 29057.94 }, { "epoch": 0.020225, "grad_norm": 0.9296875, "learning_rate": 0.0012120000000000002, "loss": 4.1334, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 809, "tokens_per_second_per_gpu": 15890.7 }, { "epoch": 0.02025, "grad_norm": 0.734375, "learning_rate": 0.0012135000000000002, "loss": 4.1781, "memory/device_reserved (GiB)": 95.73, "memory/max_active (GiB)": 95.72, "memory/max_allocated (GiB)": 95.72, "step": 810, "tokens_per_second_per_gpu": 14070.79 }, { "epoch": 0.020275, "grad_norm": 0.75390625, "learning_rate": 0.0012150000000000002, "loss": 4.1368, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 811, "tokens_per_second_per_gpu": 26670.88 }, { "epoch": 0.0203, "grad_norm": 0.80859375, "learning_rate": 0.0012165000000000001, "loss": 4.1362, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 812, "tokens_per_second_per_gpu": 12493.24 }, { "epoch": 0.020325, "grad_norm": 0.8203125, "learning_rate": 0.0012180000000000001, "loss": 4.1255, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 813, "tokens_per_second_per_gpu": 17896.59 }, { "epoch": 0.02035, "grad_norm": 0.96484375, "learning_rate": 0.0012194999999999999, "loss": 4.1387, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 814, "tokens_per_second_per_gpu": 16803.38 }, { "epoch": 0.020375, "grad_norm": 1.1875, "learning_rate": 0.0012209999999999999, "loss": 4.139, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 815, "tokens_per_second_per_gpu": 18073.49 }, { "epoch": 0.0204, "grad_norm": 0.9296875, "learning_rate": 0.0012224999999999998, "loss": 4.1086, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 816, "tokens_per_second_per_gpu": 24296.81 }, { "epoch": 0.020425, "grad_norm": 1.0546875, "learning_rate": 0.001224, "loss": 4.1126, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 817, "tokens_per_second_per_gpu": 18404.91 }, { "epoch": 0.02045, "grad_norm": 0.88671875, "learning_rate": 0.0012255, "loss": 4.1222, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 818, "tokens_per_second_per_gpu": 20588.47 }, { "epoch": 0.020475, "grad_norm": 0.65625, "learning_rate": 0.001227, "loss": 4.1203, "memory/device_reserved (GiB)": 74.49, "memory/max_active (GiB)": 74.44, "memory/max_allocated (GiB)": 74.44, "step": 819, "tokens_per_second_per_gpu": 17896.45 }, { "epoch": 0.0205, "grad_norm": 0.49609375, "learning_rate": 0.0012285, "loss": 4.1341, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 820, "tokens_per_second_per_gpu": 17234.43 }, { "epoch": 0.020525, "grad_norm": 0.53125, "learning_rate": 0.00123, "loss": 4.0829, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 821, "tokens_per_second_per_gpu": 13545.33 }, { "epoch": 0.02055, "grad_norm": 0.69921875, "learning_rate": 0.0012315, "loss": 4.1075, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 822, "tokens_per_second_per_gpu": 11710.27 }, { "epoch": 0.020575, "grad_norm": 0.9140625, "learning_rate": 0.001233, "loss": 4.1009, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 823, "tokens_per_second_per_gpu": 11693.3 }, { "epoch": 0.0206, "grad_norm": 0.8515625, "learning_rate": 0.0012345, "loss": 4.1233, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 824, "tokens_per_second_per_gpu": 12702.46 }, { "epoch": 0.020625, "grad_norm": 0.90625, "learning_rate": 0.001236, "loss": 4.0797, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 825, "tokens_per_second_per_gpu": 17425.61 }, { "epoch": 0.02065, "grad_norm": 0.734375, "learning_rate": 0.0012374999999999999, "loss": 4.1017, "memory/device_reserved (GiB)": 74.93, "memory/max_active (GiB)": 74.91, "memory/max_allocated (GiB)": 74.91, "step": 826, "tokens_per_second_per_gpu": 18310.54 }, { "epoch": 0.020675, "grad_norm": 0.62890625, "learning_rate": 0.0012389999999999999, "loss": 4.0692, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 827, "tokens_per_second_per_gpu": 17838.16 }, { "epoch": 0.0207, "grad_norm": 0.6953125, "learning_rate": 0.0012405, "loss": 4.087, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 828, "tokens_per_second_per_gpu": 12574.21 }, { "epoch": 0.020725, "grad_norm": 0.640625, "learning_rate": 0.001242, "loss": 4.0547, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 829, "tokens_per_second_per_gpu": 24561.99 }, { "epoch": 0.02075, "grad_norm": 0.6640625, "learning_rate": 0.0012435, "loss": 4.0676, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 830, "tokens_per_second_per_gpu": 22283.75 }, { "epoch": 0.020775, "grad_norm": 0.6328125, "learning_rate": 0.001245, "loss": 4.0627, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 831, "tokens_per_second_per_gpu": 20084.02 }, { "epoch": 0.0208, "grad_norm": 0.458984375, "learning_rate": 0.0012465, "loss": 4.0875, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 832, "tokens_per_second_per_gpu": 25119.83 }, { "epoch": 0.020825, "grad_norm": 0.51171875, "learning_rate": 0.001248, "loss": 4.0833, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 833, "tokens_per_second_per_gpu": 13840.33 }, { "epoch": 0.02085, "grad_norm": 0.61328125, "learning_rate": 0.0012495, "loss": 4.064, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 834, "tokens_per_second_per_gpu": 15856.27 }, { "epoch": 0.020875, "grad_norm": 0.63671875, "learning_rate": 0.001251, "loss": 4.0644, "memory/device_reserved (GiB)": 85.15, "memory/max_active (GiB)": 85.11, "memory/max_allocated (GiB)": 85.11, "step": 835, "tokens_per_second_per_gpu": 16656.51 }, { "epoch": 0.0209, "grad_norm": 0.578125, "learning_rate": 0.0012525, "loss": 4.0609, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 836, "tokens_per_second_per_gpu": 13007.35 }, { "epoch": 0.020925, "grad_norm": 0.81640625, "learning_rate": 0.001254, "loss": 4.0317, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 837, "tokens_per_second_per_gpu": 23741.71 }, { "epoch": 0.02095, "grad_norm": 1.15625, "learning_rate": 0.0012554999999999999, "loss": 4.0322, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 838, "tokens_per_second_per_gpu": 17505.88 }, { "epoch": 0.020975, "grad_norm": 1.0078125, "learning_rate": 0.0012569999999999999, "loss": 4.0589, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 839, "tokens_per_second_per_gpu": 11060.36 }, { "epoch": 0.021, "grad_norm": 1.2578125, "learning_rate": 0.0012585, "loss": 4.0863, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 840, "tokens_per_second_per_gpu": 18467.42 }, { "epoch": 0.021025, "grad_norm": 0.984375, "learning_rate": 0.00126, "loss": 4.0743, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 841, "tokens_per_second_per_gpu": 13551.18 }, { "epoch": 0.02105, "grad_norm": 1.140625, "learning_rate": 0.0012615, "loss": 4.0542, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 842, "tokens_per_second_per_gpu": 22526.22 }, { "epoch": 0.021075, "grad_norm": 1.1171875, "learning_rate": 0.001263, "loss": 4.0585, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 843, "tokens_per_second_per_gpu": 19916.44 }, { "epoch": 0.0211, "grad_norm": 1.078125, "learning_rate": 0.0012645, "loss": 4.0913, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 844, "tokens_per_second_per_gpu": 15062.1 }, { "epoch": 0.021125, "grad_norm": 1.2734375, "learning_rate": 0.001266, "loss": 4.0481, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 845, "tokens_per_second_per_gpu": 14215.92 }, { "epoch": 0.02115, "grad_norm": 0.9765625, "learning_rate": 0.0012675, "loss": 4.0593, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 846, "tokens_per_second_per_gpu": 20619.9 }, { "epoch": 0.021175, "grad_norm": 0.87890625, "learning_rate": 0.001269, "loss": 4.0432, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 847, "tokens_per_second_per_gpu": 23827.3 }, { "epoch": 0.0212, "grad_norm": 0.65625, "learning_rate": 0.0012705, "loss": 4.0844, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 848, "tokens_per_second_per_gpu": 16480.89 }, { "epoch": 0.021225, "grad_norm": 0.7265625, "learning_rate": 0.001272, "loss": 4.0421, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 849, "tokens_per_second_per_gpu": 27513.54 }, { "epoch": 0.02125, "grad_norm": 0.80078125, "learning_rate": 0.0012735, "loss": 4.0831, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 850, "tokens_per_second_per_gpu": 11834.36 }, { "epoch": 0.021275, "grad_norm": 1.0703125, "learning_rate": 0.001275, "loss": 4.0584, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 851, "tokens_per_second_per_gpu": 15753.5 }, { "epoch": 0.0213, "grad_norm": 1.0625, "learning_rate": 0.0012765, "loss": 4.0788, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 852, "tokens_per_second_per_gpu": 15526.51 }, { "epoch": 0.021325, "grad_norm": 0.90625, "learning_rate": 0.001278, "loss": 4.0155, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 853, "tokens_per_second_per_gpu": 15959.97 }, { "epoch": 0.02135, "grad_norm": 0.64453125, "learning_rate": 0.0012795, "loss": 4.0331, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 854, "tokens_per_second_per_gpu": 17057.1 }, { "epoch": 0.021375, "grad_norm": 0.36328125, "learning_rate": 0.001281, "loss": 4.0342, "memory/device_reserved (GiB)": 96.42, "memory/max_active (GiB)": 96.23, "memory/max_allocated (GiB)": 96.23, "step": 855, "tokens_per_second_per_gpu": 13528.85 }, { "epoch": 0.0214, "grad_norm": 0.5390625, "learning_rate": 0.0012825, "loss": 4.0503, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 856, "tokens_per_second_per_gpu": 14013.84 }, { "epoch": 0.021425, "grad_norm": 0.6171875, "learning_rate": 0.001284, "loss": 4.0602, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 857, "tokens_per_second_per_gpu": 14428.79 }, { "epoch": 0.02145, "grad_norm": 0.640625, "learning_rate": 0.0012855, "loss": 4.0421, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 858, "tokens_per_second_per_gpu": 20360.04 }, { "epoch": 0.021475, "grad_norm": 0.71875, "learning_rate": 0.001287, "loss": 3.9854, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 859, "tokens_per_second_per_gpu": 15084.76 }, { "epoch": 0.0215, "grad_norm": 1.1171875, "learning_rate": 0.0012885, "loss": 4.0317, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 860, "tokens_per_second_per_gpu": 14725.39 }, { "epoch": 0.021525, "grad_norm": 0.9921875, "learning_rate": 0.00129, "loss": 4.0075, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 861, "tokens_per_second_per_gpu": 23473.6 }, { "epoch": 0.02155, "grad_norm": 0.875, "learning_rate": 0.0012915000000000001, "loss": 4.0167, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 862, "tokens_per_second_per_gpu": 15717.81 }, { "epoch": 0.021575, "grad_norm": 0.7890625, "learning_rate": 0.001293, "loss": 4.0077, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 863, "tokens_per_second_per_gpu": 16030.15 }, { "epoch": 0.0216, "grad_norm": 0.64453125, "learning_rate": 0.0012945, "loss": 4.0281, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 864, "tokens_per_second_per_gpu": 18075.21 }, { "epoch": 0.021625, "grad_norm": 0.66015625, "learning_rate": 0.001296, "loss": 4.0188, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 865, "tokens_per_second_per_gpu": 23855.71 }, { "epoch": 0.02165, "grad_norm": 0.703125, "learning_rate": 0.0012975, "loss": 4.0069, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 866, "tokens_per_second_per_gpu": 23498.03 }, { "epoch": 0.021675, "grad_norm": 0.5234375, "learning_rate": 0.001299, "loss": 3.9963, "memory/device_reserved (GiB)": 45.24, "memory/max_active (GiB)": 45.19, "memory/max_allocated (GiB)": 45.19, "step": 867, "tokens_per_second_per_gpu": 26653.4 }, { "epoch": 0.0217, "grad_norm": 0.498046875, "learning_rate": 0.0013005, "loss": 4.0078, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 868, "tokens_per_second_per_gpu": 20940.0 }, { "epoch": 0.021725, "grad_norm": 0.53515625, "learning_rate": 0.001302, "loss": 3.9819, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 869, "tokens_per_second_per_gpu": 14051.47 }, { "epoch": 0.02175, "grad_norm": 1.3359375, "learning_rate": 0.0013035, "loss": 4.0201, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 870, "tokens_per_second_per_gpu": 10359.79 }, { "epoch": 0.021775, "grad_norm": 1.3125, "learning_rate": 0.001305, "loss": 4.0266, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 871, "tokens_per_second_per_gpu": 13973.85 }, { "epoch": 0.0218, "grad_norm": 0.6953125, "learning_rate": 0.0013065, "loss": 3.9934, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 872, "tokens_per_second_per_gpu": 15643.53 }, { "epoch": 0.021825, "grad_norm": 0.74609375, "learning_rate": 0.001308, "loss": 3.9843, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 873, "tokens_per_second_per_gpu": 22670.57 }, { "epoch": 0.02185, "grad_norm": 0.875, "learning_rate": 0.0013095000000000001, "loss": 4.0389, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 874, "tokens_per_second_per_gpu": 11062.69 }, { "epoch": 0.021875, "grad_norm": 1.125, "learning_rate": 0.001311, "loss": 3.9729, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 875, "tokens_per_second_per_gpu": 28095.62 }, { "epoch": 0.0219, "grad_norm": 1.4609375, "learning_rate": 0.0013125, "loss": 4.0226, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 876, "tokens_per_second_per_gpu": 12658.97 }, { "epoch": 0.021925, "grad_norm": 0.89453125, "learning_rate": 0.001314, "loss": 4.0382, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 877, "tokens_per_second_per_gpu": 28458.01 }, { "epoch": 0.02195, "grad_norm": 1.2734375, "learning_rate": 0.0013155, "loss": 4.0552, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 878, "tokens_per_second_per_gpu": 10437.82 }, { "epoch": 0.021975, "grad_norm": 1.28125, "learning_rate": 0.001317, "loss": 4.081, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 879, "tokens_per_second_per_gpu": 25314.42 }, { "epoch": 0.022, "grad_norm": 0.88671875, "learning_rate": 0.0013185, "loss": 4.0772, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 880, "tokens_per_second_per_gpu": 12326.21 }, { "epoch": 0.022025, "grad_norm": 1.09375, "learning_rate": 0.00132, "loss": 3.9906, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 881, "tokens_per_second_per_gpu": 27927.19 }, { "epoch": 0.02205, "grad_norm": 1.515625, "learning_rate": 0.0013215, "loss": 4.1023, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 882, "tokens_per_second_per_gpu": 10899.65 }, { "epoch": 0.022075, "grad_norm": 1.1328125, "learning_rate": 0.001323, "loss": 4.0912, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 883, "tokens_per_second_per_gpu": 16038.55 }, { "epoch": 0.0221, "grad_norm": 1.2265625, "learning_rate": 0.0013245, "loss": 4.0446, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 884, "tokens_per_second_per_gpu": 11679.09 }, { "epoch": 0.022125, "grad_norm": 0.88671875, "learning_rate": 0.0013260000000000001, "loss": 4.0132, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 885, "tokens_per_second_per_gpu": 26491.89 }, { "epoch": 0.02215, "grad_norm": 0.671875, "learning_rate": 0.0013275000000000001, "loss": 4.0343, "memory/device_reserved (GiB)": 96.42, "memory/max_active (GiB)": 96.23, "memory/max_allocated (GiB)": 96.23, "step": 886, "tokens_per_second_per_gpu": 13979.76 }, { "epoch": 0.022175, "grad_norm": 0.8046875, "learning_rate": 0.001329, "loss": 4.0093, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 887, "tokens_per_second_per_gpu": 13096.25 }, { "epoch": 0.0222, "grad_norm": 0.80078125, "learning_rate": 0.0013305, "loss": 3.9901, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 888, "tokens_per_second_per_gpu": 12817.32 }, { "epoch": 0.022225, "grad_norm": 1.03125, "learning_rate": 0.001332, "loss": 4.0151, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 889, "tokens_per_second_per_gpu": 11470.04 }, { "epoch": 0.02225, "grad_norm": 0.890625, "learning_rate": 0.0013335, "loss": 4.0131, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 890, "tokens_per_second_per_gpu": 23526.4 }, { "epoch": 0.022275, "grad_norm": 0.53515625, "learning_rate": 0.001335, "loss": 3.9628, "memory/device_reserved (GiB)": 56.59, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 891, "tokens_per_second_per_gpu": 21310.55 }, { "epoch": 0.0223, "grad_norm": 0.6171875, "learning_rate": 0.0013365, "loss": 3.9936, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 892, "tokens_per_second_per_gpu": 17812.97 }, { "epoch": 0.022325, "grad_norm": 0.59375, "learning_rate": 0.001338, "loss": 3.9475, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 893, "tokens_per_second_per_gpu": 17065.57 }, { "epoch": 0.02235, "grad_norm": 0.76953125, "learning_rate": 0.0013395, "loss": 3.9663, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 894, "tokens_per_second_per_gpu": 15336.8 }, { "epoch": 0.022375, "grad_norm": 0.82421875, "learning_rate": 0.001341, "loss": 3.9316, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 895, "tokens_per_second_per_gpu": 13242.21 }, { "epoch": 0.0224, "grad_norm": 0.5859375, "learning_rate": 0.0013425000000000002, "loss": 3.9531, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 896, "tokens_per_second_per_gpu": 16756.75 }, { "epoch": 0.022425, "grad_norm": 0.68359375, "learning_rate": 0.0013440000000000001, "loss": 3.9345, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 897, "tokens_per_second_per_gpu": 20982.85 }, { "epoch": 0.02245, "grad_norm": 0.67578125, "learning_rate": 0.0013455000000000001, "loss": 3.9533, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 898, "tokens_per_second_per_gpu": 17612.85 }, { "epoch": 0.022475, "grad_norm": 0.65234375, "learning_rate": 0.001347, "loss": 3.9224, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 899, "tokens_per_second_per_gpu": 19911.52 }, { "epoch": 0.0225, "grad_norm": 0.56640625, "learning_rate": 0.0013485, "loss": 3.9889, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 900, "tokens_per_second_per_gpu": 26670.29 }, { "epoch": 0.022525, "grad_norm": 0.56640625, "learning_rate": 0.00135, "loss": 3.9216, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 901, "tokens_per_second_per_gpu": 20466.37 }, { "epoch": 0.02255, "grad_norm": 0.53125, "learning_rate": 0.0013515, "loss": 3.9014, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 902, "tokens_per_second_per_gpu": 28610.62 }, { "epoch": 0.022575, "grad_norm": 0.392578125, "learning_rate": 0.001353, "loss": 3.9174, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 903, "tokens_per_second_per_gpu": 24479.02 }, { "epoch": 0.0226, "grad_norm": 0.380859375, "learning_rate": 0.0013545, "loss": 3.9153, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 904, "tokens_per_second_per_gpu": 13620.18 }, { "epoch": 0.022625, "grad_norm": 0.546875, "learning_rate": 0.001356, "loss": 3.9454, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 905, "tokens_per_second_per_gpu": 12376.1 }, { "epoch": 0.02265, "grad_norm": 0.6484375, "learning_rate": 0.0013575, "loss": 3.9138, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 906, "tokens_per_second_per_gpu": 17704.89 }, { "epoch": 0.022675, "grad_norm": 0.5234375, "learning_rate": 0.001359, "loss": 3.9255, "memory/device_reserved (GiB)": 85.14, "memory/max_active (GiB)": 85.11, "memory/max_allocated (GiB)": 85.11, "step": 907, "tokens_per_second_per_gpu": 17289.73 }, { "epoch": 0.0227, "grad_norm": 0.427734375, "learning_rate": 0.0013605000000000002, "loss": 3.9385, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 908, "tokens_per_second_per_gpu": 23258.4 }, { "epoch": 0.022725, "grad_norm": 0.470703125, "learning_rate": 0.0013620000000000001, "loss": 3.8963, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 909, "tokens_per_second_per_gpu": 19382.08 }, { "epoch": 0.02275, "grad_norm": 0.6328125, "learning_rate": 0.0013635000000000001, "loss": 3.8996, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 910, "tokens_per_second_per_gpu": 10973.71 }, { "epoch": 0.022775, "grad_norm": 1.046875, "learning_rate": 0.0013650000000000001, "loss": 3.9245, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 911, "tokens_per_second_per_gpu": 10841.36 }, { "epoch": 0.0228, "grad_norm": 0.921875, "learning_rate": 0.0013665, "loss": 3.917, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 912, "tokens_per_second_per_gpu": 15074.37 }, { "epoch": 0.022825, "grad_norm": 0.75390625, "learning_rate": 0.001368, "loss": 3.9317, "memory/device_reserved (GiB)": 75.81, "memory/max_active (GiB)": 75.79, "memory/max_allocated (GiB)": 75.79, "step": 913, "tokens_per_second_per_gpu": 18003.57 }, { "epoch": 0.02285, "grad_norm": 0.82421875, "learning_rate": 0.0013695, "loss": 3.899, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 914, "tokens_per_second_per_gpu": 17039.14 }, { "epoch": 0.022875, "grad_norm": 1.0234375, "learning_rate": 0.001371, "loss": 3.9422, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 915, "tokens_per_second_per_gpu": 23021.98 }, { "epoch": 0.0229, "grad_norm": 0.96875, "learning_rate": 0.0013725, "loss": 3.973, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 916, "tokens_per_second_per_gpu": 20274.11 }, { "epoch": 0.022925, "grad_norm": 0.9765625, "learning_rate": 0.001374, "loss": 3.9461, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 917, "tokens_per_second_per_gpu": 13334.83 }, { "epoch": 0.02295, "grad_norm": 0.81640625, "learning_rate": 0.0013755, "loss": 3.9683, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 918, "tokens_per_second_per_gpu": 20671.17 }, { "epoch": 0.022975, "grad_norm": 0.77734375, "learning_rate": 0.0013770000000000002, "loss": 3.9324, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 919, "tokens_per_second_per_gpu": 21422.43 }, { "epoch": 0.023, "grad_norm": 0.984375, "learning_rate": 0.0013785000000000002, "loss": 3.9645, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 920, "tokens_per_second_per_gpu": 10873.25 }, { "epoch": 0.023025, "grad_norm": 1.046875, "learning_rate": 0.0013800000000000002, "loss": 3.978, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 921, "tokens_per_second_per_gpu": 12252.57 }, { "epoch": 0.02305, "grad_norm": 0.73046875, "learning_rate": 0.0013815000000000001, "loss": 3.9689, "memory/device_reserved (GiB)": 64.72, "memory/max_active (GiB)": 64.71, "memory/max_allocated (GiB)": 64.71, "step": 922, "tokens_per_second_per_gpu": 21037.4 }, { "epoch": 0.023075, "grad_norm": 0.76171875, "learning_rate": 0.0013830000000000001, "loss": 4.0102, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 923, "tokens_per_second_per_gpu": 13943.05 }, { "epoch": 0.0231, "grad_norm": 0.7734375, "learning_rate": 0.0013845, "loss": 3.9346, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 924, "tokens_per_second_per_gpu": 13585.59 }, { "epoch": 0.023125, "grad_norm": 0.890625, "learning_rate": 0.001386, "loss": 3.9224, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 925, "tokens_per_second_per_gpu": 13274.82 }, { "epoch": 0.02315, "grad_norm": 1.03125, "learning_rate": 0.0013875, "loss": 3.9848, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 926, "tokens_per_second_per_gpu": 13176.35 }, { "epoch": 0.023175, "grad_norm": 0.68359375, "learning_rate": 0.001389, "loss": 3.9347, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 927, "tokens_per_second_per_gpu": 13490.58 }, { "epoch": 0.0232, "grad_norm": 0.84765625, "learning_rate": 0.0013905, "loss": 3.9535, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 928, "tokens_per_second_per_gpu": 11528.23 }, { "epoch": 0.023225, "grad_norm": 0.9375, "learning_rate": 0.001392, "loss": 3.9248, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 929, "tokens_per_second_per_gpu": 11223.5 }, { "epoch": 0.02325, "grad_norm": 0.83203125, "learning_rate": 0.0013935000000000002, "loss": 3.9478, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 930, "tokens_per_second_per_gpu": 14848.62 }, { "epoch": 0.023275, "grad_norm": 0.6796875, "learning_rate": 0.0013950000000000002, "loss": 3.9028, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 931, "tokens_per_second_per_gpu": 23873.11 }, { "epoch": 0.0233, "grad_norm": 0.63671875, "learning_rate": 0.0013965000000000002, "loss": 3.9268, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 932, "tokens_per_second_per_gpu": 12305.03 }, { "epoch": 0.023325, "grad_norm": 0.65625, "learning_rate": 0.0013980000000000002, "loss": 3.8898, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 933, "tokens_per_second_per_gpu": 15448.51 }, { "epoch": 0.02335, "grad_norm": 0.41796875, "learning_rate": 0.0013995000000000001, "loss": 3.9331, "memory/device_reserved (GiB)": 55.41, "memory/max_active (GiB)": 55.39, "memory/max_allocated (GiB)": 55.39, "step": 934, "tokens_per_second_per_gpu": 23941.59 }, { "epoch": 0.023375, "grad_norm": 0.478515625, "learning_rate": 0.0014010000000000001, "loss": 3.8942, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 935, "tokens_per_second_per_gpu": 18095.23 }, { "epoch": 0.0234, "grad_norm": 0.359375, "learning_rate": 0.0014025, "loss": 3.8632, "memory/device_reserved (GiB)": 55.57, "memory/max_active (GiB)": 55.42, "memory/max_allocated (GiB)": 55.42, "step": 936, "tokens_per_second_per_gpu": 23330.72 }, { "epoch": 0.023425, "grad_norm": 0.4140625, "learning_rate": 0.001404, "loss": 3.8707, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 937, "tokens_per_second_per_gpu": 15416.07 }, { "epoch": 0.02345, "grad_norm": 0.498046875, "learning_rate": 0.0014055, "loss": 3.8626, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 938, "tokens_per_second_per_gpu": 16111.36 }, { "epoch": 0.023475, "grad_norm": 0.52734375, "learning_rate": 0.001407, "loss": 3.8574, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 939, "tokens_per_second_per_gpu": 14963.65 }, { "epoch": 0.0235, "grad_norm": 0.5703125, "learning_rate": 0.0014085, "loss": 3.8668, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 940, "tokens_per_second_per_gpu": 17291.5 }, { "epoch": 0.023525, "grad_norm": 0.76953125, "learning_rate": 0.00141, "loss": 3.8567, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 941, "tokens_per_second_per_gpu": 15573.26 }, { "epoch": 0.02355, "grad_norm": 0.83984375, "learning_rate": 0.0014115, "loss": 3.8794, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 942, "tokens_per_second_per_gpu": 15776.05 }, { "epoch": 0.023575, "grad_norm": 0.72265625, "learning_rate": 0.001413, "loss": 3.89, "memory/device_reserved (GiB)": 84.66, "memory/max_active (GiB)": 84.64, "memory/max_allocated (GiB)": 84.64, "step": 943, "tokens_per_second_per_gpu": 15874.98 }, { "epoch": 0.0236, "grad_norm": 0.58203125, "learning_rate": 0.0014145, "loss": 3.8764, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 944, "tokens_per_second_per_gpu": 23229.53 }, { "epoch": 0.023625, "grad_norm": 0.61328125, "learning_rate": 0.001416, "loss": 3.8395, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 945, "tokens_per_second_per_gpu": 22653.43 }, { "epoch": 0.02365, "grad_norm": 0.5703125, "learning_rate": 0.0014175, "loss": 3.8758, "memory/device_reserved (GiB)": 95.73, "memory/max_active (GiB)": 95.72, "memory/max_allocated (GiB)": 95.72, "step": 946, "tokens_per_second_per_gpu": 13900.52 }, { "epoch": 0.023675, "grad_norm": 0.498046875, "learning_rate": 0.001419, "loss": 3.8207, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 947, "tokens_per_second_per_gpu": 16593.47 }, { "epoch": 0.0237, "grad_norm": 0.546875, "learning_rate": 0.0014204999999999999, "loss": 3.8448, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 948, "tokens_per_second_per_gpu": 23267.99 }, { "epoch": 0.023725, "grad_norm": 0.67578125, "learning_rate": 0.0014219999999999999, "loss": 3.8187, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 949, "tokens_per_second_per_gpu": 11958.63 }, { "epoch": 0.02375, "grad_norm": 0.8203125, "learning_rate": 0.0014235, "loss": 3.8568, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 950, "tokens_per_second_per_gpu": 23694.63 }, { "epoch": 0.023775, "grad_norm": 0.78515625, "learning_rate": 0.001425, "loss": 3.8024, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 951, "tokens_per_second_per_gpu": 10526.04 }, { "epoch": 0.0238, "grad_norm": 0.9375, "learning_rate": 0.0014265, "loss": 3.8821, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 952, "tokens_per_second_per_gpu": 13670.56 }, { "epoch": 0.023825, "grad_norm": 1.0, "learning_rate": 0.001428, "loss": 3.8379, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 953, "tokens_per_second_per_gpu": 18058.93 }, { "epoch": 0.02385, "grad_norm": 0.85546875, "learning_rate": 0.0014295, "loss": 3.9334, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 954, "tokens_per_second_per_gpu": 13455.05 }, { "epoch": 0.023875, "grad_norm": 0.6953125, "learning_rate": 0.001431, "loss": 3.8673, "memory/device_reserved (GiB)": 65.8, "memory/max_active (GiB)": 65.62, "memory/max_allocated (GiB)": 65.62, "step": 955, "tokens_per_second_per_gpu": 19364.48 }, { "epoch": 0.0239, "grad_norm": 0.671875, "learning_rate": 0.0014325, "loss": 3.8755, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 956, "tokens_per_second_per_gpu": 22980.75 }, { "epoch": 0.023925, "grad_norm": 0.6171875, "learning_rate": 0.001434, "loss": 3.8684, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 957, "tokens_per_second_per_gpu": 18867.0 }, { "epoch": 0.02395, "grad_norm": 0.59375, "learning_rate": 0.0014355, "loss": 3.8433, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 958, "tokens_per_second_per_gpu": 10580.39 }, { "epoch": 0.023975, "grad_norm": 0.84765625, "learning_rate": 0.001437, "loss": 3.9009, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 959, "tokens_per_second_per_gpu": 15335.09 }, { "epoch": 0.024, "grad_norm": 1.1484375, "learning_rate": 0.0014385, "loss": 3.8835, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 960, "tokens_per_second_per_gpu": 19743.25 }, { "epoch": 0.024025, "grad_norm": 0.8125, "learning_rate": 0.0014399999999999999, "loss": 3.9088, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 961, "tokens_per_second_per_gpu": 13366.57 }, { "epoch": 0.02405, "grad_norm": 0.68359375, "learning_rate": 0.0014415, "loss": 3.8392, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 962, "tokens_per_second_per_gpu": 15137.11 }, { "epoch": 0.024075, "grad_norm": 0.75390625, "learning_rate": 0.001443, "loss": 3.8867, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 963, "tokens_per_second_per_gpu": 24137.73 }, { "epoch": 0.0241, "grad_norm": 0.58203125, "learning_rate": 0.0014445, "loss": 3.8526, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 964, "tokens_per_second_per_gpu": 27848.67 }, { "epoch": 0.024125, "grad_norm": 0.4921875, "learning_rate": 0.001446, "loss": 3.83, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 965, "tokens_per_second_per_gpu": 23903.04 }, { "epoch": 0.02415, "grad_norm": 0.4765625, "learning_rate": 0.0014475, "loss": 3.8426, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 966, "tokens_per_second_per_gpu": 12486.59 }, { "epoch": 0.024175, "grad_norm": 0.56640625, "learning_rate": 0.001449, "loss": 3.8456, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 967, "tokens_per_second_per_gpu": 19515.44 }, { "epoch": 0.0242, "grad_norm": 0.640625, "learning_rate": 0.0014505, "loss": 3.8537, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 968, "tokens_per_second_per_gpu": 13157.83 }, { "epoch": 0.024225, "grad_norm": 0.84765625, "learning_rate": 0.001452, "loss": 3.8317, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 969, "tokens_per_second_per_gpu": 15750.14 }, { "epoch": 0.02425, "grad_norm": 0.890625, "learning_rate": 0.0014535, "loss": 3.8548, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 970, "tokens_per_second_per_gpu": 23653.19 }, { "epoch": 0.024275, "grad_norm": 0.8359375, "learning_rate": 0.001455, "loss": 3.8772, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 971, "tokens_per_second_per_gpu": 10718.48 }, { "epoch": 0.0243, "grad_norm": 0.7109375, "learning_rate": 0.0014565, "loss": 3.8565, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 972, "tokens_per_second_per_gpu": 14449.35 }, { "epoch": 0.024325, "grad_norm": 0.60546875, "learning_rate": 0.001458, "loss": 3.8454, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 973, "tokens_per_second_per_gpu": 19491.48 }, { "epoch": 0.02435, "grad_norm": 0.466796875, "learning_rate": 0.0014595, "loss": 3.8504, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 974, "tokens_per_second_per_gpu": 13619.52 }, { "epoch": 0.024375, "grad_norm": 0.4296875, "learning_rate": 0.001461, "loss": 3.8155, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 975, "tokens_per_second_per_gpu": 26050.49 }, { "epoch": 0.0244, "grad_norm": 0.40625, "learning_rate": 0.0014625, "loss": 3.803, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 976, "tokens_per_second_per_gpu": 20954.15 }, { "epoch": 0.024425, "grad_norm": 0.314453125, "learning_rate": 0.001464, "loss": 3.8207, "memory/device_reserved (GiB)": 55.57, "memory/max_active (GiB)": 55.42, "memory/max_allocated (GiB)": 55.42, "step": 977, "tokens_per_second_per_gpu": 23006.27 }, { "epoch": 0.02445, "grad_norm": 0.369140625, "learning_rate": 0.0014655, "loss": 3.8411, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 978, "tokens_per_second_per_gpu": 10754.76 }, { "epoch": 0.024475, "grad_norm": 0.50390625, "learning_rate": 0.001467, "loss": 3.8222, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 979, "tokens_per_second_per_gpu": 13761.76 }, { "epoch": 0.0245, "grad_norm": 0.546875, "learning_rate": 0.0014685, "loss": 3.8074, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 980, "tokens_per_second_per_gpu": 19754.58 }, { "epoch": 0.024525, "grad_norm": 0.83203125, "learning_rate": 0.00147, "loss": 3.8476, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 981, "tokens_per_second_per_gpu": 10552.12 }, { "epoch": 0.02455, "grad_norm": 0.703125, "learning_rate": 0.0014715, "loss": 3.8749, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 982, "tokens_per_second_per_gpu": 21501.88 }, { "epoch": 0.024575, "grad_norm": 0.45703125, "learning_rate": 0.001473, "loss": 3.8403, "memory/device_reserved (GiB)": 106.43, "memory/max_active (GiB)": 106.39, "memory/max_allocated (GiB)": 106.39, "step": 983, "tokens_per_second_per_gpu": 12625.18 }, { "epoch": 0.0246, "grad_norm": 0.51953125, "learning_rate": 0.0014745000000000001, "loss": 3.7824, "memory/device_reserved (GiB)": 66.36, "memory/max_active (GiB)": 66.15, "memory/max_allocated (GiB)": 66.15, "step": 984, "tokens_per_second_per_gpu": 20039.15 }, { "epoch": 0.024625, "grad_norm": 0.45703125, "learning_rate": 0.001476, "loss": 3.8108, "memory/device_reserved (GiB)": 85.15, "memory/max_active (GiB)": 85.11, "memory/max_allocated (GiB)": 85.11, "step": 985, "tokens_per_second_per_gpu": 16059.3 }, { "epoch": 0.02465, "grad_norm": 0.40234375, "learning_rate": 0.0014775, "loss": 3.8031, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 986, "tokens_per_second_per_gpu": 11513.81 }, { "epoch": 0.024675, "grad_norm": 0.62109375, "learning_rate": 0.001479, "loss": 3.814, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 987, "tokens_per_second_per_gpu": 10702.7 }, { "epoch": 0.0247, "grad_norm": 1.0, "learning_rate": 0.0014805, "loss": 3.8101, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 988, "tokens_per_second_per_gpu": 17397.73 }, { "epoch": 0.024725, "grad_norm": 1.1796875, "learning_rate": 0.001482, "loss": 3.8563, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 989, "tokens_per_second_per_gpu": 10834.02 }, { "epoch": 0.02475, "grad_norm": 0.921875, "learning_rate": 0.0014835, "loss": 3.9141, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 990, "tokens_per_second_per_gpu": 15193.85 }, { "epoch": 0.024775, "grad_norm": 0.9765625, "learning_rate": 0.001485, "loss": 3.8371, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 991, "tokens_per_second_per_gpu": 13587.94 }, { "epoch": 0.0248, "grad_norm": 0.75, "learning_rate": 0.0014865, "loss": 3.8317, "memory/device_reserved (GiB)": 64.72, "memory/max_active (GiB)": 64.71, "memory/max_allocated (GiB)": 64.71, "step": 992, "tokens_per_second_per_gpu": 20300.88 }, { "epoch": 0.024825, "grad_norm": 0.80859375, "learning_rate": 0.001488, "loss": 3.8562, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 993, "tokens_per_second_per_gpu": 13914.44 }, { "epoch": 0.02485, "grad_norm": 0.8125, "learning_rate": 0.0014895, "loss": 3.8611, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 994, "tokens_per_second_per_gpu": 10567.36 }, { "epoch": 0.024875, "grad_norm": 0.9609375, "learning_rate": 0.001491, "loss": 3.8366, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 995, "tokens_per_second_per_gpu": 26595.76 }, { "epoch": 0.0249, "grad_norm": 0.7734375, "learning_rate": 0.0014925000000000001, "loss": 3.8521, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 996, "tokens_per_second_per_gpu": 12717.51 }, { "epoch": 0.024925, "grad_norm": 0.8984375, "learning_rate": 0.001494, "loss": 3.8545, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 997, "tokens_per_second_per_gpu": 15684.64 }, { "epoch": 0.02495, "grad_norm": 1.015625, "learning_rate": 0.0014955, "loss": 3.8618, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 998, "tokens_per_second_per_gpu": 10815.57 }, { "epoch": 0.024975, "grad_norm": 1.234375, "learning_rate": 0.001497, "loss": 3.8557, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 999, "tokens_per_second_per_gpu": 19350.86 }, { "epoch": 0.025, "grad_norm": 0.87890625, "learning_rate": 0.0014985, "loss": 3.8257, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1000, "tokens_per_second_per_gpu": 23345.81 }, { "epoch": 0.025025, "grad_norm": 0.6484375, "learning_rate": 0.0015, "loss": 3.8855, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1001, "tokens_per_second_per_gpu": 13101.28 }, { "epoch": 0.02505, "grad_norm": 0.59765625, "learning_rate": 0.0015014999999999998, "loss": 3.8587, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1002, "tokens_per_second_per_gpu": 13662.68 }, { "epoch": 0.025075, "grad_norm": 0.42578125, "learning_rate": 0.001503, "loss": 3.8366, "memory/device_reserved (GiB)": 106.43, "memory/max_active (GiB)": 106.39, "memory/max_allocated (GiB)": 106.39, "step": 1003, "tokens_per_second_per_gpu": 13209.41 }, { "epoch": 0.0251, "grad_norm": 0.46875, "learning_rate": 0.0015044999999999998, "loss": 3.8588, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1004, "tokens_per_second_per_gpu": 15037.85 }, { "epoch": 0.025125, "grad_norm": 0.43359375, "learning_rate": 0.001506, "loss": 3.8347, "memory/device_reserved (GiB)": 95.73, "memory/max_active (GiB)": 95.72, "memory/max_allocated (GiB)": 95.72, "step": 1005, "tokens_per_second_per_gpu": 14234.08 }, { "epoch": 0.02515, "grad_norm": 0.5, "learning_rate": 0.0015075, "loss": 3.8213, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1006, "tokens_per_second_per_gpu": 15800.49 }, { "epoch": 0.025175, "grad_norm": 0.45703125, "learning_rate": 0.0015090000000000001, "loss": 3.8009, "memory/device_reserved (GiB)": 74.49, "memory/max_active (GiB)": 74.44, "memory/max_allocated (GiB)": 74.44, "step": 1007, "tokens_per_second_per_gpu": 17730.89 }, { "epoch": 0.0252, "grad_norm": 0.40625, "learning_rate": 0.0015105, "loss": 3.7949, "memory/device_reserved (GiB)": 106.26, "memory/max_active (GiB)": 106.08, "memory/max_allocated (GiB)": 106.08, "step": 1008, "tokens_per_second_per_gpu": 12304.59 }, { "epoch": 0.025225, "grad_norm": 0.5, "learning_rate": 0.001512, "loss": 3.7784, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1009, "tokens_per_second_per_gpu": 24007.98 }, { "epoch": 0.02525, "grad_norm": 0.6875, "learning_rate": 0.0015134999999999999, "loss": 3.8071, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1010, "tokens_per_second_per_gpu": 11431.54 }, { "epoch": 0.025275, "grad_norm": 0.63671875, "learning_rate": 0.001515, "loss": 3.8283, "memory/device_reserved (GiB)": 65.8, "memory/max_active (GiB)": 65.62, "memory/max_allocated (GiB)": 65.62, "step": 1011, "tokens_per_second_per_gpu": 20643.48 }, { "epoch": 0.0253, "grad_norm": 0.66796875, "learning_rate": 0.0015164999999999998, "loss": 3.8164, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1012, "tokens_per_second_per_gpu": 16936.06 }, { "epoch": 0.025325, "grad_norm": 0.70703125, "learning_rate": 0.001518, "loss": 3.7739, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1013, "tokens_per_second_per_gpu": 14889.44 }, { "epoch": 0.02535, "grad_norm": 0.74609375, "learning_rate": 0.0015194999999999998, "loss": 3.8269, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1014, "tokens_per_second_per_gpu": 20438.35 }, { "epoch": 0.025375, "grad_norm": 0.80859375, "learning_rate": 0.001521, "loss": 3.8121, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1015, "tokens_per_second_per_gpu": 19443.93 }, { "epoch": 0.0254, "grad_norm": 0.76953125, "learning_rate": 0.0015225, "loss": 3.7926, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1016, "tokens_per_second_per_gpu": 27679.27 }, { "epoch": 0.025425, "grad_norm": 0.80078125, "learning_rate": 0.001524, "loss": 3.8151, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1017, "tokens_per_second_per_gpu": 3419.37 }, { "epoch": 0.02545, "grad_norm": 0.68359375, "learning_rate": 0.0015255, "loss": 3.7762, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1018, "tokens_per_second_per_gpu": 15560.66 }, { "epoch": 0.025475, "grad_norm": 0.60546875, "learning_rate": 0.0015270000000000001, "loss": 3.8004, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1019, "tokens_per_second_per_gpu": 12807.42 }, { "epoch": 0.0255, "grad_norm": 0.5703125, "learning_rate": 0.0015285, "loss": 3.7846, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1020, "tokens_per_second_per_gpu": 23418.89 }, { "epoch": 0.025525, "grad_norm": 0.50390625, "learning_rate": 0.0015300000000000001, "loss": 3.7378, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1021, "tokens_per_second_per_gpu": 14583.14 }, { "epoch": 0.02555, "grad_norm": 0.2890625, "learning_rate": 0.0015314999999999999, "loss": 3.821, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.42, "memory/max_allocated (GiB)": 107.42, "step": 1022, "tokens_per_second_per_gpu": 12486.07 }, { "epoch": 0.025575, "grad_norm": 0.453125, "learning_rate": 0.001533, "loss": 3.7984, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1023, "tokens_per_second_per_gpu": 11696.62 }, { "epoch": 0.0256, "grad_norm": 0.5078125, "learning_rate": 0.0015344999999999998, "loss": 3.7579, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1024, "tokens_per_second_per_gpu": 13398.28 }, { "epoch": 0.025625, "grad_norm": 0.42578125, "learning_rate": 0.001536, "loss": 3.753, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1025, "tokens_per_second_per_gpu": 19230.89 }, { "epoch": 0.02565, "grad_norm": 0.341796875, "learning_rate": 0.0015374999999999998, "loss": 3.8058, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1026, "tokens_per_second_per_gpu": 14408.12 }, { "epoch": 0.025675, "grad_norm": 0.279296875, "learning_rate": 0.001539, "loss": 3.8046, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1027, "tokens_per_second_per_gpu": 4899.21 }, { "epoch": 0.0257, "grad_norm": 0.185546875, "learning_rate": 0.0015405, "loss": 3.7835, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.22, "memory/max_allocated (GiB)": 97.22, "step": 1028, "tokens_per_second_per_gpu": 14029.0 }, { "epoch": 0.025725, "grad_norm": 0.33984375, "learning_rate": 0.001542, "loss": 3.7593, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1029, "tokens_per_second_per_gpu": 15568.33 }, { "epoch": 0.02575, "grad_norm": 0.53515625, "learning_rate": 0.0015435, "loss": 3.7599, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1030, "tokens_per_second_per_gpu": 13305.08 }, { "epoch": 0.025775, "grad_norm": 0.85546875, "learning_rate": 0.0015450000000000001, "loss": 3.7875, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1031, "tokens_per_second_per_gpu": 10718.58 }, { "epoch": 0.0258, "grad_norm": 0.5390625, "learning_rate": 0.0015465, "loss": 3.7951, "memory/device_reserved (GiB)": 65.8, "memory/max_active (GiB)": 65.62, "memory/max_allocated (GiB)": 65.62, "step": 1032, "tokens_per_second_per_gpu": 19981.2 }, { "epoch": 0.025825, "grad_norm": 0.53515625, "learning_rate": 0.0015480000000000001, "loss": 3.7594, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1033, "tokens_per_second_per_gpu": 16585.04 }, { "epoch": 0.02585, "grad_norm": 0.64453125, "learning_rate": 0.0015494999999999999, "loss": 3.7911, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1034, "tokens_per_second_per_gpu": 17926.76 }, { "epoch": 0.025875, "grad_norm": 0.7109375, "learning_rate": 0.001551, "loss": 3.7632, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1035, "tokens_per_second_per_gpu": 17434.04 }, { "epoch": 0.0259, "grad_norm": 0.76953125, "learning_rate": 0.0015524999999999998, "loss": 3.7834, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1036, "tokens_per_second_per_gpu": 17650.11 }, { "epoch": 0.025925, "grad_norm": 0.74609375, "learning_rate": 0.001554, "loss": 3.7991, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1037, "tokens_per_second_per_gpu": 12114.91 }, { "epoch": 0.02595, "grad_norm": 0.5390625, "learning_rate": 0.0015554999999999998, "loss": 3.8005, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1038, "tokens_per_second_per_gpu": 22807.43 }, { "epoch": 0.025975, "grad_norm": 0.515625, "learning_rate": 0.001557, "loss": 3.7441, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1039, "tokens_per_second_per_gpu": 28028.09 }, { "epoch": 0.026, "grad_norm": 0.49609375, "learning_rate": 0.0015585, "loss": 3.7617, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1040, "tokens_per_second_per_gpu": 13238.88 }, { "epoch": 0.026025, "grad_norm": 0.53515625, "learning_rate": 0.0015600000000000002, "loss": 3.7561, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1041, "tokens_per_second_per_gpu": 14037.52 }, { "epoch": 0.02605, "grad_norm": 0.57421875, "learning_rate": 0.0015615, "loss": 3.7456, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1042, "tokens_per_second_per_gpu": 16320.97 }, { "epoch": 0.026075, "grad_norm": 0.73828125, "learning_rate": 0.0015630000000000002, "loss": 3.7871, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 1043, "tokens_per_second_per_gpu": 11215.62 }, { "epoch": 0.0261, "grad_norm": 1.140625, "learning_rate": 0.0015645, "loss": 3.7982, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1044, "tokens_per_second_per_gpu": 15441.98 }, { "epoch": 0.026125, "grad_norm": 1.0234375, "learning_rate": 0.0015660000000000001, "loss": 3.7988, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1045, "tokens_per_second_per_gpu": 23855.85 }, { "epoch": 0.02615, "grad_norm": 0.85546875, "learning_rate": 0.0015674999999999999, "loss": 3.8019, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 1046, "tokens_per_second_per_gpu": 28122.72 }, { "epoch": 0.026175, "grad_norm": 0.7890625, "learning_rate": 0.001569, "loss": 3.7816, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1047, "tokens_per_second_per_gpu": 18416.42 }, { "epoch": 0.0262, "grad_norm": 0.59375, "learning_rate": 0.0015704999999999998, "loss": 3.77, "memory/device_reserved (GiB)": 106.43, "memory/max_active (GiB)": 106.39, "memory/max_allocated (GiB)": 106.39, "step": 1048, "tokens_per_second_per_gpu": 3102.1 }, { "epoch": 0.026225, "grad_norm": 0.62109375, "learning_rate": 0.001572, "loss": 3.7663, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1049, "tokens_per_second_per_gpu": 18207.25 }, { "epoch": 0.02625, "grad_norm": 0.5390625, "learning_rate": 0.0015735, "loss": 3.7739, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1050, "tokens_per_second_per_gpu": 10525.97 }, { "epoch": 0.026275, "grad_norm": 0.6796875, "learning_rate": 0.001575, "loss": 3.7992, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1051, "tokens_per_second_per_gpu": 12692.57 }, { "epoch": 0.0263, "grad_norm": 0.69140625, "learning_rate": 0.0015765, "loss": 3.8097, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1052, "tokens_per_second_per_gpu": 17469.97 }, { "epoch": 0.026325, "grad_norm": 0.59765625, "learning_rate": 0.0015780000000000002, "loss": 3.7431, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1053, "tokens_per_second_per_gpu": 4700.88 }, { "epoch": 0.02635, "grad_norm": 0.69140625, "learning_rate": 0.0015795, "loss": 3.762, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1054, "tokens_per_second_per_gpu": 15762.73 }, { "epoch": 0.026375, "grad_norm": 0.61328125, "learning_rate": 0.0015810000000000002, "loss": 3.7843, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1055, "tokens_per_second_per_gpu": 18695.94 }, { "epoch": 0.0264, "grad_norm": 0.578125, "learning_rate": 0.0015825, "loss": 3.775, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1056, "tokens_per_second_per_gpu": 6997.0 }, { "epoch": 0.026425, "grad_norm": 0.52734375, "learning_rate": 0.0015840000000000001, "loss": 3.7206, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1057, "tokens_per_second_per_gpu": 20091.79 }, { "epoch": 0.02645, "grad_norm": 0.57421875, "learning_rate": 0.0015854999999999999, "loss": 3.7514, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1058, "tokens_per_second_per_gpu": 28574.03 }, { "epoch": 0.026475, "grad_norm": 0.60546875, "learning_rate": 0.001587, "loss": 3.7333, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1059, "tokens_per_second_per_gpu": 16406.26 }, { "epoch": 0.0265, "grad_norm": 0.5625, "learning_rate": 0.0015884999999999999, "loss": 3.7451, "memory/device_reserved (GiB)": 116.14, "memory/max_active (GiB)": 116.12, "memory/max_allocated (GiB)": 116.12, "step": 1060, "tokens_per_second_per_gpu": 11861.85 }, { "epoch": 0.026525, "grad_norm": 0.462890625, "learning_rate": 0.00159, "loss": 3.7236, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1061, "tokens_per_second_per_gpu": 27019.17 }, { "epoch": 0.02655, "grad_norm": 0.5625, "learning_rate": 0.0015915, "loss": 3.7464, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1062, "tokens_per_second_per_gpu": 3224.55 }, { "epoch": 0.026575, "grad_norm": 0.69921875, "learning_rate": 0.001593, "loss": 3.7289, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1063, "tokens_per_second_per_gpu": 18666.09 }, { "epoch": 0.0266, "grad_norm": 0.74609375, "learning_rate": 0.0015945, "loss": 3.7298, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1064, "tokens_per_second_per_gpu": 17376.4 }, { "epoch": 0.026625, "grad_norm": 0.63671875, "learning_rate": 0.0015960000000000002, "loss": 3.7279, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1065, "tokens_per_second_per_gpu": 18241.08 }, { "epoch": 0.02665, "grad_norm": 0.55078125, "learning_rate": 0.0015975, "loss": 3.7267, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1066, "tokens_per_second_per_gpu": 24232.87 }, { "epoch": 0.026675, "grad_norm": 0.59765625, "learning_rate": 0.0015990000000000002, "loss": 3.7639, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1067, "tokens_per_second_per_gpu": 17865.88 }, { "epoch": 0.0267, "grad_norm": 0.87890625, "learning_rate": 0.0016005, "loss": 3.7656, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1068, "tokens_per_second_per_gpu": 20846.45 }, { "epoch": 0.026725, "grad_norm": 0.76171875, "learning_rate": 0.0016020000000000001, "loss": 3.7727, "memory/device_reserved (GiB)": 74.49, "memory/max_active (GiB)": 74.44, "memory/max_allocated (GiB)": 74.44, "step": 1069, "tokens_per_second_per_gpu": 17665.41 }, { "epoch": 0.02675, "grad_norm": 0.466796875, "learning_rate": 0.0016034999999999999, "loss": 3.7921, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1070, "tokens_per_second_per_gpu": 17011.55 }, { "epoch": 0.026775, "grad_norm": 0.474609375, "learning_rate": 0.001605, "loss": 3.7313, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1071, "tokens_per_second_per_gpu": 14263.11 }, { "epoch": 0.0268, "grad_norm": 0.5390625, "learning_rate": 0.0016064999999999999, "loss": 3.7216, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1072, "tokens_per_second_per_gpu": 12346.69 }, { "epoch": 0.026825, "grad_norm": 0.7265625, "learning_rate": 0.001608, "loss": 3.7305, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1073, "tokens_per_second_per_gpu": 11809.43 }, { "epoch": 0.02685, "grad_norm": 0.515625, "learning_rate": 0.0016095, "loss": 3.7155, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1074, "tokens_per_second_per_gpu": 4006.26 }, { "epoch": 0.026875, "grad_norm": 0.4921875, "learning_rate": 0.0016110000000000002, "loss": 3.7487, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1075, "tokens_per_second_per_gpu": 17633.93 }, { "epoch": 0.0269, "grad_norm": 0.41015625, "learning_rate": 0.0016125, "loss": 3.7358, "memory/device_reserved (GiB)": 54.53, "memory/max_active (GiB)": 54.51, "memory/max_allocated (GiB)": 54.51, "step": 1076, "tokens_per_second_per_gpu": 24026.06 }, { "epoch": 0.026925, "grad_norm": 0.46484375, "learning_rate": 0.0016140000000000002, "loss": 3.7104, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1077, "tokens_per_second_per_gpu": 18390.34 }, { "epoch": 0.02695, "grad_norm": 0.5234375, "learning_rate": 0.0016155, "loss": 3.7244, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1078, "tokens_per_second_per_gpu": 12745.43 }, { "epoch": 0.026975, "grad_norm": 0.5234375, "learning_rate": 0.0016170000000000002, "loss": 3.685, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1079, "tokens_per_second_per_gpu": 25442.44 }, { "epoch": 0.027, "grad_norm": 0.6015625, "learning_rate": 0.0016185, "loss": 3.7236, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1080, "tokens_per_second_per_gpu": 22191.52 }, { "epoch": 0.027025, "grad_norm": 0.7578125, "learning_rate": 0.0016200000000000001, "loss": 3.7204, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1081, "tokens_per_second_per_gpu": 20407.79 }, { "epoch": 0.02705, "grad_norm": 0.47265625, "learning_rate": 0.0016215, "loss": 3.7599, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1082, "tokens_per_second_per_gpu": 25825.96 }, { "epoch": 0.027075, "grad_norm": 0.51171875, "learning_rate": 0.001623, "loss": 3.7071, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1083, "tokens_per_second_per_gpu": 14544.32 }, { "epoch": 0.0271, "grad_norm": 0.6015625, "learning_rate": 0.0016245, "loss": 3.7367, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1084, "tokens_per_second_per_gpu": 16084.69 }, { "epoch": 0.027125, "grad_norm": 0.51953125, "learning_rate": 0.001626, "loss": 3.7227, "memory/device_reserved (GiB)": 85.15, "memory/max_active (GiB)": 85.11, "memory/max_allocated (GiB)": 85.11, "step": 1085, "tokens_per_second_per_gpu": 16371.68 }, { "epoch": 0.02715, "grad_norm": 0.482421875, "learning_rate": 0.0016275, "loss": 3.735, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1086, "tokens_per_second_per_gpu": 13064.78 }, { "epoch": 0.027175, "grad_norm": 0.71484375, "learning_rate": 0.0016290000000000002, "loss": 3.7046, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1087, "tokens_per_second_per_gpu": 23501.81 }, { "epoch": 0.0272, "grad_norm": 0.796875, "learning_rate": 0.0016305, "loss": 3.7273, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1088, "tokens_per_second_per_gpu": 17305.87 }, { "epoch": 0.027225, "grad_norm": 0.6875, "learning_rate": 0.0016320000000000002, "loss": 3.7246, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1089, "tokens_per_second_per_gpu": 10785.63 }, { "epoch": 0.02725, "grad_norm": 0.64453125, "learning_rate": 0.0016335, "loss": 3.7428, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1090, "tokens_per_second_per_gpu": 4189.37 }, { "epoch": 0.027275, "grad_norm": 0.8671875, "learning_rate": 0.0016350000000000002, "loss": 3.7072, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1091, "tokens_per_second_per_gpu": 13433.57 }, { "epoch": 0.0273, "grad_norm": 0.7734375, "learning_rate": 0.0016365, "loss": 3.7282, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1092, "tokens_per_second_per_gpu": 22716.7 }, { "epoch": 0.027325, "grad_norm": 0.86328125, "learning_rate": 0.0016380000000000001, "loss": 3.7486, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1093, "tokens_per_second_per_gpu": 18490.9 }, { "epoch": 0.02735, "grad_norm": 0.91796875, "learning_rate": 0.0016395, "loss": 3.7319, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1094, "tokens_per_second_per_gpu": 14868.46 }, { "epoch": 0.027375, "grad_norm": 0.8203125, "learning_rate": 0.001641, "loss": 3.7467, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1095, "tokens_per_second_per_gpu": 11510.51 }, { "epoch": 0.0274, "grad_norm": 0.80078125, "learning_rate": 0.0016425, "loss": 3.7614, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1096, "tokens_per_second_per_gpu": 3645.5 }, { "epoch": 0.027425, "grad_norm": 0.76953125, "learning_rate": 0.001644, "loss": 3.7684, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1097, "tokens_per_second_per_gpu": 22800.86 }, { "epoch": 0.02745, "grad_norm": 0.56640625, "learning_rate": 0.0016455, "loss": 3.7356, "memory/device_reserved (GiB)": 77.03, "memory/max_active (GiB)": 76.82, "memory/max_allocated (GiB)": 76.82, "step": 1098, "tokens_per_second_per_gpu": 12435.96 }, { "epoch": 0.027475, "grad_norm": 0.6015625, "learning_rate": 0.0016470000000000002, "loss": 3.7314, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1099, "tokens_per_second_per_gpu": 21104.82 }, { "epoch": 0.0275, "grad_norm": 0.78125, "learning_rate": 0.0016485, "loss": 3.758, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1100, "tokens_per_second_per_gpu": 11813.22 }, { "epoch": 0.027525, "grad_norm": 0.91796875, "learning_rate": 0.0016500000000000002, "loss": 3.7555, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1101, "tokens_per_second_per_gpu": 15839.97 }, { "epoch": 0.02755, "grad_norm": 0.87109375, "learning_rate": 0.0016515, "loss": 3.7592, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1102, "tokens_per_second_per_gpu": 15788.81 }, { "epoch": 0.027575, "grad_norm": 0.79296875, "learning_rate": 0.0016530000000000002, "loss": 3.7525, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1103, "tokens_per_second_per_gpu": 15853.59 }, { "epoch": 0.0276, "grad_norm": 0.546875, "learning_rate": 0.0016545, "loss": 3.7399, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1104, "tokens_per_second_per_gpu": 17195.32 }, { "epoch": 0.027625, "grad_norm": 0.265625, "learning_rate": 0.0016560000000000001, "loss": 3.7227, "memory/device_reserved (GiB)": 116.62, "memory/max_active (GiB)": 116.59, "memory/max_allocated (GiB)": 116.59, "step": 1105, "tokens_per_second_per_gpu": 9545.12 }, { "epoch": 0.02765, "grad_norm": 0.431640625, "learning_rate": 0.0016575, "loss": 3.7599, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 1106, "tokens_per_second_per_gpu": 13598.22 }, { "epoch": 0.027675, "grad_norm": 0.439453125, "learning_rate": 0.001659, "loss": 3.7213, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1107, "tokens_per_second_per_gpu": 14710.64 }, { "epoch": 0.0277, "grad_norm": 0.416015625, "learning_rate": 0.0016605, "loss": 3.7022, "memory/device_reserved (GiB)": 46.44, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1108, "tokens_per_second_per_gpu": 4210.26 }, { "epoch": 0.027725, "grad_norm": 0.384765625, "learning_rate": 0.0016620000000000003, "loss": 3.7068, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1109, "tokens_per_second_per_gpu": 13796.26 }, { "epoch": 0.02775, "grad_norm": 0.4296875, "learning_rate": 0.0016635, "loss": 3.7109, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1110, "tokens_per_second_per_gpu": 15295.03 }, { "epoch": 0.027775, "grad_norm": 0.478515625, "learning_rate": 0.0016650000000000002, "loss": 3.7022, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1111, "tokens_per_second_per_gpu": 23190.05 }, { "epoch": 0.0278, "grad_norm": 0.62890625, "learning_rate": 0.0016665, "loss": 3.6741, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1112, "tokens_per_second_per_gpu": 15132.44 }, { "epoch": 0.027825, "grad_norm": 0.60546875, "learning_rate": 0.0016680000000000002, "loss": 3.6812, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1113, "tokens_per_second_per_gpu": 15717.28 }, { "epoch": 0.02785, "grad_norm": 0.447265625, "learning_rate": 0.0016695, "loss": 3.6954, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1114, "tokens_per_second_per_gpu": 17581.49 }, { "epoch": 0.027875, "grad_norm": 0.56640625, "learning_rate": 0.0016710000000000002, "loss": 3.6989, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1115, "tokens_per_second_per_gpu": 22840.54 }, { "epoch": 0.0279, "grad_norm": 0.54296875, "learning_rate": 0.0016725, "loss": 3.6925, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1116, "tokens_per_second_per_gpu": 24047.79 }, { "epoch": 0.027925, "grad_norm": 0.51953125, "learning_rate": 0.0016740000000000001, "loss": 3.7336, "memory/device_reserved (GiB)": 36.22, "memory/max_active (GiB)": 36.02, "memory/max_allocated (GiB)": 36.02, "step": 1117, "tokens_per_second_per_gpu": 30797.23 }, { "epoch": 0.02795, "grad_norm": 0.490234375, "learning_rate": 0.0016755000000000001, "loss": 3.6958, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1118, "tokens_per_second_per_gpu": 20434.29 }, { "epoch": 0.027975, "grad_norm": 0.5078125, "learning_rate": 0.001677, "loss": 3.6864, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1119, "tokens_per_second_per_gpu": 11145.08 }, { "epoch": 0.028, "grad_norm": 0.6328125, "learning_rate": 0.0016785, "loss": 3.696, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1120, "tokens_per_second_per_gpu": 10478.78 }, { "epoch": 0.028025, "grad_norm": 0.6953125, "learning_rate": 0.0016800000000000003, "loss": 3.671, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1121, "tokens_per_second_per_gpu": 13845.73 }, { "epoch": 0.02805, "grad_norm": 0.453125, "learning_rate": 0.0016815, "loss": 3.6925, "memory/device_reserved (GiB)": 96.42, "memory/max_active (GiB)": 96.23, "memory/max_allocated (GiB)": 96.23, "step": 1122, "tokens_per_second_per_gpu": 13330.71 }, { "epoch": 0.028075, "grad_norm": 0.51171875, "learning_rate": 0.0016830000000000003, "loss": 3.6737, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1123, "tokens_per_second_per_gpu": 22330.6 }, { "epoch": 0.0281, "grad_norm": 0.7890625, "learning_rate": 0.0016845, "loss": 3.7372, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1124, "tokens_per_second_per_gpu": 11117.45 }, { "epoch": 0.028125, "grad_norm": 0.875, "learning_rate": 0.0016860000000000002, "loss": 3.7102, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1125, "tokens_per_second_per_gpu": 28203.12 }, { "epoch": 0.02815, "grad_norm": 0.94921875, "learning_rate": 0.0016875, "loss": 3.6966, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1126, "tokens_per_second_per_gpu": 12587.38 }, { "epoch": 0.028175, "grad_norm": 1.046875, "learning_rate": 0.001689, "loss": 3.7257, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 1127, "tokens_per_second_per_gpu": 28751.53 }, { "epoch": 0.0282, "grad_norm": 1.5546875, "learning_rate": 0.0016905, "loss": 3.8247, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1128, "tokens_per_second_per_gpu": 10430.28 }, { "epoch": 0.028225, "grad_norm": 0.82421875, "learning_rate": 0.001692, "loss": 3.7255, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1129, "tokens_per_second_per_gpu": 3755.84 }, { "epoch": 0.02825, "grad_norm": 0.78515625, "learning_rate": 0.0016935000000000001, "loss": 3.7512, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1130, "tokens_per_second_per_gpu": 12056.34 }, { "epoch": 0.028275, "grad_norm": 1.0390625, "learning_rate": 0.001695, "loss": 3.762, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 1131, "tokens_per_second_per_gpu": 29078.41 }, { "epoch": 0.0283, "grad_norm": 1.6484375, "learning_rate": 0.0016965, "loss": 3.8281, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1132, "tokens_per_second_per_gpu": 10990.07 }, { "epoch": 0.028325, "grad_norm": 0.80859375, "learning_rate": 0.0016979999999999999, "loss": 3.8016, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1133, "tokens_per_second_per_gpu": 16429.29 }, { "epoch": 0.02835, "grad_norm": 0.6484375, "learning_rate": 0.0016995, "loss": 3.7272, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 1134, "tokens_per_second_per_gpu": 3824.31 }, { "epoch": 0.028375, "grad_norm": 0.7421875, "learning_rate": 0.0017009999999999998, "loss": 3.727, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1135, "tokens_per_second_per_gpu": 26714.57 }, { "epoch": 0.0284, "grad_norm": 0.439453125, "learning_rate": 0.0017025, "loss": 3.7488, "memory/device_reserved (GiB)": 86.01, "memory/max_active (GiB)": 85.99, "memory/max_allocated (GiB)": 85.99, "step": 1136, "tokens_per_second_per_gpu": 15546.24 }, { "epoch": 0.028425, "grad_norm": 0.423828125, "learning_rate": 0.0017039999999999998, "loss": 3.7422, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1137, "tokens_per_second_per_gpu": 13321.99 }, { "epoch": 0.02845, "grad_norm": 0.447265625, "learning_rate": 0.0017055, "loss": 3.7043, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1138, "tokens_per_second_per_gpu": 12365.45 }, { "epoch": 0.028475, "grad_norm": 0.515625, "learning_rate": 0.001707, "loss": 3.6924, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1139, "tokens_per_second_per_gpu": 11663.23 }, { "epoch": 0.0285, "grad_norm": 0.609375, "learning_rate": 0.0017085, "loss": 3.7123, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1140, "tokens_per_second_per_gpu": 4333.83 }, { "epoch": 0.028525, "grad_norm": 0.462890625, "learning_rate": 0.00171, "loss": 3.726, "memory/device_reserved (GiB)": 56.59, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1141, "tokens_per_second_per_gpu": 21545.83 }, { "epoch": 0.02855, "grad_norm": 0.4765625, "learning_rate": 0.0017115000000000001, "loss": 3.6746, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1142, "tokens_per_second_per_gpu": 17668.3 }, { "epoch": 0.028575, "grad_norm": 0.62109375, "learning_rate": 0.001713, "loss": 3.6839, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1143, "tokens_per_second_per_gpu": 16676.4 }, { "epoch": 0.0286, "grad_norm": 0.71484375, "learning_rate": 0.0017145, "loss": 3.707, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1144, "tokens_per_second_per_gpu": 15136.65 }, { "epoch": 0.028625, "grad_norm": 0.56640625, "learning_rate": 0.0017159999999999999, "loss": 3.6626, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1145, "tokens_per_second_per_gpu": 13312.81 }, { "epoch": 0.02865, "grad_norm": 0.59375, "learning_rate": 0.0017175, "loss": 3.708, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1146, "tokens_per_second_per_gpu": 16185.8 }, { "epoch": 0.028675, "grad_norm": 0.5390625, "learning_rate": 0.0017189999999999998, "loss": 3.6891, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1147, "tokens_per_second_per_gpu": 3731.04 }, { "epoch": 0.0287, "grad_norm": 0.625, "learning_rate": 0.0017205, "loss": 3.6875, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1148, "tokens_per_second_per_gpu": 17665.02 }, { "epoch": 0.028725, "grad_norm": 0.458984375, "learning_rate": 0.001722, "loss": 3.6435, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1149, "tokens_per_second_per_gpu": 13088.51 }, { "epoch": 0.02875, "grad_norm": 0.423828125, "learning_rate": 0.0017235, "loss": 3.6527, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1150, "tokens_per_second_per_gpu": 26779.79 }, { "epoch": 0.028775, "grad_norm": 0.4453125, "learning_rate": 0.001725, "loss": 3.6632, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1151, "tokens_per_second_per_gpu": 19782.47 }, { "epoch": 0.0288, "grad_norm": 0.44921875, "learning_rate": 0.0017265000000000002, "loss": 3.6374, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1152, "tokens_per_second_per_gpu": 29316.28 }, { "epoch": 0.028825, "grad_norm": 0.49609375, "learning_rate": 0.001728, "loss": 3.6517, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1153, "tokens_per_second_per_gpu": 3046.65 }, { "epoch": 0.02885, "grad_norm": 0.56640625, "learning_rate": 0.0017295000000000001, "loss": 3.6478, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1154, "tokens_per_second_per_gpu": 9837.46 }, { "epoch": 0.028875, "grad_norm": 0.55859375, "learning_rate": 0.001731, "loss": 3.6422, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1155, "tokens_per_second_per_gpu": 12189.91 }, { "epoch": 0.0289, "grad_norm": 0.53125, "learning_rate": 0.0017325, "loss": 3.6814, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1156, "tokens_per_second_per_gpu": 17812.55 }, { "epoch": 0.028925, "grad_norm": 0.390625, "learning_rate": 0.0017339999999999999, "loss": 3.6399, "memory/device_reserved (GiB)": 65.6, "memory/max_active (GiB)": 65.59, "memory/max_allocated (GiB)": 65.59, "step": 1157, "tokens_per_second_per_gpu": 20827.33 }, { "epoch": 0.02895, "grad_norm": 0.42578125, "learning_rate": 0.0017355, "loss": 3.6579, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1158, "tokens_per_second_per_gpu": 22684.16 }, { "epoch": 0.028975, "grad_norm": 0.4375, "learning_rate": 0.0017369999999999998, "loss": 3.6409, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1159, "tokens_per_second_per_gpu": 19189.43 }, { "epoch": 0.029, "grad_norm": 0.50390625, "learning_rate": 0.0017385, "loss": 3.6829, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1160, "tokens_per_second_per_gpu": 11323.5 }, { "epoch": 0.029025, "grad_norm": 0.66015625, "learning_rate": 0.00174, "loss": 3.6647, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1161, "tokens_per_second_per_gpu": 10541.96 }, { "epoch": 0.02905, "grad_norm": 0.5546875, "learning_rate": 0.0017415, "loss": 3.6218, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1162, "tokens_per_second_per_gpu": 14492.27 }, { "epoch": 0.029075, "grad_norm": 0.4296875, "learning_rate": 0.001743, "loss": 3.646, "memory/device_reserved (GiB)": 116.82, "memory/max_active (GiB)": 116.63, "memory/max_allocated (GiB)": 116.63, "step": 1163, "tokens_per_second_per_gpu": 11785.83 }, { "epoch": 0.0291, "grad_norm": 0.546875, "learning_rate": 0.0017445000000000002, "loss": 3.6405, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1164, "tokens_per_second_per_gpu": 4697.2 }, { "epoch": 0.029125, "grad_norm": 0.6015625, "learning_rate": 0.001746, "loss": 3.6812, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1165, "tokens_per_second_per_gpu": 23443.93 }, { "epoch": 0.02915, "grad_norm": 0.578125, "learning_rate": 0.0017475000000000001, "loss": 3.6563, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1166, "tokens_per_second_per_gpu": 19894.28 }, { "epoch": 0.029175, "grad_norm": 0.57421875, "learning_rate": 0.001749, "loss": 3.6614, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1167, "tokens_per_second_per_gpu": 13037.74 }, { "epoch": 0.0292, "grad_norm": 0.5390625, "learning_rate": 0.0017505, "loss": 3.6703, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1168, "tokens_per_second_per_gpu": 20795.79 }, { "epoch": 0.029225, "grad_norm": 0.59375, "learning_rate": 0.0017519999999999999, "loss": 3.6452, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1169, "tokens_per_second_per_gpu": 20928.83 }, { "epoch": 0.02925, "grad_norm": 0.61328125, "learning_rate": 0.0017535, "loss": 3.6741, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1170, "tokens_per_second_per_gpu": 10778.88 }, { "epoch": 0.029275, "grad_norm": 0.625, "learning_rate": 0.0017549999999999998, "loss": 3.6383, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1171, "tokens_per_second_per_gpu": 12300.12 }, { "epoch": 0.0293, "grad_norm": 0.451171875, "learning_rate": 0.0017565, "loss": 3.6914, "memory/device_reserved (GiB)": 44.36, "memory/max_active (GiB)": 44.31, "memory/max_allocated (GiB)": 44.31, "step": 1172, "tokens_per_second_per_gpu": 27949.1 }, { "epoch": 0.029325, "grad_norm": 0.482421875, "learning_rate": 0.001758, "loss": 3.6745, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1173, "tokens_per_second_per_gpu": 13582.07 }, { "epoch": 0.02935, "grad_norm": 0.55859375, "learning_rate": 0.0017595, "loss": 3.6703, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1174, "tokens_per_second_per_gpu": 13412.64 }, { "epoch": 0.029375, "grad_norm": 0.60546875, "learning_rate": 0.001761, "loss": 3.6391, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1175, "tokens_per_second_per_gpu": 12763.37 }, { "epoch": 0.0294, "grad_norm": 0.68359375, "learning_rate": 0.0017625000000000002, "loss": 3.6758, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1176, "tokens_per_second_per_gpu": 3387.29 }, { "epoch": 0.029425, "grad_norm": 0.6796875, "learning_rate": 0.001764, "loss": 3.6863, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1177, "tokens_per_second_per_gpu": 13772.6 }, { "epoch": 0.02945, "grad_norm": 0.546875, "learning_rate": 0.0017655000000000001, "loss": 3.6939, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 1178, "tokens_per_second_per_gpu": 11368.55 }, { "epoch": 0.029475, "grad_norm": 0.5703125, "learning_rate": 0.001767, "loss": 3.6576, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1179, "tokens_per_second_per_gpu": 11272.0 }, { "epoch": 0.0295, "grad_norm": 0.5234375, "learning_rate": 0.0017685, "loss": 3.6446, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1180, "tokens_per_second_per_gpu": 14588.75 }, { "epoch": 0.029525, "grad_norm": 0.5546875, "learning_rate": 0.0017699999999999999, "loss": 3.6608, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1181, "tokens_per_second_per_gpu": 24284.24 }, { "epoch": 0.02955, "grad_norm": 0.5546875, "learning_rate": 0.0017715, "loss": 3.657, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1182, "tokens_per_second_per_gpu": 12423.99 }, { "epoch": 0.029575, "grad_norm": 0.53125, "learning_rate": 0.001773, "loss": 3.6612, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1183, "tokens_per_second_per_gpu": 14939.29 }, { "epoch": 0.0296, "grad_norm": 0.388671875, "learning_rate": 0.0017745, "loss": 3.6373, "memory/device_reserved (GiB)": 35.74, "memory/max_active (GiB)": 35.55, "memory/max_allocated (GiB)": 35.55, "step": 1184, "tokens_per_second_per_gpu": 30899.06 }, { "epoch": 0.029625, "grad_norm": 0.431640625, "learning_rate": 0.001776, "loss": 3.6467, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1185, "tokens_per_second_per_gpu": 17806.31 }, { "epoch": 0.02965, "grad_norm": 0.396484375, "learning_rate": 0.0017775000000000002, "loss": 3.6327, "memory/device_reserved (GiB)": 54.53, "memory/max_active (GiB)": 54.51, "memory/max_allocated (GiB)": 54.51, "step": 1186, "tokens_per_second_per_gpu": 23386.93 }, { "epoch": 0.029675, "grad_norm": 0.3515625, "learning_rate": 0.001779, "loss": 3.6402, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1187, "tokens_per_second_per_gpu": 2667.17 }, { "epoch": 0.0297, "grad_norm": 0.375, "learning_rate": 0.0017805000000000002, "loss": 3.6554, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1188, "tokens_per_second_per_gpu": 15967.12 }, { "epoch": 0.029725, "grad_norm": 0.3828125, "learning_rate": 0.001782, "loss": 3.6372, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1189, "tokens_per_second_per_gpu": 14718.01 }, { "epoch": 0.02975, "grad_norm": 0.349609375, "learning_rate": 0.0017835000000000001, "loss": 3.6171, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1190, "tokens_per_second_per_gpu": 17984.1 }, { "epoch": 0.029775, "grad_norm": 0.40234375, "learning_rate": 0.001785, "loss": 3.6182, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1191, "tokens_per_second_per_gpu": 15715.81 }, { "epoch": 0.0298, "grad_norm": 0.58984375, "learning_rate": 0.0017865000000000001, "loss": 3.5943, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1192, "tokens_per_second_per_gpu": 15915.04 }, { "epoch": 0.029825, "grad_norm": 0.55078125, "learning_rate": 0.0017879999999999999, "loss": 3.6438, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1193, "tokens_per_second_per_gpu": 3203.88 }, { "epoch": 0.02985, "grad_norm": 0.392578125, "learning_rate": 0.0017895, "loss": 3.6354, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1194, "tokens_per_second_per_gpu": 23864.57 }, { "epoch": 0.029875, "grad_norm": 0.45703125, "learning_rate": 0.001791, "loss": 3.6263, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1195, "tokens_per_second_per_gpu": 23881.15 }, { "epoch": 0.0299, "grad_norm": 0.462890625, "learning_rate": 0.0017925, "loss": 3.6107, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1196, "tokens_per_second_per_gpu": 9843.21 }, { "epoch": 0.029925, "grad_norm": 0.61328125, "learning_rate": 0.001794, "loss": 3.6471, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1197, "tokens_per_second_per_gpu": 16649.91 }, { "epoch": 0.02995, "grad_norm": 0.765625, "learning_rate": 0.0017955000000000002, "loss": 3.6212, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1198, "tokens_per_second_per_gpu": 22413.54 }, { "epoch": 0.029975, "grad_norm": 0.84375, "learning_rate": 0.001797, "loss": 3.6825, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1199, "tokens_per_second_per_gpu": 11799.03 }, { "epoch": 0.03, "grad_norm": 0.671875, "learning_rate": 0.0017985000000000002, "loss": 3.6415, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1200, "tokens_per_second_per_gpu": 23127.27 }, { "epoch": 0.030025, "grad_norm": 0.859375, "learning_rate": 0.0018, "loss": 3.6355, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1201, "tokens_per_second_per_gpu": 2994.89 }, { "epoch": 0.03005, "grad_norm": 0.97265625, "learning_rate": 0.0018015000000000001, "loss": 3.6576, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 1202, "tokens_per_second_per_gpu": 13671.96 }, { "epoch": 0.030075, "grad_norm": 0.8984375, "learning_rate": 0.001803, "loss": 3.6474, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1203, "tokens_per_second_per_gpu": 18822.2 }, { "epoch": 0.0301, "grad_norm": 0.73828125, "learning_rate": 0.0018045000000000001, "loss": 3.7088, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 1204, "tokens_per_second_per_gpu": 13176.17 }, { "epoch": 0.030125, "grad_norm": 0.55078125, "learning_rate": 0.0018059999999999999, "loss": 3.6915, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1205, "tokens_per_second_per_gpu": 23951.02 }, { "epoch": 0.03015, "grad_norm": 0.4609375, "learning_rate": 0.0018075, "loss": 3.6807, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1206, "tokens_per_second_per_gpu": 3365.16 }, { "epoch": 0.030175, "grad_norm": 0.392578125, "learning_rate": 0.001809, "loss": 3.6583, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1207, "tokens_per_second_per_gpu": 12546.81 }, { "epoch": 0.0302, "grad_norm": 0.55859375, "learning_rate": 0.0018105, "loss": 3.6569, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1208, "tokens_per_second_per_gpu": 10662.42 }, { "epoch": 0.030225, "grad_norm": 0.76171875, "learning_rate": 0.001812, "loss": 3.6602, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1209, "tokens_per_second_per_gpu": 15952.7 }, { "epoch": 0.03025, "grad_norm": 0.625, "learning_rate": 0.0018135000000000002, "loss": 3.6446, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1210, "tokens_per_second_per_gpu": 20433.83 }, { "epoch": 0.030275, "grad_norm": 0.50390625, "learning_rate": 0.001815, "loss": 3.6577, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1211, "tokens_per_second_per_gpu": 13758.44 }, { "epoch": 0.0303, "grad_norm": 0.5, "learning_rate": 0.0018165000000000002, "loss": 3.6531, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1212, "tokens_per_second_per_gpu": 16301.09 }, { "epoch": 0.030325, "grad_norm": 0.57421875, "learning_rate": 0.001818, "loss": 3.6641, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1213, "tokens_per_second_per_gpu": 24207.54 }, { "epoch": 0.03035, "grad_norm": 0.49609375, "learning_rate": 0.0018195000000000002, "loss": 3.6523, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1214, "tokens_per_second_per_gpu": 27524.04 }, { "epoch": 0.030375, "grad_norm": 0.5390625, "learning_rate": 0.001821, "loss": 3.5869, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1215, "tokens_per_second_per_gpu": 23692.95 }, { "epoch": 0.0304, "grad_norm": 0.53515625, "learning_rate": 0.0018225000000000001, "loss": 3.6381, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1216, "tokens_per_second_per_gpu": 12364.51 }, { "epoch": 0.030425, "grad_norm": 0.546875, "learning_rate": 0.001824, "loss": 3.6311, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1217, "tokens_per_second_per_gpu": 19907.81 }, { "epoch": 0.03045, "grad_norm": 0.421875, "learning_rate": 0.0018255, "loss": 3.6201, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1218, "tokens_per_second_per_gpu": 13123.61 }, { "epoch": 0.030475, "grad_norm": 0.419921875, "learning_rate": 0.001827, "loss": 3.6316, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1219, "tokens_per_second_per_gpu": 15812.81 }, { "epoch": 0.0305, "grad_norm": 0.44140625, "learning_rate": 0.0018285000000000003, "loss": 3.5882, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1220, "tokens_per_second_per_gpu": 3512.5 }, { "epoch": 0.030525, "grad_norm": 0.474609375, "learning_rate": 0.00183, "loss": 3.5826, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1221, "tokens_per_second_per_gpu": 10644.15 }, { "epoch": 0.03055, "grad_norm": 0.5234375, "learning_rate": 0.0018315000000000002, "loss": 3.613, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1222, "tokens_per_second_per_gpu": 13818.66 }, { "epoch": 0.030575, "grad_norm": 0.4375, "learning_rate": 0.001833, "loss": 3.5881, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1223, "tokens_per_second_per_gpu": 18987.9 }, { "epoch": 0.0306, "grad_norm": 0.357421875, "learning_rate": 0.0018345000000000002, "loss": 3.6218, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 1224, "tokens_per_second_per_gpu": 13921.75 }, { "epoch": 0.030625, "grad_norm": 0.275390625, "learning_rate": 0.001836, "loss": 3.6421, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1225, "tokens_per_second_per_gpu": 3943.83 }, { "epoch": 0.03065, "grad_norm": 0.271484375, "learning_rate": 0.0018375000000000002, "loss": 3.5917, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1226, "tokens_per_second_per_gpu": 10002.16 }, { "epoch": 0.030675, "grad_norm": 0.2353515625, "learning_rate": 0.001839, "loss": 3.5901, "memory/device_reserved (GiB)": 44.36, "memory/max_active (GiB)": 44.31, "memory/max_allocated (GiB)": 44.31, "step": 1227, "tokens_per_second_per_gpu": 28284.82 }, { "epoch": 0.0307, "grad_norm": 0.337890625, "learning_rate": 0.0018405000000000001, "loss": 3.6061, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1228, "tokens_per_second_per_gpu": 10773.41 }, { "epoch": 0.030725, "grad_norm": 0.52734375, "learning_rate": 0.001842, "loss": 3.6106, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1229, "tokens_per_second_per_gpu": 14002.94 }, { "epoch": 0.03075, "grad_norm": 0.6875, "learning_rate": 0.0018435, "loss": 3.5916, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1230, "tokens_per_second_per_gpu": 3792.9 }, { "epoch": 0.030775, "grad_norm": 0.66015625, "learning_rate": 0.001845, "loss": 3.5993, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1231, "tokens_per_second_per_gpu": 10268.64 }, { "epoch": 0.0308, "grad_norm": 0.58203125, "learning_rate": 0.0018465000000000003, "loss": 3.5854, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1232, "tokens_per_second_per_gpu": 21865.83 }, { "epoch": 0.030825, "grad_norm": 0.345703125, "learning_rate": 0.001848, "loss": 3.6193, "memory/device_reserved (GiB)": 106.61, "memory/max_active (GiB)": 106.43, "memory/max_allocated (GiB)": 106.43, "step": 1233, "tokens_per_second_per_gpu": 12292.09 }, { "epoch": 0.03085, "grad_norm": 0.51171875, "learning_rate": 0.0018495000000000002, "loss": 3.6122, "memory/device_reserved (GiB)": 66.36, "memory/max_active (GiB)": 66.15, "memory/max_allocated (GiB)": 66.15, "step": 1234, "tokens_per_second_per_gpu": 20207.84 }, { "epoch": 0.030875, "grad_norm": 0.447265625, "learning_rate": 0.001851, "loss": 3.6376, "memory/device_reserved (GiB)": 64.72, "memory/max_active (GiB)": 64.71, "memory/max_allocated (GiB)": 64.71, "step": 1235, "tokens_per_second_per_gpu": 3492.27 }, { "epoch": 0.0309, "grad_norm": 0.498046875, "learning_rate": 0.0018525000000000002, "loss": 3.6262, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1236, "tokens_per_second_per_gpu": 11443.51 }, { "epoch": 0.030925, "grad_norm": 0.53515625, "learning_rate": 0.001854, "loss": 3.5946, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1237, "tokens_per_second_per_gpu": 10600.08 }, { "epoch": 0.03095, "grad_norm": 0.5859375, "learning_rate": 0.0018555000000000002, "loss": 3.583, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1238, "tokens_per_second_per_gpu": 17142.18 }, { "epoch": 0.030975, "grad_norm": 0.65625, "learning_rate": 0.001857, "loss": 3.6324, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1239, "tokens_per_second_per_gpu": 10477.73 }, { "epoch": 0.031, "grad_norm": 0.6328125, "learning_rate": 0.0018585000000000001, "loss": 3.6941, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1240, "tokens_per_second_per_gpu": 15066.53 }, { "epoch": 0.031025, "grad_norm": 0.55859375, "learning_rate": 0.00186, "loss": 3.6313, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1241, "tokens_per_second_per_gpu": 13421.62 }, { "epoch": 0.03105, "grad_norm": 0.44140625, "learning_rate": 0.0018615, "loss": 3.5993, "memory/device_reserved (GiB)": 74.93, "memory/max_active (GiB)": 74.91, "memory/max_allocated (GiB)": 74.91, "step": 1242, "tokens_per_second_per_gpu": 3566.39 }, { "epoch": 0.031075, "grad_norm": 0.431640625, "learning_rate": 0.001863, "loss": 3.5994, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1243, "tokens_per_second_per_gpu": 13932.42 }, { "epoch": 0.0311, "grad_norm": 0.369140625, "learning_rate": 0.0018645000000000003, "loss": 3.6099, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1244, "tokens_per_second_per_gpu": 10570.76 }, { "epoch": 0.031125, "grad_norm": 0.45703125, "learning_rate": 0.001866, "loss": 3.5818, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1245, "tokens_per_second_per_gpu": 27040.51 }, { "epoch": 0.03115, "grad_norm": 0.61328125, "learning_rate": 0.0018675000000000002, "loss": 3.608, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1246, "tokens_per_second_per_gpu": 12729.69 }, { "epoch": 0.031175, "grad_norm": 0.703125, "learning_rate": 0.001869, "loss": 3.6149, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1247, "tokens_per_second_per_gpu": 16237.5 }, { "epoch": 0.0312, "grad_norm": 0.60546875, "learning_rate": 0.0018705000000000002, "loss": 3.601, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1248, "tokens_per_second_per_gpu": 10658.9 }, { "epoch": 0.031225, "grad_norm": 0.69140625, "learning_rate": 0.001872, "loss": 3.6316, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1249, "tokens_per_second_per_gpu": 19787.72 }, { "epoch": 0.03125, "grad_norm": 0.625, "learning_rate": 0.0018735000000000002, "loss": 3.5986, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1250, "tokens_per_second_per_gpu": 24034.98 }, { "epoch": 0.031275, "grad_norm": 0.412109375, "learning_rate": 0.001875, "loss": 3.6473, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1251, "tokens_per_second_per_gpu": 13420.26 }, { "epoch": 0.0313, "grad_norm": 0.474609375, "learning_rate": 0.0018765, "loss": 3.6138, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1252, "tokens_per_second_per_gpu": 14332.8 }, { "epoch": 0.031325, "grad_norm": 0.470703125, "learning_rate": 0.0018780000000000001, "loss": 3.6129, "memory/device_reserved (GiB)": 96.22, "memory/max_active (GiB)": 96.19, "memory/max_allocated (GiB)": 96.19, "step": 1253, "tokens_per_second_per_gpu": 14523.03 }, { "epoch": 0.03135, "grad_norm": 0.52734375, "learning_rate": 0.0018794999999999999, "loss": 3.5966, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1254, "tokens_per_second_per_gpu": 15458.57 }, { "epoch": 0.031375, "grad_norm": 0.376953125, "learning_rate": 0.001881, "loss": 3.6099, "memory/device_reserved (GiB)": 85.52, "memory/max_active (GiB)": 85.52, "memory/max_allocated (GiB)": 85.52, "step": 1255, "tokens_per_second_per_gpu": 15456.62 }, { "epoch": 0.0314, "grad_norm": 0.421875, "learning_rate": 0.0018824999999999998, "loss": 3.6529, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1256, "tokens_per_second_per_gpu": 3217.03 }, { "epoch": 0.031425, "grad_norm": 0.349609375, "learning_rate": 0.001884, "loss": 3.6081, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1257, "tokens_per_second_per_gpu": 22586.72 }, { "epoch": 0.03145, "grad_norm": 0.2421875, "learning_rate": 0.0018854999999999998, "loss": 3.604, "memory/device_reserved (GiB)": 116.62, "memory/max_active (GiB)": 116.59, "memory/max_allocated (GiB)": 116.59, "step": 1258, "tokens_per_second_per_gpu": 11819.77 }, { "epoch": 0.031475, "grad_norm": 0.345703125, "learning_rate": 0.001887, "loss": 3.6016, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1259, "tokens_per_second_per_gpu": 23664.05 }, { "epoch": 0.0315, "grad_norm": 0.435546875, "learning_rate": 0.0018885, "loss": 3.6089, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1260, "tokens_per_second_per_gpu": 11356.48 }, { "epoch": 0.031525, "grad_norm": 0.453125, "learning_rate": 0.00189, "loss": 3.6155, "memory/device_reserved (GiB)": 35.0, "memory/max_active (GiB)": 34.99, "memory/max_allocated (GiB)": 34.99, "step": 1261, "tokens_per_second_per_gpu": 34160.3 }, { "epoch": 0.03155, "grad_norm": 0.46875, "learning_rate": 0.0018915, "loss": 3.5646, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1262, "tokens_per_second_per_gpu": 16077.16 }, { "epoch": 0.031575, "grad_norm": 0.5, "learning_rate": 0.0018930000000000002, "loss": 3.6169, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1263, "tokens_per_second_per_gpu": 20605.18 }, { "epoch": 0.0316, "grad_norm": 0.515625, "learning_rate": 0.0018945, "loss": 3.5997, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1264, "tokens_per_second_per_gpu": 20796.29 }, { "epoch": 0.031625, "grad_norm": 0.4609375, "learning_rate": 0.0018960000000000001, "loss": 3.5478, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1265, "tokens_per_second_per_gpu": 19545.81 }, { "epoch": 0.03165, "grad_norm": 0.4921875, "learning_rate": 0.0018974999999999999, "loss": 3.5682, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1266, "tokens_per_second_per_gpu": 29158.65 }, { "epoch": 0.031675, "grad_norm": 0.53515625, "learning_rate": 0.001899, "loss": 3.5908, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1267, "tokens_per_second_per_gpu": 19961.9 }, { "epoch": 0.0317, "grad_norm": 0.6875, "learning_rate": 0.0019004999999999998, "loss": 3.602, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1268, "tokens_per_second_per_gpu": 15596.74 }, { "epoch": 0.031725, "grad_norm": 0.7265625, "learning_rate": 0.001902, "loss": 3.6153, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1269, "tokens_per_second_per_gpu": 12923.79 }, { "epoch": 0.03175, "grad_norm": 0.71875, "learning_rate": 0.0019034999999999998, "loss": 3.5925, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1270, "tokens_per_second_per_gpu": 23656.28 }, { "epoch": 0.031775, "grad_norm": 0.734375, "learning_rate": 0.001905, "loss": 3.6148, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1271, "tokens_per_second_per_gpu": 14320.41 }, { "epoch": 0.0318, "grad_norm": 0.390625, "learning_rate": 0.0019065, "loss": 3.6057, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.42, "memory/max_allocated (GiB)": 107.42, "step": 1272, "tokens_per_second_per_gpu": 12473.1 }, { "epoch": 0.031825, "grad_norm": 0.482421875, "learning_rate": 0.001908, "loss": 3.5804, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1273, "tokens_per_second_per_gpu": 11567.53 }, { "epoch": 0.03185, "grad_norm": 0.54296875, "learning_rate": 0.0019095, "loss": 3.615, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1274, "tokens_per_second_per_gpu": 13483.92 }, { "epoch": 0.031875, "grad_norm": 0.51953125, "learning_rate": 0.0019110000000000002, "loss": 3.6257, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1275, "tokens_per_second_per_gpu": 19470.8 }, { "epoch": 0.0319, "grad_norm": 0.337890625, "learning_rate": 0.0019125, "loss": 3.621, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1276, "tokens_per_second_per_gpu": 14459.03 }, { "epoch": 0.031925, "grad_norm": 0.291015625, "learning_rate": 0.0019140000000000001, "loss": 3.6039, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1277, "tokens_per_second_per_gpu": 27734.03 }, { "epoch": 0.03195, "grad_norm": 0.177734375, "learning_rate": 0.0019154999999999999, "loss": 3.6376, "memory/device_reserved (GiB)": 116.76, "memory/max_active (GiB)": 116.62, "memory/max_allocated (GiB)": 116.62, "step": 1278, "tokens_per_second_per_gpu": 11352.44 }, { "epoch": 0.031975, "grad_norm": 0.30859375, "learning_rate": 0.001917, "loss": 3.5869, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1279, "tokens_per_second_per_gpu": 15992.78 }, { "epoch": 0.032, "grad_norm": 0.65625, "learning_rate": 0.0019184999999999998, "loss": 3.6038, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1280, "tokens_per_second_per_gpu": 13750.22 }, { "epoch": 0.032025, "grad_norm": 0.98828125, "learning_rate": 0.00192, "loss": 3.6454, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1281, "tokens_per_second_per_gpu": 10784.48 }, { "epoch": 0.03205, "grad_norm": 0.56640625, "learning_rate": 0.0019214999999999998, "loss": 3.634, "memory/device_reserved (GiB)": 65.8, "memory/max_active (GiB)": 65.62, "memory/max_allocated (GiB)": 65.62, "step": 1282, "tokens_per_second_per_gpu": 20058.99 }, { "epoch": 0.032075, "grad_norm": 0.60546875, "learning_rate": 0.001923, "loss": 3.5798, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1283, "tokens_per_second_per_gpu": 16710.6 }, { "epoch": 0.0321, "grad_norm": 0.671875, "learning_rate": 0.0019245, "loss": 3.6167, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1284, "tokens_per_second_per_gpu": 18169.77 }, { "epoch": 0.032125, "grad_norm": 0.8359375, "learning_rate": 0.001926, "loss": 3.6544, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1285, "tokens_per_second_per_gpu": 17544.65 }, { "epoch": 0.03215, "grad_norm": 0.79296875, "learning_rate": 0.0019275, "loss": 3.6437, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1286, "tokens_per_second_per_gpu": 18431.87 }, { "epoch": 0.032175, "grad_norm": 0.8125, "learning_rate": 0.0019290000000000002, "loss": 3.678, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1287, "tokens_per_second_per_gpu": 12710.38 }, { "epoch": 0.0322, "grad_norm": 0.53125, "learning_rate": 0.0019305, "loss": 3.6314, "memory/device_reserved (GiB)": 95.34, "memory/max_active (GiB)": 95.31, "memory/max_allocated (GiB)": 95.31, "step": 1288, "tokens_per_second_per_gpu": 15594.57 }, { "epoch": 0.032225, "grad_norm": 0.5625, "learning_rate": 0.0019320000000000001, "loss": 3.6063, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1289, "tokens_per_second_per_gpu": 28373.23 }, { "epoch": 0.03225, "grad_norm": 0.5703125, "learning_rate": 0.0019334999999999999, "loss": 3.6325, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1290, "tokens_per_second_per_gpu": 13252.77 }, { "epoch": 0.032275, "grad_norm": 0.4375, "learning_rate": 0.001935, "loss": 3.6115, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1291, "tokens_per_second_per_gpu": 13854.58 }, { "epoch": 0.0323, "grad_norm": 0.478515625, "learning_rate": 0.0019364999999999999, "loss": 3.6134, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1292, "tokens_per_second_per_gpu": 16514.56 }, { "epoch": 0.032325, "grad_norm": 0.458984375, "learning_rate": 0.001938, "loss": 3.5966, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 1293, "tokens_per_second_per_gpu": 11234.63 }, { "epoch": 0.03235, "grad_norm": 0.53125, "learning_rate": 0.0019395, "loss": 3.5817, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1294, "tokens_per_second_per_gpu": 15707.64 }, { "epoch": 0.032375, "grad_norm": 0.58203125, "learning_rate": 0.001941, "loss": 3.5557, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1295, "tokens_per_second_per_gpu": 24123.17 }, { "epoch": 0.0324, "grad_norm": 0.62109375, "learning_rate": 0.0019425, "loss": 3.585, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 1296, "tokens_per_second_per_gpu": 28776.72 }, { "epoch": 0.032425, "grad_norm": 0.69921875, "learning_rate": 0.0019440000000000002, "loss": 3.5781, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1297, "tokens_per_second_per_gpu": 17817.21 }, { "epoch": 0.03245, "grad_norm": 0.478515625, "learning_rate": 0.0019455, "loss": 3.6014, "memory/device_reserved (GiB)": 75.81, "memory/max_active (GiB)": 75.79, "memory/max_allocated (GiB)": 75.79, "step": 1298, "tokens_per_second_per_gpu": 18518.39 }, { "epoch": 0.032475, "grad_norm": 0.47265625, "learning_rate": 0.0019470000000000002, "loss": 3.5638, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1299, "tokens_per_second_per_gpu": 17746.96 }, { "epoch": 0.0325, "grad_norm": 0.388671875, "learning_rate": 0.0019485, "loss": 3.6056, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1300, "tokens_per_second_per_gpu": 10351.42 }, { "epoch": 0.032525, "grad_norm": 0.474609375, "learning_rate": 0.0019500000000000001, "loss": 3.5863, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1301, "tokens_per_second_per_gpu": 12730.77 }, { "epoch": 0.03255, "grad_norm": 0.388671875, "learning_rate": 0.0019515, "loss": 3.5789, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1302, "tokens_per_second_per_gpu": 17689.69 }, { "epoch": 0.032575, "grad_norm": 0.48046875, "learning_rate": 0.001953, "loss": 3.5812, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1303, "tokens_per_second_per_gpu": 13973.44 }, { "epoch": 0.0326, "grad_norm": 0.62890625, "learning_rate": 0.0019545, "loss": 3.5892, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1304, "tokens_per_second_per_gpu": 15463.23 }, { "epoch": 0.032625, "grad_norm": 0.515625, "learning_rate": 0.0019560000000000003, "loss": 3.6065, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1305, "tokens_per_second_per_gpu": 19399.02 }, { "epoch": 0.03265, "grad_norm": 0.498046875, "learning_rate": 0.0019575, "loss": 3.5957, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1306, "tokens_per_second_per_gpu": 20553.4 }, { "epoch": 0.032675, "grad_norm": 0.494140625, "learning_rate": 0.0019590000000000002, "loss": 3.5782, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1307, "tokens_per_second_per_gpu": 19930.59 }, { "epoch": 0.0327, "grad_norm": 0.55859375, "learning_rate": 0.0019605, "loss": 3.562, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1308, "tokens_per_second_per_gpu": 28630.65 }, { "epoch": 0.032725, "grad_norm": 0.55078125, "learning_rate": 0.001962, "loss": 3.549, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1309, "tokens_per_second_per_gpu": 16095.22 }, { "epoch": 0.03275, "grad_norm": 0.37109375, "learning_rate": 0.0019635, "loss": 3.588, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1310, "tokens_per_second_per_gpu": 17029.87 }, { "epoch": 0.032775, "grad_norm": 0.390625, "learning_rate": 0.001965, "loss": 3.5593, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1311, "tokens_per_second_per_gpu": 26819.22 }, { "epoch": 0.0328, "grad_norm": 0.49609375, "learning_rate": 0.0019665, "loss": 3.5773, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1312, "tokens_per_second_per_gpu": 12497.87 }, { "epoch": 0.032825, "grad_norm": 0.47265625, "learning_rate": 0.001968, "loss": 3.5741, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1313, "tokens_per_second_per_gpu": 17635.46 }, { "epoch": 0.03285, "grad_norm": 0.44921875, "learning_rate": 0.0019695, "loss": 3.5573, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1314, "tokens_per_second_per_gpu": 16921.93 }, { "epoch": 0.032875, "grad_norm": 0.51171875, "learning_rate": 0.001971, "loss": 3.5294, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1315, "tokens_per_second_per_gpu": 18187.2 }, { "epoch": 0.0329, "grad_norm": 0.447265625, "learning_rate": 0.0019725, "loss": 3.5736, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1316, "tokens_per_second_per_gpu": 24429.46 }, { "epoch": 0.032925, "grad_norm": 0.458984375, "learning_rate": 0.001974, "loss": 3.5583, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1317, "tokens_per_second_per_gpu": 18095.02 }, { "epoch": 0.03295, "grad_norm": 0.59375, "learning_rate": 0.0019755, "loss": 3.552, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1318, "tokens_per_second_per_gpu": 20437.85 }, { "epoch": 0.032975, "grad_norm": 0.478515625, "learning_rate": 0.001977, "loss": 3.5937, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.62, "memory/max_allocated (GiB)": 66.62, "step": 1319, "tokens_per_second_per_gpu": 18923.13 }, { "epoch": 0.033, "grad_norm": 0.306640625, "learning_rate": 0.0019785, "loss": 3.5789, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1320, "tokens_per_second_per_gpu": 17531.73 }, { "epoch": 0.033025, "grad_norm": 0.265625, "learning_rate": 0.00198, "loss": 3.5499, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1321, "tokens_per_second_per_gpu": 13831.09 }, { "epoch": 0.03305, "grad_norm": 0.341796875, "learning_rate": 0.0019814999999999998, "loss": 3.5388, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1322, "tokens_per_second_per_gpu": 11559.14 }, { "epoch": 0.033075, "grad_norm": 0.41015625, "learning_rate": 0.001983, "loss": 3.5391, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1323, "tokens_per_second_per_gpu": 11907.54 }, { "epoch": 0.0331, "grad_norm": 0.388671875, "learning_rate": 0.0019845, "loss": 3.5345, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1324, "tokens_per_second_per_gpu": 12837.16 }, { "epoch": 0.033125, "grad_norm": 0.365234375, "learning_rate": 0.0019860000000000004, "loss": 3.576, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1325, "tokens_per_second_per_gpu": 16988.54 }, { "epoch": 0.03315, "grad_norm": 0.3125, "learning_rate": 0.0019875, "loss": 3.5103, "memory/device_reserved (GiB)": 76.0, "memory/max_active (GiB)": 75.82, "memory/max_allocated (GiB)": 75.82, "step": 1326, "tokens_per_second_per_gpu": 18092.82 }, { "epoch": 0.033175, "grad_norm": 0.333984375, "learning_rate": 0.0019890000000000003, "loss": 3.5294, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1327, "tokens_per_second_per_gpu": 18134.49 }, { "epoch": 0.0332, "grad_norm": 0.44921875, "learning_rate": 0.0019905, "loss": 3.5345, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1328, "tokens_per_second_per_gpu": 12603.19 }, { "epoch": 0.033225, "grad_norm": 0.51953125, "learning_rate": 0.0019920000000000003, "loss": 3.5554, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1329, "tokens_per_second_per_gpu": 24227.61 }, { "epoch": 0.03325, "grad_norm": 0.59375, "learning_rate": 0.0019935, "loss": 3.5567, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1330, "tokens_per_second_per_gpu": 22022.83 }, { "epoch": 0.033275, "grad_norm": 0.73828125, "learning_rate": 0.0019950000000000002, "loss": 3.5914, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1331, "tokens_per_second_per_gpu": 20337.96 }, { "epoch": 0.0333, "grad_norm": 0.478515625, "learning_rate": 0.0019965, "loss": 3.5818, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1332, "tokens_per_second_per_gpu": 25901.2 }, { "epoch": 0.033325, "grad_norm": 0.53515625, "learning_rate": 0.001998, "loss": 3.5513, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1333, "tokens_per_second_per_gpu": 13981.54 }, { "epoch": 0.03335, "grad_norm": 0.71875, "learning_rate": 0.0019995, "loss": 3.5792, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1334, "tokens_per_second_per_gpu": 16008.62 }, { "epoch": 0.033375, "grad_norm": 0.4921875, "learning_rate": 0.002001, "loss": 3.5646, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1335, "tokens_per_second_per_gpu": 20220.74 }, { "epoch": 0.0334, "grad_norm": 0.421875, "learning_rate": 0.0020025, "loss": 3.5782, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1336, "tokens_per_second_per_gpu": 13215.59 }, { "epoch": 0.033425, "grad_norm": 0.57421875, "learning_rate": 0.002004, "loss": 3.5563, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1337, "tokens_per_second_per_gpu": 22717.51 }, { "epoch": 0.03345, "grad_norm": 0.77734375, "learning_rate": 0.0020055, "loss": 3.5566, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1338, "tokens_per_second_per_gpu": 17516.96 }, { "epoch": 0.033475, "grad_norm": 0.6171875, "learning_rate": 0.002007, "loss": 3.5478, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1339, "tokens_per_second_per_gpu": 11011.01 }, { "epoch": 0.0335, "grad_norm": 0.703125, "learning_rate": 0.0020085, "loss": 3.5599, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1340, "tokens_per_second_per_gpu": 18454.03 }, { "epoch": 0.033525, "grad_norm": 0.73828125, "learning_rate": 0.00201, "loss": 3.575, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1341, "tokens_per_second_per_gpu": 13481.05 }, { "epoch": 0.03355, "grad_norm": 0.63671875, "learning_rate": 0.0020115, "loss": 3.5601, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1342, "tokens_per_second_per_gpu": 22342.25 }, { "epoch": 0.033575, "grad_norm": 0.7265625, "learning_rate": 0.002013, "loss": 3.5966, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1343, "tokens_per_second_per_gpu": 20644.62 }, { "epoch": 0.0336, "grad_norm": 0.73828125, "learning_rate": 0.0020145, "loss": 3.6091, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1344, "tokens_per_second_per_gpu": 15142.08 }, { "epoch": 0.033625, "grad_norm": 0.859375, "learning_rate": 0.002016, "loss": 3.5924, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1345, "tokens_per_second_per_gpu": 14099.78 }, { "epoch": 0.03365, "grad_norm": 0.71484375, "learning_rate": 0.0020175, "loss": 3.6332, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1346, "tokens_per_second_per_gpu": 21532.35 }, { "epoch": 0.033675, "grad_norm": 0.5234375, "learning_rate": 0.002019, "loss": 3.5825, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1347, "tokens_per_second_per_gpu": 24330.63 }, { "epoch": 0.0337, "grad_norm": 0.36328125, "learning_rate": 0.0020205, "loss": 3.5743, "memory/device_reserved (GiB)": 96.05, "memory/max_active (GiB)": 95.88, "memory/max_allocated (GiB)": 95.88, "step": 1348, "tokens_per_second_per_gpu": 13692.95 }, { "epoch": 0.033725, "grad_norm": 0.388671875, "learning_rate": 0.0020220000000000004, "loss": 3.5457, "memory/device_reserved (GiB)": 46.4, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1349, "tokens_per_second_per_gpu": 28301.62 }, { "epoch": 0.03375, "grad_norm": 0.369140625, "learning_rate": 0.0020235, "loss": 3.5925, "memory/device_reserved (GiB)": 116.62, "memory/max_active (GiB)": 116.59, "memory/max_allocated (GiB)": 116.59, "step": 1350, "tokens_per_second_per_gpu": 11464.18 }, { "epoch": 0.033775, "grad_norm": 0.4296875, "learning_rate": 0.0020250000000000003, "loss": 3.5638, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1351, "tokens_per_second_per_gpu": 15768.04 }, { "epoch": 0.0338, "grad_norm": 0.474609375, "learning_rate": 0.0020265, "loss": 3.5687, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1352, "tokens_per_second_per_gpu": 15947.14 }, { "epoch": 0.033825, "grad_norm": 0.470703125, "learning_rate": 0.0020280000000000003, "loss": 3.5567, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1353, "tokens_per_second_per_gpu": 15874.51 }, { "epoch": 0.03385, "grad_norm": 0.38671875, "learning_rate": 0.0020295, "loss": 3.5474, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1354, "tokens_per_second_per_gpu": 17436.61 }, { "epoch": 0.033875, "grad_norm": 0.23046875, "learning_rate": 0.0020310000000000003, "loss": 3.5245, "memory/device_reserved (GiB)": 96.42, "memory/max_active (GiB)": 96.23, "memory/max_allocated (GiB)": 96.23, "step": 1355, "tokens_per_second_per_gpu": 13675.95 }, { "epoch": 0.0339, "grad_norm": 0.31640625, "learning_rate": 0.0020325, "loss": 3.5818, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 1356, "tokens_per_second_per_gpu": 13835.52 }, { "epoch": 0.033925, "grad_norm": 0.3828125, "learning_rate": 0.0020340000000000002, "loss": 3.5646, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1357, "tokens_per_second_per_gpu": 15244.96 }, { "epoch": 0.03395, "grad_norm": 0.3828125, "learning_rate": 0.0020355, "loss": 3.555, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1358, "tokens_per_second_per_gpu": 20809.76 }, { "epoch": 0.033975, "grad_norm": 0.431640625, "learning_rate": 0.002037, "loss": 3.5179, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1359, "tokens_per_second_per_gpu": 15535.67 }, { "epoch": 0.034, "grad_norm": 0.419921875, "learning_rate": 0.0020385, "loss": 3.5609, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1360, "tokens_per_second_per_gpu": 14703.62 }, { "epoch": 0.034025, "grad_norm": 0.388671875, "learning_rate": 0.00204, "loss": 3.5407, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1361, "tokens_per_second_per_gpu": 23370.71 }, { "epoch": 0.03405, "grad_norm": 0.462890625, "learning_rate": 0.0020415, "loss": 3.525, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1362, "tokens_per_second_per_gpu": 15286.27 }, { "epoch": 0.034075, "grad_norm": 0.54296875, "learning_rate": 0.002043, "loss": 3.5753, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1363, "tokens_per_second_per_gpu": 16333.88 }, { "epoch": 0.0341, "grad_norm": 0.384765625, "learning_rate": 0.0020445, "loss": 3.5606, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1364, "tokens_per_second_per_gpu": 18300.33 }, { "epoch": 0.034125, "grad_norm": 0.46484375, "learning_rate": 0.002046, "loss": 3.555, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1365, "tokens_per_second_per_gpu": 23605.46 }, { "epoch": 0.03415, "grad_norm": 0.46484375, "learning_rate": 0.0020475, "loss": 3.5199, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1366, "tokens_per_second_per_gpu": 24457.63 }, { "epoch": 0.034175, "grad_norm": 0.404296875, "learning_rate": 0.002049, "loss": 3.5133, "memory/device_reserved (GiB)": 65.6, "memory/max_active (GiB)": 65.59, "memory/max_allocated (GiB)": 65.59, "step": 1367, "tokens_per_second_per_gpu": 19712.83 }, { "epoch": 0.0342, "grad_norm": 0.375, "learning_rate": 0.0020505000000000002, "loss": 3.5994, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1368, "tokens_per_second_per_gpu": 20905.45 }, { "epoch": 0.034225, "grad_norm": 0.30859375, "learning_rate": 0.002052, "loss": 3.547, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1369, "tokens_per_second_per_gpu": 13946.41 }, { "epoch": 0.03425, "grad_norm": 0.421875, "learning_rate": 0.0020535, "loss": 3.4979, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1370, "tokens_per_second_per_gpu": 10291.55 }, { "epoch": 0.034275, "grad_norm": 0.47265625, "learning_rate": 0.0020550000000000004, "loss": 3.5488, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1371, "tokens_per_second_per_gpu": 14330.76 }, { "epoch": 0.0343, "grad_norm": 0.412109375, "learning_rate": 0.0020565, "loss": 3.5514, "memory/device_reserved (GiB)": 116.82, "memory/max_active (GiB)": 116.63, "memory/max_allocated (GiB)": 116.63, "step": 1372, "tokens_per_second_per_gpu": 11593.23 }, { "epoch": 0.034325, "grad_norm": 0.349609375, "learning_rate": 0.0020580000000000004, "loss": 3.5162, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1373, "tokens_per_second_per_gpu": 22695.51 }, { "epoch": 0.03435, "grad_norm": 0.5703125, "learning_rate": 0.0020595, "loss": 3.5434, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1374, "tokens_per_second_per_gpu": 11155.17 }, { "epoch": 0.034375, "grad_norm": 0.466796875, "learning_rate": 0.0020610000000000003, "loss": 3.5315, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1375, "tokens_per_second_per_gpu": 28298.2 }, { "epoch": 0.0344, "grad_norm": 0.416015625, "learning_rate": 0.0020625, "loss": 3.4958, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1376, "tokens_per_second_per_gpu": 12498.72 }, { "epoch": 0.034425, "grad_norm": 0.56640625, "learning_rate": 0.002064, "loss": 3.5425, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 1377, "tokens_per_second_per_gpu": 28557.14 }, { "epoch": 0.03445, "grad_norm": 0.8984375, "learning_rate": 0.0020655, "loss": 3.549, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1378, "tokens_per_second_per_gpu": 10497.04 }, { "epoch": 0.034475, "grad_norm": 0.84765625, "learning_rate": 0.002067, "loss": 3.5783, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1379, "tokens_per_second_per_gpu": 24554.95 }, { "epoch": 0.0345, "grad_norm": 0.640625, "learning_rate": 0.0020685, "loss": 3.5727, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1380, "tokens_per_second_per_gpu": 12171.71 }, { "epoch": 0.034525, "grad_norm": 0.78125, "learning_rate": 0.00207, "loss": 3.5953, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 1381, "tokens_per_second_per_gpu": 27716.01 }, { "epoch": 0.03455, "grad_norm": 1.1328125, "learning_rate": 0.0020715, "loss": 3.647, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1382, "tokens_per_second_per_gpu": 10762.01 }, { "epoch": 0.034575, "grad_norm": 0.9921875, "learning_rate": 0.0020729999999999998, "loss": 3.6397, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1383, "tokens_per_second_per_gpu": 15999.61 }, { "epoch": 0.0346, "grad_norm": 0.90234375, "learning_rate": 0.0020745, "loss": 3.6361, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 1384, "tokens_per_second_per_gpu": 11675.73 }, { "epoch": 0.034625, "grad_norm": 1.0546875, "learning_rate": 0.0020759999999999997, "loss": 3.5832, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1385, "tokens_per_second_per_gpu": 26645.8 }, { "epoch": 0.03465, "grad_norm": 0.625, "learning_rate": 0.0020775, "loss": 3.6312, "memory/device_reserved (GiB)": 106.43, "memory/max_active (GiB)": 106.39, "memory/max_allocated (GiB)": 106.39, "step": 1386, "tokens_per_second_per_gpu": 13190.67 }, { "epoch": 0.034675, "grad_norm": 0.5703125, "learning_rate": 0.0020789999999999997, "loss": 3.6039, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1387, "tokens_per_second_per_gpu": 13193.56 }, { "epoch": 0.0347, "grad_norm": 0.62109375, "learning_rate": 0.0020805, "loss": 3.6006, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1388, "tokens_per_second_per_gpu": 12725.45 }, { "epoch": 0.034725, "grad_norm": 0.6015625, "learning_rate": 0.002082, "loss": 3.5991, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1389, "tokens_per_second_per_gpu": 11439.07 }, { "epoch": 0.03475, "grad_norm": 0.609375, "learning_rate": 0.0020835, "loss": 3.5909, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1390, "tokens_per_second_per_gpu": 24168.89 }, { "epoch": 0.034775, "grad_norm": 0.40234375, "learning_rate": 0.002085, "loss": 3.5883, "memory/device_reserved (GiB)": 56.59, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1391, "tokens_per_second_per_gpu": 21950.71 }, { "epoch": 0.0348, "grad_norm": 0.365234375, "learning_rate": 0.0020865000000000002, "loss": 3.5228, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1392, "tokens_per_second_per_gpu": 17421.23 }, { "epoch": 0.034825, "grad_norm": 0.330078125, "learning_rate": 0.002088, "loss": 3.5342, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1393, "tokens_per_second_per_gpu": 17480.62 }, { "epoch": 0.03485, "grad_norm": 0.31640625, "learning_rate": 0.0020895, "loss": 3.5694, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1394, "tokens_per_second_per_gpu": 15047.88 }, { "epoch": 0.034875, "grad_norm": 0.3828125, "learning_rate": 0.002091, "loss": 3.5293, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1395, "tokens_per_second_per_gpu": 13187.63 }, { "epoch": 0.0349, "grad_norm": 0.380859375, "learning_rate": 0.0020925, "loss": 3.542, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1396, "tokens_per_second_per_gpu": 16280.68 }, { "epoch": 0.034925, "grad_norm": 0.36328125, "learning_rate": 0.002094, "loss": 3.5522, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1397, "tokens_per_second_per_gpu": 20943.41 }, { "epoch": 0.03495, "grad_norm": 0.3984375, "learning_rate": 0.0020955, "loss": 3.5235, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1398, "tokens_per_second_per_gpu": 17425.71 }, { "epoch": 0.034975, "grad_norm": 0.380859375, "learning_rate": 0.002097, "loss": 3.4941, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1399, "tokens_per_second_per_gpu": 19351.85 }, { "epoch": 0.035, "grad_norm": 0.35546875, "learning_rate": 0.0020985, "loss": 3.5263, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1400, "tokens_per_second_per_gpu": 26588.47 }, { "epoch": 0.035025, "grad_norm": 0.310546875, "learning_rate": 0.0021, "loss": 3.5466, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1401, "tokens_per_second_per_gpu": 20551.96 }, { "epoch": 0.03505, "grad_norm": 0.302734375, "learning_rate": 0.0021015, "loss": 3.4977, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1402, "tokens_per_second_per_gpu": 30549.58 }, { "epoch": 0.035075, "grad_norm": 0.28515625, "learning_rate": 0.002103, "loss": 3.5247, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1403, "tokens_per_second_per_gpu": 23523.47 }, { "epoch": 0.0351, "grad_norm": 0.357421875, "learning_rate": 0.0021045, "loss": 3.526, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1404, "tokens_per_second_per_gpu": 13688.17 }, { "epoch": 0.035125, "grad_norm": 0.46484375, "learning_rate": 0.002106, "loss": 3.5002, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1405, "tokens_per_second_per_gpu": 11878.87 }, { "epoch": 0.03515, "grad_norm": 0.416015625, "learning_rate": 0.0021075, "loss": 3.4682, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1406, "tokens_per_second_per_gpu": 17446.47 }, { "epoch": 0.035175, "grad_norm": 0.3203125, "learning_rate": 0.0021089999999999998, "loss": 3.5235, "memory/device_reserved (GiB)": 65.8, "memory/max_active (GiB)": 65.62, "memory/max_allocated (GiB)": 65.62, "step": 1407, "tokens_per_second_per_gpu": 20037.3 }, { "epoch": 0.0352, "grad_norm": 0.287109375, "learning_rate": 0.0021105, "loss": 3.4988, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1408, "tokens_per_second_per_gpu": 22510.15 }, { "epoch": 0.035225, "grad_norm": 0.296875, "learning_rate": 0.0021119999999999997, "loss": 3.4795, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1409, "tokens_per_second_per_gpu": 19057.12 }, { "epoch": 0.03525, "grad_norm": 0.291015625, "learning_rate": 0.0021135, "loss": 3.4718, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1410, "tokens_per_second_per_gpu": 11227.44 }, { "epoch": 0.035275, "grad_norm": 0.41796875, "learning_rate": 0.002115, "loss": 3.5271, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1411, "tokens_per_second_per_gpu": 10657.57 }, { "epoch": 0.0353, "grad_norm": 0.53515625, "learning_rate": 0.0021165, "loss": 3.5092, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1412, "tokens_per_second_per_gpu": 14258.56 }, { "epoch": 0.035325, "grad_norm": 0.546875, "learning_rate": 0.002118, "loss": 3.5218, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1413, "tokens_per_second_per_gpu": 19906.81 }, { "epoch": 0.03535, "grad_norm": 0.47265625, "learning_rate": 0.0021195000000000003, "loss": 3.4981, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1414, "tokens_per_second_per_gpu": 17177.78 }, { "epoch": 0.035375, "grad_norm": 0.5859375, "learning_rate": 0.002121, "loss": 3.5232, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1415, "tokens_per_second_per_gpu": 23675.94 }, { "epoch": 0.0354, "grad_norm": 0.7734375, "learning_rate": 0.0021225000000000003, "loss": 3.5297, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1416, "tokens_per_second_per_gpu": 20016.97 }, { "epoch": 0.035425, "grad_norm": 0.59765625, "learning_rate": 0.002124, "loss": 3.5541, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1417, "tokens_per_second_per_gpu": 13035.39 }, { "epoch": 0.03545, "grad_norm": 0.53515625, "learning_rate": 0.0021255000000000002, "loss": 3.5719, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1418, "tokens_per_second_per_gpu": 20740.09 }, { "epoch": 0.035475, "grad_norm": 0.48046875, "learning_rate": 0.002127, "loss": 3.529, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1419, "tokens_per_second_per_gpu": 20790.48 }, { "epoch": 0.0355, "grad_norm": 0.58203125, "learning_rate": 0.0021285, "loss": 3.5301, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1420, "tokens_per_second_per_gpu": 10670.73 }, { "epoch": 0.035525, "grad_norm": 0.625, "learning_rate": 0.00213, "loss": 3.5534, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1421, "tokens_per_second_per_gpu": 12918.99 }, { "epoch": 0.03555, "grad_norm": 0.5234375, "learning_rate": 0.0021315, "loss": 3.5325, "memory/device_reserved (GiB)": 45.24, "memory/max_active (GiB)": 45.19, "memory/max_allocated (GiB)": 45.19, "step": 1422, "tokens_per_second_per_gpu": 27492.87 }, { "epoch": 0.035575, "grad_norm": 0.58984375, "learning_rate": 0.002133, "loss": 3.5919, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1423, "tokens_per_second_per_gpu": 13690.46 }, { "epoch": 0.0356, "grad_norm": 0.57421875, "learning_rate": 0.0021345, "loss": 3.5656, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1424, "tokens_per_second_per_gpu": 13261.84 }, { "epoch": 0.035625, "grad_norm": 0.5859375, "learning_rate": 0.002136, "loss": 3.548, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1425, "tokens_per_second_per_gpu": 13164.8 }, { "epoch": 0.03565, "grad_norm": 0.57421875, "learning_rate": 0.0021375, "loss": 3.566, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1426, "tokens_per_second_per_gpu": 13455.01 }, { "epoch": 0.035675, "grad_norm": 0.416015625, "learning_rate": 0.002139, "loss": 3.5342, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1427, "tokens_per_second_per_gpu": 13353.75 }, { "epoch": 0.0357, "grad_norm": 0.3984375, "learning_rate": 0.0021405, "loss": 3.5566, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 1428, "tokens_per_second_per_gpu": 11870.1 }, { "epoch": 0.035725, "grad_norm": 0.419921875, "learning_rate": 0.002142, "loss": 3.4812, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1429, "tokens_per_second_per_gpu": 11149.52 }, { "epoch": 0.03575, "grad_norm": 0.44921875, "learning_rate": 0.0021435, "loss": 3.527, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1430, "tokens_per_second_per_gpu": 14591.52 }, { "epoch": 0.035775, "grad_norm": 0.462890625, "learning_rate": 0.0021449999999999998, "loss": 3.4909, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1431, "tokens_per_second_per_gpu": 24118.15 }, { "epoch": 0.0358, "grad_norm": 0.45703125, "learning_rate": 0.0021465, "loss": 3.5108, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1432, "tokens_per_second_per_gpu": 11920.89 }, { "epoch": 0.035825, "grad_norm": 0.34375, "learning_rate": 0.002148, "loss": 3.5076, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1433, "tokens_per_second_per_gpu": 15384.44 }, { "epoch": 0.03585, "grad_norm": 0.369140625, "learning_rate": 0.0021495, "loss": 3.579, "memory/device_reserved (GiB)": 54.53, "memory/max_active (GiB)": 54.51, "memory/max_allocated (GiB)": 54.51, "step": 1434, "tokens_per_second_per_gpu": 24061.12 }, { "epoch": 0.035875, "grad_norm": 0.37890625, "learning_rate": 0.002151, "loss": 3.4768, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1435, "tokens_per_second_per_gpu": 17650.12 }, { "epoch": 0.0359, "grad_norm": 0.330078125, "learning_rate": 0.0021525000000000003, "loss": 3.4884, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1436, "tokens_per_second_per_gpu": 27428.39 }, { "epoch": 0.035925, "grad_norm": 0.396484375, "learning_rate": 0.002154, "loss": 3.5213, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1437, "tokens_per_second_per_gpu": 15538.43 }, { "epoch": 0.03595, "grad_norm": 0.369140625, "learning_rate": 0.0021555000000000003, "loss": 3.5184, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1438, "tokens_per_second_per_gpu": 16080.2 }, { "epoch": 0.035975, "grad_norm": 0.27734375, "learning_rate": 0.002157, "loss": 3.5298, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1439, "tokens_per_second_per_gpu": 15102.05 }, { "epoch": 0.036, "grad_norm": 0.337890625, "learning_rate": 0.0021585000000000003, "loss": 3.4614, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1440, "tokens_per_second_per_gpu": 17539.91 }, { "epoch": 0.036025, "grad_norm": 0.41796875, "learning_rate": 0.00216, "loss": 3.4718, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1441, "tokens_per_second_per_gpu": 16138.5 }, { "epoch": 0.03605, "grad_norm": 0.53125, "learning_rate": 0.0021615000000000002, "loss": 3.4779, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1442, "tokens_per_second_per_gpu": 15440.51 }, { "epoch": 0.036075, "grad_norm": 0.396484375, "learning_rate": 0.002163, "loss": 3.5047, "memory/device_reserved (GiB)": 56.67, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1443, "tokens_per_second_per_gpu": 21451.84 }, { "epoch": 0.0361, "grad_norm": 0.3203125, "learning_rate": 0.0021645, "loss": 3.5128, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1444, "tokens_per_second_per_gpu": 24338.7 }, { "epoch": 0.036125, "grad_norm": 0.302734375, "learning_rate": 0.002166, "loss": 3.4718, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1445, "tokens_per_second_per_gpu": 22579.23 }, { "epoch": 0.03615, "grad_norm": 0.259765625, "learning_rate": 0.0021675, "loss": 3.523, "memory/device_reserved (GiB)": 116.14, "memory/max_active (GiB)": 116.12, "memory/max_allocated (GiB)": 116.12, "step": 1446, "tokens_per_second_per_gpu": 11822.3 }, { "epoch": 0.036175, "grad_norm": 0.314453125, "learning_rate": 0.002169, "loss": 3.4993, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1447, "tokens_per_second_per_gpu": 16759.04 }, { "epoch": 0.0362, "grad_norm": 0.486328125, "learning_rate": 0.0021705, "loss": 3.462, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1448, "tokens_per_second_per_gpu": 23686.97 }, { "epoch": 0.036225, "grad_norm": 0.61328125, "learning_rate": 0.002172, "loss": 3.4776, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1449, "tokens_per_second_per_gpu": 11890.98 }, { "epoch": 0.03625, "grad_norm": 0.6796875, "learning_rate": 0.0021735, "loss": 3.5156, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1450, "tokens_per_second_per_gpu": 24142.04 }, { "epoch": 0.036275, "grad_norm": 0.82421875, "learning_rate": 0.002175, "loss": 3.5444, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1451, "tokens_per_second_per_gpu": 10488.81 }, { "epoch": 0.0363, "grad_norm": 1.078125, "learning_rate": 0.0021765, "loss": 3.6363, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 1452, "tokens_per_second_per_gpu": 13433.9 }, { "epoch": 0.036325, "grad_norm": 0.74609375, "learning_rate": 0.002178, "loss": 3.5659, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1453, "tokens_per_second_per_gpu": 18038.74 }, { "epoch": 0.03635, "grad_norm": 0.6328125, "learning_rate": 0.0021795, "loss": 3.5738, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 1454, "tokens_per_second_per_gpu": 14132.51 }, { "epoch": 0.036375, "grad_norm": 0.41015625, "learning_rate": 0.0021809999999999998, "loss": 3.5117, "memory/device_reserved (GiB)": 65.8, "memory/max_active (GiB)": 65.62, "memory/max_allocated (GiB)": 65.62, "step": 1455, "tokens_per_second_per_gpu": 20393.19 }, { "epoch": 0.0364, "grad_norm": 0.416015625, "learning_rate": 0.0021825, "loss": 3.5307, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1456, "tokens_per_second_per_gpu": 22380.83 }, { "epoch": 0.036425, "grad_norm": 0.384765625, "learning_rate": 0.002184, "loss": 3.552, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1457, "tokens_per_second_per_gpu": 18496.35 }, { "epoch": 0.03645, "grad_norm": 0.5078125, "learning_rate": 0.0021855, "loss": 3.5276, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1458, "tokens_per_second_per_gpu": 10493.7 }, { "epoch": 0.036475, "grad_norm": 0.80859375, "learning_rate": 0.002187, "loss": 3.5468, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1459, "tokens_per_second_per_gpu": 15167.74 }, { "epoch": 0.0365, "grad_norm": 0.765625, "learning_rate": 0.0021885000000000003, "loss": 3.5642, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1460, "tokens_per_second_per_gpu": 20287.19 }, { "epoch": 0.036525, "grad_norm": 0.5234375, "learning_rate": 0.00219, "loss": 3.5549, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1461, "tokens_per_second_per_gpu": 13475.22 }, { "epoch": 0.03655, "grad_norm": 0.439453125, "learning_rate": 0.0021915000000000003, "loss": 3.5461, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1462, "tokens_per_second_per_gpu": 15890.89 }, { "epoch": 0.036575, "grad_norm": 0.470703125, "learning_rate": 0.002193, "loss": 3.5702, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1463, "tokens_per_second_per_gpu": 24331.12 }, { "epoch": 0.0366, "grad_norm": 0.4765625, "learning_rate": 0.0021945000000000003, "loss": 3.5552, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1464, "tokens_per_second_per_gpu": 27476.11 }, { "epoch": 0.036625, "grad_norm": 0.546875, "learning_rate": 0.002196, "loss": 3.5601, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1465, "tokens_per_second_per_gpu": 23522.2 }, { "epoch": 0.03665, "grad_norm": 0.55859375, "learning_rate": 0.0021975000000000002, "loss": 3.5386, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1466, "tokens_per_second_per_gpu": 12399.74 }, { "epoch": 0.036675, "grad_norm": 0.490234375, "learning_rate": 0.002199, "loss": 3.4949, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1467, "tokens_per_second_per_gpu": 19902.42 }, { "epoch": 0.0367, "grad_norm": 0.5625, "learning_rate": 0.0022005, "loss": 3.557, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1468, "tokens_per_second_per_gpu": 13009.93 }, { "epoch": 0.036725, "grad_norm": 0.50390625, "learning_rate": 0.002202, "loss": 3.5157, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1469, "tokens_per_second_per_gpu": 16063.85 }, { "epoch": 0.03675, "grad_norm": 0.341796875, "learning_rate": 0.0022035, "loss": 3.5018, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1470, "tokens_per_second_per_gpu": 24367.52 }, { "epoch": 0.036775, "grad_norm": 0.31640625, "learning_rate": 0.002205, "loss": 3.5188, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1471, "tokens_per_second_per_gpu": 10821.55 }, { "epoch": 0.0368, "grad_norm": 0.306640625, "learning_rate": 0.0022065, "loss": 3.4915, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1472, "tokens_per_second_per_gpu": 14219.04 }, { "epoch": 0.036825, "grad_norm": 0.310546875, "learning_rate": 0.002208, "loss": 3.5026, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1473, "tokens_per_second_per_gpu": 19471.35 }, { "epoch": 0.03685, "grad_norm": 0.28515625, "learning_rate": 0.0022095, "loss": 3.5174, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 1474, "tokens_per_second_per_gpu": 13866.66 }, { "epoch": 0.036875, "grad_norm": 0.22265625, "learning_rate": 0.002211, "loss": 3.489, "memory/device_reserved (GiB)": 75.81, "memory/max_active (GiB)": 75.79, "memory/max_allocated (GiB)": 75.79, "step": 1475, "tokens_per_second_per_gpu": 17618.2 }, { "epoch": 0.0369, "grad_norm": 0.255859375, "learning_rate": 0.0022125, "loss": 3.4751, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1476, "tokens_per_second_per_gpu": 20376.26 }, { "epoch": 0.036925, "grad_norm": 0.24609375, "learning_rate": 0.002214, "loss": 3.4753, "memory/device_reserved (GiB)": 44.36, "memory/max_active (GiB)": 44.31, "memory/max_allocated (GiB)": 44.31, "step": 1477, "tokens_per_second_per_gpu": 28410.94 }, { "epoch": 0.03695, "grad_norm": 0.2265625, "learning_rate": 0.0022155, "loss": 3.5029, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1478, "tokens_per_second_per_gpu": 10931.53 }, { "epoch": 0.036975, "grad_norm": 0.3125, "learning_rate": 0.0022170000000000002, "loss": 3.5123, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1479, "tokens_per_second_per_gpu": 13955.96 }, { "epoch": 0.037, "grad_norm": 0.392578125, "learning_rate": 0.0022185, "loss": 3.4522, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1480, "tokens_per_second_per_gpu": 19766.28 }, { "epoch": 0.037025, "grad_norm": 0.474609375, "learning_rate": 0.00222, "loss": 3.4999, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1481, "tokens_per_second_per_gpu": 10943.87 }, { "epoch": 0.03705, "grad_norm": 0.392578125, "learning_rate": 0.0022215000000000004, "loss": 3.5066, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1482, "tokens_per_second_per_gpu": 21519.1 }, { "epoch": 0.037075, "grad_norm": 0.2373046875, "learning_rate": 0.002223, "loss": 3.4769, "memory/device_reserved (GiB)": 96.42, "memory/max_active (GiB)": 96.23, "memory/max_allocated (GiB)": 96.23, "step": 1483, "tokens_per_second_per_gpu": 13557.26 }, { "epoch": 0.0371, "grad_norm": 0.31640625, "learning_rate": 0.0022245000000000003, "loss": 3.4897, "memory/device_reserved (GiB)": 66.36, "memory/max_active (GiB)": 66.15, "memory/max_allocated (GiB)": 66.15, "step": 1484, "tokens_per_second_per_gpu": 20272.93 }, { "epoch": 0.037125, "grad_norm": 0.294921875, "learning_rate": 0.002226, "loss": 3.4987, "memory/device_reserved (GiB)": 96.42, "memory/max_active (GiB)": 96.23, "memory/max_allocated (GiB)": 96.23, "step": 1485, "tokens_per_second_per_gpu": 14568.97 }, { "epoch": 0.03715, "grad_norm": 0.322265625, "learning_rate": 0.0022275000000000003, "loss": 3.4905, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1486, "tokens_per_second_per_gpu": 11235.3 }, { "epoch": 0.037175, "grad_norm": 0.453125, "learning_rate": 0.002229, "loss": 3.4815, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1487, "tokens_per_second_per_gpu": 11168.71 }, { "epoch": 0.0372, "grad_norm": 0.6328125, "learning_rate": 0.0022305000000000003, "loss": 3.5113, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1488, "tokens_per_second_per_gpu": 17303.83 }, { "epoch": 0.037225, "grad_norm": 0.5234375, "learning_rate": 0.002232, "loss": 3.5167, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1489, "tokens_per_second_per_gpu": 10587.0 }, { "epoch": 0.03725, "grad_norm": 0.45703125, "learning_rate": 0.0022335000000000002, "loss": 3.5153, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1490, "tokens_per_second_per_gpu": 15143.57 }, { "epoch": 0.037275, "grad_norm": 0.478515625, "learning_rate": 0.002235, "loss": 3.4821, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1491, "tokens_per_second_per_gpu": 13655.85 }, { "epoch": 0.0373, "grad_norm": 0.45703125, "learning_rate": 0.0022365, "loss": 3.5213, "memory/device_reserved (GiB)": 65.6, "memory/max_active (GiB)": 65.59, "memory/max_allocated (GiB)": 65.59, "step": 1492, "tokens_per_second_per_gpu": 20190.82 }, { "epoch": 0.037325, "grad_norm": 0.486328125, "learning_rate": 0.002238, "loss": 3.475, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1493, "tokens_per_second_per_gpu": 13884.92 }, { "epoch": 0.03735, "grad_norm": 0.55859375, "learning_rate": 0.0022395, "loss": 3.548, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1494, "tokens_per_second_per_gpu": 10741.99 }, { "epoch": 0.037375, "grad_norm": 0.5234375, "learning_rate": 0.002241, "loss": 3.4785, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1495, "tokens_per_second_per_gpu": 27023.57 }, { "epoch": 0.0374, "grad_norm": 0.53125, "learning_rate": 0.0022425, "loss": 3.4931, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1496, "tokens_per_second_per_gpu": 12545.51 }, { "epoch": 0.037425, "grad_norm": 0.59765625, "learning_rate": 0.002244, "loss": 3.5184, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1497, "tokens_per_second_per_gpu": 15634.69 }, { "epoch": 0.03745, "grad_norm": 0.5625, "learning_rate": 0.0022455, "loss": 3.5149, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1498, "tokens_per_second_per_gpu": 10696.78 }, { "epoch": 0.037475, "grad_norm": 0.52734375, "learning_rate": 0.002247, "loss": 3.5024, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1499, "tokens_per_second_per_gpu": 19647.33 }, { "epoch": 0.0375, "grad_norm": 0.62109375, "learning_rate": 0.0022485, "loss": 3.5234, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1500, "tokens_per_second_per_gpu": 23342.05 }, { "epoch": 0.037525, "grad_norm": 0.365234375, "learning_rate": 0.0022500000000000003, "loss": 3.5368, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1501, "tokens_per_second_per_gpu": 13618.12 }, { "epoch": 0.03755, "grad_norm": 0.3046875, "learning_rate": 0.0022515, "loss": 3.5398, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1502, "tokens_per_second_per_gpu": 14295.47 }, { "epoch": 0.037575, "grad_norm": 0.26953125, "learning_rate": 0.0022530000000000002, "loss": 3.528, "memory/device_reserved (GiB)": 106.43, "memory/max_active (GiB)": 106.39, "memory/max_allocated (GiB)": 106.39, "step": 1503, "tokens_per_second_per_gpu": 13285.01 }, { "epoch": 0.0376, "grad_norm": 0.330078125, "learning_rate": 0.0022545, "loss": 3.4937, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1504, "tokens_per_second_per_gpu": 15054.63 }, { "epoch": 0.037625, "grad_norm": 0.314453125, "learning_rate": 0.002256, "loss": 3.4654, "memory/device_reserved (GiB)": 75.36, "memory/max_active (GiB)": 75.32, "memory/max_allocated (GiB)": 75.32, "step": 1505, "tokens_per_second_per_gpu": 17392.79 }, { "epoch": 0.03765, "grad_norm": 0.3828125, "learning_rate": 0.0022575, "loss": 3.5465, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1506, "tokens_per_second_per_gpu": 15822.8 }, { "epoch": 0.037675, "grad_norm": 0.337890625, "learning_rate": 0.002259, "loss": 3.531, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1507, "tokens_per_second_per_gpu": 22267.3 }, { "epoch": 0.0377, "grad_norm": 0.1875, "learning_rate": 0.0022605, "loss": 3.4965, "memory/device_reserved (GiB)": 116.14, "memory/max_active (GiB)": 116.12, "memory/max_allocated (GiB)": 116.12, "step": 1508, "tokens_per_second_per_gpu": 11553.88 }, { "epoch": 0.037725, "grad_norm": 0.361328125, "learning_rate": 0.002262, "loss": 3.4964, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1509, "tokens_per_second_per_gpu": 24043.88 }, { "epoch": 0.03775, "grad_norm": 0.578125, "learning_rate": 0.0022635, "loss": 3.5378, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1510, "tokens_per_second_per_gpu": 11437.17 }, { "epoch": 0.037775, "grad_norm": 0.50390625, "learning_rate": 0.002265, "loss": 3.5173, "memory/device_reserved (GiB)": 44.36, "memory/max_active (GiB)": 44.31, "memory/max_allocated (GiB)": 44.31, "step": 1511, "tokens_per_second_per_gpu": 29604.23 }, { "epoch": 0.0378, "grad_norm": 0.427734375, "learning_rate": 0.0022665, "loss": 3.4895, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1512, "tokens_per_second_per_gpu": 16179.81 }, { "epoch": 0.037825, "grad_norm": 0.419921875, "learning_rate": 0.002268, "loss": 3.455, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1513, "tokens_per_second_per_gpu": 20321.2 }, { "epoch": 0.03785, "grad_norm": 0.50390625, "learning_rate": 0.0022695, "loss": 3.4768, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1514, "tokens_per_second_per_gpu": 21485.34 }, { "epoch": 0.037875, "grad_norm": 0.609375, "learning_rate": 0.002271, "loss": 3.5066, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1515, "tokens_per_second_per_gpu": 20119.36 }, { "epoch": 0.0379, "grad_norm": 0.56640625, "learning_rate": 0.0022724999999999998, "loss": 3.5052, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1516, "tokens_per_second_per_gpu": 27884.72 }, { "epoch": 0.037925, "grad_norm": 0.640625, "learning_rate": 0.002274, "loss": 3.4795, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1517, "tokens_per_second_per_gpu": 20002.65 }, { "epoch": 0.03795, "grad_norm": 0.703125, "learning_rate": 0.0022754999999999997, "loss": 3.5288, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1518, "tokens_per_second_per_gpu": 15602.11 }, { "epoch": 0.037975, "grad_norm": 0.58984375, "learning_rate": 0.002277, "loss": 3.4784, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1519, "tokens_per_second_per_gpu": 12988.07 }, { "epoch": 0.038, "grad_norm": 0.478515625, "learning_rate": 0.0022784999999999997, "loss": 3.4887, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1520, "tokens_per_second_per_gpu": 23893.27 }, { "epoch": 0.038025, "grad_norm": 0.466796875, "learning_rate": 0.00228, "loss": 3.51, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1521, "tokens_per_second_per_gpu": 14266.55 }, { "epoch": 0.03805, "grad_norm": 0.3125, "learning_rate": 0.0022815, "loss": 3.5037, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.42, "memory/max_allocated (GiB)": 107.42, "step": 1522, "tokens_per_second_per_gpu": 12307.71 }, { "epoch": 0.038075, "grad_norm": 0.451171875, "learning_rate": 0.002283, "loss": 3.5118, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1523, "tokens_per_second_per_gpu": 11727.41 }, { "epoch": 0.0381, "grad_norm": 0.4765625, "learning_rate": 0.0022845, "loss": 3.4919, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1524, "tokens_per_second_per_gpu": 14010.68 }, { "epoch": 0.038125, "grad_norm": 0.421875, "learning_rate": 0.0022860000000000003, "loss": 3.4917, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1525, "tokens_per_second_per_gpu": 19382.64 }, { "epoch": 0.03815, "grad_norm": 0.35546875, "learning_rate": 0.0022875, "loss": 3.4864, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1526, "tokens_per_second_per_gpu": 14105.43 }, { "epoch": 0.038175, "grad_norm": 0.2578125, "learning_rate": 0.0022890000000000002, "loss": 3.5013, "memory/device_reserved (GiB)": 55.41, "memory/max_active (GiB)": 55.39, "memory/max_allocated (GiB)": 55.39, "step": 1527, "tokens_per_second_per_gpu": 22851.31 }, { "epoch": 0.0382, "grad_norm": 0.15625, "learning_rate": 0.0022905, "loss": 3.5255, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.22, "memory/max_allocated (GiB)": 97.22, "step": 1528, "tokens_per_second_per_gpu": 13186.49 }, { "epoch": 0.038225, "grad_norm": 0.287109375, "learning_rate": 0.002292, "loss": 3.4983, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1529, "tokens_per_second_per_gpu": 15401.2 }, { "epoch": 0.03825, "grad_norm": 0.41015625, "learning_rate": 0.0022935, "loss": 3.4915, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1530, "tokens_per_second_per_gpu": 13223.1 }, { "epoch": 0.038275, "grad_norm": 0.6171875, "learning_rate": 0.002295, "loss": 3.4955, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1531, "tokens_per_second_per_gpu": 10838.37 }, { "epoch": 0.0383, "grad_norm": 0.50390625, "learning_rate": 0.0022965, "loss": 3.5547, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1532, "tokens_per_second_per_gpu": 27014.86 }, { "epoch": 0.038325, "grad_norm": 0.498046875, "learning_rate": 0.002298, "loss": 3.507, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1533, "tokens_per_second_per_gpu": 16746.63 }, { "epoch": 0.03835, "grad_norm": 0.5234375, "learning_rate": 0.0022995, "loss": 3.4619, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1534, "tokens_per_second_per_gpu": 18139.98 }, { "epoch": 0.038375, "grad_norm": 0.5390625, "learning_rate": 0.002301, "loss": 3.4771, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1535, "tokens_per_second_per_gpu": 17688.98 }, { "epoch": 0.0384, "grad_norm": 0.5546875, "learning_rate": 0.0023025, "loss": 3.5069, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1536, "tokens_per_second_per_gpu": 17688.09 }, { "epoch": 0.038425, "grad_norm": 0.419921875, "learning_rate": 0.002304, "loss": 3.4916, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1537, "tokens_per_second_per_gpu": 12736.29 }, { "epoch": 0.03845, "grad_norm": 0.357421875, "learning_rate": 0.0023055, "loss": 3.4604, "memory/device_reserved (GiB)": 85.15, "memory/max_active (GiB)": 85.11, "memory/max_allocated (GiB)": 85.11, "step": 1538, "tokens_per_second_per_gpu": 16729.02 }, { "epoch": 0.038475, "grad_norm": 0.349609375, "learning_rate": 0.002307, "loss": 3.4748, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1539, "tokens_per_second_per_gpu": 28252.34 }, { "epoch": 0.0385, "grad_norm": 0.392578125, "learning_rate": 0.0023085, "loss": 3.4447, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1540, "tokens_per_second_per_gpu": 13002.43 }, { "epoch": 0.038525, "grad_norm": 0.330078125, "learning_rate": 0.00231, "loss": 3.5096, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1541, "tokens_per_second_per_gpu": 13630.19 }, { "epoch": 0.03855, "grad_norm": 0.32421875, "learning_rate": 0.0023114999999999998, "loss": 3.4626, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1542, "tokens_per_second_per_gpu": 16266.5 }, { "epoch": 0.038575, "grad_norm": 0.4765625, "learning_rate": 0.002313, "loss": 3.4722, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 1543, "tokens_per_second_per_gpu": 11235.13 }, { "epoch": 0.0386, "grad_norm": 0.59375, "learning_rate": 0.0023145, "loss": 3.4756, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1544, "tokens_per_second_per_gpu": 15456.24 }, { "epoch": 0.038625, "grad_norm": 0.6328125, "learning_rate": 0.002316, "loss": 3.5092, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1545, "tokens_per_second_per_gpu": 24419.36 }, { "epoch": 0.03865, "grad_norm": 0.59765625, "learning_rate": 0.0023175, "loss": 3.5375, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 1546, "tokens_per_second_per_gpu": 28461.41 }, { "epoch": 0.038675, "grad_norm": 0.5703125, "learning_rate": 0.0023190000000000003, "loss": 3.4943, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1547, "tokens_per_second_per_gpu": 17797.76 }, { "epoch": 0.0387, "grad_norm": 0.390625, "learning_rate": 0.0023205, "loss": 3.5037, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1548, "tokens_per_second_per_gpu": 19486.02 }, { "epoch": 0.038725, "grad_norm": 0.41796875, "learning_rate": 0.0023220000000000003, "loss": 3.4721, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1549, "tokens_per_second_per_gpu": 17649.97 }, { "epoch": 0.03875, "grad_norm": 0.4140625, "learning_rate": 0.0023235, "loss": 3.5126, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1550, "tokens_per_second_per_gpu": 10254.37 }, { "epoch": 0.038775, "grad_norm": 0.52734375, "learning_rate": 0.0023250000000000002, "loss": 3.4441, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1551, "tokens_per_second_per_gpu": 12625.55 }, { "epoch": 0.0388, "grad_norm": 0.59375, "learning_rate": 0.0023265, "loss": 3.5053, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1552, "tokens_per_second_per_gpu": 17603.14 }, { "epoch": 0.038825, "grad_norm": 0.62109375, "learning_rate": 0.002328, "loss": 3.4928, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1553, "tokens_per_second_per_gpu": 13551.34 }, { "epoch": 0.03885, "grad_norm": 0.58984375, "learning_rate": 0.0023295, "loss": 3.5124, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1554, "tokens_per_second_per_gpu": 15479.75 }, { "epoch": 0.038875, "grad_norm": 0.48046875, "learning_rate": 0.002331, "loss": 3.5124, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1555, "tokens_per_second_per_gpu": 18888.81 }, { "epoch": 0.0389, "grad_norm": 0.400390625, "learning_rate": 0.0023325, "loss": 3.4879, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1556, "tokens_per_second_per_gpu": 21046.06 }, { "epoch": 0.038925, "grad_norm": 0.388671875, "learning_rate": 0.002334, "loss": 3.4582, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1557, "tokens_per_second_per_gpu": 19942.92 }, { "epoch": 0.03895, "grad_norm": 0.419921875, "learning_rate": 0.0023355, "loss": 3.5412, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1558, "tokens_per_second_per_gpu": 28217.0 }, { "epoch": 0.038975, "grad_norm": 0.427734375, "learning_rate": 0.002337, "loss": 3.4588, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1559, "tokens_per_second_per_gpu": 15820.3 }, { "epoch": 0.039, "grad_norm": 0.349609375, "learning_rate": 0.0023385, "loss": 3.4924, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1560, "tokens_per_second_per_gpu": 16934.2 }, { "epoch": 0.039025, "grad_norm": 0.376953125, "learning_rate": 0.00234, "loss": 3.47, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1561, "tokens_per_second_per_gpu": 27068.53 }, { "epoch": 0.03905, "grad_norm": 0.341796875, "learning_rate": 0.0023415, "loss": 3.4837, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1562, "tokens_per_second_per_gpu": 12560.33 }, { "epoch": 0.039075, "grad_norm": 0.39453125, "learning_rate": 0.002343, "loss": 3.4459, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1563, "tokens_per_second_per_gpu": 17863.81 }, { "epoch": 0.0391, "grad_norm": 0.5546875, "learning_rate": 0.0023445, "loss": 3.4623, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1564, "tokens_per_second_per_gpu": 16997.3 }, { "epoch": 0.039125, "grad_norm": 0.5234375, "learning_rate": 0.002346, "loss": 3.4047, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1565, "tokens_per_second_per_gpu": 18001.77 }, { "epoch": 0.03915, "grad_norm": 0.4296875, "learning_rate": 0.0023474999999999998, "loss": 3.4568, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1566, "tokens_per_second_per_gpu": 23991.34 }, { "epoch": 0.039175, "grad_norm": 0.421875, "learning_rate": 0.002349, "loss": 3.4402, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1567, "tokens_per_second_per_gpu": 17945.49 }, { "epoch": 0.0392, "grad_norm": 0.56640625, "learning_rate": 0.0023505, "loss": 3.5191, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1568, "tokens_per_second_per_gpu": 20598.78 }, { "epoch": 0.039225, "grad_norm": 0.546875, "learning_rate": 0.002352, "loss": 3.4753, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.62, "memory/max_allocated (GiB)": 66.62, "step": 1569, "tokens_per_second_per_gpu": 19871.0 }, { "epoch": 0.03925, "grad_norm": 0.40234375, "learning_rate": 0.0023535, "loss": 3.5314, "memory/device_reserved (GiB)": 76.5, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1570, "tokens_per_second_per_gpu": 17011.15 }, { "epoch": 0.039275, "grad_norm": 0.31640625, "learning_rate": 0.0023550000000000003, "loss": 3.473, "memory/device_reserved (GiB)": 106.43, "memory/max_active (GiB)": 106.39, "memory/max_allocated (GiB)": 106.39, "step": 1571, "tokens_per_second_per_gpu": 12909.41 }, { "epoch": 0.0393, "grad_norm": 0.3515625, "learning_rate": 0.0023565, "loss": 3.4762, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1572, "tokens_per_second_per_gpu": 11845.91 }, { "epoch": 0.039325, "grad_norm": 0.44140625, "learning_rate": 0.0023580000000000003, "loss": 3.4611, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1573, "tokens_per_second_per_gpu": 11744.2 }, { "epoch": 0.03935, "grad_norm": 0.365234375, "learning_rate": 0.0023595, "loss": 3.4996, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1574, "tokens_per_second_per_gpu": 12723.32 }, { "epoch": 0.039375, "grad_norm": 0.31640625, "learning_rate": 0.0023610000000000003, "loss": 3.4751, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1575, "tokens_per_second_per_gpu": 17002.24 }, { "epoch": 0.0394, "grad_norm": 0.263671875, "learning_rate": 0.0023625, "loss": 3.4561, "memory/device_reserved (GiB)": 65.6, "memory/max_active (GiB)": 65.59, "memory/max_allocated (GiB)": 65.59, "step": 1576, "tokens_per_second_per_gpu": 20240.17 }, { "epoch": 0.039425, "grad_norm": 0.279296875, "learning_rate": 0.002364, "loss": 3.4415, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1577, "tokens_per_second_per_gpu": 17916.55 }, { "epoch": 0.03945, "grad_norm": 0.30078125, "learning_rate": 0.0023655, "loss": 3.445, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1578, "tokens_per_second_per_gpu": 12687.97 }, { "epoch": 0.039475, "grad_norm": 0.38671875, "learning_rate": 0.002367, "loss": 3.4384, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1579, "tokens_per_second_per_gpu": 25421.46 }, { "epoch": 0.0395, "grad_norm": 0.484375, "learning_rate": 0.0023685, "loss": 3.459, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1580, "tokens_per_second_per_gpu": 22546.57 }, { "epoch": 0.039525, "grad_norm": 0.546875, "learning_rate": 0.00237, "loss": 3.4654, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1581, "tokens_per_second_per_gpu": 20141.36 }, { "epoch": 0.03955, "grad_norm": 0.466796875, "learning_rate": 0.0023715, "loss": 3.4915, "memory/device_reserved (GiB)": 55.57, "memory/max_active (GiB)": 55.42, "memory/max_allocated (GiB)": 55.42, "step": 1582, "tokens_per_second_per_gpu": 22420.42 }, { "epoch": 0.039575, "grad_norm": 0.328125, "learning_rate": 0.002373, "loss": 3.4765, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1583, "tokens_per_second_per_gpu": 14095.85 }, { "epoch": 0.0396, "grad_norm": 0.3984375, "learning_rate": 0.0023745, "loss": 3.4737, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1584, "tokens_per_second_per_gpu": 15514.73 }, { "epoch": 0.039625, "grad_norm": 0.404296875, "learning_rate": 0.002376, "loss": 3.4759, "memory/device_reserved (GiB)": 106.61, "memory/max_active (GiB)": 106.43, "memory/max_allocated (GiB)": 106.43, "step": 1585, "tokens_per_second_per_gpu": 12849.41 }, { "epoch": 0.03965, "grad_norm": 0.37109375, "learning_rate": 0.0023775, "loss": 3.4362, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1586, "tokens_per_second_per_gpu": 13089.47 }, { "epoch": 0.039675, "grad_norm": 0.4296875, "learning_rate": 0.002379, "loss": 3.4331, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1587, "tokens_per_second_per_gpu": 22946.66 }, { "epoch": 0.0397, "grad_norm": 0.44921875, "learning_rate": 0.0023805, "loss": 3.4679, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1588, "tokens_per_second_per_gpu": 17487.01 }, { "epoch": 0.039725, "grad_norm": 0.482421875, "learning_rate": 0.002382, "loss": 3.4699, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1589, "tokens_per_second_per_gpu": 10994.62 }, { "epoch": 0.03975, "grad_norm": 0.50390625, "learning_rate": 0.0023835, "loss": 3.4734, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1590, "tokens_per_second_per_gpu": 18289.69 }, { "epoch": 0.039775, "grad_norm": 0.4296875, "learning_rate": 0.002385, "loss": 3.4553, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1591, "tokens_per_second_per_gpu": 13825.49 }, { "epoch": 0.0398, "grad_norm": 0.498046875, "learning_rate": 0.0023865, "loss": 3.4363, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1592, "tokens_per_second_per_gpu": 22341.43 }, { "epoch": 0.039825, "grad_norm": 0.498046875, "learning_rate": 0.0023880000000000004, "loss": 3.474, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1593, "tokens_per_second_per_gpu": 20192.44 }, { "epoch": 0.03985, "grad_norm": 0.5234375, "learning_rate": 0.0023895, "loss": 3.4676, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1594, "tokens_per_second_per_gpu": 15125.67 }, { "epoch": 0.039875, "grad_norm": 0.48046875, "learning_rate": 0.0023910000000000003, "loss": 3.4492, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1595, "tokens_per_second_per_gpu": 13952.21 }, { "epoch": 0.0399, "grad_norm": 0.474609375, "learning_rate": 0.0023925, "loss": 3.513, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1596, "tokens_per_second_per_gpu": 20800.77 }, { "epoch": 0.039925, "grad_norm": 0.498046875, "learning_rate": 0.0023940000000000003, "loss": 3.4598, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1597, "tokens_per_second_per_gpu": 23814.2 }, { "epoch": 0.03995, "grad_norm": 0.41015625, "learning_rate": 0.0023955, "loss": 3.4759, "memory/device_reserved (GiB)": 85.82, "memory/max_active (GiB)": 85.68, "memory/max_allocated (GiB)": 85.68, "step": 1598, "tokens_per_second_per_gpu": 15088.56 }, { "epoch": 0.039975, "grad_norm": 0.462890625, "learning_rate": 0.0023970000000000003, "loss": 3.4686, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1599, "tokens_per_second_per_gpu": 27082.92 }, { "epoch": 0.04, "grad_norm": 0.53515625, "learning_rate": 0.0023985, "loss": 3.4804, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1600, "tokens_per_second_per_gpu": 12031.87 }, { "epoch": 0.040025, "grad_norm": 0.431640625, "learning_rate": 0.0024000000000000002, "loss": 3.4425, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1601, "tokens_per_second_per_gpu": 15837.83 }, { "epoch": 0.04005, "grad_norm": 0.49609375, "learning_rate": 0.0024015, "loss": 3.4657, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1602, "tokens_per_second_per_gpu": 16144.09 }, { "epoch": 0.040075, "grad_norm": 0.5546875, "learning_rate": 0.002403, "loss": 3.4979, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1603, "tokens_per_second_per_gpu": 15848.46 }, { "epoch": 0.0401, "grad_norm": 0.478515625, "learning_rate": 0.0024045, "loss": 3.4807, "memory/device_reserved (GiB)": 96.22, "memory/max_active (GiB)": 96.19, "memory/max_allocated (GiB)": 96.19, "step": 1604, "tokens_per_second_per_gpu": 14136.87 }, { "epoch": 0.040125, "grad_norm": 0.2158203125, "learning_rate": 0.002406, "loss": 3.4584, "memory/device_reserved (GiB)": 106.61, "memory/max_active (GiB)": 106.43, "memory/max_allocated (GiB)": 106.43, "step": 1605, "tokens_per_second_per_gpu": 12098.17 }, { "epoch": 0.04015, "grad_norm": 0.3828125, "learning_rate": 0.0024075, "loss": 3.4939, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 1606, "tokens_per_second_per_gpu": 13997.28 }, { "epoch": 0.040175, "grad_norm": 0.376953125, "learning_rate": 0.002409, "loss": 3.4807, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1607, "tokens_per_second_per_gpu": 15028.75 }, { "epoch": 0.0402, "grad_norm": 0.48046875, "learning_rate": 0.0024105, "loss": 3.4622, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1608, "tokens_per_second_per_gpu": 20265.75 }, { "epoch": 0.040225, "grad_norm": 0.4765625, "learning_rate": 0.002412, "loss": 3.4482, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1609, "tokens_per_second_per_gpu": 15102.78 }, { "epoch": 0.04025, "grad_norm": 0.61328125, "learning_rate": 0.0024135, "loss": 3.4781, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1610, "tokens_per_second_per_gpu": 14823.11 }, { "epoch": 0.040275, "grad_norm": 0.625, "learning_rate": 0.002415, "loss": 3.4686, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1611, "tokens_per_second_per_gpu": 23148.24 }, { "epoch": 0.0403, "grad_norm": 0.5625, "learning_rate": 0.0024165000000000002, "loss": 3.4665, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1612, "tokens_per_second_per_gpu": 15439.18 }, { "epoch": 0.040325, "grad_norm": 0.56640625, "learning_rate": 0.002418, "loss": 3.4718, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1613, "tokens_per_second_per_gpu": 16189.41 }, { "epoch": 0.04035, "grad_norm": 0.46875, "learning_rate": 0.0024195, "loss": 3.5027, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1614, "tokens_per_second_per_gpu": 18394.53 }, { "epoch": 0.040375, "grad_norm": 0.388671875, "learning_rate": 0.0024210000000000004, "loss": 3.4931, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1615, "tokens_per_second_per_gpu": 23895.62 }, { "epoch": 0.0404, "grad_norm": 0.412109375, "learning_rate": 0.0024225, "loss": 3.4385, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1616, "tokens_per_second_per_gpu": 24598.08 }, { "epoch": 0.040425, "grad_norm": 0.390625, "learning_rate": 0.0024240000000000004, "loss": 3.4905, "memory/device_reserved (GiB)": 55.57, "memory/max_active (GiB)": 55.42, "memory/max_allocated (GiB)": 55.42, "step": 1617, "tokens_per_second_per_gpu": 24013.93 }, { "epoch": 0.04045, "grad_norm": 0.44140625, "learning_rate": 0.0024255, "loss": 3.4878, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1618, "tokens_per_second_per_gpu": 21628.74 }, { "epoch": 0.040475, "grad_norm": 0.375, "learning_rate": 0.0024270000000000003, "loss": 3.4842, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1619, "tokens_per_second_per_gpu": 13740.61 }, { "epoch": 0.0405, "grad_norm": 0.396484375, "learning_rate": 0.0024285, "loss": 3.4546, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1620, "tokens_per_second_per_gpu": 10505.39 }, { "epoch": 0.040525, "grad_norm": 0.54296875, "learning_rate": 0.0024300000000000003, "loss": 3.4532, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1621, "tokens_per_second_per_gpu": 14010.82 }, { "epoch": 0.04055, "grad_norm": 0.44140625, "learning_rate": 0.0024315, "loss": 3.4622, "memory/device_reserved (GiB)": 86.71, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1622, "tokens_per_second_per_gpu": 15600.27 }, { "epoch": 0.040575, "grad_norm": 0.328125, "learning_rate": 0.0024330000000000003, "loss": 3.4374, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1623, "tokens_per_second_per_gpu": 23962.38 }, { "epoch": 0.0406, "grad_norm": 0.486328125, "learning_rate": 0.0024345, "loss": 3.4678, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1624, "tokens_per_second_per_gpu": 11110.47 }, { "epoch": 0.040625, "grad_norm": 0.41796875, "learning_rate": 0.0024360000000000002, "loss": 3.433, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1625, "tokens_per_second_per_gpu": 28676.3 }, { "epoch": 0.04065, "grad_norm": 0.5, "learning_rate": 0.0024375, "loss": 3.4638, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1626, "tokens_per_second_per_gpu": 12804.39 }, { "epoch": 0.040675, "grad_norm": 0.59375, "learning_rate": 0.0024389999999999998, "loss": 3.4699, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 1627, "tokens_per_second_per_gpu": 28573.53 }, { "epoch": 0.0407, "grad_norm": 0.72265625, "learning_rate": 0.0024405, "loss": 3.4996, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1628, "tokens_per_second_per_gpu": 10458.89 }, { "epoch": 0.040725, "grad_norm": 0.71484375, "learning_rate": 0.0024419999999999997, "loss": 3.4523, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1629, "tokens_per_second_per_gpu": 24753.83 }, { "epoch": 0.04075, "grad_norm": 0.486328125, "learning_rate": 0.0024435, "loss": 3.502, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1630, "tokens_per_second_per_gpu": 12038.92 }, { "epoch": 0.040775, "grad_norm": 0.45703125, "learning_rate": 0.0024449999999999997, "loss": 3.4451, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 1631, "tokens_per_second_per_gpu": 28333.75 }, { "epoch": 0.0408, "grad_norm": 0.6640625, "learning_rate": 0.0024465, "loss": 3.4622, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1632, "tokens_per_second_per_gpu": 10859.27 }, { "epoch": 0.040825, "grad_norm": 0.9609375, "learning_rate": 0.002448, "loss": 3.5624, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1633, "tokens_per_second_per_gpu": 16038.7 }, { "epoch": 0.04085, "grad_norm": 0.92578125, "learning_rate": 0.0024495, "loss": 3.5544, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 1634, "tokens_per_second_per_gpu": 11856.2 }, { "epoch": 0.040875, "grad_norm": 0.83203125, "learning_rate": 0.002451, "loss": 3.5095, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1635, "tokens_per_second_per_gpu": 26486.63 }, { "epoch": 0.0409, "grad_norm": 0.53125, "learning_rate": 0.0024525000000000003, "loss": 3.5103, "memory/device_reserved (GiB)": 86.21, "memory/max_active (GiB)": 86.02, "memory/max_allocated (GiB)": 86.02, "step": 1636, "tokens_per_second_per_gpu": 15558.25 }, { "epoch": 0.040925, "grad_norm": 0.6796875, "learning_rate": 0.002454, "loss": 3.5592, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1637, "tokens_per_second_per_gpu": 13368.45 }, { "epoch": 0.04095, "grad_norm": 0.65234375, "learning_rate": 0.0024555000000000002, "loss": 3.5235, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1638, "tokens_per_second_per_gpu": 12681.15 }, { "epoch": 0.040975, "grad_norm": 0.6328125, "learning_rate": 0.002457, "loss": 3.518, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1639, "tokens_per_second_per_gpu": 11755.22 }, { "epoch": 0.041, "grad_norm": 0.78515625, "learning_rate": 0.0024585, "loss": 3.5068, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1640, "tokens_per_second_per_gpu": 24181.71 }, { "epoch": 0.041025, "grad_norm": 0.7578125, "learning_rate": 0.00246, "loss": 3.5537, "memory/device_reserved (GiB)": 64.28, "memory/max_active (GiB)": 64.24, "memory/max_allocated (GiB)": 64.24, "step": 1641, "tokens_per_second_per_gpu": 19813.17 }, { "epoch": 0.04105, "grad_norm": 0.50390625, "learning_rate": 0.0024615, "loss": 3.4945, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1642, "tokens_per_second_per_gpu": 17963.63 }, { "epoch": 0.041075, "grad_norm": 0.5390625, "learning_rate": 0.002463, "loss": 3.49, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1643, "tokens_per_second_per_gpu": 17191.81 }, { "epoch": 0.0411, "grad_norm": 0.5234375, "learning_rate": 0.0024645, "loss": 3.4985, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1644, "tokens_per_second_per_gpu": 15627.01 }, { "epoch": 0.041125, "grad_norm": 0.458984375, "learning_rate": 0.002466, "loss": 3.4606, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1645, "tokens_per_second_per_gpu": 13447.72 }, { "epoch": 0.04115, "grad_norm": 0.384765625, "learning_rate": 0.0024675, "loss": 3.4865, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1646, "tokens_per_second_per_gpu": 16562.44 }, { "epoch": 0.041175, "grad_norm": 0.30078125, "learning_rate": 0.002469, "loss": 3.452, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1647, "tokens_per_second_per_gpu": 19882.36 }, { "epoch": 0.0412, "grad_norm": 0.3125, "learning_rate": 0.0024705, "loss": 3.482, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1648, "tokens_per_second_per_gpu": 17699.48 }, { "epoch": 0.041225, "grad_norm": 0.275390625, "learning_rate": 0.002472, "loss": 3.4162, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1649, "tokens_per_second_per_gpu": 20101.43 }, { "epoch": 0.04125, "grad_norm": 0.26953125, "learning_rate": 0.0024735, "loss": 3.4138, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1650, "tokens_per_second_per_gpu": 27146.19 }, { "epoch": 0.041275, "grad_norm": 0.234375, "learning_rate": 0.0024749999999999998, "loss": 3.4321, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1651, "tokens_per_second_per_gpu": 19709.27 }, { "epoch": 0.0413, "grad_norm": 0.259765625, "learning_rate": 0.0024765, "loss": 3.477, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1652, "tokens_per_second_per_gpu": 31305.36 }, { "epoch": 0.041325, "grad_norm": 0.2333984375, "learning_rate": 0.0024779999999999997, "loss": 3.41, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1653, "tokens_per_second_per_gpu": 23790.99 }, { "epoch": 0.04135, "grad_norm": 0.294921875, "learning_rate": 0.0024795, "loss": 3.438, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1654, "tokens_per_second_per_gpu": 13756.67 }, { "epoch": 0.041375, "grad_norm": 0.361328125, "learning_rate": 0.002481, "loss": 3.4131, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1655, "tokens_per_second_per_gpu": 12021.3 }, { "epoch": 0.0414, "grad_norm": 0.314453125, "learning_rate": 0.0024825, "loss": 3.452, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1656, "tokens_per_second_per_gpu": 17313.17 }, { "epoch": 0.041425, "grad_norm": 0.2734375, "learning_rate": 0.002484, "loss": 3.4499, "memory/device_reserved (GiB)": 75.99, "memory/max_active (GiB)": 75.82, "memory/max_allocated (GiB)": 75.82, "step": 1657, "tokens_per_second_per_gpu": 18143.79 }, { "epoch": 0.04145, "grad_norm": 0.3203125, "learning_rate": 0.0024855000000000003, "loss": 3.4232, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1658, "tokens_per_second_per_gpu": 22650.06 }, { "epoch": 0.041475, "grad_norm": 0.294921875, "learning_rate": 0.002487, "loss": 3.4367, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1659, "tokens_per_second_per_gpu": 19148.84 }, { "epoch": 0.0415, "grad_norm": 0.337890625, "learning_rate": 0.0024885000000000003, "loss": 3.4237, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1660, "tokens_per_second_per_gpu": 11299.52 }, { "epoch": 0.041525, "grad_norm": 0.376953125, "learning_rate": 0.00249, "loss": 3.4072, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1661, "tokens_per_second_per_gpu": 10692.7 }, { "epoch": 0.04155, "grad_norm": 0.392578125, "learning_rate": 0.0024915000000000002, "loss": 3.4118, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1662, "tokens_per_second_per_gpu": 14572.68 }, { "epoch": 0.041575, "grad_norm": 0.32421875, "learning_rate": 0.002493, "loss": 3.3948, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1663, "tokens_per_second_per_gpu": 20532.19 }, { "epoch": 0.0416, "grad_norm": 0.28125, "learning_rate": 0.0024945, "loss": 3.373, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1664, "tokens_per_second_per_gpu": 16522.28 }, { "epoch": 0.041625, "grad_norm": 0.306640625, "learning_rate": 0.002496, "loss": 3.4468, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1665, "tokens_per_second_per_gpu": 23946.77 }, { "epoch": 0.04165, "grad_norm": 0.296875, "learning_rate": 0.0024975, "loss": 3.4283, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1666, "tokens_per_second_per_gpu": 19907.08 }, { "epoch": 0.041675, "grad_norm": 0.28125, "learning_rate": 0.002499, "loss": 3.4222, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1667, "tokens_per_second_per_gpu": 12874.36 }, { "epoch": 0.0417, "grad_norm": 0.3671875, "learning_rate": 0.0025005, "loss": 3.4831, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1668, "tokens_per_second_per_gpu": 20630.17 }, { "epoch": 0.041725, "grad_norm": 0.44140625, "learning_rate": 0.002502, "loss": 3.4484, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1669, "tokens_per_second_per_gpu": 20443.85 }, { "epoch": 0.04175, "grad_norm": 0.4921875, "learning_rate": 0.0025035, "loss": 3.411, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1670, "tokens_per_second_per_gpu": 10552.13 }, { "epoch": 0.041775, "grad_norm": 0.4609375, "learning_rate": 0.002505, "loss": 3.4083, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1671, "tokens_per_second_per_gpu": 12234.78 }, { "epoch": 0.0418, "grad_norm": 0.40234375, "learning_rate": 0.0025065, "loss": 3.4449, "memory/device_reserved (GiB)": 55.57, "memory/max_active (GiB)": 55.42, "memory/max_allocated (GiB)": 55.42, "step": 1672, "tokens_per_second_per_gpu": 23493.66 }, { "epoch": 0.041825, "grad_norm": 0.326171875, "learning_rate": 0.002508, "loss": 3.422, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1673, "tokens_per_second_per_gpu": 13667.7 }, { "epoch": 0.04185, "grad_norm": 0.474609375, "learning_rate": 0.0025095, "loss": 3.4849, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1674, "tokens_per_second_per_gpu": 13409.66 }, { "epoch": 0.041875, "grad_norm": 0.51953125, "learning_rate": 0.0025109999999999998, "loss": 3.4117, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1675, "tokens_per_second_per_gpu": 12861.87 }, { "epoch": 0.0419, "grad_norm": 0.5859375, "learning_rate": 0.0025125, "loss": 3.4487, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1676, "tokens_per_second_per_gpu": 13391.9 }, { "epoch": 0.041925, "grad_norm": 0.45703125, "learning_rate": 0.0025139999999999997, "loss": 3.4691, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1677, "tokens_per_second_per_gpu": 13697.51 }, { "epoch": 0.04195, "grad_norm": 0.458984375, "learning_rate": 0.0025155, "loss": 3.4354, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 1678, "tokens_per_second_per_gpu": 11489.94 }, { "epoch": 0.041975, "grad_norm": 0.41796875, "learning_rate": 0.002517, "loss": 3.4734, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1679, "tokens_per_second_per_gpu": 10921.96 }, { "epoch": 0.042, "grad_norm": 0.376953125, "learning_rate": 0.0025185000000000003, "loss": 3.4138, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1680, "tokens_per_second_per_gpu": 14813.73 }, { "epoch": 0.042025, "grad_norm": 0.357421875, "learning_rate": 0.00252, "loss": 3.395, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1681, "tokens_per_second_per_gpu": 22947.66 }, { "epoch": 0.04205, "grad_norm": 0.375, "learning_rate": 0.0025215000000000003, "loss": 3.4281, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1682, "tokens_per_second_per_gpu": 11994.21 }, { "epoch": 0.042075, "grad_norm": 0.37109375, "learning_rate": 0.002523, "loss": 3.4589, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1683, "tokens_per_second_per_gpu": 15257.0 }, { "epoch": 0.0421, "grad_norm": 0.34375, "learning_rate": 0.0025245000000000003, "loss": 3.4429, "memory/device_reserved (GiB)": 44.36, "memory/max_active (GiB)": 44.31, "memory/max_allocated (GiB)": 44.31, "step": 1684, "tokens_per_second_per_gpu": 27548.71 }, { "epoch": 0.042125, "grad_norm": 0.4140625, "learning_rate": 0.002526, "loss": 3.4058, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1685, "tokens_per_second_per_gpu": 17574.15 }, { "epoch": 0.04215, "grad_norm": 0.3984375, "learning_rate": 0.0025275000000000002, "loss": 3.4312, "memory/device_reserved (GiB)": 74.93, "memory/max_active (GiB)": 74.91, "memory/max_allocated (GiB)": 74.91, "step": 1686, "tokens_per_second_per_gpu": 18549.39 }, { "epoch": 0.042175, "grad_norm": 0.294921875, "learning_rate": 0.002529, "loss": 3.4163, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1687, "tokens_per_second_per_gpu": 15868.69 }, { "epoch": 0.0422, "grad_norm": 0.416015625, "learning_rate": 0.0025305, "loss": 3.4385, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1688, "tokens_per_second_per_gpu": 16224.91 }, { "epoch": 0.042225, "grad_norm": 0.41796875, "learning_rate": 0.002532, "loss": 3.4211, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1689, "tokens_per_second_per_gpu": 14969.11 }, { "epoch": 0.04225, "grad_norm": 0.3203125, "learning_rate": 0.0025335, "loss": 3.3951, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1690, "tokens_per_second_per_gpu": 17850.22 }, { "epoch": 0.042275, "grad_norm": 0.41015625, "learning_rate": 0.002535, "loss": 3.4191, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1691, "tokens_per_second_per_gpu": 15729.04 }, { "epoch": 0.0423, "grad_norm": 0.439453125, "learning_rate": 0.0025365, "loss": 3.4192, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1692, "tokens_per_second_per_gpu": 15578.61 }, { "epoch": 0.042325, "grad_norm": 0.326171875, "learning_rate": 0.002538, "loss": 3.4293, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1693, "tokens_per_second_per_gpu": 21717.81 }, { "epoch": 0.04235, "grad_norm": 0.294921875, "learning_rate": 0.0025395, "loss": 3.4162, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1694, "tokens_per_second_per_gpu": 23759.26 }, { "epoch": 0.042375, "grad_norm": 0.376953125, "learning_rate": 0.002541, "loss": 3.4058, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1695, "tokens_per_second_per_gpu": 22957.98 }, { "epoch": 0.0424, "grad_norm": 0.361328125, "learning_rate": 0.0025425, "loss": 3.4236, "memory/device_reserved (GiB)": 85.82, "memory/max_active (GiB)": 85.68, "memory/max_allocated (GiB)": 85.68, "step": 1696, "tokens_per_second_per_gpu": 15174.02 }, { "epoch": 0.042425, "grad_norm": 0.353515625, "learning_rate": 0.002544, "loss": 3.3949, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1697, "tokens_per_second_per_gpu": 17562.99 }, { "epoch": 0.04245, "grad_norm": 0.3671875, "learning_rate": 0.0025455, "loss": 3.4254, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1698, "tokens_per_second_per_gpu": 22963.27 }, { "epoch": 0.042475, "grad_norm": 0.3671875, "learning_rate": 0.002547, "loss": 3.4004, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1699, "tokens_per_second_per_gpu": 11758.64 }, { "epoch": 0.0425, "grad_norm": 0.37109375, "learning_rate": 0.0025485, "loss": 3.4032, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1700, "tokens_per_second_per_gpu": 23581.93 }, { "epoch": 0.042525, "grad_norm": 0.44921875, "learning_rate": 0.00255, "loss": 3.4364, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1701, "tokens_per_second_per_gpu": 10589.65 }, { "epoch": 0.04255, "grad_norm": 0.4921875, "learning_rate": 0.0025515, "loss": 3.4531, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 1702, "tokens_per_second_per_gpu": 13398.45 }, { "epoch": 0.042575, "grad_norm": 0.5546875, "learning_rate": 0.002553, "loss": 3.4716, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1703, "tokens_per_second_per_gpu": 17913.65 }, { "epoch": 0.0426, "grad_norm": 0.486328125, "learning_rate": 0.0025545000000000003, "loss": 3.4539, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 1704, "tokens_per_second_per_gpu": 13482.41 }, { "epoch": 0.042625, "grad_norm": 0.458984375, "learning_rate": 0.002556, "loss": 3.4376, "memory/device_reserved (GiB)": 64.72, "memory/max_active (GiB)": 64.71, "memory/max_allocated (GiB)": 64.71, "step": 1705, "tokens_per_second_per_gpu": 20009.22 }, { "epoch": 0.04265, "grad_norm": 0.458984375, "learning_rate": 0.0025575000000000003, "loss": 3.4079, "memory/device_reserved (GiB)": 56.63, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1706, "tokens_per_second_per_gpu": 22731.28 }, { "epoch": 0.042675, "grad_norm": 0.38671875, "learning_rate": 0.002559, "loss": 3.4195, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1707, "tokens_per_second_per_gpu": 18695.79 }, { "epoch": 0.0427, "grad_norm": 0.39453125, "learning_rate": 0.0025605000000000003, "loss": 3.3978, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1708, "tokens_per_second_per_gpu": 10495.74 }, { "epoch": 0.042725, "grad_norm": 0.416015625, "learning_rate": 0.002562, "loss": 3.3919, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1709, "tokens_per_second_per_gpu": 15384.14 }, { "epoch": 0.04275, "grad_norm": 0.5078125, "learning_rate": 0.0025635000000000002, "loss": 3.4465, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1710, "tokens_per_second_per_gpu": 21356.58 }, { "epoch": 0.042775, "grad_norm": 0.423828125, "learning_rate": 0.002565, "loss": 3.408, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1711, "tokens_per_second_per_gpu": 13459.02 }, { "epoch": 0.0428, "grad_norm": 0.421875, "learning_rate": 0.0025665, "loss": 3.4107, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1712, "tokens_per_second_per_gpu": 15576.53 }, { "epoch": 0.042825, "grad_norm": 0.466796875, "learning_rate": 0.002568, "loss": 3.4465, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1713, "tokens_per_second_per_gpu": 24181.26 }, { "epoch": 0.04285, "grad_norm": 0.55859375, "learning_rate": 0.0025695, "loss": 3.4621, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1714, "tokens_per_second_per_gpu": 27803.98 }, { "epoch": 0.042875, "grad_norm": 0.57421875, "learning_rate": 0.002571, "loss": 3.4206, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1715, "tokens_per_second_per_gpu": 24012.74 }, { "epoch": 0.0429, "grad_norm": 0.490234375, "learning_rate": 0.0025725, "loss": 3.4463, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1716, "tokens_per_second_per_gpu": 12538.54 }, { "epoch": 0.042925, "grad_norm": 0.412109375, "learning_rate": 0.002574, "loss": 3.4565, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1717, "tokens_per_second_per_gpu": 19965.96 }, { "epoch": 0.04295, "grad_norm": 0.392578125, "learning_rate": 0.0025755, "loss": 3.4361, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1718, "tokens_per_second_per_gpu": 12689.62 }, { "epoch": 0.042975, "grad_norm": 0.38671875, "learning_rate": 0.002577, "loss": 3.4358, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1719, "tokens_per_second_per_gpu": 15987.55 }, { "epoch": 0.043, "grad_norm": 0.431640625, "learning_rate": 0.0025785, "loss": 3.4481, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1720, "tokens_per_second_per_gpu": 23525.6 }, { "epoch": 0.043025, "grad_norm": 0.4453125, "learning_rate": 0.00258, "loss": 3.4401, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1721, "tokens_per_second_per_gpu": 10768.58 }, { "epoch": 0.04305, "grad_norm": 0.44921875, "learning_rate": 0.0025815, "loss": 3.4029, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1722, "tokens_per_second_per_gpu": 14518.37 }, { "epoch": 0.043075, "grad_norm": 0.384765625, "learning_rate": 0.0025830000000000002, "loss": 3.3824, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1723, "tokens_per_second_per_gpu": 19566.23 }, { "epoch": 0.0431, "grad_norm": 0.27734375, "learning_rate": 0.0025845, "loss": 3.4756, "memory/device_reserved (GiB)": 96.93, "memory/max_active (GiB)": 96.75, "memory/max_allocated (GiB)": 96.75, "step": 1724, "tokens_per_second_per_gpu": 13699.55 }, { "epoch": 0.043125, "grad_norm": 0.255859375, "learning_rate": 0.002586, "loss": 3.4392, "memory/device_reserved (GiB)": 55.41, "memory/max_active (GiB)": 55.39, "memory/max_allocated (GiB)": 55.39, "step": 1725, "tokens_per_second_per_gpu": 23270.79 }, { "epoch": 0.04315, "grad_norm": 0.2421875, "learning_rate": 0.0025875000000000004, "loss": 3.4289, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1726, "tokens_per_second_per_gpu": 20476.82 }, { "epoch": 0.043175, "grad_norm": 0.244140625, "learning_rate": 0.002589, "loss": 3.4447, "memory/device_reserved (GiB)": 75.99, "memory/max_active (GiB)": 75.82, "memory/max_allocated (GiB)": 75.82, "step": 1727, "tokens_per_second_per_gpu": 17731.33 }, { "epoch": 0.0432, "grad_norm": 0.24609375, "learning_rate": 0.0025905000000000004, "loss": 3.4028, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1728, "tokens_per_second_per_gpu": 10811.24 }, { "epoch": 0.043225, "grad_norm": 0.306640625, "learning_rate": 0.002592, "loss": 3.4164, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1729, "tokens_per_second_per_gpu": 13945.48 }, { "epoch": 0.04325, "grad_norm": 0.369140625, "learning_rate": 0.0025935000000000003, "loss": 3.3548, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1730, "tokens_per_second_per_gpu": 19830.86 }, { "epoch": 0.043275, "grad_norm": 0.37890625, "learning_rate": 0.002595, "loss": 3.421, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1731, "tokens_per_second_per_gpu": 10542.1 }, { "epoch": 0.0433, "grad_norm": 0.38671875, "learning_rate": 0.0025965000000000003, "loss": 3.4064, "memory/device_reserved (GiB)": 56.14, "memory/max_active (GiB)": 55.95, "memory/max_allocated (GiB)": 55.95, "step": 1732, "tokens_per_second_per_gpu": 21749.89 }, { "epoch": 0.043325, "grad_norm": 0.2255859375, "learning_rate": 0.002598, "loss": 3.4061, "memory/device_reserved (GiB)": 85.15, "memory/max_active (GiB)": 85.11, "memory/max_allocated (GiB)": 85.11, "step": 1733, "tokens_per_second_per_gpu": 15894.06 }, { "epoch": 0.04335, "grad_norm": 0.30859375, "learning_rate": 0.0025995000000000002, "loss": 3.3912, "memory/device_reserved (GiB)": 66.35, "memory/max_active (GiB)": 66.15, "memory/max_allocated (GiB)": 66.15, "step": 1734, "tokens_per_second_per_gpu": 20235.34 }, { "epoch": 0.043375, "grad_norm": 0.28125, "learning_rate": 0.002601, "loss": 3.3853, "memory/device_reserved (GiB)": 44.36, "memory/max_active (GiB)": 44.31, "memory/max_allocated (GiB)": 44.31, "step": 1735, "tokens_per_second_per_gpu": 27550.53 }, { "epoch": 0.0434, "grad_norm": 0.251953125, "learning_rate": 0.0026025, "loss": 3.4352, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1736, "tokens_per_second_per_gpu": 11236.12 }, { "epoch": 0.043425, "grad_norm": 0.404296875, "learning_rate": 0.002604, "loss": 3.4176, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1737, "tokens_per_second_per_gpu": 10840.44 }, { "epoch": 0.04345, "grad_norm": 0.546875, "learning_rate": 0.0026055, "loss": 3.4038, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1738, "tokens_per_second_per_gpu": 17339.64 }, { "epoch": 0.043475, "grad_norm": 0.578125, "learning_rate": 0.002607, "loss": 3.4502, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1739, "tokens_per_second_per_gpu": 10609.71 }, { "epoch": 0.0435, "grad_norm": 0.53125, "learning_rate": 0.0026085, "loss": 3.4514, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1740, "tokens_per_second_per_gpu": 14979.82 }, { "epoch": 0.043525, "grad_norm": 0.46484375, "learning_rate": 0.00261, "loss": 3.461, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1741, "tokens_per_second_per_gpu": 13526.38 }, { "epoch": 0.04355, "grad_norm": 0.44921875, "learning_rate": 0.0026115, "loss": 3.4299, "memory/device_reserved (GiB)": 64.72, "memory/max_active (GiB)": 64.71, "memory/max_allocated (GiB)": 64.71, "step": 1742, "tokens_per_second_per_gpu": 20033.73 }, { "epoch": 0.043575, "grad_norm": 0.40234375, "learning_rate": 0.002613, "loss": 3.4074, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1743, "tokens_per_second_per_gpu": 13919.14 }, { "epoch": 0.0436, "grad_norm": 0.337890625, "learning_rate": 0.0026145, "loss": 3.4226, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1744, "tokens_per_second_per_gpu": 10914.18 }, { "epoch": 0.043625, "grad_norm": 0.34375, "learning_rate": 0.002616, "loss": 3.4013, "memory/device_reserved (GiB)": 46.39, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1745, "tokens_per_second_per_gpu": 26513.04 }, { "epoch": 0.04365, "grad_norm": 0.400390625, "learning_rate": 0.0026175, "loss": 3.4318, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1746, "tokens_per_second_per_gpu": 12843.87 }, { "epoch": 0.043675, "grad_norm": 0.4140625, "learning_rate": 0.0026190000000000002, "loss": 3.4096, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1747, "tokens_per_second_per_gpu": 15880.59 }, { "epoch": 0.0437, "grad_norm": 0.50390625, "learning_rate": 0.0026205000000000004, "loss": 3.4245, "memory/device_reserved (GiB)": 127.55, "memory/max_active (GiB)": 127.35, "memory/max_allocated (GiB)": 127.35, "step": 1748, "tokens_per_second_per_gpu": 10692.67 }, { "epoch": 0.043725, "grad_norm": 0.474609375, "learning_rate": 0.002622, "loss": 3.4249, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1749, "tokens_per_second_per_gpu": 19357.34 }, { "epoch": 0.04375, "grad_norm": 0.423828125, "learning_rate": 0.0026235000000000004, "loss": 3.391, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1750, "tokens_per_second_per_gpu": 23852.2 }, { "epoch": 0.043775, "grad_norm": 0.30859375, "learning_rate": 0.002625, "loss": 3.4351, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1751, "tokens_per_second_per_gpu": 13356.15 }, { "epoch": 0.0438, "grad_norm": 0.25390625, "learning_rate": 0.0026265, "loss": 3.4141, "memory/device_reserved (GiB)": 97.44, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1752, "tokens_per_second_per_gpu": 13330.25 }, { "epoch": 0.043825, "grad_norm": 0.2314453125, "learning_rate": 0.002628, "loss": 3.3987, "memory/device_reserved (GiB)": 96.42, "memory/max_active (GiB)": 96.23, "memory/max_allocated (GiB)": 96.23, "step": 1753, "tokens_per_second_per_gpu": 14470.45 }, { "epoch": 0.04385, "grad_norm": 0.30078125, "learning_rate": 0.0026295, "loss": 3.4109, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1754, "tokens_per_second_per_gpu": 14973.02 }, { "epoch": 0.043875, "grad_norm": 0.314453125, "learning_rate": 0.002631, "loss": 3.4064, "memory/device_reserved (GiB)": 106.26, "memory/max_active (GiB)": 106.08, "memory/max_allocated (GiB)": 106.08, "step": 1755, "tokens_per_second_per_gpu": 12857.47 }, { "epoch": 0.0439, "grad_norm": 0.333984375, "learning_rate": 0.0026325, "loss": 3.4352, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1756, "tokens_per_second_per_gpu": 15689.65 }, { "epoch": 0.043925, "grad_norm": 0.2333984375, "learning_rate": 0.002634, "loss": 3.4027, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1757, "tokens_per_second_per_gpu": 21923.57 }, { "epoch": 0.04395, "grad_norm": 0.197265625, "learning_rate": 0.0026355, "loss": 3.426, "memory/device_reserved (GiB)": 96.42, "memory/max_active (GiB)": 96.22, "memory/max_allocated (GiB)": 96.22, "step": 1758, "tokens_per_second_per_gpu": 13418.51 }, { "epoch": 0.043975, "grad_norm": 0.2578125, "learning_rate": 0.002637, "loss": 3.3894, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1759, "tokens_per_second_per_gpu": 24491.74 }, { "epoch": 0.044, "grad_norm": 0.423828125, "learning_rate": 0.0026385, "loss": 3.4275, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1760, "tokens_per_second_per_gpu": 11247.5 }, { "epoch": 0.044025, "grad_norm": 0.61328125, "learning_rate": 0.00264, "loss": 3.4444, "memory/device_reserved (GiB)": 35.22, "memory/max_active (GiB)": 35.02, "memory/max_allocated (GiB)": 35.02, "step": 1761, "tokens_per_second_per_gpu": 33971.64 }, { "epoch": 0.04405, "grad_norm": 0.55859375, "learning_rate": 0.0026414999999999998, "loss": 3.4196, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1762, "tokens_per_second_per_gpu": 16207.95 }, { "epoch": 0.044075, "grad_norm": 0.474609375, "learning_rate": 0.002643, "loss": 3.4154, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1763, "tokens_per_second_per_gpu": 20758.11 }, { "epoch": 0.0441, "grad_norm": 0.57421875, "learning_rate": 0.0026444999999999997, "loss": 3.4227, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1764, "tokens_per_second_per_gpu": 20933.38 }, { "epoch": 0.044125, "grad_norm": 0.49609375, "learning_rate": 0.002646, "loss": 3.4113, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1765, "tokens_per_second_per_gpu": 20328.8 }, { "epoch": 0.04415, "grad_norm": 0.56640625, "learning_rate": 0.0026475, "loss": 3.424, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1766, "tokens_per_second_per_gpu": 28885.39 }, { "epoch": 0.044175, "grad_norm": 0.60546875, "learning_rate": 0.002649, "loss": 3.4592, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1767, "tokens_per_second_per_gpu": 20234.49 }, { "epoch": 0.0442, "grad_norm": 0.55078125, "learning_rate": 0.0026505, "loss": 3.4609, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1768, "tokens_per_second_per_gpu": 15912.34 }, { "epoch": 0.044225, "grad_norm": 0.466796875, "learning_rate": 0.0026520000000000003, "loss": 3.4629, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1769, "tokens_per_second_per_gpu": 12854.59 }, { "epoch": 0.04425, "grad_norm": 0.49609375, "learning_rate": 0.0026535, "loss": 3.4224, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1770, "tokens_per_second_per_gpu": 23600.25 }, { "epoch": 0.044275, "grad_norm": 0.421875, "learning_rate": 0.0026550000000000002, "loss": 3.4245, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1771, "tokens_per_second_per_gpu": 14240.32 }, { "epoch": 0.0443, "grad_norm": 0.2197265625, "learning_rate": 0.0026565, "loss": 3.4746, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.42, "memory/max_allocated (GiB)": 107.42, "step": 1772, "tokens_per_second_per_gpu": 12415.51 }, { "epoch": 0.044325, "grad_norm": 0.3046875, "learning_rate": 0.002658, "loss": 3.4487, "memory/device_reserved (GiB)": 117.82, "memory/max_active (GiB)": 117.63, "memory/max_allocated (GiB)": 117.63, "step": 1773, "tokens_per_second_per_gpu": 11728.76 }, { "epoch": 0.04435, "grad_norm": 0.380859375, "learning_rate": 0.0026595, "loss": 3.428, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1774, "tokens_per_second_per_gpu": 13483.5 }, { "epoch": 0.044375, "grad_norm": 0.32421875, "learning_rate": 0.002661, "loss": 3.4394, "memory/device_reserved (GiB)": 66.8, "memory/max_active (GiB)": 66.63, "memory/max_allocated (GiB)": 66.63, "step": 1775, "tokens_per_second_per_gpu": 18870.34 }, { "epoch": 0.0444, "grad_norm": 0.255859375, "learning_rate": 0.0026625, "loss": 3.4378, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1776, "tokens_per_second_per_gpu": 13952.93 }, { "epoch": 0.044425, "grad_norm": 0.2275390625, "learning_rate": 0.002664, "loss": 3.3772, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1777, "tokens_per_second_per_gpu": 27786.47 }, { "epoch": 0.04445, "grad_norm": 0.1669921875, "learning_rate": 0.0026655, "loss": 3.4545, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.22, "memory/max_allocated (GiB)": 97.22, "step": 1778, "tokens_per_second_per_gpu": 13063.98 }, { "epoch": 0.044475, "grad_norm": 0.271484375, "learning_rate": 0.002667, "loss": 3.4308, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1779, "tokens_per_second_per_gpu": 15197.1 }, { "epoch": 0.0445, "grad_norm": 0.333984375, "learning_rate": 0.0026685, "loss": 3.3732, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1780, "tokens_per_second_per_gpu": 13364.89 }, { "epoch": 0.044525, "grad_norm": 0.482421875, "learning_rate": 0.00267, "loss": 3.3969, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1781, "tokens_per_second_per_gpu": 10848.07 }, { "epoch": 0.04455, "grad_norm": 0.46484375, "learning_rate": 0.0026715, "loss": 3.4136, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1782, "tokens_per_second_per_gpu": 26226.1 }, { "epoch": 0.044575, "grad_norm": 0.40234375, "learning_rate": 0.002673, "loss": 3.4063, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1783, "tokens_per_second_per_gpu": 16623.15 }, { "epoch": 0.0446, "grad_norm": 0.4453125, "learning_rate": 0.0026745, "loss": 3.4083, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1784, "tokens_per_second_per_gpu": 17625.86 }, { "epoch": 0.044625, "grad_norm": 0.51953125, "learning_rate": 0.002676, "loss": 3.4065, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1785, "tokens_per_second_per_gpu": 17692.48 }, { "epoch": 0.04465, "grad_norm": 0.45703125, "learning_rate": 0.0026774999999999998, "loss": 3.3926, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1786, "tokens_per_second_per_gpu": 17734.6 }, { "epoch": 0.044675, "grad_norm": 0.4140625, "learning_rate": 0.002679, "loss": 3.4451, "memory/device_reserved (GiB)": 107.61, "memory/max_active (GiB)": 107.43, "memory/max_allocated (GiB)": 107.43, "step": 1787, "tokens_per_second_per_gpu": 12253.75 }, { "epoch": 0.0447, "grad_norm": 0.37109375, "learning_rate": 0.0026804999999999997, "loss": 3.4092, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1788, "tokens_per_second_per_gpu": 22035.77 }, { "epoch": 0.044725, "grad_norm": 0.365234375, "learning_rate": 0.002682, "loss": 3.4072, "memory/device_reserved (GiB)": 46.36, "memory/max_active (GiB)": 46.22, "memory/max_allocated (GiB)": 46.22, "step": 1789, "tokens_per_second_per_gpu": 28799.13 }, { "epoch": 0.04475, "grad_norm": 0.486328125, "learning_rate": 0.0026835, "loss": 3.4273, "memory/device_reserved (GiB)": 107.12, "memory/max_active (GiB)": 106.95, "memory/max_allocated (GiB)": 106.95, "step": 1790, "tokens_per_second_per_gpu": 13507.67 }, { "epoch": 0.044775, "grad_norm": 0.41796875, "learning_rate": 0.0026850000000000003, "loss": 3.4105, "memory/device_reserved (GiB)": 97.42, "memory/max_active (GiB)": 97.23, "memory/max_allocated (GiB)": 97.23, "step": 1791, "tokens_per_second_per_gpu": 14054.94 }, { "epoch": 0.0448, "grad_norm": 0.39453125, "learning_rate": 0.0026865, "loss": 3.4125, "memory/device_reserved (GiB)": 87.21, "memory/max_active (GiB)": 87.03, "memory/max_allocated (GiB)": 87.03, "step": 1792, "tokens_per_second_per_gpu": 15951.81 }, { "epoch": 0.044825, "grad_norm": 0.455078125, "learning_rate": 0.0026880000000000003, "loss": 3.4148, "memory/device_reserved (GiB)": 117.34, "memory/max_active (GiB)": 117.15, "memory/max_allocated (GiB)": 117.15, "step": 1793, "tokens_per_second_per_gpu": 11286.11 }, { "epoch": 0.04485, "grad_norm": 0.47265625, "learning_rate": 0.0026895, "loss": 3.4305, "memory/device_reserved (GiB)": 86.7, "memory/max_active (GiB)": 86.55, "memory/max_allocated (GiB)": 86.55, "step": 1794, "tokens_per_second_per_gpu": 15414.34 }, { "epoch": 0.044875, "grad_norm": 0.49609375, "learning_rate": 0.0026910000000000002, "loss": 3.4523, "memory/device_reserved (GiB)": 56.57, "memory/max_active (GiB)": 56.42, "memory/max_allocated (GiB)": 56.42, "step": 1795, "tokens_per_second_per_gpu": 23738.61 }, { "epoch": 0.0449, "grad_norm": 0.416015625, "learning_rate": 0.0026925, "loss": 3.4288, "memory/device_reserved (GiB)": 45.93, "memory/max_active (GiB)": 45.75, "memory/max_allocated (GiB)": 45.75, "step": 1796, "tokens_per_second_per_gpu": 29115.65 }, { "epoch": 0.044925, "grad_norm": 0.55859375, "learning_rate": 0.002694, "loss": 3.4467, "memory/device_reserved (GiB)": 76.49, "memory/max_active (GiB)": 76.35, "memory/max_allocated (GiB)": 76.35, "step": 1797, "tokens_per_second_per_gpu": 18356.21 }, { "epoch": 0.04495, "grad_norm": 0.42578125, "learning_rate": 0.0026955, "loss": 3.4082, "memory/device_reserved (GiB)": 86.21, "memory/max_active (GiB)": 86.02, "memory/max_allocated (GiB)": 86.02, "step": 1798, "tokens_per_second_per_gpu": 15680.13 }, { "epoch": 0.044975, "grad_norm": 0.376953125, "learning_rate": 0.002697, "loss": 3.3861, "memory/device_reserved (GiB)": 77.01, "memory/max_active (GiB)": 76.83, "memory/max_allocated (GiB)": 76.83, "step": 1799, "tokens_per_second_per_gpu": 17597.13 }, { "epoch": 0.045, "grad_norm": 0.4140625, "learning_rate": 0.0026985, "loss": 3.4615, "memory/device_reserved (GiB)": 127.96, "memory/max_active (GiB)": 127.83, "memory/max_allocated (GiB)": 127.83, "step": 1800, "tokens_per_second_per_gpu": 10181.44 } ], "logging_steps": 1, "max_steps": 40000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1172492665891062e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }