Upload folder using huggingface_hub
Browse files- model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state_0.pth +1 -1
- rng_state_1.pth +1 -1
- rng_state_2.pth +1 -1
- rng_state_3.pth +1 -1
- rng_state_4.pth +1 -1
- rng_state_5.pth +1 -1
- rng_state_6.pth +1 -1
- rng_state_7.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +2203 -3
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 505408136
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5ad63b79c6f5a0ce02ab80d656cc75c3f40065ca1444e1eeb8a989304b610fa
|
| 3 |
size 505408136
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1010874315
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b686e15846fe98d3e289254d43ec627262be81d3ccfd84694780cdc2d857d26d
|
| 3 |
size 1010874315
|
rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 16389
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9daa68d1a810813b1abdd9f201531e73b7d4e041dcc3fad23284f8fcf4b91d24
|
| 3 |
size 16389
|
rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 16389
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d4cf6d3addd42425edfaae3634d0333d62c162775e97e30e6dbbb03fd74dd6f
|
| 3 |
size 16389
|
rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 16389
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b56ba980c9ffbdf2bb60baefdb68afe18fcb829a6a07f61c928a35c3737bd1d4
|
| 3 |
size 16389
|
rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 16389
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7dc66401aa5093553b2517ab0a1356729053e4b424c30bd8d4bf1f904c790981
|
| 3 |
size 16389
|
rng_state_4.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 16389
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71bf5ab9e423d6e1f39e134e259bf1988b5193894d9efa9209c9ed124671b440
|
| 3 |
size 16389
|
rng_state_5.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 16389
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b3bbe1c81dcd5d4dad45c273f16941ad3a079fae1884c8c592be9c19cf695f6
|
| 3 |
size 16389
|
rng_state_6.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 16389
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3979078d5678a98b0350d1d45af82cb4e5ee17b169ebd5301f8daf30c0a3debe
|
| 3 |
size 16389
|
rng_state_7.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 16389
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2672dccf4e80588003012e60bca5c978b07507e61cd0e44e8d1bce97be4a1ebb
|
| 3 |
size 16389
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:709037943547f0e738a8d6f42c84d14097097b4e323fa2f1aad3fb147909ac3e
|
| 3 |
size 1465
|
trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -17608,6 +17608,2206 @@
|
|
| 17608 |
"memory/max_allocated (GiB)": 107.43,
|
| 17609 |
"step": 1600,
|
| 17610 |
"tokens_per_second_per_gpu": 12031.87
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17611 |
}
|
| 17612 |
],
|
| 17613 |
"logging_steps": 1,
|
|
@@ -17627,7 +19827,7 @@
|
|
| 17627 |
"attributes": {}
|
| 17628 |
}
|
| 17629 |
},
|
| 17630 |
-
"total_flos":
|
| 17631 |
"train_batch_size": 1,
|
| 17632 |
"trial_name": null,
|
| 17633 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.045,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 1800,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 17608 |
"memory/max_allocated (GiB)": 107.43,
|
| 17609 |
"step": 1600,
|
| 17610 |
"tokens_per_second_per_gpu": 12031.87
|
| 17611 |
+
},
|
| 17612 |
+
{
|
| 17613 |
+
"epoch": 0.040025,
|
| 17614 |
+
"grad_norm": 0.431640625,
|
| 17615 |
+
"learning_rate": 0.0024000000000000002,
|
| 17616 |
+
"loss": 3.4425,
|
| 17617 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 17618 |
+
"memory/max_active (GiB)": 87.03,
|
| 17619 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 17620 |
+
"step": 1601,
|
| 17621 |
+
"tokens_per_second_per_gpu": 15837.83
|
| 17622 |
+
},
|
| 17623 |
+
{
|
| 17624 |
+
"epoch": 0.04005,
|
| 17625 |
+
"grad_norm": 0.49609375,
|
| 17626 |
+
"learning_rate": 0.0024015,
|
| 17627 |
+
"loss": 3.4657,
|
| 17628 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 17629 |
+
"memory/max_active (GiB)": 87.03,
|
| 17630 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 17631 |
+
"step": 1602,
|
| 17632 |
+
"tokens_per_second_per_gpu": 16144.09
|
| 17633 |
+
},
|
| 17634 |
+
{
|
| 17635 |
+
"epoch": 0.040075,
|
| 17636 |
+
"grad_norm": 0.5546875,
|
| 17637 |
+
"learning_rate": 0.002403,
|
| 17638 |
+
"loss": 3.4979,
|
| 17639 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 17640 |
+
"memory/max_active (GiB)": 87.03,
|
| 17641 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 17642 |
+
"step": 1603,
|
| 17643 |
+
"tokens_per_second_per_gpu": 15848.46
|
| 17644 |
+
},
|
| 17645 |
+
{
|
| 17646 |
+
"epoch": 0.0401,
|
| 17647 |
+
"grad_norm": 0.478515625,
|
| 17648 |
+
"learning_rate": 0.0024045,
|
| 17649 |
+
"loss": 3.4807,
|
| 17650 |
+
"memory/device_reserved (GiB)": 96.22,
|
| 17651 |
+
"memory/max_active (GiB)": 96.19,
|
| 17652 |
+
"memory/max_allocated (GiB)": 96.19,
|
| 17653 |
+
"step": 1604,
|
| 17654 |
+
"tokens_per_second_per_gpu": 14136.87
|
| 17655 |
+
},
|
| 17656 |
+
{
|
| 17657 |
+
"epoch": 0.040125,
|
| 17658 |
+
"grad_norm": 0.2158203125,
|
| 17659 |
+
"learning_rate": 0.002406,
|
| 17660 |
+
"loss": 3.4584,
|
| 17661 |
+
"memory/device_reserved (GiB)": 106.61,
|
| 17662 |
+
"memory/max_active (GiB)": 106.43,
|
| 17663 |
+
"memory/max_allocated (GiB)": 106.43,
|
| 17664 |
+
"step": 1605,
|
| 17665 |
+
"tokens_per_second_per_gpu": 12098.17
|
| 17666 |
+
},
|
| 17667 |
+
{
|
| 17668 |
+
"epoch": 0.04015,
|
| 17669 |
+
"grad_norm": 0.3828125,
|
| 17670 |
+
"learning_rate": 0.0024075,
|
| 17671 |
+
"loss": 3.4939,
|
| 17672 |
+
"memory/device_reserved (GiB)": 96.93,
|
| 17673 |
+
"memory/max_active (GiB)": 96.75,
|
| 17674 |
+
"memory/max_allocated (GiB)": 96.75,
|
| 17675 |
+
"step": 1606,
|
| 17676 |
+
"tokens_per_second_per_gpu": 13997.28
|
| 17677 |
+
},
|
| 17678 |
+
{
|
| 17679 |
+
"epoch": 0.040175,
|
| 17680 |
+
"grad_norm": 0.376953125,
|
| 17681 |
+
"learning_rate": 0.002409,
|
| 17682 |
+
"loss": 3.4807,
|
| 17683 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 17684 |
+
"memory/max_active (GiB)": 97.23,
|
| 17685 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 17686 |
+
"step": 1607,
|
| 17687 |
+
"tokens_per_second_per_gpu": 15028.75
|
| 17688 |
+
},
|
| 17689 |
+
{
|
| 17690 |
+
"epoch": 0.0402,
|
| 17691 |
+
"grad_norm": 0.48046875,
|
| 17692 |
+
"learning_rate": 0.0024105,
|
| 17693 |
+
"loss": 3.4622,
|
| 17694 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 17695 |
+
"memory/max_active (GiB)": 66.63,
|
| 17696 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 17697 |
+
"step": 1608,
|
| 17698 |
+
"tokens_per_second_per_gpu": 20265.75
|
| 17699 |
+
},
|
| 17700 |
+
{
|
| 17701 |
+
"epoch": 0.040225,
|
| 17702 |
+
"grad_norm": 0.4765625,
|
| 17703 |
+
"learning_rate": 0.002412,
|
| 17704 |
+
"loss": 3.4482,
|
| 17705 |
+
"memory/device_reserved (GiB)": 86.7,
|
| 17706 |
+
"memory/max_active (GiB)": 86.55,
|
| 17707 |
+
"memory/max_allocated (GiB)": 86.55,
|
| 17708 |
+
"step": 1609,
|
| 17709 |
+
"tokens_per_second_per_gpu": 15102.78
|
| 17710 |
+
},
|
| 17711 |
+
{
|
| 17712 |
+
"epoch": 0.04025,
|
| 17713 |
+
"grad_norm": 0.61328125,
|
| 17714 |
+
"learning_rate": 0.0024135,
|
| 17715 |
+
"loss": 3.4781,
|
| 17716 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 17717 |
+
"memory/max_active (GiB)": 87.03,
|
| 17718 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 17719 |
+
"step": 1610,
|
| 17720 |
+
"tokens_per_second_per_gpu": 14823.11
|
| 17721 |
+
},
|
| 17722 |
+
{
|
| 17723 |
+
"epoch": 0.040275,
|
| 17724 |
+
"grad_norm": 0.625,
|
| 17725 |
+
"learning_rate": 0.002415,
|
| 17726 |
+
"loss": 3.4686,
|
| 17727 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 17728 |
+
"memory/max_active (GiB)": 56.42,
|
| 17729 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 17730 |
+
"step": 1611,
|
| 17731 |
+
"tokens_per_second_per_gpu": 23148.24
|
| 17732 |
+
},
|
| 17733 |
+
{
|
| 17734 |
+
"epoch": 0.0403,
|
| 17735 |
+
"grad_norm": 0.5625,
|
| 17736 |
+
"learning_rate": 0.0024165000000000002,
|
| 17737 |
+
"loss": 3.4665,
|
| 17738 |
+
"memory/device_reserved (GiB)": 86.7,
|
| 17739 |
+
"memory/max_active (GiB)": 86.55,
|
| 17740 |
+
"memory/max_allocated (GiB)": 86.55,
|
| 17741 |
+
"step": 1612,
|
| 17742 |
+
"tokens_per_second_per_gpu": 15439.18
|
| 17743 |
+
},
|
| 17744 |
+
{
|
| 17745 |
+
"epoch": 0.040325,
|
| 17746 |
+
"grad_norm": 0.56640625,
|
| 17747 |
+
"learning_rate": 0.002418,
|
| 17748 |
+
"loss": 3.4718,
|
| 17749 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 17750 |
+
"memory/max_active (GiB)": 87.03,
|
| 17751 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 17752 |
+
"step": 1613,
|
| 17753 |
+
"tokens_per_second_per_gpu": 16189.41
|
| 17754 |
+
},
|
| 17755 |
+
{
|
| 17756 |
+
"epoch": 0.04035,
|
| 17757 |
+
"grad_norm": 0.46875,
|
| 17758 |
+
"learning_rate": 0.0024195,
|
| 17759 |
+
"loss": 3.5027,
|
| 17760 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 17761 |
+
"memory/max_active (GiB)": 66.63,
|
| 17762 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 17763 |
+
"step": 1614,
|
| 17764 |
+
"tokens_per_second_per_gpu": 18394.53
|
| 17765 |
+
},
|
| 17766 |
+
{
|
| 17767 |
+
"epoch": 0.040375,
|
| 17768 |
+
"grad_norm": 0.388671875,
|
| 17769 |
+
"learning_rate": 0.0024210000000000004,
|
| 17770 |
+
"loss": 3.4931,
|
| 17771 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 17772 |
+
"memory/max_active (GiB)": 56.42,
|
| 17773 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 17774 |
+
"step": 1615,
|
| 17775 |
+
"tokens_per_second_per_gpu": 23895.62
|
| 17776 |
+
},
|
| 17777 |
+
{
|
| 17778 |
+
"epoch": 0.0404,
|
| 17779 |
+
"grad_norm": 0.412109375,
|
| 17780 |
+
"learning_rate": 0.0024225,
|
| 17781 |
+
"loss": 3.4385,
|
| 17782 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 17783 |
+
"memory/max_active (GiB)": 56.42,
|
| 17784 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 17785 |
+
"step": 1616,
|
| 17786 |
+
"tokens_per_second_per_gpu": 24598.08
|
| 17787 |
+
},
|
| 17788 |
+
{
|
| 17789 |
+
"epoch": 0.040425,
|
| 17790 |
+
"grad_norm": 0.390625,
|
| 17791 |
+
"learning_rate": 0.0024240000000000004,
|
| 17792 |
+
"loss": 3.4905,
|
| 17793 |
+
"memory/device_reserved (GiB)": 55.57,
|
| 17794 |
+
"memory/max_active (GiB)": 55.42,
|
| 17795 |
+
"memory/max_allocated (GiB)": 55.42,
|
| 17796 |
+
"step": 1617,
|
| 17797 |
+
"tokens_per_second_per_gpu": 24013.93
|
| 17798 |
+
},
|
| 17799 |
+
{
|
| 17800 |
+
"epoch": 0.04045,
|
| 17801 |
+
"grad_norm": 0.44140625,
|
| 17802 |
+
"learning_rate": 0.0024255,
|
| 17803 |
+
"loss": 3.4878,
|
| 17804 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 17805 |
+
"memory/max_active (GiB)": 66.63,
|
| 17806 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 17807 |
+
"step": 1618,
|
| 17808 |
+
"tokens_per_second_per_gpu": 21628.74
|
| 17809 |
+
},
|
| 17810 |
+
{
|
| 17811 |
+
"epoch": 0.040475,
|
| 17812 |
+
"grad_norm": 0.375,
|
| 17813 |
+
"learning_rate": 0.0024270000000000003,
|
| 17814 |
+
"loss": 3.4842,
|
| 17815 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 17816 |
+
"memory/max_active (GiB)": 97.23,
|
| 17817 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 17818 |
+
"step": 1619,
|
| 17819 |
+
"tokens_per_second_per_gpu": 13740.61
|
| 17820 |
+
},
|
| 17821 |
+
{
|
| 17822 |
+
"epoch": 0.0405,
|
| 17823 |
+
"grad_norm": 0.396484375,
|
| 17824 |
+
"learning_rate": 0.0024285,
|
| 17825 |
+
"loss": 3.4546,
|
| 17826 |
+
"memory/device_reserved (GiB)": 127.55,
|
| 17827 |
+
"memory/max_active (GiB)": 127.35,
|
| 17828 |
+
"memory/max_allocated (GiB)": 127.35,
|
| 17829 |
+
"step": 1620,
|
| 17830 |
+
"tokens_per_second_per_gpu": 10505.39
|
| 17831 |
+
},
|
| 17832 |
+
{
|
| 17833 |
+
"epoch": 0.040525,
|
| 17834 |
+
"grad_norm": 0.54296875,
|
| 17835 |
+
"learning_rate": 0.0024300000000000003,
|
| 17836 |
+
"loss": 3.4532,
|
| 17837 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 17838 |
+
"memory/max_active (GiB)": 97.23,
|
| 17839 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 17840 |
+
"step": 1621,
|
| 17841 |
+
"tokens_per_second_per_gpu": 14010.82
|
| 17842 |
+
},
|
| 17843 |
+
{
|
| 17844 |
+
"epoch": 0.04055,
|
| 17845 |
+
"grad_norm": 0.44140625,
|
| 17846 |
+
"learning_rate": 0.0024315,
|
| 17847 |
+
"loss": 3.4622,
|
| 17848 |
+
"memory/device_reserved (GiB)": 86.71,
|
| 17849 |
+
"memory/max_active (GiB)": 86.55,
|
| 17850 |
+
"memory/max_allocated (GiB)": 86.55,
|
| 17851 |
+
"step": 1622,
|
| 17852 |
+
"tokens_per_second_per_gpu": 15600.27
|
| 17853 |
+
},
|
| 17854 |
+
{
|
| 17855 |
+
"epoch": 0.040575,
|
| 17856 |
+
"grad_norm": 0.328125,
|
| 17857 |
+
"learning_rate": 0.0024330000000000003,
|
| 17858 |
+
"loss": 3.4374,
|
| 17859 |
+
"memory/device_reserved (GiB)": 56.63,
|
| 17860 |
+
"memory/max_active (GiB)": 56.42,
|
| 17861 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 17862 |
+
"step": 1623,
|
| 17863 |
+
"tokens_per_second_per_gpu": 23962.38
|
| 17864 |
+
},
|
| 17865 |
+
{
|
| 17866 |
+
"epoch": 0.0406,
|
| 17867 |
+
"grad_norm": 0.486328125,
|
| 17868 |
+
"learning_rate": 0.0024345,
|
| 17869 |
+
"loss": 3.4678,
|
| 17870 |
+
"memory/device_reserved (GiB)": 117.82,
|
| 17871 |
+
"memory/max_active (GiB)": 117.63,
|
| 17872 |
+
"memory/max_allocated (GiB)": 117.63,
|
| 17873 |
+
"step": 1624,
|
| 17874 |
+
"tokens_per_second_per_gpu": 11110.47
|
| 17875 |
+
},
|
| 17876 |
+
{
|
| 17877 |
+
"epoch": 0.040625,
|
| 17878 |
+
"grad_norm": 0.41796875,
|
| 17879 |
+
"learning_rate": 0.0024360000000000002,
|
| 17880 |
+
"loss": 3.433,
|
| 17881 |
+
"memory/device_reserved (GiB)": 46.36,
|
| 17882 |
+
"memory/max_active (GiB)": 46.22,
|
| 17883 |
+
"memory/max_allocated (GiB)": 46.22,
|
| 17884 |
+
"step": 1625,
|
| 17885 |
+
"tokens_per_second_per_gpu": 28676.3
|
| 17886 |
+
},
|
| 17887 |
+
{
|
| 17888 |
+
"epoch": 0.04065,
|
| 17889 |
+
"grad_norm": 0.5,
|
| 17890 |
+
"learning_rate": 0.0024375,
|
| 17891 |
+
"loss": 3.4638,
|
| 17892 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 17893 |
+
"memory/max_active (GiB)": 107.43,
|
| 17894 |
+
"memory/max_allocated (GiB)": 107.43,
|
| 17895 |
+
"step": 1626,
|
| 17896 |
+
"tokens_per_second_per_gpu": 12804.39
|
| 17897 |
+
},
|
| 17898 |
+
{
|
| 17899 |
+
"epoch": 0.040675,
|
| 17900 |
+
"grad_norm": 0.59375,
|
| 17901 |
+
"learning_rate": 0.0024389999999999998,
|
| 17902 |
+
"loss": 3.4699,
|
| 17903 |
+
"memory/device_reserved (GiB)": 45.93,
|
| 17904 |
+
"memory/max_active (GiB)": 45.75,
|
| 17905 |
+
"memory/max_allocated (GiB)": 45.75,
|
| 17906 |
+
"step": 1627,
|
| 17907 |
+
"tokens_per_second_per_gpu": 28573.53
|
| 17908 |
+
},
|
| 17909 |
+
{
|
| 17910 |
+
"epoch": 0.0407,
|
| 17911 |
+
"grad_norm": 0.72265625,
|
| 17912 |
+
"learning_rate": 0.0024405,
|
| 17913 |
+
"loss": 3.4996,
|
| 17914 |
+
"memory/device_reserved (GiB)": 127.96,
|
| 17915 |
+
"memory/max_active (GiB)": 127.83,
|
| 17916 |
+
"memory/max_allocated (GiB)": 127.83,
|
| 17917 |
+
"step": 1628,
|
| 17918 |
+
"tokens_per_second_per_gpu": 10458.89
|
| 17919 |
+
},
|
| 17920 |
+
{
|
| 17921 |
+
"epoch": 0.040725,
|
| 17922 |
+
"grad_norm": 0.71484375,
|
| 17923 |
+
"learning_rate": 0.0024419999999999997,
|
| 17924 |
+
"loss": 3.4523,
|
| 17925 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 17926 |
+
"memory/max_active (GiB)": 56.42,
|
| 17927 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 17928 |
+
"step": 1629,
|
| 17929 |
+
"tokens_per_second_per_gpu": 24753.83
|
| 17930 |
+
},
|
| 17931 |
+
{
|
| 17932 |
+
"epoch": 0.04075,
|
| 17933 |
+
"grad_norm": 0.486328125,
|
| 17934 |
+
"learning_rate": 0.0024435,
|
| 17935 |
+
"loss": 3.502,
|
| 17936 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 17937 |
+
"memory/max_active (GiB)": 107.43,
|
| 17938 |
+
"memory/max_allocated (GiB)": 107.43,
|
| 17939 |
+
"step": 1630,
|
| 17940 |
+
"tokens_per_second_per_gpu": 12038.92
|
| 17941 |
+
},
|
| 17942 |
+
{
|
| 17943 |
+
"epoch": 0.040775,
|
| 17944 |
+
"grad_norm": 0.45703125,
|
| 17945 |
+
"learning_rate": 0.0024449999999999997,
|
| 17946 |
+
"loss": 3.4451,
|
| 17947 |
+
"memory/device_reserved (GiB)": 45.93,
|
| 17948 |
+
"memory/max_active (GiB)": 45.75,
|
| 17949 |
+
"memory/max_allocated (GiB)": 45.75,
|
| 17950 |
+
"step": 1631,
|
| 17951 |
+
"tokens_per_second_per_gpu": 28333.75
|
| 17952 |
+
},
|
| 17953 |
+
{
|
| 17954 |
+
"epoch": 0.0408,
|
| 17955 |
+
"grad_norm": 0.6640625,
|
| 17956 |
+
"learning_rate": 0.0024465,
|
| 17957 |
+
"loss": 3.4622,
|
| 17958 |
+
"memory/device_reserved (GiB)": 127.96,
|
| 17959 |
+
"memory/max_active (GiB)": 127.83,
|
| 17960 |
+
"memory/max_allocated (GiB)": 127.83,
|
| 17961 |
+
"step": 1632,
|
| 17962 |
+
"tokens_per_second_per_gpu": 10859.27
|
| 17963 |
+
},
|
| 17964 |
+
{
|
| 17965 |
+
"epoch": 0.040825,
|
| 17966 |
+
"grad_norm": 0.9609375,
|
| 17967 |
+
"learning_rate": 0.002448,
|
| 17968 |
+
"loss": 3.5624,
|
| 17969 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 17970 |
+
"memory/max_active (GiB)": 87.03,
|
| 17971 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 17972 |
+
"step": 1633,
|
| 17973 |
+
"tokens_per_second_per_gpu": 16038.7
|
| 17974 |
+
},
|
| 17975 |
+
{
|
| 17976 |
+
"epoch": 0.04085,
|
| 17977 |
+
"grad_norm": 0.92578125,
|
| 17978 |
+
"learning_rate": 0.0024495,
|
| 17979 |
+
"loss": 3.5544,
|
| 17980 |
+
"memory/device_reserved (GiB)": 117.34,
|
| 17981 |
+
"memory/max_active (GiB)": 117.15,
|
| 17982 |
+
"memory/max_allocated (GiB)": 117.15,
|
| 17983 |
+
"step": 1634,
|
| 17984 |
+
"tokens_per_second_per_gpu": 11856.2
|
| 17985 |
+
},
|
| 17986 |
+
{
|
| 17987 |
+
"epoch": 0.040875,
|
| 17988 |
+
"grad_norm": 0.83203125,
|
| 17989 |
+
"learning_rate": 0.002451,
|
| 17990 |
+
"loss": 3.5095,
|
| 17991 |
+
"memory/device_reserved (GiB)": 46.39,
|
| 17992 |
+
"memory/max_active (GiB)": 46.22,
|
| 17993 |
+
"memory/max_allocated (GiB)": 46.22,
|
| 17994 |
+
"step": 1635,
|
| 17995 |
+
"tokens_per_second_per_gpu": 26486.63
|
| 17996 |
+
},
|
| 17997 |
+
{
|
| 17998 |
+
"epoch": 0.0409,
|
| 17999 |
+
"grad_norm": 0.53125,
|
| 18000 |
+
"learning_rate": 0.0024525000000000003,
|
| 18001 |
+
"loss": 3.5103,
|
| 18002 |
+
"memory/device_reserved (GiB)": 86.21,
|
| 18003 |
+
"memory/max_active (GiB)": 86.02,
|
| 18004 |
+
"memory/max_allocated (GiB)": 86.02,
|
| 18005 |
+
"step": 1636,
|
| 18006 |
+
"tokens_per_second_per_gpu": 15558.25
|
| 18007 |
+
},
|
| 18008 |
+
{
|
| 18009 |
+
"epoch": 0.040925,
|
| 18010 |
+
"grad_norm": 0.6796875,
|
| 18011 |
+
"learning_rate": 0.002454,
|
| 18012 |
+
"loss": 3.5592,
|
| 18013 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 18014 |
+
"memory/max_active (GiB)": 107.43,
|
| 18015 |
+
"memory/max_allocated (GiB)": 107.43,
|
| 18016 |
+
"step": 1637,
|
| 18017 |
+
"tokens_per_second_per_gpu": 13368.45
|
| 18018 |
+
},
|
| 18019 |
+
{
|
| 18020 |
+
"epoch": 0.04095,
|
| 18021 |
+
"grad_norm": 0.65234375,
|
| 18022 |
+
"learning_rate": 0.0024555000000000002,
|
| 18023 |
+
"loss": 3.5235,
|
| 18024 |
+
"memory/device_reserved (GiB)": 107.12,
|
| 18025 |
+
"memory/max_active (GiB)": 106.95,
|
| 18026 |
+
"memory/max_allocated (GiB)": 106.95,
|
| 18027 |
+
"step": 1638,
|
| 18028 |
+
"tokens_per_second_per_gpu": 12681.15
|
| 18029 |
+
},
|
| 18030 |
+
{
|
| 18031 |
+
"epoch": 0.040975,
|
| 18032 |
+
"grad_norm": 0.6328125,
|
| 18033 |
+
"learning_rate": 0.002457,
|
| 18034 |
+
"loss": 3.518,
|
| 18035 |
+
"memory/device_reserved (GiB)": 117.82,
|
| 18036 |
+
"memory/max_active (GiB)": 117.63,
|
| 18037 |
+
"memory/max_allocated (GiB)": 117.63,
|
| 18038 |
+
"step": 1639,
|
| 18039 |
+
"tokens_per_second_per_gpu": 11755.22
|
| 18040 |
+
},
|
| 18041 |
+
{
|
| 18042 |
+
"epoch": 0.041,
|
| 18043 |
+
"grad_norm": 0.78515625,
|
| 18044 |
+
"learning_rate": 0.0024585,
|
| 18045 |
+
"loss": 3.5068,
|
| 18046 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 18047 |
+
"memory/max_active (GiB)": 56.42,
|
| 18048 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 18049 |
+
"step": 1640,
|
| 18050 |
+
"tokens_per_second_per_gpu": 24181.71
|
| 18051 |
+
},
|
| 18052 |
+
{
|
| 18053 |
+
"epoch": 0.041025,
|
| 18054 |
+
"grad_norm": 0.7578125,
|
| 18055 |
+
"learning_rate": 0.00246,
|
| 18056 |
+
"loss": 3.5537,
|
| 18057 |
+
"memory/device_reserved (GiB)": 64.28,
|
| 18058 |
+
"memory/max_active (GiB)": 64.24,
|
| 18059 |
+
"memory/max_allocated (GiB)": 64.24,
|
| 18060 |
+
"step": 1641,
|
| 18061 |
+
"tokens_per_second_per_gpu": 19813.17
|
| 18062 |
+
},
|
| 18063 |
+
{
|
| 18064 |
+
"epoch": 0.04105,
|
| 18065 |
+
"grad_norm": 0.50390625,
|
| 18066 |
+
"learning_rate": 0.0024615,
|
| 18067 |
+
"loss": 3.4945,
|
| 18068 |
+
"memory/device_reserved (GiB)": 77.01,
|
| 18069 |
+
"memory/max_active (GiB)": 76.83,
|
| 18070 |
+
"memory/max_allocated (GiB)": 76.83,
|
| 18071 |
+
"step": 1642,
|
| 18072 |
+
"tokens_per_second_per_gpu": 17963.63
|
| 18073 |
+
},
|
| 18074 |
+
{
|
| 18075 |
+
"epoch": 0.041075,
|
| 18076 |
+
"grad_norm": 0.5390625,
|
| 18077 |
+
"learning_rate": 0.002463,
|
| 18078 |
+
"loss": 3.49,
|
| 18079 |
+
"memory/device_reserved (GiB)": 77.01,
|
| 18080 |
+
"memory/max_active (GiB)": 76.83,
|
| 18081 |
+
"memory/max_allocated (GiB)": 76.83,
|
| 18082 |
+
"step": 1643,
|
| 18083 |
+
"tokens_per_second_per_gpu": 17191.81
|
| 18084 |
+
},
|
| 18085 |
+
{
|
| 18086 |
+
"epoch": 0.0411,
|
| 18087 |
+
"grad_norm": 0.5234375,
|
| 18088 |
+
"learning_rate": 0.0024645,
|
| 18089 |
+
"loss": 3.4985,
|
| 18090 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 18091 |
+
"memory/max_active (GiB)": 87.03,
|
| 18092 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 18093 |
+
"step": 1644,
|
| 18094 |
+
"tokens_per_second_per_gpu": 15627.01
|
| 18095 |
+
},
|
| 18096 |
+
{
|
| 18097 |
+
"epoch": 0.041125,
|
| 18098 |
+
"grad_norm": 0.458984375,
|
| 18099 |
+
"learning_rate": 0.002466,
|
| 18100 |
+
"loss": 3.4606,
|
| 18101 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 18102 |
+
"memory/max_active (GiB)": 107.43,
|
| 18103 |
+
"memory/max_allocated (GiB)": 107.43,
|
| 18104 |
+
"step": 1645,
|
| 18105 |
+
"tokens_per_second_per_gpu": 13447.72
|
| 18106 |
+
},
|
| 18107 |
+
{
|
| 18108 |
+
"epoch": 0.04115,
|
| 18109 |
+
"grad_norm": 0.384765625,
|
| 18110 |
+
"learning_rate": 0.0024675,
|
| 18111 |
+
"loss": 3.4865,
|
| 18112 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 18113 |
+
"memory/max_active (GiB)": 87.03,
|
| 18114 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 18115 |
+
"step": 1646,
|
| 18116 |
+
"tokens_per_second_per_gpu": 16562.44
|
| 18117 |
+
},
|
| 18118 |
+
{
|
| 18119 |
+
"epoch": 0.041175,
|
| 18120 |
+
"grad_norm": 0.30078125,
|
| 18121 |
+
"learning_rate": 0.002469,
|
| 18122 |
+
"loss": 3.452,
|
| 18123 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18124 |
+
"memory/max_active (GiB)": 66.63,
|
| 18125 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18126 |
+
"step": 1647,
|
| 18127 |
+
"tokens_per_second_per_gpu": 19882.36
|
| 18128 |
+
},
|
| 18129 |
+
{
|
| 18130 |
+
"epoch": 0.0412,
|
| 18131 |
+
"grad_norm": 0.3125,
|
| 18132 |
+
"learning_rate": 0.0024705,
|
| 18133 |
+
"loss": 3.482,
|
| 18134 |
+
"memory/device_reserved (GiB)": 77.01,
|
| 18135 |
+
"memory/max_active (GiB)": 76.83,
|
| 18136 |
+
"memory/max_allocated (GiB)": 76.83,
|
| 18137 |
+
"step": 1648,
|
| 18138 |
+
"tokens_per_second_per_gpu": 17699.48
|
| 18139 |
+
},
|
| 18140 |
+
{
|
| 18141 |
+
"epoch": 0.041225,
|
| 18142 |
+
"grad_norm": 0.275390625,
|
| 18143 |
+
"learning_rate": 0.002472,
|
| 18144 |
+
"loss": 3.4162,
|
| 18145 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18146 |
+
"memory/max_active (GiB)": 66.63,
|
| 18147 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18148 |
+
"step": 1649,
|
| 18149 |
+
"tokens_per_second_per_gpu": 20101.43
|
| 18150 |
+
},
|
| 18151 |
+
{
|
| 18152 |
+
"epoch": 0.04125,
|
| 18153 |
+
"grad_norm": 0.26953125,
|
| 18154 |
+
"learning_rate": 0.0024735,
|
| 18155 |
+
"loss": 3.4138,
|
| 18156 |
+
"memory/device_reserved (GiB)": 46.39,
|
| 18157 |
+
"memory/max_active (GiB)": 46.22,
|
| 18158 |
+
"memory/max_allocated (GiB)": 46.22,
|
| 18159 |
+
"step": 1650,
|
| 18160 |
+
"tokens_per_second_per_gpu": 27146.19
|
| 18161 |
+
},
|
| 18162 |
+
{
|
| 18163 |
+
"epoch": 0.041275,
|
| 18164 |
+
"grad_norm": 0.234375,
|
| 18165 |
+
"learning_rate": 0.0024749999999999998,
|
| 18166 |
+
"loss": 3.4321,
|
| 18167 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18168 |
+
"memory/max_active (GiB)": 66.63,
|
| 18169 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18170 |
+
"step": 1651,
|
| 18171 |
+
"tokens_per_second_per_gpu": 19709.27
|
| 18172 |
+
},
|
| 18173 |
+
{
|
| 18174 |
+
"epoch": 0.0413,
|
| 18175 |
+
"grad_norm": 0.259765625,
|
| 18176 |
+
"learning_rate": 0.0024765,
|
| 18177 |
+
"loss": 3.477,
|
| 18178 |
+
"memory/device_reserved (GiB)": 46.36,
|
| 18179 |
+
"memory/max_active (GiB)": 46.22,
|
| 18180 |
+
"memory/max_allocated (GiB)": 46.22,
|
| 18181 |
+
"step": 1652,
|
| 18182 |
+
"tokens_per_second_per_gpu": 31305.36
|
| 18183 |
+
},
|
| 18184 |
+
{
|
| 18185 |
+
"epoch": 0.041325,
|
| 18186 |
+
"grad_norm": 0.2333984375,
|
| 18187 |
+
"learning_rate": 0.0024779999999999997,
|
| 18188 |
+
"loss": 3.41,
|
| 18189 |
+
"memory/device_reserved (GiB)": 56.14,
|
| 18190 |
+
"memory/max_active (GiB)": 55.95,
|
| 18191 |
+
"memory/max_allocated (GiB)": 55.95,
|
| 18192 |
+
"step": 1653,
|
| 18193 |
+
"tokens_per_second_per_gpu": 23790.99
|
| 18194 |
+
},
|
| 18195 |
+
{
|
| 18196 |
+
"epoch": 0.04135,
|
| 18197 |
+
"grad_norm": 0.294921875,
|
| 18198 |
+
"learning_rate": 0.0024795,
|
| 18199 |
+
"loss": 3.438,
|
| 18200 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 18201 |
+
"memory/max_active (GiB)": 97.23,
|
| 18202 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 18203 |
+
"step": 1654,
|
| 18204 |
+
"tokens_per_second_per_gpu": 13756.67
|
| 18205 |
+
},
|
| 18206 |
+
{
|
| 18207 |
+
"epoch": 0.041375,
|
| 18208 |
+
"grad_norm": 0.361328125,
|
| 18209 |
+
"learning_rate": 0.002481,
|
| 18210 |
+
"loss": 3.4131,
|
| 18211 |
+
"memory/device_reserved (GiB)": 117.82,
|
| 18212 |
+
"memory/max_active (GiB)": 117.63,
|
| 18213 |
+
"memory/max_allocated (GiB)": 117.63,
|
| 18214 |
+
"step": 1655,
|
| 18215 |
+
"tokens_per_second_per_gpu": 12021.3
|
| 18216 |
+
},
|
| 18217 |
+
{
|
| 18218 |
+
"epoch": 0.0414,
|
| 18219 |
+
"grad_norm": 0.314453125,
|
| 18220 |
+
"learning_rate": 0.0024825,
|
| 18221 |
+
"loss": 3.452,
|
| 18222 |
+
"memory/device_reserved (GiB)": 76.49,
|
| 18223 |
+
"memory/max_active (GiB)": 76.35,
|
| 18224 |
+
"memory/max_allocated (GiB)": 76.35,
|
| 18225 |
+
"step": 1656,
|
| 18226 |
+
"tokens_per_second_per_gpu": 17313.17
|
| 18227 |
+
},
|
| 18228 |
+
{
|
| 18229 |
+
"epoch": 0.041425,
|
| 18230 |
+
"grad_norm": 0.2734375,
|
| 18231 |
+
"learning_rate": 0.002484,
|
| 18232 |
+
"loss": 3.4499,
|
| 18233 |
+
"memory/device_reserved (GiB)": 75.99,
|
| 18234 |
+
"memory/max_active (GiB)": 75.82,
|
| 18235 |
+
"memory/max_allocated (GiB)": 75.82,
|
| 18236 |
+
"step": 1657,
|
| 18237 |
+
"tokens_per_second_per_gpu": 18143.79
|
| 18238 |
+
},
|
| 18239 |
+
{
|
| 18240 |
+
"epoch": 0.04145,
|
| 18241 |
+
"grad_norm": 0.3203125,
|
| 18242 |
+
"learning_rate": 0.0024855000000000003,
|
| 18243 |
+
"loss": 3.4232,
|
| 18244 |
+
"memory/device_reserved (GiB)": 56.63,
|
| 18245 |
+
"memory/max_active (GiB)": 56.42,
|
| 18246 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 18247 |
+
"step": 1658,
|
| 18248 |
+
"tokens_per_second_per_gpu": 22650.06
|
| 18249 |
+
},
|
| 18250 |
+
{
|
| 18251 |
+
"epoch": 0.041475,
|
| 18252 |
+
"grad_norm": 0.294921875,
|
| 18253 |
+
"learning_rate": 0.002487,
|
| 18254 |
+
"loss": 3.4367,
|
| 18255 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18256 |
+
"memory/max_active (GiB)": 66.63,
|
| 18257 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18258 |
+
"step": 1659,
|
| 18259 |
+
"tokens_per_second_per_gpu": 19148.84
|
| 18260 |
+
},
|
| 18261 |
+
{
|
| 18262 |
+
"epoch": 0.0415,
|
| 18263 |
+
"grad_norm": 0.337890625,
|
| 18264 |
+
"learning_rate": 0.0024885000000000003,
|
| 18265 |
+
"loss": 3.4237,
|
| 18266 |
+
"memory/device_reserved (GiB)": 127.96,
|
| 18267 |
+
"memory/max_active (GiB)": 127.83,
|
| 18268 |
+
"memory/max_allocated (GiB)": 127.83,
|
| 18269 |
+
"step": 1660,
|
| 18270 |
+
"tokens_per_second_per_gpu": 11299.52
|
| 18271 |
+
},
|
| 18272 |
+
{
|
| 18273 |
+
"epoch": 0.041525,
|
| 18274 |
+
"grad_norm": 0.376953125,
|
| 18275 |
+
"learning_rate": 0.00249,
|
| 18276 |
+
"loss": 3.4072,
|
| 18277 |
+
"memory/device_reserved (GiB)": 127.96,
|
| 18278 |
+
"memory/max_active (GiB)": 127.83,
|
| 18279 |
+
"memory/max_allocated (GiB)": 127.83,
|
| 18280 |
+
"step": 1661,
|
| 18281 |
+
"tokens_per_second_per_gpu": 10692.7
|
| 18282 |
+
},
|
| 18283 |
+
{
|
| 18284 |
+
"epoch": 0.04155,
|
| 18285 |
+
"grad_norm": 0.392578125,
|
| 18286 |
+
"learning_rate": 0.0024915000000000002,
|
| 18287 |
+
"loss": 3.4118,
|
| 18288 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 18289 |
+
"memory/max_active (GiB)": 97.23,
|
| 18290 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 18291 |
+
"step": 1662,
|
| 18292 |
+
"tokens_per_second_per_gpu": 14572.68
|
| 18293 |
+
},
|
| 18294 |
+
{
|
| 18295 |
+
"epoch": 0.041575,
|
| 18296 |
+
"grad_norm": 0.32421875,
|
| 18297 |
+
"learning_rate": 0.002493,
|
| 18298 |
+
"loss": 3.3948,
|
| 18299 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18300 |
+
"memory/max_active (GiB)": 66.63,
|
| 18301 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18302 |
+
"step": 1663,
|
| 18303 |
+
"tokens_per_second_per_gpu": 20532.19
|
| 18304 |
+
},
|
| 18305 |
+
{
|
| 18306 |
+
"epoch": 0.0416,
|
| 18307 |
+
"grad_norm": 0.28125,
|
| 18308 |
+
"learning_rate": 0.0024945,
|
| 18309 |
+
"loss": 3.373,
|
| 18310 |
+
"memory/device_reserved (GiB)": 77.01,
|
| 18311 |
+
"memory/max_active (GiB)": 76.83,
|
| 18312 |
+
"memory/max_allocated (GiB)": 76.83,
|
| 18313 |
+
"step": 1664,
|
| 18314 |
+
"tokens_per_second_per_gpu": 16522.28
|
| 18315 |
+
},
|
| 18316 |
+
{
|
| 18317 |
+
"epoch": 0.041625,
|
| 18318 |
+
"grad_norm": 0.306640625,
|
| 18319 |
+
"learning_rate": 0.002496,
|
| 18320 |
+
"loss": 3.4468,
|
| 18321 |
+
"memory/device_reserved (GiB)": 56.14,
|
| 18322 |
+
"memory/max_active (GiB)": 55.95,
|
| 18323 |
+
"memory/max_allocated (GiB)": 55.95,
|
| 18324 |
+
"step": 1665,
|
| 18325 |
+
"tokens_per_second_per_gpu": 23946.77
|
| 18326 |
+
},
|
| 18327 |
+
{
|
| 18328 |
+
"epoch": 0.04165,
|
| 18329 |
+
"grad_norm": 0.296875,
|
| 18330 |
+
"learning_rate": 0.0024975,
|
| 18331 |
+
"loss": 3.4283,
|
| 18332 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18333 |
+
"memory/max_active (GiB)": 66.63,
|
| 18334 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18335 |
+
"step": 1666,
|
| 18336 |
+
"tokens_per_second_per_gpu": 19907.08
|
| 18337 |
+
},
|
| 18338 |
+
{
|
| 18339 |
+
"epoch": 0.041675,
|
| 18340 |
+
"grad_norm": 0.28125,
|
| 18341 |
+
"learning_rate": 0.002499,
|
| 18342 |
+
"loss": 3.4222,
|
| 18343 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 18344 |
+
"memory/max_active (GiB)": 107.43,
|
| 18345 |
+
"memory/max_allocated (GiB)": 107.43,
|
| 18346 |
+
"step": 1667,
|
| 18347 |
+
"tokens_per_second_per_gpu": 12874.36
|
| 18348 |
+
},
|
| 18349 |
+
{
|
| 18350 |
+
"epoch": 0.0417,
|
| 18351 |
+
"grad_norm": 0.3671875,
|
| 18352 |
+
"learning_rate": 0.0025005,
|
| 18353 |
+
"loss": 3.4831,
|
| 18354 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18355 |
+
"memory/max_active (GiB)": 66.63,
|
| 18356 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18357 |
+
"step": 1668,
|
| 18358 |
+
"tokens_per_second_per_gpu": 20630.17
|
| 18359 |
+
},
|
| 18360 |
+
{
|
| 18361 |
+
"epoch": 0.041725,
|
| 18362 |
+
"grad_norm": 0.44140625,
|
| 18363 |
+
"learning_rate": 0.002502,
|
| 18364 |
+
"loss": 3.4484,
|
| 18365 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18366 |
+
"memory/max_active (GiB)": 66.63,
|
| 18367 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18368 |
+
"step": 1669,
|
| 18369 |
+
"tokens_per_second_per_gpu": 20443.85
|
| 18370 |
+
},
|
| 18371 |
+
{
|
| 18372 |
+
"epoch": 0.04175,
|
| 18373 |
+
"grad_norm": 0.4921875,
|
| 18374 |
+
"learning_rate": 0.0025035,
|
| 18375 |
+
"loss": 3.411,
|
| 18376 |
+
"memory/device_reserved (GiB)": 127.96,
|
| 18377 |
+
"memory/max_active (GiB)": 127.83,
|
| 18378 |
+
"memory/max_allocated (GiB)": 127.83,
|
| 18379 |
+
"step": 1670,
|
| 18380 |
+
"tokens_per_second_per_gpu": 10552.13
|
| 18381 |
+
},
|
| 18382 |
+
{
|
| 18383 |
+
"epoch": 0.041775,
|
| 18384 |
+
"grad_norm": 0.4609375,
|
| 18385 |
+
"learning_rate": 0.002505,
|
| 18386 |
+
"loss": 3.4083,
|
| 18387 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 18388 |
+
"memory/max_active (GiB)": 107.43,
|
| 18389 |
+
"memory/max_allocated (GiB)": 107.43,
|
| 18390 |
+
"step": 1671,
|
| 18391 |
+
"tokens_per_second_per_gpu": 12234.78
|
| 18392 |
+
},
|
| 18393 |
+
{
|
| 18394 |
+
"epoch": 0.0418,
|
| 18395 |
+
"grad_norm": 0.40234375,
|
| 18396 |
+
"learning_rate": 0.0025065,
|
| 18397 |
+
"loss": 3.4449,
|
| 18398 |
+
"memory/device_reserved (GiB)": 55.57,
|
| 18399 |
+
"memory/max_active (GiB)": 55.42,
|
| 18400 |
+
"memory/max_allocated (GiB)": 55.42,
|
| 18401 |
+
"step": 1672,
|
| 18402 |
+
"tokens_per_second_per_gpu": 23493.66
|
| 18403 |
+
},
|
| 18404 |
+
{
|
| 18405 |
+
"epoch": 0.041825,
|
| 18406 |
+
"grad_norm": 0.326171875,
|
| 18407 |
+
"learning_rate": 0.002508,
|
| 18408 |
+
"loss": 3.422,
|
| 18409 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 18410 |
+
"memory/max_active (GiB)": 97.23,
|
| 18411 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 18412 |
+
"step": 1673,
|
| 18413 |
+
"tokens_per_second_per_gpu": 13667.7
|
| 18414 |
+
},
|
| 18415 |
+
{
|
| 18416 |
+
"epoch": 0.04185,
|
| 18417 |
+
"grad_norm": 0.474609375,
|
| 18418 |
+
"learning_rate": 0.0025095,
|
| 18419 |
+
"loss": 3.4849,
|
| 18420 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 18421 |
+
"memory/max_active (GiB)": 107.43,
|
| 18422 |
+
"memory/max_allocated (GiB)": 107.43,
|
| 18423 |
+
"step": 1674,
|
| 18424 |
+
"tokens_per_second_per_gpu": 13409.66
|
| 18425 |
+
},
|
| 18426 |
+
{
|
| 18427 |
+
"epoch": 0.041875,
|
| 18428 |
+
"grad_norm": 0.51953125,
|
| 18429 |
+
"learning_rate": 0.0025109999999999998,
|
| 18430 |
+
"loss": 3.4117,
|
| 18431 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 18432 |
+
"memory/max_active (GiB)": 107.43,
|
| 18433 |
+
"memory/max_allocated (GiB)": 107.43,
|
| 18434 |
+
"step": 1675,
|
| 18435 |
+
"tokens_per_second_per_gpu": 12861.87
|
| 18436 |
+
},
|
| 18437 |
+
{
|
| 18438 |
+
"epoch": 0.0419,
|
| 18439 |
+
"grad_norm": 0.5859375,
|
| 18440 |
+
"learning_rate": 0.0025125,
|
| 18441 |
+
"loss": 3.4487,
|
| 18442 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 18443 |
+
"memory/max_active (GiB)": 107.43,
|
| 18444 |
+
"memory/max_allocated (GiB)": 107.43,
|
| 18445 |
+
"step": 1676,
|
| 18446 |
+
"tokens_per_second_per_gpu": 13391.9
|
| 18447 |
+
},
|
| 18448 |
+
{
|
| 18449 |
+
"epoch": 0.041925,
|
| 18450 |
+
"grad_norm": 0.45703125,
|
| 18451 |
+
"learning_rate": 0.0025139999999999997,
|
| 18452 |
+
"loss": 3.4691,
|
| 18453 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 18454 |
+
"memory/max_active (GiB)": 97.23,
|
| 18455 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 18456 |
+
"step": 1677,
|
| 18457 |
+
"tokens_per_second_per_gpu": 13697.51
|
| 18458 |
+
},
|
| 18459 |
+
{
|
| 18460 |
+
"epoch": 0.04195,
|
| 18461 |
+
"grad_norm": 0.458984375,
|
| 18462 |
+
"learning_rate": 0.0025155,
|
| 18463 |
+
"loss": 3.4354,
|
| 18464 |
+
"memory/device_reserved (GiB)": 117.34,
|
| 18465 |
+
"memory/max_active (GiB)": 117.15,
|
| 18466 |
+
"memory/max_allocated (GiB)": 117.15,
|
| 18467 |
+
"step": 1678,
|
| 18468 |
+
"tokens_per_second_per_gpu": 11489.94
|
| 18469 |
+
},
|
| 18470 |
+
{
|
| 18471 |
+
"epoch": 0.041975,
|
| 18472 |
+
"grad_norm": 0.41796875,
|
| 18473 |
+
"learning_rate": 0.002517,
|
| 18474 |
+
"loss": 3.4734,
|
| 18475 |
+
"memory/device_reserved (GiB)": 127.96,
|
| 18476 |
+
"memory/max_active (GiB)": 127.83,
|
| 18477 |
+
"memory/max_allocated (GiB)": 127.83,
|
| 18478 |
+
"step": 1679,
|
| 18479 |
+
"tokens_per_second_per_gpu": 10921.96
|
| 18480 |
+
},
|
| 18481 |
+
{
|
| 18482 |
+
"epoch": 0.042,
|
| 18483 |
+
"grad_norm": 0.376953125,
|
| 18484 |
+
"learning_rate": 0.0025185000000000003,
|
| 18485 |
+
"loss": 3.4138,
|
| 18486 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 18487 |
+
"memory/max_active (GiB)": 97.23,
|
| 18488 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 18489 |
+
"step": 1680,
|
| 18490 |
+
"tokens_per_second_per_gpu": 14813.73
|
| 18491 |
+
},
|
| 18492 |
+
{
|
| 18493 |
+
"epoch": 0.042025,
|
| 18494 |
+
"grad_norm": 0.357421875,
|
| 18495 |
+
"learning_rate": 0.00252,
|
| 18496 |
+
"loss": 3.395,
|
| 18497 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 18498 |
+
"memory/max_active (GiB)": 56.42,
|
| 18499 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 18500 |
+
"step": 1681,
|
| 18501 |
+
"tokens_per_second_per_gpu": 22947.66
|
| 18502 |
+
},
|
| 18503 |
+
{
|
| 18504 |
+
"epoch": 0.04205,
|
| 18505 |
+
"grad_norm": 0.375,
|
| 18506 |
+
"learning_rate": 0.0025215000000000003,
|
| 18507 |
+
"loss": 3.4281,
|
| 18508 |
+
"memory/device_reserved (GiB)": 117.82,
|
| 18509 |
+
"memory/max_active (GiB)": 117.63,
|
| 18510 |
+
"memory/max_allocated (GiB)": 117.63,
|
| 18511 |
+
"step": 1682,
|
| 18512 |
+
"tokens_per_second_per_gpu": 11994.21
|
| 18513 |
+
},
|
| 18514 |
+
{
|
| 18515 |
+
"epoch": 0.042075,
|
| 18516 |
+
"grad_norm": 0.37109375,
|
| 18517 |
+
"learning_rate": 0.002523,
|
| 18518 |
+
"loss": 3.4589,
|
| 18519 |
+
"memory/device_reserved (GiB)": 86.7,
|
| 18520 |
+
"memory/max_active (GiB)": 86.55,
|
| 18521 |
+
"memory/max_allocated (GiB)": 86.55,
|
| 18522 |
+
"step": 1683,
|
| 18523 |
+
"tokens_per_second_per_gpu": 15257.0
|
| 18524 |
+
},
|
| 18525 |
+
{
|
| 18526 |
+
"epoch": 0.0421,
|
| 18527 |
+
"grad_norm": 0.34375,
|
| 18528 |
+
"learning_rate": 0.0025245000000000003,
|
| 18529 |
+
"loss": 3.4429,
|
| 18530 |
+
"memory/device_reserved (GiB)": 44.36,
|
| 18531 |
+
"memory/max_active (GiB)": 44.31,
|
| 18532 |
+
"memory/max_allocated (GiB)": 44.31,
|
| 18533 |
+
"step": 1684,
|
| 18534 |
+
"tokens_per_second_per_gpu": 27548.71
|
| 18535 |
+
},
|
| 18536 |
+
{
|
| 18537 |
+
"epoch": 0.042125,
|
| 18538 |
+
"grad_norm": 0.4140625,
|
| 18539 |
+
"learning_rate": 0.002526,
|
| 18540 |
+
"loss": 3.4058,
|
| 18541 |
+
"memory/device_reserved (GiB)": 76.49,
|
| 18542 |
+
"memory/max_active (GiB)": 76.35,
|
| 18543 |
+
"memory/max_allocated (GiB)": 76.35,
|
| 18544 |
+
"step": 1685,
|
| 18545 |
+
"tokens_per_second_per_gpu": 17574.15
|
| 18546 |
+
},
|
| 18547 |
+
{
|
| 18548 |
+
"epoch": 0.04215,
|
| 18549 |
+
"grad_norm": 0.3984375,
|
| 18550 |
+
"learning_rate": 0.0025275000000000002,
|
| 18551 |
+
"loss": 3.4312,
|
| 18552 |
+
"memory/device_reserved (GiB)": 74.93,
|
| 18553 |
+
"memory/max_active (GiB)": 74.91,
|
| 18554 |
+
"memory/max_allocated (GiB)": 74.91,
|
| 18555 |
+
"step": 1686,
|
| 18556 |
+
"tokens_per_second_per_gpu": 18549.39
|
| 18557 |
+
},
|
| 18558 |
+
{
|
| 18559 |
+
"epoch": 0.042175,
|
| 18560 |
+
"grad_norm": 0.294921875,
|
| 18561 |
+
"learning_rate": 0.002529,
|
| 18562 |
+
"loss": 3.4163,
|
| 18563 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 18564 |
+
"memory/max_active (GiB)": 87.03,
|
| 18565 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 18566 |
+
"step": 1687,
|
| 18567 |
+
"tokens_per_second_per_gpu": 15868.69
|
| 18568 |
+
},
|
| 18569 |
+
{
|
| 18570 |
+
"epoch": 0.0422,
|
| 18571 |
+
"grad_norm": 0.416015625,
|
| 18572 |
+
"learning_rate": 0.0025305,
|
| 18573 |
+
"loss": 3.4385,
|
| 18574 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 18575 |
+
"memory/max_active (GiB)": 87.03,
|
| 18576 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 18577 |
+
"step": 1688,
|
| 18578 |
+
"tokens_per_second_per_gpu": 16224.91
|
| 18579 |
+
},
|
| 18580 |
+
{
|
| 18581 |
+
"epoch": 0.042225,
|
| 18582 |
+
"grad_norm": 0.41796875,
|
| 18583 |
+
"learning_rate": 0.002532,
|
| 18584 |
+
"loss": 3.4211,
|
| 18585 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 18586 |
+
"memory/max_active (GiB)": 87.03,
|
| 18587 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 18588 |
+
"step": 1689,
|
| 18589 |
+
"tokens_per_second_per_gpu": 14969.11
|
| 18590 |
+
},
|
| 18591 |
+
{
|
| 18592 |
+
"epoch": 0.04225,
|
| 18593 |
+
"grad_norm": 0.3203125,
|
| 18594 |
+
"learning_rate": 0.0025335,
|
| 18595 |
+
"loss": 3.3951,
|
| 18596 |
+
"memory/device_reserved (GiB)": 76.49,
|
| 18597 |
+
"memory/max_active (GiB)": 76.35,
|
| 18598 |
+
"memory/max_allocated (GiB)": 76.35,
|
| 18599 |
+
"step": 1690,
|
| 18600 |
+
"tokens_per_second_per_gpu": 17850.22
|
| 18601 |
+
},
|
| 18602 |
+
{
|
| 18603 |
+
"epoch": 0.042275,
|
| 18604 |
+
"grad_norm": 0.41015625,
|
| 18605 |
+
"learning_rate": 0.002535,
|
| 18606 |
+
"loss": 3.4191,
|
| 18607 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 18608 |
+
"memory/max_active (GiB)": 87.03,
|
| 18609 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 18610 |
+
"step": 1691,
|
| 18611 |
+
"tokens_per_second_per_gpu": 15729.04
|
| 18612 |
+
},
|
| 18613 |
+
{
|
| 18614 |
+
"epoch": 0.0423,
|
| 18615 |
+
"grad_norm": 0.439453125,
|
| 18616 |
+
"learning_rate": 0.0025365,
|
| 18617 |
+
"loss": 3.4192,
|
| 18618 |
+
"memory/device_reserved (GiB)": 86.7,
|
| 18619 |
+
"memory/max_active (GiB)": 86.55,
|
| 18620 |
+
"memory/max_allocated (GiB)": 86.55,
|
| 18621 |
+
"step": 1692,
|
| 18622 |
+
"tokens_per_second_per_gpu": 15578.61
|
| 18623 |
+
},
|
| 18624 |
+
{
|
| 18625 |
+
"epoch": 0.042325,
|
| 18626 |
+
"grad_norm": 0.326171875,
|
| 18627 |
+
"learning_rate": 0.002538,
|
| 18628 |
+
"loss": 3.4293,
|
| 18629 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 18630 |
+
"memory/max_active (GiB)": 56.42,
|
| 18631 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 18632 |
+
"step": 1693,
|
| 18633 |
+
"tokens_per_second_per_gpu": 21717.81
|
| 18634 |
+
},
|
| 18635 |
+
{
|
| 18636 |
+
"epoch": 0.04235,
|
| 18637 |
+
"grad_norm": 0.294921875,
|
| 18638 |
+
"learning_rate": 0.0025395,
|
| 18639 |
+
"loss": 3.4162,
|
| 18640 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 18641 |
+
"memory/max_active (GiB)": 56.42,
|
| 18642 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 18643 |
+
"step": 1694,
|
| 18644 |
+
"tokens_per_second_per_gpu": 23759.26
|
| 18645 |
+
},
|
| 18646 |
+
{
|
| 18647 |
+
"epoch": 0.042375,
|
| 18648 |
+
"grad_norm": 0.376953125,
|
| 18649 |
+
"learning_rate": 0.002541,
|
| 18650 |
+
"loss": 3.4058,
|
| 18651 |
+
"memory/device_reserved (GiB)": 56.63,
|
| 18652 |
+
"memory/max_active (GiB)": 56.42,
|
| 18653 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 18654 |
+
"step": 1695,
|
| 18655 |
+
"tokens_per_second_per_gpu": 22957.98
|
| 18656 |
+
},
|
| 18657 |
+
{
|
| 18658 |
+
"epoch": 0.0424,
|
| 18659 |
+
"grad_norm": 0.361328125,
|
| 18660 |
+
"learning_rate": 0.0025425,
|
| 18661 |
+
"loss": 3.4236,
|
| 18662 |
+
"memory/device_reserved (GiB)": 85.82,
|
| 18663 |
+
"memory/max_active (GiB)": 85.68,
|
| 18664 |
+
"memory/max_allocated (GiB)": 85.68,
|
| 18665 |
+
"step": 1696,
|
| 18666 |
+
"tokens_per_second_per_gpu": 15174.02
|
| 18667 |
+
},
|
| 18668 |
+
{
|
| 18669 |
+
"epoch": 0.042425,
|
| 18670 |
+
"grad_norm": 0.353515625,
|
| 18671 |
+
"learning_rate": 0.002544,
|
| 18672 |
+
"loss": 3.3949,
|
| 18673 |
+
"memory/device_reserved (GiB)": 77.01,
|
| 18674 |
+
"memory/max_active (GiB)": 76.83,
|
| 18675 |
+
"memory/max_allocated (GiB)": 76.83,
|
| 18676 |
+
"step": 1697,
|
| 18677 |
+
"tokens_per_second_per_gpu": 17562.99
|
| 18678 |
+
},
|
| 18679 |
+
{
|
| 18680 |
+
"epoch": 0.04245,
|
| 18681 |
+
"grad_norm": 0.3671875,
|
| 18682 |
+
"learning_rate": 0.0025455,
|
| 18683 |
+
"loss": 3.4254,
|
| 18684 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 18685 |
+
"memory/max_active (GiB)": 56.42,
|
| 18686 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 18687 |
+
"step": 1698,
|
| 18688 |
+
"tokens_per_second_per_gpu": 22963.27
|
| 18689 |
+
},
|
| 18690 |
+
{
|
| 18691 |
+
"epoch": 0.042475,
|
| 18692 |
+
"grad_norm": 0.3671875,
|
| 18693 |
+
"learning_rate": 0.002547,
|
| 18694 |
+
"loss": 3.4004,
|
| 18695 |
+
"memory/device_reserved (GiB)": 117.82,
|
| 18696 |
+
"memory/max_active (GiB)": 117.63,
|
| 18697 |
+
"memory/max_allocated (GiB)": 117.63,
|
| 18698 |
+
"step": 1699,
|
| 18699 |
+
"tokens_per_second_per_gpu": 11758.64
|
| 18700 |
+
},
|
| 18701 |
+
{
|
| 18702 |
+
"epoch": 0.0425,
|
| 18703 |
+
"grad_norm": 0.37109375,
|
| 18704 |
+
"learning_rate": 0.0025485,
|
| 18705 |
+
"loss": 3.4032,
|
| 18706 |
+
"memory/device_reserved (GiB)": 56.14,
|
| 18707 |
+
"memory/max_active (GiB)": 55.95,
|
| 18708 |
+
"memory/max_allocated (GiB)": 55.95,
|
| 18709 |
+
"step": 1700,
|
| 18710 |
+
"tokens_per_second_per_gpu": 23581.93
|
| 18711 |
+
},
|
| 18712 |
+
{
|
| 18713 |
+
"epoch": 0.042525,
|
| 18714 |
+
"grad_norm": 0.44921875,
|
| 18715 |
+
"learning_rate": 0.00255,
|
| 18716 |
+
"loss": 3.4364,
|
| 18717 |
+
"memory/device_reserved (GiB)": 127.55,
|
| 18718 |
+
"memory/max_active (GiB)": 127.35,
|
| 18719 |
+
"memory/max_allocated (GiB)": 127.35,
|
| 18720 |
+
"step": 1701,
|
| 18721 |
+
"tokens_per_second_per_gpu": 10589.65
|
| 18722 |
+
},
|
| 18723 |
+
{
|
| 18724 |
+
"epoch": 0.04255,
|
| 18725 |
+
"grad_norm": 0.4921875,
|
| 18726 |
+
"learning_rate": 0.0025515,
|
| 18727 |
+
"loss": 3.4531,
|
| 18728 |
+
"memory/device_reserved (GiB)": 96.93,
|
| 18729 |
+
"memory/max_active (GiB)": 96.75,
|
| 18730 |
+
"memory/max_allocated (GiB)": 96.75,
|
| 18731 |
+
"step": 1702,
|
| 18732 |
+
"tokens_per_second_per_gpu": 13398.45
|
| 18733 |
+
},
|
| 18734 |
+
{
|
| 18735 |
+
"epoch": 0.042575,
|
| 18736 |
+
"grad_norm": 0.5546875,
|
| 18737 |
+
"learning_rate": 0.002553,
|
| 18738 |
+
"loss": 3.4716,
|
| 18739 |
+
"memory/device_reserved (GiB)": 77.01,
|
| 18740 |
+
"memory/max_active (GiB)": 76.83,
|
| 18741 |
+
"memory/max_allocated (GiB)": 76.83,
|
| 18742 |
+
"step": 1703,
|
| 18743 |
+
"tokens_per_second_per_gpu": 17913.65
|
| 18744 |
+
},
|
| 18745 |
+
{
|
| 18746 |
+
"epoch": 0.0426,
|
| 18747 |
+
"grad_norm": 0.486328125,
|
| 18748 |
+
"learning_rate": 0.0025545000000000003,
|
| 18749 |
+
"loss": 3.4539,
|
| 18750 |
+
"memory/device_reserved (GiB)": 96.93,
|
| 18751 |
+
"memory/max_active (GiB)": 96.75,
|
| 18752 |
+
"memory/max_allocated (GiB)": 96.75,
|
| 18753 |
+
"step": 1704,
|
| 18754 |
+
"tokens_per_second_per_gpu": 13482.41
|
| 18755 |
+
},
|
| 18756 |
+
{
|
| 18757 |
+
"epoch": 0.042625,
|
| 18758 |
+
"grad_norm": 0.458984375,
|
| 18759 |
+
"learning_rate": 0.002556,
|
| 18760 |
+
"loss": 3.4376,
|
| 18761 |
+
"memory/device_reserved (GiB)": 64.72,
|
| 18762 |
+
"memory/max_active (GiB)": 64.71,
|
| 18763 |
+
"memory/max_allocated (GiB)": 64.71,
|
| 18764 |
+
"step": 1705,
|
| 18765 |
+
"tokens_per_second_per_gpu": 20009.22
|
| 18766 |
+
},
|
| 18767 |
+
{
|
| 18768 |
+
"epoch": 0.04265,
|
| 18769 |
+
"grad_norm": 0.458984375,
|
| 18770 |
+
"learning_rate": 0.0025575000000000003,
|
| 18771 |
+
"loss": 3.4079,
|
| 18772 |
+
"memory/device_reserved (GiB)": 56.63,
|
| 18773 |
+
"memory/max_active (GiB)": 56.42,
|
| 18774 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 18775 |
+
"step": 1706,
|
| 18776 |
+
"tokens_per_second_per_gpu": 22731.28
|
| 18777 |
+
},
|
| 18778 |
+
{
|
| 18779 |
+
"epoch": 0.042675,
|
| 18780 |
+
"grad_norm": 0.38671875,
|
| 18781 |
+
"learning_rate": 0.002559,
|
| 18782 |
+
"loss": 3.4195,
|
| 18783 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18784 |
+
"memory/max_active (GiB)": 66.63,
|
| 18785 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18786 |
+
"step": 1707,
|
| 18787 |
+
"tokens_per_second_per_gpu": 18695.79
|
| 18788 |
+
},
|
| 18789 |
+
{
|
| 18790 |
+
"epoch": 0.0427,
|
| 18791 |
+
"grad_norm": 0.39453125,
|
| 18792 |
+
"learning_rate": 0.0025605000000000003,
|
| 18793 |
+
"loss": 3.3978,
|
| 18794 |
+
"memory/device_reserved (GiB)": 127.55,
|
| 18795 |
+
"memory/max_active (GiB)": 127.35,
|
| 18796 |
+
"memory/max_allocated (GiB)": 127.35,
|
| 18797 |
+
"step": 1708,
|
| 18798 |
+
"tokens_per_second_per_gpu": 10495.74
|
| 18799 |
+
},
|
| 18800 |
+
{
|
| 18801 |
+
"epoch": 0.042725,
|
| 18802 |
+
"grad_norm": 0.416015625,
|
| 18803 |
+
"learning_rate": 0.002562,
|
| 18804 |
+
"loss": 3.3919,
|
| 18805 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 18806 |
+
"memory/max_active (GiB)": 87.03,
|
| 18807 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 18808 |
+
"step": 1709,
|
| 18809 |
+
"tokens_per_second_per_gpu": 15384.14
|
| 18810 |
+
},
|
| 18811 |
+
{
|
| 18812 |
+
"epoch": 0.04275,
|
| 18813 |
+
"grad_norm": 0.5078125,
|
| 18814 |
+
"learning_rate": 0.0025635000000000002,
|
| 18815 |
+
"loss": 3.4465,
|
| 18816 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18817 |
+
"memory/max_active (GiB)": 66.63,
|
| 18818 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18819 |
+
"step": 1710,
|
| 18820 |
+
"tokens_per_second_per_gpu": 21356.58
|
| 18821 |
+
},
|
| 18822 |
+
{
|
| 18823 |
+
"epoch": 0.042775,
|
| 18824 |
+
"grad_norm": 0.423828125,
|
| 18825 |
+
"learning_rate": 0.002565,
|
| 18826 |
+
"loss": 3.408,
|
| 18827 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 18828 |
+
"memory/max_active (GiB)": 97.23,
|
| 18829 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 18830 |
+
"step": 1711,
|
| 18831 |
+
"tokens_per_second_per_gpu": 13459.02
|
| 18832 |
+
},
|
| 18833 |
+
{
|
| 18834 |
+
"epoch": 0.0428,
|
| 18835 |
+
"grad_norm": 0.421875,
|
| 18836 |
+
"learning_rate": 0.0025665,
|
| 18837 |
+
"loss": 3.4107,
|
| 18838 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 18839 |
+
"memory/max_active (GiB)": 87.03,
|
| 18840 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 18841 |
+
"step": 1712,
|
| 18842 |
+
"tokens_per_second_per_gpu": 15576.53
|
| 18843 |
+
},
|
| 18844 |
+
{
|
| 18845 |
+
"epoch": 0.042825,
|
| 18846 |
+
"grad_norm": 0.466796875,
|
| 18847 |
+
"learning_rate": 0.002568,
|
| 18848 |
+
"loss": 3.4465,
|
| 18849 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 18850 |
+
"memory/max_active (GiB)": 56.42,
|
| 18851 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 18852 |
+
"step": 1713,
|
| 18853 |
+
"tokens_per_second_per_gpu": 24181.26
|
| 18854 |
+
},
|
| 18855 |
+
{
|
| 18856 |
+
"epoch": 0.04285,
|
| 18857 |
+
"grad_norm": 0.55859375,
|
| 18858 |
+
"learning_rate": 0.0025695,
|
| 18859 |
+
"loss": 3.4621,
|
| 18860 |
+
"memory/device_reserved (GiB)": 46.39,
|
| 18861 |
+
"memory/max_active (GiB)": 46.22,
|
| 18862 |
+
"memory/max_allocated (GiB)": 46.22,
|
| 18863 |
+
"step": 1714,
|
| 18864 |
+
"tokens_per_second_per_gpu": 27803.98
|
| 18865 |
+
},
|
| 18866 |
+
{
|
| 18867 |
+
"epoch": 0.042875,
|
| 18868 |
+
"grad_norm": 0.57421875,
|
| 18869 |
+
"learning_rate": 0.002571,
|
| 18870 |
+
"loss": 3.4206,
|
| 18871 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 18872 |
+
"memory/max_active (GiB)": 56.42,
|
| 18873 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 18874 |
+
"step": 1715,
|
| 18875 |
+
"tokens_per_second_per_gpu": 24012.74
|
| 18876 |
+
},
|
| 18877 |
+
{
|
| 18878 |
+
"epoch": 0.0429,
|
| 18879 |
+
"grad_norm": 0.490234375,
|
| 18880 |
+
"learning_rate": 0.0025725,
|
| 18881 |
+
"loss": 3.4463,
|
| 18882 |
+
"memory/device_reserved (GiB)": 107.12,
|
| 18883 |
+
"memory/max_active (GiB)": 106.95,
|
| 18884 |
+
"memory/max_allocated (GiB)": 106.95,
|
| 18885 |
+
"step": 1716,
|
| 18886 |
+
"tokens_per_second_per_gpu": 12538.54
|
| 18887 |
+
},
|
| 18888 |
+
{
|
| 18889 |
+
"epoch": 0.042925,
|
| 18890 |
+
"grad_norm": 0.412109375,
|
| 18891 |
+
"learning_rate": 0.002574,
|
| 18892 |
+
"loss": 3.4565,
|
| 18893 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18894 |
+
"memory/max_active (GiB)": 66.63,
|
| 18895 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18896 |
+
"step": 1717,
|
| 18897 |
+
"tokens_per_second_per_gpu": 19965.96
|
| 18898 |
+
},
|
| 18899 |
+
{
|
| 18900 |
+
"epoch": 0.04295,
|
| 18901 |
+
"grad_norm": 0.392578125,
|
| 18902 |
+
"learning_rate": 0.0025755,
|
| 18903 |
+
"loss": 3.4361,
|
| 18904 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 18905 |
+
"memory/max_active (GiB)": 107.43,
|
| 18906 |
+
"memory/max_allocated (GiB)": 107.43,
|
| 18907 |
+
"step": 1718,
|
| 18908 |
+
"tokens_per_second_per_gpu": 12689.62
|
| 18909 |
+
},
|
| 18910 |
+
{
|
| 18911 |
+
"epoch": 0.042975,
|
| 18912 |
+
"grad_norm": 0.38671875,
|
| 18913 |
+
"learning_rate": 0.002577,
|
| 18914 |
+
"loss": 3.4358,
|
| 18915 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 18916 |
+
"memory/max_active (GiB)": 87.03,
|
| 18917 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 18918 |
+
"step": 1719,
|
| 18919 |
+
"tokens_per_second_per_gpu": 15987.55
|
| 18920 |
+
},
|
| 18921 |
+
{
|
| 18922 |
+
"epoch": 0.043,
|
| 18923 |
+
"grad_norm": 0.431640625,
|
| 18924 |
+
"learning_rate": 0.0025785,
|
| 18925 |
+
"loss": 3.4481,
|
| 18926 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 18927 |
+
"memory/max_active (GiB)": 56.42,
|
| 18928 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 18929 |
+
"step": 1720,
|
| 18930 |
+
"tokens_per_second_per_gpu": 23525.6
|
| 18931 |
+
},
|
| 18932 |
+
{
|
| 18933 |
+
"epoch": 0.043025,
|
| 18934 |
+
"grad_norm": 0.4453125,
|
| 18935 |
+
"learning_rate": 0.00258,
|
| 18936 |
+
"loss": 3.4401,
|
| 18937 |
+
"memory/device_reserved (GiB)": 127.55,
|
| 18938 |
+
"memory/max_active (GiB)": 127.35,
|
| 18939 |
+
"memory/max_allocated (GiB)": 127.35,
|
| 18940 |
+
"step": 1721,
|
| 18941 |
+
"tokens_per_second_per_gpu": 10768.58
|
| 18942 |
+
},
|
| 18943 |
+
{
|
| 18944 |
+
"epoch": 0.04305,
|
| 18945 |
+
"grad_norm": 0.44921875,
|
| 18946 |
+
"learning_rate": 0.0025815,
|
| 18947 |
+
"loss": 3.4029,
|
| 18948 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 18949 |
+
"memory/max_active (GiB)": 97.23,
|
| 18950 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 18951 |
+
"step": 1722,
|
| 18952 |
+
"tokens_per_second_per_gpu": 14518.37
|
| 18953 |
+
},
|
| 18954 |
+
{
|
| 18955 |
+
"epoch": 0.043075,
|
| 18956 |
+
"grad_norm": 0.384765625,
|
| 18957 |
+
"learning_rate": 0.0025830000000000002,
|
| 18958 |
+
"loss": 3.3824,
|
| 18959 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18960 |
+
"memory/max_active (GiB)": 66.63,
|
| 18961 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18962 |
+
"step": 1723,
|
| 18963 |
+
"tokens_per_second_per_gpu": 19566.23
|
| 18964 |
+
},
|
| 18965 |
+
{
|
| 18966 |
+
"epoch": 0.0431,
|
| 18967 |
+
"grad_norm": 0.27734375,
|
| 18968 |
+
"learning_rate": 0.0025845,
|
| 18969 |
+
"loss": 3.4756,
|
| 18970 |
+
"memory/device_reserved (GiB)": 96.93,
|
| 18971 |
+
"memory/max_active (GiB)": 96.75,
|
| 18972 |
+
"memory/max_allocated (GiB)": 96.75,
|
| 18973 |
+
"step": 1724,
|
| 18974 |
+
"tokens_per_second_per_gpu": 13699.55
|
| 18975 |
+
},
|
| 18976 |
+
{
|
| 18977 |
+
"epoch": 0.043125,
|
| 18978 |
+
"grad_norm": 0.255859375,
|
| 18979 |
+
"learning_rate": 0.002586,
|
| 18980 |
+
"loss": 3.4392,
|
| 18981 |
+
"memory/device_reserved (GiB)": 55.41,
|
| 18982 |
+
"memory/max_active (GiB)": 55.39,
|
| 18983 |
+
"memory/max_allocated (GiB)": 55.39,
|
| 18984 |
+
"step": 1725,
|
| 18985 |
+
"tokens_per_second_per_gpu": 23270.79
|
| 18986 |
+
},
|
| 18987 |
+
{
|
| 18988 |
+
"epoch": 0.04315,
|
| 18989 |
+
"grad_norm": 0.2421875,
|
| 18990 |
+
"learning_rate": 0.0025875000000000004,
|
| 18991 |
+
"loss": 3.4289,
|
| 18992 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 18993 |
+
"memory/max_active (GiB)": 66.63,
|
| 18994 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 18995 |
+
"step": 1726,
|
| 18996 |
+
"tokens_per_second_per_gpu": 20476.82
|
| 18997 |
+
},
|
| 18998 |
+
{
|
| 18999 |
+
"epoch": 0.043175,
|
| 19000 |
+
"grad_norm": 0.244140625,
|
| 19001 |
+
"learning_rate": 0.002589,
|
| 19002 |
+
"loss": 3.4447,
|
| 19003 |
+
"memory/device_reserved (GiB)": 75.99,
|
| 19004 |
+
"memory/max_active (GiB)": 75.82,
|
| 19005 |
+
"memory/max_allocated (GiB)": 75.82,
|
| 19006 |
+
"step": 1727,
|
| 19007 |
+
"tokens_per_second_per_gpu": 17731.33
|
| 19008 |
+
},
|
| 19009 |
+
{
|
| 19010 |
+
"epoch": 0.0432,
|
| 19011 |
+
"grad_norm": 0.24609375,
|
| 19012 |
+
"learning_rate": 0.0025905000000000004,
|
| 19013 |
+
"loss": 3.4028,
|
| 19014 |
+
"memory/device_reserved (GiB)": 127.96,
|
| 19015 |
+
"memory/max_active (GiB)": 127.83,
|
| 19016 |
+
"memory/max_allocated (GiB)": 127.83,
|
| 19017 |
+
"step": 1728,
|
| 19018 |
+
"tokens_per_second_per_gpu": 10811.24
|
| 19019 |
+
},
|
| 19020 |
+
{
|
| 19021 |
+
"epoch": 0.043225,
|
| 19022 |
+
"grad_norm": 0.306640625,
|
| 19023 |
+
"learning_rate": 0.002592,
|
| 19024 |
+
"loss": 3.4164,
|
| 19025 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 19026 |
+
"memory/max_active (GiB)": 97.23,
|
| 19027 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 19028 |
+
"step": 1729,
|
| 19029 |
+
"tokens_per_second_per_gpu": 13945.48
|
| 19030 |
+
},
|
| 19031 |
+
{
|
| 19032 |
+
"epoch": 0.04325,
|
| 19033 |
+
"grad_norm": 0.369140625,
|
| 19034 |
+
"learning_rate": 0.0025935000000000003,
|
| 19035 |
+
"loss": 3.3548,
|
| 19036 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 19037 |
+
"memory/max_active (GiB)": 66.63,
|
| 19038 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 19039 |
+
"step": 1730,
|
| 19040 |
+
"tokens_per_second_per_gpu": 19830.86
|
| 19041 |
+
},
|
| 19042 |
+
{
|
| 19043 |
+
"epoch": 0.043275,
|
| 19044 |
+
"grad_norm": 0.37890625,
|
| 19045 |
+
"learning_rate": 0.002595,
|
| 19046 |
+
"loss": 3.421,
|
| 19047 |
+
"memory/device_reserved (GiB)": 127.96,
|
| 19048 |
+
"memory/max_active (GiB)": 127.83,
|
| 19049 |
+
"memory/max_allocated (GiB)": 127.83,
|
| 19050 |
+
"step": 1731,
|
| 19051 |
+
"tokens_per_second_per_gpu": 10542.1
|
| 19052 |
+
},
|
| 19053 |
+
{
|
| 19054 |
+
"epoch": 0.0433,
|
| 19055 |
+
"grad_norm": 0.38671875,
|
| 19056 |
+
"learning_rate": 0.0025965000000000003,
|
| 19057 |
+
"loss": 3.4064,
|
| 19058 |
+
"memory/device_reserved (GiB)": 56.14,
|
| 19059 |
+
"memory/max_active (GiB)": 55.95,
|
| 19060 |
+
"memory/max_allocated (GiB)": 55.95,
|
| 19061 |
+
"step": 1732,
|
| 19062 |
+
"tokens_per_second_per_gpu": 21749.89
|
| 19063 |
+
},
|
| 19064 |
+
{
|
| 19065 |
+
"epoch": 0.043325,
|
| 19066 |
+
"grad_norm": 0.2255859375,
|
| 19067 |
+
"learning_rate": 0.002598,
|
| 19068 |
+
"loss": 3.4061,
|
| 19069 |
+
"memory/device_reserved (GiB)": 85.15,
|
| 19070 |
+
"memory/max_active (GiB)": 85.11,
|
| 19071 |
+
"memory/max_allocated (GiB)": 85.11,
|
| 19072 |
+
"step": 1733,
|
| 19073 |
+
"tokens_per_second_per_gpu": 15894.06
|
| 19074 |
+
},
|
| 19075 |
+
{
|
| 19076 |
+
"epoch": 0.04335,
|
| 19077 |
+
"grad_norm": 0.30859375,
|
| 19078 |
+
"learning_rate": 0.0025995000000000002,
|
| 19079 |
+
"loss": 3.3912,
|
| 19080 |
+
"memory/device_reserved (GiB)": 66.35,
|
| 19081 |
+
"memory/max_active (GiB)": 66.15,
|
| 19082 |
+
"memory/max_allocated (GiB)": 66.15,
|
| 19083 |
+
"step": 1734,
|
| 19084 |
+
"tokens_per_second_per_gpu": 20235.34
|
| 19085 |
+
},
|
| 19086 |
+
{
|
| 19087 |
+
"epoch": 0.043375,
|
| 19088 |
+
"grad_norm": 0.28125,
|
| 19089 |
+
"learning_rate": 0.002601,
|
| 19090 |
+
"loss": 3.3853,
|
| 19091 |
+
"memory/device_reserved (GiB)": 44.36,
|
| 19092 |
+
"memory/max_active (GiB)": 44.31,
|
| 19093 |
+
"memory/max_allocated (GiB)": 44.31,
|
| 19094 |
+
"step": 1735,
|
| 19095 |
+
"tokens_per_second_per_gpu": 27550.53
|
| 19096 |
+
},
|
| 19097 |
+
{
|
| 19098 |
+
"epoch": 0.0434,
|
| 19099 |
+
"grad_norm": 0.251953125,
|
| 19100 |
+
"learning_rate": 0.0026025,
|
| 19101 |
+
"loss": 3.4352,
|
| 19102 |
+
"memory/device_reserved (GiB)": 117.82,
|
| 19103 |
+
"memory/max_active (GiB)": 117.63,
|
| 19104 |
+
"memory/max_allocated (GiB)": 117.63,
|
| 19105 |
+
"step": 1736,
|
| 19106 |
+
"tokens_per_second_per_gpu": 11236.12
|
| 19107 |
+
},
|
| 19108 |
+
{
|
| 19109 |
+
"epoch": 0.043425,
|
| 19110 |
+
"grad_norm": 0.404296875,
|
| 19111 |
+
"learning_rate": 0.002604,
|
| 19112 |
+
"loss": 3.4176,
|
| 19113 |
+
"memory/device_reserved (GiB)": 127.96,
|
| 19114 |
+
"memory/max_active (GiB)": 127.83,
|
| 19115 |
+
"memory/max_allocated (GiB)": 127.83,
|
| 19116 |
+
"step": 1737,
|
| 19117 |
+
"tokens_per_second_per_gpu": 10840.44
|
| 19118 |
+
},
|
| 19119 |
+
{
|
| 19120 |
+
"epoch": 0.04345,
|
| 19121 |
+
"grad_norm": 0.546875,
|
| 19122 |
+
"learning_rate": 0.0026055,
|
| 19123 |
+
"loss": 3.4038,
|
| 19124 |
+
"memory/device_reserved (GiB)": 76.49,
|
| 19125 |
+
"memory/max_active (GiB)": 76.35,
|
| 19126 |
+
"memory/max_allocated (GiB)": 76.35,
|
| 19127 |
+
"step": 1738,
|
| 19128 |
+
"tokens_per_second_per_gpu": 17339.64
|
| 19129 |
+
},
|
| 19130 |
+
{
|
| 19131 |
+
"epoch": 0.043475,
|
| 19132 |
+
"grad_norm": 0.578125,
|
| 19133 |
+
"learning_rate": 0.002607,
|
| 19134 |
+
"loss": 3.4502,
|
| 19135 |
+
"memory/device_reserved (GiB)": 127.55,
|
| 19136 |
+
"memory/max_active (GiB)": 127.35,
|
| 19137 |
+
"memory/max_allocated (GiB)": 127.35,
|
| 19138 |
+
"step": 1739,
|
| 19139 |
+
"tokens_per_second_per_gpu": 10609.71
|
| 19140 |
+
},
|
| 19141 |
+
{
|
| 19142 |
+
"epoch": 0.0435,
|
| 19143 |
+
"grad_norm": 0.53125,
|
| 19144 |
+
"learning_rate": 0.0026085,
|
| 19145 |
+
"loss": 3.4514,
|
| 19146 |
+
"memory/device_reserved (GiB)": 86.7,
|
| 19147 |
+
"memory/max_active (GiB)": 86.55,
|
| 19148 |
+
"memory/max_allocated (GiB)": 86.55,
|
| 19149 |
+
"step": 1740,
|
| 19150 |
+
"tokens_per_second_per_gpu": 14979.82
|
| 19151 |
+
},
|
| 19152 |
+
{
|
| 19153 |
+
"epoch": 0.043525,
|
| 19154 |
+
"grad_norm": 0.46484375,
|
| 19155 |
+
"learning_rate": 0.00261,
|
| 19156 |
+
"loss": 3.461,
|
| 19157 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 19158 |
+
"memory/max_active (GiB)": 97.23,
|
| 19159 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 19160 |
+
"step": 1741,
|
| 19161 |
+
"tokens_per_second_per_gpu": 13526.38
|
| 19162 |
+
},
|
| 19163 |
+
{
|
| 19164 |
+
"epoch": 0.04355,
|
| 19165 |
+
"grad_norm": 0.44921875,
|
| 19166 |
+
"learning_rate": 0.0026115,
|
| 19167 |
+
"loss": 3.4299,
|
| 19168 |
+
"memory/device_reserved (GiB)": 64.72,
|
| 19169 |
+
"memory/max_active (GiB)": 64.71,
|
| 19170 |
+
"memory/max_allocated (GiB)": 64.71,
|
| 19171 |
+
"step": 1742,
|
| 19172 |
+
"tokens_per_second_per_gpu": 20033.73
|
| 19173 |
+
},
|
| 19174 |
+
{
|
| 19175 |
+
"epoch": 0.043575,
|
| 19176 |
+
"grad_norm": 0.40234375,
|
| 19177 |
+
"learning_rate": 0.002613,
|
| 19178 |
+
"loss": 3.4074,
|
| 19179 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 19180 |
+
"memory/max_active (GiB)": 97.23,
|
| 19181 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 19182 |
+
"step": 1743,
|
| 19183 |
+
"tokens_per_second_per_gpu": 13919.14
|
| 19184 |
+
},
|
| 19185 |
+
{
|
| 19186 |
+
"epoch": 0.0436,
|
| 19187 |
+
"grad_norm": 0.337890625,
|
| 19188 |
+
"learning_rate": 0.0026145,
|
| 19189 |
+
"loss": 3.4226,
|
| 19190 |
+
"memory/device_reserved (GiB)": 127.96,
|
| 19191 |
+
"memory/max_active (GiB)": 127.83,
|
| 19192 |
+
"memory/max_allocated (GiB)": 127.83,
|
| 19193 |
+
"step": 1744,
|
| 19194 |
+
"tokens_per_second_per_gpu": 10914.18
|
| 19195 |
+
},
|
| 19196 |
+
{
|
| 19197 |
+
"epoch": 0.043625,
|
| 19198 |
+
"grad_norm": 0.34375,
|
| 19199 |
+
"learning_rate": 0.002616,
|
| 19200 |
+
"loss": 3.4013,
|
| 19201 |
+
"memory/device_reserved (GiB)": 46.39,
|
| 19202 |
+
"memory/max_active (GiB)": 46.22,
|
| 19203 |
+
"memory/max_allocated (GiB)": 46.22,
|
| 19204 |
+
"step": 1745,
|
| 19205 |
+
"tokens_per_second_per_gpu": 26513.04
|
| 19206 |
+
},
|
| 19207 |
+
{
|
| 19208 |
+
"epoch": 0.04365,
|
| 19209 |
+
"grad_norm": 0.400390625,
|
| 19210 |
+
"learning_rate": 0.0026175,
|
| 19211 |
+
"loss": 3.4318,
|
| 19212 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 19213 |
+
"memory/max_active (GiB)": 107.43,
|
| 19214 |
+
"memory/max_allocated (GiB)": 107.43,
|
| 19215 |
+
"step": 1746,
|
| 19216 |
+
"tokens_per_second_per_gpu": 12843.87
|
| 19217 |
+
},
|
| 19218 |
+
{
|
| 19219 |
+
"epoch": 0.043675,
|
| 19220 |
+
"grad_norm": 0.4140625,
|
| 19221 |
+
"learning_rate": 0.0026190000000000002,
|
| 19222 |
+
"loss": 3.4096,
|
| 19223 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 19224 |
+
"memory/max_active (GiB)": 87.03,
|
| 19225 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 19226 |
+
"step": 1747,
|
| 19227 |
+
"tokens_per_second_per_gpu": 15880.59
|
| 19228 |
+
},
|
| 19229 |
+
{
|
| 19230 |
+
"epoch": 0.0437,
|
| 19231 |
+
"grad_norm": 0.50390625,
|
| 19232 |
+
"learning_rate": 0.0026205000000000004,
|
| 19233 |
+
"loss": 3.4245,
|
| 19234 |
+
"memory/device_reserved (GiB)": 127.55,
|
| 19235 |
+
"memory/max_active (GiB)": 127.35,
|
| 19236 |
+
"memory/max_allocated (GiB)": 127.35,
|
| 19237 |
+
"step": 1748,
|
| 19238 |
+
"tokens_per_second_per_gpu": 10692.67
|
| 19239 |
+
},
|
| 19240 |
+
{
|
| 19241 |
+
"epoch": 0.043725,
|
| 19242 |
+
"grad_norm": 0.474609375,
|
| 19243 |
+
"learning_rate": 0.002622,
|
| 19244 |
+
"loss": 3.4249,
|
| 19245 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 19246 |
+
"memory/max_active (GiB)": 66.63,
|
| 19247 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 19248 |
+
"step": 1749,
|
| 19249 |
+
"tokens_per_second_per_gpu": 19357.34
|
| 19250 |
+
},
|
| 19251 |
+
{
|
| 19252 |
+
"epoch": 0.04375,
|
| 19253 |
+
"grad_norm": 0.423828125,
|
| 19254 |
+
"learning_rate": 0.0026235000000000004,
|
| 19255 |
+
"loss": 3.391,
|
| 19256 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 19257 |
+
"memory/max_active (GiB)": 56.42,
|
| 19258 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 19259 |
+
"step": 1750,
|
| 19260 |
+
"tokens_per_second_per_gpu": 23852.2
|
| 19261 |
+
},
|
| 19262 |
+
{
|
| 19263 |
+
"epoch": 0.043775,
|
| 19264 |
+
"grad_norm": 0.30859375,
|
| 19265 |
+
"learning_rate": 0.002625,
|
| 19266 |
+
"loss": 3.4351,
|
| 19267 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 19268 |
+
"memory/max_active (GiB)": 97.23,
|
| 19269 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 19270 |
+
"step": 1751,
|
| 19271 |
+
"tokens_per_second_per_gpu": 13356.15
|
| 19272 |
+
},
|
| 19273 |
+
{
|
| 19274 |
+
"epoch": 0.0438,
|
| 19275 |
+
"grad_norm": 0.25390625,
|
| 19276 |
+
"learning_rate": 0.0026265,
|
| 19277 |
+
"loss": 3.4141,
|
| 19278 |
+
"memory/device_reserved (GiB)": 97.44,
|
| 19279 |
+
"memory/max_active (GiB)": 97.23,
|
| 19280 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 19281 |
+
"step": 1752,
|
| 19282 |
+
"tokens_per_second_per_gpu": 13330.25
|
| 19283 |
+
},
|
| 19284 |
+
{
|
| 19285 |
+
"epoch": 0.043825,
|
| 19286 |
+
"grad_norm": 0.2314453125,
|
| 19287 |
+
"learning_rate": 0.002628,
|
| 19288 |
+
"loss": 3.3987,
|
| 19289 |
+
"memory/device_reserved (GiB)": 96.42,
|
| 19290 |
+
"memory/max_active (GiB)": 96.23,
|
| 19291 |
+
"memory/max_allocated (GiB)": 96.23,
|
| 19292 |
+
"step": 1753,
|
| 19293 |
+
"tokens_per_second_per_gpu": 14470.45
|
| 19294 |
+
},
|
| 19295 |
+
{
|
| 19296 |
+
"epoch": 0.04385,
|
| 19297 |
+
"grad_norm": 0.30078125,
|
| 19298 |
+
"learning_rate": 0.0026295,
|
| 19299 |
+
"loss": 3.4109,
|
| 19300 |
+
"memory/device_reserved (GiB)": 86.7,
|
| 19301 |
+
"memory/max_active (GiB)": 86.55,
|
| 19302 |
+
"memory/max_allocated (GiB)": 86.55,
|
| 19303 |
+
"step": 1754,
|
| 19304 |
+
"tokens_per_second_per_gpu": 14973.02
|
| 19305 |
+
},
|
| 19306 |
+
{
|
| 19307 |
+
"epoch": 0.043875,
|
| 19308 |
+
"grad_norm": 0.314453125,
|
| 19309 |
+
"learning_rate": 0.002631,
|
| 19310 |
+
"loss": 3.4064,
|
| 19311 |
+
"memory/device_reserved (GiB)": 106.26,
|
| 19312 |
+
"memory/max_active (GiB)": 106.08,
|
| 19313 |
+
"memory/max_allocated (GiB)": 106.08,
|
| 19314 |
+
"step": 1755,
|
| 19315 |
+
"tokens_per_second_per_gpu": 12857.47
|
| 19316 |
+
},
|
| 19317 |
+
{
|
| 19318 |
+
"epoch": 0.0439,
|
| 19319 |
+
"grad_norm": 0.333984375,
|
| 19320 |
+
"learning_rate": 0.0026325,
|
| 19321 |
+
"loss": 3.4352,
|
| 19322 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 19323 |
+
"memory/max_active (GiB)": 87.03,
|
| 19324 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 19325 |
+
"step": 1756,
|
| 19326 |
+
"tokens_per_second_per_gpu": 15689.65
|
| 19327 |
+
},
|
| 19328 |
+
{
|
| 19329 |
+
"epoch": 0.043925,
|
| 19330 |
+
"grad_norm": 0.2333984375,
|
| 19331 |
+
"learning_rate": 0.002634,
|
| 19332 |
+
"loss": 3.4027,
|
| 19333 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 19334 |
+
"memory/max_active (GiB)": 56.42,
|
| 19335 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 19336 |
+
"step": 1757,
|
| 19337 |
+
"tokens_per_second_per_gpu": 21923.57
|
| 19338 |
+
},
|
| 19339 |
+
{
|
| 19340 |
+
"epoch": 0.04395,
|
| 19341 |
+
"grad_norm": 0.197265625,
|
| 19342 |
+
"learning_rate": 0.0026355,
|
| 19343 |
+
"loss": 3.426,
|
| 19344 |
+
"memory/device_reserved (GiB)": 96.42,
|
| 19345 |
+
"memory/max_active (GiB)": 96.22,
|
| 19346 |
+
"memory/max_allocated (GiB)": 96.22,
|
| 19347 |
+
"step": 1758,
|
| 19348 |
+
"tokens_per_second_per_gpu": 13418.51
|
| 19349 |
+
},
|
| 19350 |
+
{
|
| 19351 |
+
"epoch": 0.043975,
|
| 19352 |
+
"grad_norm": 0.2578125,
|
| 19353 |
+
"learning_rate": 0.002637,
|
| 19354 |
+
"loss": 3.3894,
|
| 19355 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 19356 |
+
"memory/max_active (GiB)": 56.42,
|
| 19357 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 19358 |
+
"step": 1759,
|
| 19359 |
+
"tokens_per_second_per_gpu": 24491.74
|
| 19360 |
+
},
|
| 19361 |
+
{
|
| 19362 |
+
"epoch": 0.044,
|
| 19363 |
+
"grad_norm": 0.423828125,
|
| 19364 |
+
"learning_rate": 0.0026385,
|
| 19365 |
+
"loss": 3.4275,
|
| 19366 |
+
"memory/device_reserved (GiB)": 117.82,
|
| 19367 |
+
"memory/max_active (GiB)": 117.63,
|
| 19368 |
+
"memory/max_allocated (GiB)": 117.63,
|
| 19369 |
+
"step": 1760,
|
| 19370 |
+
"tokens_per_second_per_gpu": 11247.5
|
| 19371 |
+
},
|
| 19372 |
+
{
|
| 19373 |
+
"epoch": 0.044025,
|
| 19374 |
+
"grad_norm": 0.61328125,
|
| 19375 |
+
"learning_rate": 0.00264,
|
| 19376 |
+
"loss": 3.4444,
|
| 19377 |
+
"memory/device_reserved (GiB)": 35.22,
|
| 19378 |
+
"memory/max_active (GiB)": 35.02,
|
| 19379 |
+
"memory/max_allocated (GiB)": 35.02,
|
| 19380 |
+
"step": 1761,
|
| 19381 |
+
"tokens_per_second_per_gpu": 33971.64
|
| 19382 |
+
},
|
| 19383 |
+
{
|
| 19384 |
+
"epoch": 0.04405,
|
| 19385 |
+
"grad_norm": 0.55859375,
|
| 19386 |
+
"learning_rate": 0.0026414999999999998,
|
| 19387 |
+
"loss": 3.4196,
|
| 19388 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 19389 |
+
"memory/max_active (GiB)": 87.03,
|
| 19390 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 19391 |
+
"step": 1762,
|
| 19392 |
+
"tokens_per_second_per_gpu": 16207.95
|
| 19393 |
+
},
|
| 19394 |
+
{
|
| 19395 |
+
"epoch": 0.044075,
|
| 19396 |
+
"grad_norm": 0.474609375,
|
| 19397 |
+
"learning_rate": 0.002643,
|
| 19398 |
+
"loss": 3.4154,
|
| 19399 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 19400 |
+
"memory/max_active (GiB)": 66.63,
|
| 19401 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 19402 |
+
"step": 1763,
|
| 19403 |
+
"tokens_per_second_per_gpu": 20758.11
|
| 19404 |
+
},
|
| 19405 |
+
{
|
| 19406 |
+
"epoch": 0.0441,
|
| 19407 |
+
"grad_norm": 0.57421875,
|
| 19408 |
+
"learning_rate": 0.0026444999999999997,
|
| 19409 |
+
"loss": 3.4227,
|
| 19410 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 19411 |
+
"memory/max_active (GiB)": 66.63,
|
| 19412 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 19413 |
+
"step": 1764,
|
| 19414 |
+
"tokens_per_second_per_gpu": 20933.38
|
| 19415 |
+
},
|
| 19416 |
+
{
|
| 19417 |
+
"epoch": 0.044125,
|
| 19418 |
+
"grad_norm": 0.49609375,
|
| 19419 |
+
"learning_rate": 0.002646,
|
| 19420 |
+
"loss": 3.4113,
|
| 19421 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 19422 |
+
"memory/max_active (GiB)": 66.63,
|
| 19423 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 19424 |
+
"step": 1765,
|
| 19425 |
+
"tokens_per_second_per_gpu": 20328.8
|
| 19426 |
+
},
|
| 19427 |
+
{
|
| 19428 |
+
"epoch": 0.04415,
|
| 19429 |
+
"grad_norm": 0.56640625,
|
| 19430 |
+
"learning_rate": 0.0026475,
|
| 19431 |
+
"loss": 3.424,
|
| 19432 |
+
"memory/device_reserved (GiB)": 46.36,
|
| 19433 |
+
"memory/max_active (GiB)": 46.22,
|
| 19434 |
+
"memory/max_allocated (GiB)": 46.22,
|
| 19435 |
+
"step": 1766,
|
| 19436 |
+
"tokens_per_second_per_gpu": 28885.39
|
| 19437 |
+
},
|
| 19438 |
+
{
|
| 19439 |
+
"epoch": 0.044175,
|
| 19440 |
+
"grad_norm": 0.60546875,
|
| 19441 |
+
"learning_rate": 0.002649,
|
| 19442 |
+
"loss": 3.4592,
|
| 19443 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 19444 |
+
"memory/max_active (GiB)": 66.63,
|
| 19445 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 19446 |
+
"step": 1767,
|
| 19447 |
+
"tokens_per_second_per_gpu": 20234.49
|
| 19448 |
+
},
|
| 19449 |
+
{
|
| 19450 |
+
"epoch": 0.0442,
|
| 19451 |
+
"grad_norm": 0.55078125,
|
| 19452 |
+
"learning_rate": 0.0026505,
|
| 19453 |
+
"loss": 3.4609,
|
| 19454 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 19455 |
+
"memory/max_active (GiB)": 87.03,
|
| 19456 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 19457 |
+
"step": 1768,
|
| 19458 |
+
"tokens_per_second_per_gpu": 15912.34
|
| 19459 |
+
},
|
| 19460 |
+
{
|
| 19461 |
+
"epoch": 0.044225,
|
| 19462 |
+
"grad_norm": 0.466796875,
|
| 19463 |
+
"learning_rate": 0.0026520000000000003,
|
| 19464 |
+
"loss": 3.4629,
|
| 19465 |
+
"memory/device_reserved (GiB)": 107.12,
|
| 19466 |
+
"memory/max_active (GiB)": 106.95,
|
| 19467 |
+
"memory/max_allocated (GiB)": 106.95,
|
| 19468 |
+
"step": 1769,
|
| 19469 |
+
"tokens_per_second_per_gpu": 12854.59
|
| 19470 |
+
},
|
| 19471 |
+
{
|
| 19472 |
+
"epoch": 0.04425,
|
| 19473 |
+
"grad_norm": 0.49609375,
|
| 19474 |
+
"learning_rate": 0.0026535,
|
| 19475 |
+
"loss": 3.4224,
|
| 19476 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 19477 |
+
"memory/max_active (GiB)": 56.42,
|
| 19478 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 19479 |
+
"step": 1770,
|
| 19480 |
+
"tokens_per_second_per_gpu": 23600.25
|
| 19481 |
+
},
|
| 19482 |
+
{
|
| 19483 |
+
"epoch": 0.044275,
|
| 19484 |
+
"grad_norm": 0.421875,
|
| 19485 |
+
"learning_rate": 0.0026550000000000002,
|
| 19486 |
+
"loss": 3.4245,
|
| 19487 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 19488 |
+
"memory/max_active (GiB)": 97.23,
|
| 19489 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 19490 |
+
"step": 1771,
|
| 19491 |
+
"tokens_per_second_per_gpu": 14240.32
|
| 19492 |
+
},
|
| 19493 |
+
{
|
| 19494 |
+
"epoch": 0.0443,
|
| 19495 |
+
"grad_norm": 0.2197265625,
|
| 19496 |
+
"learning_rate": 0.0026565,
|
| 19497 |
+
"loss": 3.4746,
|
| 19498 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 19499 |
+
"memory/max_active (GiB)": 107.42,
|
| 19500 |
+
"memory/max_allocated (GiB)": 107.42,
|
| 19501 |
+
"step": 1772,
|
| 19502 |
+
"tokens_per_second_per_gpu": 12415.51
|
| 19503 |
+
},
|
| 19504 |
+
{
|
| 19505 |
+
"epoch": 0.044325,
|
| 19506 |
+
"grad_norm": 0.3046875,
|
| 19507 |
+
"learning_rate": 0.002658,
|
| 19508 |
+
"loss": 3.4487,
|
| 19509 |
+
"memory/device_reserved (GiB)": 117.82,
|
| 19510 |
+
"memory/max_active (GiB)": 117.63,
|
| 19511 |
+
"memory/max_allocated (GiB)": 117.63,
|
| 19512 |
+
"step": 1773,
|
| 19513 |
+
"tokens_per_second_per_gpu": 11728.76
|
| 19514 |
+
},
|
| 19515 |
+
{
|
| 19516 |
+
"epoch": 0.04435,
|
| 19517 |
+
"grad_norm": 0.380859375,
|
| 19518 |
+
"learning_rate": 0.0026595,
|
| 19519 |
+
"loss": 3.428,
|
| 19520 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 19521 |
+
"memory/max_active (GiB)": 97.23,
|
| 19522 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 19523 |
+
"step": 1774,
|
| 19524 |
+
"tokens_per_second_per_gpu": 13483.5
|
| 19525 |
+
},
|
| 19526 |
+
{
|
| 19527 |
+
"epoch": 0.044375,
|
| 19528 |
+
"grad_norm": 0.32421875,
|
| 19529 |
+
"learning_rate": 0.002661,
|
| 19530 |
+
"loss": 3.4394,
|
| 19531 |
+
"memory/device_reserved (GiB)": 66.8,
|
| 19532 |
+
"memory/max_active (GiB)": 66.63,
|
| 19533 |
+
"memory/max_allocated (GiB)": 66.63,
|
| 19534 |
+
"step": 1775,
|
| 19535 |
+
"tokens_per_second_per_gpu": 18870.34
|
| 19536 |
+
},
|
| 19537 |
+
{
|
| 19538 |
+
"epoch": 0.0444,
|
| 19539 |
+
"grad_norm": 0.255859375,
|
| 19540 |
+
"learning_rate": 0.0026625,
|
| 19541 |
+
"loss": 3.4378,
|
| 19542 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 19543 |
+
"memory/max_active (GiB)": 97.23,
|
| 19544 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 19545 |
+
"step": 1776,
|
| 19546 |
+
"tokens_per_second_per_gpu": 13952.93
|
| 19547 |
+
},
|
| 19548 |
+
{
|
| 19549 |
+
"epoch": 0.044425,
|
| 19550 |
+
"grad_norm": 0.2275390625,
|
| 19551 |
+
"learning_rate": 0.002664,
|
| 19552 |
+
"loss": 3.3772,
|
| 19553 |
+
"memory/device_reserved (GiB)": 46.36,
|
| 19554 |
+
"memory/max_active (GiB)": 46.22,
|
| 19555 |
+
"memory/max_allocated (GiB)": 46.22,
|
| 19556 |
+
"step": 1777,
|
| 19557 |
+
"tokens_per_second_per_gpu": 27786.47
|
| 19558 |
+
},
|
| 19559 |
+
{
|
| 19560 |
+
"epoch": 0.04445,
|
| 19561 |
+
"grad_norm": 0.1669921875,
|
| 19562 |
+
"learning_rate": 0.0026655,
|
| 19563 |
+
"loss": 3.4545,
|
| 19564 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 19565 |
+
"memory/max_active (GiB)": 97.22,
|
| 19566 |
+
"memory/max_allocated (GiB)": 97.22,
|
| 19567 |
+
"step": 1778,
|
| 19568 |
+
"tokens_per_second_per_gpu": 13063.98
|
| 19569 |
+
},
|
| 19570 |
+
{
|
| 19571 |
+
"epoch": 0.044475,
|
| 19572 |
+
"grad_norm": 0.271484375,
|
| 19573 |
+
"learning_rate": 0.002667,
|
| 19574 |
+
"loss": 3.4308,
|
| 19575 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 19576 |
+
"memory/max_active (GiB)": 87.03,
|
| 19577 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 19578 |
+
"step": 1779,
|
| 19579 |
+
"tokens_per_second_per_gpu": 15197.1
|
| 19580 |
+
},
|
| 19581 |
+
{
|
| 19582 |
+
"epoch": 0.0445,
|
| 19583 |
+
"grad_norm": 0.333984375,
|
| 19584 |
+
"learning_rate": 0.0026685,
|
| 19585 |
+
"loss": 3.3732,
|
| 19586 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 19587 |
+
"memory/max_active (GiB)": 97.23,
|
| 19588 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 19589 |
+
"step": 1780,
|
| 19590 |
+
"tokens_per_second_per_gpu": 13364.89
|
| 19591 |
+
},
|
| 19592 |
+
{
|
| 19593 |
+
"epoch": 0.044525,
|
| 19594 |
+
"grad_norm": 0.482421875,
|
| 19595 |
+
"learning_rate": 0.00267,
|
| 19596 |
+
"loss": 3.3969,
|
| 19597 |
+
"memory/device_reserved (GiB)": 127.96,
|
| 19598 |
+
"memory/max_active (GiB)": 127.83,
|
| 19599 |
+
"memory/max_allocated (GiB)": 127.83,
|
| 19600 |
+
"step": 1781,
|
| 19601 |
+
"tokens_per_second_per_gpu": 10848.07
|
| 19602 |
+
},
|
| 19603 |
+
{
|
| 19604 |
+
"epoch": 0.04455,
|
| 19605 |
+
"grad_norm": 0.46484375,
|
| 19606 |
+
"learning_rate": 0.0026715,
|
| 19607 |
+
"loss": 3.4136,
|
| 19608 |
+
"memory/device_reserved (GiB)": 46.36,
|
| 19609 |
+
"memory/max_active (GiB)": 46.22,
|
| 19610 |
+
"memory/max_allocated (GiB)": 46.22,
|
| 19611 |
+
"step": 1782,
|
| 19612 |
+
"tokens_per_second_per_gpu": 26226.1
|
| 19613 |
+
},
|
| 19614 |
+
{
|
| 19615 |
+
"epoch": 0.044575,
|
| 19616 |
+
"grad_norm": 0.40234375,
|
| 19617 |
+
"learning_rate": 0.002673,
|
| 19618 |
+
"loss": 3.4063,
|
| 19619 |
+
"memory/device_reserved (GiB)": 77.01,
|
| 19620 |
+
"memory/max_active (GiB)": 76.83,
|
| 19621 |
+
"memory/max_allocated (GiB)": 76.83,
|
| 19622 |
+
"step": 1783,
|
| 19623 |
+
"tokens_per_second_per_gpu": 16623.15
|
| 19624 |
+
},
|
| 19625 |
+
{
|
| 19626 |
+
"epoch": 0.0446,
|
| 19627 |
+
"grad_norm": 0.4453125,
|
| 19628 |
+
"learning_rate": 0.0026745,
|
| 19629 |
+
"loss": 3.4083,
|
| 19630 |
+
"memory/device_reserved (GiB)": 77.01,
|
| 19631 |
+
"memory/max_active (GiB)": 76.83,
|
| 19632 |
+
"memory/max_allocated (GiB)": 76.83,
|
| 19633 |
+
"step": 1784,
|
| 19634 |
+
"tokens_per_second_per_gpu": 17625.86
|
| 19635 |
+
},
|
| 19636 |
+
{
|
| 19637 |
+
"epoch": 0.044625,
|
| 19638 |
+
"grad_norm": 0.51953125,
|
| 19639 |
+
"learning_rate": 0.002676,
|
| 19640 |
+
"loss": 3.4065,
|
| 19641 |
+
"memory/device_reserved (GiB)": 76.49,
|
| 19642 |
+
"memory/max_active (GiB)": 76.35,
|
| 19643 |
+
"memory/max_allocated (GiB)": 76.35,
|
| 19644 |
+
"step": 1785,
|
| 19645 |
+
"tokens_per_second_per_gpu": 17692.48
|
| 19646 |
+
},
|
| 19647 |
+
{
|
| 19648 |
+
"epoch": 0.04465,
|
| 19649 |
+
"grad_norm": 0.45703125,
|
| 19650 |
+
"learning_rate": 0.0026774999999999998,
|
| 19651 |
+
"loss": 3.3926,
|
| 19652 |
+
"memory/device_reserved (GiB)": 77.01,
|
| 19653 |
+
"memory/max_active (GiB)": 76.83,
|
| 19654 |
+
"memory/max_allocated (GiB)": 76.83,
|
| 19655 |
+
"step": 1786,
|
| 19656 |
+
"tokens_per_second_per_gpu": 17734.6
|
| 19657 |
+
},
|
| 19658 |
+
{
|
| 19659 |
+
"epoch": 0.044675,
|
| 19660 |
+
"grad_norm": 0.4140625,
|
| 19661 |
+
"learning_rate": 0.002679,
|
| 19662 |
+
"loss": 3.4451,
|
| 19663 |
+
"memory/device_reserved (GiB)": 107.61,
|
| 19664 |
+
"memory/max_active (GiB)": 107.43,
|
| 19665 |
+
"memory/max_allocated (GiB)": 107.43,
|
| 19666 |
+
"step": 1787,
|
| 19667 |
+
"tokens_per_second_per_gpu": 12253.75
|
| 19668 |
+
},
|
| 19669 |
+
{
|
| 19670 |
+
"epoch": 0.0447,
|
| 19671 |
+
"grad_norm": 0.37109375,
|
| 19672 |
+
"learning_rate": 0.0026804999999999997,
|
| 19673 |
+
"loss": 3.4092,
|
| 19674 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 19675 |
+
"memory/max_active (GiB)": 56.42,
|
| 19676 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 19677 |
+
"step": 1788,
|
| 19678 |
+
"tokens_per_second_per_gpu": 22035.77
|
| 19679 |
+
},
|
| 19680 |
+
{
|
| 19681 |
+
"epoch": 0.044725,
|
| 19682 |
+
"grad_norm": 0.365234375,
|
| 19683 |
+
"learning_rate": 0.002682,
|
| 19684 |
+
"loss": 3.4072,
|
| 19685 |
+
"memory/device_reserved (GiB)": 46.36,
|
| 19686 |
+
"memory/max_active (GiB)": 46.22,
|
| 19687 |
+
"memory/max_allocated (GiB)": 46.22,
|
| 19688 |
+
"step": 1789,
|
| 19689 |
+
"tokens_per_second_per_gpu": 28799.13
|
| 19690 |
+
},
|
| 19691 |
+
{
|
| 19692 |
+
"epoch": 0.04475,
|
| 19693 |
+
"grad_norm": 0.486328125,
|
| 19694 |
+
"learning_rate": 0.0026835,
|
| 19695 |
+
"loss": 3.4273,
|
| 19696 |
+
"memory/device_reserved (GiB)": 107.12,
|
| 19697 |
+
"memory/max_active (GiB)": 106.95,
|
| 19698 |
+
"memory/max_allocated (GiB)": 106.95,
|
| 19699 |
+
"step": 1790,
|
| 19700 |
+
"tokens_per_second_per_gpu": 13507.67
|
| 19701 |
+
},
|
| 19702 |
+
{
|
| 19703 |
+
"epoch": 0.044775,
|
| 19704 |
+
"grad_norm": 0.41796875,
|
| 19705 |
+
"learning_rate": 0.0026850000000000003,
|
| 19706 |
+
"loss": 3.4105,
|
| 19707 |
+
"memory/device_reserved (GiB)": 97.42,
|
| 19708 |
+
"memory/max_active (GiB)": 97.23,
|
| 19709 |
+
"memory/max_allocated (GiB)": 97.23,
|
| 19710 |
+
"step": 1791,
|
| 19711 |
+
"tokens_per_second_per_gpu": 14054.94
|
| 19712 |
+
},
|
| 19713 |
+
{
|
| 19714 |
+
"epoch": 0.0448,
|
| 19715 |
+
"grad_norm": 0.39453125,
|
| 19716 |
+
"learning_rate": 0.0026865,
|
| 19717 |
+
"loss": 3.4125,
|
| 19718 |
+
"memory/device_reserved (GiB)": 87.21,
|
| 19719 |
+
"memory/max_active (GiB)": 87.03,
|
| 19720 |
+
"memory/max_allocated (GiB)": 87.03,
|
| 19721 |
+
"step": 1792,
|
| 19722 |
+
"tokens_per_second_per_gpu": 15951.81
|
| 19723 |
+
},
|
| 19724 |
+
{
|
| 19725 |
+
"epoch": 0.044825,
|
| 19726 |
+
"grad_norm": 0.455078125,
|
| 19727 |
+
"learning_rate": 0.0026880000000000003,
|
| 19728 |
+
"loss": 3.4148,
|
| 19729 |
+
"memory/device_reserved (GiB)": 117.34,
|
| 19730 |
+
"memory/max_active (GiB)": 117.15,
|
| 19731 |
+
"memory/max_allocated (GiB)": 117.15,
|
| 19732 |
+
"step": 1793,
|
| 19733 |
+
"tokens_per_second_per_gpu": 11286.11
|
| 19734 |
+
},
|
| 19735 |
+
{
|
| 19736 |
+
"epoch": 0.04485,
|
| 19737 |
+
"grad_norm": 0.47265625,
|
| 19738 |
+
"learning_rate": 0.0026895,
|
| 19739 |
+
"loss": 3.4305,
|
| 19740 |
+
"memory/device_reserved (GiB)": 86.7,
|
| 19741 |
+
"memory/max_active (GiB)": 86.55,
|
| 19742 |
+
"memory/max_allocated (GiB)": 86.55,
|
| 19743 |
+
"step": 1794,
|
| 19744 |
+
"tokens_per_second_per_gpu": 15414.34
|
| 19745 |
+
},
|
| 19746 |
+
{
|
| 19747 |
+
"epoch": 0.044875,
|
| 19748 |
+
"grad_norm": 0.49609375,
|
| 19749 |
+
"learning_rate": 0.0026910000000000002,
|
| 19750 |
+
"loss": 3.4523,
|
| 19751 |
+
"memory/device_reserved (GiB)": 56.57,
|
| 19752 |
+
"memory/max_active (GiB)": 56.42,
|
| 19753 |
+
"memory/max_allocated (GiB)": 56.42,
|
| 19754 |
+
"step": 1795,
|
| 19755 |
+
"tokens_per_second_per_gpu": 23738.61
|
| 19756 |
+
},
|
| 19757 |
+
{
|
| 19758 |
+
"epoch": 0.0449,
|
| 19759 |
+
"grad_norm": 0.416015625,
|
| 19760 |
+
"learning_rate": 0.0026925,
|
| 19761 |
+
"loss": 3.4288,
|
| 19762 |
+
"memory/device_reserved (GiB)": 45.93,
|
| 19763 |
+
"memory/max_active (GiB)": 45.75,
|
| 19764 |
+
"memory/max_allocated (GiB)": 45.75,
|
| 19765 |
+
"step": 1796,
|
| 19766 |
+
"tokens_per_second_per_gpu": 29115.65
|
| 19767 |
+
},
|
| 19768 |
+
{
|
| 19769 |
+
"epoch": 0.044925,
|
| 19770 |
+
"grad_norm": 0.55859375,
|
| 19771 |
+
"learning_rate": 0.002694,
|
| 19772 |
+
"loss": 3.4467,
|
| 19773 |
+
"memory/device_reserved (GiB)": 76.49,
|
| 19774 |
+
"memory/max_active (GiB)": 76.35,
|
| 19775 |
+
"memory/max_allocated (GiB)": 76.35,
|
| 19776 |
+
"step": 1797,
|
| 19777 |
+
"tokens_per_second_per_gpu": 18356.21
|
| 19778 |
+
},
|
| 19779 |
+
{
|
| 19780 |
+
"epoch": 0.04495,
|
| 19781 |
+
"grad_norm": 0.42578125,
|
| 19782 |
+
"learning_rate": 0.0026955,
|
| 19783 |
+
"loss": 3.4082,
|
| 19784 |
+
"memory/device_reserved (GiB)": 86.21,
|
| 19785 |
+
"memory/max_active (GiB)": 86.02,
|
| 19786 |
+
"memory/max_allocated (GiB)": 86.02,
|
| 19787 |
+
"step": 1798,
|
| 19788 |
+
"tokens_per_second_per_gpu": 15680.13
|
| 19789 |
+
},
|
| 19790 |
+
{
|
| 19791 |
+
"epoch": 0.044975,
|
| 19792 |
+
"grad_norm": 0.376953125,
|
| 19793 |
+
"learning_rate": 0.002697,
|
| 19794 |
+
"loss": 3.3861,
|
| 19795 |
+
"memory/device_reserved (GiB)": 77.01,
|
| 19796 |
+
"memory/max_active (GiB)": 76.83,
|
| 19797 |
+
"memory/max_allocated (GiB)": 76.83,
|
| 19798 |
+
"step": 1799,
|
| 19799 |
+
"tokens_per_second_per_gpu": 17597.13
|
| 19800 |
+
},
|
| 19801 |
+
{
|
| 19802 |
+
"epoch": 0.045,
|
| 19803 |
+
"grad_norm": 0.4140625,
|
| 19804 |
+
"learning_rate": 0.0026985,
|
| 19805 |
+
"loss": 3.4615,
|
| 19806 |
+
"memory/device_reserved (GiB)": 127.96,
|
| 19807 |
+
"memory/max_active (GiB)": 127.83,
|
| 19808 |
+
"memory/max_allocated (GiB)": 127.83,
|
| 19809 |
+
"step": 1800,
|
| 19810 |
+
"tokens_per_second_per_gpu": 10181.44
|
| 19811 |
}
|
| 19812 |
],
|
| 19813 |
"logging_steps": 1,
|
|
|
|
| 19827 |
"attributes": {}
|
| 19828 |
}
|
| 19829 |
},
|
| 19830 |
+
"total_flos": 2.1172492665891062e+18,
|
| 19831 |
"train_batch_size": 1,
|
| 19832 |
"trial_name": null,
|
| 19833 |
"trial_params": null
|