winglian commited on
Commit
4c7ded7
·
verified ·
1 Parent(s): ac0b54b

Upload folder using huggingface_hub

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf1e49bafee5e23d59bc89f4ed8a0094873d703923e2d80654edd0e0789fddb9
3
  size 505408136
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5ad63b79c6f5a0ce02ab80d656cc75c3f40065ca1444e1eeb8a989304b610fa
3
  size 505408136
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01e7e30d843734b4ab3ea4838b6d299eaf917be50ce40b81d78b3b0bf9ba0b18
3
  size 1010874315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b686e15846fe98d3e289254d43ec627262be81d3ccfd84694780cdc2d857d26d
3
  size 1010874315
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:108dcd93929ea6e655c2ec96575cda913142bdd3522936dab01875d3147121da
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9daa68d1a810813b1abdd9f201531e73b7d4e041dcc3fad23284f8fcf4b91d24
3
  size 16389
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad148fbc9f8715ec96b91b740a012d3d056c09e40bce69864cb715320811705a
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d4cf6d3addd42425edfaae3634d0333d62c162775e97e30e6dbbb03fd74dd6f
3
  size 16389
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bff46447846444bff4a32f38b4eededb4e43b40aa19fd44a1739de2373a4dfa0
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b56ba980c9ffbdf2bb60baefdb68afe18fcb829a6a07f61c928a35c3737bd1d4
3
  size 16389
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f10b07bcc5fafad4caf841092938cad70c523ed3bd84e9d6a5ebea3a4f89996b
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dc66401aa5093553b2517ab0a1356729053e4b424c30bd8d4bf1f904c790981
3
  size 16389
rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:daa93c0f1b2ff29f1c6eb05b38bd2a7e7b3885209653ad9ac0b180ab26fe9d6c
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71bf5ab9e423d6e1f39e134e259bf1988b5193894d9efa9209c9ed124671b440
3
  size 16389
rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1315452500a3da6d7c1ad424cda0948c9ffbe4e6d97f951009cadbf6940e8d5d
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b3bbe1c81dcd5d4dad45c273f16941ad3a079fae1884c8c592be9c19cf695f6
3
  size 16389
rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2168cb46319b56c369191f7bd315e73e6b9d3325a75334249ecc44d81e8949e
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3979078d5678a98b0350d1d45af82cb4e5ee17b169ebd5301f8daf30c0a3debe
3
  size 16389
rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a0273acd0392329a4383016813c5d2303685e01e1642270a57c078c815dc23d
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2672dccf4e80588003012e60bca5c978b07507e61cd0e44e8d1bce97be4a1ebb
3
  size 16389
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12ca48ca705b26744cb5a0abb45acbd477d0725271238b5b2bced00b4e073c3c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:709037943547f0e738a8d6f42c84d14097097b4e323fa2f1aad3fb147909ac3e
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.04,
6
  "eval_steps": 500,
7
- "global_step": 1600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -17608,6 +17608,2206 @@
17608
  "memory/max_allocated (GiB)": 107.43,
17609
  "step": 1600,
17610
  "tokens_per_second_per_gpu": 12031.87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17611
  }
17612
  ],
17613
  "logging_steps": 1,
@@ -17627,7 +19827,7 @@
17627
  "attributes": {}
17628
  }
17629
  },
17630
- "total_flos": 1.8819084394965238e+18,
17631
  "train_batch_size": 1,
17632
  "trial_name": null,
17633
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.045,
6
  "eval_steps": 500,
7
+ "global_step": 1800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
17608
  "memory/max_allocated (GiB)": 107.43,
17609
  "step": 1600,
17610
  "tokens_per_second_per_gpu": 12031.87
17611
+ },
17612
+ {
17613
+ "epoch": 0.040025,
17614
+ "grad_norm": 0.431640625,
17615
+ "learning_rate": 0.0024000000000000002,
17616
+ "loss": 3.4425,
17617
+ "memory/device_reserved (GiB)": 87.21,
17618
+ "memory/max_active (GiB)": 87.03,
17619
+ "memory/max_allocated (GiB)": 87.03,
17620
+ "step": 1601,
17621
+ "tokens_per_second_per_gpu": 15837.83
17622
+ },
17623
+ {
17624
+ "epoch": 0.04005,
17625
+ "grad_norm": 0.49609375,
17626
+ "learning_rate": 0.0024015,
17627
+ "loss": 3.4657,
17628
+ "memory/device_reserved (GiB)": 87.21,
17629
+ "memory/max_active (GiB)": 87.03,
17630
+ "memory/max_allocated (GiB)": 87.03,
17631
+ "step": 1602,
17632
+ "tokens_per_second_per_gpu": 16144.09
17633
+ },
17634
+ {
17635
+ "epoch": 0.040075,
17636
+ "grad_norm": 0.5546875,
17637
+ "learning_rate": 0.002403,
17638
+ "loss": 3.4979,
17639
+ "memory/device_reserved (GiB)": 87.21,
17640
+ "memory/max_active (GiB)": 87.03,
17641
+ "memory/max_allocated (GiB)": 87.03,
17642
+ "step": 1603,
17643
+ "tokens_per_second_per_gpu": 15848.46
17644
+ },
17645
+ {
17646
+ "epoch": 0.0401,
17647
+ "grad_norm": 0.478515625,
17648
+ "learning_rate": 0.0024045,
17649
+ "loss": 3.4807,
17650
+ "memory/device_reserved (GiB)": 96.22,
17651
+ "memory/max_active (GiB)": 96.19,
17652
+ "memory/max_allocated (GiB)": 96.19,
17653
+ "step": 1604,
17654
+ "tokens_per_second_per_gpu": 14136.87
17655
+ },
17656
+ {
17657
+ "epoch": 0.040125,
17658
+ "grad_norm": 0.2158203125,
17659
+ "learning_rate": 0.002406,
17660
+ "loss": 3.4584,
17661
+ "memory/device_reserved (GiB)": 106.61,
17662
+ "memory/max_active (GiB)": 106.43,
17663
+ "memory/max_allocated (GiB)": 106.43,
17664
+ "step": 1605,
17665
+ "tokens_per_second_per_gpu": 12098.17
17666
+ },
17667
+ {
17668
+ "epoch": 0.04015,
17669
+ "grad_norm": 0.3828125,
17670
+ "learning_rate": 0.0024075,
17671
+ "loss": 3.4939,
17672
+ "memory/device_reserved (GiB)": 96.93,
17673
+ "memory/max_active (GiB)": 96.75,
17674
+ "memory/max_allocated (GiB)": 96.75,
17675
+ "step": 1606,
17676
+ "tokens_per_second_per_gpu": 13997.28
17677
+ },
17678
+ {
17679
+ "epoch": 0.040175,
17680
+ "grad_norm": 0.376953125,
17681
+ "learning_rate": 0.002409,
17682
+ "loss": 3.4807,
17683
+ "memory/device_reserved (GiB)": 97.42,
17684
+ "memory/max_active (GiB)": 97.23,
17685
+ "memory/max_allocated (GiB)": 97.23,
17686
+ "step": 1607,
17687
+ "tokens_per_second_per_gpu": 15028.75
17688
+ },
17689
+ {
17690
+ "epoch": 0.0402,
17691
+ "grad_norm": 0.48046875,
17692
+ "learning_rate": 0.0024105,
17693
+ "loss": 3.4622,
17694
+ "memory/device_reserved (GiB)": 66.8,
17695
+ "memory/max_active (GiB)": 66.63,
17696
+ "memory/max_allocated (GiB)": 66.63,
17697
+ "step": 1608,
17698
+ "tokens_per_second_per_gpu": 20265.75
17699
+ },
17700
+ {
17701
+ "epoch": 0.040225,
17702
+ "grad_norm": 0.4765625,
17703
+ "learning_rate": 0.002412,
17704
+ "loss": 3.4482,
17705
+ "memory/device_reserved (GiB)": 86.7,
17706
+ "memory/max_active (GiB)": 86.55,
17707
+ "memory/max_allocated (GiB)": 86.55,
17708
+ "step": 1609,
17709
+ "tokens_per_second_per_gpu": 15102.78
17710
+ },
17711
+ {
17712
+ "epoch": 0.04025,
17713
+ "grad_norm": 0.61328125,
17714
+ "learning_rate": 0.0024135,
17715
+ "loss": 3.4781,
17716
+ "memory/device_reserved (GiB)": 87.21,
17717
+ "memory/max_active (GiB)": 87.03,
17718
+ "memory/max_allocated (GiB)": 87.03,
17719
+ "step": 1610,
17720
+ "tokens_per_second_per_gpu": 14823.11
17721
+ },
17722
+ {
17723
+ "epoch": 0.040275,
17724
+ "grad_norm": 0.625,
17725
+ "learning_rate": 0.002415,
17726
+ "loss": 3.4686,
17727
+ "memory/device_reserved (GiB)": 56.57,
17728
+ "memory/max_active (GiB)": 56.42,
17729
+ "memory/max_allocated (GiB)": 56.42,
17730
+ "step": 1611,
17731
+ "tokens_per_second_per_gpu": 23148.24
17732
+ },
17733
+ {
17734
+ "epoch": 0.0403,
17735
+ "grad_norm": 0.5625,
17736
+ "learning_rate": 0.0024165000000000002,
17737
+ "loss": 3.4665,
17738
+ "memory/device_reserved (GiB)": 86.7,
17739
+ "memory/max_active (GiB)": 86.55,
17740
+ "memory/max_allocated (GiB)": 86.55,
17741
+ "step": 1612,
17742
+ "tokens_per_second_per_gpu": 15439.18
17743
+ },
17744
+ {
17745
+ "epoch": 0.040325,
17746
+ "grad_norm": 0.56640625,
17747
+ "learning_rate": 0.002418,
17748
+ "loss": 3.4718,
17749
+ "memory/device_reserved (GiB)": 87.21,
17750
+ "memory/max_active (GiB)": 87.03,
17751
+ "memory/max_allocated (GiB)": 87.03,
17752
+ "step": 1613,
17753
+ "tokens_per_second_per_gpu": 16189.41
17754
+ },
17755
+ {
17756
+ "epoch": 0.04035,
17757
+ "grad_norm": 0.46875,
17758
+ "learning_rate": 0.0024195,
17759
+ "loss": 3.5027,
17760
+ "memory/device_reserved (GiB)": 66.8,
17761
+ "memory/max_active (GiB)": 66.63,
17762
+ "memory/max_allocated (GiB)": 66.63,
17763
+ "step": 1614,
17764
+ "tokens_per_second_per_gpu": 18394.53
17765
+ },
17766
+ {
17767
+ "epoch": 0.040375,
17768
+ "grad_norm": 0.388671875,
17769
+ "learning_rate": 0.0024210000000000004,
17770
+ "loss": 3.4931,
17771
+ "memory/device_reserved (GiB)": 56.57,
17772
+ "memory/max_active (GiB)": 56.42,
17773
+ "memory/max_allocated (GiB)": 56.42,
17774
+ "step": 1615,
17775
+ "tokens_per_second_per_gpu": 23895.62
17776
+ },
17777
+ {
17778
+ "epoch": 0.0404,
17779
+ "grad_norm": 0.412109375,
17780
+ "learning_rate": 0.0024225,
17781
+ "loss": 3.4385,
17782
+ "memory/device_reserved (GiB)": 56.57,
17783
+ "memory/max_active (GiB)": 56.42,
17784
+ "memory/max_allocated (GiB)": 56.42,
17785
+ "step": 1616,
17786
+ "tokens_per_second_per_gpu": 24598.08
17787
+ },
17788
+ {
17789
+ "epoch": 0.040425,
17790
+ "grad_norm": 0.390625,
17791
+ "learning_rate": 0.0024240000000000004,
17792
+ "loss": 3.4905,
17793
+ "memory/device_reserved (GiB)": 55.57,
17794
+ "memory/max_active (GiB)": 55.42,
17795
+ "memory/max_allocated (GiB)": 55.42,
17796
+ "step": 1617,
17797
+ "tokens_per_second_per_gpu": 24013.93
17798
+ },
17799
+ {
17800
+ "epoch": 0.04045,
17801
+ "grad_norm": 0.44140625,
17802
+ "learning_rate": 0.0024255,
17803
+ "loss": 3.4878,
17804
+ "memory/device_reserved (GiB)": 66.8,
17805
+ "memory/max_active (GiB)": 66.63,
17806
+ "memory/max_allocated (GiB)": 66.63,
17807
+ "step": 1618,
17808
+ "tokens_per_second_per_gpu": 21628.74
17809
+ },
17810
+ {
17811
+ "epoch": 0.040475,
17812
+ "grad_norm": 0.375,
17813
+ "learning_rate": 0.0024270000000000003,
17814
+ "loss": 3.4842,
17815
+ "memory/device_reserved (GiB)": 97.42,
17816
+ "memory/max_active (GiB)": 97.23,
17817
+ "memory/max_allocated (GiB)": 97.23,
17818
+ "step": 1619,
17819
+ "tokens_per_second_per_gpu": 13740.61
17820
+ },
17821
+ {
17822
+ "epoch": 0.0405,
17823
+ "grad_norm": 0.396484375,
17824
+ "learning_rate": 0.0024285,
17825
+ "loss": 3.4546,
17826
+ "memory/device_reserved (GiB)": 127.55,
17827
+ "memory/max_active (GiB)": 127.35,
17828
+ "memory/max_allocated (GiB)": 127.35,
17829
+ "step": 1620,
17830
+ "tokens_per_second_per_gpu": 10505.39
17831
+ },
17832
+ {
17833
+ "epoch": 0.040525,
17834
+ "grad_norm": 0.54296875,
17835
+ "learning_rate": 0.0024300000000000003,
17836
+ "loss": 3.4532,
17837
+ "memory/device_reserved (GiB)": 97.42,
17838
+ "memory/max_active (GiB)": 97.23,
17839
+ "memory/max_allocated (GiB)": 97.23,
17840
+ "step": 1621,
17841
+ "tokens_per_second_per_gpu": 14010.82
17842
+ },
17843
+ {
17844
+ "epoch": 0.04055,
17845
+ "grad_norm": 0.44140625,
17846
+ "learning_rate": 0.0024315,
17847
+ "loss": 3.4622,
17848
+ "memory/device_reserved (GiB)": 86.71,
17849
+ "memory/max_active (GiB)": 86.55,
17850
+ "memory/max_allocated (GiB)": 86.55,
17851
+ "step": 1622,
17852
+ "tokens_per_second_per_gpu": 15600.27
17853
+ },
17854
+ {
17855
+ "epoch": 0.040575,
17856
+ "grad_norm": 0.328125,
17857
+ "learning_rate": 0.0024330000000000003,
17858
+ "loss": 3.4374,
17859
+ "memory/device_reserved (GiB)": 56.63,
17860
+ "memory/max_active (GiB)": 56.42,
17861
+ "memory/max_allocated (GiB)": 56.42,
17862
+ "step": 1623,
17863
+ "tokens_per_second_per_gpu": 23962.38
17864
+ },
17865
+ {
17866
+ "epoch": 0.0406,
17867
+ "grad_norm": 0.486328125,
17868
+ "learning_rate": 0.0024345,
17869
+ "loss": 3.4678,
17870
+ "memory/device_reserved (GiB)": 117.82,
17871
+ "memory/max_active (GiB)": 117.63,
17872
+ "memory/max_allocated (GiB)": 117.63,
17873
+ "step": 1624,
17874
+ "tokens_per_second_per_gpu": 11110.47
17875
+ },
17876
+ {
17877
+ "epoch": 0.040625,
17878
+ "grad_norm": 0.41796875,
17879
+ "learning_rate": 0.0024360000000000002,
17880
+ "loss": 3.433,
17881
+ "memory/device_reserved (GiB)": 46.36,
17882
+ "memory/max_active (GiB)": 46.22,
17883
+ "memory/max_allocated (GiB)": 46.22,
17884
+ "step": 1625,
17885
+ "tokens_per_second_per_gpu": 28676.3
17886
+ },
17887
+ {
17888
+ "epoch": 0.04065,
17889
+ "grad_norm": 0.5,
17890
+ "learning_rate": 0.0024375,
17891
+ "loss": 3.4638,
17892
+ "memory/device_reserved (GiB)": 107.61,
17893
+ "memory/max_active (GiB)": 107.43,
17894
+ "memory/max_allocated (GiB)": 107.43,
17895
+ "step": 1626,
17896
+ "tokens_per_second_per_gpu": 12804.39
17897
+ },
17898
+ {
17899
+ "epoch": 0.040675,
17900
+ "grad_norm": 0.59375,
17901
+ "learning_rate": 0.0024389999999999998,
17902
+ "loss": 3.4699,
17903
+ "memory/device_reserved (GiB)": 45.93,
17904
+ "memory/max_active (GiB)": 45.75,
17905
+ "memory/max_allocated (GiB)": 45.75,
17906
+ "step": 1627,
17907
+ "tokens_per_second_per_gpu": 28573.53
17908
+ },
17909
+ {
17910
+ "epoch": 0.0407,
17911
+ "grad_norm": 0.72265625,
17912
+ "learning_rate": 0.0024405,
17913
+ "loss": 3.4996,
17914
+ "memory/device_reserved (GiB)": 127.96,
17915
+ "memory/max_active (GiB)": 127.83,
17916
+ "memory/max_allocated (GiB)": 127.83,
17917
+ "step": 1628,
17918
+ "tokens_per_second_per_gpu": 10458.89
17919
+ },
17920
+ {
17921
+ "epoch": 0.040725,
17922
+ "grad_norm": 0.71484375,
17923
+ "learning_rate": 0.0024419999999999997,
17924
+ "loss": 3.4523,
17925
+ "memory/device_reserved (GiB)": 56.57,
17926
+ "memory/max_active (GiB)": 56.42,
17927
+ "memory/max_allocated (GiB)": 56.42,
17928
+ "step": 1629,
17929
+ "tokens_per_second_per_gpu": 24753.83
17930
+ },
17931
+ {
17932
+ "epoch": 0.04075,
17933
+ "grad_norm": 0.486328125,
17934
+ "learning_rate": 0.0024435,
17935
+ "loss": 3.502,
17936
+ "memory/device_reserved (GiB)": 107.61,
17937
+ "memory/max_active (GiB)": 107.43,
17938
+ "memory/max_allocated (GiB)": 107.43,
17939
+ "step": 1630,
17940
+ "tokens_per_second_per_gpu": 12038.92
17941
+ },
17942
+ {
17943
+ "epoch": 0.040775,
17944
+ "grad_norm": 0.45703125,
17945
+ "learning_rate": 0.0024449999999999997,
17946
+ "loss": 3.4451,
17947
+ "memory/device_reserved (GiB)": 45.93,
17948
+ "memory/max_active (GiB)": 45.75,
17949
+ "memory/max_allocated (GiB)": 45.75,
17950
+ "step": 1631,
17951
+ "tokens_per_second_per_gpu": 28333.75
17952
+ },
17953
+ {
17954
+ "epoch": 0.0408,
17955
+ "grad_norm": 0.6640625,
17956
+ "learning_rate": 0.0024465,
17957
+ "loss": 3.4622,
17958
+ "memory/device_reserved (GiB)": 127.96,
17959
+ "memory/max_active (GiB)": 127.83,
17960
+ "memory/max_allocated (GiB)": 127.83,
17961
+ "step": 1632,
17962
+ "tokens_per_second_per_gpu": 10859.27
17963
+ },
17964
+ {
17965
+ "epoch": 0.040825,
17966
+ "grad_norm": 0.9609375,
17967
+ "learning_rate": 0.002448,
17968
+ "loss": 3.5624,
17969
+ "memory/device_reserved (GiB)": 87.21,
17970
+ "memory/max_active (GiB)": 87.03,
17971
+ "memory/max_allocated (GiB)": 87.03,
17972
+ "step": 1633,
17973
+ "tokens_per_second_per_gpu": 16038.7
17974
+ },
17975
+ {
17976
+ "epoch": 0.04085,
17977
+ "grad_norm": 0.92578125,
17978
+ "learning_rate": 0.0024495,
17979
+ "loss": 3.5544,
17980
+ "memory/device_reserved (GiB)": 117.34,
17981
+ "memory/max_active (GiB)": 117.15,
17982
+ "memory/max_allocated (GiB)": 117.15,
17983
+ "step": 1634,
17984
+ "tokens_per_second_per_gpu": 11856.2
17985
+ },
17986
+ {
17987
+ "epoch": 0.040875,
17988
+ "grad_norm": 0.83203125,
17989
+ "learning_rate": 0.002451,
17990
+ "loss": 3.5095,
17991
+ "memory/device_reserved (GiB)": 46.39,
17992
+ "memory/max_active (GiB)": 46.22,
17993
+ "memory/max_allocated (GiB)": 46.22,
17994
+ "step": 1635,
17995
+ "tokens_per_second_per_gpu": 26486.63
17996
+ },
17997
+ {
17998
+ "epoch": 0.0409,
17999
+ "grad_norm": 0.53125,
18000
+ "learning_rate": 0.0024525000000000003,
18001
+ "loss": 3.5103,
18002
+ "memory/device_reserved (GiB)": 86.21,
18003
+ "memory/max_active (GiB)": 86.02,
18004
+ "memory/max_allocated (GiB)": 86.02,
18005
+ "step": 1636,
18006
+ "tokens_per_second_per_gpu": 15558.25
18007
+ },
18008
+ {
18009
+ "epoch": 0.040925,
18010
+ "grad_norm": 0.6796875,
18011
+ "learning_rate": 0.002454,
18012
+ "loss": 3.5592,
18013
+ "memory/device_reserved (GiB)": 107.61,
18014
+ "memory/max_active (GiB)": 107.43,
18015
+ "memory/max_allocated (GiB)": 107.43,
18016
+ "step": 1637,
18017
+ "tokens_per_second_per_gpu": 13368.45
18018
+ },
18019
+ {
18020
+ "epoch": 0.04095,
18021
+ "grad_norm": 0.65234375,
18022
+ "learning_rate": 0.0024555000000000002,
18023
+ "loss": 3.5235,
18024
+ "memory/device_reserved (GiB)": 107.12,
18025
+ "memory/max_active (GiB)": 106.95,
18026
+ "memory/max_allocated (GiB)": 106.95,
18027
+ "step": 1638,
18028
+ "tokens_per_second_per_gpu": 12681.15
18029
+ },
18030
+ {
18031
+ "epoch": 0.040975,
18032
+ "grad_norm": 0.6328125,
18033
+ "learning_rate": 0.002457,
18034
+ "loss": 3.518,
18035
+ "memory/device_reserved (GiB)": 117.82,
18036
+ "memory/max_active (GiB)": 117.63,
18037
+ "memory/max_allocated (GiB)": 117.63,
18038
+ "step": 1639,
18039
+ "tokens_per_second_per_gpu": 11755.22
18040
+ },
18041
+ {
18042
+ "epoch": 0.041,
18043
+ "grad_norm": 0.78515625,
18044
+ "learning_rate": 0.0024585,
18045
+ "loss": 3.5068,
18046
+ "memory/device_reserved (GiB)": 56.57,
18047
+ "memory/max_active (GiB)": 56.42,
18048
+ "memory/max_allocated (GiB)": 56.42,
18049
+ "step": 1640,
18050
+ "tokens_per_second_per_gpu": 24181.71
18051
+ },
18052
+ {
18053
+ "epoch": 0.041025,
18054
+ "grad_norm": 0.7578125,
18055
+ "learning_rate": 0.00246,
18056
+ "loss": 3.5537,
18057
+ "memory/device_reserved (GiB)": 64.28,
18058
+ "memory/max_active (GiB)": 64.24,
18059
+ "memory/max_allocated (GiB)": 64.24,
18060
+ "step": 1641,
18061
+ "tokens_per_second_per_gpu": 19813.17
18062
+ },
18063
+ {
18064
+ "epoch": 0.04105,
18065
+ "grad_norm": 0.50390625,
18066
+ "learning_rate": 0.0024615,
18067
+ "loss": 3.4945,
18068
+ "memory/device_reserved (GiB)": 77.01,
18069
+ "memory/max_active (GiB)": 76.83,
18070
+ "memory/max_allocated (GiB)": 76.83,
18071
+ "step": 1642,
18072
+ "tokens_per_second_per_gpu": 17963.63
18073
+ },
18074
+ {
18075
+ "epoch": 0.041075,
18076
+ "grad_norm": 0.5390625,
18077
+ "learning_rate": 0.002463,
18078
+ "loss": 3.49,
18079
+ "memory/device_reserved (GiB)": 77.01,
18080
+ "memory/max_active (GiB)": 76.83,
18081
+ "memory/max_allocated (GiB)": 76.83,
18082
+ "step": 1643,
18083
+ "tokens_per_second_per_gpu": 17191.81
18084
+ },
18085
+ {
18086
+ "epoch": 0.0411,
18087
+ "grad_norm": 0.5234375,
18088
+ "learning_rate": 0.0024645,
18089
+ "loss": 3.4985,
18090
+ "memory/device_reserved (GiB)": 87.21,
18091
+ "memory/max_active (GiB)": 87.03,
18092
+ "memory/max_allocated (GiB)": 87.03,
18093
+ "step": 1644,
18094
+ "tokens_per_second_per_gpu": 15627.01
18095
+ },
18096
+ {
18097
+ "epoch": 0.041125,
18098
+ "grad_norm": 0.458984375,
18099
+ "learning_rate": 0.002466,
18100
+ "loss": 3.4606,
18101
+ "memory/device_reserved (GiB)": 107.61,
18102
+ "memory/max_active (GiB)": 107.43,
18103
+ "memory/max_allocated (GiB)": 107.43,
18104
+ "step": 1645,
18105
+ "tokens_per_second_per_gpu": 13447.72
18106
+ },
18107
+ {
18108
+ "epoch": 0.04115,
18109
+ "grad_norm": 0.384765625,
18110
+ "learning_rate": 0.0024675,
18111
+ "loss": 3.4865,
18112
+ "memory/device_reserved (GiB)": 87.21,
18113
+ "memory/max_active (GiB)": 87.03,
18114
+ "memory/max_allocated (GiB)": 87.03,
18115
+ "step": 1646,
18116
+ "tokens_per_second_per_gpu": 16562.44
18117
+ },
18118
+ {
18119
+ "epoch": 0.041175,
18120
+ "grad_norm": 0.30078125,
18121
+ "learning_rate": 0.002469,
18122
+ "loss": 3.452,
18123
+ "memory/device_reserved (GiB)": 66.8,
18124
+ "memory/max_active (GiB)": 66.63,
18125
+ "memory/max_allocated (GiB)": 66.63,
18126
+ "step": 1647,
18127
+ "tokens_per_second_per_gpu": 19882.36
18128
+ },
18129
+ {
18130
+ "epoch": 0.0412,
18131
+ "grad_norm": 0.3125,
18132
+ "learning_rate": 0.0024705,
18133
+ "loss": 3.482,
18134
+ "memory/device_reserved (GiB)": 77.01,
18135
+ "memory/max_active (GiB)": 76.83,
18136
+ "memory/max_allocated (GiB)": 76.83,
18137
+ "step": 1648,
18138
+ "tokens_per_second_per_gpu": 17699.48
18139
+ },
18140
+ {
18141
+ "epoch": 0.041225,
18142
+ "grad_norm": 0.275390625,
18143
+ "learning_rate": 0.002472,
18144
+ "loss": 3.4162,
18145
+ "memory/device_reserved (GiB)": 66.8,
18146
+ "memory/max_active (GiB)": 66.63,
18147
+ "memory/max_allocated (GiB)": 66.63,
18148
+ "step": 1649,
18149
+ "tokens_per_second_per_gpu": 20101.43
18150
+ },
18151
+ {
18152
+ "epoch": 0.04125,
18153
+ "grad_norm": 0.26953125,
18154
+ "learning_rate": 0.0024735,
18155
+ "loss": 3.4138,
18156
+ "memory/device_reserved (GiB)": 46.39,
18157
+ "memory/max_active (GiB)": 46.22,
18158
+ "memory/max_allocated (GiB)": 46.22,
18159
+ "step": 1650,
18160
+ "tokens_per_second_per_gpu": 27146.19
18161
+ },
18162
+ {
18163
+ "epoch": 0.041275,
18164
+ "grad_norm": 0.234375,
18165
+ "learning_rate": 0.0024749999999999998,
18166
+ "loss": 3.4321,
18167
+ "memory/device_reserved (GiB)": 66.8,
18168
+ "memory/max_active (GiB)": 66.63,
18169
+ "memory/max_allocated (GiB)": 66.63,
18170
+ "step": 1651,
18171
+ "tokens_per_second_per_gpu": 19709.27
18172
+ },
18173
+ {
18174
+ "epoch": 0.0413,
18175
+ "grad_norm": 0.259765625,
18176
+ "learning_rate": 0.0024765,
18177
+ "loss": 3.477,
18178
+ "memory/device_reserved (GiB)": 46.36,
18179
+ "memory/max_active (GiB)": 46.22,
18180
+ "memory/max_allocated (GiB)": 46.22,
18181
+ "step": 1652,
18182
+ "tokens_per_second_per_gpu": 31305.36
18183
+ },
18184
+ {
18185
+ "epoch": 0.041325,
18186
+ "grad_norm": 0.2333984375,
18187
+ "learning_rate": 0.0024779999999999997,
18188
+ "loss": 3.41,
18189
+ "memory/device_reserved (GiB)": 56.14,
18190
+ "memory/max_active (GiB)": 55.95,
18191
+ "memory/max_allocated (GiB)": 55.95,
18192
+ "step": 1653,
18193
+ "tokens_per_second_per_gpu": 23790.99
18194
+ },
18195
+ {
18196
+ "epoch": 0.04135,
18197
+ "grad_norm": 0.294921875,
18198
+ "learning_rate": 0.0024795,
18199
+ "loss": 3.438,
18200
+ "memory/device_reserved (GiB)": 97.42,
18201
+ "memory/max_active (GiB)": 97.23,
18202
+ "memory/max_allocated (GiB)": 97.23,
18203
+ "step": 1654,
18204
+ "tokens_per_second_per_gpu": 13756.67
18205
+ },
18206
+ {
18207
+ "epoch": 0.041375,
18208
+ "grad_norm": 0.361328125,
18209
+ "learning_rate": 0.002481,
18210
+ "loss": 3.4131,
18211
+ "memory/device_reserved (GiB)": 117.82,
18212
+ "memory/max_active (GiB)": 117.63,
18213
+ "memory/max_allocated (GiB)": 117.63,
18214
+ "step": 1655,
18215
+ "tokens_per_second_per_gpu": 12021.3
18216
+ },
18217
+ {
18218
+ "epoch": 0.0414,
18219
+ "grad_norm": 0.314453125,
18220
+ "learning_rate": 0.0024825,
18221
+ "loss": 3.452,
18222
+ "memory/device_reserved (GiB)": 76.49,
18223
+ "memory/max_active (GiB)": 76.35,
18224
+ "memory/max_allocated (GiB)": 76.35,
18225
+ "step": 1656,
18226
+ "tokens_per_second_per_gpu": 17313.17
18227
+ },
18228
+ {
18229
+ "epoch": 0.041425,
18230
+ "grad_norm": 0.2734375,
18231
+ "learning_rate": 0.002484,
18232
+ "loss": 3.4499,
18233
+ "memory/device_reserved (GiB)": 75.99,
18234
+ "memory/max_active (GiB)": 75.82,
18235
+ "memory/max_allocated (GiB)": 75.82,
18236
+ "step": 1657,
18237
+ "tokens_per_second_per_gpu": 18143.79
18238
+ },
18239
+ {
18240
+ "epoch": 0.04145,
18241
+ "grad_norm": 0.3203125,
18242
+ "learning_rate": 0.0024855000000000003,
18243
+ "loss": 3.4232,
18244
+ "memory/device_reserved (GiB)": 56.63,
18245
+ "memory/max_active (GiB)": 56.42,
18246
+ "memory/max_allocated (GiB)": 56.42,
18247
+ "step": 1658,
18248
+ "tokens_per_second_per_gpu": 22650.06
18249
+ },
18250
+ {
18251
+ "epoch": 0.041475,
18252
+ "grad_norm": 0.294921875,
18253
+ "learning_rate": 0.002487,
18254
+ "loss": 3.4367,
18255
+ "memory/device_reserved (GiB)": 66.8,
18256
+ "memory/max_active (GiB)": 66.63,
18257
+ "memory/max_allocated (GiB)": 66.63,
18258
+ "step": 1659,
18259
+ "tokens_per_second_per_gpu": 19148.84
18260
+ },
18261
+ {
18262
+ "epoch": 0.0415,
18263
+ "grad_norm": 0.337890625,
18264
+ "learning_rate": 0.0024885000000000003,
18265
+ "loss": 3.4237,
18266
+ "memory/device_reserved (GiB)": 127.96,
18267
+ "memory/max_active (GiB)": 127.83,
18268
+ "memory/max_allocated (GiB)": 127.83,
18269
+ "step": 1660,
18270
+ "tokens_per_second_per_gpu": 11299.52
18271
+ },
18272
+ {
18273
+ "epoch": 0.041525,
18274
+ "grad_norm": 0.376953125,
18275
+ "learning_rate": 0.00249,
18276
+ "loss": 3.4072,
18277
+ "memory/device_reserved (GiB)": 127.96,
18278
+ "memory/max_active (GiB)": 127.83,
18279
+ "memory/max_allocated (GiB)": 127.83,
18280
+ "step": 1661,
18281
+ "tokens_per_second_per_gpu": 10692.7
18282
+ },
18283
+ {
18284
+ "epoch": 0.04155,
18285
+ "grad_norm": 0.392578125,
18286
+ "learning_rate": 0.0024915000000000002,
18287
+ "loss": 3.4118,
18288
+ "memory/device_reserved (GiB)": 97.42,
18289
+ "memory/max_active (GiB)": 97.23,
18290
+ "memory/max_allocated (GiB)": 97.23,
18291
+ "step": 1662,
18292
+ "tokens_per_second_per_gpu": 14572.68
18293
+ },
18294
+ {
18295
+ "epoch": 0.041575,
18296
+ "grad_norm": 0.32421875,
18297
+ "learning_rate": 0.002493,
18298
+ "loss": 3.3948,
18299
+ "memory/device_reserved (GiB)": 66.8,
18300
+ "memory/max_active (GiB)": 66.63,
18301
+ "memory/max_allocated (GiB)": 66.63,
18302
+ "step": 1663,
18303
+ "tokens_per_second_per_gpu": 20532.19
18304
+ },
18305
+ {
18306
+ "epoch": 0.0416,
18307
+ "grad_norm": 0.28125,
18308
+ "learning_rate": 0.0024945,
18309
+ "loss": 3.373,
18310
+ "memory/device_reserved (GiB)": 77.01,
18311
+ "memory/max_active (GiB)": 76.83,
18312
+ "memory/max_allocated (GiB)": 76.83,
18313
+ "step": 1664,
18314
+ "tokens_per_second_per_gpu": 16522.28
18315
+ },
18316
+ {
18317
+ "epoch": 0.041625,
18318
+ "grad_norm": 0.306640625,
18319
+ "learning_rate": 0.002496,
18320
+ "loss": 3.4468,
18321
+ "memory/device_reserved (GiB)": 56.14,
18322
+ "memory/max_active (GiB)": 55.95,
18323
+ "memory/max_allocated (GiB)": 55.95,
18324
+ "step": 1665,
18325
+ "tokens_per_second_per_gpu": 23946.77
18326
+ },
18327
+ {
18328
+ "epoch": 0.04165,
18329
+ "grad_norm": 0.296875,
18330
+ "learning_rate": 0.0024975,
18331
+ "loss": 3.4283,
18332
+ "memory/device_reserved (GiB)": 66.8,
18333
+ "memory/max_active (GiB)": 66.63,
18334
+ "memory/max_allocated (GiB)": 66.63,
18335
+ "step": 1666,
18336
+ "tokens_per_second_per_gpu": 19907.08
18337
+ },
18338
+ {
18339
+ "epoch": 0.041675,
18340
+ "grad_norm": 0.28125,
18341
+ "learning_rate": 0.002499,
18342
+ "loss": 3.4222,
18343
+ "memory/device_reserved (GiB)": 107.61,
18344
+ "memory/max_active (GiB)": 107.43,
18345
+ "memory/max_allocated (GiB)": 107.43,
18346
+ "step": 1667,
18347
+ "tokens_per_second_per_gpu": 12874.36
18348
+ },
18349
+ {
18350
+ "epoch": 0.0417,
18351
+ "grad_norm": 0.3671875,
18352
+ "learning_rate": 0.0025005,
18353
+ "loss": 3.4831,
18354
+ "memory/device_reserved (GiB)": 66.8,
18355
+ "memory/max_active (GiB)": 66.63,
18356
+ "memory/max_allocated (GiB)": 66.63,
18357
+ "step": 1668,
18358
+ "tokens_per_second_per_gpu": 20630.17
18359
+ },
18360
+ {
18361
+ "epoch": 0.041725,
18362
+ "grad_norm": 0.44140625,
18363
+ "learning_rate": 0.002502,
18364
+ "loss": 3.4484,
18365
+ "memory/device_reserved (GiB)": 66.8,
18366
+ "memory/max_active (GiB)": 66.63,
18367
+ "memory/max_allocated (GiB)": 66.63,
18368
+ "step": 1669,
18369
+ "tokens_per_second_per_gpu": 20443.85
18370
+ },
18371
+ {
18372
+ "epoch": 0.04175,
18373
+ "grad_norm": 0.4921875,
18374
+ "learning_rate": 0.0025035,
18375
+ "loss": 3.411,
18376
+ "memory/device_reserved (GiB)": 127.96,
18377
+ "memory/max_active (GiB)": 127.83,
18378
+ "memory/max_allocated (GiB)": 127.83,
18379
+ "step": 1670,
18380
+ "tokens_per_second_per_gpu": 10552.13
18381
+ },
18382
+ {
18383
+ "epoch": 0.041775,
18384
+ "grad_norm": 0.4609375,
18385
+ "learning_rate": 0.002505,
18386
+ "loss": 3.4083,
18387
+ "memory/device_reserved (GiB)": 107.61,
18388
+ "memory/max_active (GiB)": 107.43,
18389
+ "memory/max_allocated (GiB)": 107.43,
18390
+ "step": 1671,
18391
+ "tokens_per_second_per_gpu": 12234.78
18392
+ },
18393
+ {
18394
+ "epoch": 0.0418,
18395
+ "grad_norm": 0.40234375,
18396
+ "learning_rate": 0.0025065,
18397
+ "loss": 3.4449,
18398
+ "memory/device_reserved (GiB)": 55.57,
18399
+ "memory/max_active (GiB)": 55.42,
18400
+ "memory/max_allocated (GiB)": 55.42,
18401
+ "step": 1672,
18402
+ "tokens_per_second_per_gpu": 23493.66
18403
+ },
18404
+ {
18405
+ "epoch": 0.041825,
18406
+ "grad_norm": 0.326171875,
18407
+ "learning_rate": 0.002508,
18408
+ "loss": 3.422,
18409
+ "memory/device_reserved (GiB)": 97.42,
18410
+ "memory/max_active (GiB)": 97.23,
18411
+ "memory/max_allocated (GiB)": 97.23,
18412
+ "step": 1673,
18413
+ "tokens_per_second_per_gpu": 13667.7
18414
+ },
18415
+ {
18416
+ "epoch": 0.04185,
18417
+ "grad_norm": 0.474609375,
18418
+ "learning_rate": 0.0025095,
18419
+ "loss": 3.4849,
18420
+ "memory/device_reserved (GiB)": 107.61,
18421
+ "memory/max_active (GiB)": 107.43,
18422
+ "memory/max_allocated (GiB)": 107.43,
18423
+ "step": 1674,
18424
+ "tokens_per_second_per_gpu": 13409.66
18425
+ },
18426
+ {
18427
+ "epoch": 0.041875,
18428
+ "grad_norm": 0.51953125,
18429
+ "learning_rate": 0.0025109999999999998,
18430
+ "loss": 3.4117,
18431
+ "memory/device_reserved (GiB)": 107.61,
18432
+ "memory/max_active (GiB)": 107.43,
18433
+ "memory/max_allocated (GiB)": 107.43,
18434
+ "step": 1675,
18435
+ "tokens_per_second_per_gpu": 12861.87
18436
+ },
18437
+ {
18438
+ "epoch": 0.0419,
18439
+ "grad_norm": 0.5859375,
18440
+ "learning_rate": 0.0025125,
18441
+ "loss": 3.4487,
18442
+ "memory/device_reserved (GiB)": 107.61,
18443
+ "memory/max_active (GiB)": 107.43,
18444
+ "memory/max_allocated (GiB)": 107.43,
18445
+ "step": 1676,
18446
+ "tokens_per_second_per_gpu": 13391.9
18447
+ },
18448
+ {
18449
+ "epoch": 0.041925,
18450
+ "grad_norm": 0.45703125,
18451
+ "learning_rate": 0.0025139999999999997,
18452
+ "loss": 3.4691,
18453
+ "memory/device_reserved (GiB)": 97.42,
18454
+ "memory/max_active (GiB)": 97.23,
18455
+ "memory/max_allocated (GiB)": 97.23,
18456
+ "step": 1677,
18457
+ "tokens_per_second_per_gpu": 13697.51
18458
+ },
18459
+ {
18460
+ "epoch": 0.04195,
18461
+ "grad_norm": 0.458984375,
18462
+ "learning_rate": 0.0025155,
18463
+ "loss": 3.4354,
18464
+ "memory/device_reserved (GiB)": 117.34,
18465
+ "memory/max_active (GiB)": 117.15,
18466
+ "memory/max_allocated (GiB)": 117.15,
18467
+ "step": 1678,
18468
+ "tokens_per_second_per_gpu": 11489.94
18469
+ },
18470
+ {
18471
+ "epoch": 0.041975,
18472
+ "grad_norm": 0.41796875,
18473
+ "learning_rate": 0.002517,
18474
+ "loss": 3.4734,
18475
+ "memory/device_reserved (GiB)": 127.96,
18476
+ "memory/max_active (GiB)": 127.83,
18477
+ "memory/max_allocated (GiB)": 127.83,
18478
+ "step": 1679,
18479
+ "tokens_per_second_per_gpu": 10921.96
18480
+ },
18481
+ {
18482
+ "epoch": 0.042,
18483
+ "grad_norm": 0.376953125,
18484
+ "learning_rate": 0.0025185000000000003,
18485
+ "loss": 3.4138,
18486
+ "memory/device_reserved (GiB)": 97.42,
18487
+ "memory/max_active (GiB)": 97.23,
18488
+ "memory/max_allocated (GiB)": 97.23,
18489
+ "step": 1680,
18490
+ "tokens_per_second_per_gpu": 14813.73
18491
+ },
18492
+ {
18493
+ "epoch": 0.042025,
18494
+ "grad_norm": 0.357421875,
18495
+ "learning_rate": 0.00252,
18496
+ "loss": 3.395,
18497
+ "memory/device_reserved (GiB)": 56.57,
18498
+ "memory/max_active (GiB)": 56.42,
18499
+ "memory/max_allocated (GiB)": 56.42,
18500
+ "step": 1681,
18501
+ "tokens_per_second_per_gpu": 22947.66
18502
+ },
18503
+ {
18504
+ "epoch": 0.04205,
18505
+ "grad_norm": 0.375,
18506
+ "learning_rate": 0.0025215000000000003,
18507
+ "loss": 3.4281,
18508
+ "memory/device_reserved (GiB)": 117.82,
18509
+ "memory/max_active (GiB)": 117.63,
18510
+ "memory/max_allocated (GiB)": 117.63,
18511
+ "step": 1682,
18512
+ "tokens_per_second_per_gpu": 11994.21
18513
+ },
18514
+ {
18515
+ "epoch": 0.042075,
18516
+ "grad_norm": 0.37109375,
18517
+ "learning_rate": 0.002523,
18518
+ "loss": 3.4589,
18519
+ "memory/device_reserved (GiB)": 86.7,
18520
+ "memory/max_active (GiB)": 86.55,
18521
+ "memory/max_allocated (GiB)": 86.55,
18522
+ "step": 1683,
18523
+ "tokens_per_second_per_gpu": 15257.0
18524
+ },
18525
+ {
18526
+ "epoch": 0.0421,
18527
+ "grad_norm": 0.34375,
18528
+ "learning_rate": 0.0025245000000000003,
18529
+ "loss": 3.4429,
18530
+ "memory/device_reserved (GiB)": 44.36,
18531
+ "memory/max_active (GiB)": 44.31,
18532
+ "memory/max_allocated (GiB)": 44.31,
18533
+ "step": 1684,
18534
+ "tokens_per_second_per_gpu": 27548.71
18535
+ },
18536
+ {
18537
+ "epoch": 0.042125,
18538
+ "grad_norm": 0.4140625,
18539
+ "learning_rate": 0.002526,
18540
+ "loss": 3.4058,
18541
+ "memory/device_reserved (GiB)": 76.49,
18542
+ "memory/max_active (GiB)": 76.35,
18543
+ "memory/max_allocated (GiB)": 76.35,
18544
+ "step": 1685,
18545
+ "tokens_per_second_per_gpu": 17574.15
18546
+ },
18547
+ {
18548
+ "epoch": 0.04215,
18549
+ "grad_norm": 0.3984375,
18550
+ "learning_rate": 0.0025275000000000002,
18551
+ "loss": 3.4312,
18552
+ "memory/device_reserved (GiB)": 74.93,
18553
+ "memory/max_active (GiB)": 74.91,
18554
+ "memory/max_allocated (GiB)": 74.91,
18555
+ "step": 1686,
18556
+ "tokens_per_second_per_gpu": 18549.39
18557
+ },
18558
+ {
18559
+ "epoch": 0.042175,
18560
+ "grad_norm": 0.294921875,
18561
+ "learning_rate": 0.002529,
18562
+ "loss": 3.4163,
18563
+ "memory/device_reserved (GiB)": 87.21,
18564
+ "memory/max_active (GiB)": 87.03,
18565
+ "memory/max_allocated (GiB)": 87.03,
18566
+ "step": 1687,
18567
+ "tokens_per_second_per_gpu": 15868.69
18568
+ },
18569
+ {
18570
+ "epoch": 0.0422,
18571
+ "grad_norm": 0.416015625,
18572
+ "learning_rate": 0.0025305,
18573
+ "loss": 3.4385,
18574
+ "memory/device_reserved (GiB)": 87.21,
18575
+ "memory/max_active (GiB)": 87.03,
18576
+ "memory/max_allocated (GiB)": 87.03,
18577
+ "step": 1688,
18578
+ "tokens_per_second_per_gpu": 16224.91
18579
+ },
18580
+ {
18581
+ "epoch": 0.042225,
18582
+ "grad_norm": 0.41796875,
18583
+ "learning_rate": 0.002532,
18584
+ "loss": 3.4211,
18585
+ "memory/device_reserved (GiB)": 87.21,
18586
+ "memory/max_active (GiB)": 87.03,
18587
+ "memory/max_allocated (GiB)": 87.03,
18588
+ "step": 1689,
18589
+ "tokens_per_second_per_gpu": 14969.11
18590
+ },
18591
+ {
18592
+ "epoch": 0.04225,
18593
+ "grad_norm": 0.3203125,
18594
+ "learning_rate": 0.0025335,
18595
+ "loss": 3.3951,
18596
+ "memory/device_reserved (GiB)": 76.49,
18597
+ "memory/max_active (GiB)": 76.35,
18598
+ "memory/max_allocated (GiB)": 76.35,
18599
+ "step": 1690,
18600
+ "tokens_per_second_per_gpu": 17850.22
18601
+ },
18602
+ {
18603
+ "epoch": 0.042275,
18604
+ "grad_norm": 0.41015625,
18605
+ "learning_rate": 0.002535,
18606
+ "loss": 3.4191,
18607
+ "memory/device_reserved (GiB)": 87.21,
18608
+ "memory/max_active (GiB)": 87.03,
18609
+ "memory/max_allocated (GiB)": 87.03,
18610
+ "step": 1691,
18611
+ "tokens_per_second_per_gpu": 15729.04
18612
+ },
18613
+ {
18614
+ "epoch": 0.0423,
18615
+ "grad_norm": 0.439453125,
18616
+ "learning_rate": 0.0025365,
18617
+ "loss": 3.4192,
18618
+ "memory/device_reserved (GiB)": 86.7,
18619
+ "memory/max_active (GiB)": 86.55,
18620
+ "memory/max_allocated (GiB)": 86.55,
18621
+ "step": 1692,
18622
+ "tokens_per_second_per_gpu": 15578.61
18623
+ },
18624
+ {
18625
+ "epoch": 0.042325,
18626
+ "grad_norm": 0.326171875,
18627
+ "learning_rate": 0.002538,
18628
+ "loss": 3.4293,
18629
+ "memory/device_reserved (GiB)": 56.57,
18630
+ "memory/max_active (GiB)": 56.42,
18631
+ "memory/max_allocated (GiB)": 56.42,
18632
+ "step": 1693,
18633
+ "tokens_per_second_per_gpu": 21717.81
18634
+ },
18635
+ {
18636
+ "epoch": 0.04235,
18637
+ "grad_norm": 0.294921875,
18638
+ "learning_rate": 0.0025395,
18639
+ "loss": 3.4162,
18640
+ "memory/device_reserved (GiB)": 56.57,
18641
+ "memory/max_active (GiB)": 56.42,
18642
+ "memory/max_allocated (GiB)": 56.42,
18643
+ "step": 1694,
18644
+ "tokens_per_second_per_gpu": 23759.26
18645
+ },
18646
+ {
18647
+ "epoch": 0.042375,
18648
+ "grad_norm": 0.376953125,
18649
+ "learning_rate": 0.002541,
18650
+ "loss": 3.4058,
18651
+ "memory/device_reserved (GiB)": 56.63,
18652
+ "memory/max_active (GiB)": 56.42,
18653
+ "memory/max_allocated (GiB)": 56.42,
18654
+ "step": 1695,
18655
+ "tokens_per_second_per_gpu": 22957.98
18656
+ },
18657
+ {
18658
+ "epoch": 0.0424,
18659
+ "grad_norm": 0.361328125,
18660
+ "learning_rate": 0.0025425,
18661
+ "loss": 3.4236,
18662
+ "memory/device_reserved (GiB)": 85.82,
18663
+ "memory/max_active (GiB)": 85.68,
18664
+ "memory/max_allocated (GiB)": 85.68,
18665
+ "step": 1696,
18666
+ "tokens_per_second_per_gpu": 15174.02
18667
+ },
18668
+ {
18669
+ "epoch": 0.042425,
18670
+ "grad_norm": 0.353515625,
18671
+ "learning_rate": 0.002544,
18672
+ "loss": 3.3949,
18673
+ "memory/device_reserved (GiB)": 77.01,
18674
+ "memory/max_active (GiB)": 76.83,
18675
+ "memory/max_allocated (GiB)": 76.83,
18676
+ "step": 1697,
18677
+ "tokens_per_second_per_gpu": 17562.99
18678
+ },
18679
+ {
18680
+ "epoch": 0.04245,
18681
+ "grad_norm": 0.3671875,
18682
+ "learning_rate": 0.0025455,
18683
+ "loss": 3.4254,
18684
+ "memory/device_reserved (GiB)": 56.57,
18685
+ "memory/max_active (GiB)": 56.42,
18686
+ "memory/max_allocated (GiB)": 56.42,
18687
+ "step": 1698,
18688
+ "tokens_per_second_per_gpu": 22963.27
18689
+ },
18690
+ {
18691
+ "epoch": 0.042475,
18692
+ "grad_norm": 0.3671875,
18693
+ "learning_rate": 0.002547,
18694
+ "loss": 3.4004,
18695
+ "memory/device_reserved (GiB)": 117.82,
18696
+ "memory/max_active (GiB)": 117.63,
18697
+ "memory/max_allocated (GiB)": 117.63,
18698
+ "step": 1699,
18699
+ "tokens_per_second_per_gpu": 11758.64
18700
+ },
18701
+ {
18702
+ "epoch": 0.0425,
18703
+ "grad_norm": 0.37109375,
18704
+ "learning_rate": 0.0025485,
18705
+ "loss": 3.4032,
18706
+ "memory/device_reserved (GiB)": 56.14,
18707
+ "memory/max_active (GiB)": 55.95,
18708
+ "memory/max_allocated (GiB)": 55.95,
18709
+ "step": 1700,
18710
+ "tokens_per_second_per_gpu": 23581.93
18711
+ },
18712
+ {
18713
+ "epoch": 0.042525,
18714
+ "grad_norm": 0.44921875,
18715
+ "learning_rate": 0.00255,
18716
+ "loss": 3.4364,
18717
+ "memory/device_reserved (GiB)": 127.55,
18718
+ "memory/max_active (GiB)": 127.35,
18719
+ "memory/max_allocated (GiB)": 127.35,
18720
+ "step": 1701,
18721
+ "tokens_per_second_per_gpu": 10589.65
18722
+ },
18723
+ {
18724
+ "epoch": 0.04255,
18725
+ "grad_norm": 0.4921875,
18726
+ "learning_rate": 0.0025515,
18727
+ "loss": 3.4531,
18728
+ "memory/device_reserved (GiB)": 96.93,
18729
+ "memory/max_active (GiB)": 96.75,
18730
+ "memory/max_allocated (GiB)": 96.75,
18731
+ "step": 1702,
18732
+ "tokens_per_second_per_gpu": 13398.45
18733
+ },
18734
+ {
18735
+ "epoch": 0.042575,
18736
+ "grad_norm": 0.5546875,
18737
+ "learning_rate": 0.002553,
18738
+ "loss": 3.4716,
18739
+ "memory/device_reserved (GiB)": 77.01,
18740
+ "memory/max_active (GiB)": 76.83,
18741
+ "memory/max_allocated (GiB)": 76.83,
18742
+ "step": 1703,
18743
+ "tokens_per_second_per_gpu": 17913.65
18744
+ },
18745
+ {
18746
+ "epoch": 0.0426,
18747
+ "grad_norm": 0.486328125,
18748
+ "learning_rate": 0.0025545000000000003,
18749
+ "loss": 3.4539,
18750
+ "memory/device_reserved (GiB)": 96.93,
18751
+ "memory/max_active (GiB)": 96.75,
18752
+ "memory/max_allocated (GiB)": 96.75,
18753
+ "step": 1704,
18754
+ "tokens_per_second_per_gpu": 13482.41
18755
+ },
18756
+ {
18757
+ "epoch": 0.042625,
18758
+ "grad_norm": 0.458984375,
18759
+ "learning_rate": 0.002556,
18760
+ "loss": 3.4376,
18761
+ "memory/device_reserved (GiB)": 64.72,
18762
+ "memory/max_active (GiB)": 64.71,
18763
+ "memory/max_allocated (GiB)": 64.71,
18764
+ "step": 1705,
18765
+ "tokens_per_second_per_gpu": 20009.22
18766
+ },
18767
+ {
18768
+ "epoch": 0.04265,
18769
+ "grad_norm": 0.458984375,
18770
+ "learning_rate": 0.0025575000000000003,
18771
+ "loss": 3.4079,
18772
+ "memory/device_reserved (GiB)": 56.63,
18773
+ "memory/max_active (GiB)": 56.42,
18774
+ "memory/max_allocated (GiB)": 56.42,
18775
+ "step": 1706,
18776
+ "tokens_per_second_per_gpu": 22731.28
18777
+ },
18778
+ {
18779
+ "epoch": 0.042675,
18780
+ "grad_norm": 0.38671875,
18781
+ "learning_rate": 0.002559,
18782
+ "loss": 3.4195,
18783
+ "memory/device_reserved (GiB)": 66.8,
18784
+ "memory/max_active (GiB)": 66.63,
18785
+ "memory/max_allocated (GiB)": 66.63,
18786
+ "step": 1707,
18787
+ "tokens_per_second_per_gpu": 18695.79
18788
+ },
18789
+ {
18790
+ "epoch": 0.0427,
18791
+ "grad_norm": 0.39453125,
18792
+ "learning_rate": 0.0025605000000000003,
18793
+ "loss": 3.3978,
18794
+ "memory/device_reserved (GiB)": 127.55,
18795
+ "memory/max_active (GiB)": 127.35,
18796
+ "memory/max_allocated (GiB)": 127.35,
18797
+ "step": 1708,
18798
+ "tokens_per_second_per_gpu": 10495.74
18799
+ },
18800
+ {
18801
+ "epoch": 0.042725,
18802
+ "grad_norm": 0.416015625,
18803
+ "learning_rate": 0.002562,
18804
+ "loss": 3.3919,
18805
+ "memory/device_reserved (GiB)": 87.21,
18806
+ "memory/max_active (GiB)": 87.03,
18807
+ "memory/max_allocated (GiB)": 87.03,
18808
+ "step": 1709,
18809
+ "tokens_per_second_per_gpu": 15384.14
18810
+ },
18811
+ {
18812
+ "epoch": 0.04275,
18813
+ "grad_norm": 0.5078125,
18814
+ "learning_rate": 0.0025635000000000002,
18815
+ "loss": 3.4465,
18816
+ "memory/device_reserved (GiB)": 66.8,
18817
+ "memory/max_active (GiB)": 66.63,
18818
+ "memory/max_allocated (GiB)": 66.63,
18819
+ "step": 1710,
18820
+ "tokens_per_second_per_gpu": 21356.58
18821
+ },
18822
+ {
18823
+ "epoch": 0.042775,
18824
+ "grad_norm": 0.423828125,
18825
+ "learning_rate": 0.002565,
18826
+ "loss": 3.408,
18827
+ "memory/device_reserved (GiB)": 97.42,
18828
+ "memory/max_active (GiB)": 97.23,
18829
+ "memory/max_allocated (GiB)": 97.23,
18830
+ "step": 1711,
18831
+ "tokens_per_second_per_gpu": 13459.02
18832
+ },
18833
+ {
18834
+ "epoch": 0.0428,
18835
+ "grad_norm": 0.421875,
18836
+ "learning_rate": 0.0025665,
18837
+ "loss": 3.4107,
18838
+ "memory/device_reserved (GiB)": 87.21,
18839
+ "memory/max_active (GiB)": 87.03,
18840
+ "memory/max_allocated (GiB)": 87.03,
18841
+ "step": 1712,
18842
+ "tokens_per_second_per_gpu": 15576.53
18843
+ },
18844
+ {
18845
+ "epoch": 0.042825,
18846
+ "grad_norm": 0.466796875,
18847
+ "learning_rate": 0.002568,
18848
+ "loss": 3.4465,
18849
+ "memory/device_reserved (GiB)": 56.57,
18850
+ "memory/max_active (GiB)": 56.42,
18851
+ "memory/max_allocated (GiB)": 56.42,
18852
+ "step": 1713,
18853
+ "tokens_per_second_per_gpu": 24181.26
18854
+ },
18855
+ {
18856
+ "epoch": 0.04285,
18857
+ "grad_norm": 0.55859375,
18858
+ "learning_rate": 0.0025695,
18859
+ "loss": 3.4621,
18860
+ "memory/device_reserved (GiB)": 46.39,
18861
+ "memory/max_active (GiB)": 46.22,
18862
+ "memory/max_allocated (GiB)": 46.22,
18863
+ "step": 1714,
18864
+ "tokens_per_second_per_gpu": 27803.98
18865
+ },
18866
+ {
18867
+ "epoch": 0.042875,
18868
+ "grad_norm": 0.57421875,
18869
+ "learning_rate": 0.002571,
18870
+ "loss": 3.4206,
18871
+ "memory/device_reserved (GiB)": 56.57,
18872
+ "memory/max_active (GiB)": 56.42,
18873
+ "memory/max_allocated (GiB)": 56.42,
18874
+ "step": 1715,
18875
+ "tokens_per_second_per_gpu": 24012.74
18876
+ },
18877
+ {
18878
+ "epoch": 0.0429,
18879
+ "grad_norm": 0.490234375,
18880
+ "learning_rate": 0.0025725,
18881
+ "loss": 3.4463,
18882
+ "memory/device_reserved (GiB)": 107.12,
18883
+ "memory/max_active (GiB)": 106.95,
18884
+ "memory/max_allocated (GiB)": 106.95,
18885
+ "step": 1716,
18886
+ "tokens_per_second_per_gpu": 12538.54
18887
+ },
18888
+ {
18889
+ "epoch": 0.042925,
18890
+ "grad_norm": 0.412109375,
18891
+ "learning_rate": 0.002574,
18892
+ "loss": 3.4565,
18893
+ "memory/device_reserved (GiB)": 66.8,
18894
+ "memory/max_active (GiB)": 66.63,
18895
+ "memory/max_allocated (GiB)": 66.63,
18896
+ "step": 1717,
18897
+ "tokens_per_second_per_gpu": 19965.96
18898
+ },
18899
+ {
18900
+ "epoch": 0.04295,
18901
+ "grad_norm": 0.392578125,
18902
+ "learning_rate": 0.0025755,
18903
+ "loss": 3.4361,
18904
+ "memory/device_reserved (GiB)": 107.61,
18905
+ "memory/max_active (GiB)": 107.43,
18906
+ "memory/max_allocated (GiB)": 107.43,
18907
+ "step": 1718,
18908
+ "tokens_per_second_per_gpu": 12689.62
18909
+ },
18910
+ {
18911
+ "epoch": 0.042975,
18912
+ "grad_norm": 0.38671875,
18913
+ "learning_rate": 0.002577,
18914
+ "loss": 3.4358,
18915
+ "memory/device_reserved (GiB)": 87.21,
18916
+ "memory/max_active (GiB)": 87.03,
18917
+ "memory/max_allocated (GiB)": 87.03,
18918
+ "step": 1719,
18919
+ "tokens_per_second_per_gpu": 15987.55
18920
+ },
18921
+ {
18922
+ "epoch": 0.043,
18923
+ "grad_norm": 0.431640625,
18924
+ "learning_rate": 0.0025785,
18925
+ "loss": 3.4481,
18926
+ "memory/device_reserved (GiB)": 56.57,
18927
+ "memory/max_active (GiB)": 56.42,
18928
+ "memory/max_allocated (GiB)": 56.42,
18929
+ "step": 1720,
18930
+ "tokens_per_second_per_gpu": 23525.6
18931
+ },
18932
+ {
18933
+ "epoch": 0.043025,
18934
+ "grad_norm": 0.4453125,
18935
+ "learning_rate": 0.00258,
18936
+ "loss": 3.4401,
18937
+ "memory/device_reserved (GiB)": 127.55,
18938
+ "memory/max_active (GiB)": 127.35,
18939
+ "memory/max_allocated (GiB)": 127.35,
18940
+ "step": 1721,
18941
+ "tokens_per_second_per_gpu": 10768.58
18942
+ },
18943
+ {
18944
+ "epoch": 0.04305,
18945
+ "grad_norm": 0.44921875,
18946
+ "learning_rate": 0.0025815,
18947
+ "loss": 3.4029,
18948
+ "memory/device_reserved (GiB)": 97.42,
18949
+ "memory/max_active (GiB)": 97.23,
18950
+ "memory/max_allocated (GiB)": 97.23,
18951
+ "step": 1722,
18952
+ "tokens_per_second_per_gpu": 14518.37
18953
+ },
18954
+ {
18955
+ "epoch": 0.043075,
18956
+ "grad_norm": 0.384765625,
18957
+ "learning_rate": 0.0025830000000000002,
18958
+ "loss": 3.3824,
18959
+ "memory/device_reserved (GiB)": 66.8,
18960
+ "memory/max_active (GiB)": 66.63,
18961
+ "memory/max_allocated (GiB)": 66.63,
18962
+ "step": 1723,
18963
+ "tokens_per_second_per_gpu": 19566.23
18964
+ },
18965
+ {
18966
+ "epoch": 0.0431,
18967
+ "grad_norm": 0.27734375,
18968
+ "learning_rate": 0.0025845,
18969
+ "loss": 3.4756,
18970
+ "memory/device_reserved (GiB)": 96.93,
18971
+ "memory/max_active (GiB)": 96.75,
18972
+ "memory/max_allocated (GiB)": 96.75,
18973
+ "step": 1724,
18974
+ "tokens_per_second_per_gpu": 13699.55
18975
+ },
18976
+ {
18977
+ "epoch": 0.043125,
18978
+ "grad_norm": 0.255859375,
18979
+ "learning_rate": 0.002586,
18980
+ "loss": 3.4392,
18981
+ "memory/device_reserved (GiB)": 55.41,
18982
+ "memory/max_active (GiB)": 55.39,
18983
+ "memory/max_allocated (GiB)": 55.39,
18984
+ "step": 1725,
18985
+ "tokens_per_second_per_gpu": 23270.79
18986
+ },
18987
+ {
18988
+ "epoch": 0.04315,
18989
+ "grad_norm": 0.2421875,
18990
+ "learning_rate": 0.0025875000000000004,
18991
+ "loss": 3.4289,
18992
+ "memory/device_reserved (GiB)": 66.8,
18993
+ "memory/max_active (GiB)": 66.63,
18994
+ "memory/max_allocated (GiB)": 66.63,
18995
+ "step": 1726,
18996
+ "tokens_per_second_per_gpu": 20476.82
18997
+ },
18998
+ {
18999
+ "epoch": 0.043175,
19000
+ "grad_norm": 0.244140625,
19001
+ "learning_rate": 0.002589,
19002
+ "loss": 3.4447,
19003
+ "memory/device_reserved (GiB)": 75.99,
19004
+ "memory/max_active (GiB)": 75.82,
19005
+ "memory/max_allocated (GiB)": 75.82,
19006
+ "step": 1727,
19007
+ "tokens_per_second_per_gpu": 17731.33
19008
+ },
19009
+ {
19010
+ "epoch": 0.0432,
19011
+ "grad_norm": 0.24609375,
19012
+ "learning_rate": 0.0025905000000000004,
19013
+ "loss": 3.4028,
19014
+ "memory/device_reserved (GiB)": 127.96,
19015
+ "memory/max_active (GiB)": 127.83,
19016
+ "memory/max_allocated (GiB)": 127.83,
19017
+ "step": 1728,
19018
+ "tokens_per_second_per_gpu": 10811.24
19019
+ },
19020
+ {
19021
+ "epoch": 0.043225,
19022
+ "grad_norm": 0.306640625,
19023
+ "learning_rate": 0.002592,
19024
+ "loss": 3.4164,
19025
+ "memory/device_reserved (GiB)": 97.42,
19026
+ "memory/max_active (GiB)": 97.23,
19027
+ "memory/max_allocated (GiB)": 97.23,
19028
+ "step": 1729,
19029
+ "tokens_per_second_per_gpu": 13945.48
19030
+ },
19031
+ {
19032
+ "epoch": 0.04325,
19033
+ "grad_norm": 0.369140625,
19034
+ "learning_rate": 0.0025935000000000003,
19035
+ "loss": 3.3548,
19036
+ "memory/device_reserved (GiB)": 66.8,
19037
+ "memory/max_active (GiB)": 66.63,
19038
+ "memory/max_allocated (GiB)": 66.63,
19039
+ "step": 1730,
19040
+ "tokens_per_second_per_gpu": 19830.86
19041
+ },
19042
+ {
19043
+ "epoch": 0.043275,
19044
+ "grad_norm": 0.37890625,
19045
+ "learning_rate": 0.002595,
19046
+ "loss": 3.421,
19047
+ "memory/device_reserved (GiB)": 127.96,
19048
+ "memory/max_active (GiB)": 127.83,
19049
+ "memory/max_allocated (GiB)": 127.83,
19050
+ "step": 1731,
19051
+ "tokens_per_second_per_gpu": 10542.1
19052
+ },
19053
+ {
19054
+ "epoch": 0.0433,
19055
+ "grad_norm": 0.38671875,
19056
+ "learning_rate": 0.0025965000000000003,
19057
+ "loss": 3.4064,
19058
+ "memory/device_reserved (GiB)": 56.14,
19059
+ "memory/max_active (GiB)": 55.95,
19060
+ "memory/max_allocated (GiB)": 55.95,
19061
+ "step": 1732,
19062
+ "tokens_per_second_per_gpu": 21749.89
19063
+ },
19064
+ {
19065
+ "epoch": 0.043325,
19066
+ "grad_norm": 0.2255859375,
19067
+ "learning_rate": 0.002598,
19068
+ "loss": 3.4061,
19069
+ "memory/device_reserved (GiB)": 85.15,
19070
+ "memory/max_active (GiB)": 85.11,
19071
+ "memory/max_allocated (GiB)": 85.11,
19072
+ "step": 1733,
19073
+ "tokens_per_second_per_gpu": 15894.06
19074
+ },
19075
+ {
19076
+ "epoch": 0.04335,
19077
+ "grad_norm": 0.30859375,
19078
+ "learning_rate": 0.0025995000000000002,
19079
+ "loss": 3.3912,
19080
+ "memory/device_reserved (GiB)": 66.35,
19081
+ "memory/max_active (GiB)": 66.15,
19082
+ "memory/max_allocated (GiB)": 66.15,
19083
+ "step": 1734,
19084
+ "tokens_per_second_per_gpu": 20235.34
19085
+ },
19086
+ {
19087
+ "epoch": 0.043375,
19088
+ "grad_norm": 0.28125,
19089
+ "learning_rate": 0.002601,
19090
+ "loss": 3.3853,
19091
+ "memory/device_reserved (GiB)": 44.36,
19092
+ "memory/max_active (GiB)": 44.31,
19093
+ "memory/max_allocated (GiB)": 44.31,
19094
+ "step": 1735,
19095
+ "tokens_per_second_per_gpu": 27550.53
19096
+ },
19097
+ {
19098
+ "epoch": 0.0434,
19099
+ "grad_norm": 0.251953125,
19100
+ "learning_rate": 0.0026025,
19101
+ "loss": 3.4352,
19102
+ "memory/device_reserved (GiB)": 117.82,
19103
+ "memory/max_active (GiB)": 117.63,
19104
+ "memory/max_allocated (GiB)": 117.63,
19105
+ "step": 1736,
19106
+ "tokens_per_second_per_gpu": 11236.12
19107
+ },
19108
+ {
19109
+ "epoch": 0.043425,
19110
+ "grad_norm": 0.404296875,
19111
+ "learning_rate": 0.002604,
19112
+ "loss": 3.4176,
19113
+ "memory/device_reserved (GiB)": 127.96,
19114
+ "memory/max_active (GiB)": 127.83,
19115
+ "memory/max_allocated (GiB)": 127.83,
19116
+ "step": 1737,
19117
+ "tokens_per_second_per_gpu": 10840.44
19118
+ },
19119
+ {
19120
+ "epoch": 0.04345,
19121
+ "grad_norm": 0.546875,
19122
+ "learning_rate": 0.0026055,
19123
+ "loss": 3.4038,
19124
+ "memory/device_reserved (GiB)": 76.49,
19125
+ "memory/max_active (GiB)": 76.35,
19126
+ "memory/max_allocated (GiB)": 76.35,
19127
+ "step": 1738,
19128
+ "tokens_per_second_per_gpu": 17339.64
19129
+ },
19130
+ {
19131
+ "epoch": 0.043475,
19132
+ "grad_norm": 0.578125,
19133
+ "learning_rate": 0.002607,
19134
+ "loss": 3.4502,
19135
+ "memory/device_reserved (GiB)": 127.55,
19136
+ "memory/max_active (GiB)": 127.35,
19137
+ "memory/max_allocated (GiB)": 127.35,
19138
+ "step": 1739,
19139
+ "tokens_per_second_per_gpu": 10609.71
19140
+ },
19141
+ {
19142
+ "epoch": 0.0435,
19143
+ "grad_norm": 0.53125,
19144
+ "learning_rate": 0.0026085,
19145
+ "loss": 3.4514,
19146
+ "memory/device_reserved (GiB)": 86.7,
19147
+ "memory/max_active (GiB)": 86.55,
19148
+ "memory/max_allocated (GiB)": 86.55,
19149
+ "step": 1740,
19150
+ "tokens_per_second_per_gpu": 14979.82
19151
+ },
19152
+ {
19153
+ "epoch": 0.043525,
19154
+ "grad_norm": 0.46484375,
19155
+ "learning_rate": 0.00261,
19156
+ "loss": 3.461,
19157
+ "memory/device_reserved (GiB)": 97.42,
19158
+ "memory/max_active (GiB)": 97.23,
19159
+ "memory/max_allocated (GiB)": 97.23,
19160
+ "step": 1741,
19161
+ "tokens_per_second_per_gpu": 13526.38
19162
+ },
19163
+ {
19164
+ "epoch": 0.04355,
19165
+ "grad_norm": 0.44921875,
19166
+ "learning_rate": 0.0026115,
19167
+ "loss": 3.4299,
19168
+ "memory/device_reserved (GiB)": 64.72,
19169
+ "memory/max_active (GiB)": 64.71,
19170
+ "memory/max_allocated (GiB)": 64.71,
19171
+ "step": 1742,
19172
+ "tokens_per_second_per_gpu": 20033.73
19173
+ },
19174
+ {
19175
+ "epoch": 0.043575,
19176
+ "grad_norm": 0.40234375,
19177
+ "learning_rate": 0.002613,
19178
+ "loss": 3.4074,
19179
+ "memory/device_reserved (GiB)": 97.42,
19180
+ "memory/max_active (GiB)": 97.23,
19181
+ "memory/max_allocated (GiB)": 97.23,
19182
+ "step": 1743,
19183
+ "tokens_per_second_per_gpu": 13919.14
19184
+ },
19185
+ {
19186
+ "epoch": 0.0436,
19187
+ "grad_norm": 0.337890625,
19188
+ "learning_rate": 0.0026145,
19189
+ "loss": 3.4226,
19190
+ "memory/device_reserved (GiB)": 127.96,
19191
+ "memory/max_active (GiB)": 127.83,
19192
+ "memory/max_allocated (GiB)": 127.83,
19193
+ "step": 1744,
19194
+ "tokens_per_second_per_gpu": 10914.18
19195
+ },
19196
+ {
19197
+ "epoch": 0.043625,
19198
+ "grad_norm": 0.34375,
19199
+ "learning_rate": 0.002616,
19200
+ "loss": 3.4013,
19201
+ "memory/device_reserved (GiB)": 46.39,
19202
+ "memory/max_active (GiB)": 46.22,
19203
+ "memory/max_allocated (GiB)": 46.22,
19204
+ "step": 1745,
19205
+ "tokens_per_second_per_gpu": 26513.04
19206
+ },
19207
+ {
19208
+ "epoch": 0.04365,
19209
+ "grad_norm": 0.400390625,
19210
+ "learning_rate": 0.0026175,
19211
+ "loss": 3.4318,
19212
+ "memory/device_reserved (GiB)": 107.61,
19213
+ "memory/max_active (GiB)": 107.43,
19214
+ "memory/max_allocated (GiB)": 107.43,
19215
+ "step": 1746,
19216
+ "tokens_per_second_per_gpu": 12843.87
19217
+ },
19218
+ {
19219
+ "epoch": 0.043675,
19220
+ "grad_norm": 0.4140625,
19221
+ "learning_rate": 0.0026190000000000002,
19222
+ "loss": 3.4096,
19223
+ "memory/device_reserved (GiB)": 87.21,
19224
+ "memory/max_active (GiB)": 87.03,
19225
+ "memory/max_allocated (GiB)": 87.03,
19226
+ "step": 1747,
19227
+ "tokens_per_second_per_gpu": 15880.59
19228
+ },
19229
+ {
19230
+ "epoch": 0.0437,
19231
+ "grad_norm": 0.50390625,
19232
+ "learning_rate": 0.0026205000000000004,
19233
+ "loss": 3.4245,
19234
+ "memory/device_reserved (GiB)": 127.55,
19235
+ "memory/max_active (GiB)": 127.35,
19236
+ "memory/max_allocated (GiB)": 127.35,
19237
+ "step": 1748,
19238
+ "tokens_per_second_per_gpu": 10692.67
19239
+ },
19240
+ {
19241
+ "epoch": 0.043725,
19242
+ "grad_norm": 0.474609375,
19243
+ "learning_rate": 0.002622,
19244
+ "loss": 3.4249,
19245
+ "memory/device_reserved (GiB)": 66.8,
19246
+ "memory/max_active (GiB)": 66.63,
19247
+ "memory/max_allocated (GiB)": 66.63,
19248
+ "step": 1749,
19249
+ "tokens_per_second_per_gpu": 19357.34
19250
+ },
19251
+ {
19252
+ "epoch": 0.04375,
19253
+ "grad_norm": 0.423828125,
19254
+ "learning_rate": 0.0026235000000000004,
19255
+ "loss": 3.391,
19256
+ "memory/device_reserved (GiB)": 56.57,
19257
+ "memory/max_active (GiB)": 56.42,
19258
+ "memory/max_allocated (GiB)": 56.42,
19259
+ "step": 1750,
19260
+ "tokens_per_second_per_gpu": 23852.2
19261
+ },
19262
+ {
19263
+ "epoch": 0.043775,
19264
+ "grad_norm": 0.30859375,
19265
+ "learning_rate": 0.002625,
19266
+ "loss": 3.4351,
19267
+ "memory/device_reserved (GiB)": 97.42,
19268
+ "memory/max_active (GiB)": 97.23,
19269
+ "memory/max_allocated (GiB)": 97.23,
19270
+ "step": 1751,
19271
+ "tokens_per_second_per_gpu": 13356.15
19272
+ },
19273
+ {
19274
+ "epoch": 0.0438,
19275
+ "grad_norm": 0.25390625,
19276
+ "learning_rate": 0.0026265,
19277
+ "loss": 3.4141,
19278
+ "memory/device_reserved (GiB)": 97.44,
19279
+ "memory/max_active (GiB)": 97.23,
19280
+ "memory/max_allocated (GiB)": 97.23,
19281
+ "step": 1752,
19282
+ "tokens_per_second_per_gpu": 13330.25
19283
+ },
19284
+ {
19285
+ "epoch": 0.043825,
19286
+ "grad_norm": 0.2314453125,
19287
+ "learning_rate": 0.002628,
19288
+ "loss": 3.3987,
19289
+ "memory/device_reserved (GiB)": 96.42,
19290
+ "memory/max_active (GiB)": 96.23,
19291
+ "memory/max_allocated (GiB)": 96.23,
19292
+ "step": 1753,
19293
+ "tokens_per_second_per_gpu": 14470.45
19294
+ },
19295
+ {
19296
+ "epoch": 0.04385,
19297
+ "grad_norm": 0.30078125,
19298
+ "learning_rate": 0.0026295,
19299
+ "loss": 3.4109,
19300
+ "memory/device_reserved (GiB)": 86.7,
19301
+ "memory/max_active (GiB)": 86.55,
19302
+ "memory/max_allocated (GiB)": 86.55,
19303
+ "step": 1754,
19304
+ "tokens_per_second_per_gpu": 14973.02
19305
+ },
19306
+ {
19307
+ "epoch": 0.043875,
19308
+ "grad_norm": 0.314453125,
19309
+ "learning_rate": 0.002631,
19310
+ "loss": 3.4064,
19311
+ "memory/device_reserved (GiB)": 106.26,
19312
+ "memory/max_active (GiB)": 106.08,
19313
+ "memory/max_allocated (GiB)": 106.08,
19314
+ "step": 1755,
19315
+ "tokens_per_second_per_gpu": 12857.47
19316
+ },
19317
+ {
19318
+ "epoch": 0.0439,
19319
+ "grad_norm": 0.333984375,
19320
+ "learning_rate": 0.0026325,
19321
+ "loss": 3.4352,
19322
+ "memory/device_reserved (GiB)": 87.21,
19323
+ "memory/max_active (GiB)": 87.03,
19324
+ "memory/max_allocated (GiB)": 87.03,
19325
+ "step": 1756,
19326
+ "tokens_per_second_per_gpu": 15689.65
19327
+ },
19328
+ {
19329
+ "epoch": 0.043925,
19330
+ "grad_norm": 0.2333984375,
19331
+ "learning_rate": 0.002634,
19332
+ "loss": 3.4027,
19333
+ "memory/device_reserved (GiB)": 56.57,
19334
+ "memory/max_active (GiB)": 56.42,
19335
+ "memory/max_allocated (GiB)": 56.42,
19336
+ "step": 1757,
19337
+ "tokens_per_second_per_gpu": 21923.57
19338
+ },
19339
+ {
19340
+ "epoch": 0.04395,
19341
+ "grad_norm": 0.197265625,
19342
+ "learning_rate": 0.0026355,
19343
+ "loss": 3.426,
19344
+ "memory/device_reserved (GiB)": 96.42,
19345
+ "memory/max_active (GiB)": 96.22,
19346
+ "memory/max_allocated (GiB)": 96.22,
19347
+ "step": 1758,
19348
+ "tokens_per_second_per_gpu": 13418.51
19349
+ },
19350
+ {
19351
+ "epoch": 0.043975,
19352
+ "grad_norm": 0.2578125,
19353
+ "learning_rate": 0.002637,
19354
+ "loss": 3.3894,
19355
+ "memory/device_reserved (GiB)": 56.57,
19356
+ "memory/max_active (GiB)": 56.42,
19357
+ "memory/max_allocated (GiB)": 56.42,
19358
+ "step": 1759,
19359
+ "tokens_per_second_per_gpu": 24491.74
19360
+ },
19361
+ {
19362
+ "epoch": 0.044,
19363
+ "grad_norm": 0.423828125,
19364
+ "learning_rate": 0.0026385,
19365
+ "loss": 3.4275,
19366
+ "memory/device_reserved (GiB)": 117.82,
19367
+ "memory/max_active (GiB)": 117.63,
19368
+ "memory/max_allocated (GiB)": 117.63,
19369
+ "step": 1760,
19370
+ "tokens_per_second_per_gpu": 11247.5
19371
+ },
19372
+ {
19373
+ "epoch": 0.044025,
19374
+ "grad_norm": 0.61328125,
19375
+ "learning_rate": 0.00264,
19376
+ "loss": 3.4444,
19377
+ "memory/device_reserved (GiB)": 35.22,
19378
+ "memory/max_active (GiB)": 35.02,
19379
+ "memory/max_allocated (GiB)": 35.02,
19380
+ "step": 1761,
19381
+ "tokens_per_second_per_gpu": 33971.64
19382
+ },
19383
+ {
19384
+ "epoch": 0.04405,
19385
+ "grad_norm": 0.55859375,
19386
+ "learning_rate": 0.0026414999999999998,
19387
+ "loss": 3.4196,
19388
+ "memory/device_reserved (GiB)": 87.21,
19389
+ "memory/max_active (GiB)": 87.03,
19390
+ "memory/max_allocated (GiB)": 87.03,
19391
+ "step": 1762,
19392
+ "tokens_per_second_per_gpu": 16207.95
19393
+ },
19394
+ {
19395
+ "epoch": 0.044075,
19396
+ "grad_norm": 0.474609375,
19397
+ "learning_rate": 0.002643,
19398
+ "loss": 3.4154,
19399
+ "memory/device_reserved (GiB)": 66.8,
19400
+ "memory/max_active (GiB)": 66.63,
19401
+ "memory/max_allocated (GiB)": 66.63,
19402
+ "step": 1763,
19403
+ "tokens_per_second_per_gpu": 20758.11
19404
+ },
19405
+ {
19406
+ "epoch": 0.0441,
19407
+ "grad_norm": 0.57421875,
19408
+ "learning_rate": 0.0026444999999999997,
19409
+ "loss": 3.4227,
19410
+ "memory/device_reserved (GiB)": 66.8,
19411
+ "memory/max_active (GiB)": 66.63,
19412
+ "memory/max_allocated (GiB)": 66.63,
19413
+ "step": 1764,
19414
+ "tokens_per_second_per_gpu": 20933.38
19415
+ },
19416
+ {
19417
+ "epoch": 0.044125,
19418
+ "grad_norm": 0.49609375,
19419
+ "learning_rate": 0.002646,
19420
+ "loss": 3.4113,
19421
+ "memory/device_reserved (GiB)": 66.8,
19422
+ "memory/max_active (GiB)": 66.63,
19423
+ "memory/max_allocated (GiB)": 66.63,
19424
+ "step": 1765,
19425
+ "tokens_per_second_per_gpu": 20328.8
19426
+ },
19427
+ {
19428
+ "epoch": 0.04415,
19429
+ "grad_norm": 0.56640625,
19430
+ "learning_rate": 0.0026475,
19431
+ "loss": 3.424,
19432
+ "memory/device_reserved (GiB)": 46.36,
19433
+ "memory/max_active (GiB)": 46.22,
19434
+ "memory/max_allocated (GiB)": 46.22,
19435
+ "step": 1766,
19436
+ "tokens_per_second_per_gpu": 28885.39
19437
+ },
19438
+ {
19439
+ "epoch": 0.044175,
19440
+ "grad_norm": 0.60546875,
19441
+ "learning_rate": 0.002649,
19442
+ "loss": 3.4592,
19443
+ "memory/device_reserved (GiB)": 66.8,
19444
+ "memory/max_active (GiB)": 66.63,
19445
+ "memory/max_allocated (GiB)": 66.63,
19446
+ "step": 1767,
19447
+ "tokens_per_second_per_gpu": 20234.49
19448
+ },
19449
+ {
19450
+ "epoch": 0.0442,
19451
+ "grad_norm": 0.55078125,
19452
+ "learning_rate": 0.0026505,
19453
+ "loss": 3.4609,
19454
+ "memory/device_reserved (GiB)": 87.21,
19455
+ "memory/max_active (GiB)": 87.03,
19456
+ "memory/max_allocated (GiB)": 87.03,
19457
+ "step": 1768,
19458
+ "tokens_per_second_per_gpu": 15912.34
19459
+ },
19460
+ {
19461
+ "epoch": 0.044225,
19462
+ "grad_norm": 0.466796875,
19463
+ "learning_rate": 0.0026520000000000003,
19464
+ "loss": 3.4629,
19465
+ "memory/device_reserved (GiB)": 107.12,
19466
+ "memory/max_active (GiB)": 106.95,
19467
+ "memory/max_allocated (GiB)": 106.95,
19468
+ "step": 1769,
19469
+ "tokens_per_second_per_gpu": 12854.59
19470
+ },
19471
+ {
19472
+ "epoch": 0.04425,
19473
+ "grad_norm": 0.49609375,
19474
+ "learning_rate": 0.0026535,
19475
+ "loss": 3.4224,
19476
+ "memory/device_reserved (GiB)": 56.57,
19477
+ "memory/max_active (GiB)": 56.42,
19478
+ "memory/max_allocated (GiB)": 56.42,
19479
+ "step": 1770,
19480
+ "tokens_per_second_per_gpu": 23600.25
19481
+ },
19482
+ {
19483
+ "epoch": 0.044275,
19484
+ "grad_norm": 0.421875,
19485
+ "learning_rate": 0.0026550000000000002,
19486
+ "loss": 3.4245,
19487
+ "memory/device_reserved (GiB)": 97.42,
19488
+ "memory/max_active (GiB)": 97.23,
19489
+ "memory/max_allocated (GiB)": 97.23,
19490
+ "step": 1771,
19491
+ "tokens_per_second_per_gpu": 14240.32
19492
+ },
19493
+ {
19494
+ "epoch": 0.0443,
19495
+ "grad_norm": 0.2197265625,
19496
+ "learning_rate": 0.0026565,
19497
+ "loss": 3.4746,
19498
+ "memory/device_reserved (GiB)": 107.61,
19499
+ "memory/max_active (GiB)": 107.42,
19500
+ "memory/max_allocated (GiB)": 107.42,
19501
+ "step": 1772,
19502
+ "tokens_per_second_per_gpu": 12415.51
19503
+ },
19504
+ {
19505
+ "epoch": 0.044325,
19506
+ "grad_norm": 0.3046875,
19507
+ "learning_rate": 0.002658,
19508
+ "loss": 3.4487,
19509
+ "memory/device_reserved (GiB)": 117.82,
19510
+ "memory/max_active (GiB)": 117.63,
19511
+ "memory/max_allocated (GiB)": 117.63,
19512
+ "step": 1773,
19513
+ "tokens_per_second_per_gpu": 11728.76
19514
+ },
19515
+ {
19516
+ "epoch": 0.04435,
19517
+ "grad_norm": 0.380859375,
19518
+ "learning_rate": 0.0026595,
19519
+ "loss": 3.428,
19520
+ "memory/device_reserved (GiB)": 97.42,
19521
+ "memory/max_active (GiB)": 97.23,
19522
+ "memory/max_allocated (GiB)": 97.23,
19523
+ "step": 1774,
19524
+ "tokens_per_second_per_gpu": 13483.5
19525
+ },
19526
+ {
19527
+ "epoch": 0.044375,
19528
+ "grad_norm": 0.32421875,
19529
+ "learning_rate": 0.002661,
19530
+ "loss": 3.4394,
19531
+ "memory/device_reserved (GiB)": 66.8,
19532
+ "memory/max_active (GiB)": 66.63,
19533
+ "memory/max_allocated (GiB)": 66.63,
19534
+ "step": 1775,
19535
+ "tokens_per_second_per_gpu": 18870.34
19536
+ },
19537
+ {
19538
+ "epoch": 0.0444,
19539
+ "grad_norm": 0.255859375,
19540
+ "learning_rate": 0.0026625,
19541
+ "loss": 3.4378,
19542
+ "memory/device_reserved (GiB)": 97.42,
19543
+ "memory/max_active (GiB)": 97.23,
19544
+ "memory/max_allocated (GiB)": 97.23,
19545
+ "step": 1776,
19546
+ "tokens_per_second_per_gpu": 13952.93
19547
+ },
19548
+ {
19549
+ "epoch": 0.044425,
19550
+ "grad_norm": 0.2275390625,
19551
+ "learning_rate": 0.002664,
19552
+ "loss": 3.3772,
19553
+ "memory/device_reserved (GiB)": 46.36,
19554
+ "memory/max_active (GiB)": 46.22,
19555
+ "memory/max_allocated (GiB)": 46.22,
19556
+ "step": 1777,
19557
+ "tokens_per_second_per_gpu": 27786.47
19558
+ },
19559
+ {
19560
+ "epoch": 0.04445,
19561
+ "grad_norm": 0.1669921875,
19562
+ "learning_rate": 0.0026655,
19563
+ "loss": 3.4545,
19564
+ "memory/device_reserved (GiB)": 97.42,
19565
+ "memory/max_active (GiB)": 97.22,
19566
+ "memory/max_allocated (GiB)": 97.22,
19567
+ "step": 1778,
19568
+ "tokens_per_second_per_gpu": 13063.98
19569
+ },
19570
+ {
19571
+ "epoch": 0.044475,
19572
+ "grad_norm": 0.271484375,
19573
+ "learning_rate": 0.002667,
19574
+ "loss": 3.4308,
19575
+ "memory/device_reserved (GiB)": 87.21,
19576
+ "memory/max_active (GiB)": 87.03,
19577
+ "memory/max_allocated (GiB)": 87.03,
19578
+ "step": 1779,
19579
+ "tokens_per_second_per_gpu": 15197.1
19580
+ },
19581
+ {
19582
+ "epoch": 0.0445,
19583
+ "grad_norm": 0.333984375,
19584
+ "learning_rate": 0.0026685,
19585
+ "loss": 3.3732,
19586
+ "memory/device_reserved (GiB)": 97.42,
19587
+ "memory/max_active (GiB)": 97.23,
19588
+ "memory/max_allocated (GiB)": 97.23,
19589
+ "step": 1780,
19590
+ "tokens_per_second_per_gpu": 13364.89
19591
+ },
19592
+ {
19593
+ "epoch": 0.044525,
19594
+ "grad_norm": 0.482421875,
19595
+ "learning_rate": 0.00267,
19596
+ "loss": 3.3969,
19597
+ "memory/device_reserved (GiB)": 127.96,
19598
+ "memory/max_active (GiB)": 127.83,
19599
+ "memory/max_allocated (GiB)": 127.83,
19600
+ "step": 1781,
19601
+ "tokens_per_second_per_gpu": 10848.07
19602
+ },
19603
+ {
19604
+ "epoch": 0.04455,
19605
+ "grad_norm": 0.46484375,
19606
+ "learning_rate": 0.0026715,
19607
+ "loss": 3.4136,
19608
+ "memory/device_reserved (GiB)": 46.36,
19609
+ "memory/max_active (GiB)": 46.22,
19610
+ "memory/max_allocated (GiB)": 46.22,
19611
+ "step": 1782,
19612
+ "tokens_per_second_per_gpu": 26226.1
19613
+ },
19614
+ {
19615
+ "epoch": 0.044575,
19616
+ "grad_norm": 0.40234375,
19617
+ "learning_rate": 0.002673,
19618
+ "loss": 3.4063,
19619
+ "memory/device_reserved (GiB)": 77.01,
19620
+ "memory/max_active (GiB)": 76.83,
19621
+ "memory/max_allocated (GiB)": 76.83,
19622
+ "step": 1783,
19623
+ "tokens_per_second_per_gpu": 16623.15
19624
+ },
19625
+ {
19626
+ "epoch": 0.0446,
19627
+ "grad_norm": 0.4453125,
19628
+ "learning_rate": 0.0026745,
19629
+ "loss": 3.4083,
19630
+ "memory/device_reserved (GiB)": 77.01,
19631
+ "memory/max_active (GiB)": 76.83,
19632
+ "memory/max_allocated (GiB)": 76.83,
19633
+ "step": 1784,
19634
+ "tokens_per_second_per_gpu": 17625.86
19635
+ },
19636
+ {
19637
+ "epoch": 0.044625,
19638
+ "grad_norm": 0.51953125,
19639
+ "learning_rate": 0.002676,
19640
+ "loss": 3.4065,
19641
+ "memory/device_reserved (GiB)": 76.49,
19642
+ "memory/max_active (GiB)": 76.35,
19643
+ "memory/max_allocated (GiB)": 76.35,
19644
+ "step": 1785,
19645
+ "tokens_per_second_per_gpu": 17692.48
19646
+ },
19647
+ {
19648
+ "epoch": 0.04465,
19649
+ "grad_norm": 0.45703125,
19650
+ "learning_rate": 0.0026774999999999998,
19651
+ "loss": 3.3926,
19652
+ "memory/device_reserved (GiB)": 77.01,
19653
+ "memory/max_active (GiB)": 76.83,
19654
+ "memory/max_allocated (GiB)": 76.83,
19655
+ "step": 1786,
19656
+ "tokens_per_second_per_gpu": 17734.6
19657
+ },
19658
+ {
19659
+ "epoch": 0.044675,
19660
+ "grad_norm": 0.4140625,
19661
+ "learning_rate": 0.002679,
19662
+ "loss": 3.4451,
19663
+ "memory/device_reserved (GiB)": 107.61,
19664
+ "memory/max_active (GiB)": 107.43,
19665
+ "memory/max_allocated (GiB)": 107.43,
19666
+ "step": 1787,
19667
+ "tokens_per_second_per_gpu": 12253.75
19668
+ },
19669
+ {
19670
+ "epoch": 0.0447,
19671
+ "grad_norm": 0.37109375,
19672
+ "learning_rate": 0.0026804999999999997,
19673
+ "loss": 3.4092,
19674
+ "memory/device_reserved (GiB)": 56.57,
19675
+ "memory/max_active (GiB)": 56.42,
19676
+ "memory/max_allocated (GiB)": 56.42,
19677
+ "step": 1788,
19678
+ "tokens_per_second_per_gpu": 22035.77
19679
+ },
19680
+ {
19681
+ "epoch": 0.044725,
19682
+ "grad_norm": 0.365234375,
19683
+ "learning_rate": 0.002682,
19684
+ "loss": 3.4072,
19685
+ "memory/device_reserved (GiB)": 46.36,
19686
+ "memory/max_active (GiB)": 46.22,
19687
+ "memory/max_allocated (GiB)": 46.22,
19688
+ "step": 1789,
19689
+ "tokens_per_second_per_gpu": 28799.13
19690
+ },
19691
+ {
19692
+ "epoch": 0.04475,
19693
+ "grad_norm": 0.486328125,
19694
+ "learning_rate": 0.0026835,
19695
+ "loss": 3.4273,
19696
+ "memory/device_reserved (GiB)": 107.12,
19697
+ "memory/max_active (GiB)": 106.95,
19698
+ "memory/max_allocated (GiB)": 106.95,
19699
+ "step": 1790,
19700
+ "tokens_per_second_per_gpu": 13507.67
19701
+ },
19702
+ {
19703
+ "epoch": 0.044775,
19704
+ "grad_norm": 0.41796875,
19705
+ "learning_rate": 0.0026850000000000003,
19706
+ "loss": 3.4105,
19707
+ "memory/device_reserved (GiB)": 97.42,
19708
+ "memory/max_active (GiB)": 97.23,
19709
+ "memory/max_allocated (GiB)": 97.23,
19710
+ "step": 1791,
19711
+ "tokens_per_second_per_gpu": 14054.94
19712
+ },
19713
+ {
19714
+ "epoch": 0.0448,
19715
+ "grad_norm": 0.39453125,
19716
+ "learning_rate": 0.0026865,
19717
+ "loss": 3.4125,
19718
+ "memory/device_reserved (GiB)": 87.21,
19719
+ "memory/max_active (GiB)": 87.03,
19720
+ "memory/max_allocated (GiB)": 87.03,
19721
+ "step": 1792,
19722
+ "tokens_per_second_per_gpu": 15951.81
19723
+ },
19724
+ {
19725
+ "epoch": 0.044825,
19726
+ "grad_norm": 0.455078125,
19727
+ "learning_rate": 0.0026880000000000003,
19728
+ "loss": 3.4148,
19729
+ "memory/device_reserved (GiB)": 117.34,
19730
+ "memory/max_active (GiB)": 117.15,
19731
+ "memory/max_allocated (GiB)": 117.15,
19732
+ "step": 1793,
19733
+ "tokens_per_second_per_gpu": 11286.11
19734
+ },
19735
+ {
19736
+ "epoch": 0.04485,
19737
+ "grad_norm": 0.47265625,
19738
+ "learning_rate": 0.0026895,
19739
+ "loss": 3.4305,
19740
+ "memory/device_reserved (GiB)": 86.7,
19741
+ "memory/max_active (GiB)": 86.55,
19742
+ "memory/max_allocated (GiB)": 86.55,
19743
+ "step": 1794,
19744
+ "tokens_per_second_per_gpu": 15414.34
19745
+ },
19746
+ {
19747
+ "epoch": 0.044875,
19748
+ "grad_norm": 0.49609375,
19749
+ "learning_rate": 0.0026910000000000002,
19750
+ "loss": 3.4523,
19751
+ "memory/device_reserved (GiB)": 56.57,
19752
+ "memory/max_active (GiB)": 56.42,
19753
+ "memory/max_allocated (GiB)": 56.42,
19754
+ "step": 1795,
19755
+ "tokens_per_second_per_gpu": 23738.61
19756
+ },
19757
+ {
19758
+ "epoch": 0.0449,
19759
+ "grad_norm": 0.416015625,
19760
+ "learning_rate": 0.0026925,
19761
+ "loss": 3.4288,
19762
+ "memory/device_reserved (GiB)": 45.93,
19763
+ "memory/max_active (GiB)": 45.75,
19764
+ "memory/max_allocated (GiB)": 45.75,
19765
+ "step": 1796,
19766
+ "tokens_per_second_per_gpu": 29115.65
19767
+ },
19768
+ {
19769
+ "epoch": 0.044925,
19770
+ "grad_norm": 0.55859375,
19771
+ "learning_rate": 0.002694,
19772
+ "loss": 3.4467,
19773
+ "memory/device_reserved (GiB)": 76.49,
19774
+ "memory/max_active (GiB)": 76.35,
19775
+ "memory/max_allocated (GiB)": 76.35,
19776
+ "step": 1797,
19777
+ "tokens_per_second_per_gpu": 18356.21
19778
+ },
19779
+ {
19780
+ "epoch": 0.04495,
19781
+ "grad_norm": 0.42578125,
19782
+ "learning_rate": 0.0026955,
19783
+ "loss": 3.4082,
19784
+ "memory/device_reserved (GiB)": 86.21,
19785
+ "memory/max_active (GiB)": 86.02,
19786
+ "memory/max_allocated (GiB)": 86.02,
19787
+ "step": 1798,
19788
+ "tokens_per_second_per_gpu": 15680.13
19789
+ },
19790
+ {
19791
+ "epoch": 0.044975,
19792
+ "grad_norm": 0.376953125,
19793
+ "learning_rate": 0.002697,
19794
+ "loss": 3.3861,
19795
+ "memory/device_reserved (GiB)": 77.01,
19796
+ "memory/max_active (GiB)": 76.83,
19797
+ "memory/max_allocated (GiB)": 76.83,
19798
+ "step": 1799,
19799
+ "tokens_per_second_per_gpu": 17597.13
19800
+ },
19801
+ {
19802
+ "epoch": 0.045,
19803
+ "grad_norm": 0.4140625,
19804
+ "learning_rate": 0.0026985,
19805
+ "loss": 3.4615,
19806
+ "memory/device_reserved (GiB)": 127.96,
19807
+ "memory/max_active (GiB)": 127.83,
19808
+ "memory/max_allocated (GiB)": 127.83,
19809
+ "step": 1800,
19810
+ "tokens_per_second_per_gpu": 10181.44
19811
  }
19812
  ],
19813
  "logging_steps": 1,
 
19827
  "attributes": {}
19828
  }
19829
  },
19830
+ "total_flos": 2.1172492665891062e+18,
19831
  "train_batch_size": 1,
19832
  "trial_name": null,
19833
  "trial_params": null