Upload folder using huggingface_hub
Browse files- README.md +99 -0
- base/meta_000002.json +39 -0
- base/meta_008000.json +39 -0
- base/model_000002.pt +3 -0
- base/model_008000.pt +3 -0
- base/optim_000002.pt +3 -0
- base/optim_008000.pt +3 -0
- data/identity_conversations.jsonl +0 -0
- data/words_alpha.txt +0 -0
- mid_train/meta_000813.json +30 -0
- mid_train/model_000813.pt +3 -0
- mid_train/optim_000813.pt +3 -0
- report/chat-sft.md +26 -0
- report/header.md +36 -0
- report/midtraining.md +23 -0
- report/report.md +99 -0
- sft/meta_000700.json +14 -0
- sft/model_000700.pt +3 -0
- sft/optim_000700.pt +3 -0
- tokenizer/token_bytes.pt +3 -0
- tokenizer/tokenizer.pkl +3 -0
README.md
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# nanochat training report
|
| 2 |
+
|
| 3 |
+
Generated: 2025-11-02 23:26:37
|
| 4 |
+
|
| 5 |
+
## Environment
|
| 6 |
+
|
| 7 |
+
### Git Information
|
| 8 |
+
- Branch: main
|
| 9 |
+
- Commit: 45f875f (dirty)
|
| 10 |
+
- Message: update with checkpoint save(test)
|
| 11 |
+
|
| 12 |
+
### Hardware
|
| 13 |
+
- Platform: Linux
|
| 14 |
+
- CPUs: 128 cores (256 logical)
|
| 15 |
+
- Memory: 160.0 GB
|
| 16 |
+
- GPUs: 1x NVIDIA A100-SXM4-40GB
|
| 17 |
+
- GPU Memory: 39.5 GB total
|
| 18 |
+
- CUDA Version: 12.8
|
| 19 |
+
- Hourly Rate: $1.79/hour
|
| 20 |
+
|
| 21 |
+
### Software
|
| 22 |
+
- Python: 3.10.19
|
| 23 |
+
- PyTorch: 2.8.0+cu128
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
### Bloat
|
| 27 |
+
- Characters: 419,982
|
| 28 |
+
- Lines: 10,409
|
| 29 |
+
- Files: 52
|
| 30 |
+
- Tokens (approx): 104,995
|
| 31 |
+
- Dependencies (uv.lock lines): 2,220
|
| 32 |
+
|
| 33 |
+
Run started: 2025-11-02 23:26:38
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## Midtraining
|
| 38 |
+
timestamp: 2025-11-03 12:34:20
|
| 39 |
+
|
| 40 |
+
- run: dummy
|
| 41 |
+
- device_type:
|
| 42 |
+
- dtype: bfloat16
|
| 43 |
+
- num_iterations: -1
|
| 44 |
+
- max_seq_len: 2048
|
| 45 |
+
- device_batch_size: 1
|
| 46 |
+
- unembedding_lr: 0.0040
|
| 47 |
+
- embedding_lr: 0.2000
|
| 48 |
+
- matrix_lr: 0.0200
|
| 49 |
+
- init_lr_frac: 1.0000
|
| 50 |
+
- weight_decay: 0.0000
|
| 51 |
+
- eval_every: 150
|
| 52 |
+
- eval_tokens: 10,485,760
|
| 53 |
+
- total_batch_size: 524,288
|
| 54 |
+
- checkpoint_every: 100
|
| 55 |
+
- dry_run: 0
|
| 56 |
+
- Number of iterations: 813
|
| 57 |
+
- DDP world size: 1
|
| 58 |
+
- Minimum validation bpb: 0.3972
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
## Chat SFT
|
| 62 |
+
timestamp: 2025-11-04 02:59:21
|
| 63 |
+
|
| 64 |
+
- run: dummy
|
| 65 |
+
- source: mid
|
| 66 |
+
- device_type:
|
| 67 |
+
- dtype: bfloat16
|
| 68 |
+
- device_batch_size: 1
|
| 69 |
+
- num_epochs: 1
|
| 70 |
+
- num_iterations: -1
|
| 71 |
+
- target_examples_per_step: 32
|
| 72 |
+
- unembedding_lr: 0.0040
|
| 73 |
+
- embedding_lr: 0.2000
|
| 74 |
+
- matrix_lr: 0.0200
|
| 75 |
+
- weight_decay: 0.0000
|
| 76 |
+
- init_lr_frac: 0.0200
|
| 77 |
+
- eval_every: 100
|
| 78 |
+
- eval_steps: 100
|
| 79 |
+
- eval_metrics_every: 200
|
| 80 |
+
- eval_metrics_max_problems: 1024
|
| 81 |
+
- checkpoint_every: 50
|
| 82 |
+
- Training rows: 22,439
|
| 83 |
+
- Number of iterations: 701
|
| 84 |
+
- Training loss: 1.3129
|
| 85 |
+
- Validation loss: nan
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
## Summary
|
| 89 |
+
|
| 90 |
+
- Characters: 419,982
|
| 91 |
+
- Lines: 10,409
|
| 92 |
+
- Files: 52
|
| 93 |
+
- Tokens (approx): 104,995
|
| 94 |
+
- Dependencies (uv.lock lines): 2,220
|
| 95 |
+
|
| 96 |
+
| Metric | BASE | MID | SFT | RL |
|
| 97 |
+
|-----------------|----------|----------|----------|----------|
|
| 98 |
+
|
| 99 |
+
Total wall clock time: 27h32m
|
base/meta_000002.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 2,
|
| 3 |
+
"val_bpb": 2.6584760219879953,
|
| 4 |
+
"model_config": {
|
| 5 |
+
"sequence_len": 2048,
|
| 6 |
+
"vocab_size": 65536,
|
| 7 |
+
"n_layer": 32,
|
| 8 |
+
"n_head": 16,
|
| 9 |
+
"n_kv_head": 16,
|
| 10 |
+
"n_embd": 2048
|
| 11 |
+
},
|
| 12 |
+
"user_config": {
|
| 13 |
+
"run": "dummy",
|
| 14 |
+
"device_type": "",
|
| 15 |
+
"depth": 32,
|
| 16 |
+
"max_seq_len": 2048,
|
| 17 |
+
"num_iterations": 2,
|
| 18 |
+
"target_flops": -1.0,
|
| 19 |
+
"target_param_data_ratio": 20,
|
| 20 |
+
"device_batch_size": 1,
|
| 21 |
+
"total_batch_size": 32768,
|
| 22 |
+
"embedding_lr": 0.2,
|
| 23 |
+
"unembedding_lr": 0.004,
|
| 24 |
+
"weight_decay": 0.0,
|
| 25 |
+
"matrix_lr": 0.02,
|
| 26 |
+
"grad_clip": 1.0,
|
| 27 |
+
"warmup_ratio": 0.0,
|
| 28 |
+
"warmdown_ratio": 0.2,
|
| 29 |
+
"final_lr_frac": 0.0,
|
| 30 |
+
"eval_every": 999999,
|
| 31 |
+
"eval_tokens": 2048,
|
| 32 |
+
"core_metric_every": 999999,
|
| 33 |
+
"core_metric_max_per_task": 500,
|
| 34 |
+
"sample_every": 999999,
|
| 35 |
+
"model_tag": ""
|
| 36 |
+
},
|
| 37 |
+
"device_batch_size": 1,
|
| 38 |
+
"max_seq_len": 2048
|
| 39 |
+
}
|
base/meta_008000.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 8000,
|
| 3 |
+
"val_bpb": 0.7965255783929484,
|
| 4 |
+
"model_config": {
|
| 5 |
+
"sequence_len": 2048,
|
| 6 |
+
"vocab_size": 65536,
|
| 7 |
+
"n_layer": 32,
|
| 8 |
+
"n_head": 16,
|
| 9 |
+
"n_kv_head": 16,
|
| 10 |
+
"n_embd": 2048
|
| 11 |
+
},
|
| 12 |
+
"user_config": {
|
| 13 |
+
"run": "dummy",
|
| 14 |
+
"device_type": "",
|
| 15 |
+
"depth": 32,
|
| 16 |
+
"max_seq_len": 2048,
|
| 17 |
+
"num_iterations": 8000,
|
| 18 |
+
"target_flops": -1.0,
|
| 19 |
+
"target_param_data_ratio": 20,
|
| 20 |
+
"device_batch_size": 1,
|
| 21 |
+
"total_batch_size": 262144,
|
| 22 |
+
"embedding_lr": 0.2,
|
| 23 |
+
"unembedding_lr": 0.004,
|
| 24 |
+
"weight_decay": 0.0,
|
| 25 |
+
"matrix_lr": 0.02,
|
| 26 |
+
"grad_clip": 1.0,
|
| 27 |
+
"warmup_ratio": 0.0,
|
| 28 |
+
"warmdown_ratio": 0.2,
|
| 29 |
+
"final_lr_frac": 0.0,
|
| 30 |
+
"eval_every": 250,
|
| 31 |
+
"eval_tokens": 4096,
|
| 32 |
+
"core_metric_every": 250,
|
| 33 |
+
"core_metric_max_per_task": 12,
|
| 34 |
+
"sample_every": 250,
|
| 35 |
+
"model_tag": ""
|
| 36 |
+
},
|
| 37 |
+
"device_batch_size": 1,
|
| 38 |
+
"max_seq_len": 2048
|
| 39 |
+
}
|
base/model_000002.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:938a4e3a951b3fb3d315b2b129c5f2c8d45f76f52a9ce760ad1366e3bdcb1dda
|
| 3 |
+
size 7247837167
|
base/model_008000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd8aa4ec0454f1bdecc63bc6b67a2bc193b872950fee2639357b2120f9d37d79
|
| 3 |
+
size 7247837167
|
base/optim_000002.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:486c914c8ba0746661f298eac6284023c4d21fc719fca590a272928699de7680
|
| 3 |
+
size 8204121436
|
base/optim_008000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96ec9cfbf26b54b70b9893041d6379bdc30a58bc09949801de9567e883eabba4
|
| 3 |
+
size 8204121436
|
data/identity_conversations.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/words_alpha.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
mid_train/meta_000813.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 813,
|
| 3 |
+
"val_bpb": 0.39722378266856306,
|
| 4 |
+
"model_config": {
|
| 5 |
+
"sequence_len": 2048,
|
| 6 |
+
"vocab_size": 65536,
|
| 7 |
+
"n_layer": 32,
|
| 8 |
+
"n_head": 16,
|
| 9 |
+
"n_kv_head": 16,
|
| 10 |
+
"n_embd": 2048
|
| 11 |
+
},
|
| 12 |
+
"user_config": {
|
| 13 |
+
"run": "dummy",
|
| 14 |
+
"device_type": "",
|
| 15 |
+
"dtype": "bfloat16",
|
| 16 |
+
"num_iterations": -1,
|
| 17 |
+
"max_seq_len": 2048,
|
| 18 |
+
"device_batch_size": 1,
|
| 19 |
+
"unembedding_lr": 0.004,
|
| 20 |
+
"embedding_lr": 0.2,
|
| 21 |
+
"matrix_lr": 0.02,
|
| 22 |
+
"init_lr_frac": 1.0,
|
| 23 |
+
"weight_decay": 0.0,
|
| 24 |
+
"eval_every": 150,
|
| 25 |
+
"eval_tokens": 10485760,
|
| 26 |
+
"total_batch_size": 524288,
|
| 27 |
+
"checkpoint_every": 100,
|
| 28 |
+
"dry_run": 0
|
| 29 |
+
}
|
| 30 |
+
}
|
mid_train/model_000813.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18e475a962f44163d56a40818d98db57407ffbc38f8224a0ad1e1b9e80efcb38
|
| 3 |
+
size 7247837167
|
mid_train/optim_000813.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:828fa45b4318459d7202323953559362d4eefebf519728d2f60b4ead7e5426c2
|
| 3 |
+
size 8204121436
|
report/chat-sft.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Chat SFT
|
| 2 |
+
timestamp: 2025-11-04 02:59:21
|
| 3 |
+
|
| 4 |
+
- run: dummy
|
| 5 |
+
- source: mid
|
| 6 |
+
- device_type:
|
| 7 |
+
- dtype: bfloat16
|
| 8 |
+
- device_batch_size: 1
|
| 9 |
+
- num_epochs: 1
|
| 10 |
+
- num_iterations: -1
|
| 11 |
+
- target_examples_per_step: 32
|
| 12 |
+
- unembedding_lr: 0.0040
|
| 13 |
+
- embedding_lr: 0.2000
|
| 14 |
+
- matrix_lr: 0.0200
|
| 15 |
+
- weight_decay: 0.0000
|
| 16 |
+
- init_lr_frac: 0.0200
|
| 17 |
+
- eval_every: 100
|
| 18 |
+
- eval_steps: 100
|
| 19 |
+
- eval_metrics_every: 200
|
| 20 |
+
- eval_metrics_max_problems: 1024
|
| 21 |
+
- checkpoint_every: 50
|
| 22 |
+
- Training rows: 22,439
|
| 23 |
+
- Number of iterations: 701
|
| 24 |
+
- Training loss: 1.3129
|
| 25 |
+
- Validation loss: nan
|
| 26 |
+
|
report/header.md
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# nanochat training report
|
| 2 |
+
|
| 3 |
+
Generated: 2025-11-02 23:26:37
|
| 4 |
+
|
| 5 |
+
## Environment
|
| 6 |
+
|
| 7 |
+
### Git Information
|
| 8 |
+
- Branch: main
|
| 9 |
+
- Commit: 45f875f (dirty)
|
| 10 |
+
- Message: update with checkpoint save(test)
|
| 11 |
+
|
| 12 |
+
### Hardware
|
| 13 |
+
- Platform: Linux
|
| 14 |
+
- CPUs: 128 cores (256 logical)
|
| 15 |
+
- Memory: 160.0 GB
|
| 16 |
+
- GPUs: 1x NVIDIA A100-SXM4-40GB
|
| 17 |
+
- GPU Memory: 39.5 GB total
|
| 18 |
+
- CUDA Version: 12.8
|
| 19 |
+
- Hourly Rate: $1.79/hour
|
| 20 |
+
|
| 21 |
+
### Software
|
| 22 |
+
- Python: 3.10.19
|
| 23 |
+
- PyTorch: 2.8.0+cu128
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
### Bloat
|
| 27 |
+
- Characters: 419,982
|
| 28 |
+
- Lines: 10,409
|
| 29 |
+
- Files: 52
|
| 30 |
+
- Tokens (approx): 104,995
|
| 31 |
+
- Dependencies (uv.lock lines): 2,220
|
| 32 |
+
|
| 33 |
+
Run started: 2025-11-02 23:26:38
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
report/midtraining.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Midtraining
|
| 2 |
+
timestamp: 2025-11-03 12:34:20
|
| 3 |
+
|
| 4 |
+
- run: dummy
|
| 5 |
+
- device_type:
|
| 6 |
+
- dtype: bfloat16
|
| 7 |
+
- num_iterations: -1
|
| 8 |
+
- max_seq_len: 2048
|
| 9 |
+
- device_batch_size: 1
|
| 10 |
+
- unembedding_lr: 0.0040
|
| 11 |
+
- embedding_lr: 0.2000
|
| 12 |
+
- matrix_lr: 0.0200
|
| 13 |
+
- init_lr_frac: 1.0000
|
| 14 |
+
- weight_decay: 0.0000
|
| 15 |
+
- eval_every: 150
|
| 16 |
+
- eval_tokens: 10,485,760
|
| 17 |
+
- total_batch_size: 524,288
|
| 18 |
+
- checkpoint_every: 100
|
| 19 |
+
- dry_run: 0
|
| 20 |
+
- Number of iterations: 813
|
| 21 |
+
- DDP world size: 1
|
| 22 |
+
- Minimum validation bpb: 0.3972
|
| 23 |
+
|
report/report.md
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# nanochat training report
|
| 2 |
+
|
| 3 |
+
Generated: 2025-11-02 23:26:37
|
| 4 |
+
|
| 5 |
+
## Environment
|
| 6 |
+
|
| 7 |
+
### Git Information
|
| 8 |
+
- Branch: main
|
| 9 |
+
- Commit: 45f875f (dirty)
|
| 10 |
+
- Message: update with checkpoint save(test)
|
| 11 |
+
|
| 12 |
+
### Hardware
|
| 13 |
+
- Platform: Linux
|
| 14 |
+
- CPUs: 128 cores (256 logical)
|
| 15 |
+
- Memory: 160.0 GB
|
| 16 |
+
- GPUs: 1x NVIDIA A100-SXM4-40GB
|
| 17 |
+
- GPU Memory: 39.5 GB total
|
| 18 |
+
- CUDA Version: 12.8
|
| 19 |
+
- Hourly Rate: $1.79/hour
|
| 20 |
+
|
| 21 |
+
### Software
|
| 22 |
+
- Python: 3.10.19
|
| 23 |
+
- PyTorch: 2.8.0+cu128
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
### Bloat
|
| 27 |
+
- Characters: 419,982
|
| 28 |
+
- Lines: 10,409
|
| 29 |
+
- Files: 52
|
| 30 |
+
- Tokens (approx): 104,995
|
| 31 |
+
- Dependencies (uv.lock lines): 2,220
|
| 32 |
+
|
| 33 |
+
Run started: 2025-11-02 23:26:38
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## Midtraining
|
| 38 |
+
timestamp: 2025-11-03 12:34:20
|
| 39 |
+
|
| 40 |
+
- run: dummy
|
| 41 |
+
- device_type:
|
| 42 |
+
- dtype: bfloat16
|
| 43 |
+
- num_iterations: -1
|
| 44 |
+
- max_seq_len: 2048
|
| 45 |
+
- device_batch_size: 1
|
| 46 |
+
- unembedding_lr: 0.0040
|
| 47 |
+
- embedding_lr: 0.2000
|
| 48 |
+
- matrix_lr: 0.0200
|
| 49 |
+
- init_lr_frac: 1.0000
|
| 50 |
+
- weight_decay: 0.0000
|
| 51 |
+
- eval_every: 150
|
| 52 |
+
- eval_tokens: 10,485,760
|
| 53 |
+
- total_batch_size: 524,288
|
| 54 |
+
- checkpoint_every: 100
|
| 55 |
+
- dry_run: 0
|
| 56 |
+
- Number of iterations: 813
|
| 57 |
+
- DDP world size: 1
|
| 58 |
+
- Minimum validation bpb: 0.3972
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
## Chat SFT
|
| 62 |
+
timestamp: 2025-11-04 02:59:21
|
| 63 |
+
|
| 64 |
+
- run: dummy
|
| 65 |
+
- source: mid
|
| 66 |
+
- device_type:
|
| 67 |
+
- dtype: bfloat16
|
| 68 |
+
- device_batch_size: 1
|
| 69 |
+
- num_epochs: 1
|
| 70 |
+
- num_iterations: -1
|
| 71 |
+
- target_examples_per_step: 32
|
| 72 |
+
- unembedding_lr: 0.0040
|
| 73 |
+
- embedding_lr: 0.2000
|
| 74 |
+
- matrix_lr: 0.0200
|
| 75 |
+
- weight_decay: 0.0000
|
| 76 |
+
- init_lr_frac: 0.0200
|
| 77 |
+
- eval_every: 100
|
| 78 |
+
- eval_steps: 100
|
| 79 |
+
- eval_metrics_every: 200
|
| 80 |
+
- eval_metrics_max_problems: 1024
|
| 81 |
+
- checkpoint_every: 50
|
| 82 |
+
- Training rows: 22,439
|
| 83 |
+
- Number of iterations: 701
|
| 84 |
+
- Training loss: 1.3129
|
| 85 |
+
- Validation loss: nan
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
## Summary
|
| 89 |
+
|
| 90 |
+
- Characters: 419,982
|
| 91 |
+
- Lines: 10,409
|
| 92 |
+
- Files: 52
|
| 93 |
+
- Tokens (approx): 104,995
|
| 94 |
+
- Dependencies (uv.lock lines): 2,220
|
| 95 |
+
|
| 96 |
+
| Metric | BASE | MID | SFT | RL |
|
| 97 |
+
|-----------------|----------|----------|----------|----------|
|
| 98 |
+
|
| 99 |
+
Total wall clock time: 27h32m
|
sft/meta_000700.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 700,
|
| 3 |
+
"val_loss": NaN,
|
| 4 |
+
"mmlu_acc": 0.34375,
|
| 5 |
+
"arc_easy_acc": 0.4091796875,
|
| 6 |
+
"model_config": {
|
| 7 |
+
"sequence_len": 2048,
|
| 8 |
+
"vocab_size": 65536,
|
| 9 |
+
"n_layer": 32,
|
| 10 |
+
"n_head": 16,
|
| 11 |
+
"n_kv_head": 16,
|
| 12 |
+
"n_embd": 2048
|
| 13 |
+
}
|
| 14 |
+
}
|
sft/model_000700.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e927a960386a949b702669bcdb66bf7398b0ca9852617a3a3ca8ae431c70a18
|
| 3 |
+
size 7247837167
|
sft/optim_000700.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4360b4112f587ab59d637843634c8f3335c6fa3ae1c94e6e9b55fd0470bbabed
|
| 3 |
+
size 8204121436
|
tokenizer/token_bytes.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae39c27aae519d14071efc95f9a558ba0b7ede47e7d83ad4f198422b44c5f70e
|
| 3 |
+
size 263721
|
tokenizer/tokenizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c060565a46fe83b49d99005acba796f2a630daa7970eb49f7513b89f9fb40e0
|
| 3 |
+
size 846208
|