Upload folder using huggingface_hub
Browse files- README.md +2 -2
- checkpoint-128/model-00001-of-00004.safetensors +1 -1
- checkpoint-128/model-00002-of-00004.safetensors +1 -1
- checkpoint-128/model-00003-of-00004.safetensors +1 -1
- checkpoint-128/model-00004-of-00004.safetensors +1 -1
- checkpoint-128/trainer_state.json +37 -37
- checkpoint-128/training_args.bin +1 -1
- checkpoint-32/model-00001-of-00004.safetensors +1 -1
- checkpoint-32/model-00002-of-00004.safetensors +1 -1
- checkpoint-32/model-00003-of-00004.safetensors +1 -1
- checkpoint-32/model-00004-of-00004.safetensors +1 -1
- checkpoint-32/trainer_state.json +10 -10
- checkpoint-32/training_args.bin +1 -1
- checkpoint-64/model-00001-of-00004.safetensors +1 -1
- checkpoint-64/model-00002-of-00004.safetensors +1 -1
- checkpoint-64/model-00003-of-00004.safetensors +1 -1
- checkpoint-64/model-00004-of-00004.safetensors +1 -1
- checkpoint-64/trainer_state.json +19 -19
- checkpoint-64/training_args.bin +1 -1
- checkpoint-96/model-00001-of-00004.safetensors +1 -1
- checkpoint-96/model-00002-of-00004.safetensors +1 -1
- checkpoint-96/model-00003-of-00004.safetensors +1 -1
- checkpoint-96/model-00004-of-00004.safetensors +1 -1
- checkpoint-96/trainer_state.json +28 -28
- checkpoint-96/training_args.bin +1 -1
README.md
CHANGED
|
@@ -4,8 +4,8 @@ library_name: transformers
|
|
| 4 |
model_name: d1_train_1024_no_reasoning_llama3_8B
|
| 5 |
tags:
|
| 6 |
- generated_from_trainer
|
| 7 |
-
- trl
|
| 8 |
- sft
|
|
|
|
| 9 |
licence: license
|
| 10 |
---
|
| 11 |
|
|
@@ -27,7 +27,7 @@ print(output["generated_text"])
|
|
| 27 |
|
| 28 |
## Training procedure
|
| 29 |
|
| 30 |
-
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenwu/huggingface/runs/
|
| 31 |
|
| 32 |
|
| 33 |
This model was trained with SFT.
|
|
|
|
| 4 |
model_name: d1_train_1024_no_reasoning_llama3_8B
|
| 5 |
tags:
|
| 6 |
- generated_from_trainer
|
|
|
|
| 7 |
- sft
|
| 8 |
+
- trl
|
| 9 |
licence: license
|
| 10 |
---
|
| 11 |
|
|
|
|
| 27 |
|
| 28 |
## Training procedure
|
| 29 |
|
| 30 |
+
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenwu/huggingface/runs/l01140ez)
|
| 31 |
|
| 32 |
|
| 33 |
This model was trained with SFT.
|
checkpoint-128/model-00001-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4976698672
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:40fa941dc7eda6dd2d9d5ec937b0f9a23e8642f4622f7f1dd49a2fcd475960a5
|
| 3 |
size 4976698672
|
checkpoint-128/model-00002-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4999802720
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13fe6aadd034f0361cb93df64a5a9ee6539c37ac3bf09a10984a1b9f27ec96c1
|
| 3 |
size 4999802720
|
checkpoint-128/model-00003-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4915916176
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d5094186aa8995114e74046358974b9cced60317442e262f194885388756bd5a
|
| 3 |
size 4915916176
|
checkpoint-128/model-00004-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1168138808
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:677385a5dc6b6fb78ebfef1f574dbb84568cfe5cc534db5377d92146be10f265
|
| 3 |
size 1168138808
|
checkpoint-128/trainer_state.json
CHANGED
|
@@ -11,7 +11,7 @@
|
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
"epoch": 0.0625,
|
| 14 |
-
"grad_norm": 5.
|
| 15 |
"learning_rate": 1e-05,
|
| 16 |
"loss": 0.553,
|
| 17 |
"mean_token_accuracy": 0.8401249051094055,
|
|
@@ -20,109 +20,109 @@
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
"epoch": 0.625,
|
| 23 |
-
"grad_norm": 1.
|
| 24 |
"learning_rate": 1e-05,
|
| 25 |
-
"loss": 0.
|
| 26 |
-
"mean_token_accuracy": 0.
|
| 27 |
"num_tokens": 401401.0,
|
| 28 |
"step": 10
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"epoch": 1.25,
|
| 32 |
-
"grad_norm": 1.
|
| 33 |
"learning_rate": 1e-05,
|
| 34 |
-
"loss": 0.
|
| 35 |
-
"mean_token_accuracy": 0.
|
| 36 |
"num_tokens": 799732.0,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.875,
|
| 41 |
-
"grad_norm": 1.
|
| 42 |
"learning_rate": 1e-05,
|
| 43 |
-
"loss": 0.
|
| 44 |
-
"mean_token_accuracy": 0.
|
| 45 |
"num_tokens": 1199159.0,
|
| 46 |
"step": 30
|
| 47 |
},
|
| 48 |
{
|
| 49 |
"epoch": 2.5,
|
| 50 |
-
"grad_norm": 2.
|
| 51 |
"learning_rate": 1e-05,
|
| 52 |
-
"loss": 0.
|
| 53 |
-
"mean_token_accuracy": 0.
|
| 54 |
"num_tokens": 1592321.0,
|
| 55 |
"step": 40
|
| 56 |
},
|
| 57 |
{
|
| 58 |
"epoch": 3.125,
|
| 59 |
-
"grad_norm": 0.
|
| 60 |
"learning_rate": 1e-05,
|
| 61 |
-
"loss": 0.
|
| 62 |
-
"mean_token_accuracy": 0.
|
| 63 |
"num_tokens": 1998955.0,
|
| 64 |
"step": 50
|
| 65 |
},
|
| 66 |
{
|
| 67 |
"epoch": 3.75,
|
| 68 |
-
"grad_norm": 1.
|
| 69 |
"learning_rate": 1e-05,
|
| 70 |
-
"loss": 0.
|
| 71 |
-
"mean_token_accuracy": 0.
|
| 72 |
"num_tokens": 2397408.0,
|
| 73 |
"step": 60
|
| 74 |
},
|
| 75 |
{
|
| 76 |
"epoch": 4.375,
|
| 77 |
-
"grad_norm": 0.
|
| 78 |
"learning_rate": 1e-05,
|
| 79 |
-
"loss": 0.
|
| 80 |
-
"mean_token_accuracy": 0.
|
| 81 |
"num_tokens": 2798080.0,
|
| 82 |
"step": 70
|
| 83 |
},
|
| 84 |
{
|
| 85 |
"epoch": 5.0,
|
| 86 |
-
"grad_norm": 1.
|
| 87 |
"learning_rate": 1e-05,
|
| 88 |
-
"loss": 0.
|
| 89 |
-
"mean_token_accuracy": 0.
|
| 90 |
"num_tokens": 3199940.0,
|
| 91 |
"step": 80
|
| 92 |
},
|
| 93 |
{
|
| 94 |
"epoch": 5.625,
|
| 95 |
-
"grad_norm": 0.
|
| 96 |
"learning_rate": 1e-05,
|
| 97 |
-
"loss": 0.
|
| 98 |
-
"mean_token_accuracy": 0.
|
| 99 |
"num_tokens": 3601545.0,
|
| 100 |
"step": 90
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"epoch": 6.25,
|
| 104 |
-
"grad_norm": 0.
|
| 105 |
"learning_rate": 1e-05,
|
| 106 |
-
"loss": 0.
|
| 107 |
-
"mean_token_accuracy": 0.
|
| 108 |
"num_tokens": 3998166.0,
|
| 109 |
"step": 100
|
| 110 |
},
|
| 111 |
{
|
| 112 |
"epoch": 6.875,
|
| 113 |
-
"grad_norm": 0.
|
| 114 |
"learning_rate": 1e-05,
|
| 115 |
-
"loss": 0.
|
| 116 |
-
"mean_token_accuracy": 0.
|
| 117 |
"num_tokens": 4400384.0,
|
| 118 |
"step": 110
|
| 119 |
},
|
| 120 |
{
|
| 121 |
"epoch": 7.5,
|
| 122 |
-
"grad_norm": 0.
|
| 123 |
"learning_rate": 1e-05,
|
| 124 |
-
"loss": 0.
|
| 125 |
-
"mean_token_accuracy": 0.
|
| 126 |
"num_tokens": 4799604.0,
|
| 127 |
"step": 120
|
| 128 |
}
|
|
|
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
"epoch": 0.0625,
|
| 14 |
+
"grad_norm": 5.229332748167166,
|
| 15 |
"learning_rate": 1e-05,
|
| 16 |
"loss": 0.553,
|
| 17 |
"mean_token_accuracy": 0.8401249051094055,
|
|
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
"epoch": 0.625,
|
| 23 |
+
"grad_norm": 1.878449792522089,
|
| 24 |
"learning_rate": 1e-05,
|
| 25 |
+
"loss": 0.4425,
|
| 26 |
+
"mean_token_accuracy": 0.8662860658433702,
|
| 27 |
"num_tokens": 401401.0,
|
| 28 |
"step": 10
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"epoch": 1.25,
|
| 32 |
+
"grad_norm": 1.4215514241135978,
|
| 33 |
"learning_rate": 1e-05,
|
| 34 |
+
"loss": 0.3341,
|
| 35 |
+
"mean_token_accuracy": 0.8969074487686157,
|
| 36 |
"num_tokens": 799732.0,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.875,
|
| 41 |
+
"grad_norm": 1.4575405337602632,
|
| 42 |
"learning_rate": 1e-05,
|
| 43 |
+
"loss": 0.2066,
|
| 44 |
+
"mean_token_accuracy": 0.9323108971118927,
|
| 45 |
"num_tokens": 1199159.0,
|
| 46 |
"step": 30
|
| 47 |
},
|
| 48 |
{
|
| 49 |
"epoch": 2.5,
|
| 50 |
+
"grad_norm": 2.103711519590209,
|
| 51 |
"learning_rate": 1e-05,
|
| 52 |
+
"loss": 0.1032,
|
| 53 |
+
"mean_token_accuracy": 0.9672375440597534,
|
| 54 |
"num_tokens": 1592321.0,
|
| 55 |
"step": 40
|
| 56 |
},
|
| 57 |
{
|
| 58 |
"epoch": 3.125,
|
| 59 |
+
"grad_norm": 0.8849927277663293,
|
| 60 |
"learning_rate": 1e-05,
|
| 61 |
+
"loss": 0.0611,
|
| 62 |
+
"mean_token_accuracy": 0.9806811392307282,
|
| 63 |
"num_tokens": 1998955.0,
|
| 64 |
"step": 50
|
| 65 |
},
|
| 66 |
{
|
| 67 |
"epoch": 3.75,
|
| 68 |
+
"grad_norm": 1.0988924501749624,
|
| 69 |
"learning_rate": 1e-05,
|
| 70 |
+
"loss": 0.0292,
|
| 71 |
+
"mean_token_accuracy": 0.9918049573898315,
|
| 72 |
"num_tokens": 2397408.0,
|
| 73 |
"step": 60
|
| 74 |
},
|
| 75 |
{
|
| 76 |
"epoch": 4.375,
|
| 77 |
+
"grad_norm": 0.9209838605387056,
|
| 78 |
"learning_rate": 1e-05,
|
| 79 |
+
"loss": 0.0197,
|
| 80 |
+
"mean_token_accuracy": 0.9946328461170196,
|
| 81 |
"num_tokens": 2798080.0,
|
| 82 |
"step": 70
|
| 83 |
},
|
| 84 |
{
|
| 85 |
"epoch": 5.0,
|
| 86 |
+
"grad_norm": 1.145881672413454,
|
| 87 |
"learning_rate": 1e-05,
|
| 88 |
+
"loss": 0.0184,
|
| 89 |
+
"mean_token_accuracy": 0.9954161286354065,
|
| 90 |
"num_tokens": 3199940.0,
|
| 91 |
"step": 80
|
| 92 |
},
|
| 93 |
{
|
| 94 |
"epoch": 5.625,
|
| 95 |
+
"grad_norm": 0.8760509498406688,
|
| 96 |
"learning_rate": 1e-05,
|
| 97 |
+
"loss": 0.0094,
|
| 98 |
+
"mean_token_accuracy": 0.9976460933685303,
|
| 99 |
"num_tokens": 3601545.0,
|
| 100 |
"step": 90
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"epoch": 6.25,
|
| 104 |
+
"grad_norm": 0.6303656980191445,
|
| 105 |
"learning_rate": 1e-05,
|
| 106 |
+
"loss": 0.0089,
|
| 107 |
+
"mean_token_accuracy": 0.9977393686771393,
|
| 108 |
"num_tokens": 3998166.0,
|
| 109 |
"step": 100
|
| 110 |
},
|
| 111 |
{
|
| 112 |
"epoch": 6.875,
|
| 113 |
+
"grad_norm": 0.737583281857238,
|
| 114 |
"learning_rate": 1e-05,
|
| 115 |
+
"loss": 0.008,
|
| 116 |
+
"mean_token_accuracy": 0.9981118857860565,
|
| 117 |
"num_tokens": 4400384.0,
|
| 118 |
"step": 110
|
| 119 |
},
|
| 120 |
{
|
| 121 |
"epoch": 7.5,
|
| 122 |
+
"grad_norm": 0.5676177953651935,
|
| 123 |
"learning_rate": 1e-05,
|
| 124 |
+
"loss": 0.0053,
|
| 125 |
+
"mean_token_accuracy": 0.9985986590385437,
|
| 126 |
"num_tokens": 4799604.0,
|
| 127 |
"step": 120
|
| 128 |
}
|
checkpoint-128/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 8081
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c29f8fe80083b5b7c0f17189143d52e9e222441f308cf2fc22d25021e0096e3
|
| 3 |
size 8081
|
checkpoint-32/model-00001-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4976698672
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a28a9e5797fde7e49088949ab39301b942d08c68a323e48d48f880a2dc2f31df
|
| 3 |
size 4976698672
|
checkpoint-32/model-00002-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4999802720
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c0d8c607046cd885ce539e1e07e480893946a0e3b8013882c90b4b84135c1d5
|
| 3 |
size 4999802720
|
checkpoint-32/model-00003-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4915916176
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6a91279de971d2da10439673c5f4c919c9a1c2084b7d56cddd19f1833e3026d
|
| 3 |
size 4915916176
|
checkpoint-32/model-00004-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1168138808
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d090159b203d889219d031869d0b7100b700e438446783e79a2fa4001ffe5635
|
| 3 |
size 1168138808
|
checkpoint-32/trainer_state.json
CHANGED
|
@@ -11,7 +11,7 @@
|
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
"epoch": 0.0625,
|
| 14 |
-
"grad_norm": 5.
|
| 15 |
"learning_rate": 1e-05,
|
| 16 |
"loss": 0.553,
|
| 17 |
"mean_token_accuracy": 0.8401249051094055,
|
|
@@ -20,28 +20,28 @@
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
"epoch": 0.625,
|
| 23 |
-
"grad_norm": 1.
|
| 24 |
"learning_rate": 1e-05,
|
| 25 |
-
"loss": 0.
|
| 26 |
-
"mean_token_accuracy": 0.
|
| 27 |
"num_tokens": 401401.0,
|
| 28 |
"step": 10
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"epoch": 1.25,
|
| 32 |
-
"grad_norm": 1.
|
| 33 |
"learning_rate": 1e-05,
|
| 34 |
-
"loss": 0.
|
| 35 |
-
"mean_token_accuracy": 0.
|
| 36 |
"num_tokens": 799732.0,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.875,
|
| 41 |
-
"grad_norm": 1.
|
| 42 |
"learning_rate": 1e-05,
|
| 43 |
-
"loss": 0.
|
| 44 |
-
"mean_token_accuracy": 0.
|
| 45 |
"num_tokens": 1199159.0,
|
| 46 |
"step": 30
|
| 47 |
}
|
|
|
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
"epoch": 0.0625,
|
| 14 |
+
"grad_norm": 5.229332748167166,
|
| 15 |
"learning_rate": 1e-05,
|
| 16 |
"loss": 0.553,
|
| 17 |
"mean_token_accuracy": 0.8401249051094055,
|
|
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
"epoch": 0.625,
|
| 23 |
+
"grad_norm": 1.878449792522089,
|
| 24 |
"learning_rate": 1e-05,
|
| 25 |
+
"loss": 0.4425,
|
| 26 |
+
"mean_token_accuracy": 0.8662860658433702,
|
| 27 |
"num_tokens": 401401.0,
|
| 28 |
"step": 10
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"epoch": 1.25,
|
| 32 |
+
"grad_norm": 1.4215514241135978,
|
| 33 |
"learning_rate": 1e-05,
|
| 34 |
+
"loss": 0.3341,
|
| 35 |
+
"mean_token_accuracy": 0.8969074487686157,
|
| 36 |
"num_tokens": 799732.0,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.875,
|
| 41 |
+
"grad_norm": 1.4575405337602632,
|
| 42 |
"learning_rate": 1e-05,
|
| 43 |
+
"loss": 0.2066,
|
| 44 |
+
"mean_token_accuracy": 0.9323108971118927,
|
| 45 |
"num_tokens": 1199159.0,
|
| 46 |
"step": 30
|
| 47 |
}
|
checkpoint-32/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 8081
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c29f8fe80083b5b7c0f17189143d52e9e222441f308cf2fc22d25021e0096e3
|
| 3 |
size 8081
|
checkpoint-64/model-00001-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4976698672
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:00fbe653cb7ebbf34086241b234f117c83033ad1a430ee2a8293c94345910a52
|
| 3 |
size 4976698672
|
checkpoint-64/model-00002-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4999802720
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e8c8c6db9f5a0f64b053d5ed412d770f46e1b21fb22921ad3cbccc9eefc9c483
|
| 3 |
size 4999802720
|
checkpoint-64/model-00003-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4915916176
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3e632cf3a6e8508a9d0a2ebb86f758925169f16f34c0e1a9a9f1a0b01fd45a79
|
| 3 |
size 4915916176
|
checkpoint-64/model-00004-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1168138808
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:29a6f46c1fa988045aac05f3c9c3c04c706b1314a062ea50f31f7a662a19c98a
|
| 3 |
size 1168138808
|
checkpoint-64/trainer_state.json
CHANGED
|
@@ -11,7 +11,7 @@
|
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
"epoch": 0.0625,
|
| 14 |
-
"grad_norm": 5.
|
| 15 |
"learning_rate": 1e-05,
|
| 16 |
"loss": 0.553,
|
| 17 |
"mean_token_accuracy": 0.8401249051094055,
|
|
@@ -20,55 +20,55 @@
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
"epoch": 0.625,
|
| 23 |
-
"grad_norm": 1.
|
| 24 |
"learning_rate": 1e-05,
|
| 25 |
-
"loss": 0.
|
| 26 |
-
"mean_token_accuracy": 0.
|
| 27 |
"num_tokens": 401401.0,
|
| 28 |
"step": 10
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"epoch": 1.25,
|
| 32 |
-
"grad_norm": 1.
|
| 33 |
"learning_rate": 1e-05,
|
| 34 |
-
"loss": 0.
|
| 35 |
-
"mean_token_accuracy": 0.
|
| 36 |
"num_tokens": 799732.0,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.875,
|
| 41 |
-
"grad_norm": 1.
|
| 42 |
"learning_rate": 1e-05,
|
| 43 |
-
"loss": 0.
|
| 44 |
-
"mean_token_accuracy": 0.
|
| 45 |
"num_tokens": 1199159.0,
|
| 46 |
"step": 30
|
| 47 |
},
|
| 48 |
{
|
| 49 |
"epoch": 2.5,
|
| 50 |
-
"grad_norm": 2.
|
| 51 |
"learning_rate": 1e-05,
|
| 52 |
-
"loss": 0.
|
| 53 |
-
"mean_token_accuracy": 0.
|
| 54 |
"num_tokens": 1592321.0,
|
| 55 |
"step": 40
|
| 56 |
},
|
| 57 |
{
|
| 58 |
"epoch": 3.125,
|
| 59 |
-
"grad_norm": 0.
|
| 60 |
"learning_rate": 1e-05,
|
| 61 |
-
"loss": 0.
|
| 62 |
-
"mean_token_accuracy": 0.
|
| 63 |
"num_tokens": 1998955.0,
|
| 64 |
"step": 50
|
| 65 |
},
|
| 66 |
{
|
| 67 |
"epoch": 3.75,
|
| 68 |
-
"grad_norm": 1.
|
| 69 |
"learning_rate": 1e-05,
|
| 70 |
-
"loss": 0.
|
| 71 |
-
"mean_token_accuracy": 0.
|
| 72 |
"num_tokens": 2397408.0,
|
| 73 |
"step": 60
|
| 74 |
}
|
|
|
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
"epoch": 0.0625,
|
| 14 |
+
"grad_norm": 5.229332748167166,
|
| 15 |
"learning_rate": 1e-05,
|
| 16 |
"loss": 0.553,
|
| 17 |
"mean_token_accuracy": 0.8401249051094055,
|
|
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
"epoch": 0.625,
|
| 23 |
+
"grad_norm": 1.878449792522089,
|
| 24 |
"learning_rate": 1e-05,
|
| 25 |
+
"loss": 0.4425,
|
| 26 |
+
"mean_token_accuracy": 0.8662860658433702,
|
| 27 |
"num_tokens": 401401.0,
|
| 28 |
"step": 10
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"epoch": 1.25,
|
| 32 |
+
"grad_norm": 1.4215514241135978,
|
| 33 |
"learning_rate": 1e-05,
|
| 34 |
+
"loss": 0.3341,
|
| 35 |
+
"mean_token_accuracy": 0.8969074487686157,
|
| 36 |
"num_tokens": 799732.0,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.875,
|
| 41 |
+
"grad_norm": 1.4575405337602632,
|
| 42 |
"learning_rate": 1e-05,
|
| 43 |
+
"loss": 0.2066,
|
| 44 |
+
"mean_token_accuracy": 0.9323108971118927,
|
| 45 |
"num_tokens": 1199159.0,
|
| 46 |
"step": 30
|
| 47 |
},
|
| 48 |
{
|
| 49 |
"epoch": 2.5,
|
| 50 |
+
"grad_norm": 2.103711519590209,
|
| 51 |
"learning_rate": 1e-05,
|
| 52 |
+
"loss": 0.1032,
|
| 53 |
+
"mean_token_accuracy": 0.9672375440597534,
|
| 54 |
"num_tokens": 1592321.0,
|
| 55 |
"step": 40
|
| 56 |
},
|
| 57 |
{
|
| 58 |
"epoch": 3.125,
|
| 59 |
+
"grad_norm": 0.8849927277663293,
|
| 60 |
"learning_rate": 1e-05,
|
| 61 |
+
"loss": 0.0611,
|
| 62 |
+
"mean_token_accuracy": 0.9806811392307282,
|
| 63 |
"num_tokens": 1998955.0,
|
| 64 |
"step": 50
|
| 65 |
},
|
| 66 |
{
|
| 67 |
"epoch": 3.75,
|
| 68 |
+
"grad_norm": 1.0988924501749624,
|
| 69 |
"learning_rate": 1e-05,
|
| 70 |
+
"loss": 0.0292,
|
| 71 |
+
"mean_token_accuracy": 0.9918049573898315,
|
| 72 |
"num_tokens": 2397408.0,
|
| 73 |
"step": 60
|
| 74 |
}
|
checkpoint-64/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 8081
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c29f8fe80083b5b7c0f17189143d52e9e222441f308cf2fc22d25021e0096e3
|
| 3 |
size 8081
|
checkpoint-96/model-00001-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4976698672
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6914a1438136e282d6bcf40bef64b766ed70117fe040d52f0ec393d760378169
|
| 3 |
size 4976698672
|
checkpoint-96/model-00002-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4999802720
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e58cf5422d1aa176b52eb65b5c6b33963574f79372067c6312a83c597116796d
|
| 3 |
size 4999802720
|
checkpoint-96/model-00003-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4915916176
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a74864c894aeca1872c8398d5a779c786f09ff6a99d00ed1b05366e63dd943b
|
| 3 |
size 4915916176
|
checkpoint-96/model-00004-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1168138808
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b5469fe62b9764063be846645c926a92b9fb00883c6b5ec1f63c697e62e5ca6
|
| 3 |
size 1168138808
|
checkpoint-96/trainer_state.json
CHANGED
|
@@ -11,7 +11,7 @@
|
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
"epoch": 0.0625,
|
| 14 |
-
"grad_norm": 5.
|
| 15 |
"learning_rate": 1e-05,
|
| 16 |
"loss": 0.553,
|
| 17 |
"mean_token_accuracy": 0.8401249051094055,
|
|
@@ -20,82 +20,82 @@
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
"epoch": 0.625,
|
| 23 |
-
"grad_norm": 1.
|
| 24 |
"learning_rate": 1e-05,
|
| 25 |
-
"loss": 0.
|
| 26 |
-
"mean_token_accuracy": 0.
|
| 27 |
"num_tokens": 401401.0,
|
| 28 |
"step": 10
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"epoch": 1.25,
|
| 32 |
-
"grad_norm": 1.
|
| 33 |
"learning_rate": 1e-05,
|
| 34 |
-
"loss": 0.
|
| 35 |
-
"mean_token_accuracy": 0.
|
| 36 |
"num_tokens": 799732.0,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.875,
|
| 41 |
-
"grad_norm": 1.
|
| 42 |
"learning_rate": 1e-05,
|
| 43 |
-
"loss": 0.
|
| 44 |
-
"mean_token_accuracy": 0.
|
| 45 |
"num_tokens": 1199159.0,
|
| 46 |
"step": 30
|
| 47 |
},
|
| 48 |
{
|
| 49 |
"epoch": 2.5,
|
| 50 |
-
"grad_norm": 2.
|
| 51 |
"learning_rate": 1e-05,
|
| 52 |
-
"loss": 0.
|
| 53 |
-
"mean_token_accuracy": 0.
|
| 54 |
"num_tokens": 1592321.0,
|
| 55 |
"step": 40
|
| 56 |
},
|
| 57 |
{
|
| 58 |
"epoch": 3.125,
|
| 59 |
-
"grad_norm": 0.
|
| 60 |
"learning_rate": 1e-05,
|
| 61 |
-
"loss": 0.
|
| 62 |
-
"mean_token_accuracy": 0.
|
| 63 |
"num_tokens": 1998955.0,
|
| 64 |
"step": 50
|
| 65 |
},
|
| 66 |
{
|
| 67 |
"epoch": 3.75,
|
| 68 |
-
"grad_norm": 1.
|
| 69 |
"learning_rate": 1e-05,
|
| 70 |
-
"loss": 0.
|
| 71 |
-
"mean_token_accuracy": 0.
|
| 72 |
"num_tokens": 2397408.0,
|
| 73 |
"step": 60
|
| 74 |
},
|
| 75 |
{
|
| 76 |
"epoch": 4.375,
|
| 77 |
-
"grad_norm": 0.
|
| 78 |
"learning_rate": 1e-05,
|
| 79 |
-
"loss": 0.
|
| 80 |
-
"mean_token_accuracy": 0.
|
| 81 |
"num_tokens": 2798080.0,
|
| 82 |
"step": 70
|
| 83 |
},
|
| 84 |
{
|
| 85 |
"epoch": 5.0,
|
| 86 |
-
"grad_norm": 1.
|
| 87 |
"learning_rate": 1e-05,
|
| 88 |
-
"loss": 0.
|
| 89 |
-
"mean_token_accuracy": 0.
|
| 90 |
"num_tokens": 3199940.0,
|
| 91 |
"step": 80
|
| 92 |
},
|
| 93 |
{
|
| 94 |
"epoch": 5.625,
|
| 95 |
-
"grad_norm": 0.
|
| 96 |
"learning_rate": 1e-05,
|
| 97 |
-
"loss": 0.
|
| 98 |
-
"mean_token_accuracy": 0.
|
| 99 |
"num_tokens": 3601545.0,
|
| 100 |
"step": 90
|
| 101 |
}
|
|
|
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
"epoch": 0.0625,
|
| 14 |
+
"grad_norm": 5.229332748167166,
|
| 15 |
"learning_rate": 1e-05,
|
| 16 |
"loss": 0.553,
|
| 17 |
"mean_token_accuracy": 0.8401249051094055,
|
|
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
"epoch": 0.625,
|
| 23 |
+
"grad_norm": 1.878449792522089,
|
| 24 |
"learning_rate": 1e-05,
|
| 25 |
+
"loss": 0.4425,
|
| 26 |
+
"mean_token_accuracy": 0.8662860658433702,
|
| 27 |
"num_tokens": 401401.0,
|
| 28 |
"step": 10
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"epoch": 1.25,
|
| 32 |
+
"grad_norm": 1.4215514241135978,
|
| 33 |
"learning_rate": 1e-05,
|
| 34 |
+
"loss": 0.3341,
|
| 35 |
+
"mean_token_accuracy": 0.8969074487686157,
|
| 36 |
"num_tokens": 799732.0,
|
| 37 |
"step": 20
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"epoch": 1.875,
|
| 41 |
+
"grad_norm": 1.4575405337602632,
|
| 42 |
"learning_rate": 1e-05,
|
| 43 |
+
"loss": 0.2066,
|
| 44 |
+
"mean_token_accuracy": 0.9323108971118927,
|
| 45 |
"num_tokens": 1199159.0,
|
| 46 |
"step": 30
|
| 47 |
},
|
| 48 |
{
|
| 49 |
"epoch": 2.5,
|
| 50 |
+
"grad_norm": 2.103711519590209,
|
| 51 |
"learning_rate": 1e-05,
|
| 52 |
+
"loss": 0.1032,
|
| 53 |
+
"mean_token_accuracy": 0.9672375440597534,
|
| 54 |
"num_tokens": 1592321.0,
|
| 55 |
"step": 40
|
| 56 |
},
|
| 57 |
{
|
| 58 |
"epoch": 3.125,
|
| 59 |
+
"grad_norm": 0.8849927277663293,
|
| 60 |
"learning_rate": 1e-05,
|
| 61 |
+
"loss": 0.0611,
|
| 62 |
+
"mean_token_accuracy": 0.9806811392307282,
|
| 63 |
"num_tokens": 1998955.0,
|
| 64 |
"step": 50
|
| 65 |
},
|
| 66 |
{
|
| 67 |
"epoch": 3.75,
|
| 68 |
+
"grad_norm": 1.0988924501749624,
|
| 69 |
"learning_rate": 1e-05,
|
| 70 |
+
"loss": 0.0292,
|
| 71 |
+
"mean_token_accuracy": 0.9918049573898315,
|
| 72 |
"num_tokens": 2397408.0,
|
| 73 |
"step": 60
|
| 74 |
},
|
| 75 |
{
|
| 76 |
"epoch": 4.375,
|
| 77 |
+
"grad_norm": 0.9209838605387056,
|
| 78 |
"learning_rate": 1e-05,
|
| 79 |
+
"loss": 0.0197,
|
| 80 |
+
"mean_token_accuracy": 0.9946328461170196,
|
| 81 |
"num_tokens": 2798080.0,
|
| 82 |
"step": 70
|
| 83 |
},
|
| 84 |
{
|
| 85 |
"epoch": 5.0,
|
| 86 |
+
"grad_norm": 1.145881672413454,
|
| 87 |
"learning_rate": 1e-05,
|
| 88 |
+
"loss": 0.0184,
|
| 89 |
+
"mean_token_accuracy": 0.9954161286354065,
|
| 90 |
"num_tokens": 3199940.0,
|
| 91 |
"step": 80
|
| 92 |
},
|
| 93 |
{
|
| 94 |
"epoch": 5.625,
|
| 95 |
+
"grad_norm": 0.8760509498406688,
|
| 96 |
"learning_rate": 1e-05,
|
| 97 |
+
"loss": 0.0094,
|
| 98 |
+
"mean_token_accuracy": 0.9976460933685303,
|
| 99 |
"num_tokens": 3601545.0,
|
| 100 |
"step": 90
|
| 101 |
}
|
checkpoint-96/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 8081
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c29f8fe80083b5b7c0f17189143d52e9e222441f308cf2fc22d25021e0096e3
|
| 3 |
size 8081
|