ChenWu98 commited on
Commit
ba16a81
·
verified ·
1 Parent(s): e043aad

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -4,8 +4,8 @@ library_name: transformers
4
  model_name: d1_train_1024_no_reasoning_llama3_8B
5
  tags:
6
  - generated_from_trainer
7
- - trl
8
  - sft
 
9
  licence: license
10
  ---
11
 
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenwu/huggingface/runs/bomjniwi)
31
 
32
 
33
  This model was trained with SFT.
 
4
  model_name: d1_train_1024_no_reasoning_llama3_8B
5
  tags:
6
  - generated_from_trainer
 
7
  - sft
8
+ - trl
9
  licence: license
10
  ---
11
 
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenwu/huggingface/runs/l01140ez)
31
 
32
 
33
  This model was trained with SFT.
checkpoint-128/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8f0ef39e33ae7eb14ee7cc666dfa9b94278355748bf86d44ad981d087111580
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40fa941dc7eda6dd2d9d5ec937b0f9a23e8642f4622f7f1dd49a2fcd475960a5
3
  size 4976698672
checkpoint-128/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9acf7eb8792a82212fdd297a6fd0a954ac9f43ddf9e98022cf450f1f12c8a6ec
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13fe6aadd034f0361cb93df64a5a9ee6539c37ac3bf09a10984a1b9f27ec96c1
3
  size 4999802720
checkpoint-128/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10a687f40a20e1c92dfb329ff45456bc4e50c130752ce770f39ed82311fbf8a8
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5094186aa8995114e74046358974b9cced60317442e262f194885388756bd5a
3
  size 4915916176
checkpoint-128/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e22ed6f07f17fa0258c60559e6844d93b48b58cfe068a4dc4c95f7b4849b42a
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:677385a5dc6b6fb78ebfef1f574dbb84568cfe5cc534db5377d92146be10f265
3
  size 1168138808
checkpoint-128/trainer_state.json CHANGED
@@ -11,7 +11,7 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0625,
14
- "grad_norm": 5.229090624867237,
15
  "learning_rate": 1e-05,
16
  "loss": 0.553,
17
  "mean_token_accuracy": 0.8401249051094055,
@@ -20,109 +20,109 @@
20
  },
21
  {
22
  "epoch": 0.625,
23
- "grad_norm": 1.8775022993488002,
24
  "learning_rate": 1e-05,
25
- "loss": 0.443,
26
- "mean_token_accuracy": 0.8661629954973856,
27
  "num_tokens": 401401.0,
28
  "step": 10
29
  },
30
  {
31
  "epoch": 1.25,
32
- "grad_norm": 1.4054008068425676,
33
  "learning_rate": 1e-05,
34
- "loss": 0.3346,
35
- "mean_token_accuracy": 0.8969329178333283,
36
  "num_tokens": 799732.0,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.875,
41
- "grad_norm": 1.4215176512700638,
42
  "learning_rate": 1e-05,
43
- "loss": 0.2071,
44
- "mean_token_accuracy": 0.9320787608623504,
45
  "num_tokens": 1199159.0,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 2.5,
50
- "grad_norm": 2.0853381701848304,
51
  "learning_rate": 1e-05,
52
- "loss": 0.1034,
53
- "mean_token_accuracy": 0.9672571659088135,
54
  "num_tokens": 1592321.0,
55
  "step": 40
56
  },
57
  {
58
  "epoch": 3.125,
59
- "grad_norm": 0.9298876909455216,
60
  "learning_rate": 1e-05,
61
- "loss": 0.0614,
62
- "mean_token_accuracy": 0.9805558681488037,
63
  "num_tokens": 1998955.0,
64
  "step": 50
65
  },
66
  {
67
  "epoch": 3.75,
68
- "grad_norm": 1.184064034490825,
69
  "learning_rate": 1e-05,
70
- "loss": 0.0298,
71
- "mean_token_accuracy": 0.9916574656963348,
72
  "num_tokens": 2397408.0,
73
  "step": 60
74
  },
75
  {
76
  "epoch": 4.375,
77
- "grad_norm": 0.8214791537068664,
78
  "learning_rate": 1e-05,
79
- "loss": 0.019,
80
- "mean_token_accuracy": 0.9946628630161285,
81
  "num_tokens": 2798080.0,
82
  "step": 70
83
  },
84
  {
85
  "epoch": 5.0,
86
- "grad_norm": 1.0482853792795606,
87
  "learning_rate": 1e-05,
88
- "loss": 0.0168,
89
- "mean_token_accuracy": 0.9956359148025513,
90
  "num_tokens": 3199940.0,
91
  "step": 80
92
  },
93
  {
94
  "epoch": 5.625,
95
- "grad_norm": 0.8533738215069255,
96
  "learning_rate": 1e-05,
97
- "loss": 0.0088,
98
- "mean_token_accuracy": 0.9977155506610871,
99
  "num_tokens": 3601545.0,
100
  "step": 90
101
  },
102
  {
103
  "epoch": 6.25,
104
- "grad_norm": 0.6832402024714068,
105
  "learning_rate": 1e-05,
106
- "loss": 0.0086,
107
- "mean_token_accuracy": 0.9978549599647522,
108
  "num_tokens": 3998166.0,
109
  "step": 100
110
  },
111
  {
112
  "epoch": 6.875,
113
- "grad_norm": 0.7633873323361593,
114
  "learning_rate": 1e-05,
115
- "loss": 0.0079,
116
- "mean_token_accuracy": 0.9980584681034088,
117
  "num_tokens": 4400384.0,
118
  "step": 110
119
  },
120
  {
121
  "epoch": 7.5,
122
- "grad_norm": 0.7715988435304115,
123
  "learning_rate": 1e-05,
124
- "loss": 0.0052,
125
- "mean_token_accuracy": 0.9987416326999664,
126
  "num_tokens": 4799604.0,
127
  "step": 120
128
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0625,
14
+ "grad_norm": 5.229332748167166,
15
  "learning_rate": 1e-05,
16
  "loss": 0.553,
17
  "mean_token_accuracy": 0.8401249051094055,
 
20
  },
21
  {
22
  "epoch": 0.625,
23
+ "grad_norm": 1.878449792522089,
24
  "learning_rate": 1e-05,
25
+ "loss": 0.4425,
26
+ "mean_token_accuracy": 0.8662860658433702,
27
  "num_tokens": 401401.0,
28
  "step": 10
29
  },
30
  {
31
  "epoch": 1.25,
32
+ "grad_norm": 1.4215514241135978,
33
  "learning_rate": 1e-05,
34
+ "loss": 0.3341,
35
+ "mean_token_accuracy": 0.8969074487686157,
36
  "num_tokens": 799732.0,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.875,
41
+ "grad_norm": 1.4575405337602632,
42
  "learning_rate": 1e-05,
43
+ "loss": 0.2066,
44
+ "mean_token_accuracy": 0.9323108971118927,
45
  "num_tokens": 1199159.0,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 2.5,
50
+ "grad_norm": 2.103711519590209,
51
  "learning_rate": 1e-05,
52
+ "loss": 0.1032,
53
+ "mean_token_accuracy": 0.9672375440597534,
54
  "num_tokens": 1592321.0,
55
  "step": 40
56
  },
57
  {
58
  "epoch": 3.125,
59
+ "grad_norm": 0.8849927277663293,
60
  "learning_rate": 1e-05,
61
+ "loss": 0.0611,
62
+ "mean_token_accuracy": 0.9806811392307282,
63
  "num_tokens": 1998955.0,
64
  "step": 50
65
  },
66
  {
67
  "epoch": 3.75,
68
+ "grad_norm": 1.0988924501749624,
69
  "learning_rate": 1e-05,
70
+ "loss": 0.0292,
71
+ "mean_token_accuracy": 0.9918049573898315,
72
  "num_tokens": 2397408.0,
73
  "step": 60
74
  },
75
  {
76
  "epoch": 4.375,
77
+ "grad_norm": 0.9209838605387056,
78
  "learning_rate": 1e-05,
79
+ "loss": 0.0197,
80
+ "mean_token_accuracy": 0.9946328461170196,
81
  "num_tokens": 2798080.0,
82
  "step": 70
83
  },
84
  {
85
  "epoch": 5.0,
86
+ "grad_norm": 1.145881672413454,
87
  "learning_rate": 1e-05,
88
+ "loss": 0.0184,
89
+ "mean_token_accuracy": 0.9954161286354065,
90
  "num_tokens": 3199940.0,
91
  "step": 80
92
  },
93
  {
94
  "epoch": 5.625,
95
+ "grad_norm": 0.8760509498406688,
96
  "learning_rate": 1e-05,
97
+ "loss": 0.0094,
98
+ "mean_token_accuracy": 0.9976460933685303,
99
  "num_tokens": 3601545.0,
100
  "step": 90
101
  },
102
  {
103
  "epoch": 6.25,
104
+ "grad_norm": 0.6303656980191445,
105
  "learning_rate": 1e-05,
106
+ "loss": 0.0089,
107
+ "mean_token_accuracy": 0.9977393686771393,
108
  "num_tokens": 3998166.0,
109
  "step": 100
110
  },
111
  {
112
  "epoch": 6.875,
113
+ "grad_norm": 0.737583281857238,
114
  "learning_rate": 1e-05,
115
+ "loss": 0.008,
116
+ "mean_token_accuracy": 0.9981118857860565,
117
  "num_tokens": 4400384.0,
118
  "step": 110
119
  },
120
  {
121
  "epoch": 7.5,
122
+ "grad_norm": 0.5676177953651935,
123
  "learning_rate": 1e-05,
124
+ "loss": 0.0053,
125
+ "mean_token_accuracy": 0.9985986590385437,
126
  "num_tokens": 4799604.0,
127
  "step": 120
128
  }
checkpoint-128/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf9d951d05098026f9d17656169173d7d8769eb9865665888a43420e263ac25c
3
  size 8081
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c29f8fe80083b5b7c0f17189143d52e9e222441f308cf2fc22d25021e0096e3
3
  size 8081
checkpoint-32/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60d0dbb4a13be74ad990aed12e1389840238e4a5050917310a1fe798fc9069b9
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a28a9e5797fde7e49088949ab39301b942d08c68a323e48d48f880a2dc2f31df
3
  size 4976698672
checkpoint-32/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ede8bc15400697d2489a46a82f6103cf15d1ec4a968dab8e8633e89da4ffcec
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c0d8c607046cd885ce539e1e07e480893946a0e3b8013882c90b4b84135c1d5
3
  size 4999802720
checkpoint-32/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efb727f76ceef896d0eb3e9121ee41c77b14c07e68558e959d9cf14e01caaf4a
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6a91279de971d2da10439673c5f4c919c9a1c2084b7d56cddd19f1833e3026d
3
  size 4915916176
checkpoint-32/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc5ae0f36b0762d53c699f698f9ad86c981e406cc1e623141ba101e8491dae65
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d090159b203d889219d031869d0b7100b700e438446783e79a2fa4001ffe5635
3
  size 1168138808
checkpoint-32/trainer_state.json CHANGED
@@ -11,7 +11,7 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0625,
14
- "grad_norm": 5.229090624867237,
15
  "learning_rate": 1e-05,
16
  "loss": 0.553,
17
  "mean_token_accuracy": 0.8401249051094055,
@@ -20,28 +20,28 @@
20
  },
21
  {
22
  "epoch": 0.625,
23
- "grad_norm": 1.8775022993488002,
24
  "learning_rate": 1e-05,
25
- "loss": 0.443,
26
- "mean_token_accuracy": 0.8661629954973856,
27
  "num_tokens": 401401.0,
28
  "step": 10
29
  },
30
  {
31
  "epoch": 1.25,
32
- "grad_norm": 1.4054008068425676,
33
  "learning_rate": 1e-05,
34
- "loss": 0.3346,
35
- "mean_token_accuracy": 0.8969329178333283,
36
  "num_tokens": 799732.0,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.875,
41
- "grad_norm": 1.4215176512700638,
42
  "learning_rate": 1e-05,
43
- "loss": 0.2071,
44
- "mean_token_accuracy": 0.9320787608623504,
45
  "num_tokens": 1199159.0,
46
  "step": 30
47
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0625,
14
+ "grad_norm": 5.229332748167166,
15
  "learning_rate": 1e-05,
16
  "loss": 0.553,
17
  "mean_token_accuracy": 0.8401249051094055,
 
20
  },
21
  {
22
  "epoch": 0.625,
23
+ "grad_norm": 1.878449792522089,
24
  "learning_rate": 1e-05,
25
+ "loss": 0.4425,
26
+ "mean_token_accuracy": 0.8662860658433702,
27
  "num_tokens": 401401.0,
28
  "step": 10
29
  },
30
  {
31
  "epoch": 1.25,
32
+ "grad_norm": 1.4215514241135978,
33
  "learning_rate": 1e-05,
34
+ "loss": 0.3341,
35
+ "mean_token_accuracy": 0.8969074487686157,
36
  "num_tokens": 799732.0,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.875,
41
+ "grad_norm": 1.4575405337602632,
42
  "learning_rate": 1e-05,
43
+ "loss": 0.2066,
44
+ "mean_token_accuracy": 0.9323108971118927,
45
  "num_tokens": 1199159.0,
46
  "step": 30
47
  }
checkpoint-32/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf9d951d05098026f9d17656169173d7d8769eb9865665888a43420e263ac25c
3
  size 8081
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c29f8fe80083b5b7c0f17189143d52e9e222441f308cf2fc22d25021e0096e3
3
  size 8081
checkpoint-64/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db3c8ded99a1c7f831a6abde4f05102c98e68e4862834c2445f4b11ec23ef0fc
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00fbe653cb7ebbf34086241b234f117c83033ad1a430ee2a8293c94345910a52
3
  size 4976698672
checkpoint-64/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6a8406f4665a8362c2c29ba57f3e7f67776f8009b7451cec5dbf27d5d2ad8b7
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8c8c6db9f5a0f64b053d5ed412d770f46e1b21fb22921ad3cbccc9eefc9c483
3
  size 4999802720
checkpoint-64/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef15c198f1a91994edb8360f3067e52d6ab09a588208105729b7ae42551a5767
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e632cf3a6e8508a9d0a2ebb86f758925169f16f34c0e1a9a9f1a0b01fd45a79
3
  size 4915916176
checkpoint-64/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef2307b648f0f62172249f08b465e0de1e3ead3b3bd37538e2238759039134c3
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29a6f46c1fa988045aac05f3c9c3c04c706b1314a062ea50f31f7a662a19c98a
3
  size 1168138808
checkpoint-64/trainer_state.json CHANGED
@@ -11,7 +11,7 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0625,
14
- "grad_norm": 5.229090624867237,
15
  "learning_rate": 1e-05,
16
  "loss": 0.553,
17
  "mean_token_accuracy": 0.8401249051094055,
@@ -20,55 +20,55 @@
20
  },
21
  {
22
  "epoch": 0.625,
23
- "grad_norm": 1.8775022993488002,
24
  "learning_rate": 1e-05,
25
- "loss": 0.443,
26
- "mean_token_accuracy": 0.8661629954973856,
27
  "num_tokens": 401401.0,
28
  "step": 10
29
  },
30
  {
31
  "epoch": 1.25,
32
- "grad_norm": 1.4054008068425676,
33
  "learning_rate": 1e-05,
34
- "loss": 0.3346,
35
- "mean_token_accuracy": 0.8969329178333283,
36
  "num_tokens": 799732.0,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.875,
41
- "grad_norm": 1.4215176512700638,
42
  "learning_rate": 1e-05,
43
- "loss": 0.2071,
44
- "mean_token_accuracy": 0.9320787608623504,
45
  "num_tokens": 1199159.0,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 2.5,
50
- "grad_norm": 2.0853381701848304,
51
  "learning_rate": 1e-05,
52
- "loss": 0.1034,
53
- "mean_token_accuracy": 0.9672571659088135,
54
  "num_tokens": 1592321.0,
55
  "step": 40
56
  },
57
  {
58
  "epoch": 3.125,
59
- "grad_norm": 0.9298876909455216,
60
  "learning_rate": 1e-05,
61
- "loss": 0.0614,
62
- "mean_token_accuracy": 0.9805558681488037,
63
  "num_tokens": 1998955.0,
64
  "step": 50
65
  },
66
  {
67
  "epoch": 3.75,
68
- "grad_norm": 1.184064034490825,
69
  "learning_rate": 1e-05,
70
- "loss": 0.0298,
71
- "mean_token_accuracy": 0.9916574656963348,
72
  "num_tokens": 2397408.0,
73
  "step": 60
74
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0625,
14
+ "grad_norm": 5.229332748167166,
15
  "learning_rate": 1e-05,
16
  "loss": 0.553,
17
  "mean_token_accuracy": 0.8401249051094055,
 
20
  },
21
  {
22
  "epoch": 0.625,
23
+ "grad_norm": 1.878449792522089,
24
  "learning_rate": 1e-05,
25
+ "loss": 0.4425,
26
+ "mean_token_accuracy": 0.8662860658433702,
27
  "num_tokens": 401401.0,
28
  "step": 10
29
  },
30
  {
31
  "epoch": 1.25,
32
+ "grad_norm": 1.4215514241135978,
33
  "learning_rate": 1e-05,
34
+ "loss": 0.3341,
35
+ "mean_token_accuracy": 0.8969074487686157,
36
  "num_tokens": 799732.0,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.875,
41
+ "grad_norm": 1.4575405337602632,
42
  "learning_rate": 1e-05,
43
+ "loss": 0.2066,
44
+ "mean_token_accuracy": 0.9323108971118927,
45
  "num_tokens": 1199159.0,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 2.5,
50
+ "grad_norm": 2.103711519590209,
51
  "learning_rate": 1e-05,
52
+ "loss": 0.1032,
53
+ "mean_token_accuracy": 0.9672375440597534,
54
  "num_tokens": 1592321.0,
55
  "step": 40
56
  },
57
  {
58
  "epoch": 3.125,
59
+ "grad_norm": 0.8849927277663293,
60
  "learning_rate": 1e-05,
61
+ "loss": 0.0611,
62
+ "mean_token_accuracy": 0.9806811392307282,
63
  "num_tokens": 1998955.0,
64
  "step": 50
65
  },
66
  {
67
  "epoch": 3.75,
68
+ "grad_norm": 1.0988924501749624,
69
  "learning_rate": 1e-05,
70
+ "loss": 0.0292,
71
+ "mean_token_accuracy": 0.9918049573898315,
72
  "num_tokens": 2397408.0,
73
  "step": 60
74
  }
checkpoint-64/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf9d951d05098026f9d17656169173d7d8769eb9865665888a43420e263ac25c
3
  size 8081
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c29f8fe80083b5b7c0f17189143d52e9e222441f308cf2fc22d25021e0096e3
3
  size 8081
checkpoint-96/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a0492a2f49410326e0d75a2d1989a6f07f77dad77dfa79b670dbbe3602e41df
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6914a1438136e282d6bcf40bef64b766ed70117fe040d52f0ec393d760378169
3
  size 4976698672
checkpoint-96/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc758699380eb76d476da0a770a22d75f171ae64b0e9555b21dedf207b666c91
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e58cf5422d1aa176b52eb65b5c6b33963574f79372067c6312a83c597116796d
3
  size 4999802720
checkpoint-96/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b414eacb245cb1340a124e23ff980846bf4c7ea14ec96cadb6647809944bc18
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a74864c894aeca1872c8398d5a779c786f09ff6a99d00ed1b05366e63dd943b
3
  size 4915916176
checkpoint-96/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76bbea69cf63298ec2e6cd765ac41f6a1fbd54d699676a9b0638abe4b2c71eac
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b5469fe62b9764063be846645c926a92b9fb00883c6b5ec1f63c697e62e5ca6
3
  size 1168138808
checkpoint-96/trainer_state.json CHANGED
@@ -11,7 +11,7 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0625,
14
- "grad_norm": 5.229090624867237,
15
  "learning_rate": 1e-05,
16
  "loss": 0.553,
17
  "mean_token_accuracy": 0.8401249051094055,
@@ -20,82 +20,82 @@
20
  },
21
  {
22
  "epoch": 0.625,
23
- "grad_norm": 1.8775022993488002,
24
  "learning_rate": 1e-05,
25
- "loss": 0.443,
26
- "mean_token_accuracy": 0.8661629954973856,
27
  "num_tokens": 401401.0,
28
  "step": 10
29
  },
30
  {
31
  "epoch": 1.25,
32
- "grad_norm": 1.4054008068425676,
33
  "learning_rate": 1e-05,
34
- "loss": 0.3346,
35
- "mean_token_accuracy": 0.8969329178333283,
36
  "num_tokens": 799732.0,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.875,
41
- "grad_norm": 1.4215176512700638,
42
  "learning_rate": 1e-05,
43
- "loss": 0.2071,
44
- "mean_token_accuracy": 0.9320787608623504,
45
  "num_tokens": 1199159.0,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 2.5,
50
- "grad_norm": 2.0853381701848304,
51
  "learning_rate": 1e-05,
52
- "loss": 0.1034,
53
- "mean_token_accuracy": 0.9672571659088135,
54
  "num_tokens": 1592321.0,
55
  "step": 40
56
  },
57
  {
58
  "epoch": 3.125,
59
- "grad_norm": 0.9298876909455216,
60
  "learning_rate": 1e-05,
61
- "loss": 0.0614,
62
- "mean_token_accuracy": 0.9805558681488037,
63
  "num_tokens": 1998955.0,
64
  "step": 50
65
  },
66
  {
67
  "epoch": 3.75,
68
- "grad_norm": 1.184064034490825,
69
  "learning_rate": 1e-05,
70
- "loss": 0.0298,
71
- "mean_token_accuracy": 0.9916574656963348,
72
  "num_tokens": 2397408.0,
73
  "step": 60
74
  },
75
  {
76
  "epoch": 4.375,
77
- "grad_norm": 0.8214791537068664,
78
  "learning_rate": 1e-05,
79
- "loss": 0.019,
80
- "mean_token_accuracy": 0.9946628630161285,
81
  "num_tokens": 2798080.0,
82
  "step": 70
83
  },
84
  {
85
  "epoch": 5.0,
86
- "grad_norm": 1.0482853792795606,
87
  "learning_rate": 1e-05,
88
- "loss": 0.0168,
89
- "mean_token_accuracy": 0.9956359148025513,
90
  "num_tokens": 3199940.0,
91
  "step": 80
92
  },
93
  {
94
  "epoch": 5.625,
95
- "grad_norm": 0.8533738215069255,
96
  "learning_rate": 1e-05,
97
- "loss": 0.0088,
98
- "mean_token_accuracy": 0.9977155506610871,
99
  "num_tokens": 3601545.0,
100
  "step": 90
101
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0625,
14
+ "grad_norm": 5.229332748167166,
15
  "learning_rate": 1e-05,
16
  "loss": 0.553,
17
  "mean_token_accuracy": 0.8401249051094055,
 
20
  },
21
  {
22
  "epoch": 0.625,
23
+ "grad_norm": 1.878449792522089,
24
  "learning_rate": 1e-05,
25
+ "loss": 0.4425,
26
+ "mean_token_accuracy": 0.8662860658433702,
27
  "num_tokens": 401401.0,
28
  "step": 10
29
  },
30
  {
31
  "epoch": 1.25,
32
+ "grad_norm": 1.4215514241135978,
33
  "learning_rate": 1e-05,
34
+ "loss": 0.3341,
35
+ "mean_token_accuracy": 0.8969074487686157,
36
  "num_tokens": 799732.0,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.875,
41
+ "grad_norm": 1.4575405337602632,
42
  "learning_rate": 1e-05,
43
+ "loss": 0.2066,
44
+ "mean_token_accuracy": 0.9323108971118927,
45
  "num_tokens": 1199159.0,
46
  "step": 30
47
  },
48
  {
49
  "epoch": 2.5,
50
+ "grad_norm": 2.103711519590209,
51
  "learning_rate": 1e-05,
52
+ "loss": 0.1032,
53
+ "mean_token_accuracy": 0.9672375440597534,
54
  "num_tokens": 1592321.0,
55
  "step": 40
56
  },
57
  {
58
  "epoch": 3.125,
59
+ "grad_norm": 0.8849927277663293,
60
  "learning_rate": 1e-05,
61
+ "loss": 0.0611,
62
+ "mean_token_accuracy": 0.9806811392307282,
63
  "num_tokens": 1998955.0,
64
  "step": 50
65
  },
66
  {
67
  "epoch": 3.75,
68
+ "grad_norm": 1.0988924501749624,
69
  "learning_rate": 1e-05,
70
+ "loss": 0.0292,
71
+ "mean_token_accuracy": 0.9918049573898315,
72
  "num_tokens": 2397408.0,
73
  "step": 60
74
  },
75
  {
76
  "epoch": 4.375,
77
+ "grad_norm": 0.9209838605387056,
78
  "learning_rate": 1e-05,
79
+ "loss": 0.0197,
80
+ "mean_token_accuracy": 0.9946328461170196,
81
  "num_tokens": 2798080.0,
82
  "step": 70
83
  },
84
  {
85
  "epoch": 5.0,
86
+ "grad_norm": 1.145881672413454,
87
  "learning_rate": 1e-05,
88
+ "loss": 0.0184,
89
+ "mean_token_accuracy": 0.9954161286354065,
90
  "num_tokens": 3199940.0,
91
  "step": 80
92
  },
93
  {
94
  "epoch": 5.625,
95
+ "grad_norm": 0.8760509498406688,
96
  "learning_rate": 1e-05,
97
+ "loss": 0.0094,
98
+ "mean_token_accuracy": 0.9976460933685303,
99
  "num_tokens": 3601545.0,
100
  "step": 90
101
  }
checkpoint-96/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf9d951d05098026f9d17656169173d7d8769eb9865665888a43420e263ac25c
3
  size 8081
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c29f8fe80083b5b7c0f17189143d52e9e222441f308cf2fc22d25021e0096e3
3
  size 8081