duzx16
commited on
Commit
·
3946e1b
1
Parent(s):
758ba9f
Update README.md
Browse files- README.md +2 -0
- tokenization_chatglm.py +12 -12
- tokenizer_config.json +2 -2
README.md
CHANGED
|
@@ -8,6 +8,8 @@ tags:
|
|
| 8 |
- thudm
|
| 9 |
---
|
| 10 |
# ChatGLM-6B
|
|
|
|
|
|
|
| 11 |
## 介绍
|
| 12 |
ChatGLM-6B 是一个开源的、支持中英双语问答的对话语言模型,基于 [General Language Model (GLM)](https://github.com/THUDM/GLM) 架构,具有 62 亿参数。结合模型量化技术,用户可以在消费级的显卡上进行本地部署(INT4 量化级别下最低只需 6GB 显存)。ChatGLM-6B 使用了和 [ChatGLM](https://chatglm.cn) 相同的技术,针对中文问答和对话进行了优化。经过约 1T 标识符的中英双语训练,辅以监督微调、反馈自助、人类反馈强化学习等技术的加持,62 亿参数的 ChatGLM-6B 已经能生成相当符合人类偏好的回答。
|
| 13 |
|
|
|
|
| 8 |
- thudm
|
| 9 |
---
|
| 10 |
# ChatGLM-6B
|
| 11 |
+
**本仓库已经不再维护,请使用 [ChatGLM-6B-INT4](https://huggingface.co/THUDM/chatglm-6b-int4)**
|
| 12 |
+
|
| 13 |
## 介绍
|
| 14 |
ChatGLM-6B 是一个开源的、支持中英双语问答的对话语言模型,基于 [General Language Model (GLM)](https://github.com/THUDM/GLM) 架构,具有 62 亿参数。结合模型量化技术,用户可以在消费级的显卡上进行本地部署(INT4 量化级别下最低只需 6GB 显存)。ChatGLM-6B 使用了和 [ChatGLM](https://chatglm.cn) 相同的技术,针对中文问答和对话进行了优化。经过约 1T 标识符的中英双语训练,辅以监督微调、反馈自助、人类反馈强化学习等技术的加持,62 亿参数的 ChatGLM-6B 已经能生成相当符合人类偏好的回答。
|
| 15 |
|
tokenization_chatglm.py
CHANGED
|
@@ -171,8 +171,8 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 171 |
do_lower_case=False,
|
| 172 |
remove_space=False,
|
| 173 |
bos_token='<sop>',
|
| 174 |
-
eos_token='
|
| 175 |
-
|
| 176 |
mask_token='[MASK]',
|
| 177 |
gmask_token='[gMASK]',
|
| 178 |
padding_side="left",
|
|
@@ -185,7 +185,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 185 |
padding_side=padding_side,
|
| 186 |
bos_token=bos_token,
|
| 187 |
eos_token=eos_token,
|
| 188 |
-
|
| 189 |
mask_token=mask_token,
|
| 190 |
gmask_token=gmask_token,
|
| 191 |
num_image_tokens=num_image_tokens,
|
|
@@ -198,7 +198,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 198 |
|
| 199 |
self.bos_token = bos_token
|
| 200 |
self.eos_token = eos_token
|
| 201 |
-
self.
|
| 202 |
self.mask_token = mask_token
|
| 203 |
self.gmask_token = gmask_token
|
| 204 |
|
|
@@ -213,14 +213,14 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 213 |
return self.convert_tokens_to_ids(self.gmask_token)
|
| 214 |
|
| 215 |
@property
|
| 216 |
-
def
|
| 217 |
"""
|
| 218 |
-
`Optional[int]`: Id of the end of
|
| 219 |
set.
|
| 220 |
"""
|
| 221 |
-
if self.
|
| 222 |
return None
|
| 223 |
-
return self.convert_tokens_to_ids(self.
|
| 224 |
|
| 225 |
@property
|
| 226 |
def vocab_size(self):
|
|
@@ -324,18 +324,18 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 324 |
"""
|
| 325 |
mask_ids = self.sp_tokenizer[self.mask_token]
|
| 326 |
gmask_ids = self.sp_tokenizer[self.gmask_token]
|
| 327 |
-
|
| 328 |
if mask_ids not in token_ids_0 and gmask_ids not in token_ids_0:
|
| 329 |
token_ids_0 += [gmask_ids]
|
| 330 |
|
| 331 |
if token_ids_0[-1] != mask_ids and token_ids_0[-1] != gmask_ids:
|
| 332 |
-
token_ids_0 += [self.sp_tokenizer[self.
|
| 333 |
|
| 334 |
token_ids_0 += [self.sp_tokenizer[self.bos_token]]
|
| 335 |
|
| 336 |
if token_ids_1 is not None:
|
| 337 |
-
if not token_ids_1 or token_ids_1[-1] !=
|
| 338 |
-
token_ids_1 += [
|
| 339 |
token_ids_0 += token_ids_1
|
| 340 |
|
| 341 |
return token_ids_0
|
|
|
|
| 171 |
do_lower_case=False,
|
| 172 |
remove_space=False,
|
| 173 |
bos_token='<sop>',
|
| 174 |
+
eos_token='<eop>',
|
| 175 |
+
end_token='</s>',
|
| 176 |
mask_token='[MASK]',
|
| 177 |
gmask_token='[gMASK]',
|
| 178 |
padding_side="left",
|
|
|
|
| 185 |
padding_side=padding_side,
|
| 186 |
bos_token=bos_token,
|
| 187 |
eos_token=eos_token,
|
| 188 |
+
end_token=end_token,
|
| 189 |
mask_token=mask_token,
|
| 190 |
gmask_token=gmask_token,
|
| 191 |
num_image_tokens=num_image_tokens,
|
|
|
|
| 198 |
|
| 199 |
self.bos_token = bos_token
|
| 200 |
self.eos_token = eos_token
|
| 201 |
+
self.end_token = end_token
|
| 202 |
self.mask_token = mask_token
|
| 203 |
self.gmask_token = gmask_token
|
| 204 |
|
|
|
|
| 213 |
return self.convert_tokens_to_ids(self.gmask_token)
|
| 214 |
|
| 215 |
@property
|
| 216 |
+
def end_token_id(self) -> Optional[int]:
|
| 217 |
"""
|
| 218 |
+
`Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
|
| 219 |
set.
|
| 220 |
"""
|
| 221 |
+
if self.end_token is None:
|
| 222 |
return None
|
| 223 |
+
return self.convert_tokens_to_ids(self.end_token)
|
| 224 |
|
| 225 |
@property
|
| 226 |
def vocab_size(self):
|
|
|
|
| 324 |
"""
|
| 325 |
mask_ids = self.sp_tokenizer[self.mask_token]
|
| 326 |
gmask_ids = self.sp_tokenizer[self.gmask_token]
|
| 327 |
+
eos_id = self.sp_tokenizer[self.eos_token]
|
| 328 |
if mask_ids not in token_ids_0 and gmask_ids not in token_ids_0:
|
| 329 |
token_ids_0 += [gmask_ids]
|
| 330 |
|
| 331 |
if token_ids_0[-1] != mask_ids and token_ids_0[-1] != gmask_ids:
|
| 332 |
+
token_ids_0 += [self.sp_tokenizer[self.end_token]]
|
| 333 |
|
| 334 |
token_ids_0 += [self.sp_tokenizer[self.bos_token]]
|
| 335 |
|
| 336 |
if token_ids_1 is not None:
|
| 337 |
+
if not token_ids_1 or token_ids_1[-1] != eos_id:
|
| 338 |
+
token_ids_1 += [eos_id]
|
| 339 |
token_ids_0 += token_ids_1
|
| 340 |
|
| 341 |
return token_ids_0
|
tokenizer_config.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"name_or_path": "THUDM/chatglm-6b",
|
| 3 |
"bos_token": "<sop>",
|
| 4 |
-
"
|
| 5 |
-
"
|
| 6 |
"gmask_token": "[gMASK]",
|
| 7 |
"mask_token": "[MASK]",
|
| 8 |
"pad_token": "<pad>",
|
|
|
|
| 1 |
{
|
| 2 |
"name_or_path": "THUDM/chatglm-6b",
|
| 3 |
"bos_token": "<sop>",
|
| 4 |
+
"eos_token": "<eop>",
|
| 5 |
+
"end_token": "</s>",
|
| 6 |
"gmask_token": "[gMASK]",
|
| 7 |
"mask_token": "[MASK]",
|
| 8 |
"pad_token": "<pad>",
|