Spaces:
Sleeping
Sleeping
fix unicode error: 'unicodeescape' codec can't decode bytes in position 602-608: unknown Unicode character name
Browse files
tokenizer/tiktoken_patch.py
CHANGED
|
@@ -14,7 +14,7 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
|
|
| 14 |
"replace" Replace with replacement character
|
| 15 |
"backslashreplace" Replace with backslashed escape sequence
|
| 16 |
"xmlcharrefreplace" Replace with XML character reference
|
| 17 |
-
"namereplace"
|
| 18 |
"""
|
| 19 |
try:
|
| 20 |
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
|
|
|
| 14 |
"replace" Replace with replacement character
|
| 15 |
"backslashreplace" Replace with backslashed escape sequence
|
| 16 |
"xmlcharrefreplace" Replace with XML character reference
|
| 17 |
+
"namereplace"
|
| 18 |
"""
|
| 19 |
try:
|
| 20 |
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
vocab/fastchat_t5_3b/__init__.py
CHANGED
|
@@ -1,23 +1,9 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
## 默认 use_fast=True 报错
|
| 4 |
-
|
| 5 |
-
print(iter_vocab(tokenizer, name=name))
|
| 6 |
-
File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 144, in iter_vocab
|
| 7 |
-
dist_length, mean_length = get_coding_length(tokenizer, zh_tokens, filter=lambda k: not is_zh_char(k))
|
| 8 |
-
File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 34, in get_coding_length
|
| 9 |
-
tokens = tokenizer.encode(word)
|
| 10 |
-
File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_base.py", line 2600, in encode
|
| 11 |
-
encoded_inputs = self.encode_plus(
|
| 12 |
-
File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_base.py", line 3008, in encode_plus
|
| 13 |
-
return self._encode_plus(
|
| 14 |
-
File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_fast.py", line 576, in _encode_plus
|
| 15 |
-
batched_output = self._batch_encode_plus(
|
| 16 |
-
File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_fast.py", line 504, in _batch_encode_plus
|
| 17 |
encodings = self._tokenizer.encode_batch(
|
| 18 |
pyo3_runtime.PanicException: AddedVocabulary bad split
|
| 19 |
"""
|
| 20 |
-
|
| 21 |
from transformers import AutoTokenizer
|
| 22 |
|
| 23 |
tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0", trust_remote_code=True, use_fast=False)
|
|
|
|
| 1 |
"""
|
|
|
|
| 2 |
## 默认 use_fast=True 报错
|
| 3 |
+
lib\site-packages\transformers\tokenization_utils_fast.py", line 504, in _batch_encode_plus
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
encodings = self._tokenizer.encode_batch(
|
| 5 |
pyo3_runtime.PanicException: AddedVocabulary bad split
|
| 6 |
"""
|
|
|
|
| 7 |
from transformers import AutoTokenizer
|
| 8 |
|
| 9 |
tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0", trust_remote_code=True, use_fast=False)
|