Spaces:
Sleeping
Sleeping
add more tokenizer
Browse files- vocab/__init__.py +10 -1
- vocab/falcon_7b/__init__.py +4 -0
- vocab/fastchat_t5_3b/__init__.py +3 -0
- vocab/flan_t5_base/__init__.py +3 -0
- vocab/pko_t5_large/__init__.py +3 -0
- vocab/t5/__init__.py +0 -7
- vocab/t5_base/__init__.py +8 -0
- vocab/t5_large/__init__.py +8 -0
- vocab/t5_small/__init__.py +8 -0
vocab/__init__.py
CHANGED
|
@@ -96,8 +96,8 @@ all_tokenizers = [
|
|
| 96 |
# "alpaca_7b",
|
| 97 |
"baichuan",
|
| 98 |
"baichuan2",
|
| 99 |
-
"qwen",
|
| 100 |
"internlm_chat_7b",
|
|
|
|
| 101 |
"falcon_180b",
|
| 102 |
# "goat",
|
| 103 |
|
|
@@ -109,9 +109,18 @@ all_tokenizers = [
|
|
| 109 |
"skywork_13b_base",
|
| 110 |
"skywork_13b_math",
|
| 111 |
"mistral",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
]
|
| 114 |
|
|
|
|
|
|
|
| 115 |
class TokenizerType(Enum):
|
| 116 |
"""
|
| 117 |
- https://huggingface.co/docs/transformers/tokenizer_summary
|
|
|
|
| 96 |
# "alpaca_7b",
|
| 97 |
"baichuan",
|
| 98 |
"baichuan2",
|
|
|
|
| 99 |
"internlm_chat_7b",
|
| 100 |
+
"falcon_7b",
|
| 101 |
"falcon_180b",
|
| 102 |
# "goat",
|
| 103 |
|
|
|
|
| 109 |
"skywork_13b_base",
|
| 110 |
"skywork_13b_math",
|
| 111 |
"mistral",
|
| 112 |
+
"t5_small",
|
| 113 |
+
"t5_base",
|
| 114 |
+
"t5_large",
|
| 115 |
+
"flan_t5_base",
|
| 116 |
+
"fastchat_t5_3b",
|
| 117 |
+
"pko_t5_large",
|
| 118 |
+
|
| 119 |
|
| 120 |
]
|
| 121 |
|
| 122 |
+
all_tokenizers = sorted(all_tokenizers)
|
| 123 |
+
|
| 124 |
class TokenizerType(Enum):
|
| 125 |
"""
|
| 126 |
- https://huggingface.co/docs/transformers/tokenizer_summary
|
vocab/falcon_7b/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from transformers import AutoTokenizer
|
| 3 |
+
|
| 4 |
+
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
|
vocab/fastchat_t5_3b/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer
|
| 2 |
+
|
| 3 |
+
tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0", trust_remote_code=True)
|
vocab/flan_t5_base/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer
|
| 2 |
+
|
| 3 |
+
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", trust_remote_code=True)
|
vocab/pko_t5_large/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer
|
| 2 |
+
|
| 3 |
+
tokenizer = AutoTokenizer.from_pretrained("paust/pko-t5-large", trust_remote_code=True)
|
vocab/t5/__init__.py
DELETED
|
@@ -1,7 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
SentencePiece
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab/t5_base/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
https://huggingface.co/t5-base
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
from transformers import AutoTokenizer
|
| 7 |
+
|
| 8 |
+
tokenizer = AutoTokenizer.from_pretrained("t5-base", trust_remote_code=True)
|
vocab/t5_large/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
https://huggingface.co/t5-large
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
from transformers import AutoTokenizer
|
| 7 |
+
|
| 8 |
+
tokenizer = AutoTokenizer.from_pretrained("t5-large", trust_remote_code=True)
|
vocab/t5_small/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
https://huggingface.co/t5-large
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
from transformers import AutoTokenizer
|
| 7 |
+
|
| 8 |
+
tokenizer = AutoTokenizer.from_pretrained("t5-small", trust_remote_code=True)
|