Upload tokenizer_nanogpt.py with huggingface_hub
Browse files- tokenizer_nanogpt.py +70 -0
tokenizer_nanogpt.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pickle
|
| 3 |
+
from huggingface_hub import hf_hub_download
|
| 4 |
+
from huggingface_hub.utils import HfHubHTTPError
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class NanoGPTTokenizer:
|
| 8 |
+
"""Lightweight wrapper over a tiktoken Encoding stored in tokenizer.pkl.
|
| 9 |
+
|
| 10 |
+
Provides minimal encode/decode needed for inference and a from_pretrained
|
| 11 |
+
constructor so it can be loaded via AutoTokenizer with trust_remote_code.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, enc):
|
| 15 |
+
self.enc = enc
|
| 16 |
+
self.bos_token_id = enc.encode_single_token("<|bos|>")
|
| 17 |
+
|
| 18 |
+
@classmethod
|
| 19 |
+
def register_for_auto_class(cls, auto_class="AutoTokenizer"):
|
| 20 |
+
"""Required for AutoTokenizer registration."""
|
| 21 |
+
pass
|
| 22 |
+
|
| 23 |
+
@classmethod
|
| 24 |
+
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
|
| 25 |
+
"""
|
| 26 |
+
Load tokenizer from either:
|
| 27 |
+
- Local directory path
|
| 28 |
+
- Hugging Face Hub repo ID
|
| 29 |
+
- Cached directory (handled automatically)
|
| 30 |
+
"""
|
| 31 |
+
# First, try to load from local path
|
| 32 |
+
local_tok_path = os.path.join(pretrained_model_name_or_path, "tokenizer.pkl")
|
| 33 |
+
|
| 34 |
+
if os.path.isfile(local_tok_path):
|
| 35 |
+
# Local file exists, load it directly
|
| 36 |
+
with open(local_tok_path, "rb") as f:
|
| 37 |
+
enc = pickle.load(f)
|
| 38 |
+
else:
|
| 39 |
+
# Try to download from Hugging Face Hub
|
| 40 |
+
try:
|
| 41 |
+
# This handles cache automatically and returns the cached file path
|
| 42 |
+
tok_path = hf_hub_download(
|
| 43 |
+
repo_id=pretrained_model_name_or_path,
|
| 44 |
+
filename="tokenizer.pkl"
|
| 45 |
+
)
|
| 46 |
+
with open(tok_path, "rb") as f:
|
| 47 |
+
enc = pickle.load(f)
|
| 48 |
+
except (HfHubHTTPError, OSError) as e:
|
| 49 |
+
raise ValueError(
|
| 50 |
+
f"Could not load tokenizer.pkl from {pretrained_model_name_or_path}. "
|
| 51 |
+
f"Make sure the path exists or the repo is accessible on the Hub."
|
| 52 |
+
) from e
|
| 53 |
+
|
| 54 |
+
return cls(enc)
|
| 55 |
+
|
| 56 |
+
def encode(self, text, prepend=None):
|
| 57 |
+
ids = self.enc.encode_ordinary(text)
|
| 58 |
+
if prepend is not None:
|
| 59 |
+
prepend_id = prepend if isinstance(prepend, int) else self.enc.encode_single_token(prepend)
|
| 60 |
+
ids.insert(0, prepend_id)
|
| 61 |
+
return ids
|
| 62 |
+
|
| 63 |
+
def decode(self, ids):
|
| 64 |
+
return self.enc.decode(ids)
|
| 65 |
+
|
| 66 |
+
def get_bos_token_id(self):
|
| 67 |
+
return self.bos_token_id
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
|