Georgian-Translation / encoder_decoder_tokenizer.py
Darsala's picture
Upload encoder_decoder_tokenizer.py
9412f10 verified
"""
Encoder-Decoder Tokenizer Implementations
Provides tokenizer implementations for encoder-decoder models.
"""
import os
import numpy as np
import torch
from pathlib import Path
from overrides import overrides
from typing import Dict, Any, Tuple, Union, List, Optional, overload
from datasets import Dataset, DatasetDict
from transformers.tokenization_utils_base import (
AddedToken, # type: ignore
BatchEncoding,
EncodedInput,
EncodedInputPair,
PreTokenizedInput,
PreTokenizedInputPair,
TextInput,
TextInputPair,
TruncationStrategy,
)
from transformers.utils import logging
from transformers import AutoTokenizer
from transformers.utils.generic import PaddingStrategy, TensorType
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.modeling_utils import PreTrainedModel
from transformers import EncoderDecoderModel
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = "▁"
class EncoderDecoderTokenizer(PreTrainedTokenizer):
def __init__(self, encoder_tokenizer_path, decoder_tokenizer_path, **kwargs):
self.encoder: PreTrainedTokenizer = AutoTokenizer.from_pretrained(encoder_tokenizer_path)
self.decoder: PreTrainedTokenizer = AutoTokenizer.from_pretrained(decoder_tokenizer_path)
self.current_tokenizer = self.encoder
self._decode_use_source_tokenizer = False
if self.decoder.eos_token is None:
self.decoder.eos_token = self.decoder.sep_token
if self.encoder.eos_token is None:
self.encoder.eos_token = self.encoder.sep_token
if self.encoder.pad_token is None:
self.encoder.pad_token = self.encoder.eos_token
if self.decoder.pad_token is None:
self.decoder.pad_token = self.decoder.eos_token
if self.encoder.bos_token is None:
self.encoder.bos_token = self.encoder.cls_token
if self.decoder.bos_token is None:
self.decoder.bos_token = self.decoder.cls_token
self._pad_token = self.encoder.pad_token
self._unk_token = self.encoder.unk_token
self._bos_token = self.encoder.bos_token
self._eos_token = self.encoder.eos_token
self._sep_token = self.encoder.sep_token
self._cls_token = self.encoder.cls_token
self._mask_token = self.encoder.mask_token
self.decoder_pad_token = self.decoder.pad_token
self.decoder_unk_token = self.decoder.unk_token
self.decoder_bos_token = self.decoder.bos_token
self.decoder_eos_token = self.decoder.eos_token
self.decoder_sep_token = self.decoder.sep_token
self.decoder_cls_token = self.decoder.cls_token
self.decoder_mas_token = self.decoder.mask_token
self.decoder_pad_token_id = self.decoder.pad_token_id
self.decoder_unk_token_id = self.decoder.unk_token_id
self.decoder_bos_token_id = self.decoder.bos_token_id
self.decoder_eos_token_id = self.decoder.eos_token_id
self.decoder_sep_token_id = self.decoder.sep_token_id
self.decoder_cls_token_id = self.decoder.cls_token_id
self.decoder_mas_token_id = self.decoder.mask_token_id
self._additional_special_tokens = []
@property
def is_fast(self) -> bool:
return self.current_tokenizer.is_fast
@property
def vocab_size(self) -> int:
"""
`int`: Size of the base vocabulary (without the added tokens).
"""
return self.current_tokenizer.vocab_size
@property
def added_tokens_encoder(self) -> Dict[str, int]:
"""
Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
optimisation in `self._added_tokens_encoder` for the slow tokenizers.
"""
return self.current_tokenizer.added_tokens_encoder
@property
def added_tokens_decoder(self) -> Dict[int, AddedToken]:
"""
Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
Returns:
`Dict[str, int]`: The added tokens.
"""
return self.current_tokenizer.added_tokens_decoder
@added_tokens_decoder.setter
def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> None:
self.current_tokenizer.added_tokens_decoder = value
def get_added_vocab(self) -> Dict[str, int]:
"""
Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
something we should change.
Returns:
`Dict[str, int]`: The added tokens.
"""
return self._added_tokens_encoder
def __len__(self):
"""
Size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because otherwise if
there is a hole in the vocab, we will add tokenizers at a wrong index.
"""
return len(set(self.get_vocab().keys()))
def num_special_tokens_to_add(self, pair: bool = False) -> int:
"""
Returns the number of added tokens when encoding a sequence with special tokens.
<Tip>
This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
this inside your training loop.
</Tip>
Args:
pair (`bool`, *optional*, defaults to `False`):
Whether the number of added tokens should be computed in the case of a sequence pair or a single
sequence.
Returns:
`int`: Number of special tokens added to sequences.
"""
return self.current_tokenizer.num_special_tokens_to_add(pair)
def tokenize(self, text: TextInput, **kwargs):
"""
Converts a string in a sequence of tokens, using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
Args:
text (`str`):
The sequence to be encoded.
**kwargs (additional keyword arguments):
Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
Returns:
`List[str]`: The list of tokens.
"""
return self.decoder.tokenize(text, **kwargs)
def _tokenize(self, text, **kwargs):
"""
Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
Do NOT take care of added tokens.
"""
raise self.decoder._tokenize(text, **kwargs)
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
"""
Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
vocabulary.
Args:
tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
Returns:
`int` or `List[int]`: The token id or list of token ids.
"""
return self.current_tokenizer.convert_tokens_to_ids(tokens)
def _convert_token_to_id_with_added_voc(self, token):
return self.current_tokenizer._convert_token_to_id_with_added_voc(token)
def _convert_token_to_id(self, token):
return self.current_tokenizer._convert_token_to_id(token)
def encode(self, *args, **kwargs):
return self.current_tokenizer.encode(*args, **kwargs)
def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
List[PreTokenizedInputPair],
List[EncodedInput],
List[EncodedInputPair],
],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
return self.current_tokenizer._batch_encode_plus(batch_text_or_text_pairs=batch_text_or_text_pairs,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def prepare_for_tokenization(
self, text: str, is_split_into_words: bool = False, **kwargs
) -> Tuple[str, Dict[str, Any]]:
"""
Performs any necessary transformations before tokenization.
This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
`kwargs` at the end of the encoding process to be sure all the arguments have been used.
Args:
text (`str`):
The text to prepare.
is_split_into_words (`bool`, *optional*, defaults to `False`):
Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
which it will tokenize. This is useful for NER or token classification.
kwargs (`Dict[str, Any]`, *optional*):
Keyword arguments to use for the tokenization.
Returns:
`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
"""
return self.current_tokenizer.prepare_for_tokenization(text, is_split_into_words, **kwargs)
def get_special_tokens_mask(
self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
Args:
token_ids_0 (`List[int]`):
List of ids of the first sequence.
token_ids_1 (`List[int]`, *optional*):
List of ids of the second sequence.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
return self.current_tokenizer.get_special_tokens_mask(token_ids_0, token_ids_1, already_has_special_tokens)
@overload
def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str:
return self.current_tokenizer.convert_ids_to_tokens(ids, skip_special_tokens)
@overload
def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]:
return self.current_tokenizer.convert_ids_to_tokens(ids, skip_special_tokens)
def convert_ids_to_tokens(
self, ids: Union[int, List[int]], skip_special_tokens: bool = False
) -> Union[str, List[str]]:
"""
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
added tokens.
Args:
ids (`int` or `List[int]`):
The token id (or token ids) to convert to tokens.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
Returns:
`str` or `List[str]`: The decoded token(s).
"""
return self.current_tokenizer.convert_ids_to_tokens(ids, skip_special_tokens)
def convert_tokens_to_string(self, tokens: List[str]) -> str:
return self.current_tokenizer.convert_tokens_to_string(tokens)
def decode(
self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: Optional[bool] = None,
**kwargs,
) -> str:
return self.decoder.decode(token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
@overrides
def __call__(self, text, text_target=None, *args, **kwargs):
if isinstance(text, str):
text = text + self.eos_token
else:
text = [i + self.eos_token for i in text]
results = self.encoder(text, *args, **kwargs)
if text_target:
tmp = self.decoder(text_target, *args, **kwargs)
results['labels'] = tmp['input_ids']
results['labels'][results['labels'] == self.decoder.pad_token_id] = -100
results['decoder_attention_mask'] = tmp['attention_mask']
return results
def _decode(
self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: Optional[bool] = None,
spaces_between_special_tokens: bool = True,
**kwargs,
) -> str:
return self.decoder._decode(token_ids,
skip_special_tokens,
clean_up_tokenization_spaces,
spaces_between_special_tokens)
def save_pretrained(
self,
save_directory: Union[str, os.PathLike],
legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None,
push_to_hub: bool = False,
**kwargs,
) -> None:
encoder_path = Path(save_directory) / Path("encoder")
decoder_path = Path(save_directory) / Path("decoder")
self.encoder.save_pretrained(encoder_path, legacy_format, filename_prefix, push_to_hub, **kwargs)
self.decoder.save_pretrained(decoder_path, legacy_format, filename_prefix, push_to_hub, **kwargs)
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Union[str, os.PathLike],
*init_inputs,
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
local_files_only: bool = False,
token: Optional[Union[str, bool]] = None,
revision: str = "main",
**kwargs,
):
encoder_path = Path(pretrained_model_name_or_path) / Path("encoder")
decoder_path = Path(pretrained_model_name_or_path) / Path("decoder")
return EncoderDecoderTokenizer(encoder_path, decoder_path)
def _switch_to_target_mode(self):
self.current_encoder = self.decoder
def _switch_to_input_mode(self):
self.current_tokenizer = self.encoder
@property
def pad_token_id(self) -> Any:
"""Return pad token ID from current tokenizer."""
return self.current_tokenizer.pad_token_id
@property
def unk_token_id(self) -> Any:
"""Return unk token ID from current tokenizer."""
return self.current_tokenizer.unk_token_id
@property
def bos_token_id(self) -> Any:
"""Return bos token ID from current tokenizer."""
return self.current_tokenizer.bos_token_id
@property
def eos_token_id(self) -> Any:
"""Return eos token ID from current tokenizer."""
return self.current_tokenizer.eos_token_id
@property
def sep_token_id(self) -> Any:
"""Return sep token ID from current tokenizer."""
return self.current_tokenizer.sep_token_id
@property
def cls_token_id(self) -> Any:
"""Return cls token ID from current tokenizer."""
return self.current_tokenizer.cls_token_id
@property
def mask_token_id(self) -> Any:
"""Return mask token ID from current tokenizer."""
return self.current_tokenizer.mask_token_id
def get_vocab(self) -> Dict[str, int]:
"""
Returns the vocabulary as a dictionary of token to indices.
"""
return self.current_tokenizer.get_vocab()
@property
def pad_token(self) -> Any:
"""Return pad token from current tokenizer."""
return self.current_tokenizer.pad_token
@property
def unk_token(self) -> Any:
"""Return unk token from current tokenizer."""
return self.current_tokenizer.unk_token
@property
def bos_token(self) -> Any:
"""Return bos token from current tokenizer."""
return self.current_tokenizer.bos_token
@property
def eos_token(self) -> Any:
"""Return eos token from current tokenizer."""
return self.current_tokenizer.eos_token
@property
def sep_token(self) -> Any:
"""Return sep token from current tokenizer."""
return self.current_tokenizer.sep_token
@property
def cls_token(self) -> Any:
"""Return cls token from current tokenizer."""
return self.current_tokenizer.cls_token
@property
def mask_token(self) -> Any:
"""Return mask token from current tokenizer."""
return self.current_tokenizer.mask_token