Georgian-Translation / encoder_decoder_tokenizer.py

Upload encoder_decoder_tokenizer.py

9412f10 verified 4 months ago

19.3 kB

	"""
	Encoder-Decoder Tokenizer Implementations

	Provides tokenizer implementations for encoder-decoder models.
	"""
	import os
	import numpy as np
	import torch
	from pathlib import Path
	from overrides import overrides
	from typing import Dict, Any, Tuple, Union, List, Optional, overload
	from datasets import Dataset, DatasetDict
	from transformers.tokenization_utils_base import (
	AddedToken, # type: ignore
	BatchEncoding,
	EncodedInput,
	EncodedInputPair,
	PreTokenizedInput,
	PreTokenizedInputPair,
	TextInput,
	TextInputPair,
	TruncationStrategy,
	)
	from transformers.utils import logging
	from transformers import AutoTokenizer
	from transformers.utils.generic import PaddingStrategy, TensorType
	from transformers.tokenization_utils import PreTrainedTokenizer
	from transformers.modeling_utils import PreTrainedModel
	from transformers import EncoderDecoderModel

	logger = logging.get_logger(__name__)

	SPIECE_UNDERLINE = "▁"


	class EncoderDecoderTokenizer(PreTrainedTokenizer):
	def __init__(self, encoder_tokenizer_path, decoder_tokenizer_path, **kwargs):
	self.encoder: PreTrainedTokenizer = AutoTokenizer.from_pretrained(encoder_tokenizer_path)
	self.decoder: PreTrainedTokenizer = AutoTokenizer.from_pretrained(decoder_tokenizer_path)
	self.current_tokenizer = self.encoder
	self._decode_use_source_tokenizer = False

	if self.decoder.eos_token is None:
	self.decoder.eos_token = self.decoder.sep_token

	if self.encoder.eos_token is None:
	self.encoder.eos_token = self.encoder.sep_token

	if self.encoder.pad_token is None:
	self.encoder.pad_token = self.encoder.eos_token
	if self.decoder.pad_token is None:
	self.decoder.pad_token = self.decoder.eos_token

	if self.encoder.bos_token is None:
	self.encoder.bos_token = self.encoder.cls_token
	if self.decoder.bos_token is None:
	self.decoder.bos_token = self.decoder.cls_token

	self._pad_token = self.encoder.pad_token
	self._unk_token = self.encoder.unk_token
	self._bos_token = self.encoder.bos_token
	self._eos_token = self.encoder.eos_token
	self._sep_token = self.encoder.sep_token
	self._cls_token = self.encoder.cls_token
	self._mask_token = self.encoder.mask_token
	self.decoder_pad_token = self.decoder.pad_token
	self.decoder_unk_token = self.decoder.unk_token
	self.decoder_bos_token = self.decoder.bos_token
	self.decoder_eos_token = self.decoder.eos_token
	self.decoder_sep_token = self.decoder.sep_token
	self.decoder_cls_token = self.decoder.cls_token
	self.decoder_mas_token = self.decoder.mask_token

	self.decoder_pad_token_id = self.decoder.pad_token_id
	self.decoder_unk_token_id = self.decoder.unk_token_id
	self.decoder_bos_token_id = self.decoder.bos_token_id
	self.decoder_eos_token_id = self.decoder.eos_token_id
	self.decoder_sep_token_id = self.decoder.sep_token_id
	self.decoder_cls_token_id = self.decoder.cls_token_id
	self.decoder_mas_token_id = self.decoder.mask_token_id
	self._additional_special_tokens = []

	@property
	def is_fast(self) -> bool:
	return self.current_tokenizer.is_fast

	@property
	def vocab_size(self) -> int:
	"""
	`int`: Size of the base vocabulary (without the added tokens).
	"""
	return self.current_tokenizer.vocab_size

	@property
	def added_tokens_encoder(self) -> Dict[str, int]:
	"""
	Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
	optimisation in `self._added_tokens_encoder` for the slow tokenizers.
	"""
	return self.current_tokenizer.added_tokens_encoder

	@property
	def added_tokens_decoder(self) -> Dict[int, AddedToken]:
	"""
	Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

	Returns:
	`Dict[str, int]`: The added tokens.
	"""
	return self.current_tokenizer.added_tokens_decoder

	@added_tokens_decoder.setter
	def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> None:
	self.current_tokenizer.added_tokens_decoder = value

	def get_added_vocab(self) -> Dict[str, int]:
	"""
	Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
	the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
	something we should change.

	Returns:
	`Dict[str, int]`: The added tokens.
	"""
	return self._added_tokens_encoder

	def __len__(self):
	"""
	Size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because otherwise if
	there is a hole in the vocab, we will add tokenizers at a wrong index.
	"""
	return len(set(self.get_vocab().keys()))

	def num_special_tokens_to_add(self, pair: bool = False) -> int:
	"""
	Returns the number of added tokens when encoding a sequence with special tokens.

	<Tip>

	This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
	this inside your training loop.

	</Tip>

	Args:
	pair (`bool`, optional, defaults to `False`):
	Whether the number of added tokens should be computed in the case of a sequence pair or a single
	sequence.

	Returns:
	`int`: Number of special tokens added to sequences.
	"""
	return self.current_tokenizer.num_special_tokens_to_add(pair)

	def tokenize(self, text: TextInput, **kwargs):
	"""
	Converts a string in a sequence of tokens, using the tokenizer.

	Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
	(BPE/SentencePieces/WordPieces). Takes care of added tokens.

	Args:
	text (`str`):
	The sequence to be encoded.
	**kwargs (additional keyword arguments):
	Passed along to the model-specific `prepare_for_tokenization` preprocessing method.

	Returns:
	`List[str]`: The list of tokens.
	"""
	return self.decoder.tokenize(text, **kwargs)

	def _tokenize(self, text, **kwargs):
	"""
	Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
	vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).

	Do NOT take care of added tokens.
	"""
	raise self.decoder._tokenize(text, **kwargs)

	def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
	"""
	Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
	vocabulary.

	Args:
	tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).

	Returns:
	`int` or `List[int]`: The token id or list of token ids.
	"""
	return self.current_tokenizer.convert_tokens_to_ids(tokens)

	def _convert_token_to_id_with_added_voc(self, token):
	return self.current_tokenizer._convert_token_to_id_with_added_voc(token)

	def _convert_token_to_id(self, token):
	return self.current_tokenizer._convert_token_to_id(token)

	def encode(self, args, *kwargs):
	return self.current_tokenizer.encode(args, *kwargs)

	def _batch_encode_plus(
	self,
	batch_text_or_text_pairs: Union[
	List[TextInput],
	List[TextInputPair],
	List[PreTokenizedInput],
	List[PreTokenizedInputPair],
	List[EncodedInput],
	List[EncodedInputPair],
	],
	add_special_tokens: bool = True,
	padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
	truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
	max_length: Optional[int] = None,
	stride: int = 0,
	is_split_into_words: bool = False,
	pad_to_multiple_of: Optional[int] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	return_token_type_ids: Optional[bool] = None,
	return_attention_mask: Optional[bool] = None,
	return_overflowing_tokens: bool = False,
	return_special_tokens_mask: bool = False,
	return_offsets_mapping: bool = False,
	return_length: bool = False,
	verbose: bool = True,
	**kwargs,
	) -> BatchEncoding:
	return self.current_tokenizer._batch_encode_plus(batch_text_or_text_pairs=batch_text_or_text_pairs,
	add_special_tokens=add_special_tokens,
	padding_strategy=padding_strategy,
	truncation_strategy=truncation_strategy,
	max_length=max_length,
	stride=stride,
	is_split_into_words=is_split_into_words,
	pad_to_multiple_of=pad_to_multiple_of,
	return_tensors=return_tensors,
	return_token_type_ids=return_token_type_ids,
	return_attention_mask=return_attention_mask,
	return_overflowing_tokens=return_overflowing_tokens,
	return_special_tokens_mask=return_special_tokens_mask,
	return_offsets_mapping=return_offsets_mapping,
	return_length=return_length,
	verbose=verbose,
	**kwargs,
	)

	def prepare_for_tokenization(
	self, text: str, is_split_into_words: bool = False, **kwargs
	) -> Tuple[str, Dict[str, Any]]:
	"""
	Performs any necessary transformations before tokenization.

	This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
	`kwargs` at the end of the encoding process to be sure all the arguments have been used.

	Args:
	text (`str`):
	The text to prepare.
	is_split_into_words (`bool`, optional, defaults to `False`):
	Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
	tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
	which it will tokenize. This is useful for NER or token classification.
	kwargs (`Dict[str, Any]`, optional):
	Keyword arguments to use for the tokenization.

	Returns:
	`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
	"""
	return self.current_tokenizer.prepare_for_tokenization(text, is_split_into_words, **kwargs)

	def get_special_tokens_mask(
	self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
	) -> List[int]:
	"""
	Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
	special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

	Args:
	token_ids_0 (`List[int]`):
	List of ids of the first sequence.
	token_ids_1 (`List[int]`, optional):
	List of ids of the second sequence.
	already_has_special_tokens (`bool`, optional, defaults to `False`):
	Whether or not the token list is already formatted with special tokens for the model.

	Returns:
	A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
	"""

	return self.current_tokenizer.get_special_tokens_mask(token_ids_0, token_ids_1, already_has_special_tokens)

	@overload
	def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str:
	return self.current_tokenizer.convert_ids_to_tokens(ids, skip_special_tokens)

	@overload
	def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]:
	return self.current_tokenizer.convert_ids_to_tokens(ids, skip_special_tokens)

	def convert_ids_to_tokens(
	self, ids: Union[int, List[int]], skip_special_tokens: bool = False
	) -> Union[str, List[str]]:
	"""
	Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
	added tokens.

	Args:
	ids (`int` or `List[int]`):
	The token id (or token ids) to convert to tokens.
	skip_special_tokens (`bool`, optional, defaults to `False`):
	Whether or not to remove special tokens in the decoding.

	Returns:
	`str` or `List[str]`: The decoded token(s).
	"""
	return self.current_tokenizer.convert_ids_to_tokens(ids, skip_special_tokens)

	def convert_tokens_to_string(self, tokens: List[str]) -> str:
	return self.current_tokenizer.convert_tokens_to_string(tokens)

	def decode(
	self,
	token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor"],
	skip_special_tokens: bool = False,
	clean_up_tokenization_spaces: Optional[bool] = None,
	**kwargs,
	) -> str:
	return self.decoder.decode(token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)

	@overrides
	def __call__(self, text, text_target=None, args, *kwargs):
	if isinstance(text, str):
	text = text + self.eos_token
	else:
	text = [i + self.eos_token for i in text]
	results = self.encoder(text, args, *kwargs)
	if text_target:
	tmp = self.decoder(text_target, args, *kwargs)
	results['labels'] = tmp['input_ids']
	results['labels'][results['labels'] == self.decoder.pad_token_id] = -100
	results['decoder_attention_mask'] = tmp['attention_mask']
	return results

	def _decode(
	self,
	token_ids: List[int],
	skip_special_tokens: bool = False,
	clean_up_tokenization_spaces: Optional[bool] = None,
	spaces_between_special_tokens: bool = True,
	**kwargs,
	) -> str:
	return self.decoder._decode(token_ids,
	skip_special_tokens,
	clean_up_tokenization_spaces,
	spaces_between_special_tokens)

	def save_pretrained(
	self,
	save_directory: Union[str, os.PathLike],
	legacy_format: Optional[bool] = None,
	filename_prefix: Optional[str] = None,
	push_to_hub: bool = False,
	**kwargs,
	) -> None:
	encoder_path = Path(save_directory) / Path("encoder")
	decoder_path = Path(save_directory) / Path("decoder")
	self.encoder.save_pretrained(encoder_path, legacy_format, filename_prefix, push_to_hub, **kwargs)
	self.decoder.save_pretrained(decoder_path, legacy_format, filename_prefix, push_to_hub, **kwargs)

	@classmethod
	def from_pretrained(
	cls,
	pretrained_model_name_or_path: Union[str, os.PathLike],
	*init_inputs,
	cache_dir: Optional[Union[str, os.PathLike]] = None,
	force_download: bool = False,
	local_files_only: bool = False,
	token: Optional[Union[str, bool]] = None,
	revision: str = "main",
	**kwargs,
	):
	encoder_path = Path(pretrained_model_name_or_path) / Path("encoder")
	decoder_path = Path(pretrained_model_name_or_path) / Path("decoder")

	return EncoderDecoderTokenizer(encoder_path, decoder_path)

	def _switch_to_target_mode(self):
	self.current_encoder = self.decoder

	def _switch_to_input_mode(self):
	self.current_tokenizer = self.encoder

	@property
	def pad_token_id(self) -> Any:
	"""Return pad token ID from current tokenizer."""
	return self.current_tokenizer.pad_token_id

	@property
	def unk_token_id(self) -> Any:
	"""Return unk token ID from current tokenizer."""
	return self.current_tokenizer.unk_token_id

	@property
	def bos_token_id(self) -> Any:
	"""Return bos token ID from current tokenizer."""
	return self.current_tokenizer.bos_token_id

	@property
	def eos_token_id(self) -> Any:
	"""Return eos token ID from current tokenizer."""
	return self.current_tokenizer.eos_token_id

	@property
	def sep_token_id(self) -> Any:
	"""Return sep token ID from current tokenizer."""
	return self.current_tokenizer.sep_token_id

	@property
	def cls_token_id(self) -> Any:
	"""Return cls token ID from current tokenizer."""
	return self.current_tokenizer.cls_token_id

	@property
	def mask_token_id(self) -> Any:
	"""Return mask token ID from current tokenizer."""
	return self.current_tokenizer.mask_token_id

	def get_vocab(self) -> Dict[str, int]:
	"""
	Returns the vocabulary as a dictionary of token to indices.
	"""
	return self.current_tokenizer.get_vocab()

	@property
	def pad_token(self) -> Any:
	"""Return pad token from current tokenizer."""
	return self.current_tokenizer.pad_token

	@property
	def unk_token(self) -> Any:
	"""Return unk token from current tokenizer."""
	return self.current_tokenizer.unk_token

	@property
	def bos_token(self) -> Any:
	"""Return bos token from current tokenizer."""
	return self.current_tokenizer.bos_token

	@property
	def eos_token(self) -> Any:
	"""Return eos token from current tokenizer."""
	return self.current_tokenizer.eos_token

	@property
	def sep_token(self) -> Any:
	"""Return sep token from current tokenizer."""
	return self.current_tokenizer.sep_token

	@property
	def cls_token(self) -> Any:
	"""Return cls token from current tokenizer."""
	return self.current_tokenizer.cls_token

	@property
	def mask_token(self) -> Any:
	"""Return mask token from current tokenizer."""
	return self.current_tokenizer.mask_token