from typing import Union from transformers import CLIPProcessor, CLIPTokenizer, CLIPTokenizerFast class LongCLIPProcessor(CLIPProcessor): tokenizer: Union[CLIPTokenizer, CLIPTokenizerFast] def __call__( self, text=None, short_text=None, images=None, return_tensors=None, **kwargs ): encoding = super().__call__(text, images, return_tensors, **kwargs) if short_text is not None: short_text_encoding = self.tokenizer( short_text, return_tensors=return_tensors, **kwargs ) encoding["short_input_ids"] = short_text_encoding.input_ids encoding["short_attention_mask"] = short_text_encoding.attention_mask return encoding