|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
DiffusionVL Processor - Combines image processor and tokenizer. |
|
|
""" |
|
|
|
|
|
import re |
|
|
from typing import List, Optional, Union |
|
|
|
|
|
import torch |
|
|
|
|
|
from transformers.feature_extraction_utils import BatchFeature |
|
|
from transformers.image_utils import ImageInput |
|
|
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack |
|
|
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput |
|
|
from transformers.video_utils import VideoInput |
|
|
|
|
|
|
|
|
IMAGE_TOKEN_INDEX = -200 |
|
|
DEFAULT_IMAGE_TOKEN = "<image>" |
|
|
|
|
|
|
|
|
class DiffusionVL_Qwen2_5_VL_ProcessorKwargs(ProcessingKwargs, total=False): |
|
|
"""Keyword arguments for DiffusionVL_Qwen2_5_VL_Processor.""" |
|
|
|
|
|
_defaults = { |
|
|
"text_kwargs": { |
|
|
"padding": False, |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
def tokenizer_image_token( |
|
|
prompt: str, |
|
|
tokenizer, |
|
|
image_token_index: int = IMAGE_TOKEN_INDEX, |
|
|
return_tensors: Optional[str] = None, |
|
|
) -> Union[List[int], torch.Tensor]: |
|
|
""" |
|
|
Tokenize text with image placeholders, replacing <image> with IMAGE_TOKEN_INDEX. |
|
|
|
|
|
Args: |
|
|
prompt: Input text containing <image> placeholders. |
|
|
tokenizer: The tokenizer to use for encoding text. |
|
|
image_token_index: The token index to use for image placeholders. |
|
|
return_tensors: If "pt", return a PyTorch tensor. |
|
|
|
|
|
Returns: |
|
|
List of token IDs or a PyTorch tensor. |
|
|
""" |
|
|
prompt_chunks = prompt.split(DEFAULT_IMAGE_TOKEN) |
|
|
|
|
|
input_ids = [] |
|
|
offset = 0 |
|
|
|
|
|
if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0: |
|
|
|
|
|
input_ids = tokenizer(prompt_chunks[0], add_special_tokens=False).input_ids |
|
|
offset = 1 |
|
|
|
|
|
for chunk_idx in range(offset, len(prompt_chunks)): |
|
|
chunk = prompt_chunks[chunk_idx] |
|
|
|
|
|
input_ids.append(image_token_index) |
|
|
|
|
|
if len(chunk) > 0: |
|
|
input_ids.extend(tokenizer(chunk, add_special_tokens=False).input_ids) |
|
|
|
|
|
if return_tensors == "pt": |
|
|
return torch.tensor(input_ids, dtype=torch.long) |
|
|
return input_ids |
|
|
|
|
|
|
|
|
class DiffusionVL_Qwen2_5_VL_Processor(ProcessorMixin): |
|
|
r""" |
|
|
Constructs a DiffusionVL processor which wraps an image processor and a tokenizer into a single processor. |
|
|
|
|
|
[`DiffusionVL_Qwen2_5_VL_Processor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. |
|
|
See the [`~DiffusionVL_Qwen2_5_VL_Processor.__call__`] and [`~DiffusionVL_Qwen2_5_VL_Processor.decode`] for more information. |
|
|
|
|
|
This processor uses LLaVA-style image token handling: |
|
|
- `<image>` in text is replaced with `IMAGE_TOKEN_INDEX` (-200) in input_ids |
|
|
- The model's `prepare_inputs_labels_for_multimodal` replaces -200 with actual image features |
|
|
|
|
|
Args: |
|
|
image_processor ([`Qwen2VLImageProcessor`], *optional*): |
|
|
The image processor is a required input. |
|
|
tokenizer ([`Qwen2TokenizerFast`], *optional*): |
|
|
The tokenizer is a required input. |
|
|
chat_template (`str`, *optional*): |
|
|
A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. |
|
|
|
|
|
Example: |
|
|
|
|
|
```python |
|
|
>>> from transformers import AutoProcessor |
|
|
>>> from PIL import Image |
|
|
|
|
|
>>> processor = AutoProcessor.from_pretrained("path/to/model", trust_remote_code=True) |
|
|
|
|
|
>>> # Prepare text with image placeholder |
|
|
>>> messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}] |
|
|
>>> text = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
|
|
|
|
>>> # Process image and text |
|
|
>>> image = Image.open("image.jpg") |
|
|
>>> inputs = processor(text=[text], images=[image], return_tensors="pt") |
|
|
``` |
|
|
""" |
|
|
|
|
|
attributes = ["image_processor", "tokenizer"] |
|
|
image_processor_class = "Qwen2VLImageProcessor" |
|
|
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
image_processor=None, |
|
|
tokenizer=None, |
|
|
chat_template: Optional[str] = None, |
|
|
**kwargs, |
|
|
): |
|
|
self.image_token = DEFAULT_IMAGE_TOKEN |
|
|
self.image_token_index = IMAGE_TOKEN_INDEX |
|
|
|
|
|
super().__init__(image_processor, tokenizer, chat_template=chat_template) |
|
|
|
|
|
def __call__( |
|
|
self, |
|
|
images: Optional[ImageInput] = None, |
|
|
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, |
|
|
videos: Optional[VideoInput] = None, |
|
|
**kwargs: Unpack[DiffusionVL_Qwen2_5_VL_ProcessorKwargs], |
|
|
) -> BatchFeature: |
|
|
""" |
|
|
Main method to prepare for the model one or several sequences and image(s). |
|
|
|
|
|
This method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] |
|
|
if `text` is not `None` to encode the text. To prepare the vision inputs, this method forwards the `images` |
|
|
and `kwargs` arguments to Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `images` is not `None`. |
|
|
|
|
|
The text should contain `<image>` placeholders where images should be inserted. |
|
|
These will be replaced with `IMAGE_TOKEN_INDEX` (-200) in the output input_ids. |
|
|
|
|
|
Args: |
|
|
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, *optional*): |
|
|
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array, or PyTorch |
|
|
tensor. Both channels-first and channels-last formats are supported. |
|
|
text (`str`, `List[str]`, *optional*): |
|
|
The sequence or batch of sequences to be encoded. Each sequence should be a string containing |
|
|
`<image>` placeholders where images will be inserted. |
|
|
videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, *optional*): |
|
|
The video or batch of videos to be prepared. Currently not fully supported. |
|
|
return_tensors (`str` or [`~utils.TensorType`], *optional*): |
|
|
If set, will return tensors of a particular framework. Acceptable values are: |
|
|
- `'pt'`: Return PyTorch `torch.Tensor` objects. |
|
|
- `'np'`: Return NumPy `np.ndarray` objects. |
|
|
|
|
|
Returns: |
|
|
[`BatchFeature`]: A [`BatchFeature`] with the following fields: |
|
|
|
|
|
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. |
|
|
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model. |
|
|
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. |
|
|
- **image_grid_thw** -- List of image 3D grid dimensions. Returned when `images` is not `None`. |
|
|
""" |
|
|
output_kwargs = self._merge_kwargs( |
|
|
DiffusionVL_Qwen2_5_VL_ProcessorKwargs, |
|
|
tokenizer_init_kwargs=self.tokenizer.init_kwargs, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
|
|
|
image_inputs = {} |
|
|
if images is not None: |
|
|
image_inputs = self.image_processor( |
|
|
images=images, **output_kwargs.get("images_kwargs", {}) |
|
|
) |
|
|
|
|
|
|
|
|
if text is None: |
|
|
return BatchFeature(data=image_inputs) |
|
|
|
|
|
if not isinstance(text, list): |
|
|
text = [text] |
|
|
|
|
|
|
|
|
return_tensors = output_kwargs.get("text_kwargs", {}).pop("return_tensors", None) |
|
|
|
|
|
all_input_ids = [] |
|
|
for t in text: |
|
|
input_ids = tokenizer_image_token( |
|
|
t, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors=None |
|
|
) |
|
|
all_input_ids.append(input_ids) |
|
|
|
|
|
|
|
|
max_len = max(len(ids) for ids in all_input_ids) |
|
|
padded_input_ids = [] |
|
|
attention_masks = [] |
|
|
|
|
|
pad_token_id = ( |
|
|
self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0 |
|
|
) |
|
|
|
|
|
for ids in all_input_ids: |
|
|
padding_length = max_len - len(ids) |
|
|
padded_ids = ids + [pad_token_id] * padding_length |
|
|
mask = [1] * len(ids) + [0] * padding_length |
|
|
padded_input_ids.append(padded_ids) |
|
|
attention_masks.append(mask) |
|
|
|
|
|
text_inputs = { |
|
|
"input_ids": padded_input_ids, |
|
|
"attention_mask": attention_masks, |
|
|
} |
|
|
|
|
|
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) |
|
|
|
|
|
def build_conversation_input_ids( |
|
|
self, |
|
|
messages: List[dict], |
|
|
images: Optional[List] = None, |
|
|
add_generation_prompt: bool = True, |
|
|
) -> dict: |
|
|
""" |
|
|
Build input_ids from conversation messages in LLaVA format. |
|
|
|
|
|
This method converts a list of messages into a prompt string with `<image>` placeholders. |
|
|
Uses LLaVA-style chat template format (trained format). |
|
|
|
|
|
Args: |
|
|
messages: List of message dicts with 'role' and 'content' keys. |
|
|
Content can be a string or a list of dicts with 'type' key ('text' or 'image'). |
|
|
images: Optional list of images (used for validation). |
|
|
add_generation_prompt: Whether to add generation prompt at the end. |
|
|
|
|
|
Returns: |
|
|
dict with 'text' key containing the prompt string with `<image>` placeholders. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
text_parts = [] |
|
|
|
|
|
for message in messages: |
|
|
role = message.get("role", "user") |
|
|
content = message.get("content", "") |
|
|
|
|
|
text_parts.append(f"<|im_start|>{role}\n") |
|
|
|
|
|
|
|
|
if isinstance(content, str): |
|
|
text_parts.append(content) |
|
|
elif isinstance(content, list): |
|
|
for item in content: |
|
|
if isinstance(item, dict): |
|
|
if item.get("type") == "image": |
|
|
text_parts.append(DEFAULT_IMAGE_TOKEN) |
|
|
elif item.get("type") == "text": |
|
|
text_parts.append(item.get("text", "")) |
|
|
else: |
|
|
text_parts.append(str(item)) |
|
|
|
|
|
text_parts.append("<|im_end|>\n") |
|
|
|
|
|
if add_generation_prompt: |
|
|
text_parts.append("<|im_start|>assistant\n") |
|
|
|
|
|
text = "".join(text_parts) |
|
|
return {"text": text} |
|
|
|
|
|
def batch_decode(self, *args, **kwargs): |
|
|
""" |
|
|
Decode a batch of token IDs to text. |
|
|
|
|
|
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. |
|
|
Please refer to the docstring of this method for more information. |
|
|
""" |
|
|
return self.tokenizer.batch_decode(*args, **kwargs) |
|
|
|
|
|
def decode(self, *args, **kwargs): |
|
|
""" |
|
|
Decode token IDs to text. |
|
|
|
|
|
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. |
|
|
Please refer to the docstring of this method for more information. |
|
|
""" |
|
|
return self.tokenizer.decode(*args, **kwargs) |
|
|
|
|
|
@property |
|
|
def model_input_names(self) -> List[str]: |
|
|
"""Return the list of model input names.""" |
|
|
tokenizer_names = self.tokenizer.model_input_names |
|
|
image_processor_names = self.image_processor.model_input_names |
|
|
return list(dict.fromkeys(tokenizer_names + image_processor_names)) |
|
|
|
|
|
|
|
|
__all__ = ["DiffusionVL_Qwen2_5_VL_Processor", "tokenizer_image_token"] |
|
|
|