|
|
from transformers import AutoProcessor, AutoModelForCausalLM |
|
|
from PIL import Image |
|
|
import torch |
|
|
|
|
|
class TextHandler: |
|
|
def process(self): |
|
|
return input("⌨️ Digite sua mensagem: ").strip() |
|
|
|
|
|
class AudioHandler: |
|
|
def __init__(self, client, audio_model): |
|
|
self.client = client |
|
|
self.audio_model = audio_model |
|
|
|
|
|
class ImageHandler: |
|
|
def __init__(self, model_name): |
|
|
self.processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) |
|
|
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) |
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
self.model.to(self.device) |
|
|
self.model.eval() |
|
|
|
|
|
def process_pil_image(self, pil_image: Image.Image): |
|
|
"""Processa um objeto PIL.Image vindo diretamente do Gradio.""" |
|
|
if not isinstance(pil_image, Image.Image): |
|
|
raise TypeError("A entrada deve ser um objeto PIL.Image.") |
|
|
return self._generate_caption(pil_image.convert("RGB")) |
|
|
|
|
|
def _generate_caption(self, img): |
|
|
"""Lógica de geração de legenda reutilizável usando Florence-2.""" |
|
|
|
|
|
prompt = "<MORE_DETAILED_CAPTION>" |
|
|
|
|
|
with torch.no_grad(): |
|
|
inputs = self.processor(text=prompt, images=img, return_tensors="pt").to(self.device) |
|
|
|
|
|
generated_ids = self.model.generate( |
|
|
input_ids=inputs["input_ids"], |
|
|
pixel_values=inputs["pixel_values"], |
|
|
max_new_tokens=1024, |
|
|
do_sample=False, |
|
|
num_beams=3, |
|
|
) |
|
|
|
|
|
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0] |
|
|
|
|
|
|
|
|
parsed_answer = self.processor.post_process_generation( |
|
|
generated_text, |
|
|
task=prompt, |
|
|
image_size=(img.width, img.height) |
|
|
) |
|
|
|
|
|
|
|
|
return parsed_answer.get(prompt, "") |
|
|
|