Upload 10 files
Browse files- jade/config.json +1 -1
- jade/handlers.py +29 -8
- requirements.txt +3 -1
jade/config.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"groq_model": "moonshotai/kimi-k2-instruct-0905",
|
| 3 |
"audio_model": "whisper-large-v3",
|
| 4 |
-
"caption_model": "
|
| 5 |
"max_context": 12,
|
| 6 |
"language": "pt",
|
| 7 |
"local_mode": false
|
|
|
|
| 1 |
{
|
| 2 |
"groq_model": "moonshotai/kimi-k2-instruct-0905",
|
| 3 |
"audio_model": "whisper-large-v3",
|
| 4 |
+
"caption_model": "microsoft/Florence-2-base-ft",
|
| 5 |
"max_context": 12,
|
| 6 |
"language": "pt",
|
| 7 |
"local_mode": false
|
jade/handlers.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from transformers import
|
| 2 |
from PIL import Image
|
| 3 |
import torch
|
| 4 |
|
|
@@ -13,11 +13,11 @@ class AudioHandler:
|
|
| 13 |
|
| 14 |
class ImageHandler:
|
| 15 |
def __init__(self, model_name):
|
| 16 |
-
self.processor =
|
| 17 |
-
self.model =
|
| 18 |
-
self.model.eval()
|
| 19 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 20 |
self.model.to(self.device)
|
|
|
|
| 21 |
|
| 22 |
def process_pil_image(self, pil_image: Image.Image):
|
| 23 |
"""Processa um objeto PIL.Image vindo diretamente do Gradio."""
|
|
@@ -26,8 +26,29 @@ class ImageHandler:
|
|
| 26 |
return self._generate_caption(pil_image.convert("RGB"))
|
| 27 |
|
| 28 |
def _generate_caption(self, img):
|
| 29 |
-
"""L贸gica de gera莽茫o de legenda reutiliz谩vel."""
|
|
|
|
|
|
|
|
|
|
| 30 |
with torch.no_grad():
|
| 31 |
-
inputs = self.processor(
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoProcessor, AutoModelForCausalLM
|
| 2 |
from PIL import Image
|
| 3 |
import torch
|
| 4 |
|
|
|
|
| 13 |
|
| 14 |
class ImageHandler:
|
| 15 |
def __init__(self, model_name):
|
| 16 |
+
self.processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
| 17 |
+
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
|
|
|
|
| 18 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 19 |
self.model.to(self.device)
|
| 20 |
+
self.model.eval()
|
| 21 |
|
| 22 |
def process_pil_image(self, pil_image: Image.Image):
|
| 23 |
"""Processa um objeto PIL.Image vindo diretamente do Gradio."""
|
|
|
|
| 26 |
return self._generate_caption(pil_image.convert("RGB"))
|
| 27 |
|
| 28 |
def _generate_caption(self, img):
|
| 29 |
+
"""L贸gica de gera莽茫o de legenda reutiliz谩vel usando Florence-2."""
|
| 30 |
+
# Prompt para descri莽茫o detalhada
|
| 31 |
+
prompt = "<MORE_DETAILED_CAPTION>"
|
| 32 |
+
|
| 33 |
with torch.no_grad():
|
| 34 |
+
inputs = self.processor(text=prompt, images=img, return_tensors="pt").to(self.device)
|
| 35 |
+
|
| 36 |
+
generated_ids = self.model.generate(
|
| 37 |
+
input_ids=inputs["input_ids"],
|
| 38 |
+
pixel_values=inputs["pixel_values"],
|
| 39 |
+
max_new_tokens=1024,
|
| 40 |
+
do_sample=False,
|
| 41 |
+
num_beams=3,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
|
| 45 |
+
|
| 46 |
+
# O Florence-2 requer p贸s-processamento para extrair a resposta limpa
|
| 47 |
+
parsed_answer = self.processor.post_process_generation(
|
| 48 |
+
generated_text,
|
| 49 |
+
task=prompt,
|
| 50 |
+
image_size=(img.width, img.height)
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# parsed_answer retorna um dict, ex: {'<MORE_DETAILED_CAPTION>': 'texto da legenda'}
|
| 54 |
+
return parsed_answer.get(prompt, "")
|
requirements.txt
CHANGED
|
@@ -12,4 +12,6 @@ fastapi
|
|
| 12 |
uvicorn[standard]
|
| 13 |
joblib
|
| 14 |
scikit-learn
|
| 15 |
-
numpy
|
|
|
|
|
|
|
|
|
| 12 |
uvicorn[standard]
|
| 13 |
joblib
|
| 14 |
scikit-learn
|
| 15 |
+
numpy
|
| 16 |
+
einops
|
| 17 |
+
timm
|