Madras1 commited on
Commit
e451153
verified
1 Parent(s): b716c3b

Upload 10 files

Browse files
Files changed (3) hide show
  1. jade/config.json +1 -1
  2. jade/handlers.py +29 -8
  3. requirements.txt +3 -1
jade/config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "groq_model": "moonshotai/kimi-k2-instruct-0905",
3
  "audio_model": "whisper-large-v3",
4
- "caption_model": "Salesforce/blip-image-captioning-large",
5
  "max_context": 12,
6
  "language": "pt",
7
  "local_mode": false
 
1
  {
2
  "groq_model": "moonshotai/kimi-k2-instruct-0905",
3
  "audio_model": "whisper-large-v3",
4
+ "caption_model": "microsoft/Florence-2-base-ft",
5
  "max_context": 12,
6
  "language": "pt",
7
  "local_mode": false
jade/handlers.py CHANGED
@@ -1,4 +1,4 @@
1
- from transformers import BlipProcessor, BlipForConditionalGeneration
2
  from PIL import Image
3
  import torch
4
 
@@ -13,11 +13,11 @@ class AudioHandler:
13
 
14
  class ImageHandler:
15
  def __init__(self, model_name):
16
- self.processor = BlipProcessor.from_pretrained(model_name, use_fast=True)
17
- self.model = BlipForConditionalGeneration.from_pretrained(model_name)
18
- self.model.eval()
19
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
20
  self.model.to(self.device)
 
21
 
22
  def process_pil_image(self, pil_image: Image.Image):
23
  """Processa um objeto PIL.Image vindo diretamente do Gradio."""
@@ -26,8 +26,29 @@ class ImageHandler:
26
  return self._generate_caption(pil_image.convert("RGB"))
27
 
28
  def _generate_caption(self, img):
29
- """L贸gica de gera莽茫o de legenda reutiliz谩vel."""
 
 
 
30
  with torch.no_grad():
31
- inputs = self.processor(img, "a photo of", return_tensors="pt").to(self.device)
32
- out = self.model.generate(**inputs, max_new_tokens=60)
33
- return self.processor.decode(out[0], skip_special_tokens=True).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoProcessor, AutoModelForCausalLM
2
  from PIL import Image
3
  import torch
4
 
 
13
 
14
  class ImageHandler:
15
  def __init__(self, model_name):
16
+ self.processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
17
+ self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
 
18
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
19
  self.model.to(self.device)
20
+ self.model.eval()
21
 
22
  def process_pil_image(self, pil_image: Image.Image):
23
  """Processa um objeto PIL.Image vindo diretamente do Gradio."""
 
26
  return self._generate_caption(pil_image.convert("RGB"))
27
 
28
  def _generate_caption(self, img):
29
+ """L贸gica de gera莽茫o de legenda reutiliz谩vel usando Florence-2."""
30
+ # Prompt para descri莽茫o detalhada
31
+ prompt = "<MORE_DETAILED_CAPTION>"
32
+
33
  with torch.no_grad():
34
+ inputs = self.processor(text=prompt, images=img, return_tensors="pt").to(self.device)
35
+
36
+ generated_ids = self.model.generate(
37
+ input_ids=inputs["input_ids"],
38
+ pixel_values=inputs["pixel_values"],
39
+ max_new_tokens=1024,
40
+ do_sample=False,
41
+ num_beams=3,
42
+ )
43
+
44
+ generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
45
+
46
+ # O Florence-2 requer p贸s-processamento para extrair a resposta limpa
47
+ parsed_answer = self.processor.post_process_generation(
48
+ generated_text,
49
+ task=prompt,
50
+ image_size=(img.width, img.height)
51
+ )
52
+
53
+ # parsed_answer retorna um dict, ex: {'<MORE_DETAILED_CAPTION>': 'texto da legenda'}
54
+ return parsed_answer.get(prompt, "")
requirements.txt CHANGED
@@ -12,4 +12,6 @@ fastapi
12
  uvicorn[standard]
13
  joblib
14
  scikit-learn
15
- numpy
 
 
 
12
  uvicorn[standard]
13
  joblib
14
  scikit-learn
15
+ numpy
16
+ einops
17
+ timm