import gradio as gr from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration, AutoProcessor, AutoModelForCausalLM import torch # --- Modelos disponibles --- MODELOS = { "BLIP (Salesforce/blip-image-captioning-base)": ( "Salesforce/blip-image-captioning-base", "blip" ), "LLaVA 1.5 (liuhaotian/llava-v1.5-7b)": ( "liuhaotian/llava-v1.5-7b", "llava" ), } # --- Función principal --- def generar_descripcion(imagen, modelo_nombre): modelo_id, tipo = MODELOS[modelo_nombre] if tipo == "blip": processor = BlipProcessor.from_pretrained(modelo_id) model = BlipForConditionalGeneration.from_pretrained(modelo_id) inputs = processor(images=imagen, return_tensors="pt") out = model.generate(**inputs, max_new_tokens=60) return processor.decode(out[0], skip_special_tokens=True) elif tipo == "llava": processor = AutoProcessor.from_pretrained(modelo_id) model = AutoModelForCausalLM.from_pretrained(modelo_id, torch_dtype=torch.float16, device_map="auto") prompt = "Describe la imagen médica en lenguaje clínico." inputs = processor(prompt, images=imagen, return_tensors="pt").to(model.device) out = model.generate(**inputs, max_new_tokens=80) return processor.decode(out[0], skip_special_tokens=True) # --- Interfaz Gradio --- iface = gr.Interface( fn=generar_descripcion, inputs=[ gr.Image(type="pil", label="Subir imagen mpMRI (PI-CAI)"), gr.Radio(list(MODELOS.keys()), label="Seleccionar modelo"), ], outputs="text", title="Benchmark VLMs en PI-CAI Challenge", description="Comparación de modelos visión-lenguaje (BLIP, LLaVA, etc.) sobre imágenes mpMRI de próstata del dataset 6624726." ) iface.launch()