|
|
|
|
|
from PIL import Image |
|
|
import argparse |
|
|
from transformers import pipeline |
|
|
|
|
|
DEFAULT_VL_MODEL = "Qwen/Qwen3-VL-4B-Instruct" |
|
|
FALLBACK_CAPTIONER = "Salesforce/blip-image-captioning-base" |
|
|
FALLBACK_LM = "google/flan-t5-small" |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("--image", required=True) |
|
|
parser.add_argument("--prompt", required=True) |
|
|
args = parser.parse_args() |
|
|
|
|
|
try: |
|
|
vl_pipe = pipeline("image-to-text", model=DEFAULT_VL_MODEL) |
|
|
img = Image.open(args.image) |
|
|
output = vl_pipe(img, prompt=args.prompt) |
|
|
print(output) |
|
|
except Exception: |
|
|
print("VL model not found, using fallback...") |
|
|
captioner = pipeline("image-to-text", model=FALLBACK_CAPTIONER) |
|
|
lm = pipeline("text2text-generation", model=FALLBACK_LM) |
|
|
caption = captioner(Image.open(args.image))[0]["generated_text"] |
|
|
prompt = f"Image: {caption}\nInstruction: {args.prompt}" |
|
|
response = lm(prompt)[0]["generated_text"] |
|
|
print(response) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|