# inference.py from PIL import Image import argparse from transformers import pipeline DEFAULT_VL_MODEL = "Qwen/Qwen3-VL-4B-Instruct" FALLBACK_CAPTIONER = "Salesforce/blip-image-captioning-base" FALLBACK_LM = "google/flan-t5-small" def main(): parser = argparse.ArgumentParser() parser.add_argument("--image", required=True) parser.add_argument("--prompt", required=True) args = parser.parse_args() try: vl_pipe = pipeline("image-to-text", model=DEFAULT_VL_MODEL) img = Image.open(args.image) output = vl_pipe(img, prompt=args.prompt) print(output) except Exception: print("VL model not found, using fallback...") captioner = pipeline("image-to-text", model=FALLBACK_CAPTIONER) lm = pipeline("text2text-generation", model=FALLBACK_LM) caption = captioner(Image.open(args.image))[0]["generated_text"] prompt = f"Image: {caption}\nInstruction: {args.prompt}" response = lm(prompt)[0]["generated_text"] print(response) if __name__ == "__main__": main()