omni-vis-assist / inference.py
hmnshudhmn24's picture
Upload 9 files
a96ad18 verified
# inference.py
from PIL import Image
import argparse
from transformers import pipeline
DEFAULT_VL_MODEL = "Qwen/Qwen3-VL-4B-Instruct"
FALLBACK_CAPTIONER = "Salesforce/blip-image-captioning-base"
FALLBACK_LM = "google/flan-t5-small"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--image", required=True)
parser.add_argument("--prompt", required=True)
args = parser.parse_args()
try:
vl_pipe = pipeline("image-to-text", model=DEFAULT_VL_MODEL)
img = Image.open(args.image)
output = vl_pipe(img, prompt=args.prompt)
print(output)
except Exception:
print("VL model not found, using fallback...")
captioner = pipeline("image-to-text", model=FALLBACK_CAPTIONER)
lm = pipeline("text2text-generation", model=FALLBACK_LM)
caption = captioner(Image.open(args.image))[0]["generated_text"]
prompt = f"Image: {caption}\nInstruction: {args.prompt}"
response = lm(prompt)[0]["generated_text"]
print(response)
if __name__ == "__main__":
main()