Spaces:

reverseforward
/

newtestspace

Sleeping

newtestspace / app.py

reveseforward

test1

742955b 2 months ago

1.99 kB

	import torch
	from transformers import AutoProcessor, AutoModelForVision2Seq
	import gradio as gr

	# ----------------------------
	# CONFIG
	# ----------------------------
	MODEL_NAME = "reverseforward/qwenmeasurement" # change this to your repo name
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.float16 # use float16 on A10G

	# ----------------------------
	# LOAD MODEL
	# ----------------------------
	print("Loading model...")
	model = AutoModelForVision2Seq.from_pretrained(
	MODEL_NAME,
	torch_dtype=DTYPE,
	device_map="auto",
	)
	processor = AutoProcessor.from_pretrained(MODEL_NAME)
	print("Model loaded successfully.")

	# ----------------------------
	# INFERENCE FUNCTION
	# ----------------------------
	def chat_with_image(image, text):
	if image is None or text.strip() == "":
	return "Please provide both an image and text input."

	# Prepare inputs for Qwen3-VL
	inputs = processor(text=[text], images=[image], return_tensors="pt").to(DEVICE, DTYPE)

	# Generate output
	with torch.inference_mode():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=256,
	temperature=0.7,
	)

	output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
	return output.strip()


	# ----------------------------
	# GRADIO UI
	# ----------------------------
	title = "🧠 Qwen3-VL-8B Fine-tuned (Image + Text)"
	description = """
	Upload an image and enter a text prompt.
	The model will reason visually and respond.
	"""

	demo = gr.Interface(
	fn=chat_with_image,
	inputs=[
	gr.Image(type="pil", label="Upload Image"),
	gr.Textbox(label="Enter Instruction or Question"),
	],
	outputs=gr.Textbox(label="Model Output"),
	title=title,
	description=description,
	examples=[
	["examples/cat.jpg", "Describe this image."],
	["examples/room.jpg", "How many chairs are visible?"],
	],
	)

	if __name__ == "__main__":
	demo.launch()