import json import time import gradio as gr import numpy as np from gradio.themes.ocean import Ocean from PIL import Image from qwen_vl_utils import process_vision_info from transformers import ( AutoModelForCausalLM, AutoProcessor, Qwen3VLForConditionalGeneration, ) from spaces import GPU import supervision as sv model_qwen_id = "Qwen/Qwen3-VL-4B-Instruct" model_moondream_id = "moondream/moondream3-preview" model_qwen = Qwen3VLForConditionalGeneration.from_pretrained( model_qwen_id, torch_dtype="auto", device_map="auto", ) model_moondream = AutoModelForCausalLM.from_pretrained( model_moondream_id, trust_remote_code=True, device_map={"": "cuda"}, ) def extract_model_short_name(model_id): return model_id.split("/")[-1].replace("-", " ").replace("_", " ") model_qwen_name = extract_model_short_name(model_qwen_id) model_moondream_name = extract_model_short_name(model_moondream_id) processor_qwen = AutoProcessor.from_pretrained(model_qwen_id) def create_annotated_image(image, json_data, height, width): try: parsed_json_data = json_data.split("```json")[1].split("```")[0] bbox_data = json.loads(parsed_json_data) except Exception: return image original_width, original_height = image.size x_scale = original_width / width y_scale = original_height / height points = [] point_labels = [] for item in bbox_data: label = item.get("label", "") if "point_2d" in item: x, y = item["point_2d"] scaled_x = int(x * x_scale) scaled_y = int(y * y_scale) points.append([scaled_x, scaled_y]) point_labels.append(label) annotated_image = np.array(image.convert("RGB")) detections = sv.Detections.from_vlm(vlm = sv.VLM.QWEN_2_5_VL, result=json_data, input_wh=(original_width, original_height), resolution_wh=(original_width, original_height)) bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX) label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX) annotated_image = bounding_box_annotator.annotate( scene=annotated_image, detections=detections ) annotated_image = label_annotator.annotate( scene=annotated_image, detections=detections ) if points: points_array = np.array(points).reshape(1, -1, 2) key_points = sv.KeyPoints(xy=points_array) vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.BLUE) # vertex_label_annotator = sv.VertexLabelAnnotator(text_scale=0.5, border_radius=2) annotated_image = vertex_annotator.annotate( scene=annotated_image, key_points=key_points ) # annotated_image = vertex_label_annotator.annotate( # scene=annotated_image, # key_points=key_points, # labels=point_labels # ) return Image.fromarray(annotated_image) def create_annotated_image_normalized(image, json_data, label="object"): if not isinstance(json_data, dict): return image original_width, original_height = image.size annotated_image = np.array(image.convert("RGB")) points = [] if "points" in json_data: for point in json_data.get("points", []): x = int(point["x"] * original_width) y = int(point["y"] * original_height) points.append([x, y]) if "reasoning" in json_data: for grounding in json_data["reasoning"].get("grounding", []): for x_norm, y_norm in grounding.get("points", []): x = int(x_norm * original_width) y = int(y_norm * original_height) points.append([x, y]) if points: points_array = np.array(points).reshape(1, -1, 2) key_points = sv.KeyPoints(xy=points_array) vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.RED) annotated_image = vertex_annotator.annotate( scene=annotated_image, key_points=key_points ) if "objects" in json_data: detections = sv.Detections.from_vlm(sv.VLM.MOONDREAM,json_data, resolution_wh=(original_width, original_height)) bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX) label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX) labels = [label for _ in detections.xyxy] annotated_image = bounding_box_annotator.annotate( scene=annotated_image, detections=detections ) annotated_image = label_annotator.annotate( scene=annotated_image, detections=detections, labels=labels ) return Image.fromarray(annotated_image) def parse_qwen3_json(json_output): lines = json_output.splitlines() for i, line in enumerate(lines): if line == "```json": json_output = "\n".join(lines[i+1:]) json_output = json_output.split("```")[0] break try: boxes = json.loads(json_output) except json.JSONDecodeError: end_idx = json_output.rfind('"}') + len('"}') truncated_text = json_output[:end_idx] + "]" boxes = json.loads(truncated_text) if not isinstance(boxes, list): boxes = [boxes] return boxes def create_annotated_image_qwen3(image, json_output): try: boxes = parse_qwen3_json(json_output) except Exception as e: print(f"Error parsing JSON: {e}") return image if not boxes: return image original_width, original_height = image.size annotated_image = np.array(image.convert("RGB")) xyxy = [] labels = [] for box in boxes: if "bbox_2d" in box and "label" in box: x1, y1, x2, y2 = box["bbox_2d"] scale = 1000 x1 = max(0, min(scale, x1)) / scale * original_width y1 = max(0, min(scale, y1)) / scale * original_height x2 = max(0, min(scale, x2)) / scale * original_width y2 = max(0, min(scale, y2)) / scale * original_height # Ensure x1 <= x2 and y1 <= y2 if x1 > x2: x1, x2 = x2, x1 if y1 > y2: y1, y2 = y2, y1 xyxy.append([int(x1), int(y1), int(x2), int(y2)]) labels.append(box["label"]) if not xyxy: return image detections = sv.Detections( xyxy=np.array(xyxy), class_id=np.arange(len(xyxy)) ) bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX) label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX) annotated_image = bounding_box_annotator.annotate( scene=annotated_image, detections=detections ) annotated_image = label_annotator.annotate( scene=annotated_image, detections=detections, labels=labels ) return Image.fromarray(annotated_image) @GPU def detect_qwen(image, prompt): messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": prompt}, ], } ] t0 = time.perf_counter() inputs = processor_qwen.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ).to(model_qwen.device) generated_ids = model_qwen.generate(**inputs, max_new_tokens=1024) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor_qwen.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False, )[0] elapsed_ms = (time.perf_counter() - t0) * 1_000 annotated_image = create_annotated_image_qwen3(image, output_text) time_taken = f"**Inference time ({model_qwen_name}):** {elapsed_ms:.0f} ms" return annotated_image, output_text, time_taken @GPU def detect_moondream(image, prompt, category_input): t0 = time.perf_counter() if category_input in ["Object Detection", "Visual Grounding + Object Detection"]: output_text = model_moondream.detect(image=image, object=prompt) elif category_input == "Visual Grounding + Keypoint Detection": output_text = model_moondream.point(image=image, object=prompt) else: output_text = model_moondream.query( image=image, question=prompt, reasoning=True ) elapsed_ms = (time.perf_counter() - t0) * 1_000 annotated_image = create_annotated_image_normalized( image=image, json_data=output_text, label="object" ) time_taken = f"**Inference time ({model_moondream_name}):** {elapsed_ms:.0f} ms" return annotated_image, output_text, time_taken def detect(image, prompt_model_1, prompt_model_2, category_input): STANDARD_SIZE = (1024, 1024) image.thumbnail(STANDARD_SIZE) annotated_image_model_1, output_text_model_1, timing_1 = detect_qwen( image, prompt_model_1 ) annotated_image_model_2, output_text_model_2, timing_2 = detect_moondream( image, prompt_model_2, category_input ) return ( annotated_image_model_1, output_text_model_1, timing_1, annotated_image_model_2, output_text_model_2, timing_2, ) css_hide_share = """ button#gradio-share-link-button-0 { display: none !important; } """ with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo: gr.Markdown("# 👓 Object Understanding with Vision Language Models") gr.Markdown( "### Explore object detection, visual grounding, keypoint detection, and/or object counting through natural language prompts." ) gr.Markdown(""" *Powered by [Qwen3-VL 4B](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct) and [Moondream 3 Preview](https://huggingface.co/moondream/moondream3-preview). Inspired by the tutorial [Object Detection and Visual Grounding with Qwen 2.5](https://pyimagesearch.com/2025/06/09/object-detection-and-visual-grounding-with-qwen-2-5/) on PyImageSearch.* *Moondream 3 uses the [moondream-preview](https://huggingface.co/vikhyatk/moondream2/blob/main/moondream.py), selecting `detect` for categories with "Object Detection" `point` for the ones with "Keypoint Detection", and reasoning-based querying for all others.* """) with gr.Row(): with gr.Column(scale=2): image_input = gr.Image(label="Upload an image", type="pil", height=400) prompt_input_model_1 = gr.Textbox( label=f"Enter your prompt for {model_qwen_name}", placeholder="e.g., Detect all red cars in the image", ) prompt_input_model_2 = gr.Textbox( label=f"Enter your prompt for {model_moondream_name}", placeholder="e.g., Detect all blue cars in the image", ) categories = [ "Object Detection", "Object Counting", "Visual Grounding + Keypoint Detection", "Visual Grounding + Object Detection", "General query", ] category_input = gr.Dropdown( choices=categories, label="Category", interactive=True ) generate_btn = gr.Button(value="Generate") with gr.Column(scale=1): output_image_model_1 = gr.Image( type="pil", label=f"Annotated image for {model_qwen_name}", height=400 ) output_textbox_model_1 = gr.Textbox( label=f"Model response for {model_qwen_name}", lines=10 ) output_time_model_1 = gr.Markdown() with gr.Column(scale=1): output_image_model_2 = gr.Image( type="pil", label=f"Annotated image for {model_moondream_name}", height=400, ) output_textbox_model_2 = gr.Textbox( label=f"Model response for {model_moondream_name}", lines=10 ) output_time_model_2 = gr.Markdown() gr.Markdown("### Examples") example_prompts = [ [ "examples/example_1.jpg", "locate every instance in the image. Report bbox coordinates in JSON format.", "objects", "Object Detection", ], [ "examples/example_2.JPG", 'locate every instance that belongs to the following categories: "candy, hand". Report bbox coordinates in JSON format.', "candies", "Object Detection", ], [ "examples/example_1.jpg", "Count the number of red cars in the image.", "Count the number of red cars in the image.", "Object Counting", ], [ "examples/example_2.JPG", "Count the number of blue candies in the image.", "Count the number of blue candies in the image.", "Object Counting", ], [ "examples/example_1.jpg", 'locate every instance that belongs to the following categories: "red car". Report bbox coordinates in JSON format..', "red cars", "Visual Grounding + Keypoint Detection", ], [ "examples/example_2.JPG", "Identify the blue candies in this image, detect their key points and return their positions in the form of points.", "blue candies", "Visual Grounding + Keypoint Detection", ], [ "examples/example_1.jpg", 'locate every instance that belongs to the following categories: "leading red car". Report bbox coordinates in JSON format..', "leading red car", "Visual Grounding + Object Detection", ], [ "examples/example_2.JPG", 'locate every instance that belongs to the following categories: "blue candy located at the top of the group". Report bbox coordinates in JSON format.', "blue candy located at the top of the group", "Visual Grounding + Object Detection", ], ] gr.Examples( examples=example_prompts, inputs=[ image_input, prompt_input_model_1, prompt_input_model_2, category_input, ], label="Click an example to populate the input", ) generate_btn.click( fn=detect, inputs=[ image_input, prompt_input_model_1, prompt_input_model_2, category_input, ], outputs=[ output_image_model_1, output_textbox_model_1, output_time_model_1, output_image_model_2, output_textbox_model_2, output_time_model_2, ], ) if __name__ == "__main__": demo.launch()