# pip install git+https://github.com/huggingface/transformers.git # transformers>=4.49 import os import re from typing import Literal os.environ['CUDA_VISIBLE_DEVICES'] = '0' def draw_bbox_qwen2_vl(image, response, norm_bbox: Literal['norm1000', 'none']): matches = re.findall( r'<\|object_ref_start\|>(.*?)<\|object_ref_end\|><\|box_start\|>\((\d+),(\d+)\),\((\d+),(\d+)\)<\|box_end\|>', response) ref = [] bbox = [] for match_ in matches: ref.append(match_[0]) bbox.append(list(match_[1:])) draw_bbox(image, ref, bbox, norm_bbox=norm_bbox) def infer_grounding(): from swift.llm import PtEngine, RequestConfig, BaseArguments, InferRequest, safe_snapshot_download output_path = 'bbox.png' image = load_image('http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png') infer_request = InferRequest(messages=[{'role': 'user', 'content': 'Task: Object Detection'}], images=[image]) request_config = RequestConfig(max_tokens=512, temperature=0) adapter_path = safe_snapshot_download('swift/test_grounding') args = BaseArguments.from_pretrained(adapter_path) engine = PtEngine(args.model, adapters=[adapter_path]) resp_list = engine.infer([infer_request], request_config) response = resp_list[0].choices[0].message.content print(f'lora-response: {response}') draw_bbox_qwen2_vl(image, response, norm_bbox=args.norm_bbox) print(f'output_path: {output_path}') image.save(output_path) if __name__ == '__main__': from swift.llm import draw_bbox, load_image infer_grounding()