# pip install git+https://github.com/huggingface/transformers.git  # transformers>=4.49
import os
import re
from typing import Literal

os.environ['CUDA_VISIBLE_DEVICES'] = '0'


def draw_bbox_qwen2_vl(image, response, norm_bbox: Literal['norm1000', 'none']):
    matches = re.findall(
        r'<\|object_ref_start\|>(.*?)<\|object_ref_end\|><\|box_start\|>\((\d+),(\d+)\),\((\d+),(\d+)\)<\|box_end\|>',
        response)
    ref = []
    bbox = []
    for match_ in matches:
        ref.append(match_[0])
        bbox.append(list(match_[1:]))
    draw_bbox(image, ref, bbox, norm_bbox=norm_bbox)


def infer_grounding():
    from swift.llm import PtEngine, RequestConfig, BaseArguments, InferRequest, safe_snapshot_download
    output_path = 'bbox.png'
    image = load_image('http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png')
    infer_request = InferRequest(messages=[{'role': 'user', 'content': 'Task: Object Detection'}], images=[image])

    request_config = RequestConfig(max_tokens=512, temperature=0)
    adapter_path = safe_snapshot_download('swift/test_grounding')
    args = BaseArguments.from_pretrained(adapter_path)

    engine = PtEngine(args.model, adapters=[adapter_path])
    resp_list = engine.infer([infer_request], request_config)
    response = resp_list[0].choices[0].message.content
    print(f'lora-response: {response}')

    draw_bbox_qwen2_vl(image, response, norm_bbox=args.norm_bbox)
    print(f'output_path: {output_path}')
    image.save(output_path)


if __name__ == '__main__':
    from swift.llm import draw_bbox, load_image
    infer_grounding()