hbXNov
/

Qwen3-VL-2B-Instruct-HoneyBee

+import os
+import logging
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Union
+import torch
+from datasets import load_dataset
+import json
+from tqdm import tqdm
+from PIL import Image
+import requests
+from io import BytesIO
+import argparse
+from pathlib import Path
+from enum import Enum
+# Import custom modules
+from data import (
+    DatasetType,
+    DatasetConfig,
+    get_dataset_config,
+    get_formatted_instruction,
+    process_response,
+    save_descriptions,
+    load_image_dataset,
+    get_processed_response
+)
+from torch.utils.data import Dataset, DataLoader, DistributedSampler
+import torch.distributed as dist
+from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
+from vllm import LLM, SamplingParams
+import io
+import base64
+from PIL import Image
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('evaluation.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+INSTRUCTION =  "\n\nYour final answer MUST BE put in \\boxed{}."
+def pil_to_base64(image_pil, format="PNG"):
+    buffered = io.BytesIO()
+    image_pil.save(buffered, format=format)
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return img_str
+def base64_to_pil(base64_string):
+    img_data = base64.b64decode(base64_string)
+    image_pil = Image.open(io.BytesIO(img_data))
+    return image_pil
+class InstanceDataset(Dataset):
+    def __init__(self, data):
+        self.data = data
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        item = self.data[index]
+        for k in item:
+            if k == 'options' or k == 'choices':
+                if item[k] == None:
+                    item[k] = ""
+                else:
+                    item[k] = str(item[k])
+        if 'image_url' in item:
+            image_url = item['image_url']
+            image_str = pil_to_base64(image_url)
+            item['image_url'] = image_str
+        instance = {'index': index, 'item': item}
+        return instance
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate model on various math datasets')
+    parser.add_argument('--dataset', type=str, choices=['mathvista', 'mathverse', 'mathvision', 'mathvision-mini', 'hallusionbench', 'mmmu-pro-vision', 'we-math', 'math500', 'gpqa', 'dynamath', 'logicvista'],
+                      default='mathvista', help='Dataset to evaluate on')
+    parser.add_argument('--model_path', type=str, help='Path to the model', default="Qwen/Qwen3-VL-2B-Instruct")
+    parser.add_argument('--name', type=str, help='model save name', default="plm")
+    parser.add_argument('--bsz', type=int, help='batch size', default=2)
+    args = parser.parse_args()
+    # device = int(os.environ['LOCAL_RANK'])
+    # torch.cuda.set_device(f'cuda:{device}')
+    # Configuration
+    dataset_type = DatasetType(args.dataset)
+    dataset_config = get_dataset_config(dataset_type)
+    output_folder = f"./outputs/{dataset_type.value}_{args.name}"
+    os.makedirs(output_folder, exist_ok=True)
+    MODEL_PATH = args.model_path
+    processor = AutoProcessor.from_pretrained(MODEL_PATH)
+    vlm = LLM(MODEL_PATH, limit_mm_per_prompt={"image": 1}, tensor_parallel_size=torch.cuda.device_count())
+    sampling_params = SamplingParams(max_tokens=2048, temperature=0.7, top_p=0.8, top_k=20, repetition_penalty=1.0, presence_penalty=1.5)
+    # Load dataset
+    logger.info(f"Loading dataset {dataset_config.name}")
+    data = load_image_dataset(dataset_config)
+    # dist.init_process_group()
+    dataset = InstanceDataset(data)
+    # sampler = DistributedSampler(dataset, shuffle=False)
+    dataloader = DataLoader(dataset, batch_size=args.bsz)
+    # Load model
+    # local_rank = int(os.environ['LOCAL_RANK'])
+    # logger.info(f"Loaded model {args.model_path} | local rank: {local_rank}")
+    for batch in tqdm(dataloader):
+        indices = batch['index']
+        run_input_instances = []
+        run_indices = []
+        run_processed_responses = []
+        run_items = []
+        run_formatted_instructions = []
+        for j in range(len(indices)):
+            index = indices[j].item()
+            output_file = os.path.join(output_folder, f'{index}.json')
+            global_item = batch['item']
+            if not os.path.exists(output_file):
+                item = {}
+                for k in global_item:
+                    item[k] = global_item[k][j]
+                for k in item:
+                    if len(item[k]) > 0:
+                        if k == 'choices' or k == 'options':
+                            # print(f'item[k]: {item[k]}')
+                            try:
+                                item[k] = eval(item[k])
+                            except:
+                                item[k] = item[k]
+                    if k == 'image_url':
+                        item['image_url'] = base64_to_pil(item['image_url'])
+                formatted_instruction = get_formatted_instruction(dataset_type, item)
+                formatted_instruction = formatted_instruction + INSTRUCTION
+                if 'image_url' in item:
+                    message = [{"role": "user", "content": [{"type": "image", "image": ""}, {"type": "text", "text": formatted_instruction}]}]
+                else:
+                    message = [{"role": "user", "content": [{"type": "text", "text": formatted_instruction}]}]
+                text = processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
+                if 'image_url' in item:
+                    input_instance = {'prompt': text, 'multi_modal_data': {'image': item['image_url']}}
+                else:
+                    input_instance = {'prompt': text}
+                # print(f'input_instance: {input_instance}')
+                run_input_instances.append(input_instance)
+                run_indices.append(index)
+                processed_response = get_processed_response(dataset_type, item)
+                # print(f'response: {item["response"]} | processed_response: {processed_response} | choices: {item["choices"]} | ')
+                run_processed_responses.append(processed_response)
+                run_items.append(item)
+                run_formatted_instructions.append(formatted_instruction)
+            outputs = vlm.generate(run_input_instances, sampling_params=sampling_params)
+            for j in range(len(run_indices)):
+                answer = outputs[j].outputs[0].text
+                processed_response = run_processed_responses[j]
+                item = run_items[j]
+                formatted_instruction = run_formatted_instructions[j]
+                if 'image_url' in item:
+                    del item['image_url']
+                description = {
+                    'index': j,
+                    'item': json.dumps(item),
+                    'formatted_instruction': formatted_instruction,
+                    'processed_response': processed_response,
+                    'answer': answer
+                }
+                with open(output_file, 'w') as f:
+                    json.dump(description, f, indent = 4)
+if __name__ == "__main__":
+    main()
+#
+#  VLLM_WORKER_MULTIPROC_METHOD=spawn  VLLM_DISABLE_COMPILE_CACHE=1 CUDA_VISIBLE_DEVICES=3,4,5,6 python eval_qwen_multi_vllm.py --dataset mathvista --name qwen3_vl_2b_instruct_vllm
+#