Last active
January 31, 2026 12:32
-
-
Save MSiam/9d10ed8479f7b400e9b2af8a8bc64164 to your computer and use it in GitHub Desktop.
evaluate RefCOCO Qwen2.5/3-VL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import itertools | |
| import json | |
| import os | |
| import re | |
| from functools import partial | |
| import markdown | |
| from bs4 import BeautifulSoup | |
| import numpy as np | |
| import ast | |
| from PIL import Image, ImageDraw, ImageFont | |
| from PIL import ImageColor | |
| import torch | |
| from torchvision.ops.boxes import box_area | |
| from tqdm import tqdm | |
| #from transformers import AutoModelForCausalLM, AutoTokenizer | |
| additional_colors = [colorname for (colorname, colorcode) in ImageColor.colormap.items()] | |
| ds_collections = { | |
| 'refcoco_val': 'data/refcoco/refcoco_val.jsonl', | |
| 'refcoco_testA': 'data/refcoco/refcoco_testA.jsonl', | |
| 'refcoco_testB': 'data/refcoco/refcoco_testB.jsonl', | |
| 'refcoco+_val': 'data/refcoco+/refcoco+_val.jsonl', | |
| 'refcoco+_testA': 'data/refcoco+/refcoco+_testA.jsonl', | |
| 'refcoco+_testB': 'data/refcoco+/refcoco+_testB.jsonl', | |
| 'refcocog_val': 'data/refcocog/refcocog_val.jsonl', | |
| 'refcocog_test': 'data/refcocog/refcocog_test.jsonl', | |
| } | |
| def box_iou(boxes1, boxes2): | |
| area1 = box_area(boxes1) | |
| area2 = box_area(boxes2) | |
| lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] | |
| rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] | |
| wh = (rb - lt).clamp(min=0) # [N,M,2] | |
| inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] | |
| union = area1[:, None] + area2 - inter | |
| iou = inter / union | |
| return iou, union | |
| def parse_json(response): | |
| # Meta response for now is not being used, is used in InternVL Variant not Qwen | |
| html = markdown.markdown(response, extensions=['fenced_code']) | |
| soup = BeautifulSoup(html, 'html.parser') | |
| json_text = soup.find('code').text | |
| data = json.loads(json_text) | |
| return data | |
| def collate_fn(batches, tokenizer): | |
| texts = [_['text'] for _ in batches] | |
| bboxes = [_['bbox'] for _ in batches] | |
| hws = [_['hw'] for _ in batches] | |
| input_ids = tokenizer(texts, return_tensors='pt', padding='longest') | |
| return input_ids.input_ids, input_ids.attention_mask, bboxes, hws | |
| class RefCOCODataset(torch.utils.data.Dataset): | |
| def __init__(self, test): | |
| self.datas = open(test).readlines() | |
| def __len__(self): | |
| return len(self.datas) | |
| def __getitem__(self, idx): | |
| data = json.loads(self.datas[idx].strip()) | |
| image = data['image'] | |
| text = data['sent'] | |
| bbox = data['bbox'] | |
| w, h = data['width'], data['height'] | |
| return { | |
| 'text': text, | |
| 'image': image, #self.prompt.format(image, text), | |
| 'bbox': np.array(bbox).reshape(1, 4), | |
| 'hw': np.array([h, w]).reshape(1, 2), | |
| } | |
| def inference(img_url, prompt, system_prompt="You are a helpful assistant", max_new_tokens=1024): | |
| image = Image.open(img_url) | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": system_prompt | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": prompt | |
| }, | |
| { | |
| "image": img_url | |
| } | |
| ] | |
| } | |
| ] | |
| text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| #print("input:\n",text) | |
| inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to('cuda') | |
| output_ids = model.generate(**inputs, max_new_tokens=1024) | |
| generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)] | |
| output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) | |
| #print("output:\n",output_text[0]) | |
| input_height = inputs['image_grid_thw'][0][1]*14 | |
| input_width = inputs['image_grid_thw'][0][2]*14 | |
| return output_text[0], input_height, input_width, image.size | |
| class InferenceSampler(torch.utils.data.sampler.Sampler): | |
| def __init__(self, size): | |
| self._size = int(size) | |
| assert size > 0 | |
| #self._rank = torch.distributed.get_rank() | |
| #self._world_size = torch.distributed.get_world_size() | |
| self._rank = 0 | |
| self._world_size = 1 | |
| self._local_indices = self._get_local_indices(size, self._world_size, | |
| self._rank) | |
| @staticmethod | |
| def _get_local_indices(total_size, world_size, rank): | |
| shard_size = total_size // world_size | |
| left = total_size % world_size | |
| shard_sizes = [shard_size + int(r < left) for r in range(world_size)] | |
| begin = sum(shard_sizes[:rank]) | |
| end = min(sum(shard_sizes[:rank + 1]), total_size) | |
| return range(begin, end) | |
| def __iter__(self): | |
| yield from self._local_indices | |
| def __len__(self): | |
| return len(self._local_indices) | |
| def plot_bounding_boxes(im, bounding_boxes, color): | |
| """ | |
| Plots bounding boxes on an image with markers for each a name, using PIL, normalized coordinates, and different colors. | |
| Args: | |
| img_path: The path to the image file. | |
| bounding_boxes: A list of bounding boxes containing the name of the object | |
| and their positions in normalized [y1 x1 y2 x2] format. | |
| """ | |
| # Load the image | |
| img = im | |
| width, height = img.size | |
| print(img.size) | |
| # Create a drawing object | |
| draw = ImageDraw.Draw(img) | |
| # Define a list of colors | |
| colors = [ | |
| 'red', | |
| 'green', | |
| 'blue', | |
| 'yellow', | |
| 'orange', | |
| 'pink', | |
| 'purple', | |
| 'brown', | |
| 'gray', | |
| 'beige', | |
| 'turquoise', | |
| 'cyan', | |
| 'magenta', | |
| 'lime', | |
| 'navy', | |
| 'maroon', | |
| 'teal', | |
| 'olive', | |
| 'coral', | |
| 'lavender', | |
| 'violet', | |
| 'gold', | |
| 'silver', | |
| ] + additional_colors | |
| font = ImageFont.truetype("NotoSansCJK-Regular.ttc", size=14) | |
| # Iterate over the bounding boxes | |
| for i, bounding_box in enumerate(bounding_boxes): | |
| # Select a color from the list | |
| #color = colors[i % len(colors)] | |
| # Convert normalized coordinates to absolute coordinates | |
| abs_y1 = bounding_box[1] | |
| abs_x1 = bounding_box[0] | |
| abs_y2 = bounding_box[3] | |
| abs_x2 = bounding_box[2] | |
| if abs_x1 > abs_x2: | |
| abs_x1, abs_x2 = abs_x2, abs_x1 | |
| if abs_y1 > abs_y2: | |
| abs_y1, abs_y2 = abs_y2, abs_y1 | |
| # Draw the bounding box | |
| draw.rectangle( | |
| ((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=4 | |
| ) | |
| # Draw the text | |
| if "label" in bounding_box: | |
| draw.text((abs_x1 + 8, abs_y1 + 6), bounding_box["label"], fill=color, font=font) | |
| # Display the image | |
| img.show() | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--checkpoint', type=str, default='') | |
| parser.add_argument('--dataset', type=str, default='') | |
| parser.add_argument('--batch-size', type=int, default=1) | |
| parser.add_argument('--num-workers', type=int, default=1) | |
| parser.add_argument('--vis', action='store_true') | |
| parser.add_argument('--start_at', type=int, default=0) | |
| args = parser.parse_args() | |
| if 'Qwen2.5' in args.checkpoint: | |
| from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoTokenizer | |
| from qwen_vl_utils import process_vision_info | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained(args.checkpoint, device_map='cuda', torch_dtype=torch.bfloat16) | |
| processor = AutoProcessor.from_pretrained(args.checkpoint) | |
| prompt = 'Locate the {}, output its bbox coordinates using JSON format.' | |
| elif 'Qwen3' in args.checkpoint: | |
| from transformers import AutoProcessor, AutoModelForVision2Seq | |
| from qwen_vl_utils import process_vision_info | |
| processor = AutoProcessor.from_pretrained(args.checkpoint, local_files_only=True) | |
| model, output_loading_info = AutoModelForVision2Seq.from_pretrained(args.checkpoint, torch_dtype="auto", device_map="auto", | |
| output_loading_info=True, local_files_only=True) | |
| prompt = 'Locate every object that matches the description {} in the image. Report coordinates in JSON format.' | |
| dataset = RefCOCODataset(test=ds_collections[args.dataset]) | |
| dataloader = torch.utils.data.DataLoader( | |
| dataset=dataset, | |
| batch_size=args.batch_size, | |
| num_workers=args.num_workers, | |
| pin_memory=True, | |
| drop_last=True, | |
| ) | |
| iter_ = -1 | |
| outputs = [] | |
| for didx, batch in enumerate(tqdm(dataloader)): | |
| messages = [] | |
| for img, txt, bbox,hw in zip(batch['image'], batch['text'], batch['bbox'], batch['hw']): | |
| iter_ += 1 | |
| if iter_ < args.start_at: | |
| continue | |
| try: | |
| response, input_height, input_width, (width, height) = inference(img, prompt.format(txt)) | |
| try: | |
| response = parse_json(response) | |
| except: | |
| response = json.loads(response) | |
| except: | |
| print('Expression: ', txt, ' Image: ', img) | |
| print('=====', response) | |
| if 'Qwen2.5' in args.checkpoint: | |
| save_prefix = 'qwen2.5' | |
| normalized_by = [input_width, input_height] | |
| elif 'Qwen3' in args.checkpoint: | |
| save_prefix = 'qwen3' | |
| normalized_by = [1000, 1000] | |
| try: | |
| #Original Qwen-VL refCOCO code always assume one BB | |
| # Their for loop goes through multiple samples within the batch | |
| response = response[0] | |
| abs_y1 = int(response["bbox_2d"][1]/normalized_by[1] * height) | |
| abs_x1 = int(response["bbox_2d"][0]/normalized_by[0] * width) | |
| abs_y2 = int(response["bbox_2d"][3]/normalized_by[1] * height) | |
| abs_x2 = int(response["bbox_2d"][2]/normalized_by[0] * width) | |
| if abs_x1 > abs_x2: | |
| abs_x1, abs_x2 = abs_x2, abs_x1 | |
| if abs_y1 > abs_y2: | |
| abs_y1, abs_y2 = abs_y2, abs_y1 | |
| if args.vis: | |
| print("Expression: ", txt) | |
| img = Image.open(img) | |
| plot_bounding_boxes(img, [[abs_x1, abs_y1, abs_x2, abs_y2]], 'red') | |
| plot_bounding_boxes(img, [[int(b) for b in bbox[0]]], 'blue') | |
| outputs.append({ | |
| 'answer': [abs_x1, abs_y1, abs_x2, abs_y2], | |
| 'gt_bbox': bbox, | |
| 'hw': hw, | |
| }) | |
| except: | |
| # Issues in parsing the response or found no objects | |
| outputs.append({ | |
| 'answer': [0.0, 0.0, 0.0, 0.0], | |
| 'gt_bbox': bbox, | |
| 'hw': hw, | |
| }) | |
| correct = total_cnt = 0 | |
| for i, output in enumerate(outputs): | |
| predict_bbox = output['answer'] | |
| target_bbox = torch.tensor(output['gt_bbox'], | |
| dtype=torch.float32).view(-1, 4) | |
| predict_bbox = torch.tensor(predict_bbox, | |
| dtype=torch.float32).view(-1, 4) | |
| iou, _ = box_iou(predict_bbox, target_bbox) | |
| iou = iou.item() | |
| total_cnt += 1 | |
| if iou >= 0.5: | |
| correct += 1 | |
| print(f"Evaluating {args.dataset} ...") | |
| print(f'Precision @ 1: {correct / total_cnt} \n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment