Skip to content

Instantly share code, notes, and snippets.

@MSiam
Last active January 31, 2026 12:32
Show Gist options
  • Select an option

  • Save MSiam/9d10ed8479f7b400e9b2af8a8bc64164 to your computer and use it in GitHub Desktop.

Select an option

Save MSiam/9d10ed8479f7b400e9b2af8a8bc64164 to your computer and use it in GitHub Desktop.
evaluate RefCOCO Qwen2.5/3-VL
import argparse
import itertools
import json
import os
import re
from functools import partial
import markdown
from bs4 import BeautifulSoup
import numpy as np
import ast
from PIL import Image, ImageDraw, ImageFont
from PIL import ImageColor
import torch
from torchvision.ops.boxes import box_area
from tqdm import tqdm
#from transformers import AutoModelForCausalLM, AutoTokenizer
additional_colors = [colorname for (colorname, colorcode) in ImageColor.colormap.items()]
ds_collections = {
'refcoco_val': 'data/refcoco/refcoco_val.jsonl',
'refcoco_testA': 'data/refcoco/refcoco_testA.jsonl',
'refcoco_testB': 'data/refcoco/refcoco_testB.jsonl',
'refcoco+_val': 'data/refcoco+/refcoco+_val.jsonl',
'refcoco+_testA': 'data/refcoco+/refcoco+_testA.jsonl',
'refcoco+_testB': 'data/refcoco+/refcoco+_testB.jsonl',
'refcocog_val': 'data/refcocog/refcocog_val.jsonl',
'refcocog_test': 'data/refcocog/refcocog_test.jsonl',
}
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
wh = (rb - lt).clamp(min=0) # [N,M,2]
inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
def parse_json(response):
# Meta response for now is not being used, is used in InternVL Variant not Qwen
html = markdown.markdown(response, extensions=['fenced_code'])
soup = BeautifulSoup(html, 'html.parser')
json_text = soup.find('code').text
data = json.loads(json_text)
return data
def collate_fn(batches, tokenizer):
texts = [_['text'] for _ in batches]
bboxes = [_['bbox'] for _ in batches]
hws = [_['hw'] for _ in batches]
input_ids = tokenizer(texts, return_tensors='pt', padding='longest')
return input_ids.input_ids, input_ids.attention_mask, bboxes, hws
class RefCOCODataset(torch.utils.data.Dataset):
def __init__(self, test):
self.datas = open(test).readlines()
def __len__(self):
return len(self.datas)
def __getitem__(self, idx):
data = json.loads(self.datas[idx].strip())
image = data['image']
text = data['sent']
bbox = data['bbox']
w, h = data['width'], data['height']
return {
'text': text,
'image': image, #self.prompt.format(image, text),
'bbox': np.array(bbox).reshape(1, 4),
'hw': np.array([h, w]).reshape(1, 2),
}
def inference(img_url, prompt, system_prompt="You are a helpful assistant", max_new_tokens=1024):
image = Image.open(img_url)
messages = [
{
"role": "system",
"content": system_prompt
},
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"image": img_url
}
]
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
#print("input:\n",text)
inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to('cuda')
output_ids = model.generate(**inputs, max_new_tokens=1024)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
#print("output:\n",output_text[0])
input_height = inputs['image_grid_thw'][0][1]*14
input_width = inputs['image_grid_thw'][0][2]*14
return output_text[0], input_height, input_width, image.size
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
#self._rank = torch.distributed.get_rank()
#self._world_size = torch.distributed.get_world_size()
self._rank = 0
self._world_size = 1
self._local_indices = self._get_local_indices(size, self._world_size,
self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
def plot_bounding_boxes(im, bounding_boxes, color):
"""
Plots bounding boxes on an image with markers for each a name, using PIL, normalized coordinates, and different colors.
Args:
img_path: The path to the image file.
bounding_boxes: A list of bounding boxes containing the name of the object
and their positions in normalized [y1 x1 y2 x2] format.
"""
# Load the image
img = im
width, height = img.size
print(img.size)
# Create a drawing object
draw = ImageDraw.Draw(img)
# Define a list of colors
colors = [
'red',
'green',
'blue',
'yellow',
'orange',
'pink',
'purple',
'brown',
'gray',
'beige',
'turquoise',
'cyan',
'magenta',
'lime',
'navy',
'maroon',
'teal',
'olive',
'coral',
'lavender',
'violet',
'gold',
'silver',
] + additional_colors
font = ImageFont.truetype("NotoSansCJK-Regular.ttc", size=14)
# Iterate over the bounding boxes
for i, bounding_box in enumerate(bounding_boxes):
# Select a color from the list
#color = colors[i % len(colors)]
# Convert normalized coordinates to absolute coordinates
abs_y1 = bounding_box[1]
abs_x1 = bounding_box[0]
abs_y2 = bounding_box[3]
abs_x2 = bounding_box[2]
if abs_x1 > abs_x2:
abs_x1, abs_x2 = abs_x2, abs_x1
if abs_y1 > abs_y2:
abs_y1, abs_y2 = abs_y2, abs_y1
# Draw the bounding box
draw.rectangle(
((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=4
)
# Draw the text
if "label" in bounding_box:
draw.text((abs_x1 + 8, abs_y1 + 6), bounding_box["label"], fill=color, font=font)
# Display the image
img.show()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--dataset', type=str, default='')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--vis', action='store_true')
parser.add_argument('--start_at', type=int, default=0)
args = parser.parse_args()
if 'Qwen2.5' in args.checkpoint:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoTokenizer
from qwen_vl_utils import process_vision_info
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(args.checkpoint, device_map='cuda', torch_dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained(args.checkpoint)
prompt = 'Locate the {}, output its bbox coordinates using JSON format.'
elif 'Qwen3' in args.checkpoint:
from transformers import AutoProcessor, AutoModelForVision2Seq
from qwen_vl_utils import process_vision_info
processor = AutoProcessor.from_pretrained(args.checkpoint, local_files_only=True)
model, output_loading_info = AutoModelForVision2Seq.from_pretrained(args.checkpoint, torch_dtype="auto", device_map="auto",
output_loading_info=True, local_files_only=True)
prompt = 'Locate every object that matches the description {} in the image. Report coordinates in JSON format.'
dataset = RefCOCODataset(test=ds_collections[args.dataset])
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=True,
)
iter_ = -1
outputs = []
for didx, batch in enumerate(tqdm(dataloader)):
messages = []
for img, txt, bbox,hw in zip(batch['image'], batch['text'], batch['bbox'], batch['hw']):
iter_ += 1
if iter_ < args.start_at:
continue
try:
response, input_height, input_width, (width, height) = inference(img, prompt.format(txt))
try:
response = parse_json(response)
except:
response = json.loads(response)
except:
print('Expression: ', txt, ' Image: ', img)
print('=====', response)
if 'Qwen2.5' in args.checkpoint:
save_prefix = 'qwen2.5'
normalized_by = [input_width, input_height]
elif 'Qwen3' in args.checkpoint:
save_prefix = 'qwen3'
normalized_by = [1000, 1000]
try:
#Original Qwen-VL refCOCO code always assume one BB
# Their for loop goes through multiple samples within the batch
response = response[0]
abs_y1 = int(response["bbox_2d"][1]/normalized_by[1] * height)
abs_x1 = int(response["bbox_2d"][0]/normalized_by[0] * width)
abs_y2 = int(response["bbox_2d"][3]/normalized_by[1] * height)
abs_x2 = int(response["bbox_2d"][2]/normalized_by[0] * width)
if abs_x1 > abs_x2:
abs_x1, abs_x2 = abs_x2, abs_x1
if abs_y1 > abs_y2:
abs_y1, abs_y2 = abs_y2, abs_y1
if args.vis:
print("Expression: ", txt)
img = Image.open(img)
plot_bounding_boxes(img, [[abs_x1, abs_y1, abs_x2, abs_y2]], 'red')
plot_bounding_boxes(img, [[int(b) for b in bbox[0]]], 'blue')
outputs.append({
'answer': [abs_x1, abs_y1, abs_x2, abs_y2],
'gt_bbox': bbox,
'hw': hw,
})
except:
# Issues in parsing the response or found no objects
outputs.append({
'answer': [0.0, 0.0, 0.0, 0.0],
'gt_bbox': bbox,
'hw': hw,
})
correct = total_cnt = 0
for i, output in enumerate(outputs):
predict_bbox = output['answer']
target_bbox = torch.tensor(output['gt_bbox'],
dtype=torch.float32).view(-1, 4)
predict_bbox = torch.tensor(predict_bbox,
dtype=torch.float32).view(-1, 4)
iou, _ = box_iou(predict_bbox, target_bbox)
iou = iou.item()
total_cnt += 1
if iou >= 0.5:
correct += 1
print(f"Evaluating {args.dataset} ...")
print(f'Precision @ 1: {correct / total_cnt} \n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment