MSiam · January 31, 2026 12:32
diff --git a/refCoCoQwen.txt b/refCoCoQwen.txt
 import argparse
 import itertools
 import json
 import os
 import re
 from functools import partial
 import markdown
 from bs4 import BeautifulSoup


 import numpy as np
 import ast
 from PIL import Image, ImageDraw, ImageFont
 from PIL import ImageColor
 import torch
 from torchvision.ops.boxes import box_area
 from tqdm import tqdm
 #from transformers import AutoModelForCausalLM, AutoTokenizer

 additional_colors = [colorname for (colorname, colorcode) in ImageColor.colormap.items()]
 ds_collections = {
    'refcoco_val': 'data/refcoco/refcoco_val.jsonl',
    'refcoco_testA': 'data/refcoco/refcoco_testA.jsonl',
    'refcoco_testB': 'data/refcoco/refcoco_testB.jsonl',
    'refcoco+_val': 'data/refcoco+/refcoco+_val.jsonl',
    'refcoco+_testA': 'data/refcoco+/refcoco+_testA.jsonl',
    'refcoco+_testB': 'data/refcoco+/refcoco+_testB.jsonl',
    'refcocog_val': 'data/refcocog/refcocog_val.jsonl',
    'refcocog_test': 'data/refcocog/refcocog_test.jsonl',
 }

 def box_iou(boxes1, boxes2):
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

    wh = (rb - lt).clamp(min=0)  # [N,M,2]
    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]

    union = area1[:, None] + area2 - inter

    iou = inter / union
    return iou, union

 def parse_json(response):
    # Meta response for now is not being used, is used in InternVL Variant not Qwen
    html = markdown.markdown(response, extensions=['fenced_code'])
    soup = BeautifulSoup(html, 'html.parser')
    json_text = soup.find('code').text
    data = json.loads(json_text)
    return data
    
 def collate_fn(batches, tokenizer):

    texts = [_['text'] for _ in batches]
    bboxes = [_['bbox'] for _ in batches]
    hws = [_['hw'] for _ in batches]

    input_ids = tokenizer(texts, return_tensors='pt', padding='longest')

    return input_ids.input_ids, input_ids.attention_mask, bboxes, hws


 class RefCOCODataset(torch.utils.data.Dataset):

    def __init__(self, test):
        self.datas = open(test).readlines()

    def __len__(self):
        return len(self.datas)

    def __getitem__(self, idx):
        data = json.loads(self.datas[idx].strip())
        image = data['image']
        text = data['sent']
        bbox = data['bbox']
        w, h = data['width'], data['height']
        return {
            'text': text,
            'image': image, #self.prompt.format(image, text),
            'bbox': np.array(bbox).reshape(1, 4),
            'hw': np.array([h, w]).reshape(1, 2),
        }

 def inference(img_url, prompt, system_prompt="You are a helpful assistant", max_new_tokens=1024):
  image = Image.open(img_url)
  messages = [
    {
      "role": "system",
      "content": system_prompt
    },
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": prompt
        },
        {
          "image": img_url
        }
      ]
    }
  ]
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  #print("input:\n",text)
  inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to('cuda')

  output_ids = model.generate(**inputs, max_new_tokens=1024)
  generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
  output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
  #print("output:\n",output_text[0])

  input_height = inputs['image_grid_thw'][0][1]*14
  input_width = inputs['image_grid_thw'][0][2]*14

  return output_text[0], input_height, input_width, image.size


 class InferenceSampler(torch.utils.data.sampler.Sampler):

    def __init__(self, size):
        self._size = int(size)
        assert size > 0
        #self._rank = torch.distributed.get_rank()
        #self._world_size = torch.distributed.get_world_size()
        self._rank = 0
        self._world_size = 1
        self._local_indices = self._get_local_indices(size, self._world_size,
                                                      self._rank)

    @staticmethod
    def _get_local_indices(total_size, world_size, rank):
        shard_size = total_size // world_size
        left = total_size % world_size
        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]

        begin = sum(shard_sizes[:rank])
        end = min(sum(shard_sizes[:rank + 1]), total_size)
        return range(begin, end)

    def __iter__(self):
        yield from self._local_indices

    def __len__(self):
        return len(self._local_indices)

 def plot_bounding_boxes(im, bounding_boxes, color):
    """
    Plots bounding boxes on an image with markers for each a name, using PIL, normalized coordinates, and different colors.

    Args:
        img_path: The path to the image file.
        bounding_boxes: A list of bounding boxes containing the name of the object
         and their positions in normalized [y1 x1 y2 x2] format.
    """

    # Load the image
    img = im
    width, height = img.size
    print(img.size)
    # Create a drawing object
    draw = ImageDraw.Draw(img)

    # Define a list of colors
    colors = [
    'red',
    'green',
    'blue',
    'yellow',
    'orange',
    'pink',
    'purple',
    'brown',
    'gray',
    'beige',
    'turquoise',
    'cyan',
    'magenta',
    'lime',
    'navy',
    'maroon',
    'teal',
    'olive',
    'coral',
    'lavender',
    'violet',
    'gold',
    'silver',
    ] + additional_colors

    font = ImageFont.truetype("NotoSansCJK-Regular.ttc", size=14)

    # Iterate over the bounding boxes
    for i, bounding_box in enumerate(bounding_boxes):
      # Select a color from the list
      #color = colors[i % len(colors)]

      # Convert normalized coordinates to absolute coordinates
      abs_y1 = bounding_box[1]
      abs_x1 = bounding_box[0]
      abs_y2 = bounding_box[3]
      abs_x2 = bounding_box[2]

      if abs_x1 > abs_x2:
        abs_x1, abs_x2 = abs_x2, abs_x1

      if abs_y1 > abs_y2:
        abs_y1, abs_y2 = abs_y2, abs_y1

      # Draw the bounding box
      draw.rectangle(
          ((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=4
      )

      # Draw the text
      if "label" in bounding_box:
        draw.text((abs_x1 + 8, abs_y1 + 6), bounding_box["label"], fill=color, font=font)

    # Display the image
    img.show()


 if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--checkpoint', type=str, default='')
    parser.add_argument('--dataset', type=str, default='')
    parser.add_argument('--batch-size', type=int, default=1)
    parser.add_argument('--num-workers', type=int, default=1)
    parser.add_argument('--vis', action='store_true')
    parser.add_argument('--start_at', type=int, default=0)
    args = parser.parse_args()

    if 'Qwen2.5' in args.checkpoint:
        from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoTokenizer
        from qwen_vl_utils import process_vision_info

        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(args.checkpoint, device_map='cuda', torch_dtype=torch.bfloat16)
        processor = AutoProcessor.from_pretrained(args.checkpoint)
        prompt = 'Locate the {}, output its bbox coordinates using JSON format.'

    elif 'Qwen3' in args.checkpoint:
        from transformers import AutoProcessor, AutoModelForVision2Seq
        from qwen_vl_utils import process_vision_info
        processor = AutoProcessor.from_pretrained(args.checkpoint, local_files_only=True)
        model, output_loading_info = AutoModelForVision2Seq.from_pretrained(args.checkpoint, torch_dtype="auto", device_map="auto",
                                                                            output_loading_info=True, local_files_only=True)
        prompt = 'Locate every object that matches the description {} in the image. Report coordinates in JSON format.'


    dataset = RefCOCODataset(test=ds_collections[args.dataset])
    dataloader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True,
    )

    iter_ = -1
    outputs = []
    for didx, batch in enumerate(tqdm(dataloader)):
        messages = []

        for img, txt, bbox,hw in zip(batch['image'], batch['text'], batch['bbox'], batch['hw']):
            iter_ += 1
            if iter_ < args.start_at:
                continue

            try:
                response, input_height, input_width, (width, height) = inference(img, prompt.format(txt))
                try:
                    response = parse_json(response)
                except:
                    response = json.loads(response)
            except:
                print('Expression: ', txt, ' Image: ', img)
                print('=====', response)

            if 'Qwen2.5' in args.checkpoint:
                save_prefix = 'qwen2.5'
                normalized_by = [input_width, input_height]
            elif 'Qwen3' in args.checkpoint:
                save_prefix = 'qwen3'
                normalized_by = [1000, 1000]

            try:
                #Original Qwen-VL refCOCO code always assume one BB
                # Their for loop goes through multiple samples within the batch
                response = response[0]
                abs_y1 = int(response["bbox_2d"][1]/normalized_by[1] * height)
                abs_x1 = int(response["bbox_2d"][0]/normalized_by[0] * width)
                abs_y2 = int(response["bbox_2d"][3]/normalized_by[1] * height)
                abs_x2 = int(response["bbox_2d"][2]/normalized_by[0] * width)

                if abs_x1 > abs_x2:
                    abs_x1, abs_x2 = abs_x2, abs_x1

                if abs_y1 > abs_y2:
                    abs_y1, abs_y2 = abs_y2, abs_y1

                if args.vis:
                    print("Expression: ", txt)
                    img = Image.open(img)
                    plot_bounding_boxes(img, [[abs_x1, abs_y1, abs_x2, abs_y2]], 'red')
                    plot_bounding_boxes(img, [[int(b) for b in bbox[0]]], 'blue')

                outputs.append({
                    'answer': [abs_x1, abs_y1, abs_x2, abs_y2],
                    'gt_bbox': bbox,
                    'hw': hw,
                })
            except:
                # Issues in parsing the response or found no objects
                outputs.append({
                    'answer': [0.0, 0.0, 0.0, 0.0],
                    'gt_bbox': bbox,
                    'hw': hw,
                })

    correct = total_cnt = 0
    for i, output in enumerate(outputs):
        predict_bbox = output['answer']
        target_bbox = torch.tensor(output['gt_bbox'],
                                   dtype=torch.float32).view(-1, 4)
        predict_bbox = torch.tensor(predict_bbox,
                                    dtype=torch.float32).view(-1, 4)
        iou, _ = box_iou(predict_bbox, target_bbox)
        iou = iou.item()
        total_cnt += 1
        if iou >= 0.5:
            correct += 1

    print(f"Evaluating {args.dataset} ...")
    print(f'Precision @ 1: {correct / total_cnt} \n')
	import argparse
	import itertools
	import json
	import os
	import re
	from functools import partial
	import markdown
	from bs4 import BeautifulSoup


	import numpy as np
	import ast
	from PIL import Image, ImageDraw, ImageFont
	from PIL import ImageColor
	import torch
	from torchvision.ops.boxes import box_area
	from tqdm import tqdm
	#from transformers import AutoModelForCausalLM, AutoTokenizer

	additional_colors = [colorname for (colorname, colorcode) in ImageColor.colormap.items()]
	ds_collections = {
	'refcoco_val': 'data/refcoco/refcoco_val.jsonl',
	'refcoco_testA': 'data/refcoco/refcoco_testA.jsonl',
	'refcoco_testB': 'data/refcoco/refcoco_testB.jsonl',
	'refcoco+_val': 'data/refcoco+/refcoco+_val.jsonl',
	'refcoco+_testA': 'data/refcoco+/refcoco+_testA.jsonl',
	'refcoco+_testB': 'data/refcoco+/refcoco+_testB.jsonl',
	'refcocog_val': 'data/refcocog/refcocog_val.jsonl',
	'refcocog_test': 'data/refcocog/refcocog_test.jsonl',
	}

	def box_iou(boxes1, boxes2):
	area1 = box_area(boxes1)
	area2 = box_area(boxes2)

	lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
	rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]

	wh = (rb - lt).clamp(min=0) # [N,M,2]
	inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]

	union = area1[:, None] + area2 - inter

	iou = inter / union
	return iou, union

	def parse_json(response):
	# Meta response for now is not being used, is used in InternVL Variant not Qwen
	html = markdown.markdown(response, extensions=['fenced_code'])
	soup = BeautifulSoup(html, 'html.parser')
	json_text = soup.find('code').text
	data = json.loads(json_text)
	return data

	def collate_fn(batches, tokenizer):

	texts = [_['text'] for _ in batches]
	bboxes = [_['bbox'] for _ in batches]
	hws = [_['hw'] for _ in batches]

	input_ids = tokenizer(texts, return_tensors='pt', padding='longest')

	return input_ids.input_ids, input_ids.attention_mask, bboxes, hws


	class RefCOCODataset(torch.utils.data.Dataset):

	def __init__(self, test):
	self.datas = open(test).readlines()

	def __len__(self):
	return len(self.datas)

	def __getitem__(self, idx):
	data = json.loads(self.datas[idx].strip())
	image = data['image']
	text = data['sent']
	bbox = data['bbox']
	w, h = data['width'], data['height']
	return {
	'text': text,
	'image': image, #self.prompt.format(image, text),
	'bbox': np.array(bbox).reshape(1, 4),
	'hw': np.array([h, w]).reshape(1, 2),
	}

	def inference(img_url, prompt, system_prompt="You are a helpful assistant", max_new_tokens=1024):
	image = Image.open(img_url)
	messages = [
	{
	"role": "system",
	"content": system_prompt
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": prompt
	},
	{
	"image": img_url
	}
	]
	}
	]
	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	#print("input:\n",text)
	inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to('cuda')

	output_ids = model.generate(**inputs, max_new_tokens=1024)
	generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
	output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
	#print("output:\n",output_text[0])

	input_height = inputs['image_grid_thw'][0][1]*14
	input_width = inputs['image_grid_thw'][0][2]*14

	return output_text[0], input_height, input_width, image.size


	class InferenceSampler(torch.utils.data.sampler.Sampler):

	def __init__(self, size):
	self._size = int(size)
	assert size > 0
	#self._rank = torch.distributed.get_rank()
	#self._world_size = torch.distributed.get_world_size()
	self._rank = 0
	self._world_size = 1
	self._local_indices = self._get_local_indices(size, self._world_size,
	self._rank)

	@staticmethod
	def _get_local_indices(total_size, world_size, rank):
	shard_size = total_size // world_size
	left = total_size % world_size
	shard_sizes = [shard_size + int(r < left) for r in range(world_size)]

	begin = sum(shard_sizes[:rank])
	end = min(sum(shard_sizes[:rank + 1]), total_size)
	return range(begin, end)

	def __iter__(self):
	yield from self._local_indices

	def __len__(self):
	return len(self._local_indices)

	def plot_bounding_boxes(im, bounding_boxes, color):
	"""
	Plots bounding boxes on an image with markers for each a name, using PIL, normalized coordinates, and different colors.

	Args:
	img_path: The path to the image file.
	bounding_boxes: A list of bounding boxes containing the name of the object
	and their positions in normalized [y1 x1 y2 x2] format.
	"""

	# Load the image
	img = im
	width, height = img.size
	print(img.size)
	# Create a drawing object
	draw = ImageDraw.Draw(img)

	# Define a list of colors
	colors = [
	'red',
	'green',
	'blue',
	'yellow',
	'orange',
	'pink',
	'purple',
	'brown',
	'gray',
	'beige',
	'turquoise',
	'cyan',
	'magenta',
	'lime',
	'navy',
	'maroon',
	'teal',
	'olive',
	'coral',
	'lavender',
	'violet',
	'gold',
	'silver',
	] + additional_colors

	font = ImageFont.truetype("NotoSansCJK-Regular.ttc", size=14)

	# Iterate over the bounding boxes
	for i, bounding_box in enumerate(bounding_boxes):
	# Select a color from the list
	#color = colors[i % len(colors)]

	# Convert normalized coordinates to absolute coordinates
	abs_y1 = bounding_box[1]
	abs_x1 = bounding_box[0]
	abs_y2 = bounding_box[3]
	abs_x2 = bounding_box[2]

	if abs_x1 > abs_x2:
	abs_x1, abs_x2 = abs_x2, abs_x1

	if abs_y1 > abs_y2:
	abs_y1, abs_y2 = abs_y2, abs_y1

	# Draw the bounding box
	draw.rectangle(
	((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=4
	)

	# Draw the text
	if "label" in bounding_box:
	draw.text((abs_x1 + 8, abs_y1 + 6), bounding_box["label"], fill=color, font=font)

	# Display the image
	img.show()


	if __name__ == '__main__':

	parser = argparse.ArgumentParser()
	parser.add_argument('--checkpoint', type=str, default='')
	parser.add_argument('--dataset', type=str, default='')
	parser.add_argument('--batch-size', type=int, default=1)
	parser.add_argument('--num-workers', type=int, default=1)
	parser.add_argument('--vis', action='store_true')
	parser.add_argument('--start_at', type=int, default=0)
	args = parser.parse_args()

	if 'Qwen2.5' in args.checkpoint:
	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoTokenizer
	from qwen_vl_utils import process_vision_info

	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(args.checkpoint, device_map='cuda', torch_dtype=torch.bfloat16)
	processor = AutoProcessor.from_pretrained(args.checkpoint)
	prompt = 'Locate the {}, output its bbox coordinates using JSON format.'

	elif 'Qwen3' in args.checkpoint:
	from transformers import AutoProcessor, AutoModelForVision2Seq
	from qwen_vl_utils import process_vision_info
	processor = AutoProcessor.from_pretrained(args.checkpoint, local_files_only=True)
	model, output_loading_info = AutoModelForVision2Seq.from_pretrained(args.checkpoint, torch_dtype="auto", device_map="auto",
	output_loading_info=True, local_files_only=True)
	prompt = 'Locate every object that matches the description {} in the image. Report coordinates in JSON format.'


	dataset = RefCOCODataset(test=ds_collections[args.dataset])
	dataloader = torch.utils.data.DataLoader(
	dataset=dataset,
	batch_size=args.batch_size,
	num_workers=args.num_workers,
	pin_memory=True,
	drop_last=True,
	)

	iter_ = -1
	outputs = []
	for didx, batch in enumerate(tqdm(dataloader)):
	messages = []

	for img, txt, bbox,hw in zip(batch['image'], batch['text'], batch['bbox'], batch['hw']):
	iter_ += 1
	if iter_ < args.start_at:
	continue

	try:
	response, input_height, input_width, (width, height) = inference(img, prompt.format(txt))
	try:
	response = parse_json(response)
	except:
	response = json.loads(response)
	except:
	print('Expression: ', txt, ' Image: ', img)
	print('=====', response)

	if 'Qwen2.5' in args.checkpoint:
	save_prefix = 'qwen2.5'
	normalized_by = [input_width, input_height]
	elif 'Qwen3' in args.checkpoint:
	save_prefix = 'qwen3'
	normalized_by = [1000, 1000]

	try:
	#Original Qwen-VL refCOCO code always assume one BB
	# Their for loop goes through multiple samples within the batch
	response = response[0]
	abs_y1 = int(response["bbox_2d"][1]/normalized_by[1] * height)
	abs_x1 = int(response["bbox_2d"][0]/normalized_by[0] * width)
	abs_y2 = int(response["bbox_2d"][3]/normalized_by[1] * height)
	abs_x2 = int(response["bbox_2d"][2]/normalized_by[0] * width)

	if abs_x1 > abs_x2:
	abs_x1, abs_x2 = abs_x2, abs_x1

	if abs_y1 > abs_y2:
	abs_y1, abs_y2 = abs_y2, abs_y1

	if args.vis:
	print("Expression: ", txt)
	img = Image.open(img)
	plot_bounding_boxes(img, [[abs_x1, abs_y1, abs_x2, abs_y2]], 'red')
	plot_bounding_boxes(img, [[int(b) for b in bbox[0]]], 'blue')

	outputs.append({
	'answer': [abs_x1, abs_y1, abs_x2, abs_y2],
	'gt_bbox': bbox,
	'hw': hw,
	})
	except:
	# Issues in parsing the response or found no objects
	outputs.append({
	'answer': [0.0, 0.0, 0.0, 0.0],
	'gt_bbox': bbox,
	'hw': hw,
	})

	correct = total_cnt = 0
	for i, output in enumerate(outputs):
	predict_bbox = output['answer']
	target_bbox = torch.tensor(output['gt_bbox'],
	dtype=torch.float32).view(-1, 4)
	predict_bbox = torch.tensor(predict_bbox,
	dtype=torch.float32).view(-1, 4)
	iou, _ = box_iou(predict_bbox, target_bbox)
	iou = iou.item()
	total_cnt += 1
	if iou >= 0.5:
	correct += 1

	print(f"Evaluating {args.dataset} ...")
	print(f'Precision @ 1: {correct / total_cnt} \n')
No results found