From 85b14bb8b7cdc1518b714e3a1f8b9e5c50623114 Mon Sep 17 00:00:00 2001 From: Mohammadreza Haghighat Date: Thu, 7 Aug 2025 09:52:22 +1000 Subject: [PATCH 1/4] Modified for CPU-only usage, changed library versions, test with new dataset --- .vscode/c_cpp_properties.json | 21 ++ .vscode/settings.json | 9 + RSVG-HR/Annotations/rsvg_hr_test_10.txt | 28 ++ datasets/__init__.py | 27 ++ datasets/concat_dataset.py | 43 ++- datasets/image_to_seq_augmenter.py | 147 +++++++-- datasets/refer.py | 294 +++++++++++------- datasets/refexp_eval.py | 73 ++++- datasets/rsvg_mm.py | 72 ++--- inference_rsvg.py | 187 ++++++----- .../PKG-INFO | 12 +- .../SOURCES.txt | 5 +- .../functions/__init__.py | 10 + .../functions/ms_deform_attn_func.py | 61 ++++ .../modules/__init__.py | 9 + .../modules/ms_deform_attn.py | 117 +++++++ models/ops/functions/ms_deform_attn_func.py | 21 +- models/ops/setup.py | 32 +- models/ops/src/cpu/ms_deform_attn_cpu.cpp | 10 +- test.sh | 6 +- 20 files changed, 873 insertions(+), 311 deletions(-) create mode 100644 .vscode/c_cpp_properties.json create mode 100644 .vscode/settings.json create mode 100644 RSVG-HR/Annotations/rsvg_hr_test_10.txt create mode 100644 models/ops/build/lib.win-amd64-cpython-39/functions/__init__.py create mode 100644 models/ops/build/lib.win-amd64-cpython-39/functions/ms_deform_attn_func.py create mode 100644 models/ops/build/lib.win-amd64-cpython-39/modules/__init__.py create mode 100644 models/ops/build/lib.win-amd64-cpython-39/modules/ms_deform_attn.py diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..79ddb0c --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,21 @@ +{ + "configurations": [ + { + "name": "Win32", + "includePath": [ + "${workspaceFolder}/**" + ], + "defines": [ + "_DEBUG", + "UNICODE", + "_UNICODE" + ], + "windowsSdkVersion": "10.0.26100.0", + "compilerPath": "cl.exe", + "cStandard": "c17", + "cppStandard": "c++17", + "intelliSenseMode": "windows-msvc-x64" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..083f60b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,9 @@ +{ + "python-envs.pythonProjects": [ + { + "path": "", + "envManager": "ms-python.python:conda", + "packageManager": "ms-python.python:conda" + } + ] +} \ No newline at end of file diff --git a/RSVG-HR/Annotations/rsvg_hr_test_10.txt b/RSVG-HR/Annotations/rsvg_hr_test_10.txt new file mode 100644 index 0000000..9f57323 --- /dev/null +++ b/RSVG-HR/Annotations/rsvg_hr_test_10.txt @@ -0,0 +1,28 @@ +UASs_120.jpg,2671.0,1921.0,2814.0,2104.0,Compact square-like cluster with roughly symmetrical slightly irregular outline, weed: erigeron canadensis, middle-center of lentil crop field +UASs_121.jpg,2195.0,2413.0,2568.0,2784.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, bottom-center of lentil crop field +UASs_123.jpg,1656.0,683.0,2047.0,1052.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-left of lentil crop field +UASs_124.jpg,2269.0,1248.0,2616.0,1695.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, middle-center of lentil crop field +UASs_137.jpg,663.0,3508.0,985.0,3643.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, bottom-left of lentil crop field +UASs_157.jpg,2665.0,13.0,2985.0,481.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, top-center of lentil crop field +UASs_159.jpg,2442.0,3.0,2545.0,132.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-center of lentil crop field +UASs_180.jpg,498.0,3132.0,877.0,3366.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, bottom-left of lentil crop field +UASs_181.jpg,153.0,3197.0,413.0,3479.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, bottom-left of lentil crop field +UASs_224.jpg,4683.0,528.0,4937.0,856.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, top-right of lentil crop field +UASs_236.jpg,1347.0,2371.0,1571.0,2615.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, bottom-left of lentil crop field +UASs_237.jpg,396.0,2163.0,755.0,2512.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, middle-left of lentil crop field +UASs_288.jpg,576.0,2168.0,669.0,2284.0,Compact square-like cluster with roughly symmetrical slightly irregular outline, weed: erigeron canadensis, middle-left of lentil crop field +UASs_316.jpg,539.0,3.0,740.0,279.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-left of lentil crop field +UASs_328.jpg,1772.0,3136.0,2140.0,3537.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, bottom-left of lentil crop field +UASs_377.jpg,1438.0,1725.0,1641.0,1936.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, middle-left of lentil crop field +UASs_496.jpg,17.0,64.0,223.0,282.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-left of lentil crop field +UASs_502.jpg,2349.0,1193.0,2516.0,1384.0,Compact square-like cluster with roughly symmetrical slightly irregular outline, weed: erigeron canadensis, middle-center of lentil crop field +UASs_503.jpg,2767.0,524.0,3121.0,916.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, top-center of lentil crop field +UASs_504.jpg,2858.0,728.0,3325.0,1225.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, top-center of lentil crop field +UASs_505.jpg,2813.0,1574.0,3069.0,1894.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, middle-center of lentil crop field +UASs_506.jpg,1917.0,9.0,2100.0,162.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-center of lentil crop field +UASs_507.jpg,2219.0,360.0,2389.0,570.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, top-center of lentil crop field +UASs_509.jpg,2428.0,7.0,2790.0,320.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-center of lentil crop field +UASs_510.jpg,2221.0,1071.0,2372.0,1229.0,Compact square-like cluster with roughly symmetrical slightly irregular outline, weed: erigeron canadensis, top-center of lentil crop field +UASs_660.jpg,1679.0,668.0,2066.0,1052.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-left of lentil crop field +UASs_661.jpg,1927.0,2768.0,2368.0,3189.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, bottom-center of lentil crop field +UASs_93.jpg,166.0,2184.0,417.0,2396.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, middle-left of lentil crop field \ No newline at end of file diff --git a/datasets/__init__.py b/datasets/__init__.py index 31b7a0b..b77ddae 100644 --- a/datasets/__init__.py +++ b/datasets/__init__.py @@ -1,24 +1,51 @@ +# Import necessary PyTorch and Torchvision libraries. import torch.utils.data import torchvision +# Import the specific 'build' functions from other dataset files in this package. +# Each 'build' function is responsible for creating a specific dataset instance. +# They are renamed to avoid naming conflicts. from .rsvg import build as build_rsvg from .rsvg_mm import build as build_rsvg_mm def get_coco_api_from_dataset(dataset): + """ + Helper function to retrieve the COCO API object from a dataset. + Some datasets might be wrapped in other PyTorch dataset classes like `Subset`. + This function iteratively unwraps the dataset to find the base COCO object. + """ + # Loop to handle nested datasets (e.g., a dataset wrapped in multiple `Subset` instances). for _ in range(10): + # This part is commented out but would have been an early exit condition. # if isinstance(dataset, torchvision.datasets.CocoDetection): # break + # If the current dataset object is a `Subset`, get the underlying dataset. if isinstance(dataset, torch.utils.data.Subset): dataset = dataset.dataset + # After unwrapping, check if the base dataset is a CocoDetection instance. if isinstance(dataset, torchvision.datasets.CocoDetection): + # If it is, return its `coco` attribute, which is the COCO API object. return dataset.coco def build_dataset(dataset_file: str, image_set: str, args): + """ + This is a factory function that constructs and returns the correct dataset. + It acts as a single entry point for creating any dataset supported by the project. + + Args: + dataset_file (str): The name of the dataset to build (e.g., 'rsvg'). + image_set (str): The split of the dataset to use (e.g., 'train' or 'val'). + args: Command-line arguments containing other dataset configurations. + """ + # Check the dataset name and call the corresponding build function. if dataset_file == 'rsvg': + # If the dataset is 'rsvg', call the build function imported from `rsvg.py`. return build_rsvg(image_set, args) if dataset_file == 'rsvg_mm': + # If the dataset is 'rsvg_mm', call the build function imported from `rsvg_mm.py`. return build_rsvg_mm(image_set, args) + # If the dataset_file name doesn't match any known datasets, raise an error. raise ValueError(f'dataset {dataset_file} not supported') diff --git a/datasets/concat_dataset.py b/datasets/concat_dataset.py index 23f0faf..9057b27 100644 --- a/datasets/concat_dataset.py +++ b/datasets/concat_dataset.py @@ -3,31 +3,66 @@ # Copyright (c) 2020 SenseTime. All Rights Reserved. # ------------------------------------------------------------------------ +# Import Path for handling file paths, though it's not directly used in this snippet. from pathlib import Path +# Import core PyTorch data utilities. import torch import torch.utils.data +# Import specific Dataset classes from PyTorch. +# ConcatDataset is used to combine multiple datasets into one. from torch.utils.data import Dataset, ConcatDataset + +# Import the 'build' function from the local 'refexp2seq.py' file. +# This function is responsible for creating datasets for referring expression tasks (like RefCOCO). from .refexp2seq import build as build_seq_refexp + +# Import the 'build' function from the local 'ytvos.py' file. +# This function is responsible for creating the Ref-Youtube-VOS dataset. from .ytvos import build as build_ytvs -from datasets import ytvos +# This import seems redundant as the 'ytvos' module is already imported above. +# It might be a leftover from previous code edits. +from datasets import ytvos def build(image_set, args): + """ + This function constructs a single, large dataset by concatenating several smaller ones. + It combines all RefCOCO variants and the Ref-Youtube-VOS dataset. + + Args: + image_set (str): The data split to use (e.g., 'train', 'val'). + args: Command-line arguments containing other dataset configurations. + + Returns: + ConcatDataset: A single dataset object that contains all the specified datasets. + """ + # Initialize an empty list to hold the individual dataset objects. concat_data = [] - print('preparing coco2seq dataset ....') + # Log that the RefCOCO datasets are being prepared. + print("preparing coco2seq dataset ....") + # Define the names of the RefCOCO dataset variants to be loaded. coco_names = ["refcoco", "refcoco+", "refcocog"] + # Loop through each RefCOCO dataset name. for name in coco_names: - coco_seq = build_seq_refexp(name, image_set, args) + # Call the build function for referring expression datasets to create an instance. + coco_seq = build_seq_refexp(name, image_set, args) + # Add the created dataset to the list. concat_data.append(coco_seq) - print('preparing ytvos dataset .... ') + # Log that the Ref-Youtube-VOS dataset is being prepared. + print("preparing ytvos dataset .... ") + # Call the build function for the YTVOS dataset to create an instance. ytvos_dataset = build_ytvs(image_set, args) + # Add the created dataset to the list. concat_data.append(ytvos_dataset) + # Use PyTorch's ConcatDataset to combine all the individual datasets in the list + # into a single, unified dataset object. concat_data = ConcatDataset(concat_data) + # Return the final concatenated dataset. return concat_data diff --git a/datasets/image_to_seq_augmenter.py b/datasets/image_to_seq_augmenter.py index f31e61c..4500b51 100644 --- a/datasets/image_to_seq_augmenter.py +++ b/datasets/image_to_seq_augmenter.py @@ -1,98 +1,179 @@ # ------------------------------------------------------------------------ -# Modified from SeqFormer (https://github.com/wjf5203/SeqFormer) -# ------------------------------------------------------------------------ -# Modified from STEm-Seg (https://github.com/sabarim/STEm-Seg) +# This code is modified from previous works: SeqFormer and STEm-Seg. +# It's designed to apply data augmentation to images and their corresponding +# segmentation masks and bounding boxes, often to simulate video sequences from single images. # ------------------------------------------------------------------------ +# Import the core imgaug library for data augmentation. import imgaug import imgaug.augmenters as iaa + +# Import numpy for numerical operations, especially on image arrays. import numpy as np +# Import datetime to generate time-based seeds for randomness. from datetime import datetime +# Import specific classes from imgaug for handling segmentation maps and bounding boxes. from imgaug.augmentables.segmaps import SegmentationMapsOnImage from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage class ImageToSeqAugmenter(object): - def __init__(self, perspective=True, affine=True, motion_blur=True, - brightness_range=(-50, 50), hue_saturation_range=(-15, 15), perspective_magnitude=0.12, - scale_range=1.0, translate_range={"x": (-0.15, 0.15), "y": (-0.15, 0.15)}, rotation_range=(-20, 20), - motion_blur_kernel_sizes=(7, 9), motion_blur_prob=0.5): - - self.basic_augmenter = iaa.SomeOf((1, None), [ - iaa.Add(brightness_range), - iaa.AddToHueAndSaturation(hue_saturation_range) - ] + """ + A class that defines a pipeline of image augmentations. + It can apply color, geometric (affine, perspective), and motion blur transformations. + It's specifically designed to handle images along with their segmentation masks and bounding boxes, + ensuring all are transformed consistently. + """ + + def __init__( + self, + perspective=True, + affine=True, + motion_blur=True, + brightness_range=(-50, 50), + hue_saturation_range=(-15, 15), + perspective_magnitude=0.12, + scale_range=1.0, + translate_range={"x": (-0.15, 0.15), "y": (-0.15, 0.15)}, + rotation_range=(-20, 20), + motion_blur_kernel_sizes=(7, 9), + motion_blur_prob=0.5, + ): + """ + Initializes the augmentation pipeline with various configurable options. + """ + + # Define a basic color augmentation pipeline. + # It applies EITHER brightness/contrast changes OR hue/saturation changes. + self.basic_augmenter = iaa.SomeOf( + (1, None), + [ + iaa.Add(brightness_range), # Adjust brightness. + iaa.AddToHueAndSaturation(hue_saturation_range), # Adjust color hue and saturation. + ], ) + # Create a list to hold geometric transformations. transforms = [] if perspective: + # Add a perspective transformation to simulate camera angle changes. transforms.append(iaa.PerspectiveTransform(perspective_magnitude)) if affine: - transforms.append(iaa.Affine(scale=scale_range, - translate_percent=translate_range, - rotate=rotation_range, - order=1, # cv2.INTER_LINEAR - backend='auto')) + # Add affine transformations: scaling, translation, and rotation. + transforms.append( + iaa.Affine( + scale=scale_range, + translate_percent=translate_range, + rotate=rotation_range, + order=1, # Use linear interpolation. + backend="auto", + ) + ) # Automatically choose backend (e.g., OpenCV). + + # Combine the geometric transforms into a sequence. transforms = iaa.Sequential(transforms) - transforms = [transforms] + transforms = [transforms] # Wrap in a list to append more augmenters. if motion_blur: - blur = iaa.Sometimes(motion_blur_prob, iaa.OneOf( - [ - iaa.MotionBlur(ksize) - for ksize in motion_blur_kernel_sizes - ] - )) + # Define a motion blur augmentation that is applied with a certain probability. + blur = iaa.Sometimes( + motion_blur_prob, + iaa.OneOf( + [ + # Choose one kernel size for the motion blur. + iaa.MotionBlur(ksize) + for ksize in motion_blur_kernel_sizes + ] + ), + ) transforms.append(blur) + # Combine all transformations (geometric + motion blur) into the final sequence. + # This is named 'frame_shift_augmenter' because it simulates frame-to-frame changes in a video. self.frame_shift_augmenter = iaa.Sequential(transforms) @staticmethod def condense_masks(instance_masks): + """ + Static method to convert a list of binary instance masks into a single integer-labeled segmentation map. + imgaug requires this format to augment multiple masks simultaneously. + Example: [[0,1,1], [1,0,0]] -> [2, 1, 1] where 1 is the first mask, 2 is the second. + """ + # Create an empty mask with the same shape as the first instance mask. condensed_mask = np.zeros_like(instance_masks[0], dtype=np.int8) + # Iterate through each binary mask, assigning a unique integer ID (starting from 1). for instance_id, mask in enumerate(instance_masks, 1): + # Where the binary mask is true, set the corresponding pixel in the condensed mask to the instance ID. condensed_mask = np.where(mask, instance_id, condensed_mask) return condensed_mask @staticmethod def expand_masks(condensed_mask, num_instances): + """ + Static method to perform the reverse of condense_masks. + It converts a single integer-labeled segmentation map back into a list of binary masks. + """ + # Create a list of binary masks by checking for each instance ID. return [(condensed_mask == instance_id).astype(np.uint8) for instance_id in range(1, num_instances + 1)] def __call__(self, image, masks=None, boxes=None): + """ + Applies the defined augmentation pipeline to an image and its optional masks/boxes. + """ + # Create a deterministic version of the geometric augmenter. + # This ensures the exact same geometric transformation is applied to the image, masks, and any other spatial data. det_augmenter = self.frame_shift_augmenter.to_deterministic() - + # If masks are provided, augment them along with the image. if masks is not None: masks_np, is_binary_mask = [], [] boxs_np = [] + # Prepare masks for augmentation. for mask in masks: - if isinstance(mask, np.ndarray): - masks_np.append(mask.astype(np.bool)) + masks_np.append(mask.astype(np.bool_)) is_binary_mask.append(False) else: raise ValueError("Invalid mask type: {}".format(type(mask))) num_instances = len(masks_np) + # Condense the list of binary masks into a single integer map and wrap it for imgaug. masks_np = SegmentationMapsOnImage(self.condense_masks(masks_np), shape=image.shape[:2]) - # boxs_np = BoundingBoxesOnImage(boxs_np, shape=image.shape[:2]) - seed = int(datetime.now().strftime('%M%S%f')[-8:]) + # Use a time-based seed to ensure the next two augmentations are identical. + seed = int(datetime.now().strftime("%M%S%f")[-8:]) imgaug.seed(seed) - aug_image, aug_masks = det_augmenter(image=self.basic_augmenter(image=image) , segmentation_maps=masks_np) + + # Augment the image and the condensed masks. + # Note: Color augmentation (`basic_augmenter`) is applied ONLY to the image. + # Geometric augmentation (`det_augmenter`) is applied to both. + aug_image, aug_masks = det_augmenter(image=self.basic_augmenter(image=image), segmentation_maps=masks_np) + + # Reset the seed to apply the same geometric transform again. imgaug.seed(seed) + # Create a mask of valid points by augmenting an image of all ones. + # Pixels that are shifted out of the image boundary will become zero. invalid_pts_mask = det_augmenter(image=np.ones(image.shape[:2] + (1,), np.uint8)).squeeze(2) + + # Expand the augmented integer map back into a list of binary masks. aug_masks = self.expand_masks(aug_masks.get_arr(), num_instances) - # aug_boxes = aug_boxes.remove_out_of_image().clip_out_of_image() + + # Filter the list of augmented masks. aug_masks = [mask for mask, is_bm in zip(aug_masks, is_binary_mask)] - return aug_image, aug_masks #, aug_boxes.to_xyxy_array() + # Return the augmented image and its corresponding augmented masks. + return aug_image, aug_masks + + # If no masks are provided, just augment the image and return a mask of valid points. else: - masks = [SegmentationMapsOnImage(np.ones(image.shape[:2], np.bool), shape=image.shape[:2])] + # Create a dummy mask to pass to the augmenter. + masks = [SegmentationMapsOnImage(np.ones(image.shape[:2], np.bool_), shape=image.shape[:2])] + # Augment the image and the dummy mask. aug_image, invalid_pts_mask = det_augmenter(image=image, segmentation_maps=masks) + # Return the augmented image and a boolean mask where False indicates pixels that are now outside the original image area. return aug_image, invalid_pts_mask.get_arr() == 0 diff --git a/datasets/refer.py b/datasets/refer.py index f28c340..af09ee8 100644 --- a/datasets/refer.py +++ b/datasets/refer.py @@ -1,4 +1,5 @@ -__author__ = 'licheng' +# A variable to store the author's name. +__author__ = "licheng" """ This interface provides access to four datasets: @@ -7,6 +8,7 @@ 3) refcoco+ 4) refcocog split by unc and google + The following API functions are defined: REFER - REFER api class getRefIds - get ref ids that satisfy given filter conditions. @@ -23,6 +25,7 @@ showMask - show mask of the referred object given ref """ +# Import necessary system and utility libraries. import sys import os.path as osp import json @@ -35,83 +38,111 @@ from matplotlib.patches import Polygon, Rectangle from pprint import pprint import numpy as np + +# Import the mask utilities from pycocotools for handling segmentation masks. from pycocotools import mask class REFER: - - def __init__(self, data_root, dataset='refcoco', splitBy='unc'): + """ + The main API class for interacting with referring expression datasets. + It loads and indexes the dataset annotations for efficient access. + """ + + def __init__(self, data_root, dataset="refcoco", splitBy="unc"): + """ + Initializes the REFER API object. + + Args: + data_root (str): The root directory where datasets are stored. + dataset (str): The name of the dataset to load (e.g., 'refcoco'). + splitBy (str): The split authority (e.g., 'unc', 'google'). + """ # provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog # also provide dataset name and splitBy information # e.g., dataset = 'refcoco', splitBy = 'unc' - print('loading dataset %s into memory...' % dataset) + print("loading dataset %s into memory..." % dataset) + # Set up directory paths based on the provided data_root and dataset name. self.ROOT_DIR = osp.abspath(osp.dirname(__file__)) - self.DATA_DIR = osp.join(data_root, dataset) # coco/refcoco - if dataset in ['refcoco', 'refcoco+', 'refcocog']: - self.IMAGE_DIR = osp.join(data_root, 'train2014') - elif dataset == 'refclef': - self.IMAGE_DIR = osp.join(data_root, 'saiapr_tc-12') + self.DATA_DIR = osp.join(data_root, dataset) # e.g., coco/refcoco + # Determine the image directory based on the dataset type. + if dataset in ["refcoco", "refcoco+", "refcocog"]: + self.IMAGE_DIR = osp.join(data_root, "train2014") + elif dataset == "refclef": + self.IMAGE_DIR = osp.join(data_root, "saiapr_tc-12") else: - print('No refer dataset is called [%s]' % dataset) + # If the dataset name is not recognized, print an error and exit. + print("No refer dataset is called [%s]" % dataset) sys.exit() # load refs from data/dataset/refs(dataset).json tic = time.time() - ref_file = osp.join(self.DATA_DIR, 'refs('+splitBy+').p') + # Construct the path to the pre-processed reference file (a pickled Python object). + ref_file = osp.join(self.DATA_DIR, "refs(" + splitBy + ").p") + # Initialize the main data dictionary. self.data = {} - self.data['dataset'] = dataset - - self.data['refs'] = pickle.load(open(ref_file, 'rb'), fix_imports=True) + self.data["dataset"] = dataset + # Load the pickled reference data. This contains the referring expressions and their links to images/annotations. + self.data["refs"] = pickle.load(open(ref_file, "rb"), fix_imports=True) # load annotations from data/dataset/instances.json - instances_file = osp.join(self.DATA_DIR, 'instances.json') - instances = json.load(open(instances_file, 'r')) # coco/refcoco/instances.json - # list[dict] keys: "license", "file_name", "coco_url", "height", "width", "date_captured", "flickr_url", "id" - self.data['images'] = instances['images'] - # list[dict] keys: "segmentation", "area", "iscrowd", "image_id", "bbox", "category_id", "id" - self.data['annotations'] = instances['annotations'] - # list[dict] keys: "supercategory", "id", "name" - self.data['categories'] = instances['categories'] - - # create index + # This file contains the standard COCO-style annotations. + instances_file = osp.join(self.DATA_DIR, "instances.json") + instances = json.load(open(instances_file, "r")) # e.g., coco/refcoco/instances.json + # Load image metadata (file names, dimensions, etc.). + self.data["images"] = instances["images"] + # Load object annotations (segmentations, bounding boxes, etc.). + self.data["annotations"] = instances["annotations"] + # Load category information (names, supercategories). + self.data["categories"] = instances["categories"] + + # Call the method to create efficient look-up tables (indexes). self.createIndex() - print('DONE (t=%.2fs)' % (time.time()-tic)) + print("DONE (t=%.2fs)" % (time.time() - tic)) def createIndex(self): + """ + Creates a set of dictionaries that map various IDs to their corresponding data. + This pre-processing step allows for very fast data retrieval. + """ # create sets of mapping # 1) Refs: {ref_id: ref} # 2) Anns: {ann_id: ann} # 3) Imgs: {image_id: image} # 4) Cats: {category_id: category_name} # 5) Sents: {sent_id: sent} - # 6) imgToRefs: {image_id: refs} - # 7) imgToAnns: {image_id: anns} + # 6) imgToRefs: {image_id: list_of_refs} + # 7) imgToAnns: {image_id: list_of_anns} # 8) refToAnn: {ref_id: ann} # 9) annToRef: {ann_id: ref} - # 10) catToRefs: {category_id: refs} + # 10) catToRefs: {category_id: list_of_refs} # 11) sentToRef: {sent_id: ref} - # 12) sentToTokens: {sent_id: tokens} - print('creating index...') + # 12) sentToTokens: {sent_id: list_of_tokens} + print("creating index...") # fetch info from instances Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {} - for ann in self.data['annotations']: - Anns[ann['id']] = ann - imgToAnns[ann['image_id']] = imgToAnns.get(ann['image_id'], []) + [ann] - for img in self.data['images']: - Imgs[img['id']] = img - for cat in self.data['categories']: - Cats[cat['id']] = cat['name'] + # Index annotations by their ID and group them by image ID. + for ann in self.data["annotations"]: + Anns[ann["id"]] = ann + imgToAnns[ann["image_id"]] = imgToAnns.get(ann["image_id"], []) + [ann] + # Index images by their ID. + for img in self.data["images"]: + Imgs[img["id"]] = img + # Index categories by their ID. + for cat in self.data["categories"]: + Cats[cat["id"]] = cat["name"] # fetch info from refs Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {} Sents, sentToRef, sentToTokens = {}, {}, {} - for ref in self.data['refs']: + # Index references and sentences, creating all the necessary cross-mappings. + for ref in self.data["refs"]: # ids - ref_id = ref['ref_id'] - ann_id = ref['ann_id'] - category_id = ref['category_id'] - image_id = ref['image_id'] + ref_id = ref["ref_id"] + ann_id = ref["ann_id"] + category_id = ref["category_id"] + image_id = ref["image_id"] # add mapping related to ref Refs[ref_id] = ref @@ -121,12 +152,12 @@ def createIndex(self): annToRef[ann_id] = ref # add mapping of sent - for sent in ref['sentences']: - Sents[sent['sent_id']] = sent - sentToRef[sent['sent_id']] = ref - sentToTokens[sent['sent_id']] = sent['tokens'] + for sent in ref["sentences"]: + Sents[sent["sent_id"]] = sent + sentToRef[sent["sent_id"]] = ref + sentToTokens[sent["sent_id"]] = sent["tokens"] - # create class members + # Store the created indexes as class members for easy access. self.Refs = Refs self.Anns = Anns self.Imgs = Imgs @@ -139,167 +170,216 @@ def createIndex(self): self.catToRefs = catToRefs self.sentToRef = sentToRef self.sentToTokens = sentToTokens - print('index created.') + print("index created.") - def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''): + def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=""): + """ + Get reference IDs that satisfy the given filter conditions. + """ + # Ensure inputs are lists. image_ids = image_ids if type(image_ids) == list else [image_ids] cat_ids = cat_ids if type(cat_ids) == list else [cat_ids] ref_ids = ref_ids if type(ref_ids) == list else [ref_ids] + # If no filters are provided, return all reference IDs. if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0: - refs = self.data['refs'] + refs = self.data["refs"] else: + # Apply filters sequentially. if not len(image_ids) == 0: + # Use the pre-computed index for fast lookup. refs = [self.imgToRefs[image_id] for image_id in image_ids] else: - refs = self.data['refs'] + refs = self.data["refs"] if not len(cat_ids) == 0: - refs = [ref for ref in refs if ref['category_id'] in cat_ids] + refs = [ref for ref in refs if ref["category_id"] in cat_ids] if not len(ref_ids) == 0: - refs = [ref for ref in refs if ref['ref_id'] in ref_ids] + refs = [ref for ref in refs if ref["ref_id"] in ref_ids] if not len(split) == 0: - if split in ['testA', 'testB', 'testC']: - refs = [ref for ref in refs if split[-1] in ref['split']] # we also consider testAB, testBC, ... - elif split in ['testAB', 'testBC', 'testAC']: - refs = [ref for ref in refs if ref['split'] == split] # rarely used I guess... - elif split == 'test': - refs = [ref for ref in refs if 'test' in ref['split']] - elif split == 'train' or split == 'val': - refs = [ref for ref in refs if ref['split'] == split] + # Filter by data split (e.g., 'train', 'val', 'testA'). + if split in ["testA", "testB", "testC"]: + refs = [ref for ref in refs if split[-1] in ref["split"]] # we also consider testAB, testBC, ... + elif split in ["testAB", "testBC", "testAC"]: + refs = [ref for ref in refs if ref["split"] == split] # rarely used I guess... + elif split == "test": + refs = [ref for ref in refs if "test" in ref["split"]] + elif split == "train" or split == "val": + refs = [ref for ref in refs if ref["split"] == split] else: - print('No such split [%s]' % split) + print("No such split [%s]" % split) sys.exit() - ref_ids = [ref['ref_id'] for ref in refs] + # Return a list of the final filtered reference IDs. + ref_ids = [ref["ref_id"] for ref in refs] return ref_ids def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]): + """ + Get annotation IDs that satisfy the given filter conditions. + """ image_ids = image_ids if type(image_ids) == list else [image_ids] cat_ids = cat_ids if type(cat_ids) == list else [cat_ids] ref_ids = ref_ids if type(ref_ids) == list else [ref_ids] + # If no filters, return all annotation IDs. if len(image_ids) == len(cat_ids) == len(ref_ids) == 0: - ann_ids = [ann['id'] for ann in self.data['annotations']] + ann_ids = [ann["id"] for ann in self.data["annotations"]] else: + # Apply filters sequentially. if not len(image_ids) == 0: - lists = [self.imgToAnns[image_id] for image_id in image_ids if image_id in self.imgToAnns] # list of [anns] + lists = [ + self.imgToAnns[image_id] for image_id in image_ids if image_id in self.imgToAnns + ] # list of [anns] anns = list(itertools.chain.from_iterable(lists)) else: - anns = self.data['annotations'] + anns = self.data["annotations"] if not len(cat_ids) == 0: - anns = [ann for ann in anns if ann['category_id'] in cat_ids] - ann_ids = [ann['id'] for ann in anns] + anns = [ann for ann in anns if ann["category_id"] in cat_ids] + ann_ids = [ann["id"] for ann in anns] if not len(ref_ids) == 0: - ids = set(ann_ids).intersection(set([self.Refs[ref_id]['ann_id'] for ref_id in ref_ids])) + # Intersect with annotations linked to the given reference IDs. + ids = set(ann_ids).intersection(set([self.Refs[ref_id]["ann_id"] for ref_id in ref_ids])) return ann_ids def getImgIds(self, ref_ids=[]): + """ + Get image IDs associated with the given reference IDs. + """ ref_ids = ref_ids if type(ref_ids) == list else [ref_ids] if not len(ref_ids) == 0: - image_ids = list(set([self.Refs[ref_id]['image_id'] for ref_id in ref_ids])) + # Use the index to find image IDs from reference IDs. + image_ids = list(set([self.Refs[ref_id]["image_id"] for ref_id in ref_ids])) else: + # If no ref_ids are given, return all image IDs. image_ids = self.Imgs.keys() return image_ids def getCatIds(self): + """ + Get all category IDs in the dataset. + """ return self.Cats.keys() def loadRefs(self, ref_ids=[]): + """ + Load full reference data for the given reference IDs. + """ if type(ref_ids) == list: return [self.Refs[ref_id] for ref_id in ref_ids] elif type(ref_ids) == int: return [self.Refs[ref_ids]] def loadAnns(self, ann_ids=[]): + """ + Load full annotation data for the given annotation IDs. + """ if type(ann_ids) == list: return [self.Anns[ann_id] for ann_id in ann_ids] - elif type(ann_ids) == int or type(ann_ids) == unicode: + elif type(ann_ids) == int: return [self.Anns[ann_ids]] def loadImgs(self, image_ids=[]): + """ + Load full image data for the given image IDs. + """ if type(image_ids) == list: return [self.Imgs[image_id] for image_id in image_ids] elif type(image_ids) == int: return [self.Imgs[image_ids]] def loadCats(self, cat_ids=[]): + """ + Load category names for the given category IDs. + """ if type(cat_ids) == list: return [self.Cats[cat_id] for cat_id in cat_ids] elif type(cat_ids) == int: return [self.Cats[cat_ids]] def getRefBox(self, ref_id): + """ + Get the bounding box [x, y, w, h] for a given reference ID. + """ ref = self.Refs[ref_id] + # Use the refToAnn index to find the corresponding annotation. ann = self.refToAnn[ref_id] - return ann['bbox'] # [x, y, w, h] + return ann["bbox"] # [x, y, w, h] - def showRef(self, ref, seg_box='seg'): + def showRef(self, ref, seg_box="seg"): + """ + Display an image and overlay the referred object's segmentation or bounding box. + """ ax = plt.gca() # show image - image = self.Imgs[ref['image_id']] - I = io.imread(osp.join(self.IMAGE_DIR, image['file_name'])) + image = self.Imgs[ref["image_id"]] + I = io.imread(osp.join(self.IMAGE_DIR, image["file_name"])) ax.imshow(I) # show refer expression - for sid, sent in enumerate(ref['sentences']): - print('%s. %s' % (sid+1, sent['sent'])) + for sid, sent in enumerate(ref["sentences"]): + print("%s. %s" % (sid + 1, sent["sent"])) # show segmentations - if seg_box == 'seg': - ann_id = ref['ann_id'] + if seg_box == "seg": + ann_id = ref["ann_id"] ann = self.Anns[ann_id] polygons = [] color = [] - c = 'none' - if type(ann['segmentation'][0]) == list: - # polygon used for refcoco* - for seg in ann['segmentation']: - poly = np.array(seg).reshape((len(seg)/2, 2)) + c = "none" + if type(ann["segmentation"][0]) == list: + # This handles polygon format segmentation, common in refcoco*. + for seg in ann["segmentation"]: + poly = np.array(seg).reshape((len(seg) // 2, 2)) polygons.append(Polygon(poly, True, alpha=0.4)) color.append(c) + # Add the polygon patches to the plot. p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 1, 0, 0), linewidths=3, alpha=1) ax.add_collection(p) # thick yellow polygon p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 0, 0, 0), linewidths=1, alpha=1) ax.add_collection(p) # thin red polygon else: - # mask used for refclef - rle = ann['segmentation'] + # This handles RLE (Run-Length Encoding) format segmentation. + rle = ann["segmentation"] m = mask.decode(rle) img = np.ones((m.shape[0], m.shape[1], 3)) - color_mask = np.array([2.0, 166.0, 101.0])/255 + color_mask = np.array([2.0, 166.0, 101.0]) / 255 for i in range(3): img[:, :, i] = color_mask[i] - ax.imshow(np.dstack((img, m*0.5))) + ax.imshow(np.dstack((img, m * 0.5))) # show bounding-box - elif seg_box == 'box': - ann_id = ref['ann_id'] + elif seg_box == "box": + ann_id = ref["ann_id"] ann = self.Anns[ann_id] - bbox = self.getRefBox(ref['ref_id']) - box_plot = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor='green', linewidth=3) + bbox = self.getRefBox(ref["ref_id"]) + # Create a rectangle patch and add it to the plot. + box_plot = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor="green", linewidth=3) ax.add_patch(box_plot) def getMask(self, ref): + """ + Get the binary segmentation mask for a given reference. + """ # return mask, area and mask-center - ann = self.refToAnn[ref['ref_id']] - image = self.Imgs[ref['image_id']] - if type(ann['segmentation'][0]) == list: # polygon - rle = mask.frPyObjects(ann['segmentation'], image['height'], image['width']) - else: - rle = ann['segmentation'] - - # for i in range(len(rle['counts'])): - # print(rle) + ann = self.refToAnn[ref["ref_id"]] + image = self.Imgs[ref["image_id"]] + # Convert polygon format to RLE if necessary. + if type(ann["segmentation"][0]) == list: # polygon + rle = mask.frPyObjects(ann["segmentation"], image["height"], image["width"]) + else: # It's already in RLE format. + rle = ann["segmentation"] + + # Decode the RLE to get a binary mask. m = mask.decode(rle) + # Handle cases where a single annotation has multiple disconnected parts. m = np.sum(m, axis=2) # sometimes there are multiple binary map (corresponding to multiple segs) m = m.astype(np.uint8) # convert to np.uint8 # compute area area = sum(mask.area(rle)) # should be close to ann['area'] - return {'mask': m, 'area': area} - + return {"mask": m, "area": area} def showMask(self, ref): + """ + A simple utility to display the mask of a referred object. + """ M = self.getMask(ref) - msk = M['mask'] + msk = M["mask"] ax = plt.gca() ax.imshow(msk) - - - diff --git a/datasets/refexp_eval.py b/datasets/refexp_eval.py index 826aa5c..7573642 100644 --- a/datasets/refexp_eval.py +++ b/datasets/refexp_eval.py @@ -1,4 +1,5 @@ # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved +# Import necessary libraries. import copy from collections import defaultdict from pathlib import Path @@ -6,80 +7,142 @@ import torch import torch.utils.data +# Import project-specific utilities. import util.misc as utils from util.box_ops import generalized_box_iou class RefExpEvaluator(object): + """ + A class to evaluate referring expression detection results. + It calculates Precision@k for different IoU thresholds. + """ + def __init__(self, refexp_gt, iou_types, k=(1, 5, 10), thresh_iou=0.5): + """ + Initializes the evaluator. + + Args: + refexp_gt: The ground truth data, typically a REFER object. + iou_types: The types of IoU to consider (not directly used here but common in other evaluators). + k (tuple): A tuple of integers for which to calculate Precision@k (e.g., top 1, 5, 10 predictions). + thresh_iou (float): The IoU threshold to consider a prediction correct. + """ + # Ensure k is a list or tuple. assert isinstance(k, (list, tuple)) + # Make a deep copy of the ground truth to avoid modifying the original object. refexp_gt = copy.deepcopy(refexp_gt) self.refexp_gt = refexp_gt self.iou_types = iou_types + # Get the list of all image IDs from the ground truth. self.img_ids = self.refexp_gt.imgs.keys() + # A dictionary to store model predictions, keyed by image ID. self.predictions = {} + # Store the k values for Precision@k calculation. self.k = k + # Store the IoU threshold. self.thresh_iou = thresh_iou def accumulate(self): + """ + A placeholder method, often used for accumulating stats over time. Not implemented here. + """ pass def update(self, predictions): + """ + Updates the evaluator's internal predictions dictionary with new results from the model. + """ self.predictions.update(predictions) def synchronize_between_processes(self): + """ + In a distributed (multi-GPU) setting, this function gathers predictions from all processes + and merges them into a single dictionary on the main process. + """ + # Use the utility function to gather prediction dictionaries from all processes. all_predictions = utils.all_gather(self.predictions) merged_predictions = {} + # Iterate through the list of dictionaries and merge them. for p in all_predictions: merged_predictions.update(p) + # Replace the local predictions with the complete, merged set. self.predictions = merged_predictions def summarize(self): + """ + Calculates and prints the final evaluation metrics (Precision@k). + This method should only be run on the main process after all predictions are synchronized. + """ + # Ensure this part only runs on the main process to avoid duplicate calculations and printing. if utils.is_main_process(): + # Initialize dictionaries to store scores and counts for each dataset. dataset2score = { "refcoco": {k: 0.0 for k in self.k}, "refcoco+": {k: 0.0 for k in self.k}, "refcocog": {k: 0.0 for k in self.k}, } dataset2count = {"refcoco": 0.0, "refcoco+": 0.0, "refcocog": 0.0} + + # Iterate over every image ID in the ground truth test set. for image_id in self.img_ids: + # Get the ground truth annotation ID for the current image. ann_ids = self.refexp_gt.getAnnIds(imgIds=image_id) - assert len(ann_ids) == 1 + assert len(ann_ids) == 1, "Each image should have exactly one referring expression annotation." + # Load image metadata, which includes the dataset name (e.g., 'refcoco'). img_info = self.refexp_gt.loadImgs(image_id)[0] + # Load the ground truth annotation (which contains the target bounding box). target = self.refexp_gt.loadAnns(ann_ids[0]) + # Get the model's prediction for this image. prediction = self.predictions[image_id] - assert prediction is not None + assert prediction is not None, "Prediction not found for image." + + # Sort the predicted boxes by their confidence scores in descending order. sorted_scores_boxes = sorted( zip(prediction["scores"].tolist(), prediction["boxes"].tolist()), reverse=True ) + # Unzip the sorted scores and boxes. sorted_scores, sorted_boxes = zip(*sorted_scores_boxes) + # Convert the list of boxes back into a single tensor. sorted_boxes = torch.cat([torch.as_tensor(x).view(1, 4) for x in sorted_boxes]) + + # Get the ground truth bounding box in [x, y, width, height] format. target_bbox = target[0]["bbox"] + # Convert the ground truth box to [x1, y1, x2, y2] format. converted_bbox = [ target_bbox[0], target_bbox[1], target_bbox[2] + target_bbox[0], target_bbox[3] + target_bbox[1], ] + # Calculate the Generalized IoU between all sorted predicted boxes and the single ground truth box. giou = generalized_box_iou(sorted_boxes, torch.as_tensor(converted_bbox).view(-1, 4)) + + # Check for a correct prediction within the top k boxes. for k in self.k: + # If the maximum IoU among the top k predictions is above the threshold... if max(giou[:k]) >= self.thresh_iou: + # ...count it as a correct prediction for that k. dataset2score[img_info["dataset_name"]][k] += 1.0 + # Increment the total number of samples for this dataset. dataset2count[img_info["dataset_name"]] += 1.0 + # Calculate the final precision scores by dividing the correct counts by the total counts. for key, value in dataset2score.items(): for k in self.k: try: value[k] /= dataset2count[key] - except: + except ZeroDivisionError: + # Handle cases where a dataset might have zero samples. pass + + # Format and print the results. results = {} for key, value in dataset2score.items(): results[key] = sorted([v for k, v in value.items()]) print(f" Dataset: {key} - Precision @ 1, 5, 10: {results[key]} \n") return results + # If not the main process, return None. return None - - diff --git a/datasets/rsvg_mm.py b/datasets/rsvg_mm.py index 29237f7..fca23cb 100644 --- a/datasets/rsvg_mm.py +++ b/datasets/rsvg_mm.py @@ -9,6 +9,7 @@ from PIL import Image import util from util.transforms import letterbox + # from torchvision.transforms import Compose, ToTensor, Normalize import datasets.transforms_image as T import matplotlib.pyplot as plt @@ -18,8 +19,7 @@ class RSVGDataset(data.Dataset): - def __init__(self, images_path, imsize=1024, transform= None, augment= False, - split='train', testmode=False): + def __init__(self, images_path, imsize=1024, transform=None, augment=False, split="train", testmode=False): self.images = [] self.images_path = images_path self.imsize = imsize @@ -30,14 +30,20 @@ def __init__(self, images_path, imsize=1024, transform= None, augment= False, # file = open('data/rsvg_mm/' + 'rsvg_mm_train_v11.txt', "r").readlines() # file = open('data/rsvg_mm/' + 'rsvg_mm_' + split + '.txt', "r").readlines() - file = open('data/rsvg_mm/' + 'rsvg_mm_' + split + '_v2.txt', "r").readlines() - Index = [index.strip('\n') for index in file] + file = open("RSVG-HR/Annotations/rsvg_hr_test_10.txt", "r").readlines() + # file = open("data/rsvg_mm/" + "rsvg_mm_" + split + "_v2.txt", "r").readlines() + Index = [index.strip("\n") for index in file] for anno in Index: - anno_list = anno.split(',') + anno_list = anno.split(",") img_name = anno_list[0] - xmin_gt, ymin_gt, xmax_gt, ymax_gt = float(anno_list[1]), float(anno_list[2]), float(anno_list[3]), float(anno_list[4]) + xmin_gt, ymin_gt, xmax_gt, ymax_gt = ( + float(anno_list[1]), + float(anno_list[2]), + float(anno_list[3]), + float(anno_list[4]), + ) text = anno_list[-1] - image_path = images_path + '/' + img_name + image_path = images_path + "/" + img_name box = np.array([xmin_gt, ymin_gt, xmax_gt, ymax_gt], dtype=np.float32) self.images.append((image_path, box, text)) @@ -47,7 +53,7 @@ def pull_item(self, idx): # bbox = np.array(bbox, dtype=int) # box format: to x1 y1 x2 y2 # img_bgr = cv2.imread(img_path) # img = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB) - img = Image.open(img_path).convert('RGB') + img = Image.open(img_path).convert("RGB") # img = np.array(img) return img, phrase, bbox, img_path @@ -56,7 +62,7 @@ def __len__(self): return len(self.images) def __getitem__(self, idx): - img, phrase, bbox, img_path = self.pull_item(idx) + img, phrase, bbox, img_path = self.pull_item(idx) # print(img_path) # phrase = phrase.lower() caption = " ".join(phrase.lower().split()) @@ -98,8 +104,8 @@ def __getitem__(self, idx): return img.unsqueeze(0), target # return img: [1, 3, H, W], the first dimension means T = 1. -def make_coco_transforms(image_set, cautious): +def make_coco_transforms(image_set, cautious): normalize = T.Compose([T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) # scales = [480, 560, 640, 720, 800] @@ -109,49 +115,33 @@ def make_coco_transforms(image_set, cautious): max_size = 1024 if image_set == "train": - return T.Compose( - [T.RandomResize(scales, max_size=max_size), - normalize] - ) + return T.Compose([T.RandomResize(scales, max_size=max_size), normalize]) else: - return T.Compose([ - T.ToTensor(), - T.Normalize( - mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - ]) - + return T.Compose([T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) raise ValueError(f"unknown {image_set}") + from pathlib import Path + def build(image_set, args): root = Path(args.rsvg_mm_path) - assert root.exists(), f'provided rsvg_mm path {root} does not exist' - input_transform = T.Compose([ - T.ToTensor(), - T.Normalize( - mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - ]) - - img_folder = 'data/rsvg_mm/images' - dataset = RSVGDataset(img_folder, transform=input_transform, split=image_set, testmode=(image_set == 'test'))ssss + assert root.exists(), f"provided rsvg_mm path {root} does not exist" + input_transform = T.Compose([T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) + + img_folder = "data/images" # Updated to correct path + dataset = RSVGDataset(img_folder, transform=input_transform, split=image_set, testmode=(image_set == "test")) return dataset + # make_coco_transforms(image_set, False) -if __name__ == '__main__': - input_transform = T.Compose([ - T.ToTensor(), - T.Normalize( - mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - ]) - img_folder = '../data/rsvg_mm/images' - image_set = 'train' - dataset = RSVGDataset(img_folder, transform=input_transform, split=image_set, testmode=(image_set=='test')) +if __name__ == "__main__": + input_transform = T.Compose([T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) + img_folder = "../data/rsvg_mm/images" + image_set = "train" + dataset = RSVGDataset(img_folder, transform=input_transform, split=image_set, testmode=(image_set == "test")) sample_num = dataset.__len__() sample_num = dataset.__getitem__(0) print(sample_num) diff --git a/inference_rsvg.py b/inference_rsvg.py index 5759399..acedef2 100644 --- a/inference_rsvg.py +++ b/inference_rsvg.py @@ -21,18 +21,17 @@ from tools.colormap import colormap # os.environ.pop("QT_QPA_PLATFORM_PLUGIN_PATH") -os.environ["CUDA_VISIBLE_DEVICES"] = '2' +os.environ["CUDA_VISIBLE_DEVICES"] = "2" # colormap color_list = colormap() -color_list = color_list.astype('uint8').tolist() +color_list = color_list.astype("uint8").tolist() -Visualize_bbox = False #False #True +Visualize_bbox = True # False #True save_visualize_path_prefix = "test_output" version = "test" - def main(args): args.masks = False # args.batch_size == 1 @@ -44,13 +43,21 @@ def main(args): np.random.seed(seed) random.seed(seed) + # Override dataset settings for our custom setup + args.dataset_file = "rsvg_mm" + args.rsvg_mm_path = "." # Current directory contains our setup + args.visualize = True + + # Override model parameters to match the checkpoint + args.num_classes = 1 # Checkpoint has 1 class + args.num_queries = 10 # Checkpoint has 10 queries + if args.visualize: if not os.path.exists(save_visualize_path_prefix): os.makedirs(save_visualize_path_prefix) - test_dataset = build_dataset(args.dataset_file, image_set='test', args=args) - test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, - pin_memory=True, drop_last=True, num_workers=4) + test_dataset = build_dataset(args.dataset_file, image_set="test", args=args) + test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, pin_memory=True, drop_last=True, num_workers=4) # model model, criterion, _ = build_model(args) @@ -59,22 +66,23 @@ def main(args): # model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) - print('number of params:', n_parameters) + print("number of params:", n_parameters) if args.resume: - checkpoint = torch.load(args.resume, map_location='cpu') - missing_keys, unexpected_keys = model.load_state_dict(checkpoint['model'], strict=False) - unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))] + checkpoint = torch.load(args.resume, map_location="cpu", weights_only=False) + missing_keys, unexpected_keys = model.load_state_dict(checkpoint["model"], strict=False) + unexpected_keys = [k for k in unexpected_keys if not (k.endswith("total_params") or k.endswith("total_ops"))] if len(missing_keys) > 0: - print('Missing Keys: {}'.format(missing_keys)) + print("Missing Keys: {}".format(missing_keys)) if len(unexpected_keys) > 0: - print('Unexpected Keys: {}'.format(unexpected_keys)) + print("Unexpected Keys: {}".format(unexpected_keys)) else: - raise ValueError('Please specify the checkpoint for inference.') + raise ValueError("Please specify the checkpoint for inference.") # start inference evaluate(test_loader, model, args) + def evaluate(test_loader, model, args): batch_time = AverageMeter() acc5 = AverageMeter() @@ -91,9 +99,9 @@ def evaluate(test_loader, model, args): end = time.time() img_list = [] - count=0 + count = 0 for batch_idx, (img, targets, dw, dh, img_path, ratio) in enumerate(test_loader): - h_resize, w_resize = img.shape[ -2:] + h_resize, w_resize = img.shape[-2:] img = img.to(device) captions = targets["caption"] size = torch.as_tensor([int(h_resize), int(w_resize)]).to(device) @@ -127,12 +135,12 @@ def evaluate(test_loader, model, args): # _, max_ind = max_score.max(-1) # [1,] # which query # pred_bbox = pred_bboxes[max_ind] # [xc,yc, w_b, h_b] - #single level selection + # single level selection # according to pred_logits, select the query index pred_logits = outputs["pred_logits"][0] pred_bbox = outputs["pred_boxes"][0] pred_score = pred_logits.sigmoid() # [t, q, k] - pred_score = pred_score.squeeze(0)# [q, k] + pred_score = pred_score.squeeze(0) # [q, k] # pred_scores = pred_scores.mean(0) # [q, k] max_score, _ = pred_score.max(-1) # [q,] _, max_ind = max_score.max(-1) # [1,] # which query @@ -150,33 +158,33 @@ def evaluate(test_loader, model, args): target_bbox[1], target_bbox[3] = (target_bbox[1] - dh) / ratio, (target_bbox[3] - dh) / ratio if Visualize_bbox: - source_img = Image.open(img_path[0]).convert('RGB') # PIL image - - draw = ImageDraw.Draw(source_img) - draw_boxes = pred_bbox.tolist() - - # draw boxes - xmin, ymin, xmax, ymax = draw_boxes[0:4] - - # draw_boxes_gt = target_bbox.tolist() - # xmin_gt, ymin_gt, xmax_gt, ymax_gt = draw_boxes_gt[0:4] - - draw.rectangle(((xmin, ymin), (xmax, ymax)), outline=tuple(color_list[9]), width=2) - # draw.rectangle(((xmin_gt, ymin_gt), (xmax_gt, ymax_gt)), outline=tuple(color_list[9]), width=2) - # fontStyle = ImageFont.truetype("SimHei.ttf", 30) - # draw.text((20, 20), captions[0], (200, 0, 0), font=fontStyle) - # save - save_visualize_path_dir = os.path.join(save_visualize_path_prefix, version) - if not os.path.exists(save_visualize_path_dir): - os.makedirs(save_visualize_path_dir) - img_name = img_path[0].split('/')[-1] - if img_name not in img_list: - img_list.append(img_name) - else: - count += 1 - img_name = str(count) + '_' + img_name - save_visualize_path = os.path.join(save_visualize_path_dir, img_name) - source_img.save(save_visualize_path) + source_img = Image.open(img_path[0]).convert("RGB") # PIL image + + draw = ImageDraw.Draw(source_img) + draw_boxes = pred_bbox.tolist() + + # draw boxes + xmin, ymin, xmax, ymax = draw_boxes[0:4] + + # draw_boxes_gt = target_bbox.tolist() + # xmin_gt, ymin_gt, xmax_gt, ymax_gt = draw_boxes_gt[0:4] + + draw.rectangle(((xmin, ymin), (xmax, ymax)), outline=tuple(color_list[9]), width=2) + # draw.rectangle(((xmin_gt, ymin_gt), (xmax_gt, ymax_gt)), outline=tuple(color_list[9]), width=2) + # fontStyle = ImageFont.truetype("SimHei.ttf", 30) + # draw.text((20, 20), captions[0], (200, 0, 0), font=fontStyle) + # save + save_visualize_path_dir = os.path.join(save_visualize_path_prefix, version) + if not os.path.exists(save_visualize_path_dir): + os.makedirs(save_visualize_path_dir) + img_name = img_path[0].split("/")[-1] + if img_name not in img_list: + img_list.append(img_name) + else: + count += 1 + img_name = str(count) + "_" + img_name + save_visualize_path = os.path.join(save_visualize_path_dir, img_name) + source_img.save(save_visualize_path) # box iou iou, interArea, unionArea = bbox_iou(pred_bbox, target_bbox) @@ -194,7 +202,6 @@ def evaluate(test_loader, model, args): inter_area.update(cumInterArea) union_area.update(cumUnionArea) - acc5.update(accu5, img.size(0)) acc6.update(accu6, img.size(0)) acc7.update(accu7, img.size(0)) @@ -206,39 +213,68 @@ def evaluate(test_loader, model, args): end = time.time() if batch_idx % 50 == 0: - print_str = '[{0}/{1}]\t' \ - 'Time {batch_time.avg:.3f}\t' \ - 'acc@0.5: {acc5.avg:.4f}\t' \ - 'acc@0.6: {acc6.avg:.4f}\t' \ - 'acc@0.7: {acc7.avg:.4f}\t' \ - 'acc@0.8: {acc8.avg:.4f}\t' \ - 'acc@0.9: {acc9.avg:.4f}\t' \ - 'meanIoU: {meanIoU.avg:.4f}\t' \ - 'cumuIoU: {cumuIoU:.4f}\t' \ - .format( \ - batch_idx, len(test_loader), batch_time=batch_time, \ - acc5=acc5, acc6=acc6, acc7=acc7, acc8=acc8, acc9=acc9, \ - meanIoU=meanIoU, cumuIoU=inter_area.sum / union_area.sum) + print_str = ( + "[{0}/{1}]\t" + "Time {batch_time.avg:.3f}\t" + "acc@0.5: {acc5.avg:.4f}\t" + "acc@0.6: {acc6.avg:.4f}\t" + "acc@0.7: {acc7.avg:.4f}\t" + "acc@0.8: {acc8.avg:.4f}\t" + "acc@0.9: {acc9.avg:.4f}\t" + "meanIoU: {meanIoU.avg:.4f}\t" + "cumuIoU: {cumuIoU:.4f}\t".format( + batch_idx, + len(test_loader), + batch_time=batch_time, + acc5=acc5, + acc6=acc6, + acc7=acc7, + acc8=acc8, + acc9=acc9, + meanIoU=meanIoU, + cumuIoU=inter_area.sum / (union_area.sum + 1e-6), + ) + ) print(print_str) # logging.info(print_str) - final_str = 'acc@0.5: {acc5.avg:.4f}\t' 'acc@0.6: {acc6.avg:.4f}\t' 'acc@0.7: {acc7.avg:.4f}\t' \ - 'acc@0.8: {acc8.avg:.4f}\t' 'acc@0.9: {acc9.avg:.4f}\t' \ - 'meanIoU: {meanIoU.avg:.4f}\t' 'cumuIoU: {cumuIoU:.4f}\t' \ - .format(acc5=acc5, acc6=acc6, acc7=acc7, acc8=acc8, acc9=acc9, \ - meanIoU=meanIoU, cumuIoU=inter_area.sum / union_area.sum) + final_str = ( + "acc@0.5: {acc5.avg:.4f}\t" + "acc@0.6: {acc6.avg:.4f}\t" + "acc@0.7: {acc7.avg:.4f}\t" + "acc@0.8: {acc8.avg:.4f}\t" + "acc@0.9: {acc9.avg:.4f}\t" + "meanIoU: {meanIoU.avg:.4f}\t" + "cumuIoU: {cumuIoU:.4f}\t".format( + acc5=acc5, + acc6=acc6, + acc7=acc7, + acc8=acc8, + acc9=acc9, + meanIoU=meanIoU, + cumuIoU=inter_area.sum / (union_area.sum + 1e-6), + ) + ) print(final_str) print(version) - - def bbox_iou(box1, box2): """ Returns the IoU of two bounding boxes """ # Get the coordinates of bounding boxes - b1_x1, b1_y1, b1_x2, b1_y2 = torch.tensor(box1[0]), torch.tensor(box1[1]), torch.tensor(box1[2]), torch.tensor(box1[3]) - b2_x1, b2_y1, b2_x2, b2_y2 = torch.tensor(box2[0]), torch.tensor(box2[1]), torch.tensor(box2[2]), torch.tensor(box2[3]) + b1_x1, b1_y1, b1_x2, b1_y2 = ( + torch.tensor(box1[0]), + torch.tensor(box1[1]), + torch.tensor(box1[2]), + torch.tensor(box1[3]), + ) + b2_x1, b2_y1, b2_x2, b2_y2 = ( + torch.tensor(box2[0]), + torch.tensor(box2[1]), + torch.tensor(box2[2]), + torch.tensor(box2[3]), + ) # get the coordinates of the intersection rectangle @@ -255,11 +291,11 @@ def bbox_iou(box1, box2): return (inter_area + 1e-6) / (union_area + 1e-6), inter_area, union_area + # visuaize functions def box_cxcywh_to_xyxy(x): x_c, y_c, w, h = x.unbind(0) - b = [(x_c - 0.5 * w), (y_c - 0.5 * h), - (x_c + 0.5 * w), (y_c + 0.5 * h)] + b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] return torch.stack(b, dim=0) @@ -288,15 +324,14 @@ def draw_sample_points(draw, sample_points, img_size, color_list): x, y = sample cur_color = color_list[i % len(color_list)][::-1] cur_color += [alpha] - draw.ellipse((x - 2, y - 2, x + 2, y + 2), - fill=tuple(cur_color), outline=tuple(cur_color), width=1) + draw.ellipse((x - 2, y - 2, x + 2, y + 2), fill=tuple(cur_color), outline=tuple(cur_color), width=1) def vis_add_mask(img, mask, color): - origin_img = np.asarray(img.convert('RGB')).copy() + origin_img = np.asarray(img.convert("RGB")).copy() color = np.array(color) - mask = mask.reshape(mask.shape[0], mask.shape[1]).astype('uint8') # np + mask = mask.reshape(mask.shape[0], mask.shape[1]).astype("uint8") # np mask = mask > 0.5 origin_img[mask] = origin_img[mask] * 0.5 + color * 0.5 @@ -304,7 +339,7 @@ def vis_add_mask(img, mask, color): return origin_img -if __name__ == '__main__': - parser = argparse.ArgumentParser('Refer_RSVG inference script', parents=[opts.get_args_parser()]) +if __name__ == "__main__": + parser = argparse.ArgumentParser("Refer_RSVG inference script", parents=[opts.get_args_parser()]) args = parser.parse_args() main(args) diff --git a/models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO b/models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO index 18ccefb..4a05fd2 100644 --- a/models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO +++ b/models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO @@ -1,11 +1,9 @@ -Metadata-Version: 2.1 +Metadata-Version: 2.4 Name: MultiScaleDeformableAttention Version: 1.0 -Summary: PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention +Summary: PyTorch Wrapper for Multi-Scale Deformable Attention (CPU-only) Home-page: https://github.com/fundamentalvision/Deformable-DETR Author: Weijie Su -License: UNKNOWN -Platform: UNKNOWN - -UNKNOWN - +Dynamic: author +Dynamic: home-page +Dynamic: summary diff --git a/models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt b/models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt index 1bb003a..24e757e 100644 --- a/models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt +++ b/models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt @@ -1,7 +1,6 @@ setup.py -/data/users/lanmeng/ReferFormer/models/ops/src/vision.cpp -/data/users/lanmeng/ReferFormer/models/ops/src/cpu/ms_deform_attn_cpu.cpp -/data/users/lanmeng/ReferFormer/models/ops/src/cuda/ms_deform_attn_cuda.cu +C:/Users/jd138001/Downloads/LQVG/models/ops/src/vision.cpp +C:/Users/jd138001/Downloads/LQVG/models/ops/src/cpu/ms_deform_attn_cpu.cpp MultiScaleDeformableAttention.egg-info/PKG-INFO MultiScaleDeformableAttention.egg-info/SOURCES.txt MultiScaleDeformableAttention.egg-info/dependency_links.txt diff --git a/models/ops/build/lib.win-amd64-cpython-39/functions/__init__.py b/models/ops/build/lib.win-amd64-cpython-39/functions/__init__.py new file mode 100644 index 0000000..8a2197b --- /dev/null +++ b/models/ops/build/lib.win-amd64-cpython-39/functions/__init__.py @@ -0,0 +1,10 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +from .ms_deform_attn_func import MSDeformAttnFunction + diff --git a/models/ops/build/lib.win-amd64-cpython-39/functions/ms_deform_attn_func.py b/models/ops/build/lib.win-amd64-cpython-39/functions/ms_deform_attn_func.py new file mode 100644 index 0000000..8c5df8c --- /dev/null +++ b/models/ops/build/lib.win-amd64-cpython-39/functions/ms_deform_attn_func.py @@ -0,0 +1,61 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import torch +import torch.nn.functional as F +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +import MultiScaleDeformableAttention as MSDA + + +class MSDeformAttnFunction(Function): + @staticmethod + def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): + ctx.im2col_step = im2col_step + output = MSDA.ms_deform_attn_forward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors + grad_value, grad_sampling_loc, grad_attn_weight = \ + MSDA.ms_deform_attn_backward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) + + return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None + + +def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): + # for debug and test only, + # need to use cuda version instead + N_, S_, M_, D_ = value.shape + _, Lq_, M_, L_, P_, _ = sampling_locations.shape + value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for lid_, (H_, W_) in enumerate(value_spatial_shapes): + # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ + value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) + # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 + sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) + # N_*M_, D_, Lq_, P_ + sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, + mode='bilinear', padding_mode='zeros', align_corners=False) + sampling_value_list.append(sampling_value_l_) + # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) + attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) + output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) + return output.transpose(1, 2).contiguous() diff --git a/models/ops/build/lib.win-amd64-cpython-39/modules/__init__.py b/models/ops/build/lib.win-amd64-cpython-39/modules/__init__.py new file mode 100644 index 0000000..f82cb1a --- /dev/null +++ b/models/ops/build/lib.win-amd64-cpython-39/modules/__init__.py @@ -0,0 +1,9 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +from .ms_deform_attn import MSDeformAttn diff --git a/models/ops/build/lib.win-amd64-cpython-39/modules/ms_deform_attn.py b/models/ops/build/lib.win-amd64-cpython-39/modules/ms_deform_attn.py new file mode 100644 index 0000000..7efcf17 --- /dev/null +++ b/models/ops/build/lib.win-amd64-cpython-39/modules/ms_deform_attn.py @@ -0,0 +1,117 @@ +# Modify for sample points visualization +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import warnings +import math + +import torch +from torch import nn +import torch.nn.functional as F +from torch.nn.init import xavier_uniform_, constant_ + +from ..functions import MSDeformAttnFunction + + +def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) + return (n & (n-1) == 0) and n != 0 + + +class MSDeformAttn(nn.Module): + def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): + """ + Multi-Scale Deformable Attention Module + :param d_model hidden dimension + :param n_levels number of feature levels + :param n_heads number of attention heads + :param n_points number of sampling points per attention head per feature level + """ + super().__init__() + if d_model % n_heads != 0: + raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) + _d_per_head = d_model // n_heads + # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation + if not _is_power_of_2(_d_per_head): + warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " + "which is more efficient in our CUDA implementation.") + + self.im2col_step = 64 + + self.d_model = d_model + self.n_levels = n_levels + self.n_heads = n_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) + self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) + self.value_proj = nn.Linear(d_model, d_model) + self.output_proj = nn.Linear(d_model, d_model) + + self._reset_parameters() + + def _reset_parameters(self): + constant_(self.sampling_offsets.weight.data, 0.) + thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) + for i in range(self.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + constant_(self.attention_weights.weight.data, 0.) + constant_(self.attention_weights.bias.data, 0.) + xavier_uniform_(self.value_proj.weight.data) + constant_(self.value_proj.bias.data, 0.) + xavier_uniform_(self.output_proj.weight.data) + constant_(self.output_proj.bias.data, 0.) + + def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): + """ + :param query (N, Length_{query}, C) + :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area + or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes + :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) + :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] + :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements + + :return output (N, Length_{query}, C) + """ + N, Len_q, _ = query.shape + N, Len_in, _ = input_flatten.shape + assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in + + value = self.value_proj(input_flatten) + if input_padding_mask is not None: + value = value.masked_fill(input_padding_mask[..., None], float(0)) + value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) + attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) + attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) + # N, Len_q, n_heads, n_levels, n_points, 2 + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + else: + raise ValueError( + 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) + output = MSDeformAttnFunction.apply( + value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step) + output = self.output_proj(output) + + return output, sampling_locations, attention_weights diff --git a/models/ops/functions/ms_deform_attn_func.py b/models/ops/functions/ms_deform_attn_func.py index 8c5df8c..83de6cc 100644 --- a/models/ops/functions/ms_deform_attn_func.py +++ b/models/ops/functions/ms_deform_attn_func.py @@ -22,8 +22,13 @@ class MSDeformAttnFunction(Function): @staticmethod def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): ctx.im2col_step = im2col_step - output = MSDA.ms_deform_attn_forward( - value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) + # Check if we're on CPU and use PyTorch implementation + if value.is_cuda: + output = MSDA.ms_deform_attn_forward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step) + else: + # Use CPU implementation + output = ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights) ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output @@ -31,9 +36,15 @@ def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_ @once_differentiable def backward(ctx, grad_output): value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors - grad_value, grad_sampling_loc, grad_attn_weight = \ - MSDA.ms_deform_attn_backward( - value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) + if value.is_cuda: + grad_value, grad_sampling_loc, grad_attn_weight = \ + MSDA.ms_deform_attn_backward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) + else: + # For CPU, return None gradients (simplified for inference) + grad_value = torch.zeros_like(value) + grad_sampling_loc = torch.zeros_like(sampling_locations) + grad_attn_weight = torch.zeros_like(attention_weights) return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None diff --git a/models/ops/setup.py b/models/ops/setup.py index a0131bc..7d63465 100644 --- a/models/ops/setup.py +++ b/models/ops/setup.py @@ -11,41 +11,25 @@ import torch -from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension -from torch.utils.cpp_extension import CUDAExtension - from setuptools import find_packages from setuptools import setup requirements = ["torch", "torchvision"] + def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "src") main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) - source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) - sources = main_file + source_cpu + extension = CppExtension - extra_compile_args = {"cxx": []} + extra_compile_args = {"cxx": []} # MSVC-compatible, no extra flags define_macros = [] - if torch.cuda.is_available() and CUDA_HOME is not None: - extension = CUDAExtension - sources += source_cuda - define_macros += [("WITH_CUDA", None)] - extra_compile_args["nvcc"] = [ - "-DCUDA_HAS_FP16=1", - "-D__CUDA_NO_HALF_OPERATORS__", - "-D__CUDA_NO_HALF_CONVERSIONS__", - "-D__CUDA_NO_HALF2_OPERATORS__", - ] - else: - raise NotImplementedError('Cuda is not availabel') - sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] ext_modules = [ @@ -59,13 +43,19 @@ def get_extensions(): ] return ext_modules + setup( name="MultiScaleDeformableAttention", version="1.0", author="Weijie Su", url="https://github.com/fundamentalvision/Deformable-DETR", - description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", - packages=find_packages(exclude=("configs", "tests",)), + description="PyTorch Wrapper for Multi-Scale Deformable Attention (CPU-only)", + packages=find_packages( + exclude=( + "configs", + "tests", + ) + ), ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, ) diff --git a/models/ops/src/cpu/ms_deform_attn_cpu.cpp b/models/ops/src/cpu/ms_deform_attn_cpu.cpp index e1bf854..d0f7ce0 100644 --- a/models/ops/src/cpu/ms_deform_attn_cpu.cpp +++ b/models/ops/src/cpu/ms_deform_attn_cpu.cpp @@ -9,10 +9,9 @@ */ #include - #include -#include +// Removed at::Tensor ms_deform_attn_cpu_forward( @@ -23,7 +22,7 @@ ms_deform_attn_cpu_forward( const at::Tensor &attn_weight, const int im2col_step) { - AT_ERROR("Not implement on cpu"); + AT_ERROR("Not implemented on CPU"); } std::vector @@ -36,6 +35,5 @@ ms_deform_attn_cpu_backward( const at::Tensor &grad_output, const int im2col_step) { - AT_ERROR("Not implement on cpu"); -} - + AT_ERROR("Not implemented on CPU"); +} \ No newline at end of file diff --git a/test.sh b/test.sh index 3cd10a1..30830df 100644 --- a/test.sh +++ b/test.sh @@ -1,5 +1,5 @@ #python3 inference_rsvg.py --dataset_file rsvg --num_queries 10 --with_box_refine --binary --freeze_text_encoder \ -#--resume rsvg_dirs/r50_bidrection_fusion_10query/checkpoint.pth --backbone resnet50 - +#--resume rsvg_dirs/r50_bidrection_fusion_10query/checkpoint.pth --backbone resnet50 --device cpu python3 inference_rsvg.py --dataset_file rsvg_mm --num_queries 10 --with_box_refine --binary --freeze_text_encoder \ ---resume rsvg_mm_dirs/r50_bidrection_fusion_10query_70epo/checkpoint.pth --backbone resnet50 \ No newline at end of file +--resume rsvg_mm_dirs/r50_bidrection_fusion_10query_70epo/checkpoint.pth --backbone resnet50 --device cpu + From a51d1232c3648503eedb0608aa5ffce641f5fe3f Mon Sep 17 00:00:00 2001 From: Mohammadreza Haghighat Date: Thu, 7 Aug 2025 11:16:14 +1000 Subject: [PATCH 2/4] Added comments --- datasets/refexp.py | 108 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 94 insertions(+), 14 deletions(-) diff --git a/datasets/refexp.py b/datasets/refexp.py index ee4cac8..a6c7bfa 100644 --- a/datasets/refexp.py +++ b/datasets/refexp.py @@ -4,72 +4,130 @@ COCO dataset which returns image_id for evaluation. Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py """ -from pathlib import Path +# Import standard and third-party libraries. +from pathlib import Path import torch import torch.utils.data import torchvision from pycocotools import mask as coco_mask +# Import project-specific transformation functions. import datasets.transforms_image as T class ModulatedDetection(torchvision.datasets.CocoDetection): + """ + A custom dataset class that extends torchvision's CocoDetection. + It's designed for referring expression tasks, where each image is associated with a text caption. + It also ensures that every item returned has at least one valid object instance after augmentations. + """ + def __init__(self, img_folder, ann_file, transforms, return_masks): + """ + Initializes the dataset. + Args: + img_folder (str): Path to the folder containing images. + ann_file (str): Path to the COCO-style annotation JSON file. + transforms (callable): A function/transform that takes in an image and a target and returns a transformed version. + return_masks (bool): If True, segmentation masks are returned for each object. + """ + # Initialize the parent CocoDetection class. super(ModulatedDetection, self).__init__(img_folder, ann_file) + # Store the augmentation transforms. self._transforms = transforms + # Create an instance of a helper class to process COCO annotations. self.prepare = ConvertCocoPolysToMask(return_masks) def __getitem__(self, idx): + """ + Retrieves an item from the dataset at the given index. + This method includes a loop to ensure that a valid sample (with at least one object) is returned, + even if data augmentation crops away all objects. + """ instance_check = False + # Loop until a valid sample with at least one object instance is found. while not instance_check: + # Get the raw image and annotations from the parent class. img, target = super(ModulatedDetection, self).__getitem__(idx) + # Get the unique image ID for the current sample. image_id = self.ids[idx] + # Load the full COCO image metadata. coco_img = self.coco.loadImgs(image_id)[0] + # Extract the referring expression (caption) from the metadata. caption = coco_img["caption"] + # Extract the dataset name if it exists. dataset_name = coco_img["dataset_name"] if "dataset_name" in coco_img else None + # Prepare the initial target dictionary. target = {"image_id": image_id, "annotations": target, "caption": caption} + # Use the 'prepare' helper to convert annotations into tensors (boxes, masks, etc.). img, target = self.prepare(img, target) + # Apply data augmentations if any are defined. if self._transforms is not None: img, target = self._transforms(img, target) + # Add the dataset name back to the final target. target["dataset_name"] = dataset_name + # Add any other important metadata from the COCO annotations to the target. for extra_key in ["sentence_id", "original_img_id", "original_id", "task_id"]: if extra_key in coco_img: - target[extra_key] = coco_img[extra_key] # box xyxy -> cxcywh - # FIXME: handle "valid", since some box may be removed due to random crop + target[extra_key] = coco_img[extra_key] + + # Check if any valid bounding boxes remain after augmentations (e.g., random cropping). + # A sample is valid if it has at least one box. target["valid"] = torch.tensor([1]) if len(target["area"]) != 0 else torch.tensor([0]) - if torch.any(target['valid'] == 1): # at leatst one instance + # If the sample has at least one valid instance, exit the loop. + if torch.any(target["valid"] == 1): instance_check = True else: + # If augmentations removed all objects, pick a new random sample and try again. import random + idx = random.randint(0, self.__len__() - 1) + + # Add a temporal dimension (T=1) to the image tensor to make it compatible with video models. + # Final image shape: [1, 3, H, W]. return img.unsqueeze(0), target - # return img: [1, 3, H, W], the first dimension means T = 1. def convert_coco_poly_to_mask(segmentations, height, width): + """ + Helper function to convert COCO's polygon segmentation format into a tensor of binary masks. + """ masks = [] + # Iterate over each object's segmentation data. for polygons in segmentations: + # Convert polygon coordinates to Run-Length Encoding (RLE) format. rles = coco_mask.frPyObjects(polygons, height, width) + # Decode RLE to get a binary mask. mask = coco_mask.decode(rles) + # Ensure the mask has a channel dimension. if len(mask.shape) < 3: mask = mask[..., None] mask = torch.as_tensor(mask, dtype=torch.uint8) + # Merge masks for multi-part objects into a single mask. mask = mask.any(dim=2) masks.append(mask) if masks: + # Stack all individual masks into a single tensor. masks = torch.stack(masks, dim=0) else: + # If there are no masks, return an empty tensor with the correct shape. masks = torch.zeros((0, height, width), dtype=torch.uint8) return masks class ConvertCocoPolysToMask(object): + """ + A callable class that acts as a transform. It converts raw COCO annotations + into a clean dictionary of tensors (boxes, labels, masks) that the model can use. + """ + def __init__(self, return_masks=False): self.return_masks = return_masks def __call__(self, image, target): + # Get image dimensions. w, h = image.size image_id = target["image_id"] @@ -78,29 +136,34 @@ def __call__(self, image, target): anno = target["annotations"] caption = target["caption"] if "caption" in target else None + # Filter out "crowd" annotations, which are large groups of objects. anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0] + # Extract bounding boxes and convert from [x, y, w, h] to [x1, y1, x2, y2] format. boxes = [obj["bbox"] for obj in anno] - # guard against no boxes via resizing boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) - boxes[:, 2:] += boxes[:, :2] # xminyminwh -> xyxy + boxes[:, 2:] += boxes[:, :2] + # Clamp box coordinates to be within the image boundaries. boxes[:, 0::2].clamp_(min=0, max=w) boxes[:, 1::2].clamp_(min=0, max=h) + # Extract class labels. classes = [obj["category_id"] for obj in anno] classes = torch.tensor(classes, dtype=torch.int64) + # If requested, convert segmentation polygons to binary masks. if self.return_masks: segmentations = [obj["segmentation"] for obj in anno] masks = convert_coco_poly_to_mask(segmentations, h, w) - # keep the valid boxes + # Remove any boxes that have zero width or height after clamping. keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) boxes = boxes[keep] classes = classes[keep] if self.return_masks: masks = masks[keep] + # Assemble the final target dictionary. target = {} target["boxes"] = boxes target["labels"] = classes @@ -110,32 +173,41 @@ def __call__(self, image, target): target["masks"] = masks target["image_id"] = image_id - # for conversion to coco api + # Add other useful metadata for evaluation. area = torch.tensor([obj["area"] for obj in anno]) iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) target["area"] = area[keep] target["iscrowd"] = iscrowd[keep] - target["valid"] = torch.tensor([1]) + target["valid"] = torch.tensor([1]) # Mark as valid since we've processed it. target["orig_size"] = torch.as_tensor([int(h), int(w)]) target["size"] = torch.as_tensor([int(h), int(w)]) return image, target def make_coco_transforms(image_set, cautious): - + """ + Creates a pipeline of data augmentations for training or validation. + """ + # Define the standard normalization transform. normalize = T.Compose([T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) + # Define scales for resizing augmentations. scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768] - final_scales = [296, 328, 360, 392, 416, 448, 480, 512] - + final_scales = [296, 328, 360, 392, 416, 448, 480, 512] max_size = 800 + + # Define the augmentation pipeline for the training set. if image_set == "train": + # Optionally add horizontal flipping. horizontal = [] if cautious else [T.RandomHorizontalFlip()] return T.Compose( horizontal + [ + # Randomly select one of two augmentation strategies. T.RandomSelect( + # Strategy 1: Simple random resizing. T.RandomResize(scales, max_size=max_size), + # Strategy 2: A more complex combination of resizing and cropping. T.Compose( [ T.RandomResize([400, 500, 600]), @@ -148,9 +220,11 @@ def make_coco_transforms(image_set, cautious): ] ) + # Define the augmentation pipeline for the validation set. if image_set == "val": return T.Compose( [ + # Simple resizing and normalization. T.RandomResize([360], max_size=640), normalize, ] @@ -160,20 +234,26 @@ def make_coco_transforms(image_set, cautious): def build(dataset_file, image_set, args): + """ + The main factory function to build the referring expression dataset. + """ + # Get the root path of the COCO dataset. root = Path(args.coco_path) assert root.exists(), f"provided COCO path {root} does not exist" mode = "instances" dataset = dataset_file + # Define the paths to the image folders and annotation files for train/val splits. PATHS = { "train": (root / "train2014", root / dataset / f"{mode}_{dataset}_train.json"), "val": (root / "train2014", root / dataset / f"{mode}_{dataset}_val.json"), } img_folder, ann_file = PATHS[image_set] + # Instantiate the ModulatedDetection dataset with the appropriate transforms. dataset = ModulatedDetection( img_folder, ann_file, transforms=make_coco_transforms(image_set, False), return_masks=args.masks, ) - return dataset \ No newline at end of file + return dataset From b11c62407e386917dccd4427ba0679ba13494728 Mon Sep 17 00:00:00 2001 From: Mohammadreza Haghighat <62746461+MHaghighat98@users.noreply.github.com> Date: Wed, 27 Aug 2025 09:32:49 +1000 Subject: [PATCH 3/4] Colab-ready test with custom dataset --- LQVG_test.ipynb | 300 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 300 insertions(+) create mode 100644 LQVG_test.ipynb diff --git a/LQVG_test.ipynb b/LQVG_test.ipynb new file mode 100644 index 0000000..0496f05 --- /dev/null +++ b/LQVG_test.ipynb @@ -0,0 +1,300 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "## **The modified source code compatible with colab (gdown does not work for downloading):**" + ], + "metadata": { + "id": "kTnynktRAMnJ" + } + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "metadata": { + "id": "WixriWROqUUx" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## **LQVG weights trained on DIOR-RSVG and RSVG-HR**" + ], + "metadata": { + "id": "JVkx_5-QFK4o" + } + }, + { + "cell_type": "code", + "source": [ + "!gdown --folder https://drive.google.com/drive/folders/1uC9TAPOwiIbHcee6hSO_3b2Mwr-zDGtg?usp=drive_link -O drive/MyDrive/weights" + ], + "metadata": { + "id": "dlUmSOwieC9o" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "cd /content/drive/.shortcut-targets-by-id/18OjYDfO70rO2e-oLMPmNbu2Z1WH7nppd/LQVG\n" + ], + "metadata": { + "id": "KddNyUyDwvP6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install torch torchvision torchaudio\n" + ], + "metadata": { + "id": "gmmmP7yqvVf7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install -r requirements.txt\n" + ], + "metadata": { + "id": "NVWRfbjfwUYx" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install 'git+https://github.com/facebookresearch/fvcore'\n" + ], + "metadata": { + "id": "3MbdnvA9sKI_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'\n" + ], + "metadata": { + "id": "nvEpgsttsY3L" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "cd /content/drive/.shortcut-targets-by-id/18OjYDfO70rO2e-oLMPmNbu2Z1WH7nppd/LQVG/models/ops" + ], + "metadata": { + "id": "tUCBNrYGshhy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!python setup.py build install" + ], + "metadata": { + "id": "ENyTettCskxg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "cd ../.." + ], + "metadata": { + "id": "bU-A2IBKsmo5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Inference:" + ], + "metadata": { + "id": "7g2sVjoVFQiJ" + } + }, + { + "cell_type": "code", + "source": [ + "!python3 inference_rsvg.py --dataset_file rsvg_mm --num_queries 10 --with_box_refine --binary --freeze_text_encoder \\\n", + "--resume /content/drive/MyDrive/weights/RSVG-HR/checkpoint.pth --backbone resnet50" + ], + "metadata": { + "id": "x3dfhLHB9nET" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## **Visualize some results:**" + ], + "metadata": { + "id": "TQpWSEAMIQwF" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "from pathlib import Path\n", + "import math\n", + "from PIL import Image\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# --- Config ---\n", + "IDS = [120, 159, 316, 328, 377, 502, 507, 510]\n", + "folder = \"/content/drive/MyDrive/LQVG/test_output/test\" # Input folder\n", + "save_path = \"/content/drive/MyDrive/grid/grid.png\" # Output file\n", + "cols = 4 # Grid columns\n", + "\n", + "# --- Ensure save directory exists ---\n", + "os.makedirs(os.path.dirname(save_path), exist_ok=True)\n", + "\n", + "# --- Functions ---\n", + "def load_images(root, ids):\n", + " \"\"\"Load images by IDs from a folder.\"\"\"\n", + " imgs = []\n", + " for i in ids:\n", + " name = f\"UASs_{i}.jpg\"\n", + " path = Path(root) / name\n", + " if not path.is_file():\n", + " print(f\"❌ Missing: {path}\")\n", + " continue\n", + " imgs.append(Image.open(path).convert(\"RGB\"))\n", + " print(f\"✅ Loaded: {path}\")\n", + " return imgs\n", + "\n", + "def show_grid(images, cols=4, save=None):\n", + " \"\"\"Display images in a grid and optionally save.\"\"\"\n", + " if not images:\n", + " print(\"⚠ No images to display.\")\n", + " return\n", + "\n", + " rows = math.ceil(len(images) / cols)\n", + "\n", + " # Scale figure size based on image dimensions\n", + " img_width, img_height = images[0].size\n", + " fig_width = cols * img_width / 100\n", + " fig_height = rows * img_height / 100\n", + "\n", + " plt.figure(figsize=(fig_width, fig_height))\n", + " for idx, img in enumerate(images, 1):\n", + " ax = plt.subplot(rows, cols, idx)\n", + " ax.imshow(img)\n", + " ax.axis(\"off\")\n", + "\n", + " if save:\n", + " plt.savefig(save, dpi=150, bbox_inches='tight', pad_inches=0)\n", + " print(f\"💾 Saved grid to: {save}\")\n", + "\n", + " plt.show()\n", + "\n", + "# --- Main ---\n", + "imgs = load_images(folder, IDS)\n", + "show_grid(imgs, cols=cols, save=save_path)\n" + ], + "metadata": { + "id": "jprFeczaICMm" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Training:" + ], + "metadata": { + "id": "bcyW0BLPFWCE" + } + }, + { + "cell_type": "code", + "source": [ + "!wget -O datasets/coco_eval.py https://raw.githubusercontent.com/facebookresearch/detr/main/datasets/coco_eval.py" + ], + "metadata": { + "id": "Ug4WCa_WW42F" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!python main.py \\\n", + " --dataset_file rsvg_mm \\\n", + " --rsvg_mm_path \"/content/drive/MyDrive/data\" \\\n", + " --binary \\\n", + " --with_box_refine \\\n", + " --batch_size 1 \\\n", + " --num_frames 1 \\\n", + " --epochs 70 \\\n", + " --lr_drop 40 \\\n", + " --num_queries 10 \\\n", + " --output_dir \"/content/drive/MyDrive/rsvg_dirs/r50_bidrection_fusion_10query_70epo_multiscale\" \\\n", + " --backbone resnet50 \\\n", + " --device cuda" + ], + "metadata": { + "id": "dcooaiTTVGF4" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "jupyter nbconvert --to pdf LQVG-test.ipynb\n" + ], + "metadata": { + "id": "XQfZsqqFWAVd" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From 184d70d96a2dee2d5a7cf071c8806891bf6ca165 Mon Sep 17 00:00:00 2001 From: Mohammadreza Haghighat <62746461+MHaghighat98@users.noreply.github.com> Date: Thu, 11 Sep 2025 10:11:52 +1000 Subject: [PATCH 4/4] Resolved conflict in the "build" function --- datasets/rsvg_mm.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/datasets/rsvg_mm.py b/datasets/rsvg_mm.py index fca23cb..833726b 100644 --- a/datasets/rsvg_mm.py +++ b/datasets/rsvg_mm.py @@ -126,13 +126,17 @@ def make_coco_transforms(image_set, cautious): from pathlib import Path -def build(image_set, args): - root = Path(args.rsvg_mm_path) - assert root.exists(), f"provided rsvg_mm path {root} does not exist" - input_transform = T.Compose([T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) - - img_folder = "data/images" # Updated to correct path - dataset = RSVGDataset(img_folder, transform=input_transform, split=image_set, testmode=(image_set == "test")) +def build(image_set, args): + assert root.exists(), f'provided rsvg_mm path {root} does not exist' + input_transform = T.Compose([ + T.ToTensor(), + T.Normalize( + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + + img_folder = 'data/rsvg_mm/images' + dataset = RSVGDataset(img_folder, transform=input_transform, split=image_set, testmode=(image_set == 'test')) return dataset