From 85b14bb8b7cdc1518b714e3a1f8b9e5c50623114 Mon Sep 17 00:00:00 2001
From: Mohammadreza Haghighat <mr.haghighat1998@gmail.com>
Date: Thu, 7 Aug 2025 09:52:22 +1000
Subject: [PATCH 1/4] Modified for CPU-only usage, changed library versions,
 test with new dataset

---
 .vscode/c_cpp_properties.json                 |  21 ++
 .vscode/settings.json                         |   9 +
 RSVG-HR/Annotations/rsvg_hr_test_10.txt       |  28 ++
 datasets/__init__.py                          |  27 ++
 datasets/concat_dataset.py                    |  43 ++-
 datasets/image_to_seq_augmenter.py            | 147 +++++++--
 datasets/refer.py                             | 294 +++++++++++-------
 datasets/refexp_eval.py                       |  73 ++++-
 datasets/rsvg_mm.py                           |  72 ++---
 inference_rsvg.py                             | 187 ++++++-----
 .../PKG-INFO                                  |  12 +-
 .../SOURCES.txt                               |   5 +-
 .../functions/__init__.py                     |  10 +
 .../functions/ms_deform_attn_func.py          |  61 ++++
 .../modules/__init__.py                       |   9 +
 .../modules/ms_deform_attn.py                 | 117 +++++++
 models/ops/functions/ms_deform_attn_func.py   |  21 +-
 models/ops/setup.py                           |  32 +-
 models/ops/src/cpu/ms_deform_attn_cpu.cpp     |  10 +-
 test.sh                                       |   6 +-
 20 files changed, 873 insertions(+), 311 deletions(-)
 create mode 100644 .vscode/c_cpp_properties.json
 create mode 100644 .vscode/settings.json
 create mode 100644 RSVG-HR/Annotations/rsvg_hr_test_10.txt
 create mode 100644 models/ops/build/lib.win-amd64-cpython-39/functions/__init__.py
 create mode 100644 models/ops/build/lib.win-amd64-cpython-39/functions/ms_deform_attn_func.py
 create mode 100644 models/ops/build/lib.win-amd64-cpython-39/modules/__init__.py
 create mode 100644 models/ops/build/lib.win-amd64-cpython-39/modules/ms_deform_attn.py

diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
new file mode 100644
index 0000000..79ddb0c
--- /dev/null
+++ b/.vscode/c_cpp_properties.json
@@ -0,0 +1,21 @@
+{
+    "configurations": [
+        {
+            "name": "Win32",
+            "includePath": [
+                "${workspaceFolder}/**"
+            ],
+            "defines": [
+                "_DEBUG",
+                "UNICODE",
+                "_UNICODE"
+            ],
+            "windowsSdkVersion": "10.0.26100.0",
+            "compilerPath": "cl.exe",
+            "cStandard": "c17",
+            "cppStandard": "c++17",
+            "intelliSenseMode": "windows-msvc-x64"
+        }
+    ],
+    "version": 4
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..083f60b
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,9 @@
+{
+    "python-envs.pythonProjects": [
+        {
+            "path": "",
+            "envManager": "ms-python.python:conda",
+            "packageManager": "ms-python.python:conda"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/RSVG-HR/Annotations/rsvg_hr_test_10.txt b/RSVG-HR/Annotations/rsvg_hr_test_10.txt
new file mode 100644
index 0000000..9f57323
--- /dev/null
+++ b/RSVG-HR/Annotations/rsvg_hr_test_10.txt
@@ -0,0 +1,28 @@
+UASs_120.jpg,2671.0,1921.0,2814.0,2104.0,Compact square-like cluster with roughly symmetrical slightly irregular outline, weed: erigeron canadensis, middle-center of lentil crop field
+UASs_121.jpg,2195.0,2413.0,2568.0,2784.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, bottom-center of lentil crop field
+UASs_123.jpg,1656.0,683.0,2047.0,1052.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-left of lentil crop field
+UASs_124.jpg,2269.0,1248.0,2616.0,1695.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, middle-center of lentil crop field
+UASs_137.jpg,663.0,3508.0,985.0,3643.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, bottom-left of lentil crop field
+UASs_157.jpg,2665.0,13.0,2985.0,481.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, top-center of lentil crop field
+UASs_159.jpg,2442.0,3.0,2545.0,132.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-center of lentil crop field
+UASs_180.jpg,498.0,3132.0,877.0,3366.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, bottom-left of lentil crop field
+UASs_181.jpg,153.0,3197.0,413.0,3479.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, bottom-left of lentil crop field
+UASs_224.jpg,4683.0,528.0,4937.0,856.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, top-right of lentil crop field
+UASs_236.jpg,1347.0,2371.0,1571.0,2615.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, bottom-left of lentil crop field
+UASs_237.jpg,396.0,2163.0,755.0,2512.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, middle-left of lentil crop field
+UASs_288.jpg,576.0,2168.0,669.0,2284.0,Compact square-like cluster with roughly symmetrical slightly irregular outline, weed: erigeron canadensis, middle-left of lentil crop field
+UASs_316.jpg,539.0,3.0,740.0,279.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-left of lentil crop field
+UASs_328.jpg,1772.0,3136.0,2140.0,3537.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, bottom-left of lentil crop field
+UASs_377.jpg,1438.0,1725.0,1641.0,1936.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, middle-left of lentil crop field
+UASs_496.jpg,17.0,64.0,223.0,282.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-left of lentil crop field
+UASs_502.jpg,2349.0,1193.0,2516.0,1384.0,Compact square-like cluster with roughly symmetrical slightly irregular outline, weed: erigeron canadensis, middle-center of lentil crop field
+UASs_503.jpg,2767.0,524.0,3121.0,916.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, top-center of lentil crop field
+UASs_504.jpg,2858.0,728.0,3325.0,1225.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, top-center of lentil crop field
+UASs_505.jpg,2813.0,1574.0,3069.0,1894.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, middle-center of lentil crop field
+UASs_506.jpg,1917.0,9.0,2100.0,162.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-center of lentil crop field
+UASs_507.jpg,2219.0,360.0,2389.0,570.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, top-center of lentil crop field
+UASs_509.jpg,2428.0,7.0,2790.0,320.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-center of lentil crop field
+UASs_510.jpg,2221.0,1071.0,2372.0,1229.0,Compact square-like cluster with roughly symmetrical slightly irregular outline, weed: erigeron canadensis, top-center of lentil crop field
+UASs_660.jpg,1679.0,668.0,2066.0,1052.0,Moderately sized square-like cluster with irregular sprawling outline, weed: bassia scoparia, top-left of lentil crop field
+UASs_661.jpg,1927.0,2768.0,2368.0,3189.0,Medium-sized square-like cluster with irregular somewhat elongated outline, weed: amaranthus tuberculatus, bottom-center of lentil crop field
+UASs_93.jpg,166.0,2184.0,417.0,2396.0,Medium-sized square-like cluster with broadly irregular potentially sprawling outline, weed: ambrosia artemisiifolia, middle-left of lentil crop field
\ No newline at end of file
diff --git a/datasets/__init__.py b/datasets/__init__.py
index 31b7a0b..b77ddae 100644
--- a/datasets/__init__.py
+++ b/datasets/__init__.py
@@ -1,24 +1,51 @@
+# Import necessary PyTorch and Torchvision libraries.
 import torch.utils.data
 import torchvision
 
+# Import the specific 'build' functions from other dataset files in this package.
+# Each 'build' function is responsible for creating a specific dataset instance.
+# They are renamed to avoid naming conflicts.
 from .rsvg import build as build_rsvg
 from .rsvg_mm import build as build_rsvg_mm
 
 
 def get_coco_api_from_dataset(dataset):
+    """
+    Helper function to retrieve the COCO API object from a dataset.
+    Some datasets might be wrapped in other PyTorch dataset classes like `Subset`.
+    This function iteratively unwraps the dataset to find the base COCO object.
+    """
+    # Loop to handle nested datasets (e.g., a dataset wrapped in multiple `Subset` instances).
     for _ in range(10):
+        # This part is commented out but would have been an early exit condition.
         # if isinstance(dataset, torchvision.datasets.CocoDetection):
         #     break
+        # If the current dataset object is a `Subset`, get the underlying dataset.
         if isinstance(dataset, torch.utils.data.Subset):
             dataset = dataset.dataset
+    # After unwrapping, check if the base dataset is a CocoDetection instance.
     if isinstance(dataset, torchvision.datasets.CocoDetection):
+        # If it is, return its `coco` attribute, which is the COCO API object.
         return dataset.coco
 
 
 def build_dataset(dataset_file: str, image_set: str, args):
+    """
+    This is a factory function that constructs and returns the correct dataset.
+    It acts as a single entry point for creating any dataset supported by the project.
+
+    Args:
+        dataset_file (str): The name of the dataset to build (e.g., 'rsvg').
+        image_set (str): The split of the dataset to use (e.g., 'train' or 'val').
+        args: Command-line arguments containing other dataset configurations.
+    """
+    # Check the dataset name and call the corresponding build function.
     if dataset_file == 'rsvg':
+        # If the dataset is 'rsvg', call the build function imported from `rsvg.py`.
         return build_rsvg(image_set, args)
     if dataset_file == 'rsvg_mm':
+        # If the dataset is 'rsvg_mm', call the build function imported from `rsvg_mm.py`.
         return build_rsvg_mm(image_set, args)
 
+    # If the dataset_file name doesn't match any known datasets, raise an error.
     raise ValueError(f'dataset {dataset_file} not supported')
diff --git a/datasets/concat_dataset.py b/datasets/concat_dataset.py
index 23f0faf..9057b27 100644
--- a/datasets/concat_dataset.py
+++ b/datasets/concat_dataset.py
@@ -3,31 +3,66 @@
 # Copyright (c) 2020 SenseTime. All Rights Reserved.
 # ------------------------------------------------------------------------
 
+# Import Path for handling file paths, though it's not directly used in this snippet.
 from pathlib import Path
 
+# Import core PyTorch data utilities.
 import torch
 import torch.utils.data
 
+# Import specific Dataset classes from PyTorch.
+# ConcatDataset is used to combine multiple datasets into one.
 from torch.utils.data import Dataset, ConcatDataset
+
+# Import the 'build' function from the local 'refexp2seq.py' file.
+# This function is responsible for creating datasets for referring expression tasks (like RefCOCO).
 from .refexp2seq import build as build_seq_refexp
+
+# Import the 'build' function from the local 'ytvos.py' file.
+# This function is responsible for creating the Ref-Youtube-VOS dataset.
 from .ytvos import build as build_ytvs
-from datasets import ytvos
 
+# This import seems redundant as the 'ytvos' module is already imported above.
+# It might be a leftover from previous code edits.
+from datasets import ytvos
 
 
 def build(image_set, args):
+    """
+    This function constructs a single, large dataset by concatenating several smaller ones.
+    It combines all RefCOCO variants and the Ref-Youtube-VOS dataset.
+
+    Args:
+        image_set (str): The data split to use (e.g., 'train', 'val').
+        args: Command-line arguments containing other dataset configurations.
+
+    Returns:
+        ConcatDataset: A single dataset object that contains all the specified datasets.
+    """
+    # Initialize an empty list to hold the individual dataset objects.
     concat_data = []
 
-    print('preparing coco2seq dataset ....')
+    # Log that the RefCOCO datasets are being prepared.
+    print("preparing coco2seq dataset ....")
+    # Define the names of the RefCOCO dataset variants to be loaded.
     coco_names = ["refcoco", "refcoco+", "refcocog"]
+    # Loop through each RefCOCO dataset name.
     for name in coco_names:
-        coco_seq =  build_seq_refexp(name, image_set, args)
+        # Call the build function for referring expression datasets to create an instance.
+        coco_seq = build_seq_refexp(name, image_set, args)
+        # Add the created dataset to the list.
         concat_data.append(coco_seq)
 
-    print('preparing ytvos dataset  .... ')
+    # Log that the Ref-Youtube-VOS dataset is being prepared.
+    print("preparing ytvos dataset  .... ")
+    # Call the build function for the YTVOS dataset to create an instance.
     ytvos_dataset = build_ytvs(image_set, args)
+    # Add the created dataset to the list.
     concat_data.append(ytvos_dataset)
 
+    # Use PyTorch's ConcatDataset to combine all the individual datasets in the list
+    # into a single, unified dataset object.
     concat_data = ConcatDataset(concat_data)
 
+    # Return the final concatenated dataset.
     return concat_data
diff --git a/datasets/image_to_seq_augmenter.py b/datasets/image_to_seq_augmenter.py
index f31e61c..4500b51 100644
--- a/datasets/image_to_seq_augmenter.py
+++ b/datasets/image_to_seq_augmenter.py
@@ -1,98 +1,179 @@
 # ------------------------------------------------------------------------
-# Modified from SeqFormer (https://github.com/wjf5203/SeqFormer)
-# ------------------------------------------------------------------------
-# Modified from STEm-Seg (https://github.com/sabarim/STEm-Seg)
+# This code is modified from previous works: SeqFormer and STEm-Seg.
+# It's designed to apply data augmentation to images and their corresponding
+# segmentation masks and bounding boxes, often to simulate video sequences from single images.
 # ------------------------------------------------------------------------
 
 
+# Import the core imgaug library for data augmentation.
 import imgaug
 import imgaug.augmenters as iaa
+
+# Import numpy for numerical operations, especially on image arrays.
 import numpy as np
 
+# Import datetime to generate time-based seeds for randomness.
 from datetime import datetime
 
+# Import specific classes from imgaug for handling segmentation maps and bounding boxes.
 from imgaug.augmentables.segmaps import SegmentationMapsOnImage
 from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
 
 
 class ImageToSeqAugmenter(object):
-    def __init__(self, perspective=True, affine=True, motion_blur=True,
-                 brightness_range=(-50, 50), hue_saturation_range=(-15, 15), perspective_magnitude=0.12,
-                 scale_range=1.0, translate_range={"x": (-0.15, 0.15), "y": (-0.15, 0.15)}, rotation_range=(-20, 20),
-                 motion_blur_kernel_sizes=(7, 9), motion_blur_prob=0.5):
-
-        self.basic_augmenter = iaa.SomeOf((1, None), [
-                iaa.Add(brightness_range),
-                iaa.AddToHueAndSaturation(hue_saturation_range)
-            ]
+    """
+    A class that defines a pipeline of image augmentations.
+    It can apply color, geometric (affine, perspective), and motion blur transformations.
+    It's specifically designed to handle images along with their segmentation masks and bounding boxes,
+    ensuring all are transformed consistently.
+    """
+
+    def __init__(
+        self,
+        perspective=True,
+        affine=True,
+        motion_blur=True,
+        brightness_range=(-50, 50),
+        hue_saturation_range=(-15, 15),
+        perspective_magnitude=0.12,
+        scale_range=1.0,
+        translate_range={"x": (-0.15, 0.15), "y": (-0.15, 0.15)},
+        rotation_range=(-20, 20),
+        motion_blur_kernel_sizes=(7, 9),
+        motion_blur_prob=0.5,
+    ):
+        """
+        Initializes the augmentation pipeline with various configurable options.
+        """
+
+        # Define a basic color augmentation pipeline.
+        # It applies EITHER brightness/contrast changes OR hue/saturation changes.
+        self.basic_augmenter = iaa.SomeOf(
+            (1, None),
+            [
+                iaa.Add(brightness_range),  # Adjust brightness.
+                iaa.AddToHueAndSaturation(hue_saturation_range),  # Adjust color hue and saturation.
+            ],
         )
 
+        # Create a list to hold geometric transformations.
         transforms = []
         if perspective:
+            # Add a perspective transformation to simulate camera angle changes.
             transforms.append(iaa.PerspectiveTransform(perspective_magnitude))
         if affine:
-            transforms.append(iaa.Affine(scale=scale_range,
-                                         translate_percent=translate_range,
-                                         rotate=rotation_range,
-                                         order=1,  # cv2.INTER_LINEAR
-                                         backend='auto'))
+            # Add affine transformations: scaling, translation, and rotation.
+            transforms.append(
+                iaa.Affine(
+                    scale=scale_range,
+                    translate_percent=translate_range,
+                    rotate=rotation_range,
+                    order=1,  # Use linear interpolation.
+                    backend="auto",
+                )
+            )  # Automatically choose backend (e.g., OpenCV).
+
+        # Combine the geometric transforms into a sequence.
         transforms = iaa.Sequential(transforms)
-        transforms = [transforms]
+        transforms = [transforms]  # Wrap in a list to append more augmenters.
 
         if motion_blur:
-            blur = iaa.Sometimes(motion_blur_prob, iaa.OneOf(
-                [
-                    iaa.MotionBlur(ksize)
-                    for ksize in motion_blur_kernel_sizes
-                ]
-            ))
+            # Define a motion blur augmentation that is applied with a certain probability.
+            blur = iaa.Sometimes(
+                motion_blur_prob,
+                iaa.OneOf(
+                    [
+                        # Choose one kernel size for the motion blur.
+                        iaa.MotionBlur(ksize)
+                        for ksize in motion_blur_kernel_sizes
+                    ]
+                ),
+            )
             transforms.append(blur)
 
+        # Combine all transformations (geometric + motion blur) into the final sequence.
+        # This is named 'frame_shift_augmenter' because it simulates frame-to-frame changes in a video.
         self.frame_shift_augmenter = iaa.Sequential(transforms)
 
     @staticmethod
     def condense_masks(instance_masks):
+        """
+        Static method to convert a list of binary instance masks into a single integer-labeled segmentation map.
+        imgaug requires this format to augment multiple masks simultaneously.
+        Example: [[0,1,1], [1,0,0]] -> [2, 1, 1] where 1 is the first mask, 2 is the second.
+        """
+        # Create an empty mask with the same shape as the first instance mask.
         condensed_mask = np.zeros_like(instance_masks[0], dtype=np.int8)
+        # Iterate through each binary mask, assigning a unique integer ID (starting from 1).
         for instance_id, mask in enumerate(instance_masks, 1):
+            # Where the binary mask is true, set the corresponding pixel in the condensed mask to the instance ID.
             condensed_mask = np.where(mask, instance_id, condensed_mask)
 
         return condensed_mask
 
     @staticmethod
     def expand_masks(condensed_mask, num_instances):
+        """
+        Static method to perform the reverse of condense_masks.
+        It converts a single integer-labeled segmentation map back into a list of binary masks.
+        """
+        # Create a list of binary masks by checking for each instance ID.
         return [(condensed_mask == instance_id).astype(np.uint8) for instance_id in range(1, num_instances + 1)]
 
     def __call__(self, image, masks=None, boxes=None):
+        """
+        Applies the defined augmentation pipeline to an image and its optional masks/boxes.
+        """
+        # Create a deterministic version of the geometric augmenter.
+        # This ensures the exact same geometric transformation is applied to the image, masks, and any other spatial data.
         det_augmenter = self.frame_shift_augmenter.to_deterministic()
 
-
+        # If masks are provided, augment them along with the image.
         if masks is not None:
             masks_np, is_binary_mask = [], []
             boxs_np = []
 
+            # Prepare masks for augmentation.
             for mask in masks:
-                
                 if isinstance(mask, np.ndarray):
-                    masks_np.append(mask.astype(np.bool))
+                    masks_np.append(mask.astype(np.bool_))
                     is_binary_mask.append(False)
                 else:
                     raise ValueError("Invalid mask type: {}".format(type(mask)))
 
             num_instances = len(masks_np)
+            # Condense the list of binary masks into a single integer map and wrap it for imgaug.
             masks_np = SegmentationMapsOnImage(self.condense_masks(masks_np), shape=image.shape[:2])
-            # boxs_np = BoundingBoxesOnImage(boxs_np, shape=image.shape[:2])
 
-            seed = int(datetime.now().strftime('%M%S%f')[-8:])
+            # Use a time-based seed to ensure the next two augmentations are identical.
+            seed = int(datetime.now().strftime("%M%S%f")[-8:])
             imgaug.seed(seed)
-            aug_image, aug_masks = det_augmenter(image=self.basic_augmenter(image=image) , segmentation_maps=masks_np)
+
+            # Augment the image and the condensed masks.
+            # Note: Color augmentation (`basic_augmenter`) is applied ONLY to the image.
+            # Geometric augmentation (`det_augmenter`) is applied to both.
+            aug_image, aug_masks = det_augmenter(image=self.basic_augmenter(image=image), segmentation_maps=masks_np)
+
+            # Reset the seed to apply the same geometric transform again.
             imgaug.seed(seed)
+            # Create a mask of valid points by augmenting an image of all ones.
+            # Pixels that are shifted out of the image boundary will become zero.
             invalid_pts_mask = det_augmenter(image=np.ones(image.shape[:2] + (1,), np.uint8)).squeeze(2)
+
+            # Expand the augmented integer map back into a list of binary masks.
             aug_masks = self.expand_masks(aug_masks.get_arr(), num_instances)
-            # aug_boxes = aug_boxes.remove_out_of_image().clip_out_of_image()
+
+            # Filter the list of augmented masks.
             aug_masks = [mask for mask, is_bm in zip(aug_masks, is_binary_mask)]
-            return aug_image, aug_masks #, aug_boxes.to_xyxy_array()
 
+            # Return the augmented image and its corresponding augmented masks.
+            return aug_image, aug_masks
+
+        # If no masks are provided, just augment the image and return a mask of valid points.
         else:
-            masks = [SegmentationMapsOnImage(np.ones(image.shape[:2], np.bool), shape=image.shape[:2])]
+            # Create a dummy mask to pass to the augmenter.
+            masks = [SegmentationMapsOnImage(np.ones(image.shape[:2], np.bool_), shape=image.shape[:2])]
+            # Augment the image and the dummy mask.
             aug_image, invalid_pts_mask = det_augmenter(image=image, segmentation_maps=masks)
+            # Return the augmented image and a boolean mask where False indicates pixels that are now outside the original image area.
             return aug_image, invalid_pts_mask.get_arr() == 0
diff --git a/datasets/refer.py b/datasets/refer.py
index f28c340..af09ee8 100644
--- a/datasets/refer.py
+++ b/datasets/refer.py
@@ -1,4 +1,5 @@
-__author__ = 'licheng'
+# A variable to store the author's name.
+__author__ = "licheng"
 
 """
 This interface provides access to four datasets:
@@ -7,6 +8,7 @@
 3) refcoco+
 4) refcocog
 split by unc and google
+
 The following API functions are defined:
 REFER      - REFER api class
 getRefIds  - get ref ids that satisfy given filter conditions.
@@ -23,6 +25,7 @@
 showMask   - show mask of the referred object given ref
 """
 
+# Import necessary system and utility libraries.
 import sys
 import os.path as osp
 import json
@@ -35,83 +38,111 @@
 from matplotlib.patches import Polygon, Rectangle
 from pprint import pprint
 import numpy as np
+
+# Import the mask utilities from pycocotools for handling segmentation masks.
 from pycocotools import mask
 
 
 class REFER:
-
-    def __init__(self, data_root, dataset='refcoco', splitBy='unc'):
+    """
+    The main API class for interacting with referring expression datasets.
+    It loads and indexes the dataset annotations for efficient access.
+    """
+
+    def __init__(self, data_root, dataset="refcoco", splitBy="unc"):
+        """
+        Initializes the REFER API object.
+
+        Args:
+            data_root (str): The root directory where datasets are stored.
+            dataset (str): The name of the dataset to load (e.g., 'refcoco').
+            splitBy (str): The split authority (e.g., 'unc', 'google').
+        """
         # provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
         # also provide dataset name and splitBy information
         # e.g., dataset = 'refcoco', splitBy = 'unc'
-        print('loading dataset %s into memory...' % dataset)
+        print("loading dataset %s into memory..." % dataset)
+        # Set up directory paths based on the provided data_root and dataset name.
         self.ROOT_DIR = osp.abspath(osp.dirname(__file__))
-        self.DATA_DIR = osp.join(data_root, dataset) # coco/refcoco
-        if dataset in ['refcoco', 'refcoco+', 'refcocog']:
-            self.IMAGE_DIR = osp.join(data_root, 'train2014')
-        elif dataset == 'refclef':
-            self.IMAGE_DIR = osp.join(data_root, 'saiapr_tc-12')
+        self.DATA_DIR = osp.join(data_root, dataset)  # e.g., coco/refcoco
+        # Determine the image directory based on the dataset type.
+        if dataset in ["refcoco", "refcoco+", "refcocog"]:
+            self.IMAGE_DIR = osp.join(data_root, "train2014")
+        elif dataset == "refclef":
+            self.IMAGE_DIR = osp.join(data_root, "saiapr_tc-12")
         else:
-            print('No refer dataset is called [%s]' % dataset)
+            # If the dataset name is not recognized, print an error and exit.
+            print("No refer dataset is called [%s]" % dataset)
             sys.exit()
 
         # load refs from data/dataset/refs(dataset).json
         tic = time.time()
-        ref_file = osp.join(self.DATA_DIR, 'refs('+splitBy+').p')
+        # Construct the path to the pre-processed reference file (a pickled Python object).
+        ref_file = osp.join(self.DATA_DIR, "refs(" + splitBy + ").p")
+        # Initialize the main data dictionary.
         self.data = {}
-        self.data['dataset'] = dataset
-
-        self.data['refs'] = pickle.load(open(ref_file, 'rb'), fix_imports=True) 
+        self.data["dataset"] = dataset
 
+        # Load the pickled reference data. This contains the referring expressions and their links to images/annotations.
+        self.data["refs"] = pickle.load(open(ref_file, "rb"), fix_imports=True)
 
         # load annotations from data/dataset/instances.json
-        instances_file = osp.join(self.DATA_DIR, 'instances.json')
-        instances = json.load(open(instances_file, 'r')) # coco/refcoco/instances.json
-        # list[dict] keys: "license", "file_name", "coco_url", "height", "width", "date_captured", "flickr_url", "id"
-        self.data['images'] = instances['images']             
-        # list[dict] keys: "segmentation", "area", "iscrowd", "image_id", "bbox", "category_id", "id"
-        self.data['annotations'] = instances['annotations']   
-         # list[dict] keys: "supercategory", "id", "name"
-        self.data['categories'] = instances['categories']    
-
-        # create index
+        # This file contains the standard COCO-style annotations.
+        instances_file = osp.join(self.DATA_DIR, "instances.json")
+        instances = json.load(open(instances_file, "r"))  # e.g., coco/refcoco/instances.json
+        # Load image metadata (file names, dimensions, etc.).
+        self.data["images"] = instances["images"]
+        # Load object annotations (segmentations, bounding boxes, etc.).
+        self.data["annotations"] = instances["annotations"]
+        # Load category information (names, supercategories).
+        self.data["categories"] = instances["categories"]
+
+        # Call the method to create efficient look-up tables (indexes).
         self.createIndex()
-        print('DONE (t=%.2fs)' % (time.time()-tic))
+        print("DONE (t=%.2fs)" % (time.time() - tic))
 
     def createIndex(self):
+        """
+        Creates a set of dictionaries that map various IDs to their corresponding data.
+        This pre-processing step allows for very fast data retrieval.
+        """
         # create sets of mapping
         # 1)  Refs: 	 	{ref_id: ref}
         # 2)  Anns: 	 	{ann_id: ann}
         # 3)  Imgs:		 	{image_id: image}
         # 4)  Cats: 	 	{category_id: category_name}
         # 5)  Sents:     	{sent_id: sent}
-        # 6)  imgToRefs: 	{image_id: refs}
-        # 7)  imgToAnns: 	{image_id: anns}
+        # 6)  imgToRefs: 	{image_id: list_of_refs}
+        # 7)  imgToAnns: 	{image_id: list_of_anns}
         # 8)  refToAnn:  	{ref_id: ann}
         # 9)  annToRef:  	{ann_id: ref}
-        # 10) catToRefs: 	{category_id: refs}
+        # 10) catToRefs: 	{category_id: list_of_refs}
         # 11) sentToRef: 	{sent_id: ref}
-        # 12) sentToTokens: {sent_id: tokens}
-        print('creating index...')
+        # 12) sentToTokens: {sent_id: list_of_tokens}
+        print("creating index...")
         # fetch info from instances
         Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
-        for ann in self.data['annotations']:
-            Anns[ann['id']] = ann
-            imgToAnns[ann['image_id']] = imgToAnns.get(ann['image_id'], []) + [ann]
-        for img in self.data['images']:
-            Imgs[img['id']] = img
-        for cat in self.data['categories']:
-            Cats[cat['id']] = cat['name']
+        # Index annotations by their ID and group them by image ID.
+        for ann in self.data["annotations"]:
+            Anns[ann["id"]] = ann
+            imgToAnns[ann["image_id"]] = imgToAnns.get(ann["image_id"], []) + [ann]
+        # Index images by their ID.
+        for img in self.data["images"]:
+            Imgs[img["id"]] = img
+        # Index categories by their ID.
+        for cat in self.data["categories"]:
+            Cats[cat["id"]] = cat["name"]
 
         # fetch info from refs
         Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
         Sents, sentToRef, sentToTokens = {}, {}, {}
-        for ref in self.data['refs']:
+        # Index references and sentences, creating all the necessary cross-mappings.
+        for ref in self.data["refs"]:
             # ids
-            ref_id = ref['ref_id']
-            ann_id = ref['ann_id']
-            category_id = ref['category_id']
-            image_id = ref['image_id']
+            ref_id = ref["ref_id"]
+            ann_id = ref["ann_id"]
+            category_id = ref["category_id"]
+            image_id = ref["image_id"]
 
             # add mapping related to ref
             Refs[ref_id] = ref
@@ -121,12 +152,12 @@ def createIndex(self):
             annToRef[ann_id] = ref
 
             # add mapping of sent
-            for sent in ref['sentences']:
-                Sents[sent['sent_id']] = sent
-                sentToRef[sent['sent_id']] = ref
-                sentToTokens[sent['sent_id']] = sent['tokens']
+            for sent in ref["sentences"]:
+                Sents[sent["sent_id"]] = sent
+                sentToRef[sent["sent_id"]] = ref
+                sentToTokens[sent["sent_id"]] = sent["tokens"]
 
-        # create class members
+        # Store the created indexes as class members for easy access.
         self.Refs = Refs
         self.Anns = Anns
         self.Imgs = Imgs
@@ -139,167 +170,216 @@ def createIndex(self):
         self.catToRefs = catToRefs
         self.sentToRef = sentToRef
         self.sentToTokens = sentToTokens
-        print('index created.')
+        print("index created.")
 
-    def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''):
+    def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=""):
+        """
+        Get reference IDs that satisfy the given filter conditions.
+        """
+        # Ensure inputs are lists.
         image_ids = image_ids if type(image_ids) == list else [image_ids]
         cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
         ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
 
+        # If no filters are provided, return all reference IDs.
         if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
-            refs = self.data['refs']
+            refs = self.data["refs"]
         else:
+            # Apply filters sequentially.
             if not len(image_ids) == 0:
+                # Use the pre-computed index for fast lookup.
                 refs = [self.imgToRefs[image_id] for image_id in image_ids]
             else:
-                refs = self.data['refs']
+                refs = self.data["refs"]
             if not len(cat_ids) == 0:
-                refs = [ref for ref in refs if ref['category_id'] in cat_ids]
+                refs = [ref for ref in refs if ref["category_id"] in cat_ids]
             if not len(ref_ids) == 0:
-                refs = [ref for ref in refs if ref['ref_id'] in ref_ids]
+                refs = [ref for ref in refs if ref["ref_id"] in ref_ids]
             if not len(split) == 0:
-                if split in ['testA', 'testB', 'testC']:
-                    refs = [ref for ref in refs if split[-1] in ref['split']]  # we also consider testAB, testBC, ...
-                elif split in ['testAB', 'testBC', 'testAC']:
-                    refs = [ref for ref in refs if ref['split'] == split]  # rarely used I guess...
-                elif split == 'test':
-                    refs = [ref for ref in refs if 'test' in ref['split']]
-                elif split == 'train' or split == 'val':
-                    refs = [ref for ref in refs if ref['split'] == split]
+                # Filter by data split (e.g., 'train', 'val', 'testA').
+                if split in ["testA", "testB", "testC"]:
+                    refs = [ref for ref in refs if split[-1] in ref["split"]]  # we also consider testAB, testBC, ...
+                elif split in ["testAB", "testBC", "testAC"]:
+                    refs = [ref for ref in refs if ref["split"] == split]  # rarely used I guess...
+                elif split == "test":
+                    refs = [ref for ref in refs if "test" in ref["split"]]
+                elif split == "train" or split == "val":
+                    refs = [ref for ref in refs if ref["split"] == split]
                 else:
-                    print('No such split [%s]' % split)
+                    print("No such split [%s]" % split)
                     sys.exit()
-        ref_ids = [ref['ref_id'] for ref in refs]
+        # Return a list of the final filtered reference IDs.
+        ref_ids = [ref["ref_id"] for ref in refs]
         return ref_ids
 
     def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
+        """
+        Get annotation IDs that satisfy the given filter conditions.
+        """
         image_ids = image_ids if type(image_ids) == list else [image_ids]
         cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
         ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
 
+        # If no filters, return all annotation IDs.
         if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
-            ann_ids = [ann['id'] for ann in self.data['annotations']]
+            ann_ids = [ann["id"] for ann in self.data["annotations"]]
         else:
+            # Apply filters sequentially.
             if not len(image_ids) == 0:
-                lists = [self.imgToAnns[image_id] for image_id in image_ids if image_id in self.imgToAnns]  # list of [anns]
+                lists = [
+                    self.imgToAnns[image_id] for image_id in image_ids if image_id in self.imgToAnns
+                ]  # list of [anns]
                 anns = list(itertools.chain.from_iterable(lists))
             else:
-                anns = self.data['annotations']
+                anns = self.data["annotations"]
             if not len(cat_ids) == 0:
-                anns = [ann for ann in anns if ann['category_id'] in cat_ids]
-            ann_ids = [ann['id'] for ann in anns]
+                anns = [ann for ann in anns if ann["category_id"] in cat_ids]
+            ann_ids = [ann["id"] for ann in anns]
             if not len(ref_ids) == 0:
-                ids = set(ann_ids).intersection(set([self.Refs[ref_id]['ann_id'] for ref_id in ref_ids]))
+                # Intersect with annotations linked to the given reference IDs.
+                ids = set(ann_ids).intersection(set([self.Refs[ref_id]["ann_id"] for ref_id in ref_ids]))
         return ann_ids
 
     def getImgIds(self, ref_ids=[]):
+        """
+        Get image IDs associated with the given reference IDs.
+        """
         ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
 
         if not len(ref_ids) == 0:
-            image_ids = list(set([self.Refs[ref_id]['image_id'] for ref_id in ref_ids]))
+            # Use the index to find image IDs from reference IDs.
+            image_ids = list(set([self.Refs[ref_id]["image_id"] for ref_id in ref_ids]))
         else:
+            # If no ref_ids are given, return all image IDs.
             image_ids = self.Imgs.keys()
         return image_ids
 
     def getCatIds(self):
+        """
+        Get all category IDs in the dataset.
+        """
         return self.Cats.keys()
 
     def loadRefs(self, ref_ids=[]):
+        """
+        Load full reference data for the given reference IDs.
+        """
         if type(ref_ids) == list:
             return [self.Refs[ref_id] for ref_id in ref_ids]
         elif type(ref_ids) == int:
             return [self.Refs[ref_ids]]
 
     def loadAnns(self, ann_ids=[]):
+        """
+        Load full annotation data for the given annotation IDs.
+        """
         if type(ann_ids) == list:
             return [self.Anns[ann_id] for ann_id in ann_ids]
-        elif type(ann_ids) == int or type(ann_ids) == unicode:
+        elif type(ann_ids) == int:
             return [self.Anns[ann_ids]]
 
     def loadImgs(self, image_ids=[]):
+        """
+        Load full image data for the given image IDs.
+        """
         if type(image_ids) == list:
             return [self.Imgs[image_id] for image_id in image_ids]
         elif type(image_ids) == int:
             return [self.Imgs[image_ids]]
 
     def loadCats(self, cat_ids=[]):
+        """
+        Load category names for the given category IDs.
+        """
         if type(cat_ids) == list:
             return [self.Cats[cat_id] for cat_id in cat_ids]
         elif type(cat_ids) == int:
             return [self.Cats[cat_ids]]
 
     def getRefBox(self, ref_id):
+        """
+        Get the bounding box [x, y, w, h] for a given reference ID.
+        """
         ref = self.Refs[ref_id]
+        # Use the refToAnn index to find the corresponding annotation.
         ann = self.refToAnn[ref_id]
-        return ann['bbox']  # [x, y, w, h]
+        return ann["bbox"]  # [x, y, w, h]
 
-    def showRef(self, ref, seg_box='seg'):
+    def showRef(self, ref, seg_box="seg"):
+        """
+        Display an image and overlay the referred object's segmentation or bounding box.
+        """
         ax = plt.gca()
         # show image
-        image = self.Imgs[ref['image_id']]
-        I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))
+        image = self.Imgs[ref["image_id"]]
+        I = io.imread(osp.join(self.IMAGE_DIR, image["file_name"]))
         ax.imshow(I)
         # show refer expression
-        for sid, sent in enumerate(ref['sentences']):
-            print('%s. %s' % (sid+1, sent['sent']))
+        for sid, sent in enumerate(ref["sentences"]):
+            print("%s. %s" % (sid + 1, sent["sent"]))
         # show segmentations
-        if seg_box == 'seg':
-            ann_id = ref['ann_id']
+        if seg_box == "seg":
+            ann_id = ref["ann_id"]
             ann = self.Anns[ann_id]
             polygons = []
             color = []
-            c = 'none'
-            if type(ann['segmentation'][0]) == list:
-                # polygon used for refcoco*
-                for seg in ann['segmentation']:
-                    poly = np.array(seg).reshape((len(seg)/2, 2))
+            c = "none"
+            if type(ann["segmentation"][0]) == list:
+                # This handles polygon format segmentation, common in refcoco*.
+                for seg in ann["segmentation"]:
+                    poly = np.array(seg).reshape((len(seg) // 2, 2))
                     polygons.append(Polygon(poly, True, alpha=0.4))
                     color.append(c)
+                # Add the polygon patches to the plot.
                 p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 1, 0, 0), linewidths=3, alpha=1)
                 ax.add_collection(p)  # thick yellow polygon
                 p = PatchCollection(polygons, facecolors=color, edgecolors=(1, 0, 0, 0), linewidths=1, alpha=1)
                 ax.add_collection(p)  # thin red polygon
             else:
-                # mask used for refclef
-                rle = ann['segmentation']
+                # This handles RLE (Run-Length Encoding) format segmentation.
+                rle = ann["segmentation"]
                 m = mask.decode(rle)
                 img = np.ones((m.shape[0], m.shape[1], 3))
-                color_mask = np.array([2.0, 166.0, 101.0])/255
+                color_mask = np.array([2.0, 166.0, 101.0]) / 255
                 for i in range(3):
                     img[:, :, i] = color_mask[i]
-                ax.imshow(np.dstack((img, m*0.5)))
+                ax.imshow(np.dstack((img, m * 0.5)))
         # show bounding-box
-        elif seg_box == 'box':
-            ann_id = ref['ann_id']
+        elif seg_box == "box":
+            ann_id = ref["ann_id"]
             ann = self.Anns[ann_id]
-            bbox = self.getRefBox(ref['ref_id'])
-            box_plot = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor='green', linewidth=3)
+            bbox = self.getRefBox(ref["ref_id"])
+            # Create a rectangle patch and add it to the plot.
+            box_plot = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor="green", linewidth=3)
             ax.add_patch(box_plot)
 
     def getMask(self, ref):
+        """
+        Get the binary segmentation mask for a given reference.
+        """
         # return mask, area and mask-center
-        ann = self.refToAnn[ref['ref_id']]
-        image = self.Imgs[ref['image_id']]
-        if type(ann['segmentation'][0]) == list:  # polygon
-            rle = mask.frPyObjects(ann['segmentation'], image['height'], image['width'])
-        else:
-            rle = ann['segmentation']
-
-        # for i in range(len(rle['counts'])):
-        # print(rle)
+        ann = self.refToAnn[ref["ref_id"]]
+        image = self.Imgs[ref["image_id"]]
+        # Convert polygon format to RLE if necessary.
+        if type(ann["segmentation"][0]) == list:  # polygon
+            rle = mask.frPyObjects(ann["segmentation"], image["height"], image["width"])
+        else:  # It's already in RLE format.
+            rle = ann["segmentation"]
+
+        # Decode the RLE to get a binary mask.
         m = mask.decode(rle)
+        # Handle cases where a single annotation has multiple disconnected parts.
         m = np.sum(m, axis=2)  # sometimes there are multiple binary map (corresponding to multiple segs)
         m = m.astype(np.uint8)  # convert to np.uint8
         # compute area
         area = sum(mask.area(rle))  # should be close to ann['area']
-        return {'mask': m, 'area': area}
-
+        return {"mask": m, "area": area}
 
     def showMask(self, ref):
+        """
+        A simple utility to display the mask of a referred object.
+        """
         M = self.getMask(ref)
-        msk = M['mask']
+        msk = M["mask"]
         ax = plt.gca()
         ax.imshow(msk)
-
-
-
diff --git a/datasets/refexp_eval.py b/datasets/refexp_eval.py
index 826aa5c..7573642 100644
--- a/datasets/refexp_eval.py
+++ b/datasets/refexp_eval.py
@@ -1,4 +1,5 @@
 # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
+# Import necessary libraries.
 import copy
 from collections import defaultdict
 from pathlib import Path
@@ -6,80 +7,142 @@
 import torch
 import torch.utils.data
 
+# Import project-specific utilities.
 import util.misc as utils
 from util.box_ops import generalized_box_iou
 
 
 class RefExpEvaluator(object):
+    """
+    A class to evaluate referring expression detection results.
+    It calculates Precision@k for different IoU thresholds.
+    """
+
     def __init__(self, refexp_gt, iou_types, k=(1, 5, 10), thresh_iou=0.5):
+        """
+        Initializes the evaluator.
+
+        Args:
+            refexp_gt: The ground truth data, typically a REFER object.
+            iou_types: The types of IoU to consider (not directly used here but common in other evaluators).
+            k (tuple): A tuple of integers for which to calculate Precision@k (e.g., top 1, 5, 10 predictions).
+            thresh_iou (float): The IoU threshold to consider a prediction correct.
+        """
+        # Ensure k is a list or tuple.
         assert isinstance(k, (list, tuple))
+        # Make a deep copy of the ground truth to avoid modifying the original object.
         refexp_gt = copy.deepcopy(refexp_gt)
         self.refexp_gt = refexp_gt
         self.iou_types = iou_types
+        # Get the list of all image IDs from the ground truth.
         self.img_ids = self.refexp_gt.imgs.keys()
+        # A dictionary to store model predictions, keyed by image ID.
         self.predictions = {}
+        # Store the k values for Precision@k calculation.
         self.k = k
+        # Store the IoU threshold.
         self.thresh_iou = thresh_iou
 
     def accumulate(self):
+        """
+        A placeholder method, often used for accumulating stats over time. Not implemented here.
+        """
         pass
 
     def update(self, predictions):
+        """
+        Updates the evaluator's internal predictions dictionary with new results from the model.
+        """
         self.predictions.update(predictions)
 
     def synchronize_between_processes(self):
+        """
+        In a distributed (multi-GPU) setting, this function gathers predictions from all processes
+        and merges them into a single dictionary on the main process.
+        """
+        # Use the utility function to gather prediction dictionaries from all processes.
         all_predictions = utils.all_gather(self.predictions)
         merged_predictions = {}
+        # Iterate through the list of dictionaries and merge them.
         for p in all_predictions:
             merged_predictions.update(p)
+        # Replace the local predictions with the complete, merged set.
         self.predictions = merged_predictions
 
     def summarize(self):
+        """
+        Calculates and prints the final evaluation metrics (Precision@k).
+        This method should only be run on the main process after all predictions are synchronized.
+        """
+        # Ensure this part only runs on the main process to avoid duplicate calculations and printing.
         if utils.is_main_process():
+            # Initialize dictionaries to store scores and counts for each dataset.
             dataset2score = {
                 "refcoco": {k: 0.0 for k in self.k},
                 "refcoco+": {k: 0.0 for k in self.k},
                 "refcocog": {k: 0.0 for k in self.k},
             }
             dataset2count = {"refcoco": 0.0, "refcoco+": 0.0, "refcocog": 0.0}
+
+            # Iterate over every image ID in the ground truth test set.
             for image_id in self.img_ids:
+                # Get the ground truth annotation ID for the current image.
                 ann_ids = self.refexp_gt.getAnnIds(imgIds=image_id)
-                assert len(ann_ids) == 1
+                assert len(ann_ids) == 1, "Each image should have exactly one referring expression annotation."
+                # Load image metadata, which includes the dataset name (e.g., 'refcoco').
                 img_info = self.refexp_gt.loadImgs(image_id)[0]
 
+                # Load the ground truth annotation (which contains the target bounding box).
                 target = self.refexp_gt.loadAnns(ann_ids[0])
+                # Get the model's prediction for this image.
                 prediction = self.predictions[image_id]
-                assert prediction is not None
+                assert prediction is not None, "Prediction not found for image."
+
+                # Sort the predicted boxes by their confidence scores in descending order.
                 sorted_scores_boxes = sorted(
                     zip(prediction["scores"].tolist(), prediction["boxes"].tolist()), reverse=True
                 )
+                # Unzip the sorted scores and boxes.
                 sorted_scores, sorted_boxes = zip(*sorted_scores_boxes)
+                # Convert the list of boxes back into a single tensor.
                 sorted_boxes = torch.cat([torch.as_tensor(x).view(1, 4) for x in sorted_boxes])
+
+                # Get the ground truth bounding box in [x, y, width, height] format.
                 target_bbox = target[0]["bbox"]
+                # Convert the ground truth box to [x1, y1, x2, y2] format.
                 converted_bbox = [
                     target_bbox[0],
                     target_bbox[1],
                     target_bbox[2] + target_bbox[0],
                     target_bbox[3] + target_bbox[1],
                 ]
+                # Calculate the Generalized IoU between all sorted predicted boxes and the single ground truth box.
                 giou = generalized_box_iou(sorted_boxes, torch.as_tensor(converted_bbox).view(-1, 4))
+
+                # Check for a correct prediction within the top k boxes.
                 for k in self.k:
+                    # If the maximum IoU among the top k predictions is above the threshold...
                     if max(giou[:k]) >= self.thresh_iou:
+                        # ...count it as a correct prediction for that k.
                         dataset2score[img_info["dataset_name"]][k] += 1.0
+                # Increment the total number of samples for this dataset.
                 dataset2count[img_info["dataset_name"]] += 1.0
 
+            # Calculate the final precision scores by dividing the correct counts by the total counts.
             for key, value in dataset2score.items():
                 for k in self.k:
                     try:
                         value[k] /= dataset2count[key]
-                    except:
+                    except ZeroDivisionError:
+                        # Handle cases where a dataset might have zero samples.
                         pass
+
+            # Format and print the results.
             results = {}
             for key, value in dataset2score.items():
                 results[key] = sorted([v for k, v in value.items()])
                 print(f" Dataset: {key} - Precision @ 1, 5, 10: {results[key]} \n")
 
             return results
+        # If not the main process, return None.
         return None
-
-
diff --git a/datasets/rsvg_mm.py b/datasets/rsvg_mm.py
index 29237f7..fca23cb 100644
--- a/datasets/rsvg_mm.py
+++ b/datasets/rsvg_mm.py
@@ -9,6 +9,7 @@
 from PIL import Image
 import util
 from util.transforms import letterbox
+
 # from torchvision.transforms import Compose, ToTensor, Normalize
 import datasets.transforms_image as T
 import matplotlib.pyplot as plt
@@ -18,8 +19,7 @@
 
 
 class RSVGDataset(data.Dataset):
-    def __init__(self, images_path, imsize=1024, transform= None, augment= False,
-                 split='train', testmode=False):
+    def __init__(self, images_path, imsize=1024, transform=None, augment=False, split="train", testmode=False):
         self.images = []
         self.images_path = images_path
         self.imsize = imsize
@@ -30,14 +30,20 @@ def __init__(self, images_path, imsize=1024, transform= None, augment= False,
 
         # file = open('data/rsvg_mm/' + 'rsvg_mm_train_v11.txt', "r").readlines()
         # file = open('data/rsvg_mm/' + 'rsvg_mm_' + split + '.txt', "r").readlines()
-        file = open('data/rsvg_mm/' + 'rsvg_mm_' + split + '_v2.txt', "r").readlines()
-        Index = [index.strip('\n') for index in file]
+        file = open("RSVG-HR/Annotations/rsvg_hr_test_10.txt", "r").readlines()
+        # file = open("data/rsvg_mm/" + "rsvg_mm_" + split + "_v2.txt", "r").readlines()
+        Index = [index.strip("\n") for index in file]
         for anno in Index:
-            anno_list = anno.split(',')
+            anno_list = anno.split(",")
             img_name = anno_list[0]
-            xmin_gt, ymin_gt, xmax_gt, ymax_gt = float(anno_list[1]), float(anno_list[2]), float(anno_list[3]), float(anno_list[4])
+            xmin_gt, ymin_gt, xmax_gt, ymax_gt = (
+                float(anno_list[1]),
+                float(anno_list[2]),
+                float(anno_list[3]),
+                float(anno_list[4]),
+            )
             text = anno_list[-1]
-            image_path = images_path + '/' + img_name
+            image_path = images_path + "/" + img_name
             box = np.array([xmin_gt, ymin_gt, xmax_gt, ymax_gt], dtype=np.float32)
             self.images.append((image_path, box, text))
 
@@ -47,7 +53,7 @@ def pull_item(self, idx):
         # bbox = np.array(bbox, dtype=int)  # box format: to x1 y1 x2 y2
         # img_bgr = cv2.imread(img_path)
         # img = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
-        img = Image.open(img_path).convert('RGB')
+        img = Image.open(img_path).convert("RGB")
         # img = np.array(img)
 
         return img, phrase, bbox, img_path
@@ -56,7 +62,7 @@ def __len__(self):
         return len(self.images)
 
     def __getitem__(self, idx):
-        img, phrase, bbox, img_path  = self.pull_item(idx)
+        img, phrase, bbox, img_path = self.pull_item(idx)
         # print(img_path)
         # phrase = phrase.lower()
         caption = " ".join(phrase.lower().split())
@@ -98,8 +104,8 @@ def __getitem__(self, idx):
             return img.unsqueeze(0), target
         # return img: [1, 3, H, W], the first dimension means T = 1.
 
-def make_coco_transforms(image_set, cautious):
 
+def make_coco_transforms(image_set, cautious):
     normalize = T.Compose([T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
 
     # scales = [480, 560, 640, 720, 800]
@@ -109,49 +115,33 @@ def make_coco_transforms(image_set, cautious):
 
     max_size = 1024
     if image_set == "train":
-        return T.Compose(
-            [T.RandomResize(scales, max_size=max_size),
-            normalize]
-        )
+        return T.Compose([T.RandomResize(scales, max_size=max_size), normalize])
 
     else:
-        return T.Compose([
-        T.ToTensor(),
-        T.Normalize(
-            mean=[0.485, 0.456, 0.406],
-            std=[0.229, 0.224, 0.225])
-    ])
-
+        return T.Compose([T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
 
     raise ValueError(f"unknown {image_set}")
 
+
 from pathlib import Path
 
+
 def build(image_set, args):
     root = Path(args.rsvg_mm_path)
-    assert root.exists(), f'provided rsvg_mm path {root} does not exist'
-    input_transform = T.Compose([
-        T.ToTensor(),
-        T.Normalize(
-            mean=[0.485, 0.456, 0.406],
-            std=[0.229, 0.224, 0.225])
-    ])
-
-    img_folder = 'data/rsvg_mm/images'
-    dataset = RSVGDataset(img_folder, transform=input_transform, split=image_set, testmode=(image_set == 'test'))ssss
+    assert root.exists(), f"provided rsvg_mm path {root} does not exist"
+    input_transform = T.Compose([T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+
+    img_folder = "data/images"  # Updated to correct path
+    dataset = RSVGDataset(img_folder, transform=input_transform, split=image_set, testmode=(image_set == "test"))
     return dataset
 
+
 # make_coco_transforms(image_set, False)
-if __name__ == '__main__':
-    input_transform = T.Compose([
-        T.ToTensor(),
-        T.Normalize(
-            mean=[0.485, 0.456, 0.406],
-            std=[0.229, 0.224, 0.225])
-    ])
-    img_folder = '../data/rsvg_mm/images'
-    image_set = 'train'
-    dataset = RSVGDataset(img_folder, transform=input_transform, split=image_set, testmode=(image_set=='test'))
+if __name__ == "__main__":
+    input_transform = T.Compose([T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+    img_folder = "../data/rsvg_mm/images"
+    image_set = "train"
+    dataset = RSVGDataset(img_folder, transform=input_transform, split=image_set, testmode=(image_set == "test"))
     sample_num = dataset.__len__()
     sample_num = dataset.__getitem__(0)
     print(sample_num)
diff --git a/inference_rsvg.py b/inference_rsvg.py
index 5759399..acedef2 100644
--- a/inference_rsvg.py
+++ b/inference_rsvg.py
@@ -21,18 +21,17 @@
 from tools.colormap import colormap
 
 # os.environ.pop("QT_QPA_PLATFORM_PLUGIN_PATH")
-os.environ["CUDA_VISIBLE_DEVICES"] = '2'
+os.environ["CUDA_VISIBLE_DEVICES"] = "2"
 
 # colormap
 color_list = colormap()
-color_list = color_list.astype('uint8').tolist()
+color_list = color_list.astype("uint8").tolist()
 
-Visualize_bbox = False #False #True
+Visualize_bbox = True  # False #True
 save_visualize_path_prefix = "test_output"
 version = "test"
 
 
-
 def main(args):
     args.masks = False
     # args.batch_size == 1
@@ -44,13 +43,21 @@ def main(args):
     np.random.seed(seed)
     random.seed(seed)
 
+    # Override dataset settings for our custom setup
+    args.dataset_file = "rsvg_mm"
+    args.rsvg_mm_path = "."  # Current directory contains our setup
+    args.visualize = True
+
+    # Override model parameters to match the checkpoint
+    args.num_classes = 1  # Checkpoint has 1 class
+    args.num_queries = 10  # Checkpoint has 10 queries
+
     if args.visualize:
         if not os.path.exists(save_visualize_path_prefix):
             os.makedirs(save_visualize_path_prefix)
 
-    test_dataset = build_dataset(args.dataset_file, image_set='test', args=args)
-    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False,
-                             pin_memory=True, drop_last=True, num_workers=4)
+    test_dataset = build_dataset(args.dataset_file, image_set="test", args=args)
+    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, pin_memory=True, drop_last=True, num_workers=4)
 
     # model
     model, criterion, _ = build_model(args)
@@ -59,22 +66,23 @@ def main(args):
 
     # model_without_ddp = model
     n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    print('number of params:', n_parameters)
+    print("number of params:", n_parameters)
 
     if args.resume:
-        checkpoint = torch.load(args.resume, map_location='cpu')
-        missing_keys, unexpected_keys = model.load_state_dict(checkpoint['model'], strict=False)
-        unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))]
+        checkpoint = torch.load(args.resume, map_location="cpu", weights_only=False)
+        missing_keys, unexpected_keys = model.load_state_dict(checkpoint["model"], strict=False)
+        unexpected_keys = [k for k in unexpected_keys if not (k.endswith("total_params") or k.endswith("total_ops"))]
         if len(missing_keys) > 0:
-            print('Missing Keys: {}'.format(missing_keys))
+            print("Missing Keys: {}".format(missing_keys))
         if len(unexpected_keys) > 0:
-            print('Unexpected Keys: {}'.format(unexpected_keys))
+            print("Unexpected Keys: {}".format(unexpected_keys))
     else:
-        raise ValueError('Please specify the checkpoint for inference.')
+        raise ValueError("Please specify the checkpoint for inference.")
 
     # start inference
     evaluate(test_loader, model, args)
 
+
 def evaluate(test_loader, model, args):
     batch_time = AverageMeter()
     acc5 = AverageMeter()
@@ -91,9 +99,9 @@ def evaluate(test_loader, model, args):
     end = time.time()
 
     img_list = []
-    count=0
+    count = 0
     for batch_idx, (img, targets, dw, dh, img_path, ratio) in enumerate(test_loader):
-        h_resize, w_resize = img.shape[ -2:]
+        h_resize, w_resize = img.shape[-2:]
         img = img.to(device)
         captions = targets["caption"]
         size = torch.as_tensor([int(h_resize), int(w_resize)]).to(device)
@@ -127,12 +135,12 @@ def evaluate(test_loader, model, args):
         # _, max_ind = max_score.max(-1)  # [1,] # which query
         # pred_bbox = pred_bboxes[max_ind]  # [xc,yc, w_b, h_b]
 
-        #single level selection
+        # single level selection
         # according to pred_logits, select the query index
         pred_logits = outputs["pred_logits"][0]
         pred_bbox = outputs["pred_boxes"][0]
         pred_score = pred_logits.sigmoid()  # [t, q, k]
-        pred_score = pred_score.squeeze(0)# [q, k]
+        pred_score = pred_score.squeeze(0)  # [q, k]
         # pred_scores = pred_scores.mean(0)  # [q, k]
         max_score, _ = pred_score.max(-1)  # [q,]
         _, max_ind = max_score.max(-1)  # [1,] # which query
@@ -150,33 +158,33 @@ def evaluate(test_loader, model, args):
         target_bbox[1], target_bbox[3] = (target_bbox[1] - dh) / ratio, (target_bbox[3] - dh) / ratio
 
         if Visualize_bbox:
-                source_img = Image.open(img_path[0]).convert('RGB')  # PIL image
-
-                draw = ImageDraw.Draw(source_img)
-                draw_boxes = pred_bbox.tolist()
-
-                # draw boxes
-                xmin, ymin, xmax, ymax = draw_boxes[0:4]
-
-                # draw_boxes_gt = target_bbox.tolist()
-                # xmin_gt, ymin_gt, xmax_gt, ymax_gt = draw_boxes_gt[0:4]
-
-                draw.rectangle(((xmin, ymin), (xmax, ymax)), outline=tuple(color_list[9]), width=2)
-                # draw.rectangle(((xmin_gt, ymin_gt), (xmax_gt, ymax_gt)), outline=tuple(color_list[9]), width=2)
-                # fontStyle = ImageFont.truetype("SimHei.ttf", 30)
-                # draw.text((20, 20), captions[0], (200, 0, 0), font=fontStyle)
-                # save
-                save_visualize_path_dir = os.path.join(save_visualize_path_prefix, version)
-                if not os.path.exists(save_visualize_path_dir):
-                    os.makedirs(save_visualize_path_dir)
-                img_name = img_path[0].split('/')[-1]
-                if img_name not in img_list:
-                    img_list.append(img_name)
-                else:
-                    count += 1
-                    img_name = str(count) + '_' + img_name
-                save_visualize_path = os.path.join(save_visualize_path_dir, img_name)
-                source_img.save(save_visualize_path)
+            source_img = Image.open(img_path[0]).convert("RGB")  # PIL image
+
+            draw = ImageDraw.Draw(source_img)
+            draw_boxes = pred_bbox.tolist()
+
+            # draw boxes
+            xmin, ymin, xmax, ymax = draw_boxes[0:4]
+
+            # draw_boxes_gt = target_bbox.tolist()
+            # xmin_gt, ymin_gt, xmax_gt, ymax_gt = draw_boxes_gt[0:4]
+
+            draw.rectangle(((xmin, ymin), (xmax, ymax)), outline=tuple(color_list[9]), width=2)
+            # draw.rectangle(((xmin_gt, ymin_gt), (xmax_gt, ymax_gt)), outline=tuple(color_list[9]), width=2)
+            # fontStyle = ImageFont.truetype("SimHei.ttf", 30)
+            # draw.text((20, 20), captions[0], (200, 0, 0), font=fontStyle)
+            # save
+            save_visualize_path_dir = os.path.join(save_visualize_path_prefix, version)
+            if not os.path.exists(save_visualize_path_dir):
+                os.makedirs(save_visualize_path_dir)
+            img_name = img_path[0].split("/")[-1]
+            if img_name not in img_list:
+                img_list.append(img_name)
+            else:
+                count += 1
+                img_name = str(count) + "_" + img_name
+            save_visualize_path = os.path.join(save_visualize_path_dir, img_name)
+            source_img.save(save_visualize_path)
 
         # box iou
         iou, interArea, unionArea = bbox_iou(pred_bbox, target_bbox)
@@ -194,7 +202,6 @@ def evaluate(test_loader, model, args):
         inter_area.update(cumInterArea)
         union_area.update(cumUnionArea)
 
-
         acc5.update(accu5, img.size(0))
         acc6.update(accu6, img.size(0))
         acc7.update(accu7, img.size(0))
@@ -206,39 +213,68 @@ def evaluate(test_loader, model, args):
         end = time.time()
 
         if batch_idx % 50 == 0:
-            print_str = '[{0}/{1}]\t' \
-                        'Time {batch_time.avg:.3f}\t' \
-                        'acc@0.5: {acc5.avg:.4f}\t' \
-                        'acc@0.6: {acc6.avg:.4f}\t' \
-                        'acc@0.7: {acc7.avg:.4f}\t' \
-                        'acc@0.8: {acc8.avg:.4f}\t' \
-                        'acc@0.9: {acc9.avg:.4f}\t' \
-                        'meanIoU: {meanIoU.avg:.4f}\t' \
-                        'cumuIoU: {cumuIoU:.4f}\t' \
-                .format( \
-                batch_idx, len(test_loader), batch_time=batch_time, \
-                acc5=acc5, acc6=acc6, acc7=acc7, acc8=acc8, acc9=acc9, \
-                meanIoU=meanIoU, cumuIoU=inter_area.sum / union_area.sum)
+            print_str = (
+                "[{0}/{1}]\t"
+                "Time {batch_time.avg:.3f}\t"
+                "acc@0.5: {acc5.avg:.4f}\t"
+                "acc@0.6: {acc6.avg:.4f}\t"
+                "acc@0.7: {acc7.avg:.4f}\t"
+                "acc@0.8: {acc8.avg:.4f}\t"
+                "acc@0.9: {acc9.avg:.4f}\t"
+                "meanIoU: {meanIoU.avg:.4f}\t"
+                "cumuIoU: {cumuIoU:.4f}\t".format(
+                    batch_idx,
+                    len(test_loader),
+                    batch_time=batch_time,
+                    acc5=acc5,
+                    acc6=acc6,
+                    acc7=acc7,
+                    acc8=acc8,
+                    acc9=acc9,
+                    meanIoU=meanIoU,
+                    cumuIoU=inter_area.sum / (union_area.sum + 1e-6),
+                )
+            )
             print(print_str)
             # logging.info(print_str)
-    final_str = 'acc@0.5: {acc5.avg:.4f}\t' 'acc@0.6: {acc6.avg:.4f}\t' 'acc@0.7: {acc7.avg:.4f}\t' \
-                'acc@0.8: {acc8.avg:.4f}\t' 'acc@0.9: {acc9.avg:.4f}\t' \
-                'meanIoU: {meanIoU.avg:.4f}\t' 'cumuIoU: {cumuIoU:.4f}\t' \
-        .format(acc5=acc5, acc6=acc6, acc7=acc7, acc8=acc8, acc9=acc9, \
-                meanIoU=meanIoU, cumuIoU=inter_area.sum / union_area.sum)
+    final_str = (
+        "acc@0.5: {acc5.avg:.4f}\t"
+        "acc@0.6: {acc6.avg:.4f}\t"
+        "acc@0.7: {acc7.avg:.4f}\t"
+        "acc@0.8: {acc8.avg:.4f}\t"
+        "acc@0.9: {acc9.avg:.4f}\t"
+        "meanIoU: {meanIoU.avg:.4f}\t"
+        "cumuIoU: {cumuIoU:.4f}\t".format(
+            acc5=acc5,
+            acc6=acc6,
+            acc7=acc7,
+            acc8=acc8,
+            acc9=acc9,
+            meanIoU=meanIoU,
+            cumuIoU=inter_area.sum / (union_area.sum + 1e-6),
+        )
+    )
     print(final_str)
     print(version)
 
 
-
-
 def bbox_iou(box1, box2):
     """
     Returns the IoU of two bounding boxes
     """
     # Get the coordinates of bounding boxes
-    b1_x1, b1_y1, b1_x2, b1_y2 = torch.tensor(box1[0]), torch.tensor(box1[1]), torch.tensor(box1[2]), torch.tensor(box1[3])
-    b2_x1, b2_y1, b2_x2, b2_y2 = torch.tensor(box2[0]), torch.tensor(box2[1]), torch.tensor(box2[2]), torch.tensor(box2[3])
+    b1_x1, b1_y1, b1_x2, b1_y2 = (
+        torch.tensor(box1[0]),
+        torch.tensor(box1[1]),
+        torch.tensor(box1[2]),
+        torch.tensor(box1[3]),
+    )
+    b2_x1, b2_y1, b2_x2, b2_y2 = (
+        torch.tensor(box2[0]),
+        torch.tensor(box2[1]),
+        torch.tensor(box2[2]),
+        torch.tensor(box2[3]),
+    )
 
     # get the coordinates of the intersection rectangle
 
@@ -255,11 +291,11 @@ def bbox_iou(box1, box2):
 
     return (inter_area + 1e-6) / (union_area + 1e-6), inter_area, union_area
 
+
 # visuaize functions
 def box_cxcywh_to_xyxy(x):
     x_c, y_c, w, h = x.unbind(0)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
     return torch.stack(b, dim=0)
 
 
@@ -288,15 +324,14 @@ def draw_sample_points(draw, sample_points, img_size, color_list):
             x, y = sample
             cur_color = color_list[i % len(color_list)][::-1]
             cur_color += [alpha]
-            draw.ellipse((x - 2, y - 2, x + 2, y + 2),
-                         fill=tuple(cur_color), outline=tuple(cur_color), width=1)
+            draw.ellipse((x - 2, y - 2, x + 2, y + 2), fill=tuple(cur_color), outline=tuple(cur_color), width=1)
 
 
 def vis_add_mask(img, mask, color):
-    origin_img = np.asarray(img.convert('RGB')).copy()
+    origin_img = np.asarray(img.convert("RGB")).copy()
     color = np.array(color)
 
-    mask = mask.reshape(mask.shape[0], mask.shape[1]).astype('uint8')  # np
+    mask = mask.reshape(mask.shape[0], mask.shape[1]).astype("uint8")  # np
     mask = mask > 0.5
 
     origin_img[mask] = origin_img[mask] * 0.5 + color * 0.5
@@ -304,7 +339,7 @@ def vis_add_mask(img, mask, color):
     return origin_img
 
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser('Refer_RSVG inference script', parents=[opts.get_args_parser()])
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Refer_RSVG inference script", parents=[opts.get_args_parser()])
     args = parser.parse_args()
     main(args)
diff --git a/models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO b/models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO
index 18ccefb..4a05fd2 100644
--- a/models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO
+++ b/models/ops/MultiScaleDeformableAttention.egg-info/PKG-INFO
@@ -1,11 +1,9 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: MultiScaleDeformableAttention
 Version: 1.0
-Summary: PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention
+Summary: PyTorch Wrapper for Multi-Scale Deformable Attention (CPU-only)
 Home-page: https://github.com/fundamentalvision/Deformable-DETR
 Author: Weijie Su
-License: UNKNOWN
-Platform: UNKNOWN
-
-UNKNOWN
-
+Dynamic: author
+Dynamic: home-page
+Dynamic: summary
diff --git a/models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt b/models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt
index 1bb003a..24e757e 100644
--- a/models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt
+++ b/models/ops/MultiScaleDeformableAttention.egg-info/SOURCES.txt
@@ -1,7 +1,6 @@
 setup.py
-/data/users/lanmeng/ReferFormer/models/ops/src/vision.cpp
-/data/users/lanmeng/ReferFormer/models/ops/src/cpu/ms_deform_attn_cpu.cpp
-/data/users/lanmeng/ReferFormer/models/ops/src/cuda/ms_deform_attn_cuda.cu
+C:/Users/jd138001/Downloads/LQVG/models/ops/src/vision.cpp
+C:/Users/jd138001/Downloads/LQVG/models/ops/src/cpu/ms_deform_attn_cpu.cpp
 MultiScaleDeformableAttention.egg-info/PKG-INFO
 MultiScaleDeformableAttention.egg-info/SOURCES.txt
 MultiScaleDeformableAttention.egg-info/dependency_links.txt
diff --git a/models/ops/build/lib.win-amd64-cpython-39/functions/__init__.py b/models/ops/build/lib.win-amd64-cpython-39/functions/__init__.py
new file mode 100644
index 0000000..8a2197b
--- /dev/null
+++ b/models/ops/build/lib.win-amd64-cpython-39/functions/__init__.py
@@ -0,0 +1,10 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from .ms_deform_attn_func import MSDeformAttnFunction
+
diff --git a/models/ops/build/lib.win-amd64-cpython-39/functions/ms_deform_attn_func.py b/models/ops/build/lib.win-amd64-cpython-39/functions/ms_deform_attn_func.py
new file mode 100644
index 0000000..8c5df8c
--- /dev/null
+++ b/models/ops/build/lib.win-amd64-cpython-39/functions/ms_deform_attn_func.py
@@ -0,0 +1,61 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+import MultiScaleDeformableAttention as MSDA
+
+
+class MSDeformAttnFunction(Function):
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
+        ctx.im2col_step = im2col_step
+        output = MSDA.ms_deform_attn_forward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = \
+            MSDA.ms_deform_attn_backward(
+                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+
+
+def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
+    # for debug and test only,
+    # need to use cuda version instead
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
+                                          mode='bilinear', padding_mode='zeros', align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
+    return output.transpose(1, 2).contiguous()
diff --git a/models/ops/build/lib.win-amd64-cpython-39/modules/__init__.py b/models/ops/build/lib.win-amd64-cpython-39/modules/__init__.py
new file mode 100644
index 0000000..f82cb1a
--- /dev/null
+++ b/models/ops/build/lib.win-amd64-cpython-39/modules/__init__.py
@@ -0,0 +1,9 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from .ms_deform_attn import MSDeformAttn
diff --git a/models/ops/build/lib.win-amd64-cpython-39/modules/ms_deform_attn.py b/models/ops/build/lib.win-amd64-cpython-39/modules/ms_deform_attn.py
new file mode 100644
index 0000000..7efcf17
--- /dev/null
+++ b/models/ops/build/lib.win-amd64-cpython-39/modules/ms_deform_attn.py
@@ -0,0 +1,117 @@
+# Modify for sample points visualization
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import warnings
+import math
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+
+from ..functions import MSDeformAttnFunction
+
+
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+
+
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
+        """
+        Multi-Scale Deformable Attention Module
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                          "which is more efficient in our CUDA implementation.")
+
+        self.im2col_step = 64
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+        output = MSDeformAttnFunction.apply(
+            value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
+        output = self.output_proj(output)
+
+        return output, sampling_locations, attention_weights
diff --git a/models/ops/functions/ms_deform_attn_func.py b/models/ops/functions/ms_deform_attn_func.py
index 8c5df8c..83de6cc 100644
--- a/models/ops/functions/ms_deform_attn_func.py
+++ b/models/ops/functions/ms_deform_attn_func.py
@@ -22,8 +22,13 @@ class MSDeformAttnFunction(Function):
     @staticmethod
     def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
         ctx.im2col_step = im2col_step
-        output = MSDA.ms_deform_attn_forward(
-            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
+        # Check if we're on CPU and use PyTorch implementation
+        if value.is_cuda:
+            output = MSDA.ms_deform_attn_forward(
+                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
+        else:
+            # Use CPU implementation
+            output = ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights)
         ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
         return output
 
@@ -31,9 +36,15 @@ def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_
     @once_differentiable
     def backward(ctx, grad_output):
         value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = \
-            MSDA.ms_deform_attn_backward(
-                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+        if value.is_cuda:
+            grad_value, grad_sampling_loc, grad_attn_weight = \
+                MSDA.ms_deform_attn_backward(
+                    value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+        else:
+            # For CPU, return None gradients (simplified for inference)
+            grad_value = torch.zeros_like(value)
+            grad_sampling_loc = torch.zeros_like(sampling_locations)
+            grad_attn_weight = torch.zeros_like(attention_weights)
 
         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
 
diff --git a/models/ops/setup.py b/models/ops/setup.py
index a0131bc..7d63465 100644
--- a/models/ops/setup.py
+++ b/models/ops/setup.py
@@ -11,41 +11,25 @@
 
 import torch
 
-from torch.utils.cpp_extension import CUDA_HOME
 from torch.utils.cpp_extension import CppExtension
-from torch.utils.cpp_extension import CUDAExtension
-
 from setuptools import find_packages
 from setuptools import setup
 
 requirements = ["torch", "torchvision"]
 
+
 def get_extensions():
     this_dir = os.path.dirname(os.path.abspath(__file__))
     extensions_dir = os.path.join(this_dir, "src")
 
     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
-    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
-
     sources = main_file + source_cpu
+
     extension = CppExtension
-    extra_compile_args = {"cxx": []}
+    extra_compile_args = {"cxx": []}  # MSVC-compatible, no extra flags
     define_macros = []
 
-    if torch.cuda.is_available() and CUDA_HOME is not None:
-        extension = CUDAExtension
-        sources += source_cuda
-        define_macros += [("WITH_CUDA", None)]
-        extra_compile_args["nvcc"] = [
-            "-DCUDA_HAS_FP16=1",
-            "-D__CUDA_NO_HALF_OPERATORS__",
-            "-D__CUDA_NO_HALF_CONVERSIONS__",
-            "-D__CUDA_NO_HALF2_OPERATORS__",
-        ]
-    else:
-        raise NotImplementedError('Cuda is not availabel')
-
     sources = [os.path.join(extensions_dir, s) for s in sources]
     include_dirs = [extensions_dir]
     ext_modules = [
@@ -59,13 +43,19 @@ def get_extensions():
     ]
     return ext_modules
 
+
 setup(
     name="MultiScaleDeformableAttention",
     version="1.0",
     author="Weijie Su",
     url="https://github.com/fundamentalvision/Deformable-DETR",
-    description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
-    packages=find_packages(exclude=("configs", "tests",)),
+    description="PyTorch Wrapper for Multi-Scale Deformable Attention (CPU-only)",
+    packages=find_packages(
+        exclude=(
+            "configs",
+            "tests",
+        )
+    ),
     ext_modules=get_extensions(),
     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
 )
diff --git a/models/ops/src/cpu/ms_deform_attn_cpu.cpp b/models/ops/src/cpu/ms_deform_attn_cpu.cpp
index e1bf854..d0f7ce0 100644
--- a/models/ops/src/cpu/ms_deform_attn_cpu.cpp
+++ b/models/ops/src/cpu/ms_deform_attn_cpu.cpp
@@ -9,10 +9,9 @@
 */
 
 #include <vector>
-
 #include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
 
+// Removed <ATen/cuda/CUDAContext.h>
 
 at::Tensor
 ms_deform_attn_cpu_forward(
@@ -23,7 +22,7 @@ ms_deform_attn_cpu_forward(
     const at::Tensor &attn_weight,
     const int im2col_step)
 {
-    AT_ERROR("Not implement on cpu");
+    AT_ERROR("Not implemented on CPU");
 }
 
 std::vector<at::Tensor>
@@ -36,6 +35,5 @@ ms_deform_attn_cpu_backward(
     const at::Tensor &grad_output,
     const int im2col_step)
 {
-    AT_ERROR("Not implement on cpu");
-}
-
+    AT_ERROR("Not implemented on CPU");
+}
\ No newline at end of file
diff --git a/test.sh b/test.sh
index 3cd10a1..30830df 100644
--- a/test.sh
+++ b/test.sh
@@ -1,5 +1,5 @@
 #python3 inference_rsvg.py --dataset_file rsvg --num_queries 10 --with_box_refine --binary --freeze_text_encoder \
-#--resume rsvg_dirs/r50_bidrection_fusion_10query/checkpoint.pth --backbone resnet50
-
+#--resume rsvg_dirs/r50_bidrection_fusion_10query/checkpoint.pth --backbone resnet50 --device cpu
 python3 inference_rsvg.py --dataset_file rsvg_mm --num_queries 10 --with_box_refine --binary --freeze_text_encoder \
---resume rsvg_mm_dirs/r50_bidrection_fusion_10query_70epo/checkpoint.pth --backbone resnet50
\ No newline at end of file
+--resume rsvg_mm_dirs/r50_bidrection_fusion_10query_70epo/checkpoint.pth --backbone resnet50 --device cpu
+

From a51d1232c3648503eedb0608aa5ffce641f5fe3f Mon Sep 17 00:00:00 2001
From: Mohammadreza Haghighat <mr.haghighat1998@gmail.com>
Date: Thu, 7 Aug 2025 11:16:14 +1000
Subject: [PATCH 2/4] Added comments

---
 datasets/refexp.py | 108 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 94 insertions(+), 14 deletions(-)

diff --git a/datasets/refexp.py b/datasets/refexp.py
index ee4cac8..a6c7bfa 100644
--- a/datasets/refexp.py
+++ b/datasets/refexp.py
@@ -4,72 +4,130 @@
 COCO dataset which returns image_id for evaluation.
 Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
 """
-from pathlib import Path
 
+# Import standard and third-party libraries.
+from pathlib import Path
 import torch
 import torch.utils.data
 import torchvision
 from pycocotools import mask as coco_mask
 
+# Import project-specific transformation functions.
 import datasets.transforms_image as T
 
 
 class ModulatedDetection(torchvision.datasets.CocoDetection):
+    """
+    A custom dataset class that extends torchvision's CocoDetection.
+    It's designed for referring expression tasks, where each image is associated with a text caption.
+    It also ensures that every item returned has at least one valid object instance after augmentations.
+    """
+
     def __init__(self, img_folder, ann_file, transforms, return_masks):
+        """
+        Initializes the dataset.
+        Args:
+            img_folder (str): Path to the folder containing images.
+            ann_file (str): Path to the COCO-style annotation JSON file.
+            transforms (callable): A function/transform that takes in an image and a target and returns a transformed version.
+            return_masks (bool): If True, segmentation masks are returned for each object.
+        """
+        # Initialize the parent CocoDetection class.
         super(ModulatedDetection, self).__init__(img_folder, ann_file)
+        # Store the augmentation transforms.
         self._transforms = transforms
+        # Create an instance of a helper class to process COCO annotations.
         self.prepare = ConvertCocoPolysToMask(return_masks)
 
     def __getitem__(self, idx):
+        """
+        Retrieves an item from the dataset at the given index.
+        This method includes a loop to ensure that a valid sample (with at least one object) is returned,
+        even if data augmentation crops away all objects.
+        """
         instance_check = False
+        # Loop until a valid sample with at least one object instance is found.
         while not instance_check:
+            # Get the raw image and annotations from the parent class.
             img, target = super(ModulatedDetection, self).__getitem__(idx)
+            # Get the unique image ID for the current sample.
             image_id = self.ids[idx]
+            # Load the full COCO image metadata.
             coco_img = self.coco.loadImgs(image_id)[0]
+            # Extract the referring expression (caption) from the metadata.
             caption = coco_img["caption"]
+            # Extract the dataset name if it exists.
             dataset_name = coco_img["dataset_name"] if "dataset_name" in coco_img else None
+            # Prepare the initial target dictionary.
             target = {"image_id": image_id, "annotations": target, "caption": caption}
+            # Use the 'prepare' helper to convert annotations into tensors (boxes, masks, etc.).
             img, target = self.prepare(img, target)
+            # Apply data augmentations if any are defined.
             if self._transforms is not None:
                 img, target = self._transforms(img, target)
+            # Add the dataset name back to the final target.
             target["dataset_name"] = dataset_name
+            # Add any other important metadata from the COCO annotations to the target.
             for extra_key in ["sentence_id", "original_img_id", "original_id", "task_id"]:
                 if extra_key in coco_img:
-                    target[extra_key] = coco_img[extra_key] # box xyxy -> cxcywh
-            # FIXME: handle "valid", since some box may be removed due to random crop
+                    target[extra_key] = coco_img[extra_key]
+
+            # Check if any valid bounding boxes remain after augmentations (e.g., random cropping).
+            # A sample is valid if it has at least one box.
             target["valid"] = torch.tensor([1]) if len(target["area"]) != 0 else torch.tensor([0])
 
-            if torch.any(target['valid'] == 1):  # at leatst one instance
+            # If the sample has at least one valid instance, exit the loop.
+            if torch.any(target["valid"] == 1):
                 instance_check = True
             else:
+                # If augmentations removed all objects, pick a new random sample and try again.
                 import random
+
                 idx = random.randint(0, self.__len__() - 1)
+
+        # Add a temporal dimension (T=1) to the image tensor to make it compatible with video models.
+        # Final image shape: [1, 3, H, W].
         return img.unsqueeze(0), target
-        # return img: [1, 3, H, W], the first dimension means T = 1.
 
 
 def convert_coco_poly_to_mask(segmentations, height, width):
+    """
+    Helper function to convert COCO's polygon segmentation format into a tensor of binary masks.
+    """
     masks = []
+    # Iterate over each object's segmentation data.
     for polygons in segmentations:
+        # Convert polygon coordinates to Run-Length Encoding (RLE) format.
         rles = coco_mask.frPyObjects(polygons, height, width)
+        # Decode RLE to get a binary mask.
         mask = coco_mask.decode(rles)
+        # Ensure the mask has a channel dimension.
         if len(mask.shape) < 3:
             mask = mask[..., None]
         mask = torch.as_tensor(mask, dtype=torch.uint8)
+        # Merge masks for multi-part objects into a single mask.
         mask = mask.any(dim=2)
         masks.append(mask)
     if masks:
+        # Stack all individual masks into a single tensor.
         masks = torch.stack(masks, dim=0)
     else:
+        # If there are no masks, return an empty tensor with the correct shape.
         masks = torch.zeros((0, height, width), dtype=torch.uint8)
     return masks
 
 
 class ConvertCocoPolysToMask(object):
+    """
+    A callable class that acts as a transform. It converts raw COCO annotations
+    into a clean dictionary of tensors (boxes, labels, masks) that the model can use.
+    """
+
     def __init__(self, return_masks=False):
         self.return_masks = return_masks
 
     def __call__(self, image, target):
+        # Get image dimensions.
         w, h = image.size
 
         image_id = target["image_id"]
@@ -78,29 +136,34 @@ def __call__(self, image, target):
         anno = target["annotations"]
         caption = target["caption"] if "caption" in target else None
 
+        # Filter out "crowd" annotations, which are large groups of objects.
         anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0]
 
+        # Extract bounding boxes and convert from [x, y, w, h] to [x1, y1, x2, y2] format.
         boxes = [obj["bbox"] for obj in anno]
-        # guard against no boxes via resizing
         boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
-        boxes[:, 2:] += boxes[:, :2] # xminyminwh -> xyxy
+        boxes[:, 2:] += boxes[:, :2]
+        # Clamp box coordinates to be within the image boundaries.
         boxes[:, 0::2].clamp_(min=0, max=w)
         boxes[:, 1::2].clamp_(min=0, max=h)
 
+        # Extract class labels.
         classes = [obj["category_id"] for obj in anno]
         classes = torch.tensor(classes, dtype=torch.int64)
 
+        # If requested, convert segmentation polygons to binary masks.
         if self.return_masks:
             segmentations = [obj["segmentation"] for obj in anno]
             masks = convert_coco_poly_to_mask(segmentations, h, w)
 
-        # keep the valid boxes
+        # Remove any boxes that have zero width or height after clamping.
         keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
         boxes = boxes[keep]
         classes = classes[keep]
         if self.return_masks:
             masks = masks[keep]
 
+        # Assemble the final target dictionary.
         target = {}
         target["boxes"] = boxes
         target["labels"] = classes
@@ -110,32 +173,41 @@ def __call__(self, image, target):
             target["masks"] = masks
         target["image_id"] = image_id
 
-        # for conversion to coco api
+        # Add other useful metadata for evaluation.
         area = torch.tensor([obj["area"] for obj in anno])
         iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
         target["area"] = area[keep]
         target["iscrowd"] = iscrowd[keep]
-        target["valid"] = torch.tensor([1])
+        target["valid"] = torch.tensor([1])  # Mark as valid since we've processed it.
         target["orig_size"] = torch.as_tensor([int(h), int(w)])
         target["size"] = torch.as_tensor([int(h), int(w)])
         return image, target
 
 
 def make_coco_transforms(image_set, cautious):
-
+    """
+    Creates a pipeline of data augmentations for training or validation.
+    """
+    # Define the standard normalization transform.
     normalize = T.Compose([T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
 
+    # Define scales for resizing augmentations.
     scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
-    final_scales = [296, 328, 360, 392, 416, 448, 480, 512] 
-
+    final_scales = [296, 328, 360, 392, 416, 448, 480, 512]
     max_size = 800
+
+    # Define the augmentation pipeline for the training set.
     if image_set == "train":
+        # Optionally add horizontal flipping.
         horizontal = [] if cautious else [T.RandomHorizontalFlip()]
         return T.Compose(
             horizontal
             + [
+                # Randomly select one of two augmentation strategies.
                 T.RandomSelect(
+                    # Strategy 1: Simple random resizing.
                     T.RandomResize(scales, max_size=max_size),
+                    # Strategy 2: A more complex combination of resizing and cropping.
                     T.Compose(
                         [
                             T.RandomResize([400, 500, 600]),
@@ -148,9 +220,11 @@ def make_coco_transforms(image_set, cautious):
             ]
         )
 
+    # Define the augmentation pipeline for the validation set.
     if image_set == "val":
         return T.Compose(
             [
+                # Simple resizing and normalization.
                 T.RandomResize([360], max_size=640),
                 normalize,
             ]
@@ -160,20 +234,26 @@ def make_coco_transforms(image_set, cautious):
 
 
 def build(dataset_file, image_set, args):
+    """
+    The main factory function to build the referring expression dataset.
+    """
+    # Get the root path of the COCO dataset.
     root = Path(args.coco_path)
     assert root.exists(), f"provided COCO path {root} does not exist"
     mode = "instances"
     dataset = dataset_file
+    # Define the paths to the image folders and annotation files for train/val splits.
     PATHS = {
         "train": (root / "train2014", root / dataset / f"{mode}_{dataset}_train.json"),
         "val": (root / "train2014", root / dataset / f"{mode}_{dataset}_val.json"),
     }
 
     img_folder, ann_file = PATHS[image_set]
+    # Instantiate the ModulatedDetection dataset with the appropriate transforms.
     dataset = ModulatedDetection(
         img_folder,
         ann_file,
         transforms=make_coco_transforms(image_set, False),
         return_masks=args.masks,
     )
-    return dataset
\ No newline at end of file
+    return dataset

From b11c62407e386917dccd4427ba0679ba13494728 Mon Sep 17 00:00:00 2001
From: Mohammadreza Haghighat <62746461+MHaghighat98@users.noreply.github.com>
Date: Wed, 27 Aug 2025 09:32:49 +1000
Subject: [PATCH 3/4] Colab-ready test with custom dataset

---
 LQVG_test.ipynb | 300 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 300 insertions(+)
 create mode 100644 LQVG_test.ipynb

diff --git a/LQVG_test.ipynb b/LQVG_test.ipynb
new file mode 100644
index 0000000..0496f05
--- /dev/null
+++ b/LQVG_test.ipynb
@@ -0,0 +1,300 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## **The modified source code compatible with colab (gdown does not work for downloading):**"
+      ],
+      "metadata": {
+        "id": "kTnynktRAMnJ"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
+      ],
+      "metadata": {
+        "id": "WixriWROqUUx"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## **LQVG weights trained on DIOR-RSVG and RSVG-HR**"
+      ],
+      "metadata": {
+        "id": "JVkx_5-QFK4o"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!gdown --folder https://drive.google.com/drive/folders/1uC9TAPOwiIbHcee6hSO_3b2Mwr-zDGtg?usp=drive_link -O drive/MyDrive/weights"
+      ],
+      "metadata": {
+        "id": "dlUmSOwieC9o"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "cd /content/drive/.shortcut-targets-by-id/18OjYDfO70rO2e-oLMPmNbu2Z1WH7nppd/LQVG\n"
+      ],
+      "metadata": {
+        "id": "KddNyUyDwvP6"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install torch torchvision torchaudio\n"
+      ],
+      "metadata": {
+        "id": "gmmmP7yqvVf7"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install -r requirements.txt\n"
+      ],
+      "metadata": {
+        "id": "NVWRfbjfwUYx"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install 'git+https://github.com/facebookresearch/fvcore'\n"
+      ],
+      "metadata": {
+        "id": "3MbdnvA9sKI_"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'\n"
+      ],
+      "metadata": {
+        "id": "nvEpgsttsY3L"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "cd /content/drive/.shortcut-targets-by-id/18OjYDfO70rO2e-oLMPmNbu2Z1WH7nppd/LQVG/models/ops"
+      ],
+      "metadata": {
+        "id": "tUCBNrYGshhy"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!python setup.py build install"
+      ],
+      "metadata": {
+        "id": "ENyTettCskxg"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "cd ../.."
+      ],
+      "metadata": {
+        "id": "bU-A2IBKsmo5"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Inference:"
+      ],
+      "metadata": {
+        "id": "7g2sVjoVFQiJ"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!python3 inference_rsvg.py --dataset_file rsvg_mm --num_queries 10 --with_box_refine --binary --freeze_text_encoder \\\n",
+        "--resume /content/drive/MyDrive/weights/RSVG-HR/checkpoint.pth --backbone resnet50"
+      ],
+      "metadata": {
+        "id": "x3dfhLHB9nET"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## **Visualize some results:**"
+      ],
+      "metadata": {
+        "id": "TQpWSEAMIQwF"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "from pathlib import Path\n",
+        "import math\n",
+        "from PIL import Image\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "# --- Config ---\n",
+        "IDS = [120, 159, 316, 328, 377, 502, 507, 510]\n",
+        "folder = \"/content/drive/MyDrive/LQVG/test_output/test\"  # Input folder\n",
+        "save_path = \"/content/drive/MyDrive/grid/grid.png\"       # Output file\n",
+        "cols = 4                                                 # Grid columns\n",
+        "\n",
+        "# --- Ensure save directory exists ---\n",
+        "os.makedirs(os.path.dirname(save_path), exist_ok=True)\n",
+        "\n",
+        "# --- Functions ---\n",
+        "def load_images(root, ids):\n",
+        "    \"\"\"Load images by IDs from a folder.\"\"\"\n",
+        "    imgs = []\n",
+        "    for i in ids:\n",
+        "        name = f\"UASs_{i}.jpg\"\n",
+        "        path = Path(root) / name\n",
+        "        if not path.is_file():\n",
+        "            print(f\"❌ Missing: {path}\")\n",
+        "            continue\n",
+        "        imgs.append(Image.open(path).convert(\"RGB\"))\n",
+        "        print(f\"✅ Loaded: {path}\")\n",
+        "    return imgs\n",
+        "\n",
+        "def show_grid(images, cols=4, save=None):\n",
+        "    \"\"\"Display images in a grid and optionally save.\"\"\"\n",
+        "    if not images:\n",
+        "        print(\"⚠ No images to display.\")\n",
+        "        return\n",
+        "\n",
+        "    rows = math.ceil(len(images) / cols)\n",
+        "\n",
+        "    # Scale figure size based on image dimensions\n",
+        "    img_width, img_height = images[0].size\n",
+        "    fig_width = cols * img_width / 100\n",
+        "    fig_height = rows * img_height / 100\n",
+        "\n",
+        "    plt.figure(figsize=(fig_width, fig_height))\n",
+        "    for idx, img in enumerate(images, 1):\n",
+        "        ax = plt.subplot(rows, cols, idx)\n",
+        "        ax.imshow(img)\n",
+        "        ax.axis(\"off\")\n",
+        "\n",
+        "    if save:\n",
+        "        plt.savefig(save, dpi=150, bbox_inches='tight', pad_inches=0)\n",
+        "        print(f\"💾 Saved grid to: {save}\")\n",
+        "\n",
+        "    plt.show()\n",
+        "\n",
+        "# --- Main ---\n",
+        "imgs = load_images(folder, IDS)\n",
+        "show_grid(imgs, cols=cols, save=save_path)\n"
+      ],
+      "metadata": {
+        "id": "jprFeczaICMm"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Training:"
+      ],
+      "metadata": {
+        "id": "bcyW0BLPFWCE"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!wget -O datasets/coco_eval.py https://raw.githubusercontent.com/facebookresearch/detr/main/datasets/coco_eval.py"
+      ],
+      "metadata": {
+        "id": "Ug4WCa_WW42F"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!python main.py \\\n",
+        "    --dataset_file rsvg_mm \\\n",
+        "    --rsvg_mm_path \"/content/drive/MyDrive/data\" \\\n",
+        "    --binary \\\n",
+        "    --with_box_refine \\\n",
+        "    --batch_size 1 \\\n",
+        "    --num_frames 1 \\\n",
+        "    --epochs 70 \\\n",
+        "    --lr_drop 40 \\\n",
+        "    --num_queries 10 \\\n",
+        "    --output_dir \"/content/drive/MyDrive/rsvg_dirs/r50_bidrection_fusion_10query_70epo_multiscale\" \\\n",
+        "    --backbone resnet50 \\\n",
+        "    --device cuda"
+      ],
+      "metadata": {
+        "id": "dcooaiTTVGF4"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "jupyter nbconvert --to pdf LQVG-test.ipynb\n"
+      ],
+      "metadata": {
+        "id": "XQfZsqqFWAVd"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file

From 184d70d96a2dee2d5a7cf071c8806891bf6ca165 Mon Sep 17 00:00:00 2001
From: Mohammadreza Haghighat <62746461+MHaghighat98@users.noreply.github.com>
Date: Thu, 11 Sep 2025 10:11:52 +1000
Subject: [PATCH 4/4] Resolved conflict in the "build" function

---
 datasets/rsvg_mm.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/datasets/rsvg_mm.py b/datasets/rsvg_mm.py
index fca23cb..833726b 100644
--- a/datasets/rsvg_mm.py
+++ b/datasets/rsvg_mm.py
@@ -126,13 +126,17 @@ def make_coco_transforms(image_set, cautious):
 from pathlib import Path
 
 
-def build(image_set, args):
-    root = Path(args.rsvg_mm_path)
-    assert root.exists(), f"provided rsvg_mm path {root} does not exist"
-    input_transform = T.Compose([T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
-
-    img_folder = "data/images"  # Updated to correct path
-    dataset = RSVGDataset(img_folder, transform=input_transform, split=image_set, testmode=(image_set == "test"))
+def build(image_set, args):    
+    assert root.exists(), f'provided rsvg_mm path {root} does not exist'
+    input_transform = T.Compose([
+        T.ToTensor(),
+        T.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225])
+    ])
+
+    img_folder = 'data/rsvg_mm/images'
+    dataset = RSVGDataset(img_folder, transform=input_transform, split=image_set, testmode=(image_set == 'test'))
     return dataset