From 3d4c5a3837a6540a769baf5f4684f0e764ec14a4 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Tue, 18 Jul 2023 19:02:41 +0000 Subject: [PATCH 01/18] Added the bbbc-download-plugin:0.1.0-dev1 --- utils/bbbc-download-plugin/.bumpversion. | 0 utils/bbbc-download-plugin/.bumpversion.cfg | 27 + utils/bbbc-download-plugin/README.md | 0 utils/bbbc-download-plugin/VERSION | 1 + utils/bbbc-download-plugin/plugin.json | 40 + utils/bbbc-download-plugin/pyproject.toml | 29 + utils/bbbc-download-plugin/run-plugin.sh | 19 + .../plugins/utils/bbbc_download/BBBC_model.py | 708 ++++++++++++++++++ .../plugins/utils/bbbc_download/__init__.py | 2 + .../plugins/utils/bbbc_download/__main__.py | 100 +++ .../plugins/utils/bbbc_download/download.py | 144 ++++ .../plugins/utils/bbbc_download/mapping.py | 9 + utils/bbbc-download-plugin/tests/__init__.py | 0 13 files changed, 1079 insertions(+) create mode 100644 utils/bbbc-download-plugin/.bumpversion. create mode 100644 utils/bbbc-download-plugin/.bumpversion.cfg create mode 100644 utils/bbbc-download-plugin/README.md create mode 100644 utils/bbbc-download-plugin/VERSION create mode 100644 utils/bbbc-download-plugin/plugin.json create mode 100644 utils/bbbc-download-plugin/pyproject.toml create mode 100644 utils/bbbc-download-plugin/run-plugin.sh create mode 100644 utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py create mode 100644 utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py create mode 100644 utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__main__.py create mode 100644 utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py create mode 100644 utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/mapping.py create mode 100644 utils/bbbc-download-plugin/tests/__init__.py diff --git a/utils/bbbc-download-plugin/.bumpversion. b/utils/bbbc-download-plugin/.bumpversion. new file mode 100644 index 000000000..e69de29bb diff --git a/utils/bbbc-download-plugin/.bumpversion.cfg b/utils/bbbc-download-plugin/.bumpversion.cfg new file mode 100644 index 000000000..8cc773f0b --- /dev/null +++ b/utils/bbbc-download-plugin/.bumpversion.cfg @@ -0,0 +1,27 @@ +[bumpversion] +current_version = 0.1.0-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:VERSION] + +[bumpversion:file:src/polus/plugins/utils/bbbc_download/__init__.py] \ No newline at end of file diff --git a/utils/bbbc-download-plugin/README.md b/utils/bbbc-download-plugin/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/utils/bbbc-download-plugin/VERSION b/utils/bbbc-download-plugin/VERSION new file mode 100644 index 000000000..15a06bec5 --- /dev/null +++ b/utils/bbbc-download-plugin/VERSION @@ -0,0 +1 @@ +0.1.0-dev0 \ No newline at end of file diff --git a/utils/bbbc-download-plugin/plugin.json b/utils/bbbc-download-plugin/plugin.json new file mode 100644 index 000000000..ba0c6ec5b --- /dev/null +++ b/utils/bbbc-download-plugin/plugin.json @@ -0,0 +1,40 @@ +{ + "name": "BBBC Download", + "version": "0.1.0-dev0", + "title": "BBBC Download", + "description": "Downloads the datasets on the BBBC website", + "author": "Saket Prem(saket.prem@axleinfo.com), Matthew", + "institution": "National Center for Advancing Translational Sciences, National Institutes of Health", + "repository": "https://github.com/PolusAI/polus-plugins", + "website": "https://ncats.nih.gov/preclinical/core/informatics", + "citation": "", + "containerId": "polusai/bbbc-download-plugin:0.1.0-dev0", + "baseCommand": [ + "python3", + "-m", + "polus.plugins.utils.bbbc_download" + ], + "inputs": [ + { + "name": "name", + "type": "string", + "description": "The name of the datasets to be downloaded(spereate the datasets with a comma. eg: BBBC001,BBBC002,BBBC003 )", + "required": true + } + + ], + "outputs": [ + { + "name": "outDir", + "type": "genericData", + "description": "Output collection" + } + ], + "ui": [ + { + "key": "inputs.name", + "title": "Input name of datasets as string", + "description": "Input the name of the datasets to be downloaded as a string" + } + ] + } \ No newline at end of file diff --git a/utils/bbbc-download-plugin/pyproject.toml b/utils/bbbc-download-plugin/pyproject.toml new file mode 100644 index 000000000..c2b76de04 --- /dev/null +++ b/utils/bbbc-download-plugin/pyproject.toml @@ -0,0 +1,29 @@ +[tool.poetry] +name = "polus-plugins-utils-bbbc-download-plugin" +version = "0.1.0" +description = "" +authors = ["Your Name "] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = "^3.9.16" +typer = "^0.9.0" +pyarrow = "11.0.0" +scikit-image = "0.20.0" +vaex = "4.16.0" +bfio = "2.3.1.dev0" +beautifulsoup4 = "4.12.0" +numpy = "1.24.2" +pandas = "1.5.3" +requests = "2.28.2" +pydantic = "1.10.7" +mapping = "^0.1.6" +bump2version = "1.0.1" +mypy = "1.0.1" +tqdm = "^4.65.0" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/utils/bbbc-download-plugin/run-plugin.sh b/utils/bbbc-download-plugin/run-plugin.sh new file mode 100644 index 000000000..637544c99 --- /dev/null +++ b/utils/bbbc-download-plugin/run-plugin.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +version=$( dict: + if not values["path"].exists(): + raise ValueError("No metadata") + + return values + + @property + def size(self) -> int: + """Returns the size of the dataset's metadata in bytes.""" + + raw_path = root.joinpath(self.name, "raw/Metadata") + standard_path = root.joinpath(self.name, "standard/Metadata") + raw_sum = sum(os.path.getsize(file) for file in raw_path.rglob("*")) + standard_sum = sum(os.path.getsize(file) for file in standard_path.rglob("*")) + + return raw_sum + standard_sum + + +class GroundTruth(pydantic.BaseModel): + """Class that contains information about a dataset's ground truth.""" + + path: Path + name: str + + @pydantic.root_validator() + @classmethod + def valid_data(cls, values: dict) -> dict: + if not values["path"].exists(): + raise ValueError("No ground truth") + + return values + + @property + def size(self) -> int: + """Returns the size of the dataset's ground truth in bytes.""" + + raw_path = root.joinpath(self.name, "raw/Ground Truth") + standard_path = root.joinpath(self.name, "standard/Ground Truth") + raw_sum = sum(os.path.getsize(file) for file in raw_path.rglob("*")) + standard_sum = sum(os.path.getsize(file) for file in standard_path.rglob("*")) + + return raw_sum + standard_sum + + +class Images(pydantic.BaseModel): + """Class that contains information about a dataset's images.""" + + path: Path + name: str + + @pydantic.root_validator() + @classmethod + def valid_data(cls, values: dict) -> dict: + if not values["path"].exists(): + raise ValueError("No images") + + return values + + @property + def size(self) -> int: + """Returns the size of the dataset's images in bytes.""" + + raw_path = root.joinpath(self.name, "raw/Images") + standard_path = root.joinpath(self.name, "standard/Images") + raw_sum = sum(os.path.getsize(file) for file in raw_path.rglob("*")) + standard_sum = sum(os.path.getsize(file) for file in standard_path.rglob("*")) + + return raw_sum + standard_sum + + +class BBBCDataset(pydantic.BaseModel): + """Class that models a BBBC dataset. + + Attributes: + name: The name of the dataset. + images: An Images object that contains information about the dataset's images + ground_truth: A GroundTruth object that contains information about the dataset's ground truth + metadata: A Metadata object that contains information about the dataset's metadata + """ + + name: str + images: Optional[Images] = None + ground_truth: Optional[GroundTruth] = None + metadata: Optional[Metadata] = None + + @pydantic.validator("name") + @classmethod + def valid_name(cls, v: str) -> str: + """Validates the name of the dataset. + + Args: + v: The name of the dataset to be downloaded. + + Returns: + The name provided if validation is successful. + """ + + if v not in list(BBBC.combined_table["Accession"]): + raise ValueError( + v + + " is an invalid dataset name. Valid dataset names belong to an existing BBBC dataset." + ) + + return v + + @classmethod + def create_dataset(cls, name: str) -> Union["BBBCDataset", None]: + """Creates a dataset. + + Args: + name: The name of the dataset to be created. + + Returns: + A new instance of a Dataset object or None if the validation fails. + """ + + try: + if name in exception_sets: + dataset_class = globals()[name] + + return dataset_class(name=name) + else: + return BBBCDataset(name=name) + except ValueError as e: + print(e) + + return None + + @property + def info(self) -> Dict[str, Union[str, np.int64]]: + """Provides information about the dataset such as its description and total images. + + Returns: + A dictionary that contains information about the dataset. + """ + + table = BBBC.combined_table + + row = table.loc[table["Accession"] == self.name] + + info = { + "Description": row["Description"].values[0], + "Mode": row["Mode"].values[0], + "Fields per sample": row["Fields per sample"].values[0], + "Total Fields": row["Total Fields"].values[0], + "Total Images": row["Total Images"].values[0], + "Ground truth types": self._ground_truth_types(), + } + + return info + + @property + def size(self) -> int: + """Returns the size of the dataset in bytes.""" + + dataset_path = root.joinpath(self.name) + + return sum(os.path.getsize(file) for file in dataset_path.rglob("*")) + + def _ground_truth_types(self) -> List[str]: + """Provides the types of ground truth used by the dataset. + + Returns: + A list of strings where each string is a type of ground truth. + """ + + res = requests.get("https://bbbc.broadinstitute.org/image_sets") + soup = bs4.BeautifulSoup(res.content, "html.parser") + types = [] + + for t in soup.find_all("table")[:3]: + for row in t.find_all("tr"): + cols = row.find_all("td") + + if len(cols) > 0 and cols[0].text == self.name: + for link in cols[6].find_all("a"): + types.append(link.attrs["href"].split("#")[-1]) + + return types + + def _init_data(self,download_path:Path) -> None: + """Initializes the images, ground_truth, and metadata attributes of the dataset.""" + download_path=download_path.joinpath("BBBC") + + images_path = download_path.joinpath(self.name, "raw/Images") + truth_path = download_path.joinpath(self.name, "raw/Ground Truth") + meta_path = download_path.joinpath(self.name, "raw/Metadata") + + try: + self.images = Images(path=images_path, name=self.name) + except ValueError: + pass + + try: + self.ground_truth = GroundTruth(path=truth_path, name=self.name) + except ValueError: + pass + + try: + self.metadata = Metadata(path=meta_path, name=self.name) + except ValueError: + pass + + if self.images == None: + print(self.name + " has no images.") + + if self.ground_truth == None and self.metadata == None: + print(self.name + " has no ground truth or metadata.") + + return + + def raw(self,download_path: Path) -> None: + """Download the dataset's raw data.""" + + download(self.name,download_path) + self._init_data(download_path) + + return + + def standard(self, extension: str) -> None: + """Standardize the dataset's raw data. + + Args: + extension: The extension of the standard image. Can be ".ome.tif" or ".ome.zarr". + """ + + if extension not in [".ome.tif", ".ome.zarr"]: + print( + f"ERROR: {extension} is an invalid extension for standardization. Must be .ome.tif or .ome.zarr." + ) + return + + if self.images == None: + print( + f"ERROR: Images for {self.name} have not been downloaded so they cannot be standardized." + ) + return + + standard_folder = Path(root, self.name, "standard") + arrow_file = Path("arrow", self.name + ".arrow") + arrow_table = pq.read_table(arrow_file) + df = vaex.from_arrow_table(arrow_table) + + if not standard_folder.exists(): + standard_folder.mkdir(parents=True, exist_ok=True) + + for i, row in df.iterrows(): + func = globals()[self.name + "_mapping"] + out_file = func(row, extension) + raw_image = io.imread(row["Path"]) + num_channels = 1 if len(raw_image.shape) == 2 else raw_image.shape[2] + + if row["Image Type"] == "Intensity": + sub_folder = "Images" + elif row["Image Type"] == "Ground Truth": + sub_folder = "Ground Truth" + elif row["Image Type"] == "Metadata": + sub_folder = "Metadata" + else: + print("ERROR: Invalid value for attribute Image Type") + return + + save_path = standard_folder.joinpath(sub_folder) + + if not save_path.exists(): + save_path.mkdir(parents=True, exist_ok=True) + + with BioWriter(save_path.joinpath(out_file)) as bw: + bw.X, bw.Y, bw.Z, bw.C = ( + raw_image.shape[1], + raw_image.shape[0], + num_channels, + 1, + ) + bw.dtype = raw_image.dtype + bw[:] = raw_image + + print(f"Finished standardizing {self.name}") + + return + + +class BBBC019(BBBCDataset): + def raw(self,download_path:Path) -> None: + download(self.name) + download_path=download_path.joinpath("BBBC") + + # Separate images from ground truth + save_location = download_path.joinpath("BBBC019") + images_folder = save_location.joinpath("raw/Images") + truth_folder = save_location.joinpath("raw/Ground Truth") + + for set in [ + x + for x in images_folder.iterdir() + if x.name not in [".DS_Store", "__MACOSX"] + ]: + for obj in [ + x + for x in set.iterdir() + if x.name not in ["images", "measures.mat", "desktop.ini", ".DS_Store"] + ]: + src = images_folder.joinpath(set.name, obj.name) + dst = truth_folder.joinpath(set.name, obj.name) + + if dst.exists(): + try: + shutil.rmtree(src) + except NotADirectoryError as e: + print(e) + else: + shutil.move(src, dst) + + self._init_data(download_path) + + return + + +class BBBC029(BBBCDataset): + def raw(self,download_path:Path) -> None: + print("Started downloading BBBC029") + download_path=download_path.joinpath("BBBC") + + save_location = download_path.joinpath("BBBC029", "raw") + + if not save_location.exists(): + save_location.mkdir(parents=True, exist_ok=True) + + file_path = save_location.joinpath("Images") + get_url( + "https://data.broadinstitute.org/bbbc/BBBC029/images.zip", + file_path, + "BBBC029", + ) + + file_path = save_location.joinpath("Ground Truth") + get_url( + "https://data.broadinstitute.org/bbbc/BBBC029/ground_truth.zip", + file_path, + "BBBC029", + ) + + print("BBBC029 has finished downloading") + + self._init_data(download_path) + + return + + +class BBBC041(BBBCDataset): + def raw(self,download_path:Path) -> None: + download(self.name) + download_path=download_path.joinpath("BBBC") + + # Separate images from ground truth + save_location = download_path.joinpath("BBBC041") + file_names = ["test.json", "training.json"] + + if not save_location.joinpath("raw/Ground Truth").exists(): + save_location.joinpath("raw/Ground Truth").mkdir( + parents=True, exist_ok=True + ) + + for file in file_names: + src = save_location.joinpath("raw/Images/malaria", file) + dst = save_location.joinpath("raw/Ground Truth") + + if dst.joinpath(file).exists(): + os.remove(src) + else: + shutil.move(src, dst) + + self._init_data(download_path) + + return + + +class BBBC042(BBBCDataset): + def raw(self,download_path:Path) -> None: + print("Started downloading BBBC042") + download_path=download_path.joinpath("BBBC") + + save_location = download_path.joinpath("BBBC042", "raw") + + if not save_location.exists(): + save_location.mkdir(parents=True, exist_ok=True) + + file_path = save_location.joinpath("Images") + get_url( + "https://data.broadinstitute.org/bbbc/BBBC042/images.zip", + file_path, + "BBBC042", + ) + + file_path = save_location.joinpath("Ground Truth") + get_url( + "https://data.broadinstitute.org/bbbc/BBBC042/positions.zip", + file_path, + "BBBC042", + ) + + print("BBBC042 has finished downloading") + + self._init_data(download_path) + + return + + +class BBBC046(BBBCDataset): + def raw(self, download_path: Path) -> None: + download(self.name) + download_path=download_path.joinpath("BBBC") + + # Separate images from ground truth + try: + save_location = download_path.joinpath(self.name) + images_folder = save_location.joinpath("raw/Images") + truth_folder = save_location.joinpath("raw/Ground Truth") + + # Extract these files because they do not extract automatically + for file in ["OE-ID350-AR-1.zip", "OE-ID350-AR-2.zip", "OE-ID350-AR-4.zip", "OE-ID350-AR-8.zip"]: + with ZipFile(images_folder.joinpath(file), "r") as zfile: + zfile.extractall(images_folder) + + os.remove(images_folder.joinpath(file)) + + if not truth_folder.exists(): + truth_folder.mkdir(parents=True, exist_ok=True) + + # Iterate over folders in the images folder + for folder in images_folder.iterdir(): + if not truth_folder.joinpath(folder.name).exists(): + truth_folder.joinpath(folder.name).mkdir( + parents=True, exist_ok=True + ) + + # Move ground truth data to Ground Truth folder + for obj in folder.iterdir(): + if obj.name.endswith((".txt", ".tif")): + src = obj + dst = truth_folder.joinpath(folder.name, obj.name) + + if dst.exists(): + os.remove(src) + else: + shutil.move(src, dst) + + self._init_data(download_path) + except Exception as e: + print( + "BBBC046 downloaded successfully but an error occurred when organizing raw data." + ) + print("ERROR: " + str(e)) + + return + + +class BBBC054(BBBCDataset): + def raw(self, download_path:Path) -> None: + download(self.name) + download_path=download_path.joinpath("BBBC") + + # Separate images from ground truth + save_location = download_path.joinpath(self.name) + src = save_location.joinpath("raw/Images", "Replicate1annotation.csv") + dst = save_location.joinpath("raw/Ground Truth", "Replicate1annotation.csv") + + if not dst.exists(): + dst.mkdir(parents=True, exist_ok=True) + + if dst.exists(): + os.remove(src) + else: + shutil.move(src, dst) + + self._init_data(download_path) + + return + + +class IDAndSegmentation: + """Class that models the Identification and segmentation table on https://bbbc.broadinstitute.org/image_sets. + + Attributes: + name: The name of the table as seen on the BBBC image set webpage + table: The Identification and segmentation table as a pandas DataFrame + """ + + name: str = "Identification and segmentation" + table: pd.DataFrame = tables[0] + + @classmethod + @property + def datasets(cls) -> List[BBBCDataset]: + """Returns a list of all datasets in the table. + + Returns: + A list containing a Dataset object for each dataset in the table. + """ + + return [BBBCDataset.create_dataset(name) for name in cls.table["Accession"]] + + @classmethod + def raw(cls,download_path:Path) -> None: + """Downloads raw data for every dataset in this table""" + + num_workers = max(cpu_count(), 2) + threads = [] + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + for dataset in IDAndSegmentation.datasets: + threads.append(executor.submit(dataset.raw(download_path))) + + for f in tqdm( + as_completed(threads), desc=f"Downloading data", total=len(threads) + ): + f.result() + + +class PhenotypeClassification: + """Class that models the Phenotype classification table on https://bbbc.broadinstitute.org/image_sets. + + Attributes: + name: The name of the table as seen on the BBBC image set webpage + table: The Phenotype classification table as a pandas DataFrame + """ + + name: str = "Phenotype classification" + table: pd.DataFrame = tables[1] + + @classmethod + @property + def datasets(cls) -> List[BBBCDataset]: + """Returns a list of all datasets in the table. + + Returns: + A list containing a Dataset object for each dataset in the table. + """ + + return [BBBCDataset.create_dataset(name) for name in cls.table["Accession"]] + + @classmethod + def raw(cls,download_path:Path) -> None: + """Downloads raw data for every dataset in this table""" + + num_workers = max(cpu_count(), 2) + threads = [] + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + for dataset in PhenotypeClassification.datasets: + threads.append(executor.submit(dataset.raw(download_path))) + + for f in tqdm( + as_completed(threads), desc=f"Downloading data", total=len(threads) + ): + f.result() + + +class ImageBasedProfiling: + """Class that models the Image-based Profiling table on https://bbbc.broadinstitute.org/image_sets. + + Attributes: + name: The name of the table as seen on the BBBC image set webpage + table: The Image-based Profiling table as a pandas DataFrame + """ + + name: str = "Image-based Profiling" + table: pd.DataFrame = tables[2] + + @classmethod + @property + def datasets(cls) -> List[BBBCDataset]: + """Returns a list of all datasets in the table. + + Returns: + A list containing a Dataset object for each dataset in the table. + """ + + return [BBBCDataset.create_dataset(name) for name in cls.table["Accession"]] + + @classmethod + def raw(cls,download_path:Path) -> None: + """Downloads raw data for every dataset in this table""" + + num_workers = max(cpu_count(), 2) + threads = [] + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + for dataset in ImageBasedProfiling.datasets: + threads.append(executor.submit(dataset.raw(download_path))) + + for f in tqdm( + as_completed(threads), desc=f"Downloading data", total=len(threads) + ): + f.result() + + +class BBBC: + """Class that models the Broad Bioimage Benchmark Collection (BBBC). + + BBBC has tables that contain datasets. Datasets are separated into tables + based on how they can be used. Each dataset has images and ground truth. + Read more about BBBC here: https://bbbc.broadinstitute.org. + """ + + @classmethod + @property + def datasets(cls) -> List[BBBCDataset]: + """Returns a list of all datasets in BBBC. + + Returns: + A list containing a Dataset object for each dataset in BBBC. + """ + + table = BBBC.combined_table + + return [BBBCDataset.create_dataset(name) for name in table["Accession"]] + + @classmethod + @property + def combined_table(cls) -> pd.DataFrame: + """Combines each table on https://bbbc.broadinstitute.org/image_sets into a single table. + + Returns: + A pandas DataFrame representation of the combined table. + """ + + # Combine each table into one table + combined_table = ( + pd.concat(tables) + .drop(columns=["Ground truth"]) + .drop_duplicates("Accession") + ) + + return combined_table + + @classmethod + def raw(cls,download_path:Path) -> None: + """Downloads raw data for every dataset.""" + + num_workers = max(cpu_count(), 2) + threads = [] + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + for dataset in BBBC.datasets: + threads.append(executor.submit(dataset.raw(download_path))) + + for f in tqdm( + as_completed(threads), desc=f"Downloading data", total=len(threads) + ): + f.result() diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py new file mode 100644 index 000000000..3a98e7a8a --- /dev/null +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py @@ -0,0 +1,2 @@ +"""Bbbc Download.""" +__version__ = "0.1.0-dev0" \ No newline at end of file diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__main__.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__main__.py new file mode 100644 index 000000000..79439a500 --- /dev/null +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__main__.py @@ -0,0 +1,100 @@ +import json +import os +import logging +from pathlib import Path +from concurrent.futures import ProcessPoolExecutor, as_completed +from typing import Any, Optional + +import typer +from tqdm import tqdm +from polus.plugins.utils.bbbc_download.BBBC_model import BBBC, BBBCDataset, IDAndSegmentation, PhenotypeClassification, ImageBasedProfiling +from sys import platform +from multiprocessing import cpu_count + + + +if platform == "linux" or platform == "linux2": + NUM_THREADS = len(os.sched_getaffinity(0)) # type: ignore +else: + NUM_THREADS = max(cpu_count() // 2, 2) + +app = typer.Typer() + +# Initialize the logger +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +logger = logging.getLogger("polus.plugins.utils.bbbc_download") +logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) + +@app.command() +def main( + name: str= typer.Option( + ..., "--name", help="The name of the dataset that is to be downloaded" + ), + out_dir: Path= typer.Option( + ...,"--outDir", help="The path for downloading the dataset" + ) + +)-> None: + """Download the required dataset from the BBBC dataaset.""" + logger.info(f"name = {name}") + logger.info(f"outDir = {out_dir}") + """Checking if output directory exists. If it does not exist then a designated path is created.""" + if not out_dir.exists(): + out_dir.mkdir() + logger.info(f"{out_dir} did not exists. Creating new path.") + + with ProcessPoolExecutor(max_workers=NUM_THREADS) as executor: + threads=[] + names=name.split(",") + for n in names: + if(n=='IDAndSegmentation'): + threads.append( + executor.submit(IDAndSegmentation.raw,out_dir) + ) + + elif(n=='PhenotypeClassification'): + threads.append( + executor.submit(PhenotypeClassification.raw,out_dir) + ) + + + + elif(n=='ImageBasedProfiling'): + threads.append( + executor.submit(ImageBasedProfiling.raw,out_dir) + ) + + elif(n=='BBBC'): + threads.append( + executor.submit(BBBC.raw,out_dir) + ) + + + else: + d=executor.submit(BBBCDataset.create_dataset, n) + d_name=d.result() + threads.append( + executor.submit(d_name.raw,out_dir) + ) + + + for f in tqdm( + as_completed(threads), + total=len(threads), + mininterval=5, + desc=f"donwloading the dataset", + initial=0, + unit_scale=True, + colour="cyan", + ): + f.result() + + + + + +if __name__ == "__main__": + app() \ No newline at end of file diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py new file mode 100644 index 000000000..b27493bde --- /dev/null +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py @@ -0,0 +1,144 @@ +from pathlib import Path +import re +from urllib.request import urlretrieve +from urllib.error import URLError +from zipfile import ZipFile + +import bs4 +import shutil +import requests + +match_str = ( + "Images|Ground truth|Ground Truth|Metadata|Hand-annotated Ground Truth Images" +) +endings = (".txt", ".csv", ".tif", ".xlsx", ".xls", ".lst") + + +def get_lower_tags(tag: bs4.element.Tag) -> list: + """Get all tags between the tag argument and the next tag of the same type. + Args: + tag: Get tags between this tag and the next tag of the same type + """ + + tags = [] + + for sib in tag.find_next_siblings(): + if sib.name == tag.name: + break + else: + tags.append(sib) + + return tags + + +def get_url(url: str, save_location: Path, name: str) -> None: + """Get the given url and save it. + Args: + url: The url to get + save_location: The path where the files will be saved + name: The name of the dataset that the url is associated with + """ + + file_name = url.split("/")[-1] + + for download_attempts in range(10): + if url.endswith(endings): + try: + if not save_location.exists(): + save_location.mkdir(parents=True, exist_ok=True) + + urlretrieve(url, save_location.joinpath(file_name)) + except URLError as e: + if download_attempts == 9: + print("FAILED TO DOWNLOAD: " + url + " for " + name) + print("ERROR: " + str(e)) + + continue + elif url.endswith(".zip"): + try: + zip_path, _ = urlretrieve(url) + + with ZipFile(zip_path, "r") as zfile: + zfile.extractall(save_location) + except URLError as e: + if download_attempts == 9: + print("FAILED TO DOWNLOAD: " + url + " for " + name) + print("ERROR: " + str(e)) + + continue + except Exception as e: + print(e) + + continue + + break + + return + +def remove_macosx(name:str, save_location:Path)-> None: + images_path=save_location.joinpath("Images") + folders=[folders for folders in images_path.iterdir() if folders.is_dir()] + for f in folders: + if f.name=="__MACOSX": + shutil.rmtree(f) + print("Deleted the __MACOSX folder in " + name) + + + + +def download(name: str,download_path:Path) -> None: + """Download a single dataset. + Args: + name: The name of the dataset to be downloaded + """ + + print("Started downloading " + name) + download_path=download_path.joinpath("BBBC") + + save_location = download_path.joinpath(name, "raw") + + if not save_location.exists(): + save_location.mkdir(parents=True, exist_ok=True) + + dataset_url = "https://bbbc.broadinstitute.org/" + name + + dataset_page = requests.get(dataset_url) + soup = bs4.BeautifulSoup(dataset_page.content, "html.parser") + + for heading in soup.find_all("h3"): + # Ignore headings that we aren't interested in + if re.match(match_str, heading.text.strip()) == None: + continue + + if heading.text.strip() == "Images": + sub_folder = "Images" + elif heading.text.strip() == "Metadata": + sub_folder = "Metadata" + else: + sub_folder = "Ground Truth" + + # Iterate over every tag under the current heading and above the next heading + for tag in get_lower_tags(heading): + links = tag.find_all("a") + data_links = [ + l for l in links if l.attrs["href"].endswith((".zip", *endings)) + ] + + for link in data_links: + data_url = link.attrs["href"] + file_path = save_location.joinpath(sub_folder) + + get_url(data_url, file_path, name) + + # Manually download BBBC018 ground truth because its webpage structure is incorrect + if name == "BBBC018" and re.match("Ground truth", heading.text.strip()): + url = "https://data.broadinstitute.org/bbbc/BBBC018/BBBC018_v1_outlines.zip" + + file_path = save_location.joinpath(sub_folder) + + get_url(url, file_path, "BBBC018") + + print(name + " has finished downloading") + remove_macosx(name,save_location) + + return diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/mapping.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/mapping.py new file mode 100644 index 000000000..6650c6c55 --- /dev/null +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/mapping.py @@ -0,0 +1,9 @@ + + +def BBBC001_mapping(row: dict, extension: str) -> str: + # important attributes: plate, well, wel num, control, field, channel, treatment, image type + + return f"a01_w01_n01_p01_f0{row['Field'] + 1}_c01_t00_i01{extension}" + + +__all__ = ["BBBC001_mapping"] \ No newline at end of file diff --git a/utils/bbbc-download-plugin/tests/__init__.py b/utils/bbbc-download-plugin/tests/__init__.py new file mode 100644 index 000000000..e69de29bb From 5b056b8aaeb3976a1b679cb510de0f4279cb0542 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Fri, 21 Jul 2023 21:41:39 +0000 Subject: [PATCH 02/18] Tested bbbc plugin --- utils/bbbc-download-plugin/Dockerfile | 20 +++ utils/bbbc-download-plugin/README.md | 145 ++++++++++++++++++ utils/bbbc-download-plugin/build-docker.sh | 4 + utils/bbbc-download-plugin/plugin.json | 2 +- utils/bbbc-download-plugin/pyproject.toml | 8 +- utils/bbbc-download-plugin/run-plugin.sh | 2 +- .../plugins/utils/bbbc_download/BBBC_model.py | 65 +++++--- .../plugins/utils/bbbc_download/__main__.py | 26 +++- .../plugins/utils/bbbc_download/download.py | 11 +- utils/bbbc-download-plugin/tests/__init__.py | 1 + utils/bbbc-download-plugin/tests/test_main.py | 84 ++++++++++ 11 files changed, 334 insertions(+), 34 deletions(-) create mode 100644 utils/bbbc-download-plugin/Dockerfile create mode 100644 utils/bbbc-download-plugin/build-docker.sh create mode 100644 utils/bbbc-download-plugin/tests/test_main.py diff --git a/utils/bbbc-download-plugin/Dockerfile b/utils/bbbc-download-plugin/Dockerfile new file mode 100644 index 000000000..da89ce48c --- /dev/null +++ b/utils/bbbc-download-plugin/Dockerfile @@ -0,0 +1,20 @@ +FROM polusai/bfio:2.1.9 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +ENTRYPOINT ["python3", "-m", "polus.plugins.utils.bbbc_download"] +CMD ["--help"] \ No newline at end of file diff --git a/utils/bbbc-download-plugin/README.md b/utils/bbbc-download-plugin/README.md index e69de29bb..9e9e4b10c 100644 --- a/utils/bbbc-download-plugin/README.md +++ b/utils/bbbc-download-plugin/README.md @@ -0,0 +1,145 @@ +#BBBC Download (0.1.0-dev0) + +This plugin is designed to download the necessary datasets from the Broad Bioimage Benchmark Collection(BBBC) website. + +For information on the BBBC dataset, visit +[BBBC dataset information](https://bbbc.broadinstitute.org/image_sets/). + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Options + +This plugin takes 1 input arguments and +1 output argument: + +| Name | Description | I/O | Type | +| --------------- | ------------------------------------------------------------ | ------ | ----------- | +| `--name ` | The name of the datasets to be downloaded | Input | String | +| `--outDir` | Directory to store the downloaded datasets | Output | genericData | + +The Following are valid names for datasets: +"all"- To download all the datasets from the bbbc website +"IDAndSegmentation"- To download the datasets from the Identification and segmentation table +"PhenotypeClassification"- To download the datasets from the Phenotype classification table +"ImageBasedProfiling"- To download the datasets from the Image-based Profiling table + +To download specific datasets from the website, give the name of each dataset in the input argument seperated by a comma. eg: --name="BBBC001,BBBC002,BBBC003" + + +# BBBC Model +The classes in BBBC_model.py model the data from the [Broad Bioimage Benchmark Collection (BBBC)](https://bbbc.broadinstitute.org/image_sets). The tables on this webpage classify datasets by their biological application. Each dataset has a webpage that contains links to the data and describes information about the dataset. Almost every dataset has image data and ground truth data. There are a few datasets that have metadata rather than ground truth data. + +# Classes +This section describes the classes and functions used to model the BBBC. + +## BBBC +The `BBBC` class contains functions used for interacting with every dataset in the BBBC. + +### Functions +`datasets()`: Returns a list of all the datasets in the collection. + +`combined_table()`: Combines each table on the BBBC image set webpage into a single pandas DataFrame. + +`raw()`: Downloads all of the datasets in the collection. + +## Table Classes +There is a class for each table on the BBBC image set webpage. The classes are `IDAndSegmentation`, `PhenotypeClassification`, and `ImageBasedProfiling`. They have the same attributes and functions. + +### Attributes +`name`: The name of the table as it appears on the BBBC image set webpage. + +`table`: A pandas DataFrame representation of the table. + +### Functions +`datasets()`: Returns a list of all the datasets in the table. + +`raw()`: Downloads all of the datasets in the table. + +## BBBCDataset +The `BBBCDataset` class models individual datasets. + +*Note*: some datasets need specialized functionality so they cannot be modeled by the general BBBCDataset class. These datasets have their own classes with the specialized functionality implemented there. + +### Attributes +`name`: A string that represents the dataset's name. The provided name must be the name of an existing dataset or else an exception will be raised. + +`images`: An Images object that contains information about the dataset's images. Set to `None` until raw data is downloaded. + +`ground_truth`: A GroundTruth object that contains information about the dataset's ground truth. Set to `None` until raw data is downloaded. + +`metadata`: A Metadata object that contains information about the dataset's metadata. Set to `None` until raw data is downloaded. + +*Note*: The `images`, `ground_truth`, or `metadata` attributes will be `None` after downloading raw data if the dataset has no images, ground truth, or metadata. + +### Functions +`create_dataset(name)`: Takes in a name as a string and returns a BBBCDataset object for the dataset with that name. If there is no dataset with this name, then an error message is displayed and `None` is returned. + +`info()`: Returns a dictionary containing information about the dataset. The information includes: + +- A description of the dataset +- The microscopy technique used for the dataset +- The number of fields per sample +- The total number of fields +- The total number of images +- The types of ground truth used for the dataset + +`size()`: Computes and returns the total size of the dataset in bytes. + +`raw()`: Downloads the raw data for the dataset. Initializes the `images`, `ground_truth`, and `metadata` attributes. + +`standard(extension)`: Standardizes the dataset's raw data. The extension argument indicates which file format to save to. It can be `".ome.tif"` or `".ome.zarr"`. + +## Data Classes +Each dataset has image and ground truth data. A few datasets have metadata rather than ground truth. The `Images`, `GroundTruth`, and `Metadata` classes contain information about the dataset's images, ground truth, and metadata respectively. They have the same attributes and functions. + +### Attributes +`path`: The path to the folder where the data is stored. +`name`: The name of the dataset that the data belongs to. + +### Functions +`size()`: Computes and returns the size of the data in bytes. + +# Example Workflow +This section provides an example of how to use these classes and functions. + +```python + from BBBC_model import BBBC, BBBCDataset, IDAndSegmentation + + # Print all datasets + for d in BBBC.datasets: + print(d.name) + + # Print all datasets in the Identification and segmentation table + print(IDAndSegmentation.name) + for d in IDAndSegmentation.datasets: + print(d.name) + + # Create a dataset + d = BBBCDataset.create_dataset("BBBC001") + + # Print some information about the dataset + print(d.name) + print(d.info) + + # Download dataset's raw data + d.raw() + + # Print information about the dataset after downloading its raw data + print(d.size) + print(d.images.size) + print(d.ground_truth.size) + + # This will print None because this dataset has no metadata + print(d.metadata) + + # Standardize the raw data + d.standard(".ome.tif") + + # Print information about the dataset after standardizing + print(d.size) + print(d.images.size) + print(d.ground_truth.size) +``` \ No newline at end of file diff --git a/utils/bbbc-download-plugin/build-docker.sh b/utils/bbbc-download-plugin/build-docker.sh new file mode 100644 index 000000000..3c751e602 --- /dev/null +++ b/utils/bbbc-download-plugin/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$("] +authors = [ + "Saket Prem ", + "Matthew McIntyre " + ] readme = "README.md" packages = [{include = "polus", from = "src"}] @@ -22,6 +25,7 @@ mapping = "^0.1.6" bump2version = "1.0.1" mypy = "1.0.1" tqdm = "^4.65.0" +pytest = "^7.4.0" [build-system] diff --git a/utils/bbbc-download-plugin/run-plugin.sh b/utils/bbbc-download-plugin/run-plugin.sh index 637544c99..57408ac85 100644 --- a/utils/bbbc-download-plugin/run-plugin.sh +++ b/utils/bbbc-download-plugin/run-plugin.sh @@ -10,7 +10,7 @@ name="BBBC001" outDir=/data/output # Show the help options -docker run polusai/bbbc-download-plugin:${version} +#docker run polusai/bbbc-download-plugin:${version} # Run the plugin docker run --mount type=bind,source=${datapath},target=/data/ \ diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py index de6d20fd2..617e33fd3 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py @@ -6,7 +6,7 @@ from pathlib import Path from zipfile import ZipFile -from polus.plugins.utils.bbbc_download.download import download, get_url +from polus.plugins.utils.bbbc_download.download import download, get_url, remove_macosx from polus.plugins.utils.bbbc_download.mapping import * import pydantic @@ -139,6 +139,7 @@ class BBBCDataset(pydantic.BaseModel): images: Optional[Images] = None ground_truth: Optional[GroundTruth] = None metadata: Optional[Metadata] = None + output_path: Optional[Path]= None @pydantic.validator("name") @classmethod @@ -210,7 +211,7 @@ def info(self) -> Dict[str, Union[str, np.int64]]: def size(self) -> int: """Returns the size of the dataset in bytes.""" - dataset_path = root.joinpath(self.name) + dataset_path = self.output_path.joinpath("BBBC",self.name) return sum(os.path.getsize(file) for file in dataset_path.rglob("*")) @@ -268,6 +269,7 @@ def _init_data(self,download_path:Path) -> None: def raw(self,download_path: Path) -> None: """Download the dataset's raw data.""" + self.output_path=download_path download(self.name,download_path) self._init_data(download_path) @@ -339,14 +341,14 @@ def standard(self, extension: str) -> None: class BBBC019(BBBCDataset): def raw(self,download_path:Path) -> None: - download(self.name) - download_path=download_path.joinpath("BBBC") + download(self.name,download_path) + self.output_path=download_path + save_location=download_path.joinpath("BBBC") # Separate images from ground truth - save_location = download_path.joinpath("BBBC019") + save_location = save_location.joinpath("BBBC019") images_folder = save_location.joinpath("raw/Images") truth_folder = save_location.joinpath("raw/Ground Truth") - for set in [ x for x in images_folder.iterdir() @@ -368,6 +370,7 @@ def raw(self,download_path:Path) -> None: else: shutil.move(src, dst) + self._init_data(download_path) return @@ -376,9 +379,10 @@ def raw(self,download_path:Path) -> None: class BBBC029(BBBCDataset): def raw(self,download_path:Path) -> None: print("Started downloading BBBC029") - download_path=download_path.joinpath("BBBC") + self.output_path=download_path + save_location=download_path.joinpath("BBBC") - save_location = download_path.joinpath("BBBC029", "raw") + save_location = save_location.joinpath("BBBC029", "raw") if not save_location.exists(): save_location.mkdir(parents=True, exist_ok=True) @@ -398,6 +402,21 @@ def raw(self,download_path:Path) -> None: ) print("BBBC029 has finished downloading") + images_folder=save_location.joinpath("Images") + truth_folder=save_location.joinpath("Ground Truth") + remove_macosx("BBBC029",images_folder) + remove_macosx("BBBC029",truth_folder) + source_directory=images_folder.joinpath("images") + for source_file in source_directory.glob("*"): + destination_file = images_folder / source_file.name + shutil.move(source_file, destination_file) + shutil.rmtree(source_directory) + + source_directory=truth_folder.joinpath("ground_truth") + for source_file in source_directory.glob("*"): + destination_file = truth_folder / source_file.name + shutil.move(source_file, destination_file) + shutil.rmtree(source_directory) self._init_data(download_path) @@ -406,11 +425,12 @@ def raw(self,download_path:Path) -> None: class BBBC041(BBBCDataset): def raw(self,download_path:Path) -> None: - download(self.name) - download_path=download_path.joinpath("BBBC") + download(self.name,download_path) + self.output_path=download_path + save_location=download_path.joinpath("BBBC") # Separate images from ground truth - save_location = download_path.joinpath("BBBC041") + save_location = save_location.joinpath("BBBC041") file_names = ["test.json", "training.json"] if not save_location.joinpath("raw/Ground Truth").exists(): @@ -435,9 +455,10 @@ def raw(self,download_path:Path) -> None: class BBBC042(BBBCDataset): def raw(self,download_path:Path) -> None: print("Started downloading BBBC042") - download_path=download_path.joinpath("BBBC") + self.output_path=download_path + save_location=download_path.joinpath("BBBC") - save_location = download_path.joinpath("BBBC042", "raw") + save_location = save_location.joinpath("BBBC042", "raw") if not save_location.exists(): save_location.mkdir(parents=True, exist_ok=True) @@ -457,6 +478,10 @@ def raw(self,download_path:Path) -> None: ) print("BBBC042 has finished downloading") + images_folder=save_location.joinpath("Images") + truth_folder=save_location.joinpath("Ground Truth") + remove_macosx("BBBC029",images_folder) + remove_macosx("BBBC029",truth_folder) self._init_data(download_path) @@ -465,12 +490,13 @@ def raw(self,download_path:Path) -> None: class BBBC046(BBBCDataset): def raw(self, download_path: Path) -> None: - download(self.name) - download_path=download_path.joinpath("BBBC") + download(self.name,download_path) + self.output_path=download_path + save_location=download_path.joinpath("BBBC") # Separate images from ground truth try: - save_location = download_path.joinpath(self.name) + save_location = save_location.joinpath(self.name) images_folder = save_location.joinpath("raw/Images") truth_folder = save_location.joinpath("raw/Ground Truth") @@ -514,11 +540,12 @@ def raw(self, download_path: Path) -> None: class BBBC054(BBBCDataset): def raw(self, download_path:Path) -> None: - download(self.name) - download_path=download_path.joinpath("BBBC") + download(self.name,download_path) + self.output_path=download_path + save_location=download_path.joinpath("BBBC") # Separate images from ground truth - save_location = download_path.joinpath(self.name) + save_location = save_location.joinpath(self.name) src = save_location.joinpath("raw/Images", "Replicate1annotation.csv") dst = save_location.joinpath("raw/Ground Truth", "Replicate1annotation.csv") diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__main__.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__main__.py index 79439a500..84b023170 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__main__.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__main__.py @@ -2,7 +2,7 @@ import os import logging from pathlib import Path -from concurrent.futures import ProcessPoolExecutor, as_completed +from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Any, Optional import typer @@ -10,6 +10,7 @@ from polus.plugins.utils.bbbc_download.BBBC_model import BBBC, BBBCDataset, IDAndSegmentation, PhenotypeClassification, ImageBasedProfiling from sys import platform from multiprocessing import cpu_count +import time @@ -36,6 +37,7 @@ def main( out_dir: Path= typer.Option( ...,"--outDir", help="The path for downloading the dataset" ) + )-> None: """Download the required dataset from the BBBC dataaset.""" @@ -43,10 +45,15 @@ def main( logger.info(f"outDir = {out_dir}") """Checking if output directory exists. If it does not exist then a designated path is created.""" if not out_dir.exists(): - out_dir.mkdir() logger.info(f"{out_dir} did not exists. Creating new path.") + out_dir.mkdir() + if(not out_dir.exists): + raise ValueError("Directory does not exist") + + - with ProcessPoolExecutor(max_workers=NUM_THREADS) as executor: + with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor: + start_time = time.time() threads=[] names=name.split(",") for n in names: @@ -67,7 +74,7 @@ def main( executor.submit(ImageBasedProfiling.raw,out_dir) ) - elif(n=='BBBC'): + elif(n=='All'): threads.append( executor.submit(BBBC.raw,out_dir) ) @@ -91,10 +98,15 @@ def main( colour="cyan", ): f.result() - + end_time = time.time() + execution_time = (end_time - start_time) + execution_time_min=execution_time/60 + logger.info(f"The execution time is {execution_time} in seconds") + logger.info(f"The execution time is {execution_time_min} in minutes") +if __name__ == "__main__": + app() + -if __name__ == "__main__": - app() \ No newline at end of file diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py index b27493bde..d77c4fde1 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py @@ -75,9 +75,8 @@ def get_url(url: str, save_location: Path, name: str) -> None: return -def remove_macosx(name:str, save_location:Path)-> None: - images_path=save_location.joinpath("Images") - folders=[folders for folders in images_path.iterdir() if folders.is_dir()] +def remove_macosx(name:str, location:Path)-> None: + folders=[folders for folders in location.iterdir() if folders.is_dir()] for f in folders: if f.name=="__MACOSX": shutil.rmtree(f) @@ -139,6 +138,10 @@ def download(name: str,download_path:Path) -> None: get_url(url, file_path, "BBBC018") print(name + " has finished downloading") - remove_macosx(name,save_location) + images_path=save_location.joinpath("Images") + remove_macosx(name,images_path) + ground_path=save_location.joinpath("Ground Truth") + if ground_path.exists(): + remove_macosx(name,ground_path) return diff --git a/utils/bbbc-download-plugin/tests/__init__.py b/utils/bbbc-download-plugin/tests/__init__.py index e69de29bb..fa93c893c 100644 --- a/utils/bbbc-download-plugin/tests/__init__.py +++ b/utils/bbbc-download-plugin/tests/__init__.py @@ -0,0 +1 @@ +"""bbbc download plugin.""" \ No newline at end of file diff --git a/utils/bbbc-download-plugin/tests/test_main.py b/utils/bbbc-download-plugin/tests/test_main.py new file mode 100644 index 000000000..852e2fb64 --- /dev/null +++ b/utils/bbbc-download-plugin/tests/test_main.py @@ -0,0 +1,84 @@ +import pathlib +import shutil +import tempfile +import numpy as np +import pytest +import requests +import skimage +from bfio import BioReader +from skimage import io +from typer.testing import CliRunner + +from polus.plugins.utils.bbbc_download.__main__ import app as app +from polus.plugins.utils.bbbc_download import BBBC_model,mapping,download + +runner = CliRunner() + +@pytest.fixture +def output_directory(): + """Generate random output directory.""" + out_dir = pathlib.Path(tempfile.mkdtemp(dir=pathlib.Path.cwd())) + yield out_dir + shutil.rmtree(out_dir) + +@pytest.fixture +def macosx_directory(): + """Generate random directory.""" + test_dir = pathlib.Path(tempfile.mkdtemp(dir=pathlib.Path.cwd())) + macosx_dir=test_dir.joinpath("Images","__MACOSX") + macosx_dir.mkdir(parents=True) + yield macosx_dir + shutil.rmtree(macosx_dir.parents[1]) + + +def test_delete_macosx(macosx_directory) -> None: + + mac_dir=macosx_directory + mac_dir=pathlib.Path(mac_dir) + + mac_dir_test= mac_dir.parent + macosx_test_name="testname" + download.remove_macosx(macosx_test_name,mac_dir_test) + assert mac_dir.exists()==False + + +def test_bbbc_datasets()->None: + d_test=BBBC_model.BBBC.datasets + assert len(d_test)==50 + +def test_raw(output_directory)->None: + d=BBBC_model.BBBCDataset.create_dataset("BBBC054") + output_dir=pathlib.Path(output_directory) + d.raw(output_dir) + assert d.size >0 + +def test_IDAndSegmentation()-> None: + d_test_IDAndSegmentation= BBBC_model.IDAndSegmentation.datasets + assert len(d_test_IDAndSegmentation)==32 + +def test_PhenotypeClassification()-> None: + d_test_PhenotypeClassification= BBBC_model.PhenotypeClassification.datasets + assert len(d_test_PhenotypeClassification)==14 + +def test_ImageBasedProfiling()-> None: + d_test_ImageBasedProfiling= BBBC_model.ImageBasedProfiling.datasets + assert len(d_test_ImageBasedProfiling)==6 + +def test_cli(output_directory) -> None: + """Test Cli.""" + name="BBBC001,BBBC002" + output_dir=pathlib.Path(output_directory) + + result = runner.invoke( + app, + [ + "--name", + name, + "--outDir", + output_dir, + ], + ) + + assert result.exit_code == 0 + + From 0b88e5e1ddeab4e962c8553fe5fde414f3e97206 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 24 Jul 2023 17:04:11 +0000 Subject: [PATCH 03/18] modified the readme file and the check the docker container files --- utils/bbbc-download-plugin/README.md | 130 ++---------------- utils/bbbc-download-plugin/pyproject.toml | 3 +- utils/bbbc-download-plugin/run-plugin.sh | 18 +-- .../plugins/utils/bbbc_download/download.py | 5 + 4 files changed, 29 insertions(+), 127 deletions(-) diff --git a/utils/bbbc-download-plugin/README.md b/utils/bbbc-download-plugin/README.md index 9e9e4b10c..cdbf1b8cb 100644 --- a/utils/bbbc-download-plugin/README.md +++ b/utils/bbbc-download-plugin/README.md @@ -1,14 +1,20 @@ -#BBBC Download (0.1.0-dev0) +# BBBC Download (0.1.0-dev0) This plugin is designed to download the necessary datasets from the Broad Bioimage Benchmark Collection(BBBC) website. For information on the BBBC dataset, visit [BBBC dataset information](https://bbbc.broadinstitute.org/image_sets/). +The tables on this webpage classify datasets by their biological application. Each dataset has a webpage that contains links to the data and describes information about the dataset. Almost every dataset has image data and ground truth data. There are a few datasets that have metadata rather than ground truth data. ## Building -To build the Docker image for the conversion plugin, run -`./build-docker.sh`. +To build the Docker image for the download plugin, run +`bash build-docker.sh`. + +## Executing + +To execute the build docker image for the download plugin, run +'bash run-plugin.sh' ## Options @@ -28,118 +34,8 @@ The Following are valid names for datasets: To download specific datasets from the website, give the name of each dataset in the input argument seperated by a comma. eg: --name="BBBC001,BBBC002,BBBC003" +### NOTE +There may be some errors while running th plugin for BBBC046 dataset. -# BBBC Model -The classes in BBBC_model.py model the data from the [Broad Bioimage Benchmark Collection (BBBC)](https://bbbc.broadinstitute.org/image_sets). The tables on this webpage classify datasets by their biological application. Each dataset has a webpage that contains links to the data and describes information about the dataset. Almost every dataset has image data and ground truth data. There are a few datasets that have metadata rather than ground truth data. - -# Classes -This section describes the classes and functions used to model the BBBC. - -## BBBC -The `BBBC` class contains functions used for interacting with every dataset in the BBBC. - -### Functions -`datasets()`: Returns a list of all the datasets in the collection. - -`combined_table()`: Combines each table on the BBBC image set webpage into a single pandas DataFrame. - -`raw()`: Downloads all of the datasets in the collection. - -## Table Classes -There is a class for each table on the BBBC image set webpage. The classes are `IDAndSegmentation`, `PhenotypeClassification`, and `ImageBasedProfiling`. They have the same attributes and functions. - -### Attributes -`name`: The name of the table as it appears on the BBBC image set webpage. - -`table`: A pandas DataFrame representation of the table. - -### Functions -`datasets()`: Returns a list of all the datasets in the table. - -`raw()`: Downloads all of the datasets in the table. - -## BBBCDataset -The `BBBCDataset` class models individual datasets. - -*Note*: some datasets need specialized functionality so they cannot be modeled by the general BBBCDataset class. These datasets have their own classes with the specialized functionality implemented there. - -### Attributes -`name`: A string that represents the dataset's name. The provided name must be the name of an existing dataset or else an exception will be raised. - -`images`: An Images object that contains information about the dataset's images. Set to `None` until raw data is downloaded. - -`ground_truth`: A GroundTruth object that contains information about the dataset's ground truth. Set to `None` until raw data is downloaded. - -`metadata`: A Metadata object that contains information about the dataset's metadata. Set to `None` until raw data is downloaded. - -*Note*: The `images`, `ground_truth`, or `metadata` attributes will be `None` after downloading raw data if the dataset has no images, ground truth, or metadata. - -### Functions -`create_dataset(name)`: Takes in a name as a string and returns a BBBCDataset object for the dataset with that name. If there is no dataset with this name, then an error message is displayed and `None` is returned. - -`info()`: Returns a dictionary containing information about the dataset. The information includes: - -- A description of the dataset -- The microscopy technique used for the dataset -- The number of fields per sample -- The total number of fields -- The total number of images -- The types of ground truth used for the dataset - -`size()`: Computes and returns the total size of the dataset in bytes. - -`raw()`: Downloads the raw data for the dataset. Initializes the `images`, `ground_truth`, and `metadata` attributes. - -`standard(extension)`: Standardizes the dataset's raw data. The extension argument indicates which file format to save to. It can be `".ome.tif"` or `".ome.zarr"`. - -## Data Classes -Each dataset has image and ground truth data. A few datasets have metadata rather than ground truth. The `Images`, `GroundTruth`, and `Metadata` classes contain information about the dataset's images, ground truth, and metadata respectively. They have the same attributes and functions. - -### Attributes -`path`: The path to the folder where the data is stored. -`name`: The name of the dataset that the data belongs to. - -### Functions -`size()`: Computes and returns the size of the data in bytes. - -# Example Workflow -This section provides an example of how to use these classes and functions. - -```python - from BBBC_model import BBBC, BBBCDataset, IDAndSegmentation - - # Print all datasets - for d in BBBC.datasets: - print(d.name) - - # Print all datasets in the Identification and segmentation table - print(IDAndSegmentation.name) - for d in IDAndSegmentation.datasets: - print(d.name) - - # Create a dataset - d = BBBCDataset.create_dataset("BBBC001") - - # Print some information about the dataset - print(d.name) - print(d.info) - - # Download dataset's raw data - d.raw() - - # Print information about the dataset after downloading its raw data - print(d.size) - print(d.images.size) - print(d.ground_truth.size) - - # This will print None because this dataset has no metadata - print(d.metadata) - - # Standardize the raw data - d.standard(".ome.tif") - - # Print information about the dataset after standardizing - print(d.size) - print(d.images.size) - print(d.ground_truth.size) -``` \ No newline at end of file +## Sample docker command: +docker run -v /home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data/:/home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data/ polusai/bbbc-download-plugin:0.1.0-dev0 --name="BBBC001" --outDir=/home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data \ No newline at end of file diff --git a/utils/bbbc-download-plugin/pyproject.toml b/utils/bbbc-download-plugin/pyproject.toml index 061163daa..006521d58 100644 --- a/utils/bbbc-download-plugin/pyproject.toml +++ b/utils/bbbc-download-plugin/pyproject.toml @@ -10,7 +10,7 @@ readme = "README.md" packages = [{include = "polus", from = "src"}] [tool.poetry.dependencies] -python = "^3.9.16" +python = ">=3.9,<4" typer = "^0.9.0" pyarrow = "11.0.0" scikit-image = "0.20.0" @@ -26,6 +26,7 @@ bump2version = "1.0.1" mypy = "1.0.1" tqdm = "^4.65.0" pytest = "^7.4.0" +xmlschema = "^2.3.1" [build-system] diff --git a/utils/bbbc-download-plugin/run-plugin.sh b/utils/bbbc-download-plugin/run-plugin.sh index 57408ac85..c78c4cb52 100644 --- a/utils/bbbc-download-plugin/run-plugin.sh +++ b/utils/bbbc-download-plugin/run-plugin.sh @@ -2,18 +2,18 @@ version=$( None: return def remove_macosx(name:str, location:Path)-> None: + """ Remove the __MACOSX folder from the downlpoaded dataset. + Args: + name: The name of the dataset + location: The partent directory of the __MACOSX folder. + """ folders=[folders for folders in location.iterdir() if folders.is_dir()] for f in folders: if f.name=="__MACOSX": From 7abf6ec22147fdc8aa23180ffc70af6f049e5559 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 24 Jul 2023 17:51:22 +0000 Subject: [PATCH 04/18] sample dockeer command in readme file updated --- utils/bbbc-download-plugin/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bbbc-download-plugin/README.md b/utils/bbbc-download-plugin/README.md index cdbf1b8cb..0970c74bb 100644 --- a/utils/bbbc-download-plugin/README.md +++ b/utils/bbbc-download-plugin/README.md @@ -38,4 +38,4 @@ To download specific datasets from the website, give the name of each dataset in There may be some errors while running th plugin for BBBC046 dataset. ## Sample docker command: -docker run -v /home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data/:/home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data/ polusai/bbbc-download-plugin:0.1.0-dev0 --name="BBBC001" --outDir=/home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data \ No newline at end of file +``` docker run -v /home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data/:/home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data/ polusai/bbbc-download-plugin:0.1.0-dev0 --name="BBBC001" --outDir=/home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data ``` \ No newline at end of file From 9e6b7d1b58e14664166ba8e07c6378c2d41b853b Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 24 Jul 2023 18:23:34 +0000 Subject: [PATCH 05/18] removed mapping.py adn the mapping dependency in project.toml --- utils/bbbc-download-plugin/README.md | 2 +- utils/bbbc-download-plugin/pyproject.toml | 1 - utils/bbbc-download-plugin/run-plugin.sh | 2 +- .../src/polus/plugins/utils/bbbc_download/BBBC_model.py | 2 +- .../src/polus/plugins/utils/bbbc_download/mapping.py | 9 --------- 5 files changed, 3 insertions(+), 13 deletions(-) delete mode 100644 utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/mapping.py diff --git a/utils/bbbc-download-plugin/README.md b/utils/bbbc-download-plugin/README.md index 0970c74bb..12d35d059 100644 --- a/utils/bbbc-download-plugin/README.md +++ b/utils/bbbc-download-plugin/README.md @@ -35,7 +35,7 @@ The Following are valid names for datasets: To download specific datasets from the website, give the name of each dataset in the input argument seperated by a comma. eg: --name="BBBC001,BBBC002,BBBC003" ### NOTE -There may be some errors while running th plugin for BBBC046 dataset. +BBBC046 dataset download is not supported by this plugin ## Sample docker command: ``` docker run -v /home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data/:/home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data/ polusai/bbbc-download-plugin:0.1.0-dev0 --name="BBBC001" --outDir=/home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data ``` \ No newline at end of file diff --git a/utils/bbbc-download-plugin/pyproject.toml b/utils/bbbc-download-plugin/pyproject.toml index 006521d58..5bcf26c72 100644 --- a/utils/bbbc-download-plugin/pyproject.toml +++ b/utils/bbbc-download-plugin/pyproject.toml @@ -21,7 +21,6 @@ numpy = "1.24.2" pandas = "1.5.3" requests = "2.28.2" pydantic = "1.10.7" -mapping = "^0.1.6" bump2version = "1.0.1" mypy = "1.0.1" tqdm = "^4.65.0" diff --git a/utils/bbbc-download-plugin/run-plugin.sh b/utils/bbbc-download-plugin/run-plugin.sh index c78c4cb52..1b85d5652 100644 --- a/utils/bbbc-download-plugin/run-plugin.sh +++ b/utils/bbbc-download-plugin/run-plugin.sh @@ -4,7 +4,7 @@ version=$( str: - # important attributes: plate, well, wel num, control, field, channel, treatment, image type - - return f"a01_w01_n01_p01_f0{row['Field'] + 1}_c01_t00_i01{extension}" - - -__all__ = ["BBBC001_mapping"] \ No newline at end of file From 787e409bf994f8f438ef802e28924a1d81e91e21 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 24 Jul 2023 19:18:34 +0000 Subject: [PATCH 06/18] removed mapping import from test_main --- utils/bbbc-download-plugin/tests/test_main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/bbbc-download-plugin/tests/test_main.py b/utils/bbbc-download-plugin/tests/test_main.py index 852e2fb64..5a1443690 100644 --- a/utils/bbbc-download-plugin/tests/test_main.py +++ b/utils/bbbc-download-plugin/tests/test_main.py @@ -10,7 +10,7 @@ from typer.testing import CliRunner from polus.plugins.utils.bbbc_download.__main__ import app as app -from polus.plugins.utils.bbbc_download import BBBC_model,mapping,download +from polus.plugins.utils.bbbc_download import BBBC_model,download runner = CliRunner() @@ -47,7 +47,7 @@ def test_bbbc_datasets()->None: assert len(d_test)==50 def test_raw(output_directory)->None: - d=BBBC_model.BBBCDataset.create_dataset("BBBC054") + d=BBBC_model.BBBCDataset.create_dataset("BBBC001") output_dir=pathlib.Path(output_directory) d.raw(output_dir) assert d.size >0 From 15218e44589abdb86e4ddcfc34bb5654d2ea55f6 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 27 Jul 2023 14:05:35 +0000 Subject: [PATCH 07/18] Changed the folder name of Ground Truth to Ground_Truth. --- .../plugins/utils/bbbc_download/BBBC_model.py | 26 +++++++++---------- .../plugins/utils/bbbc_download/download.py | 4 +-- utils/bbbc-download-plugin/tests/test_main.py | 9 +++++-- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py index 7ecd6f86d..9b71d04aa 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py @@ -91,8 +91,8 @@ def valid_data(cls, values: dict) -> dict: def size(self) -> int: """Returns the size of the dataset's ground truth in bytes.""" - raw_path = root.joinpath(self.name, "raw/Ground Truth") - standard_path = root.joinpath(self.name, "standard/Ground Truth") + raw_path = root.joinpath(self.name, "raw/Ground_Truth") + standard_path = root.joinpath(self.name, "standard/Ground_Truth") raw_sum = sum(os.path.getsize(file) for file in raw_path.rglob("*")) standard_sum = sum(os.path.getsize(file) for file in standard_path.rglob("*")) @@ -241,7 +241,7 @@ def _init_data(self,download_path:Path) -> None: download_path=download_path.joinpath("BBBC") images_path = download_path.joinpath(self.name, "raw/Images") - truth_path = download_path.joinpath(self.name, "raw/Ground Truth") + truth_path = download_path.joinpath(self.name, "raw/Ground_Truth") meta_path = download_path.joinpath(self.name, "raw/Metadata") try: @@ -312,7 +312,7 @@ def standard(self, extension: str) -> None: if row["Image Type"] == "Intensity": sub_folder = "Images" elif row["Image Type"] == "Ground Truth": - sub_folder = "Ground Truth" + sub_folder = "Ground_Truth" elif row["Image Type"] == "Metadata": sub_folder = "Metadata" else: @@ -348,7 +348,7 @@ def raw(self,download_path:Path) -> None: # Separate images from ground truth save_location = save_location.joinpath("BBBC019") images_folder = save_location.joinpath("raw/Images") - truth_folder = save_location.joinpath("raw/Ground Truth") + truth_folder = save_location.joinpath("raw/Ground_Truth") for set in [ x for x in images_folder.iterdir() @@ -394,7 +394,7 @@ def raw(self,download_path:Path) -> None: "BBBC029", ) - file_path = save_location.joinpath("Ground Truth") + file_path = save_location.joinpath("Ground_Truth") get_url( "https://data.broadinstitute.org/bbbc/BBBC029/ground_truth.zip", file_path, @@ -403,7 +403,7 @@ def raw(self,download_path:Path) -> None: print("BBBC029 has finished downloading") images_folder=save_location.joinpath("Images") - truth_folder=save_location.joinpath("Ground Truth") + truth_folder=save_location.joinpath("Ground_Truth") remove_macosx("BBBC029",images_folder) remove_macosx("BBBC029",truth_folder) source_directory=images_folder.joinpath("images") @@ -433,14 +433,14 @@ def raw(self,download_path:Path) -> None: save_location = save_location.joinpath("BBBC041") file_names = ["test.json", "training.json"] - if not save_location.joinpath("raw/Ground Truth").exists(): - save_location.joinpath("raw/Ground Truth").mkdir( + if not save_location.joinpath("raw/Ground_Truth").exists(): + save_location.joinpath("raw/Ground_Truth").mkdir( parents=True, exist_ok=True ) for file in file_names: src = save_location.joinpath("raw/Images/malaria", file) - dst = save_location.joinpath("raw/Ground Truth") + dst = save_location.joinpath("raw/Ground_Truth") if dst.joinpath(file).exists(): os.remove(src) @@ -479,7 +479,7 @@ def raw(self,download_path:Path) -> None: print("BBBC042 has finished downloading") images_folder=save_location.joinpath("Images") - truth_folder=save_location.joinpath("Ground Truth") + truth_folder=save_location.joinpath("Ground_Truth") remove_macosx("BBBC029",images_folder) remove_macosx("BBBC029",truth_folder) @@ -498,7 +498,7 @@ def raw(self, download_path: Path) -> None: try: save_location = save_location.joinpath(self.name) images_folder = save_location.joinpath("raw/Images") - truth_folder = save_location.joinpath("raw/Ground Truth") + truth_folder = save_location.joinpath("raw/Ground_Truth") # Extract these files because they do not extract automatically for file in ["OE-ID350-AR-1.zip", "OE-ID350-AR-2.zip", "OE-ID350-AR-4.zip", "OE-ID350-AR-8.zip"]: @@ -547,7 +547,7 @@ def raw(self, download_path:Path) -> None: # Separate images from ground truth save_location = save_location.joinpath(self.name) src = save_location.joinpath("raw/Images", "Replicate1annotation.csv") - dst = save_location.joinpath("raw/Ground Truth", "Replicate1annotation.csv") + dst = save_location.joinpath("raw/Ground_Truth", "Replicate1annotation.csv") if not dst.exists(): dst.mkdir(parents=True, exist_ok=True) diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py index 1ee02a44e..d9da36e11 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py @@ -119,7 +119,7 @@ def download(name: str,download_path:Path) -> None: elif heading.text.strip() == "Metadata": sub_folder = "Metadata" else: - sub_folder = "Ground Truth" + sub_folder = "Ground_Truth" # Iterate over every tag under the current heading and above the next heading for tag in get_lower_tags(heading): @@ -145,7 +145,7 @@ def download(name: str,download_path:Path) -> None: print(name + " has finished downloading") images_path=save_location.joinpath("Images") remove_macosx(name,images_path) - ground_path=save_location.joinpath("Ground Truth") + ground_path=save_location.joinpath("Ground_Truth") if ground_path.exists(): remove_macosx(name,ground_path) diff --git a/utils/bbbc-download-plugin/tests/test_main.py b/utils/bbbc-download-plugin/tests/test_main.py index 5a1443690..3514f15e0 100644 --- a/utils/bbbc-download-plugin/tests/test_main.py +++ b/utils/bbbc-download-plugin/tests/test_main.py @@ -23,7 +23,7 @@ def output_directory(): @pytest.fixture def macosx_directory(): - """Generate random directory.""" + """Generate random directory named __MACOSX.""" test_dir = pathlib.Path(tempfile.mkdtemp(dir=pathlib.Path.cwd())) macosx_dir=test_dir.joinpath("Images","__MACOSX") macosx_dir.mkdir(parents=True) @@ -32,7 +32,7 @@ def macosx_directory(): def test_delete_macosx(macosx_directory) -> None: - + """Testing the delete_macosx function in download.py""" mac_dir=macosx_directory mac_dir=pathlib.Path(mac_dir) @@ -43,24 +43,29 @@ def test_delete_macosx(macosx_directory) -> None: def test_bbbc_datasets()->None: + """Test to check if all the datasets on the BBBC website are recognized.""" d_test=BBBC_model.BBBC.datasets assert len(d_test)==50 def test_raw(output_directory)->None: + """A function to test the download functionality.""" d=BBBC_model.BBBCDataset.create_dataset("BBBC001") output_dir=pathlib.Path(output_directory) d.raw(output_dir) assert d.size >0 def test_IDAndSegmentation()-> None: + """Test to check if all the datasets on the Identification and segmentation table are recognized.""" d_test_IDAndSegmentation= BBBC_model.IDAndSegmentation.datasets assert len(d_test_IDAndSegmentation)==32 def test_PhenotypeClassification()-> None: + """Test to check if all the datasets on the Phenotype CLassification table are recognized.""" d_test_PhenotypeClassification= BBBC_model.PhenotypeClassification.datasets assert len(d_test_PhenotypeClassification)==14 def test_ImageBasedProfiling()-> None: + """Test to check if all the datasets on the Image based profiling table are recognized.""" d_test_ImageBasedProfiling= BBBC_model.ImageBasedProfiling.datasets assert len(d_test_ImageBasedProfiling)==6 From e67503f18a25431994f5ad623b51673aad5bbb56 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 27 Jul 2023 16:27:50 +0000 Subject: [PATCH 08/18] Changes to readme and added comments to test_main.py --- utils/bbbc-download-plugin/README.md | 22 +++++++++---------- .../plugins/utils/bbbc_download/BBBC_model.py | 2 +- utils/bbbc-download-plugin/tests/test_main.py | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/utils/bbbc-download-plugin/README.md b/utils/bbbc-download-plugin/README.md index 12d35d059..cb4472eb8 100644 --- a/utils/bbbc-download-plugin/README.md +++ b/utils/bbbc-download-plugin/README.md @@ -11,10 +11,10 @@ The tables on this webpage classify datasets by their biological application. Ea To build the Docker image for the download plugin, run `bash build-docker.sh`. -## Executing +## Run the Docker image To execute the build docker image for the download plugin, run -'bash run-plugin.sh' +`bash run-plugin.sh`. ## Options @@ -23,19 +23,19 @@ This plugin takes 1 input arguments and | Name | Description | I/O | Type | | --------------- | ------------------------------------------------------------ | ------ | ----------- | -| `--name ` | The name of the datasets to be downloaded | Input | String | +| `--name ` | The name of the datasets to be downloaded | Input | String | | `--outDir` | Directory to store the downloaded datasets | Output | genericData | -The Following are valid names for datasets: -"all"- To download all the datasets from the bbbc website -"IDAndSegmentation"- To download the datasets from the Identification and segmentation table -"PhenotypeClassification"- To download the datasets from the Phenotype classification table -"ImageBasedProfiling"- To download the datasets from the Image-based Profiling table +The following are valid names for datasets: +`"all"`- To download all the datasets from the bbbc website +`"IDAndSegmentation"`- To download the datasets from the Identification and segmentation table +`"PhenotypeClassification"`- To download the datasets from the Phenotype classification table +`"ImageBasedProfiling"`- To download the datasets from the Image-based Profiling table -To download specific datasets from the website, give the name of each dataset in the input argument seperated by a comma. eg: --name="BBBC001,BBBC002,BBBC003" +To download specific datasets from the website, give the name of each dataset in the input argument seperated by a comma. example: `--name="BBBC001,BBBC002,BBBC003"` ### NOTE -BBBC046 dataset download is not supported by this plugin +BBBC046 dataset download is not supported by this plugin. ## Sample docker command: -``` docker run -v /home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data/:/home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data/ polusai/bbbc-download-plugin:0.1.0-dev0 --name="BBBC001" --outDir=/home/ec2-user/polus-plugins/utils/bbbc-download-plugin/data ``` \ No newline at end of file +```docker run -v /home/ec2-user/data/:/home/ec2-user/data/ polusai/bbbc-download-plugin:0.1.0-dev0 --name="BBBC001" --outDir=/home/ec2-user/data/output``` \ No newline at end of file diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py index 9b71d04aa..c0109d4ee 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py @@ -470,7 +470,7 @@ def raw(self,download_path:Path) -> None: "BBBC042", ) - file_path = save_location.joinpath("Ground Truth") + file_path = save_location.joinpath("Ground_Truth") get_url( "https://data.broadinstitute.org/bbbc/BBBC042/positions.zip", file_path, diff --git a/utils/bbbc-download-plugin/tests/test_main.py b/utils/bbbc-download-plugin/tests/test_main.py index 3514f15e0..24be86ae6 100644 --- a/utils/bbbc-download-plugin/tests/test_main.py +++ b/utils/bbbc-download-plugin/tests/test_main.py @@ -49,7 +49,7 @@ def test_bbbc_datasets()->None: def test_raw(output_directory)->None: """A function to test the download functionality.""" - d=BBBC_model.BBBCDataset.create_dataset("BBBC001") + d=BBBC_model.BBBCDataset.create_dataset("BBBC054") #change dataset name to test output_dir=pathlib.Path(output_directory) d.raw(output_dir) assert d.size >0 From badf30cc9b7f28d0745ba9b1c17a664706b165c7 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 27 Jul 2023 16:29:53 +0000 Subject: [PATCH 09/18] Changes all to All in readme --- utils/bbbc-download-plugin/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bbbc-download-plugin/README.md b/utils/bbbc-download-plugin/README.md index cb4472eb8..029d12793 100644 --- a/utils/bbbc-download-plugin/README.md +++ b/utils/bbbc-download-plugin/README.md @@ -27,7 +27,7 @@ This plugin takes 1 input arguments and | `--outDir` | Directory to store the downloaded datasets | Output | genericData | The following are valid names for datasets: -`"all"`- To download all the datasets from the bbbc website +`"All"`- To download all the datasets from the bbbc website `"IDAndSegmentation"`- To download the datasets from the Identification and segmentation table `"PhenotypeClassification"`- To download the datasets from the Phenotype classification table `"ImageBasedProfiling"`- To download the datasets from the Image-based Profiling table From dfd65e976eb11098bc2ec6981aa40dca54326da8 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 27 Jul 2023 16:49:29 +0000 Subject: [PATCH 10/18] Changed spelling error in readme file --- utils/bbbc-download-plugin/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bbbc-download-plugin/README.md b/utils/bbbc-download-plugin/README.md index 029d12793..d1a2c39bc 100644 --- a/utils/bbbc-download-plugin/README.md +++ b/utils/bbbc-download-plugin/README.md @@ -13,7 +13,7 @@ To build the Docker image for the download plugin, run ## Run the Docker image -To execute the build docker image for the download plugin, run +To execute the built docker image for the download plugin, run `bash run-plugin.sh`. ## Options From 52f10fb080983daa32da7b5564eecaf81537dbdb Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Fri, 28 Jul 2023 18:59:26 +0000 Subject: [PATCH 11/18] Changes made after checcking with pre-commit --- utils/bbbc-download-plugin/.bumpversion.cfg | 2 +- utils/bbbc-download-plugin/Dockerfile | 4 +- utils/bbbc-download-plugin/README.md | 18 +-- utils/bbbc-download-plugin/VERSION | 2 +- utils/bbbc-download-plugin/build-docker.sh | 2 +- utils/bbbc-download-plugin/plugin.json | 77 +++++++------ utils/bbbc-download-plugin/run-plugin.sh | 1 + .../plugins/utils/bbbc_download/__init__.py | 2 +- .../plugins/utils/bbbc_download/__main__.py | 106 ++++++++---------- utils/bbbc-download-plugin/tests/__init__.py | 2 +- utils/bbbc-download-plugin/tests/test_main.py | 63 ++++++----- 11 files changed, 136 insertions(+), 143 deletions(-) diff --git a/utils/bbbc-download-plugin/.bumpversion.cfg b/utils/bbbc-download-plugin/.bumpversion.cfg index 8cc773f0b..182a51988 100644 --- a/utils/bbbc-download-plugin/.bumpversion.cfg +++ b/utils/bbbc-download-plugin/.bumpversion.cfg @@ -24,4 +24,4 @@ replace = version = "{new_version}" [bumpversion:file:VERSION] -[bumpversion:file:src/polus/plugins/utils/bbbc_download/__init__.py] \ No newline at end of file +[bumpversion:file:src/polus/plugins/utils/bbbc_download/__init__.py] diff --git a/utils/bbbc-download-plugin/Dockerfile b/utils/bbbc-download-plugin/Dockerfile index da89ce48c..d89987fd6 100644 --- a/utils/bbbc-download-plugin/Dockerfile +++ b/utils/bbbc-download-plugin/Dockerfile @@ -3,7 +3,7 @@ FROM polusai/bfio:2.1.9 # environment variables defined in polusai/bfio ENV EXEC_DIR="/opt/executables" ENV POLUS_IMG_EXT=".ome.tif" -ENV POLUS_TAB_EXT=".csv" +ENV POLUS_TAB_EXT=".csv" ENV POLUS_LOG="INFO" # Work directory defined in the base container @@ -17,4 +17,4 @@ COPY src ${EXEC_DIR}/src RUN pip3 install ${EXEC_DIR} --no-cache-dir ENTRYPOINT ["python3", "-m", "polus.plugins.utils.bbbc_download"] -CMD ["--help"] \ No newline at end of file +CMD ["--help"] diff --git a/utils/bbbc-download-plugin/README.md b/utils/bbbc-download-plugin/README.md index d1a2c39bc..4218fc9f9 100644 --- a/utils/bbbc-download-plugin/README.md +++ b/utils/bbbc-download-plugin/README.md @@ -2,7 +2,7 @@ This plugin is designed to download the necessary datasets from the Broad Bioimage Benchmark Collection(BBBC) website. -For information on the BBBC dataset, visit +For information on the BBBC dataset, visit [BBBC dataset information](https://bbbc.broadinstitute.org/image_sets/). The tables on this webpage classify datasets by their biological application. Each dataset has a webpage that contains links to the data and describes information about the dataset. Almost every dataset has image data and ground truth data. There are a few datasets that have metadata rather than ground truth data. @@ -13,7 +13,7 @@ To build the Docker image for the download plugin, run ## Run the Docker image -To execute the built docker image for the download plugin, run +To execute the built docker image for the download plugin, run `bash run-plugin.sh`. ## Options @@ -26,16 +26,16 @@ This plugin takes 1 input arguments and | `--name ` | The name of the datasets to be downloaded | Input | String | | `--outDir` | Directory to store the downloaded datasets | Output | genericData | -The following are valid names for datasets: -`"All"`- To download all the datasets from the bbbc website -`"IDAndSegmentation"`- To download the datasets from the Identification and segmentation table -`"PhenotypeClassification"`- To download the datasets from the Phenotype classification table +The following are valid names for datasets: +`"All"`- To download all the datasets from the bbbc website +`"IDAndSegmentation"`- To download the datasets from the Identification and segmentation table +`"PhenotypeClassification"`- To download the datasets from the Phenotype classification table `"ImageBasedProfiling"`- To download the datasets from the Image-based Profiling table -To download specific datasets from the website, give the name of each dataset in the input argument seperated by a comma. example: `--name="BBBC001,BBBC002,BBBC003"` +To download specific datasets from the website, give the name of each dataset in the input argument seperated by a comma. example: `--name="BBBC001,BBBC002,BBBC003"` ### NOTE -BBBC046 dataset download is not supported by this plugin. +BBBC046 dataset download is not supported by this plugin. ## Sample docker command: -```docker run -v /home/ec2-user/data/:/home/ec2-user/data/ polusai/bbbc-download-plugin:0.1.0-dev0 --name="BBBC001" --outDir=/home/ec2-user/data/output``` \ No newline at end of file +```docker run -v /home/ec2-user/data/:/home/ec2-user/data/ polusai/bbbc-download-plugin:0.1.0-dev0 --name="BBBC001" --outDir=/home/ec2-user/data/output``` diff --git a/utils/bbbc-download-plugin/VERSION b/utils/bbbc-download-plugin/VERSION index 15a06bec5..206c0852b 100644 --- a/utils/bbbc-download-plugin/VERSION +++ b/utils/bbbc-download-plugin/VERSION @@ -1 +1 @@ -0.1.0-dev0 \ No newline at end of file +0.1.0-dev0 diff --git a/utils/bbbc-download-plugin/build-docker.sh b/utils/bbbc-download-plugin/build-docker.sh index 3c751e602..3bfcb041b 100644 --- a/utils/bbbc-download-plugin/build-docker.sh +++ b/utils/bbbc-download-plugin/build-docker.sh @@ -1,4 +1,4 @@ #!/bin/bash version=$( None: + out_dir: Path = typer.Option( + ..., + "--outDir", + help="The path for downloading the dataset", + ), +) -> None: """Download the required dataset from the BBBC dataaset.""" logger.info(f"name = {name}") logger.info(f"outDir = {out_dir}") - """Checking if output directory exists. If it does not exist then a designated path is created.""" + """Checking if output directory exists. + If it does not exist then a designated path is created.""" if not out_dir.exists(): logger.info(f"{out_dir} did not exists. Creating new path.") out_dir.mkdir() - if(not out_dir.exists): - raise ValueError("Directory does not exist") - - + if not out_dir.exists(): + msg = "Directory does not exist" + raise ValueError(msg) with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor: start_time = time.time() - threads=[] - names=name.split(",") + threads = [] + names = name.split(",") for n in names: - if(n=='IDAndSegmentation'): - threads.append( - executor.submit(IDAndSegmentation.raw,out_dir) - ) - - elif(n=='PhenotypeClassification'): - threads.append( - executor.submit(PhenotypeClassification.raw,out_dir) - ) + if n == "IDAndSegmentation": + threads.append(executor.submit(IDAndSegmentation.raw, out_dir)) + elif n == "PhenotypeClassification": + threads.append(executor.submit(PhenotypeClassification.raw, out_dir)) + elif n == "ImageBasedProfiling": + threads.append(executor.submit(ImageBasedProfiling.raw, out_dir)) - elif(n=='ImageBasedProfiling'): - threads.append( - executor.submit(ImageBasedProfiling.raw,out_dir) - ) - - elif(n=='All'): - threads.append( - executor.submit(BBBC.raw,out_dir) - ) - + elif n == "All": + threads.append(executor.submit(BBBC.raw, out_dir)) else: - d=executor.submit(BBBCDataset.create_dataset, n) - d_name=d.result() - threads.append( - executor.submit(d_name.raw,out_dir) - ) + d = executor.submit(BBBCDataset.create_dataset, n) + d_name = d.result() + threads.append(executor.submit(d_name.raw, out_dir)) - for f in tqdm( as_completed(threads), total=len(threads), mininterval=5, - desc=f"donwloading the dataset", + desc="donwloading the dataset", initial=0, unit_scale=True, colour="cyan", ): f.result() end_time = time.time() - execution_time = (end_time - start_time) - execution_time_min=execution_time/60 + execution_time = end_time - start_time + execution_time_min = execution_time / 60 logger.info(f"The execution time is {execution_time} in seconds") - logger.info(f"The execution time is {execution_time_min} in minutes") - - -if __name__ == "__main__": - app() - + logger.info(f"The execution time is {execution_time_min} in minutes") +if __name__ == "__main__": + app() diff --git a/utils/bbbc-download-plugin/tests/__init__.py b/utils/bbbc-download-plugin/tests/__init__.py index fa93c893c..437dfbef1 100644 --- a/utils/bbbc-download-plugin/tests/__init__.py +++ b/utils/bbbc-download-plugin/tests/__init__.py @@ -1 +1 @@ -"""bbbc download plugin.""" \ No newline at end of file +"""bbbc download plugin.""" diff --git a/utils/bbbc-download-plugin/tests/test_main.py b/utils/bbbc-download-plugin/tests/test_main.py index 24be86ae6..6ea042992 100644 --- a/utils/bbbc-download-plugin/tests/test_main.py +++ b/utils/bbbc-download-plugin/tests/test_main.py @@ -10,10 +10,11 @@ from typer.testing import CliRunner from polus.plugins.utils.bbbc_download.__main__ import app as app -from polus.plugins.utils.bbbc_download import BBBC_model,download +from polus.plugins.utils.bbbc_download import BBBC_model, download runner = CliRunner() + @pytest.fixture def output_directory(): """Generate random output directory.""" @@ -21,11 +22,12 @@ def output_directory(): yield out_dir shutil.rmtree(out_dir) + @pytest.fixture def macosx_directory(): """Generate random directory named __MACOSX.""" test_dir = pathlib.Path(tempfile.mkdtemp(dir=pathlib.Path.cwd())) - macosx_dir=test_dir.joinpath("Images","__MACOSX") + macosx_dir = test_dir.joinpath("Images", "__MACOSX") macosx_dir.mkdir(parents=True) yield macosx_dir shutil.rmtree(macosx_dir.parents[1]) @@ -33,46 +35,51 @@ def macosx_directory(): def test_delete_macosx(macosx_directory) -> None: """Testing the delete_macosx function in download.py""" - mac_dir=macosx_directory - mac_dir=pathlib.Path(mac_dir) - - mac_dir_test= mac_dir.parent - macosx_test_name="testname" - download.remove_macosx(macosx_test_name,mac_dir_test) - assert mac_dir.exists()==False + mac_dir = macosx_directory + mac_dir = pathlib.Path(mac_dir) + + mac_dir_test = mac_dir.parent + macosx_test_name = "testname" + download.remove_macosx(macosx_test_name, mac_dir_test) + assert mac_dir.exists() == False -def test_bbbc_datasets()->None: +def test_bbbc_datasets() -> None: """Test to check if all the datasets on the BBBC website are recognized.""" - d_test=BBBC_model.BBBC.datasets - assert len(d_test)==50 + d_test = BBBC_model.BBBC.datasets + assert len(d_test) == 50 -def test_raw(output_directory)->None: + +def test_raw(output_directory) -> None: """A function to test the download functionality.""" - d=BBBC_model.BBBCDataset.create_dataset("BBBC054") #change dataset name to test - output_dir=pathlib.Path(output_directory) + d = BBBC_model.BBBCDataset.create_dataset("BBBC054") # change dataset name to test + output_dir = pathlib.Path(output_directory) d.raw(output_dir) - assert d.size >0 + assert d.size > 0 + -def test_IDAndSegmentation()-> None: +def test_IDAndSegmentation() -> None: """Test to check if all the datasets on the Identification and segmentation table are recognized.""" - d_test_IDAndSegmentation= BBBC_model.IDAndSegmentation.datasets - assert len(d_test_IDAndSegmentation)==32 + d_test_IDAndSegmentation = BBBC_model.IDAndSegmentation.datasets + assert len(d_test_IDAndSegmentation) == 32 -def test_PhenotypeClassification()-> None: + +def test_PhenotypeClassification() -> None: """Test to check if all the datasets on the Phenotype CLassification table are recognized.""" - d_test_PhenotypeClassification= BBBC_model.PhenotypeClassification.datasets - assert len(d_test_PhenotypeClassification)==14 + d_test_PhenotypeClassification = BBBC_model.PhenotypeClassification.datasets + assert len(d_test_PhenotypeClassification) == 14 + -def test_ImageBasedProfiling()-> None: +def test_ImageBasedProfiling() -> None: """Test to check if all the datasets on the Image based profiling table are recognized.""" - d_test_ImageBasedProfiling= BBBC_model.ImageBasedProfiling.datasets - assert len(d_test_ImageBasedProfiling)==6 + d_test_ImageBasedProfiling = BBBC_model.ImageBasedProfiling.datasets + assert len(d_test_ImageBasedProfiling) == 6 + def test_cli(output_directory) -> None: """Test Cli.""" - name="BBBC001,BBBC002" - output_dir=pathlib.Path(output_directory) + name = "BBBC001,BBBC002" + output_dir = pathlib.Path(output_directory) result = runner.invoke( app, @@ -85,5 +92,3 @@ def test_cli(output_directory) -> None: ) assert result.exit_code == 0 - - From e000834f804f129e2851862f05875b5e386394fd Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 21 Aug 2023 15:35:30 +0000 Subject: [PATCH 12/18] Added nested zip file extraction --- utils/bbbc-download-plugin/.bumpversion.cfg | 2 +- utils/bbbc-download-plugin/README.md | 2 +- utils/bbbc-download-plugin/VERSION | 2 +- utils/bbbc-download-plugin/plugin.json | 2 +- utils/bbbc-download-plugin/pyproject.toml | 2 +- .../plugins/utils/bbbc_download/__init__.py | 2 +- .../plugins/utils/bbbc_download/download.py | 33 +++++++++++++++++++ 7 files changed, 39 insertions(+), 6 deletions(-) diff --git a/utils/bbbc-download-plugin/.bumpversion.cfg b/utils/bbbc-download-plugin/.bumpversion.cfg index 182a51988..2495ab3c5 100644 --- a/utils/bbbc-download-plugin/.bumpversion.cfg +++ b/utils/bbbc-download-plugin/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.0-dev0 +current_version = 0.1.0-dev1 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/utils/bbbc-download-plugin/README.md b/utils/bbbc-download-plugin/README.md index 4218fc9f9..d10719a2c 100644 --- a/utils/bbbc-download-plugin/README.md +++ b/utils/bbbc-download-plugin/README.md @@ -1,4 +1,4 @@ -# BBBC Download (0.1.0-dev0) +# BBBC Download (0.1.0-dev1) This plugin is designed to download the necessary datasets from the Broad Bioimage Benchmark Collection(BBBC) website. diff --git a/utils/bbbc-download-plugin/VERSION b/utils/bbbc-download-plugin/VERSION index 206c0852b..6b1a238a7 100644 --- a/utils/bbbc-download-plugin/VERSION +++ b/utils/bbbc-download-plugin/VERSION @@ -1 +1 @@ -0.1.0-dev0 +0.1.0-dev1 diff --git a/utils/bbbc-download-plugin/plugin.json b/utils/bbbc-download-plugin/plugin.json index dbe8689b2..355bc21ab 100644 --- a/utils/bbbc-download-plugin/plugin.json +++ b/utils/bbbc-download-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "BBBC Download", - "version": "0.1.0-dev0", + "version": "0.1.0-dev1", "title": "BBBC Download", "description": "Downloads the datasets on the Broad Bioimage Benchmark Collection website", "author": "Saket Prem(saket.prem@axleinfo.com), Matthew McIntyre(Matthew.McIntyre@axleinfo.com)", diff --git a/utils/bbbc-download-plugin/pyproject.toml b/utils/bbbc-download-plugin/pyproject.toml index 5bcf26c72..ad00e70f8 100644 --- a/utils/bbbc-download-plugin/pyproject.toml +++ b/utils/bbbc-download-plugin/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "polus-plugins-utils-bbbc-download-plugin" -version = "0.1.0-dev0" +version = "0.1.0-dev1" description = "" authors = [ "Saket Prem ", diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py index 6b548254d..223215dcf 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py @@ -1,2 +1,2 @@ """Bbbc Download.""" -__version__ = "0.1.0-dev0" +__version__ = "0.1.0-dev1" diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py index d9da36e11..89823680f 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py @@ -30,6 +30,29 @@ def get_lower_tags(tag: bs4.element.Tag) -> list: return tags +def extract_nested_zips(name: str,zip_path:Path, extract_path:Path): + """Unzip nested zip files. + Args: + name: Name of the dataset + zip_path: Path to the zip file + extract_path: The path where the unzipped files will be saved + """ + + with ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(extract_path) + zip_path.unlink() + print(zip_path) + print(zip_path.exists()) + extracted_folder_name = zip_path.stem # Name with .zip extension + extracted_folder_name = extract_path.joinpath(extracted_folder_name.replace('.zip','')) + remove_macosx(name,extract_path) + + nested_zip_files = list(extracted_folder_name.glob("*.zip")) + print(nested_zip_files) + for nested_zip_file in nested_zip_files: + nested_extract_path = nested_zip_file.parent + extract_nested_zips(nested_zip_file, nested_extract_path) + def get_url(url: str, save_location: Path, name: str) -> None: """Get the given url and save it. @@ -60,6 +83,9 @@ def get_url(url: str, save_location: Path, name: str) -> None: with ZipFile(zip_path, "r") as zfile: zfile.extractall(save_location) + + + except URLError as e: if download_attempts == 9: print("FAILED TO DOWNLOAD: " + url + " for " + name) @@ -148,5 +174,12 @@ def download(name: str,download_path:Path) -> None: ground_path=save_location.joinpath("Ground_Truth") if ground_path.exists(): remove_macosx(name,ground_path) + + # unzip nested zip files + zip_files = list(images_path.glob("**/*.zip")) + print(zip_files) + for zip_file in zip_files: + extract_path = zip_file.parent + extract_nested_zips(name,zip_file, extract_path) return From 09a3611d4ec5c68282e1422c0b0ce7015e1038fc Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 21 Aug 2023 15:40:43 +0000 Subject: [PATCH 13/18] Modified doc string in download function --- .../src/polus/plugins/utils/bbbc_download/download.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py index 89823680f..d98186b56 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py @@ -120,6 +120,7 @@ def download(name: str,download_path:Path) -> None: """Download a single dataset. Args: name: The name of the dataset to be downloaded + downlaod_path: Path to donwload the dataset """ print("Started downloading " + name) From 8f23b43f00190b3b6b02cbabba955b763276edf6 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 24 Aug 2023 20:27:20 +0000 Subject: [PATCH 14/18] plugin.json contaianerId modified to dev1 --- utils/bbbc-download-plugin/plugin.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bbbc-download-plugin/plugin.json b/utils/bbbc-download-plugin/plugin.json index 355bc21ab..a4f8af711 100644 --- a/utils/bbbc-download-plugin/plugin.json +++ b/utils/bbbc-download-plugin/plugin.json @@ -8,7 +8,7 @@ "repository": "https://github.com/PolusAI/polus-plugins", "website": "https://ncats.nih.gov/preclinical/core/informatics", "citation": "", - "containerId": "polusai/bbbc-download-plugin:0.1.0-dev0", + "containerId": "polusai/bbbc-download-plugin:0.1.0-dev1", "baseCommand": [ "python3", "-m", From 08f74b36474e61269646462c125ca276e86cff6f Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Fri, 25 Aug 2023 18:49:52 +0000 Subject: [PATCH 15/18] Changes the print statements to logger in BBBC_model.py and download.py --- utils/bbbc-download-plugin/plugin.json | 2 +- .../plugins/utils/bbbc_download/BBBC_model.py | 33 ++++++++++--------- .../plugins/utils/bbbc_download/download.py | 28 +++++++--------- 3 files changed, 30 insertions(+), 33 deletions(-) diff --git a/utils/bbbc-download-plugin/plugin.json b/utils/bbbc-download-plugin/plugin.json index a4f8af711..8246dd3ac 100644 --- a/utils/bbbc-download-plugin/plugin.json +++ b/utils/bbbc-download-plugin/plugin.json @@ -33,7 +33,7 @@ { "key": "inputs.name", "title": "Input name of datasets as string", - "description": "Input the name of the datasets to be downloaded as a string" + "description": "Input the name of the datasets to be downloaded." } ] } diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py index c0109d4ee..d4e755422 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py @@ -20,7 +20,8 @@ from skimage import io import pyarrow as pa import pyarrow.parquet as pq - +import logging +logger = logging.getLogger(__name__) BASE_URL = "https://bbbc.broadinstitute.org/" @@ -180,7 +181,7 @@ def create_dataset(cls, name: str) -> Union["BBBCDataset", None]: else: return BBBCDataset(name=name) except ValueError as e: - print(e) + logger.info(f"{e}") return None @@ -260,10 +261,10 @@ def _init_data(self,download_path:Path) -> None: pass if self.images == None: - print(self.name + " has no images.") + logger.info(f"{self.name} has no images") if self.ground_truth == None and self.metadata == None: - print(self.name + " has no ground truth or metadata.") + logger.info(f"{self.name} has no ground truth or metadata") return @@ -284,13 +285,13 @@ def standard(self, extension: str) -> None: """ if extension not in [".ome.tif", ".ome.zarr"]: - print( + logger.info( f"ERROR: {extension} is an invalid extension for standardization. Must be .ome.tif or .ome.zarr." ) return if self.images == None: - print( + logger.info( f"ERROR: Images for {self.name} have not been downloaded so they cannot be standardized." ) return @@ -316,7 +317,7 @@ def standard(self, extension: str) -> None: elif row["Image Type"] == "Metadata": sub_folder = "Metadata" else: - print("ERROR: Invalid value for attribute Image Type") + logger.info(f"ERROR: Invalid value for attribute Image Type") return save_path = standard_folder.joinpath(sub_folder) @@ -334,7 +335,7 @@ def standard(self, extension: str) -> None: bw.dtype = raw_image.dtype bw[:] = raw_image - print(f"Finished standardizing {self.name}") + logger.info(f"Finished standardizing {self.name}") return @@ -366,7 +367,7 @@ def raw(self,download_path:Path) -> None: try: shutil.rmtree(src) except NotADirectoryError as e: - print(e) + logger.info(f"{e}") else: shutil.move(src, dst) @@ -378,7 +379,7 @@ def raw(self,download_path:Path) -> None: class BBBC029(BBBCDataset): def raw(self,download_path:Path) -> None: - print("Started downloading BBBC029") + logger.info(f"Started downloading BBBC029") self.output_path=download_path save_location=download_path.joinpath("BBBC") @@ -401,7 +402,7 @@ def raw(self,download_path:Path) -> None: "BBBC029", ) - print("BBBC029 has finished downloading") + logger.info(f"BBBC029 has finished downloading") images_folder=save_location.joinpath("Images") truth_folder=save_location.joinpath("Ground_Truth") remove_macosx("BBBC029",images_folder) @@ -454,7 +455,7 @@ def raw(self,download_path:Path) -> None: class BBBC042(BBBCDataset): def raw(self,download_path:Path) -> None: - print("Started downloading BBBC042") + logger.info(f"Started downloading BBBC042") self.output_path=download_path save_location=download_path.joinpath("BBBC") @@ -477,7 +478,7 @@ def raw(self,download_path:Path) -> None: "BBBC042", ) - print("BBBC042 has finished downloading") + logger.info(f"BBBC042 has finished downloading") images_folder=save_location.joinpath("Images") truth_folder=save_location.joinpath("Ground_Truth") remove_macosx("BBBC029",images_folder) @@ -530,10 +531,10 @@ def raw(self, download_path: Path) -> None: self._init_data(download_path) except Exception as e: - print( - "BBBC046 downloaded successfully but an error occurred when organizing raw data." + logger.info( + f"BBBC046 downloaded successfully but an error occurred when organizing raw data." ) - print("ERROR: " + str(e)) + logger.info(f"ERROR: {str(e)}") return diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py index d98186b56..7936a8817 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py @@ -3,6 +3,7 @@ from urllib.request import urlretrieve from urllib.error import URLError from zipfile import ZipFile +import logging import bs4 import shutil @@ -12,7 +13,7 @@ "Images|Ground truth|Ground Truth|Metadata|Hand-annotated Ground Truth Images" ) endings = (".txt", ".csv", ".tif", ".xlsx", ".xls", ".lst") - +logger = logging.getLogger(__name__) def get_lower_tags(tag: bs4.element.Tag) -> list: """Get all tags between the tag argument and the next tag of the same type. @@ -41,14 +42,12 @@ def extract_nested_zips(name: str,zip_path:Path, extract_path:Path): with ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(extract_path) zip_path.unlink() - print(zip_path) - print(zip_path.exists()) extracted_folder_name = zip_path.stem # Name with .zip extension extracted_folder_name = extract_path.joinpath(extracted_folder_name.replace('.zip','')) remove_macosx(name,extract_path) nested_zip_files = list(extracted_folder_name.glob("*.zip")) - print(nested_zip_files) + for nested_zip_file in nested_zip_files: nested_extract_path = nested_zip_file.parent extract_nested_zips(nested_zip_file, nested_extract_path) @@ -73,8 +72,8 @@ def get_url(url: str, save_location: Path, name: str) -> None: urlretrieve(url, save_location.joinpath(file_name)) except URLError as e: if download_attempts == 9: - print("FAILED TO DOWNLOAD: " + url + " for " + name) - print("ERROR: " + str(e)) + logger.info(f"FAILED TO DOWNLOAD {url} for {name}") + logger.info(f"ERROR {str(e)}") continue elif url.endswith(".zip"): @@ -88,12 +87,12 @@ def get_url(url: str, save_location: Path, name: str) -> None: except URLError as e: if download_attempts == 9: - print("FAILED TO DOWNLOAD: " + url + " for " + name) - print("ERROR: " + str(e)) + logger.info(f"FAILED TO DOWNLOAD {url} for {name}") + logger.info(f"ERROR {str(e)}") continue except Exception as e: - print(e) + logger.info(f"{e}") continue @@ -111,10 +110,7 @@ def remove_macosx(name:str, location:Path)-> None: for f in folders: if f.name=="__MACOSX": shutil.rmtree(f) - print("Deleted the __MACOSX folder in " + name) - - - + logger.info(f"Deleted the __MACOSX folder in {name}") def download(name: str,download_path:Path) -> None: """Download a single dataset. @@ -123,7 +119,7 @@ def download(name: str,download_path:Path) -> None: downlaod_path: Path to donwload the dataset """ - print("Started downloading " + name) + logger.info(f"Started downloading {name}") download_path=download_path.joinpath("BBBC") save_location = download_path.joinpath(name, "raw") @@ -169,7 +165,8 @@ def download(name: str,download_path:Path) -> None: get_url(url, file_path, "BBBC018") - print(name + " has finished downloading") + logger.info(f"{name} has finished downloading") + images_path=save_location.joinpath("Images") remove_macosx(name,images_path) ground_path=save_location.joinpath("Ground_Truth") @@ -178,7 +175,6 @@ def download(name: str,download_path:Path) -> None: # unzip nested zip files zip_files = list(images_path.glob("**/*.zip")) - print(zip_files) for zip_file in zip_files: extract_path = zip_file.parent extract_nested_zips(name,zip_file, extract_path) From c7f019c162f42a4e97bcea89749bf1b0a3c0a556 Mon Sep 17 00:00:00 2001 From: Jane Van Lam <75lam@cua.edu> Date: Mon, 16 Mar 2026 12:21:15 -0400 Subject: [PATCH 16/18] update packeages to work with cp3.13,remove vaex update test_main.py, BBBC_model.py --- utils/bbbc-download-plugin/.bumpversion. | 0 utils/bbbc-download-plugin/Dockerfile | 2 +- utils/bbbc-download-plugin/pyproject.toml | 30 +++++----- .../plugins/utils/bbbc_download/BBBC_model.py | 58 ++++++++----------- utils/bbbc-download-plugin/tests/test_main.py | 8 +-- 5 files changed, 44 insertions(+), 54 deletions(-) delete mode 100644 utils/bbbc-download-plugin/.bumpversion. diff --git a/utils/bbbc-download-plugin/.bumpversion. b/utils/bbbc-download-plugin/.bumpversion. deleted file mode 100644 index e69de29bb..000000000 diff --git a/utils/bbbc-download-plugin/Dockerfile b/utils/bbbc-download-plugin/Dockerfile index d89987fd6..00653c1bc 100644 --- a/utils/bbbc-download-plugin/Dockerfile +++ b/utils/bbbc-download-plugin/Dockerfile @@ -1,4 +1,4 @@ -FROM polusai/bfio:2.1.9 +FROM python:3.13-slim # environment variables defined in polusai/bfio ENV EXEC_DIR="/opt/executables" diff --git a/utils/bbbc-download-plugin/pyproject.toml b/utils/bbbc-download-plugin/pyproject.toml index ad00e70f8..48275df4c 100644 --- a/utils/bbbc-download-plugin/pyproject.toml +++ b/utils/bbbc-download-plugin/pyproject.toml @@ -10,22 +10,22 @@ readme = "README.md" packages = [{include = "polus", from = "src"}] [tool.poetry.dependencies] -python = ">=3.9,<4" -typer = "^0.9.0" -pyarrow = "11.0.0" -scikit-image = "0.20.0" -vaex = "4.16.0" -bfio = "2.3.1.dev0" -beautifulsoup4 = "4.12.0" -numpy = "1.24.2" -pandas = "1.5.3" -requests = "2.28.2" -pydantic = "1.10.7" +python = ">=3.12" +typer = ">=0.24.0" +pyarrow = ">=23.0.0" +scikit-image = ">=0.25.0" +bfio = ">=2.5.0" +beautifulsoup4 = ">=4.14.3" +numpy = ">=1.26.0" +pandas = ">=2.2.3" +requests = ">=2.32.5" +pydantic = ">=2.12.5" bump2version = "1.0.1" -mypy = "1.0.1" -tqdm = "^4.65.0" -pytest = "^7.4.0" -xmlschema = "^2.3.1" +mypy = ">=1.19.1" +tqdm = ">=4.67.0" +pytest = ">=9.0.0" +xmlschema = ">=4.3.1" +lxml = ">=6.0.2" [build-system] diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py index d4e755422..11839b8e8 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/BBBC_model.py @@ -1,10 +1,11 @@ -from typing import List, Dict, Union, Optional +from typing import List, Dict, Union, Optional, Self import shutil import os from multiprocessing import cpu_count from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from zipfile import ZipFile +from pydantic import model_validator, field_validator from polus.plugins.utils.bbbc_download.download import download, get_url, remove_macosx @@ -16,7 +17,6 @@ from tqdm import tqdm import bs4 from bfio import BioWriter -import vaex from skimage import io import pyarrow as pa import pyarrow.parquet as pq @@ -54,14 +54,12 @@ class Metadata(pydantic.BaseModel): path: Path name: str - @pydantic.root_validator() - @classmethod - def valid_data(cls, values: dict) -> dict: - if not values["path"].exists(): + @model_validator(mode="after") + def validate_data(self) -> Self: + if not self.path.exists(): raise ValueError("No metadata") - - return values - + return self + @property def size(self) -> int: """Returns the size of the dataset's metadata in bytes.""" @@ -80,13 +78,12 @@ class GroundTruth(pydantic.BaseModel): path: Path name: str - @pydantic.root_validator() - @classmethod - def valid_data(cls, values: dict) -> dict: - if not values["path"].exists(): + @model_validator(mode="after") + def validate_data(self) -> Self: + if not self.path.exists(): raise ValueError("No ground truth") - return values + return self @property def size(self) -> int: @@ -106,13 +103,12 @@ class Images(pydantic.BaseModel): path: Path name: str - @pydantic.root_validator() - @classmethod - def valid_data(cls, values: dict) -> dict: - if not values["path"].exists(): + + def validate_data(self) -> Self: + if not self.path.exists(): raise ValueError("No images") - return values + return self @property def size(self) -> int: @@ -142,7 +138,7 @@ class BBBCDataset(pydantic.BaseModel): metadata: Optional[Metadata] = None output_path: Optional[Path]= None - @pydantic.validator("name") + @field_validator("name") @classmethod def valid_name(cls, v: str) -> str: """Validates the name of the dataset. @@ -154,7 +150,7 @@ def valid_name(cls, v: str) -> str: The name provided if validation is successful. """ - if v not in list(BBBC.combined_table["Accession"]): + if v not in list(BBBC.combined_table()["Accession"]): raise ValueError( v + " is an invalid dataset name. Valid dataset names belong to an existing BBBC dataset." @@ -193,7 +189,7 @@ def info(self) -> Dict[str, Union[str, np.int64]]: A dictionary that contains information about the dataset. """ - table = BBBC.combined_table + table = BBBC.combined_table() row = table.loc[table["Accession"] == self.name] @@ -299,8 +295,7 @@ def standard(self, extension: str) -> None: standard_folder = Path(root, self.name, "standard") arrow_file = Path("arrow", self.name + ".arrow") arrow_table = pq.read_table(arrow_file) - df = vaex.from_arrow_table(arrow_table) - + df = arrow_table.to_pandas() if not standard_folder.exists(): standard_folder.mkdir(parents=True, exist_ok=True) @@ -575,7 +570,6 @@ class IDAndSegmentation: table: pd.DataFrame = tables[0] @classmethod - @property def datasets(cls) -> List[BBBCDataset]: """Returns a list of all datasets in the table. @@ -593,7 +587,7 @@ def raw(cls,download_path:Path) -> None: threads = [] with ThreadPoolExecutor(max_workers=num_workers) as executor: - for dataset in IDAndSegmentation.datasets: + for dataset in IDAndSegmentation.datasets(): threads.append(executor.submit(dataset.raw(download_path))) for f in tqdm( @@ -614,7 +608,6 @@ class PhenotypeClassification: table: pd.DataFrame = tables[1] @classmethod - @property def datasets(cls) -> List[BBBCDataset]: """Returns a list of all datasets in the table. @@ -632,7 +625,7 @@ def raw(cls,download_path:Path) -> None: threads = [] with ThreadPoolExecutor(max_workers=num_workers) as executor: - for dataset in PhenotypeClassification.datasets: + for dataset in PhenotypeClassification.datasets(): threads.append(executor.submit(dataset.raw(download_path))) for f in tqdm( @@ -653,7 +646,6 @@ class ImageBasedProfiling: table: pd.DataFrame = tables[2] @classmethod - @property def datasets(cls) -> List[BBBCDataset]: """Returns a list of all datasets in the table. @@ -671,7 +663,7 @@ def raw(cls,download_path:Path) -> None: threads = [] with ThreadPoolExecutor(max_workers=num_workers) as executor: - for dataset in ImageBasedProfiling.datasets: + for dataset in ImageBasedProfiling.datasets(): threads.append(executor.submit(dataset.raw(download_path))) for f in tqdm( @@ -689,7 +681,6 @@ class BBBC: """ @classmethod - @property def datasets(cls) -> List[BBBCDataset]: """Returns a list of all datasets in BBBC. @@ -697,12 +688,11 @@ def datasets(cls) -> List[BBBCDataset]: A list containing a Dataset object for each dataset in BBBC. """ - table = BBBC.combined_table + table = BBBC.combined_table() return [BBBCDataset.create_dataset(name) for name in table["Accession"]] @classmethod - @property def combined_table(cls) -> pd.DataFrame: """Combines each table on https://bbbc.broadinstitute.org/image_sets into a single table. @@ -727,7 +717,7 @@ def raw(cls,download_path:Path) -> None: threads = [] with ThreadPoolExecutor(max_workers=num_workers) as executor: - for dataset in BBBC.datasets: + for dataset in BBBC.datasets(): threads.append(executor.submit(dataset.raw(download_path))) for f in tqdm( diff --git a/utils/bbbc-download-plugin/tests/test_main.py b/utils/bbbc-download-plugin/tests/test_main.py index 6ea042992..fd65f7cb1 100644 --- a/utils/bbbc-download-plugin/tests/test_main.py +++ b/utils/bbbc-download-plugin/tests/test_main.py @@ -46,7 +46,7 @@ def test_delete_macosx(macosx_directory) -> None: def test_bbbc_datasets() -> None: """Test to check if all the datasets on the BBBC website are recognized.""" - d_test = BBBC_model.BBBC.datasets + d_test = BBBC_model.BBBC.datasets() assert len(d_test) == 50 @@ -60,19 +60,19 @@ def test_raw(output_directory) -> None: def test_IDAndSegmentation() -> None: """Test to check if all the datasets on the Identification and segmentation table are recognized.""" - d_test_IDAndSegmentation = BBBC_model.IDAndSegmentation.datasets + d_test_IDAndSegmentation = BBBC_model.IDAndSegmentation.datasets() assert len(d_test_IDAndSegmentation) == 32 def test_PhenotypeClassification() -> None: """Test to check if all the datasets on the Phenotype CLassification table are recognized.""" - d_test_PhenotypeClassification = BBBC_model.PhenotypeClassification.datasets + d_test_PhenotypeClassification = BBBC_model.PhenotypeClassification.datasets() assert len(d_test_PhenotypeClassification) == 14 def test_ImageBasedProfiling() -> None: """Test to check if all the datasets on the Image based profiling table are recognized.""" - d_test_ImageBasedProfiling = BBBC_model.ImageBasedProfiling.datasets + d_test_ImageBasedProfiling = BBBC_model.ImageBasedProfiling.datasets() assert len(d_test_ImageBasedProfiling) == 6 From aa561ff09f07685bef2c17bf645c4b2062fd6798 Mon Sep 17 00:00:00 2001 From: Jane Van Lam <75lam@cua.edu> Date: Mon, 16 Mar 2026 12:54:53 -0400 Subject: [PATCH 17/18] update gitignore and Dockerfile --- .gitignore | 1 + utils/bbbc-download-plugin/Dockerfile | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 9649798c4..975d1c690 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,4 @@ data src/polus/plugins/_plugins/manifests/* # allow python scripts insied manifests dir !src/polus/plugins/_plugins/manifests/*.py +uv.lock diff --git a/utils/bbbc-download-plugin/Dockerfile b/utils/bbbc-download-plugin/Dockerfile index 00653c1bc..4f10e8d3b 100644 --- a/utils/bbbc-download-plugin/Dockerfile +++ b/utils/bbbc-download-plugin/Dockerfile @@ -9,10 +9,11 @@ ENV POLUS_LOG="INFO" # Work directory defined in the base container WORKDIR ${EXEC_DIR} -COPY pyproject.toml ${EXEC_DIR} -COPY VERSION ${EXEC_DIR} -COPY README.md ${EXEC_DIR} -COPY src ${EXEC_DIR}/src +# When building from repo root: -f utils/bbbc-download-plugin/Dockerfile . +COPY utils/bbbc-download-plugin/pyproject.toml ${EXEC_DIR} +COPY utils/bbbc-download-plugin/VERSION ${EXEC_DIR} +COPY utils/bbbc-download-plugin/README.md ${EXEC_DIR} +COPY utils/bbbc-download-plugin/src ${EXEC_DIR}/src RUN pip3 install ${EXEC_DIR} --no-cache-dir From bd1ab204952616f97fc74795e88886ff7eaaa077 Mon Sep 17 00:00:00 2001 From: Jane Van Lam <75lam@cua.edu> Date: Mon, 16 Mar 2026 12:55:31 -0400 Subject: [PATCH 18/18] =?UTF-8?q?Bump=20version:=200.1.0-dev1=20=E2=86=92?= =?UTF-8?q?=200.1.1-dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- utils/bbbc-download-plugin/.bumpversion.cfg | 2 +- utils/bbbc-download-plugin/VERSION | 2 +- utils/bbbc-download-plugin/plugin.json | 4 ++-- utils/bbbc-download-plugin/pyproject.toml | 2 +- .../src/polus/plugins/utils/bbbc_download/__init__.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/utils/bbbc-download-plugin/.bumpversion.cfg b/utils/bbbc-download-plugin/.bumpversion.cfg index 2495ab3c5..ba5924e46 100644 --- a/utils/bbbc-download-plugin/.bumpversion.cfg +++ b/utils/bbbc-download-plugin/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.0-dev1 +current_version = 0.1.1-dev0 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/utils/bbbc-download-plugin/VERSION b/utils/bbbc-download-plugin/VERSION index 6b1a238a7..44bf4db83 100644 --- a/utils/bbbc-download-plugin/VERSION +++ b/utils/bbbc-download-plugin/VERSION @@ -1 +1 @@ -0.1.0-dev1 +0.1.1-dev0 diff --git a/utils/bbbc-download-plugin/plugin.json b/utils/bbbc-download-plugin/plugin.json index 8246dd3ac..493442d4b 100644 --- a/utils/bbbc-download-plugin/plugin.json +++ b/utils/bbbc-download-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "BBBC Download", - "version": "0.1.0-dev1", + "version": "0.1.1-dev0", "title": "BBBC Download", "description": "Downloads the datasets on the Broad Bioimage Benchmark Collection website", "author": "Saket Prem(saket.prem@axleinfo.com), Matthew McIntyre(Matthew.McIntyre@axleinfo.com)", @@ -8,7 +8,7 @@ "repository": "https://github.com/PolusAI/polus-plugins", "website": "https://ncats.nih.gov/preclinical/core/informatics", "citation": "", - "containerId": "polusai/bbbc-download-plugin:0.1.0-dev1", + "containerId": "polusai/bbbc-download-plugin:0.1.1-dev0", "baseCommand": [ "python3", "-m", diff --git a/utils/bbbc-download-plugin/pyproject.toml b/utils/bbbc-download-plugin/pyproject.toml index 48275df4c..f1b4ac73f 100644 --- a/utils/bbbc-download-plugin/pyproject.toml +++ b/utils/bbbc-download-plugin/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "polus-plugins-utils-bbbc-download-plugin" -version = "0.1.0-dev1" +version = "0.1.1-dev0" description = "" authors = [ "Saket Prem ", diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py index 223215dcf..1e5dffd13 100644 --- a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py @@ -1,2 +1,2 @@ """Bbbc Download.""" -__version__ = "0.1.0-dev1" +__version__ = "0.1.1-dev0"