From 84f0e6492ac1ec635ea1094925c3ce39034ba347 Mon Sep 17 00:00:00 2001 From: Abe Arab Date: Tue, 24 Dec 2024 04:20:45 -0800 Subject: [PATCH 1/6] update dataverse module transfer codes from #124 --- gget/__init__.py | 1 + gget/constants.py | 3 ++ gget/gget_dataverse.py | 89 ++++++++++++++++++++++++++++++++++++++++++ gget/main.py | 43 +++++++++++++++++++- gget/utils.py | 9 +++++ 5 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 gget/gget_dataverse.py diff --git a/gget/__init__.py b/gget/__init__.py index e0b42da6b..7c5494175 100644 --- a/gget/__init__.py +++ b/gget/__init__.py @@ -19,6 +19,7 @@ from .gget_opentargets import opentargets from .gget_cbio import cbio_plot, cbio_search from .gget_bgee import bgee +from .gget_dataverse import dataverse import logging # Mute numexpr threads info diff --git a/gget/constants.py b/gget/constants.py index a61f9a113..c2784c085 100644 --- a/gget/constants.py +++ b/gget/constants.py @@ -66,6 +66,9 @@ # OpenTargets API endpoint OPENTARGETS_GRAPHQL_API = "https://api.platform.opentargets.org/api/v4/graphql" +# Harvard dataverse API server +DATAVERSE_GET_URL = "https://dataverse.harvard.edu/api/access/datafile/" + # CBIO data CBIO_CANCER_TYPE_TO_TISSUE_DICTIONARY = { "Acute Leukemias of Ambiguous Lineage": "leukemia", diff --git a/gget/gget_dataverse.py b/gget/gget_dataverse.py new file mode 100644 index 000000000..7d63f58e2 --- /dev/null +++ b/gget/gget_dataverse.py @@ -0,0 +1,89 @@ +import os +import requests +from tqdm import tqdm +import pandas as pd +import pandas as pd +from .utils import print_sys +from .constants import DATAVERSE_GET_URL + +def dataverse_downloader(url, path, file_name): + """dataverse download helper with progress bar + + Args: + url (str): the url of the dataset to download + path (str): the path to save the dataset locally + file_name (str): the name of the file to save locally + """ + save_path = os.path.join(path, file_name) + response = requests.get(url, stream=True) + total_size_in_bytes = int(response.headers.get("content-length", 0)) + block_size = 1024 + progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + with open(save_path, "wb") as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + + +def download_wrapper(entry, path, return_type=None): + """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files + + Args: + entry (dict): the entry of the dataset to download. Must include 'id', 'name', 'type' keys + path (str): the path to save the dataset locally + return_type (str, optional): the return type. Defaults to None. Can be "url", "filename", or ["url", "filename"] + + Returns: + str: the exact dataset query name + """ + url = DATAVERSE_GET_URL + str(entry['id']) + + if not os.path.exists(path): + os.mkdir(path) + + filename = f"{entry['name']}.{entry['type']}" + + if os.path.exists(os.path.join(path, filename)): + print_sys(f"Found local copy for {entry['id']} datafile as {filename} ...") + os.path.join(path, filename) + else: + print_sys(f"Downloading {entry['id']} datafile as {filename} ...") + dataverse_downloader(url, path, filename) + + if return_type == "url": + return url + elif return_type == "filename": + return filename + elif return_type == ["url", "filename"]: + return url, filename + + +def dataverse(df, path, sep=","): + """download datasets from dataverse for a given dataframe + Input dataframe must have 'name', 'id', 'type' columns. + - 'name' is the dataset name for single file + - 'id' is the unique identifier for the file + - 'type' is the file type (e.g. csv, tsv, pkl) + + Args: + df (pd.DataFrame or str): the dataframe or path to the csv/tsv file + path (str): the path to save the dataset locally + """ + if type(df) == str: + if os.path.exists(df): + df = pd.read_csv(df, sep=sep) + else: + raise FileNotFoundError(f"File {df} not found") + elif type(df) == pd.DataFrame: + pass + else: + raise ValueError("Input must be a pandas dataframe or a path to a csv / tsv file") + + print_sys(f"Searching for {len(df)} datafiles in dataverse ...") + + # run the download wrapper for each entry in the dataframe + for _, entry in df.iterrows(): + download_wrapper(entry, path) + + print_sys(f"Download completed, saved to `{path}`.") \ No newline at end of file diff --git a/gget/main.py b/gget/main.py index 876d16123..1a1093ea3 100644 --- a/gget/main.py +++ b/gget/main.py @@ -39,7 +39,7 @@ from .gget_opentargets import opentargets, OPENTARGETS_RESOURCES from .gget_cbio import cbio_plot, cbio_search from .gget_bgee import bgee - +from .gget_dataverse import dataverse # Custom formatter for help messages that preserved the text formatting and adds the default value to the end of the help message class CustomHelpFormatter(argparse.RawTextHelpFormatter): @@ -2335,6 +2335,32 @@ def main(): help="Does not print progress information.", ) + ## dataverse parser arguments + dataverse_desc = "Download datasets from the Dataverse repositories." + parser_dataverse = parent_subparsers.add_parser( + "dataverse", + parents=[parent], + description=dataverse_desc, + help=dataverse_desc, + add_help=True, + formatter_class=CustomHelpFormatter, + ) + parser_dataverse.add_argument( + "-o", + "--path", + type=str, + required=True, + help="Path to the directory the datasets will be saved in, e.g. 'path/to/directory'.", + ) + parser_dataverse.add_argument( + "-t", + "--table", + type=str, + default=None, + required=False, + help="File containing the dataset IDs to download, e.g. 'datasets.tsv'.", + ) + ### Define return values args = parent_parser.parse_args() @@ -2386,6 +2412,7 @@ def main(): "opentargets": parser_opentargets, "cbio": parser_cbio, "bgee": parser_bgee, + "dataverse": parser_dataverse, } if len(sys.argv) == 2: @@ -3295,3 +3322,17 @@ def main(): print( bgee_results.to_json(orient="records", force_ascii=False, indent=4) ) + + ## dataverse return + if args.command == "dataverse": + # Define separator based on file extension + if '.csv' in args.table: + sep = ',' + elif '.tsv' in args.table: + sep = '\t' + # Run gget dataverse function + dataverse( + df = args.table, + path = args.out, + sep = sep, + ) diff --git a/gget/utils.py b/gget/utils.py index 2483c3e1a..a85d42067 100644 --- a/gget/utils.py +++ b/gget/utils.py @@ -5,6 +5,7 @@ # import time import re import os +import sys import uuid import pandas as pd import numpy as np @@ -66,6 +67,14 @@ def flatten(xss): return [x for xs in xss for x in xs] +def print_sys(s): + """system print + Args: + s (str): the string to print + """ + print(s, flush = True, file = sys.stderr) + + def get_latest_cosmic(): html = requests.get(COSMIC_RELEASE_URL) if html.status_code != 200: From da4dacf4d6a274e34a2b5ed92144131ee76ade16 Mon Sep 17 00:00:00 2001 From: Abe Arab Date: Tue, 24 Dec 2024 04:23:33 -0800 Subject: [PATCH 2/6] draft unittest for dataverse --- tests/test_dataverse.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 tests/test_dataverse.py diff --git a/tests/test_dataverse.py b/tests/test_dataverse.py new file mode 100644 index 000000000..c19163540 --- /dev/null +++ b/tests/test_dataverse.py @@ -0,0 +1,25 @@ +import unittest +import pandas as pd +from gget.gget_dataverse import dataverse +import os +import shutil + +#TODO: Verify the test code, this is drafted using co-pilot! +class TestDataverse(unittest.TestCase): + def test_dataverse_download(self): + df = pd.DataFrame({ + 'id': [6180617], + 'name': ['nodes'], + 'type': ['tab'] + }) + + dataverse(df, 'temp_datasets') + + # Check if the file is downloaded + self.assertTrue(os.path.exists('temp_datasets/nodes.tab')) + + # Clean up by removing the datasets folder + shutil.rmtree('temp_datasets') + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From 7e0adb02e864130942b55c0010bbeec3ec4281e8 Mon Sep 17 00:00:00 2001 From: "Laura Luebbert, Ph.D." <56094636+lauraluebbert@users.noreply.github.com> Date: Thu, 11 Sep 2025 18:53:02 -0400 Subject: [PATCH 3/6] Bump dev version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index f3770fd54..92471b523 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = gget -version = 0.29.3 +version = 0.29.4 author = Laura Luebbert author_email = lauralubbert@gmail.com maintainer = Laura Luebbert From 9159971b9199f924b68c2b09d65048cce6e19e54 Mon Sep 17 00:00:00 2001 From: "Laura Luebbert, Ph.D." <56094636+lauraluebbert@users.noreply.github.com> Date: Thu, 11 Sep 2025 18:58:21 -0400 Subject: [PATCH 4/6] Add missing biopython dependency --- gget/gget_setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gget/gget_setup.py b/gget/gget_setup.py index 4a97f35cb..45ab4673f 100644 --- a/gget/gget_setup.py +++ b/gget/gget_setup.py @@ -279,6 +279,7 @@ def setup(module, verbose=True, out=None): # Core AlphaFold dependencies (Colab/CPU friendly set) alphafold_deps = [ "absl-py>=2.1,<3", + "biopython", "dm-haiku<=0.0.12", # dont upgrade to avoid clash with jax "dm-tree>=0.1.8", "filelock>=3.12", From 90dab85143c084d519115d92ae8c2b460accf452 Mon Sep 17 00:00:00 2001 From: josephrich98 Date: Mon, 15 Sep 2025 14:21:56 -0700 Subject: [PATCH 5/6] removed type check from cellxgenes for python 3.9 --- gget/gget_cellxgene.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gget/gget_cellxgene.py b/gget/gget_cellxgene.py index 8d077a687..e2d45241c 100644 --- a/gget/gget_cellxgene.py +++ b/gget/gget_cellxgene.py @@ -22,7 +22,7 @@ def _listify(x): return [x] -def _build_obs_filter(filters: dict, is_primary_data: bool) -> str | None: +def _build_obs_filter(filters: dict, is_primary_data: bool): """ Build a SOMA obs value_filter string like: "is_primary_data == True and tissue in ['lung'] and cell_type in ['muscle cell']" From 5b49314f3564926e560953bd530880b70802d4b3 Mon Sep 17 00:00:00 2001 From: "Laura Luebbert, Ph.D." <56094636+lauraluebbert@users.noreply.github.com> Date: Sun, 28 Sep 2025 16:18:07 -0400 Subject: [PATCH 6/6] remove duplicated function --- gget/gget_search.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/gget/gget_search.py b/gget/gget_search.py index 4d67b1dd1..221bb6d96 100644 --- a/gget/gget_search.py +++ b/gget/gget_search.py @@ -21,17 +21,6 @@ from gget.constants import ENSEMBL_FTP_URL, ENSEMBL_FTP_URL_NV -def clean_cols(x): - if isinstance(x, list): - unique_list = list(set(x)) - if len(unique_list) == 1: - return unique_list[0] - else: - return unique_list - else: - return x - - def clean_cols(x): if isinstance(x, list): unique_list = list(set(x))