From 84f0e6492ac1ec635ea1094925c3ce39034ba347 Mon Sep 17 00:00:00 2001
From: Abe Arab <abarbiology@gmail.com>
Date: Tue, 24 Dec 2024 04:20:45 -0800
Subject: [PATCH 1/6] update dataverse module transfer codes from #124

---
 gget/__init__.py       |  1 +
 gget/constants.py      |  3 ++
 gget/gget_dataverse.py | 89 ++++++++++++++++++++++++++++++++++++++++++
 gget/main.py           | 43 +++++++++++++++++++-
 gget/utils.py          |  9 +++++
 5 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 gget/gget_dataverse.py

diff --git a/gget/__init__.py b/gget/__init__.py
index e0b42da6b..7c5494175 100644
--- a/gget/__init__.py
+++ b/gget/__init__.py
@@ -19,6 +19,7 @@
 from .gget_opentargets import opentargets
 from .gget_cbio import cbio_plot, cbio_search
 from .gget_bgee import bgee
+from .gget_dataverse import dataverse
 
 import logging
 # Mute numexpr threads info
diff --git a/gget/constants.py b/gget/constants.py
index a61f9a113..c2784c085 100644
--- a/gget/constants.py
+++ b/gget/constants.py
@@ -66,6 +66,9 @@
 # OpenTargets API endpoint
 OPENTARGETS_GRAPHQL_API = "https://api.platform.opentargets.org/api/v4/graphql"
 
+# Harvard dataverse API server
+DATAVERSE_GET_URL = "https://dataverse.harvard.edu/api/access/datafile/"
+
 # CBIO data
 CBIO_CANCER_TYPE_TO_TISSUE_DICTIONARY = {
     "Acute Leukemias of Ambiguous Lineage": "leukemia",
diff --git a/gget/gget_dataverse.py b/gget/gget_dataverse.py
new file mode 100644
index 000000000..7d63f58e2
--- /dev/null
+++ b/gget/gget_dataverse.py
@@ -0,0 +1,89 @@
+import os
+import requests
+from tqdm import tqdm
+import pandas as pd
+import pandas as pd
+from .utils import print_sys
+from .constants import DATAVERSE_GET_URL
+
+def dataverse_downloader(url, path, file_name):
+    """dataverse download helper with progress bar
+
+    Args:
+        url (str): the url of the dataset to download
+        path (str): the path to save the dataset locally
+        file_name (str): the name of the file to save locally
+    """
+    save_path = os.path.join(path, file_name)
+    response = requests.get(url, stream=True)
+    total_size_in_bytes = int(response.headers.get("content-length", 0))
+    block_size = 1024
+    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+    with open(save_path, "wb") as file:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            file.write(data)
+    progress_bar.close()
+
+
+def download_wrapper(entry, path, return_type=None):
+    """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files
+
+    Args:
+        entry (dict): the entry of the dataset to download. Must include 'id', 'name', 'type' keys
+        path (str): the path to save the dataset locally
+        return_type (str, optional): the return type. Defaults to None. Can be "url", "filename", or ["url", "filename"]
+
+    Returns:
+        str: the exact dataset query name
+    """
+    url = DATAVERSE_GET_URL + str(entry['id'])
+
+    if not os.path.exists(path):
+        os.mkdir(path)
+
+    filename = f"{entry['name']}.{entry['type']}"
+
+    if os.path.exists(os.path.join(path, filename)):
+        print_sys(f"Found local copy for {entry['id']} datafile as {filename} ...")
+        os.path.join(path, filename)
+    else:
+        print_sys(f"Downloading {entry['id']} datafile as {filename} ...")
+        dataverse_downloader(url, path, filename)
+    
+    if return_type == "url":
+        return url
+    elif return_type == "filename":
+        return filename
+    elif return_type == ["url", "filename"]:
+        return url, filename
+
+
+def dataverse(df, path, sep=","):
+    """download datasets from dataverse for a given dataframe
+    Input dataframe must have 'name', 'id', 'type' columns.
+    - 'name' is the dataset name for single file
+    - 'id' is the unique identifier for the file
+    - 'type' is the file type (e.g. csv, tsv, pkl)
+
+    Args:
+        df (pd.DataFrame or str): the dataframe or path to the csv/tsv file
+        path (str): the path to save the dataset locally
+    """
+    if type(df) == str:
+        if os.path.exists(df):
+            df = pd.read_csv(df, sep=sep)
+        else:
+            raise FileNotFoundError(f"File {df} not found")
+    elif type(df) == pd.DataFrame:
+        pass
+    else:
+        raise ValueError("Input must be a pandas dataframe or a path to a csv / tsv file")
+    
+    print_sys(f"Searching for {len(df)} datafiles in dataverse ...")
+
+    # run the download wrapper for each entry in the dataframe
+    for _, entry in df.iterrows():
+        download_wrapper(entry, path)
+    
+    print_sys(f"Download completed, saved to `{path}`.")
\ No newline at end of file
diff --git a/gget/main.py b/gget/main.py
index 876d16123..1a1093ea3 100644
--- a/gget/main.py
+++ b/gget/main.py
@@ -39,7 +39,7 @@
 from .gget_opentargets import opentargets, OPENTARGETS_RESOURCES
 from .gget_cbio import cbio_plot, cbio_search
 from .gget_bgee import bgee
-
+from .gget_dataverse import dataverse
 
 # Custom formatter for help messages that preserved the text formatting and adds the default value to the end of the help message
 class CustomHelpFormatter(argparse.RawTextHelpFormatter):
@@ -2335,6 +2335,32 @@ def main():
         help="Does not print progress information.",
     )
 
+    ## dataverse parser arguments
+    dataverse_desc = "Download datasets from the Dataverse repositories."
+    parser_dataverse = parent_subparsers.add_parser(
+        "dataverse",
+        parents=[parent],
+        description=dataverse_desc,
+        help=dataverse_desc,
+        add_help=True,
+        formatter_class=CustomHelpFormatter,
+    )
+    parser_dataverse.add_argument(
+        "-o",
+        "--path",
+        type=str,
+        required=True,
+        help="Path to the directory the datasets will be saved in, e.g. 'path/to/directory'.",
+    )
+    parser_dataverse.add_argument(
+        "-t",
+        "--table",
+        type=str,
+        default=None,
+        required=False,
+        help="File containing the dataset IDs to download, e.g. 'datasets.tsv'.",
+    )
+    
     ### Define return values
     args = parent_parser.parse_args()
 
@@ -2386,6 +2412,7 @@ def main():
         "opentargets": parser_opentargets,
         "cbio": parser_cbio,
         "bgee": parser_bgee,
+        "dataverse": parser_dataverse,
     }
 
     if len(sys.argv) == 2:
@@ -3295,3 +3322,17 @@ def main():
                 print(
                     bgee_results.to_json(orient="records", force_ascii=False, indent=4)
                 )
+
+    ## dataverse return
+    if args.command == "dataverse":
+        # Define separator based on file extension
+        if '.csv' in args.table:
+            sep = ','
+        elif '.tsv' in args.table:
+            sep = '\t'
+        # Run gget dataverse function
+        dataverse(
+            df = args.table,
+            path = args.out,
+            sep = sep,
+        )
diff --git a/gget/utils.py b/gget/utils.py
index 2483c3e1a..a85d42067 100644
--- a/gget/utils.py
+++ b/gget/utils.py
@@ -5,6 +5,7 @@
 # import time
 import re
 import os
+import sys
 import uuid
 import pandas as pd
 import numpy as np
@@ -66,6 +67,14 @@ def flatten(xss):
     return [x for xs in xss for x in xs]
 
 
+def print_sys(s):
+    """system print
+    Args:
+        s (str): the string to print
+    """
+    print(s, flush = True, file = sys.stderr)
+
+
 def get_latest_cosmic():
     html = requests.get(COSMIC_RELEASE_URL)
     if html.status_code != 200:

From da4dacf4d6a274e34a2b5ed92144131ee76ade16 Mon Sep 17 00:00:00 2001
From: Abe Arab <abarbiology@gmail.com>
Date: Tue, 24 Dec 2024 04:23:33 -0800
Subject: [PATCH 2/6] draft unittest for dataverse

---
 tests/test_dataverse.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 tests/test_dataverse.py

diff --git a/tests/test_dataverse.py b/tests/test_dataverse.py
new file mode 100644
index 000000000..c19163540
--- /dev/null
+++ b/tests/test_dataverse.py
@@ -0,0 +1,25 @@
+import unittest
+import pandas as pd
+from gget.gget_dataverse import dataverse
+import os
+import shutil
+
+#TODO: Verify the test code, this is drafted using co-pilot!
+class TestDataverse(unittest.TestCase):
+    def test_dataverse_download(self):
+        df = pd.DataFrame({
+            'id': [6180617],
+            'name': ['nodes'],
+            'type': ['tab']
+        })
+
+        dataverse(df, 'temp_datasets')
+
+        # Check if the file is downloaded
+        self.assertTrue(os.path.exists('temp_datasets/nodes.tab'))
+
+        # Clean up by removing the datasets folder
+        shutil.rmtree('temp_datasets')
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file

From 7e0adb02e864130942b55c0010bbeec3ec4281e8 Mon Sep 17 00:00:00 2001
From: "Laura Luebbert, Ph.D."
 <56094636+lauraluebbert@users.noreply.github.com>
Date: Thu, 11 Sep 2025 18:53:02 -0400
Subject: [PATCH 3/6] Bump dev version

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index f3770fd54..92471b523 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = gget
-version = 0.29.3
+version = 0.29.4
 author = Laura Luebbert
 author_email = lauralubbert@gmail.com
 maintainer = Laura Luebbert

From 9159971b9199f924b68c2b09d65048cce6e19e54 Mon Sep 17 00:00:00 2001
From: "Laura Luebbert, Ph.D."
 <56094636+lauraluebbert@users.noreply.github.com>
Date: Thu, 11 Sep 2025 18:58:21 -0400
Subject: [PATCH 4/6] Add missing biopython dependency

---
 gget/gget_setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gget/gget_setup.py b/gget/gget_setup.py
index 4a97f35cb..45ab4673f 100644
--- a/gget/gget_setup.py
+++ b/gget/gget_setup.py
@@ -279,6 +279,7 @@ def setup(module, verbose=True, out=None):
         # Core AlphaFold dependencies (Colab/CPU friendly set)
         alphafold_deps = [
             "absl-py>=2.1,<3",
+            "biopython",
             "dm-haiku<=0.0.12",          # dont upgrade to avoid clash with jax
             "dm-tree>=0.1.8",
             "filelock>=3.12",

From 90dab85143c084d519115d92ae8c2b460accf452 Mon Sep 17 00:00:00 2001
From: josephrich98 <josephrich98@gmail.com>
Date: Mon, 15 Sep 2025 14:21:56 -0700
Subject: [PATCH 5/6] removed type check from cellxgenes for python 3.9

---
 gget/gget_cellxgene.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gget/gget_cellxgene.py b/gget/gget_cellxgene.py
index 8d077a687..e2d45241c 100644
--- a/gget/gget_cellxgene.py
+++ b/gget/gget_cellxgene.py
@@ -22,7 +22,7 @@ def _listify(x):
         return [x]
 
 
-def _build_obs_filter(filters: dict, is_primary_data: bool) -> str | None:
+def _build_obs_filter(filters: dict, is_primary_data: bool):
     """
     Build a SOMA obs value_filter string like:
         "is_primary_data == True and tissue in ['lung'] and cell_type in ['muscle cell']"

From 5b49314f3564926e560953bd530880b70802d4b3 Mon Sep 17 00:00:00 2001
From: "Laura Luebbert, Ph.D."
 <56094636+lauraluebbert@users.noreply.github.com>
Date: Sun, 28 Sep 2025 16:18:07 -0400
Subject: [PATCH 6/6] remove duplicated function

---
 gget/gget_search.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/gget/gget_search.py b/gget/gget_search.py
index 4d67b1dd1..221bb6d96 100644
--- a/gget/gget_search.py
+++ b/gget/gget_search.py
@@ -21,17 +21,6 @@
 from gget.constants import ENSEMBL_FTP_URL, ENSEMBL_FTP_URL_NV
 
 
-def clean_cols(x):
-    if isinstance(x, list):
-        unique_list = list(set(x))
-        if len(unique_list) == 1:
-            return unique_list[0]
-        else:
-            return unique_list
-    else:
-        return x
-
-
 def clean_cols(x):
     if isinstance(x, list):
         unique_list = list(set(x))