From 2a6050728899a9a46d661ab9ce4da001665a2242 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 17 Feb 2026 23:48:58 -0800 Subject: [PATCH 01/19] feat: egfr interactome We get higher counts of UniProt -> STRING interactions than usual, though I haven't trimmed it yet. --- cache/directory.py | 7 ++ datasets/egfr/.gitignore | 2 + datasets/egfr/Snakefile | 24 +++++ .../egfr/scripts/process_gold_standard.py | 23 +++++ datasets/egfr/scripts/process_interactome.py | 14 +++ egfr/egfr-param-tuning.yaml | 87 ------------------- pyproject.toml | 4 + tools/mapping/__init__.py | 0 tools/mapping/ensembl_uniprot.py | 57 ++++++++++++ 9 files changed, 131 insertions(+), 87 deletions(-) create mode 100644 datasets/egfr/.gitignore create mode 100644 datasets/egfr/Snakefile create mode 100644 datasets/egfr/scripts/process_gold_standard.py create mode 100644 datasets/egfr/scripts/process_interactome.py delete mode 100644 egfr/egfr-param-tuning.yaml create mode 100644 tools/mapping/__init__.py create mode 100644 tools/mapping/ensembl_uniprot.py diff --git a/cache/directory.py b/cache/directory.py index b308f1b5..b94594b1 100644 --- a/cache/directory.py +++ b/cache/directory.py @@ -214,6 +214,13 @@ def download(self, output: str | PathLike): cached="https://drive.google.com/uc?id=1-_zF9oKFCNmJbDCC2vq8OM17HJw80s2T" ), }, + "EGFR": { + "eight-egfr-reference-all.txt": CacheItem( + name="EGFR Gold Standard Reference", + online="https://raw.githubusercontent.com/gitter-lab/tps/ca7cafd1e1c17f45ddea07c3fb54d0d70b86ff45/data/resources/eight-egfr-reference-all.txt", + cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw" + ) + }, } diff --git a/datasets/egfr/.gitignore b/datasets/egfr/.gitignore new file mode 100644 index 00000000..4b8fba9e --- /dev/null +++ b/datasets/egfr/.gitignore @@ -0,0 +1,2 @@ +raw +processed diff --git a/datasets/egfr/Snakefile b/datasets/egfr/Snakefile new file mode 100644 index 00000000..0f77b504 --- /dev/null +++ b/datasets/egfr/Snakefile @@ -0,0 +1,24 @@ +include: "../../cache/Snakefile" + +rule all: + input: + "processed/gold-standard-nodes.txt", + "processed/interactome.tsv", + +produce_fetch_rules({ + "raw/eight-egfr-reference-all.txt": ["EGFR", "eight-egfr-reference-all.txt"], + "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True), + "raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True), +}) + +rule process_gold_standard: + input: + "raw/HUMAN_9606_idmapping_selected.tsv", + "raw/eight-egfr-reference-all.txt" + output: "processed/gold-standard-nodes.txt" + shell: "uv run scripts/process_gold_standard.py" + +rule process_interactome: + input: "raw/9606.protein.links.txt" + output: "processed/interactome.tsv" + shell: "uv run scripts/process_interactome.py" diff --git a/datasets/egfr/scripts/process_gold_standard.py b/datasets/egfr/scripts/process_gold_standard.py new file mode 100644 index 00000000..e9dd23dc --- /dev/null +++ b/datasets/egfr/scripts/process_gold_standard.py @@ -0,0 +1,23 @@ +import pandas +from pathlib import Path +from tools.mapping.ensembl_uniprot import idmapping_uniprot_mapping + +egfr_directory = Path(__file__).parent.resolve() / '..' + +def main(): + # First, we remove all PSUEDONODES (and any duplicates) + nodes = (egfr_directory / 'raw' / 'eight-egfr-reference-all.txt').read_text().splitlines() + nodes = list(set([node for node in nodes if not node.endswith("_PSEUDONODE")])) + + # Then, we map our UniProt nodes to ENSP. + idmapping_df = idmapping_uniprot_mapping(egfr_directory / 'raw' / 'HUMAN_9606_idmapping_selected.tsv') + idmapping_df = pandas.DataFrame(nodes, columns=['UniProtKB-ID']).merge(idmapping_df, on='UniProtKB-ID', how='left') + idmapping_df = idmapping_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl']) + idmapping_df = idmapping_df[~idmapping_df['Ensembl_PRO'].isna()] + nodes = idmapping_df['Ensembl_PRO'].astype(str).to_list() + + (egfr_directory / 'processed').mkdir(exist_ok=True) + (egfr_directory / 'processed' / 'gold-standard-nodes.txt').write_text("\n".join(nodes)) + +if __name__ == "__main__": + main() diff --git a/datasets/egfr/scripts/process_interactome.py b/datasets/egfr/scripts/process_interactome.py new file mode 100644 index 00000000..b733e08a --- /dev/null +++ b/datasets/egfr/scripts/process_interactome.py @@ -0,0 +1,14 @@ +from pathlib import Path +import pandas + +egfr_directory = Path(__file__).parent.resolve() / '..' + +def main(): + interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.txt', sep='\t') + interactome_df['Direction'] = 'U' + + (egfr_directory / 'processed').mkdir(exist_ok=True) + interactome_df.to_csv(egfr_directory / 'processed' / 'interactome.tsv', index=False, header=False, sep='\t') + +if __name__ == "__main__": + main() diff --git a/egfr/egfr-param-tuning.yaml b/egfr/egfr-param-tuning.yaml deleted file mode 100644 index 30cacedb..00000000 --- a/egfr/egfr-param-tuning.yaml +++ /dev/null @@ -1,87 +0,0 @@ -hash_length: 7 -container_framework: docker -unpack_singularity: false -container_registry: - base_url: docker.io - owner: reedcompbio -algorithms: - - name: omicsintegrator2 - params: - include: true - run1: - b: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - g: [2, 3, 4, 5, 6, 7] - w: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - - name: domino - params: - include: true - run1: - module_threshold: [0.001, 0.01, 0.02] - slice_threshold: [0.001, 0.1, 0.3, 0.9, 1] - - name: mincostflow - params: - include: true - run1: - capacity: [1, 5, 10, 15] - flow: [6, 8, 20, 50, 60, 70, 80, 90, 150] - - name: pathlinker - params: - include: true - run1: - k: [10, 20, 30, 40, 50, 60, 100, 200, 500] - - name: allpairs - params: - include: true - - name: meo - params: - include: true - run1: - local_search: ['No'] - max_path_length: [2] - rand_restarts: [10] - - name: omicsintegrator1 - params: - include: true - run1: - b: [0.01, 0.55, 2, 5, 10] - d: [10, 20, 30, 40] - g: [0.0001, 0.001] - mu: [0.001, 0.005, 0.008, 0.02, 0.03] - r: [0.01, 0.1, 1] - w: [0.001, 0.1, 0.5, 2, 8] -datasets: - - label: tps_egfr - node_files: - - tps-egfr-prizes.txt - edge_files: - - phosphosite-irefindex13.0-uniprot.txt - other_files: [] - data_dir: input -gold_standards: - - label: gs_egfr - node_files: - - gs-egfr.txt - data_dir: input - dataset_labels: - - tps_egfr -reconstruction_settings: - locations: - reconstruction_dir: output/tps_egfr - run: true -analysis: - summary: - include: true - graphspace: - include: false - cytoscape: - include: false - ml: - include: true - aggregate_per_algorithm: true - components: 4 - labels: false - linkage: ward - metric: euclidean - evaluation: - include: false - aggregate_per_algorithm: false diff --git a/pyproject.toml b/pyproject.toml index 9a071ecd..4df36774 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,3 +25,7 @@ package = true [tool.setuptools.packages] find = {namespaces = false} + +[build-system] +requires = ["setuptools", "wheel", "pip"] +build-backend = "setuptools.build_meta" diff --git a/tools/mapping/__init__.py b/tools/mapping/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tools/mapping/ensembl_uniprot.py b/tools/mapping/ensembl_uniprot.py new file mode 100644 index 00000000..73b548d9 --- /dev/null +++ b/tools/mapping/ensembl_uniprot.py @@ -0,0 +1,57 @@ +import pandas +import os + +""" +Utilities for mapping Ensembl and UniProt. + +For example, +```py +idmapping_uniprot_ensembl = idmapping_uniprot_mapping(path / "HUMAN_9606_idmapping_selected.tsv") +``` + +then you can use the `idmapping_as_ensg_uniprot_mapping` or `idmapping_as_ensp_uniprot_mapping` to restrict the mapping to specifically +ENSG or ENSP. +""" + +def handle_ensembl_list( + idmapping_df: pandas.DataFrame, + column_name: str +) -> pandas.DataFrame: + idmapping_df = idmapping_df[idmapping_df[column_name].notnull()] + # Handle our ;-delimited list + idmapping_df[column_name] = idmapping_df[column_name].str.split("; ") + idmapping_df = idmapping_df.explode(column_name) + # Drop isoforms + idmapping_df[column_name] = idmapping_df[column_name].str.split(".").str[0] + idmapping_df = idmapping_df.reset_index(drop=True) + return idmapping_df + +def idmapping_uniprot_mapping( + path: str | os.PathLike + ) -> pandas.DataFrame: + """ + Gets the UniProt mapping file (`*_idmapping_selected`) as a dataframe with columns + UniProtKB-AC: High-quality UniProt IDs + Ensembl: ENSG + Ensembl_PRO: ENSG (Ensembl Protein IDs) + """ + # The very powerful UniProt-provided mapping file: its Ensembl mappings are a semicolon-delimeted list of Emsembl IDs containing + # attached isoforms (and not all UniProtKB-AC identifiers have those!) so we'll need to do some extra post-processing. + # This is `*_idmapping_selected`. + idmapping_selected_df = pandas.read_csv( + path, + header=None, + # See directory.py for the README associated with this mapping file. + usecols=[0, 1, 18, 20], + names=["UniProtKB-AC", "UniProtKB-ID", "Ensembl", "Ensembl_PRO"], + sep="\t", + ) + idmapping_selected_df = handle_ensembl_list(idmapping_selected_df, "Ensembl") + idmapping_selected_df = handle_ensembl_list(idmapping_selected_df, "Ensembl_PRO") + return idmapping_selected_df + +def idmapping_as_ensg_uniprot_mapping(uniprot_mapping: pandas.DataFrame): + return uniprot_mapping.drop(columns=["Ensembl_PRO"]) + +def idmapping_as_ensp_uniprot_mapping(uniprot_mapping: pandas.DataFrame): + return uniprot_mapping.drop(columns=["Ensembl"]).rename(columns={"Ensembl_PRO": "Ensembl"}) From 8e1589b5421bb47d8c8e10857457e1659546b432 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 19 Feb 2026 00:25:29 +0000 Subject: [PATCH 02/19] feat: split egfr data --- cache/directory.py | 7 +++- configs/dmmm.yaml | 18 +++++++++++ datasets/egfr/Snakefile | 31 +++++++++++++++--- datasets/egfr/scripts/map_ensembl.py | 32 +++++++++++++++++++ .../egfr/scripts/process_gold_standard.py | 11 +------ datasets/egfr/scripts/process_interactome.py | 2 +- datasets/egfr/scripts/process_prizes.py | 15 +++++++++ run_snakemake.sh | 1 + 8 files changed, 101 insertions(+), 16 deletions(-) create mode 100644 datasets/egfr/scripts/map_ensembl.py create mode 100644 datasets/egfr/scripts/process_prizes.py diff --git a/cache/directory.py b/cache/directory.py index b94594b1..23f6d4f4 100644 --- a/cache/directory.py +++ b/cache/directory.py @@ -80,7 +80,7 @@ def download(self, output: str | PathLike): "9606": { "9606.protein.links.txt.gz": CacheItem( name="STRING 9606 protein links", - cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj", + cached="https://drive.google.com/uc?id=13tE_-A6g7McZs_lZGz9As7iE-5cBFvqE", online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz", ), "9606.protein.aliases.txt.gz": CacheItem( @@ -219,6 +219,11 @@ def download(self, output: str | PathLike): name="EGFR Gold Standard Reference", online="https://raw.githubusercontent.com/gitter-lab/tps/ca7cafd1e1c17f45ddea07c3fb54d0d70b86ff45/data/resources/eight-egfr-reference-all.txt", cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw" + ), + "egfr-prizes.txt": CacheItem( + name="EGFR prizes", + online="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt", + cached="https://drive.google.com/file/d/1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj/view?usp=sharing" ) }, } diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml index 8d4c9c0a..7c6e622d 100644 --- a/configs/dmmm.yaml +++ b/configs/dmmm.yaml @@ -77,6 +77,16 @@ datasets: edge_files: ["raw/phosphosite-irefindex13.0-uniprot.txt"] node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"] other_files: [] + - label: dmmmegfr_string + data_dir: datasets/egfr + edge_files: ["processed/interactome.tsv"] + node_files: ["processed/prizes-uniprot.txt"] + other_files: [] + - label: dmmmegfr_irefindex + data_dir: datasets/egfr + edge_files: ["processed/phosphosite-irefindex13.0-uniprot.txt"] + node_files: ["processed/prizes.txt"] + other_files: [] gold_standards: - label: gs0 node_files: ['GS_files/Alopecia_areata_GS.txt'] @@ -90,3 +100,11 @@ gold_standards: node_files: ["processed/FADU_gold_standard.txt"] data_dir: datasets/depmap dataset_labels: ["dmmmdepmap_cellline_fadu"] + - label: gs_egfr_string + node_files: ["processed/gold-standard-nodes.txt"] + data_dir: datasets/egfr + dataset_labels: ["dmmmegfr_string"] + - label: gs_egfr_irefindex + node_files: ["processed/gold-standard-nodes-uniprot.txt"] + data_dir: datasets/egfr + dataset_labels: ["dmmmegfr_irefindex"] diff --git a/datasets/egfr/Snakefile b/datasets/egfr/Snakefile index 0f77b504..756c3d2e 100644 --- a/datasets/egfr/Snakefile +++ b/datasets/egfr/Snakefile @@ -2,23 +2,46 @@ include: "../../cache/Snakefile" rule all: input: + # Our UniProt based files over the older iRefIndex interactome + "processed/gold-standard-nodes-uniprot.txt", + "processed/prizes-uniprot.txt", + "processed/phosphosite-irefindex13.0-uniprot.txt", + + # Our Ensembl protein based files over the STRING interactome "processed/gold-standard-nodes.txt", + "processed/prizes.txt", "processed/interactome.tsv", produce_fetch_rules({ "raw/eight-egfr-reference-all.txt": ["EGFR", "eight-egfr-reference-all.txt"], + "raw/egfr-prizes.txt": ["EGFR", "egfr-prizes.txt"], "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True), "raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True), + "processed/phosphosite-irefindex13.0-uniprot.txt": ["iRefIndex", "phosphosite-irefindex13.0-uniprot.txt"] }) rule process_gold_standard: - input: - "raw/HUMAN_9606_idmapping_selected.tsv", - "raw/eight-egfr-reference-all.txt" - output: "processed/gold-standard-nodes.txt" + input: "raw/eight-egfr-reference-all.txt" + output: "processed/gold-standard-nodes-uniprot.txt", shell: "uv run scripts/process_gold_standard.py" rule process_interactome: input: "raw/9606.protein.links.txt" output: "processed/interactome.tsv" shell: "uv run scripts/process_interactome.py" + +rule process_prizes: + input: "raw/egfr-prizes.txt" + output: "processed/prizes-uniprot.txt" + shell: "uv run scripts/process_prizes.py" + +rule map_ensembl: + input: + "raw/HUMAN_9606_idmapping_selected.tsv", + + "processed/prizes-uniprot.txt", + "processed/gold-standard-nodes-uniprot.txt", + output: + "processed/prizes.txt", + "processed/gold-standard-nodes.txt" + shell: "uv run scripts/map_ensembl.py" \ No newline at end of file diff --git a/datasets/egfr/scripts/map_ensembl.py b/datasets/egfr/scripts/map_ensembl.py new file mode 100644 index 00000000..f8396cb1 --- /dev/null +++ b/datasets/egfr/scripts/map_ensembl.py @@ -0,0 +1,32 @@ +import pandas +from pathlib import Path +from tools.mapping.ensembl_uniprot import idmapping_uniprot_mapping + +egfr_directory = Path(__file__).parent.resolve() / '..' + +def main(): + # Re-read the uniprot nodes from `process_gold_standard.py` + nodes = (egfr_directory / 'processed' / 'gold-standard-nodes-uniprot.txt').read_text().splitlines() + # and the prizes from `process_prizes.py` + prizes = pandas.read_csv(egfr_directory / 'processed' / 'prizes-uniprot.txt', sep='\t') + + # We grab our UniProt <-> ENSP mapping + idmapping_df = idmapping_uniprot_mapping(egfr_directory / 'raw' / 'HUMAN_9606_idmapping_selected.tsv') + + # and map the nodes + idmapping_nodes_df = pandas.DataFrame(nodes, columns=['UniProtKB-ID']).merge(idmapping_df, on='UniProtKB-ID', how='left') + idmapping_nodes_df = idmapping_nodes_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl']) + idmapping_nodes_df = idmapping_nodes_df[~idmapping_nodes_df['Ensembl_PRO'].isna()] + nodes = idmapping_nodes_df['Ensembl_PRO'].astype(str).to_list() + (egfr_directory / 'processed' / 'gold-standard-nodes.txt').write_text("\n".join(nodes)) + + # and the prizes + idmapping_prizes_df = prizes.merge(idmapping_df, left_on='NODEID', right_on="UniProtKB-ID", how='inner') + idmapping_prizes_df = idmapping_prizes_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl', 'NODEID']) + idmapping_prizes_df = idmapping_prizes_df[~idmapping_prizes_df['Ensembl_PRO'].isna()] + idmapping_prizes_df = idmapping_prizes_df.rename(columns={'Ensembl_PRO': 'NODEID'}) + idmapping_prizes_df = idmapping_prizes_df[["NODEID", "prize"]] + idmapping_prizes_df.to_csv(egfr_directory / 'processed' / 'prizes.txt', sep='\t', index=False) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/datasets/egfr/scripts/process_gold_standard.py b/datasets/egfr/scripts/process_gold_standard.py index e9dd23dc..6f5c8997 100644 --- a/datasets/egfr/scripts/process_gold_standard.py +++ b/datasets/egfr/scripts/process_gold_standard.py @@ -1,6 +1,4 @@ -import pandas from pathlib import Path -from tools.mapping.ensembl_uniprot import idmapping_uniprot_mapping egfr_directory = Path(__file__).parent.resolve() / '..' @@ -9,15 +7,8 @@ def main(): nodes = (egfr_directory / 'raw' / 'eight-egfr-reference-all.txt').read_text().splitlines() nodes = list(set([node for node in nodes if not node.endswith("_PSEUDONODE")])) - # Then, we map our UniProt nodes to ENSP. - idmapping_df = idmapping_uniprot_mapping(egfr_directory / 'raw' / 'HUMAN_9606_idmapping_selected.tsv') - idmapping_df = pandas.DataFrame(nodes, columns=['UniProtKB-ID']).merge(idmapping_df, on='UniProtKB-ID', how='left') - idmapping_df = idmapping_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl']) - idmapping_df = idmapping_df[~idmapping_df['Ensembl_PRO'].isna()] - nodes = idmapping_df['Ensembl_PRO'].astype(str).to_list() - (egfr_directory / 'processed').mkdir(exist_ok=True) - (egfr_directory / 'processed' / 'gold-standard-nodes.txt').write_text("\n".join(nodes)) + (egfr_directory / 'processed' / 'gold-standard-nodes-uniprot.txt').write_text("\n".join(nodes)) if __name__ == "__main__": main() diff --git a/datasets/egfr/scripts/process_interactome.py b/datasets/egfr/scripts/process_interactome.py index b733e08a..32c6b8bd 100644 --- a/datasets/egfr/scripts/process_interactome.py +++ b/datasets/egfr/scripts/process_interactome.py @@ -6,7 +6,7 @@ def main(): interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.txt', sep='\t') interactome_df['Direction'] = 'U' - + (egfr_directory / 'processed').mkdir(exist_ok=True) interactome_df.to_csv(egfr_directory / 'processed' / 'interactome.tsv', index=False, header=False, sep='\t') diff --git a/datasets/egfr/scripts/process_prizes.py b/datasets/egfr/scripts/process_prizes.py new file mode 100644 index 00000000..f66635cc --- /dev/null +++ b/datasets/egfr/scripts/process_prizes.py @@ -0,0 +1,15 @@ +import pandas +from pathlib import Path + +egfr_directory = Path(__file__).parent.resolve() / '..' + +def main(): + prizes = pandas.read_csv( + egfr_directory / 'raw' / 'egfr-prizes.txt', sep='\t', + header=None, names=['NODEID', 'prize'] + ) + prizes['active'] = 'True' + prizes.to_csv(egfr_directory / 'processed' / 'prizes-uniprot.txt', index=False, sep='\t') + +if __name__ == "__main__": + main() diff --git a/run_snakemake.sh b/run_snakemake.sh index 24305244..137fff2f 100755 --- a/run_snakemake.sh +++ b/run_snakemake.sh @@ -18,6 +18,7 @@ main() { uv run snakemake --cores 1 -d datasets/diseases -s datasets/diseases/Snakefile uv run snakemake --cores 1 -d datasets/rn-muscle-skeletal -s datasets/rn-muscle-skeletal/Snakefile uv run snakemake --cores 1 -d datasets/depmap -s datasets/depmap/Snakefile + uv run snakemake --cores 1 -d datasets/egfr -s datasets/egfr/Snakefile } main "$@" From d0a9205ea21fd9e50f078d617126848b196ba46e Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 19 Feb 2026 00:26:02 +0000 Subject: [PATCH 03/19] style: fmt --- datasets/egfr/scripts/map_ensembl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/egfr/scripts/map_ensembl.py b/datasets/egfr/scripts/map_ensembl.py index f8396cb1..fd308a72 100644 --- a/datasets/egfr/scripts/map_ensembl.py +++ b/datasets/egfr/scripts/map_ensembl.py @@ -12,7 +12,7 @@ def main(): # We grab our UniProt <-> ENSP mapping idmapping_df = idmapping_uniprot_mapping(egfr_directory / 'raw' / 'HUMAN_9606_idmapping_selected.tsv') - + # and map the nodes idmapping_nodes_df = pandas.DataFrame(nodes, columns=['UniProtKB-ID']).merge(idmapping_df, on='UniProtKB-ID', how='left') idmapping_nodes_df = idmapping_nodes_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl']) From f68b7861e4da6c224a4ba76f0ad66c863c58f35b Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 19 Feb 2026 03:34:06 +0000 Subject: [PATCH 04/19] fix: strip interactome prefix --- datasets/egfr/scripts/process_interactome.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datasets/egfr/scripts/process_interactome.py b/datasets/egfr/scripts/process_interactome.py index 32c6b8bd..e0ab02c7 100644 --- a/datasets/egfr/scripts/process_interactome.py +++ b/datasets/egfr/scripts/process_interactome.py @@ -4,7 +4,9 @@ egfr_directory = Path(__file__).parent.resolve() / '..' def main(): - interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.txt', sep='\t') + interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.txt', sep=' ') + interactome_df['protein1'] = interactome_df['protein1'].astype(str).str.removeprefix("9606.") + interactome_df['protein2'] = interactome_df['protein2'].astype(str).str.removeprefix("9606.") interactome_df['Direction'] = 'U' (egfr_directory / 'processed').mkdir(exist_ok=True) From af335c841ba1d11c41f5a61c7edca23d491703bc Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 19 Feb 2026 04:32:09 +0000 Subject: [PATCH 05/19] chore: add dummy column --- datasets/egfr/scripts/process_prizes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datasets/egfr/scripts/process_prizes.py b/datasets/egfr/scripts/process_prizes.py index f66635cc..33549fed 100644 --- a/datasets/egfr/scripts/process_prizes.py +++ b/datasets/egfr/scripts/process_prizes.py @@ -9,6 +9,8 @@ def main(): header=None, names=['NODEID', 'prize'] ) prizes['active'] = 'True' + prizes['dummy'] = 'True' + prizes.to_csv(egfr_directory / 'processed' / 'prizes-uniprot.txt', index=False, sep='\t') if __name__ == "__main__": From 7a3f79a908c67936b2361b7596e486c851308a1b Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 19 Feb 2026 08:12:47 +0000 Subject: [PATCH 06/19] chore: update spras, docs, unpin commit --- cache/directory.py | 2 +- configs/dmmm.yaml | 119 ++++++++++++------------ configs/pra.yaml | 27 +++--- datasets/egfr/README.md | 18 ++++ datasets/egfr/scripts/map_ensembl.py | 2 +- datasets/egfr/scripts/process_prizes.py | 12 ++- spras | 2 +- 7 files changed, 105 insertions(+), 77 deletions(-) create mode 100644 datasets/egfr/README.md diff --git a/cache/directory.py b/cache/directory.py index 23f6d4f4..14373572 100644 --- a/cache/directory.py +++ b/cache/directory.py @@ -217,7 +217,7 @@ def download(self, output: str | PathLike): "EGFR": { "eight-egfr-reference-all.txt": CacheItem( name="EGFR Gold Standard Reference", - online="https://raw.githubusercontent.com/gitter-lab/tps/ca7cafd1e1c17f45ddea07c3fb54d0d70b86ff45/data/resources/eight-egfr-reference-all.txt", + online="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt", cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw" ), "egfr-prizes.txt": CacheItem( diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml index 7c6e622d..1086e722 100644 --- a/configs/dmmm.yaml +++ b/configs/dmmm.yaml @@ -25,81 +25,82 @@ analysis: # Custom settings algorithms: - name: "omicsintegrator1" - params: - include: true + include: true + runs: run1: - b: [2] - w: [.5] - d: [10] - mu: [2] + b: 2 + w: .5 + d: 10 + mu: 2 + # TODO: egfr prefers dummy_mode: ["file"] since we manually specify EGF_HUMAN as one. - name: "omicsintegrator2" - params: - include: true + include: true + runs: run1: - b: [4] - g: [0] + b: 4 + g: 0 datasets: # TODO: use old paramaters for datasets # HIV: https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking/spras-config/config.yaml - - label: dmmmhiv_060 - node_files: ["processed_prize_060.txt"] - edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"] - other_files: [] - data_dir: "datasets/hiv/processed" - - label: dmmmhiv_05 - node_files: ["processed_prize_05.txt"] - edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"] - other_files: [] - data_dir: "datasets/hiv/processed" - # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml - - label: dmmmyeast - node_files: ["prizes1_dummies.txt"] - edge_files: ["network1.txt"] - other_files: [] - data_dir: "datasets/yeast-osmotic-stress/processed" - - label: dmmmdiseases_alopecia_areata - data_dir: datasets/diseases - edge_files: - - raw/string_interactome.txt - node_files: - - prize_files/alopecia_areata_prizes.txt - other_files: [] - - label: dmmmdiseases_diabetes_mellitus - data_dir: datasets/diseases - edge_files: - - raw/string_interactome.txt - node_files: - - prize_files/diabetes_mellitus_prizes.txt - other_files: [] - - label: dmmmdepmap_cellline_fadu - data_dir: datasets/depmap - edge_files: ["raw/phosphosite-irefindex13.0-uniprot.txt"] - node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"] - other_files: [] + # - label: dmmmhiv_060 + # node_files: ["processed_prize_060.txt"] + # edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"] + # other_files: [] + # data_dir: "datasets/hiv/processed" + # - label: dmmmhiv_05 + # node_files: ["processed_prize_05.txt"] + # edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"] + # other_files: [] + # data_dir: "datasets/hiv/processed" + # # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml + # - label: dmmmyeast + # node_files: ["prizes1_dummies.txt"] + # edge_files: ["network1.txt"] + # other_files: [] + # data_dir: "datasets/yeast-osmotic-stress/processed" + # - label: dmmmdiseases_alopecia_areata + # data_dir: datasets/diseases + # edge_files: + # - raw/string_interactome.txt + # node_files: + # - prize_files/alopecia_areata_prizes.txt + # other_files: [] + # - label: dmmmdiseases_diabetes_mellitus + # data_dir: datasets/diseases + # edge_files: + # - raw/string_interactome.txt + # node_files: + # - prize_files/diabetes_mellitus_prizes.txt + # other_files: [] + # - label: dmmmdepmap_cellline_fadu + # data_dir: datasets/depmap + # edge_files: ["raw/phosphosite-irefindex13.0-uniprot.txt"] + # node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"] + # other_files: [] - label: dmmmegfr_string data_dir: datasets/egfr edge_files: ["processed/interactome.tsv"] - node_files: ["processed/prizes-uniprot.txt"] + node_files: ["processed/prizes.txt"] other_files: [] - label: dmmmegfr_irefindex data_dir: datasets/egfr edge_files: ["processed/phosphosite-irefindex13.0-uniprot.txt"] - node_files: ["processed/prizes.txt"] + node_files: ["processed/prizes-uniprot.txt"] other_files: [] gold_standards: - - label: gs0 - node_files: ['GS_files/Alopecia_areata_GS.txt'] - data_dir: "datasets/diseases" - dataset_labels: ["dmmmdiseases_alopecia_areata"] - - label: gs1 - node_files: ['GS_files/Diabetes_mellitus_GS.txt'] - data_dir: "datasets/diseases" - dataset_labels: ["dmmmdiseases_diabetes_mellitus"] - - label: gs_fadu - node_files: ["processed/FADU_gold_standard.txt"] - data_dir: datasets/depmap - dataset_labels: ["dmmmdepmap_cellline_fadu"] + # - label: gs0 + # node_files: ['GS_files/Alopecia_areata_GS.txt'] + # data_dir: "datasets/diseases" + # dataset_labels: ["dmmmdiseases_alopecia_areata"] + # - label: gs1 + # node_files: ['GS_files/Diabetes_mellitus_GS.txt'] + # data_dir: "datasets/diseases" + # dataset_labels: ["dmmmdiseases_diabetes_mellitus"] + # - label: gs_fadu + # node_files: ["processed/FADU_gold_standard.txt"] + # data_dir: datasets/depmap + # dataset_labels: ["dmmmdepmap_cellline_fadu"] - label: gs_egfr_string node_files: ["processed/gold-standard-nodes.txt"] data_dir: datasets/egfr diff --git a/configs/pra.yaml b/configs/pra.yaml index 76f14a3c..3ad77733 100644 --- a/configs/pra.yaml +++ b/configs/pra.yaml @@ -26,27 +26,26 @@ analysis: # Custom settings algorithms: - name: "omicsintegrator1" - params: - include: true + include: true + runs: run1: - b: [2] - w: [.5] - d: [10] - mu: [2] + b: 2 + w: .5 + d: 10 + mu: 2 - name: "omicsintegrator2" - params: - include: true + include: true + runs: run1: - b: [4] - g: [0] + b: 4 + g: 0 - name: "pathlinker" - params: - include: true + include: true + runs: run1: k: [10, 20] - name: "allpairs" - params: - include: true + include: true datasets: - label: prarn_muscleskeletal2018 diff --git a/datasets/egfr/README.md b/datasets/egfr/README.md new file mode 100644 index 00000000..30398dee --- /dev/null +++ b/datasets/egfr/README.md @@ -0,0 +1,18 @@ +# EGFR + +EGFR dataset. This dataset does a lot less processing for raw files, and is mainly focused on creating the new STRING-based interactome. + +This data is from [_Synthesizing Signaling Pathways from Temporal Phosphoproteomic Data_](https://doi.org/10.1016/j.celrep.2018.08.085). + +## Overview + +This produces two sets of files: one based on the iRefIndex/PhosphoSite directed interactome of closed-source origin based off of UniProt identifiers, and another one based off of the more updated though undirected STRING interactome. + +## Scripts + +- `process_prizes.py`: produces a `prizes-uniprot.txt` from +[egfr-prizes.txt](https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt), +trimming psuedonodes and manually injecting the `EGF_HUMAN` receptor as a dummy node for OmicsIntegrator1. +- `process_interactome.py`: Produces the STRING `interactome.tsv` file from the STRING links file. Note that the `phosphosite-irefindex13.0-uniprot.txt` is a magic (as in with closed-source origin) directed interactome produced with a combination of the now archived iRefIndex v13 interactome with extra PhosphoSite-provided nodes +- `process_gold_standard.py`: Produces the `gold-standard-nodes-uniprot.txt` file from the [EGFR prize file](https://raw.githubusercontent.com/gitter-lab/tps/ca7cafd1e1c17f45ddea07c3fb54d0d70b86ff45/data/resources/eight-egfr-reference-all.txt) from the above paper. +- `map_ensembl.py`: Maps UniProt identifiers to STRING identifiers for the STRING-based data. diff --git a/datasets/egfr/scripts/map_ensembl.py b/datasets/egfr/scripts/map_ensembl.py index fd308a72..89913a07 100644 --- a/datasets/egfr/scripts/map_ensembl.py +++ b/datasets/egfr/scripts/map_ensembl.py @@ -25,7 +25,7 @@ def main(): idmapping_prizes_df = idmapping_prizes_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl', 'NODEID']) idmapping_prizes_df = idmapping_prizes_df[~idmapping_prizes_df['Ensembl_PRO'].isna()] idmapping_prizes_df = idmapping_prizes_df.rename(columns={'Ensembl_PRO': 'NODEID'}) - idmapping_prizes_df = idmapping_prizes_df[["NODEID", "prize"]] + idmapping_prizes_df = idmapping_prizes_df[["NODEID", "prize", "active", "dummy", "source"]] idmapping_prizes_df.to_csv(egfr_directory / 'processed' / 'prizes.txt', sep='\t', index=False) if __name__ == "__main__": diff --git a/datasets/egfr/scripts/process_prizes.py b/datasets/egfr/scripts/process_prizes.py index 33549fed..8763ac58 100644 --- a/datasets/egfr/scripts/process_prizes.py +++ b/datasets/egfr/scripts/process_prizes.py @@ -8,8 +8,18 @@ def main(): egfr_directory / 'raw' / 'egfr-prizes.txt', sep='\t', header=None, names=['NODEID', 'prize'] ) + prizes = prizes.loc[~prizes['NODEID'].str.endswith('_PSEUDONODE')] + # TODO: prize: 10 is a magic value. + prizes = pandas.concat( + [prizes, pandas.DataFrame({ + 'NODEID': ['EGF_HUMAN'], + 'prize': [10], + 'dummy': ['True'], + 'source': ['True'] + })], + ignore_index=True + ) prizes['active'] = 'True' - prizes['dummy'] = 'True' prizes.to_csv(egfr_directory / 'processed' / 'prizes-uniprot.txt', index=False, sep='\t') diff --git a/spras b/spras index cd01e67e..18f2cf84 160000 --- a/spras +++ b/spras @@ -1 +1 @@ -Subproject commit cd01e67ea24f1817ba469335dfacb875ba2412bb +Subproject commit 18f2cf84cfac034b2962f47434d3f900288b6a97 From 69c6a8646a58a62cbb3ed92a05ca614c75a148dd Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 19 Feb 2026 08:20:07 +0000 Subject: [PATCH 07/19] chore: uncomment dmmm whoops --- configs/dmmm.yaml | 94 +++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml index 1086e722..81535a42 100644 --- a/configs/dmmm.yaml +++ b/configs/dmmm.yaml @@ -43,41 +43,41 @@ algorithms: datasets: # TODO: use old paramaters for datasets # HIV: https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking/spras-config/config.yaml - # - label: dmmmhiv_060 - # node_files: ["processed_prize_060.txt"] - # edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"] - # other_files: [] - # data_dir: "datasets/hiv/processed" - # - label: dmmmhiv_05 - # node_files: ["processed_prize_05.txt"] - # edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"] - # other_files: [] - # data_dir: "datasets/hiv/processed" - # # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml - # - label: dmmmyeast - # node_files: ["prizes1_dummies.txt"] - # edge_files: ["network1.txt"] - # other_files: [] - # data_dir: "datasets/yeast-osmotic-stress/processed" - # - label: dmmmdiseases_alopecia_areata - # data_dir: datasets/diseases - # edge_files: - # - raw/string_interactome.txt - # node_files: - # - prize_files/alopecia_areata_prizes.txt - # other_files: [] - # - label: dmmmdiseases_diabetes_mellitus - # data_dir: datasets/diseases - # edge_files: - # - raw/string_interactome.txt - # node_files: - # - prize_files/diabetes_mellitus_prizes.txt - # other_files: [] - # - label: dmmmdepmap_cellline_fadu - # data_dir: datasets/depmap - # edge_files: ["raw/phosphosite-irefindex13.0-uniprot.txt"] - # node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"] - # other_files: [] + - label: dmmmhiv_060 + node_files: ["processed_prize_060.txt"] + edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"] + other_files: [] + data_dir: "datasets/hiv/processed" + - label: dmmmhiv_05 + node_files: ["processed_prize_05.txt"] + edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"] + other_files: [] + data_dir: "datasets/hiv/processed" + # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml + - label: dmmmyeast + node_files: ["prizes1_dummies.txt"] + edge_files: ["network1.txt"] + other_files: [] + data_dir: "datasets/yeast-osmotic-stress/processed" + - label: dmmmdiseases_alopecia_areata + data_dir: datasets/diseases + edge_files: + - raw/string_interactome.txt + node_files: + - prize_files/alopecia_areata_prizes.txt + other_files: [] + - label: dmmmdiseases_diabetes_mellitus + data_dir: datasets/diseases + edge_files: + - raw/string_interactome.txt + node_files: + - prize_files/diabetes_mellitus_prizes.txt + other_files: [] + - label: dmmmdepmap_cellline_fadu + data_dir: datasets/depmap + edge_files: ["raw/phosphosite-irefindex13.0-uniprot.txt"] + node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"] + other_files: [] - label: dmmmegfr_string data_dir: datasets/egfr edge_files: ["processed/interactome.tsv"] @@ -89,18 +89,18 @@ datasets: node_files: ["processed/prizes-uniprot.txt"] other_files: [] gold_standards: - # - label: gs0 - # node_files: ['GS_files/Alopecia_areata_GS.txt'] - # data_dir: "datasets/diseases" - # dataset_labels: ["dmmmdiseases_alopecia_areata"] - # - label: gs1 - # node_files: ['GS_files/Diabetes_mellitus_GS.txt'] - # data_dir: "datasets/diseases" - # dataset_labels: ["dmmmdiseases_diabetes_mellitus"] - # - label: gs_fadu - # node_files: ["processed/FADU_gold_standard.txt"] - # data_dir: datasets/depmap - # dataset_labels: ["dmmmdepmap_cellline_fadu"] + - label: gs0 + node_files: ['GS_files/Alopecia_areata_GS.txt'] + data_dir: "datasets/diseases" + dataset_labels: ["dmmmdiseases_alopecia_areata"] + - label: gs1 + node_files: ['GS_files/Diabetes_mellitus_GS.txt'] + data_dir: "datasets/diseases" + dataset_labels: ["dmmmdiseases_diabetes_mellitus"] + - label: gs_fadu + node_files: ["processed/FADU_gold_standard.txt"] + data_dir: datasets/depmap + dataset_labels: ["dmmmdepmap_cellline_fadu"] - label: gs_egfr_string node_files: ["processed/gold-standard-nodes.txt"] data_dir: datasets/egfr From 5483cd4aff7272d6cdb218d06327d276676edf2d Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 20 Feb 2026 09:58:51 +0000 Subject: [PATCH 08/19] fix: do string interactome trimming! No trim.py :/ --- datasets/egfr/scripts/map_ensembl.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/datasets/egfr/scripts/map_ensembl.py b/datasets/egfr/scripts/map_ensembl.py index 89913a07..d8488012 100644 --- a/datasets/egfr/scripts/map_ensembl.py +++ b/datasets/egfr/scripts/map_ensembl.py @@ -5,6 +5,12 @@ egfr_directory = Path(__file__).parent.resolve() / '..' def main(): + # We get specifically the STRING nodes, as the mapping from UniProt overeagerly maps + string_nodes = pandas.read_csv( + egfr_directory / 'processed' / 'interactome.tsv', + header=None, sep='\t', names=['Interactor1', 'Interactor2', 'Weight', 'Direction']) + interactor_series = pandas.concat([string_nodes['Interactor1'], string_nodes['Interactor2']], ignore_index=True) + # Re-read the uniprot nodes from `process_gold_standard.py` nodes = (egfr_directory / 'processed' / 'gold-standard-nodes-uniprot.txt').read_text().splitlines() # and the prizes from `process_prizes.py` @@ -12,6 +18,8 @@ def main(): # We grab our UniProt <-> ENSP mapping idmapping_df = idmapping_uniprot_mapping(egfr_directory / 'raw' / 'HUMAN_9606_idmapping_selected.tsv') + # Trim it with the interactor series + idmapping_df = idmapping_df[idmapping_df["Ensembl_PRO"].isin(interactor_series)] # and map the nodes idmapping_nodes_df = pandas.DataFrame(nodes, columns=['UniProtKB-ID']).merge(idmapping_df, on='UniProtKB-ID', how='left') From d5096ebf57c0074dcf61e067ad84151dfd1ef560 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 20 Feb 2026 10:08:20 +0000 Subject: [PATCH 09/19] fix: add interactome to map_ensembl input --- datasets/egfr/Snakefile | 1 + 1 file changed, 1 insertion(+) diff --git a/datasets/egfr/Snakefile b/datasets/egfr/Snakefile index 756c3d2e..6c4eefbd 100644 --- a/datasets/egfr/Snakefile +++ b/datasets/egfr/Snakefile @@ -38,6 +38,7 @@ rule process_prizes: rule map_ensembl: input: "raw/HUMAN_9606_idmapping_selected.tsv", + "processed/interactome.tsv", "processed/prizes-uniprot.txt", "processed/gold-standard-nodes-uniprot.txt", From 9ef2a3fbb0ce9796181f9d66296b1d7c04ef1335 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Fri, 20 Feb 2026 22:32:47 +0000 Subject: [PATCH 10/19] chore: add Service with proper header handling --- cache/__init__.py | 4 +- cache/directory.py | 114 ++++++++++++++++++++++++--------------------- 2 files changed, 62 insertions(+), 56 deletions(-) diff --git a/cache/__init__.py b/cache/__init__.py index 2f15fe4d..9e48cf44 100644 --- a/cache/__init__.py +++ b/cache/__init__.py @@ -67,9 +67,9 @@ def link(output: str, directive: list[str], uncompress=False): Path(output).unlink(missing_ok=True) - # Re-download if the directive has expired. + # Re-download if the file doesn't exist or the directive has expired. cache_item = get_cache_item(directive) - if has_expired(directive): + if not (artifacts_dir / artifact_name).exists() or has_expired(directive): (artifacts_dir / artifact_name).unlink(missing_ok=True) cache_item.download(artifacts_dir / artifact_name) diff --git a/cache/directory.py b/cache/directory.py index 14373572..459ddabc 100644 --- a/cache/directory.py +++ b/cache/directory.py @@ -2,29 +2,47 @@ from typing import Union from os import PathLike from tempfile import NamedTemporaryFile -from typing import Optional -import urllib.request +from typing import Optional, Mapping import filecmp -import urllib.parse -import os from pathlib import Path +import warnings +import requests +import shutil +import urllib.parse +import pydantic import gdown -dir_path = Path(os.path.dirname(os.path.realpath(__file__))) +dir_path = Path(__file__).parent.resolve() -def fetch_biomart_url(xml: str) -> str: +@dataclass +class Service: + url: str + headers: Optional[Mapping[str, str]] = None + + def download(self, output: str | PathLike) -> requests.Response: + """ + Downloads a URL, returning the response (to be used with `with`) and modifying the output path. + """ + # As per https://stackoverflow.com/a/39217788/7589775 to enable download streaming. + with requests.get(self.url, stream=True, headers=self.headers) as response: + response.raw.decode_content = True + with open(output, 'wb') as f: + shutil.copyfileobj(response.raw, f) + return response + + +def fetch_biomart_service(xml: str) -> Service: """ Access BioMart data through the BioMart REST API: https://useast.ensembl.org/info/data/biomart/biomart_restful.html#biomartxml """ ROOT = "http://www.ensembl.org/biomart/martservice?query=" - return ROOT + urllib.parse.quote_plus(xml) + return Service(ROOT + urllib.parse.quote_plus(xml)) -@dataclass -class CacheItem: +class CacheItem(pydantic.BaseModel): """ Class for differentriating between offline and online items in a cache. @@ -35,41 +53,29 @@ class CacheItem: name: str """The display name of the artifact, used for human-printing.""" cached: str - online: str - online_headers: Optional[list[tuple[str, str]]] = None + online: Optional[Service] = None @classmethod + @warnings.deprecated("Pending for removal after the CONTRIBUTING guide is updated.") def cache_only(cls, name: str, cached: str) -> "CacheItem": """Wrapper method to explicitly declare a CacheItem as cached only.""" - return cls(name=name, online=cached, cached="") - - def download_online(self, output: str | PathLike): - # https://stackoverflow.com/a/45313194/7589775: this is to add optional headers to requests. - # We remove the opener at the end by re-installing the default opener. - opener = urllib.request.build_opener() - if self.online_headers: - opener.addheaders = self.online_headers - urllib.request.install_opener(opener) - urllib.request.urlretrieve(self.online, output) - urllib.request.install_opener(urllib.request.build_opener()) + return cls(name=name, cached=cached, online=None) def download(self, output: str | PathLike): print(f"Fetching {self.name}...") - print(f"Downloading {self.online}...") - - if self.cached == "": - # From CacheItem.cached_only - # (gdown doesn't take in Paths for the output_file, so we must stringify it here) - gdown.download(self.online, str(output)) - return - - self.download_online(output) with NamedTemporaryFile() as cached_file: print(f"Downloading cache {self.cached}...") gdown.download(self.cached, cached_file) + + if self.online is None: + return + + print(f"Downloading {self.online}...") + self.online.download(output) + print("Checking that downloaded artifact matches with cached artifact...") - filecmp.cmp(output, cached_file.name) + assert filecmp.cmp(output, cached_file.name) CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]] @@ -81,12 +87,12 @@ def download(self, output: str | PathLike): "9606.protein.links.txt.gz": CacheItem( name="STRING 9606 protein links", cached="https://drive.google.com/uc?id=13tE_-A6g7McZs_lZGz9As7iE-5cBFvqE", - online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz", + online=Service("http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz"), ), "9606.protein.aliases.txt.gz": CacheItem( name="STRING 9606 protein aliases", cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY", - online="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz", + online=Service("https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz"), ), } }, @@ -98,19 +104,19 @@ def download(self, output: str | PathLike): "SwissProt_9606.tsv": CacheItem( name="UniProt 9606 SwissProt genes", cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk", - online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29", + online=Service("https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29"), ), # idmapping FTP files. See the associated README: # https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README "HUMAN_9606_idmapping_selected.tab.gz": CacheItem( name="UniProt 9606 ID external database mapping", cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX", - online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz", + online=Service("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz"), ), "HUMAN_9606_idmapping.dat.gz": CacheItem( name="UniProt 9606 internal id mapping", cached="https://drive.google.com/uc?id=1lGxrx_kGyNdupwIOUXzfIZScc7rQKP-O", - online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz", + online=Service("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz"), ), } }, @@ -120,56 +126,56 @@ def download(self, output: str | PathLike): "tiga_gene-trait_stats.tsv": CacheItem( name="TIGA data", cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK", - online="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv", + online=Service("https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv"), ), "HumanDO.tsv": CacheItem( name="Disease ontology data", cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi", - online="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv", + online=Service("https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv"), ), "human_disease_textmining_filtered.tsv": CacheItem( name="DISEASES textmining channel", cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D", - online="https://download.jensenlab.org/human_disease_textmining_filtered.tsv", + online=Service("https://download.jensenlab.org/human_disease_textmining_filtered.tsv"), ), "human_disease_knowledge_filtered.tsv": CacheItem( name="DISEASES knowledge channel", cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld", - online="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv", + online=Service("https://download.jensenlab.org/human_disease_knowledge_filtered.tsv"), ), }, "BioMart": { "ensg-ensp.tsv": CacheItem( name="BioMart ENSG <-> ENSP mapping", cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL", - online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text()), + online=fetch_biomart_service((dir_path / "biomart" / "ensg-ensp.xml").read_text()), ) }, "DepMap": { "OmicsProfiles.csv": CacheItem( name="DepMap omics metadata", cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads", + online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads"), ), "CRISPRGeneDependency.csv": CacheItem( name="DepMap gene dependency probability estimates", cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads", + online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads"), ), "OmicsSomaticMutationsMatrixDamaging.csv": CacheItem( name="DepMap genotyped matrix", cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads", + online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads"), ), "OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem( name="DepMap model-level TPMs", cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads", + online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads"), ), "OmicsCNGeneWGS.csv": CacheItem( name="DepMap gene-level copy number data", cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub", - online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads", + online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads"), ), }, "iRefIndex": { @@ -190,39 +196,39 @@ def download(self, output: str | PathLike): # The following files are from https://github.com/gitter-lab/osmotic-stress "prizes.txt": CacheItem( name="Osmotic Stress Prizes", - online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt", + online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt"), cached="https://drive.google.com/uc?id=16WDQs0Vjv6rI12-hbifsbnpH31jMGhJg" ), "ChasmanNetwork-DirUndir.txt": CacheItem( name="Network Input", - online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt", + online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt"), cached="https://drive.google.com/uc?id=1qYXPaWcPU72YYME7NaBzD7thYCHRzrLH" ), "dummy.txt": CacheItem( name="Dummy Nodes File", - online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt", + online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt"), cached="https://drive.google.com/uc?id=1dsFIhBrIEahggg0JPxw64JwS51pKxoQU" ), "_edgeFreq.eda ": CacheItem( name="Case Study Omics Integrator Edge Frequencies", - online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda", + online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda"), cached="https://drive.google.com/uc?id=1M_rxEzUCo_EVuFyM47OEH2J-4LB3eeCR" ), "goldStandardUnionDetailed.txt": CacheItem( name="Gold Standard Reference Pathways", - online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt", + online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt"), cached="https://drive.google.com/uc?id=1-_zF9oKFCNmJbDCC2vq8OM17HJw80s2T" ), }, "EGFR": { "eight-egfr-reference-all.txt": CacheItem( name="EGFR Gold Standard Reference", - online="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt", + online=Service("https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt"), cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw" ), "egfr-prizes.txt": CacheItem( name="EGFR prizes", - online="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt", + online=Service("https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt"), cached="https://drive.google.com/file/d/1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj/view?usp=sharing" ) }, From e4a9d35137bc6253d22562305a8c78398a3979fc Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sun, 22 Feb 2026 00:44:51 +0000 Subject: [PATCH 11/19] feat(cache): pinned & unpinned files --- cache/.gitignore | 1 + cache/README.md | 28 +++++++++- cache/directory.py | 131 ++++++++++++++++++++++++++++++--------------- pyproject.toml | 1 + spras | 2 +- uv.lock | 26 ++++++++- 6 files changed, 141 insertions(+), 48 deletions(-) diff --git a/cache/.gitignore b/cache/.gitignore index de153db3..9554b8c6 100644 --- a/cache/.gitignore +++ b/cache/.gitignore @@ -1 +1,2 @@ artifacts +logs diff --git a/cache/README.md b/cache/README.md index 4aea94bc..654aa829 100644 --- a/cache/README.md +++ b/cache/README.md @@ -1,7 +1,31 @@ -# cache +# Cache -Handles artifact fetching and cache. This folder has: +Handles artifact fetching and cache. The point of this is to [for the duration that SPRAS is maintained] prevent any kind of +data rot, to ensure that continuous benchmarking is encouraged to use the latest available data. +During benchmarking runs, data is fetched from all provided URLs in `directory.py`, where we get the most current version of data, +and compare it to our cached data to check if the data has changed at all. + +All entries are provided with this template: + +```py +"file-name.ext": CacheItem( + name="Short File Description", + cached="https://drive.google.com/uc?id=...", + # Either-or + pinned=Service("..."), + unpinned=Service("..."), +), +``` + +When a file is requested, `cached`, `pinned`, and `unpinned` are all downloaded: +- If the URLs linking to `pinned` and `unpinned` do not succeed (i.e. do not return a 2XX status code), we fail. +- If the URL linking to `pinned` does not match `cached`, we fail. +- If the URL linking to `unpinned` does not match `cached`, we warn that the data needs updating. + +## Layout + +This folder has: - `Snakefile` which only contains a function used for producing fetching rules. - `directory.py`, the actual location of file URLs and their cached counterparts. - `cli.py`, a utility for manually fetching specific URLs from `directory.py`. diff --git a/cache/directory.py b/cache/directory.py index 459ddabc..71f0ffa8 100644 --- a/cache/directory.py +++ b/cache/directory.py @@ -10,12 +10,15 @@ import shutil import urllib.parse -import pydantic import gdown - +from loguru import logger dir_path = Path(__file__).parent.resolve() +# Our cache emits warnings for files with unpinned versions that don't match the cache. +(dir_path / 'logs').mkdir(exist_ok=True) +logger.add(dir_path / 'logs' / "cache.log") + @dataclass class Service: url: str @@ -31,7 +34,12 @@ def download(self, output: str | PathLike) -> requests.Response: with open(output, 'wb') as f: shutil.copyfileobj(response.raw, f) return response - + + @staticmethod + def coerce(obj: 'Service | str') -> 'Service': + # TODO: This could also be replaced by coercing str to Service in CacheItem via pydantic. + if isinstance(obj, str): return Service(url=obj) + else: return obj def fetch_biomart_service(xml: str) -> Service: """ @@ -41,41 +49,70 @@ def fetch_biomart_service(xml: str) -> Service: ROOT = "http://www.ensembl.org/biomart/martservice?query=" return Service(ROOT + urllib.parse.quote_plus(xml)) - -class CacheItem(pydantic.BaseModel): +@dataclass +class CacheItem: """ - Class for differentriating between offline and online items in a cache. - - NOTE: If cached is "", we assume that online is a Google Drive URL (for cases where there is no - remaining online data source.) + Class for differentriating between different ways of fetching data. + As mentioned in the ./README.md, `cached` is always needed, and we differentriate between service outage (`pinned`) + and data needing updates (`unpinned`). There is no need to specify both keys at once, but the choice does matter + for how errors are presented during benchmarking runs. """ name: str """The display name of the artifact, used for human-printing.""" + cached: str - online: Optional[Service] = None + """ + The URL of the cached file, which is currently a Google Drive URL. + """ + + pinned: Optional[Service | str] = None + """ + The Service (URL + headers) of the file, which is the 'pinned' file. + By a pinned file, we say that the file has a dedicated version, and should not change. + If this is None, we go for the `unpinned` file or `cached` if `unpinned` is None. + """ + + unpinned: Optional[Service | str] = None + """ + Analogously to `pinned`, this is a Service (URL + headers) which is 'unpinned,' + or lacks a dedicated version. When `pinned` matches `cached` but `unpinned` doesn't match `pinned`, + we say that the file has a new version. + + If `pinned` is None and `unpinned` doesn't match `cached`, we warn instead of erroring. + + We will still error if the status code is not 2XX (a successful request). + """ @classmethod @warnings.deprecated("Pending for removal after the CONTRIBUTING guide is updated.") def cache_only(cls, name: str, cached: str) -> "CacheItem": """Wrapper method to explicitly declare a CacheItem as cached only.""" - return cls(name=name, cached=cached, online=None) + return cls(name=name, cached=cached) def download(self, output: str | PathLike): - print(f"Fetching {self.name}...") + logger.info(f"Fetching {self.name}...") with NamedTemporaryFile() as cached_file: - print(f"Downloading cache {self.cached}...") + logger.info(f"Downloading cache {self.cached}...") gdown.download(self.cached, cached_file) - if self.online is None: - return + if self.pinned is not None: + logger.info(f"Downloading pinned URL {self.pinned}...") + Service.coerce(self.pinned).download(output) - print(f"Downloading {self.online}...") - self.online.download(output) + logger.info("Checking that the downloaded pinned artifact matches with cached artifact...") + assert filecmp.cmp(output, cached_file.name) + + if self.unpinned is not None: + logger.info(f"Downloading unpinned URL {self.unpinned}...") + with NamedTemporaryFile() as unpinned_file: + Service.coerce(self.unpinned).download(unpinned_file.name) - print("Checking that downloaded artifact matches with cached artifact...") - assert filecmp.cmp(output, cached_file.name) + logger.info("Checking that the downloaded unpinned artifact matches with cached artifact...") + if not filecmp.cmp(unpinned_file.name, cached_file.name): + # This gets saved to a file. Search for `logger.add` for more info. + logger.warning(f"Unpinned file {self.unpinned} for {self.name} does not match cache - this source should be updated!") CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]] @@ -87,12 +124,12 @@ def download(self, output: str | PathLike): "9606.protein.links.txt.gz": CacheItem( name="STRING 9606 protein links", cached="https://drive.google.com/uc?id=13tE_-A6g7McZs_lZGz9As7iE-5cBFvqE", - online=Service("http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz"), + pinned="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz", ), "9606.protein.aliases.txt.gz": CacheItem( name="STRING 9606 protein aliases", cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY", - online=Service("https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz"), + pinned="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz", ), } }, @@ -104,19 +141,19 @@ def download(self, output: str | PathLike): "SwissProt_9606.tsv": CacheItem( name="UniProt 9606 SwissProt genes", cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk", - online=Service("https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29"), + unpinned="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29", ), # idmapping FTP files. See the associated README: # https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README "HUMAN_9606_idmapping_selected.tab.gz": CacheItem( name="UniProt 9606 ID external database mapping", cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX", - online=Service("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz"), + unpinned="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz", ), "HUMAN_9606_idmapping.dat.gz": CacheItem( name="UniProt 9606 internal id mapping", cached="https://drive.google.com/uc?id=1lGxrx_kGyNdupwIOUXzfIZScc7rQKP-O", - online=Service("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz"), + unpinned="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz", ), } }, @@ -126,109 +163,115 @@ def download(self, output: str | PathLike): "tiga_gene-trait_stats.tsv": CacheItem( name="TIGA data", cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK", - online=Service("https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv"), + pinned="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv", ), "HumanDO.tsv": CacheItem( name="Disease ontology data", cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi", - online=Service("https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv"), + # DiseaseOntology is a decently updating repository! + unpinned="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/refs/heads/main/DOreports/HumanDO.tsv", ), "human_disease_textmining_filtered.tsv": CacheItem( name="DISEASES textmining channel", cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D", - online=Service("https://download.jensenlab.org/human_disease_textmining_filtered.tsv"), + unpinned="https://download.jensenlab.org/human_disease_textmining_filtered.tsv", ), "human_disease_knowledge_filtered.tsv": CacheItem( name="DISEASES knowledge channel", cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld", - online=Service("https://download.jensenlab.org/human_disease_knowledge_filtered.tsv"), + unpinned="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv", ), }, "BioMart": { "ensg-ensp.tsv": CacheItem( name="BioMart ENSG <-> ENSP mapping", cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL", - online=fetch_biomart_service((dir_path / "biomart" / "ensg-ensp.xml").read_text()), + unpinned=fetch_biomart_service((dir_path / "biomart" / "ensg-ensp.xml").read_text()), ) }, "DepMap": { "OmicsProfiles.csv": CacheItem( name="DepMap omics metadata", cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL", - online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads"), + pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads", ), "CRISPRGeneDependency.csv": CacheItem( name="DepMap gene dependency probability estimates", cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz", - online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads"), + pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads", ), "OmicsSomaticMutationsMatrixDamaging.csv": CacheItem( name="DepMap genotyped matrix", cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh", - online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads"), + pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads", ), "OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem( name="DepMap model-level TPMs", cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP", - online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads"), + pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads", ), "OmicsCNGeneWGS.csv": CacheItem( name="DepMap gene-level copy number data", cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub", - online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads"), + pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads", ), }, "iRefIndex": { # This can also be obtained from the SPRAS repo # (https://github.com/Reed-CompBio/spras/blob/b5d7a2499afa8eab14c60ce0f99fa7e8a23a2c64/input/phosphosite-irefindex13.0-uniprot.txt). # iRefIndex has been down for quite some time, so this is only from the cache. - "phosphosite-irefindex13.0-uniprot.txt": CacheItem.cache_only( + "phosphosite-irefindex13.0-uniprot.txt": CacheItem( name="iRefIndex v13.0 UniProt interactome", cached="https://drive.google.com/uc?id=1fQ8Z3FjEwUseEtsExO723zj7mAAtdomo" ) }, "OsmoticStress": { - "yeast_pcsf_network.sif": CacheItem.cache_only( + "yeast_pcsf_network.sif": CacheItem( # In the paper https://doi.org/10.1016/j.celrep.2018.08.085 name="Case Study Edge Results, from Supplementary Data 3", cached="https://drive.google.com/uc?id=1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h" ), - # The following files are from https://github.com/gitter-lab/osmotic-stress + # The following files are from https://github.com/gitter-lab/osmotic-stress. + # While the following files do point to the repository's main branch, + # they aren't expected to actually change. "prizes.txt": CacheItem( name="Osmotic Stress Prizes", - online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt"), + pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt", cached="https://drive.google.com/uc?id=16WDQs0Vjv6rI12-hbifsbnpH31jMGhJg" ), "ChasmanNetwork-DirUndir.txt": CacheItem( name="Network Input", - online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt"), + pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt", cached="https://drive.google.com/uc?id=1qYXPaWcPU72YYME7NaBzD7thYCHRzrLH" ), "dummy.txt": CacheItem( name="Dummy Nodes File", - online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt"), + pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt", cached="https://drive.google.com/uc?id=1dsFIhBrIEahggg0JPxw64JwS51pKxoQU" ), "_edgeFreq.eda ": CacheItem( name="Case Study Omics Integrator Edge Frequencies", - online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda"), + pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda", cached="https://drive.google.com/uc?id=1M_rxEzUCo_EVuFyM47OEH2J-4LB3eeCR" ), "goldStandardUnionDetailed.txt": CacheItem( name="Gold Standard Reference Pathways", - online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt"), + pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt", cached="https://drive.google.com/uc?id=1-_zF9oKFCNmJbDCC2vq8OM17HJw80s2T" ), }, "EGFR": { + # The following files are from https://github.com/gitter-lab/tps. + # While the following files do point to the repository's main branch, + # they aren't expected to actually change. "eight-egfr-reference-all.txt": CacheItem( name="EGFR Gold Standard Reference", - online=Service("https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt"), + pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt", cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw" ), "egfr-prizes.txt": CacheItem( name="EGFR prizes", - online=Service("https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt"), + pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt", cached="https://drive.google.com/file/d/1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj/view?usp=sharing" ) }, diff --git a/pyproject.toml b/pyproject.toml index 4df36774..b2112572 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ readme = "README.md" requires-python = ">=3.13" dependencies = [ "gdown>=5.2.0", + "loguru>=0.7.3", "more-itertools>=10.7.0", "networkx>=3.6.1", "pandas>=2.3.0", diff --git a/spras b/spras index 18f2cf84..479842d6 160000 --- a/spras +++ b/spras @@ -1 +1 @@ -Subproject commit 18f2cf84cfac034b2962f47434d3f900288b6a97 +Subproject commit 479842d6954f5df448f628259588c5a038e8efef diff --git a/uv.lock b/uv.lock index a00522cf..f3f25d5d 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.13" [[package]] @@ -310,6 +310,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0", size = 28880, upload-time = "2025-05-27T07:38:15.137Z" }, ] +[[package]] +name = "loguru" +version = "0.7.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "win32-setctime", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" }, +] + [[package]] name = "markupsafe" version = "3.0.2" @@ -846,6 +859,7 @@ version = "0.1.0" source = { editable = "." } dependencies = [ { name = "gdown" }, + { name = "loguru" }, { name = "more-itertools" }, { name = "networkx" }, { name = "pandas" }, @@ -860,6 +874,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "gdown", specifier = ">=5.2.0" }, + { name = "loguru", specifier = ">=0.7.3" }, { name = "more-itertools", specifier = ">=10.7.0" }, { name = "networkx", specifier = ">=3.6.1" }, { name = "pandas", specifier = ">=2.3.0" }, @@ -951,6 +966,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/06/04c8e804f813cf972e3262f3f8584c232de64f0cde9f703b46cf53a45090/virtualenv-20.34.0-py3-none-any.whl", hash = "sha256:341f5afa7eee943e4984a9207c025feedd768baff6753cd660c857ceb3e36026", size = 5983279, upload-time = "2025-08-13T14:24:05.111Z" }, ] +[[package]] +name = "win32-setctime" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" }, +] + [[package]] name = "wrapt" version = "1.17.3" From 2d7f2b1d39e6df9d03ee295124723e79beb29bc5 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sun, 22 Feb 2026 02:47:16 +0000 Subject: [PATCH 12/19] feat(cache): download_against_cache debugging --- cache/cli.py | 4 +- cache/directory.py | 94 ++++++++++++++------ datasets/diseases/Snakefile | 4 +- datasets/diseases/scripts/files.py | 2 +- datasets/egfr/Snakefile | 4 +- datasets/egfr/scripts/process_interactome.py | 4 +- 6 files changed, 76 insertions(+), 36 deletions(-) diff --git a/cache/cli.py b/cache/cli.py index ad82fb67..2e8d8201 100644 --- a/cache/cli.py +++ b/cache/cli.py @@ -3,7 +3,7 @@ This may be expanded in the future, so only depend on this file as a debugging utility. -For example, `python cache/cli.py KEGG/ko03250.xml ko03250.xm` allows running the KEGG query +For example, `python cache/cli.py KEGG/ko03250.xml ko03250.xml` allows running the KEGG query for ko03250.xml, which can not be normally accessed automatically in the browser. """ @@ -23,7 +23,7 @@ def main(): args = parse_args() cache_item = get_cache_item(args.path.split("/")) - cache_item.download_online(args.output) + cache_item.download(args.output) if __name__ == "__main__": main() diff --git a/cache/directory.py b/cache/directory.py index 71f0ffa8..e5c6c4fa 100644 --- a/cache/directory.py +++ b/cache/directory.py @@ -17,7 +17,10 @@ # Our cache emits warnings for files with unpinned versions that don't match the cache. (dir_path / 'logs').mkdir(exist_ok=True) -logger.add(dir_path / 'logs' / "cache.log") +logger.add(dir_path / 'logs' / "cache.log", level="WARNING") + +class DownloadFileCheckException(RuntimeError): + """See Service#download_against_cache for some motivation for this custom error""" @dataclass class Service: @@ -34,12 +37,51 @@ def download(self, output: str | PathLike) -> requests.Response: with open(output, 'wb') as f: shutil.copyfileobj(response.raw, f) return response - + + # NOTE: this is slightly yucky code deduplication. The only intended values of `downloaded_file_type` are `pinned` and `unpinned`. + def download_against_cache( + self, + cache: Path, + downloaded_file_type: str, + move_output: bool + ): + """ + Downloads `this` Service and checks it against the provided `cache` at path. In logs, + the file will be referred to as `downloaded_file_type`. + + @param move_output: Whether or not output should be irrecoverably moved instead of just copied. + """ + logger.info(f"Downloading {downloaded_file_type} file {self.url} to check against with artifact at {cache}...") + downloaded_file_path = Path(NamedTemporaryFile(delete=False).name) + + self.download(downloaded_file_path) + logger.info(f"Checking that the {downloaded_file_type} artifact {downloaded_file_path} matches with cached artifact at {cache}...") + + if not filecmp.cmp(cache, downloaded_file_path): + # This entire if-branch is debug schenanigans: we want to be able to easily compare our current cached file to the online file, + # especially since some `Service`s have special errors that can make the request hard to compare in the browser. + + debug_file_path = Path(NamedTemporaryFile(prefix="spras-benchmarking-debug-artifact", delete=False).name) + # We use shutil over Path#rename since temporary directories can be mounted to a different file system. + if move_output: + shutil.move(cache, debug_file_path) + else: + shutil.copy(cache, debug_file_path) + # We use a custom error type to prevent any overlap with RuntimeError. I am not sure if there is any. + raise DownloadFileCheckException(f"The {downloaded_file_type} file {downloaded_file_path} and " + \ + f"cached file originally at {cache} do not match! " + \ + f"Compare the pinned {downloaded_file_path} and the cached {debug_file_path}.") + else: + # Since we don't clean up pinned_file_path for the above branch's debugging, + # we need to clean it up here. + downloaded_file_path.unlink() + @staticmethod def coerce(obj: 'Service | str') -> 'Service': # TODO: This could also be replaced by coercing str to Service in CacheItem via pydantic. - if isinstance(obj, str): return Service(url=obj) - else: return obj + if isinstance(obj, str): + return Service(url=obj) + return obj def fetch_biomart_service(xml: str) -> Service: """ @@ -93,38 +135,29 @@ def cache_only(cls, name: str, cached: str) -> "CacheItem": def download(self, output: str | PathLike): logger.info(f"Fetching {self.name}...") - with NamedTemporaryFile() as cached_file: - logger.info(f"Downloading cache {self.cached}...") - gdown.download(self.cached, cached_file) - - if self.pinned is not None: - logger.info(f"Downloading pinned URL {self.pinned}...") - Service.coerce(self.pinned).download(output) - - logger.info("Checking that the downloaded pinned artifact matches with cached artifact...") - assert filecmp.cmp(output, cached_file.name) - - if self.unpinned is not None: - logger.info(f"Downloading unpinned URL {self.unpinned}...") - with NamedTemporaryFile() as unpinned_file: - Service.coerce(self.unpinned).download(unpinned_file.name) - - logger.info("Checking that the downloaded unpinned artifact matches with cached artifact...") - if not filecmp.cmp(unpinned_file.name, cached_file.name): - # This gets saved to a file. Search for `logger.add` for more info. - logger.warning(f"Unpinned file {self.unpinned} for {self.name} does not match cache - this source should be updated!") + logger.info(f"Downloading cache {self.cached} to {output}...") + gdown.download(self.cached, str(output)) # gdown doesn't have a type signature, but it expects a string :/ + if self.pinned is not None: + Service.coerce(self.pinned).download_against_cache(cache=Path(output), downloaded_file_type="pinned", move_output=True) + if self.unpinned is not None: + # Normally, download_against_cache raises a DownloadFileCheckException: we catch it and warn instead if that happens. + try: + Service.coerce(self.unpinned).download_against_cache(cache=Path(output), downloaded_file_type="unpinned", move_output=False) + except DownloadFileCheckException as err: + logger.warning(err) + # TODO: yikes! same with self.unpinned CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]] # An *unversioned* directory list. directory: CacheDirectory = { "STRING": { "9606": { - "9606.protein.links.txt.gz": CacheItem( - name="STRING 9606 protein links", + "9606.protein.links.full.txt.gz": CacheItem( + name="STRING 9606 full protein links", cached="https://drive.google.com/uc?id=13tE_-A6g7McZs_lZGz9As7iE-5cBFvqE", - pinned="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz", + pinned="http://stringdb-downloads.org/download/protein.links.full.v12.0/9606.protein.links.full.v12.0.txt.gz", ), "9606.protein.aliases.txt.gz": CacheItem( name="STRING 9606 protein aliases", @@ -272,7 +305,7 @@ def download(self, output: str | PathLike): "egfr-prizes.txt": CacheItem( name="EGFR prizes", pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt", - cached="https://drive.google.com/file/d/1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj/view?usp=sharing" + cached="https://drive.google.com/uc?id=1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj" ) }, } @@ -291,4 +324,9 @@ def get_cache_item(path: list[str]) -> CacheItem: if not isinstance(current_item, CacheItem): raise ValueError(f"Path {path} doesn't lead to a cache item") + # Google Drive validation. TODO: remove if move to OSDF. + if "uc?id=" not in current_item.cached or "/view?usp=sharing" in current_item.cached: + raise RuntimeError("Make sure your Google Drive URLs are in https://drive.google.com/uc?id=... format " + \ + "with no /view?usp=sharing at the end. See CONTRIBUTING.md for more info.") + return current_item diff --git a/datasets/diseases/Snakefile b/datasets/diseases/Snakefile index 0455b57a..93de50d5 100644 --- a/datasets/diseases/Snakefile +++ b/datasets/diseases/Snakefile @@ -13,7 +13,7 @@ produce_fetch_rules({ "raw/HumanDO.tsv": ["DISEASES", "HumanDO.tsv"], "raw/tiga_gene-trait_stats.tsv": ["DISEASES", "tiga_gene-trait_stats.tsv"], "raw/ensg-ensp.tsv": ["BioMart", "ensg-ensp.tsv"], - "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True), + "raw/9606.protein.links.full.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True), "raw/9606.protein.aliases.txt": FetchConfig(["STRING", "9606", "9606.protein.aliases.txt.gz"], uncompress=True), }) @@ -42,7 +42,7 @@ rule files: input: "data/inputs.csv", "data/gold_standard.csv", - "raw/9606.protein.links.txt" + "raw/9606.protein.links.full.txt" output: # These are the two we use for the SPRAS run for now "GS_files/Alopecia_areata_GS.txt", diff --git a/datasets/diseases/scripts/files.py b/datasets/diseases/scripts/files.py index dc5a949b..f8704461 100644 --- a/datasets/diseases/scripts/files.py +++ b/datasets/diseases/scripts/files.py @@ -42,7 +42,7 @@ def main(): # See /cache/directory.py for information on how this was grabbed. # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES. - string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.txt", sep=" ", skiprows=[0], header=None) + string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ", skiprows=[0], header=None) # Threshold anything above a confidence score of 900 to trim down the background interactome string = string[string.iloc[:, 2] > 900] diff --git a/datasets/egfr/Snakefile b/datasets/egfr/Snakefile index 6c4eefbd..f9af5435 100644 --- a/datasets/egfr/Snakefile +++ b/datasets/egfr/Snakefile @@ -15,7 +15,7 @@ rule all: produce_fetch_rules({ "raw/eight-egfr-reference-all.txt": ["EGFR", "eight-egfr-reference-all.txt"], "raw/egfr-prizes.txt": ["EGFR", "egfr-prizes.txt"], - "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True), + "raw/9606.protein.links.full.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True), "raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True), "processed/phosphosite-irefindex13.0-uniprot.txt": ["iRefIndex", "phosphosite-irefindex13.0-uniprot.txt"] }) @@ -26,7 +26,7 @@ rule process_gold_standard: shell: "uv run scripts/process_gold_standard.py" rule process_interactome: - input: "raw/9606.protein.links.txt" + input: "raw/9606.protein.links.full.txt" output: "processed/interactome.tsv" shell: "uv run scripts/process_interactome.py" diff --git a/datasets/egfr/scripts/process_interactome.py b/datasets/egfr/scripts/process_interactome.py index e0ab02c7..3e1c8cd2 100644 --- a/datasets/egfr/scripts/process_interactome.py +++ b/datasets/egfr/scripts/process_interactome.py @@ -4,9 +4,11 @@ egfr_directory = Path(__file__).parent.resolve() / '..' def main(): - interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.txt', sep=' ') + interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.full.txt', sep=' ') interactome_df['protein1'] = interactome_df['protein1'].astype(str).str.removeprefix("9606.") interactome_df['protein2'] = interactome_df['protein2'].astype(str).str.removeprefix("9606.") + # Since this is links.full vs links, we need to restrict to a subset of headers before saving the interactome. + interactome_df = interactome_df[["protein1", "protein2", "combined_score"]] interactome_df['Direction'] = 'U' (egfr_directory / 'processed').mkdir(exist_ok=True) From 117c4593aaf879f5b741645f9507f0e4b1cc2c76 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sun, 22 Feb 2026 08:09:03 +0000 Subject: [PATCH 13/19] fix: use new api for hiv --- cache/directory.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cache/directory.py b/cache/directory.py index cf262752..f18a771e 100644 --- a/cache/directory.py +++ b/cache/directory.py @@ -254,20 +254,24 @@ def download(self, output: str | PathLike): "ko03250.xml": CacheItem( name="KEGG 03250", cached="https://drive.google.com/uc?id=16dtWKHCQMp2qrLfFDE7nVhbwBCr2H5a9", - online="https://www.kegg.jp/kegg-bin/download?entry=ko03250&format=kgml", - online_headers = [('Referer', 'https://www.kegg.jp/pathway/ko03250')], + unpinned=Service( + "https://www.kegg.jp/kegg-bin/download?entry=ko03250&format=kgml", + headers={'Referer': 'https://www.kegg.jp/pathway/ko03250'}) ) }, "HIV1": { + # The following files are from https://github.com/gitter-lab/hiv1-aurkb. + # While the following files do point to the repository's main branch, + # they aren't expected to actually change. "prize_05.tsv": CacheItem( name="HIV_05 prizes", cached="https://drive.google.com/uc?id=1jVWNRPfYkbqimO44GdzXYB3-7NXhet1m", - online="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/ac9278d447e4188eea3bf4b24c4c4e0c19b0c6d9/Results/base_analysis/prize_05.csv" + pinned="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/refs/heads/main/Results/base_analysis/prize_05.csv" ), "prize_060.tsv": CacheItem( name="HIV_060 prizes", cached="https://drive.google.com/uc?id=1Aucgp7pcooGr9oT4m2bvYEuYW6186WxQ", - online="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/ac9278d447e4188eea3bf4b24c4c4e0c19b0c6d9/Results/base_analysis/prize_060.csv" + pinned="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/refs/heads/main/Results/base_analysis/prize_060.csv" ) }, "iRefIndex": { From 2cc1019fed6a3fab639eb69643b08286429aeba9 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sun, 22 Feb 2026 11:20:52 +0000 Subject: [PATCH 14/19] feat: use self-looping iRefIndex interactome --- cache/directory.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cache/directory.py b/cache/directory.py index f18a771e..38a58635 100644 --- a/cache/directory.py +++ b/cache/directory.py @@ -275,12 +275,15 @@ def download(self, output: str | PathLike): ) }, "iRefIndex": { - # This can also be obtained from the SPRAS repo + # This can also be obtained from the SPRAS repo, though the SPRAS repo removes self loops. We don't. # (https://github.com/Reed-CompBio/spras/blob/b5d7a2499afa8eab14c60ce0f99fa7e8a23a2c64/input/phosphosite-irefindex13.0-uniprot.txt). - # iRefIndex has been down for quite some time, so this is only from the cache. + # iRefIndex has been down for quite some time, so we grab this from a repository instead. + # While the following files do point to the repository's main branch, + # they aren't expected to actually change, so we make them `pinned`. "phosphosite-irefindex13.0-uniprot.txt": CacheItem( name="iRefIndex v13.0 UniProt interactome", - cached="https://drive.google.com/uc?id=1fQ8Z3FjEwUseEtsExO723zj7mAAtdomo" + cached="https://drive.google.com/uc?id=1fQ8Z3FjEwUseEtsExO723zj7mAAtdomo", + pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/networks/phosphosite-irefindex13.0-uniprot.txt" ) }, "OsmoticStress": { From ab050fb79273d6a0d64355511fc781ec2d7ca14f Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 23 Feb 2026 01:35:39 +0000 Subject: [PATCH 15/19] fix: diseases links handling --- README.md | 2 +- datasets/diseases/scripts/files.py | 21 ++++++++++----------- datasets/diseases/scripts/gold_standard.py | 4 +--- datasets/diseases/scripts/inputs.py | 4 +--- 4 files changed, 13 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index c98180a6..12a6e84b 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ pip install ./spras To run the postprocess output scripts, we have a `pyproject.toml` which can be used with your desired python package manager. This separates the `spras` conda environment from the small scripts we have. (on CI, we use [`uv`](https://docs.astral.sh/uv/).) -To run the benchmarking pipeline, use: +To run the benchmarking pipeline, use (this example is specifically for disease module mining): ```sh snakemake --cores 1 --configfile configs/dmmm.yaml --show-failed-logs -s spras/Snakefile diff --git a/datasets/diseases/scripts/files.py b/datasets/diseases/scripts/files.py index f8704461..9f810131 100644 --- a/datasets/diseases/scripts/files.py +++ b/datasets/diseases/scripts/files.py @@ -1,9 +1,7 @@ import pandas as pd from pathlib import Path -import os -# https://stackoverflow.com/a/5137509/7589775 -dir_path = os.path.dirname(os.path.realpath(__file__)) +dir_path = Path(__file__).parent.resolve() diseases_path = Path(dir_path, "..") (diseases_path / "prize_files").mkdir(exist_ok=True, parents=True) @@ -16,7 +14,7 @@ def main(): GS_string_df = GS_string_df[GS_string_df["diseaseID"].isin(tiga_string_df["id"])] GS_combined_group = GS_string_df.groupby("diseaseName") - GS_combined_dict = {k: v for k, v in GS_combined_group} + GS_combined_dict = {str(k): v for k, v in GS_combined_group} tiga_filtered = tiga_string_df[tiga_string_df["id"].isin(GS_string_df["diseaseID"])] tiga_group = tiga_filtered.groupby("trait") @@ -27,7 +25,7 @@ def main(): tiga_threshold = tiga_filtered.loc[tiga_filtered["trait"].isin(list(tiga_count_threshold.keys()))] tiga_prizes = tiga_threshold.groupby("trait") - tiga_prize_dict = {k: v for k, v in tiga_prizes} + tiga_prize_dict = {str(k): v for k, v in tiga_prizes} for disease in tiga_prize_dict.keys(): df = tiga_prize_dict[disease] @@ -38,17 +36,18 @@ def main(): for disease in GS_combined_dict.keys(): df = GS_combined_dict[disease] df = df[["str_id"]] - df.to_csv(diseases_path / "GS_files" / f"{disease.replace(' ', '_')}_GS.txt", sep="\t", index=False, header=None) + df.to_csv(diseases_path / "GS_files" / f"{disease.replace(' ', '_')}_GS.txt", sep="\t", index=False, header=False) # See /cache/directory.py for information on how this was grabbed. # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES. - string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ", skiprows=[0], header=None) + string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ") + string = string[["protein1", "protein2", "combined_score"]] # Threshold anything above a confidence score of 900 to trim down the background interactome - string = string[string.iloc[:, 2] > 900] - string = string.iloc[:, [0, 1]] - string[len(string.columns)] = 1 - string.to_csv(diseases_path / "raw" / "string_interactome.txt", sep="\t", index=False, header=None) + string = string[string["combined_score"] > 900] + string = string[["protein1", "protein2"]] + # though we still keep the weight afterwards + string.to_csv(diseases_path / "raw" / "string_interactome.txt", sep="\t", index=False, header=False) if __name__ == "__main__": diff --git a/datasets/diseases/scripts/gold_standard.py b/datasets/diseases/scripts/gold_standard.py index 846eaba5..40ed6111 100644 --- a/datasets/diseases/scripts/gold_standard.py +++ b/datasets/diseases/scripts/gold_standard.py @@ -1,9 +1,7 @@ import pandas as pd -import os from pathlib import Path -# https://stackoverflow.com/a/5137509/7589775 -dir_path = os.path.dirname(os.path.realpath(__file__)) +dir_path = Path(__file__).parent.resolve() diseases_path = Path(dir_path, "..") diff --git a/datasets/diseases/scripts/inputs.py b/datasets/diseases/scripts/inputs.py index 8dc6214d..b7319af9 100644 --- a/datasets/diseases/scripts/inputs.py +++ b/datasets/diseases/scripts/inputs.py @@ -1,9 +1,7 @@ from pathlib import Path import pandas as pd -import os -# https://stackoverflow.com/a/5137509/7589775 -dir_path = os.path.dirname(os.path.realpath(__file__)) +dir_path = Path(__file__).parent.resolve() diseases_path = Path(dir_path, "..") (diseases_path / "data").mkdir(exist_ok=True, parents=True) From 9d9c11d7d3570514ac34ef6353a99c1d105202df Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Mon, 23 Feb 2026 01:55:43 +0000 Subject: [PATCH 16/19] refactor(diseases): split interactome handling --- configs/dmmm.yaml | 4 ++-- datasets/diseases/.gitignore | 4 ++-- datasets/diseases/Snakefile | 9 +++++++-- datasets/diseases/scripts/files.py | 11 ----------- datasets/diseases/scripts/interactome.py | 19 +++++++++++++++++++ 5 files changed, 30 insertions(+), 17 deletions(-) create mode 100644 datasets/diseases/scripts/interactome.py diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml index b4e16bc0..eb1b7f60 100644 --- a/configs/dmmm.yaml +++ b/configs/dmmm.yaml @@ -62,14 +62,14 @@ datasets: - label: dmmmdiseases_alopecia_areata data_dir: datasets/diseases edge_files: - - raw/string_interactome.txt + - processed/string_interactome.tsv node_files: - prize_files/alopecia_areata_prizes.txt other_files: [] - label: dmmmdiseases_diabetes_mellitus data_dir: datasets/diseases edge_files: - - raw/string_interactome.txt + - processed/string_interactome.tsv node_files: - prize_files/diabetes_mellitus_prizes.txt other_files: [] diff --git a/datasets/diseases/.gitignore b/datasets/diseases/.gitignore index 70081635..f65ef927 100644 --- a/datasets/diseases/.gitignore +++ b/datasets/diseases/.gitignore @@ -3,7 +3,7 @@ data # prize and gold standard files +raw +processed GS_files prize_files -raw -Pickles diff --git a/datasets/diseases/Snakefile b/datasets/diseases/Snakefile index 93de50d5..aed94654 100644 --- a/datasets/diseases/Snakefile +++ b/datasets/diseases/Snakefile @@ -2,6 +2,7 @@ include: "../../cache/Snakefile" rule all: input: + "processed/string_interactome.tsv", "GS_files/Alopecia_areata_GS.txt", "GS_files/Diabetes_mellitus_GS.txt", "prize_files/alopecia_areata_prizes.txt", @@ -41,8 +42,7 @@ rule gold_standard: rule files: input: "data/inputs.csv", - "data/gold_standard.csv", - "raw/9606.protein.links.full.txt" + "data/gold_standard.csv" output: # These are the two we use for the SPRAS run for now "GS_files/Alopecia_areata_GS.txt", @@ -51,3 +51,8 @@ rule files: "prize_files/diabetes_mellitus_prizes.txt" shell: "uv run scripts/files.py" + +rule interactome: + input: "raw/9606.protein.links.full.txt" + output: "processed/string_interactome.tsv" + shell: "uv run scripts/interactome.py" diff --git a/datasets/diseases/scripts/files.py b/datasets/diseases/scripts/files.py index 9f810131..31c631b2 100644 --- a/datasets/diseases/scripts/files.py +++ b/datasets/diseases/scripts/files.py @@ -38,17 +38,6 @@ def main(): df = df[["str_id"]] df.to_csv(diseases_path / "GS_files" / f"{disease.replace(' ', '_')}_GS.txt", sep="\t", index=False, header=False) - # See /cache/directory.py for information on how this was grabbed. - # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES. - string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ") - string = string[["protein1", "protein2", "combined_score"]] - - # Threshold anything above a confidence score of 900 to trim down the background interactome - string = string[string["combined_score"] > 900] - string = string[["protein1", "protein2"]] - # though we still keep the weight afterwards - string.to_csv(diseases_path / "raw" / "string_interactome.txt", sep="\t", index=False, header=False) - if __name__ == "__main__": main() diff --git a/datasets/diseases/scripts/interactome.py b/datasets/diseases/scripts/interactome.py new file mode 100644 index 00000000..b0a40b6b --- /dev/null +++ b/datasets/diseases/scripts/interactome.py @@ -0,0 +1,19 @@ +from pathlib import Path +import pandas + +diseases_path = Path(__file__).parent.parent.resolve() + +def main(): + # See /cache/directory.py for information on how this was grabbed. + # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES. + string = pandas.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ") + string = string[["protein1", "protein2", "combined_score"]] + + # Threshold anything above a confidence score of 900 to trim down the background interactome + string = string[string["combined_score"] > 900] + # though we still keep the weight afterwards + (diseases_path / "processed").mkdir(exist_ok=True) + string.to_csv(diseases_path / "processed" / "string_interactome.tsv", sep="\t", index=False, header=False) + +if __name__ == "__main__": + main() From b8759da51fd863e414978b4cd8c8c7aa6ae32089 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 24 Feb 2026 19:57:05 +0000 Subject: [PATCH 17/19] disable ml i do not like this One Bit. --- configs/dmmm.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml index eb1b7f60..9fe9dd94 100644 --- a/configs/dmmm.yaml +++ b/configs/dmmm.yaml @@ -17,7 +17,9 @@ analysis: cytoscape: include: false ml: - include: true + # TODO: we either need at least one non-empty pathway in all algorithm runs, + # or we need to get rid of validate_df hard-erroring. The latter seems better. + include: false aggregate_per_algorithm: true evaluation: include: false From 62596047ecb47cc0fa987f3e271d692dfe463c24 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 24 Feb 2026 20:30:59 +0000 Subject: [PATCH 18/19] fix: don't use import.meta.glob for data files --- web/package.json | 1 + web/pnpm-lock.yaml | 89 ++++++++++++++++++++++++++++++++++++++++++++ web/src/lib/paths.ts | 9 +++-- 3 files changed, 96 insertions(+), 3 deletions(-) diff --git a/web/package.json b/web/package.json index 74b19ed1..1cccf275 100644 --- a/web/package.json +++ b/web/package.json @@ -14,6 +14,7 @@ "@fontsource-variable/noto-sans": "^5.2.10", "astro": "^5.16.6", "dayjs": "^1.11.19", + "glob": "^13.0.6", "medium-zoom": "^1.1.0", "sass": "^1.97.1", "yaml": "^2.8.2" diff --git a/web/pnpm-lock.yaml b/web/pnpm-lock.yaml index c6e31365..be8d4b53 100644 --- a/web/pnpm-lock.yaml +++ b/web/pnpm-lock.yaml @@ -17,6 +17,9 @@ importers: dayjs: specifier: ^1.11.19 version: 1.11.19 + glob: + specifier: ^13.0.6 + version: 13.0.6 medium-zoom: specifier: ^1.1.0 version: 1.1.0 @@ -276,89 +279,105 @@ packages: resolution: {integrity: sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==} cpu: [arm64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-arm@1.2.4': resolution: {integrity: sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==} cpu: [arm] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-ppc64@1.2.4': resolution: {integrity: sha512-FMuvGijLDYG6lW+b/UvyilUWu5Ayu+3r2d1S8notiGCIyYU/76eig1UfMmkZ7vwgOrzKzlQbFSuQfgm7GYUPpA==} cpu: [ppc64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-riscv64@1.2.4': resolution: {integrity: sha512-oVDbcR4zUC0ce82teubSm+x6ETixtKZBh/qbREIOcI3cULzDyb18Sr/Wcyx7NRQeQzOiHTNbZFF1UwPS2scyGA==} cpu: [riscv64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-s390x@1.2.4': resolution: {integrity: sha512-qmp9VrzgPgMoGZyPvrQHqk02uyjA0/QrTO26Tqk6l4ZV0MPWIW6LTkqOIov+J1yEu7MbFQaDpwdwJKhbJvuRxQ==} cpu: [s390x] os: [linux] + libc: [glibc] '@img/sharp-libvips-linux-x64@1.2.4': resolution: {integrity: sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==} cpu: [x64] os: [linux] + libc: [glibc] '@img/sharp-libvips-linuxmusl-arm64@1.2.4': resolution: {integrity: sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==} cpu: [arm64] os: [linux] + libc: [musl] '@img/sharp-libvips-linuxmusl-x64@1.2.4': resolution: {integrity: sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==} cpu: [x64] os: [linux] + libc: [musl] '@img/sharp-linux-arm64@0.34.5': resolution: {integrity: sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] + libc: [glibc] '@img/sharp-linux-arm@0.34.5': resolution: {integrity: sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm] os: [linux] + libc: [glibc] '@img/sharp-linux-ppc64@0.34.5': resolution: {integrity: sha512-7zznwNaqW6YtsfrGGDA6BRkISKAAE1Jo0QdpNYXNMHu2+0dTrPflTLNkpc8l7MUP5M16ZJcUvysVWWrMefZquA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [ppc64] os: [linux] + libc: [glibc] '@img/sharp-linux-riscv64@0.34.5': resolution: {integrity: sha512-51gJuLPTKa7piYPaVs8GmByo7/U7/7TZOq+cnXJIHZKavIRHAP77e3N2HEl3dgiqdD/w0yUfiJnII77PuDDFdw==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [riscv64] os: [linux] + libc: [glibc] '@img/sharp-linux-s390x@0.34.5': resolution: {integrity: sha512-nQtCk0PdKfho3eC5MrbQoigJ2gd1CgddUMkabUj+rBevs8tZ2cULOx46E7oyX+04WGfABgIwmMC0VqieTiR4jg==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [s390x] os: [linux] + libc: [glibc] '@img/sharp-linux-x64@0.34.5': resolution: {integrity: sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] + libc: [glibc] '@img/sharp-linuxmusl-arm64@0.34.5': resolution: {integrity: sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] + libc: [musl] '@img/sharp-linuxmusl-x64@0.34.5': resolution: {integrity: sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] + libc: [musl] '@img/sharp-wasm32@0.34.5': resolution: {integrity: sha512-OdWTEiVkY2PHwqkbBI8frFxQQFekHaSSkUIJkwzclWZe64O1X4UlUjqqqLaPbUpMOQk6FBu/HtlGXNblIs0huw==} @@ -418,36 +437,42 @@ packages: engines: {node: '>= 10.0.0'} cpu: [arm] os: [linux] + libc: [glibc] '@parcel/watcher-linux-arm-musl@2.5.1': resolution: {integrity: sha512-6E+m/Mm1t1yhB8X412stiKFG3XykmgdIOqhjWj+VL8oHkKABfu/gjFj8DvLrYVHSBNC+/u5PeNrujiSQ1zwd1Q==} engines: {node: '>= 10.0.0'} cpu: [arm] os: [linux] + libc: [musl] '@parcel/watcher-linux-arm64-glibc@2.5.1': resolution: {integrity: sha512-LrGp+f02yU3BN9A+DGuY3v3bmnFUggAITBGriZHUREfNEzZh/GO06FF5u2kx8x+GBEUYfyTGamol4j3m9ANe8w==} engines: {node: '>= 10.0.0'} cpu: [arm64] os: [linux] + libc: [glibc] '@parcel/watcher-linux-arm64-musl@2.5.1': resolution: {integrity: sha512-cFOjABi92pMYRXS7AcQv9/M1YuKRw8SZniCDw0ssQb/noPkRzA+HBDkwmyOJYp5wXcsTrhxO0zq1U11cK9jsFg==} engines: {node: '>= 10.0.0'} cpu: [arm64] os: [linux] + libc: [musl] '@parcel/watcher-linux-x64-glibc@2.5.1': resolution: {integrity: sha512-GcESn8NZySmfwlTsIur+49yDqSny2IhPeZfXunQi48DMugKeZ7uy1FX83pO0X22sHntJ4Ub+9k34XQCX+oHt2A==} engines: {node: '>= 10.0.0'} cpu: [x64] os: [linux] + libc: [glibc] '@parcel/watcher-linux-x64-musl@2.5.1': resolution: {integrity: sha512-n0E2EQbatQ3bXhcH2D1XIAANAcTZkQICBPVaxMeaCVBtOpBZpWJuf7LwyWPSBDITb7In8mqQgJ7gH8CILCURXg==} engines: {node: '>= 10.0.0'} cpu: [x64] os: [linux] + libc: [musl] '@parcel/watcher-win32-arm64@2.5.1': resolution: {integrity: sha512-RFzklRvmc3PkjKjry3hLF9wD7ppR4AKcWNzH7kXR7GUe0Igb3Nz8fyPwtZCSquGrhU5HhUNDr/mKBqj7tqA2Vw==} @@ -514,56 +539,67 @@ packages: resolution: {integrity: sha512-EHMUcDwhtdRGlXZsGSIuXSYwD5kOT9NVnx9sqzYiwAc91wfYOE1g1djOEDseZJKKqtHAHGwnGPQu3kytmfaXLQ==} cpu: [arm] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm-musleabihf@4.54.0': resolution: {integrity: sha512-+pBrqEjaakN2ySv5RVrj/qLytYhPKEUwk+e3SFU5jTLHIcAtqh2rLrd/OkbNuHJpsBgxsD8ccJt5ga/SeG0JmA==} cpu: [arm] os: [linux] + libc: [musl] '@rollup/rollup-linux-arm64-gnu@4.54.0': resolution: {integrity: sha512-NSqc7rE9wuUaRBsBp5ckQ5CVz5aIRKCwsoa6WMF7G01sX3/qHUw/z4pv+D+ahL1EIKy6Enpcnz1RY8pf7bjwng==} cpu: [arm64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm64-musl@4.54.0': resolution: {integrity: sha512-gr5vDbg3Bakga5kbdpqx81m2n9IX8M6gIMlQQIXiLTNeQW6CucvuInJ91EuCJ/JYvc+rcLLsDFcfAD1K7fMofg==} cpu: [arm64] os: [linux] + libc: [musl] '@rollup/rollup-linux-loong64-gnu@4.54.0': resolution: {integrity: sha512-gsrtB1NA3ZYj2vq0Rzkylo9ylCtW/PhpLEivlgWe0bpgtX5+9j9EZa0wtZiCjgu6zmSeZWyI/e2YRX1URozpIw==} cpu: [loong64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-ppc64-gnu@4.54.0': resolution: {integrity: sha512-y3qNOfTBStmFNq+t4s7Tmc9hW2ENtPg8FeUD/VShI7rKxNW7O4fFeaYbMsd3tpFlIg1Q8IapFgy7Q9i2BqeBvA==} cpu: [ppc64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-riscv64-gnu@4.54.0': resolution: {integrity: sha512-89sepv7h2lIVPsFma8iwmccN7Yjjtgz0Rj/Ou6fEqg3HDhpCa+Et+YSufy27i6b0Wav69Qv4WBNl3Rs6pwhebQ==} cpu: [riscv64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-riscv64-musl@4.54.0': resolution: {integrity: sha512-ZcU77ieh0M2Q8Ur7D5X7KvK+UxbXeDHwiOt/CPSBTI1fBmeDMivW0dPkdqkT4rOgDjrDDBUed9x4EgraIKoR2A==} cpu: [riscv64] os: [linux] + libc: [musl] '@rollup/rollup-linux-s390x-gnu@4.54.0': resolution: {integrity: sha512-2AdWy5RdDF5+4YfG/YesGDDtbyJlC9LHmL6rZw6FurBJ5n4vFGupsOBGfwMRjBYH7qRQowT8D/U4LoSvVwOhSQ==} cpu: [s390x] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.54.0': resolution: {integrity: sha512-WGt5J8Ij/rvyqpFexxk3ffKqqbLf9AqrTBbWDk7ApGUzaIs6V+s2s84kAxklFwmMF/vBNGrVdYgbblCOFFezMQ==} cpu: [x64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-musl@4.54.0': resolution: {integrity: sha512-JzQmb38ATzHjxlPHuTH6tE7ojnMKM2kYNzt44LO/jJi8BpceEC8QuXYA908n8r3CNuG/B3BV8VR3Hi1rYtmPiw==} cpu: [x64] os: [linux] + libc: [musl] '@rollup/rollup-openharmony-arm64@4.54.0': resolution: {integrity: sha512-huT3fd0iC7jigGh7n3q/+lfPcXxBi+om/Rs3yiFxjvSxbSB6aohDFXbWvlspaqjeOh+hx7DDHS+5Es5qRkWkZg==} @@ -693,6 +729,10 @@ packages: bail@2.0.2: resolution: {integrity: sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==} + balanced-match@4.0.4: + resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==} + engines: {node: 18 || 20 || >=22} + base-64@1.0.0: resolution: {integrity: sha512-kwDPIFCGx0NZHog36dj+tHiwP4QMzsZ3AgMViUBKI0+V5n4U0ufTCUMhnQ04diaRI8EX/QcPfql7zlhZ7j4zgg==} @@ -706,6 +746,10 @@ packages: resolution: {integrity: sha512-F3PH5k5juxom4xktynS7MoFY+NUWH5LC4CnH11YB8NPew+HLpmBLCybSAEyb2F+4pRXhuhWqFesoQd6DAyc2hw==} engines: {node: '>=18'} + brace-expansion@5.0.3: + resolution: {integrity: sha512-fy6KJm2RawA5RcHkLa1z/ScpBeA762UF9KmZQxwIbDtRJrgLzM10depAiEQ+CXYcoiqW1/m96OAAoke2nE9EeA==} + engines: {node: 18 || 20 || >=22} + braces@3.0.3: resolution: {integrity: sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==} engines: {node: '>=8'} @@ -948,6 +992,10 @@ packages: github-slugger@2.0.0: resolution: {integrity: sha512-IaOQ9puYtjrkq7Y0Ygl9KDZnrf/aiUJYUpVf89y8kyaxbRG7Y1SrX/jaumrv81vc61+kiMempujsM3Yw7w5qcw==} + glob@13.0.6: + resolution: {integrity: sha512-Wjlyrolmm8uDpm/ogGyXZXb1Z+Ca2B8NbJwqBVg0axK9GbBeoS7yGV6vjXnYdGm6X53iehEuxxbyiKp8QmN4Vw==} + engines: {node: 18 || 20 || >=22} + h3@1.15.4: resolution: {integrity: sha512-z5cFQWDffyOe4vQ9xIqNfCZdV4p//vy6fBnr8Q1AWnVZ0teurKMG66rLj++TKwKPUP3u7iMUvrvKaEUiQw2QWQ==} @@ -1050,6 +1098,10 @@ packages: lru-cache@10.4.3: resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==} + lru-cache@11.2.6: + resolution: {integrity: sha512-ESL2CrkS/2wTPfuend7Zhkzo2u0daGJ/A2VucJOgQ/C48S/zB8MMeMHSGKYpXhIjbPxfuezITkaBH1wqv00DDQ==} + engines: {node: 20 || >=22} + magic-string@0.30.21: resolution: {integrity: sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==} @@ -1195,6 +1247,14 @@ packages: resolution: {integrity: sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==} engines: {node: '>=8.6'} + minimatch@10.2.2: + resolution: {integrity: sha512-+G4CpNBxa5MprY+04MbgOw1v7So6n5JY166pFi9KfYwT78fxScCeSNQSNzp6dpPSW2rONOps6Ocam1wFhCgoVw==} + engines: {node: 18 || 20 || >=22} + + minipass@7.1.3: + resolution: {integrity: sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==} + engines: {node: '>=16 || 14 >=14.17'} + mrmime@2.0.1: resolution: {integrity: sha512-Y3wQdFg2Va6etvQ5I82yUhGdsKrcYox6p7FfL1LbK2J4V01F9TGlepTIhnK24t7koZibmg82KGglhA1XK5IsLQ==} engines: {node: '>=10'} @@ -1266,6 +1326,10 @@ packages: parse5@7.3.0: resolution: {integrity: sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==} + path-scurry@2.0.2: + resolution: {integrity: sha512-3O/iVVsJAPsOnpwWIeD+d6z/7PmqApyQePUtCndjatj/9I5LylHvt5qluFaBT3I5h3r1ejfR056c+FCv+NnNXg==} + engines: {node: 18 || 20 || >=22} + piccolore@0.1.3: resolution: {integrity: sha512-o8bTeDWjE086iwKrROaDf31K0qC/BENdm15/uH9usSC/uZjJOKb2YGiVHfLY4GhwsERiPI1jmwI2XrA7ACOxVw==} @@ -2321,6 +2385,8 @@ snapshots: bail@2.0.2: {} + balanced-match@4.0.4: {} + base-64@1.0.0: {} base64-js@1.5.1: {} @@ -2338,6 +2404,10 @@ snapshots: widest-line: 5.0.0 wrap-ansi: 9.0.2 + brace-expansion@5.0.3: + dependencies: + balanced-match: 4.0.4 + braces@3.0.3: dependencies: fill-range: 7.1.1 @@ -2559,6 +2629,12 @@ snapshots: github-slugger@2.0.0: {} + glob@13.0.6: + dependencies: + minimatch: 10.2.2 + minipass: 7.1.3 + path-scurry: 2.0.2 + h3@1.15.4: dependencies: cookie-es: 1.2.2 @@ -2707,6 +2783,8 @@ snapshots: lru-cache@10.4.3: {} + lru-cache@11.2.6: {} + magic-string@0.30.21: dependencies: '@jridgewell/sourcemap-codec': 1.5.5 @@ -3042,6 +3120,12 @@ snapshots: picomatch: 2.3.1 optional: true + minimatch@10.2.2: + dependencies: + brace-expansion: 5.0.3 + + minipass@7.1.3: {} + mrmime@2.0.1: {} ms@2.1.3: {} @@ -3111,6 +3195,11 @@ snapshots: dependencies: entities: 6.0.1 + path-scurry@2.0.2: + dependencies: + lru-cache: 11.2.6 + minipass: 7.1.3 + piccolore@0.1.3: {} picocolors@1.1.1: {} diff --git a/web/src/lib/paths.ts b/web/src/lib/paths.ts index dd593072..edd01397 100644 --- a/web/src/lib/paths.ts +++ b/web/src/lib/paths.ts @@ -1,8 +1,11 @@ import { extractDatasetCategory, extractDatasetType } from "./outputStyle"; +import { globSync } from 'glob' -export function getDataFiles() { - const dataFiles = import.meta.glob("../../public/data/output/**", { query: "?raw" }); - return Object.keys(dataFiles).map((path) => path.substring("../../public/data/output/".length)); +export function getDataFiles(): string[] { + // We prefer this over import.meta.glob, as import.meta.glob currently + // leads to OOM for large raw imports, and OOM is especially plausible on CD. + const dataFiles = globSync("../../public/data/output/**"); + return dataFiles.map((path) => path.substring("../../public/data/output/".length)); } export function getDatasets() { From f3ef49f281e23f83ba16e8c75052f366a03d0cbf Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 24 Feb 2026 20:46:44 +0000 Subject: [PATCH 19/19] fix(web): add egfr data category, glob correct directory --- web/src/lib/outputStyle.ts | 14 +++++++++----- web/src/lib/paths.ts | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/web/src/lib/outputStyle.ts b/web/src/lib/outputStyle.ts index 3eb6f089..a65783c5 100644 --- a/web/src/lib/outputStyle.ts +++ b/web/src/lib/outputStyle.ts @@ -47,6 +47,10 @@ const dataCategories = { name: "Yeast", directory: "yeast-osmotic-stress", }, + egfr: { + name: "EGFR", + directory: "egfr" + } }; // TODO: replace this once we have proper dataset categories @@ -57,11 +61,11 @@ export function extractDatasetCategory(name: string): { category: string; name: export function parseOutputString(str: string): Output { const components = str.split("-"); - let dataType; - let datasetCategory; - let datasetName; - let algorithm; - let paramsHash; + let dataType: string | undefined; + let datasetCategory: string | undefined; + let datasetName: string | undefined; + let algorithm: string | undefined; + let paramsHash: string | undefined; if (components.length === 5) { // This is a slug URL (type-...) diff --git a/web/src/lib/paths.ts b/web/src/lib/paths.ts index edd01397..9048e8e4 100644 --- a/web/src/lib/paths.ts +++ b/web/src/lib/paths.ts @@ -4,8 +4,8 @@ import { globSync } from 'glob' export function getDataFiles(): string[] { // We prefer this over import.meta.glob, as import.meta.glob currently // leads to OOM for large raw imports, and OOM is especially plausible on CD. - const dataFiles = globSync("../../public/data/output/**"); - return dataFiles.map((path) => path.substring("../../public/data/output/".length)); + const dataFiles = globSync("public/data/output/**"); + return dataFiles.map((path) => path.substring("public/data/output/".length)); } export function getDatasets() {