From 2a6050728899a9a46d661ab9ce4da001665a2242 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Tue, 17 Feb 2026 23:48:58 -0800
Subject: [PATCH 01/19] feat: egfr interactome

We get higher counts of UniProt -> STRING interactions than usual, though I haven't trimmed it yet.
---
 cache/directory.py                            |  7 ++
 datasets/egfr/.gitignore                      |  2 +
 datasets/egfr/Snakefile                       | 24 +++++
 .../egfr/scripts/process_gold_standard.py     | 23 +++++
 datasets/egfr/scripts/process_interactome.py  | 14 +++
 egfr/egfr-param-tuning.yaml                   | 87 -------------------
 pyproject.toml                                |  4 +
 tools/mapping/__init__.py                     |  0
 tools/mapping/ensembl_uniprot.py              | 57 ++++++++++++
 9 files changed, 131 insertions(+), 87 deletions(-)
 create mode 100644 datasets/egfr/.gitignore
 create mode 100644 datasets/egfr/Snakefile
 create mode 100644 datasets/egfr/scripts/process_gold_standard.py
 create mode 100644 datasets/egfr/scripts/process_interactome.py
 delete mode 100644 egfr/egfr-param-tuning.yaml
 create mode 100644 tools/mapping/__init__.py
 create mode 100644 tools/mapping/ensembl_uniprot.py

diff --git a/cache/directory.py b/cache/directory.py
index b308f1b5..b94594b1 100644
--- a/cache/directory.py
+++ b/cache/directory.py
@@ -214,6 +214,13 @@ def download(self, output: str | PathLike):
             cached="https://drive.google.com/uc?id=1-_zF9oKFCNmJbDCC2vq8OM17HJw80s2T"
         ),
     },
+    "EGFR": {
+        "eight-egfr-reference-all.txt": CacheItem(
+            name="EGFR Gold Standard Reference",
+            online="https://raw.githubusercontent.com/gitter-lab/tps/ca7cafd1e1c17f45ddea07c3fb54d0d70b86ff45/data/resources/eight-egfr-reference-all.txt",
+            cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw"
+        )
+    },
 }
 
 
diff --git a/datasets/egfr/.gitignore b/datasets/egfr/.gitignore
new file mode 100644
index 00000000..4b8fba9e
--- /dev/null
+++ b/datasets/egfr/.gitignore
@@ -0,0 +1,2 @@
+raw
+processed
diff --git a/datasets/egfr/Snakefile b/datasets/egfr/Snakefile
new file mode 100644
index 00000000..0f77b504
--- /dev/null
+++ b/datasets/egfr/Snakefile
@@ -0,0 +1,24 @@
+include: "../../cache/Snakefile"
+
+rule all:
+    input:
+        "processed/gold-standard-nodes.txt",
+        "processed/interactome.tsv",
+
+produce_fetch_rules({
+    "raw/eight-egfr-reference-all.txt": ["EGFR", "eight-egfr-reference-all.txt"],
+    "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True),
+    "raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True),
+})
+
+rule process_gold_standard:
+    input:
+        "raw/HUMAN_9606_idmapping_selected.tsv",
+        "raw/eight-egfr-reference-all.txt"
+    output: "processed/gold-standard-nodes.txt"
+    shell: "uv run scripts/process_gold_standard.py"
+
+rule process_interactome:
+    input: "raw/9606.protein.links.txt"
+    output: "processed/interactome.tsv"
+    shell: "uv run scripts/process_interactome.py"
diff --git a/datasets/egfr/scripts/process_gold_standard.py b/datasets/egfr/scripts/process_gold_standard.py
new file mode 100644
index 00000000..e9dd23dc
--- /dev/null
+++ b/datasets/egfr/scripts/process_gold_standard.py
@@ -0,0 +1,23 @@
+import pandas
+from pathlib import Path
+from tools.mapping.ensembl_uniprot import idmapping_uniprot_mapping
+
+egfr_directory = Path(__file__).parent.resolve() / '..'
+
+def main():
+    # First, we remove all PSUEDONODES (and any duplicates)
+    nodes = (egfr_directory / 'raw' / 'eight-egfr-reference-all.txt').read_text().splitlines()
+    nodes = list(set([node for node in nodes if not node.endswith("_PSEUDONODE")]))
+
+    # Then, we map our UniProt nodes to ENSP.
+    idmapping_df = idmapping_uniprot_mapping(egfr_directory / 'raw' / 'HUMAN_9606_idmapping_selected.tsv')
+    idmapping_df = pandas.DataFrame(nodes, columns=['UniProtKB-ID']).merge(idmapping_df, on='UniProtKB-ID', how='left')
+    idmapping_df = idmapping_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl'])
+    idmapping_df = idmapping_df[~idmapping_df['Ensembl_PRO'].isna()]
+    nodes = idmapping_df['Ensembl_PRO'].astype(str).to_list()
+
+    (egfr_directory / 'processed').mkdir(exist_ok=True)
+    (egfr_directory / 'processed' / 'gold-standard-nodes.txt').write_text("\n".join(nodes))
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets/egfr/scripts/process_interactome.py b/datasets/egfr/scripts/process_interactome.py
new file mode 100644
index 00000000..b733e08a
--- /dev/null
+++ b/datasets/egfr/scripts/process_interactome.py
@@ -0,0 +1,14 @@
+from pathlib import Path
+import pandas
+
+egfr_directory = Path(__file__).parent.resolve() / '..'
+
+def main():
+    interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.txt', sep='\t')
+    interactome_df['Direction'] = 'U'
+    
+    (egfr_directory / 'processed').mkdir(exist_ok=True)
+    interactome_df.to_csv(egfr_directory / 'processed' / 'interactome.tsv', index=False, header=False, sep='\t')
+
+if __name__ == "__main__":
+    main()
diff --git a/egfr/egfr-param-tuning.yaml b/egfr/egfr-param-tuning.yaml
deleted file mode 100644
index 30cacedb..00000000
--- a/egfr/egfr-param-tuning.yaml
+++ /dev/null
@@ -1,87 +0,0 @@
-hash_length: 7
-container_framework: docker
-unpack_singularity: false
-container_registry:
-  base_url: docker.io
-  owner: reedcompbio
-algorithms:
-  - name: omicsintegrator2
-    params:
-      include: true
-      run1:
-        b: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        g: [2, 3, 4, 5, 6, 7]
-        w: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-  - name: domino
-    params:
-      include: true
-      run1:
-        module_threshold: [0.001, 0.01, 0.02]
-        slice_threshold: [0.001, 0.1, 0.3, 0.9, 1]
-  - name: mincostflow
-    params:
-      include: true
-      run1:
-        capacity: [1, 5, 10, 15]
-        flow: [6, 8, 20, 50, 60, 70, 80, 90, 150]
-  - name: pathlinker
-    params:
-      include: true
-      run1:
-        k: [10, 20, 30, 40, 50, 60, 100, 200, 500]
-  - name: allpairs
-    params:
-      include: true
-  - name: meo
-    params:
-      include: true
-      run1:
-        local_search: ['No']
-        max_path_length: [2]
-        rand_restarts: [10]
-  - name: omicsintegrator1
-    params:
-      include: true
-      run1:
-        b: [0.01, 0.55, 2, 5, 10]
-        d: [10, 20, 30, 40]
-        g: [0.0001, 0.001]
-        mu: [0.001, 0.005, 0.008, 0.02, 0.03]
-        r: [0.01, 0.1, 1]
-        w: [0.001, 0.1, 0.5, 2, 8]
-datasets:
-  - label: tps_egfr
-    node_files:
-      - tps-egfr-prizes.txt
-    edge_files:
-      - phosphosite-irefindex13.0-uniprot.txt
-    other_files: []
-    data_dir: input
-gold_standards:
-  - label: gs_egfr
-    node_files:
-      - gs-egfr.txt
-    data_dir: input
-    dataset_labels:
-      - tps_egfr
-reconstruction_settings:
-  locations:
-    reconstruction_dir: output/tps_egfr
-  run: true
-analysis:
-  summary:
-    include: true
-  graphspace:
-    include: false
-  cytoscape:
-    include: false
-  ml:
-    include: true
-    aggregate_per_algorithm: true
-    components: 4
-    labels: false
-    linkage: ward
-    metric: euclidean
-  evaluation:
-    include: false
-    aggregate_per_algorithm: false
diff --git a/pyproject.toml b/pyproject.toml
index 9a071ecd..4df36774 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,3 +25,7 @@ package = true
 
 [tool.setuptools.packages]
 find = {namespaces = false} 
+
+[build-system]
+requires = ["setuptools", "wheel", "pip"]
+build-backend = "setuptools.build_meta"
diff --git a/tools/mapping/__init__.py b/tools/mapping/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tools/mapping/ensembl_uniprot.py b/tools/mapping/ensembl_uniprot.py
new file mode 100644
index 00000000..73b548d9
--- /dev/null
+++ b/tools/mapping/ensembl_uniprot.py
@@ -0,0 +1,57 @@
+import pandas
+import os
+
+"""
+Utilities for mapping Ensembl and UniProt.
+
+For example,
+```py
+idmapping_uniprot_ensembl = idmapping_uniprot_mapping(path / "HUMAN_9606_idmapping_selected.tsv")
+```
+
+then you can use the `idmapping_as_ensg_uniprot_mapping` or `idmapping_as_ensp_uniprot_mapping` to restrict the mapping to specifically
+ENSG or ENSP.
+"""
+
+def handle_ensembl_list(
+        idmapping_df: pandas.DataFrame,
+        column_name: str
+) -> pandas.DataFrame:
+    idmapping_df = idmapping_df[idmapping_df[column_name].notnull()]
+    # Handle our ;-delimited list
+    idmapping_df[column_name] = idmapping_df[column_name].str.split("; ")
+    idmapping_df = idmapping_df.explode(column_name)
+    # Drop isoforms
+    idmapping_df[column_name] = idmapping_df[column_name].str.split(".").str[0]
+    idmapping_df = idmapping_df.reset_index(drop=True)
+    return idmapping_df
+
+def idmapping_uniprot_mapping(
+        path: str | os.PathLike
+    ) -> pandas.DataFrame:
+    """
+    Gets the UniProt mapping file (`*_idmapping_selected`) as a dataframe with columns
+    UniProtKB-AC: High-quality UniProt IDs
+    Ensembl: ENSG
+    Ensembl_PRO: ENSG (Ensembl Protein IDs)
+    """
+    # The very powerful UniProt-provided mapping file: its Ensembl mappings are a semicolon-delimeted list of Emsembl IDs containing
+    # attached isoforms (and not all UniProtKB-AC identifiers have those!) so we'll need to do some extra post-processing.
+    # This is `*_idmapping_selected`.
+    idmapping_selected_df = pandas.read_csv(
+        path,
+        header=None,
+        # See directory.py for the README associated with this mapping file.
+        usecols=[0, 1, 18, 20],
+        names=["UniProtKB-AC", "UniProtKB-ID", "Ensembl", "Ensembl_PRO"],
+        sep="\t",
+    )
+    idmapping_selected_df = handle_ensembl_list(idmapping_selected_df, "Ensembl")
+    idmapping_selected_df = handle_ensembl_list(idmapping_selected_df, "Ensembl_PRO")
+    return idmapping_selected_df
+
+def idmapping_as_ensg_uniprot_mapping(uniprot_mapping: pandas.DataFrame):
+    return uniprot_mapping.drop(columns=["Ensembl_PRO"])
+
+def idmapping_as_ensp_uniprot_mapping(uniprot_mapping: pandas.DataFrame):
+    return uniprot_mapping.drop(columns=["Ensembl"]).rename(columns={"Ensembl_PRO": "Ensembl"})

From 8e1589b5421bb47d8c8e10857457e1659546b432 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Thu, 19 Feb 2026 00:25:29 +0000
Subject: [PATCH 02/19] feat: split egfr data

---
 cache/directory.py                            |  7 +++-
 configs/dmmm.yaml                             | 18 +++++++++++
 datasets/egfr/Snakefile                       | 31 +++++++++++++++---
 datasets/egfr/scripts/map_ensembl.py          | 32 +++++++++++++++++++
 .../egfr/scripts/process_gold_standard.py     | 11 +------
 datasets/egfr/scripts/process_interactome.py  |  2 +-
 datasets/egfr/scripts/process_prizes.py       | 15 +++++++++
 run_snakemake.sh                              |  1 +
 8 files changed, 101 insertions(+), 16 deletions(-)
 create mode 100644 datasets/egfr/scripts/map_ensembl.py
 create mode 100644 datasets/egfr/scripts/process_prizes.py

diff --git a/cache/directory.py b/cache/directory.py
index b94594b1..23f6d4f4 100644
--- a/cache/directory.py
+++ b/cache/directory.py
@@ -80,7 +80,7 @@ def download(self, output: str | PathLike):
         "9606": {
             "9606.protein.links.txt.gz": CacheItem(
                 name="STRING 9606 protein links",
-                cached="https://drive.google.com/uc?id=1fvjdIbgzbgJrdJxWRRRwwS1zuegf6DOj",
+                cached="https://drive.google.com/uc?id=13tE_-A6g7McZs_lZGz9As7iE-5cBFvqE",
                 online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
             ),
             "9606.protein.aliases.txt.gz": CacheItem(
@@ -219,6 +219,11 @@ def download(self, output: str | PathLike):
             name="EGFR Gold Standard Reference",
             online="https://raw.githubusercontent.com/gitter-lab/tps/ca7cafd1e1c17f45ddea07c3fb54d0d70b86ff45/data/resources/eight-egfr-reference-all.txt",
             cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw"
+        ),
+        "egfr-prizes.txt": CacheItem(
+            name="EGFR prizes",
+            online="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt",
+            cached="https://drive.google.com/file/d/1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj/view?usp=sharing"
         )
     },
 }
diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml
index 8d4c9c0a..7c6e622d 100644
--- a/configs/dmmm.yaml
+++ b/configs/dmmm.yaml
@@ -77,6 +77,16 @@ datasets:
     edge_files: ["raw/phosphosite-irefindex13.0-uniprot.txt"]
     node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"]
     other_files: []
+  - label: dmmmegfr_string
+    data_dir: datasets/egfr
+    edge_files: ["processed/interactome.tsv"]
+    node_files: ["processed/prizes-uniprot.txt"]
+    other_files: []
+  - label: dmmmegfr_irefindex
+    data_dir: datasets/egfr
+    edge_files: ["processed/phosphosite-irefindex13.0-uniprot.txt"]
+    node_files: ["processed/prizes.txt"]
+    other_files: []
 gold_standards:
   - label: gs0
     node_files: ['GS_files/Alopecia_areata_GS.txt']
@@ -90,3 +100,11 @@ gold_standards:
     node_files: ["processed/FADU_gold_standard.txt"]
     data_dir: datasets/depmap
     dataset_labels: ["dmmmdepmap_cellline_fadu"]
+  - label: gs_egfr_string
+    node_files: ["processed/gold-standard-nodes.txt"]
+    data_dir: datasets/egfr
+    dataset_labels: ["dmmmegfr_string"]
+  - label: gs_egfr_irefindex
+    node_files: ["processed/gold-standard-nodes-uniprot.txt"]
+    data_dir: datasets/egfr
+    dataset_labels: ["dmmmegfr_irefindex"]
diff --git a/datasets/egfr/Snakefile b/datasets/egfr/Snakefile
index 0f77b504..756c3d2e 100644
--- a/datasets/egfr/Snakefile
+++ b/datasets/egfr/Snakefile
@@ -2,23 +2,46 @@ include: "../../cache/Snakefile"
 
 rule all:
     input:
+        # Our UniProt based files over the older iRefIndex interactome
+        "processed/gold-standard-nodes-uniprot.txt",
+        "processed/prizes-uniprot.txt",
+        "processed/phosphosite-irefindex13.0-uniprot.txt",
+
+        # Our Ensembl protein based files over the STRING interactome 
         "processed/gold-standard-nodes.txt",
+        "processed/prizes.txt",
         "processed/interactome.tsv",
 
 produce_fetch_rules({
     "raw/eight-egfr-reference-all.txt": ["EGFR", "eight-egfr-reference-all.txt"],
+    "raw/egfr-prizes.txt": ["EGFR", "egfr-prizes.txt"],
     "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True),
     "raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True),
+    "processed/phosphosite-irefindex13.0-uniprot.txt": ["iRefIndex", "phosphosite-irefindex13.0-uniprot.txt"]
 })
 
 rule process_gold_standard:
-    input:
-        "raw/HUMAN_9606_idmapping_selected.tsv",
-        "raw/eight-egfr-reference-all.txt"
-    output: "processed/gold-standard-nodes.txt"
+    input: "raw/eight-egfr-reference-all.txt"
+    output: "processed/gold-standard-nodes-uniprot.txt",
     shell: "uv run scripts/process_gold_standard.py"
 
 rule process_interactome:
     input: "raw/9606.protein.links.txt"
     output: "processed/interactome.tsv"
     shell: "uv run scripts/process_interactome.py"
+
+rule process_prizes:
+    input: "raw/egfr-prizes.txt"
+    output: "processed/prizes-uniprot.txt"
+    shell: "uv run scripts/process_prizes.py"
+
+rule map_ensembl:
+    input:
+        "raw/HUMAN_9606_idmapping_selected.tsv",
+        
+        "processed/prizes-uniprot.txt",
+        "processed/gold-standard-nodes-uniprot.txt",
+    output:
+        "processed/prizes.txt",
+        "processed/gold-standard-nodes.txt"
+    shell: "uv run scripts/map_ensembl.py"
\ No newline at end of file
diff --git a/datasets/egfr/scripts/map_ensembl.py b/datasets/egfr/scripts/map_ensembl.py
new file mode 100644
index 00000000..f8396cb1
--- /dev/null
+++ b/datasets/egfr/scripts/map_ensembl.py
@@ -0,0 +1,32 @@
+import pandas
+from pathlib import Path
+from tools.mapping.ensembl_uniprot import idmapping_uniprot_mapping
+
+egfr_directory = Path(__file__).parent.resolve() / '..'
+
+def main():
+    # Re-read the uniprot nodes from `process_gold_standard.py`
+    nodes = (egfr_directory / 'processed' / 'gold-standard-nodes-uniprot.txt').read_text().splitlines()
+    # and the prizes from `process_prizes.py`
+    prizes = pandas.read_csv(egfr_directory / 'processed' / 'prizes-uniprot.txt', sep='\t')
+
+    # We grab our UniProt <-> ENSP mapping
+    idmapping_df = idmapping_uniprot_mapping(egfr_directory / 'raw' / 'HUMAN_9606_idmapping_selected.tsv')
+    
+    # and map the nodes
+    idmapping_nodes_df = pandas.DataFrame(nodes, columns=['UniProtKB-ID']).merge(idmapping_df, on='UniProtKB-ID', how='left')
+    idmapping_nodes_df = idmapping_nodes_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl'])
+    idmapping_nodes_df = idmapping_nodes_df[~idmapping_nodes_df['Ensembl_PRO'].isna()]
+    nodes = idmapping_nodes_df['Ensembl_PRO'].astype(str).to_list()
+    (egfr_directory / 'processed' / 'gold-standard-nodes.txt').write_text("\n".join(nodes))
+
+    # and the prizes
+    idmapping_prizes_df = prizes.merge(idmapping_df, left_on='NODEID', right_on="UniProtKB-ID", how='inner')
+    idmapping_prizes_df = idmapping_prizes_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl', 'NODEID'])
+    idmapping_prizes_df = idmapping_prizes_df[~idmapping_prizes_df['Ensembl_PRO'].isna()]
+    idmapping_prizes_df = idmapping_prizes_df.rename(columns={'Ensembl_PRO': 'NODEID'})
+    idmapping_prizes_df = idmapping_prizes_df[["NODEID", "prize"]]
+    idmapping_prizes_df.to_csv(egfr_directory / 'processed' / 'prizes.txt', sep='\t', index=False)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/datasets/egfr/scripts/process_gold_standard.py b/datasets/egfr/scripts/process_gold_standard.py
index e9dd23dc..6f5c8997 100644
--- a/datasets/egfr/scripts/process_gold_standard.py
+++ b/datasets/egfr/scripts/process_gold_standard.py
@@ -1,6 +1,4 @@
-import pandas
 from pathlib import Path
-from tools.mapping.ensembl_uniprot import idmapping_uniprot_mapping
 
 egfr_directory = Path(__file__).parent.resolve() / '..'
 
@@ -9,15 +7,8 @@ def main():
     nodes = (egfr_directory / 'raw' / 'eight-egfr-reference-all.txt').read_text().splitlines()
     nodes = list(set([node for node in nodes if not node.endswith("_PSEUDONODE")]))
 
-    # Then, we map our UniProt nodes to ENSP.
-    idmapping_df = idmapping_uniprot_mapping(egfr_directory / 'raw' / 'HUMAN_9606_idmapping_selected.tsv')
-    idmapping_df = pandas.DataFrame(nodes, columns=['UniProtKB-ID']).merge(idmapping_df, on='UniProtKB-ID', how='left')
-    idmapping_df = idmapping_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl'])
-    idmapping_df = idmapping_df[~idmapping_df['Ensembl_PRO'].isna()]
-    nodes = idmapping_df['Ensembl_PRO'].astype(str).to_list()
-
     (egfr_directory / 'processed').mkdir(exist_ok=True)
-    (egfr_directory / 'processed' / 'gold-standard-nodes.txt').write_text("\n".join(nodes))
+    (egfr_directory / 'processed' / 'gold-standard-nodes-uniprot.txt').write_text("\n".join(nodes))
 
 if __name__ == "__main__":
     main()
diff --git a/datasets/egfr/scripts/process_interactome.py b/datasets/egfr/scripts/process_interactome.py
index b733e08a..32c6b8bd 100644
--- a/datasets/egfr/scripts/process_interactome.py
+++ b/datasets/egfr/scripts/process_interactome.py
@@ -6,7 +6,7 @@
 def main():
     interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.txt', sep='\t')
     interactome_df['Direction'] = 'U'
-    
+
     (egfr_directory / 'processed').mkdir(exist_ok=True)
     interactome_df.to_csv(egfr_directory / 'processed' / 'interactome.tsv', index=False, header=False, sep='\t')
 
diff --git a/datasets/egfr/scripts/process_prizes.py b/datasets/egfr/scripts/process_prizes.py
new file mode 100644
index 00000000..f66635cc
--- /dev/null
+++ b/datasets/egfr/scripts/process_prizes.py
@@ -0,0 +1,15 @@
+import pandas
+from pathlib import Path
+
+egfr_directory = Path(__file__).parent.resolve() / '..'
+
+def main():
+    prizes = pandas.read_csv(
+        egfr_directory / 'raw' / 'egfr-prizes.txt', sep='\t',
+        header=None, names=['NODEID', 'prize']
+    )
+    prizes['active'] = 'True'
+    prizes.to_csv(egfr_directory / 'processed' / 'prizes-uniprot.txt', index=False, sep='\t')
+
+if __name__ == "__main__":
+    main()
diff --git a/run_snakemake.sh b/run_snakemake.sh
index 24305244..137fff2f 100755
--- a/run_snakemake.sh
+++ b/run_snakemake.sh
@@ -18,6 +18,7 @@ main() {
     uv run snakemake --cores 1 -d datasets/diseases -s datasets/diseases/Snakefile
     uv run snakemake --cores 1 -d datasets/rn-muscle-skeletal -s datasets/rn-muscle-skeletal/Snakefile
     uv run snakemake --cores 1 -d datasets/depmap -s datasets/depmap/Snakefile
+    uv run snakemake --cores 1 -d datasets/egfr -s datasets/egfr/Snakefile
 }
 
 main "$@"

From d0a9205ea21fd9e50f078d617126848b196ba46e Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Thu, 19 Feb 2026 00:26:02 +0000
Subject: [PATCH 03/19] style: fmt

---
 datasets/egfr/scripts/map_ensembl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/egfr/scripts/map_ensembl.py b/datasets/egfr/scripts/map_ensembl.py
index f8396cb1..fd308a72 100644
--- a/datasets/egfr/scripts/map_ensembl.py
+++ b/datasets/egfr/scripts/map_ensembl.py
@@ -12,7 +12,7 @@ def main():
 
     # We grab our UniProt <-> ENSP mapping
     idmapping_df = idmapping_uniprot_mapping(egfr_directory / 'raw' / 'HUMAN_9606_idmapping_selected.tsv')
-    
+
     # and map the nodes
     idmapping_nodes_df = pandas.DataFrame(nodes, columns=['UniProtKB-ID']).merge(idmapping_df, on='UniProtKB-ID', how='left')
     idmapping_nodes_df = idmapping_nodes_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl'])

From f68b7861e4da6c224a4ba76f0ad66c863c58f35b Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Thu, 19 Feb 2026 03:34:06 +0000
Subject: [PATCH 04/19] fix: strip interactome prefix

---
 datasets/egfr/scripts/process_interactome.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/datasets/egfr/scripts/process_interactome.py b/datasets/egfr/scripts/process_interactome.py
index 32c6b8bd..e0ab02c7 100644
--- a/datasets/egfr/scripts/process_interactome.py
+++ b/datasets/egfr/scripts/process_interactome.py
@@ -4,7 +4,9 @@
 egfr_directory = Path(__file__).parent.resolve() / '..'
 
 def main():
-    interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.txt', sep='\t')
+    interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.txt', sep=' ')
+    interactome_df['protein1'] = interactome_df['protein1'].astype(str).str.removeprefix("9606.")
+    interactome_df['protein2'] = interactome_df['protein2'].astype(str).str.removeprefix("9606.")
     interactome_df['Direction'] = 'U'
 
     (egfr_directory / 'processed').mkdir(exist_ok=True)

From af335c841ba1d11c41f5a61c7edca23d491703bc Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Thu, 19 Feb 2026 04:32:09 +0000
Subject: [PATCH 05/19] chore: add dummy column

---
 datasets/egfr/scripts/process_prizes.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/datasets/egfr/scripts/process_prizes.py b/datasets/egfr/scripts/process_prizes.py
index f66635cc..33549fed 100644
--- a/datasets/egfr/scripts/process_prizes.py
+++ b/datasets/egfr/scripts/process_prizes.py
@@ -9,6 +9,8 @@ def main():
         header=None, names=['NODEID', 'prize']
     )
     prizes['active'] = 'True'
+    prizes['dummy'] = 'True'
+
     prizes.to_csv(egfr_directory / 'processed' / 'prizes-uniprot.txt', index=False, sep='\t')
 
 if __name__ == "__main__":

From 7a3f79a908c67936b2361b7596e486c851308a1b Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Thu, 19 Feb 2026 08:12:47 +0000
Subject: [PATCH 06/19] chore: update spras, docs, unpin commit

---
 cache/directory.py                      |   2 +-
 configs/dmmm.yaml                       | 119 ++++++++++++------------
 configs/pra.yaml                        |  27 +++---
 datasets/egfr/README.md                 |  18 ++++
 datasets/egfr/scripts/map_ensembl.py    |   2 +-
 datasets/egfr/scripts/process_prizes.py |  12 ++-
 spras                                   |   2 +-
 7 files changed, 105 insertions(+), 77 deletions(-)
 create mode 100644 datasets/egfr/README.md

diff --git a/cache/directory.py b/cache/directory.py
index 23f6d4f4..14373572 100644
--- a/cache/directory.py
+++ b/cache/directory.py
@@ -217,7 +217,7 @@ def download(self, output: str | PathLike):
     "EGFR": {
         "eight-egfr-reference-all.txt": CacheItem(
             name="EGFR Gold Standard Reference",
-            online="https://raw.githubusercontent.com/gitter-lab/tps/ca7cafd1e1c17f45ddea07c3fb54d0d70b86ff45/data/resources/eight-egfr-reference-all.txt",
+            online="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt",
             cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw"
         ),
         "egfr-prizes.txt": CacheItem(
diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml
index 7c6e622d..1086e722 100644
--- a/configs/dmmm.yaml
+++ b/configs/dmmm.yaml
@@ -25,81 +25,82 @@ analysis:
 # Custom settings
 algorithms:
   - name: "omicsintegrator1"
-    params:
-      include: true
+    include: true
+    runs:
       run1:
-        b: [2]
-        w: [.5]
-        d: [10]
-        mu: [2]
+        b: 2
+        w: .5
+        d: 10
+        mu: 2
+        # TODO: egfr prefers dummy_mode: ["file"] since we manually specify EGF_HUMAN as one.
   - name: "omicsintegrator2"
-    params:
-      include: true
+    include: true
+    runs:
       run1:
-        b: [4]
-        g: [0]
+        b: 4
+        g: 0
 
 datasets:
   # TODO: use old paramaters for datasets
   # HIV: https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking/spras-config/config.yaml
-  - label: dmmmhiv_060
-    node_files: ["processed_prize_060.txt"]
-    edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"]
-    other_files: []
-    data_dir: "datasets/hiv/processed"
-  - label: dmmmhiv_05
-    node_files: ["processed_prize_05.txt"]
-    edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"]
-    other_files: []
-    data_dir: "datasets/hiv/processed"
-  # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml
-  - label: dmmmyeast
-    node_files: ["prizes1_dummies.txt"]
-    edge_files: ["network1.txt"]
-    other_files: []
-    data_dir: "datasets/yeast-osmotic-stress/processed"
-  - label: dmmmdiseases_alopecia_areata
-    data_dir: datasets/diseases
-    edge_files:
-      - raw/string_interactome.txt
-    node_files:
-      - prize_files/alopecia_areata_prizes.txt
-    other_files: []
-  - label: dmmmdiseases_diabetes_mellitus
-    data_dir: datasets/diseases
-    edge_files:
-      - raw/string_interactome.txt
-    node_files:
-      - prize_files/diabetes_mellitus_prizes.txt
-    other_files: []
-  - label: dmmmdepmap_cellline_fadu
-    data_dir: datasets/depmap
-    edge_files: ["raw/phosphosite-irefindex13.0-uniprot.txt"]
-    node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"]
-    other_files: []
+  # - label: dmmmhiv_060
+  #   node_files: ["processed_prize_060.txt"]
+  #   edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"]
+  #   other_files: []
+  #   data_dir: "datasets/hiv/processed"
+  # - label: dmmmhiv_05
+  #   node_files: ["processed_prize_05.txt"]
+  #   edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"]
+  #   other_files: []
+  #   data_dir: "datasets/hiv/processed"
+  # # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml
+  # - label: dmmmyeast
+  #   node_files: ["prizes1_dummies.txt"]
+  #   edge_files: ["network1.txt"]
+  #   other_files: []
+  #   data_dir: "datasets/yeast-osmotic-stress/processed"
+  # - label: dmmmdiseases_alopecia_areata
+  #   data_dir: datasets/diseases
+  #   edge_files:
+  #     - raw/string_interactome.txt
+  #   node_files:
+  #     - prize_files/alopecia_areata_prizes.txt
+  #   other_files: []
+  # - label: dmmmdiseases_diabetes_mellitus
+  #   data_dir: datasets/diseases
+  #   edge_files:
+  #     - raw/string_interactome.txt
+  #   node_files:
+  #     - prize_files/diabetes_mellitus_prizes.txt
+  #   other_files: []
+  # - label: dmmmdepmap_cellline_fadu
+  #   data_dir: datasets/depmap
+  #   edge_files: ["raw/phosphosite-irefindex13.0-uniprot.txt"]
+  #   node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"]
+  #   other_files: []
   - label: dmmmegfr_string
     data_dir: datasets/egfr
     edge_files: ["processed/interactome.tsv"]
-    node_files: ["processed/prizes-uniprot.txt"]
+    node_files: ["processed/prizes.txt"]
     other_files: []
   - label: dmmmegfr_irefindex
     data_dir: datasets/egfr
     edge_files: ["processed/phosphosite-irefindex13.0-uniprot.txt"]
-    node_files: ["processed/prizes.txt"]
+    node_files: ["processed/prizes-uniprot.txt"]
     other_files: []
 gold_standards:
-  - label: gs0
-    node_files: ['GS_files/Alopecia_areata_GS.txt']
-    data_dir: "datasets/diseases"
-    dataset_labels: ["dmmmdiseases_alopecia_areata"]
-  - label: gs1
-    node_files: ['GS_files/Diabetes_mellitus_GS.txt']
-    data_dir: "datasets/diseases"
-    dataset_labels: ["dmmmdiseases_diabetes_mellitus"]
-  - label: gs_fadu
-    node_files: ["processed/FADU_gold_standard.txt"]
-    data_dir: datasets/depmap
-    dataset_labels: ["dmmmdepmap_cellline_fadu"]
+  # - label: gs0
+  #   node_files: ['GS_files/Alopecia_areata_GS.txt']
+  #   data_dir: "datasets/diseases"
+  #   dataset_labels: ["dmmmdiseases_alopecia_areata"]
+  # - label: gs1
+  #   node_files: ['GS_files/Diabetes_mellitus_GS.txt']
+  #   data_dir: "datasets/diseases"
+  #   dataset_labels: ["dmmmdiseases_diabetes_mellitus"]
+  # - label: gs_fadu
+  #   node_files: ["processed/FADU_gold_standard.txt"]
+  #   data_dir: datasets/depmap
+  #   dataset_labels: ["dmmmdepmap_cellline_fadu"]
   - label: gs_egfr_string
     node_files: ["processed/gold-standard-nodes.txt"]
     data_dir: datasets/egfr
diff --git a/configs/pra.yaml b/configs/pra.yaml
index 76f14a3c..3ad77733 100644
--- a/configs/pra.yaml
+++ b/configs/pra.yaml
@@ -26,27 +26,26 @@ analysis:
 # Custom settings
 algorithms:
   - name: "omicsintegrator1"
-    params:
-      include: true
+    include: true
+    runs:
       run1:
-        b: [2]
-        w: [.5]
-        d: [10]
-        mu: [2]
+        b: 2
+        w: .5
+        d: 10
+        mu: 2
   - name: "omicsintegrator2"
-    params:
-      include: true
+    include: true
+    runs:
       run1:
-        b: [4]
-        g: [0]
+        b: 4
+        g: 0
   - name: "pathlinker"
-    params:
-      include: true
+    include: true
+    runs:
       run1:
         k: [10, 20]
   - name: "allpairs"
-    params:
-      include: true
+    include: true
 
 datasets:
   - label: prarn_muscleskeletal2018
diff --git a/datasets/egfr/README.md b/datasets/egfr/README.md
new file mode 100644
index 00000000..30398dee
--- /dev/null
+++ b/datasets/egfr/README.md
@@ -0,0 +1,18 @@
+# EGFR
+
+EGFR dataset. This dataset does a lot less processing for raw files, and is mainly focused on creating the new STRING-based interactome.
+
+This data is from [_Synthesizing Signaling Pathways from Temporal Phosphoproteomic Data_](https://doi.org/10.1016/j.celrep.2018.08.085).
+
+## Overview
+
+This produces two sets of files: one based on the iRefIndex/PhosphoSite directed interactome of closed-source origin based off of UniProt identifiers, and another one based off of the more updated though undirected STRING interactome.
+
+## Scripts
+
+- `process_prizes.py`: produces a `prizes-uniprot.txt` from
+[egfr-prizes.txt](https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt),
+trimming psuedonodes and manually injecting the `EGF_HUMAN` receptor as a dummy node for OmicsIntegrator1.
+- `process_interactome.py`: Produces the STRING `interactome.tsv` file from the STRING links file. Note that the `phosphosite-irefindex13.0-uniprot.txt` is a magic (as in with closed-source origin) directed interactome produced with a combination of the now archived iRefIndex v13 interactome with extra PhosphoSite-provided nodes
+- `process_gold_standard.py`: Produces the `gold-standard-nodes-uniprot.txt` file from the [EGFR prize file](https://raw.githubusercontent.com/gitter-lab/tps/ca7cafd1e1c17f45ddea07c3fb54d0d70b86ff45/data/resources/eight-egfr-reference-all.txt) from the above paper.
+- `map_ensembl.py`: Maps UniProt identifiers to STRING identifiers for the STRING-based data.
diff --git a/datasets/egfr/scripts/map_ensembl.py b/datasets/egfr/scripts/map_ensembl.py
index fd308a72..89913a07 100644
--- a/datasets/egfr/scripts/map_ensembl.py
+++ b/datasets/egfr/scripts/map_ensembl.py
@@ -25,7 +25,7 @@ def main():
     idmapping_prizes_df = idmapping_prizes_df.drop(columns=['UniProtKB-ID', 'UniProtKB-AC', 'Ensembl', 'NODEID'])
     idmapping_prizes_df = idmapping_prizes_df[~idmapping_prizes_df['Ensembl_PRO'].isna()]
     idmapping_prizes_df = idmapping_prizes_df.rename(columns={'Ensembl_PRO': 'NODEID'})
-    idmapping_prizes_df = idmapping_prizes_df[["NODEID", "prize"]]
+    idmapping_prizes_df = idmapping_prizes_df[["NODEID", "prize", "active", "dummy", "source"]]
     idmapping_prizes_df.to_csv(egfr_directory / 'processed' / 'prizes.txt', sep='\t', index=False)
 
 if __name__ == "__main__":
diff --git a/datasets/egfr/scripts/process_prizes.py b/datasets/egfr/scripts/process_prizes.py
index 33549fed..8763ac58 100644
--- a/datasets/egfr/scripts/process_prizes.py
+++ b/datasets/egfr/scripts/process_prizes.py
@@ -8,8 +8,18 @@ def main():
         egfr_directory / 'raw' / 'egfr-prizes.txt', sep='\t',
         header=None, names=['NODEID', 'prize']
     )
+    prizes = prizes.loc[~prizes['NODEID'].str.endswith('_PSEUDONODE')]
+    # TODO: prize: 10 is a magic value.
+    prizes = pandas.concat(
+        [prizes, pandas.DataFrame({
+            'NODEID': ['EGF_HUMAN'],
+            'prize': [10],
+            'dummy': ['True'],
+            'source': ['True']
+        })],
+        ignore_index=True
+    )
     prizes['active'] = 'True'
-    prizes['dummy'] = 'True'
 
     prizes.to_csv(egfr_directory / 'processed' / 'prizes-uniprot.txt', index=False, sep='\t')
 
diff --git a/spras b/spras
index cd01e67e..18f2cf84 160000
--- a/spras
+++ b/spras
@@ -1 +1 @@
-Subproject commit cd01e67ea24f1817ba469335dfacb875ba2412bb
+Subproject commit 18f2cf84cfac034b2962f47434d3f900288b6a97

From 69c6a8646a58a62cbb3ed92a05ca614c75a148dd Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Thu, 19 Feb 2026 08:20:07 +0000
Subject: [PATCH 07/19] chore: uncomment dmmm

whoops
---
 configs/dmmm.yaml | 94 +++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml
index 1086e722..81535a42 100644
--- a/configs/dmmm.yaml
+++ b/configs/dmmm.yaml
@@ -43,41 +43,41 @@ algorithms:
 datasets:
   # TODO: use old paramaters for datasets
   # HIV: https://github.com/Reed-CompBio/spras-benchmarking/blob/0293ae4dc0be59502fac06b42cfd9796a4b4413e/hiv-benchmarking/spras-config/config.yaml
-  # - label: dmmmhiv_060
-  #   node_files: ["processed_prize_060.txt"]
-  #   edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"]
-  #   other_files: []
-  #   data_dir: "datasets/hiv/processed"
-  # - label: dmmmhiv_05
-  #   node_files: ["processed_prize_05.txt"]
-  #   edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"]
-  #   other_files: []
-  #   data_dir: "datasets/hiv/processed"
-  # # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml
-  # - label: dmmmyeast
-  #   node_files: ["prizes1_dummies.txt"]
-  #   edge_files: ["network1.txt"]
-  #   other_files: []
-  #   data_dir: "datasets/yeast-osmotic-stress/processed"
-  # - label: dmmmdiseases_alopecia_areata
-  #   data_dir: datasets/diseases
-  #   edge_files:
-  #     - raw/string_interactome.txt
-  #   node_files:
-  #     - prize_files/alopecia_areata_prizes.txt
-  #   other_files: []
-  # - label: dmmmdiseases_diabetes_mellitus
-  #   data_dir: datasets/diseases
-  #   edge_files:
-  #     - raw/string_interactome.txt
-  #   node_files:
-  #     - prize_files/diabetes_mellitus_prizes.txt
-  #   other_files: []
-  # - label: dmmmdepmap_cellline_fadu
-  #   data_dir: datasets/depmap
-  #   edge_files: ["raw/phosphosite-irefindex13.0-uniprot.txt"]
-  #   node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"]
-  #   other_files: []
+  - label: dmmmhiv_060
+    node_files: ["processed_prize_060.txt"]
+    edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"]
+    other_files: []
+    data_dir: "datasets/hiv/processed"
+  - label: dmmmhiv_05
+    node_files: ["processed_prize_05.txt"]
+    edge_files: ["../raw/phosphosite-irefindex13.0-uniprot.txt"]
+    other_files: []
+    data_dir: "datasets/hiv/processed"
+  # Yeast: https://github.com/tristan-f-r/spras-benchmarking/blob/9477d85871024a5e3a4b0b8b9be7e78c0d0ee961/yeast-osmotic-stress/config.yaml
+  - label: dmmmyeast
+    node_files: ["prizes1_dummies.txt"]
+    edge_files: ["network1.txt"]
+    other_files: []
+    data_dir: "datasets/yeast-osmotic-stress/processed"
+  - label: dmmmdiseases_alopecia_areata
+    data_dir: datasets/diseases
+    edge_files:
+      - raw/string_interactome.txt
+    node_files:
+      - prize_files/alopecia_areata_prizes.txt
+    other_files: []
+  - label: dmmmdiseases_diabetes_mellitus
+    data_dir: datasets/diseases
+    edge_files:
+      - raw/string_interactome.txt
+    node_files:
+      - prize_files/diabetes_mellitus_prizes.txt
+    other_files: []
+  - label: dmmmdepmap_cellline_fadu
+    data_dir: datasets/depmap
+    edge_files: ["raw/phosphosite-irefindex13.0-uniprot.txt"]
+    node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"]
+    other_files: []
   - label: dmmmegfr_string
     data_dir: datasets/egfr
     edge_files: ["processed/interactome.tsv"]
@@ -89,18 +89,18 @@ datasets:
     node_files: ["processed/prizes-uniprot.txt"]
     other_files: []
 gold_standards:
-  # - label: gs0
-  #   node_files: ['GS_files/Alopecia_areata_GS.txt']
-  #   data_dir: "datasets/diseases"
-  #   dataset_labels: ["dmmmdiseases_alopecia_areata"]
-  # - label: gs1
-  #   node_files: ['GS_files/Diabetes_mellitus_GS.txt']
-  #   data_dir: "datasets/diseases"
-  #   dataset_labels: ["dmmmdiseases_diabetes_mellitus"]
-  # - label: gs_fadu
-  #   node_files: ["processed/FADU_gold_standard.txt"]
-  #   data_dir: datasets/depmap
-  #   dataset_labels: ["dmmmdepmap_cellline_fadu"]
+  - label: gs0
+    node_files: ['GS_files/Alopecia_areata_GS.txt']
+    data_dir: "datasets/diseases"
+    dataset_labels: ["dmmmdiseases_alopecia_areata"]
+  - label: gs1
+    node_files: ['GS_files/Diabetes_mellitus_GS.txt']
+    data_dir: "datasets/diseases"
+    dataset_labels: ["dmmmdiseases_diabetes_mellitus"]
+  - label: gs_fadu
+    node_files: ["processed/FADU_gold_standard.txt"]
+    data_dir: datasets/depmap
+    dataset_labels: ["dmmmdepmap_cellline_fadu"]
   - label: gs_egfr_string
     node_files: ["processed/gold-standard-nodes.txt"]
     data_dir: datasets/egfr

From 5483cd4aff7272d6cdb218d06327d276676edf2d Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Fri, 20 Feb 2026 09:58:51 +0000
Subject: [PATCH 08/19] fix: do string interactome trimming!

No trim.py :/
---
 datasets/egfr/scripts/map_ensembl.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/datasets/egfr/scripts/map_ensembl.py b/datasets/egfr/scripts/map_ensembl.py
index 89913a07..d8488012 100644
--- a/datasets/egfr/scripts/map_ensembl.py
+++ b/datasets/egfr/scripts/map_ensembl.py
@@ -5,6 +5,12 @@
 egfr_directory = Path(__file__).parent.resolve() / '..'
 
 def main():
+    # We get specifically the STRING nodes, as the mapping from UniProt overeagerly maps
+    string_nodes = pandas.read_csv(
+        egfr_directory / 'processed' / 'interactome.tsv',
+        header=None, sep='\t', names=['Interactor1', 'Interactor2', 'Weight', 'Direction'])
+    interactor_series = pandas.concat([string_nodes['Interactor1'], string_nodes['Interactor2']], ignore_index=True)
+
     # Re-read the uniprot nodes from `process_gold_standard.py`
     nodes = (egfr_directory / 'processed' / 'gold-standard-nodes-uniprot.txt').read_text().splitlines()
     # and the prizes from `process_prizes.py`
@@ -12,6 +18,8 @@ def main():
 
     # We grab our UniProt <-> ENSP mapping
     idmapping_df = idmapping_uniprot_mapping(egfr_directory / 'raw' / 'HUMAN_9606_idmapping_selected.tsv')
+    # Trim it with the interactor series
+    idmapping_df = idmapping_df[idmapping_df["Ensembl_PRO"].isin(interactor_series)]
 
     # and map the nodes
     idmapping_nodes_df = pandas.DataFrame(nodes, columns=['UniProtKB-ID']).merge(idmapping_df, on='UniProtKB-ID', how='left')

From d5096ebf57c0074dcf61e067ad84151dfd1ef560 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Fri, 20 Feb 2026 10:08:20 +0000
Subject: [PATCH 09/19] fix: add interactome to map_ensembl input

---
 datasets/egfr/Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/datasets/egfr/Snakefile b/datasets/egfr/Snakefile
index 756c3d2e..6c4eefbd 100644
--- a/datasets/egfr/Snakefile
+++ b/datasets/egfr/Snakefile
@@ -38,6 +38,7 @@ rule process_prizes:
 rule map_ensembl:
     input:
         "raw/HUMAN_9606_idmapping_selected.tsv",
+        "processed/interactome.tsv",
         
         "processed/prizes-uniprot.txt",
         "processed/gold-standard-nodes-uniprot.txt",

From 9ef2a3fbb0ce9796181f9d66296b1d7c04ef1335 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Fri, 20 Feb 2026 22:32:47 +0000
Subject: [PATCH 10/19] chore: add Service with proper header handling

---
 cache/__init__.py  |   4 +-
 cache/directory.py | 114 ++++++++++++++++++++++++---------------------
 2 files changed, 62 insertions(+), 56 deletions(-)

diff --git a/cache/__init__.py b/cache/__init__.py
index 2f15fe4d..9e48cf44 100644
--- a/cache/__init__.py
+++ b/cache/__init__.py
@@ -67,9 +67,9 @@ def link(output: str, directive: list[str], uncompress=False):
 
     Path(output).unlink(missing_ok=True)
 
-    # Re-download if the directive has expired.
+    # Re-download if the file doesn't exist or the directive has expired.
     cache_item = get_cache_item(directive)
-    if has_expired(directive):
+    if not (artifacts_dir / artifact_name).exists() or has_expired(directive):
         (artifacts_dir / artifact_name).unlink(missing_ok=True)
         cache_item.download(artifacts_dir / artifact_name)
 
diff --git a/cache/directory.py b/cache/directory.py
index 14373572..459ddabc 100644
--- a/cache/directory.py
+++ b/cache/directory.py
@@ -2,29 +2,47 @@
 from typing import Union
 from os import PathLike
 from tempfile import NamedTemporaryFile
-from typing import Optional
-import urllib.request
+from typing import Optional, Mapping
 import filecmp
-import urllib.parse
-import os
 from pathlib import Path
+import warnings
+import requests
+import shutil
+import urllib.parse
 
+import pydantic
 import gdown
 
-dir_path = Path(os.path.dirname(os.path.realpath(__file__)))
 
+dir_path = Path(__file__).parent.resolve()
 
-def fetch_biomart_url(xml: str) -> str:
+@dataclass
+class Service:
+    url: str
+    headers: Optional[Mapping[str, str]] = None
+
+    def download(self, output: str | PathLike) -> requests.Response:
+        """
+        Downloads a URL, returning the response (to be used with `with`) and modifying the output path.
+        """
+        # As per https://stackoverflow.com/a/39217788/7589775 to enable download streaming.
+        with requests.get(self.url, stream=True, headers=self.headers) as response:
+            response.raw.decode_content = True
+            with open(output, 'wb') as f:
+                shutil.copyfileobj(response.raw, f)
+            return response
+
+
+def fetch_biomart_service(xml: str) -> Service:
     """
     Access BioMart data through the BioMart REST API:
     https://useast.ensembl.org/info/data/biomart/biomart_restful.html#biomartxml
     """
     ROOT = "http://www.ensembl.org/biomart/martservice?query="
-    return ROOT + urllib.parse.quote_plus(xml)
+    return Service(ROOT + urllib.parse.quote_plus(xml))
 
 
-@dataclass
-class CacheItem:
+class CacheItem(pydantic.BaseModel):
     """
     Class for differentriating between offline and online items in a cache.
 
@@ -35,41 +53,29 @@ class CacheItem:
     name: str
     """The display name of the artifact, used for human-printing."""
     cached: str
-    online: str
-    online_headers: Optional[list[tuple[str, str]]] = None
+    online: Optional[Service] = None
 
     @classmethod
+    @warnings.deprecated("Pending for removal after the CONTRIBUTING guide is updated.")
     def cache_only(cls, name: str, cached: str) -> "CacheItem":
         """Wrapper method to explicitly declare a CacheItem as cached only."""
-        return cls(name=name, online=cached, cached="")
-
-    def download_online(self, output: str | PathLike):
-        # https://stackoverflow.com/a/45313194/7589775: this is to add optional headers to requests.
-        # We remove the opener at the end by re-installing the default opener.
-        opener = urllib.request.build_opener()
-        if self.online_headers:
-            opener.addheaders = self.online_headers
-        urllib.request.install_opener(opener)
-        urllib.request.urlretrieve(self.online, output)
-        urllib.request.install_opener(urllib.request.build_opener())
+        return cls(name=name, cached=cached, online=None)
 
     def download(self, output: str | PathLike):
         print(f"Fetching {self.name}...")
-        print(f"Downloading {self.online}...")
-
-        if self.cached == "":
-            # From CacheItem.cached_only
-            # (gdown doesn't take in Paths for the output_file, so we must stringify it here)
-            gdown.download(self.online, str(output))
-            return
-
-        self.download_online(output)
 
         with NamedTemporaryFile() as cached_file:
             print(f"Downloading cache {self.cached}...")
             gdown.download(self.cached, cached_file)
+
+            if self.online is None:
+                return
+
+            print(f"Downloading {self.online}...")
+            self.online.download(output)
+
             print("Checking that downloaded artifact matches with cached artifact...")
-            filecmp.cmp(output, cached_file.name)
+            assert filecmp.cmp(output, cached_file.name)
 
 
 CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]]
@@ -81,12 +87,12 @@ def download(self, output: str | PathLike):
             "9606.protein.links.txt.gz": CacheItem(
                 name="STRING 9606 protein links",
                 cached="https://drive.google.com/uc?id=13tE_-A6g7McZs_lZGz9As7iE-5cBFvqE",
-                online="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
+                online=Service("http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz"),
             ),
             "9606.protein.aliases.txt.gz": CacheItem(
                 name="STRING 9606 protein aliases",
                 cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY",
-                online="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz",
+                online=Service("https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz"),
             ),
         }
     },
@@ -98,19 +104,19 @@ def download(self, output: str | PathLike):
             "SwissProt_9606.tsv": CacheItem(
                 name="UniProt 9606 SwissProt genes",
                 cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk",
-                online="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29",
+                online=Service("https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29"),
             ),
             # idmapping FTP files. See the associated README:
             # https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README
             "HUMAN_9606_idmapping_selected.tab.gz": CacheItem(
                 name="UniProt 9606 ID external database mapping",
                 cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX",
-                online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz",
+                online=Service("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz"),
             ),
             "HUMAN_9606_idmapping.dat.gz": CacheItem(
                 name="UniProt 9606 internal id mapping",
                 cached="https://drive.google.com/uc?id=1lGxrx_kGyNdupwIOUXzfIZScc7rQKP-O",
-                online="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz",
+                online=Service("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz"),
             ),
         }
     },
@@ -120,56 +126,56 @@ def download(self, output: str | PathLike):
         "tiga_gene-trait_stats.tsv": CacheItem(
             name="TIGA data",
             cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK",
-            online="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv",
+            online=Service("https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv"),
         ),
         "HumanDO.tsv": CacheItem(
             name="Disease ontology data",
             cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi",
-            online="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv",
+            online=Service("https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv"),
         ),
         "human_disease_textmining_filtered.tsv": CacheItem(
             name="DISEASES textmining channel",
             cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D",
-            online="https://download.jensenlab.org/human_disease_textmining_filtered.tsv",
+            online=Service("https://download.jensenlab.org/human_disease_textmining_filtered.tsv"),
         ),
         "human_disease_knowledge_filtered.tsv": CacheItem(
             name="DISEASES knowledge channel",
             cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld",
-            online="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv",
+            online=Service("https://download.jensenlab.org/human_disease_knowledge_filtered.tsv"),
         ),
     },
     "BioMart": {
         "ensg-ensp.tsv": CacheItem(
             name="BioMart ENSG <-> ENSP mapping",
             cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL",
-            online=fetch_biomart_url((dir_path / "biomart" / "ensg-ensp.xml").read_text()),
+            online=fetch_biomart_service((dir_path / "biomart" / "ensg-ensp.xml").read_text()),
         )
     },
     "DepMap": {
         "OmicsProfiles.csv": CacheItem(
             name="DepMap omics metadata",
             cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL",
-            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads",
+            online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads"),
         ),
         "CRISPRGeneDependency.csv": CacheItem(
             name="DepMap gene dependency probability estimates",
             cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz",
-            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads",
+            online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads"),
         ),
         "OmicsSomaticMutationsMatrixDamaging.csv": CacheItem(
             name="DepMap genotyped matrix",
             cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh",
-            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads",
+            online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads"),
         ),
         "OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem(
             name="DepMap model-level TPMs",
             cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP",
-            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads",
+            online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads"),
         ),
         "OmicsCNGeneWGS.csv": CacheItem(
             name="DepMap gene-level copy number data",
             cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub",
-            online="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads",
+            online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads"),
         ),
     },
     "iRefIndex": {
@@ -190,39 +196,39 @@ def download(self, output: str | PathLike):
         # The following files are from https://github.com/gitter-lab/osmotic-stress
         "prizes.txt": CacheItem(
             name="Osmotic Stress Prizes",
-            online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt",
+            online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt"),
             cached="https://drive.google.com/uc?id=16WDQs0Vjv6rI12-hbifsbnpH31jMGhJg"
         ),
         "ChasmanNetwork-DirUndir.txt": CacheItem(
             name="Network Input",
-            online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt",
+            online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt"),
             cached="https://drive.google.com/uc?id=1qYXPaWcPU72YYME7NaBzD7thYCHRzrLH"
         ),
         "dummy.txt": CacheItem(
             name="Dummy Nodes File",
-            online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt",
+            online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt"),
             cached="https://drive.google.com/uc?id=1dsFIhBrIEahggg0JPxw64JwS51pKxoQU"
         ),
         "_edgeFreq.eda ": CacheItem(
             name="Case Study Omics Integrator Edge Frequencies",
-            online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda",
+            online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda"),
             cached="https://drive.google.com/uc?id=1M_rxEzUCo_EVuFyM47OEH2J-4LB3eeCR"
         ),
         "goldStandardUnionDetailed.txt": CacheItem(
             name="Gold Standard Reference Pathways",
-            online="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt",
+            online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt"),
             cached="https://drive.google.com/uc?id=1-_zF9oKFCNmJbDCC2vq8OM17HJw80s2T"
         ),
     },
     "EGFR": {
         "eight-egfr-reference-all.txt": CacheItem(
             name="EGFR Gold Standard Reference",
-            online="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt",
+            online=Service("https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt"),
             cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw"
         ),
         "egfr-prizes.txt": CacheItem(
             name="EGFR prizes",
-            online="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt",
+            online=Service("https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt"),
             cached="https://drive.google.com/file/d/1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj/view?usp=sharing"
         )
     },

From e4a9d35137bc6253d22562305a8c78398a3979fc Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Sun, 22 Feb 2026 00:44:51 +0000
Subject: [PATCH 11/19] feat(cache): pinned & unpinned files

---
 cache/.gitignore   |   1 +
 cache/README.md    |  28 +++++++++-
 cache/directory.py | 131 ++++++++++++++++++++++++++++++---------------
 pyproject.toml     |   1 +
 spras              |   2 +-
 uv.lock            |  26 ++++++++-
 6 files changed, 141 insertions(+), 48 deletions(-)

diff --git a/cache/.gitignore b/cache/.gitignore
index de153db3..9554b8c6 100644
--- a/cache/.gitignore
+++ b/cache/.gitignore
@@ -1 +1,2 @@
 artifacts
+logs
diff --git a/cache/README.md b/cache/README.md
index 4aea94bc..654aa829 100644
--- a/cache/README.md
+++ b/cache/README.md
@@ -1,7 +1,31 @@
-# cache
+# Cache
 
-Handles artifact fetching and cache. This folder has:
+Handles artifact fetching and cache. The point of this is to [for the duration that SPRAS is maintained] prevent any kind of
+data rot, to ensure that continuous benchmarking is encouraged to use the latest available data.
 
+During benchmarking runs, data is fetched from all provided URLs in `directory.py`, where we get the most current version of data,
+and compare it to our cached data to check if the data has changed at all.
+
+All entries are provided with this template:
+
+```py
+"file-name.ext": CacheItem(
+    name="Short File Description",
+    cached="https://drive.google.com/uc?id=...",
+    # Either-or
+    pinned=Service("..."),
+    unpinned=Service("..."),
+),
+```
+
+When a file is requested, `cached`, `pinned`, and `unpinned` are all downloaded:
+- If the URLs linking to `pinned` and `unpinned` do not succeed (i.e. do not return a 2XX status code), we fail.
+- If the URL linking to `pinned` does not match `cached`, we fail.
+- If the URL linking to `unpinned` does not match `cached`, we warn that the data needs updating.
+
+## Layout
+
+This folder has:
 - `Snakefile` which only contains a function used for producing fetching rules.
 - `directory.py`, the actual location of file URLs and their cached counterparts.
 - `cli.py`, a utility for manually fetching specific URLs from `directory.py`.
diff --git a/cache/directory.py b/cache/directory.py
index 459ddabc..71f0ffa8 100644
--- a/cache/directory.py
+++ b/cache/directory.py
@@ -10,12 +10,15 @@
 import shutil
 import urllib.parse
 
-import pydantic
 import gdown
-
+from loguru import logger
 
 dir_path = Path(__file__).parent.resolve()
 
+# Our cache emits warnings for files with unpinned versions that don't match the cache.
+(dir_path / 'logs').mkdir(exist_ok=True)
+logger.add(dir_path / 'logs' / "cache.log")
+
 @dataclass
 class Service:
     url: str
@@ -31,7 +34,12 @@ def download(self, output: str | PathLike) -> requests.Response:
             with open(output, 'wb') as f:
                 shutil.copyfileobj(response.raw, f)
             return response
-
+    
+    @staticmethod
+    def coerce(obj: 'Service | str') -> 'Service':
+        # TODO: This could also be replaced by coercing str to Service in CacheItem via pydantic.
+        if isinstance(obj, str): return Service(url=obj)
+        else: return obj
 
 def fetch_biomart_service(xml: str) -> Service:
     """
@@ -41,41 +49,70 @@ def fetch_biomart_service(xml: str) -> Service:
     ROOT = "http://www.ensembl.org/biomart/martservice?query="
     return Service(ROOT + urllib.parse.quote_plus(xml))
 
-
-class CacheItem(pydantic.BaseModel):
+@dataclass
+class CacheItem:
     """
-    Class for differentriating between offline and online items in a cache.
-
-    NOTE: If cached is "", we assume that online is a Google Drive URL (for cases where there is no
-    remaining online data source.)
+    Class for differentriating between different ways of fetching data.
+    As mentioned in the ./README.md, `cached` is always needed, and we differentriate between service outage (`pinned`)
+    and data needing updates (`unpinned`). There is no need to specify both keys at once, but the choice does matter
+    for how errors are presented during benchmarking runs.
     """
 
     name: str
     """The display name of the artifact, used for human-printing."""
+
     cached: str
-    online: Optional[Service] = None
+    """
+    The URL of the cached file, which is currently a Google Drive URL.
+    """
+
+    pinned: Optional[Service | str] = None
+    """
+    The Service (URL + headers) of the file, which is the 'pinned' file.
+    By a pinned file, we say that the file has a dedicated version, and should not change.
+    If this is None, we go for the `unpinned` file or `cached` if `unpinned` is None.
+    """
+
+    unpinned: Optional[Service | str] = None
+    """
+    Analogously to `pinned`, this is a Service (URL + headers) which is 'unpinned,'
+    or lacks a dedicated version. When `pinned` matches `cached` but `unpinned` doesn't match `pinned`,
+    we say that the file has a new version.
+
+    If `pinned` is None and `unpinned` doesn't match `cached`, we warn instead of erroring.
+
+    We will still error if the status code is not 2XX (a successful request).
+    """
 
     @classmethod
     @warnings.deprecated("Pending for removal after the CONTRIBUTING guide is updated.")
     def cache_only(cls, name: str, cached: str) -> "CacheItem":
         """Wrapper method to explicitly declare a CacheItem as cached only."""
-        return cls(name=name, cached=cached, online=None)
+        return cls(name=name, cached=cached)
 
     def download(self, output: str | PathLike):
-        print(f"Fetching {self.name}...")
+        logger.info(f"Fetching {self.name}...")
 
         with NamedTemporaryFile() as cached_file:
-            print(f"Downloading cache {self.cached}...")
+            logger.info(f"Downloading cache {self.cached}...")
             gdown.download(self.cached, cached_file)
 
-            if self.online is None:
-                return
+            if self.pinned is not None:
+                logger.info(f"Downloading pinned URL {self.pinned}...")
+                Service.coerce(self.pinned).download(output)
 
-            print(f"Downloading {self.online}...")
-            self.online.download(output)
+                logger.info("Checking that the downloaded pinned artifact matches with cached artifact...")
+                assert filecmp.cmp(output, cached_file.name)
+            
+            if self.unpinned is not None:
+                logger.info(f"Downloading unpinned URL {self.unpinned}...")
+                with NamedTemporaryFile() as unpinned_file:
+                    Service.coerce(self.unpinned).download(unpinned_file.name)
 
-            print("Checking that downloaded artifact matches with cached artifact...")
-            assert filecmp.cmp(output, cached_file.name)
+                    logger.info("Checking that the downloaded unpinned artifact matches with cached artifact...")
+                    if not filecmp.cmp(unpinned_file.name, cached_file.name):
+                        # This gets saved to a file. Search for `logger.add` for more info.
+                        logger.warning(f"Unpinned file {self.unpinned} for {self.name} does not match cache - this source should be updated!")
 
 
 CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]]
@@ -87,12 +124,12 @@ def download(self, output: str | PathLike):
             "9606.protein.links.txt.gz": CacheItem(
                 name="STRING 9606 protein links",
                 cached="https://drive.google.com/uc?id=13tE_-A6g7McZs_lZGz9As7iE-5cBFvqE",
-                online=Service("http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz"),
+                pinned="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
             ),
             "9606.protein.aliases.txt.gz": CacheItem(
                 name="STRING 9606 protein aliases",
                 cached="https://drive.google.com/uc?id=1IWrQeTVCcw1A-jDk-4YiReWLnwP0S9bY",
-                online=Service("https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz"),
+                pinned="https://stringdb-downloads.org/download/protein.aliases.v12.0/9606.protein.aliases.v12.0.txt.gz",
             ),
         }
     },
@@ -104,19 +141,19 @@ def download(self, output: str | PathLike):
             "SwissProt_9606.tsv": CacheItem(
                 name="UniProt 9606 SwissProt genes",
                 cached="https://drive.google.com/uc?id=1h2Cl-60qcKse-djcsqlRXm_n60mVY7lk",
-                online=Service("https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29"),
+                unpinned="https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29",
             ),
             # idmapping FTP files. See the associated README:
             # https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README
             "HUMAN_9606_idmapping_selected.tab.gz": CacheItem(
                 name="UniProt 9606 ID external database mapping",
                 cached="https://drive.google.com/uc?id=1Oysa5COq31H771rVeyrs-6KFhE3VJqoX",
-                online=Service("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz"),
+                unpinned="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz",
             ),
             "HUMAN_9606_idmapping.dat.gz": CacheItem(
                 name="UniProt 9606 internal id mapping",
                 cached="https://drive.google.com/uc?id=1lGxrx_kGyNdupwIOUXzfIZScc7rQKP-O",
-                online=Service("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz"),
+                unpinned="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz",
             ),
         }
     },
@@ -126,109 +163,115 @@ def download(self, output: str | PathLike):
         "tiga_gene-trait_stats.tsv": CacheItem(
             name="TIGA data",
             cached="https://drive.google.com/uc?id=114qyuNDy4qdmYDHHJAW-yBeTxcGTDUnK",
-            online=Service("https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv"),
+            pinned="https://unmtid-dbs.net/download/TIGA/20250916/tiga_gene-trait_stats.tsv",
         ),
         "HumanDO.tsv": CacheItem(
             name="Disease ontology data",
             cached="https://drive.google.com/uc?id=1lfB1DGJgrXTxP_50L6gGu_Nq6OyDjiIi",
-            online=Service("https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/016a4ec33d1a1508d669650086cd92ccebe138e6/DOreports/HumanDO.tsv"),
+            # DiseaseOntology is a decently updating repository!
+            unpinned="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/refs/heads/main/DOreports/HumanDO.tsv",
         ),
         "human_disease_textmining_filtered.tsv": CacheItem(
             name="DISEASES textmining channel",
             cached="https://drive.google.com/uc?id=1vD8KbT9sk04VEJx9r3_LglCTGYJdhN0D",
-            online=Service("https://download.jensenlab.org/human_disease_textmining_filtered.tsv"),
+            unpinned="https://download.jensenlab.org/human_disease_textmining_filtered.tsv",
         ),
         "human_disease_knowledge_filtered.tsv": CacheItem(
             name="DISEASES knowledge channel",
             cached="https://drive.google.com/uc?id=1qGUnjVwF9-8p5xvp8_6CfVsbMSM_wkld",
-            online=Service("https://download.jensenlab.org/human_disease_knowledge_filtered.tsv"),
+            unpinned="https://download.jensenlab.org/human_disease_knowledge_filtered.tsv",
         ),
     },
     "BioMart": {
         "ensg-ensp.tsv": CacheItem(
             name="BioMart ENSG <-> ENSP mapping",
             cached="https://drive.google.com/uc?id=1-gPrDoluXIGydzWKjWEnW-nWhYu3YkHL",
-            online=fetch_biomart_service((dir_path / "biomart" / "ensg-ensp.xml").read_text()),
+            unpinned=fetch_biomart_service((dir_path / "biomart" / "ensg-ensp.xml").read_text()),
         )
     },
     "DepMap": {
         "OmicsProfiles.csv": CacheItem(
             name="DepMap omics metadata",
             cached="https://drive.google.com/uc?id=1i54aKfO0Ci2QKLTNJnuQ_jgGhH4c9rTL",
-            online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads"),
+            pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F2025-05-01-master-mapping-table-28c2.12%2Fpublic_release_date.2025-05-01.master_mapping_table.csv&dl_name=OmicsProfiles.csv&bucket=depmap-external-downloads",
         ),
         "CRISPRGeneDependency.csv": CacheItem(
             name="DepMap gene dependency probability estimates",
             cached="https://drive.google.com/uc?id=122rWNqT_u3M7B_11WYZMtOLiPbBykkaz",
-            online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads"),
+            pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2F25q2-public-557c.3%2FCRISPRGeneDependency.csv&dl_name=CRISPRGeneDependency.csv&bucket=depmap-external-downloads",
         ),
         "OmicsSomaticMutationsMatrixDamaging.csv": CacheItem(
             name="DepMap genotyped matrix",
             cached="https://drive.google.com/uc?id=1W7N2H0Qi7NwmTmNChcwa2ZZ4WxAuz-Xh",
-            online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads"),
+            pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.87%2FOmicsSomaticMutationsMatrixDamaging.csv&dl_name=OmicsSomaticMutationsMatrixDamaging.csv&bucket=depmap-external-downloads",
         ),
         "OmicsExpressionProteinCodingGenesTPMLogp1.csv": CacheItem(
             name="DepMap model-level TPMs",
             cached="https://drive.google.com/uc?id=1P0m88eXJ8GPdru8h9oOcHPeXKU7ljIrP",
-            online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads"),
+            pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.73%2FOmicsExpressionProteinCodingGenesTPMLogp1.csv&dl_name=OmicsExpressionProteinCodingGenesTPMLogp1.csv&bucket=depmap-external-downloads",
         ),
         "OmicsCNGeneWGS.csv": CacheItem(
             name="DepMap gene-level copy number data",
             cached="https://drive.google.com/uc?id=1TPp3cfK7OZUrftucr3fLO-krXSQAA6Ub",
-            online=Service("https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads"),
+            pinned="https://depmap.org/portal/download/api/download?file_name=downloads-by-canonical-id%2Fpublic-25q2-c5ef.104%2FOmicsCNGeneWGS.csv&dl_name=OmicsCNGeneWGS.csv&bucket=depmap-external-downloads",
         ),
     },
     "iRefIndex": {
         # This can also be obtained from the SPRAS repo
         # (https://github.com/Reed-CompBio/spras/blob/b5d7a2499afa8eab14c60ce0f99fa7e8a23a2c64/input/phosphosite-irefindex13.0-uniprot.txt).
         # iRefIndex has been down for quite some time, so this is only from the cache.
-        "phosphosite-irefindex13.0-uniprot.txt": CacheItem.cache_only(
+        "phosphosite-irefindex13.0-uniprot.txt": CacheItem(
             name="iRefIndex v13.0 UniProt interactome",
             cached="https://drive.google.com/uc?id=1fQ8Z3FjEwUseEtsExO723zj7mAAtdomo"
         )
     },
     "OsmoticStress": {
-        "yeast_pcsf_network.sif": CacheItem.cache_only(
+        "yeast_pcsf_network.sif": CacheItem(
             # In the paper https://doi.org/10.1016/j.celrep.2018.08.085
             name="Case Study Edge Results, from Supplementary Data 3",
             cached="https://drive.google.com/uc?id=1Agte0Aezext-8jLhGP4GmaF3tS7gHX-h"
         ),
-        # The following files are from https://github.com/gitter-lab/osmotic-stress
+        # The following files are from https://github.com/gitter-lab/osmotic-stress.
+        # While the following files do point to the repository's main branch,
+        # they aren't expected to actually change.
         "prizes.txt": CacheItem(
             name="Osmotic Stress Prizes",
-            online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt"),
+            pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/prizes.txt",
             cached="https://drive.google.com/uc?id=16WDQs0Vjv6rI12-hbifsbnpH31jMGhJg"
         ),
         "ChasmanNetwork-DirUndir.txt": CacheItem(
             name="Network Input",
-            online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt"),
+            pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/ChasmanNetwork-DirUndir.txt",
             cached="https://drive.google.com/uc?id=1qYXPaWcPU72YYME7NaBzD7thYCHRzrLH"
         ),
         "dummy.txt": CacheItem(
             name="Dummy Nodes File",
-            online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt"),
+            pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Input%20Data/dummy.txt",
             cached="https://drive.google.com/uc?id=1dsFIhBrIEahggg0JPxw64JwS51pKxoQU"
         ),
         "_edgeFreq.eda ": CacheItem(
             name="Case Study Omics Integrator Edge Frequencies",
-            online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda"),
+            pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/Notebooks/Forest-TPS/_edgeFreq.eda",
             cached="https://drive.google.com/uc?id=1M_rxEzUCo_EVuFyM47OEH2J-4LB3eeCR"
         ),
         "goldStandardUnionDetailed.txt": CacheItem(
             name="Gold Standard Reference Pathways",
-            online=Service("https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt"),
+            pinned="https://raw.githubusercontent.com/gitter-lab/osmotic-stress/refs/heads/master/data/evaluation/goldStandardUnionDetailed.txt",
             cached="https://drive.google.com/uc?id=1-_zF9oKFCNmJbDCC2vq8OM17HJw80s2T"
         ),
     },
     "EGFR": {
+        # The following files are from https://github.com/gitter-lab/tps.
+        # While the following files do point to the repository's main branch,
+        # they aren't expected to actually change.
         "eight-egfr-reference-all.txt": CacheItem(
             name="EGFR Gold Standard Reference",
-            online=Service("https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt"),
+            pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/resources/eight-egfr-reference-all.txt",
             cached="https://drive.google.com/uc?id=15MqpIbH1GRA1tq0ZXH9oMnKytoFSzXyw"
         ),
         "egfr-prizes.txt": CacheItem(
             name="EGFR prizes",
-            online=Service("https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt"),
+            pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt",
             cached="https://drive.google.com/file/d/1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj/view?usp=sharing"
         )
     },
diff --git a/pyproject.toml b/pyproject.toml
index 4df36774..b2112572 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,7 @@ readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
     "gdown>=5.2.0",
+    "loguru>=0.7.3",
     "more-itertools>=10.7.0",
     "networkx>=3.6.1",
     "pandas>=2.3.0",
diff --git a/spras b/spras
index 18f2cf84..479842d6 160000
--- a/spras
+++ b/spras
@@ -1 +1 @@
-Subproject commit 18f2cf84cfac034b2962f47434d3f900288b6a97
+Subproject commit 479842d6954f5df448f628259588c5a038e8efef
diff --git a/uv.lock b/uv.lock
index a00522cf..f3f25d5d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.13"
 
 [[package]]
@@ -310,6 +310,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0", size = 28880, upload-time = "2025-05-27T07:38:15.137Z" },
 ]
 
+[[package]]
+name = "loguru"
+version = "0.7.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "win32-setctime", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" },
+]
+
 [[package]]
 name = "markupsafe"
 version = "3.0.2"
@@ -846,6 +859,7 @@ version = "0.1.0"
 source = { editable = "." }
 dependencies = [
     { name = "gdown" },
+    { name = "loguru" },
     { name = "more-itertools" },
     { name = "networkx" },
     { name = "pandas" },
@@ -860,6 +874,7 @@ dev = [
 [package.metadata]
 requires-dist = [
     { name = "gdown", specifier = ">=5.2.0" },
+    { name = "loguru", specifier = ">=0.7.3" },
     { name = "more-itertools", specifier = ">=10.7.0" },
     { name = "networkx", specifier = ">=3.6.1" },
     { name = "pandas", specifier = ">=2.3.0" },
@@ -951,6 +966,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/76/06/04c8e804f813cf972e3262f3f8584c232de64f0cde9f703b46cf53a45090/virtualenv-20.34.0-py3-none-any.whl", hash = "sha256:341f5afa7eee943e4984a9207c025feedd768baff6753cd660c857ceb3e36026", size = 5983279, upload-time = "2025-08-13T14:24:05.111Z" },
 ]
 
+[[package]]
+name = "win32-setctime"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
+]
+
 [[package]]
 name = "wrapt"
 version = "1.17.3"

From 2d7f2b1d39e6df9d03ee295124723e79beb29bc5 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Sun, 22 Feb 2026 02:47:16 +0000
Subject: [PATCH 12/19] feat(cache): download_against_cache debugging

---
 cache/cli.py                                 |  4 +-
 cache/directory.py                           | 94 ++++++++++++++------
 datasets/diseases/Snakefile                  |  4 +-
 datasets/diseases/scripts/files.py           |  2 +-
 datasets/egfr/Snakefile                      |  4 +-
 datasets/egfr/scripts/process_interactome.py |  4 +-
 6 files changed, 76 insertions(+), 36 deletions(-)

diff --git a/cache/cli.py b/cache/cli.py
index ad82fb67..2e8d8201 100644
--- a/cache/cli.py
+++ b/cache/cli.py
@@ -3,7 +3,7 @@
 
 This may be expanded in the future, so only depend on this file as a debugging utility.
 
-For example, `python cache/cli.py KEGG/ko03250.xml ko03250.xm` allows running the KEGG query
+For example, `python cache/cli.py KEGG/ko03250.xml ko03250.xml` allows running the KEGG query
 for ko03250.xml, which can not be normally accessed automatically in the browser.
 """
 
@@ -23,7 +23,7 @@ def main():
     args = parse_args()
     cache_item = get_cache_item(args.path.split("/"))
 
-    cache_item.download_online(args.output)
+    cache_item.download(args.output)
 
 if __name__ == "__main__":
     main()
diff --git a/cache/directory.py b/cache/directory.py
index 71f0ffa8..e5c6c4fa 100644
--- a/cache/directory.py
+++ b/cache/directory.py
@@ -17,7 +17,10 @@
 
 # Our cache emits warnings for files with unpinned versions that don't match the cache.
 (dir_path / 'logs').mkdir(exist_ok=True)
-logger.add(dir_path / 'logs' / "cache.log")
+logger.add(dir_path / 'logs' / "cache.log", level="WARNING")
+
+class DownloadFileCheckException(RuntimeError):
+    """See Service#download_against_cache for some motivation for this custom error"""
 
 @dataclass
 class Service:
@@ -34,12 +37,51 @@ def download(self, output: str | PathLike) -> requests.Response:
             with open(output, 'wb') as f:
                 shutil.copyfileobj(response.raw, f)
             return response
-    
+
+    # NOTE: this is slightly yucky code deduplication. The only intended values of `downloaded_file_type` are `pinned` and `unpinned`.
+    def download_against_cache(
+            self,
+            cache: Path,
+            downloaded_file_type: str,
+            move_output: bool
+        ):
+        """
+        Downloads `this` Service and checks it against the provided `cache` at path. In logs,
+        the file will be referred to as `downloaded_file_type`.
+
+        @param move_output: Whether or not output should be irrecoverably moved instead of just copied.
+        """
+        logger.info(f"Downloading {downloaded_file_type} file {self.url} to check against with artifact at {cache}...")
+        downloaded_file_path = Path(NamedTemporaryFile(delete=False).name)
+
+        self.download(downloaded_file_path)
+        logger.info(f"Checking that the {downloaded_file_type} artifact {downloaded_file_path} matches with cached artifact at {cache}...")
+
+        if not filecmp.cmp(cache, downloaded_file_path):
+            # This entire if-branch is debug schenanigans: we want to be able to easily compare our current cached file to the online file,
+            # especially since some `Service`s have special errors that can make the request hard to compare in the browser.
+
+            debug_file_path = Path(NamedTemporaryFile(prefix="spras-benchmarking-debug-artifact", delete=False).name)
+            # We use shutil over Path#rename since temporary directories can be mounted to a different file system.
+            if move_output:
+                shutil.move(cache, debug_file_path)
+            else:
+                shutil.copy(cache, debug_file_path)
+            # We use a custom error type to prevent any overlap with RuntimeError. I am not sure if there is any.
+            raise DownloadFileCheckException(f"The {downloaded_file_type} file {downloaded_file_path} and " + \
+                                             f"cached file originally at {cache} do not match! " + \
+                                             f"Compare the pinned {downloaded_file_path} and the cached {debug_file_path}.")
+        else:
+            # Since we don't clean up pinned_file_path for the above branch's debugging,
+            # we need to clean it up here.
+            downloaded_file_path.unlink()
+
     @staticmethod
     def coerce(obj: 'Service | str') -> 'Service':
         # TODO: This could also be replaced by coercing str to Service in CacheItem via pydantic.
-        if isinstance(obj, str): return Service(url=obj)
-        else: return obj
+        if isinstance(obj, str):
+            return Service(url=obj)
+        return obj
 
 def fetch_biomart_service(xml: str) -> Service:
     """
@@ -93,38 +135,29 @@ def cache_only(cls, name: str, cached: str) -> "CacheItem":
     def download(self, output: str | PathLike):
         logger.info(f"Fetching {self.name}...")
 
-        with NamedTemporaryFile() as cached_file:
-            logger.info(f"Downloading cache {self.cached}...")
-            gdown.download(self.cached, cached_file)
-
-            if self.pinned is not None:
-                logger.info(f"Downloading pinned URL {self.pinned}...")
-                Service.coerce(self.pinned).download(output)
-
-                logger.info("Checking that the downloaded pinned artifact matches with cached artifact...")
-                assert filecmp.cmp(output, cached_file.name)
-            
-            if self.unpinned is not None:
-                logger.info(f"Downloading unpinned URL {self.unpinned}...")
-                with NamedTemporaryFile() as unpinned_file:
-                    Service.coerce(self.unpinned).download(unpinned_file.name)
-
-                    logger.info("Checking that the downloaded unpinned artifact matches with cached artifact...")
-                    if not filecmp.cmp(unpinned_file.name, cached_file.name):
-                        # This gets saved to a file. Search for `logger.add` for more info.
-                        logger.warning(f"Unpinned file {self.unpinned} for {self.name} does not match cache - this source should be updated!")
+        logger.info(f"Downloading cache {self.cached} to {output}...")
+        gdown.download(self.cached, str(output)) # gdown doesn't have a type signature, but it expects a string :/
 
+        if self.pinned is not None:
+            Service.coerce(self.pinned).download_against_cache(cache=Path(output), downloaded_file_type="pinned", move_output=True)
+        if self.unpinned is not None:
+            # Normally, download_against_cache raises a DownloadFileCheckException: we catch it and warn instead if that happens.
+            try:
+                Service.coerce(self.unpinned).download_against_cache(cache=Path(output), downloaded_file_type="unpinned", move_output=False)
+            except DownloadFileCheckException as err:
+                logger.warning(err)
 
+        # TODO: yikes! same with self.unpinned
 CacheDirectory = dict[str, Union[CacheItem, "CacheDirectory"]]
 
 # An *unversioned* directory list.
 directory: CacheDirectory = {
     "STRING": {
         "9606": {
-            "9606.protein.links.txt.gz": CacheItem(
-                name="STRING 9606 protein links",
+            "9606.protein.links.full.txt.gz": CacheItem(
+                name="STRING 9606 full protein links",
                 cached="https://drive.google.com/uc?id=13tE_-A6g7McZs_lZGz9As7iE-5cBFvqE",
-                pinned="http://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz",
+                pinned="http://stringdb-downloads.org/download/protein.links.full.v12.0/9606.protein.links.full.v12.0.txt.gz",
             ),
             "9606.protein.aliases.txt.gz": CacheItem(
                 name="STRING 9606 protein aliases",
@@ -272,7 +305,7 @@ def download(self, output: str | PathLike):
         "egfr-prizes.txt": CacheItem(
             name="EGFR prizes",
             pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt",
-            cached="https://drive.google.com/file/d/1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj/view?usp=sharing"
+            cached="https://drive.google.com/uc?id=1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj"
         )
     },
 }
@@ -291,4 +324,9 @@ def get_cache_item(path: list[str]) -> CacheItem:
     if not isinstance(current_item, CacheItem):
         raise ValueError(f"Path {path} doesn't lead to a cache item")
 
+    # Google Drive validation. TODO: remove if move to OSDF.
+    if "uc?id=" not in current_item.cached or "/view?usp=sharing" in current_item.cached:
+        raise RuntimeError("Make sure your Google Drive URLs are in https://drive.google.com/uc?id=... format " + \
+                           "with no /view?usp=sharing at the end. See CONTRIBUTING.md for more info.")
+
     return current_item
diff --git a/datasets/diseases/Snakefile b/datasets/diseases/Snakefile
index 0455b57a..93de50d5 100644
--- a/datasets/diseases/Snakefile
+++ b/datasets/diseases/Snakefile
@@ -13,7 +13,7 @@ produce_fetch_rules({
     "raw/HumanDO.tsv": ["DISEASES", "HumanDO.tsv"],
     "raw/tiga_gene-trait_stats.tsv": ["DISEASES", "tiga_gene-trait_stats.tsv"],
     "raw/ensg-ensp.tsv": ["BioMart", "ensg-ensp.tsv"],
-    "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True),
+    "raw/9606.protein.links.full.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
     "raw/9606.protein.aliases.txt": FetchConfig(["STRING", "9606", "9606.protein.aliases.txt.gz"], uncompress=True),
 })
 
@@ -42,7 +42,7 @@ rule files:
     input:
         "data/inputs.csv",
         "data/gold_standard.csv",
-        "raw/9606.protein.links.txt"
+        "raw/9606.protein.links.full.txt"
     output:
         # These are the two we use for the SPRAS run for now
         "GS_files/Alopecia_areata_GS.txt",
diff --git a/datasets/diseases/scripts/files.py b/datasets/diseases/scripts/files.py
index dc5a949b..f8704461 100644
--- a/datasets/diseases/scripts/files.py
+++ b/datasets/diseases/scripts/files.py
@@ -42,7 +42,7 @@ def main():
 
     # See /cache/directory.py for information on how this was grabbed.
     # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
-    string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.txt", sep=" ", skiprows=[0], header=None)
+    string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ", skiprows=[0], header=None)
 
     # Threshold anything above a confidence score of 900 to trim down the background interactome
     string = string[string.iloc[:, 2] > 900]
diff --git a/datasets/egfr/Snakefile b/datasets/egfr/Snakefile
index 6c4eefbd..f9af5435 100644
--- a/datasets/egfr/Snakefile
+++ b/datasets/egfr/Snakefile
@@ -15,7 +15,7 @@ rule all:
 produce_fetch_rules({
     "raw/eight-egfr-reference-all.txt": ["EGFR", "eight-egfr-reference-all.txt"],
     "raw/egfr-prizes.txt": ["EGFR", "egfr-prizes.txt"],
-    "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True),
+    "raw/9606.protein.links.full.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
     "raw/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True),
     "processed/phosphosite-irefindex13.0-uniprot.txt": ["iRefIndex", "phosphosite-irefindex13.0-uniprot.txt"]
 })
@@ -26,7 +26,7 @@ rule process_gold_standard:
     shell: "uv run scripts/process_gold_standard.py"
 
 rule process_interactome:
-    input: "raw/9606.protein.links.txt"
+    input: "raw/9606.protein.links.full.txt"
     output: "processed/interactome.tsv"
     shell: "uv run scripts/process_interactome.py"
 
diff --git a/datasets/egfr/scripts/process_interactome.py b/datasets/egfr/scripts/process_interactome.py
index e0ab02c7..3e1c8cd2 100644
--- a/datasets/egfr/scripts/process_interactome.py
+++ b/datasets/egfr/scripts/process_interactome.py
@@ -4,9 +4,11 @@
 egfr_directory = Path(__file__).parent.resolve() / '..'
 
 def main():
-    interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.txt', sep=' ')
+    interactome_df = pandas.read_csv(egfr_directory / 'raw' / '9606.protein.links.full.txt', sep=' ')
     interactome_df['protein1'] = interactome_df['protein1'].astype(str).str.removeprefix("9606.")
     interactome_df['protein2'] = interactome_df['protein2'].astype(str).str.removeprefix("9606.")
+    # Since this is links.full vs links, we need to restrict to a subset of headers before saving the interactome.
+    interactome_df = interactome_df[["protein1", "protein2", "combined_score"]]
     interactome_df['Direction'] = 'U'
 
     (egfr_directory / 'processed').mkdir(exist_ok=True)

From 117c4593aaf879f5b741645f9507f0e4b1cc2c76 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Sun, 22 Feb 2026 08:09:03 +0000
Subject: [PATCH 13/19] fix: use new api for hiv

---
 cache/directory.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cache/directory.py b/cache/directory.py
index cf262752..f18a771e 100644
--- a/cache/directory.py
+++ b/cache/directory.py
@@ -254,20 +254,24 @@ def download(self, output: str | PathLike):
         "ko03250.xml": CacheItem(
             name="KEGG 03250",
             cached="https://drive.google.com/uc?id=16dtWKHCQMp2qrLfFDE7nVhbwBCr2H5a9",
-            online="https://www.kegg.jp/kegg-bin/download?entry=ko03250&format=kgml",
-            online_headers = [('Referer', 'https://www.kegg.jp/pathway/ko03250')],
+            unpinned=Service(
+                "https://www.kegg.jp/kegg-bin/download?entry=ko03250&format=kgml",
+                headers={'Referer': 'https://www.kegg.jp/pathway/ko03250'})
         )
     },
     "HIV1": {
+        # The following files are from https://github.com/gitter-lab/hiv1-aurkb.
+        # While the following files do point to the repository's main branch,
+        # they aren't expected to actually change.
         "prize_05.tsv": CacheItem(
             name="HIV_05 prizes",
             cached="https://drive.google.com/uc?id=1jVWNRPfYkbqimO44GdzXYB3-7NXhet1m",
-            online="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/ac9278d447e4188eea3bf4b24c4c4e0c19b0c6d9/Results/base_analysis/prize_05.csv"
+            pinned="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/refs/heads/main/Results/base_analysis/prize_05.csv"
         ),
         "prize_060.tsv": CacheItem(
             name="HIV_060 prizes",
             cached="https://drive.google.com/uc?id=1Aucgp7pcooGr9oT4m2bvYEuYW6186WxQ",
-            online="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/ac9278d447e4188eea3bf4b24c4c4e0c19b0c6d9/Results/base_analysis/prize_060.csv"
+            pinned="https://raw.githubusercontent.com/gitter-lab/hiv1-aurkb/refs/heads/main/Results/base_analysis/prize_060.csv"
         )
     },
     "iRefIndex": {

From 2cc1019fed6a3fab639eb69643b08286429aeba9 Mon Sep 17 00:00:00 2001
From: "Tristan F." <pub.tristanf@gmail.com>
Date: Sun, 22 Feb 2026 11:20:52 +0000
Subject: [PATCH 14/19] feat: use self-looping iRefIndex interactome

---
 cache/directory.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cache/directory.py b/cache/directory.py
index f18a771e..38a58635 100644
--- a/cache/directory.py
+++ b/cache/directory.py
@@ -275,12 +275,15 @@ def download(self, output: str | PathLike):
         )
     },
     "iRefIndex": {
-        # This can also be obtained from the SPRAS repo
+        # This can also be obtained from the SPRAS repo, though the SPRAS repo removes self loops. We don't.
         # (https://github.com/Reed-CompBio/spras/blob/b5d7a2499afa8eab14c60ce0f99fa7e8a23a2c64/input/phosphosite-irefindex13.0-uniprot.txt).
-        # iRefIndex has been down for quite some time, so this is only from the cache.
+        # iRefIndex has been down for quite some time, so we grab this from a repository instead.
+        # While the following files do point to the repository's main branch,
+        # they aren't expected to actually change, so we make them `pinned`.
         "phosphosite-irefindex13.0-uniprot.txt": CacheItem(
             name="iRefIndex v13.0 UniProt interactome",
-            cached="https://drive.google.com/uc?id=1fQ8Z3FjEwUseEtsExO723zj7mAAtdomo"
+            cached="https://drive.google.com/uc?id=1fQ8Z3FjEwUseEtsExO723zj7mAAtdomo",
+            pinned="https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/networks/phosphosite-irefindex13.0-uniprot.txt"
         )
     },
     "OsmoticStress": {

From ab050fb79273d6a0d64355511fc781ec2d7ca14f Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Mon, 23 Feb 2026 01:35:39 +0000
Subject: [PATCH 15/19] fix: diseases links handling

---
 README.md                                  |  2 +-
 datasets/diseases/scripts/files.py         | 21 ++++++++++-----------
 datasets/diseases/scripts/gold_standard.py |  4 +---
 datasets/diseases/scripts/inputs.py        |  4 +---
 4 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index c98180a6..12a6e84b 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ pip install ./spras
 To run the postprocess output scripts, we have a `pyproject.toml` which can be used with your desired python package manager. This separates
 the `spras` conda environment from the small scripts we have. (on CI, we use [`uv`](https://docs.astral.sh/uv/).)
 
-To run the benchmarking pipeline, use:
+To run the benchmarking pipeline, use (this example is specifically for disease module mining):
 
 ```sh
 snakemake --cores 1 --configfile configs/dmmm.yaml --show-failed-logs -s spras/Snakefile
diff --git a/datasets/diseases/scripts/files.py b/datasets/diseases/scripts/files.py
index f8704461..9f810131 100644
--- a/datasets/diseases/scripts/files.py
+++ b/datasets/diseases/scripts/files.py
@@ -1,9 +1,7 @@
 import pandas as pd
 from pathlib import Path
-import os
 
-# https://stackoverflow.com/a/5137509/7589775
-dir_path = os.path.dirname(os.path.realpath(__file__))
+dir_path = Path(__file__).parent.resolve()
 
 diseases_path = Path(dir_path, "..")
 (diseases_path / "prize_files").mkdir(exist_ok=True, parents=True)
@@ -16,7 +14,7 @@ def main():
 
     GS_string_df = GS_string_df[GS_string_df["diseaseID"].isin(tiga_string_df["id"])]
     GS_combined_group = GS_string_df.groupby("diseaseName")
-    GS_combined_dict = {k: v for k, v in GS_combined_group}
+    GS_combined_dict = {str(k): v for k, v in GS_combined_group}
 
     tiga_filtered = tiga_string_df[tiga_string_df["id"].isin(GS_string_df["diseaseID"])]
     tiga_group = tiga_filtered.groupby("trait")
@@ -27,7 +25,7 @@ def main():
     tiga_threshold = tiga_filtered.loc[tiga_filtered["trait"].isin(list(tiga_count_threshold.keys()))]
 
     tiga_prizes = tiga_threshold.groupby("trait")
-    tiga_prize_dict = {k: v for k, v in tiga_prizes}
+    tiga_prize_dict = {str(k): v for k, v in tiga_prizes}
 
     for disease in tiga_prize_dict.keys():
         df = tiga_prize_dict[disease]
@@ -38,17 +36,18 @@ def main():
     for disease in GS_combined_dict.keys():
         df = GS_combined_dict[disease]
         df = df[["str_id"]]
-        df.to_csv(diseases_path / "GS_files" / f"{disease.replace(' ', '_')}_GS.txt", sep="\t", index=False, header=None)
+        df.to_csv(diseases_path / "GS_files" / f"{disease.replace(' ', '_')}_GS.txt", sep="\t", index=False, header=False)
 
     # See /cache/directory.py for information on how this was grabbed.
     # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
-    string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ", skiprows=[0], header=None)
+    string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ")
+    string = string[["protein1", "protein2", "combined_score"]]
 
     # Threshold anything above a confidence score of 900 to trim down the background interactome
-    string = string[string.iloc[:, 2] > 900]
-    string = string.iloc[:, [0, 1]]
-    string[len(string.columns)] = 1
-    string.to_csv(diseases_path / "raw" / "string_interactome.txt", sep="\t", index=False, header=None)
+    string = string[string["combined_score"] > 900]
+    string = string[["protein1", "protein2"]]
+    # though we still keep the weight afterwards
+    string.to_csv(diseases_path / "raw" / "string_interactome.txt", sep="\t", index=False, header=False)
 
 
 if __name__ == "__main__":
diff --git a/datasets/diseases/scripts/gold_standard.py b/datasets/diseases/scripts/gold_standard.py
index 846eaba5..40ed6111 100644
--- a/datasets/diseases/scripts/gold_standard.py
+++ b/datasets/diseases/scripts/gold_standard.py
@@ -1,9 +1,7 @@
 import pandas as pd
-import os
 from pathlib import Path
 
-# https://stackoverflow.com/a/5137509/7589775
-dir_path = os.path.dirname(os.path.realpath(__file__))
+dir_path = Path(__file__).parent.resolve()
 
 diseases_path = Path(dir_path, "..")
 
diff --git a/datasets/diseases/scripts/inputs.py b/datasets/diseases/scripts/inputs.py
index 8dc6214d..b7319af9 100644
--- a/datasets/diseases/scripts/inputs.py
+++ b/datasets/diseases/scripts/inputs.py
@@ -1,9 +1,7 @@
 from pathlib import Path
 import pandas as pd
-import os
 
-# https://stackoverflow.com/a/5137509/7589775
-dir_path = os.path.dirname(os.path.realpath(__file__))
+dir_path = Path(__file__).parent.resolve()
 
 diseases_path = Path(dir_path, "..")
 (diseases_path / "data").mkdir(exist_ok=True, parents=True)

From 9d9c11d7d3570514ac34ef6353a99c1d105202df Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Mon, 23 Feb 2026 01:55:43 +0000
Subject: [PATCH 16/19] refactor(diseases): split interactome handling

---
 configs/dmmm.yaml                        |  4 ++--
 datasets/diseases/.gitignore             |  4 ++--
 datasets/diseases/Snakefile              |  9 +++++++--
 datasets/diseases/scripts/files.py       | 11 -----------
 datasets/diseases/scripts/interactome.py | 19 +++++++++++++++++++
 5 files changed, 30 insertions(+), 17 deletions(-)
 create mode 100644 datasets/diseases/scripts/interactome.py

diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml
index b4e16bc0..eb1b7f60 100644
--- a/configs/dmmm.yaml
+++ b/configs/dmmm.yaml
@@ -62,14 +62,14 @@ datasets:
   - label: dmmmdiseases_alopecia_areata
     data_dir: datasets/diseases
     edge_files:
-      - raw/string_interactome.txt
+      - processed/string_interactome.tsv
     node_files:
       - prize_files/alopecia_areata_prizes.txt
     other_files: []
   - label: dmmmdiseases_diabetes_mellitus
     data_dir: datasets/diseases
     edge_files:
-      - raw/string_interactome.txt
+      - processed/string_interactome.tsv
     node_files:
       - prize_files/diabetes_mellitus_prizes.txt
     other_files: []
diff --git a/datasets/diseases/.gitignore b/datasets/diseases/.gitignore
index 70081635..f65ef927 100644
--- a/datasets/diseases/.gitignore
+++ b/datasets/diseases/.gitignore
@@ -3,7 +3,7 @@
 data
 
 # prize and gold standard files
+raw
+processed
 GS_files
 prize_files
-raw
-Pickles
diff --git a/datasets/diseases/Snakefile b/datasets/diseases/Snakefile
index 93de50d5..aed94654 100644
--- a/datasets/diseases/Snakefile
+++ b/datasets/diseases/Snakefile
@@ -2,6 +2,7 @@ include: "../../cache/Snakefile"
 
 rule all:
     input:
+        "processed/string_interactome.tsv",
         "GS_files/Alopecia_areata_GS.txt",
         "GS_files/Diabetes_mellitus_GS.txt",
         "prize_files/alopecia_areata_prizes.txt",
@@ -41,8 +42,7 @@ rule gold_standard:
 rule files:
     input:
         "data/inputs.csv",
-        "data/gold_standard.csv",
-        "raw/9606.protein.links.full.txt"
+        "data/gold_standard.csv"
     output:
         # These are the two we use for the SPRAS run for now
         "GS_files/Alopecia_areata_GS.txt",
@@ -51,3 +51,8 @@ rule files:
         "prize_files/diabetes_mellitus_prizes.txt"
     shell:
         "uv run scripts/files.py"
+
+rule interactome:
+    input: "raw/9606.protein.links.full.txt"
+    output: "processed/string_interactome.tsv"
+    shell: "uv run scripts/interactome.py"
diff --git a/datasets/diseases/scripts/files.py b/datasets/diseases/scripts/files.py
index 9f810131..31c631b2 100644
--- a/datasets/diseases/scripts/files.py
+++ b/datasets/diseases/scripts/files.py
@@ -38,17 +38,6 @@ def main():
         df = df[["str_id"]]
         df.to_csv(diseases_path / "GS_files" / f"{disease.replace(' ', '_')}_GS.txt", sep="\t", index=False, header=False)
 
-    # See /cache/directory.py for information on how this was grabbed.
-    # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
-    string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ")
-    string = string[["protein1", "protein2", "combined_score"]]
-
-    # Threshold anything above a confidence score of 900 to trim down the background interactome
-    string = string[string["combined_score"] > 900]
-    string = string[["protein1", "protein2"]]
-    # though we still keep the weight afterwards
-    string.to_csv(diseases_path / "raw" / "string_interactome.txt", sep="\t", index=False, header=False)
-
 
 if __name__ == "__main__":
     main()
diff --git a/datasets/diseases/scripts/interactome.py b/datasets/diseases/scripts/interactome.py
new file mode 100644
index 00000000..b0a40b6b
--- /dev/null
+++ b/datasets/diseases/scripts/interactome.py
@@ -0,0 +1,19 @@
+from pathlib import Path
+import pandas
+
+diseases_path = Path(__file__).parent.parent.resolve()
+
+def main():
+    # See /cache/directory.py for information on how this was grabbed.
+    # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
+    string = pandas.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ")
+    string = string[["protein1", "protein2", "combined_score"]]
+
+    # Threshold anything above a confidence score of 900 to trim down the background interactome
+    string = string[string["combined_score"] > 900]
+    # though we still keep the weight afterwards
+    (diseases_path / "processed").mkdir(exist_ok=True)
+    string.to_csv(diseases_path / "processed" / "string_interactome.tsv", sep="\t", index=False, header=False)
+
+if __name__ == "__main__":
+    main()

From b8759da51fd863e414978b4cd8c8c7aa6ae32089 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Tue, 24 Feb 2026 19:57:05 +0000
Subject: [PATCH 17/19] disable ml

i do not like this One Bit.
---
 configs/dmmm.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml
index eb1b7f60..9fe9dd94 100644
--- a/configs/dmmm.yaml
+++ b/configs/dmmm.yaml
@@ -17,7 +17,9 @@ analysis:
   cytoscape:
     include: false
   ml:
-    include: true
+    # TODO: we either need at least one non-empty pathway in all algorithm runs,
+    # or we need to get rid of validate_df hard-erroring. The latter seems better.
+    include: false
     aggregate_per_algorithm: true
   evaluation:
     include: false

From 62596047ecb47cc0fa987f3e271d692dfe463c24 Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Tue, 24 Feb 2026 20:30:59 +0000
Subject: [PATCH 18/19] fix: don't use import.meta.glob for data files

---
 web/package.json     |  1 +
 web/pnpm-lock.yaml   | 89 ++++++++++++++++++++++++++++++++++++++++++++
 web/src/lib/paths.ts |  9 +++--
 3 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/web/package.json b/web/package.json
index 74b19ed1..1cccf275 100644
--- a/web/package.json
+++ b/web/package.json
@@ -14,6 +14,7 @@
     "@fontsource-variable/noto-sans": "^5.2.10",
     "astro": "^5.16.6",
     "dayjs": "^1.11.19",
+    "glob": "^13.0.6",
     "medium-zoom": "^1.1.0",
     "sass": "^1.97.1",
     "yaml": "^2.8.2"
diff --git a/web/pnpm-lock.yaml b/web/pnpm-lock.yaml
index c6e31365..be8d4b53 100644
--- a/web/pnpm-lock.yaml
+++ b/web/pnpm-lock.yaml
@@ -17,6 +17,9 @@ importers:
       dayjs:
         specifier: ^1.11.19
         version: 1.11.19
+      glob:
+        specifier: ^13.0.6
+        version: 13.0.6
       medium-zoom:
         specifier: ^1.1.0
         version: 1.1.0
@@ -276,89 +279,105 @@ packages:
     resolution: {integrity: sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==}
     cpu: [arm64]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-libvips-linux-arm@1.2.4':
     resolution: {integrity: sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==}
     cpu: [arm]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-libvips-linux-ppc64@1.2.4':
     resolution: {integrity: sha512-FMuvGijLDYG6lW+b/UvyilUWu5Ayu+3r2d1S8notiGCIyYU/76eig1UfMmkZ7vwgOrzKzlQbFSuQfgm7GYUPpA==}
     cpu: [ppc64]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-libvips-linux-riscv64@1.2.4':
     resolution: {integrity: sha512-oVDbcR4zUC0ce82teubSm+x6ETixtKZBh/qbREIOcI3cULzDyb18Sr/Wcyx7NRQeQzOiHTNbZFF1UwPS2scyGA==}
     cpu: [riscv64]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-libvips-linux-s390x@1.2.4':
     resolution: {integrity: sha512-qmp9VrzgPgMoGZyPvrQHqk02uyjA0/QrTO26Tqk6l4ZV0MPWIW6LTkqOIov+J1yEu7MbFQaDpwdwJKhbJvuRxQ==}
     cpu: [s390x]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-libvips-linux-x64@1.2.4':
     resolution: {integrity: sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==}
     cpu: [x64]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-libvips-linuxmusl-arm64@1.2.4':
     resolution: {integrity: sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==}
     cpu: [arm64]
     os: [linux]
+    libc: [musl]
 
   '@img/sharp-libvips-linuxmusl-x64@1.2.4':
     resolution: {integrity: sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==}
     cpu: [x64]
     os: [linux]
+    libc: [musl]
 
   '@img/sharp-linux-arm64@0.34.5':
     resolution: {integrity: sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [arm64]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-linux-arm@0.34.5':
     resolution: {integrity: sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [arm]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-linux-ppc64@0.34.5':
     resolution: {integrity: sha512-7zznwNaqW6YtsfrGGDA6BRkISKAAE1Jo0QdpNYXNMHu2+0dTrPflTLNkpc8l7MUP5M16ZJcUvysVWWrMefZquA==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [ppc64]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-linux-riscv64@0.34.5':
     resolution: {integrity: sha512-51gJuLPTKa7piYPaVs8GmByo7/U7/7TZOq+cnXJIHZKavIRHAP77e3N2HEl3dgiqdD/w0yUfiJnII77PuDDFdw==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [riscv64]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-linux-s390x@0.34.5':
     resolution: {integrity: sha512-nQtCk0PdKfho3eC5MrbQoigJ2gd1CgddUMkabUj+rBevs8tZ2cULOx46E7oyX+04WGfABgIwmMC0VqieTiR4jg==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [s390x]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-linux-x64@0.34.5':
     resolution: {integrity: sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [x64]
     os: [linux]
+    libc: [glibc]
 
   '@img/sharp-linuxmusl-arm64@0.34.5':
     resolution: {integrity: sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [arm64]
     os: [linux]
+    libc: [musl]
 
   '@img/sharp-linuxmusl-x64@0.34.5':
     resolution: {integrity: sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==}
     engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0}
     cpu: [x64]
     os: [linux]
+    libc: [musl]
 
   '@img/sharp-wasm32@0.34.5':
     resolution: {integrity: sha512-OdWTEiVkY2PHwqkbBI8frFxQQFekHaSSkUIJkwzclWZe64O1X4UlUjqqqLaPbUpMOQk6FBu/HtlGXNblIs0huw==}
@@ -418,36 +437,42 @@ packages:
     engines: {node: '>= 10.0.0'}
     cpu: [arm]
     os: [linux]
+    libc: [glibc]
 
   '@parcel/watcher-linux-arm-musl@2.5.1':
     resolution: {integrity: sha512-6E+m/Mm1t1yhB8X412stiKFG3XykmgdIOqhjWj+VL8oHkKABfu/gjFj8DvLrYVHSBNC+/u5PeNrujiSQ1zwd1Q==}
     engines: {node: '>= 10.0.0'}
     cpu: [arm]
     os: [linux]
+    libc: [musl]
 
   '@parcel/watcher-linux-arm64-glibc@2.5.1':
     resolution: {integrity: sha512-LrGp+f02yU3BN9A+DGuY3v3bmnFUggAITBGriZHUREfNEzZh/GO06FF5u2kx8x+GBEUYfyTGamol4j3m9ANe8w==}
     engines: {node: '>= 10.0.0'}
     cpu: [arm64]
     os: [linux]
+    libc: [glibc]
 
   '@parcel/watcher-linux-arm64-musl@2.5.1':
     resolution: {integrity: sha512-cFOjABi92pMYRXS7AcQv9/M1YuKRw8SZniCDw0ssQb/noPkRzA+HBDkwmyOJYp5wXcsTrhxO0zq1U11cK9jsFg==}
     engines: {node: '>= 10.0.0'}
     cpu: [arm64]
     os: [linux]
+    libc: [musl]
 
   '@parcel/watcher-linux-x64-glibc@2.5.1':
     resolution: {integrity: sha512-GcESn8NZySmfwlTsIur+49yDqSny2IhPeZfXunQi48DMugKeZ7uy1FX83pO0X22sHntJ4Ub+9k34XQCX+oHt2A==}
     engines: {node: '>= 10.0.0'}
     cpu: [x64]
     os: [linux]
+    libc: [glibc]
 
   '@parcel/watcher-linux-x64-musl@2.5.1':
     resolution: {integrity: sha512-n0E2EQbatQ3bXhcH2D1XIAANAcTZkQICBPVaxMeaCVBtOpBZpWJuf7LwyWPSBDITb7In8mqQgJ7gH8CILCURXg==}
     engines: {node: '>= 10.0.0'}
     cpu: [x64]
     os: [linux]
+    libc: [musl]
 
   '@parcel/watcher-win32-arm64@2.5.1':
     resolution: {integrity: sha512-RFzklRvmc3PkjKjry3hLF9wD7ppR4AKcWNzH7kXR7GUe0Igb3Nz8fyPwtZCSquGrhU5HhUNDr/mKBqj7tqA2Vw==}
@@ -514,56 +539,67 @@ packages:
     resolution: {integrity: sha512-EHMUcDwhtdRGlXZsGSIuXSYwD5kOT9NVnx9sqzYiwAc91wfYOE1g1djOEDseZJKKqtHAHGwnGPQu3kytmfaXLQ==}
     cpu: [arm]
     os: [linux]
+    libc: [glibc]
 
   '@rollup/rollup-linux-arm-musleabihf@4.54.0':
     resolution: {integrity: sha512-+pBrqEjaakN2ySv5RVrj/qLytYhPKEUwk+e3SFU5jTLHIcAtqh2rLrd/OkbNuHJpsBgxsD8ccJt5ga/SeG0JmA==}
     cpu: [arm]
     os: [linux]
+    libc: [musl]
 
   '@rollup/rollup-linux-arm64-gnu@4.54.0':
     resolution: {integrity: sha512-NSqc7rE9wuUaRBsBp5ckQ5CVz5aIRKCwsoa6WMF7G01sX3/qHUw/z4pv+D+ahL1EIKy6Enpcnz1RY8pf7bjwng==}
     cpu: [arm64]
     os: [linux]
+    libc: [glibc]
 
   '@rollup/rollup-linux-arm64-musl@4.54.0':
     resolution: {integrity: sha512-gr5vDbg3Bakga5kbdpqx81m2n9IX8M6gIMlQQIXiLTNeQW6CucvuInJ91EuCJ/JYvc+rcLLsDFcfAD1K7fMofg==}
     cpu: [arm64]
     os: [linux]
+    libc: [musl]
 
   '@rollup/rollup-linux-loong64-gnu@4.54.0':
     resolution: {integrity: sha512-gsrtB1NA3ZYj2vq0Rzkylo9ylCtW/PhpLEivlgWe0bpgtX5+9j9EZa0wtZiCjgu6zmSeZWyI/e2YRX1URozpIw==}
     cpu: [loong64]
     os: [linux]
+    libc: [glibc]
 
   '@rollup/rollup-linux-ppc64-gnu@4.54.0':
     resolution: {integrity: sha512-y3qNOfTBStmFNq+t4s7Tmc9hW2ENtPg8FeUD/VShI7rKxNW7O4fFeaYbMsd3tpFlIg1Q8IapFgy7Q9i2BqeBvA==}
     cpu: [ppc64]
     os: [linux]
+    libc: [glibc]
 
   '@rollup/rollup-linux-riscv64-gnu@4.54.0':
     resolution: {integrity: sha512-89sepv7h2lIVPsFma8iwmccN7Yjjtgz0Rj/Ou6fEqg3HDhpCa+Et+YSufy27i6b0Wav69Qv4WBNl3Rs6pwhebQ==}
     cpu: [riscv64]
     os: [linux]
+    libc: [glibc]
 
   '@rollup/rollup-linux-riscv64-musl@4.54.0':
     resolution: {integrity: sha512-ZcU77ieh0M2Q8Ur7D5X7KvK+UxbXeDHwiOt/CPSBTI1fBmeDMivW0dPkdqkT4rOgDjrDDBUed9x4EgraIKoR2A==}
     cpu: [riscv64]
     os: [linux]
+    libc: [musl]
 
   '@rollup/rollup-linux-s390x-gnu@4.54.0':
     resolution: {integrity: sha512-2AdWy5RdDF5+4YfG/YesGDDtbyJlC9LHmL6rZw6FurBJ5n4vFGupsOBGfwMRjBYH7qRQowT8D/U4LoSvVwOhSQ==}
     cpu: [s390x]
     os: [linux]
+    libc: [glibc]
 
   '@rollup/rollup-linux-x64-gnu@4.54.0':
     resolution: {integrity: sha512-WGt5J8Ij/rvyqpFexxk3ffKqqbLf9AqrTBbWDk7ApGUzaIs6V+s2s84kAxklFwmMF/vBNGrVdYgbblCOFFezMQ==}
     cpu: [x64]
     os: [linux]
+    libc: [glibc]
 
   '@rollup/rollup-linux-x64-musl@4.54.0':
     resolution: {integrity: sha512-JzQmb38ATzHjxlPHuTH6tE7ojnMKM2kYNzt44LO/jJi8BpceEC8QuXYA908n8r3CNuG/B3BV8VR3Hi1rYtmPiw==}
     cpu: [x64]
     os: [linux]
+    libc: [musl]
 
   '@rollup/rollup-openharmony-arm64@4.54.0':
     resolution: {integrity: sha512-huT3fd0iC7jigGh7n3q/+lfPcXxBi+om/Rs3yiFxjvSxbSB6aohDFXbWvlspaqjeOh+hx7DDHS+5Es5qRkWkZg==}
@@ -693,6 +729,10 @@ packages:
   bail@2.0.2:
     resolution: {integrity: sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==}
 
+  balanced-match@4.0.4:
+    resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==}
+    engines: {node: 18 || 20 || >=22}
+
   base-64@1.0.0:
     resolution: {integrity: sha512-kwDPIFCGx0NZHog36dj+tHiwP4QMzsZ3AgMViUBKI0+V5n4U0ufTCUMhnQ04diaRI8EX/QcPfql7zlhZ7j4zgg==}
 
@@ -706,6 +746,10 @@ packages:
     resolution: {integrity: sha512-F3PH5k5juxom4xktynS7MoFY+NUWH5LC4CnH11YB8NPew+HLpmBLCybSAEyb2F+4pRXhuhWqFesoQd6DAyc2hw==}
     engines: {node: '>=18'}
 
+  brace-expansion@5.0.3:
+    resolution: {integrity: sha512-fy6KJm2RawA5RcHkLa1z/ScpBeA762UF9KmZQxwIbDtRJrgLzM10depAiEQ+CXYcoiqW1/m96OAAoke2nE9EeA==}
+    engines: {node: 18 || 20 || >=22}
+
   braces@3.0.3:
     resolution: {integrity: sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==}
     engines: {node: '>=8'}
@@ -948,6 +992,10 @@ packages:
   github-slugger@2.0.0:
     resolution: {integrity: sha512-IaOQ9puYtjrkq7Y0Ygl9KDZnrf/aiUJYUpVf89y8kyaxbRG7Y1SrX/jaumrv81vc61+kiMempujsM3Yw7w5qcw==}
 
+  glob@13.0.6:
+    resolution: {integrity: sha512-Wjlyrolmm8uDpm/ogGyXZXb1Z+Ca2B8NbJwqBVg0axK9GbBeoS7yGV6vjXnYdGm6X53iehEuxxbyiKp8QmN4Vw==}
+    engines: {node: 18 || 20 || >=22}
+
   h3@1.15.4:
     resolution: {integrity: sha512-z5cFQWDffyOe4vQ9xIqNfCZdV4p//vy6fBnr8Q1AWnVZ0teurKMG66rLj++TKwKPUP3u7iMUvrvKaEUiQw2QWQ==}
 
@@ -1050,6 +1098,10 @@ packages:
   lru-cache@10.4.3:
     resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==}
 
+  lru-cache@11.2.6:
+    resolution: {integrity: sha512-ESL2CrkS/2wTPfuend7Zhkzo2u0daGJ/A2VucJOgQ/C48S/zB8MMeMHSGKYpXhIjbPxfuezITkaBH1wqv00DDQ==}
+    engines: {node: 20 || >=22}
+
   magic-string@0.30.21:
     resolution: {integrity: sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==}
 
@@ -1195,6 +1247,14 @@ packages:
     resolution: {integrity: sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==}
     engines: {node: '>=8.6'}
 
+  minimatch@10.2.2:
+    resolution: {integrity: sha512-+G4CpNBxa5MprY+04MbgOw1v7So6n5JY166pFi9KfYwT78fxScCeSNQSNzp6dpPSW2rONOps6Ocam1wFhCgoVw==}
+    engines: {node: 18 || 20 || >=22}
+
+  minipass@7.1.3:
+    resolution: {integrity: sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==}
+    engines: {node: '>=16 || 14 >=14.17'}
+
   mrmime@2.0.1:
     resolution: {integrity: sha512-Y3wQdFg2Va6etvQ5I82yUhGdsKrcYox6p7FfL1LbK2J4V01F9TGlepTIhnK24t7koZibmg82KGglhA1XK5IsLQ==}
     engines: {node: '>=10'}
@@ -1266,6 +1326,10 @@ packages:
   parse5@7.3.0:
     resolution: {integrity: sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==}
 
+  path-scurry@2.0.2:
+    resolution: {integrity: sha512-3O/iVVsJAPsOnpwWIeD+d6z/7PmqApyQePUtCndjatj/9I5LylHvt5qluFaBT3I5h3r1ejfR056c+FCv+NnNXg==}
+    engines: {node: 18 || 20 || >=22}
+
   piccolore@0.1.3:
     resolution: {integrity: sha512-o8bTeDWjE086iwKrROaDf31K0qC/BENdm15/uH9usSC/uZjJOKb2YGiVHfLY4GhwsERiPI1jmwI2XrA7ACOxVw==}
 
@@ -2321,6 +2385,8 @@ snapshots:
 
   bail@2.0.2: {}
 
+  balanced-match@4.0.4: {}
+
   base-64@1.0.0: {}
 
   base64-js@1.5.1: {}
@@ -2338,6 +2404,10 @@ snapshots:
       widest-line: 5.0.0
       wrap-ansi: 9.0.2
 
+  brace-expansion@5.0.3:
+    dependencies:
+      balanced-match: 4.0.4
+
   braces@3.0.3:
     dependencies:
       fill-range: 7.1.1
@@ -2559,6 +2629,12 @@ snapshots:
 
   github-slugger@2.0.0: {}
 
+  glob@13.0.6:
+    dependencies:
+      minimatch: 10.2.2
+      minipass: 7.1.3
+      path-scurry: 2.0.2
+
   h3@1.15.4:
     dependencies:
       cookie-es: 1.2.2
@@ -2707,6 +2783,8 @@ snapshots:
 
   lru-cache@10.4.3: {}
 
+  lru-cache@11.2.6: {}
+
   magic-string@0.30.21:
     dependencies:
       '@jridgewell/sourcemap-codec': 1.5.5
@@ -3042,6 +3120,12 @@ snapshots:
       picomatch: 2.3.1
     optional: true
 
+  minimatch@10.2.2:
+    dependencies:
+      brace-expansion: 5.0.3
+
+  minipass@7.1.3: {}
+
   mrmime@2.0.1: {}
 
   ms@2.1.3: {}
@@ -3111,6 +3195,11 @@ snapshots:
     dependencies:
       entities: 6.0.1
 
+  path-scurry@2.0.2:
+    dependencies:
+      lru-cache: 11.2.6
+      minipass: 7.1.3
+
   piccolore@0.1.3: {}
 
   picocolors@1.1.1: {}
diff --git a/web/src/lib/paths.ts b/web/src/lib/paths.ts
index dd593072..edd01397 100644
--- a/web/src/lib/paths.ts
+++ b/web/src/lib/paths.ts
@@ -1,8 +1,11 @@
 import { extractDatasetCategory, extractDatasetType } from "./outputStyle";
+import { globSync } from 'glob'
 
-export function getDataFiles() {
-  const dataFiles = import.meta.glob("../../public/data/output/**", { query: "?raw" });
-  return Object.keys(dataFiles).map((path) => path.substring("../../public/data/output/".length));
+export function getDataFiles(): string[] {
+  // We prefer this over import.meta.glob, as import.meta.glob currently
+  // leads to OOM for large raw imports, and OOM is especially plausible on CD.
+  const dataFiles = globSync("../../public/data/output/**");
+  return dataFiles.map((path) => path.substring("../../public/data/output/".length));
 }
 
 export function getDatasets() {

From f3ef49f281e23f83ba16e8c75052f366a03d0cbf Mon Sep 17 00:00:00 2001
From: "Tristan F.-R." <pub.tristanf@gmail.com>
Date: Tue, 24 Feb 2026 20:46:44 +0000
Subject: [PATCH 19/19] fix(web): add egfr data category, glob correct
 directory

---
 web/src/lib/outputStyle.ts | 14 +++++++++-----
 web/src/lib/paths.ts       |  4 ++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/web/src/lib/outputStyle.ts b/web/src/lib/outputStyle.ts
index 3eb6f089..a65783c5 100644
--- a/web/src/lib/outputStyle.ts
+++ b/web/src/lib/outputStyle.ts
@@ -47,6 +47,10 @@ const dataCategories = {
     name: "Yeast",
     directory: "yeast-osmotic-stress",
   },
+  egfr: {
+    name: "EGFR",
+    directory: "egfr"
+  }
 };
 
 // TODO: replace this once we have proper dataset categories
@@ -57,11 +61,11 @@ export function extractDatasetCategory(name: string): { category: string; name:
 
 export function parseOutputString(str: string): Output {
   const components = str.split("-");
-  let dataType;
-  let datasetCategory;
-  let datasetName;
-  let algorithm;
-  let paramsHash;
+  let dataType: string | undefined;
+  let datasetCategory: string | undefined;
+  let datasetName: string | undefined;
+  let algorithm: string | undefined;
+  let paramsHash: string | undefined;
 
   if (components.length === 5) {
     // This is a slug URL (type-...)
diff --git a/web/src/lib/paths.ts b/web/src/lib/paths.ts
index edd01397..9048e8e4 100644
--- a/web/src/lib/paths.ts
+++ b/web/src/lib/paths.ts
@@ -4,8 +4,8 @@ import { globSync } from 'glob'
 export function getDataFiles(): string[] {
   // We prefer this over import.meta.glob, as import.meta.glob currently
   // leads to OOM for large raw imports, and OOM is especially plausible on CD.
-  const dataFiles = globSync("../../public/data/output/**");
-  return dataFiles.map((path) => path.substring("../../public/data/output/".length));
+  const dataFiles = globSync("public/data/output/**");
+  return dataFiles.map((path) => path.substring("public/data/output/".length));
 }
 
 export function getDatasets() {