Reed-CompBio · tristan-f-r · Feb 24, 2026 · Feb 18, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ pip install ./spras
 To run the postprocess output scripts, we have a `pyproject.toml` which can be used with your desired python package manager. This separates
 the `spras` conda environment from the small scripts we have. (on CI, we use [`uv`](https://docs.astral.sh/uv/).)
 
-To run the benchmarking pipeline, use:
+To run the benchmarking pipeline, use (this example is specifically for disease module mining):
 
 ```sh
 snakemake --cores 1 --configfile configs/dmmm.yaml --show-failed-logs -s spras/Snakefile

diff --git a/cache/.gitignore b/cache/.gitignore
@@ -1 +1,2 @@
 artifacts
+logs
diff --git a/cache/README.md b/cache/README.md
@@ -1,7 +1,31 @@
-# cache
+# Cache
 
-Handles artifact fetching and cache. This folder has:
+Handles artifact fetching and cache. The point of this is to [for the duration that SPRAS is maintained] prevent any kind of
+data rot, to ensure that continuous benchmarking is encouraged to use the latest available data.
 
+During benchmarking runs, data is fetched from all provided URLs in `directory.py`, where we get the most current version of data,
+and compare it to our cached data to check if the data has changed at all.
+
+All entries are provided with this template:
+
+```py
+"file-name.ext": CacheItem(
+    name="Short File Description",
+    cached="https://drive.google.com/uc?id=...",
+    # Either-or
+    pinned=Service("..."),
+    unpinned=Service("..."),
+),
+```
+
+When a file is requested, `cached`, `pinned`, and `unpinned` are all downloaded:
+- If the URLs linking to `pinned` and `unpinned` do not succeed (i.e. do not return a 2XX status code), we fail.
+- If the URL linking to `pinned` does not match `cached`, we fail.
+- If the URL linking to `unpinned` does not match `cached`, we warn that the data needs updating.
+
+## Layout
+
+This folder has:
 - `Snakefile` which only contains a function used for producing fetching rules.
 - `directory.py`, the actual location of file URLs and their cached counterparts.
 - `cli.py`, a utility for manually fetching specific URLs from `directory.py`.

diff --git a/cache/__init__.py b/cache/__init__.py
@@ -67,9 +67,9 @@ def link(output: str, directive: list[str], uncompress=False):
 
     Path(output).unlink(missing_ok=True)
 
-    # Re-download if the directive has expired.
+    # Re-download if the file doesn't exist or the directive has expired.
     cache_item = get_cache_item(directive)
-    if has_expired(directive):
+    if not (artifacts_dir / artifact_name).exists() or has_expired(directive):
         (artifacts_dir / artifact_name).unlink(missing_ok=True)
         cache_item.download(artifacts_dir / artifact_name)
 

diff --git a/cache/cli.py b/cache/cli.py
@@ -3,7 +3,7 @@
 
 This may be expanded in the future, so only depend on this file as a debugging utility.
 
-For example, `python cache/cli.py KEGG/ko03250.xml ko03250.xm` allows running the KEGG query
+For example, `python cache/cli.py KEGG/ko03250.xml ko03250.xml` allows running the KEGG query
 for ko03250.xml, which can not be normally accessed automatically in the browser.
 """
 
@@ -23,7 +23,7 @@ def main():
     args = parse_args()
     cache_item = get_cache_item(args.path.split("/"))
 
-    cache_item.download_online(args.output)
+    cache_item.download(args.output)
 
 if __name__ == "__main__":
     main()
diff --git a/cache/directory.py b/cache/directory.py
diff --git a/configs/dmmm.yaml b/configs/dmmm.yaml
@@ -17,27 +17,30 @@ analysis:
   cytoscape:
     include: false
   ml:
-    include: true
+    # TODO: we either need at least one non-empty pathway in all algorithm runs,
+    # or we need to get rid of validate_df hard-erroring. The latter seems better.
+    include: false
     aggregate_per_algorithm: true
   evaluation:
     include: false
 
 # Custom settings
 algorithms:
   - name: "omicsintegrator1"
-    params:
-      include: true
+    include: true
+    runs:
       run1:
-        b: [2]
-        w: [.5]
-        d: [10]
-        mu: [2]
+        b: 2
+        w: .5
+        d: 10
+        mu: 2
+        # TODO: egfr prefers dummy_mode: ["file"] since we manually specify EGF_HUMAN as one.
   - name: "omicsintegrator2"
-    params:
-      include: true
+    include: true
+    runs:
       run1:
-        b: [4]
-        g: [0]
+        b: 4
+        g: 0
 
 datasets:
   # TODO: use old paramaters for datasets
@@ -61,14 +64,14 @@ datasets:
   - label: dmmmdiseases_alopecia_areata
     data_dir: datasets/diseases
     edge_files:
-      - raw/string_interactome.txt
+      - processed/string_interactome.tsv
     node_files:
       - prize_files/alopecia_areata_prizes.txt
     other_files: []
   - label: dmmmdiseases_diabetes_mellitus
     data_dir: datasets/diseases
     edge_files:
-      - raw/string_interactome.txt
+      - processed/string_interactome.tsv
     node_files:
       - prize_files/diabetes_mellitus_prizes.txt
     other_files: []
@@ -77,6 +80,16 @@ datasets:
     edge_files: ["raw/phosphosite-irefindex13.0-uniprot.txt"]
     node_files: ["processed/FADU_cell_line_prizes_input_nonzero.txt"]
     other_files: []
+  - label: dmmmegfr_string
+    data_dir: datasets/egfr
+    edge_files: ["processed/interactome.tsv"]
+    node_files: ["processed/prizes.txt"]
+    other_files: []
+  - label: dmmmegfr_irefindex
+    data_dir: datasets/egfr
+    edge_files: ["processed/phosphosite-irefindex13.0-uniprot.txt"]
+    node_files: ["processed/prizes-uniprot.txt"]
+    other_files: []
 gold_standards:
   - label: gs0
     node_files: ['GS_files/Alopecia_areata_GS.txt']
@@ -90,3 +103,11 @@ gold_standards:
     node_files: ["processed/FADU_gold_standard.txt"]
     data_dir: datasets/depmap
     dataset_labels: ["dmmmdepmap_cellline_fadu"]
+  - label: gs_egfr_string
+    node_files: ["processed/gold-standard-nodes.txt"]
+    data_dir: datasets/egfr
+    dataset_labels: ["dmmmegfr_string"]
+  - label: gs_egfr_irefindex
+    node_files: ["processed/gold-standard-nodes-uniprot.txt"]
+    data_dir: datasets/egfr
+    dataset_labels: ["dmmmegfr_irefindex"]
diff --git a/configs/pra.yaml b/configs/pra.yaml
@@ -26,27 +26,26 @@ analysis:
 # Custom settings
 algorithms:
   - name: "omicsintegrator1"
-    params:
-      include: true
+    include: true
+    runs:
       run1:
-        b: [2]
-        w: [.5]
-        d: [10]
-        mu: [2]
+        b: 2
+        w: .5
+        d: 10
+        mu: 2
   - name: "omicsintegrator2"
-    params:
-      include: true
+    include: true
+    runs:
       run1:
-        b: [4]
-        g: [0]
+        b: 4
+        g: 0
   - name: "pathlinker"
-    params:
-      include: true
+    include: true
+    runs:
       run1:
         k: [10, 20]
   - name: "allpairs"
-    params:
-      include: true
+    include: true
 
 datasets:
   - label: prarn_muscleskeletal2018

diff --git a/datasets/diseases/.gitignore b/datasets/diseases/.gitignore
@@ -3,7 +3,7 @@
 data
 
 # prize and gold standard files
+raw
+processed
 GS_files
 prize_files
-raw
-Pickles
diff --git a/datasets/diseases/Snakefile b/datasets/diseases/Snakefile
@@ -2,6 +2,7 @@ include: "../../cache/Snakefile"
 
 rule all:
     input:
+        "processed/string_interactome.tsv",
         "GS_files/Alopecia_areata_GS.txt",
         "GS_files/Diabetes_mellitus_GS.txt",
         "prize_files/alopecia_areata_prizes.txt",
@@ -13,7 +14,7 @@ produce_fetch_rules({
     "raw/HumanDO.tsv": ["DISEASES", "HumanDO.tsv"],
     "raw/tiga_gene-trait_stats.tsv": ["DISEASES", "tiga_gene-trait_stats.tsv"],
     "raw/ensg-ensp.tsv": ["BioMart", "ensg-ensp.tsv"],
-    "raw/9606.protein.links.txt": FetchConfig(["STRING", "9606", "9606.protein.links.txt.gz"], uncompress=True),
+    "raw/9606.protein.links.full.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
     "raw/9606.protein.aliases.txt": FetchConfig(["STRING", "9606", "9606.protein.aliases.txt.gz"], uncompress=True),
 })
 
@@ -41,8 +42,7 @@ rule gold_standard:
 rule files:
     input:
         "data/inputs.csv",
-        "data/gold_standard.csv",
-        "raw/9606.protein.links.txt"
+        "data/gold_standard.csv"
     output:
         # These are the two we use for the SPRAS run for now
         "GS_files/Alopecia_areata_GS.txt",
@@ -51,3 +51,8 @@ rule files:
         "prize_files/diabetes_mellitus_prizes.txt"
     shell:
         "uv run scripts/files.py"
+
+rule interactome:
+    input: "raw/9606.protein.links.full.txt"
+    output: "processed/string_interactome.tsv"
+    shell: "uv run scripts/interactome.py"
diff --git a/datasets/diseases/scripts/files.py b/datasets/diseases/scripts/files.py
@@ -1,9 +1,7 @@
 import pandas as pd
 from pathlib import Path
-import os
 
-# https://stackoverflow.com/a/5137509/7589775
-dir_path = os.path.dirname(os.path.realpath(__file__))
+dir_path = Path(__file__).parent.resolve()
 
 diseases_path = Path(dir_path, "..")
 (diseases_path / "prize_files").mkdir(exist_ok=True, parents=True)
@@ -16,7 +14,7 @@ def main():
 
     GS_string_df = GS_string_df[GS_string_df["diseaseID"].isin(tiga_string_df["id"])]
     GS_combined_group = GS_string_df.groupby("diseaseName")
-    GS_combined_dict = {k: v for k, v in GS_combined_group}
+    GS_combined_dict = {str(k): v for k, v in GS_combined_group}
 
     tiga_filtered = tiga_string_df[tiga_string_df["id"].isin(GS_string_df["diseaseID"])]
     tiga_group = tiga_filtered.groupby("trait")
@@ -27,7 +25,7 @@ def main():
     tiga_threshold = tiga_filtered.loc[tiga_filtered["trait"].isin(list(tiga_count_threshold.keys()))]
 
     tiga_prizes = tiga_threshold.groupby("trait")
-    tiga_prize_dict = {k: v for k, v in tiga_prizes}
+    tiga_prize_dict = {str(k): v for k, v in tiga_prizes}
 
     for disease in tiga_prize_dict.keys():
         df = tiga_prize_dict[disease]
@@ -38,17 +36,7 @@ def main():
     for disease in GS_combined_dict.keys():
         df = GS_combined_dict[disease]
         df = df[["str_id"]]
-        df.to_csv(diseases_path / "GS_files" / f"{disease.replace(' ', '_')}_GS.txt", sep="\t", index=False, header=None)
-
-    # See /cache/directory.py for information on how this was grabbed.
-    # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
-    string = pd.read_csv(diseases_path / "raw" / "9606.protein.links.txt", sep=" ", skiprows=[0], header=None)
-
-    # Threshold anything above a confidence score of 900 to trim down the background interactome
-    string = string[string.iloc[:, 2] > 900]
-    string = string.iloc[:, [0, 1]]
-    string[len(string.columns)] = 1
-    string.to_csv(diseases_path / "raw" / "string_interactome.txt", sep="\t", index=False, header=None)
+        df.to_csv(diseases_path / "GS_files" / f"{disease.replace(' ', '_')}_GS.txt", sep="\t", index=False, header=False)
 
 
 if __name__ == "__main__":

diff --git a/datasets/diseases/scripts/gold_standard.py b/datasets/diseases/scripts/gold_standard.py
@@ -1,9 +1,7 @@
 import pandas as pd
-import os
 from pathlib import Path
 
-# https://stackoverflow.com/a/5137509/7589775
-dir_path = os.path.dirname(os.path.realpath(__file__))
+dir_path = Path(__file__).parent.resolve()
 
 diseases_path = Path(dir_path, "..")
 

diff --git a/datasets/diseases/scripts/inputs.py b/datasets/diseases/scripts/inputs.py
@@ -1,9 +1,7 @@
 from pathlib import Path
 import pandas as pd
-import os
 
-# https://stackoverflow.com/a/5137509/7589775
-dir_path = os.path.dirname(os.path.realpath(__file__))
+dir_path = Path(__file__).parent.resolve()
 
 diseases_path = Path(dir_path, "..")
 (diseases_path / "data").mkdir(exist_ok=True, parents=True)

diff --git a/datasets/diseases/scripts/interactome.py b/datasets/diseases/scripts/interactome.py
@@ -0,0 +1,19 @@
+from pathlib import Path
+import pandas
+
+diseases_path = Path(__file__).parent.parent.resolve()
+
+def main():
+    # See /cache/directory.py for information on how this was grabbed.
+    # 9606 is the organism code for homo sapiens and the required background interactome of DISEASES.
+    string = pandas.read_csv(diseases_path / "raw" / "9606.protein.links.full.txt", sep=" ")
+    string = string[["protein1", "protein2", "combined_score"]]
+
+    # Threshold anything above a confidence score of 900 to trim down the background interactome
+    string = string[string["combined_score"] > 900]
+    # though we still keep the weight afterwards
+    (diseases_path / "processed").mkdir(exist_ok=True)
+    string.to_csv(diseases_path / "processed" / "string_interactome.tsv", sep="\t", index=False, header=False)
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets/egfr/.gitignore b/datasets/egfr/.gitignore
@@ -0,0 +1,2 @@
+raw
+processed
diff --git a/datasets/egfr/README.md b/datasets/egfr/README.md
@@ -0,0 +1,18 @@
+# EGFR
+
+EGFR dataset. This dataset does a lot less processing for raw files, and is mainly focused on creating the new STRING-based interactome.
+
+This data is from [_Synthesizing Signaling Pathways from Temporal Phosphoproteomic Data_](https://doi.org/10.1016/j.celrep.2018.08.085).
+
+## Overview
+
+This produces two sets of files: one based on the iRefIndex/PhosphoSite directed interactome of closed-source origin based off of UniProt identifiers, and another one based off of the more updated though undirected STRING interactome.
+
+## Scripts
+
+- `process_prizes.py`: produces a `prizes-uniprot.txt` from
+[egfr-prizes.txt](https://raw.githubusercontent.com/gitter-lab/tps/refs/heads/master/data/pcsf/egfr-prizes.txt),
+trimming psuedonodes and manually injecting the `EGF_HUMAN` receptor as a dummy node for OmicsIntegrator1.
+- `process_interactome.py`: Produces the STRING `interactome.tsv` file from the STRING links file. Note that the `phosphosite-irefindex13.0-uniprot.txt` is a magic (as in with closed-source origin) directed interactome produced with a combination of the now archived iRefIndex v13 interactome with extra PhosphoSite-provided nodes
+- `process_gold_standard.py`: Produces the `gold-standard-nodes-uniprot.txt` file from the [EGFR prize file](https://raw.githubusercontent.com/gitter-lab/tps/ca7cafd1e1c17f45ddea07c3fb54d0d70b86ff45/data/resources/eight-egfr-reference-all.txt) from the above paper.
+- `map_ensembl.py`: Maps UniProt identifiers to STRING identifiers for the STRING-based data.