Reed-CompBio · tristan-f-r · Jul 1, 2025 · Jul 28, 2025 · Jan 6, 2026 · Jan 6, 2026
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -16,6 +16,8 @@
 		// For web display
 		"ghcr.io/devcontainers/features/node:1": {},
 		// For scripting
-		"ghcr.io/va-h/devcontainers-features/uv:1": {}
+		"ghcr.io/va-h/devcontainers-features/uv:1": {},
+		// For paxtools
+		"ghcr.io/devcontainers/features/java:1": {}
 	}
 }
diff --git a/README.md b/README.md
@@ -39,21 +39,22 @@ uv run snakemake --cores 1
 
 ## Organization
 
-There are five primary folders in this repository:
+There are six primary folders in this repository:
 
 ```
 .
 ├── cache
 ├── configs
 ├── datasets
 ├── spras
+├── tools
 └── web
 ```
 
 `spras` is the cloned submodule of [SPRAS](https://github.com/reed-compbio/spras), `web` is an
 [astro](https://astro.build/) app which generates the `spras-benchmarking` [output](https://reed-compbio.github.io/spras-benchmarking/),
 `configs` is the YAML file used to talk to SPRAS, and `datasets` contains the raw data. `cache` is utility for `datasets` which provides a convenient
-way to fetch online files for further processing.
+way to fetch online files for further processing. `tools` is the miscellaneous utilities for dataset processing, for tasks common to datasets.
 
 The workflow runs as so:
 

diff --git a/cache/directory.py b/cache/directory.py
@@ -336,6 +336,46 @@ def download(self, output: str | PathLike):
             cached="https://drive.google.com/uc?id=1nI5hw-rYRZPs15UJiqokHpHEAabRq6Xj"
         )
     },
+    "Surfaceome": {
+        "table_S3_surfaceome.xlsx": CacheItem(
+            name="Human surfaceome",
+            unpinned="http://wlab.ethz.ch/surfaceome/table_S3_surfaceome.xlsx",
+            cached="https://docs.google.com/uc?id=1cBXYbDnAJVet0lv3BRrizV5FuqfMbBr0"
+        )
+    },
+    "TranscriptionFactors": {
+        "Homo_sapiens_TF.tsv": CacheItem(
+            name="Human transcription factors",
+            # This server has anti-bot protection, so to respect their wishes, we don't download from the server.
+            # The original URL is https://guolab.wchscu.cn/AnimalTFDB4_static/download/TF_list_final/Homo_sapiens_TF,
+            # which is accessible from https://guolab.wchscu.cn/AnimalTFDB4//#/Download -> Homo sapiens
+            # (also under the Internet Archive as of Feb 2nd, 2026. If the original artifact disappears, the drive link below should suffice.)
+            cached="https://drive.google.com/uc?id=1fVi18GpudUlquRPHgUJl3H1jy54gO-uz",
+        )
+    },
+    "PathwayCommons": {
+        "pc-biopax.owl.gz": CacheItem(
+            name="PathwayCommons Universal BioPAX file",
+            cached="https://drive.google.com/uc?id=1R7uE2ky7fGlZThIWCOblu7iqbpC-aRr0",
+            pinned="https://download.baderlab.org/PathwayCommons/PC2/v14/pc-biopax.owl.gz"
+        ),
+        "pathways.txt.gz": CacheItem(
+            name="PathwayCommons Pathway Identifiers",
+            cached="https://drive.google.com/uc?id=1SMwuuohuZuNFnTev4zRNJrBnBsLlCHcK",
+            pinned="https://download.baderlab.org/PathwayCommons/PC2/v14/pathways.txt.gz",
+        ),
+        "denylist.txt": CacheItem(
+            name="PathwayCommons small molecule denylist",
+            cached="https://drive.google.com/uc?id=1QmISJXPvVljA8oKuNYRUNbJJvZKPa_-u",
+            pinned="https://download.baderlab.org/PathwayCommons/PC2/v14/blacklist.txt"
+        ),
+        "intermediate": {
+            "pc-panther-biopax.owl": CacheItem(
+                name="PathwayCommons PANTHER-only BioPAX file",
+                cached="https://drive.google.com/uc?id=1MklrD8CJ1BIjh_wWr_g5rrIJ5XJB7FUI"
+            )
+        }
+    }
 }
 
 

diff --git a/datasets/README.md b/datasets/README.md
@@ -11,3 +11,8 @@ Many of the datasets here have been stripped of their extra post-analysis. Here,
 - [`diseases`](https://github.com/Reed-CompBio/spras-benchmarking/tree/3c0155567dbc43278531b91f9173f6d4f4486dd8/datasets/diseases)
 - [`depmap`](https://github.com/Reed-CompBio/spras-benchmarking/tree/b332c0ab53868f111cb89cd4e9f485e8c19aa9e3/datasets/depmap)
 - [`yeast-osmotic-stress`](https://github.com/Reed-CompBio/spras-benchmarking/tree/8f69dcdf4a52607347fe3a962b753df396e44cda/yeast-osmotic-stress)
+
+## `explore` folders
+
+To motivate certain decisions made in-code, such as `synthetic-data`'s PANTHER pathway choices, we provide scripts that use live data
+to assist in data curation. These folders can also contain exploratory CLIs for motivating e.g. magic constants.
diff --git a/datasets/__init__.py b/datasets/__init__.py
diff --git a/datasets/synthetic_data/.gitignore b/datasets/synthetic_data/.gitignore
@@ -0,0 +1,3 @@
+/intermediate
+/processed
+/raw
diff --git a/datasets/synthetic_data/README.md b/datasets/synthetic_data/README.md
@@ -0,0 +1,65 @@
+# Synthetic Data
+
+## PANTHER Pathway Fetching
+
+This dataset has a kind of 'sub'-dataset, which is a separate Snakemake rule
+used for generating the pathway files and their associated metadata to be used inside this one.
+
+Located under `./panther_pathways`, it provides TODO.
+
+## Download New PANTHER Pathways
+1. Visit [Pathway Commons](https://www.pathwaycommons.org/).
+2. Search for the desired pathway (e.g., "signaling") and filter the results by the **PANTHER pathway** data source.  
+   Example: [Search for "Signaling" filtered by PANTHER pathway](https://apps.pathwaycommons.org/search?datasource=panther&q=Signaling&type=Pathway)
+3. Click on the desired pathway and download the **Extended SIF** version of the pathway.
+4. In the `raw/pathway-data/` folder, create a new subfolder named after the pathway you downloaded.
+5. Move the downloaded Extended SIF file to this new folder (as a `.txt` file). Rename the file to match the subfolder name exactly.
+
+## Sources and Targets
+
+[Sources](http://wlab.ethz.ch/surfaceome/), or `table_S3_surfaceome.xlsx`, (see [original paper](https://doi.org/10.1073/pnas.1808790115))
+are silico human surfaceomes receptors.
+
+[Targets]( https://guolab.wchscu.cn/AnimalTFDB4//#/), or `Homo_sapiens_TF.tsv`, (see [original paper](https://doi.org/10.1093/nar/gkac907))
+are human transcription factors.
+
+## Steps to Generate SPRAS-Compatible Pathways
+
+This entire workflow can also be done with `uv run snakemake --cores 1` inside this directory.
+
+### 1. Process PANTHER Pathways
+
+1. Open `Snakefile` and add the name of any new pathways to the `pathways` entry.
+2. Run the command:
+   ```sh
+   uv run scripts/process_panther_pathway.py <pathway>
+   ```
+3. This will create five new files in the respective `pathway` subfolder of the `pathway-data/` directory:
+- `edges.txt`
+- `nodes.txt`
+- `prizes-100.txt`
+- `sources.txt`
+- `targets.txt`
+
+### 2. Convert Pathways to SPRAS-Compatible Format
+1.	In `panther_spras_formatting.py`, add the name of any new pathways to the `pathway_dirs` list on **line 8**.
+2.	From the synthetic-data/ directory, run the command:
+```
+python scripts/panther_spras_formatting.py
+```
+3. This will create a new folder named `spras-compatible-pathway-data`, containing subfolders for each PANTHER pathway in SPRAS-compatible format.  
+Each subfolder will include the following three files:
+- `<pathway_name>_gs_edges.txt`
+- `<pathway_name>_gs_nodes.txt`
+- `<pathway_name>_node_prizes.txt`
+
+# Pilot Data
+For the pilot data, use the list `["Wnt_signaling", "JAK_STAT_signaling", "Interferon_gamma_signaling", "FGF_signaling", "Ras"]` in both:
+- the list in `combine.py`
+- the list in `overlap_analytics.py`
+
+Make sure these pathways in the list are also added `["Wnt_signaling", "JAK_STAT_signaling", "Interferon_gamma_signaling", "FGF_signaling", "Ras"]`to:
+- the `pathways` vector in `ProcessPantherPathway.R`
+- the list in `panther_spras_formatting.py`
+
+**Once you’ve updated the pathway lists in all relevant scripts, run all the steps above to generate the Pilot dataset.**
diff --git a/datasets/synthetic_data/Snakefile b/datasets/synthetic_data/Snakefile
@@ -0,0 +1,83 @@
+include: "../../cache/Snakefile"
+from jsonc_parser.parser import JsoncParser
+
+pathways = JsoncParser.parse_file("pathways.jsonc")
+
+# TODO: deduplicate from sampling.py
+thresholds = list(map(str, map(lambda x: (x + 1) / 10, range(10))))
+
+rule all:
+    input:
+        "raw/9606.protein.links.full.v12.0.txt",
+        expand([
+            "thresholded/{threshold}/{pathway}/interactome.txt",
+            "thresholded/{threshold}/{pathway}/gold_standard_edges.txt",
+        ], pathway=pathways, threshold=thresholds)
+
+produce_fetch_rules({
+    "raw/9606.protein.links.full.v12.0.txt": FetchConfig(["STRING", "9606", "9606.protein.links.full.txt.gz"], uncompress=True),
+    "raw/human-interactome/table_S3_surfaceome.xlsx": ["Surfaceome", "table_S3_surfaceome.xlsx"],
+    "raw/human-interactome/Homo_sapiens_TF.tsv": ["TranscriptionFactors", "Homo_sapiens_TF.tsv"],
+    "raw/human-interactome/HUMAN_9606_idmapping_selected.tsv": FetchConfig(["UniProt", "9606", "HUMAN_9606_idmapping_selected.tab.gz"], uncompress=True),
+    "raw/pc-panther-biopax.owl": ["PathwayCommons", "intermediate", "pc-panther-biopax.owl"],
+    "raw/denylist.txt": ["PathwayCommons", "denylist.txt"],
+    "raw/pathways.txt": FetchConfig(["PathwayCommons", "pathways.txt.gz"], uncompress=True)
+})
+
+rule interactome:
+    input:
+        "raw/9606.protein.links.full.v12.0.txt",
+        "raw/9606.protein.aliases.txt"
+    output:
+        "processed/proteins_missing_aliases.csv",
+        "processed/removed_edges.txt",
+        "processed/interactome.tsv"
+    shell:
+        "uv run scripts/interactome.py"
+
+rule process_tfs:
+    input:
+        "raw/human-interactome/Homo_sapiens_TF.tsv",
+        "raw/human-interactome/HUMAN_9606_idmapping_selected.tsv"
+    output:
+        "raw/human-interactome/Homo_sapiens_TF_Uniprot.tsv"
+    shell:
+        "uv run scripts/map_transcription_factors.py"
+
+rule process_panther_pathway:
+    input:
+        "raw/pathway-data/{pathway}.txt",
+        "raw/human-interactome/table_S3_surfaceome.xlsx",
+        "raw/human-interactome/Homo_sapiens_TF_Uniprot.tsv"
+    output:
+        "intermediate/{pathway}/edges.txt",
+        "intermediate/{pathway}/nodes.txt",
+        "intermediate/{pathway}/sources.txt",
+        "intermediate/{pathway}/targets.txt",
+        "intermediate/{pathway}/prizes.txt"
+    shell:
+        "uv run scripts/process_panther_pathway.py {wildcards.pathway}"
+
+rule make_spras_compatible:
+    input:
+        "intermediate/{pathway}/edges.txt",
+        "intermediate/{pathway}/nodes.txt",
+        "intermediate/{pathway}/sources.txt",
+        "intermediate/{pathway}/targets.txt",
+        "intermediate/{pathway}/prizes.txt"
+    output:
+        "processed/{pathway}/{pathway}_node_prizes.txt",
+        "processed/{pathway}/{pathway}_gs_edges.txt",
+        "processed/{pathway}/{pathway}_gs_nodes.txt"
+    shell:
+        "uv run scripts/panther_spras_formatting.py {wildcards.pathway}"
+
+rule threshold:
+    input:
+        "processed/{pathway}/{pathway}_node_prizes.txt",
+        "processed/{pathway}/{pathway}_gs_edges.txt"
+    output:
+        expand("thresholded/{threshold}/{{pathway}}/interactome.txt", threshold=thresholds),
+        expand("thresholded/{threshold}/{{pathway}}/gold_standard_edges.txt", threshold=thresholds)
+    shell:
+        "uv run scripts/sampling.py {wildcards.pathway}"
diff --git a/datasets/synthetic_data/__init__.py b/datasets/synthetic_data/__init__.py
diff --git a/datasets/synthetic_data/panther_pathways/.gitignore b/datasets/synthetic_data/panther_pathways/.gitignore
@@ -0,0 +1,3 @@
+/raw
+/intermediate
+/output
diff --git a/datasets/synthetic_data/panther_pathways/Snakefile b/datasets/synthetic_data/panther_pathways/Snakefile
@@ -0,0 +1,19 @@
+include: "../../../cache/Snakefile"
+
+rule all:
+    input:
+        "output/pc-panther-biopax.owl"
+
+produce_fetch_rules({
+    "raw/pc-biopax.owl": FetchConfig(["PathwayCommons", "pc-biopax.owl.gz"], uncompress=True),
+    "raw/pathways.txt": FetchConfig(["PathwayCommons", "pathways.txt.gz"], uncompress=True)
+})
+
+rule fetch_from_owl:
+    input:
+        "raw/pc-biopax.owl",
+        "raw/pathways.txt"
+    output:
+        "output/pc-panther-biopax.owl"
+    shell:
+        "uv run fetch_from_owl.py"
diff --git a/datasets/synthetic_data/panther_pathways/__init__.py b/datasets/synthetic_data/panther_pathways/__init__.py
diff --git a/datasets/synthetic_data/panther_pathways/fetch_from_owl.py b/datasets/synthetic_data/panther_pathways/fetch_from_owl.py
@@ -0,0 +1,19 @@
+from pathlib import Path
+from paxtools.fetch import fetch
+from datasets.synthetic_data.util.parse_pc_pathways import parse_pc_pathways
+
+current_directory = Path(__file__).parent.resolve()
+
+def main():
+    pathways_df = parse_pc_pathways(current_directory / 'raw' / 'pathways.txt')
+    print("Fetching pathways... [This may take some time. On the author's desktop machine, it took 15 minutes.]")
+    (current_directory / 'output').mkdir(exist_ok=True)
+    fetch(
+        current_directory / 'raw' / 'pc-biopax.owl',
+        output=(current_directory / 'output' / "pc-panther-biopax.owl"),
+        uris=list(pathways_df["PATHWAY_URI"]),
+        memory=f"{2**(16 - 1)}m" # this is why we don't run this in CI! This is 32gb of memory.
+    )
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets/synthetic_data/pathways.jsonc b/datasets/synthetic_data/pathways.jsonc
@@ -0,0 +1,31 @@
+[
+    // "CCKR signaling map", TODO: not a valid pathway name?
+    "Wnt signaling pathway",
+    "VEGF signaling pathway",
+    "Toll receptor signaling pathway",
+    "TGF-beta signaling pathway",
+    "PDGF signaling pathway",
+    "Notch signaling pathway",
+    "JAK/STAT signaling pathway",
+    "Interleukin signaling pathway",
+    "Interferon-gamma signaling pathway",
+    "Integrin signalling pathway",
+    "Insulin/IGF pathway-protein kinase B signaling cascade",
+    "Inflammation mediated by chemokine and cytokine signaling pathway",
+    "Hedgehog signaling pathway",
+    "FGF signaling pathway",
+    "FAS signaling pathway",
+    // "Endothelin signaling pathway", TODO: not a valid pathway name?
+    "EGF receptor signaling pathway",
+    "Cadherin signaling pathway",
+    "Apoptosis signaling pathway",
+    "Ras Pathway",
+    "PI3 kinase pathway",
+    "p38 MAPK pathway",
+    "Insulin/IGF pathway-mitogen activated protein kinase kinase/MAP kinase cascade",
+    "p53 pathway",
+    "Hypoxia response via HIF activation",
+    "Oxidative stress response",
+    "B cell activation",
+    "T cell activation"
+]
diff --git a/datasets/synthetic_data/scripts/__init__.py b/datasets/synthetic_data/scripts/__init__.py
diff --git a/datasets/synthetic_data/scripts/fetch_pathway.py b/datasets/synthetic_data/scripts/fetch_pathway.py
@@ -0,0 +1,46 @@
+import argparse
+from pathlib import Path
+
+import pandas
+from paxtools.fetch import fetch
+from paxtools.sif import toSIF
+
+synthetic_directory = Path(__file__).parent.parent.resolve()
+
+def parser():
+    parser = argparse.ArgumentParser(prog="PANTHER pathway fetcher")
+
+    parser.add_argument("pathway_name", type=str)
+
+    return parser
+
+def main():
+    args = parser().parse_args()
+    curated_pathways_df = pandas.read_csv(synthetic_directory / 'intermediate' / 'curated_pathways.tsv', sep='\t')
+    associated_id = curated_pathways_df.loc[curated_pathways_df["Name"] == args.pathway_name].reset_index(drop=True).loc[0]["ID"]
+
+    pathway_data_dir = synthetic_directory / 'intermediate' / 'pathway-data'
+    pathway_data_dir.mkdir(exist_ok=True, parents=True)
+
+    fetch(
+        synthetic_directory / 'raw' / 'pc-panther-biopax.owl',
+        pathway_data_dir / Path(args.pathway_name).with_suffix(".owl"),
+        denylist=synthetic_directory / 'raw' / 'denylist.txt',
+        uris=[associated_id],
+        absolute=True
+    )
+
+    toSIF(
+        pathway_data_dir / Path(args.pathway_name).with_suffix(".owl"),
+        pathway_data_dir / Path(args.pathway_name).with_suffix(".sif"),
+        # See the paxtools library for information about how these settings were retrieved.
+        # These are directly from PathwayCommons.
+        denylist=str(synthetic_directory / 'raw' / 'denylist.txt'),
+        chemDb=["chebi"],
+        seqDb=["hgnc"],
+        exclude=["NEIGHBOR_OF"],
+        extended=True,
+    )
+
+if __name__ == "__main__":
+    main()