Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8,150 changes: 8,150 additions & 0 deletions docs/tutorials/cell_level_ldsc_analysis.ipynb

Large diffs are not rendered by default.

19,343 changes: 19,343 additions & 0 deletions docs/tutorials/sclinker.ipynb

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions src/cellink/io/_sgkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,8 @@ def read_plink(

def read_bgen(
path: str | Path = None,
metafile_path: str | Path = None,
sample_path: str | Path = None,
*,
var_rename=None,
obs_rename=None,
Expand Down Expand Up @@ -415,7 +417,5 @@ def read_bgen(
raise ImportError("sgkit is required for `read_bgen`. Install with `pip install cellink[datasets]`.")

sgkit_dataset = sg_bgen.read_bgen(path=path, **kwargs)
gdata = from_sgkit_dataset(
sgkit_dataset, var_rename=var_rename, obs_rename=obs_rename, X_field=X_field, hard_call=hard_call, keep_multiallelic=keep_multiallelic, load_call_fields=load_call_fields
)
gdata = from_sgkit_dataset(sgkit_dataset, metafile_path=metafile_path, sample_path=sample_path, var_rename=var_rename, obs_rename=obs_rename, hard_call=hard_call)
return gdata
8 changes: 7 additions & 1 deletion src/cellink/resources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,10 @@
get_pgs_catalog_score,
get_pgs_catalog_scores,
)
from ._ld import get_1000genomes_ld_scores, get_1000genomes_ld_weights
from ._ld import (
get_1000genomes_ld_scores,
get_1000genomes_ld_weights,
get_1000genomes_plink_files,
get_1000genomes_frq,
get_1000genomes_hapmap3,
)
3 changes: 2 additions & 1 deletion src/cellink/resources/_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
import pandas as pd

import cellink as cl
from cellink._core import DonorData
from cellink.io import read_h5_dd, read_zarr_dd
from cellink.resources._datasets_utils import plink_filter_prune, plink_kinship, preprocess_vcf_to_plink, try_liftover
from cellink.resources._utils import _download_file, _load_config, _run, get_data_home

from .._core import DonorData

logging.basicConfig(level=logging.INFO)


Expand Down
103 changes: 0 additions & 103 deletions src/cellink/resources/_gwas_prs_qtl.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,109 +98,6 @@ def get_gwas_catalog_study(accession_id: str, **params: Any) -> dict:
"""
return _fetch(f"studies/{accession_id}", params=params, paginate=False)

"""
def get_gwas_catalog_study_summary_stats(
accession_id: str, dest: str | Path | None = None, return_path: bool = False, **params: Any
) -> pd.DataFrame | Path:

study_meta = _fetch(f"{GWAS_API_BASE}/studies/{accession_id}", params=params, paginate=False)

if "full_summary_stats" not in study_meta:
raise ValueError(f"Study {accession_id} does not have full summary statistics available")

base_url = study_meta["full_summary_stats"]
harmonised_url = f"{base_url}/harmonised"

import re

try:
r = requests.get(harmonised_url)
r.raise_for_status()

all_files = re.findall(r'href="([^"]*\.tsv\.gz)"', r.text)

h_files = [f for f in all_files if f.endswith(".h.tsv.gz") and not f.endswith(".h.tsv.gz-meta.yaml")]

if h_files:

def build_priority(filename):
filename_lower = filename.lower()
if "build38" in filename_lower or "hg38" in filename_lower or "grch38" in filename_lower:
return 2
elif "build37" in filename_lower or "hg19" in filename_lower or "grch37" in filename_lower:
return 1
else:
return 0

h_files.sort(key=build_priority, reverse=True)
filename = h_files[0]
url = f"{harmonised_url}/{filename}"
logging.info(f"Found harmonised file: {filename}")
else:
raise ValueError("No harmonised .h.tsv.gz files found")

except Exception as e:
logging.warning(f"Could not find harmonised files ({e}), trying base directory")

try:
r = requests.get(base_url)
r.raise_for_status()
files = re.findall(r'href="([^"]*\.tsv\.gz)"', r.text)

if files:

def build_priority(filename):
filename_lower = filename.lower()
if "build38" in filename_lower or "hg38" in filename_lower or "grch38" in filename_lower:
return 2
elif "build37" in filename_lower or "hg19" in filename_lower or "grch37" in filename_lower:
return 1
else:
return 0

files.sort(key=build_priority, reverse=True)
filename = files[0]
url = f"{base_url}/{filename}"
else:
possible_files = [
f"{accession_id}_buildGRCh38.tsv.gz",
f"{accession_id}_buildGRCh37.tsv.gz",
f"{accession_id}.tsv.gz",
]

for filename in possible_files:
test_url = f"{base_url}/{filename}"
try:
test_r = requests.head(test_url)
if test_r.status_code == 200:
url = test_url
break
except:
continue
else:
raise ValueError(f"Could not find summary statistics file for {accession_id}")

except Exception as e2:
raise ValueError(f"Could not find summary statistics for {accession_id}: {e2}")

if not dest:
data_home = get_data_home()
dest = data_home / f"{accession_id}_summary_stats.tsv.gz"

logging.info(f"Downloading {url} to {dest}")

try:
urlretrieve(url, dest)
except Exception as e:
raise RuntimeError(f"Failed to download summary statistics from {url}: {e}")

if return_path:
return dest

data = pd.read_csv(dest, compression="gzip", delimiter="\t")
return data
"""

def get_gwas_catalog_study_summary_stats(
accession_id: str,
dest: str | Path | None = None,
Expand Down
Loading
Loading