Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
26099d9
added parsers from other repo
lspindler2509 Mar 18, 2024
4b34531
add new parsers to build.py
lspindler2509 Mar 18, 2024
259afcc
different formatting
lspindler2509 Mar 18, 2024
ba3710e
different formatting
lspindler2509 Mar 18, 2024
79a7ac9
different formatting
lspindler2509 Mar 18, 2024
27f4c73
updated uniprot parser
lspindler2509 Mar 18, 2024
aba8173
just quick call to download a specific file
lspindler2509 Mar 18, 2024
f45ba0a
temp: correct path
lspindler2509 Mar 18, 2024
0cb5004
adjust path
lspindler2509 Mar 18, 2024
e553804
add other path
lspindler2509 Mar 18, 2024
e236ddc
redo adjustments for local download
lspindler2509 Mar 18, 2024
fdaf77f
different order of parsers
lspindler2509 Mar 19, 2024
1caf32b
temp for debugging
lspindler2509 Mar 19, 2024
443974b
as path
lspindler2509 Mar 19, 2024
46d9d55
redo paths, add logs
lspindler2509 Mar 19, 2024
a782222
different order
lspindler2509 Mar 19, 2024
4a583ca
added log
lspindler2509 Mar 19, 2024
313ce7d
delete unnecessary prints
lspindler2509 Mar 19, 2024
98bf7e5
added test script for db
lspindler2509 Mar 19, 2024
6a82afa
use live version
lspindler2509 Mar 19, 2024
a939314
adjust test for db
lspindler2509 Mar 19, 2024
8b22a76
different test
lspindler2509 Mar 19, 2024
f86ad41
check used files
lspindler2509 Apr 4, 2024
9e23ace
analyse data structure
lspindler2509 Apr 4, 2024
72d561c
different log
lspindler2509 Apr 4, 2024
3e41244
add log
lspindler2509 Apr 4, 2024
e6229d2
create gene_disorder relation, not yet into db
lspindler2509 Apr 4, 2024
4669f23
correct attributes, write in DB
lspindler2509 Apr 4, 2024
e6b4cd5
deleted logs
lspindler2509 Apr 4, 2024
ac42549
adjusted edgetype
lspindler2509 May 21, 2024
0664fb0
adjusted type parsing
lspindler2509 May 21, 2024
619d64e
adjusted parsing
lspindler2509 May 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
sider,
uberon,
repotrial,
cosmic,
ncg,
intogen
)
from nedrexdb.post_integration import trim_uberon, drop_empty_collections

Expand Down Expand Up @@ -70,6 +73,9 @@ def update(conf, download):
ncbi.parse_gene_info()
uberon.parse()
uniprot.parse_proteins()

cosmic.parse_gene_disease_associations()
ncg.parse_gene_disease_associations()

# Sources that add node type but require existing nodes, too
clinvar.parse()
Expand Down Expand Up @@ -104,6 +110,9 @@ def update(conf, download):

sider.parse()
uniprot.parse_idmap()

intogen.parse_gene_disease_associations()


from nedrexdb.analyses import molecule_similarity

Expand Down
230 changes: 230 additions & 0 deletions nedrexdb/db/parsers/cosmic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
import gzip as _gzip
from csv import DictReader as _DictReader
from pathlib import Path as _Path

from more_itertools import chunked as _chunked
from tqdm import tqdm as _tqdm

from nedrexdb.db import MongoInstance
from nedrexdb.db.models.edges.variant_affects_gene import VariantAffectsGene
from nedrexdb.db.models.edges.variant_associated_with_disorder import VariantAssociatedWithDisorder
from nedrexdb.db.models.edges.gene_associated_with_disorder import GeneAssociatedWithDisorder
from nedrexdb.db.models.nodes.gene import Gene
from nedrexdb.db.models.nodes.genomic_variant import GenomicVariant
from nedrexdb.db.parsers import _get_file_location_factory
from nedrexdb.logger import logger

get_file_location = _get_file_location_factory("cosmic")
get_clinvar_file_location = _get_file_location_factory("clinvar")


# g_dot_re = re.compile("(..?):g\.(\d*)_?(\d*)(.*)")


def get_gdot2clinvar(fname: str) -> dict[str, str]:
from nedrexdb.db.parsers.clinvar import ClinVarVCFParser
vcf_parser = ClinVarVCFParser(fname)
gdot2clinvar = {}
for row in vcf_parser.iter_rows():
full_gdot = row['INFO'].get('CLNHGVS')
if full_gdot:
_, gdot = row['INFO']['CLNHGVS'].split(':', maxsplit=1)
gdot2clinvar[f"{row['CHROM']}:{gdot}"] = f"clinvar.{row['ID']}"
return gdot2clinvar


def get_cancer2mondo(mapping_fname: _Path) -> dict[tuple: str]:
mapping_columns = ['SITE_PRIMARY_COSMIC', 'SITE_SUBTYPE1_COSMIC',
'SITE_SUBTYPE2_COSMIC', 'SITE_SUBTYPE3_COSMIC', 'HISTOLOGY_COSMIC',
'HIST_SUBTYPE1_COSMIC', 'HIST_SUBTYPE2_COSMIC', 'HIST_SUBTYPE3_COSMIC']
cancer2mondo = {}
with open(mapping_fname, newline='') as mapping_file:
reader = _DictReader(mapping_file, delimiter="\t")
cancer2mondo = {tuple(
row[column] for column in mapping_columns): row['mapped_curie'] for row in reader}
return cancer2mondo


class COSMICRow:
def __init__(self, row):
self._row = row

def get_HGVSG(self):
return self._row["HGVSG"]

def get_COSMIC(self):
return f"cosmic.{self._row['GENOMIC_MUTATION_ID']}"

def get_symbol(self):
return self._row["Gene name"]

def get_cancer_tuple(self) -> tuple:
return tuple(
self._row[column] for column in ['Primary site', 'Site subtype 1', 'Site subtype 2', 'Site subtype 3',
'Primary histology', 'Histology subtype 1', 'Histology subtype 2',
'Histology subtype 3'])

def get_mutation_status(self):
return self._row['Mutation somatic status']

# def get_variant(self, gdot2clinvar) -> GenomicVariant:
# match = g_dot_re.search(self._row["HGVSG"])

# if variant_id:
# return GenomicVariant(primaryDomainId=variant_id, domainIds=[cosmic_id], dataSources=['COSMIC'])
# if match:
# chrom, pos_start, pos_end, mut = match.group(1, 2, 3, 4)
# pos_start = int(pos_start)
# if pos_end: pos_end = int(pos_end)
# if mut == 'del':
# pos_start -= 1
# if not pos_end:
# pos_end = pos_start + 1
# genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Deletion')]
# variant = [variant for variant in genomic_variants if
# len(variant["referenceSequence"]) == pos_end + 1 - pos_start]
# elif mut == "dup":
# genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Duplication')]
# variant = [variant for variant in genomic_variants if
# len(variant["alternativeSequence"]) == pos_end + 2 - pos_start]
# if variant:
# breakpoint()
# elif mut.startswith('ins'):
# insertion = mut.strip('ins')
# genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Insertion')]
# variant = [variant for variant in genomic_variants if
# variant["referenceSequence"] + insertion == variant["alternativeSequence"]]
# elif mut.startswith('delins'):
# insertion = mut.strip('delins')
# if not pos_end:
# pos_end = pos_start + 1
# genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Indel')]
# variant = [variant for variant in genomic_variants if
# insertion == variant["alternativeSequence"] and
# len(variant["referenceSequence"]) == pos_end + 1 - pos_start]
# else:
# if '>' not in mut:
# breakpoint()
# mut_from, mut_to = mut.split('>', 1)
# genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Single Nucleotide Variant')]
# variant = [variant for variant in genomic_variants if
# variant['referenceSequence'] == mut_from and
# variant['alternativeSequence'] == mut_to]
# assert len(variant) <= 1, f"More than one matching variant found in Nedrex for {self._row['HGVSG']}"
# if variant:
# variant = variant[0]
# variant["domainIds"].append(cosmic_id)
# return GenomicVariant(**variant)
# return None

def parse(self, gdot2clinvar: dict[str, str], symbol2entrez: dict[str, str], cancer2mondo: dict[tuple, str]) -> \
tuple[GenomicVariant, VariantAffectsGene, VariantAssociatedWithDisorder]:
variant_id = gdot2clinvar.get(self.get_HGVSG())
genomic_variant = None
variant_gene = None
variant_disorder = None
if variant_id:
cosmic_id = self.get_COSMIC()
asserted_by = ["cosmic"]
genomic_variant = GenomicVariant(primaryDomainId=variant_id, domainIds=[
cosmic_id], dataSources=asserted_by)
# data_update = genomic_variant.generate_dataSource_update()
# MongoInstance.DB[GenomicVariant.collection_name].bulk_write([data_update])
gene_id = symbol2entrez[self.get_symbol()]
variant_gene = VariantAffectsGene(sourceDomainId=variant_id, targetDomainId=gene_id,
dataSources=asserted_by)
mondo_id = cancer2mondo.get(self.get_cancer_tuple())
# if mondo_id:
variant_disorder = VariantAssociatedWithDisorder(accession=cosmic_id, dataSources=asserted_by,
sourceDomainId=variant_id,
targetDomainId=mondo_id,
reviewStatus=self.get_mutation_status())
# else:
# variant_disorder = None

return genomic_variant, variant_gene, variant_disorder


class COSMICParser:
COLUMN_NAMES = ['HGVSG', 'Gene name', 'GENOMIC_MUTATION_ID', 'Mutation somatic status', 'Primary site',
'Site subtype 1', 'Site subtype 2',
'Site subtype 3',
'Primary histology', 'Histology subtype 1', 'Histology subtype 2',
'Histology subtype 3']

def __init__(self, f: _Path):
self.f = f

if self.f.name.endswith(".gz") or self.f.name.endswith(".gzip"):
self.gzipped = True
else:
self.gzipped = False

def parse(self, mapping_fname: _Path):
if self.gzipped:
f = _gzip.open(self.f, "rt")
else:
f = self.f.open()

reader = _DictReader(f, delimiter="\t")
f_dict = [{column: row[column]
for column in self.COLUMN_NAMES} for row in reader]
f.close()

all_symbols = {row['Gene name'] for row in f_dict}
symbol2entrez = {gene["approvedSymbol"]: gene["primaryDomainId"] for gene in
Gene.find(MongoInstance.DB, {"approvedSymbol": {"$in": list(all_symbols)}})}
non_approved_symbols = all_symbols - symbol2entrez.keys()
for symbol in non_approved_symbols:
genes = [gene["primaryDomainId"]
for gene in Gene.find(MongoInstance.DB, {"symbols": symbol})]
assert len(genes) == 1, f"Multiple genes found for the symbol {symbol}"
symbol2entrez.update({symbol: genes[0]})
assert not (non_approved_symbols - symbol2entrez.keys()), \
f"Not all symbols could be mapped: {non_approved_symbols - symbol2entrez.keys()}"

# id2genomic_variant = {genomic_variant['primaryDomainId']: genomic_variant for genomic_variant in
# GenomicVariant.find(MongoInstance.DB)}
# chr_pos_type2id = defaultdict(list)
# this_id =
# genomic_variant['primaryDomainId'], genomic_variant
# chr_pos_type2id[
# (genomic_variant['chromosome'], genomic_variant['position'], genomic_variant['variantType'])].append(
# genomic_variant)
gdot2clinvar = get_gdot2clinvar(
get_clinvar_file_location("human_data"))
cancer2mondo = get_cancer2mondo(mapping_fname)

updates = (COSMICRow(row).parse(
gdot2clinvar, symbol2entrez, cancer2mondo) for row in f_dict)
for chunk in _tqdm(_chunked(updates, 10_000), leave=False, desc="Parsing COSMIC"):
if not chunk:
continue
genomic_variant_updates, variant_gene_updates, variant_disorder_updates, gene_disorder_updates = [], [], [], []
for genomic_variant, variant_gene, variant_disorder in chunk:
if genomic_variant:
genomic_variant_updates.append(
genomic_variant.generate_update())
variant_gene_updates.append(variant_gene.generate_update())
if variant_disorder:
variant_disorder_updates.append(
variant_disorder.generate_update())
gene_disorder = GeneAssociatedWithDisorder(dataSources=["cosmic"],
sourceDomainId=variant_gene.targetDomainId,
targetDomainId=variant_disorder.targetDomainId)
gene_disorder_updates.append(gene_disorder.generate_update())

for this_collection_name, these_updates in zip([GenomicVariant.collection_name, VariantAffectsGene.collection_name, VariantAssociatedWithDisorder.collection_name, GeneAssociatedWithDisorder.collection_name],
[genomic_variant_updates, variant_gene_updates, variant_disorder_updates, gene_disorder_updates]):
bulk_write_results = MongoInstance.DB[this_collection_name].bulk_write(
these_updates)
if bulk_write_results.bulk_api_result['writeErrors'] or bulk_write_results.bulk_api_result['writeConcernErrors']:
print(bulk_write_results.bulk_api_result)



def parse_gene_disease_associations():
logger.info("Parsing COSMIC")
fname = get_file_location("census")
mapping_fname = get_file_location("mapping")
COSMICParser(fname).parse(mapping_fname)
12 changes: 12 additions & 0 deletions nedrexdb/db/parsers/db_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pymongo

# Verbindung zur MongoDB-Datenbank herstellen
client = pymongo.MongoClient('localhost', 27020)
db = client['licensed_nedrex_live'] # Datenbank auswählen

# Ein zufälliges Element aus der Sammlung abrufen
random_document = db['gene_associated_with_disorder'].find_one()

# Ergebnis ausgeben
print(random_document)
print(random_document)
105 changes: 105 additions & 0 deletions nedrexdb/db/parsers/intogen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import gzip as _gzip
from csv import DictReader as _DictReader
from itertools import chain as _chain
from pathlib import Path as _Path

import requests
from more_itertools import chunked as _chunked
from tqdm import tqdm as _tqdm

from nedrexdb.db import MongoInstance
from nedrexdb.db.models.edges.gene_associated_with_disorder import GeneAssociatedWithDisorder
from nedrexdb.db.models.nodes.gene import Gene
from nedrexdb.db.parsers import _get_file_location_factory

get_file_location = _get_file_location_factory("intogen")


def biomart_symbol_transcript_to_entrez(symbol_list: list[str], filter_by: str = "hgnc_symbol", batch_size: int = 100):
import xml.etree.ElementTree as ET
query = ET.Element("Query", virtualSchemaName="default", formatter="CSV", header="0", uniqueRows="0", count="",
datasetConfigVersion="0.6")
dataset = ET.SubElement(
query, "Dataset", name="hsapiens_gene_ensembl", interface="default")
ET.SubElement(dataset, "Filter", name=filter_by, value="{tr_ids}")
ET.SubElement(dataset, "Attribute", name=filter_by)
ET.SubElement(dataset, "Attribute", name="entrezgene_id")
tree = ET.ElementTree(query)
xml_string = f'<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query>{ET.tostring(tree.getroot(), encoding="unicode")}'

symbol2entrez: dict[str, str] = dict()
for i in range(0, len(symbol_list), batch_size):
response = requests.get(
f'http://www.ensembl.org/biomart/martservice?query={format(xml_string.format(tr_ids=",".join(symbol_list[i:i + batch_size])))}')
response.raise_for_status()
symbol2entrez.update(dict(row.split(',', 1)
for row in response.content.decode('utf-8').splitlines()))
return symbol2entrez


class IntOGenRow:
def __init__(self, row):
self._row = row

def parse(self, intogen2mondo: dict[str, list[str]], symbol2entrez: dict[str, str]) -> list[
GeneAssociatedWithDisorder]:
sourceDomainId = symbol2entrez[self._row["SYMBOL"]]
asserted_by = ["intogen"]
disorders = intogen2mondo[self._row["CANCER_TYPE"]]

gawds = [
GeneAssociatedWithDisorder(
sourceDomainId=sourceDomainId, targetDomainId=disorder.replace(
"MONDO:", "mondo."),
dataSources=asserted_by
)
for disorder in disorders
]

return gawds


class IntOGenParser:
def __init__(self, f: _Path, mapping: _Path):
self.f = f

if self.f.name.endswith(".gz") or self.f.name.endswith(".gzip"):
self.gzipped = True
else:
self.gzipped = False

import json
self.intogen2mondo = json.load(open(mapping))["mondo_id"]

def parse(self):
if self.gzipped:
f = _gzip.open(self.f, "rt")
else:
f = self.f.open()

reader = _DictReader(f, delimiter="\t")
f_dict = [{"SYMBOL": row['SYMBOL'], "CANCER_TYPE": row['CANCER_TYPE']}
for row in reader]

symbol2entrez = {gene["approvedSymbol"]: gene["primaryDomainId"]
for gene in Gene.find(MongoInstance.DB)}

updates = (IntOGenRow(row).parse(self.intogen2mondo, symbol2entrez)
for row in f_dict)
for chunk in _tqdm(_chunked(updates, 1_000), leave=False, desc="Parsing IntOGen"):
chunk = list(_chain(*chunk))
chunk = [gawd.generate_update() for gawd in chunk]

if not chunk:
continue

MongoInstance.DB[GeneAssociatedWithDisorder.collection_name].bulk_write(
chunk)

f.close()


def parse_gene_disease_associations():
fname = get_file_location("drivers")
mapping_fname = get_file_location("mapping")
IntOGenParser(fname, mapping_fname).parse()
Loading