daisybio · lspindler2509 · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/build.py b/build.py
@@ -32,6 +32,9 @@
     sider,
     uberon,
     repotrial,
+    cosmic,
+    ncg,
+    intogen
 )
 from nedrexdb.post_integration import trim_uberon, drop_empty_collections
 
@@ -70,6 +73,9 @@ def update(conf, download):
     ncbi.parse_gene_info()
     uberon.parse()
     uniprot.parse_proteins()
+
+    cosmic.parse_gene_disease_associations()
+    ncg.parse_gene_disease_associations()
 
     # Sources that add node type but require existing nodes, too
     clinvar.parse()
@@ -104,6 +110,9 @@ def update(conf, download):
 
     sider.parse()
     uniprot.parse_idmap()
+
+    intogen.parse_gene_disease_associations()
+
 
     from nedrexdb.analyses import molecule_similarity
 

diff --git a/nedrexdb/db/parsers/cosmic.py b/nedrexdb/db/parsers/cosmic.py
@@ -0,0 +1,230 @@
+import gzip as _gzip
+from csv import DictReader as _DictReader
+from pathlib import Path as _Path
+
+from more_itertools import chunked as _chunked
+from tqdm import tqdm as _tqdm
+
+from nedrexdb.db import MongoInstance
+from nedrexdb.db.models.edges.variant_affects_gene import VariantAffectsGene
+from nedrexdb.db.models.edges.variant_associated_with_disorder import VariantAssociatedWithDisorder
+from nedrexdb.db.models.edges.gene_associated_with_disorder import GeneAssociatedWithDisorder
+from nedrexdb.db.models.nodes.gene import Gene
+from nedrexdb.db.models.nodes.genomic_variant import GenomicVariant
+from nedrexdb.db.parsers import _get_file_location_factory
+from nedrexdb.logger import logger
+
+get_file_location = _get_file_location_factory("cosmic")
+get_clinvar_file_location = _get_file_location_factory("clinvar")
+
+
+# g_dot_re = re.compile("(..?):g\.(\d*)_?(\d*)(.*)")
+
+
+def get_gdot2clinvar(fname: str) -> dict[str, str]:
+    from nedrexdb.db.parsers.clinvar import ClinVarVCFParser
+    vcf_parser = ClinVarVCFParser(fname)
+    gdot2clinvar = {}
+    for row in vcf_parser.iter_rows():
+        full_gdot = row['INFO'].get('CLNHGVS')
+        if full_gdot:
+            _, gdot = row['INFO']['CLNHGVS'].split(':', maxsplit=1)
+            gdot2clinvar[f"{row['CHROM']}:{gdot}"] = f"clinvar.{row['ID']}"
+    return gdot2clinvar
+
+
+def get_cancer2mondo(mapping_fname: _Path) -> dict[tuple: str]:
+    mapping_columns = ['SITE_PRIMARY_COSMIC', 'SITE_SUBTYPE1_COSMIC',
+                       'SITE_SUBTYPE2_COSMIC', 'SITE_SUBTYPE3_COSMIC', 'HISTOLOGY_COSMIC',
+                       'HIST_SUBTYPE1_COSMIC', 'HIST_SUBTYPE2_COSMIC', 'HIST_SUBTYPE3_COSMIC']
+    cancer2mondo = {}
+    with open(mapping_fname, newline='') as mapping_file:
+        reader = _DictReader(mapping_file, delimiter="\t")
+        cancer2mondo = {tuple(
+            row[column] for column in mapping_columns): row['mapped_curie'] for row in reader}
+    return cancer2mondo
+
+
+class COSMICRow:
+    def __init__(self, row):
+        self._row = row
+
+    def get_HGVSG(self):
+        return self._row["HGVSG"]
+
+    def get_COSMIC(self):
+        return f"cosmic.{self._row['GENOMIC_MUTATION_ID']}"
+
+    def get_symbol(self):
+        return self._row["Gene name"]
+
+    def get_cancer_tuple(self) -> tuple:
+        return tuple(
+            self._row[column] for column in ['Primary site', 'Site subtype 1', 'Site subtype 2', 'Site subtype 3',
+                                             'Primary histology', 'Histology subtype 1', 'Histology subtype 2',
+                                             'Histology subtype 3'])
+
+    def get_mutation_status(self):
+        return self._row['Mutation somatic status']
+
+    # def get_variant(self, gdot2clinvar) -> GenomicVariant:
+    # match = g_dot_re.search(self._row["HGVSG"])
+
+    # if variant_id:
+    #     return GenomicVariant(primaryDomainId=variant_id, domainIds=[cosmic_id], dataSources=['COSMIC'])
+    # if match:
+    #     chrom, pos_start, pos_end, mut = match.group(1, 2, 3, 4)
+    #     pos_start = int(pos_start)
+    #     if pos_end: pos_end = int(pos_end)
+    #     if mut == 'del':
+    #         pos_start -= 1
+    #         if not pos_end:
+    #             pos_end = pos_start + 1
+    #         genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Deletion')]
+    #         variant = [variant for variant in genomic_variants if
+    #                    len(variant["referenceSequence"]) == pos_end + 1 - pos_start]
+    #     elif mut == "dup":
+    #         genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Duplication')]
+    #         variant = [variant for variant in genomic_variants if
+    #                    len(variant["alternativeSequence"]) == pos_end + 2 - pos_start]
+    #         if variant:
+    #             breakpoint()
+    #     elif mut.startswith('ins'):
+    #         insertion = mut.strip('ins')
+    #         genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Insertion')]
+    #         variant = [variant for variant in genomic_variants if
+    #                    variant["referenceSequence"] + insertion == variant["alternativeSequence"]]
+    #     elif mut.startswith('delins'):
+    #         insertion = mut.strip('delins')
+    #         if not pos_end:
+    #             pos_end = pos_start + 1
+    #         genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Indel')]
+    #         variant = [variant for variant in genomic_variants if
+    #                    insertion == variant["alternativeSequence"] and
+    #                    len(variant["referenceSequence"]) == pos_end + 1 - pos_start]
+    #     else:
+    #         if '>' not in mut:
+    #             breakpoint()
+    #         mut_from, mut_to = mut.split('>', 1)
+    #         genomic_variants = chr_pos_type2id[(chrom, pos_start, 'Single Nucleotide Variant')]
+    #         variant = [variant for variant in genomic_variants if
+    #                    variant['referenceSequence'] == mut_from and
+    #                    variant['alternativeSequence'] == mut_to]
+    #     assert len(variant) <= 1, f"More than one matching variant found in Nedrex for {self._row['HGVSG']}"
+    #     if variant:
+    #         variant = variant[0]
+    #         variant["domainIds"].append(cosmic_id)
+    #         return GenomicVariant(**variant)
+    # return None
+
+    def parse(self, gdot2clinvar: dict[str, str], symbol2entrez: dict[str, str], cancer2mondo: dict[tuple, str]) -> \
+            tuple[GenomicVariant, VariantAffectsGene, VariantAssociatedWithDisorder]:
+        variant_id = gdot2clinvar.get(self.get_HGVSG())
+        genomic_variant = None
+        variant_gene = None
+        variant_disorder = None
+        if variant_id:
+            cosmic_id = self.get_COSMIC()
+            asserted_by = ["cosmic"]
+            genomic_variant = GenomicVariant(primaryDomainId=variant_id, domainIds=[
+                                             cosmic_id], dataSources=asserted_by)
+            # data_update = genomic_variant.generate_dataSource_update()
+            # MongoInstance.DB[GenomicVariant.collection_name].bulk_write([data_update])
+            gene_id = symbol2entrez[self.get_symbol()]
+            variant_gene = VariantAffectsGene(sourceDomainId=variant_id, targetDomainId=gene_id,
+                                              dataSources=asserted_by)
+            mondo_id = cancer2mondo.get(self.get_cancer_tuple())
+            # if mondo_id:
+            variant_disorder = VariantAssociatedWithDisorder(accession=cosmic_id, dataSources=asserted_by,
+                                                             sourceDomainId=variant_id,
+                                                             targetDomainId=mondo_id,
+                                                             reviewStatus=self.get_mutation_status())
+            # else:
+            #     variant_disorder = None
+
+        return genomic_variant, variant_gene, variant_disorder
+
+
+class COSMICParser:
+    COLUMN_NAMES = ['HGVSG', 'Gene name', 'GENOMIC_MUTATION_ID', 'Mutation somatic status', 'Primary site',
+                    'Site subtype 1', 'Site subtype 2',
+                    'Site subtype 3',
+                    'Primary histology', 'Histology subtype 1', 'Histology subtype 2',
+                    'Histology subtype 3']
+
+    def __init__(self, f: _Path):
+        self.f = f
+
+        if self.f.name.endswith(".gz") or self.f.name.endswith(".gzip"):
+            self.gzipped = True
+        else:
+            self.gzipped = False
+
+    def parse(self, mapping_fname: _Path):
+        if self.gzipped:
+            f = _gzip.open(self.f, "rt")
+        else:
+            f = self.f.open()
+
+        reader = _DictReader(f, delimiter="\t")
+        f_dict = [{column: row[column]
+                   for column in self.COLUMN_NAMES} for row in reader]
+        f.close()
+
+        all_symbols = {row['Gene name'] for row in f_dict}
+        symbol2entrez = {gene["approvedSymbol"]: gene["primaryDomainId"] for gene in
+                         Gene.find(MongoInstance.DB, {"approvedSymbol": {"$in": list(all_symbols)}})}
+        non_approved_symbols = all_symbols - symbol2entrez.keys()
+        for symbol in non_approved_symbols:
+            genes = [gene["primaryDomainId"]
+                     for gene in Gene.find(MongoInstance.DB, {"symbols": symbol})]
+            assert len(genes) == 1, f"Multiple genes found for the symbol {symbol}"
+            symbol2entrez.update({symbol: genes[0]})
+        assert not (non_approved_symbols - symbol2entrez.keys()), \
+            f"Not all symbols could be mapped: {non_approved_symbols - symbol2entrez.keys()}"
+
+        # id2genomic_variant = {genomic_variant['primaryDomainId']: genomic_variant for genomic_variant in
+        #                        GenomicVariant.find(MongoInstance.DB)}
+        # chr_pos_type2id = defaultdict(list)
+        # this_id =
+        # genomic_variant['primaryDomainId'], genomic_variant
+        #     chr_pos_type2id[
+        #         (genomic_variant['chromosome'], genomic_variant['position'], genomic_variant['variantType'])].append(
+        #         genomic_variant)
+        gdot2clinvar = get_gdot2clinvar(
+            get_clinvar_file_location("human_data"))
+        cancer2mondo = get_cancer2mondo(mapping_fname)
+
+        updates = (COSMICRow(row).parse(
+            gdot2clinvar, symbol2entrez, cancer2mondo) for row in f_dict)
+        for chunk in _tqdm(_chunked(updates, 10_000), leave=False, desc="Parsing COSMIC"):
+            if not chunk:
+                continue
+            genomic_variant_updates, variant_gene_updates, variant_disorder_updates, gene_disorder_updates = [], [], [], []
+            for genomic_variant, variant_gene, variant_disorder in chunk:
+                if genomic_variant:
+                    genomic_variant_updates.append(
+                        genomic_variant.generate_update())
+                    variant_gene_updates.append(variant_gene.generate_update())
+                    if variant_disorder:
+                        variant_disorder_updates.append(
+                            variant_disorder.generate_update())
+                        gene_disorder = GeneAssociatedWithDisorder(dataSources=["cosmic"],
+                                                             sourceDomainId=variant_gene.targetDomainId,
+                                                             targetDomainId=variant_disorder.targetDomainId)
+                        gene_disorder_updates.append(gene_disorder.generate_update())
+
+            for this_collection_name, these_updates in zip([GenomicVariant.collection_name, VariantAffectsGene.collection_name, VariantAssociatedWithDisorder.collection_name, GeneAssociatedWithDisorder.collection_name],
+                                                           [genomic_variant_updates, variant_gene_updates, variant_disorder_updates, gene_disorder_updates]):
+                bulk_write_results = MongoInstance.DB[this_collection_name].bulk_write(
+                    these_updates)
+                if bulk_write_results.bulk_api_result['writeErrors'] or bulk_write_results.bulk_api_result['writeConcernErrors']:
+                    print(bulk_write_results.bulk_api_result)
+
+
+
+def parse_gene_disease_associations():
+    logger.info("Parsing COSMIC")
+    fname = get_file_location("census")
+    mapping_fname = get_file_location("mapping")
+    COSMICParser(fname).parse(mapping_fname)
diff --git a/nedrexdb/db/parsers/db_test.py b/nedrexdb/db/parsers/db_test.py
@@ -0,0 +1,12 @@
+import pymongo
+
+# Verbindung zur MongoDB-Datenbank herstellen
+client = pymongo.MongoClient('localhost', 27020)
+db = client['licensed_nedrex_live']  # Datenbank auswählen
+
+# Ein zufälliges Element aus der Sammlung abrufen
+random_document = db['gene_associated_with_disorder'].find_one()
+
+# Ergebnis ausgeben
+print(random_document)
+print(random_document)
diff --git a/nedrexdb/db/parsers/intogen.py b/nedrexdb/db/parsers/intogen.py
@@ -0,0 +1,105 @@
+import gzip as _gzip
+from csv import DictReader as _DictReader
+from itertools import chain as _chain
+from pathlib import Path as _Path
+
+import requests
+from more_itertools import chunked as _chunked
+from tqdm import tqdm as _tqdm
+
+from nedrexdb.db import MongoInstance
+from nedrexdb.db.models.edges.gene_associated_with_disorder import GeneAssociatedWithDisorder
+from nedrexdb.db.models.nodes.gene import Gene
+from nedrexdb.db.parsers import _get_file_location_factory
+
+get_file_location = _get_file_location_factory("intogen")
+
+
+def biomart_symbol_transcript_to_entrez(symbol_list: list[str], filter_by: str = "hgnc_symbol", batch_size: int = 100):
+    import xml.etree.ElementTree as ET
+    query = ET.Element("Query", virtualSchemaName="default", formatter="CSV", header="0", uniqueRows="0", count="",
+                       datasetConfigVersion="0.6")
+    dataset = ET.SubElement(
+        query, "Dataset", name="hsapiens_gene_ensembl", interface="default")
+    ET.SubElement(dataset, "Filter", name=filter_by, value="{tr_ids}")
+    ET.SubElement(dataset, "Attribute", name=filter_by)
+    ET.SubElement(dataset, "Attribute", name="entrezgene_id")
+    tree = ET.ElementTree(query)
+    xml_string = f'<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query>{ET.tostring(tree.getroot(), encoding="unicode")}'
+
+    symbol2entrez: dict[str, str] = dict()
+    for i in range(0, len(symbol_list), batch_size):
+        response = requests.get(
+            f'http://www.ensembl.org/biomart/martservice?query={format(xml_string.format(tr_ids=",".join(symbol_list[i:i + batch_size])))}')
+        response.raise_for_status()
+        symbol2entrez.update(dict(row.split(',', 1)
+                             for row in response.content.decode('utf-8').splitlines()))
+    return symbol2entrez
+
+
+class IntOGenRow:
+    def __init__(self, row):
+        self._row = row
+
+    def parse(self, intogen2mondo: dict[str, list[str]], symbol2entrez: dict[str, str]) -> list[
+            GeneAssociatedWithDisorder]:
+        sourceDomainId = symbol2entrez[self._row["SYMBOL"]]
+        asserted_by = ["intogen"]
+        disorders = intogen2mondo[self._row["CANCER_TYPE"]]
+
+        gawds = [
+            GeneAssociatedWithDisorder(
+                sourceDomainId=sourceDomainId, targetDomainId=disorder.replace(
+                    "MONDO:", "mondo."),
+                dataSources=asserted_by
+            )
+            for disorder in disorders
+        ]
+
+        return gawds
+
+
+class IntOGenParser:
+    def __init__(self, f: _Path, mapping: _Path):
+        self.f = f
+
+        if self.f.name.endswith(".gz") or self.f.name.endswith(".gzip"):
+            self.gzipped = True
+        else:
+            self.gzipped = False
+
+        import json
+        self.intogen2mondo = json.load(open(mapping))["mondo_id"]
+
+    def parse(self):
+        if self.gzipped:
+            f = _gzip.open(self.f, "rt")
+        else:
+            f = self.f.open()
+
+        reader = _DictReader(f, delimiter="\t")
+        f_dict = [{"SYMBOL": row['SYMBOL'], "CANCER_TYPE": row['CANCER_TYPE']}
+                  for row in reader]
+
+        symbol2entrez = {gene["approvedSymbol"]: gene["primaryDomainId"]
+                         for gene in Gene.find(MongoInstance.DB)}
+
+        updates = (IntOGenRow(row).parse(self.intogen2mondo, symbol2entrez)
+                   for row in f_dict)
+        for chunk in _tqdm(_chunked(updates, 1_000), leave=False, desc="Parsing IntOGen"):
+            chunk = list(_chain(*chunk))
+            chunk = [gawd.generate_update() for gawd in chunk]
+
+            if not chunk:
+                continue
+
+            MongoInstance.DB[GeneAssociatedWithDisorder.collection_name].bulk_write(
+                chunk)
+
+        f.close()
+
+
+def parse_gene_disease_associations():
+    fname = get_file_location("drivers")
+    mapping_fname = get_file_location("mapping")
+    IntOGenParser(fname, mapping_fname).parse()