diff --git a/src/pyBiodatafuse/annotators/intact.py b/src/pyBiodatafuse/annotators/intact.py index d3668d8b..029e382d 100644 --- a/src/pyBiodatafuse/annotators/intact.py +++ b/src/pyBiodatafuse/annotators/intact.py @@ -252,8 +252,7 @@ def get_filtered_interactions( if ( has_uniprot_a and has_uniprot_b - and id_a in valid_intact_acs - and id_b in valid_intact_acs + and (id_a in valid_intact_acs or id_b in valid_intact_acs) ): keep_interaction = True @@ -278,15 +277,13 @@ def get_filtered_interactions( is_gene_gene = ( has_uniprot_a and has_uniprot_b - and id_a in valid_intact_acs - and id_b in valid_intact_acs + and (id_a in valid_intact_acs or id_b in valid_intact_acs) ) is_gene_compound = (has_chebi_a and has_uniprot_b) or (has_chebi_b and has_uniprot_a) is_compound_compound = ( has_chebi_a and has_chebi_b - and id_a in valid_intact_acs - and id_b in valid_intact_acs + and (id_a in valid_intact_acs or id_b in valid_intact_acs) ) if is_gene_gene or is_gene_compound or is_compound_compound: keep_interaction = True diff --git a/src/pyBiodatafuse/annotators/stringdb.py b/src/pyBiodatafuse/annotators/stringdb.py index 887d785a..655cc67a 100644 --- a/src/pyBiodatafuse/annotators/stringdb.py +++ b/src/pyBiodatafuse/annotators/stringdb.py @@ -55,7 +55,8 @@ def _format_data(row, string_ids_df, network_df) -> List[Dict[str, Any]]: preferredName_B, and vice versa. :param row: Row from the input DataFrame (with at least 'identifier' column). - :param string_ids_df: DataFrame returned from get_string_ids (not used in this version). + :param string_ids_df: DataFrame returned from get_string_ids, used to resolve + the gene's preferredName (gene symbol) from its queryItem (input identifier). :param network_df: DataFrame returned from the network call. :returns: List of dictionaries describing the interactions. """ @@ -65,10 +66,22 @@ def _format_data(row, string_ids_df, network_df) -> List[Dict[str, Any]]: target = row[Cons.TARGET_COL] identifier = row[Cons.IDENTIFIER_COL] + # Build a set of names to match against preferredName_A/B in the network. + # STRING returns gene symbols as preferredNames, but the input identifiers may be + # Ensembl gene IDs (ENSG). Use the string_ids_df mapping (queryItem -> preferredName) + # to resolve the correct symbol for this gene. + preferred_names = {target, identifier} + if string_ids_df is not None and not string_ids_df.empty: + for query_col in ("queryItem", "stringId"): + if query_col in string_ids_df.columns: + matches = string_ids_df[string_ids_df[query_col].isin([target, identifier])] + if not matches.empty: + preferred_names.update(matches["preferredName"].tolist()) + for _, row_arr in network_df.iterrows(): prot_a = row_arr[Cons.STRING_PREFERRED_NAME_A] prot_b = row_arr[Cons.STRING_PREFERRED_NAME_B] - if (prot_a == target or prot_a == identifier) and prot_b not in target_links_set: + if prot_a in preferred_names and prot_b not in target_links_set: gene_ppi_links.append( { Cons.STRING_PPI_INTERACTS_WITH: row_arr[Cons.STRING_PREFERRED_NAME_B], @@ -79,7 +92,7 @@ def _format_data(row, string_ids_df, network_df) -> List[Dict[str, Any]]: ) target_links_set.add(row_arr[Cons.STRING_PREFERRED_NAME_B]) - elif (prot_b == target or prot_b == identifier) and prot_a not in target_links_set: + elif prot_b in preferred_names and prot_a not in target_links_set: gene_ppi_links.append( { Cons.STRING_PPI_INTERACTS_WITH: row_arr[Cons.STRING_PREFERRED_NAME_A], @@ -215,16 +228,43 @@ def get_ppi( # Record the start time start_time = datetime.datetime.now() + # Known fallback taxonomy IDs to avoid hard dependency on NCBI API + species_fallback = { + "human": "9606", + "homo sapiens": "9606", + } + # Retrieve NCBI taxonomy identifier using the given species term params = {"db": "taxonomy", "term": species, "retmode": "json"} - response = requests.get( - f"{Cons.NCBI_ENDPOINT}/entrez/eutils/esearch.fcgi", params=params - ).json() + species_id = None try: + ncbi_resp = requests.get(f"{Cons.NCBI_ENDPOINT}/entrez/eutils/esearch.fcgi", params=params) + ncbi_resp.raise_for_status() + response = ncbi_resp.json() species_id = response["esearchresult"]["idlist"][0] - except (KeyError, IndexError): - logger.error("NCBI taxonomy search did not return an ID for species: %s", species) - return pd.DataFrame(), {} + except Exception as e: + fallback = species_fallback.get(species.lower()) + if fallback: + logger.warning( + "NCBI taxonomy lookup failed for '%s' (%s). Using fallback ID: %s", + species, + e, + fallback, + ) + warnings.warn( + f"STRING annotator: NCBI taxonomy lookup failed for species '{species}' ({e}). " + f"Using hardcoded fallback taxonomy ID {fallback}.", + stacklevel=2, + ) + species_id = fallback + else: + logger.error("NCBI taxonomy search failed for species '%s': %s", species, e) + warnings.warn( + f"STRING annotator: NCBI taxonomy lookup failed for species '{species}' ({e}). " + "No fallback available. Please retry later.", + stacklevel=2, + ) + return pd.DataFrame(), {} data_df = get_identifier_of_interest( bridgedb_df, diff --git a/src/pyBiodatafuse/graph/generator.py b/src/pyBiodatafuse/graph/generator.py index 211f28ed..7661d695 100644 --- a/src/pyBiodatafuse/graph/generator.py +++ b/src/pyBiodatafuse/graph/generator.py @@ -268,6 +268,15 @@ def add_intact_interactions_subgraph(g, gene_node_label, annot_list): break merge_node(g, partner, compound_attrs) + else: + # Gene partner node — ensure it exists with proper attributes + gene_partner_attrs = { + Cons.ID: partner, + Cons.NAME: partner, + Cons.LABEL: Cons.GENE_NODE_LABEL, + Cons.DATASOURCE: Cons.INTACT, + } + merge_node(g, partner, gene_partner_attrs) edge_key = tuple(sorted([gene_node_label, partner])) edge_attrs = {k: v for k, v in Cons.INTACT_PPI_EDGE_ATTRS.items()} @@ -1758,15 +1767,24 @@ def add_stringdb_ppi_subgraph(g, gene_node_label, annot_list): x for x, y in edge_data.items() if y["attr_dict"][Cons.EDGE_HASH] == edge_hash ] if len(node_exists) == 0 and not pd.isna(ppi[Cons.STRING_PPI_INTERACTS_WITH]): + partner = ppi[Cons.STRING_PPI_INTERACTS_WITH] + partner_node_attrs = { + Cons.ID: partner, + Cons.NAME: partner, + Cons.LABEL: Cons.GENE_NODE_LABEL, + Cons.DATASOURCE: Cons.STRING, + } + merge_node(g, partner, partner_node_attrs) + g.add_edge( gene_node_label, - ppi[Cons.STRING_PPI_INTERACTS_WITH], + partner, label=edge_attrs[Cons.LABEL], attr_dict=edge_attrs, ) g.add_edge( - ppi[Cons.STRING_PPI_INTERACTS_WITH], + partner, gene_node_label, label=edge_attrs[Cons.LABEL], attr_dict=edge_attrs, @@ -2722,12 +2740,6 @@ def process_ppi(g, gene_node_label, row): if valid_ppi_list: add_stringdb_ppi_subgraph(g, gene_node_label, valid_ppi_list) - if not isinstance(ppi_list, float): - for item in ppi_list: - if pd.isna(item["stringdb_link_to"]): - ppi_list = [] - add_stringdb_ppi_subgraph(g, gene_node_label, ppi_list) - def process_tf_target(g, gene_node_label, row): """Process tf-target interactions and add them to the graph. @@ -2892,42 +2904,23 @@ def _built_gene_based_graph( # Process disease-compound relationships # Build mapping from disease IDs (EFO, MONDO) to disease node label + dnode_namespaces = [Cons.EFO, Cons.MONDO] dnodes = {} + for n, d in g.nodes(data=True): attr_dict = d.get("attr_dict", {}) if attr_dict.get(Cons.LABEL) == Cons.DISEASE_NODE_LABEL: - # Map by EFO ID (with and without colon/underscore normalization) - efo = attr_dict.get(Cons.EFO) - if efo is not None: + for nspace in dnode_namespaces: + val = attr_dict.get(nspace) + if val is None: + continue # Store with original format - dnodes[efo] = n - # Also store normalized format (EFO_xxx -> EFO:xxx and vice versa) - if ":" in efo: - dnodes[efo.replace(":", "_")] = n - elif "_" in efo: - dnodes[efo.replace("_", ":")] = n - # Map by MONDO ID - mondo = attr_dict.get(Cons.MONDO) - if mondo is not None: - dnodes[mondo] = n - # Also store normalized format - if ":" in mondo: - dnodes[mondo.replace(":", "_")] = n - elif "_" in mondo: - dnodes[mondo.replace("_", ":")] = n - dnode_namespaces = [Cons.EFO, Cons.MONDO] - dnodes = {} - - for nspace in dnode_namespaces: - dnodes.update( - { - d["attr_dict"][nspace]: n - for n, d in g.nodes(data=True) - if d["attr_dict"][Cons.LABEL] == Cons.DISEASE_NODE_LABEL - and nspace in d["attr_dict"] - and d["attr_dict"][nspace] is not None - } - ) + dnodes[val] = n + # Also store normalised variants (EFO:xxx <-> EFO_xxx) + if ":" in val: + dnodes[val.replace(":", "_")] = n + elif "_" in val: + dnodes[val.replace("_", ":")] = n if disease_compound is not None: process_disease_compound(g, disease_compound, disease_nodes=dnodes) diff --git a/tests/annotators/test_intact.py b/tests/annotators/test_intact.py index b21e58fd..944b5dba 100644 --- a/tests/annotators/test_intact.py +++ b/tests/annotators/test_intact.py @@ -16,8 +16,104 @@ class TestIntact(unittest.TestCase): """Test the IntAct class.""" def test_get_interactions(self): - """Test the get_interactions function.""" + """Test the get_interactions function with mocked API calls. + + Uses controlled mock data to verify: + - gene-gene interactions where only one interactor is the input gene are kept (OR logic) + - gene-compound interactions are kept under 'both' mode + - interactions where neither interactor is the input gene are excluded + """ intact.check_endpoint_intact = Mock(return_value=True) + # DAG1 AC is EBI-1755945 + intact.get_protein_intact_acs = Mock(return_value=["EBI-1755945"]) + + mock_interactions = [ + # Gene-gene: both sides are DAG1 (self-interaction) – should be kept + { + "interaction_id": "EBI-SELF", + "interactor_id_A": "EBI-1755945", + "interactor_id_B": "EBI-1755945", + "score": 0.56, + "biological_role_A": "unspecified role", + "biological_role_B": "unspecified role", + "type": "direct interaction", + "detection_method": "x-ray diffraction", + "host_organism": "In vitro", + "interactor_A_name": "dag1_human", + "interactor_B_name": "dag1_human", + "interactor_A_species": "Homo sapiens", + "interactor_B_species": "Homo sapiens", + "molecule_A": "DAG1", + "molecule_B": "DAG1", + "id_A": "uniprotkb:Q14118", + "id_B": "uniprotkb:Q14118", + "pubmed_publication_id": "11111111", + }, + # Gene-gene: only B is DAG1 – should be kept (OR logic) + { + "interaction_id": "EBI-PARTNER", + "interactor_id_A": "EBI-9999999", + "interactor_id_B": "EBI-1755945", + "score": 0.4, + "biological_role_A": "unspecified role", + "biological_role_B": "unspecified role", + "type": "association", + "detection_method": "anti tag coip", + "host_organism": "Homo sapiens HEK293T", + "interactor_A_name": "partner_human", + "interactor_B_name": "dag1_human", + "interactor_A_species": "Homo sapiens", + "interactor_B_species": "Homo sapiens", + "molecule_A": "PARTNER", + "molecule_B": "DAG1", + "id_A": "uniprotkb:P99999", + "id_B": "uniprotkb:Q14118", + "pubmed_publication_id": "22222222", + }, + # Gene-compound: A is a ChEBI compound, B is DAG1 – kept under 'both' + { + "interaction_id": "EBI-COMPOUND", + "interactor_id_A": "EBI-5327879", + "interactor_id_B": "EBI-1755945", + "score": 0.4, + "biological_role_A": "unspecified role", + "biological_role_B": "unspecified role", + "type": "physical association", + "detection_method": "biophysical", + "host_organism": "In vitro", + "interactor_A_name": "ganglioside_gm1", + "interactor_B_name": "dag1_human", + "interactor_A_species": "Chemical synthesis (Chemical synthesis)", + "interactor_B_species": "Homo sapiens", + "molecule_A": "ganglioside_gm1", + "molecule_B": "DAG1", + "id_A": "CHEBI:61048", + "id_B": "uniprotkb:Q14118", + "pubmed_publication_id": "33333333", + }, + # Gene-gene: neither A nor B is DAG1 – should be excluded + { + "interaction_id": "EBI-UNRELATED", + "interactor_id_A": "EBI-1111111", + "interactor_id_B": "EBI-2222222", + "score": 0.5, + "biological_role_A": "unspecified role", + "biological_role_B": "unspecified role", + "type": "association", + "detection_method": "anti tag coip", + "host_organism": "In vitro", + "interactor_A_name": "other_a", + "interactor_B_name": "other_b", + "interactor_A_species": "Homo sapiens", + "interactor_B_species": "Homo sapiens", + "molecule_A": "OTHERA", + "molecule_B": "OTHERB", + "id_A": "uniprotkb:P11111", + "id_B": "uniprotkb:P22222", + "pubmed_publication_id": "44444444", + }, + ] + intact.get_intact_interactions = Mock(return_value=mock_interactions) bridgedb_dataframe = pd.DataFrame( { @@ -32,78 +128,14 @@ def test_get_interactions(self): bridgedb_dataframe, interaction_type="both" ) - expected_data = pd.Series( - [ - [ - { - "interaction_id": "EBI-7882257", - "interactor_id_A": "EBI-1755945", - "interactor_id_B": "EBI-1755945", - "score": 0.56, - "biological_role_A": "unspecified role", - "biological_role_B": "unspecified role", - "type": "direct interaction", - "detection_method": "x-ray diffraction", - "host_organism": "In vitro", - "interactor_A_name": "dag1_human", - "interactor_B_name": "dag1_human", - "interactor_A_species": "Homo sapiens", - "interactor_B_species": "Homo sapiens", - "molecule_A": "DAG1", - "molecule_B": "DAG1", - "id_A": "uniprotkb:Q14118", - "id_B": "uniprotkb:Q14118", - "pubmed_publication_id": "11423118", - "intact_link_to": "DAG1", - }, - { - "interaction_id": "EBI-7882311", - "interactor_id_A": "EBI-1755945", - "interactor_id_B": "EBI-1755945", - "score": 0.56, - "biological_role_A": "unspecified role", - "biological_role_B": "unspecified role", - "type": "direct interaction", - "detection_method": "elisa", - "host_organism": "In vitro", - "interactor_A_name": "dag1_human", - "interactor_B_name": "dag1_human", - "interactor_A_species": "Homo sapiens", - "interactor_B_species": "Homo sapiens", - "molecule_A": "DAG1", - "molecule_B": "DAG1", - "id_A": "uniprotkb:Q14118", - "id_B": "uniprotkb:Q14118", - "pubmed_publication_id": "11423118", - "intact_link_to": "DAG1", - }, - { - "interaction_id": "EBI-5327885", - "interactor_id_A": "EBI-5327879", - "interactor_id_B": "EBI-1755945", - "score": 0.4, - "biological_role_A": "unspecified role", - "biological_role_B": "unspecified role", - "type": "physical association", - "detection_method": "biophysical", - "host_organism": "Homo sapiens HeLa S3 epitheloid cervical carcinoma cell", - "interactor_A_name": "ganglioside_gm1", - "interactor_B_name": "dag1_human", - "interactor_A_species": "Chemical synthesis (Chemical synthesis)", - "interactor_B_species": "Homo sapiens", - "molecule_A": "ganglioside_gm1", - "molecule_B": "DAG1", - "id_A": "CHEBI:61048", - "id_B": "uniprotkb:Q14118", - "pubmed_publication_id": "22106087", - "intact_link_to": "CHEBI:61048", - }, - ] - ] - ) - expected_data.name = INTACT_INTERACT_COL - - pd.testing.assert_series_equal(obtained_data[INTACT_INTERACT_COL], expected_data) + result = obtained_data[INTACT_INTERACT_COL].iloc[0] + # 3 interactions should be kept (self, partner, compound); unrelated excluded + self.assertEqual(len(result), 3) + interaction_ids = {r["interaction_id"] for r in result} + self.assertIn("EBI-SELF", interaction_ids) + self.assertIn("EBI-PARTNER", interaction_ids) + self.assertIn("EBI-COMPOUND", interaction_ids) + self.assertNotIn("EBI-UNRELATED", interaction_ids) self.assertIsInstance(metadata, dict) def test_get_compound_interactions(self):