BioDataFuse · jmillanacosta · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/src/pyBiodatafuse/annotators/intact.py b/src/pyBiodatafuse/annotators/intact.py
@@ -252,8 +252,7 @@ def get_filtered_interactions(
             if (
                 has_uniprot_a
                 and has_uniprot_b
-                and id_a in valid_intact_acs
-                and id_b in valid_intact_acs
+                and (id_a in valid_intact_acs or id_b in valid_intact_acs)
             ):
                 keep_interaction = True
 
@@ -278,15 +277,13 @@ def get_filtered_interactions(
             is_gene_gene = (
                 has_uniprot_a
                 and has_uniprot_b
-                and id_a in valid_intact_acs
-                and id_b in valid_intact_acs
+                and (id_a in valid_intact_acs or id_b in valid_intact_acs)
             )
             is_gene_compound = (has_chebi_a and has_uniprot_b) or (has_chebi_b and has_uniprot_a)
             is_compound_compound = (
                 has_chebi_a
                 and has_chebi_b
-                and id_a in valid_intact_acs
-                and id_b in valid_intact_acs
+                and (id_a in valid_intact_acs or id_b in valid_intact_acs)
             )
             if is_gene_gene or is_gene_compound or is_compound_compound:
                 keep_interaction = True

diff --git a/src/pyBiodatafuse/annotators/stringdb.py b/src/pyBiodatafuse/annotators/stringdb.py
@@ -55,7 +55,8 @@ def _format_data(row, string_ids_df, network_df) -> List[Dict[str, Any]]:
     preferredName_B, and vice versa.
 
     :param row: Row from the input DataFrame (with at least 'identifier' column).
-    :param string_ids_df: DataFrame returned from get_string_ids (not used in this version).
+    :param string_ids_df: DataFrame returned from get_string_ids, used to resolve
+        the gene's preferredName (gene symbol) from its queryItem (input identifier).
     :param network_df: DataFrame returned from the network call.
     :returns: List of dictionaries describing the interactions.
     """
@@ -65,10 +66,22 @@ def _format_data(row, string_ids_df, network_df) -> List[Dict[str, Any]]:
     target = row[Cons.TARGET_COL]
     identifier = row[Cons.IDENTIFIER_COL]
 
+    # Build a set of names to match against preferredName_A/B in the network.
+    # STRING returns gene symbols as preferredNames, but the input identifiers may be
+    # Ensembl gene IDs (ENSG). Use the string_ids_df mapping (queryItem -> preferredName)
+    # to resolve the correct symbol for this gene.
+    preferred_names = {target, identifier}
+    if string_ids_df is not None and not string_ids_df.empty:
+        for query_col in ("queryItem", "stringId"):
+            if query_col in string_ids_df.columns:
+                matches = string_ids_df[string_ids_df[query_col].isin([target, identifier])]
+                if not matches.empty:
+                    preferred_names.update(matches["preferredName"].tolist())
+
     for _, row_arr in network_df.iterrows():
         prot_a = row_arr[Cons.STRING_PREFERRED_NAME_A]
         prot_b = row_arr[Cons.STRING_PREFERRED_NAME_B]
-        if (prot_a == target or prot_a == identifier) and prot_b not in target_links_set:
+        if prot_a in preferred_names and prot_b not in target_links_set:
             gene_ppi_links.append(
                 {
                     Cons.STRING_PPI_INTERACTS_WITH: row_arr[Cons.STRING_PREFERRED_NAME_B],
@@ -79,7 +92,7 @@ def _format_data(row, string_ids_df, network_df) -> List[Dict[str, Any]]:
             )
             target_links_set.add(row_arr[Cons.STRING_PREFERRED_NAME_B])
 
-        elif (prot_b == target or prot_b == identifier) and prot_a not in target_links_set:
+        elif prot_b in preferred_names and prot_a not in target_links_set:
             gene_ppi_links.append(
                 {
                     Cons.STRING_PPI_INTERACTS_WITH: row_arr[Cons.STRING_PREFERRED_NAME_A],
@@ -215,16 +228,43 @@ def get_ppi(
     # Record the start time
     start_time = datetime.datetime.now()
 
+    # Known fallback taxonomy IDs to avoid hard dependency on NCBI API
+    species_fallback = {
+        "human": "9606",
+        "homo sapiens": "9606",
+    }
+
     # Retrieve NCBI taxonomy identifier using the given species term
     params = {"db": "taxonomy", "term": species, "retmode": "json"}
-    response = requests.get(
-        f"{Cons.NCBI_ENDPOINT}/entrez/eutils/esearch.fcgi", params=params
-    ).json()
+    species_id = None
     try:
+        ncbi_resp = requests.get(f"{Cons.NCBI_ENDPOINT}/entrez/eutils/esearch.fcgi", params=params)
+        ncbi_resp.raise_for_status()
+        response = ncbi_resp.json()
         species_id = response["esearchresult"]["idlist"][0]
-    except (KeyError, IndexError):
-        logger.error("NCBI taxonomy search did not return an ID for species: %s", species)
-        return pd.DataFrame(), {}
+    except Exception as e:
+        fallback = species_fallback.get(species.lower())
+        if fallback:
+            logger.warning(
+                "NCBI taxonomy lookup failed for '%s' (%s). Using fallback ID: %s",
+                species,
+                e,
+                fallback,
+            )
+            warnings.warn(
+                f"STRING annotator: NCBI taxonomy lookup failed for species '{species}' ({e}). "
+                f"Using hardcoded fallback taxonomy ID {fallback}.",
+                stacklevel=2,
+            )
+            species_id = fallback
+        else:
+            logger.error("NCBI taxonomy search failed for species '%s': %s", species, e)
+            warnings.warn(
+                f"STRING annotator: NCBI taxonomy lookup failed for species '{species}' ({e}). "
+                "No fallback available. Please retry later.",
+                stacklevel=2,
+            )
+            return pd.DataFrame(), {}
 
     data_df = get_identifier_of_interest(
         bridgedb_df,

diff --git a/src/pyBiodatafuse/graph/generator.py b/src/pyBiodatafuse/graph/generator.py
@@ -268,6 +268,15 @@ def add_intact_interactions_subgraph(g, gene_node_label, annot_list):
                         break
 
             merge_node(g, partner, compound_attrs)
+        else:
+            # Gene partner node — ensure it exists with proper attributes
+            gene_partner_attrs = {
+                Cons.ID: partner,
+                Cons.NAME: partner,
+                Cons.LABEL: Cons.GENE_NODE_LABEL,
+                Cons.DATASOURCE: Cons.INTACT,
+            }
+            merge_node(g, partner, gene_partner_attrs)
 
         edge_key = tuple(sorted([gene_node_label, partner]))
         edge_attrs = {k: v for k, v in Cons.INTACT_PPI_EDGE_ATTRS.items()}
@@ -1758,15 +1767,24 @@ def add_stringdb_ppi_subgraph(g, gene_node_label, annot_list):
             x for x, y in edge_data.items() if y["attr_dict"][Cons.EDGE_HASH] == edge_hash
         ]
         if len(node_exists) == 0 and not pd.isna(ppi[Cons.STRING_PPI_INTERACTS_WITH]):
+            partner = ppi[Cons.STRING_PPI_INTERACTS_WITH]
+            partner_node_attrs = {
+                Cons.ID: partner,
+                Cons.NAME: partner,
+                Cons.LABEL: Cons.GENE_NODE_LABEL,
+                Cons.DATASOURCE: Cons.STRING,
+            }
+            merge_node(g, partner, partner_node_attrs)
+
             g.add_edge(
                 gene_node_label,
-                ppi[Cons.STRING_PPI_INTERACTS_WITH],
+                partner,
                 label=edge_attrs[Cons.LABEL],
                 attr_dict=edge_attrs,
             )
 
             g.add_edge(
-                ppi[Cons.STRING_PPI_INTERACTS_WITH],
+                partner,
                 gene_node_label,
                 label=edge_attrs[Cons.LABEL],
                 attr_dict=edge_attrs,
@@ -2722,12 +2740,6 @@ def process_ppi(g, gene_node_label, row):
             if valid_ppi_list:
                 add_stringdb_ppi_subgraph(g, gene_node_label, valid_ppi_list)
 
-        if not isinstance(ppi_list, float):
-            for item in ppi_list:
-                if pd.isna(item["stringdb_link_to"]):
-                    ppi_list = []
-            add_stringdb_ppi_subgraph(g, gene_node_label, ppi_list)
-
 
 def process_tf_target(g, gene_node_label, row):
     """Process tf-target interactions and add them to the graph.
@@ -2892,42 +2904,23 @@ def _built_gene_based_graph(
 
     # Process disease-compound relationships
     # Build mapping from disease IDs (EFO, MONDO) to disease node label
+    dnode_namespaces = [Cons.EFO, Cons.MONDO]
     dnodes = {}
+
     for n, d in g.nodes(data=True):
         attr_dict = d.get("attr_dict", {})
         if attr_dict.get(Cons.LABEL) == Cons.DISEASE_NODE_LABEL:
-            # Map by EFO ID (with and without colon/underscore normalization)
-            efo = attr_dict.get(Cons.EFO)
-            if efo is not None:
+            for nspace in dnode_namespaces:
+                val = attr_dict.get(nspace)
+                if val is None:
+                    continue
                 # Store with original format
-                dnodes[efo] = n
-                # Also store normalized format (EFO_xxx -> EFO:xxx and vice versa)
-                if ":" in efo:
-                    dnodes[efo.replace(":", "_")] = n
-                elif "_" in efo:
-                    dnodes[efo.replace("_", ":")] = n
-            # Map by MONDO ID
-            mondo = attr_dict.get(Cons.MONDO)
-            if mondo is not None:
-                dnodes[mondo] = n
-                # Also store normalized format
-                if ":" in mondo:
-                    dnodes[mondo.replace(":", "_")] = n
-                elif "_" in mondo:
-                    dnodes[mondo.replace("_", ":")] = n
-    dnode_namespaces = [Cons.EFO, Cons.MONDO]
-    dnodes = {}
-
-    for nspace in dnode_namespaces:
-        dnodes.update(
-            {
-                d["attr_dict"][nspace]: n
-                for n, d in g.nodes(data=True)
-                if d["attr_dict"][Cons.LABEL] == Cons.DISEASE_NODE_LABEL
-                and nspace in d["attr_dict"]
-                and d["attr_dict"][nspace] is not None
-            }
-        )
+                dnodes[val] = n
+                # Also store normalised variants (EFO:xxx <-> EFO_xxx)
+                if ":" in val:
+                    dnodes[val.replace(":", "_")] = n
+                elif "_" in val:
+                    dnodes[val.replace("_", ":")] = n
 
     if disease_compound is not None:
         process_disease_compound(g, disease_compound, disease_nodes=dnodes)