Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions src/pyBiodatafuse/annotators/intact.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,7 @@ def get_filtered_interactions(
if (
has_uniprot_a
and has_uniprot_b
and id_a in valid_intact_acs
and id_b in valid_intact_acs
and (id_a in valid_intact_acs or id_b in valid_intact_acs)
):
keep_interaction = True

Expand All @@ -278,15 +277,13 @@ def get_filtered_interactions(
is_gene_gene = (
has_uniprot_a
and has_uniprot_b
and id_a in valid_intact_acs
and id_b in valid_intact_acs
and (id_a in valid_intact_acs or id_b in valid_intact_acs)
)
is_gene_compound = (has_chebi_a and has_uniprot_b) or (has_chebi_b and has_uniprot_a)
is_compound_compound = (
has_chebi_a
and has_chebi_b
and id_a in valid_intact_acs
and id_b in valid_intact_acs
and (id_a in valid_intact_acs or id_b in valid_intact_acs)
)
if is_gene_gene or is_gene_compound or is_compound_compound:
keep_interaction = True
Expand Down
58 changes: 49 additions & 9 deletions src/pyBiodatafuse/annotators/stringdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def _format_data(row, string_ids_df, network_df) -> List[Dict[str, Any]]:
preferredName_B, and vice versa.

:param row: Row from the input DataFrame (with at least 'identifier' column).
:param string_ids_df: DataFrame returned from get_string_ids (not used in this version).
:param string_ids_df: DataFrame returned from get_string_ids, used to resolve
the gene's preferredName (gene symbol) from its queryItem (input identifier).
:param network_df: DataFrame returned from the network call.
:returns: List of dictionaries describing the interactions.
"""
Expand All @@ -65,10 +66,22 @@ def _format_data(row, string_ids_df, network_df) -> List[Dict[str, Any]]:
target = row[Cons.TARGET_COL]
identifier = row[Cons.IDENTIFIER_COL]

# Build a set of names to match against preferredName_A/B in the network.
# STRING returns gene symbols as preferredNames, but the input identifiers may be
# Ensembl gene IDs (ENSG). Use the string_ids_df mapping (queryItem -> preferredName)
# to resolve the correct symbol for this gene.
preferred_names = {target, identifier}
if string_ids_df is not None and not string_ids_df.empty:
for query_col in ("queryItem", "stringId"):
if query_col in string_ids_df.columns:
matches = string_ids_df[string_ids_df[query_col].isin([target, identifier])]
if not matches.empty:
preferred_names.update(matches["preferredName"].tolist())

for _, row_arr in network_df.iterrows():
prot_a = row_arr[Cons.STRING_PREFERRED_NAME_A]
prot_b = row_arr[Cons.STRING_PREFERRED_NAME_B]
if (prot_a == target or prot_a == identifier) and prot_b not in target_links_set:
if prot_a in preferred_names and prot_b not in target_links_set:
gene_ppi_links.append(
{
Cons.STRING_PPI_INTERACTS_WITH: row_arr[Cons.STRING_PREFERRED_NAME_B],
Expand All @@ -79,7 +92,7 @@ def _format_data(row, string_ids_df, network_df) -> List[Dict[str, Any]]:
)
target_links_set.add(row_arr[Cons.STRING_PREFERRED_NAME_B])

elif (prot_b == target or prot_b == identifier) and prot_a not in target_links_set:
elif prot_b in preferred_names and prot_a not in target_links_set:
gene_ppi_links.append(
{
Cons.STRING_PPI_INTERACTS_WITH: row_arr[Cons.STRING_PREFERRED_NAME_A],
Expand Down Expand Up @@ -215,16 +228,43 @@ def get_ppi(
# Record the start time
start_time = datetime.datetime.now()

# Known fallback taxonomy IDs to avoid hard dependency on NCBI API
species_fallback = {
"human": "9606",
"homo sapiens": "9606",
}

# Retrieve NCBI taxonomy identifier using the given species term
params = {"db": "taxonomy", "term": species, "retmode": "json"}
response = requests.get(
f"{Cons.NCBI_ENDPOINT}/entrez/eutils/esearch.fcgi", params=params
).json()
species_id = None
try:
ncbi_resp = requests.get(f"{Cons.NCBI_ENDPOINT}/entrez/eutils/esearch.fcgi", params=params)
ncbi_resp.raise_for_status()
response = ncbi_resp.json()
species_id = response["esearchresult"]["idlist"][0]
except (KeyError, IndexError):
logger.error("NCBI taxonomy search did not return an ID for species: %s", species)
return pd.DataFrame(), {}
except Exception as e:
fallback = species_fallback.get(species.lower())
if fallback:
logger.warning(
"NCBI taxonomy lookup failed for '%s' (%s). Using fallback ID: %s",
species,
e,
fallback,
)
warnings.warn(
f"STRING annotator: NCBI taxonomy lookup failed for species '{species}' ({e}). "
f"Using hardcoded fallback taxonomy ID {fallback}.",
stacklevel=2,
)
species_id = fallback
else:
logger.error("NCBI taxonomy search failed for species '%s': %s", species, e)
warnings.warn(
f"STRING annotator: NCBI taxonomy lookup failed for species '{species}' ({e}). "
"No fallback available. Please retry later.",
stacklevel=2,
)
return pd.DataFrame(), {}

data_df = get_identifier_of_interest(
bridgedb_df,
Expand Down
71 changes: 32 additions & 39 deletions src/pyBiodatafuse/graph/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,15 @@ def add_intact_interactions_subgraph(g, gene_node_label, annot_list):
break

merge_node(g, partner, compound_attrs)
else:
# Gene partner node — ensure it exists with proper attributes
gene_partner_attrs = {
Cons.ID: partner,
Cons.NAME: partner,
Cons.LABEL: Cons.GENE_NODE_LABEL,
Cons.DATASOURCE: Cons.INTACT,
}
merge_node(g, partner, gene_partner_attrs)

edge_key = tuple(sorted([gene_node_label, partner]))
edge_attrs = {k: v for k, v in Cons.INTACT_PPI_EDGE_ATTRS.items()}
Expand Down Expand Up @@ -1758,15 +1767,24 @@ def add_stringdb_ppi_subgraph(g, gene_node_label, annot_list):
x for x, y in edge_data.items() if y["attr_dict"][Cons.EDGE_HASH] == edge_hash
]
if len(node_exists) == 0 and not pd.isna(ppi[Cons.STRING_PPI_INTERACTS_WITH]):
partner = ppi[Cons.STRING_PPI_INTERACTS_WITH]
partner_node_attrs = {
Cons.ID: partner,
Cons.NAME: partner,
Cons.LABEL: Cons.GENE_NODE_LABEL,
Cons.DATASOURCE: Cons.STRING,
}
merge_node(g, partner, partner_node_attrs)

g.add_edge(
gene_node_label,
ppi[Cons.STRING_PPI_INTERACTS_WITH],
partner,
label=edge_attrs[Cons.LABEL],
attr_dict=edge_attrs,
)

g.add_edge(
ppi[Cons.STRING_PPI_INTERACTS_WITH],
partner,
gene_node_label,
label=edge_attrs[Cons.LABEL],
attr_dict=edge_attrs,
Expand Down Expand Up @@ -2722,12 +2740,6 @@ def process_ppi(g, gene_node_label, row):
if valid_ppi_list:
add_stringdb_ppi_subgraph(g, gene_node_label, valid_ppi_list)

if not isinstance(ppi_list, float):
for item in ppi_list:
if pd.isna(item["stringdb_link_to"]):
ppi_list = []
add_stringdb_ppi_subgraph(g, gene_node_label, ppi_list)


def process_tf_target(g, gene_node_label, row):
"""Process tf-target interactions and add them to the graph.
Expand Down Expand Up @@ -2892,42 +2904,23 @@ def _built_gene_based_graph(

# Process disease-compound relationships
# Build mapping from disease IDs (EFO, MONDO) to disease node label
dnode_namespaces = [Cons.EFO, Cons.MONDO]
dnodes = {}

for n, d in g.nodes(data=True):
attr_dict = d.get("attr_dict", {})
if attr_dict.get(Cons.LABEL) == Cons.DISEASE_NODE_LABEL:
# Map by EFO ID (with and without colon/underscore normalization)
efo = attr_dict.get(Cons.EFO)
if efo is not None:
for nspace in dnode_namespaces:
val = attr_dict.get(nspace)
if val is None:
continue
# Store with original format
dnodes[efo] = n
# Also store normalized format (EFO_xxx -> EFO:xxx and vice versa)
if ":" in efo:
dnodes[efo.replace(":", "_")] = n
elif "_" in efo:
dnodes[efo.replace("_", ":")] = n
# Map by MONDO ID
mondo = attr_dict.get(Cons.MONDO)
if mondo is not None:
dnodes[mondo] = n
# Also store normalized format
if ":" in mondo:
dnodes[mondo.replace(":", "_")] = n
elif "_" in mondo:
dnodes[mondo.replace("_", ":")] = n
dnode_namespaces = [Cons.EFO, Cons.MONDO]
dnodes = {}

for nspace in dnode_namespaces:
dnodes.update(
{
d["attr_dict"][nspace]: n
for n, d in g.nodes(data=True)
if d["attr_dict"][Cons.LABEL] == Cons.DISEASE_NODE_LABEL
and nspace in d["attr_dict"]
and d["attr_dict"][nspace] is not None
}
)
dnodes[val] = n
# Also store normalised variants (EFO:xxx <-> EFO_xxx)
if ":" in val:
dnodes[val.replace(":", "_")] = n
elif "_" in val:
dnodes[val.replace("_", ":")] = n

if disease_compound is not None:
process_disease_compound(g, disease_compound, disease_nodes=dnodes)
Expand Down
Loading
Loading