From 73bee66dd56cfc26e2cb5cd5035488b1d043d75c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pablo=20S=C3=A1nchez-Izquierdo=20Besora?=
 <sanizbe.pablo@gmail.com>
Date: Thu, 28 May 2026 16:40:49 +0200
Subject: [PATCH] Updated pfam, efoldmine and ted modules

---
 mavisp/modules.py | 46 +++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/mavisp/modules.py b/mavisp/modules.py
index c2c8c9c..4039624 100644
--- a/mavisp/modules.py
+++ b/mavisp/modules.py
@@ -907,15 +907,21 @@ def ingest(self, mutations):
 
         # Compare with mutations:
         result = []
-
+        
+        efoldmine_parsed = efoldmine_parsed.copy()
+        efoldmine_parsed['mavisp_residue_index'] = efoldmine_parsed['residue_index'] + 1
+        residue_counts = efoldmine_parsed['mavisp_residue_index'].value_counts().to_dict()
+        efoldmine_by_residue = {row.mavisp_residue_index: row for row in efoldmine_parsed.itertuples(index=False)}
+        
         for mut in mutations:
             mut_resn = int(mut[1:-1])
-            row = efoldmine_parsed[efoldmine_parsed['residue_index']+1 == mut_resn]
-            if len(row) != 1:
-                this_error = f"Expected exactly one row for residue index {mut_resn}, but found {len(row)} rows."
+            row_count = residue_counts.get(mut_resn, 0)
+            if row_count != 1:
+                this_error = f"Expected exactly one row for residue index {mut_resn}, but found {row_count} rows."
                 raise MAVISpMultipleError(warning=warnings, critical=[MAVISpCriticalError(this_error)])
-            is_early_folding = row['is_early_folding'].iloc[0]
-            efoldmine_score = row['earlyFolding'].iloc[0]
+            row = efoldmine_by_residue[mut_resn]
+            is_early_folding = row.is_early_folding
+            efoldmine_score = row.earlyFolding
             result.append((is_early_folding, efoldmine_score))
 
         # Create DataFrame:
@@ -2625,13 +2631,15 @@ def ingest(self, mutations):
         # Dictionary of muts + res_numbers
         mutation_residues = {mut: int(mut[1:-1]) for mut in mutations}
 
+        pfam_intervals = [(row.start, row.end, f"{row.pfam_domain} ({row.accession})")
+            for row in pfam.itertuples(index=False)]
+
         # Map res_numbers to PFAM domains
         pfam_annotations = {}
         for mutation, resn in mutation_residues.items():
-            matching_domains = pfam[(pfam['start'] <= resn) & (pfam['end'] >= resn)]
-            pfam_annotations[mutation] = ", ".join(
-                f"{row['pfam_domain']} ({row['accession']})" for _, row in matching_domains.iterrows()) if not matching_domains.empty else None
-
+            matching_domains = [label for start, end, label in pfam_intervals if start <= resn <= end]
+            pfam_annotations[mutation] = ", ".join(matching_domains) if matching_domains else None
+        
         # Add new column to data
         self.data = pd.DataFrame.from_dict(pfam_annotations, orient='index', columns=['Pfam domain classification'])
 
@@ -2688,21 +2696,21 @@ def ingest(self, mutations):
                     raise MAVISpMultipleError(warning=warnings,
                                       critical=[MAVISpCriticalError(this_error)])
 
-        ted_expanded = pd.DataFrame(multi_boundaries)
+        ted_intervals = [(row['start'], row['end'], row['CATH_label']) for row in multi_boundaries]
+
         # Dictionary of muts + res_numbers
         mutation_residues = {mut: int(mut[1:-1]) for mut in mutations}
 
         # Map res_numbers to TED domains
         ted_annotations = {}
         for mutation, resn in mutation_residues.items():
-            matching = ted_expanded[(ted_expanded['start'] <= resn) & (ted_expanded['end'] >= resn)]
-            if matching.empty:
-                continue
-            else:
-                labels = matching['CATH_label'].dropna().astype(str)
-                labels = labels[labels.str.strip() != ""]
-                if labels.empty:
-                    continue
+            labels = []
+            for start, end, cath_label in ted_intervals:
+                if start <= resn <= end and not pd.isna(cath_label):
+                    label = str(cath_label)
+                    if label.strip() != "":
+                        labels.append(label)
+            if labels:
                 ted_annotations[mutation] = " | ".join(labels)
 
         # Add new column to data