From e25e6b6112770d56a40c27b82f2f04ce20fcc66b Mon Sep 17 00:00:00 2001 From: Abhishek Kumar Rai Date: Fri, 27 Mar 2026 14:31:19 +0530 Subject: [PATCH] feat: add Python PSI-MI TAB v2.7 parser for openPIP 2.0 (GSoC 2025) - mitab_parser.py: full PSI-MI TAB v2.7 parser mapping to openpip.sql schema - csv_parser.py: new CSV format support normalizing to same ParsedInteraction model - models.py: Python dataclasses mirroring protein, interaction, dataset, organism tables - validator.py: per-row validation with errors and warnings before DB insertion - tests/: 34 passing tests covering all parsers and validator - Updated requirements.txt with pytest --- data-upload/csv_parser.py | 91 +++++ data-upload/mitab_parser.py | 287 ++++++++++++++ data-upload/models.py | 72 ++++ data-upload/requirements.txt | 1 + .../test_parser.cpython-311-pytest-9.0.2.pyc | Bin 0 -> 53715 bytes data-upload/tests/sample.mitab27.txt | 4 + data-upload/tests/test_parser.py | 350 ++++++++++++++++++ data-upload/validator.py | 83 +++++ 8 files changed, 888 insertions(+) create mode 100644 data-upload/csv_parser.py create mode 100644 data-upload/mitab_parser.py create mode 100644 data-upload/models.py create mode 100644 data-upload/tests/__pycache__/test_parser.cpython-311-pytest-9.0.2.pyc create mode 100644 data-upload/tests/sample.mitab27.txt create mode 100644 data-upload/tests/test_parser.py create mode 100644 data-upload/validator.py diff --git a/data-upload/csv_parser.py b/data-upload/csv_parser.py new file mode 100644 index 00000000..1a108b32 --- /dev/null +++ b/data-upload/csv_parser.py @@ -0,0 +1,91 @@ +""" +CSV Parser for openPIP 2.0 +Accepts a simplified flat CSV and normalizes it to the same +ParsedInteraction model used by the PSI-MI TAB parser so that +the same validation and DB insertion pipeline handles both formats. + +Minimum required CSV columns: protein_a, protein_b +Optional columns: interaction_type, score, publication, + author, dataset, year +""" + +import csv +import re +from models import ParsedInteraction, Protein, Dataset, InteractionCategory + + +REQUIRED_CSV_COLUMNS = {"protein_a", "protein_b"} + + +def parse_csv(filepath: str) -> list: + """ + Parse a CSV file and return a list of ParsedInteraction objects. + """ + interactions = [] + + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + headers = set(reader.fieldnames or []) + + missing = REQUIRED_CSV_COLUMNS - headers + if missing: + raise ValueError( + f"CSV is missing required columns: {missing}. " + f"At minimum 'protein_a' and 'protein_b' are required." + ) + + for row in reader: + protein_a_raw = row.get("protein_a", "").strip() + protein_b_raw = row.get("protein_b", "").strip() + + if not protein_a_raw or not protein_b_raw: + continue + + is_uniprot_a = _looks_like_uniprot(protein_a_raw) + is_uniprot_b = _looks_like_uniprot(protein_b_raw) + + protein_a = Protein( + uniprot_id=protein_a_raw if is_uniprot_a else None, + gene_name=None if is_uniprot_a else protein_a_raw, + ) + protein_b = Protein( + uniprot_id=protein_b_raw if is_uniprot_b else None, + gene_name=None if is_uniprot_b else protein_b_raw, + ) + + score_raw = row.get("score", "").strip() + score = score_raw if score_raw else None + + interaction_type_raw = row.get("interaction_type", "").strip() + category = InteractionCategory( + category_name=interaction_type_raw + ) if interaction_type_raw else None + + dataset = Dataset( + pubmed_id=row.get("publication", "").strip() or None, + author=row.get("author", "").strip() or None, + name=row.get("dataset", "").strip() or None, + year=row.get("year", "").strip() or None, + ) + + interactions.append(ParsedInteraction( + protein_a=protein_a, + protein_b=protein_b, + score=score, + category=category, + dataset=dataset, + raw=dict(row), + )) + + return interactions + + +def _looks_like_uniprot(s: str) -> bool: + """ + Rough check for UniProt accession format. + Examples: P12345, Q67890, A0A000 + """ + return bool(re.match( + r'^[OPQ][0-9][A-Z0-9]{3}[0-9]$|^[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$', + s + )) \ No newline at end of file diff --git a/data-upload/mitab_parser.py b/data-upload/mitab_parser.py new file mode 100644 index 00000000..d1ba44c9 --- /dev/null +++ b/data-upload/mitab_parser.py @@ -0,0 +1,287 @@ +""" +PSI-MI TAB v2.7 Parser for openPIP 2.0 +Replaces the PHP upload parsing logic with a clean Python implementation. +Spec: https://psicquic.github.io/MITAB27Format.html + +Output maps directly to openpip.sql schema via models.py +""" + +import csv +import re +import io +from dataclasses import dataclass +from typing import Optional +from models import ParsedInteraction, Protein, Organism, Dataset, InteractionCategory + + +MITAB27_COLUMNS = [ + "unique_id_a", "unique_id_b", + "alt_id_a", "alt_id_b", + "alias_a", "alias_b", + "interaction_detection_method", + "author", + "publication_id", + "taxid_a", "taxid_b", + "interaction_type", + "source_database", + "interaction_id", + "confidence_score", + "complex_expansion", + "bio_role_a", "bio_role_b", + "exp_role_a", "exp_role_b", + "interactor_type_a", "interactor_type_b", + "xref_a", "xref_b", "xref_interaction", + "annotation_a", "annotation_b", "annotation_interaction", + "host_organism_taxid", + "parameters", + "creation_date", + "update_date", + "checksum_a", "checksum_b", "checksum_interaction", + "negative", + "features_a", "features_b", + "stoichiometry_a", "stoichiometry_b", + "participant_identification_a", "participant_identification_b", +] + +MITAB27_COLUMN_COUNT = 42 + + +@dataclass +class MITABField: + """Represents a parsed db:value(description) field.""" + db: str + value: str + description: Optional[str] = None + + +class MITABParseError(Exception): + pass + + +# ───────────────────────────────────────────────────────────── +# Field-level parsing +# ───────────────────────────────────────────────────────────── + +def parse_field(raw: str) -> list: + """ + Parse one PSI-MI TAB cell into a list of MITABField objects. + Handles: db:value(description) | db:value | - (empty) + """ + if raw.strip() in ("-", ""): + return [] + + results = [] + entries = _split_pipe(raw) + + for entry in entries: + entry = entry.strip() + if not entry or entry == "-": + continue + # Remove surrounding quotes from value if present + match = re.match( + r'^([^:(]+):"?([^"(|]+?)"?(?:\((.+)\))?$', + entry + ) + if match: + db, value, desc = match.groups() + results.append(MITABField( + db=db.strip(), + value=value.strip(), + description=desc.strip() if desc else None + )) + else: + results.append(MITABField(db="unknown", value=entry)) + return results + + +def _split_pipe(s: str) -> list: + """Split by | while respecting quoted strings.""" + parts = [] + current = [] + in_quotes = False + for char in s: + if char == '"': + in_quotes = not in_quotes + current.append(char) + elif char == '|' and not in_quotes: + parts.append(''.join(current)) + current = [] + else: + current.append(char) + if current: + parts.append(''.join(current)) + return parts + + +# ───────────────────────────────────────────────────────────── +# Extraction helpers — map MITABFields to openpip.sql columns +# ───────────────────────────────────────────────────────────── + +def _extract_id(fields: list, db_names: tuple) -> Optional[str]: + """Extract value from MITABFields matching any of the given db names.""" + for f in fields: + if f.db.lower() in db_names: + return f.value + return None + + +def _extract_description(fields: list) -> Optional[str]: + """ + Extract gene name from alias fields. + In PSI-MI TAB alias fields, the VALUE is the gene name + and the description tells us the type e.g. (gene name). + We return the value only when description confirms it is a gene name. + """ + for f in fields: + if f.description and "gene name" in f.description.lower(): + return f.value + # fallback — return first value if no gene name type found + for f in fields: + if f.value: + return f.value + return None + + +def _extract_taxid(fields: list) -> Optional[str]: + """Extract numeric taxid from taxid:9606(human) format.""" + for f in fields: + if f.db.lower() == "taxid": + return f.value + return None + + +def _extract_common_name(fields: list) -> Optional[str]: + """Extract organism common name from taxid field description.""" + for f in fields: + if f.db.lower() == "taxid" and f.description: + return f.description + return None + + +# ───────────────────────────────────────────────────────────── +# Core builder — maps one parsed row to openpip.sql schema +# ───────────────────────────────────────────────────────────── + +def _build_interaction(row: list, raw: dict) -> ParsedInteraction: + """ + Build a ParsedInteraction from a raw tab-separated row. + Maps PSI-MI TAB 2.7 columns directly to openpip.sql schema fields. + """ + uid_a = parse_field(row[0]) + uid_b = parse_field(row[1]) + alt_a = parse_field(row[2]) + alt_b = parse_field(row[3]) + alias_a = parse_field(row[4]) + alias_b = parse_field(row[5]) + author = parse_field(row[7]) + pub_id = parse_field(row[8]) + taxid_a = parse_field(row[9]) + taxid_b = parse_field(row[10]) + int_type = parse_field(row[11]) + conf = parse_field(row[14]) + + # protein table + protein_a = Protein( + uniprot_id=_extract_id(uid_a + alt_a, ("uniprotkb", "uniprot")), + ensembl_id=_extract_id(uid_a + alt_a, ("ensembl",)), + entrez_id=_extract_id(uid_a + alt_a, ("entrez", "entrezgene")), + gene_name=_extract_description(alias_a), + ) + protein_b = Protein( + uniprot_id=_extract_id(uid_b + alt_b, ("uniprotkb", "uniprot")), + ensembl_id=_extract_id(uid_b + alt_b, ("ensembl",)), + entrez_id=_extract_id(uid_b + alt_b, ("entrez", "entrezgene")), + gene_name=_extract_description(alias_b), + ) + + # organism table + organism_a = Organism( + taxid_id=_extract_taxid(taxid_a), + common_name=_extract_common_name(taxid_a), + ) + organism_b = Organism( + taxid_id=_extract_taxid(taxid_b), + common_name=_extract_common_name(taxid_b), + ) + + # interaction_category table + category = None + if int_type: + category = InteractionCategory( + category_name=int_type[0].description or int_type[0].value + ) + + # dataset table + pubmed = _extract_id(pub_id, ("pubmed",)) + author_str = author[0].value if author else None + dataset = Dataset( + pubmed_id=pubmed, + author=author_str, + ) if (pubmed or author_str) else None + + # interaction.score — varchar(10) in DB, keep as string + score = None + if conf: + try: + score = str(round(float(conf[0].value), 6))[:10] + except ValueError: + score = None + + return ParsedInteraction( + protein_a=protein_a, + protein_b=protein_b, + organism_a=organism_a, + organism_b=organism_b, + score=score, + category=category, + dataset=dataset, + raw=raw, + ) + + +# ───────────────────────────────────────────────────────────── +# Public API +# ───────────────────────────────────────────────────────────── + +def parse_mitab27(filepath: str) -> list: + """ + Parse a PSI-MI TAB 2.7 file. + Returns a list of ParsedInteraction objects. + """ + interactions = [] + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.reader(f, delimiter='\t') + for line_num, row in enumerate(reader, start=1): + if not row or row[0].startswith('#'): + continue + if len(row) < 15: + raise MITABParseError( + f"Line {line_num}: Expected at least 15 columns, " + f"got {len(row)}." + ) + while len(row) < MITAB27_COLUMN_COUNT: + row.append('-') + raw = dict(zip(MITAB27_COLUMNS, row)) + interactions.append(_build_interaction(row, raw)) + return interactions + + +def parse_mitab27_from_string(content: str) -> list: + """ + Parse PSI-MI TAB 2.7 from a raw string. + Useful for API upload endpoints that receive file content directly. + """ + interactions = [] + reader = csv.reader(io.StringIO(content), delimiter='\t') + for line_num, row in enumerate(reader, start=1): + if not row or row[0].startswith('#'): + continue + if len(row) < 15: + raise MITABParseError( + f"Line {line_num}: Too few columns ({len(row)})" + ) + while len(row) < MITAB27_COLUMN_COUNT: + row.append('-') + raw = dict(zip(MITAB27_COLUMNS, row)) + interactions.append(_build_interaction(row, raw)) + return interactions \ No newline at end of file diff --git a/data-upload/models.py b/data-upload/models.py new file mode 100644 index 00000000..0839cdaf --- /dev/null +++ b/data-upload/models.py @@ -0,0 +1,72 @@ +""" +Data models for openPIP 2.0 — Python dataclasses that directly mirror +the openpip.sql database schema. These are the target output objects +of the PSI-MI TAB and CSV parsers in this directory. + +Table mappings: + Protein -> protein table + Organism -> organism table + Dataset -> dataset table + InteractionCategory -> interaction_category table + ParsedInteraction -> interaction table + interaction_dataset + + interaction_interaction_category + + interaction_support_information +""" + +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class Protein: + """Maps to the `protein` table in openpip.sql""" + gene_name: Optional[str] = None + protein_name: Optional[str] = None + uniprot_id: Optional[str] = None + ensembl_id: Optional[str] = None + entrez_id: Optional[str] = None + sequence: Optional[str] = None + description: Optional[str] = None + + +@dataclass +class Organism: + """Maps to the `organism` table in openpip.sql""" + taxid_id: Optional[str] = None + common_name: Optional[str] = None + scientific_name: Optional[str] = None + + +@dataclass +class Dataset: + """Maps to the `dataset` table in openpip.sql""" + name: Optional[str] = None + pubmed_id: Optional[str] = None + author: Optional[str] = None + year: Optional[str] = None + interaction_status: Optional[str] = None + description: Optional[str] = None + + +@dataclass +class InteractionCategory: + """Maps to the `interaction_category` table in openpip.sql""" + category_name: Optional[str] = None + + +@dataclass +class ParsedInteraction: + """ + Maps to the `interaction` table plus related junction tables: + interaction_dataset, interaction_interaction_category, + interaction_support_information + """ + protein_a: Protein = field(default_factory=Protein) + protein_b: Protein = field(default_factory=Protein) + organism_a: Organism = field(default_factory=Organism) + organism_b: Organism = field(default_factory=Organism) + score: Optional[str] = None + category: Optional[InteractionCategory] = None + dataset: Optional[Dataset] = None + support_info: dict = field(default_factory=dict) + raw: dict = field(default_factory=dict) \ No newline at end of file diff --git a/data-upload/requirements.txt b/data-upload/requirements.txt index 7cb6656b..9deb423b 100644 --- a/data-upload/requirements.txt +++ b/data-upload/requirements.txt @@ -1 +1,2 @@ selenium +pytest \ No newline at end of file diff --git a/data-upload/tests/__pycache__/test_parser.cpython-311-pytest-9.0.2.pyc b/data-upload/tests/__pycache__/test_parser.cpython-311-pytest-9.0.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72d767964fbcedec5543c0d267231677419f8a20 GIT binary patch literal 53715 zcmeHwdvIIVnIAx0fFQt!M9F$f21PwciK0kJltft%OR*h0wq)6IvKv(i0`fu z>rFS~{?p%gzI)F-_X3xKQ!8xC?x!sKOM$cN^)|o_dJJ%*9tYf{w*x+`cK{~zPQV_$3vjc(46s*U4!A|{2HdKz0Nkdp z1l+Da1lXsq0_@jU1Mbk*0PfV+0uJcw0C(x@0SEOBfV=gLfJ6Ewz+wGiz&&~baIfA2 zxKG~Qpg zoegjU4RE^};07Dub~nHcHNYL!ce+QGxN}GK0WVe$p24hgCeXj@$8?uZ6MwWb(OM>uDojNM~}j=(EO!R4!vq)LI`)6;fupQ0z8S6O-BWpjFpk;c8)AK5z)M z;upg2?Uo2lf(1F11kiBz1xqdbfKP`?A@gqwVVgb>xFmDy zNIM_YHKIP>@(oPC-@=bSdbprWsh4Da_ieZwFxR@uM(g=gkdt4)(?Y|YgRWeB{Bz`j zdRUJ%*`Ae++vDvg!%Zh!;iW^}5m$*0rRTo*P_?-4kKTGyE25XCH9e}gm9$Jy#KMK; zc0U9Hu9Dn?zE&1E9VtaF$;EFvsJGWWAzyUW>3u>v z8a*Md98Rx9-0kwAda%Chycv2Q`!2F*|4G&W{nvG${kLqX|1L?!3H=x2{+kMj*~#zO zTP|j&?vGmE=`k!+D~fB*6jC`oW#|cg>=0I;sdVCe-k3-gip!>QnMo`)FOD5LJ-lb{ zzL6i(YH=(0PRo%aw}Qo?t$mZ%_V$~x&s%}SkpzW?`prK2KiqHj$EFiU7`wmUtc9}a zTrFax&8ckRmR1XamcTH9y#)5Z)0r~Ov{4WR9i%Li*AAXCXpXP7$>yN)9X^$#(m{R9 zRqd^iu@d=>hX|~qk|KOLC}!6p#fZLemlUkEWX%3xaqlA8^t3r{WF~2)M&^tuE(KSA ztHYp0&sat+h&?)at&lbg$&^uBo*YkQvq>X8X^iJ5zzn2o?MP0JO=Yr$OwO!DvUyOL zwKYleVty*C3)19;Yv2qL%QoE)mv-y;M zjylT}Fe%t2V{r0XZI$RkW}gH%l^jc%nQ>zuYFv$~#H%;hgX^712PZT)Nyz|14FJ$K^k5#sN>w!UhI8AB8ZRqO<&kWnNt=9Rq?SBFr` zmQ*t5%w>t=oC?ArdW4CXVzx(S0R_TbgM^F$Et4}1E=mfstjU9a-9Vpc~xG>(a)99+RbwhrYD&pa%h;(uZ&3rfxiz-{;2d zqq_OnVDCBl6gITq-hgbB9KW>z={xcK$BqvtpZw&>TsijKo@?OzM?P;bKy` zUSkw=3CgVSs|PC`qt%YlC4O~@Usaf8<}rz*{e|LJhb?|}c z8?M@63rQphRzwP?6TeDy7B%s!PIT@h$2dE}RylTnF~}223J(!=ohV!@Nbx4b=TbyoCkkKbDqD)o zy&`3Zo{8hOK9{7(+)D@UQ}dzptipQuW{VJvhV+QuS_)-YsH_!e0=7P`lBC|Wq`5g> zcML-1D1@Rd5ZOf`lxizJyepBINT+gUB9~9FP%4v4oIdl^u9HtCUO0ZD7ThIfe$3b1 zB#VK}$rdz0A|+OVTP?;4G)ahkTwa7VB^B9k(!CL)4_*t_*R2KViOIsXq@FS_8jpjvA3yUy0|=?b?21aq z#%jmL>rX5^w7RUVp4orn`QOC8oWl6dYpbhvm@!0vP{mGQ3K^AoX7)=QlPCyDbmMtR z#AO1iU!*dvYzNb;xm0?Dmxfv60r~S9W{Jn?@k^KmBa^$3fQ&Kk8Fe*_!8(jblArz~ z6go=a7{DpxQ3@Uh@ILDseV%o$%y<%6c2l-006*>BUrxT9{3tx@PhgtA*OH=6quvP{ z)8vxw6^{*n{1>(*-K$GG@2CG_?fjDNHF9Z5_o^6|)v;)#_jl{AmoK9G@4inF$Vc({ z^Y)+m{H?gp{`<)3{)PT}WwGZZw84~xIp~p|1IX%Rm;UCi`?cK|hgjlN9YSmC{SF7l6cXYr;liLo1_5{Wb0*wVsi8@AW z!s?Nv*g2WTNo1xCt~rc7rKJyjw9;|B+HrhIzO*D?`o)(oZDD=%mRabgw?NxGdMCaG zA^y&5TdH=LF+_n-MI>-C373e>ia{Uk#AdBD&WsceNXQuElr?c~hLWs4AobA$Xg~DP z1G6~*NbhoY;sXfrcU~K)+F`~J1ws{(z{%8pBC}%9M?10E9LG2_!W@v0F~})vVtO@~ zB(4uEvT0v3BE_t_q|2pF5T8p+Mx>Ay^XG%p<-$xf;xWW+Eww6}nU+$d)N+|*)Irm4 z6A1PZa_Qwn>6J_SnASFDfZOWcM);KFHD2!iKn!qO!D81NFspjp`(?scek6@ZA1Xd+ ziROhBo|(v>FQjvci>b@$#7k4~RnTqTJezqjotQK;yCyP+HlKXz(9qEE!OeYzEBVC5 zYhy-6?=NmxEcO0WzQqWaONsgT8ueuHx6Rp3r%o;)X@|egM=kXr!hnTUTFbi4}hvd*lUuagl!}f z3Dy;Cs$=VMHSi%xa*rA>AP0$HRi*KPO2@%!$H670z8R*zmzm1m*@I=R7cAq!JMmtG z_&cxlR_!ojhytOCNZ@2*%81O0fvI$2vj;iGnGxoIgp5H>SrgN%xg@a~SctDIYujfd zvoGCjqd?ouF8rGro$V^ewh%Hdm~RW1JabS+t5l zPFWMvtGPT`I4YUqAxmjIH1|wd8zQCgGk4-c6cpciZK!I8EhLd3SP?0lPM1J*7Bwl2 zo#@;%9OLW=TjkgR#vrS#i|N%|o-Ar~5ytp5WC?BwzrWWJ{H&x37luzBb`r9FC`M^n zars#_R1`zGOnz3OMvD+syi&RZ<+4-;*D06Rx|Gneo&rP%_Q$=X!_P`mp=7D@_0K^H zXZU?J*PfNG(q&EFeuAuDrm3y)(vhj952Z&7=R@UXRU*>Qs#T9FepaM>wmzs z%^lkuKOwyhYVla{ku$=(FG0S6SYFM<-UJ!RCye|RGtoDkxR_4qX=69s2*{Q`k}~c=F^^(7dymRg8!dz;J%+sO@uIiZwdiHjNnBZSJU^8y7@r2|Gx(V!V6h>@T9NpJ4V4vx)fI!+ zpCOz6QYF5j8sAXXdgg)v<@koV;GH<^_O0){)>E~^7LrI1tk@|`BBMxR4B7Qtnd=Z- zlwVG?%AHd|$l}pdrs7iOf-(;`j;oLmshpL`nFg1p=90u}V3GNXMs)M@6`1CrBKgO* z!ZC2cT-D-ueolv=snklLl4i|b;ZoTAYGV&e%v)j1UV^%A3nDj`>fBXlB-)tDGh$t} z%XzRAa?ff$R30-H7{subWkeXnL@a}tR+mAH+A29amOL9HckPxz40&0$;1|L#OrD&= zIA)Zbvf%%lObuFs?PNwzjOPs_JzmIOOQf$B$W<6z_jw~fkpRt0Q)!UE*+_`Eh3O;N zO~QKSIbwQY5yMO`0v6NTz^{v3t7OfwX{Bz*NeV7ow=~7wiWpR3g!xXF;@C9|bDf$7 z$FHGN{cU27F@6<<@PxK6tQK>wOc9iCqvEy`=mWSF<=KU}X`W(;%~ttp%Ponk-Y z_N`(^rQzWQe7DG`V1s~L}yXQA>34>yVdAzu%1qt2+O)S zz!;Pyt7dvNmnTbobZIidy@w06I=Da7lgXGepqrUY#@H(}^4$Id2ScjCt>D8BRB@v0rRkVJxDMWk>#tz<-JQPWDMMzNAP z(c4Ek#-$-_6>A3=gOX&`Ot0qhWT}rXjcGgpzXU7Uy~nh*WT_?@d*0>XriIHHqRkKP zOXzW9g1~#YWZ9$IK)E`VohPugWT6*6{EXV33+Y^19QRPaVr5chGL=wV${-=a;ZnTa zJ3tQsZTGN*oqq7`o~*w4hg_^+Z8GD1Kk9CG32en=QP|2(;fiP!nTu`0j#*3BF|l+F zmbGKIcL9{+gSU6xiDQ>$edo1fRXc1Ui3GulNa1u^x`@uArlm`b4yw`HyEw+BAuQ`s zF(^q^&Gc$6PnP=V(%8lW@DUi>V?JZM7JK3J$X@n`eV0R=7BV@!njhj~9V_4-yqAM} z%(#jYC-GZ!a38e>_fdOrAH5CJC&UJC58R1^@3g-2+R>^Vwva@EU`3>GIt^~3v#4os zt5FPYCwd#USx%V<%epwg7?dQdW_mT3Crf>Fk-=T>len0AX>oBa1hkKLiEaqwBg7;_#%ca(QBkY zOP%#`wMv#+@>s4cDtgr*!e+7Sq49j~d`3^_#?uKC($=&hB@t&3=?Q3LEMF=J7!!}3 zKrC;hE~LdmT-1gJ4~~do3c>4QCr^peoiPYg1ol~9jmu(}c8ETOI!B02!-C#-0OC}q zU!jbSn6!aOLcQPmLta%3*YX7vbs0bNzX15jK6lW_#Q8Nl=AHl`)$S8_;yWlPzVq6S zsvWkFM1o*Nq;NV7NTRc-Nww=l=bqpgXGhp7#||(CS!G>JujcY(QKO5Dy-V+*s>B|9 zIG}E6+`*~MQh3sT8h0&FcSMUDCNic;{)s&3Xh=b|fuVLPpGzB`z+-sI*aQgnwwXbd zktecw29@!76k<#R)DH%?Gb1=r(H2wz5=Fr12XVv%gN`cCkI=K^nJP%i6(md}}q1J>T?V>U@@3{(@`Yfcd@|^JKrXbA`z7DDLe16Y;e`?F#R5O-iIpc zX3l-6+M1$@xvBK{&4@V6B(AsX9i_OnfoN}H1JUj-$sL33hRVpaTJ{@sz)9P5OfWT_ z#S@C*70!Pxq%Y)+YlC)x#e9?b$*HVxWE6HBYL3FSN#0bP>JPpX0ZWt17r_7QBk`V( z>4sf4=3u6`?VUYrP;uqt#cMc4HI)UAW#-2-;>cLzSv*^);mcHmVlO?s$F?1zkYzkV z_8n|cLLm!@8J}?oSSTN3Yp12|4#7f_&0k3yvFSb>EDYPSm1vVeh8V&YTkMR4j#${1 z>?yiTPGH3?(EIvh%GN1r*X6sk)H-YuK592Cn$X|T#GcAHLroh4P;F91*#d=aQig|J z_9%N@u@RHDvbI&Y_|H~yTfAE{_0h1KYkUQnzl@)W*G%~C)OOS22HWJ_Z|nmo$9wUg zj8&}fytcb)hb^Qa(V(WI6i%l#&55B9HTuRrj&XK`W#K9Y1k3^6M^T-Tc4|JJyYF!hN8%aKwN|o)y~XoXR0DV zsr+Rc?My`^Sp;#|*bya1=aOm+)w-ikyQ9y!qgKIO8dVKGxQKQ6Rlt6 zNgN_pd78c6t5tr<_HX+Awf);o_P%)0$Kd4D*hE@yV84KbEM{*y=NqZa$VbKT?J(4_ zrBA2sPY+~OiV0X&+dM1ZYUPG`pWSBP*|up$-YPT6MH;W+VZK3g-C~UVA)4zpm$gIX z_~vRH^4~+ZVw?E<%LtwM}BIQ0~^k zXYi7ri>%|dxU;7{+*$KHmovh-X7E>*9j}MKYHWySd=1Y+Ey8(yM>qW997(ibj)Wwd zI3%{;70z)wq76*->h|*X6kt{QPm%4PP}YW1KzCW&cOyGD{A!*8`RXd@cK4N6?W(Q< z-?MM#KpFqz6y)!`wy$c38N(zRm>Ejp^qB){>_%RVp=@{5%E_f6N!F!eP_C?+vorJ% z`$g-g0J1k1tNe8g-}_nR9}CKT8(A$S_8IixMUe-3#2wCJHG+q2SDx@?bEi3LxihOO|@q3#g8 zuGGoq{`*4hZ%pFOtN(oa!Z!zbUHP=!GmT2GhEPHEVp6XMh2&&A8z z^$3Ldt;JoBr^YXmzf!_T7p4px@R>@S_P*f^&bTICN;95*XGHNDzz&W#kO){CNbD}{ zSIrEF?JV}7*c#Si87K_62>AJ4HRxW#FM%#bY#PadcaPXPp0b>rgad-{J48Ul%1NL- z6x~W-8-eWv$ol~EqnONomk>0WsmFkhm?UPQEgSFHc6fJe*46XNOu)uDZdk@YL(ZG{ znZvl@UJYC4y1Rj3c=#s-W?J6tT06UWe$(zs*HE={i% z4Hi(Wm#R-6FQV`Q#3IUzAw5X_VO}|Z4`CL!l3O3Gp$ASC-KRJ3l{m37(av@lBLKtFyTgY_E4^ zUbUHb7o)VhfRJFl(l{wG~cd-gW5}|#P3P^-G$*RU~u1W&}^}Ddt2UdU7b0Nf4(lh-w zZJV0sLYk1p1WO@EUz$;>g0fEfLhZ6Xm*n=Ojt^)|<>3R=wM{9@BYpV*?DjO0tH6G! z?VfV1rM?sQUU4_K`;KfLW5w;yr?6AQZcWdO7f5e7X{7P`#>?be4)K(p9-F#w0k&DS zc%7H|vnUw~!<`5e^s==d}`h}Qj7Zong zU~0m~3w6Nx+qz!k+sOXU@iSj_cTlKjZY_f;|Z5{q60TTKVz%}F9P zpnImapzB@eZkd={;^B@>dl-ICYDf=Z;&z{o@wbg7MmQHE4CCuCQF|wGyg6Kz&B)2R z!6ren`RW-t{wzN>32F`*_RoP$0?e_SY|lzp>2lLj^7K9=Zn9>yTZX-KsJqpN(z6Kp zPr;Z+lxad=-su6kDd8cf?f~? z`PfTS8M1U8&u6D5ASe?y=B8tG{!Wzm)xmM|a`9_6r-(8%s%U}$i3z}6ndPLCcuuzzIWxncSY=x83T3Xbdw)FT5#vTyI6;h3<^ z6&gq71#N;U@5WeD=K22iTI`CEDWsDHQg{#>QVYS5{{GGyio#HEViFRy!o^w?y^^F; zl6Vs!rv64&@ASx#l#V_usbjNu$}albuwE(rM7FF^+#5eG_|(bnmRIO0EE+-y&3DLTxEB4qdA9C!qK2;*O% z{&yTgXC8D~{!=Np25?#LTmsLSqER2{>9#(xQmDM*#H*G-U#k7jJwIx`K8W~%B8Zl_0=Xv5Hg^0H{ zG6e29Pmtu%_#6ShdBXTtNHyd}nuog;*6V}3rSPN;J=8t ztyM0nYD`?$;cQ(z2J4HiauIL~F);t};#&4?bCxt|_#+lJMJwPeY|PKQmp5Y9M|h1R zchgaJH#LgrF^`!youlPh$|Ew zd|#{DFNp)HI>_k}69VM_i$U{XgS_nlr5M2g`P?L%wcs;Xr@A^-Q2QP*`R)4gaj<}>Z zo|aFy;spq;5U0b#23H(4S3`*0wrGx6sZDs=#7lANqZ=(|r>pgH!hr-KiYYEmHPlUU z@)}(|FxRaLbkZD=6pQOlF+XMZ~>$a&Npa$5G!0a**YW}0k|`)p>}9cQkIVSMGwBaPax8?RuBKtkiBjYaONVoFXxEo|RIE`U_CnZ8P2U0P<pA!`fc0Y*y=$6MAikgt*!;ru%aVjs(6{O1I70Yi8qV5JjV8*GLg2ozst)h5tCF~ zp5#-6liWoU<^_*PpcYeZgAf_?(xq-8u$4-Pq$VfRIlb0tmo%2oXY-T&-3qHkQUuj% zVUO{zDgOxqLbxF03M650`IYT)#SXY)`y2^`%iqlMOyjHC}JW~Kn@Sas&_eV`IeKK~^8l}p&-MJ)`J3lOeD-{~YYRf^_l-DW-!;EK{D+tS_%ecT z#BX0N_qj+a(IfNGBUN|4na`uN=n0ir*{hiqhz;=f6f?3>4jPjm(T ztSkQX!O)+rduqkgdqe+mZy1rrf(4R>JlLK?eUG_|kM=nDxYtSIq}?KSb=@w7TqdJv zo8pH+hA?m5$NNRxW{`3t@2}x}Aj8S9DmqX!m>#_(_DrhqTnalHaQI!gi`GXm6PE4w z8pisz+BXNdvWQQsJ-a5U6bP&P0aspmRrh@B3GqYFid%n2uMcd6yD?Ok)7O^=RibR--W8+ z1QkzfxbYZ3-CI%Edq7!G7x5nWa};ow3fZ-oG^IHmfi&lP<8aD;oYWdY2UquTtn7fyES(Un2r?ck~t5`YRe% zdHHk8iFzKz{XhK7ZHrAEy&i6SarNRy&gYtzY$2xoK02{^J6{)T)Zi6xUka3hj|JRp zz`cyi;Cb}5`*44BJ6FoFy(lB32+nOU&!!@d|}iL5o7=WtTMjJ+VzV zDted}H^T|RjPS;W)o>UZnMiT_@objgtA;m&n#ya}Cb8Y8*O_MEe~dF0T>LUP(ZF8( z-ljeN(}AGinLiUJJ42F|@aI#3*Mh;o%k6pq6$NpCPc51;lcL1_mRif;P%UVfRFY+q z^iJDhlHOdMG>#Sr#L%^B&l{Xe7&;9e&gOAuk9l-ZQg`ABG^r&&!r%2kxpODJv#FVv zUM_N2g<|z94k+TD2JRvXzeH}9p;R{8A5u7^=TVv4qmmv4{*>UShdy z)LLP4JC%jMgs!`o|YU zpUfRVPgFX0RuR*7zOt8M@txH;#mWp3(00yiJFD8xihz`4W9a>Gl+p+xaa5?F;!;qq z%mf;QnbJzNt5In{pnjQO!3D~XR1y6RW+2bpLvxb#anB;^22Brs+hcn9?T}k6Ivt>` zRkmd!G>wNC2anN75s`1VdZ@5{Q*5GOFV)T-3Rg*=>RR{ZT;XmN+{0J8HV|@~;+-j= zG*GufQataa!`##6L+PDWVmDia@b|sg&Im(K*od_#UVxFpdi!4B+)xj{Gu;ZUd9>78 zioU23)pT2_4UYBh4WtxY>+PjBC?PtC-m8SrJDcojzb)`Or)+mQ@&IDugT zI)F)9(uDG1*?Pjm0NA2+x%n{uH}Wl~wXYv{5o=%Dx;MMmex?7_e*ByJ)pGY>rF*d2 zJ$N@9ibZ}B0Dz@nEOPzC-Dn^TXA+!m>U65m*jZ$!j;59~(eJ+Lr4mTygUUX2xoYlhkMm74ry%mXdV%@z<0l z=CdYeMI6$(=p0xl>b%$dHfH=Ev|%I7Z-0Y;J-;avn>E}0a?Ni%%heB3$?^XPL-j!i zsUE81`ZW(fk5doyh==QM{dUwtRmb(4S9Lf}HOYm#`S|yO-;cS5nX@zS^80a%-`ASp z&F&>dXs5TSD7=2y#p`<{pzv@Av&oso>%;795njJT7?5=~G9c5tn(XBdhS%@<1-$E? zzV~@H9%x6PFSB_4HhsC?U23y<{a6#vlgGfRj@NGopD#^?us3Xn&}fB=*U#oXGfcfD=8yK9@=4H_tPIV=xt3kjb0q(y9l`CK;Yt=VCN07S{Oe<;o##> zlDW?P^YK<&8*=i-U!iD6Ca&`A_rEUHqIOjZzaDS2R;ji(d+))r(?VXyvGb`Sa#N9{ zip^v$FJCbHQ{?KS1+7P6*ylc8?jERg4^+Dch;M(Q+`Y5Xy|dcA^IfcK@#XYh@^v!= z_&cwyt7_{i0;-sq{SwC{3IcQ%4hNu8cFu?}r*I;|oT}0QX>A^Q!Ki<+5`VZFf4Hpm z&fy^E-g5lmxt=?87Bzh|y~?qVBq^~zu60sg)jdt|aqMHkahZjj z*z^2pu2iqa%i}eKC+w=L9a8UvH@}hJP?Wrk>g1ozN}B!d!Wz2|n!XFZIdB)+ex6UH ztM?pE*u%{>EusCOdhFfq9X!vK?-=&b;cofu9Xwp5e2EA+Gm^qvoL{sF&d(z+c8?E8 zc}c=czR=_)Mu(?%7Pm`zEFEni?V(bMY{o1k^5=nJ#o6n4aQVozHxa67ly1x+cuLq9 zfI$-nz&DVl+8RDcRY{n7gk`MK*4i<(wk@qKb8hmt5pK`e@voq;N8q}Oa5Op;4zANV z(8r*uoLl_&6zV0gg+LF12FgN(Z);Fw(j64KQ%3JDRbBUq)UUvgnX*3|3bR;w%SMnF!d=^W7{QlPCz4PErBMgf}ioTpeQZoM@uEP6Z*0=f=y{A_pgNT?lg- zvLvqZ&3Na`l`kKD`EXeqczy3|sj_~cx*jL(56m1cug6PSDTr@H8<^Jysv^u7n}icN zg-Jjw2^mA&orSAIaA|%yXRdg1DhTA{Y%_;FL^3K#UiY>@K`h<^bcws|3PKi-u98$h zBFu$K$QU%2Tg)`*B{i2MRs({2tVNT_iBu++OimravrQJ3bkwAL{Dln8v=5+3OkN|M zL*j?mUriWOxkGlGkml~<+87y1Or&yCcwvyh!Cccc)B0flv08L&Dw8eXC}uIBYipBS z)*vg4U_CKtq{YE6wKj<&^}oQ0p^tN5u9>x#Jh@`f!7dRLmNp8t*3_6O5Za}DCRc04 zSv0xSM4B#*=Eh{Q){fKhQ)AXCE`gD{0VtMg2hifqn>28qnU|Vz+TI z-Mte%j0>I0rjHu!nAh+Pm^aWjFc1$07f7h;^B21={uTml*Tvu6NFbwC97u&n##EB~nUo2A(kH$VQn z&;ItaZ#-GqFjCzxQarR$k$*E84K?{-Fl{dWTrbVq`FC~AT3P;d>-(y&3h3<+F& zYjD@yfCMWeK^#Rb!MadzGv&1a@*bwB1?r(YC0G-s z%a&l33<(^fXt4yV;z4-WNU$MFbw~hWv`>OfT5y2swLqt=QdEw5A0}c8pjBI_RTfwi zrrRgMnt;|mQ@j&eTMn&#O9(Y8+J>sOp{#9K(C`{ays&Y5RohG2jc zh%uba|AJUCE84cIhIc@2ThQ?48N4ILTI?}&!q8np#rJSk6jN8Wr>e!Ho`!g$Ys=c& zinhM0tuJfq>ze6m*Minp*81dIl0B`ii;}9iq77BGp|Uo#U`cMaRz*K~DxxcnU4OhB md#a*+qN;tOtbO7wt&I?P%ca$$=u~{~9toh|Lu 0 + + +def test_validator_missing_protein_b(): + """Row with no protein_b identifier fails validation""" + ix = ParsedInteraction( + protein_a=Protein(uniprot_id="P12345"), + protein_b=Protein() + ) + from validator import validate_interaction + result = validate_interaction(ix, line_num=1) + assert not result.is_valid + + +def test_validator_warns_no_uniprot(): + """Row with non-UniProt identifier generates warning""" + ix = ParsedInteraction( + protein_a=Protein(gene_name="BRCA1"), # has gene_name but no uniprot_id + protein_b=Protein(gene_name="TP53") + ) + from validator import validate_interaction + result = validate_interaction(ix, line_num=1) + assert result.is_valid # not an error, just a warning + assert len(result.warnings) > 0 + + +def test_validator_returns_list(): + """validate_file returns a list of results""" + interactions = parse_mitab27(SAMPLE_MITAB) + results = validate_file(interactions) + assert isinstance(results, list) + assert len(results) == len(interactions) + + +# ───────────────────────────────────────────── +# Run all tests manually if needed +# ───────────────────────────────────────────── + +if __name__ == "__main__": + # parse_field tests + test_parse_field_basic() + test_parse_field_with_description() + test_parse_field_empty_dash() + test_parse_field_empty_string() + test_parse_field_multiple_values() + test_parse_field_psi_mi_quoted() + + # mitab parser tests + test_parse_file_row_count() + test_parse_file_protein_a_uniprot() + test_parse_file_protein_b_uniprot() + test_parse_file_protein_a_ensembl() + test_parse_file_protein_b_ensembl() + test_parse_file_gene_name_a() + test_parse_file_gene_name_b() + test_parse_file_score() + test_parse_file_score_missing() + test_parse_file_organism_taxid() + test_parse_file_interaction_category() + test_parse_file_dataset_pubmed() + test_parse_file_dataset_author() + test_parse_file_returns_parsed_interaction() + test_parse_file_protein_objects() + test_parse_file_raw_preserved() + + # CSV tests need tmp_path — skip in manual run + print("CSV and validator tests require pytest — run: pytest tests/") + + print("\nAll manual tests passed.") \ No newline at end of file diff --git a/data-upload/validator.py b/data-upload/validator.py new file mode 100644 index 00000000..a56e53bb --- /dev/null +++ b/data-upload/validator.py @@ -0,0 +1,83 @@ +""" +Validation layer for openPIP 2.0 +Validates ParsedInteraction objects before DB insertion. +Works for both PSI-MI TAB and CSV parsed data since both +produce the same ParsedInteraction model. +""" + +from dataclasses import dataclass +from models import ParsedInteraction + + +@dataclass +class ValidationResult: + is_valid: bool + errors: list + warnings: list + + +def validate_interaction(ix: ParsedInteraction, line_num: int) -> ValidationResult: + """ + Validate a single ParsedInteraction. + Errors block DB insertion. Warnings are logged but allowed. + """ + errors = [] + warnings = [] + + # At least one identifier required for each interactor + has_a = ( + ix.protein_a.uniprot_id or + ix.protein_a.gene_name or + ix.protein_a.ensembl_id + ) + has_b = ( + ix.protein_b.uniprot_id or + ix.protein_b.gene_name or + ix.protein_b.ensembl_id + ) + + if not has_a: + errors.append( + f"Line {line_num}: Interactor A has no identifier " + f"(uniprot_id, gene_name, or ensembl_id required)" + ) + if not has_b: + errors.append( + f"Line {line_num}: Interactor B has no identifier " + f"(uniprot_id, gene_name, or ensembl_id required)" + ) + + # Warn if no UniProt ID — UniProt REST annotation will be skipped + if has_a and not ix.protein_a.uniprot_id: + warnings.append( + f"Line {line_num}: No UniProt ID for interactor A " + f"— UniProt annotation fetch will be skipped" + ) + if has_b and not ix.protein_b.uniprot_id: + warnings.append( + f"Line {line_num}: No UniProt ID for interactor B " + f"— UniProt annotation fetch will be skipped" + ) + + # Warn if score present but not numeric + if ix.score is not None: + try: + float(ix.score) + except ValueError: + warnings.append( + f"Line {line_num}: Score '{ix.score}' is not numeric" + ) + + return ValidationResult( + is_valid=len(errors) == 0, + errors=errors, + warnings=warnings, + ) + + +def validate_file(interactions: list) -> list: + """Validate all interactions from a parsed file.""" + return [ + validate_interaction(ix, i + 1) + for i, ix in enumerate(interactions) + ] \ No newline at end of file