diff --git a/data-upload/csv_parser.py b/data-upload/csv_parser.py new file mode 100644 index 00000000..1a108b32 --- /dev/null +++ b/data-upload/csv_parser.py @@ -0,0 +1,91 @@ +""" +CSV Parser for openPIP 2.0 +Accepts a simplified flat CSV and normalizes it to the same +ParsedInteraction model used by the PSI-MI TAB parser so that +the same validation and DB insertion pipeline handles both formats. + +Minimum required CSV columns: protein_a, protein_b +Optional columns: interaction_type, score, publication, + author, dataset, year +""" + +import csv +import re +from models import ParsedInteraction, Protein, Dataset, InteractionCategory + + +REQUIRED_CSV_COLUMNS = {"protein_a", "protein_b"} + + +def parse_csv(filepath: str) -> list: + """ + Parse a CSV file and return a list of ParsedInteraction objects. + """ + interactions = [] + + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + headers = set(reader.fieldnames or []) + + missing = REQUIRED_CSV_COLUMNS - headers + if missing: + raise ValueError( + f"CSV is missing required columns: {missing}. " + f"At minimum 'protein_a' and 'protein_b' are required." + ) + + for row in reader: + protein_a_raw = row.get("protein_a", "").strip() + protein_b_raw = row.get("protein_b", "").strip() + + if not protein_a_raw or not protein_b_raw: + continue + + is_uniprot_a = _looks_like_uniprot(protein_a_raw) + is_uniprot_b = _looks_like_uniprot(protein_b_raw) + + protein_a = Protein( + uniprot_id=protein_a_raw if is_uniprot_a else None, + gene_name=None if is_uniprot_a else protein_a_raw, + ) + protein_b = Protein( + uniprot_id=protein_b_raw if is_uniprot_b else None, + gene_name=None if is_uniprot_b else protein_b_raw, + ) + + score_raw = row.get("score", "").strip() + score = score_raw if score_raw else None + + interaction_type_raw = row.get("interaction_type", "").strip() + category = InteractionCategory( + category_name=interaction_type_raw + ) if interaction_type_raw else None + + dataset = Dataset( + pubmed_id=row.get("publication", "").strip() or None, + author=row.get("author", "").strip() or None, + name=row.get("dataset", "").strip() or None, + year=row.get("year", "").strip() or None, + ) + + interactions.append(ParsedInteraction( + protein_a=protein_a, + protein_b=protein_b, + score=score, + category=category, + dataset=dataset, + raw=dict(row), + )) + + return interactions + + +def _looks_like_uniprot(s: str) -> bool: + """ + Rough check for UniProt accession format. + Examples: P12345, Q67890, A0A000 + """ + return bool(re.match( + r'^[OPQ][0-9][A-Z0-9]{3}[0-9]$|^[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$', + s + )) \ No newline at end of file diff --git a/data-upload/mitab_parser.py b/data-upload/mitab_parser.py new file mode 100644 index 00000000..d1ba44c9 --- /dev/null +++ b/data-upload/mitab_parser.py @@ -0,0 +1,287 @@ +""" +PSI-MI TAB v2.7 Parser for openPIP 2.0 +Replaces the PHP upload parsing logic with a clean Python implementation. +Spec: https://psicquic.github.io/MITAB27Format.html + +Output maps directly to openpip.sql schema via models.py +""" + +import csv +import re +import io +from dataclasses import dataclass +from typing import Optional +from models import ParsedInteraction, Protein, Organism, Dataset, InteractionCategory + + +MITAB27_COLUMNS = [ + "unique_id_a", "unique_id_b", + "alt_id_a", "alt_id_b", + "alias_a", "alias_b", + "interaction_detection_method", + "author", + "publication_id", + "taxid_a", "taxid_b", + "interaction_type", + "source_database", + "interaction_id", + "confidence_score", + "complex_expansion", + "bio_role_a", "bio_role_b", + "exp_role_a", "exp_role_b", + "interactor_type_a", "interactor_type_b", + "xref_a", "xref_b", "xref_interaction", + "annotation_a", "annotation_b", "annotation_interaction", + "host_organism_taxid", + "parameters", + "creation_date", + "update_date", + "checksum_a", "checksum_b", "checksum_interaction", + "negative", + "features_a", "features_b", + "stoichiometry_a", "stoichiometry_b", + "participant_identification_a", "participant_identification_b", +] + +MITAB27_COLUMN_COUNT = 42 + + +@dataclass +class MITABField: + """Represents a parsed db:value(description) field.""" + db: str + value: str + description: Optional[str] = None + + +class MITABParseError(Exception): + pass + + +# ───────────────────────────────────────────────────────────── +# Field-level parsing +# ───────────────────────────────────────────────────────────── + +def parse_field(raw: str) -> list: + """ + Parse one PSI-MI TAB cell into a list of MITABField objects. + Handles: db:value(description) | db:value | - (empty) + """ + if raw.strip() in ("-", ""): + return [] + + results = [] + entries = _split_pipe(raw) + + for entry in entries: + entry = entry.strip() + if not entry or entry == "-": + continue + # Remove surrounding quotes from value if present + match = re.match( + r'^([^:(]+):"?([^"(|]+?)"?(?:\((.+)\))?$', + entry + ) + if match: + db, value, desc = match.groups() + results.append(MITABField( + db=db.strip(), + value=value.strip(), + description=desc.strip() if desc else None + )) + else: + results.append(MITABField(db="unknown", value=entry)) + return results + + +def _split_pipe(s: str) -> list: + """Split by | while respecting quoted strings.""" + parts = [] + current = [] + in_quotes = False + for char in s: + if char == '"': + in_quotes = not in_quotes + current.append(char) + elif char == '|' and not in_quotes: + parts.append(''.join(current)) + current = [] + else: + current.append(char) + if current: + parts.append(''.join(current)) + return parts + + +# ───────────────────────────────────────────────────────────── +# Extraction helpers — map MITABFields to openpip.sql columns +# ───────────────────────────────────────────────────────────── + +def _extract_id(fields: list, db_names: tuple) -> Optional[str]: + """Extract value from MITABFields matching any of the given db names.""" + for f in fields: + if f.db.lower() in db_names: + return f.value + return None + + +def _extract_description(fields: list) -> Optional[str]: + """ + Extract gene name from alias fields. + In PSI-MI TAB alias fields, the VALUE is the gene name + and the description tells us the type e.g. (gene name). + We return the value only when description confirms it is a gene name. + """ + for f in fields: + if f.description and "gene name" in f.description.lower(): + return f.value + # fallback — return first value if no gene name type found + for f in fields: + if f.value: + return f.value + return None + + +def _extract_taxid(fields: list) -> Optional[str]: + """Extract numeric taxid from taxid:9606(human) format.""" + for f in fields: + if f.db.lower() == "taxid": + return f.value + return None + + +def _extract_common_name(fields: list) -> Optional[str]: + """Extract organism common name from taxid field description.""" + for f in fields: + if f.db.lower() == "taxid" and f.description: + return f.description + return None + + +# ───────────────────────────────────────────────────────────── +# Core builder — maps one parsed row to openpip.sql schema +# ───────────────────────────────────────────────────────────── + +def _build_interaction(row: list, raw: dict) -> ParsedInteraction: + """ + Build a ParsedInteraction from a raw tab-separated row. + Maps PSI-MI TAB 2.7 columns directly to openpip.sql schema fields. + """ + uid_a = parse_field(row[0]) + uid_b = parse_field(row[1]) + alt_a = parse_field(row[2]) + alt_b = parse_field(row[3]) + alias_a = parse_field(row[4]) + alias_b = parse_field(row[5]) + author = parse_field(row[7]) + pub_id = parse_field(row[8]) + taxid_a = parse_field(row[9]) + taxid_b = parse_field(row[10]) + int_type = parse_field(row[11]) + conf = parse_field(row[14]) + + # protein table + protein_a = Protein( + uniprot_id=_extract_id(uid_a + alt_a, ("uniprotkb", "uniprot")), + ensembl_id=_extract_id(uid_a + alt_a, ("ensembl",)), + entrez_id=_extract_id(uid_a + alt_a, ("entrez", "entrezgene")), + gene_name=_extract_description(alias_a), + ) + protein_b = Protein( + uniprot_id=_extract_id(uid_b + alt_b, ("uniprotkb", "uniprot")), + ensembl_id=_extract_id(uid_b + alt_b, ("ensembl",)), + entrez_id=_extract_id(uid_b + alt_b, ("entrez", "entrezgene")), + gene_name=_extract_description(alias_b), + ) + + # organism table + organism_a = Organism( + taxid_id=_extract_taxid(taxid_a), + common_name=_extract_common_name(taxid_a), + ) + organism_b = Organism( + taxid_id=_extract_taxid(taxid_b), + common_name=_extract_common_name(taxid_b), + ) + + # interaction_category table + category = None + if int_type: + category = InteractionCategory( + category_name=int_type[0].description or int_type[0].value + ) + + # dataset table + pubmed = _extract_id(pub_id, ("pubmed",)) + author_str = author[0].value if author else None + dataset = Dataset( + pubmed_id=pubmed, + author=author_str, + ) if (pubmed or author_str) else None + + # interaction.score — varchar(10) in DB, keep as string + score = None + if conf: + try: + score = str(round(float(conf[0].value), 6))[:10] + except ValueError: + score = None + + return ParsedInteraction( + protein_a=protein_a, + protein_b=protein_b, + organism_a=organism_a, + organism_b=organism_b, + score=score, + category=category, + dataset=dataset, + raw=raw, + ) + + +# ───────────────────────────────────────────────────────────── +# Public API +# ───────────────────────────────────────────────────────────── + +def parse_mitab27(filepath: str) -> list: + """ + Parse a PSI-MI TAB 2.7 file. + Returns a list of ParsedInteraction objects. + """ + interactions = [] + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.reader(f, delimiter='\t') + for line_num, row in enumerate(reader, start=1): + if not row or row[0].startswith('#'): + continue + if len(row) < 15: + raise MITABParseError( + f"Line {line_num}: Expected at least 15 columns, " + f"got {len(row)}." + ) + while len(row) < MITAB27_COLUMN_COUNT: + row.append('-') + raw = dict(zip(MITAB27_COLUMNS, row)) + interactions.append(_build_interaction(row, raw)) + return interactions + + +def parse_mitab27_from_string(content: str) -> list: + """ + Parse PSI-MI TAB 2.7 from a raw string. + Useful for API upload endpoints that receive file content directly. + """ + interactions = [] + reader = csv.reader(io.StringIO(content), delimiter='\t') + for line_num, row in enumerate(reader, start=1): + if not row or row[0].startswith('#'): + continue + if len(row) < 15: + raise MITABParseError( + f"Line {line_num}: Too few columns ({len(row)})" + ) + while len(row) < MITAB27_COLUMN_COUNT: + row.append('-') + raw = dict(zip(MITAB27_COLUMNS, row)) + interactions.append(_build_interaction(row, raw)) + return interactions \ No newline at end of file diff --git a/data-upload/models.py b/data-upload/models.py new file mode 100644 index 00000000..0839cdaf --- /dev/null +++ b/data-upload/models.py @@ -0,0 +1,72 @@ +""" +Data models for openPIP 2.0 — Python dataclasses that directly mirror +the openpip.sql database schema. These are the target output objects +of the PSI-MI TAB and CSV parsers in this directory. + +Table mappings: + Protein -> protein table + Organism -> organism table + Dataset -> dataset table + InteractionCategory -> interaction_category table + ParsedInteraction -> interaction table + interaction_dataset + + interaction_interaction_category + + interaction_support_information +""" + +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class Protein: + """Maps to the `protein` table in openpip.sql""" + gene_name: Optional[str] = None + protein_name: Optional[str] = None + uniprot_id: Optional[str] = None + ensembl_id: Optional[str] = None + entrez_id: Optional[str] = None + sequence: Optional[str] = None + description: Optional[str] = None + + +@dataclass +class Organism: + """Maps to the `organism` table in openpip.sql""" + taxid_id: Optional[str] = None + common_name: Optional[str] = None + scientific_name: Optional[str] = None + + +@dataclass +class Dataset: + """Maps to the `dataset` table in openpip.sql""" + name: Optional[str] = None + pubmed_id: Optional[str] = None + author: Optional[str] = None + year: Optional[str] = None + interaction_status: Optional[str] = None + description: Optional[str] = None + + +@dataclass +class InteractionCategory: + """Maps to the `interaction_category` table in openpip.sql""" + category_name: Optional[str] = None + + +@dataclass +class ParsedInteraction: + """ + Maps to the `interaction` table plus related junction tables: + interaction_dataset, interaction_interaction_category, + interaction_support_information + """ + protein_a: Protein = field(default_factory=Protein) + protein_b: Protein = field(default_factory=Protein) + organism_a: Organism = field(default_factory=Organism) + organism_b: Organism = field(default_factory=Organism) + score: Optional[str] = None + category: Optional[InteractionCategory] = None + dataset: Optional[Dataset] = None + support_info: dict = field(default_factory=dict) + raw: dict = field(default_factory=dict) \ No newline at end of file diff --git a/data-upload/requirements.txt b/data-upload/requirements.txt index 7cb6656b..9deb423b 100644 --- a/data-upload/requirements.txt +++ b/data-upload/requirements.txt @@ -1 +1,2 @@ selenium +pytest \ No newline at end of file diff --git a/data-upload/tests/__pycache__/test_parser.cpython-311-pytest-9.0.2.pyc b/data-upload/tests/__pycache__/test_parser.cpython-311-pytest-9.0.2.pyc new file mode 100644 index 00000000..72d76796 Binary files /dev/null and b/data-upload/tests/__pycache__/test_parser.cpython-311-pytest-9.0.2.pyc differ diff --git a/data-upload/tests/sample.mitab27.txt b/data-upload/tests/sample.mitab27.txt new file mode 100644 index 00000000..788a5a4b --- /dev/null +++ b/data-upload/tests/sample.mitab27.txt @@ -0,0 +1,4 @@ +#unique_id_a unique_id_b alt_id_a alt_id_b alias_a alias_b interaction_detection_method author publication_id taxid_a taxid_b interaction_type source_database interaction_id confidence_score complex_expansion bio_role_a bio_role_b exp_role_a exp_role_b interactor_type_a interactor_type_b xref_a xref_b xref_interaction annotation_a annotation_b annotation_interaction host_organism_taxid parameters creation_date update_date checksum_a checksum_b checksum_interaction negative features_a features_b stoichiometry_a stoichiometry_b participant_identification_a participant_identification_b +uniprotkb:P12345 uniprotkb:Q67890 ensembl:ENSP00000001 ensembl:ENSP00000002 uniprotkb:BRCA1_HUMAN(gene name) uniprotkb:TP53_HUMAN(gene name) psi-mi:"MI:0018"(two hybrid) Smith et al. (2020) pubmed:12345678 taxid:9606(human) taxid:9606(human) psi-mi:"MI:0915"(physical association) psi-mi:"MI:0469"(IntAct) intact:EBI-12345 author-score:0.85 - - - - - - - - - - - - - - - - - - - - - - - - - - - +uniprotkb:P98765 uniprotkb:Q11111 - - uniprotkb:MYC_HUMAN(gene name) uniprotkb:MAX_HUMAN(gene name) psi-mi:"MI:0096"(pull down) Jones et al. (2021) pubmed:87654321 taxid:9606(human) taxid:9606(human) psi-mi:"MI:0915"(physical association) psi-mi:"MI:0469"(IntAct) intact:EBI-67890 author-score:0.72 - - - - - - - - - - - - - - - - - - - - - - - - - - - +uniprotkb:A00001 uniprotkb:A00002 - - - - psi-mi:"MI:0018"(two hybrid) Brown et al. (2019) pubmed:11111111 taxid:9606(human) taxid:9606(human) psi-mi:"MI:0407"(direct interaction) psi-mi:"MI:0469"(IntAct) intact:EBI-99999 - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/data-upload/tests/test_parser.py b/data-upload/tests/test_parser.py new file mode 100644 index 00000000..5220f663 --- /dev/null +++ b/data-upload/tests/test_parser.py @@ -0,0 +1,350 @@ +import sys +import os + +# Make sure imports work from tests/ subfolder +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from mitab_parser import parse_mitab27, parse_field +from csv_parser import parse_csv +from validator import validate_file +from models import ParsedInteraction, Protein, Organism, Dataset + +# ───────────────────────────────────────────── +# Helpers +# ───────────────────────────────────────────── + +SAMPLE_MITAB = os.path.join(os.path.dirname(__file__), "sample.mitab27.txt") + + +# ───────────────────────────────────────────── +# parse_field tests +# ───────────────────────────────────────────── + +def test_parse_field_basic(): + """Standard db:value format""" + result = parse_field("uniprotkb:P12345") + assert len(result) == 1 + assert result[0].db == "uniprotkb" + assert result[0].value == "P12345" + assert result[0].description is None + + +def test_parse_field_with_description(): + """db:value(description) format""" + result = parse_field("uniprotkb:P12345(BRCA1_HUMAN)") + assert len(result) == 1 + assert result[0].db == "uniprotkb" + assert result[0].value == "P12345" + assert result[0].description == "BRCA1_HUMAN" + + +def test_parse_field_empty_dash(): + """- means no value in PSI-MI TAB""" + result = parse_field("-") + assert result == [] + + +def test_parse_field_empty_string(): + """Empty string also means no value""" + result = parse_field("") + assert result == [] + + +def test_parse_field_multiple_values(): + """Multiple values separated by pipe""" + result = parse_field("uniprotkb:P12345|ensembl:ENSP00000001") + assert len(result) == 2 + assert result[0].db == "uniprotkb" + assert result[0].value == "P12345" + assert result[1].db == "ensembl" + assert result[1].value == "ENSP00000001" + + +def test_parse_field_psi_mi_quoted(): + """PSI-MI fields often have quoted values like psi-mi:"MI:0018"(two hybrid)""" + result = parse_field('psi-mi:"MI:0018"(two hybrid)') + assert len(result) == 1 + assert result[0].db == "psi-mi" + assert "MI:0018" in result[0].value + assert result[0].description == "two hybrid" + + +# ───────────────────────────────────────────── +# parse_mitab27 file tests +# ───────────────────────────────────────────── + +def test_parse_file_row_count(): + """Sample file has 3 data rows (1 header/comment line ignored)""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert len(interactions) == 3 + + +def test_parse_file_protein_a_uniprot(): + """protein_a.uniprot_id correctly extracted from unique_id_a""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert interactions[0].protein_a.uniprot_id == "P12345" + + +def test_parse_file_protein_b_uniprot(): + """protein_b.uniprot_id correctly extracted from unique_id_b""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert interactions[0].protein_b.uniprot_id == "Q67890" + + +def test_parse_file_protein_a_ensembl(): + """ensembl_id extracted from alt_id_a""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert interactions[0].protein_a.ensembl_id == "ENSP00000001" + + +def test_parse_file_protein_b_ensembl(): + """ensembl_id extracted from alt_id_b""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert interactions[0].protein_b.ensembl_id == "ENSP00000002" + + +def test_parse_file_gene_name_a(): + """gene_name extracted from alias_a description field""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert interactions[0].protein_a.gene_name == "BRCA1_HUMAN" + + +def test_parse_file_gene_name_b(): + """gene_name extracted from alias_b description field""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert interactions[0].protein_b.gene_name == "TP53_HUMAN" + + +def test_parse_file_score(): + """confidence score correctly parsed as string for DB storage""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert interactions[0].score == "0.85" + + +def test_parse_file_score_missing(): + """missing score stored as None""" + interactions = parse_mitab27(SAMPLE_MITAB) + # third row has no score + assert interactions[2].score is None + + +def test_parse_file_organism_taxid(): + """organism taxid correctly extracted""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert interactions[0].organism_a.taxid_id == "9606" + + +def test_parse_file_interaction_category(): + """interaction_category.category_name populated from interaction_type""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert interactions[0].category is not None + assert "physical association" in interactions[0].category.category_name.lower() + + +def test_parse_file_dataset_pubmed(): + """dataset.pubmed_id extracted from publication_id""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert interactions[0].dataset.pubmed_id == "12345678" + + +def test_parse_file_dataset_author(): + """dataset.author extracted from author field""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert "Smith" in interactions[0].dataset.author + + +def test_parse_file_returns_parsed_interaction(): + """Each row returns a ParsedInteraction instance""" + interactions = parse_mitab27(SAMPLE_MITAB) + for ix in interactions: + assert isinstance(ix, ParsedInteraction) + + +def test_parse_file_protein_objects(): + """protein_a and protein_b are Protein instances""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert isinstance(interactions[0].protein_a, Protein) + assert isinstance(interactions[0].protein_b, Protein) + + +def test_parse_file_raw_preserved(): + """Raw row dict is preserved for debugging""" + interactions = parse_mitab27(SAMPLE_MITAB) + assert "unique_id_a" in interactions[0].raw + assert "unique_id_b" in interactions[0].raw + + +# ───────────────────────────────────────────── +# CSV parser tests +# ───────────────────────────────────────────── + +def test_csv_parser_basic(tmp_path): + """Basic CSV with required columns parses correctly""" + csv_file = tmp_path / "test.csv" + csv_file.write_text( + "protein_a,protein_b,interaction_type,score,publication\n" + "P12345,Q67890,physical association,0.9,12345678\n" + "P98765,Q11111,direct interaction,0.5,87654321\n" + ) + interactions = parse_csv(str(csv_file)) + assert len(interactions) == 2 + + +def test_csv_parser_protein_a(tmp_path): + """protein_a uniprot_id correctly set""" + csv_file = tmp_path / "test.csv" + csv_file.write_text( + "protein_a,protein_b\n" + "P12345,Q67890\n" + ) + interactions = parse_csv(str(csv_file)) + assert interactions[0].protein_a.uniprot_id == "P12345" + + +def test_csv_parser_protein_b(tmp_path): + """protein_b uniprot_id correctly set""" + csv_file = tmp_path / "test.csv" + csv_file.write_text( + "protein_a,protein_b\n" + "P12345,Q67890\n" + ) + interactions = parse_csv(str(csv_file)) + assert interactions[0].protein_b.uniprot_id == "Q67890" + + +def test_csv_parser_score(tmp_path): + """score correctly extracted""" + csv_file = tmp_path / "test.csv" + csv_file.write_text( + "protein_a,protein_b,score\n" + "P12345,Q67890,0.95\n" + ) + interactions = parse_csv(str(csv_file)) + assert interactions[0].score == "0.95" + + +def test_csv_parser_missing_required_columns(tmp_path): + """Missing protein_a or protein_b raises ValueError""" + csv_file = tmp_path / "test.csv" + csv_file.write_text( + "gene,score\n" + "BRCA1,0.9\n" + ) + try: + parse_csv(str(csv_file)) + assert False, "Should have raised ValueError" + except ValueError as e: + assert "protein_a" in str(e) or "protein_b" in str(e) + + +def test_csv_parser_returns_parsed_interaction(tmp_path): + """CSV parser returns ParsedInteraction instances""" + csv_file = tmp_path / "test.csv" + csv_file.write_text( + "protein_a,protein_b\n" + "P12345,Q67890\n" + ) + interactions = parse_csv(str(csv_file)) + assert isinstance(interactions[0], ParsedInteraction) + + +def test_csv_parser_pubmed(tmp_path): + """publication column maps to dataset.pubmed_id""" + csv_file = tmp_path / "test.csv" + csv_file.write_text( + "protein_a,protein_b,publication\n" + "P12345,Q67890,12345678\n" + ) + interactions = parse_csv(str(csv_file)) + assert interactions[0].dataset.pubmed_id == "12345678" + + +# ───────────────────────────────────────────── +# Validator tests +# ───────────────────────────────────────────── + +def test_validator_valid_file(): + """All rows in sample file pass validation""" + interactions = parse_mitab27(SAMPLE_MITAB) + results = validate_file(interactions) + assert all(r.is_valid for r in results) + + +def test_validator_missing_protein_a(): + """Row with no protein_a identifier fails validation""" + ix = ParsedInteraction( + protein_a=Protein(), # no uniprot_id, no gene_name + protein_b=Protein(uniprot_id="Q67890") + ) + from validator import validate_interaction + result = validate_interaction(ix, line_num=1) + assert not result.is_valid + assert len(result.errors) > 0 + + +def test_validator_missing_protein_b(): + """Row with no protein_b identifier fails validation""" + ix = ParsedInteraction( + protein_a=Protein(uniprot_id="P12345"), + protein_b=Protein() + ) + from validator import validate_interaction + result = validate_interaction(ix, line_num=1) + assert not result.is_valid + + +def test_validator_warns_no_uniprot(): + """Row with non-UniProt identifier generates warning""" + ix = ParsedInteraction( + protein_a=Protein(gene_name="BRCA1"), # has gene_name but no uniprot_id + protein_b=Protein(gene_name="TP53") + ) + from validator import validate_interaction + result = validate_interaction(ix, line_num=1) + assert result.is_valid # not an error, just a warning + assert len(result.warnings) > 0 + + +def test_validator_returns_list(): + """validate_file returns a list of results""" + interactions = parse_mitab27(SAMPLE_MITAB) + results = validate_file(interactions) + assert isinstance(results, list) + assert len(results) == len(interactions) + + +# ───────────────────────────────────────────── +# Run all tests manually if needed +# ───────────────────────────────────────────── + +if __name__ == "__main__": + # parse_field tests + test_parse_field_basic() + test_parse_field_with_description() + test_parse_field_empty_dash() + test_parse_field_empty_string() + test_parse_field_multiple_values() + test_parse_field_psi_mi_quoted() + + # mitab parser tests + test_parse_file_row_count() + test_parse_file_protein_a_uniprot() + test_parse_file_protein_b_uniprot() + test_parse_file_protein_a_ensembl() + test_parse_file_protein_b_ensembl() + test_parse_file_gene_name_a() + test_parse_file_gene_name_b() + test_parse_file_score() + test_parse_file_score_missing() + test_parse_file_organism_taxid() + test_parse_file_interaction_category() + test_parse_file_dataset_pubmed() + test_parse_file_dataset_author() + test_parse_file_returns_parsed_interaction() + test_parse_file_protein_objects() + test_parse_file_raw_preserved() + + # CSV tests need tmp_path — skip in manual run + print("CSV and validator tests require pytest — run: pytest tests/") + + print("\nAll manual tests passed.") \ No newline at end of file diff --git a/data-upload/validator.py b/data-upload/validator.py new file mode 100644 index 00000000..a56e53bb --- /dev/null +++ b/data-upload/validator.py @@ -0,0 +1,83 @@ +""" +Validation layer for openPIP 2.0 +Validates ParsedInteraction objects before DB insertion. +Works for both PSI-MI TAB and CSV parsed data since both +produce the same ParsedInteraction model. +""" + +from dataclasses import dataclass +from models import ParsedInteraction + + +@dataclass +class ValidationResult: + is_valid: bool + errors: list + warnings: list + + +def validate_interaction(ix: ParsedInteraction, line_num: int) -> ValidationResult: + """ + Validate a single ParsedInteraction. + Errors block DB insertion. Warnings are logged but allowed. + """ + errors = [] + warnings = [] + + # At least one identifier required for each interactor + has_a = ( + ix.protein_a.uniprot_id or + ix.protein_a.gene_name or + ix.protein_a.ensembl_id + ) + has_b = ( + ix.protein_b.uniprot_id or + ix.protein_b.gene_name or + ix.protein_b.ensembl_id + ) + + if not has_a: + errors.append( + f"Line {line_num}: Interactor A has no identifier " + f"(uniprot_id, gene_name, or ensembl_id required)" + ) + if not has_b: + errors.append( + f"Line {line_num}: Interactor B has no identifier " + f"(uniprot_id, gene_name, or ensembl_id required)" + ) + + # Warn if no UniProt ID — UniProt REST annotation will be skipped + if has_a and not ix.protein_a.uniprot_id: + warnings.append( + f"Line {line_num}: No UniProt ID for interactor A " + f"— UniProt annotation fetch will be skipped" + ) + if has_b and not ix.protein_b.uniprot_id: + warnings.append( + f"Line {line_num}: No UniProt ID for interactor B " + f"— UniProt annotation fetch will be skipped" + ) + + # Warn if score present but not numeric + if ix.score is not None: + try: + float(ix.score) + except ValueError: + warnings.append( + f"Line {line_num}: Score '{ix.score}' is not numeric" + ) + + return ValidationResult( + is_valid=len(errors) == 0, + errors=errors, + warnings=warnings, + ) + + +def validate_file(interactions: list) -> list: + """Validate all interactions from a parsed file.""" + return [ + validate_interaction(ix, i + 1) + for i, ix in enumerate(interactions) + ] \ No newline at end of file