frapercan · frapercan · Mar 16, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
diff --git a/.gitignore b/.gitignore
@@ -35,3 +35,7 @@ CLAUDE.md
 # Local data
 static/
 storage/
+
+# Large embedding caches and test artifacts
+data/ref_cache/
+apps/web/test-results/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute
+this software, either in source code form or as a compiled binary, for any
+purpose, commercial or non-commercial, and by any means.
+
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <https://unlicense.org/>
diff --git a/README.md b/README.md
@@ -39,7 +39,9 @@ PROTEA provides a unified backend for ingesting protein data from UniProt, compu
 
 ## Getting started
 
-### Docker (recommended)
+### Docker
+
+> **Not yet validated.** The Docker configuration exists but has not been tested end-to-end. It will likely need adjustments before it works out of the box — contributions welcome.
 
 ```bash
 git clone https://github.com/frapercan/PROTEA.git
@@ -52,7 +54,7 @@ Services available at:
 - API: http://localhost:8000
 - RabbitMQ management: http://localhost:15672 (guest/guest)
 
-### From source
+### From source (recommended)
 
 **Requirements:** Python 3.12, PostgreSQL 16 + pgvector, RabbitMQ 3.x
 
@@ -118,4 +120,4 @@ PROTEA is the natural evolution of two prior systems developed at **Ana Rojas' L
 
 PROTEA was designed to unify and supersede both systems under a single, maintainable codebase — removing the tight coupling between infrastructure, orchestration, and domain logic that accumulated across those projects.
 
-The evaluation pipeline and scoring methodology are directly informed by our participation in **CAFA6** (Critical Assessment of protein Function Annotation, 6th edition). The competition provided real-world benchmarking experience that shaped PROTEA's prediction and evaluation architecture, including the integration of [cafaeval](https://github.com/claradepaolis/CAFA-evaluator-PK) for standardised GO term prediction assessment.
+The evaluation pipeline and scoring methodology are directly informed by following the **CAFA** (Critical Assessment of protein Function Annotation) competition series. This benchmarking framework shaped PROTEA's prediction and evaluation architecture, including the integration of [cafaeval](https://github.com/claradepaolis/CAFA-evaluator-PK) for standardised GO term prediction assessment.
diff --git a/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py b/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py
@@ -0,0 +1,32 @@
+"""add_composite_index_pga_set_accession
+
+Revision ID: 489835ed5b31
+Revises: 7737a352d4fe
+Create Date: 2026-03-15 11:17:30.865922
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '489835ed5b31'
+down_revision: Union[str, Sequence[str], None] = '7737a352d4fe'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    op.create_index(
+        "ix_pga_set_accession",
+        "protein_go_annotation",
+        ["annotation_set_id", "protein_accession"],
+    )
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    op.drop_index("ix_pga_set_accession", table_name="protein_go_annotation")
diff --git a/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py b/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py
@@ -0,0 +1,38 @@
+"""add scoring_config_id to evaluation_result
+
+Revision ID: 513355a1d933
+Revises: 489835ed5b31
+Create Date: 2026-03-15 12:37:19.930750
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '513355a1d933'
+down_revision: Union[str, Sequence[str], None] = '489835ed5b31'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('evaluation_result', sa.Column('scoring_config_id', sa.UUID(), nullable=True))
+    op.create_index(op.f('ix_evaluation_result_scoring_config_id'), 'evaluation_result', ['scoring_config_id'], unique=False)
+    op.create_foreign_key(None, 'evaluation_result', 'scoring_config', ['scoring_config_id'], ['id'], ondelete='SET NULL')
+    op.drop_index(op.f('ix_pga_set_accession'), table_name='protein_go_annotation')
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_index(op.f('ix_pga_set_accession'), 'protein_go_annotation', ['annotation_set_id', 'protein_accession'], unique=False)
+    op.drop_constraint(None, 'evaluation_result', type_='foreignkey')
+    op.drop_index(op.f('ix_evaluation_result_scoring_config_id'), table_name='evaluation_result')
+    op.drop_column('evaluation_result', 'scoring_config_id')
+    # ### end Alembic commands ###
diff --git a/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py b/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py
@@ -0,0 +1,41 @@
+"""add_ia_url_to_ontology_snapshot
+
+Revision ID: 54e758c210c8
+Revises: c1d2e3f4a5b6
+Create Date: 2026-03-16 11:42:10.636169
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision: str = '54e758c210c8'
+down_revision: Union[str, Sequence[str], None] = 'c1d2e3f4a5b6'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('ontology_snapshot', sa.Column('ia_url', sa.String(), nullable=True, comment='URL of the Information Accretion TSV for this ontology release (two columns: go_id, ia_value). Used by run_cafa_evaluation to weight GO terms by information content. NULL means uniform IC=1.'))
+    op.alter_column('scoring_config', 'evidence_weights',
+               existing_type=postgresql.JSONB(astext_type=sa.Text()),
+               comment=None,
+               existing_comment='Optional per-GO-evidence-code quality multipliers in [0, 1]. NULL means use the system defaults defined in DEFAULT_EVIDENCE_WEIGHTS. Partial dicts are allowed; absent codes fall back to the system table.',
+               existing_nullable=True)
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.alter_column('scoring_config', 'evidence_weights',
+               existing_type=postgresql.JSONB(astext_type=sa.Text()),
+               comment='Optional per-GO-evidence-code quality multipliers in [0, 1]. NULL means use the system defaults defined in DEFAULT_EVIDENCE_WEIGHTS. Partial dicts are allowed; absent codes fall back to the system table.',
+               existing_nullable=True)
+    op.drop_column('ontology_snapshot', 'ia_url')
+    # ### end Alembic commands ###
diff --git a/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py b/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py
@@ -0,0 +1,28 @@
+"""merge_scoring_config_branch
+
+Revision ID: 7737a352d4fe
+Revises: 47de89cf6fec, b1c2d3e4f5a6
+Create Date: 2026-03-15 10:11:56.507967
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '7737a352d4fe'
+down_revision: Union[str, Sequence[str], None] = ('47de89cf6fec', 'b1c2d3e4f5a6')
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    pass
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    pass
diff --git a/alembic/versions/7c19ca08d5d4_add_support_entry_table.py b/alembic/versions/7c19ca08d5d4_add_support_entry_table.py
@@ -0,0 +1,37 @@
+"""add support_entry table
+
+Revision ID: 7c19ca08d5d4
+Revises: 513355a1d933
+Create Date: 2026-03-15 12:42:43.832417
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '7c19ca08d5d4'
+down_revision: Union[str, Sequence[str], None] = '513355a1d933'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('support_entry',
+    sa.Column('id', sa.UUID(), nullable=False),
+    sa.Column('comment', sa.Text(), nullable=True),
+    sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False),
+    sa.PrimaryKeyConstraint('id')
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('support_entry')
+    # ### end Alembic commands ###
diff --git a/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py b/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py
@@ -0,0 +1,38 @@
+"""add scoring_config table
+
+Revision ID: b1c2d3e4f5a6
+Revises: a7b8c9d0e1f2
+Create Date: 2026-03-15
+"""
+from __future__ import annotations
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+revision = "b1c2d3e4f5a6"
+down_revision = "a7b8c9d0e1f2"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "scoring_config",
+        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("name", sa.String(255), nullable=False),
+        sa.Column("formula", sa.String(50), nullable=False, server_default="linear"),
+        sa.Column("weights", postgresql.JSONB(astext_type=sa.Text()), nullable=False),
+        sa.Column("description", sa.Text(), nullable=True),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            nullable=False,
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+
+def downgrade() -> None:
+    op.drop_table("scoring_config")
diff --git a/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py b/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py
@@ -0,0 +1,49 @@
+"""Add evidence_weights column to scoring_config.
+
+Revision ID: c1d2e3f4a5b6
+Revises: 7c19ca08d5d4
+Create Date: 2026-03-16
+
+Motivation
+----------
+``ScoringConfig`` previously hard-coded the per-evidence-code quality
+weights inside the Python scoring engine, making them invisible to users
+and impossible to customise without a code change.
+
+This migration adds an optional ``evidence_weights`` JSONB column that
+stores per-code overrides at the config level.  Existing rows receive
+``NULL``, which is interpreted by the engine as "use system defaults"
+(:data:`protea.infrastructure.orm.models.embedding.scoring_config.DEFAULT_EVIDENCE_WEIGHTS`).
+The change is therefore fully backwards-compatible with all existing
+``ScoringConfig`` rows.
+"""
+from __future__ import annotations
+
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy.dialects import postgresql
+
+revision = "c1d2e3f4a5b6"
+down_revision = "7c19ca08d5d4"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "scoring_config",
+        sa.Column(
+            "evidence_weights",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=True,
+            comment=(
+                "Optional per-GO-evidence-code quality multipliers in [0, 1]. "
+                "NULL means use the system defaults defined in DEFAULT_EVIDENCE_WEIGHTS. "
+                "Partial dicts are allowed; absent codes fall back to the system table."
+            ),
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("scoring_config", "evidence_weights")