From 4c531582fbc4ed2691555c4c14cb25405dbe85b4 Mon Sep 17 00:00:00 2001 From: frapercan Date: Sat, 14 Mar 2026 22:29:49 +0100 Subject: [PATCH 01/17] chore: commit pending scripts, migrations and favicon --- ...dd_feature_engineering_to_go_prediction.py | 66 ++++ .../f1a2b3c4d5e6_add_go_term_relationship.py | 39 +++ apps/web/app/favicon.ico | Bin 25931 -> 781 bytes scripts/manage.sh | 321 ++++++++++++++++++ scripts/setup_vast.sh | 160 +++++++++ scripts/sync_db_vast.sh | 127 +++++++ 6 files changed, 713 insertions(+) create mode 100644 alembic/versions/a7b8c9d0e1f2_add_feature_engineering_to_go_prediction.py create mode 100644 alembic/versions/f1a2b3c4d5e6_add_go_term_relationship.py create mode 100755 scripts/manage.sh create mode 100755 scripts/setup_vast.sh create mode 100755 scripts/sync_db_vast.sh diff --git a/alembic/versions/a7b8c9d0e1f2_add_feature_engineering_to_go_prediction.py b/alembic/versions/a7b8c9d0e1f2_add_feature_engineering_to_go_prediction.py new file mode 100644 index 0000000..7d88cb1 --- /dev/null +++ b/alembic/versions/a7b8c9d0e1f2_add_feature_engineering_to_go_prediction.py @@ -0,0 +1,66 @@ +"""add feature engineering columns to go_prediction + +Revision ID: a7b8c9d0e1f2 +Revises: f1a2b3c4d5e6 +Create Date: 2026-03-11 00:00:00.000000 +""" +from __future__ import annotations + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +revision: str = "a7b8c9d0e1f2" +down_revision: Union[str, Sequence[str], None] = "f1a2b3c4d5e6" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Alignment — Needleman–Wunsch (global) + op.add_column("go_prediction", sa.Column("identity_nw", sa.Float(), nullable=True)) + op.add_column("go_prediction", sa.Column("similarity_nw", sa.Float(), nullable=True)) + op.add_column("go_prediction", sa.Column("alignment_score_nw", sa.Float(), nullable=True)) + op.add_column("go_prediction", sa.Column("gaps_pct_nw", sa.Float(), nullable=True)) + op.add_column("go_prediction", sa.Column("alignment_length_nw", sa.Float(), nullable=True)) + + # Alignment — Smith–Waterman (local) + op.add_column("go_prediction", sa.Column("identity_sw", sa.Float(), nullable=True)) + op.add_column("go_prediction", sa.Column("similarity_sw", sa.Float(), nullable=True)) + op.add_column("go_prediction", sa.Column("alignment_score_sw", sa.Float(), nullable=True)) + op.add_column("go_prediction", sa.Column("gaps_pct_sw", sa.Float(), nullable=True)) + op.add_column("go_prediction", sa.Column("alignment_length_sw", sa.Float(), nullable=True)) + + # Sequence lengths + op.add_column("go_prediction", sa.Column("length_query", sa.Integer(), nullable=True)) + op.add_column("go_prediction", sa.Column("length_ref", sa.Integer(), nullable=True)) + + # Taxonomy + op.add_column("go_prediction", sa.Column("query_taxonomy_id", sa.Integer(), nullable=True)) + op.add_column("go_prediction", sa.Column("ref_taxonomy_id", sa.Integer(), nullable=True)) + op.add_column("go_prediction", sa.Column("taxonomic_lca", sa.Integer(), nullable=True)) + op.add_column("go_prediction", sa.Column("taxonomic_distance", sa.Integer(), nullable=True)) + op.add_column("go_prediction", sa.Column("taxonomic_common_ancestors", sa.Integer(), nullable=True)) + op.add_column("go_prediction", sa.Column("taxonomic_relation", sa.String(length=20), nullable=True)) + + +def downgrade() -> None: + op.drop_column("go_prediction", "taxonomic_relation") + op.drop_column("go_prediction", "taxonomic_common_ancestors") + op.drop_column("go_prediction", "taxonomic_distance") + op.drop_column("go_prediction", "taxonomic_lca") + op.drop_column("go_prediction", "ref_taxonomy_id") + op.drop_column("go_prediction", "query_taxonomy_id") + op.drop_column("go_prediction", "length_ref") + op.drop_column("go_prediction", "length_query") + op.drop_column("go_prediction", "alignment_length_sw") + op.drop_column("go_prediction", "gaps_pct_sw") + op.drop_column("go_prediction", "alignment_score_sw") + op.drop_column("go_prediction", "similarity_sw") + op.drop_column("go_prediction", "identity_sw") + op.drop_column("go_prediction", "alignment_length_nw") + op.drop_column("go_prediction", "gaps_pct_nw") + op.drop_column("go_prediction", "alignment_score_nw") + op.drop_column("go_prediction", "similarity_nw") + op.drop_column("go_prediction", "identity_nw") diff --git a/alembic/versions/f1a2b3c4d5e6_add_go_term_relationship.py b/alembic/versions/f1a2b3c4d5e6_add_go_term_relationship.py new file mode 100644 index 0000000..a794190 --- /dev/null +++ b/alembic/versions/f1a2b3c4d5e6_add_go_term_relationship.py @@ -0,0 +1,39 @@ +"""add go_term_relationship table + +Revision ID: f1a2b3c4d5e6 +Revises: e5f6a7b8c9d0 +Create Date: 2026-03-10 00:00:00.000000 +""" +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + +revision = "f1a2b3c4d5e6" +down_revision = "e5f6a7b8c9d0" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "go_term_relationship", + sa.Column("id", sa.BigInteger(), autoincrement=True, nullable=False), + sa.Column("child_go_term_id", sa.BigInteger(), nullable=False), + sa.Column("parent_go_term_id", sa.BigInteger(), nullable=False), + sa.Column("relation_type", sa.String(40), nullable=False), + sa.Column("ontology_snapshot_id", sa.dialects.postgresql.UUID(as_uuid=True), nullable=False), + sa.ForeignKeyConstraint(["child_go_term_id"], ["go_term.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["parent_go_term_id"], ["go_term.id"], ondelete="CASCADE"), + sa.ForeignKeyConstraint(["ontology_snapshot_id"], ["ontology_snapshot.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("child_go_term_id", "parent_go_term_id", "relation_type", + name="uq_go_term_relationship"), + ) + op.create_index("ix_go_term_relationship_child", "go_term_relationship", ["child_go_term_id"]) + op.create_index("ix_go_term_relationship_parent", "go_term_relationship", ["parent_go_term_id"]) + op.create_index("ix_go_term_relationship_snapshot", "go_term_relationship", ["ontology_snapshot_id"]) + + +def downgrade() -> None: + op.drop_table("go_term_relationship") diff --git a/apps/web/app/favicon.ico b/apps/web/app/favicon.ico index 718d6fea4835ec2d246af9800eddb7ffb276240c..0fc9c3001b643f94998231dc1011a7f99aceac29 100644 GIT binary patch literal 781 zcmV+o1M>U;0096201yxW0000W0QUj_02TlM0EtjeM-2)Z3IG5A4M|8uQUCw|5C8xG z5C{eU001BJ|6u?C0=`K^K~#90ZIew)DsdRa&wrAbplM|eg=C8gC+OD7%62UjtQHYs z2(@pF3Ztpd;Kyk1DrjROL@ywbr3pqih@hr)p#_1jbME4j)a^d-z{BDEp4B<9EQ_sH ziFFu3va<5+^Wx$n3xxv1;V?6q46m=R84ib;%jFmh2ETbjp%8`B>EzSX z698ahVggG`OK=Sg;Me##4i67uSr+E!=aJ23@%Z?Ncs!0wCIbNI?d^r__4Nfp2w0W{ zMNzP|wFSd4Pv-u3W-HxN9BTPd zj}M%joB$*ljYbK8v$L}#$x5ZdR4PTM)5-q+ex_0>mdh2Aq|fIg0B&q-kR;J;G+D3L zxv;Q60QC8M%;j=4O_RE=Q`dEx<_q)rd^_%eK!CMs?QfL7mT7NquN~Rp;bD3_9(p_; zj*N`7b;B^em+9X+XR}$Zudj1(aFDVrQ%i@8hT6>)&Gu{h#Oeyszu?xtw#Zb1mO{pgX9699l+Qppw7jXaYf~-84xW z)w4x8?=youko|}Vr~(D$UXIbiXABHh`p1?nn8Po~fxRJv}|0e(BPs|G`(TT%kKVJAdg5*Z|x0leQq0 zkdUBvb#>9F()jo|T~kx@OM8$9wzs~t2l;K=woNssA3l6|sx2r3+kdfVW@e^8e*E}v zA1y5{bRi+3Z`uD3{F7LgFJDdvm;nJilkzDku>BwXH(8ItVCXk*-lSJnR?-2UN%hJ){&rlvg`CDTj z)Bzo!3v7Ou#83zEDEFcKt(f1E0~=rqeEbTnMvWR#{+9pg%7G8y>u1OVRUSoox-ovF z2Ydma(;=YuBY(eI|04{hXzZD6_f(v~H;C~y5=DhAC{MMS>2fm~1H_t2$56pc$NH8( z5bH|<)71dV-_oCHIrzrT`2s-5w_+2CM0$95I6X8p^r!gHp+j_gd;9O<1~CEQQGS8) zS9Qh3#p&JM-G8rHekNmKVewU;pJRcTAog68KYo^dRo}(M>36U4Us zfgYWSiHZL3;lpWT=zNAW>Dh#mB!_@Lg%$ms8N-;aPqMn+C2HqZgz&9~Eu z4|Kp<`$q)Uw1R?y(~S>ePdonHxpV1#eSP1B;Ogo+-Pk}6#0GsZZ5!||ev2MGdh}_m z{DeR7?0-1^zVs&`AV6Vt;r3`I`OI_wgs*w=eO%_#7Kepl{B@xiyCANc(l zzIyd4y|c6PXWq9-|KM8(zIk8LPk(>a)zyFWjhT!$HJ$qX1vo@d25W<fvZQ2zUz5WRc(UnFMKHwe1| zWmlB1qdbiA(C0jmnV<}GfbKtmcu^2*P^O?MBLZKt|As~ge8&AAO~2K@zbXelK|4T<{|y4`raF{=72kC2Kn(L4YyenWgrPiv z@^mr$t{#X5VuIMeL!7Ab6_kG$&#&5p*Z{+?5U|TZ`B!7llpVmp@skYz&n^8QfPJzL z0G6K_OJM9x+Wu2gfN45phANGt{7=C>i34CV{Xqlx(fWpeAoj^N0Biu`w+MVcCUyU* zDZuzO0>4Z6fbu^T_arWW5n!E45vX8N=bxTVeFoep_G#VmNlQzAI_KTIc{6>c+04vr zx@W}zE5JNSU>!THJ{J=cqjz+4{L4A{Ob9$ZJ*S1?Ggg3klFp!+Y1@K+pK1DqI|_gq z5ZDXVpge8-cs!o|;K73#YXZ3AShj50wBvuq3NTOZ`M&qtjj#GOFfgExjg8Gn8>Vq5 z`85n+9|!iLCZF5$HJ$Iu($dm?8~-ofu}tEc+-pyke=3!im#6pk_Wo8IA|fJwD&~~F zc16osQ)EBo58U7XDuMexaPRjU@h8tXe%S{fA0NH3vGJFhuyyO!Uyl2^&EOpX{9As0 zWj+P>{@}jxH)8|r;2HdupP!vie{sJ28b&bo!8`D^x}TE$%zXNb^X1p@0PJ86`dZyj z%ce7*{^oo+6%&~I!8hQy-vQ7E)0t0ybH4l%KltWOo~8cO`T=157JqL(oq_rC%ea&4 z2NcTJe-HgFjNg-gZ$6!Y`SMHrlj}Etf7?r!zQTPPSv}{so2e>Fjs1{gzk~LGeesX%r(Lh6rbhSo_n)@@G-FTQy93;l#E)hgP@d_SGvyCp0~o(Y;Ee8{ zdVUDbHm5`2taPUOY^MAGOw*>=s7=Gst=D+p+2yON!0%Hk` zz5mAhyT4lS*T3LS^WSxUy86q&GnoHxzQ6vm8)VS}_zuqG?+3td68_x;etQAdu@sc6 zQJ&5|4(I?~3d-QOAODHpZ=hlSg(lBZ!JZWCtHHSj`0Wh93-Uk)_S%zsJ~aD>{`A0~ z9{AG(e|q3g5B%wYKRxiL2Y$8(4w6bzchKuloQW#e&S3n+P- z8!ds-%f;TJ1>)v)##>gd{PdS2Oc3VaR`fr=`O8QIO(6(N!A?pr5C#6fc~Ge@N%Vvu zaoAX2&(a6eWy_q&UwOhU)|P3J0Qc%OdhzW=F4D|pt0E4osw;%<%Dn58hAWD^XnZD= z>9~H(3bmLtxpF?a7su6J7M*x1By7YSUbxGi)Ot0P77`}P3{)&5Un{KD?`-e?r21!4vTTnN(4Y6Lin?UkSM z`MXCTC1@4A4~mvz%Rh2&EwY))LeoT=*`tMoqcEXI>TZU9WTP#l?uFv+@Dn~b(>xh2 z;>B?;Tz2SR&KVb>vGiBSB`@U7VIWFSo=LDSb9F{GF^DbmWAfpms8Sx9OX4CnBJca3 zlj9(x!dIjN?OG1X4l*imJNvRCk}F%!?SOfiOq5y^mZW)jFL@a|r-@d#f7 z2gmU8L3IZq0ynIws=}~m^#@&C%J6QFo~Mo4V`>v7MI-_!EBMMtb%_M&kvAaN)@ZVw z+`toz&WG#HkWDjnZE!6nk{e-oFdL^$YnbOCN}JC&{$#$O27@|Tn-skXr)2ml2~O!5 zX+gYoxhoc7qoU?C^3~&!U?kRFtnSEecWuH0B0OvLodgUAi}8p1 zrO6RSXHH}DMc$&|?D004DiOVMHV8kXCP@7NKB zgaZq^^O<7PoKEp72kby@W0Z!Y*Ay{&vfg#C&gG@YVR9g?FEocMUi1gSN$+V+ayF45{a zuDZDTN}mS|;BO%gEf}pjBfN2-gIrU#G5~cucA;dokXW89%>AyXJJI z9X4UlIWA|ZYHgbI z5?oFk@A=Ik7lrEQPDH!H+b`7_Y~aDb_qa=B2^Y&Ow41cU=4WDd40dp5(QS-WMN-=Y z9g;6_-JdNU;|6cPwf$ak*aJIcwL@1n$#l~zi{c{EW?T;DaW*E8DYq?Umtz{nJ&w-M zEMyTDrC&9K$d|kZe2#ws6)L=7K+{ zQw{XnV6UC$6-rW0emqm8wJoeZK)wJIcV?dST}Z;G0Arq{dVDu0&4kd%N!3F1*;*pW zR&qUiFzK=@44#QGw7k1`3t_d8&*kBV->O##t|tonFc2YWrL7_eqg+=+k;!F-`^b8> z#KWCE8%u4k@EprxqiV$VmmtiWxDLgnGu$Vs<8rppV5EajBXL4nyyZM$SWVm!wnCj-B!Wjqj5-5dNXukI2$$|Bu3Lrw}z65Lc=1G z^-#WuQOj$hwNGG?*CM_TO8Bg-1+qc>J7k5c51U8g?ZU5n?HYor;~JIjoWH-G>AoUP ztrWWLbRNqIjW#RT*WqZgPJXU7C)VaW5}MiijYbABmzoru6EmQ*N8cVK7a3|aOB#O& zBl8JY2WKfmj;h#Q!pN%9o@VNLv{OUL?rixHwOZuvX7{IJ{(EdPpuVFoQqIOa7giLVkBOKL@^smUA!tZ1CKRK}#SSM)iQHk)*R~?M!qkCruaS!#oIL1c z?J;U~&FfH#*98^G?i}pA{ z9Jg36t4=%6mhY(quYq*vSxptes9qy|7xSlH?G=S@>u>Ebe;|LVhs~@+06N<4CViBk zUiY$thvX;>Tby6z9Y1edAMQaiH zm^r3v#$Q#2T=X>bsY#D%s!bhs^M9PMAcHbCc0FMHV{u-dwlL;a1eJ63v5U*?Q_8JO zT#50!RD619#j_Uf))0ooADz~*9&lN!bBDRUgE>Vud-i5ck%vT=r^yD*^?Mp@Q^v+V zG#-?gKlr}Eeqifb{|So?HM&g91P8|av8hQoCmQXkd?7wIJwb z_^v8bbg`SAn{I*4bH$u(RZ6*xUhuA~hc=8czK8SHEKTzSxgbwi~9(OqJB&gwb^l4+m`k*Q;_?>Y-APi1{k zAHQ)P)G)f|AyjSgcCFps)Fh6Bca*Xznq36!pV6Az&m{O8$wGFD? zY&O*3*J0;_EqM#jh6^gMQKpXV?#1?>$ml1xvh8nSN>-?H=V;nJIwB07YX$e6vLxH( zqYwQ>qxwR(i4f)DLd)-$P>T-no_c!LsN@)8`e;W@)-Hj0>nJ-}Kla4-ZdPJzI&Mce zv)V_j;(3ERN3_@I$N<^|4Lf`B;8n+bX@bHbcZTopEmDI*Jfl)-pFDvo6svPRoo@(x z);_{lY<;);XzT`dBFpRmGrr}z5u1=pC^S-{ce6iXQlLGcItwJ^mZx{m$&DA_oEZ)B{_bYPq-HA zcH8WGoBG(aBU_j)vEy+_71T34@4dmSg!|M8Vf92Zj6WH7Q7t#OHQqWgFE3ARt+%!T z?oLovLVlnf?2c7pTc)~cc^($_8nyKwsN`RA-23ed3sdj(ys%pjjM+9JrctL;dy8a( z@en&CQmnV(()bu|Y%G1-4a(6x{aLytn$T-;(&{QIJB9vMox11U-1HpD@d(QkaJdEb zG{)+6Dos_L+O3NpWo^=gR?evp|CqEG?L&Ut#D*KLaRFOgOEK(Kq1@!EGcTfo+%A&I z=dLbB+d$u{sh?u)xP{PF8L%;YPPW53+@{>5W=Jt#wQpN;0_HYdw1{ksf_XhO4#2F= zyPx6Lx2<92L-;L5PD`zn6zwIH`Jk($?Qw({erA$^bC;q33hv!d!>%wRhj# zal^hk+WGNg;rJtb-EB(?czvOM=H7dl=vblBwAv>}%1@{}mnpUznfq1cE^sgsL0*4I zJ##!*B?=vI_OEVis5o+_IwMIRrpQyT_Sq~ZU%oY7c5JMIADzpD!Upz9h@iWg_>>~j zOLS;wp^i$-E?4<_cp?RiS%Rd?i;f*mOz=~(&3lo<=@(nR!_Rqiprh@weZlL!t#NCc zO!QTcInq|%#>OVgobj{~ixEUec`E25zJ~*DofsQdzIa@5^nOXj2T;8O`l--(QyU^$t?TGY^7#&FQ+2SS3B#qK*k3`ye?8jUYSajE5iBbJls75CCc(m3dk{t?- zopcER9{Z?TC)mk~gpi^kbbu>b-+a{m#8-y2^p$ka4n60w;Sc2}HMf<8JUvhCL0B&Btk)T`ctE$*qNW8L$`7!r^9T+>=<=2qaq-;ll2{`{Rg zc5a0ZUI$oG&j-qVOuKa=*v4aY#IsoM+1|c4Z)<}lEDvy;5huB@1RJPquU2U*U-;gu z=En2m+qjBzR#DEJDO`WU)hdd{Vj%^0V*KoyZ|5lzV87&g_j~NCjwv0uQVqXOb*QrQ zy|Qn`hxx(58c70$E;L(X0uZZ72M1!6oeg)(cdKO ze0gDaTz+ohR-#d)NbAH4x{I(21yjwvBQfmpLu$)|m{XolbgF!pmsqJ#D}(ylp6uC> z{bqtcI#hT#HW=wl7>p!38sKsJ`r8}lt-q%Keqy%u(xk=yiIJiUw6|5IvkS+#?JTBl z8H5(Q?l#wzazujH!8o>1xtn8#_w+397*_cy8!pQGP%K(Ga3pAjsaTbbXJlQF_+m+-UpUUent@xM zg%jqLUExj~o^vQ3Gl*>wh=_gOr2*|U64_iXb+-111aH}$TjeajM+I20xw(((>fej-@CIz4S1pi$(#}P7`4({6QS2CaQS4NPENDp>sAqD z$bH4KGzXGffkJ7R>V>)>tC)uax{UsN*dbeNC*v}#8Y#OWYwL4t$ePR?VTyIs!wea+ z5Urmc)X|^`MG~*dS6pGSbU+gPJoq*^a=_>$n4|P^w$sMBBy@f*Z^Jg6?n5?oId6f{ z$LW4M|4m502z0t7g<#Bx%X;9<=)smFolV&(V^(7Cv2-sxbxopQ!)*#ZRhTBpx1)Fc zNm1T%bONzv6@#|dz(w02AH8OXe>kQ#1FMCzO}2J_mST)+ExmBr9cva-@?;wnmWMOk z{3_~EX_xadgJGv&H@zK_8{(x84`}+c?oSBX*Ge3VdfTt&F}yCpFP?CpW+BE^cWY0^ zb&uBN!Ja3UzYHK-CTyA5=L zEMW{l3Usky#ly=7px648W31UNV@K)&Ub&zP1c7%)`{);I4b0Q<)B}3;NMG2JH=X$U zfIW4)4n9ZM`-yRj67I)YSLDK)qfUJ_ij}a#aZN~9EXrh8eZY2&=uY%2N0UFF7<~%M zsB8=erOWZ>Ct_#^tHZ|*q`H;A)5;ycw*IcmVxi8_0Xk}aJA^ath+E;xg!x+As(M#0=)3!NJR6H&9+zd#iP(m0PIW8$ z1Y^VX`>jm`W!=WpF*{ioM?C9`yOR>@0q=u7o>BP-eSHqCgMDj!2anwH?s%i2p+Q7D zzszIf5XJpE)IG4;d_(La-xenmF(tgAxK`Y4sQ}BSJEPs6N_U2vI{8=0C_F?@7<(G; zo$~G=8p+076G;`}>{MQ>t>7cm=zGtfbdDXm6||jUU|?X?CaE?(<6bKDYKeHlz}DA8 zXT={X=yp_R;HfJ9h%?eWvQ!dRgz&Su*JfNt!Wu>|XfU&68iRikRrHRW|ZxzRR^`eIGt zIeiDgVS>IeExKVRWW8-=A=yA`}`)ZkWBrZD`hpWIxBGkh&f#ijr449~m`j6{4jiJ*C!oVA8ZC?$1RM#K(_b zL9TW)kN*Y4%^-qPpMP7d4)o?Nk#>aoYHT(*g)qmRUb?**F@pnNiy6Fv9rEiUqD(^O zzyS?nBrX63BTRYduaG(0VVG2yJRe%o&rVrLjbxTaAFTd8s;<<@Qs>u(<193R8>}2_ zuwp{7;H2a*X7_jryzriZXMg?bTuegABb^87@SsKkr2)0Gyiax8KQWstw^v#ix45EVrcEhr>!NMhprl$InQMzjSFH54x5k9qHc`@9uKQzvL4ihcq{^B zPrVR=o_ic%Y>6&rMN)hTZsI7I<3&`#(nl+3y3ys9A~&^=4?PL&nd8)`OfG#n zwAMN$1&>K++c{^|7<4P=2y(B{jJsQ0a#U;HTo4ZmWZYvI{+s;Td{Yzem%0*k#)vjpB zia;J&>}ICate44SFYY3vEelqStQWFihx%^vQ@Do(sOy7yR2@WNv7Y9I^yL=nZr3mb zXKV5t@=?-Sk|b{XMhA7ZGB@2hqsx}4xwCW!in#C zI@}scZlr3-NFJ@NFaJlhyfcw{k^vvtGl`N9xSo**rDW4S}i zM9{fMPWo%4wYDG~BZ18BD+}h|GQKc-g^{++3MY>}W_uq7jGHx{mwE9fZiPCoxN$+7 zrODGGJrOkcPQUB(FD5aoS4g~7#6NR^ma7-!>mHuJfY5kTe6PpNNKC9GGRiu^L31uG z$7v`*JknQHsYB!Tm_W{a32TM099djW%5e+j0Ve_ct}IM>XLF1Ap+YvcrLV=|CKo6S zb+9Nl3_YdKP6%Cxy@6TxZ>;4&nTneadr z_ES90ydCev)LV!dN=#(*f}|ZORFdvkYBni^aLbUk>BajeWIOcmHP#8S)*2U~QKI%S zyrLmtPqb&TphJ;>yAxri#;{uyk`JJqODDw%(Z=2`1uc}br^V%>j!gS)D*q*f_-qf8&D;W1dJgQMlaH5er zN2U<%Smb7==vE}dDI8K7cKz!vs^73o9f>2sgiTzWcwY|BMYHH5%Vn7#kiw&eItCqa zIkR2~Q}>X=Ar8W|^Ms41Fm8o6IB2_j60eOeBB1Br!boW7JnoeX6Gs)?7rW0^5psc- zjS16yb>dFn>KPOF;imD}e!enuIniFzv}n$m2#gCCv4jM#ArwlzZ$7@9&XkFxZ4n!V zj3dyiwW4Ki2QG{@i>yuZXQizw_OkZI^-3otXC{!(lUpJF33gI60ak;Uqitp74|B6I zgg{b=Iz}WkhCGj1M=hu4#Aw173YxIVbISaoc z-nLZC*6Tgivd5V`K%GxhBsp@SUU60-rfc$=wb>zdJzXS&-5(NRRodFk;Kxk!S(O(a0e7oY=E( zAyS;Ow?6Q&XA+cnkCb{28_1N8H#?J!*$MmIwLq^*T_9-z^&UE@A(z9oGYtFy6EZef LrJugUA?W`A8`#=m diff --git a/scripts/manage.sh b/scripts/manage.sh new file mode 100755 index 0000000..0974502 --- /dev/null +++ b/scripts/manage.sh @@ -0,0 +1,321 @@ +#!/usr/bin/env bash +# scripts/manage.sh — PROTEA dev stack manager +# +# Usage: +# bash scripts/manage.sh start [N] Start stack (N = embed+predict batch workers, default 1) +# bash scripts/manage.sh stop Stop all processes +# bash scripts/manage.sh status Show worker status table +# bash scripts/manage.sh logs [name] Tail logs (no name = pick from menu) +# bash scripts/manage.sh scale [N] Add N extra workers to a queue (default 1) + +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +LOG_DIR="$ROOT/logs" +PID_DIR="$ROOT/logs/pids" + +# ── colours ────────────────────────────────────────────────────────────────── +GREEN="\033[32m"; RED="\033[31m"; YELLOW="\033[33m" +CYAN="\033[36m"; BOLD="\033[1m"; RESET="\033[0m" + +# ── helpers ─────────────────────────────────────────────────────────────────── +_start_bg() { + local name="$1"; shift + mkdir -p "$LOG_DIR" "$PID_DIR" + "$@" >> "$LOG_DIR/${name}.log" 2>&1 & + local pid=$! + echo "$pid" > "$PID_DIR/${name}.pid" + printf " ${GREEN}✓${RESET} %-35s PID %s\n" "$name" "$pid" +} + +_stop_pid() { + local name="$1" pidfile="$PID_DIR/$1.pid" + if [[ -f "$pidfile" ]]; then + local pid; pid=$(cat "$pidfile") + if kill -0 "$pid" 2>/dev/null; then + kill -15 "$pid" 2>/dev/null # SIGTERM — lets current job finish + printf " ${RED}✗${RESET} %-35s stopping (PID %s) — SIGTERM sent\n" "$name" "$pid" + fi + rm -f "$pidfile" + fi +} + +_worker_name() { + # Generate a unique name for scaled workers: worker-- + local queue="$1" n="${2:-1}" + local slug="${queue//protea./}"; slug="${slug//./-}" + echo "worker-${slug}-${n}" +} + +_pid_rss_mb() { + local pid="$1" + awk '/VmRSS/{printf "%d", $2/1024}' "/proc/$pid/status" 2>/dev/null || echo "?" +} + +# ── start ───────────────────────────────────────────────────────────────────── +cmd_start() { + local BATCH_WORKERS="${1:-1}" + + printf "\n${BOLD}=== PROTEA dev stack (${BATCH_WORKERS} batch worker(s)) ===${RESET}\n\n" + + # Stop survivors + printf "${BOLD}[1] Stopping previous processes...${RESET}\n" + for f in "$PID_DIR"/*.pid; do + [[ -e "$f" ]] && _stop_pid "$(basename "$f" .pid)" + done + # Kill API and frontend (no long-running jobs, safe to force-kill) + kill -9 $(pgrep -f "uvicorn protea.api" 2>/dev/null) 2>/dev/null || true + kill -9 $(pgrep -f "next-server" 2>/dev/null) 2>/dev/null || true + # Workers that were tracked received SIGTERM above; untracked ones are left + # running so long-running jobs (e.g. run_cafa_evaluation) are not interrupted. + sleep 1 + + # API + printf "\n${BOLD}[2] API${RESET}\n" + cd "$ROOT" + _start_bg api poetry run uvicorn protea.api.app:create_app \ + --factory --host 0.0.0.0 --port 8000 --root-path /api-proxy + sleep 3 + curl -sf http://localhost:8000/jobs > /dev/null \ + && printf " ${GREEN}API OK${RESET} → http://localhost:8000\n" \ + || { printf " ${RED}API FAILED${RESET} — check logs/api.log\n"; exit 1; } + + # Core workers + printf "\n${BOLD}[3] Core workers${RESET}\n" + _start_bg worker-ping poetry run python scripts/worker.py --queue protea.ping + _start_bg worker-jobs poetry run python scripts/worker.py --queue protea.jobs + + # Embeddings pipeline + printf "\n${BOLD}[4] Embeddings pipeline${RESET}\n" + _start_bg worker-embeddings-coord poetry run python scripts/worker.py --queue protea.embeddings + for i in $(seq 1 "$BATCH_WORKERS"); do + _start_bg "worker-embeddings-batch-${i}" \ + poetry run python scripts/worker.py --queue protea.embeddings.batch + done + _start_bg worker-embeddings-write poetry run python scripts/worker.py --queue protea.embeddings.write + + # Predictions pipeline + printf "\n${BOLD}[5] Predictions pipeline${RESET}\n" + for i in $(seq 1 "$BATCH_WORKERS"); do + _start_bg "worker-predictions-batch-${i}" \ + poetry run python scripts/worker.py --queue protea.predictions.batch + done + _start_bg worker-predictions-write poetry run python scripts/worker.py --queue protea.predictions.write + + # Frontend + printf "\n${BOLD}[6] Frontend${RESET}\n" + cd "$ROOT/apps/web" + _start_bg frontend npm run dev + sleep 6 + curl -sf http://localhost:3000 -o /dev/null \ + && printf " ${GREEN}Frontend OK${RESET} → http://localhost:3000\n" \ + || printf " ${YELLOW}Frontend not ready yet${RESET} — check logs/frontend.log\n" + + printf "\n${BOLD}=== Stack running ===${RESET}\n" + printf " Frontend → http://localhost:3000\n" + printf " API → http://localhost:8000\n" + printf " RabbitMQ → http://localhost:15672 (guest/guest)\n" + printf "\n ${CYAN}bash scripts/manage.sh status${RESET} — show worker status\n" + printf " ${CYAN}bash scripts/manage.sh logs${RESET} — browse logs\n" + printf " ${CYAN}bash scripts/manage.sh stop${RESET} — stop everything\n\n" +} + +# ── stop ────────────────────────────────────────────────────────────────────── +cmd_stop() { + printf "\n${BOLD}=== Stopping PROTEA dev stack ===${RESET}\n\n" + + # Collect all worker PIDs before removing pid files + local worker_pids=() + local stopped=0 + for f in "$PID_DIR"/*.pid; do + [[ -e "$f" ]] || continue + local pid; pid=$(cat "$f") + _stop_pid "$(basename "$f" .pid)" + kill -0 "$pid" 2>/dev/null && worker_pids+=("$pid") + (( stopped++ )) || true + done + + # Also catch any untracked worker.py processes (manual launches etc.) + while IFS= read -r pid; do + kill -15 "$pid" 2>/dev/null && worker_pids+=("$pid") + done < <(pgrep -f "scripts/worker.py" 2>/dev/null || true) + + # Force-kill API and frontend immediately (no long-running state) + kill -9 $(pgrep -f "uvicorn protea.api" 2>/dev/null) 2>/dev/null || true + kill -9 $(pgrep -f "next-server" 2>/dev/null) 2>/dev/null || true + + # Wait up to 60 s for workers to finish current job, then force-kill + if [[ ${#worker_pids[@]} -gt 0 ]]; then + printf " Waiting up to 60s for workers to finish current jobs...\n" + local deadline=$(( $(date +%s) + 60 )) + for pid in "${worker_pids[@]}"; do + while kill -0 "$pid" 2>/dev/null && (( $(date +%s) < deadline )); do + sleep 2 + done + if kill -0 "$pid" 2>/dev/null; then + kill -9 "$pid" 2>/dev/null + printf " ${YELLOW}⚠${RESET} PID %s force-killed (job still running)\n" "$pid" + fi + done + fi + + [[ $stopped -eq 0 ]] && printf " (nothing was running)\n" + printf "\n${GREEN}Done.${RESET}\n\n" +} + +# ── status ──────────────────────────────────────────────────────────────────── +cmd_status() { + printf "\n${BOLD}=== PROTEA worker status ===${RESET}\n\n" + printf " ${BOLD}%-35s %-8s %-8s %s${RESET}\n" "NAME" "PID" "RAM" "STATUS" + printf " %s\n" "$(printf '─%.0s' {1..60})" + + for f in "$PID_DIR"/*.pid; do + [[ -e "$f" ]] || continue + local name; name="$(basename "$f" .pid)" + local pid; pid="$(cat "$f")" + if kill -0 "$pid" 2>/dev/null; then + local rss; rss="$(_pid_rss_mb "$pid") MB" + printf " ${GREEN}●${RESET} %-35s %-8s %-8s ${GREEN}running${RESET}\n" "$name" "$pid" "$rss" + else + printf " ${RED}●${RESET} %-35s %-8s %-8s ${RED}dead${RESET}\n" "$name" "$pid" "-" + fi + done + + # Check for untracked workers + local untracked + untracked=$(pgrep -f "scripts/worker.py" 2>/dev/null || true) + if [[ -n "$untracked" ]]; then + local tracked_pids + tracked_pids=$(cat "$PID_DIR"/*.pid 2>/dev/null | tr '\n' '|' | sed 's/|$//') + while IFS= read -r pid; do + if [[ -n "$tracked_pids" ]] && echo "$pid" | grep -qE "^(${tracked_pids})$"; then + continue + fi + local queue; queue=$(ps -p "$pid" -o args= 2>/dev/null | grep -o '\-\-queue [^ ]*' | awk '{print $2}') + local rss; rss="$(_pid_rss_mb "$pid") MB" + printf " ${YELLOW}●${RESET} %-35s %-8s %-8s ${YELLOW}untracked${RESET}\n" \ + "worker (${queue})" "$pid" "$rss" + done <<< "$untracked" + fi + + printf "\n" + + # API + if curl -sf http://localhost:8000/jobs > /dev/null 2>&1; then + printf " ${GREEN}●${RESET} API → http://localhost:8000 ${GREEN}up${RESET}\n" + else + printf " ${RED}●${RESET} API → http://localhost:8000 ${RED}down${RESET}\n" + fi + + # Frontend + if curl -sf http://localhost:3000 -o /dev/null 2>&1; then + printf " ${GREEN}●${RESET} Frontend → http://localhost:3000 ${GREEN}up${RESET}\n" + else + printf " ${RED}●${RESET} Frontend → http://localhost:3000 ${RED}down${RESET}\n" + fi + + printf "\n" +} + +# ── logs ────────────────────────────────────────────────────────────────────── +cmd_logs() { + local target="${1:-}" + + if [[ -n "$target" ]]; then + # Direct: find log file matching the given name fragment + local match + match=$(find "$LOG_DIR" -maxdepth 1 -name "*.log" | grep -i "$target" | head -1) + if [[ -z "$match" ]]; then + printf "${RED}No log found matching '%s'${RESET}\n" "$target" + printf "Available logs:\n" + find "$LOG_DIR" -maxdepth 1 -name "*.log" -exec basename {} \; | sort | sed 's/^/ /' + exit 1 + fi + printf "${CYAN}=== %s ===${RESET}\n" "$(basename "$match")" + tail -f "$match" + return + fi + + # Interactive picker + local logs + mapfile -t logs < <(find "$LOG_DIR" -maxdepth 1 -name "*.log" | sort | xargs -I{} basename {}) + + if [[ ${#logs[@]} -eq 0 ]]; then + printf "No log files found in %s\n" "$LOG_DIR" + exit 1 + fi + + printf "\n${BOLD}Available logs:${RESET}\n\n" + for i in "${!logs[@]}"; do + printf " ${CYAN}%2d${RESET} %s\n" "$((i+1))" "${logs[$i]}" + done + printf "\n ${CYAN} a${RESET} all (tail -f all logs)\n" + printf "\nSelect [1-%d / a]: " "${#logs[@]}" + read -r choice + + if [[ "$choice" == "a" ]]; then + tail -f "$LOG_DIR"/*.log + elif [[ "$choice" =~ ^[0-9]+$ ]] && (( choice >= 1 && choice <= ${#logs[@]} )); then + local selected="$LOG_DIR/${logs[$((choice-1))]}" + printf "\n${CYAN}=== %s ===${RESET}\n" "${logs[$((choice-1))]}" + tail -f "$selected" + else + printf "${RED}Invalid choice.${RESET}\n" + exit 1 + fi +} + +# ── scale ───────────────────────────────────────────────────────────────────── +cmd_scale() { + local queue="${1:-}" + local n="${2:-1}" + + if [[ -z "$queue" ]]; then + printf "Usage: manage.sh scale [N]\n" + printf "Example: manage.sh scale protea.predictions.batch 2\n" + exit 1 + fi + + printf "\n${BOLD}Adding %s worker(s) to %s${RESET}\n\n" "$n" "$queue" + cd "$ROOT" + for i in $(seq 1 "$n"); do + # Find a free index + local idx=1 + while [[ -f "$PID_DIR/$(_worker_name "$queue" "$idx").pid" ]]; do + (( idx++ )) + done + _start_bg "$(_worker_name "$queue" "$idx")" \ + poetry run python scripts/worker.py --queue "$queue" + done + printf "\n" +} + +# ── dispatch ────────────────────────────────────────────────────────────────── +CMD="${1:-help}" +shift || true + +case "$CMD" in + start) cmd_start "${1:-1}" ;; + stop) cmd_stop ;; + status) cmd_status ;; + logs) cmd_logs "${1:-}" ;; + scale) cmd_scale "${1:-}" "${2:-1}" ;; + help|--help|-h) + printf "\n${BOLD}PROTEA dev stack manager${RESET}\n\n" + printf " ${CYAN}bash scripts/manage.sh start [N]${RESET} Start stack (N batch workers per pipeline)\n" + printf " ${CYAN}bash scripts/manage.sh stop${RESET} Stop all processes\n" + printf " ${CYAN}bash scripts/manage.sh status${RESET} Show worker status + RAM\n" + printf " ${CYAN}bash scripts/manage.sh logs [name]${RESET} Tail logs (interactive if no name)\n" + printf " ${CYAN}bash scripts/manage.sh scale [N]${RESET} Add N extra workers to a queue\n\n" + printf "Examples:\n" + printf " bash scripts/manage.sh start # 1 batch worker per pipeline\n" + printf " bash scripts/manage.sh start 2 # 2 batch workers per pipeline\n" + printf " bash scripts/manage.sh scale protea.predictions.batch 2\n" + printf " bash scripts/manage.sh logs predictions\n\n" + ;; + *) + printf "${RED}Unknown command: %s${RESET}\n" "$CMD" + printf "Run ${CYAN}bash scripts/manage.sh help${RESET} for usage.\n" + exit 1 + ;; +esac diff --git a/scripts/setup_vast.sh b/scripts/setup_vast.sh new file mode 100755 index 0000000..5b77f2e --- /dev/null +++ b/scripts/setup_vast.sh @@ -0,0 +1,160 @@ +#!/usr/bin/env bash +# scripts/setup_vast.sh — Bootstrap PROTEA on a fresh vast.ai instance +# +# Usage (on the vast.ai instance, from the repo root): +# bash scripts/setup_vast.sh [DB_PASSWORD] [BATCH_WORKERS] +# +# After running: +# Frontend → http://:3000 +# API → http://:8000 +# RabbitMQ → http://:15672 (guest/guest) + +set -euo pipefail + +DB_PASSWORD="${1:-protea}" +BATCH_WORKERS="${2:-1}" + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +GREEN="\033[32m"; YELLOW="\033[33m"; BOLD="\033[1m"; RESET="\033[0m" +step() { printf "\n${BOLD}==> %s${RESET}\n" "$*"; } +ok() { printf " ${GREEN}✓${RESET} %s\n" "$*"; } + +# ── 0. Detect public IP ─────────────────────────────────────────────────────── +step "Detecting public IP" +PUBLIC_IP=$(curl -sf https://ifconfig.me || curl -sf https://api.ipify.org || echo "127.0.0.1") +ok "Public IP: $PUBLIC_IP" + +# ── 1. System packages + Python 3.12 ───────────────────────────────────────── +step "Installing system packages + Python 3.12" +export DEBIAN_FRONTEND=noninteractive +apt-get update -qq +apt-get install -y -qq \ + curl wget gnupg lsb-release ca-certificates \ + build-essential git software-properties-common libpq-dev + +if ! python3.12 --version &>/dev/null; then + add-apt-repository -y ppa:deadsnakes/ppa + apt-get update -qq + apt-get install -y -qq python3.12 python3.12-dev python3.12-venv +fi +curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 +ok "Python $(python3.12 --version)" + +# ── 2. PostgreSQL 16 + pgvector ─────────────────────────────────────────────── +step "Installing PostgreSQL 16 + pgvector" +if ! command -v psql &>/dev/null; then + curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc \ + | gpg --dearmor -o /usr/share/keyrings/postgresql.gpg + echo "deb [signed-by=/usr/share/keyrings/postgresql.gpg] \ +https://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" \ + > /etc/apt/sources.list.d/pgdg.list + apt-get update -qq + apt-get install -y -qq postgresql-16 postgresql-16-pgvector +fi +ok "PostgreSQL installed" + +service postgresql start || pg_ctlcluster 16 main start +sleep 2 + +su -c "psql -tc \"SELECT 1 FROM pg_roles WHERE rolname='protea'\" | grep -q 1 || \ + psql -c \"CREATE USER protea WITH PASSWORD '${DB_PASSWORD}';\"" postgres +su -c "psql -tc \"SELECT 1 FROM pg_database WHERE datname='protea'\" | grep -q 1 || \ + psql -c \"CREATE DATABASE protea OWNER protea;\"" postgres +su -c "psql -d protea -c \"CREATE EXTENSION IF NOT EXISTS vector;\"" postgres +ok "Database 'protea' ready" + +# ── 3. Erlang 26 + RabbitMQ ─────────────────────────────────────────────────── +step "Installing Erlang 26 + RabbitMQ" +if ! command -v rabbitmqctl &>/dev/null; then + # Erlang 26 from RabbitMQ's Cloudsmith repo (manual setup, no helper script) + rm -f /usr/share/keyrings/rabbitmq-erlang.gpg /usr/share/keyrings/rabbitmq-server.gpg + curl -fsSL https://github.com/rabbitmq/signing-keys/releases/download/3.0/cloudsmith.rabbitmq-erlang.E495BB49CC4BBE5B.key \ + -o /tmp/rabbitmq-erlang.key + gpg --batch --no-tty --dearmor < /tmp/rabbitmq-erlang.key > /usr/share/keyrings/rabbitmq-erlang.gpg + curl -fsSL https://github.com/rabbitmq/signing-keys/releases/download/3.0/cloudsmith.rabbitmq-server.9F4587F226208342.key \ + -o /tmp/rabbitmq-server.key + gpg --batch --no-tty --dearmor < /tmp/rabbitmq-server.key > /usr/share/keyrings/rabbitmq-server.gpg + cat > /etc/apt/sources.list.d/rabbitmq.list <<'APTEOF' +deb [arch=amd64 signed-by=/usr/share/keyrings/rabbitmq-erlang.gpg] https://ppa1.rabbitmq.com/rabbitmq/rabbitmq-erlang/deb/ubuntu jammy main +deb [arch=amd64 signed-by=/usr/share/keyrings/rabbitmq-server.gpg] https://ppa1.rabbitmq.com/rabbitmq/rabbitmq-server/deb/ubuntu jammy main +APTEOF + apt-get update -qq + # Pin RabbitMQ's Erlang over Ubuntu's older version + apt-get install -y -qq -o Dpkg::Options::="--force-overwrite" \ + erlang-base erlang-asn1 erlang-crypto erlang-eldap \ + erlang-ftp erlang-inets erlang-mnesia erlang-os-mon erlang-parsetools \ + erlang-public-key erlang-runtime-tools erlang-snmp erlang-ssl \ + erlang-syntax-tools erlang-tftp erlang-tools erlang-xmerl + apt-get install -y -qq rabbitmq-server +fi + +rabbitmq-plugins enable rabbitmq_management +service rabbitmq-server start || rabbitmq-server -detached +sleep 3 +ok "RabbitMQ running (UI on :15672)" + +# ── 4. Node.js 20 ───────────────────────────────────────────────────────────── +step "Installing Node.js 20" +if ! node --version 2>/dev/null | grep -q "^v2[0-9]"; then + curl -fsSL https://deb.nodesource.com/setup_20.x | bash - + apt-get install -y -qq nodejs +fi +ok "Node $(node --version)" + +# ── 5. Poetry ───────────────────────────────────────────────────────────────── +step "Installing Poetry" +export PATH="$HOME/.local/bin:$PATH" +if ! command -v poetry &>/dev/null; then + curl -sSL https://install.python-poetry.org | python3.12 - + echo 'export PATH="$HOME/.local/bin:$PATH"' >> "$HOME/.bashrc" +fi +ok "Poetry $(poetry --version)" + +# ── 6. Python dependencies ──────────────────────────────────────────────────── +step "Installing Python dependencies (torch + ESM, ~10 min)" +cd "$ROOT" +poetry env use python3.12 +poetry install --without dev +ok "Python deps installed" + +# ── 7. Configure system.yaml ────────────────────────────────────────────────── +step "Writing protea/config/system.yaml" +mkdir -p "$ROOT/protea/config" +cat > "$ROOT/protea/config/system.yaml" < .env.local < [OPTIONS] +# +# Options: +# --local-db Local database name (default: BioData) +# --local-user Local PostgreSQL user (default: usuario) +# --remote-db Remote database name (default: protea) +# --remote-user Remote PostgreSQL user (default: protea) +# --full-reset Drop and recreate remote DB before restore (default: true) +# +# Examples: +# bash scripts/sync_db_vast.sh 173.206.147.184 41624 +# bash scripts/sync_db_vast.sh 173.206.147.184 41624 --full-reset + +set -euo pipefail + +IP="${1:?Usage: sync_db_vast.sh }" +PORT="${2:?Usage: sync_db_vast.sh }" +shift 2 + +# Defaults +LOCAL_DB="BioData" +LOCAL_USER="usuario" +REMOTE_DB="protea" +REMOTE_USER="protea" +FULL_RESET=true + +while [[ $# -gt 0 ]]; do + case "$1" in + --local-db) LOCAL_DB="$2"; shift 2 ;; + --local-user) LOCAL_USER="$2"; shift 2 ;; + --remote-db) REMOTE_DB="$2"; shift 2 ;; + --remote-user) REMOTE_USER="$2"; shift 2 ;; + --no-full-reset) FULL_RESET=false; shift ;; + --full-reset) FULL_RESET=true; shift ;; + *) printf "Unknown option: %s\n" "$1"; exit 1 ;; + esac +done + +SSH="ssh -p $PORT root@$IP" +DUMP_FILE="/tmp/protea_dump_$(date +%Y%m%d_%H%M%S).pgdump" + +GREEN="\033[32m"; YELLOW="\033[33m"; RED="\033[31m"; BOLD="\033[1m"; RESET="\033[0m" +step() { printf "\n${BOLD}==> %s${RESET}\n" "$*"; } +ok() { printf " ${GREEN}✓${RESET} %s\n" "$*"; } +warn() { printf " ${YELLOW}⚠${RESET} %s\n" "$*"; } + +# ── 0. Verify SSH ────────────────────────────────────────────────────────────── +step "Checking SSH connectivity" +if ! $SSH "echo ok" &>/dev/null; then + printf "${RED}ERROR${RESET}: Cannot reach root@$IP on port $PORT\n" + exit 1 +fi +ok "Connected to $IP:$PORT" + +# ── 1. Dump local DB ─────────────────────────────────────────────────────────── +step "Dumping local database '$LOCAL_DB' → $DUMP_FILE" +pg_dump \ + --username="$LOCAL_USER" \ + --host=localhost \ + --port=5432 \ + --format=custom \ + --compress=9 \ + --no-privileges \ + --no-owner \ + "$LOCAL_DB" \ + > "$DUMP_FILE" + +DUMP_SIZE=$(du -sh "$DUMP_FILE" | cut -f1) +ok "Dump complete ($DUMP_SIZE)" + +# ── 2. Transfer to instance ──────────────────────────────────────────────────── +step "Transferring dump to instance" +REMOTE_DUMP="/tmp/$(basename "$DUMP_FILE")" +rsync -az --progress -e "ssh -p $PORT" "$DUMP_FILE" "root@$IP:$REMOTE_DUMP" +ok "Transferred to $REMOTE_DUMP" + +# ── 3. Stop the PROTEA stack (to avoid writes during restore) ───────────────── +step "Stopping PROTEA stack on instance" +$SSH "cd /root/PROTEA && export PATH=\$HOME/.local/bin:\$PATH && bash scripts/manage.sh stop 2>/dev/null || true" +ok "Stack stopped" + +# ── 4. Restore on remote ─────────────────────────────────────────────────────── +step "Restoring database on instance" + +if [[ "$FULL_RESET" == "true" ]]; then + warn "Full reset: dropping and recreating '$REMOTE_DB'" + $SSH "su -c \"psql -c 'DROP DATABASE IF EXISTS $REMOTE_DB;'\" postgres" + $SSH "su -c \"psql -c \\\"CREATE DATABASE $REMOTE_DB OWNER $REMOTE_USER;\\\"\" postgres" + $SSH "su -c \"psql -d $REMOTE_DB -c 'CREATE EXTENSION IF NOT EXISTS vector;'\" postgres" + ok "Database recreated" +fi + +$SSH "export PGPASSWORD=protea && pg_restore \ + --username=$REMOTE_USER \ + --host=localhost \ + --port=5432 \ + --dbname=$REMOTE_DB \ + --no-privileges \ + --no-owner \ + --exit-on-error \ + $REMOTE_DUMP" +ok "Restore complete" + +# ── 5. Run pending migrations (in case code is newer than dump) ──────────────── +step "Running Alembic migrations (to apply any new schema changes)" +$SSH "cd /root/PROTEA && export PATH=\$HOME/.local/bin:\$PATH && poetry run alembic upgrade head" +ok "Schema up to date" + +# ── 6. Restart stack ─────────────────────────────────────────────────────────── +step "Restarting PROTEA stack" +$SSH "cd /root/PROTEA && export PATH=\$HOME/.local/bin:\$PATH && bash scripts/manage.sh start 1" +ok "Stack restarted" + +# ── 7. Cleanup ──────────────────────────────────────────────────────────────── +rm -f "$DUMP_FILE" +$SSH "rm -f $REMOTE_DUMP" +ok "Temporary dump files removed" + +printf "\n${BOLD}╔══════════════════════════════════════════════════╗${RESET}\n" +printf "${BOLD}║ Database synced successfully ║${RESET}\n" +printf "${BOLD}╚══════════════════════════════════════════════════╝${RESET}\n\n" +printf " Source: ${LOCAL_USER}@localhost/${LOCAL_DB}\n" +printf " Target: ${REMOTE_USER}@${IP}/${REMOTE_DB}\n\n" From f15ae799a755352cf54b2df939afb3663eaf5b24 Mon Sep 17 00:00:00 2001 From: frapercan Date: Sat, 14 Mar 2026 22:31:09 +0100 Subject: [PATCH 02/17] docs: add quality baseline assessment (2026-03-14) --- docs/quality_baseline.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 docs/quality_baseline.md diff --git a/docs/quality_baseline.md b/docs/quality_baseline.md new file mode 100644 index 0000000..82e807e --- /dev/null +++ b/docs/quality_baseline.md @@ -0,0 +1,31 @@ +# Quality Baseline — 2026-03-14 + +Initial code quality assessment. Objective: track improvement over time. + +## Scores + +| Area | Score | +|---|---| +| Architecture | 8.5/10 | +| Code Quality | 7/10 | +| Tests | 7/10 (65.8% coverage) | +| API Design | 7.5/10 | +| Database | 8.5/10 | +| Frontend | 8/10 | +| Documentation | 9/10 | +| **Overall** | **7.7/10** | + +**Status:** Beta-ready. Not yet production-ready. + +## Open risks (priority order) + +| # | Risk | Area | +|---|---|---| +| 1 | `emit()` failures swallowed silently in `OperationConsumer` — progress errors are never logged | Workers | +| 2 | No transaction retries — a deadlock or timeout kills the job with no recovery | Database | +| 3 | CORS wildcard (`*`) | API | +| 4 | 16 nullable columns in `GOPrediction` — feature engineering coupled to ORM model | Database | +| 5 | Missing indexes on `ProteinGOAnnotation(protein_id, go_term_id)` — slow queries at scale | Database | +| 6 | No API versioning (`/v1/`) — breaking changes would affect external integrations | API | +| 7 | Duplicate validation in embeddings router (manual checks + Pydantic) | API | +| 8 | No pagination on endpoints that can return thousands of results | API | From dce1b4ffc3a6e6954925f3c4838f641176539d9f Mon Sep 17 00:00:00 2001 From: frapercan Date: Sat, 14 Mar 2026 22:38:16 +0100 Subject: [PATCH 03/17] fix: add migrate service and pgvector init to docker-compose - Add 'migrate' one-shot service that runs alembic upgrade head before API starts - Mount docker/init.sql to postgres initdb.d to enable pgvector extension automatically - All workers and API depend on migrate completing successfully --- docker-compose.yml | 25 +++++++++++++++++++------ docker/init.sql | 1 + 2 files changed, 20 insertions(+), 6 deletions(-) create mode 100644 docker/init.sql diff --git a/docker-compose.yml b/docker-compose.yml index 88d731d..cb925b8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,6 +8,7 @@ services: POSTGRES_DB: protea volumes: - postgres_data:/var/lib/postgresql/data + - ./docker/init.sql:/docker-entrypoint-initdb.d/init.sql ports: - "5432:5432" healthcheck: @@ -30,6 +31,16 @@ services: timeout: 5s retries: 5 + migrate: + build: . + environment: + PROTEA_DB_URL: postgresql+psycopg://protea:protea@postgres/protea + command: alembic upgrade head + depends_on: + postgres: + condition: service_healthy + restart: "no" + api: build: . environment: @@ -42,6 +53,8 @@ services: condition: service_healthy rabbitmq: condition: service_healthy + migrate: + condition: service_completed_successfully worker-jobs: build: . @@ -50,8 +63,8 @@ services: PROTEA_AMQP_URL: amqp://guest:guest@rabbitmq/ command: python scripts/worker.py --queue protea.jobs depends_on: - postgres: - condition: service_healthy + migrate: + condition: service_completed_successfully rabbitmq: condition: service_healthy @@ -62,8 +75,8 @@ services: PROTEA_AMQP_URL: amqp://guest:guest@rabbitmq/ command: python scripts/worker.py --queue protea.embeddings depends_on: - postgres: - condition: service_healthy + migrate: + condition: service_completed_successfully rabbitmq: condition: service_healthy @@ -74,8 +87,8 @@ services: PROTEA_AMQP_URL: amqp://guest:guest@rabbitmq/ command: python scripts/worker.py --queue protea.predictions.batch depends_on: - postgres: - condition: service_healthy + migrate: + condition: service_completed_successfully rabbitmq: condition: service_healthy diff --git a/docker/init.sql b/docker/init.sql new file mode 100644 index 0000000..0aa0fc2 --- /dev/null +++ b/docker/init.sql @@ -0,0 +1 @@ +CREATE EXTENSION IF NOT EXISTS vector; From fd72cb5c73afebd388475db56b2397733f67ab7a Mon Sep 17 00:00:00 2001 From: frapercan Date: Sat, 14 Mar 2026 22:53:36 +0100 Subject: [PATCH 04/17] feat: Docker-based production deployment via ghcr.io - Add docker-compose.prod.yml: uses pre-built ghcr.io images, GPU support for worker-embeddings - Rewrite deploy_vast.sh: sync compose files + docker compose pull + up (no rsync of source code) - Migrations run automatically via the migrate service on every deploy --- docker-compose.prod.yml | 30 +++++++++++++ scripts/deploy_vast.sh | 94 +++++++++++++++++++---------------------- 2 files changed, 73 insertions(+), 51 deletions(-) create mode 100644 docker-compose.prod.yml diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml new file mode 100644 index 0000000..e5f9db4 --- /dev/null +++ b/docker-compose.prod.yml @@ -0,0 +1,30 @@ +# Production overrides: pull pre-built images from ghcr.io instead of building locally. +# Use with: docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d +# +# The worker-embeddings service gets GPU access via the NVIDIA container runtime. + +services: + migrate: + image: ghcr.io/frapercan/protea:latest + + api: + image: ghcr.io/frapercan/protea:latest + + worker-jobs: + image: ghcr.io/frapercan/protea:latest + + worker-embeddings: + image: ghcr.io/frapercan/protea:latest + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + + worker-predictions: + image: ghcr.io/frapercan/protea:latest + + frontend: + image: ghcr.io/frapercan/protea-frontend:latest diff --git a/scripts/deploy_vast.sh b/scripts/deploy_vast.sh index 13ae5df..3f8499b 100755 --- a/scripts/deploy_vast.sh +++ b/scripts/deploy_vast.sh @@ -1,28 +1,31 @@ #!/usr/bin/env bash -# scripts/deploy_vast.sh — Push code updates to a running vast.ai instance +# scripts/deploy_vast.sh — Deploy PROTEA to a vast.ai instance via Docker # # Usage: -# bash scripts/deploy_vast.sh [BATCH_WORKERS] +# bash scripts/deploy_vast.sh [GHCR_TOKEN] # # Examples: # bash scripts/deploy_vast.sh 173.206.147.184 41624 -# bash scripts/deploy_vast.sh 173.206.147.184 41624 2 +# bash scripts/deploy_vast.sh 173.206.147.184 41624 ghp_xxxxx # # What it does: -# 1. rsync code to /root/PROTEA (excludes venvs, node_modules, logs, local config) -# 2. poetry install --without dev (only if pyproject.toml changed) -# 3. npm install (only if package.json changed) -# 4. alembic upgrade head -# 5. restart the full PROTEA stack +# 1. Sync docker-compose files to the remote (no source code needed) +# 2. Login to ghcr.io on the remote +# 3. Pull latest images from ghcr.io +# 4. Run migrations and restart the stack (migrate service runs automatically) +# +# Requirements on the remote: +# - Docker with NVIDIA Container Toolkit (standard vast.ai images) set -euo pipefail -IP="${1:?Usage: deploy_vast.sh [BATCH_WORKERS]}" -PORT="${2:?Usage: deploy_vast.sh [BATCH_WORKERS]}" -BATCH_WORKERS="${3:-1}" +IP="${1:?Usage: deploy_vast.sh [GHCR_TOKEN]}" +PORT="${2:?Usage: deploy_vast.sh [GHCR_TOKEN]}" +GHCR_TOKEN="${3:-${GITHUB_TOKEN:-}}" ROOT="$(cd "$(dirname "$0")/.." && pwd)" SSH="ssh -p $PORT root@$IP" + GREEN="\033[32m"; YELLOW="\033[33m"; BOLD="\033[1m"; RESET="\033[0m" step() { printf "\n${BOLD}==> %s${RESET}\n" "$*"; } ok() { printf " ${GREEN}✓${RESET} %s\n" "$*"; } @@ -37,51 +40,40 @@ if ! $SSH "echo ok" &>/dev/null; then fi ok "Connected to $IP:$PORT" -# ── 1. Sync code ─────────────────────────────────────────────────────────────── -step "Syncing code → /root/PROTEA" - -rsync -az --delete \ - --exclude='.git/' \ - --exclude='__pycache__/' \ - --exclude='*.pyc' \ - --exclude='*.egg-info/' \ - --exclude='.venv/' \ - --exclude='logs/' \ - --exclude='node_modules/' \ - --exclude='.next/' \ - --exclude='storage/' \ - --exclude='protea/config/system.yaml' \ - --exclude='apps/web/.env.local' \ - -e "ssh -p $PORT" \ - "$ROOT/" "root@$IP:/root/PROTEA/" - -ok "Code synced" +# ── 1. Sync compose files (no source code needed) ───────────────────────────── +step "Syncing compose files → /root/PROTEA" +$SSH "mkdir -p /root/PROTEA/docker" +rsync -az -e "ssh -p $PORT" \ + "$ROOT/docker-compose.yml" \ + "$ROOT/docker-compose.prod.yml" \ + "root@$IP:/root/PROTEA/" +rsync -az -e "ssh -p $PORT" \ + "$ROOT/docker/init.sql" \ + "root@$IP:/root/PROTEA/docker/" +ok "Compose files synced" -# ── 2. Install Python deps (only if pyproject.toml changed) ─────────────────── -step "Installing Python dependencies" -$SSH "cd /root/PROTEA && export PATH=\$HOME/.local/bin:\$PATH && poetry install --without dev" -ok "Python deps up to date" - -# ── 3. Install frontend deps (only if package.json changed) ─────────────────── -step "Installing frontend dependencies" -$SSH "cd /root/PROTEA/apps/web && npm install --silent" -ok "Frontend deps up to date" +# ── 2. Login to ghcr.io ─────────────────────────────────────────────────────── +step "Logging in to ghcr.io" +if [[ -n "$GHCR_TOKEN" ]]; then + $SSH "echo '$GHCR_TOKEN' | docker login ghcr.io -u frapercan --password-stdin" + ok "Logged in to ghcr.io" +else + warn "No GHCR_TOKEN provided — assuming images are public or already logged in" +fi -# ── 4. Run Alembic migrations ────────────────────────────────────────────────── -step "Running database migrations" -$SSH "cd /root/PROTEA && export PATH=\$HOME/.local/bin:\$PATH && poetry run alembic upgrade head" -ok "Schema up to date" +# ── 3. Pull latest images ───────────────────────────────────────────────────── +step "Pulling latest images from ghcr.io" +$SSH "cd /root/PROTEA && docker compose -f docker-compose.yml -f docker-compose.prod.yml pull" +ok "Images up to date" -# ── 5. Restart stack ─────────────────────────────────────────────────────────── -step "Restarting PROTEA stack ($BATCH_WORKERS batch worker(s))" -$SSH "cd /root/PROTEA && export PATH=\$HOME/.local/bin:\$PATH && bash scripts/manage.sh start $BATCH_WORKERS" +# ── 4. Restart stack (migrate runs automatically before API/workers) ─────────── +step "Restarting PROTEA stack" +$SSH "cd /root/PROTEA && docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d" ok "Stack restarted" -# ── Done ─────────────────────────────────────────────────────────────────────── -FRONTEND_PORT=$($SSH "vastai show instance --raw 2>/dev/null | python3 -c \"import sys,json; p=json.load(sys.stdin).get('ports',{}); print(p.get('3000/tcp',[{'HostPort':'3000'}])[0]['HostPort'])\" 2>/dev/null || echo '3000'") - +# ── Done ────────────────────────────────────────────────────────────────────── printf "\n${BOLD}╔══════════════════════════════════════════════════╗${RESET}\n" printf "${BOLD}║ PROTEA deployed successfully ║${RESET}\n" printf "${BOLD}╚══════════════════════════════════════════════════╝${RESET}\n\n" -printf " Logs: $SSH 'bash /root/PROTEA/scripts/manage.sh logs'\n" -printf " Status: $SSH 'bash /root/PROTEA/scripts/manage.sh status'\n\n" +printf " Logs: $SSH 'cd /root/PROTEA && docker compose -f docker-compose.yml -f docker-compose.prod.yml logs -f'\n" +printf " Status: $SSH 'cd /root/PROTEA && docker compose -f docker-compose.yml -f docker-compose.prod.yml ps'\n\n" From 18719bd950f35a23f3ebfc2d324e8760f89fe48e Mon Sep 17 00:00:00 2001 From: frapercan Date: Mon, 16 Mar 2026 21:54:04 +0100 Subject: [PATCH 05/17] feat(orm): add ScoringConfig, SupportEntry models and related migrations - ScoringConfig: reproducible scoring recipe with signal weights, formula type and optional per-evidence-code quality overrides - EvaluationResult: add scoring_config_id FK and results JSONB column - OntologySnapshot: add ia_url field for Information Accretion file - SupportEntry: new model for user-facing support/contact entries - 6 new Alembic migrations covering all model additions and indexes Co-Authored-By: Claude Sonnet 4.6 --- ...1_add_composite_index_pga_set_accession.py | 32 +++ ...33_add_scoring_config_id_to_evaluation_.py | 38 ++++ ...8c210c8_add_ia_url_to_ontology_snapshot.py | 41 ++++ ...737a352d4fe_merge_scoring_config_branch.py | 28 +++ .../7c19ca08d5d4_add_support_entry_table.py | 37 ++++ .../b1c2d3e4f5a6_add_scoring_config.py | 38 ++++ ..._add_evidence_weights_to_scoring_config.py | 49 +++++ protea/infrastructure/orm/models/__init__.py | 2 + .../models/annotation/evaluation_result.py | 8 + .../models/annotation/ontology_snapshot.py | 18 ++ .../orm/models/embedding/scoring_config.py | 196 ++++++++++++++++++ .../orm/models/support_entry.py | 25 +++ 12 files changed, 512 insertions(+) create mode 100644 alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py create mode 100644 alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py create mode 100644 alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py create mode 100644 alembic/versions/7737a352d4fe_merge_scoring_config_branch.py create mode 100644 alembic/versions/7c19ca08d5d4_add_support_entry_table.py create mode 100644 alembic/versions/b1c2d3e4f5a6_add_scoring_config.py create mode 100644 alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py create mode 100644 protea/infrastructure/orm/models/embedding/scoring_config.py create mode 100644 protea/infrastructure/orm/models/support_entry.py diff --git a/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py b/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py new file mode 100644 index 0000000..b99dc62 --- /dev/null +++ b/alembic/versions/489835ed5b31_add_composite_index_pga_set_accession.py @@ -0,0 +1,32 @@ +"""add_composite_index_pga_set_accession + +Revision ID: 489835ed5b31 +Revises: 7737a352d4fe +Create Date: 2026-03-15 11:17:30.865922 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '489835ed5b31' +down_revision: Union[str, Sequence[str], None] = '7737a352d4fe' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + op.create_index( + "ix_pga_set_accession", + "protein_go_annotation", + ["annotation_set_id", "protein_accession"], + ) + + +def downgrade() -> None: + """Downgrade schema.""" + op.drop_index("ix_pga_set_accession", table_name="protein_go_annotation") diff --git a/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py b/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py new file mode 100644 index 0000000..1890a22 --- /dev/null +++ b/alembic/versions/513355a1d933_add_scoring_config_id_to_evaluation_.py @@ -0,0 +1,38 @@ +"""add scoring_config_id to evaluation_result + +Revision ID: 513355a1d933 +Revises: 489835ed5b31 +Create Date: 2026-03-15 12:37:19.930750 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '513355a1d933' +down_revision: Union[str, Sequence[str], None] = '489835ed5b31' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('evaluation_result', sa.Column('scoring_config_id', sa.UUID(), nullable=True)) + op.create_index(op.f('ix_evaluation_result_scoring_config_id'), 'evaluation_result', ['scoring_config_id'], unique=False) + op.create_foreign_key(None, 'evaluation_result', 'scoring_config', ['scoring_config_id'], ['id'], ondelete='SET NULL') + op.drop_index(op.f('ix_pga_set_accession'), table_name='protein_go_annotation') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_index(op.f('ix_pga_set_accession'), 'protein_go_annotation', ['annotation_set_id', 'protein_accession'], unique=False) + op.drop_constraint(None, 'evaluation_result', type_='foreignkey') + op.drop_index(op.f('ix_evaluation_result_scoring_config_id'), table_name='evaluation_result') + op.drop_column('evaluation_result', 'scoring_config_id') + # ### end Alembic commands ### diff --git a/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py b/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py new file mode 100644 index 0000000..cde8fca --- /dev/null +++ b/alembic/versions/54e758c210c8_add_ia_url_to_ontology_snapshot.py @@ -0,0 +1,41 @@ +"""add_ia_url_to_ontology_snapshot + +Revision ID: 54e758c210c8 +Revises: c1d2e3f4a5b6 +Create Date: 2026-03-16 11:42:10.636169 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = '54e758c210c8' +down_revision: Union[str, Sequence[str], None] = 'c1d2e3f4a5b6' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('ontology_snapshot', sa.Column('ia_url', sa.String(), nullable=True, comment='URL of the Information Accretion TSV for this ontology release (two columns: go_id, ia_value). Used by run_cafa_evaluation to weight GO terms by information content. NULL means uniform IC=1.')) + op.alter_column('scoring_config', 'evidence_weights', + existing_type=postgresql.JSONB(astext_type=sa.Text()), + comment=None, + existing_comment='Optional per-GO-evidence-code quality multipliers in [0, 1]. NULL means use the system defaults defined in DEFAULT_EVIDENCE_WEIGHTS. Partial dicts are allowed; absent codes fall back to the system table.', + existing_nullable=True) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('scoring_config', 'evidence_weights', + existing_type=postgresql.JSONB(astext_type=sa.Text()), + comment='Optional per-GO-evidence-code quality multipliers in [0, 1]. NULL means use the system defaults defined in DEFAULT_EVIDENCE_WEIGHTS. Partial dicts are allowed; absent codes fall back to the system table.', + existing_nullable=True) + op.drop_column('ontology_snapshot', 'ia_url') + # ### end Alembic commands ### diff --git a/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py b/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py new file mode 100644 index 0000000..f759c30 --- /dev/null +++ b/alembic/versions/7737a352d4fe_merge_scoring_config_branch.py @@ -0,0 +1,28 @@ +"""merge_scoring_config_branch + +Revision ID: 7737a352d4fe +Revises: 47de89cf6fec, b1c2d3e4f5a6 +Create Date: 2026-03-15 10:11:56.507967 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '7737a352d4fe' +down_revision: Union[str, Sequence[str], None] = ('47de89cf6fec', 'b1c2d3e4f5a6') +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + pass + + +def downgrade() -> None: + """Downgrade schema.""" + pass diff --git a/alembic/versions/7c19ca08d5d4_add_support_entry_table.py b/alembic/versions/7c19ca08d5d4_add_support_entry_table.py new file mode 100644 index 0000000..599214c --- /dev/null +++ b/alembic/versions/7c19ca08d5d4_add_support_entry_table.py @@ -0,0 +1,37 @@ +"""add support_entry table + +Revision ID: 7c19ca08d5d4 +Revises: 513355a1d933 +Create Date: 2026-03-15 12:42:43.832417 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '7c19ca08d5d4' +down_revision: Union[str, Sequence[str], None] = '513355a1d933' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('support_entry', + sa.Column('id', sa.UUID(), nullable=False), + sa.Column('comment', sa.Text(), nullable=True), + sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), + sa.PrimaryKeyConstraint('id') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('support_entry') + # ### end Alembic commands ### diff --git a/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py b/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py new file mode 100644 index 0000000..5eae559 --- /dev/null +++ b/alembic/versions/b1c2d3e4f5a6_add_scoring_config.py @@ -0,0 +1,38 @@ +"""add scoring_config table + +Revision ID: b1c2d3e4f5a6 +Revises: a7b8c9d0e1f2 +Create Date: 2026-03-15 +""" +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +revision = "b1c2d3e4f5a6" +down_revision = "a7b8c9d0e1f2" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "scoring_config", + sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("name", sa.String(255), nullable=False), + sa.Column("formula", sa.String(50), nullable=False, server_default="linear"), + sa.Column("weights", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("description", sa.Text(), nullable=True), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), + ) + + +def downgrade() -> None: + op.drop_table("scoring_config") diff --git a/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py b/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py new file mode 100644 index 0000000..4a3a4c9 --- /dev/null +++ b/alembic/versions/c1d2e3f4a5b6_add_evidence_weights_to_scoring_config.py @@ -0,0 +1,49 @@ +"""Add evidence_weights column to scoring_config. + +Revision ID: c1d2e3f4a5b6 +Revises: 7c19ca08d5d4 +Create Date: 2026-03-16 + +Motivation +---------- +``ScoringConfig`` previously hard-coded the per-evidence-code quality +weights inside the Python scoring engine, making them invisible to users +and impossible to customise without a code change. + +This migration adds an optional ``evidence_weights`` JSONB column that +stores per-code overrides at the config level. Existing rows receive +``NULL``, which is interpreted by the engine as "use system defaults" +(:data:`protea.infrastructure.orm.models.embedding.scoring_config.DEFAULT_EVIDENCE_WEIGHTS`). +The change is therefore fully backwards-compatible with all existing +``ScoringConfig`` rows. +""" +from __future__ import annotations + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +revision = "c1d2e3f4a5b6" +down_revision = "7c19ca08d5d4" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "scoring_config", + sa.Column( + "evidence_weights", + postgresql.JSONB(astext_type=sa.Text()), + nullable=True, + comment=( + "Optional per-GO-evidence-code quality multipliers in [0, 1]. " + "NULL means use the system defaults defined in DEFAULT_EVIDENCE_WEIGHTS. " + "Partial dicts are allowed; absent codes fall back to the system table." + ), + ), + ) + + +def downgrade() -> None: + op.drop_column("scoring_config", "evidence_weights") diff --git a/protea/infrastructure/orm/models/__init__.py b/protea/infrastructure/orm/models/__init__.py index 62d114e..cc0bff2 100644 --- a/protea/infrastructure/orm/models/__init__.py +++ b/protea/infrastructure/orm/models/__init__.py @@ -7,6 +7,7 @@ from .annotation.protein_go_annotation import ProteinGOAnnotation # noqa: F401 from .embedding.embedding_config import EmbeddingConfig # noqa: F401 from .embedding.go_prediction import GOPrediction # noqa: F401 +from .embedding.scoring_config import ScoringConfig # noqa: F401 from .embedding.prediction_set import PredictionSet # noqa: F401 from .embedding.sequence_embedding import SequenceEmbedding # noqa: F401 from .job import Job, JobEvent # noqa: F401 @@ -14,3 +15,4 @@ from .protein.protein_metadata import ProteinUniProtMetadata # noqa: F401 from .query.query_set import QuerySet, QuerySetEntry # noqa: F401 from .sequence.sequence import Sequence # noqa: F401 +from .support_entry import SupportEntry # noqa: F401 diff --git a/protea/infrastructure/orm/models/annotation/evaluation_result.py b/protea/infrastructure/orm/models/annotation/evaluation_result.py index e254875..9054e49 100644 --- a/protea/infrastructure/orm/models/annotation/evaluation_result.py +++ b/protea/infrastructure/orm/models/annotation/evaluation_result.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from protea.infrastructure.orm.models.annotation.evaluation_set import EvaluationSet from protea.infrastructure.orm.models.embedding.prediction_set import PredictionSet + from protea.infrastructure.orm.models.embedding.scoring_config import ScoringConfig from protea.infrastructure.orm.models.job import Job @@ -54,6 +55,12 @@ class EvaluationResult(Base): nullable=False, index=True, ) + scoring_config_id: Mapped[uuid.UUID | None] = mapped_column( + UUID(as_uuid=True), + ForeignKey("scoring_config.id", ondelete="SET NULL"), + nullable=True, + index=True, + ) job_id: Mapped[uuid.UUID | None] = mapped_column( UUID(as_uuid=True), ForeignKey("job.id", ondelete="SET NULL"), @@ -67,4 +74,5 @@ class EvaluationResult(Base): evaluation_set: Mapped[EvaluationSet] = relationship("EvaluationSet") prediction_set: Mapped[PredictionSet] = relationship("PredictionSet") + scoring_config: Mapped[ScoringConfig | None] = relationship("ScoringConfig") job: Mapped[Job | None] = relationship("Job") diff --git a/protea/infrastructure/orm/models/annotation/ontology_snapshot.py b/protea/infrastructure/orm/models/annotation/ontology_snapshot.py index dfea4c3..167e329 100644 --- a/protea/infrastructure/orm/models/annotation/ontology_snapshot.py +++ b/protea/infrastructure/orm/models/annotation/ontology_snapshot.py @@ -23,6 +23,16 @@ class OntologySnapshot(Base): file was downloaded, providing full provenance. Multiple ``AnnotationSet`` rows can reference the same snapshot when they were built against the same ontology release. + + ``ia_url`` optionally points to the Information Accretion (IA) TSV file + associated with this ontology release. IA files are published alongside + each CAFA benchmark (e.g. ``IA_cafa6.tsv``) and contain per-term + information-content weights that make cafaeval penalise predictions of + common, easy-to-predict terms less than rare, specific ones. When present, + ``run_cafa_evaluation`` downloads and passes this file to cafaeval + automatically — no manual path is required in the job payload. Set a new + ``ia_url`` on each future snapshot (CAFA7, etc.) to keep evaluations + comparable across benchmark generations. """ __tablename__ = "ontology_snapshot" @@ -32,6 +42,14 @@ class OntologySnapshot(Base): ) obo_url: Mapped[str] = mapped_column(String, nullable=False) obo_version: Mapped[str] = mapped_column(String, nullable=False, unique=True, index=True) + ia_url: Mapped[str | None] = mapped_column( + String, nullable=True, + comment=( + "URL of the Information Accretion TSV for this ontology release " + "(two columns: go_id, ia_value). Used by run_cafa_evaluation to " + "weight GO terms by information content. NULL means uniform IC=1." + ), + ) loaded_at: Mapped[datetime] = mapped_column( DateTime(timezone=True), nullable=False, server_default=func.now() ) diff --git a/protea/infrastructure/orm/models/embedding/scoring_config.py b/protea/infrastructure/orm/models/embedding/scoring_config.py new file mode 100644 index 0000000..4a45ff7 --- /dev/null +++ b/protea/infrastructure/orm/models/embedding/scoring_config.py @@ -0,0 +1,196 @@ +"""ORM model for ScoringConfig — reproducible scoring formulas for GOPrediction rows. + +A ScoringConfig stores two complementary layers of configuration: + +1. **Signal weights** (``weights`` field): a dict mapping each composite signal + (e.g. ``embedding_similarity``, ``identity_nw``) to a relative weight [0, 1]. + Missing signals — because the corresponding feature-engineering flag was not + enabled at prediction time — are automatically excluded from the denominator, + so the remaining active signals always produce a normalised [0, 1] score. + +2. **Evidence-code weights** (``evidence_weights`` field, optional JSONB): + a per-GO-evidence-code quality multiplier, also in [0, 1]. When ``None``, + :data:`DEFAULT_EVIDENCE_WEIGHTS` is used as the fallback. Supplying a + partial dict overrides only the codes present; codes absent from the dict + fall back to the system default, making partial overrides safe. + +This two-layer design separates *how much a signal matters* (signal weights) +from *how trustworthy the underlying annotation is* (evidence weights), which +are independent research decisions. + +Formulas +-------- +linear + score = Σ(w_i · s_i) / Σ(w_i) for all active (w_i > 0, s_i available) signals. +evidence_weighted + Same as linear but the resolved evidence weight is always applied as a + final multiplier on top of the weighted sum — even when its signal weight is + 0. This allows down-ranking IEA-sourced predictions regardless of how + strong the embedding or alignment signals are. +""" +from __future__ import annotations + +import uuid +from datetime import datetime +from typing import Any + +from sqlalchemy import DateTime, String, Text +from sqlalchemy.dialects.postgresql import JSONB, UUID +from sqlalchemy.orm import Mapped, mapped_column +from sqlalchemy.sql import func + +from protea.infrastructure.orm.base import Base + +# --------------------------------------------------------------------------- +# Formula identifiers +# --------------------------------------------------------------------------- + +FORMULA_LINEAR = "linear" +FORMULA_EVIDENCE_WEIGHTED = "evidence_weighted" +VALID_FORMULAS = (FORMULA_LINEAR, FORMULA_EVIDENCE_WEIGHTED) + +# --------------------------------------------------------------------------- +# Signal weight defaults +# --------------------------------------------------------------------------- +# Pure embedding similarity by default; all other signals must be opted in. + +DEFAULT_WEIGHTS: dict[str, float] = { + "embedding_similarity": 1.0, + "identity_nw": 0.0, + "identity_sw": 0.0, + "evidence_weight": 0.0, + "taxonomic_proximity": 0.0, +} + +# --------------------------------------------------------------------------- +# Evidence-code quality weights (default table) +# --------------------------------------------------------------------------- +# These defaults reflect the GO Annotation quality hierarchy. A ScoringConfig +# may store a full or partial override in its ``evidence_weights`` column. +# +# Sources: +# GO evidence code definitions — https://geneontology.org/docs/guide-go-evidence-codes/ +# CAFA community consensus: experimental codes provide the highest confidence. +# +# Default tier mapping: +# Experimental (EXP, IDA, IPI, IMP, IGI, IEP, HTP, HDA, HMP, HGI, HEP, +# IC, TAS) → 1.0 +# Computational / Phylogenetic (ISS, ISO, ISA, ISM, IGC, IBA, +# IBD, IKR, IRD, RCA) → 0.7 +# Non-traceable author statement (NAS) → 0.5 +# Electronic annotation (IEA) → 0.3 +# No biological data (ND) → 0.1 + +DEFAULT_EVIDENCE_WEIGHTS: dict[str, float] = { + # Experimental — direct biological evidence + "EXP": 1.0, # Inferred from Experiment + "IDA": 1.0, # Inferred from Direct Assay + "IPI": 1.0, # Inferred from Physical Interaction + "IMP": 1.0, # Inferred from Mutant Phenotype + "IGI": 1.0, # Inferred from Genetic Interaction + "IEP": 1.0, # Inferred from Expression Pattern + "HTP": 1.0, # High-Throughput experiment (umbrella) + "HDA": 1.0, # High-Throughput Direct Assay + "HMP": 1.0, # High-Throughput Mutant Phenotype + "HGI": 1.0, # High-Throughput Genetic Interaction + "HEP": 1.0, # High-Throughput Expression Pattern + "IC": 1.0, # Inferred by Curator + "TAS": 1.0, # Traceable Author Statement + # Computational / Phylogenetic — derived from sequence or phylogeny + "ISS": 0.7, # Inferred from Sequence or Structural Similarity + "ISO": 0.7, # Inferred from Sequence Orthology + "ISA": 0.7, # Inferred from Sequence Alignment + "ISM": 0.7, # Inferred from Sequence Model + "IGC": 0.7, # Inferred from Genomic Context + "IBA": 0.7, # Inferred from Biological aspect of Ancestor + "IBD": 0.7, # Inferred from Biological aspect of Descendant + "IKR": 0.7, # Inferred from Key Residues + "IRD": 0.7, # Inferred from Rapid Divergence + "RCA": 0.7, # Inferred from Reviewed Computational Analysis + # Electronic / author statement — lowest-effort annotation + "NAS": 0.5, # Non-traceable Author Statement + "IEA": 0.3, # Inferred from Electronic Annotation (automated, bulk) + # No biological data — used only as a placeholder + "ND": 0.1, # No biological Data available +} + +#: Ordered grouping of evidence codes used for UI rendering and documentation. +#: Preserves the biological meaning of each tier. +EVIDENCE_CODE_GROUPS: dict[str, list[str]] = { + "Experimental": [ + "EXP", "IDA", "IPI", "IMP", "IGI", "IEP", + "HTP", "HDA", "HMP", "HGI", "HEP", "IC", "TAS", + ], + "Computational / Phylogenetic": [ + "ISS", "ISO", "ISA", "ISM", "IGC", + "IBA", "IBD", "IKR", "IRD", "RCA", + ], + "Electronic": ["NAS", "IEA"], + "No data": ["ND"], +} + +#: Fallback weight applied when a code is not found in any lookup table. +DEFAULT_EVIDENCE_WEIGHT_FALLBACK: float = 0.5 + + +# --------------------------------------------------------------------------- +# ORM model +# --------------------------------------------------------------------------- + +class ScoringConfig(Base): + """Persistent scoring formula definition. + + Instances are stored in the ``scoring_config`` table and referenced by + evaluation endpoints and the UI scoring selector. Every field that + influences score computation is serialised, making any result fully + reproducible by re-applying the same ``ScoringConfig`` to the raw + ``GOPrediction`` rows. + + Attributes + ---------- + id: + UUID primary key. + name: + Human-readable label shown in the UI dropdown. + formula: + One of :data:`VALID_FORMULAS` — controls how the weighted average is + combined with the evidence multiplier. + weights: + JSONB dict mapping signal keys to their relative weights. Valid keys + are the ones in :data:`DEFAULT_WEIGHTS`. Weights of 0 deactivate a + signal; absent keys are treated as 0. + evidence_weights: + Optional JSONB dict mapping GO evidence codes (e.g. ``"IEA"``) to + per-code quality multipliers in [0, 1]. When ``None`` the system falls + back to :data:`DEFAULT_EVIDENCE_WEIGHTS`. Partial dicts are allowed: + codes absent from the override still resolve via the default table. + description: + Free-text description shown as a tooltip in the UI. + created_at: + UTC timestamp set by the database at insert time. + """ + + __tablename__ = "scoring_config" + + id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) + name: Mapped[str] = mapped_column(String(255), nullable=False) + formula: Mapped[str] = mapped_column( + String(50), nullable=False, default=FORMULA_LINEAR + ) + weights: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False) + evidence_weights: Mapped[dict[str, Any] | None] = mapped_column( + JSONB, nullable=True + ) + description: Mapped[str | None] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + + def __repr__(self) -> str: + return ( + f"" + ) diff --git a/protea/infrastructure/orm/models/support_entry.py b/protea/infrastructure/orm/models/support_entry.py new file mode 100644 index 0000000..bad46f6 --- /dev/null +++ b/protea/infrastructure/orm/models/support_entry.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +import uuid +from datetime import datetime + +from sqlalchemy import DateTime, Text +from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.orm import Mapped, mapped_column +from sqlalchemy.sql import func + +from protea.infrastructure.orm.base import Base + + +class SupportEntry(Base): + """A thumbs-up + optional comment submitted by a visitor.""" + + __tablename__ = "support_entry" + + id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) + comment: Mapped[str | None] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), nullable=False, server_default=func.now() + ) From d192b3dced88abc98736a736a4dbf2a45c64ac38 Mon Sep 17 00:00:00 2001 From: frapercan Date: Mon, 16 Mar 2026 21:55:04 +0100 Subject: [PATCH 06/17] feat(core): add scoring engine and enhance CAFA evaluation pipeline - scoring.py: configurable multi-signal scoring engine combining embedding similarity, NW/SW alignment identity, taxonomic proximity and per-evidence-code quality weights; supports weighted_avg and evidence_weighted formulas - metrics.py: shared metric computation utilities - evaluation.py: propagate scoring_config through evaluation runs, store per-namespace Fmax/precision/recall/coverage in results JSONB - run_cafa_evaluation: download IA file from OntologySnapshot.ia_url, apply scoring config to generate CAFA-format prediction scores - predict_go_terms: wire scoring_config_id into prediction batch payload Co-Authored-By: Claude Sonnet 4.6 --- protea/core/evaluation.py | 198 ++++--- protea/core/metrics.py | 149 +++++ protea/core/operations/predict_go_terms.py | 547 +++++++++++++++--- protea/core/operations/run_cafa_evaluation.py | 133 ++++- protea/core/scoring.py | 193 ++++++ 5 files changed, 1063 insertions(+), 157 deletions(-) create mode 100644 protea/core/metrics.py create mode 100644 protea/core/scoring.py diff --git a/protea/core/evaluation.py b/protea/core/evaluation.py index b19ae75..28cc1ab 100644 --- a/protea/core/evaluation.py +++ b/protea/core/evaluation.py @@ -1,16 +1,26 @@ """CAFA-style evaluation data computation. This module computes the ground-truth delta between two AnnotationSets -(old → new) following the CAFA evaluation protocol: +(old → new) following the official CAFA5 evaluation protocol: 1. Experimental evidence codes only (EXP, IDA, IMP, …) 2. NOT-qualifier annotations are excluded — including their GO descendants propagated transitively through the is_a / part_of DAG. - 3. Delta proteins = proteins that gained ≥ 1 new (protein, go_term) pair. - 4. NK = delta proteins with ZERO experimental annotations in OLD. - 5. LK = delta proteins with ≥ 1 experimental annotation in OLD. - 6. PK = same annotation set as LK; use with known-terms for evaluation. - 7. known-terms = ALL experimental annotations from OLD (not delta-filtered). + 3. Classification is per (protein, namespace), not globally per protein: + + NK — protein had NO experimental annotations in ANY namespace at t0. + All novel terms across all namespaces are ground truth. + + LK — protein had annotations in SOME namespaces at t0, but NOT in + namespace S. Novel terms in S are ground truth for LK. + + PK — protein had annotations in namespace S at t0 AND gained new terms + in S at t1. Novel terms in S are ground truth for PK; old terms + in S are the ``-known`` file for the CAFA evaluator. + + Note: the same protein can be LK in one namespace and PK in another + simultaneously (e.g. had MFO+BPO at t0, gains CCO → LK in CCO, gains new + BPO → PK in BPO). Output format (matching CAFA evaluator): 2-column TSV, no header. protein_accession \\t go_id @@ -34,6 +44,8 @@ EXPERIMENTAL | {eco for eco, go in ECO_TO_CODE.items() if go in EXPERIMENTAL} ) +_NAMESPACES = ("F", "P", "C") + # --------------------------------------------------------------------------- # Data structures @@ -46,8 +58,11 @@ class EvaluationData: # {protein_accession: {go_id}} — delta annotations per category nk: dict[str, set[str]] = field(default_factory=dict) lk: dict[str, set[str]] = field(default_factory=dict) - # known-terms: ALL experimental annotations from OLD (all proteins) + pk: dict[str, set[str]] = field(default_factory=dict) + # known-terms: ALL experimental annotations from OLD (for reference download) known: dict[str, set[str]] = field(default_factory=dict) + # pk_known: old terms in PK namespaces only — passed as -known to cafaeval + pk_known: dict[str, set[str]] = field(default_factory=dict) @property def nk_proteins(self) -> int: @@ -57,6 +72,10 @@ def nk_proteins(self) -> int: def lk_proteins(self) -> int: return len(self.lk) + @property + def pk_proteins(self) -> int: + return len(self.pk) + @property def nk_annotations(self) -> int: return sum(len(v) for v in self.nk.values()) @@ -65,21 +84,27 @@ def nk_annotations(self) -> int: def lk_annotations(self) -> int: return sum(len(v) for v in self.lk.values()) + @property + def pk_annotations(self) -> int: + return sum(len(v) for v in self.pk.values()) + @property def known_terms_count(self) -> int: return sum(len(v) for v in self.known.values()) @property def delta_proteins(self) -> int: - return self.nk_proteins + self.lk_proteins + return len(set(self.nk) | set(self.lk) | set(self.pk)) def stats(self) -> dict: return { "delta_proteins": self.delta_proteins, "nk_proteins": self.nk_proteins, "lk_proteins": self.lk_proteins, + "pk_proteins": self.pk_proteins, "nk_annotations": self.nk_annotations, "lk_annotations": self.lk_annotations, + "pk_annotations": self.pk_annotations, "known_terms_count": self.known_terms_count, } @@ -120,33 +145,20 @@ def _get_descendants(term_id: int, children_map: dict[int, set[int]]) -> set[int return visited -def _load_experimental_annotations( - session: Session, - annotation_set_id: uuid.UUID, - negative_keys: set[tuple[str, int]], - go_id_map: dict[int, str], -) -> dict[str, set[str]]: - """Load all experimental, non-negated annotations from an annotation set. +def _load_go_maps( + session: Session, snapshot_id: uuid.UUID +) -> tuple[dict[int, str], dict[int, str]]: + """Load {go_term.id: go_id} and {go_term.id: aspect} for the snapshot. - Returns {protein_accession: {go_id}}. - negative_keys contains (protein_accession, go_term_db_id) pairs to exclude. + aspect is 'F' (molecular function), 'P' (biological process), or + 'C' (cellular component). """ rows = session.execute(text(""" - SELECT pga.protein_accession, pga.go_term_id - FROM protein_go_annotation pga - WHERE pga.annotation_set_id = :set_id - AND pga.evidence_code = ANY(:exp_codes) - AND (pga.qualifier IS NULL OR pga.qualifier NOT LIKE '%NOT%') - """), {"set_id": annotation_set_id, "exp_codes": _EXP_CODES}).fetchall() - - result: dict[str, set[str]] = defaultdict(set) - for protein_accession, go_term_id in rows: - if (protein_accession, go_term_id) in negative_keys: - continue - go_id = go_id_map.get(go_term_id) - if go_id: - result[protein_accession].add(go_id) - return dict(result) + SELECT id, go_id, aspect FROM go_term WHERE ontology_snapshot_id = :snap_id + """), {"snap_id": snapshot_id}).fetchall() + id_map = {db_id: go_id for db_id, go_id, _ in rows} + aspect_map = {db_id: aspect for db_id, _, aspect in rows if aspect} + return id_map, aspect_map def _build_negative_keys( @@ -166,7 +178,6 @@ def _build_negative_keys( AND qualifier LIKE '%NOT%' """), {"set_ids": set_ids}).fetchall() - # Group negated terms by protein, expand to descendants negated_by_protein: dict[str, set[int]] = defaultdict(set) for protein_accession, go_term_id in not_rows: negated_by_protein[protein_accession].add(go_term_id) @@ -182,12 +193,35 @@ def _build_negative_keys( return negative_keys -def _load_go_id_map(session: Session, snapshot_id: uuid.UUID) -> dict[int, str]: - """Load {go_term.id: go_term.go_id} for the snapshot.""" +def _load_experimental_annotations_by_ns( + session: Session, + annotation_set_id: uuid.UUID, + negative_keys: set[tuple[str, int]], + go_id_map: dict[int, str], + aspect_map: dict[int, str], +) -> dict[str, dict[str, set[str]]]: + """Load experimental, non-negated annotations grouped by namespace. + + Returns {protein_accession: {aspect: {go_id}}} where aspect ∈ {'F', 'P', 'C'}. + Terms without a known aspect are silently dropped. + """ rows = session.execute(text(""" - SELECT id, go_id FROM go_term WHERE ontology_snapshot_id = :snap_id - """), {"snap_id": snapshot_id}).fetchall() - return {db_id: go_id for db_id, go_id in rows} + SELECT pga.protein_accession, pga.go_term_id + FROM protein_go_annotation pga + WHERE pga.annotation_set_id = :set_id + AND pga.evidence_code = ANY(:exp_codes) + AND (pga.qualifier IS NULL OR pga.qualifier NOT LIKE '%NOT%') + """), {"set_id": annotation_set_id, "exp_codes": _EXP_CODES}).fetchall() + + result: dict[str, dict[str, set[str]]] = defaultdict(lambda: defaultdict(set)) + for protein_accession, go_term_id in rows: + if (protein_accession, go_term_id) in negative_keys: + continue + go_id = go_id_map.get(go_term_id) + aspect = aspect_map.get(go_term_id) + if go_id and aspect: + result[protein_accession][aspect].add(go_id) + return {p: dict(ns_terms) for p, ns_terms in result.items()} # --------------------------------------------------------------------------- @@ -200,20 +234,20 @@ def compute_evaluation_data( new_annotation_set_id: uuid.UUID, ontology_snapshot_id: uuid.UUID, ) -> EvaluationData: - """Compute NK/LK ground truth and known-terms from two annotation sets. - - This is the main entry point used both by the generation operation - (to persist stats) and by the download endpoints (to stream TSV data). - - Steps: - 1. Load GO DAG children map for NOT propagation. - 2. Build negative_keys from NOT annotations in both sets. - 3. Load experimental annotations from OLD and NEW (excluding negatives). - 4. Compute delta = new - old per protein. - 5. Classify delta proteins into NK / LK. - 6. Collect known-terms from OLD (all proteins, no delta filter). + """Compute NK/LK/PK ground truth following the CAFA5 protocol. + + Classification is per (protein, namespace): + + NK — protein had no experimental annotations in any namespace at t0. + LK — protein had annotations in some namespaces at t0, but not in + namespace S; gained new terms in S → those terms are LK ground truth. + PK — protein had annotations in namespace S at t0 and gained new terms + in S → those novel terms are PK ground truth; old terms in S are + stored in ``pk_known`` for the cafaeval ``-known`` flag. + + The same protein can be simultaneously LK in one namespace and PK in another. """ - go_id_map = _load_go_id_map(session, ontology_snapshot_id) + go_id_map, aspect_map = _load_go_maps(session, ontology_snapshot_id) children_map = _load_children_map(session, ontology_snapshot_id) negative_keys = _build_negative_keys( @@ -222,31 +256,59 @@ def compute_evaluation_data( children_map, ) - old_terms = _load_experimental_annotations( - session, old_annotation_set_id, negative_keys, go_id_map + old_by_ns = _load_experimental_annotations_by_ns( + session, old_annotation_set_id, negative_keys, go_id_map, aspect_map ) - new_terms = _load_experimental_annotations( - session, new_annotation_set_id, negative_keys, go_id_map + new_by_ns = _load_experimental_annotations_by_ns( + session, new_annotation_set_id, negative_keys, go_id_map, aspect_map ) nk: dict[str, set[str]] = {} - lk: dict[str, set[str]] = {} + lk: dict[str, set[str]] = defaultdict(set) + pk: dict[str, set[str]] = defaultdict(set) + pk_known: dict[str, set[str]] = defaultdict(set) - all_proteins = set(old_terms) | set(new_terms) + all_proteins = set(old_by_ns) | set(new_by_ns) for protein in all_proteins: - old_set = old_terms.get(protein, set()) - new_set = new_terms.get(protein, set()) + old_ns_map = old_by_ns.get(protein, {}) + new_ns_map = new_by_ns.get(protein, {}) - if not new_set: + new_all = {go for terms in new_ns_map.values() for go in terms} + if not new_all: continue - novel = new_set - old_set - if not novel: - continue + had_anything_old = bool(old_ns_map) - if not old_set: - nk[protein] = novel + if not had_anything_old: + # NK: no experimental annotations anywhere at t0. + # Novel = all new terms (nothing to subtract). + nk[protein] = new_all else: - lk[protein] = novel - - return EvaluationData(nk=nk, lk=lk, known=old_terms) + # Classify per namespace. + for ns in _NAMESPACES: + old_ns = old_ns_map.get(ns, set()) + new_ns = new_ns_map.get(ns, set()) + delta_ns = new_ns - old_ns + if not delta_ns: + continue + if not old_ns: + # LK: protein had nothing in this namespace at t0. + lk[protein] |= delta_ns + else: + # PK: protein had annotations in this namespace at t0. + pk[protein] |= delta_ns + pk_known[protein] |= old_ns + + # known = all old experimental annotations flattened (for reference download) + known = { + p: {go for terms in ns_map.values() for go in terms} + for p, ns_map in old_by_ns.items() + } + + return EvaluationData( + nk=nk, + lk=dict(lk), + pk=dict(pk), + pk_known=dict(pk_known), + known=known, + ) diff --git a/protea/core/metrics.py b/protea/core/metrics.py new file mode 100644 index 0000000..0e3cada --- /dev/null +++ b/protea/core/metrics.py @@ -0,0 +1,149 @@ +"""CAFA-style precision-recall metrics for GO term prediction evaluation. + +Takes scored GOPrediction rows and EvaluationData (ground truth) and computes +Fmax, AUC-PR, and the full precision-recall curve following the CAFA protocol. + +CAFA protocol summary +--------------------- +- Evaluate only on proteins present in the ground truth (NK or LK). +- At each score threshold t: + precision(t) = mean over proteins-with-predictions of |pred ∩ true| / |pred| + recall(t) = mean over ALL ground-truth proteins of |pred ∩ true| / |true| +- Fmax = max_t(2 * P(t) * R(t) / (P(t) + R(t))) +- AUC-PR via trapezoidal integration of the PR curve. + +Note: This implementation uses exact GO term matching (no DAG propagation). +Ancestor propagation is intentionally left for a future iteration. +""" +from __future__ import annotations + +import numpy as np +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Any + +from protea.core.evaluation import EvaluationData + +_N_THRESHOLDS = 101 # sweep [0.0, 0.01, …, 1.0] + + +@dataclass +class PRPoint: + threshold: float + precision: float + recall: float + f1: float + + +@dataclass +class CAFAMetrics: + """CAFA evaluation results for one (PredictionSet, ScoringConfig, category) triple.""" + + category: str # "nk" or "lk" + fmax: float + threshold_at_fmax: float + auc_pr: float + n_ground_truth_proteins: int # proteins in the chosen NK/LK category + n_predicted_proteins: int # proteins that received at least 1 prediction + n_predictions: int # total scored predictions passed in + curve: list[PRPoint] = field(default_factory=list) + + def summary(self) -> dict[str, Any]: + return { + "category": self.category, + "fmax": self.fmax, + "threshold_at_fmax": self.threshold_at_fmax, + "auc_pr": self.auc_pr, + "n_ground_truth_proteins": self.n_ground_truth_proteins, + "n_predicted_proteins": self.n_predicted_proteins, + "n_predictions": self.n_predictions, + } + + +def compute_cafa_metrics( + scored_predictions: list[dict[str, Any]], + evaluation_data: EvaluationData, + category: str = "nk", +) -> CAFAMetrics: + """Compute CAFA Fmax and PR curve. + + Parameters + ---------- + scored_predictions: + List of dicts, each must have: + - ``protein_accession`` (str) + - ``go_id`` (str, e.g. "GO:0005488") + - ``score`` (float in [0, 1]) + evaluation_data: + Ground truth from ``compute_evaluation_data()``. + category: + ``"nk"`` (no-knowledge) or ``"lk"`` (limited-knowledge). + + Returns + ------- + CAFAMetrics + """ + if category not in ("nk", "lk"): + raise ValueError(f"category must be 'nk' or 'lk', got {category!r}") + + ground_truth: dict[str, set[str]] = ( + evaluation_data.nk if category == "nk" else evaluation_data.lk + ) + + # Group predictions by protein, keep only proteins in ground truth + preds_by_protein: dict[str, list[tuple[float, str]]] = defaultdict(list) + for p in scored_predictions: + acc = p["protein_accession"] + if acc in ground_truth: + preds_by_protein[acc].append((float(p["score"]), str(p["go_id"]))) + + n_gt = len(ground_truth) + n_predicted = len(preds_by_protein) + + thresholds = np.linspace(0.0, 1.0, _N_THRESHOLDS) + curve: list[PRPoint] = [] + best_f = 0.0 + best_t = 0.0 + + for t in thresholds: + t = float(t) + tp_sum = 0 + pred_sum = 0 + rc_num = 0 + n_with_preds = 0 + + for acc, true_terms in ground_truth.items(): + predicted = {go for score, go in preds_by_protein.get(acc, []) if score >= t} + tp = len(predicted & true_terms) + rc_num += tp + if predicted: + n_with_preds += 1 + tp_sum += tp + pred_sum += len(predicted) + + pr = (tp_sum / pred_sum) if pred_sum > 0 else 0.0 + rc = (rc_num / sum(len(v) for v in ground_truth.values())) if n_gt > 0 else 0.0 + f1 = (2 * pr * rc / (pr + rc)) if (pr + rc) > 0 else 0.0 + + curve.append(PRPoint(threshold=round(t, 4), precision=round(pr, 6), + recall=round(rc, 6), f1=round(f1, 6))) + + if f1 > best_f: + best_f = f1 + best_t = t + + # AUC-PR: trapezoidal integration (recall on x-axis, precision on y-axis) + recalls = [p.recall for p in curve] + precisions = [p.precision for p in curve] + auc = float(abs(np.trapz(precisions, recalls))) + + return CAFAMetrics( + category=category, + fmax=round(best_f, 4), + threshold_at_fmax=round(best_t, 4), + auc_pr=round(auc, 4), + n_ground_truth_proteins=n_gt, + n_predicted_proteins=n_predicted, + n_predictions=len(scored_predictions), + curve=curve, + ) diff --git a/protea/core/operations/predict_go_terms.py b/protea/core/operations/predict_go_terms.py index 6914f51..104464b 100644 --- a/protea/core/operations/predict_go_terms.py +++ b/protea/core/operations/predict_go_terms.py @@ -1,7 +1,9 @@ from __future__ import annotations +import os import time import uuid +from pathlib import Path from typing import Annotated, Any from uuid import UUID @@ -16,6 +18,7 @@ from protea.core.knn_search import search_knn from protea.core.utils import utcnow from protea.infrastructure.orm.models.annotation.annotation_set import AnnotationSet +from protea.infrastructure.orm.models.annotation.go_term import GOTerm from protea.infrastructure.orm.models.annotation.ontology_snapshot import OntologySnapshot from protea.infrastructure.orm.models.annotation.protein_go_annotation import ProteinGOAnnotation from protea.infrastructure.orm.models.embedding.embedding_config import EmbeddingConfig @@ -32,6 +35,13 @@ _ANNOTATION_CHUNK_SIZE = 10_000 _BATCH_QUEUE = "protea.predictions.batch" _WRITE_QUEUE = "protea.predictions.write" +# Rows fetched per round-trip when streaming reference embeddings from PostgreSQL. +# At 1280 dims × 2 bytes (float16) × 2000 rows = ~5 MB per chunk — keeps Python +# object pressure negligible while amortising cursor round-trips. +_STREAM_CHUNK_SIZE = 2_000 + +# GO aspect single-character codes used in GOTerm.aspect +_ASPECTS = ("P", "F", "C") # biological_process, molecular_function, cellular_component # --------------------------------------------------------------------------- # Process-level reference cache @@ -46,6 +56,65 @@ _REF_CACHE: dict[tuple[str, str], dict[str, Any]] = {} _REF_CACHE_MAX = 1 +# --------------------------------------------------------------------------- +# Disk cache for reference embeddings +# Survives worker restarts — avoids re-fetching GB of vectors from PostgreSQL. +# Files: {cache_dir}/{emb_config_id}__{ann_set_id}_embeddings.npy +# {cache_dir}/{emb_config_id}__{ann_set_id}_accessions.npy +# Invalidation: annotation sets are immutable once loaded, so the cache is +# valid as long as the file exists. Delete files manually to force a reload. +# --------------------------------------------------------------------------- +_DISK_CACHE_DIR = Path(os.environ.get("PROTEA_REF_CACHE_DIR", "data/ref_cache")) + + +def _disk_cache_paths( + embedding_config_id: uuid.UUID, + annotation_set_id: uuid.UUID, +) -> tuple[Path, Path]: + """Return (embeddings_path, accessions_path) for the unified reference cache.""" + key = f"{embedding_config_id}__{annotation_set_id}" + return ( + _DISK_CACHE_DIR / f"{key}_embeddings.npy", + _DISK_CACHE_DIR / f"{key}_accessions.npy", + ) + + +def _aspect_index_path( + embedding_config_id: uuid.UUID, + annotation_set_id: uuid.UUID, + aspect: str, +) -> Path: + """Return the path for the per-aspect index array (int32 indices into the unified cache).""" + key = f"{embedding_config_id}__{annotation_set_id}" + return _DISK_CACHE_DIR / f"{key}__{aspect}_indices.npy" + + +def _load_from_disk_cache( + embedding_config_id: uuid.UUID, + annotation_set_id: uuid.UUID, +) -> dict[str, Any] | None: + emb_path, acc_path = _disk_cache_paths(embedding_config_id, annotation_set_id) + if not emb_path.exists() or not acc_path.exists(): + return None + try: + embeddings = np.load(emb_path) + accessions = list(np.load(acc_path)) + return {"accessions": accessions, "embeddings": embeddings} + except Exception: + return None + + +def _save_to_disk_cache( + embedding_config_id: uuid.UUID, + annotation_set_id: uuid.UUID, + accessions: list[str], + embeddings: np.ndarray, +) -> None: + emb_path, acc_path = _disk_cache_paths(embedding_config_id, annotation_set_id) + emb_path.parent.mkdir(parents=True, exist_ok=True) + np.save(emb_path, embeddings) + np.save(acc_path, np.array(accessions)) + # --------------------------------------------------------------------------- # Payloads @@ -76,6 +145,16 @@ class PredictGOTermsPayload(ProteaPayload, frozen=True): compute_alignments: bool = False compute_taxonomy: bool = False + # Per-aspect KNN indices (opt-in) + # When True, three separate KNN indices are built — one per GO aspect (P/F/C). + # Each index contains only reference proteins annotated in that aspect, and only + # annotations of that aspect are transferred from matched neighbors. + # This guarantees that every query protein receives BPO, MFO, and CCO candidates + # even if its nearest neighbors in a unified index happen to be annotated only in + # one or two aspects (a common cause of BPO recall ceilings). + # Memory cost: 3× the reference embedding array; search time: 3 KNN calls per batch. + aspect_separated_knn: bool = True + @field_validator("embedding_config_id", "annotation_set_id", "ontology_snapshot_id", mode="before") @classmethod def must_be_non_empty(cls, v: str) -> str: @@ -104,6 +183,7 @@ class PredictGOTermsBatchPayload(ProteaPayload, frozen=True): faiss_hnsw_ef_search: int = 64 compute_alignments: bool = False compute_taxonomy: bool = False + aspect_separated_knn: bool = True class StorePredictionsPayload(ProteaPayload, frozen=True): @@ -213,6 +293,7 @@ def execute( "faiss_hnsw_ef_search": p.faiss_hnsw_ef_search, "compute_alignments": p.compute_alignments, "compute_taxonomy": p.compute_taxonomy, + "aspect_separated_knn": p.aspect_separated_knn, }, })) @@ -304,8 +385,10 @@ def execute( {"parent_job_id": str(parent_job_id)}, "warning") return OperationResult(result={"skipped": True}) - # --- reference cache (load once per process per config+annotation_set) --- - cache_key = (p.embedding_config_id, p.annotation_set_id) + # --- reference cache (load once per process per config+annotation_set+mode) --- + # The cache key includes aspect_separated_knn so that switching modes on the + # same worker process does not serve stale data from a previous run. + cache_key = (p.embedding_config_id, p.annotation_set_id, p.aspect_separated_knn) if cache_key not in _REF_CACHE: # Evict oldest entry when cache is full to free numpy arrays from memory. if len(_REF_CACHE) >= _REF_CACHE_MAX: @@ -314,16 +397,16 @@ def execute( emit("predict_go_terms_batch.loading_reference", None, { "embedding_config_id": p.embedding_config_id, "annotation_set_id": p.annotation_set_id, + "aspect_separated_knn": p.aspect_separated_knn, }, "info") - _REF_CACHE[cache_key] = self._load_reference_data( - session, embedding_config_id, annotation_set_id, emit - ) - - ref_data = _REF_CACHE[cache_key] - - if not ref_data["embeddings"].size: - emit("predict_go_terms_batch.no_references", None, {}, "warning") - return OperationResult(result={"predictions": 0}) + if p.aspect_separated_knn: + _REF_CACHE[cache_key] = self._load_reference_data_per_aspect( + session, embedding_config_id, annotation_set_id, emit + ) + else: + _REF_CACHE[cache_key] = self._load_reference_data( + session, embedding_config_id, annotation_set_id, emit + ) # --- query embeddings for this batch --- query_embeddings, valid_accessions = self._load_query_embeddings( @@ -332,59 +415,71 @@ def execute( if not query_embeddings.size: return OperationResult(result={"predictions": 0}) - # --- KNN: convert float16 cache → float32 for search --- t0 = time.perf_counter() - ref_embeddings_f32 = ref_data["embeddings"].astype(np.float32) - neighbors = search_knn( - query_embeddings, - ref_embeddings_f32, - ref_data["accessions"], - k=p.limit_per_entry, - distance_threshold=p.distance_threshold, - backend=p.search_backend, - metric=p.metric, - faiss_index_type=p.faiss_index_type, - faiss_nlist=p.faiss_nlist, - faiss_nprobe=p.faiss_nprobe, - faiss_hnsw_m=p.faiss_hnsw_m, - faiss_hnsw_ef_search=p.faiss_hnsw_ef_search, - ) - # --- lazy GO annotation load: only for neighbors actually found --- - unique_neighbors: set[str] = set() - for top_refs in neighbors: - for ref_acc, _ in top_refs: - unique_neighbors.add(ref_acc) - go_map = self._load_annotations_for(session, annotation_set_id, unique_neighbors) - - # --- feature engineering sequences / taxonomy (opt-in) --- - ref_sequences: dict[str, str] = {} - query_sequences: dict[str, str] = {} - ref_tax_ids: dict[str, int | None] = {} - query_tax_ids: dict[str, int | None] = {} + if p.aspect_separated_knn: + prediction_dicts = self._run_aspect_separated_knn( + session, valid_accessions, query_embeddings, + _REF_CACHE[cache_key], annotation_set_id, prediction_set_id, p, + ) + else: + ref_data = _REF_CACHE[cache_key] + if not ref_data["embeddings"].size: + emit("predict_go_terms_batch.no_references", None, {}, "warning") + return OperationResult(result={"predictions": 0}) - if p.compute_alignments: - ref_sequences = self._load_sequences_for_proteins(session, unique_neighbors) - query_sequences = self._load_sequences_for_queries(session, p, valid_accessions) + # --- KNN: convert float16 cache → float32 for search --- + ref_embeddings_f32 = ref_data["embeddings"].astype(np.float32) + neighbors = search_knn( + query_embeddings, + ref_embeddings_f32, + ref_data["accessions"], + k=p.limit_per_entry, + distance_threshold=p.distance_threshold, + backend=p.search_backend, + metric=p.metric, + faiss_index_type=p.faiss_index_type, + faiss_nlist=p.faiss_nlist, + faiss_nprobe=p.faiss_nprobe, + faiss_hnsw_m=p.faiss_hnsw_m, + faiss_hnsw_ef_search=p.faiss_hnsw_ef_search, + ) - if p.compute_taxonomy: - ref_tax_ids = self._load_taxonomy_ids_for_proteins(session, unique_neighbors) - query_tax_ids = self._load_taxonomy_ids_for_queries(session, p, valid_accessions) + # --- lazy GO annotation load: only for neighbors actually found --- + unique_neighbors: set[str] = set() + for top_refs in neighbors: + for ref_acc, _ in top_refs: + unique_neighbors.add(ref_acc) + go_map = self._load_annotations_for(session, annotation_set_id, unique_neighbors) + + # --- feature engineering sequences / taxonomy (opt-in) --- + ref_sequences: dict[str, str] = {} + query_sequences: dict[str, str] = {} + ref_tax_ids: dict[str, int | None] = {} + query_tax_ids: dict[str, int | None] = {} + + if p.compute_alignments: + ref_sequences = self._load_sequences_for_proteins(session, unique_neighbors) + query_sequences = self._load_sequences_for_queries(session, p, valid_accessions) + + if p.compute_taxonomy: + ref_tax_ids = self._load_taxonomy_ids_for_proteins(session, unique_neighbors) + query_tax_ids = self._load_taxonomy_ids_for_queries(session, p, valid_accessions) + + ref_data_with_annotations = { + "accessions": ref_data["accessions"], + "embeddings": ref_embeddings_f32, + "go_map": go_map, + } + prediction_dicts = self._predict_batch( + valid_accessions, query_embeddings, ref_data_with_annotations, prediction_set_id, p, + neighbors=neighbors, + ref_sequences=ref_sequences, + query_sequences=query_sequences, + ref_tax_ids=ref_tax_ids, + query_tax_ids=query_tax_ids, + ) - # --- assemble ref_data with lazily-loaded annotations for _predict_batch --- - ref_data_with_annotations = { - "accessions": ref_data["accessions"], - "embeddings": ref_embeddings_f32, - "go_map": go_map, - } - prediction_dicts = self._predict_batch( - valid_accessions, query_embeddings, ref_data_with_annotations, prediction_set_id, p, - neighbors=neighbors, - ref_sequences=ref_sequences, - query_sequences=query_sequences, - ref_tax_ids=ref_tax_ids, - query_tax_ids=query_tax_ids, - ) elapsed = time.perf_counter() - t0 emit("predict_go_terms_batch.done", None, { @@ -417,6 +512,9 @@ def _load_reference_data( ) -> dict[str, Any]: """Load reference accessions and embeddings (float16) into the process cache. + Checks the disk cache first (survives worker restarts). On miss, fetches + from PostgreSQL and writes the result to disk for future restarts. + GO annotations are NOT loaded here — they are fetched lazily per batch for only the unique neighbors found by KNN, saving several GB of RAM. Embeddings are stored as float16 (half the memory of float32); they are @@ -424,54 +522,332 @@ def _load_reference_data( """ emit("predict_go_terms_batch.load_references_start", None, {}, "info") - rows = ( - session.query( - Protein.accession, - SequenceEmbedding.embedding, - ) - .join(Protein.sequence) + cached = _load_from_disk_cache(embedding_config_id, annotation_set_id) + if cached is not None: + emit("predict_go_terms_batch.load_references_done", None, { + "references": len(cached["accessions"]), + "embeddings_mb": round(cached["embeddings"].nbytes / 1024 / 1024), + "source": "disk_cache", + }, "info") + return cached + + annotated_accessions_sq = ( + session.query(ProteinGOAnnotation.protein_accession) + .filter(ProteinGOAnnotation.annotation_set_id == annotation_set_id) + .distinct() + .subquery() + ) + base_q = ( + session.query(Protein.accession, SequenceEmbedding.embedding) .join( SequenceEmbedding, (SequenceEmbedding.sequence_id == Protein.sequence_id) & (SequenceEmbedding.embedding_config_id == embedding_config_id), ) - .filter( - Protein.accession.in_( - session.query(ProteinGOAnnotation.protein_accession) - .filter(ProteinGOAnnotation.annotation_set_id == annotation_set_id) - .distinct() - ) - ) - .all() + .join(annotated_accessions_sq, + Protein.accession == annotated_accessions_sq.c.protein_accession) ) - if not rows: + # Count first so we can pre-allocate the numpy array and never build a + # list-of-lists in Python. Without pre-allocation, .all() on 400k rows + # materialises ~14 GB of Python float objects and hits swap. + total = base_q.count() + if total == 0: return {"accessions": [], "embeddings": np.empty((0,), dtype=np.float16)} - accessions = [r[0] for r in rows] - # float16: half the memory of float32, sufficient precision for cosine KNN - embeddings = np.array([list(r[1]) for r in rows], dtype=np.float16) + # Determine embedding dimension from a single row. + first_emb = base_q.limit(1).one()[1] + dim = len(first_emb) + + # Pre-allocate float16 array; fill row-by-row via yield_per so the + # cursor fetches _STREAM_CHUNK_SIZE rows at a time — peak Python-object + # memory stays at ~chunk_size × dim × 28 bytes ≈ tens of MB, not 14 GB. + embeddings = np.empty((total, dim), dtype=np.float16) + accessions: list[str] = [] + for i, (acc, emb) in enumerate(base_q.yield_per(_STREAM_CHUNK_SIZE)): + embeddings[i] = emb + accessions.append(acc) + + _save_to_disk_cache(embedding_config_id, annotation_set_id, accessions, embeddings) emit("predict_go_terms_batch.load_references_done", None, { "references": len(accessions), "embeddings_mb": round(embeddings.nbytes / 1024 / 1024), + "source": "database", }, "info") return {"accessions": accessions, "embeddings": embeddings} + def _load_reference_data_per_aspect( + self, + session: Session, + embedding_config_id: uuid.UUID, + annotation_set_id: uuid.UUID, + emit: EmitFn, + ) -> dict[str, dict[str, Any]]: + """Build per-aspect views over the single unified reference cache. + + Strategy — one array, three index slices: + + 1. Load (or build) the **unified** reference embeddings exactly as + :meth:`_load_reference_data` does — a single 1 GB float16 array shared + across all three aspects. No embeddings are duplicated on disk or in RAM. + 2. For each aspect (P / F / C) load (or build) a tiny **index array** — a + 1-D int32 array of row positions inside the unified array that correspond + to proteins annotated in that aspect. Index arrays are ~2 MB each and + are built with a lightweight accession-only query (no embedding data fetched). + 3. Return per-aspect sub-arrays as numpy fancy-index results (a copy in + float16, ~300 MB per aspect at most). + + Disk layout:: + + {key}_embeddings.npy ← unified, ~1 GB float16 (shared with non-aspect path) + {key}_accessions.npy ← unified accession list (shared) + {key}__P_indices.npy ← int32 row indices, ~2 MB + {key}__F_indices.npy + {key}__C_indices.npy + """ + emit("predict_go_terms_batch.load_references_per_aspect_start", None, { + "embedding_config_id": str(embedding_config_id), + "annotation_set_id": str(annotation_set_id), + }, "info") + + # ── step 1: unified embeddings (reuses existing disk cache or builds it once) ── + unified = self._load_reference_data(session, embedding_config_id, annotation_set_id, emit) + if not unified["accessions"]: + return {asp: {"accessions": [], "embeddings": np.empty((0,), dtype=np.float16)} + for asp in _ASPECTS} + + acc_to_idx: dict[str, int] = {acc: i for i, acc in enumerate(unified["accessions"])} + + # ── step 2: per-aspect index arrays ────────────────────────────────────────── + result: dict[str, dict[str, Any]] = {} + total_refs = 0 + + # Determine which aspects still need DB queries + missing_aspects = [ + asp for asp in _ASPECTS + if not _aspect_index_path(embedding_config_id, annotation_set_id, asp).exists() + ] + + # Single-pass query for ALL missing aspects to avoid repeated table scans + # (5M+ annotation rows — scanning 3× would waste ~21 min) + aspect_to_accset: dict[str, set[str]] = {asp: set() for asp in missing_aspects} + if missing_aspects: + rows = ( + session.query(ProteinGOAnnotation.protein_accession, GOTerm.aspect) + .join(ProteinGOAnnotation.go_term) + .filter( + ProteinGOAnnotation.annotation_set_id == annotation_set_id, + GOTerm.aspect.in_(missing_aspects), + ) + .distinct() + .all() + ) + for acc, asp in rows: + if asp in aspect_to_accset: + aspect_to_accset[asp].add(acc) + + for asp in missing_aspects: + idx_path = _aspect_index_path(embedding_config_id, annotation_set_id, asp) + indices = np.array( + [acc_to_idx[acc] for acc in aspect_to_accset[asp] if acc in acc_to_idx], + dtype=np.int32, + ) + idx_path.parent.mkdir(parents=True, exist_ok=True) + np.save(idx_path, indices) + + for aspect in _ASPECTS: + idx_path = _aspect_index_path(embedding_config_id, annotation_set_id, aspect) + indices = np.load(idx_path) + source = "disk_cache" if aspect not in missing_aspects else "database" + + aspect_accessions = [unified["accessions"][i] for i in indices] + aspect_embeddings = unified["embeddings"][indices] # float16 copy, ~300 MB max + + result[aspect] = {"accessions": aspect_accessions, "embeddings": aspect_embeddings} + total_refs += len(indices) + emit("predict_go_terms_batch.load_references_per_aspect_done", None, { + "aspect": aspect, + "references": len(indices), + "source": source, + }, "info") + + emit("predict_go_terms_batch.load_references_per_aspect_all_done", None, { + "total_references": total_refs, + }, "info") + return result + + def _run_aspect_separated_knn( + self, + session: Session, + valid_accessions: list[str], + query_embeddings: np.ndarray, + ref_data_by_aspect: dict[str, dict[str, Any]], + annotation_set_id: uuid.UUID, + prediction_set_id: uuid.UUID, + p: "PredictGOTermsBatchPayload", + ) -> list[dict[str, Any]]: + """Run three independent KNN searches (one per GO aspect) and merge results. + + For each aspect ``a`` in (P, F, C): + 1. Build a KNN index from the aspect-filtered reference embeddings. + 2. Find the ``limit_per_entry`` nearest neighbors for every query. + 3. Load only aspect-``a`` GO annotations for those neighbors. + 4. Transfer those annotations as predictions. + + This guarantees that every query protein can receive BPO, MFO, and CCO + candidates even if its globally nearest neighbors happen to carry + annotations in only one or two aspects — the dominant cause of the BPO + recall ceiling observed with a unified index. + + Feature engineering (alignments / taxonomy) is computed for the union of + neighbors across all aspects to avoid redundant work on shared neighbors. + """ + # Collect all unique neighbors across aspects so feature engineering + # is computed once per pair regardless of how many aspects reference it. + neighbors_by_aspect: dict[str, list[list[tuple[str, float]]]] = {} + all_unique_neighbors: set[str] = set() + + for aspect in _ASPECTS: + aspect_refs = ref_data_by_aspect[aspect] + if not aspect_refs["accessions"]: + neighbors_by_aspect[aspect] = [[] for _ in valid_accessions] + continue + + ref_f32 = aspect_refs["embeddings"].astype(np.float32) + aspect_neighbors = search_knn( + query_embeddings, + ref_f32, + aspect_refs["accessions"], + k=p.limit_per_entry, + distance_threshold=p.distance_threshold, + backend=p.search_backend, + metric=p.metric, + faiss_index_type=p.faiss_index_type, + faiss_nlist=p.faiss_nlist, + faiss_nprobe=p.faiss_nprobe, + faiss_hnsw_m=p.faiss_hnsw_m, + faiss_hnsw_ef_search=p.faiss_hnsw_ef_search, + ) + neighbors_by_aspect[aspect] = aspect_neighbors + for top_refs in aspect_neighbors: + for ref_acc, _ in top_refs: + all_unique_neighbors.add(ref_acc) + + # Feature engineering — computed over the union of all neighbors + ref_sequences: dict[str, str] = {} + query_sequences: dict[str, str] = {} + ref_tax_ids: dict[str, int | None] = {} + query_tax_ids: dict[str, int | None] = {} + + if p.compute_alignments: + ref_sequences = self._load_sequences_for_proteins(session, all_unique_neighbors) + query_sequences = self._load_sequences_for_queries(session, p, valid_accessions) + + if p.compute_taxonomy: + ref_tax_ids = self._load_taxonomy_ids_for_proteins(session, all_unique_neighbors) + query_tax_ids = self._load_taxonomy_ids_for_queries(session, p, valid_accessions) + + # Build predictions per aspect, merging into a single list. + # seen_terms is keyed per query protein to deduplicate across aspects. + predictions: list[dict[str, Any]] = [] + seen_per_query: dict[str, set[int]] = {acc: set() for acc in valid_accessions} + pair_features: dict[tuple[str, str], dict[str, Any]] = {} + + for aspect in _ASPECTS: + unique_neighbors_aspect: set[str] = set() + for top_refs in neighbors_by_aspect[aspect]: + for ref_acc, _ in top_refs: + unique_neighbors_aspect.add(ref_acc) + + go_map = self._load_annotations_for( + session, annotation_set_id, unique_neighbors_aspect, aspect=aspect + ) + + for q_acc, top_refs in zip(valid_accessions, neighbors_by_aspect[aspect], strict=False): + seen_terms = seen_per_query[q_acc] + + for ref_acc, distance in top_refs: + pair_key = (q_acc, ref_acc) + if pair_key not in pair_features: + feats: dict[str, Any] = {} + if p.compute_alignments: + q_seq = query_sequences.get(q_acc, "") + r_seq = ref_sequences.get(ref_acc, "") + if q_seq and r_seq: + feats.update(compute_alignment(q_seq, r_seq)) + if p.compute_taxonomy: + q_tid = query_tax_ids.get(q_acc) + r_tid = ref_tax_ids.get(ref_acc) + feats.update(compute_taxonomy(q_tid, r_tid)) + feats["query_taxonomy_id"] = q_tid + feats["ref_taxonomy_id"] = r_tid + pair_features[pair_key] = feats + + feats = pair_features[pair_key] + + for ann in go_map.get(ref_acc, []): + go_term_id = ann["go_term_id"] + if go_term_id in seen_terms: + continue + seen_terms.add(go_term_id) + pred: dict[str, Any] = { + "prediction_set_id": str(prediction_set_id), + "protein_accession": q_acc, + "go_term_id": go_term_id, + "ref_protein_accession": ref_acc, + "distance": distance, + } + if ann.get("qualifier"): + pred["qualifier"] = ann["qualifier"] + if ann.get("evidence_code"): + pred["evidence_code"] = ann["evidence_code"] + for key in ( + "identity_nw", "similarity_nw", "alignment_score_nw", + "gaps_pct_nw", "alignment_length_nw", + "identity_sw", "similarity_sw", "alignment_score_sw", + "gaps_pct_sw", "alignment_length_sw", + "length_query", "length_ref", + "query_taxonomy_id", "ref_taxonomy_id", + "taxonomic_lca", "taxonomic_distance", + "taxonomic_common_ancestors", "taxonomic_relation", + ): + val = feats.get(key) + if val is not None: + pred[key] = val + predictions.append(pred) + + return predictions + def _load_annotations_for( self, session: Session, annotation_set_id: uuid.UUID, accessions: set[str], + aspect: str | None = None, ) -> dict[str, list[dict[str, Any]]]: - """Load GO annotations for the given accessions, chunked to avoid param limits.""" + """Load GO annotations for the given accessions, chunked to avoid param limits. + + Only non-negated annotations are loaded: rows with a NOT qualifier (e.g. + ``'NOT'``, ``'NOT|involved_in'``) assert that the protein does *not* have + the annotated function and must never be transferred as positive predictions. + Although NOT annotations are rare in GOA/QuickGO (~0.1 % of rows), including + them would introduce false positives that are silently penalised by cafaeval + without any obvious trace in the prediction artefacts. + + When ``aspect`` is given (``'P'``, ``'F'``, or ``'C'``), only annotations + whose GO term belongs to that aspect are returned. This is used by the + per-aspect KNN mode so that BPO-index neighbors transfer only BPO terms, + MFO-index neighbors transfer only MFO terms, etc. The join to ``go_term`` + is added only when needed to keep the no-aspect path as fast as before. + """ go_map: dict[str, list[dict[str, Any]]] = {} accessions_list = list(accessions) for i in range(0, len(accessions_list), _ANNOTATION_CHUNK_SIZE): chunk = accessions_list[i: i + _ANNOTATION_CHUNK_SIZE] - rows = ( + q = ( session.query( ProteinGOAnnotation.protein_accession, ProteinGOAnnotation.go_term_id, @@ -481,9 +857,20 @@ def _load_annotations_for( .filter( ProteinGOAnnotation.annotation_set_id == annotation_set_id, ProteinGOAnnotation.protein_accession.in_(chunk), + # Exclude NOT-qualified annotations (e.g. 'NOT', 'NOT|involved_in'). + # qualifier IS NULL must be preserved explicitly because SQL LIKE + # returns NULL for NULL inputs, which would silently drop those rows. + ( + ProteinGOAnnotation.qualifier.is_(None) + | ~ProteinGOAnnotation.qualifier.like("%NOT%") + ), ) - .all() ) + if aspect is not None: + # Join go_term only when aspect filtering is requested to avoid + # an unnecessary join on the common (non-aspect-separated) path. + q = q.join(ProteinGOAnnotation.go_term).filter(GOTerm.aspect == aspect) + rows = q.all() for acc, go_term_id, qualifier, evidence_code in rows: go_map.setdefault(acc, []).append({ "go_term_id": go_term_id, diff --git a/protea/core/operations/run_cafa_evaluation.py b/protea/core/operations/run_cafa_evaluation.py index b6c8866..a571039 100644 --- a/protea/core/operations/run_cafa_evaluation.py +++ b/protea/core/operations/run_cafa_evaluation.py @@ -13,6 +13,7 @@ from protea.core.contracts.operation import EmitFn, OperationResult, ProteaPayload from protea.core.evaluation import compute_evaluation_data +from protea.core.scoring import compute_score from protea.infrastructure.orm.models.annotation.annotation_set import AnnotationSet from protea.infrastructure.orm.models.annotation.evaluation_result import EvaluationResult from protea.infrastructure.orm.models.annotation.evaluation_set import EvaluationSet @@ -20,6 +21,7 @@ from protea.infrastructure.orm.models.annotation.ontology_snapshot import OntologySnapshot from protea.infrastructure.orm.models.embedding.go_prediction import GOPrediction from protea.infrastructure.orm.models.embedding.prediction_set import PredictionSet +from protea.infrastructure.orm.models.embedding.scoring_config import ScoringConfig # Namespace labels used by cafaeval OBO parser _NS_LABELS = { @@ -35,6 +37,18 @@ class RunCafaEvaluationPayload(ProteaPayload, frozen=True): prediction_set_id: str max_distance: float | None = Field(default=None, ge=0.0, le=2.0) artifacts_dir: str | None = Field(default=None) + scoring_config_id: str | None = Field(default=None) + ia_file: str | None = Field( + default=None, + description=( + "Path to an Information Accretion (IA) TSV file (two columns: go_id, ia_value). " + "When provided, cafaeval weights each GO term by its IC so that rare, specific " + "terms contribute more to the score than common, easy-to-predict terms. " + "Without this file cafaeval assigns uniform weight (IC=1) to every term, which " + "inflates Fmax because high-frequency terms dominate the metric. " + "For CAFA6 evaluations use the IA_cafa6.tsv file supplied with the benchmark." + ), + ) @field_validator("evaluation_set_id", "prediction_set_id", mode="before") @classmethod @@ -51,10 +65,20 @@ class RunCafaEvaluationOperation: 1. Load EvaluationSet and PredictionSet from DB. 2. Compute evaluation data (delta NK/LK + known-terms) with full NOT propagation. 3. Download the OBO file from the ontology snapshot URL. - 4. Write temp files: ground-truth NK/LK, known-terms, predictions (CAFA format). - 5. Call ``cafa_eval`` for each setting (NK, LK, PK). - 6. Parse per-namespace Fmax / precision / recall / coverage from results. - 7. Persist an EvaluationResult row with all metrics. + 4. Resolve the Information Accretion (IA) file: + - If ``ia_file`` is set in the payload, use that path directly. + - Otherwise, if the OntologySnapshot has an ``ia_url``, download it to + a temporary file and pass it to cafaeval. + - If neither is set, cafaeval runs with uniform IC=1 for all terms. + IA weights make rare, specific GO terms count more than common ones and + are strongly recommended for publishable evaluations. Each CAFA benchmark + ships its own IA file (e.g. ``IA_cafa6.tsv``); store its URL in the + corresponding OntologySnapshot so future evaluations pick it up + automatically without touching the job payload. + 5. Write temp files: ground-truth NK/LK, known-terms, predictions (CAFA format). + 6. Call ``cafa_eval`` for each setting (NK, LK, PK). + 7. Parse per-namespace Fmax / precision / recall / coverage from results. + 8. Persist an EvaluationResult row with all metrics. """ name = "run_cafa_evaluation" @@ -97,11 +121,23 @@ def execute( emit("run_cafa_evaluation.delta_done", None, { "nk_proteins": data.nk_proteins, "lk_proteins": data.lk_proteins, + "pk_proteins": data.pk_proteins, }, "info") if data.delta_proteins == 0: raise ValueError("No delta proteins found — cannot evaluate") + # Load and snapshot ScoringConfig before the no-op commit below + scoring_config_snapshot: ScoringConfig | None = None + if p.scoring_config_id: + sc = session.get(ScoringConfig, uuid.UUID(p.scoring_config_id)) + if sc is None: + raise ValueError(f"ScoringConfig {p.scoring_config_id} not found") + scoring_config_snapshot = ScoringConfig( + formula=sc.formula, + weights=dict(sc.weights), + ) + # Pre-generate result_id so the artifact directory name matches the DB row. result_id = uuid.uuid4() @@ -122,25 +158,48 @@ def execute( obo_path = os.path.join(tmpdir, "go.obo") self._download_obo(snapshot.obo_url, obo_path) + # Resolve IA file: explicit payload path > snapshot ia_url > None (uniform IC). + # Priority: an explicit ia_file in the payload overrides the snapshot URL so + # that one-off experiments can use a custom IA without touching the snapshot. + # When ia_file is absent but the snapshot carries an ia_url, the file is + # downloaded once into tmpdir and used for all three settings (NK/LK/PK). + ia_path: str | None = p.ia_file + if ia_path is None and snapshot.ia_url: + ia_path = os.path.join(tmpdir, "ia.tsv") + emit("run_cafa_evaluation.downloading_ia", None, {"url": snapshot.ia_url}, "info") + self._download_tsv(snapshot.ia_url, ia_path) + if ia_path: + emit("run_cafa_evaluation.ia_resolved", None, {"ia_path": ia_path}, "info") + else: + emit("run_cafa_evaluation.ia_missing", None, { + "warning": "No IA file available; cafaeval will use uniform IC=1 for all " + "GO terms. Set ia_url on the OntologySnapshot or pass ia_file " + "in the payload for information-content-weighted metrics.", + }, "warning") + # Write ground truth files gt_dir = str(artifacts_root) if artifacts_root else tmpdir nk_path = os.path.join(gt_dir, "gt_NK.tsv") lk_path = os.path.join(gt_dir, "gt_LK.tsv") + pk_path = os.path.join(gt_dir, "gt_PK.tsv") known_path = os.path.join(gt_dir, "known_terms.tsv") + pk_known_path = os.path.join(gt_dir, "pk_known_terms.tsv") self._write_gt(data.nk, nk_path) self._write_gt(data.lk, lk_path) + self._write_gt(data.pk, pk_path) self._write_gt(data.known, known_path) + self._write_gt(data.pk_known, pk_known_path) # Write predictions (CAFA format) filtered to delta proteins pred_dir = os.path.join(gt_dir, "predictions") os.makedirs(pred_dir, exist_ok=True) pred_path = os.path.join(pred_dir, "predictions.tsv") - delta_proteins = set(data.nk) | set(data.lk) + delta_proteins = set(data.nk) | set(data.lk) | set(data.pk) emit("run_cafa_evaluation.writing_predictions", None, { "delta_proteins": len(delta_proteins), }, "info") - self._write_predictions(session, pred_set_id, delta_proteins, p.max_distance, pred_path) + self._write_predictions(session, pred_set_id, delta_proteins, p.max_distance, pred_path, scoring_config_snapshot) # No-op commit: releases the DB connection back to the pool before # cafaeval forks worker processes via multiprocessing.Pool. Forked @@ -154,7 +213,7 @@ def execute( for setting, gt_file, known_file in [ ("NK", nk_path, None), ("LK", lk_path, None), - ("PK", lk_path, known_path), + ("PK", pk_path, pk_known_path), ]: emit("run_cafa_evaluation.evaluating", None, {"setting": setting}, "info") try: @@ -167,6 +226,7 @@ def execute( try: df, dfs_best = cafa_eval( obo_path, pred_dir, gt_file, + ia=ia_path, exclude=known_file, prop="max", norm="cafa", @@ -201,6 +261,7 @@ def execute( id=result_id, evaluation_set_id=eval_set_id, prediction_set_id=pred_set_id, + scoring_config_id=uuid.UUID(p.scoring_config_id) if p.scoring_config_id else None, results=results, ) session.add(eval_result) @@ -230,6 +291,44 @@ def _download_obo(self, url: str, dest: str) -> None: with open(dest, "w", encoding="utf-8") as f: f.write(resp.text) + def _download_tsv(self, url: str, dest: str) -> None: + """Copy or download a plain-text TSV file (gzip-transparent) to dest. + + Accepts both HTTP(S) URLs and local filesystem paths (absolute or + ``file://`` scheme). Local paths are resolved without any network + request, which is useful during development when the IA file lives + inside the repository (``data/benchmarks/IA_cafa6.tsv``) and + ``ia_url`` is set to its absolute path. Once the file is pushed to + GitHub the URL can be switched to the raw.githubusercontent.com + address and the same code path handles it transparently. + """ + import gzip as _gzip + import shutil + + # Resolve local paths (absolute or file:// scheme) without HTTP. + local_path: str | None = None + if url.startswith("file://"): + local_path = url[len("file://"):] + elif url.startswith("/"): + local_path = url + + if local_path is not None: + if url.endswith(".gz"): + with _gzip.open(local_path, "rb") as src, open(dest, "wb") as f: + shutil.copyfileobj(src, f) + else: + shutil.copy2(local_path, dest) + return + + resp = requests.get(url, stream=True, timeout=300) + resp.raise_for_status() + if url.endswith(".gz"): + with open(dest, "wb") as f: + f.write(_gzip.decompress(resp.content)) + else: + with open(dest, "w", encoding="utf-8") as f: + f.write(resp.text) + def _write_gt(self, annotations: dict[str, set[str]], path: str) -> None: """Write {protein: {go_id}} to a 2-column TSV (no header).""" with open(path, "w") as f: @@ -244,8 +343,14 @@ def _write_predictions( delta_proteins: set[str], max_distance: float | None, path: str, + scoring_config: ScoringConfig | None = None, ) -> None: - """Write CAFA-format predictions (protein\\tgo_id\\tscore) for delta proteins.""" + """Write CAFA-format predictions (protein\\tgo_id\\tscore) for delta proteins. + + If a ScoringConfig is provided, scores are computed via compute_score() + using all available signals (embedding similarity, evidence, alignment, + taxonomy). Otherwise falls back to ``1 - cosine_distance / 2``. + """ q = ( session.query(GOPrediction, GOTerm) .join(GOTerm, GOPrediction.go_term_id == GOTerm.id) @@ -263,7 +368,17 @@ def _write_predictions( if key in seen: continue seen.add(key) - score = max(0.0, 1.0 - pred.distance) + if scoring_config is not None: + pred_dict = { + "distance": pred.distance, + "identity_nw": pred.identity_nw, + "identity_sw": pred.identity_sw, + "evidence_code": pred.evidence_code, + "taxonomic_distance": pred.taxonomic_distance, + } + score = compute_score(pred_dict, scoring_config) + else: + score = max(0.0, 1.0 - (pred.distance or 0.0) / 2.0) f.write(f"{pred.protein_accession}\t{gt.go_id}\t{score:.4f}\n") def _parse_results(self, dfs_best: dict) -> dict[str, Any]: diff --git a/protea/core/scoring.py b/protea/core/scoring.py new file mode 100644 index 0000000..0c0fded --- /dev/null +++ b/protea/core/scoring.py @@ -0,0 +1,193 @@ +"""Scoring engine for GOPrediction rows. + +Applies a :class:`~protea.infrastructure.orm.models.embedding.scoring_config.ScoringConfig` +formula to raw prediction signals and returns a normalised [0, 1] confidence score. + +The engine is intentionally *stateless*: every call to :func:`compute_score` +is self-contained, which means any ``ScoringConfig`` can be applied to an +existing ``PredictionSet`` at any time without re-running the KNN search. + +Evidence-code weights +--------------------- +Evidence code quality is resolved through a two-level lookup: + +1. If ``config.evidence_weights`` is not ``None``, that dict is checked first. +2. For codes absent from the override (or when no override exists), the module- + level :data:`DEFAULT_EVIDENCE_WEIGHTS` table is used. +3. Codes unknown to both tables fall back to + :data:`DEFAULT_EVIDENCE_WEIGHT_FALLBACK` (0.5). + +This means a ``ScoringConfig`` may carry a *partial* override — e.g. only +changing the IEA weight from 0.3 to 0.0 — without having to redeclare every +other code. The resolution order ensures backwards compatibility: configs +stored without ``evidence_weights`` behave identically to older configs. +""" +from __future__ import annotations + +from typing import Any + +from protea.core.evidence_codes import ECO_TO_CODE +from protea.infrastructure.orm.models.embedding.scoring_config import ( + DEFAULT_EVIDENCE_WEIGHT_FALLBACK, + DEFAULT_EVIDENCE_WEIGHTS, + FORMULA_EVIDENCE_WEIGHTED, + ScoringConfig, +) + + +# --------------------------------------------------------------------------- +# Evidence-code weight resolution +# --------------------------------------------------------------------------- + +def evidence_weight( + code: str | None, + *, + overrides: dict[str, float] | None = None, +) -> float: + """Resolve the [0, 1] quality weight for a GO evidence code or ECO ID. + + Resolution order + ---------------- + 1. Normalise *code* from ECO ID to GO code via :data:`ECO_TO_CODE` if needed. + 2. Look up the normalised code in *overrides* (if provided). + 3. Fall back to :data:`DEFAULT_EVIDENCE_WEIGHTS`. + 4. If still not found, return :data:`DEFAULT_EVIDENCE_WEIGHT_FALLBACK`. + + Parameters + ---------- + code: + A GO evidence code (e.g. ``"IEA"``) or an ECO URI + (e.g. ``"ECO:0000501"``). ``None`` returns the fallback weight. + overrides: + Optional per-config evidence weight table. May be a partial dict; + codes not present here are resolved via :data:`DEFAULT_EVIDENCE_WEIGHTS`. + + Returns + ------- + float in [0, 1]. + """ + if not code: + return DEFAULT_EVIDENCE_WEIGHT_FALLBACK + + # Normalise ECO IDs to canonical GO evidence codes. + normalized = ECO_TO_CODE.get(code, code) + + # Config-level override takes precedence over the system default. + if overrides and normalized in overrides: + return float(overrides[normalized]) + + return DEFAULT_EVIDENCE_WEIGHTS.get(normalized, DEFAULT_EVIDENCE_WEIGHT_FALLBACK) + + +# --------------------------------------------------------------------------- +# Score computation +# --------------------------------------------------------------------------- + +def compute_score(pred: dict[str, Any], config: ScoringConfig) -> float: + """Compute a [0, 1] confidence score for a single GOPrediction dict. + + All signals are normalised to [0, 1] before weighting. Signals whose + value is ``None`` (because the corresponding feature-engineering flag was + not enabled at prediction time) are *silently excluded* from both the + numerator and the denominator, so the remaining signals still produce a + valid normalised score. + + Parameters + ---------- + pred: + Dict with raw prediction fields. Recognised keys: + + - ``distance`` (float): cosine distance in [0, 2]. + - ``identity_nw`` (float | None): NW global identity in [0, 1]. + - ``identity_sw`` (float | None): SW local identity in [0, 1]. + - ``evidence_code`` (str | None): GO or ECO evidence code. + - ``taxonomic_distance`` (float | None): raw taxonomic distance. + + config: + A :class:`ScoringConfig` instance defining the formula, signal + weights, and optional per-code evidence weight overrides. + + Returns + ------- + float in [0, 1]. Higher values indicate higher predicted confidence. + The result is rounded to 6 decimal places. + """ + signal_weights = config.weights + ev_overrides: dict[str, float] | None = config.evidence_weights or None + + total_w = 0.0 + weighted_sum = 0.0 + + def _add(key: str, value: float | None) -> None: + """Add one signal's contribution to the running weighted average.""" + nonlocal total_w, weighted_sum + w = float(signal_weights.get(key, 0.0)) + if w == 0.0 or value is None: + return + total_w += w + weighted_sum += w * max(0.0, min(1.0, value)) + + # 1. Embedding similarity: cosine distance [0, 2] → similarity [0, 1]. + distance = pred.get("distance") + if distance is not None: + _add("embedding_similarity", 1.0 - distance / 2.0) + + # 2. Global sequence identity (Needleman-Wunsch). + _add("identity_nw", pred.get("identity_nw")) + + # 3. Local sequence identity (Smith-Waterman). + _add("identity_sw", pred.get("identity_sw")) + + # 4. Evidence code quality — resolved with per-config overrides. + ev_w = evidence_weight(pred.get("evidence_code"), overrides=ev_overrides) + _add("evidence_weight", ev_w) + + # 5. Taxonomic proximity: 1 / (1 + d) maps [0, ∞) → (0, 1]. + tax_dist = pred.get("taxonomic_distance") + if tax_dist is not None: + _add("taxonomic_proximity", 1.0 / (1.0 + float(tax_dist))) + + if total_w == 0.0: + return 0.0 + + base_score = weighted_sum / total_w + + # evidence_weighted formula: multiply the final score by the resolved + # evidence quality so that low-confidence annotations (IEA, ND) are + # down-ranked even when other signals are strong — and regardless of + # whether the evidence_weight signal is active (its signal weight may be 0). + if config.formula == FORMULA_EVIDENCE_WEIGHTED: + base_score *= ev_w + + return round(base_score, 6) + + +# --------------------------------------------------------------------------- +# Batch helper +# --------------------------------------------------------------------------- + +def score_predictions( + predictions: list[dict[str, Any]], + config: ScoringConfig, +) -> list[dict[str, Any]]: + """Add a ``score`` key to each prediction dict and return them sorted descending. + + Parameters + ---------- + predictions: + List of raw prediction dicts (same format as accepted by + :func:`compute_score`). + config: + The :class:`ScoringConfig` to apply. + + Returns + ------- + A new list with a ``score`` key added to each item, sorted by score in + descending order. The original list is not modified. + """ + scored = [ + {**p, "score": compute_score(p, config)} + for p in predictions + ] + scored.sort(key=lambda x: x["score"], reverse=True) + return scored From d0821d4a98eeedcd6d1f4ec9a1cb4a5ee224c326 Mon Sep 17 00:00:00 2001 From: frapercan Date: Mon, 16 Mar 2026 21:56:12 +0100 Subject: [PATCH 07/17] feat(api): add scoring and support routers, extend evaluation endpoints - scoring.py: CRUD for ScoringConfig + preset factory + scored TSV streaming endpoint (/scoring/prediction-sets/{id}/score.tsv) - support.py: contact/support entry submission endpoint - annotations.py: evaluation-set DELETE cascade, result DELETE, artifacts ZIP download, IA-url PATCH on snapshots, ground-truth FASTA and TSV download endpoints - embeddings.py: propagate scoring_config_id in prediction launch - app.py: register scoring and support routers - base_worker.py: surface scoring_config_id in worker dispatch --- protea/api/app.py | 6 + protea/api/routers/annotations.py | 190 ++++++++++- protea/api/routers/embeddings.py | 4 +- protea/api/routers/scoring.py | 544 ++++++++++++++++++++++++++++++ protea/api/routers/support.py | 70 ++++ protea/workers/base_worker.py | 7 +- 6 files changed, 815 insertions(+), 6 deletions(-) create mode 100644 protea/api/routers/scoring.py create mode 100644 protea/api/routers/support.py diff --git a/protea/api/app.py b/protea/api/app.py index a52e2e5..31a4dfb 100644 --- a/protea/api/app.py +++ b/protea/api/app.py @@ -14,6 +14,8 @@ from protea.api.routers import maintenance as maintenance_router from protea.api.routers import proteins as proteins_router from protea.api.routers import query_sets as query_sets_router +from protea.api.routers import scoring as scoring_router +from protea.api.routers import support as support_router from protea.infrastructure.session import build_session_factory from protea.infrastructure.settings import load_settings @@ -44,6 +46,8 @@ def create_app(project_root: Path | None = None) -> FastAPI: {"name": "query-sets", "description": "User-uploaded FASTA datasets for custom prediction queries."}, {"name": "maintenance", "description": "Housekeeping — identify and remove orphaned sequences or embeddings."}, {"name": "admin", "description": "Destructive admin operations (DB reset). Use with caution."}, + {"name": "scoring", "description": "Scoring configs, scored prediction export, and CAFA metrics."}, + {"name": "support", "description": "Community thumbs-up and comments."}, ], ) app.state.session_factory = factory @@ -65,6 +69,8 @@ def create_app(project_root: Path | None = None) -> FastAPI: app.include_router(query_sets_router.router) app.include_router(maintenance_router.router) app.include_router(admin_router.router) + app.include_router(scoring_router.router) + app.include_router(support_router.router) sphinx_build = project_root / "docs" / "build" / "html" if sphinx_build.exists(): diff --git a/protea/api/routers/annotations.py b/protea/api/routers/annotations.py index 3617501..faae4d5 100644 --- a/protea/api/routers/annotations.py +++ b/protea/api/routers/annotations.py @@ -29,6 +29,8 @@ from protea.infrastructure.orm.models.annotation.ontology_snapshot import OntologySnapshot from protea.infrastructure.orm.models.annotation.protein_go_annotation import ProteinGOAnnotation from protea.infrastructure.orm.models.job import Job, JobEvent +from protea.infrastructure.orm.models.protein.protein import Protein +from protea.infrastructure.orm.models.sequence.sequence import Sequence from protea.infrastructure.queue.publisher import publish_job from protea.infrastructure.session import session_scope @@ -85,6 +87,7 @@ def list_snapshots( "id": str(s.id), "obo_url": s.obo_url, "obo_version": s.obo_version, + "ia_url": s.ia_url, "loaded_at": s.loaded_at.isoformat(), "go_term_count": cnt or 0, } @@ -113,11 +116,49 @@ def get_snapshot( "id": str(s.id), "obo_url": s.obo_url, "obo_version": s.obo_version, + "ia_url": s.ia_url, "loaded_at": s.loaded_at.isoformat(), "go_term_count": term_count, } +@router.patch("/snapshots/{snapshot_id}/ia-url", summary="Set IA URL on an ontology snapshot") +def set_snapshot_ia_url( + snapshot_id: UUID, + body: dict[str, Any], + factory: sessionmaker[Session] = Depends(get_session_factory), +) -> dict[str, Any]: + """Associate an Information Accretion (IA) file URL with an existing ontology snapshot. + + The IA file contains per-term information-content weights (two columns: + ``go_id``, ``ia_value``) and is published alongside each CAFA benchmark + (e.g. ``IA_cafa6.tsv``). Once set, ``run_cafa_evaluation`` picks it up + automatically for every evaluation that uses this snapshot — no need to + pass ``ia_file`` in the job payload. + + Pass ``{"ia_url": null}`` to clear the association (evaluations will fall + back to uniform IC=1). + + This endpoint only touches ``ia_url``; the OBO file and GO term data are + not affected. + """ + ia_url = body.get("ia_url") + if "ia_url" not in body: + raise HTTPException(status_code=422, detail="Body must contain 'ia_url' key (string or null)") + + with session_scope(factory) as session: + s = session.get(OntologySnapshot, snapshot_id) + if s is None: + raise HTTPException(status_code=404, detail="OntologySnapshot not found") + s.ia_url = ia_url or None + session.flush() + return { + "id": str(s.id), + "obo_version": s.obo_version, + "ia_url": s.ia_url, + } + + @router.post("/snapshots/load", summary="Trigger ontology snapshot load") def load_ontology_snapshot( body: dict[str, Any], @@ -357,6 +398,30 @@ def list_evaluation_sets( ] +@router.delete("/evaluation-sets/{eval_id}", summary="Delete an evaluation set", status_code=204) +def delete_evaluation_set( + eval_id: UUID, + factory: sessionmaker[Session] = Depends(get_session_factory), + artifacts_dir: Path = Depends(get_artifacts_dir), +) -> None: + """Delete an evaluation set and all its results. Cascades to EvaluationResult rows.""" + with session_scope(factory) as session: + e = session.get(EvaluationSet, eval_id) + if e is None: + raise HTTPException(status_code=404, detail="EvaluationSet not found") + # Collect result IDs to clean up artifact dirs + result_ids = [str(r.id) for r in session.query(EvaluationResult).filter( + EvaluationResult.evaluation_set_id == eval_id + ).all()] + session.delete(e) + + import shutil + for rid in result_ids: + result_dir = artifacts_dir / rid + if result_dir.exists(): + shutil.rmtree(result_dir, ignore_errors=True) + + @router.get("/evaluation-sets/{eval_id}", summary="Get evaluation set details") def get_evaluation_set( eval_id: UUID, @@ -448,17 +513,34 @@ def download_gt_lk( @router.get( "/evaluation-sets/{eval_id}/ground-truth-PK.tsv", response_class=StreamingResponse, - summary="Download PK ground truth (CAFA format, same as LK)", + summary="Download PK ground truth (CAFA format)", ) def download_gt_pk( eval_id: UUID, factory: sessionmaker[Session] = Depends(get_session_factory), ) -> StreamingResponse: - """Download Partial-Knowledge ground truth (identical to LK at annotation level). - Pass ``known-terms.tsv`` as ``-known`` to the CAFA evaluator for PK scoring. + """Download Partial-Knowledge ground truth: proteins that gained new terms in a + namespace where they already had experimental annotations at t0. + Use together with ``known-terms.tsv`` passed as ``-known`` to the CAFA evaluator. Format: ``protein_accession\\tgo_id`` (no header, 2 columns). """ - return download_gt_lk(eval_id=eval_id, factory=factory) + with session_scope(factory) as session: + e = _eval_set_or_404(session, eval_id) + ann_old = session.get(AnnotationSet, e.old_annotation_set_id) + data = compute_evaluation_data( + session, e.old_annotation_set_id, e.new_annotation_set_id, + ann_old.ontology_snapshot_id, + ) + lines = [ + f"{protein}\t{go_id}\n" + for protein, go_ids in sorted(data.pk.items()) + for go_id in sorted(go_ids) + ] + return StreamingResponse( + iter(lines), + media_type="text/tab-separated-values", + headers={"Content-Disposition": 'attachment; filename="ground_truth_PK.tsv"'}, + ) @router.get( @@ -493,6 +575,81 @@ def download_known_terms( ) +@router.get( + "/evaluation-sets/{eval_id}/delta-proteins.fasta", + response_class=StreamingResponse, + summary="Download delta proteins as FASTA", +) +def download_delta_fasta( + eval_id: UUID, + category: str = Query(default="all", description="Which proteins to include: `nk`, `lk`, or `all` (default)."), + factory: sessionmaker[Session] = Depends(get_session_factory), +) -> StreamingResponse: + """Download the amino-acid sequences of delta proteins (NK and/or LK) as FASTA. + + Only proteins whose sequence is already stored in the database are included. + Header format: ``>ACCESSION entry_name OS=organism OX=taxonomy_id (NK|LK)`` + """ + with session_scope(factory) as session: + e = _eval_set_or_404(session, eval_id) + ann_old = session.get(AnnotationSet, e.old_annotation_set_id) + data = compute_evaluation_data( + session, e.old_annotation_set_id, e.new_annotation_set_id, + ann_old.ontology_snapshot_id, + ) + + # Collect requested accessions with their NK/LK/PK label + accession_label: dict[str, str] = {} + if category in ("nk", "all"): + for acc in data.nk: + accession_label[acc] = "NK" + if category in ("lk", "all"): + for acc in data.lk: + accession_label[acc] = "LK" + if category in ("pk", "all"): + for acc in data.pk: + accession_label.setdefault(acc, "PK") # may also be LK in another ns + + if not accession_label: + return StreamingResponse( + iter([]), + media_type="text/plain", + headers={"Content-Disposition": f'attachment; filename="delta_proteins_{category}.fasta"'}, + ) + + # Fetch proteins + sequences in one query + rows = ( + session.query(Protein, Sequence) + .join(Sequence, Protein.sequence_id == Sequence.id) + .filter(Protein.accession.in_(list(accession_label.keys()))) + .order_by(Protein.accession) + .all() + ) + + lines: list[str] = [] + for protein, seq in rows: + label = accession_label.get(protein.accession, "") + parts = [protein.accession] + if protein.entry_name: + parts.append(protein.entry_name) + if protein.organism: + parts.append(f"OS={protein.organism}") + if protein.taxonomy_id: + parts.append(f"OX={protein.taxonomy_id}") + parts.append(f"({label})") + lines.append(f">{' '.join(parts)}\n") + # Wrap sequence at 60 chars per line (standard FASTA) + s = seq.sequence + for i in range(0, len(s), 60): + lines.append(s[i : i + 60] + "\n") + + return StreamingResponse( + iter(lines), + media_type="text/plain", + headers={"Content-Disposition": f'attachment; filename="delta_proteins_{category}.fasta"'}, + ) + + # ── CAFA Evaluation Results ─────────────────────────────────────────────────── @router.post( @@ -625,6 +782,7 @@ def list_evaluation_results( "id": str(r.id), "evaluation_set_id": str(r.evaluation_set_id), "prediction_set_id": str(r.prediction_set_id), + "scoring_config_id": str(r.scoring_config_id) if r.scoring_config_id else None, "job_id": str(r.job_id) if r.job_id else None, "created_at": r.created_at.isoformat(), "results": r.results, @@ -633,6 +791,30 @@ def list_evaluation_results( ] +@router.delete( + "/evaluation-sets/{eval_id}/results/{result_id}", + summary="Delete an evaluation result", + status_code=204, +) +def delete_evaluation_result( + eval_id: UUID, + result_id: UUID, + factory: sessionmaker[Session] = Depends(get_session_factory), + artifacts_dir: Path = Depends(get_artifacts_dir), +) -> None: + with session_scope(factory) as session: + result = session.get(EvaluationResult, result_id) + if result is None or result.evaluation_set_id != eval_id: + raise HTTPException(status_code=404, detail="EvaluationResult not found") + session.delete(result) + + # Remove artifact directory if present (best-effort) + result_dir = artifacts_dir / str(result_id) + if result_dir.exists(): + import shutil + shutil.rmtree(result_dir, ignore_errors=True) + + # ── GO subgraph ─────────────────────────────────────────────────────────────── @router.get("/snapshots/{snapshot_id}/subgraph") diff --git a/protea/api/routers/embeddings.py b/protea/api/routers/embeddings.py index 2d8a936..3176668 100644 --- a/protea/api/routers/embeddings.py +++ b/protea/api/routers/embeddings.py @@ -281,7 +281,9 @@ def predict_go_terms( Required body fields: `embedding_config_id`, `annotation_set_id`, `ontology_snapshot_id`. Optional: `query_set_id` (FASTA upload), `limit_per_entry`, `distance_threshold`, - `batch_size`, `search_backend`, `compute_alignments`, `compute_taxonomy`. + `batch_size`, `search_backend`, `compute_alignments`, `compute_taxonomy`, + `aspect_separated_knn` (bool, default false — builds one KNN index per GO aspect to + guarantee BPO/MFO/CCO coverage even when unified nearest neighbours carry only one aspect). """ def _parse_uuid(key: str) -> UUID: raw = body.get(key) diff --git a/protea/api/routers/scoring.py b/protea/api/routers/scoring.py new file mode 100644 index 0000000..c17a2ed --- /dev/null +++ b/protea/api/routers/scoring.py @@ -0,0 +1,544 @@ +"""Scoring configuration management and analytical endpoints. + +Provides full CRUD for :class:`ScoringConfig` and two read-only analytical +endpoints that apply a stored config to an existing ``PredictionSet``: + +``GET /scoring/prediction-sets/{id}/score.tsv`` + Stream a TSV of scored predictions. The score column is computed on-the-fly + by applying the selected ``ScoringConfig`` formula to the raw signals stored + in ``GOPrediction`` rows — no re-running of the KNN pipeline is required. + +``GET /scoring/prediction-sets/{id}/metrics`` + Compute CAFA Fmax / AUC-PR for a (PredictionSet, ScoringConfig, category) + triple. Requires two ``AnnotationSet`` IDs to build the NK/LK ground truth + following the CAFA4 protocol. + +Evidence weights +---------------- +Each ``ScoringConfig`` may carry an optional ``evidence_weights`` dict that +overrides the system-default per-GO-evidence-code quality multipliers. The +API validates that: + +- Every key in the dict is a known GO evidence code (one of the codes in + :data:`DEFAULT_EVIDENCE_WEIGHTS`). +- Every value is a float in [0, 1]. + +Partial overrides are allowed: codes absent from the submitted dict will +continue to use the system default at score-computation time. +""" +from __future__ import annotations + +import uuid +from typing import Any, Iterator + +from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi.requests import Request +from fastapi.responses import StreamingResponse +from pydantic import BaseModel, Field, field_validator +from sqlalchemy.orm import Session + +from protea.core.evaluation import compute_evaluation_data +from protea.core.metrics import compute_cafa_metrics +from protea.core.scoring import compute_score, score_predictions +from protea.infrastructure.orm.models.annotation.go_term import GOTerm +from protea.infrastructure.orm.models.embedding.go_prediction import GOPrediction +from protea.infrastructure.orm.models.embedding.prediction_set import PredictionSet +from protea.infrastructure.orm.models.embedding.scoring_config import ( + DEFAULT_EVIDENCE_WEIGHTS, + DEFAULT_WEIGHTS, + VALID_FORMULAS, + ScoringConfig, +) +from protea.infrastructure.session import session_scope + +router = APIRouter(prefix="/scoring", tags=["scoring"]) + +# --------------------------------------------------------------------------- +# Built-in preset configurations +# --------------------------------------------------------------------------- +# These cover the most common use-cases and are designed to be instructive +# as reference points for custom configs. None of them override evidence +# weights so they document what the system defaults produce. + +_PRESET_CONFIGS: list[dict[str, Any]] = [ + { + "name": "embedding_only", + "formula": "linear", + "weights": { + "embedding_similarity": 1.0, + "identity_nw": 0.0, + "identity_sw": 0.0, + "evidence_weight": 0.0, + "taxonomic_proximity": 0.0, + }, + "description": ( + "Pure cosine similarity converted to [0, 1]. " + "Baseline config — no alignment, evidence, or taxonomy signals." + ), + }, + { + "name": "embedding_plus_evidence", + "formula": "evidence_weighted", + "weights": { + "embedding_similarity": 1.0, + "identity_nw": 0.0, + "identity_sw": 0.0, + "evidence_weight": 1.0, + "taxonomic_proximity": 0.0, + }, + "description": ( + "Embedding similarity multiplied by evidence code quality (evidence_weighted formula). " + "Penalises IEA-sourced annotations regardless of embedding distance." + ), + }, + { + "name": "alignment_weighted", + "formula": "linear", + "weights": { + "embedding_similarity": 0.5, + "identity_nw": 0.3, + "identity_sw": 0.2, + "evidence_weight": 0.0, + "taxonomic_proximity": 0.0, + }, + "description": ( + "Combines embedding similarity (50 %) with global NW identity (30 %) " + "and local SW identity (20 %). " + "Requires PredictionSet computed with compute_alignments=True." + ), + }, + { + "name": "composite", + "formula": "evidence_weighted", + "weights": { + "embedding_similarity": 0.4, + "identity_nw": 0.2, + "identity_sw": 0.1, + "evidence_weight": 0.2, + "taxonomic_proximity": 0.1, + }, + "description": ( + "Full composite: embedding + alignment + evidence quality + taxonomic proximity. " + "Requires compute_alignments=True and compute_taxonomy=True." + ), + }, + { + "name": "evidence_primary", + "formula": "linear", + "weights": { + "embedding_similarity": 0.2, + "identity_nw": 0.0, + "identity_sw": 0.0, + "evidence_weight": 0.8, + "taxonomic_proximity": 0.0, + }, + "description": ( + "Evidence quality as primary signal (80%), embedding similarity as tiebreaker (20%). " + "Designed for datasets where cosine distances cluster tightly (>99% of predictions " + "within distance < 0.1), making distance a poor tau discriminator. " + "Creates three well-separated score tiers: " + "EXP/IDA → ~1.0, ISS/IBA → ~0.76, IEA → ~0.46. " + "Recommended when compute_alignments and compute_taxonomy are not available." + ), + }, +] + + +# --------------------------------------------------------------------------- +# FastAPI dependency +# --------------------------------------------------------------------------- + +def get_session_factory(request: Request): + return request.app.state.session_factory + + +# --------------------------------------------------------------------------- +# Request / response models +# --------------------------------------------------------------------------- + +class ScoringConfigCreate(BaseModel): + """Request body for POST /scoring/configs. + + Attributes + ---------- + name: + Unique display name (1–255 characters). + formula: + Aggregation formula. One of ``"linear"`` or ``"evidence_weighted"``. + weights: + Signal weights dict. Valid keys: ``embedding_similarity``, + ``identity_nw``, ``identity_sw``, ``evidence_weight``, + ``taxonomic_proximity``. Missing keys default to 0. + evidence_weights: + Optional per-GO-evidence-code quality overrides. Keys must be valid + GO evidence codes (e.g. ``"IEA"``); values must be in [0, 1]. + When ``None`` the system defaults from + :data:`DEFAULT_EVIDENCE_WEIGHTS` are used at score-computation time. + Partial dicts are allowed. + description: + Free-text description stored for display in the UI. + """ + + name: str = Field(..., min_length=1, max_length=255) + formula: str = Field("linear") + weights: dict[str, float] = Field( + default_factory=lambda: dict(DEFAULT_WEIGHTS) + ) + evidence_weights: dict[str, float] | None = Field( + default=None, + description=( + "Per-GO-evidence-code quality overrides in [0, 1]. " + "NULL means use system defaults. Partial dicts are valid." + ), + ) + description: str | None = None + + model_config = {"extra": "forbid"} + + @field_validator("evidence_weights") + @classmethod + def validate_evidence_weights( + cls, v: dict[str, float] | None + ) -> dict[str, float] | None: + """Ensure all keys are known GO codes and all values are in [0, 1].""" + if v is None: + return None + known_codes = set(DEFAULT_EVIDENCE_WEIGHTS.keys()) + unknown = set(v.keys()) - known_codes + if unknown: + raise ValueError( + f"Unknown evidence codes: {sorted(unknown)}. " + f"Valid codes: {sorted(known_codes)}" + ) + out_of_range = {k: val for k, val in v.items() if not (0.0 <= val <= 1.0)} + if out_of_range: + raise ValueError( + f"Evidence weights must be in [0, 1]. Out-of-range: {out_of_range}" + ) + return v + + +class ScoringConfigResponse(BaseModel): + """Serialised representation of a stored ScoringConfig.""" + + id: uuid.UUID + name: str + formula: str + weights: dict[str, Any] + evidence_weights: dict[str, Any] | None + description: str | None + created_at: Any + + +def _to_response(c: ScoringConfig) -> ScoringConfigResponse: + """Convert an ORM ScoringConfig to its API response model.""" + return ScoringConfigResponse( + id=c.id, + name=c.name, + formula=c.formula, + weights=c.weights, + evidence_weights=c.evidence_weights, + description=c.description, + created_at=c.created_at, + ) + + +def _snapshot(c: ScoringConfig) -> ScoringConfig: + """Create a detached ScoringConfig copy safe to use after a session closes. + + The scoring endpoints close the DB session before streaming the response + body. This helper captures all scoring-relevant fields into a plain ORM + instance that does not require an open session. + """ + return ScoringConfig( + id=c.id, + name=c.name, + formula=c.formula, + weights=c.weights, + evidence_weights=c.evidence_weights, + description=c.description, + ) + + +# --------------------------------------------------------------------------- +# ScoringConfig CRUD +# --------------------------------------------------------------------------- + +@router.get("/configs", response_model=list[ScoringConfigResponse]) +def list_scoring_configs(factory=Depends(get_session_factory)): + """Return all stored ScoringConfigs ordered by creation time.""" + with session_scope(factory) as session: + configs = ( + session.query(ScoringConfig) + .order_by(ScoringConfig.created_at) + .all() + ) + return [_to_response(c) for c in configs] + + +@router.post("/configs", response_model=ScoringConfigResponse, status_code=201) +def create_scoring_config( + body: ScoringConfigCreate, + factory=Depends(get_session_factory), +): + """Create a new ScoringConfig. + + Validates that ``formula`` is one of the supported values and that every + key in ``weights`` is a recognised signal name. Evidence weight validation + is handled by the Pydantic model. + """ + if body.formula not in VALID_FORMULAS: + raise HTTPException( + status_code=422, + detail=f"Invalid formula {body.formula!r}. Valid options: {list(VALID_FORMULAS)}", + ) + known_signals = set(DEFAULT_WEIGHTS.keys()) + unknown_signals = set(body.weights.keys()) - known_signals + if unknown_signals: + raise HTTPException( + status_code=422, + detail=( + f"Unknown signal weight keys: {sorted(unknown_signals)}. " + f"Valid keys: {sorted(known_signals)}" + ), + ) + + with session_scope(factory) as session: + config = ScoringConfig( + name=body.name, + formula=body.formula, + weights=body.weights, + evidence_weights=body.evidence_weights, + description=body.description, + ) + session.add(config) + session.flush() + return _to_response(config) + + +@router.post("/configs/presets", status_code=201) +def create_preset_configs(factory=Depends(get_session_factory)): + """Seed the database with the four built-in preset ScoringConfigs. + + Idempotent — presets that already exist (matched by name) are silently + skipped. Returns the list of names that were actually created. + """ + created: list[str] = [] + with session_scope(factory) as session: + existing_names = { + row[0] for row in session.query(ScoringConfig.name).all() + } + for preset in _PRESET_CONFIGS: + if preset["name"] in existing_names: + continue + session.add(ScoringConfig(**preset)) + created.append(preset["name"]) + return {"created": created} + + +@router.get("/configs/{config_id}", response_model=ScoringConfigResponse) +def get_scoring_config( + config_id: uuid.UUID, + factory=Depends(get_session_factory), +): + """Retrieve a single ScoringConfig by UUID.""" + with session_scope(factory) as session: + config = session.get(ScoringConfig, config_id) + if config is None: + raise HTTPException(status_code=404, detail="ScoringConfig not found") + return _to_response(config) + + +@router.delete("/configs/{config_id}", status_code=204) +def delete_scoring_config( + config_id: uuid.UUID, + factory=Depends(get_session_factory), +): + """Delete a ScoringConfig by UUID.""" + with session_scope(factory) as session: + config = session.get(ScoringConfig, config_id) + if config is None: + raise HTTPException(status_code=404, detail="ScoringConfig not found") + session.delete(config) + + +# --------------------------------------------------------------------------- +# Scored TSV endpoint +# --------------------------------------------------------------------------- + +@router.get("/prediction-sets/{set_id}/score.tsv") +def download_scored_predictions( + set_id: uuid.UUID, + scoring_config_id: uuid.UUID = Query(...), + min_score: float | None = Query(None, ge=0.0, le=1.0), + accession: str | None = Query(None), + factory=Depends(get_session_factory), +): + """Stream a TSV of predictions with computed confidence scores. + + The score is computed on-the-fly for every row using the selected + ``ScoringConfig``, including any custom evidence-weight overrides stored + in that config. The session is closed before the generator starts so + the response is streamed without holding a DB connection open. + + Query parameters + ---------------- + scoring_config_id: + UUID of the ``ScoringConfig`` to apply. + min_score: + Optional score threshold — rows below this value are omitted. + accession: + Optional protein accession filter. + + TSV columns + ----------- + protein_accession, go_id, score, distance, ref_protein_accession, + evidence_code, qualifier, identity_nw, identity_sw, taxonomic_distance. + """ + with session_scope(factory) as session: + if session.get(PredictionSet, set_id) is None: + raise HTTPException(status_code=404, detail="PredictionSet not found") + config = session.get(ScoringConfig, scoring_config_id) + if config is None: + raise HTTPException(status_code=404, detail="ScoringConfig not found") + config_snap = _snapshot(config) + + def _generate() -> Iterator[bytes]: + header = "\t".join([ + "protein_accession", "go_id", "score", "distance", + "ref_protein_accession", "evidence_code", "qualifier", + "identity_nw", "identity_sw", "taxonomic_distance", + ]) + "\n" + yield header.encode() + + with session_scope(factory) as session: + q = ( + session.query(GOPrediction, GOTerm.go_id) + .join(GOTerm, GOPrediction.go_term_id == GOTerm.id) + .filter(GOPrediction.prediction_set_id == set_id) + ) + if accession: + q = q.filter(GOPrediction.protein_accession == accession) + + for pred, go_id in q.yield_per(1000): + pred_dict = { + "distance": pred.distance, + "identity_nw": pred.identity_nw, + "identity_sw": pred.identity_sw, + "evidence_code": pred.evidence_code, + "taxonomic_distance": pred.taxonomic_distance, + } + score = compute_score(pred_dict, config_snap) + if min_score is not None and score < min_score: + continue + + row = "\t".join([ + pred.protein_accession, + go_id, + str(score), + str(pred.distance) if pred.distance is not None else "", + pred.ref_protein_accession or "", + pred.evidence_code or "", + pred.qualifier or "", + str(pred.identity_nw) if pred.identity_nw is not None else "", + str(pred.identity_sw) if pred.identity_sw is not None else "", + str(pred.taxonomic_distance) if pred.taxonomic_distance is not None else "", + ]) + "\n" + yield row.encode() + + filename = f"scored_{set_id}_{scoring_config_id}.tsv" + return StreamingResponse( + _generate(), + media_type="text/tab-separated-values", + headers={"Content-Disposition": f'attachment; filename="{filename}"'}, + ) + + +# --------------------------------------------------------------------------- +# CAFA metrics endpoint +# --------------------------------------------------------------------------- + +@router.get("/prediction-sets/{set_id}/metrics") +def compute_metrics( + set_id: uuid.UUID, + scoring_config_id: uuid.UUID = Query(...), + old_annotation_set_id: uuid.UUID = Query(...), + new_annotation_set_id: uuid.UUID = Query(...), + ontology_snapshot_id: uuid.UUID = Query(...), + category: str = Query("nk", pattern="^(nk|lk)$"), + factory=Depends(get_session_factory), +): + """Compute CAFA Fmax and AUC-PR for a PredictionSet under a ScoringConfig. + + Ground truth is the NK or LK delta between *old_annotation_set* and + *new_annotation_set*, following the CAFA4 protocol: only experimental + evidence codes, NOT-qualifier annotations excluded with full DAG propagation. + + The selected ``ScoringConfig`` — including any custom ``evidence_weights`` + — is applied to every ``GOPrediction`` row before computing the + precision-recall curve. + + Parameters + ---------- + scoring_config_id: + Which stored ScoringConfig formula (and evidence weights) to apply. + old_annotation_set_id / new_annotation_set_id: + The two AnnotationSets used to compute the temporal ground-truth delta. + ontology_snapshot_id: + GO DAG snapshot used for NOT-qualifier propagation. + category: + ``"nk"`` (no-knowledge) or ``"lk"`` (limited-knowledge) protein set. + """ + with session_scope(factory) as session: + if session.get(PredictionSet, set_id) is None: + raise HTTPException(status_code=404, detail="PredictionSet not found") + config = session.get(ScoringConfig, scoring_config_id) + if config is None: + raise HTTPException(status_code=404, detail="ScoringConfig not found") + config_snap = _snapshot(config) + + eval_data = compute_evaluation_data( + session, + old_annotation_set_id=old_annotation_set_id, + new_annotation_set_id=new_annotation_set_id, + ontology_snapshot_id=ontology_snapshot_id, + ) + + rows = ( + session.query(GOPrediction, GOTerm.go_id) + .join(GOTerm, GOPrediction.go_term_id == GOTerm.id) + .filter(GOPrediction.prediction_set_id == set_id) + .all() + ) + + scored: list[dict[str, Any]] = [] + for pred, go_id in rows: + pred_dict: dict[str, Any] = { + "protein_accession": pred.protein_accession, + "go_id": go_id, + "distance": pred.distance, + "identity_nw": pred.identity_nw, + "identity_sw": pred.identity_sw, + "evidence_code": pred.evidence_code, + "taxonomic_distance": pred.taxonomic_distance, + } + pred_dict["score"] = compute_score(pred_dict, config_snap) + scored.append(pred_dict) + + metrics = compute_cafa_metrics(scored, eval_data, category=category) + + return { + "prediction_set_id": str(set_id), + "scoring_config_id": str(scoring_config_id), + "scoring_config_name": config_snap.name, + **metrics.summary(), + "curve": [ + { + "threshold": p.threshold, + "precision": p.precision, + "recall": p.recall, + "f1": p.f1, + } + for p in metrics.curve + ], + } diff --git a/protea/api/routers/support.py b/protea/api/routers/support.py new file mode 100644 index 0000000..79f3dbe --- /dev/null +++ b/protea/api/routers/support.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +from typing import Any + +from fastapi import APIRouter, Depends, Query +from pydantic import BaseModel, Field +from starlette.requests import Request + +from protea.infrastructure.orm.models.support_entry import SupportEntry +from protea.infrastructure.session import session_scope + +router = APIRouter(prefix="/support", tags=["support"]) + +_MAX_COMMENT_LENGTH = 500 +_RECENT_LIMIT = 20 +_PAGE_LIMIT = 100 + + +def get_session_factory(request: Request): + return request.app.state.session_factory + + +class SupportCreate(BaseModel): + comment: str | None = Field(default=None, max_length=_MAX_COMMENT_LENGTH) + + +@router.get("") +def get_support( + all_comments: bool = Query(False), + factory=Depends(get_session_factory), +) -> dict[str, Any]: + """Return total thumbs-up count and comments. + + Pass ``all_comments=true`` to get all comments (up to 100) instead of the 20 most recent. + """ + with session_scope(factory) as session: + total = session.query(SupportEntry).count() + limit = _PAGE_LIMIT if all_comments else _RECENT_LIMIT + recent = ( + session.query(SupportEntry) + .filter(SupportEntry.comment.isnot(None)) + .order_by(SupportEntry.created_at.desc()) + .limit(limit) + .all() + ) + return { + "count": total, + "comments": [ + {"id": str(e.id), "comment": e.comment, "created_at": e.created_at.isoformat()} + for e in recent + ], + } + + +@router.post("", status_code=201) +def post_support( + body: SupportCreate, + factory=Depends(get_session_factory), +) -> dict[str, Any]: + """Submit a thumbs-up with an optional comment.""" + comment = body.comment.strip() if body.comment else None + if comment == "": + comment = None + + with session_scope(factory) as session: + entry = SupportEntry(comment=comment) + session.add(entry) + session.flush() + total = session.query(SupportEntry).count() + return {"count": total, "id": str(entry.id)} diff --git a/protea/workers/base_worker.py b/protea/workers/base_worker.py index 80f2ccf..ba5c544 100644 --- a/protea/workers/base_worker.py +++ b/protea/workers/base_worker.py @@ -14,7 +14,7 @@ from protea.core.contracts.registry import OperationRegistry from protea.core.utils import utcnow from protea.infrastructure.orm.models.job import Job, JobEvent, JobStatus -from protea.infrastructure.queue.publisher import publish_job +from protea.infrastructure.queue.publisher import publish_job, publish_operation logger = logging.getLogger(__name__) @@ -141,6 +141,11 @@ def emit( for queue_name, child_job_id in result.publish_after_commit: publish_job(self._amqp_url, queue_name, child_job_id) + # Publish ephemeral operation messages (e.g. embedding batches). + if result.publish_operations and self._amqp_url: + for queue_name, op_payload in result.publish_operations: + publish_operation(self._amqp_url, queue_name, op_payload) + except RetryLaterError as e: # Resource busy — reset to QUEUED so the consumer can re-publish. job.status = JobStatus.QUEUED From 90e1b52f57437e5beb7800b06560debd8084138e Mon Sep 17 00:00:00 2001 From: frapercan Date: Mon, 16 Mar 2026 21:56:55 +0100 Subject: [PATCH 08/17] feat(frontend): add scoring config UI, support page, and evaluation overhaul MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New /scoring page: create/delete ScoringConfig with signal weights, evidence overrides, formula selector and preset loader - New /support page: contact form backed by support router - SupportButton + UsagePolicyModal components - evaluation/page.tsx: full rewrite — per-result metrics table (NK/LK/PK × BPO/MFO/CCO), scoring config selector, artifact ZIP download, result polling after job submit, IA-url management on snapshots - functional-annotation: scoring config selector in predict form, enriched per-protein prediction detail view - annotations/page.tsx: IA-url PATCH UI on ontology snapshots - NavLinks: add Scoring and Support entries - lib/api.ts: ScoringConfig CRUD, scored TSV URL helper, IA-url PATCH, evaluation result DELETE and artifact endpoints - fix: NaN in limit-per-entry input (guard parseInt) - fix: DELETE 204 response parsed as JSON (use fetch directly) --- apps/web/app/annotations/page.tsx | 86 ++- apps/web/app/embeddings/page.tsx | 8 +- apps/web/app/evaluation/page.tsx | 320 +++++++- .../app/functional-annotation/[id]/page.tsx | 421 ++++++++++- apps/web/app/functional-annotation/page.tsx | 26 +- apps/web/app/jobs/[id]/page.tsx | 2 +- apps/web/app/jobs/page.tsx | 27 +- apps/web/app/layout.tsx | 12 +- apps/web/app/proteins/[accession]/page.tsx | 4 +- apps/web/app/proteins/page.tsx | 10 +- apps/web/app/query-sets/page.tsx | 2 +- apps/web/app/scoring/page.tsx | 706 ++++++++++++++++++ apps/web/app/support/page.tsx | 72 ++ apps/web/components/NavLinks.tsx | 100 ++- apps/web/components/ResetDbButton.tsx | 6 +- apps/web/components/SupportButton.tsx | 132 ++++ apps/web/components/UsagePolicyModal.tsx | 75 ++ apps/web/lib/api.ts | 97 ++- apps/web/next.config.ts | 3 + apps/web/package-lock.json | 60 ++ apps/web/package.json | 1 + 21 files changed, 2031 insertions(+), 139 deletions(-) create mode 100644 apps/web/app/scoring/page.tsx create mode 100644 apps/web/app/support/page.tsx create mode 100644 apps/web/components/SupportButton.tsx create mode 100644 apps/web/components/UsagePolicyModal.tsx diff --git a/apps/web/app/annotations/page.tsx b/apps/web/app/annotations/page.tsx index d352126..7cbefc3 100644 --- a/apps/web/app/annotations/page.tsx +++ b/apps/web/app/annotations/page.tsx @@ -5,6 +5,7 @@ import Link from "next/link"; import { listAnnotationSets, listOntologySnapshots, + setSnapshotIaUrl, deleteAnnotationSet, createJob, AnnotationSet, @@ -36,6 +37,27 @@ export default function AnnotationsPage() { const [loadingSets, setLoadingSets] = useState(true); const [loadingSnaps, setLoadingSnaps] = useState(true); + // IA URL inline edit state: snapshotId → current input value (undefined = not editing) + const [iaEditId, setIaEditId] = useState(null); + const [iaEditValue, setIaEditValue] = useState(""); + const [iaSaving, setIaSaving] = useState(false); + + async function handleSaveIa(snapshotId: string) { + setIaSaving(true); + try { + const result = await setSnapshotIaUrl(snapshotId, iaEditValue.trim() || null); + setSnapshots((prev) => + prev.map((s) => (s.id === snapshotId ? { ...s, ia_url: result.ia_url } : s)) + ); + setIaEditId(null); + toast("IA URL saved", "success"); + } catch (err: any) { + toast(String(err), "error"); + } finally { + setIaSaving(false); + } + } + // Load Snapshot form const [oboUrl, setOboUrl] = useState("http://purl.obolibrary.org/obo/go/go-basic.obo"); const [snapResult, setSnapResult] = useState<{ id: string } | null>(null); @@ -177,7 +199,7 @@ export default function AnnotationsPage() {

Annotations

-
+
{tabs.map((t) => (
-
+
ID
Source
Version
Annotations
Meta
Created
@@ -254,21 +276,65 @@ export default function AnnotationsPage() { Refresh
-
-
-
ID
Version
GO Terms
Loaded
+
+
+
ID
Version
GO Terms
IA URL
Loaded
- {loadingSnaps && Array.from({ length: 2 }).map((_, i) => )} + {loadingSnaps && Array.from({ length: 2 }).map((_, i) => )} {!loadingSnaps && snapshots.length === 0 && (
No ontology snapshots yet. Use the Load Snapshot tab.
)} {snapshots.map((s) => ( -
+
{shortId(s.id)}
{s.obo_version}
{(s.go_term_count ?? 0).toLocaleString()}
+
+ {iaEditId === s.id ? ( +
+ setIaEditValue(e.target.value)} + placeholder="https://…/IA_cafa6.tsv or file path" + className="flex-1 min-w-0 rounded border px-2 py-1 text-xs focus:outline-none focus:ring-1 focus:ring-blue-500" + onKeyDown={(e) => { + if (e.key === "Enter") handleSaveIa(s.id); + if (e.key === "Escape") setIaEditId(null); + }} + /> + + +
+ ) : ( + + )} +
{formatDate(s.loaded_at)}
))} @@ -278,7 +344,7 @@ export default function AnnotationsPage() { {/* ── Load Ontology Snapshot ── */} {activeTab === "load-snapshot" && ( -
+

Load Ontology Snapshot

Downloads a GO OBO file and populates GOTerm rows.

@@ -313,7 +379,7 @@ export default function AnnotationsPage() { {/* ── Load GOA Annotations ── */} {activeTab === "load-goa" && ( -
+

Load GOA Annotations

Bulk-loads GO annotations from a GAF file.

@@ -372,7 +438,7 @@ export default function AnnotationsPage() { {/* ── Load QuickGO Annotations ── */} {activeTab === "load-quickgo" && ( -
+

Load QuickGO Annotations

Streams GO annotations from the QuickGO bulk download API.

diff --git a/apps/web/app/embeddings/page.tsx b/apps/web/app/embeddings/page.tsx index 718c4fb..f25630b 100644 --- a/apps/web/app/embeddings/page.tsx +++ b/apps/web/app/embeddings/page.tsx @@ -230,7 +230,7 @@ export default function EmbeddingsPage() { )} {/* Tab bar */} -
+
{tabs.map((t) => ( +
+
+ + + +
@@ -234,7 +351,10 @@ function EvaluationSetCard({ {/* Downloads */}
-

Ground truth files

+

+ Ground truth files + +

+

+ Known terms + + : passed to cafaeval as -known for the PK pass only. +

+
+ + {/* FASTA downloads */} +
+

+ Delta protein sequences (FASTA) + +

+
+ + + + +
{/* Run evaluation */}

Run CAFA evaluator

-
+
setScoringConfigId(ev.target.value)} + className={selectClass} + > + + {scoringConfigs.map((c) => ( + + ))} + +
{runError && ( -

{runError}

+

+ {runError} +

)} + + {/* Success banner — shown after a job is successfully submitted */} + {pendingJobId && ( +
+ + Job queued.{" "} + {pollingResults + ? "Checking for results every 10 s…" + : "Results will appear below when the job completes."} + + + View job → + +
+ )} + +
{loadingResults ? (

Loading…

) : results.length === 0 ? (

No evaluations run yet.

) : (
- {results.map((r) => ( -
-
-
- Pred: {r.prediction_set_id.slice(0, 8)}… · {new Date(r.created_at).toLocaleString()} + {results.map((r) => { + const pred = predictionSets.find((p) => p.id === r.prediction_set_id); + const sc = scoringConfigs.find((c) => c.id === r.scoring_config_id); + return ( +
+ {/* Meta header */} +
+
+
+ Prediction set: + {pred + ? {r.prediction_set_id.slice(0, 8)}… · {new Date(pred.created_at).toLocaleDateString()}{pred.prediction_count != null ? ` · ${pred.prediction_count.toLocaleString()} preds.` : ""} + : {r.prediction_set_id.slice(0, 8)}… + } +
+
+ Scoring: + {sc ? sc.name : fallback (1−d/2)} + {sc?.description && } +
+
{new Date(r.created_at).toLocaleString()}
+
+
+ + ↓ Artifacts (.zip) + + +
- - ↓ Artifacts (.zip) - +
- -
- ))} + ); + })}
)}
@@ -337,6 +573,7 @@ export default function EvaluationPage() { const [annotationSets, setAnnotationSets] = useState([]); const [predictionSets, setPredictionSets] = useState([]); const [evaluationSets, setEvaluationSets] = useState([]); + const [scoringConfigs, setScoringConfigs] = useState([]); const [loading, setLoading] = useState(true); const [oldSetId, setOldSetId] = useState(""); @@ -346,11 +583,12 @@ export default function EvaluationPage() { const [selectedEvalId, setSelectedEvalId] = useState(""); const reload = () => - Promise.all([listAnnotationSets(), listPredictionSets(), listEvaluationSets()]) - .then(([ann, pred, ev]) => { + Promise.all([listAnnotationSets(), listPredictionSets(), listEvaluationSets(), listScoringConfigs()]) + .then(([ann, pred, ev, sc]) => { setAnnotationSets(ann); setPredictionSets(pred); setEvaluationSets(ev); + setScoringConfigs(sc); }) .finally(() => setLoading(false)); @@ -434,8 +672,10 @@ export default function EvaluationPage() { e={e} annotationSets={annotationSets} predictionSets={predictionSets} + scoringConfigs={scoringConfigs} isSelected={selectedEvalId === e.id} onSelect={() => setSelectedEvalId(e.id === selectedEvalId ? "" : e.id)} + onDeleted={() => setEvaluationSets((prev) => prev.filter((x) => x.id !== e.id))} /> ))} diff --git a/apps/web/app/functional-annotation/[id]/page.tsx b/apps/web/app/functional-annotation/[id]/page.tsx index 2056b36..28c0ff7 100644 --- a/apps/web/app/functional-annotation/[id]/page.tsx +++ b/apps/web/app/functional-annotation/[id]/page.tsx @@ -11,10 +11,234 @@ import { getProteinAnnotations, getGoSubgraph, getGoTermDistribution, + listScoringConfigs, + getScoredTsvUrl, + createScoringConfig, Prediction, ProteinAnnotation, GoSubgraph, + ScoringConfig, } from "@/lib/api"; + +// ── Scoring engine (mirrors protea/core/scoring.py) ────────────────────────── +// +// Evidence-code quality weights: same default table as the Python backend. +// When a ScoringConfig carries custom `evidence_weights`, those overrides take +// precedence; codes absent from the override still resolve via this table. + +const DEFAULT_EVIDENCE_WEIGHTS: Record = { + // Experimental — direct biological evidence + EXP: 1.0, IDA: 1.0, IPI: 1.0, IMP: 1.0, IGI: 1.0, IEP: 1.0, + HTP: 1.0, HDA: 1.0, HMP: 1.0, HGI: 1.0, HEP: 1.0, + IC: 1.0, TAS: 1.0, + // Computational / Phylogenetic + ISS: 0.7, ISO: 0.7, ISA: 0.7, ISM: 0.7, IGC: 0.7, + IBA: 0.7, IBD: 0.7, IKR: 0.7, IRD: 0.7, RCA: 0.7, + // Electronic / author statement + NAS: 0.5, + IEA: 0.3, + // No biological data + ND: 0.1, +}; + +/** Fallback weight for codes not found in any lookup table. */ +const DEFAULT_EVIDENCE_WEIGHT_FALLBACK = 0.5; + +/** + * Resolve the quality weight for a single GO evidence code. + * + * Resolution order: + * 1. Config-level override (config.evidence_weights), if present. + * 2. Module-level DEFAULT_EVIDENCE_WEIGHTS table. + * 3. DEFAULT_EVIDENCE_WEIGHT_FALLBACK (0.5). + */ +function resolveEvidenceWeight( + code: string | null | undefined, + overrides: Record | null | undefined, +): number { + if (!code) return DEFAULT_EVIDENCE_WEIGHT_FALLBACK; + if (overrides && code in overrides) return overrides[code]; + return DEFAULT_EVIDENCE_WEIGHTS[code] ?? DEFAULT_EVIDENCE_WEIGHT_FALLBACK; +} + +/** + * Compute a [0, 1] confidence score for a prediction row. + * + * Mirrors the logic in `protea/core/scoring.py::compute_score()`, including + * the two-level evidence weight resolution so the UI score always matches + * the backend TSV export exactly. + */ +function computeScore(pred: Prediction, config: ScoringConfig): number { + const evWeight = resolveEvidenceWeight(pred.evidence_code, config.evidence_weights); + + const signals: Record = { + embedding_similarity: 1 - pred.distance / 2, + identity_nw: pred.identity_nw, + identity_sw: pred.identity_sw, + evidence_weight: pred.evidence_code != null ? evWeight : null, + taxonomic_proximity: + pred.taxonomic_distance != null ? 1 / (1 + pred.taxonomic_distance) : null, + }; + + let weightedSum = 0; + let totalWeight = 0; + for (const [signal, weight] of Object.entries(config.weights)) { + if (weight <= 0) continue; + const val = signals[signal]; + if (val == null) continue; + weightedSum += weight * Math.max(0, Math.min(1, val)); + totalWeight += weight; + } + + if (totalWeight === 0) return 0; + let score = weightedSum / totalWeight; + + if (config.formula === "evidence_weighted") { + score *= evWeight; + } + + return score; +} +// ── Scoring signals ─────────────────────────────────────────────────────────── + +const SIGNALS: { key: string; label: string; hint: string }[] = [ + { key: "embedding_similarity", label: "Emb. similarity", hint: "1 − cosine_distance / 2, always available" }, + { key: "identity_nw", label: "Identity NW", hint: "Global identity Needleman-Wunsch (requires compute_alignments)" }, + { key: "identity_sw", label: "Identity SW", hint: "Local identity Smith-Waterman (requires compute_alignments)" }, + { key: "evidence_weight", label: "Evidence", hint: "GO evidence code quality (EXP→1.0, IEA→0.3)" }, + { key: "taxonomic_proximity", label: "Tax. proximity", hint: "1/(1+tax_dist) (requires compute_taxonomy)" }, +]; + +const DEFAULT_CUSTOM_WEIGHTS: Record = { + embedding_similarity: 1.0, + identity_nw: 0.0, + identity_sw: 0.0, + evidence_weight: 0.0, + taxonomic_proximity: 0.0, +}; + +export const CUSTOM_ID = "__custom__"; + +// ── WeightPanel ─────────────────────────────────────────────────────────────── + +function WeightPanel({ + config, + isCustom, + customWeights, + customFormula, + onWeightChange, + onFormulaChange, + onSave, + saving, +}: { + config?: ScoringConfig; + isCustom: boolean; + customWeights: Record; + customFormula: string; + onWeightChange: (key: string, val: number) => void; + onFormulaChange: (f: string) => void; + onSave: (name: string) => void; + saving: boolean; +}) { + const [saveName, setSaveName] = useState(""); + const [showSaveForm, setShowSaveForm] = useState(false); + + if (!isCustom && config) { + return ( +
+ + {config.formula} + + {SIGNALS.map(({ key, label }) => { + const w = config.weights[key] ?? 0; + return ( + 0 ? "bg-gray-100 text-gray-700" : "text-gray-300"}`} + > + {label} {w} + + ); + })} + {config.description && ( + {config.description} + )} +
+ ); + } + + if (isCustom) { + return ( +
+
+ Custom weights + +
+
+ {SIGNALS.map(({ key, label, hint }) => { + const val = customWeights[key] ?? 0; + return ( +
+ {label} + onWeightChange(key, parseFloat(e.target.value))} + className="flex-1 accent-blue-500" + /> + {val.toFixed(2)} +
+ ); + })} +
+
+ {!showSaveForm ? ( + + ) : ( +
+ setSaveName(e.target.value)} + placeholder="Config name" + className="flex-1 rounded border px-2 py-1 text-xs focus:outline-none focus:ring-1 focus:ring-blue-500" + /> + + +
+ )} +
+
+ ); + } + + return null; +} + import dynamic from "next/dynamic"; const GoGraph = dynamic(() => import("@/components/GoGraph"), { ssr: false }); @@ -64,19 +288,37 @@ function pct(v: number | null) { return `${(v * 100).toFixed(1)}%`; } -function PredictionTable({ preds, annotatedGoIds }: { preds: Prediction[]; annotatedGoIds: Set }) { +function scoreColor(score: number): string { + // green for high scores, yellow for mid, red for low + if (score >= 0.75) return "text-green-700 font-semibold"; + if (score >= 0.5) return "text-yellow-700"; + return "text-red-500"; +} + +function PredictionTable({ preds, annotatedGoIds, scoringConfig }: { preds: Prediction[]; annotatedGoIds: Set; scoringConfig?: ScoringConfig }) { const hasAlignment = preds.some((p) => p.identity_nw != null); const hasTaxonomy = preds.some((p) => p.taxonomic_relation != null); + const hasScore = !!scoringConfig; const [expanded, setExpanded] = useState(null); + // Sort by score descending when a config is active + const sortedPreds = hasScore + ? [...preds].sort((a, b) => computeScore(b, scoringConfig!) - computeScore(a, scoringConfig!)) + : preds; + // Column layout adapts to available features - const baseGrid = hasAlignment || hasTaxonomy - ? "grid-cols-[80px_1fr_100px_65px_70px_70px]" - : "grid-cols-[90px_1fr_110px_75px]"; + const baseGrid = hasScore + ? (hasAlignment || hasTaxonomy + ? "grid-cols-[60px_80px_1fr_100px_65px_70px_70px]" + : "grid-cols-[60px_90px_1fr_110px_75px]") + : (hasAlignment || hasTaxonomy + ? "grid-cols-[80px_1fr_100px_65px_70px_70px]" + : "grid-cols-[90px_1fr_110px_75px]"); return (
+ {hasScore &&
Score
}
GO ID
Name
Ref. Protein
@@ -84,9 +326,9 @@ function PredictionTable({ preds, annotatedGoIds }: { preds: Prediction[]; annot {hasAlignment && <>
NW id%
SW id%
} {hasTaxonomy && !hasAlignment && <>
Relation
Tax dist
}
- {preds.length === 0 ? ( + {sortedPreds.length === 0 ? (

- ) : preds.map((pred, i) => ( + ) : sortedPreds.map((pred, i) => (
(hasAlignment || hasTaxonomy) ? setExpanded(expanded === i ? null : i) : undefined} > + {hasScore && ( + + {computeScore(pred, scoringConfig!).toFixed(3)} + + )} {pred.go_id} {pred.name ?? "—"} void; ontologySnapshotId: string | null; + scoringConfig?: ScoringConfig; }) { const [subgraph, setSubgraph] = useState(null); const [loadingGraph, setLoadingGraph] = useState(false); @@ -294,7 +543,7 @@ function ProteinDetail({
{/* Predictions */} - + {/* Known annotations */} @@ -320,14 +569,15 @@ function ProteinDetail({ ); } -function DownloadButton({ setId }: { setId: string }) { +function DownloadButton({ setId, scoringConfigId, customBlocked }: { setId: string; scoringConfigId?: string; customBlocked?: boolean }) { const [open, setOpen] = useState(false); const [aspect, setAspect] = useState(""); const [maxDist, setMaxDist] = useState(""); + const [minScore, setMinScore] = useState(""); const apiBase = process.env.NEXT_PUBLIC_API_URL ?? ""; - function buildUrl() { + function buildRawUrl() { const params = new URLSearchParams(); if (aspect) params.set("aspect", aspect); if (maxDist) params.set("max_distance", maxDist); @@ -335,10 +585,17 @@ function DownloadButton({ setId }: { setId: string }) { return `${apiBase}/embeddings/prediction-sets/${setId}/predictions.tsv${qs ? `?${qs}` : ""}`; } + function buildScoredUrl() { + return getScoredTsvUrl(setId, scoringConfigId!, minScore ? { minScore: parseFloat(minScore) } : undefined); + } + + const isScored = !!scoringConfigId; + return (
-
+
{tabs.map((t) => (
-
+
Accession
Predicted
@@ -603,6 +949,7 @@ export default function PredictionSetDetailPage({ params }: { params: Promise<{ loading={loadingDetail} onClose={() => setSelectedAccession(null)} ontologySnapshotId={ontologySnapshotId} + scoringConfig={selectedConfig} />
)} @@ -653,7 +1000,7 @@ export default function PredictionSetDetailPage({ params }: { params: Promise<{ {ASPECT_LABELS[asp]} top {terms.length} terms

-
+
{terms.map((t) => (
{t.go_id} diff --git a/apps/web/app/functional-annotation/page.tsx b/apps/web/app/functional-annotation/page.tsx index d23feec..0cab7b9 100644 --- a/apps/web/app/functional-annotation/page.tsx +++ b/apps/web/app/functional-annotation/page.tsx @@ -60,6 +60,7 @@ export default function FunctionalAnnotationPage() { const [predFaissNprobe, setPredFaissNprobe] = useState(10); const [predFaissHnswM, setPredFaissHnswM] = useState(32); const [predFaissHnswEf, setPredFaissHnswEf] = useState(64); + const [predAspectSeparatedKnn, setPredAspectSeparatedKnn] = useState(true); const [predComputeAlignments, setPredComputeAlignments] = useState(false); const [predComputeTaxonomy, setPredComputeTaxonomy] = useState(false); const [predResult, setPredResult] = useState<{ id: string; status: string } | null>(null); @@ -126,6 +127,7 @@ export default function FunctionalAnnotationPage() { faiss_nprobe: predFaissNprobe, faiss_hnsw_m: predFaissHnswM, faiss_hnsw_ef_search: predFaissHnswEf, + aspect_separated_knn: predAspectSeparatedKnn, compute_alignments: predComputeAlignments, compute_taxonomy: predComputeTaxonomy, }); @@ -166,7 +168,7 @@ export default function FunctionalAnnotationPage() {

Functional Annotation

-
+
{tabs.map((t) => (
-
+
ID
Config
diff --git a/apps/web/app/jobs/[id]/page.tsx b/apps/web/app/jobs/[id]/page.tsx index bbc986f..0b4f447 100644 --- a/apps/web/app/jobs/[id]/page.tsx +++ b/apps/web/app/jobs/[id]/page.tsx @@ -229,7 +229,7 @@ export default function JobDetail({ params }: { params: Promise<{ id: string }> return {s}: {n}; })}
-
+
Status
Job ID
diff --git a/apps/web/app/jobs/page.tsx b/apps/web/app/jobs/page.tsx index 08f6655..fb1286e 100644 --- a/apps/web/app/jobs/page.tsx +++ b/apps/web/app/jobs/page.tsx @@ -136,7 +136,32 @@ export default function JobsPage() { )} -
+ {/* Mobile card list */} +
+ {loading && Array.from({ length: 5 }).map((_, i) => ( +
+
+
+
+ ))} + {!loading && jobs.length === 0 && ( +
No jobs found.
+ )} + {!loading && jobs.map((j) => ( + +
+ + {formatDate(j.created_at)} +
+

{j.operation}

+ +

{j.id}

+ + ))} +
+ + {/* Desktop table */} +
Status
Operation
diff --git a/apps/web/app/layout.tsx b/apps/web/app/layout.tsx index 5982d69..8f9f055 100644 --- a/apps/web/app/layout.tsx +++ b/apps/web/app/layout.tsx @@ -3,7 +3,9 @@ import { Geist, Geist_Mono } from "next/font/google"; import "./globals.css"; import { ResetDbButton } from "@/components/ResetDbButton"; import { NavLinks } from "@/components/NavLinks"; +import { SupportButton } from "@/components/SupportButton"; import { ToastProvider } from "@/components/Toast"; +import { UsagePolicyModal } from "@/components/UsagePolicyModal"; const geistSans = Geist({ variable: "--font-geist-sans", @@ -28,16 +30,18 @@ export default function RootLayout({ return ( + -
+
PROTEA - | + | -
+
+
-
+
{children}
diff --git a/apps/web/app/proteins/[accession]/page.tsx b/apps/web/app/proteins/[accession]/page.tsx index a08c286..dbd253e 100644 --- a/apps/web/app/proteins/[accession]/page.tsx +++ b/apps/web/app/proteins/[accession]/page.tsx @@ -114,7 +114,7 @@ export default function ProteinDetailPage({ params }: { params: Promise<{ access
{/* Tabs */} -
+
{tabs.map((t) => (
{/* Table */} -
+
Accession
Entry Name
@@ -340,7 +340,7 @@ export default function ProteinsPage() { {/* ── Insert Proteins ── */} {activeTab === "insert" && ( -
+

Insert Proteins from UniProt

Downloads FASTA sequences and upserts Protein + Sequence rows.

@@ -382,7 +382,7 @@ export default function ProteinsPage() { {/* ── Fetch Metadata ── */} {activeTab === "metadata" && ( -
+

Fetch UniProt Metadata

Downloads TSV annotations and upserts ProteinUniProtMetadata rows.

diff --git a/apps/web/app/query-sets/page.tsx b/apps/web/app/query-sets/page.tsx index 29d6c3a..82a8ab7 100644 --- a/apps/web/app/query-sets/page.tsx +++ b/apps/web/app/query-sets/page.tsx @@ -114,7 +114,7 @@ export default function QuerySetsPage() { )} {/* List */} -
+
Name
Sequences
diff --git a/apps/web/app/scoring/page.tsx b/apps/web/app/scoring/page.tsx new file mode 100644 index 0000000..fdc00dd --- /dev/null +++ b/apps/web/app/scoring/page.tsx @@ -0,0 +1,706 @@ +"use client"; + +/** + * Scoring Configs management page. + * + * Allows users to: + * - View all stored ScoringConfigs with their signal and evidence weights. + * - Load the four built-in preset configs in one click. + * - Create new configs by tuning signal sliders, formula, and optionally + * overriding per-evidence-code quality weights. + * - Delete configs that are no longer needed. + * + * The scoring system has two independent layers of configuration: + * + * 1. **Signal weights** — how much each composite signal contributes to + * the weighted average (embedding similarity, NW/SW identity, evidence + * quality signal, taxonomic proximity). + * + * 2. **Evidence-code weights** — per-GO-evidence-code quality multipliers + * that define what each code tier is worth. The system ships with + * sensible defaults (EXP/IDA → 1.0, ISS/IBA → 0.7, IEA → 0.3 …). + * Leaving the override null means "use the system defaults". + */ + +import { useEffect, useState } from "react"; +import { useToast } from "@/components/Toast"; +import { + listScoringConfigs, + createScoringConfig, + deleteScoringConfig, + createPresetScoringConfigs, + ScoringConfig, +} from "@/lib/api"; + +// ── Signal definitions ──────────────────────────────────────────────────────── + +const SIGNALS: { key: string; label: string; hint: string }[] = [ + { + key: "embedding_similarity", + label: "Embedding similarity", + hint: "1 − cosine_distance / 2 — always available, no flags required.", + }, + { + key: "identity_nw", + label: "Identity NW", + hint: "Needleman-Wunsch global sequence identity [0, 1]. Requires compute_alignments=True.", + }, + { + key: "identity_sw", + label: "Identity SW", + hint: "Smith-Waterman local sequence identity [0, 1]. Requires compute_alignments=True.", + }, + { + key: "evidence_weight", + label: "Evidence weight", + hint: "Quality of the reference annotation's GO evidence code, resolved via the evidence-weight table below.", + }, + { + key: "taxonomic_proximity", + label: "Taxonomic proximity", + hint: "1 / (1 + taxonomic_distance) — requires compute_taxonomy=True.", + }, +]; + +const DEFAULT_SIGNAL_WEIGHTS: Record = { + embedding_similarity: 1.0, + identity_nw: 0.0, + identity_sw: 0.0, + evidence_weight: 0.0, + taxonomic_proximity: 0.0, +}; + +// ── Evidence-code definitions ───────────────────────────────────────────────── +// Mirrors DEFAULT_EVIDENCE_WEIGHTS in scoring_config.py (single source of truth +// for defaults; this table is used to initialise the form sliders). + +const EVIDENCE_CODE_GROUPS: { + label: string; + description: string; + codes: { code: string; label: string }[]; +}[] = [ + { + label: "Experimental", + description: + "Annotations backed by direct experimental evidence. Highest confidence tier.", + codes: [ + { code: "EXP", label: "Inferred from Experiment" }, + { code: "IDA", label: "Direct Assay" }, + { code: "IPI", label: "Physical Interaction" }, + { code: "IMP", label: "Mutant Phenotype" }, + { code: "IGI", label: "Genetic Interaction" }, + { code: "IEP", label: "Expression Pattern" }, + { code: "HTP", label: "High-Throughput (umbrella)" }, + { code: "HDA", label: "HT Direct Assay" }, + { code: "HMP", label: "HT Mutant Phenotype" }, + { code: "HGI", label: "HT Genetic Interaction" }, + { code: "HEP", label: "HT Expression Pattern" }, + { code: "IC", label: "Inferred by Curator" }, + { code: "TAS", label: "Traceable Author Statement" }, + ], + }, + { + label: "Computational / Phylogenetic", + description: + "Annotations derived from sequence similarity, orthology, or phylogenetic inference.", + codes: [ + { code: "ISS", label: "Sequence or Structural Similarity" }, + { code: "ISO", label: "Sequence Orthology" }, + { code: "ISA", label: "Sequence Alignment" }, + { code: "ISM", label: "Sequence Model" }, + { code: "IGC", label: "Genomic Context" }, + { code: "IBA", label: "Biological aspect of Ancestor" }, + { code: "IBD", label: "Biological aspect of Descendant" }, + { code: "IKR", label: "Key Residues" }, + { code: "IRD", label: "Rapid Divergence" }, + { code: "RCA", label: "Reviewed Computational Analysis" }, + ], + }, + { + label: "Electronic", + description: + "Automated annotations (IEA) or non-traceable author statements (NAS). Lower confidence.", + codes: [ + { code: "NAS", label: "Non-traceable Author Statement" }, + { code: "IEA", label: "Inferred from Electronic Annotation" }, + ], + }, + { + label: "No data", + description: "Placeholder code indicating no biological data is available.", + codes: [{ code: "ND", label: "No biological Data" }], + }, +]; + +/** System defaults — matches DEFAULT_EVIDENCE_WEIGHTS in scoring_config.py. */ +const SYSTEM_EVIDENCE_DEFAULTS: Record = { + EXP: 1.0, IDA: 1.0, IPI: 1.0, IMP: 1.0, IGI: 1.0, IEP: 1.0, + HTP: 1.0, HDA: 1.0, HMP: 1.0, HGI: 1.0, HEP: 1.0, IC: 1.0, TAS: 1.0, + ISS: 0.7, ISO: 0.7, ISA: 0.7, ISM: 0.7, IGC: 0.7, + IBA: 0.7, IBD: 0.7, IKR: 0.7, IRD: 0.7, RCA: 0.7, + NAS: 0.5, + IEA: 0.3, + ND: 0.1, +}; + +// ── Helpers ─────────────────────────────────────────────────────────────────── + +function allCodes(): string[] { + return EVIDENCE_CODE_GROUPS.flatMap((g) => g.codes.map((c) => c.code)); +} + +// ── WeightBar ───────────────────────────────────────────────────────────────── + +function WeightBar({ + label, + value, + hint, +}: { + label: string; + value: number; + hint: string; +}) { + return ( +
+ {label} +
+
+
+ 0 ? "text-gray-700" : "text-gray-300" + }`} + > + {value.toFixed(2)} + +
+ ); +} + +// ── ConfigCard ──────────────────────────────────────────────────────────────── + +function ConfigCard({ + config, + onDelete, +}: { + config: ScoringConfig; + onDelete: (id: string) => void; +}) { + const [deleting, setDeleting] = useState(false); + const [showEvidenceWeights, setShowEvidenceWeights] = useState(false); + + async function handleDelete() { + if (!confirm(`Delete scoring config "${config.name}"?`)) return; + setDeleting(true); + try { + await deleteScoringConfig(config.id); + onDelete(config.id); + } catch { + setDeleting(false); + } + } + + const hasCustomEvidence = config.evidence_weights != null; + + return ( +
+ {/* Header */} +
+
+ {config.name} + + {config.formula} + + {hasCustomEvidence && ( + + custom evidence weights + + )} + {config.description && ( +

{config.description}

+ )} +
+ +
+ + {/* Signal weights */} +
+ {SIGNALS.map(({ key, label, hint }) => ( + + ))} +
+ + {/* Evidence weights */} +
+ + + {showEvidenceWeights && ( +
+ {EVIDENCE_CODE_GROUPS.map((group) => ( +
+

+ {group.label} +

+
+ {group.codes.map(({ code, label }) => { + const val = + config.evidence_weights?.[code] ?? + SYSTEM_EVIDENCE_DEFAULTS[code] ?? + 0.5; + const isOverridden = config.evidence_weights?.[code] != null; + return ( +
+ + {code} + + + {label} + +
+
+
+ + {val.toFixed(2)} + +
+ ); + })} +
+
+ ))} +
+ )} +
+ +

+ Created {new Date(config.created_at).toLocaleDateString()} +

+
+ ); +} + +// ── NewConfigForm ───────────────────────────────────────────────────────────── + +function NewConfigForm({ onCreated }: { onCreated: (c: ScoringConfig) => void }) { + const [open, setOpen] = useState(false); + const [name, setName] = useState(""); + const [formula, setFormula] = useState("linear"); + const [weights, setWeights] = useState>({ + ...DEFAULT_SIGNAL_WEIGHTS, + }); + const [description, setDescription] = useState(""); + + // Evidence weights — null means "use system defaults"; toggling the + // checkbox allocates the override dict with a copy of the defaults. + const [useCustomEvidence, setUseCustomEvidence] = useState(false); + const [evidenceWeights, setEvidenceWeights] = useState>({ + ...SYSTEM_EVIDENCE_DEFAULTS, + }); + + const [saving, setSaving] = useState(false); + const toast = useToast(); + + function reset() { + setName(""); + setFormula("linear"); + setWeights({ ...DEFAULT_SIGNAL_WEIGHTS }); + setDescription(""); + setUseCustomEvidence(false); + setEvidenceWeights({ ...SYSTEM_EVIDENCE_DEFAULTS }); + } + + async function handleSubmit(e: React.FormEvent) { + e.preventDefault(); + if (!name.trim()) return; + setSaving(true); + try { + const c = await createScoringConfig({ + name: name.trim(), + formula, + weights, + evidence_weights: useCustomEvidence ? evidenceWeights : null, + description: description || undefined, + }); + onCreated(c); + reset(); + setOpen(false); + toast("Scoring config created", "success"); + } catch (err: any) { + toast(err.message ?? "Failed to create config", "error"); + } finally { + setSaving(false); + } + } + + function setGroupWeight(codes: string[], value: number) { + setEvidenceWeights((prev) => { + const next = { ...prev }; + for (const code of codes) next[code] = value; + return next; + }); + } + + if (!open) { + return ( + + ); + } + + return ( +
+ {/* ── Header ── */} +
+ New config + +
+ + {/* ── Name + formula ── */} +
+
+ + setName(e.target.value)} + placeholder="my_config" + className="w-full rounded border px-2 py-1.5 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500" + required + /> +
+
+ + +
+
+ + {formula === "evidence_weighted" && ( +

+ evidence_weighted: the weighted average is multiplied + by the resolved evidence quality at the end — even when the + “Evidence weight” signal slider is set to 0. This + down-ranks IEA-sourced predictions regardless of embedding strength. +

+ )} + +
+ + setDescription(e.target.value)} + placeholder="What this config is for…" + className="w-full rounded border px-2 py-1.5 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500" + /> +
+ + {/* ── Signal weights ── */} +
+ +
+ {SIGNALS.map(({ key, label, hint }) => { + const val = weights[key] ?? 0; + return ( +
+ {label} + + setWeights((prev) => ({ ...prev, [key]: parseFloat(e.target.value) })) + } + className="flex-1 accent-blue-500" + /> + + {val.toFixed(2)} + +
+ ); + })} +
+
+ + {/* ── Evidence-code weights ── */} +
+
+ setUseCustomEvidence(e.target.checked)} + className="accent-blue-500" + /> + +
+ + {!useCustomEvidence && ( +

+ Using system defaults — EXP/IDA → 1.0 · ISS/IBA → 0.7 · IEA → 0.3 · ND → 0.1 +

+ )} + + {useCustomEvidence && ( +
+ {EVIDENCE_CODE_GROUPS.map((group) => ( +
+
+
+

{group.label}

+

{group.description}

+
+ {/* Group-level shortcuts */} +
+ {[0, 0.5, 1].map((v) => ( + + ))} +
+
+
+ {group.codes.map(({ code, label }) => { + const val = evidenceWeights[code] ?? SYSTEM_EVIDENCE_DEFAULTS[code] ?? 0.5; + const isDefault = + Math.abs(val - (SYSTEM_EVIDENCE_DEFAULTS[code] ?? 0.5)) < 0.001; + return ( +
+ + {code} + + + {label} + + + setEvidenceWeights((prev) => ({ + ...prev, + [code]: parseFloat(e.target.value), + })) + } + className="flex-1 accent-blue-500" + /> + + {val.toFixed(2)} + +
+ ); + })} +
+
+ ))} + + +
+ )} +
+ + {/* ── Actions ── */} +
+ + +
+
+ ); +} + +// ── Page ────────────────────────────────────────────────────────────────────── + +export default function ScoringPage() { + const [configs, setConfigs] = useState([]); + const [loading, setLoading] = useState(true); + const [loadingPresets, setLoadingPresets] = useState(false); + const toast = useToast(); + + useEffect(() => { + listScoringConfigs() + .then(setConfigs) + .catch(() => toast("Failed to load scoring configs", "error")) + .finally(() => setLoading(false)); + }, []); + + async function handleLoadPresets() { + setLoadingPresets(true); + try { + const result = await createPresetScoringConfigs(); + const updated = await listScoringConfigs(); + setConfigs(updated); + toast( + result.created.length > 0 + ? `Presets created: ${result.created.join(", ")}` + : "All presets already exist", + result.created.length > 0 ? "success" : "info", + ); + } catch (err: any) { + toast(err.message ?? "Failed to load presets", "error"); + } finally { + setLoadingPresets(false); + } + } + + return ( + <> + {/* ── Page header ── */} +
+
+

Scoring Configs

+ +
+

+ A ScoringConfig defines how raw prediction signals are combined into a + single [0, 1] confidence score — without re-running the KNN pipeline. + Two independent layers: signal weights (which signals + matter and how much) and evidence-code weights (the + quality value assigned to each GO evidence tier). +

+
+ + {/* ── Reference card ── */} +
+

+ Available signals +

+
+ {SIGNALS.map(({ key, label, hint }) => ( +
+ + {label} + + {hint} +
+ ))} +
+
+
+ linear:{" "} + Σ(w_i · s_i) / Σ(w_i) over all active (w_i > 0 and signal available) signals. +
+
+ evidence_weighted:{" "} + Same as linear, then multiplied by the resolved evidence weight — down-ranks IEA + even when other signals are strong. +
+
+
+ + {loading &&

Loading…

} + + {!loading && ( +
+ setConfigs((prev) => [...prev, c])} /> + + {configs.length === 0 && ( +

+ No configs yet. Load the presets or create one above. +

+ )} + + {configs.map((c) => ( + setConfigs((prev) => prev.filter((x) => x.id !== id))} + /> + ))} +
+ )} + + ); +} diff --git a/apps/web/app/support/page.tsx b/apps/web/app/support/page.tsx new file mode 100644 index 0000000..acd4c2e --- /dev/null +++ b/apps/web/app/support/page.tsx @@ -0,0 +1,72 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { baseUrl } from "@/lib/api"; + +type Comment = { id: string; comment: string; created_at: string }; +type SupportData = { count: number; comments: Comment[] }; + +function timeAgo(iso: string): string { + const diff = Math.floor((Date.now() - new Date(iso).getTime()) / 1000); + if (diff < 60) return "just now"; + if (diff < 3600) return `${Math.floor(diff / 60)}m ago`; + if (diff < 86400) return `${Math.floor(diff / 3600)}h ago`; + return `${Math.floor(diff / 86400)}d ago`; +} + +export default function SupportPage() { + const [data, setData] = useState(null); + const [loading, setLoading] = useState(true); + + useEffect(() => { + fetch(`${baseUrl()}/support?all_comments=true`, { cache: "no-store" }) + .then((r) => r.json()) + .then(setData) + .finally(() => setLoading(false)); + }, []); + + if (loading) return
Loading…
; + if (!data) return
Could not load support data.
; + + const withComments = data.comments.length; + const anonymous = data.count - withComments; + + return ( +
+ + {/* Hero */} +
+
👍
+
{data.count.toLocaleString()}
+
people support this project
+
+ {withComments} with comments · {anonymous} anonymous +
+
+ + {/* Comments */} + {data.comments.length > 0 && ( +
+

+ What people are saying +

+
+ {data.comments.map((c) => ( +
+

{c.comment}

+

{timeAgo(c.created_at)}

+
+ ))} +
+
+ )} + + {data.comments.length === 0 && ( +

No comments yet. Be the first!

+ )} +
+ ); +} diff --git a/apps/web/components/NavLinks.tsx b/apps/web/components/NavLinks.tsx index 26b9041..4ebc71c 100644 --- a/apps/web/components/NavLinks.tsx +++ b/apps/web/components/NavLinks.tsx @@ -2,6 +2,7 @@ import Link from "next/link"; import { usePathname } from "next/navigation"; +import { useState, useEffect } from "react"; import { DocLinks } from "./DocLinks"; const NAV_GROUPS = [ @@ -13,6 +14,7 @@ const NAV_GROUPS = [ [ { href: "/embeddings", label: "Embeddings" }, { href: "/functional-annotation", label: "Functional Annotation" }, + { href: "/scoring", label: "Scoring" }, { href: "/evaluation", label: "Evaluation" }, ], [ @@ -21,33 +23,81 @@ const NAV_GROUPS = [ ], ]; +const ALL_LINKS = NAV_GROUPS.flat(); + export function NavLinks() { const pathname = usePathname(); + const [open, setOpen] = useState(false); + + // Close menu on route change + useEffect(() => { setOpen(false); }, [pathname]); + return ( - + <> + {/* Desktop nav */} + + + {/* Mobile hamburger */} + + + {/* Mobile dropdown */} + {open && ( +
+ +
+ )} + ); } diff --git a/apps/web/components/ResetDbButton.tsx b/apps/web/components/ResetDbButton.tsx index 9a66443..6508e6c 100644 --- a/apps/web/components/ResetDbButton.tsx +++ b/apps/web/components/ResetDbButton.tsx @@ -45,7 +45,7 @@ export function ResetDbButton() {

Reset database?

- Se borrarán todos los datos: proteínas, anotaciones, embeddings, predicciones y jobs. Esta acción es irreversible. + This will permanently delete all data: proteins, annotations, embeddings, predictions and jobs. This action cannot be undone.

diff --git a/apps/web/components/SupportButton.tsx b/apps/web/components/SupportButton.tsx new file mode 100644 index 0000000..adacd55 --- /dev/null +++ b/apps/web/components/SupportButton.tsx @@ -0,0 +1,132 @@ +"use client"; + +import { useEffect, useRef, useState } from "react"; +import Link from "next/link"; +import { baseUrl } from "@/lib/api"; + +type SupportData = { + count: number; + comments: { id: string; comment: string; created_at: string }[]; +}; + +export function SupportButton() { + const [data, setData] = useState(null); + const [open, setOpen] = useState(false); + const [comment, setComment] = useState(""); + const [submitting, setSubmitting] = useState(false); + const [submitted, setSubmitted] = useState(false); + const ref = useRef(null); + + useEffect(() => { + fetch(`${baseUrl()}/support`, { cache: "no-store" }) + .then((r) => r.json()) + .then(setData) + .catch(() => {}); + }, []); + + // Close on outside click + useEffect(() => { + if (!open) return; + function handler(e: MouseEvent) { + if (ref.current && !ref.current.contains(e.target as Node)) setOpen(false); + } + document.addEventListener("mousedown", handler); + return () => document.removeEventListener("mousedown", handler); + }, [open]); + + async function handleSubmit() { + setSubmitting(true); + try { + const res = await fetch(`${baseUrl()}/support`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ comment: comment.trim() || null }), + }); + const json = await res.json(); + setData((prev) => prev ? { ...prev, count: json.count } : { count: json.count, comments: [] }); + setSubmitted(true); + setComment(""); + setTimeout(() => setOpen(false), 1500); + } finally { + setSubmitting(false); + } + } + + const count = data?.count ?? null; + + return ( +
+ {/* Trigger button with tooltip */} +
+ + + Comments and metrics are public and visible to all visitors. + +
+ + {/* Popover */} + {open && ( +
+
+ {submitted ? ( +

+ Thanks for the support! 🎉 +

+ ) : ( + <> +

Support the project!

+