From 50f05dc43bcd56f6c59fa0f65809cc236e2ee0c1 Mon Sep 17 00:00:00 2001 From: Sukanth Gunda Date: Sat, 18 Apr 2026 13:51:26 -0400 Subject: [PATCH 1/2] Add browser sync, detection, and parsers Introduce browser-based sync by adding a new mindmark.browsers package (paths, chromium, firefox, __init__) to detect installed browsers and parse bookmarks (Chromium JSON and Firefox places.sqlite). Add CLI `sync` command and related UI hints, plus README/CONTRIBUTING updates documenting the sync workflow. Extend Index to support incremental sync: content hashing, bookmark_sources table, schema migration to v2, sync/rebuild logic, and model-change handling; implement safe Firefox DB snapshotting and Chromium tree walking. Add comprehensive tests for detection, parsing, and orchestration and include .coverage capture. --- .coverage | Bin 0 -> 90112 bytes CONTRIBUTING.md | 9 +- README.md | 143 +++++++++--- src/mindmark/browsers/__init__.py | 56 +++++ src/mindmark/browsers/chromium.py | 80 +++++++ src/mindmark/browsers/firefox.py | 147 ++++++++++++ src/mindmark/browsers/paths.py | 165 +++++++++++++ src/mindmark/cli.py | 75 +++++- src/mindmark/index.py | 265 ++++++++++++++++++++- tests/test_browser_detection.py | 88 +++++++ tests/test_browsers_init.py | 143 ++++++++++++ tests/test_chromium_parser.py | 135 +++++++++++ tests/test_firefox_parser.py | 137 +++++++++++ tests/test_incremental_sync.py | 370 ++++++++++++++++++++++++++++++ 14 files changed, 1774 insertions(+), 39 deletions(-) create mode 100644 .coverage create mode 100644 src/mindmark/browsers/__init__.py create mode 100644 src/mindmark/browsers/chromium.py create mode 100644 src/mindmark/browsers/firefox.py create mode 100644 src/mindmark/browsers/paths.py create mode 100644 tests/test_browser_detection.py create mode 100644 tests/test_browsers_init.py create mode 100644 tests/test_chromium_parser.py create mode 100644 tests/test_firefox_parser.py create mode 100644 tests/test_incremental_sync.py diff --git a/.coverage b/.coverage new file mode 100644 index 0000000000000000000000000000000000000000..e65aedba98f9e50bdf88d37d0ec34efbb7b64453 GIT binary patch literal 90112 zcmeHw33L?4op*Il&pp#ULZAZ))CdW2=;Sb9473CoaoAu^AwV-DX@q8xX5^WX0h^#6 zfgN6KZxV9Y1P91|S;zZwZX6#ujK{<;uxo#|le}cJ`##%p>DuG@9;Sbi;J1TkEO!xJ&{x*+!0AANB@dVs~Weg61OxqtzRW7apFvm2(G3^Y!nmm zePUlEA$CQ3B4RYw8SMzCqOm<WiaVKik;7XWp{6P;N5_Cz?=(H%)vi($Qm9dXcee@ba>BHmk< ziiP3XfA#&8`;i z0_9?HjaH)|t{Msxon!sI?Lhh8^!N7R&Vdq2#i{tUN?xmMDy}y6y8C9FzBJmK)14aS zxJQ*z{juns{Sm$Mt3{(^HEBjHUM=EAfCW@~9fgbY83a}t*`K_#2Tq-W;r>*dgoDue z!3H?^j^w$Vg=J;T$L}PDg6RpiM^j1NNGOK2|UomxSA0({j*vdRu*&++k4{exwb|uCK-VN0&{_ut*bxQfrm;G zx1l3`S0oYM6RA~Gm1*J6ZAf4`4Q@x_sTdHv8 zn2^?!<4mJkQg0-j>`!PD$sd{|wGzfGsf`ge1)5XibQCt0T&p>1zF>puaiyJ{vvATR zCasJJJTKutdrl*8j4=d8EdOH@8; zBkcFWq>uk>IV$5AL*;dm12uPrdtf%{3r7<$WMK|gRsa!a>+Fv1%5kZTsn4#Rt6JeiEbvWl0jFSk)a);#nwJCAP=oJ&|xsSsS3lnk3X2 zO^QT^v8&Q@10#6=W9+%Bis+;*E_j;X0)tK)U*PHO4aYjw^$IV;3M$O~cyQu+lt}~% zid8TulQ2X_VQQdiDxR@4{MtU?1f%wbJC)&%JH>>!uRAgZPFamshEtJLv^OH+EJPV% zSo^|Bu_F3(Yy-^FaEfF$E=;(ILmX6Uz%!!K?d=M6S}yej z$ur9jS~9{_1ph5(F<>!ZF<>!ZF<>!ZF<>!ZF<>!ZF<>!ZF<>$9*T;a3VHqdh|Fgmn z65fV`BY#%-H4=U;y!RK`Kuc+h0gC~P0gC~P0gC~P0gC~P0gC~P0gC~P0gC~h z0iSIOqg@7IJ+>0JdKJJ9IauK`&NorGDqQ9t;aB@F`9JGl?t9aB*jMR&$(!^Rc%Joy zJ*@ki?#-_EU8h`<^MdnUXQku3Bj(7nKW=Z~u5e%EB-@bfh;0u0V>ZEhnXfV%(0k|+ zU4a|$Y_H}}dR|?;FICqa?~Sx4BKzuAM0$F{i8^??tM5SF+}ioI4Rt-y_PV|UsqT1e zUTwqtI(Tj}Qqvdi*b7gACDn{LOE4IXMN`2bJXz8*2Y?sQ01iUMkz@-0MG}DAIvbD| zQ$dCYM~0G>|Yk{(c=YQlz& zdZNAfDNZn&>3~hqXtakdgYDsD1n_s31O5_vy%|muB(i1G`fISc;QiZ+}lJsx3V`cmSulsu?G? zTm7`?2&zjNIbYu816VqJ*cruhQ#-sfBSF*#3u*HLI-S{79gDCE!>U!bTEAWb+y>-b%hY=v} zV*#}qIY4mU2tprhR>+QSy$v9y+z3QFJf%-mH|;!Euz=>h5i}FXS_VLhuM-4sO_O!_ z7*rJsc1B<`p0^DSfoc|xtgsUA&L;_97FG&Rcwgq5y(u=y`nZ$MUo&6xv@;v+JDm49 zXK){IjgEh^f5-NQ?Ot28x6l!HHM+NWZ0t`Q6I`c#GyI?POX#xiE#DE>75BG18T3W} z?=bKUyB01zkD+0xqxS#RJGk@$D!`-n|8sV7>BV%A$L#-S2f1`3P3%$o|5;sJdIL@T z@%R5T+qm@dkEj|L@>TTucGUiV1~qdWwg0cAY&7Qme+6Zu8NL6XzKcsQq1c-_`~UJ* zF1?T9&!^mjqxSz3D9={8`~SRcT)K`5A36K~0Oi>vXa6tIUK)%Y6Hkqfoc+I_ zULTIL|M$_GUB})3d+Ciw72hD-vxQ5~p<;ig{lA;uh#YVK@7m0zXHn7F*#A4}@nDea zXynp%<~sMe|MOz-IHiP0#-q;OGB^L4o71^8dmA zl>ZU`4F4VeA^tA^34g!8)qlIc(O>Sj`!4%V`g(jjeJi{tJ-4`@az5ewTjwfgjboMl ziv3CZpq*n5GwYc}=qdCldgume^H+0>yR@1O!KV~MjWxm?2#RyyVFkG|1dvY;%(gS` z^r6{<@dxqGBLeClLec~Hi3V^p*GxdD{YP+h1GxerglzRP14#_RA7=!Izt4#OE@rde z%w~h9@&?hvE(WLMI2X>}D07lXqLsoN5+}?8jb_XOL@^Jia1xRi%!Ig}nP(ltUqC3s z9Dn7dlMgXRkBa5kH61w$xs(bRBtq4zH#V>n8`zrr4pl*89z0mZzJeSWOGYC6A(sUH zBKZ3wPKHLwh?C(#4iT{d6hfL)eJ+e9BMJVAK$VD-Gx#vtk=m6=SosZNYe}tMJ^d(9D;go}!;O`-D2aGj zY$qbHL_(aY2(|+ymW4tt@*gwiLQDVw5iI<}d5D{a(9k3Eit3AyErfhBV`xK~k8li$ z*4v5mAx8!d;7E3oq+~}Zh4>5%7aKvtSu>DLbEFm=FgiplSA?d>wtD2qAO%_N7o~q} zAw-;^of1|H%M7Sc6arlX3Moa9QL{A;xWnWDMetvQf|wkL$aO%B>J;v@kRA_(vD3pq zp-5uj5Yjv#B~lNTkRt#QP!OnxDkYhus=Wgv8ImA3^b%x&G!X*C$#7&5Bjd;r_=&4f z7*U#4gpO6LIR;V`gV!d#IfWkixKp~u@I=!eXr%yt2D`D2?s(kW$pVV_`MHDHS z>_r8F;VFyZl?8$}*^6ec9Q2Mi!@!pFqlj1OO&t0pkFP^oHGs%+3wQ8Dou zVRL5i%ceH!W#;KV>;id#r1-ke792f|k)gsIdg041UDuwl!2TY^Ir4D5c zMAj?9G3Jm2;UYcvn>blI3P^P?j4;G1c73Ru7(HzjdG?G6=7*HSXc2 zt)r5Rph&#q7lo;o;Ww~($w_s*Iw`n#SkwRv&%p+Ji9aX zMG(uOCV^9;4wiQD2)76Da5+ukA5Zf)<)m{Kia2z>w zb`pVO@{y@S)Zqc+i@5pc-YfeU>n{LJN()$cuL~N=mX%C`;G#)vC9=zKre{AN&a#i$ zrN9u(2#2Ojgv47Wwzb$^!z(3^ny;S2%|R!fPknttnKvJji}FFk=6|z!CP1KI0{ipq zai%yA{PXizmU-mOyo;_y7hUkL2=ggd1R!ZGX{qb8u=*6A57_2nNN}eSr!-4Sw+PVs z!}S6?nayOg=Z3QbBiVgtjpdrOm=uD8WDaSK))pZx6Hgp_GR@$2bEPCSu;Q<`8x!jWp_5L?Cr0TR!CB97z14c*UNzSJaTgI72gCvCQs z_#wk0f63T-x~-f6#%gY`;UdGK4O?^LN)Z zNQg$mQx)5_fnY_%As2(#Ok*f>-|NiJve`R7_ZFOZt~n-4EMp&H`;fP61IpYj+a0KK z1>3k3Hq7pW_RCw{7-i_E|LK1EMXt{EuP%M|_uh*e{5FU!wGF=Zn};%2 z2N(wnL7sI47|!jkXgdUI=WmC@glc^MKkVO$VDJAs;lG7n39k#k5PmHDv+zUV2f`D= zH-xVUr-kFfG2wpUi10DtE+H;-3t^#E*et9URtd|5g+jeBODGpggo%P*aPV3F3jYpV zKX{Y>fBet*m-ushhW{@AZT_qLm-#R9pXWcxALY~h-F%Yo;Ujz-zn$O6ui=~cCH#E8 zhM&Qg@kM+d@8NC!VgGynxBVCVL;ipD|HS`-|9Ss2{wMw4^grhRlK%_-&-fqk-|Ii* z-{>rkjXZuz-m9^NE-@kBGT* ziD+mbqQ0Jpx;i3iYl*0-A)>mPh&gkJm_3__S+j_kIg^O0Dk5ggAfmF8h>8j#rcWoL zyqt(>(})m7BFf5$m^zh+DN~3jEhVC)gow$Li6|~6qNs?7!a^bn3W%6AiHM04iOA0< zV!{L>^74oX1c(p>B6yw%zn=)7j|i`q2#<#dx0?u;iwLKa2#13RyPXJ*Bf@4Qf@L|l z7U0Iu|C6h8f38tt)oU?eF<>!ZF<>!ZF<>!ZF<>!ZF<>!ZF<>!ZG4PQyfbaiX`~Q!e zW~%~=0gC~P0gC~P0gC~P0gC~P0gC~P0gC~Pfj=t)*8TrKEB#jG76TRo76TRo76TRo z76TRo76TRo76TRo76bn)40zDb(JXW~6233=+Wy`qaQ6yx_+Rs1OoUe(Fkh$gh07p(sW}?kQx}Z2M^nM} za54h$3+R2t+6)iKH`DtRs)kIy^Be4m?!}rfrjOsz9Ztaa>^mb#z+ZGD_!5`4{oxO6 z*7ZhXoxS12-nvvInM#slQ2#<_a4nZEya9k{tRoTWjl@#no?!AotON9KThFCSt`{oj zi=*v{_`YN$k*w>ACL&$&{Q$PRnM>ziFPN!eDuU6#U4Q%&qy|IdYt_-6i1$YOdqJ(8 zYq<2J8&FGwF!X9&&872h07>JDeCZW5+VQ7!;P~z93#YoV0k%;7aPvp{_4-5vzVF?M z)t*7`i@)TP;iJF7O>eFp`$clfs*nBvJiWP=sII~T=cSZQCHEWOEnIpYofU|v-5crc z4flc8m6W@3PbAe7?Tx0e6;|BH>|K|PrXn?c;f}rGJ&|M`@Y{Ew4i1e|mQyCXe)u%to_#jRX=AWmZnN~0e(EJuIJ%2jm#(7(s?1tMS3jH=7Q5@VlJn|h|K4yc)xD=b))}tb7f!kQB?&U;NK(CYbR zurrd1kOc_Z-NdDx*L$j(bBd{NGq^@BZNDK1*t|sWhQR(mKaUR@q}x_kZ+g zu*4tmpJ%3`XI!&=Pw~ItEA8)lv)+HOowF}-3XbnMUvTbqJs9HBE2vmSPqe)b@5Cd) zj_ydu-e9!14^OYR(XF66noPy_B*MK(t%9ZwF1?0c3v$kfv^pB8EuMrW&JsgROC;K)vy~b> zSa&$%O~jLZkxnkX&OFaV_SLP3^uQ^Tx+|PpJHNJ}4wr!5>W;_e)i%r@fBUFeV+{9a zXD}JLvp*8+z*jJDqBqazl~|W{b2slAF}as^+qv{=D!ohADRNGvn!9F|wced*3zaJF zf^8%0lE7Y^)a-?*t?b*wotnxu)b>aO-k2Jzshv{i>=^@_xO5d2ipgdU&s6#JHqhj> zr4C4d$)qC%*puija4I%zFTgmoEwq=q?nV!K{5{MrDpUvY;!=w78PA>JR5%&$ zPmrZ+EnkV~7zDx2Imcf^?KF*Gcf}KMx}dR*KaBG@HQQ6L!Ae%s<(k;Ert+sUwWvCg5+Nh;O>6X}Wg0y9%mV{?Swe zvr5YhfTz>9qz8dq;E zPO(FCenHVv3g8q6r*ZdpX9-=kbMF4O!FQr)4kG1Rws8YBR1Em5==JE>8>rihpazO_ zmwqpAcOd{%oXyo9g-a0R9^|$HfN!A3Oh3K~1~W_RB*3M!PLJ8NwoasmP1cDxY*_Kd zLAbFO`@&KOVY?XTtd}{#E>uB(ibD3nP0I(D%!JOX) z?cf2N=Bj3#*lzXHq9X|BDEQ|iE7<|mex3LKb8gyecibB} zKycm&f_~e7y$v9y+z5pBGuJxP{q_|spm}cu%>=TR0g&SB1o!=U zf1MxTr~A+Q_xioQulZJc|I7P;x7za)Prqlf`)T(U*A>_2U2~m3cE+7P$3Hl3vtPC! zw=d=1;Er-twjbHLY>bSSnJ+Wz(c1ue18&zgXd9KO2JiVt-&NbSl_vV=yK39E z06LYE6{5Rx%MKR(Et>(ihQ@%!xH0wG@77I#N@eN9sQR_v9d`gMmDLTyYPS~2-GJ>I z0h-E5EQSsmvsZ8f0I#4$aM0NDXqO|~Zifn}EOA70^5X)0k6yzM-bS?n`0P9Pr#8X$ zR4W*JMSuG`sDNJI8dvmN*8(t=)t^wGb49;(4OQ%%EBZT|0h`JxDA7CjivHHs08VY} z>sRz!Z>7oynW;1r53d63)zrr5cvtkpE2&m7-WC1O3aEn0PM`EDsUr7^ey|B@pf{I| z{ZU+dBUG@4T1VtuP}Dj*B+;iSm@~XumqQKIrUlXuN(Hc9w=bj0P|it=&d^e-45jwP zJEOhiqOMlMj$5DxDhs*(5T*8`xSdM?n#$a!LmNMeYq=S~>0D#feh7mf{)B#QTMWol zR&iZz?Wf7@v7Jv=~4oq`|dixr(!Y&zG){Ot_65{Cne+kC~kKRtri$(41LBA*p&PK4@wF|AYT2|0Dhx{yY3b{9XPN{(gU}|8{?)zua&4UG|;y_4s!BR(MZ( zZgD^5e8Ty+&Q;DD$13|3`;+!TJI5Sm)-#LHQ|M9j&<$Jyz(xGk@HxrSYBt0n6l$yy z=0H%K!y+VCh9LRrf!TJ(ojx>sF#h1J5Dut+2uTk>ChTUenSfCHj}T&zD-c4+RxdM< z#2`S#&ZYic%x1rt%?3~94Wfr#3`sH{TsVKD%t<1NRSI)RoG=SCnlTFy#XOwCNl0EW z6XJSio^=d=0ig_Y{FRqZKExb7Dwbo{bmS=HQYv7O2-UCN*uYL~U~BF>R0WNB@L(1D z3UXj98HwFp?>eZNfEy;%?RE~)aupKj{>!# z5yC#)SXqmbh=;{?A_7Y!#F>gItJu5Xo_sB zM~)0qkkx)s`o|VR#2MNtVYRT#fC@z+&^4ftQUn<_TjPK`Ode1K|3xT>$$^Mm2gImO z;Z6(b@lY5$Jq#3zBnA#4%>z;*^p(RrDqA^BtUlCzJg%-u=SqY}t9_ zR7#7G73Y>vef1G2gj^z$HI7LZNNvO(fI%YG3l|6|{Nn%=!bz&ri@I|9Bs8}YW{s}O zC%^YeJ%?6Ak&-zc3{nDKDzL_=;wO7qc4{`8%qbu(<%%xvL_;4wc)WNzNdZM|sFZl+ z{*%nNF0g@h(=kH9^fOPrxY$&>=+!r#6MUyDKa-;)I$~;vnmc{W_dCD^H^QVN<$!j0 zZFv)mV!NCNSvAjkRgMs*7g$3#$?Xt$>lb6I#Y}^hNI0{}aZ_=E z;c!9CoYgU>CfZQ(FTs#9NMvm&z;+x(a@vQdP1HrUr55l8D(&hzR$5E4lQoUNMCNQ^9 zscKfLY|v6sG4UB;b7t_%rZ(zj=IK7{2`M!waiY25rUX+3y7&1U7zmuzBFqhft~OyR zh4q*ROryo64rL8Q)+@p>=8y#8B0cw;I9YLqMtTikefza!B4rzTBPRm!=!i)ToTk;B z!M#+Z(qMoCXDG8M7X;PwqayU+n`w3mSim4RC2>OS4i!t+QZnozq?faqY~)j#iO5x= zmLafG@yUt^AX!;jv~*o^h(&?HU@0hZ@3PX7|0W5U*hqGVrt_~<<91d3P^&1UHBj<+ zH6+8Dv@TWcQR#5QUJM@;?b3=PRHc*p|`J|I5+Up)b#GxUhoRV z5HA+9%Rn(ayEFAg5Yb-Ll>H=PT%3qOluB^A1Qvx|01NnIP!p226+-jV)rCqDB*+U8 z+b!DFUI0yS965A$5`km#k*Pz};Q`}|xcTSaEBhGhF91zS3t0GF7aGczl}v)*qDgEe zvdeI$XFng#vX9xNz!1y`ho($~#9Jn|wb)+6D^f%UGT36^C?#ZAZaaWsq3?_`V^lJ*ydtL zaHkQcG)qdi2+;b&^#VJY&1AFZhO+}B*?nh?<(jmZ6oP|f4rz_n79lMYQi;ou2y!$p z4$920o=)N{BnL=RECe2elxaip>*+>LJduG@nqWf0k!s}-TgC$c63>1jj^n`%-OpUU z)FfqtS2!0ZZMK#8A;Thn(TnaEvLhqIXGY#Xb&su)WECAwA0!5R>_w*+d%R79UJrPR zJZuYl@lJM<8v@mC7HyenLyz5Giu%CX5G%MKT6CSw4rjBcUJqVc?sVeP&bRo34s2!n z9lDhvbm5)9yRJb(G#Z|&*scu(DUK$R=l#-*@fb|17~-s;9ELqGjb_tP(Ob*>)|pE&p1_q_7WAN=6I&OE++ z$&<(b_1&I#dRDv|U;XUWXCHlH;KIung4yi8i5wK4&0QW|dT99N7hipK>9fE0Ufkff zL2RjQ@U`DOl({;&8~JkDABf^<%o%#CP~Jr9E`+V$HH4BK0D~fji2X;*H>$zmbh#keOssgFOe5j#V(}DH}k^!J9W+ zcK$$iBlJAZVshH@4^g9&a!%IFOGE}$ka)vmMVzEYF}v2{`WCGPXmlDb zUg1KuAYiv*$9WXn7G+5cAdXy@!pjie9e_eas3GMPBrrh)B&-!yhzqa5=uK-zYPBVu zaFszvFaR`2CISvhLpYRiAvG?fC7Gm}q@a`aJ&<-b;~wXmaSKMfi{Ci2|5G8%o_0!^ zQL}&^qV^xQNistc$hw3zhcwk?G!N?2eb|H9*PR4DpJO$cJHMJ8TyZOCbg$#qcL$zz zqUUhS%UuDLNeJ}WQ3WF&178J_0`MM6@Qvg2;Up8S)EvfXAMOXlLKig4uxnNLQ!|Yf z8DVVPs2I0qzoX)npgB>_UPI+^Oa7^mV4~pUbYc~-c(`*FTzC0cJ~UGzK-lEewh{uA z%{)m`4rfO(@w^hPjdxMo{^Dq3zoe7hu4Fgu%WwpX%^B(otEpIQe6i3J(!CvLbg#+t zt`mt$P=dC-X-YA9hZdEt1QWV>C0nf#EHN;zBXaS^5lH8+z@|Q91&c1v&^5;Ci{MJc z_%5T-;X0y~P^BuA>0H#tC!k2gFO9|fk%pxt5lREL15Kb*I??p*&zqrVM8XqLMS2lJ zq`M?6FK$p`0flrOj>C>sR8Rt=j^R;ZQ&2*Vd)G8V-SB78g*Map}T?*-O#FH}&jlDH8QUjXH4&B)0MNhU^cPSu^&JW6>8$qk|rm*abY(OQ-htp%`! zieyrepzSWi4&oFUABg2*&nWl&sPMQw-#$8%623;XZuxSNQ-Yx%kP^j?<#-uc&aPgD z#}&V9@C6qW(orA4=gxn?i9jpq1qsQ@IQFF{EydHu?4{qfq0n=WXy<|%6~(EcHS6O= z?TsNNMOGW0A;BWOk6$>$DT}_6%{VF6PA49N{Q=c z?7MmPfRMi!d@~nsLqp}RzS~zy%}+lSbcu@~vT4!e*S`PZhtJ$Y^aw#M(0@bOk>9@l z+sjVpLdX$VcoQSs1Rn9G+5b+G;Z9iI0*I(s@Oh=Rk3RYSYxm^Ohp3tJV}*C8FQG%F z2OaOc{ytYS4gcDAsi`KQHeAJYLouyZEbT z{{HpG3RL>s^l6;wVxcxVu_ng3VZBZxj(&U8sjA8RA=S8ubPY3tJbK>&8|JZQipLdU zXX+It!Zev_5vo*rEmxb-*J5+31HO!5r&bg)>odHk_)O^NZGivcsJb8sKef&9Uza8; zDXtuf@bo0&1SzM1rrM41X`<<9-dy&)f`>X)Nknm81Gb0;R*#f_mUh?UkustFd(#=k zw8?hvw30;vYN37d%5p{oWn=`7!v|0a9G*=@?7;-!08BAhkb{IhB0k_$8GCg5#WdmA z4E)L1a|LwrIqN{ZK;5?|r){-ZDM#&V1Nnv};6kU#89I1WqLtm$E1lq@hX-gaA@^ z3PTW-wJ6Sq=P=5$!bNm6m-v6fmVgGjDRfx8n#el_t#el_t#el_t#el_t z#el_t#lZg>13SVTrQfwD!m-YHZ(Xnt-mu$)UnScb0%$6~YuBKS-?c9f0=Df>e?(h< z?0iid02Kb|0ONerYTa&tD!ERmoX6<(C)kaTTCLs%VENYxW_tE~%tx)(?*yR#*>m;U z7pb5xn_B^G(x0xD24Q^E>ed~Al=r72X^nDWQ=c;t%qZ{onQP@O|j}vai|uj`#E4<(}VoKJA(3{ulQ__Y~JN@TI?R zyQ2SV!{4gNV!&d+V!&d+V!&d6FhK3o4cb|&enK3k_CjV2+BhK&ZRXOm7#a@<8?S9~ zQhPhR@o=a4;n8|(-wrVFO%IROQR~SukL=E+){`2v(UT2Szf6+zasxPR-nN}fH&C$? z{J5+7*%QFs@)z~~e2SJa>K#gVP<`XS{^l3D?{d$5oyKyyFZ(yp!x?hn9W!;Ia!3U6?;F`O?cT|I~qgQ*if&;PPjdFMmcDc;cJ6Tlo#@m&z?0e@DQ_u=AwXJO<@4vsNYxzCDd}*L7n8LD>=Dnvc)zABbLESAX@{y8jOkv?bpF literal 0 HcmV?d00001 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a4fc6a6..355c7b2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -83,8 +83,13 @@ src/mindmark/ ├── __init__.py # Package initialisation ├── __main__.py # Entry point for `python -m mindmark` ├── cli.py # CLI entry point (argparse) -├── index.py # Indexing logic -└── parser.py # Bookmark / document parser +├── index.py # Embedding index + incremental sync logic +├── parser.py # Netscape HTML bookmark parser +└── browsers/ # Direct browser bookmark reading + ├── __init__.py # detect + collect bookmarks from all browsers + ├── paths.py # OS-specific browser path resolution + ├── chromium.py # Chrome / Edge / Brave JSON parser + └── firefox.py # Firefox places.sqlite parser ``` ## Questions? diff --git a/README.md b/README.md index a5c3055..f3474fd 100644 --- a/README.md +++ b/README.md @@ -38,13 +38,25 @@ Ask in natural language — mindmark remembers what you saved. | Command | What it does | |---|---| -| `mindmark index ` | Parse an exported bookmarks HTML file, embed every bookmark locally, store vectors in SQLite | +| `mindmark sync` | **Auto-detect** installed browsers and sync bookmarks directly — no export needed | | `mindmark find "query"` | Semantic search over titles, folders, domains, and URL slugs — returns top-K with similarity scores | | `mindmark open "query"` | Search and open the best match in your default browser | | `mindmark stats` | Show index size, model info, top domains, and top folders | +| `mindmark index ` | Import bookmarks from an exported HTML file (legacy workflow) | > 🔌 **Works offline** after the first run. Embeddings run on-device via [fastembed](https://github.com/qdrant/fastembed) (ONNX Runtime, ~130 MB one-time model download). +### Supported Browsers + +| Browser | macOS | Linux | Windows | +|---|---|---|---| +| **Chrome** | ✅ | ✅ | ✅ | +| **Edge** | ✅ | ✅ | ✅ | +| **Brave** | ✅ | ✅ | ✅ | +| **Firefox** | ✅ | ✅ | ✅ | + +mindmark reads bookmark files directly from browser data directories — no export step, no browser extension. + --- ## 📋 Prerequisites @@ -121,16 +133,58 @@ pip install -e .[dev] ## ⚡ Quick Start -### 1️⃣ Export your bookmarks +### 1️⃣ Sync your bookmarks (no export needed!) + +```bash +mindmark sync +``` + +That's it — mindmark auto-detects your installed browsers, reads their bookmark files directly, and builds a searchable index. **No manual export required.** + +> First run downloads the embedding model (~130 MB) and caches it locally. Every run after that is instant and fully offline. + +
+💡 See which browsers were detected + +```bash +mindmark sync --list-browsers +``` + +Example output: + +``` +Browser Profile Path +------- ------- ---- +Chrome Default ~/Library/.../Google/Chrome/Default/Bookmarks +Chrome Profile 3 ~/Library/.../Google/Chrome/Profile 3/Bookmarks +Edge Default ~/Library/.../Microsoft Edge/Default/Bookmarks +``` + +
+ +
+💡 Sync a specific browser only + +```bash +mindmark sync --browser chrome +mindmark sync --browser firefox +mindmark sync --browser edge +mindmark sync --browser brave +``` + +
-| Browser | How | +
+💡 Alternative — import from an exported HTML file + +If you prefer the manual export workflow, or need to import bookmarks from an unsupported browser: + +| Browser | How to export | |---|---| | **Edge** | `edge://favorites` → `⋯` → **Export favorites** → save as HTML | | **Chrome** | `chrome://bookmarks` → `⋮` → **Export bookmarks** → save as HTML | | **Firefox** | `Ctrl+Shift+O` (`Cmd+Shift+O` on macOS) → **Import and Backup** → **Export Bookmarks to HTML** | -### 2️⃣ Build the index - ```bash # macOS / Linux mindmark index ~/Downloads/bookmarks.html @@ -139,9 +193,9 @@ mindmark index ~/Downloads/bookmarks.html mindmark index "$env:USERPROFILE\Downloads\bookmarks.html" ``` -> First run downloads the embedding model (~130 MB) and caches it locally. Every run after that is instant and fully offline. +
-### 3️⃣ Search in natural language +### 2️⃣ Search in natural language

mindmark find demo @@ -154,7 +208,7 @@ mindmark find "helm chart examples" --domain github.com mindmark find "docker compose setup" --folder devops ``` -### 4️⃣ Open a result directly +### 3️⃣ Open a result directly ```bash mindmark open "k8s cheat sheet" # opens the best match @@ -179,7 +233,7 @@ mm open "docker setup" ``` -### 5️⃣ JSON output for scripting +### 4️⃣ JSON output for scripting Pipe results into **fzf**, **jq**, **Alfred**, **Raycast**, **PowerToys Run**, or any tool that accepts JSON: @@ -195,6 +249,21 @@ mindmark find "istio service mesh" --json | ConvertFrom-Json | ForEach-Object { ## 📖 Usage +### Syncing + +`mindmark sync` reads bookmarks directly from your browser data directories. It's **incremental** — only new or changed bookmarks are re-embedded, making re-syncs near-instant. + +```bash +mindmark sync # sync all detected browsers +mindmark sync --browser chrome # sync only Chrome +mindmark sync --browser firefox # sync only Firefox +mindmark sync --list-browsers # list detected browsers and profiles +``` + +When you add new bookmarks in your browser, just run `mindmark sync` again — it will pick up only the changes. + +> 💡 **Note:** If you change the embedding model with `--model`, all bookmarks will be re-embedded on the next sync. Browser names are case-insensitive (e.g., `--browser Chrome` and `--browser chrome` both work). + ### Filters Narrow down results without changing your query: @@ -207,42 +276,47 @@ mindmark find "useful tools" -k 20 # return top 20 instead of ### Re-indexing -Just rerun `mindmark index `. It clears and rebuilds the index. The model is cached, so re-indexing 800+ bookmarks takes only seconds. +For the `sync` workflow, just rerun `mindmark sync`. It's incremental — only changed bookmarks are re-embedded. + +For the `index` workflow, rerun `mindmark index `. It clears and rebuilds the index. The model is cached, so re-indexing 800+ bookmarks takes only seconds. ### Swap the embedding model ```bash -mindmark index bookmarks.html --model BAAI/bge-small-en-v1.5 # default, 384-dim -mindmark index bookmarks.html --model sentence-transformers/all-MiniLM-L6-v2 -mindmark index bookmarks.html --model BAAI/bge-base-en-v1.5 # 768-dim, higher quality +mindmark sync --model BAAI/bge-small-en-v1.5 # default, 384-dim +mindmark sync --model sentence-transformers/all-MiniLM-L6-v2 +mindmark sync --model BAAI/bge-base-en-v1.5 # 768-dim, higher quality ``` -Switching models triggers a full re-embed automatically. See the [fastembed supported models list](https://qdrant.github.io/fastembed/examples/Supported_Models/). +The `--model` flag also works with `mindmark index`. Switching models triggers a full re-embed automatically. See the [fastembed supported models list](https://qdrant.github.io/fastembed/examples/Supported_Models/). --- ## 🧠 How It Works ``` -Bookmarks HTML "python async tutorial" - │ │ - ▼ ▼ - ┌────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ - │ Parse │───▶│ Embed │───▶│ Store │ │ Embed │ - │ HTML │ │ (ONNX) │ │ (SQLite) │◀────│ query │ - └────────┘ └──────────┘ └──────────┘ └──────────┘ - │ │ - ▼ ▼ - ┌──────────────────────────┐ - │ Dot-product similarity │ - │ → top-K results │ +Browser data files "python async tutorial" +(Chrome JSON / Firefox SQLite) │ + │ │ + ▼ ▼ + ┌────────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ Detect & │─▶│ Embed │─▶│ Store │ │ Embed │ + │ Parse │ │ (ONNX) │ │ (SQLite) │◀────│ query │ + └────────────┘ └──────────┘ └──────────┘ └──────────┘ + ▲ │ │ + │ ▼ ▼ + only new/ ┌──────────────────────────┐ + changed │ Dot-product similarity │ + bookmarks │ → top-K results │ └──────────────────────────┘ ``` -1. **Parse** — A stateful tokenizer reads the Netscape bookmarks HTML and extracts every link with its full folder path. -2. **Embed** — Each bookmark becomes a rich text string (`title | folder | domain | path`) and is passed through a BGE/MiniLM ONNX model. Vectors are L2-normalized. -3. **Store** — Vectors live as `float32` blobs in a single SQLite file. For 800–10,000 bookmarks this is simpler than a vector DB and still sub-millisecond. -4. **Search** — Encode the query, compute dot products against all vectors, return the top-K. +1. **Detect** — Auto-discover installed browsers (Chrome, Edge, Brave, Firefox) and their profiles across macOS, Linux, and Windows. +2. **Parse** — Read bookmark files natively: Chromium JSON format or Firefox `places.sqlite`. No export step needed. +3. **Diff** — Hash each bookmark's content and compare against the existing index. Only new or changed bookmarks proceed to embedding. +4. **Embed** — Each bookmark becomes a rich text string (`title | folder | domain | path`) and is passed through a BGE/MiniLM ONNX model. Vectors are L2-normalized. +5. **Store** — Vectors live as `float32` blobs in a single SQLite file. A `bookmark_sources` table tracks which browser contributed each bookmark, so multi-browser syncs don't conflict. +6. **Search** — Encode the query, compute dot products against all vectors, return the top-K. --- @@ -357,6 +431,15 @@ ENTRYPOINT ["mindmark"] ```bash docker build -t mindmark . + +# Sync from browser bookmarks (mount browser data directories) +# Note: browser data paths vary — this example is for macOS Chrome +docker run --rm \ + -v $HOME/.mindmark:/root/.mindmark \ + -v "$HOME/Library/Application Support/Google/Chrome":/chrome:ro \ + mindmark sync + +# Or import from an exported HTML file docker run --rm -v $HOME/.mindmark:/root/.mindmark \ -v $HOME/Downloads:/downloads mindmark \ index /downloads/bookmarks.html diff --git a/src/mindmark/browsers/__init__.py b/src/mindmark/browsers/__init__.py new file mode 100644 index 0000000..af265a8 --- /dev/null +++ b/src/mindmark/browsers/__init__.py @@ -0,0 +1,56 @@ +"""Browser detection, path resolution, and bookmark parsing. + +Provides auto-detection of installed browsers and their bookmark files, +with parsers that produce the same ``Bookmark`` dataclass used by the +rest of mindmark. +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +import json +import sqlite3 + +from ..parser import Bookmark +from ..index import SyncResult +from .paths import detect_browsers, BrowserProfile, SUPPORTED_BROWSERS + + +def parse_browser_bookmarks(profile: BrowserProfile) -> list[Bookmark]: + """Parse bookmarks from a detected browser profile.""" + if profile.browser_type == "chromium": + from .chromium import parse_chromium_json + return parse_chromium_json(profile.bookmark_path) + elif profile.browser_type == "firefox": + from .firefox import parse_firefox_places + return parse_firefox_places(profile.bookmark_path) + else: + raise ValueError(f"Unsupported browser type: {profile.browser_type}") + + +def collect_all_bookmarks( + browser_filter: str | None = None, +) -> list[tuple[BrowserProfile, list[Bookmark]]]: + """Detect browsers and parse bookmarks from all (or filtered) profiles. + + Returns a list of (profile, bookmarks) pairs. + """ + profiles = detect_browsers() + if browser_filter: + filt = browser_filter.lower() + profiles = [p for p in profiles if p.browser_name.lower() == filt] + + results: list[tuple[BrowserProfile, list[Bookmark]]] = [] + for profile in profiles: + try: + bookmarks = parse_browser_bookmarks(profile) + results.append((profile, bookmarks)) + except (OSError, ValueError, KeyError, json.JSONDecodeError, + sqlite3.Error) as e: + import sys + print( + f"warning: failed to read {profile.browser_name} " + f"({profile.profile_name}): {e}", + file=sys.stderr, + ) + return results diff --git a/src/mindmark/browsers/chromium.py b/src/mindmark/browsers/chromium.py new file mode 100644 index 0000000..6e037f2 --- /dev/null +++ b/src/mindmark/browsers/chromium.py @@ -0,0 +1,80 @@ +"""Parse Chromium-based browser bookmarks (Chrome, Edge, Brave). + +The ``Bookmarks`` file is JSON with this structure:: + + { + "roots": { + "bookmark_bar": { "children": [...] }, + "other": { "children": [...] }, + "synced": { "children": [...] } + } + } + +Each node is either a **folder** (``"type": "folder"``, has ``children``) +or a **url** (``"type": "url"``, has ``url`` + ``name``). +""" +from __future__ import annotations + +import json +from pathlib import Path + +from ..parser import Bookmark + + +def parse_chromium_json(path: Path) -> list[Bookmark]: + """Parse a Chromium ``Bookmarks`` JSON file into a list of Bookmark objects. + + Deduplicates by URL (keeps the first occurrence). + """ + path = Path(path) + with open(path, "r", encoding="utf-8", errors="replace") as f: + data = json.load(f) + + roots = data.get("roots", {}) + bookmarks: list[Bookmark] = [] + seen: set[str] = set() + + for root_name in ("bookmark_bar", "other", "synced"): + node = roots.get(root_name) + if node and isinstance(node, dict): + _walk(node, [], bookmarks, seen) + + return bookmarks + + +def _walk( + node: dict, + folder_stack: list[str], + out: list[Bookmark], + seen: set[str], +) -> None: + """Recursively walk a Chromium bookmark tree node.""" + node_type = node.get("type", "") + + if node_type == "url": + url = node.get("url", "") + if not url or url in seen: + return + seen.add(url) + + name = node.get("name", url) + try: + add_date_str = node.get("date_added", "0") + # Chromium stores dates as microseconds since 1601-01-01 + add_date = int(add_date_str) if add_date_str else 0 + except (ValueError, TypeError): + add_date = 0 + + out.append(Bookmark( + title=name, + url=url, + folder_path="/".join(folder_stack), + add_date=add_date, + icon=None, + )) + + elif node_type == "folder": + folder_name = node.get("name", "Unnamed") + children = node.get("children", []) + for child in children: + _walk(child, folder_stack + [folder_name], out, seen) diff --git a/src/mindmark/browsers/firefox.py b/src/mindmark/browsers/firefox.py new file mode 100644 index 0000000..e2a77e6 --- /dev/null +++ b/src/mindmark/browsers/firefox.py @@ -0,0 +1,147 @@ +"""Parse Firefox bookmarks from ``places.sqlite``. + +Firefox stores bookmarks in an SQLite database. The browser holds a lock +on the file while running, so we **copy** it (including WAL/SHM files) to +a temporary directory before reading. +""" +from __future__ import annotations + +import shutil +import sqlite3 +import tempfile +from pathlib import Path + +from ..parser import Bookmark + +# Firefox bookmark types (moz_bookmarks.type) +_TYPE_BOOKMARK = 1 +_TYPE_FOLDER = 2 + +# Built-in root folder IDs to skip as folder-path components +_ROOT_IDS = {1, 2, 3, 4, 5, 6} # root, menu, toolbar, tags, unfiled, mobile + + +def parse_firefox_places(path: Path) -> list[Bookmark]: + """Parse bookmarks from a Firefox ``places.sqlite`` file. + + Uses SQLite's backup API to create a consistent snapshot, which is + safer than filesystem copies when Firefox is running (especially on + Windows where file locking is stricter). + """ + path = Path(path) + if not path.is_file(): + raise FileNotFoundError(f"Firefox places.sqlite not found: {path}") + + with tempfile.TemporaryDirectory(prefix="mindmark_ff_") as tmpdir: + dst = Path(tmpdir) / "places.sqlite" + try: + # SQLite backup API: creates a consistent snapshot even with WAL + src_con = sqlite3.connect( + path.resolve().as_uri() + "?mode=ro", uri=True + ) + dst_con = sqlite3.connect(str(dst)) + src_con.backup(dst_con) + src_con.close() + dst_con.close() + except (sqlite3.OperationalError, OSError): + # Fallback: filesystem copy if backup fails (e.g. locked by OS) + shutil.copy2(path, dst) + for suffix in ("-wal", "-shm"): + sidecar = path.parent / (path.name + suffix) + if sidecar.is_file(): + try: + shutil.copy2(sidecar, Path(tmpdir) / (dst.name + suffix)) + except OSError: + pass + + return _read_places(dst) + + +def _read_places(db_path: Path) -> list[Bookmark]: + """Read bookmarks from a copied places.sqlite.""" + # Use Path.as_uri() for Windows-safe URI (handles drive letters, spaces) + uri = db_path.resolve().as_uri() + "?mode=ro" + con = sqlite3.connect(uri, uri=True) + con.row_factory = sqlite3.Row + try: + return _query_bookmarks(con) + finally: + con.close() + + +def _build_folder_map(con: sqlite3.Connection) -> dict[int, str]: + """Build a mapping from folder id → full folder path string.""" + cur = con.execute( + "SELECT id, parent, title, type FROM moz_bookmarks WHERE type = ?", + (_TYPE_FOLDER,), + ) + folders: dict[int, tuple[int, str]] = {} + for row in cur: + fid = row["id"] + parent = row["parent"] + title = row["title"] or "" + folders[fid] = (parent, title) + + # Resolve full paths by walking up parent chain + cache: dict[int, str] = {} + + def resolve(fid: int) -> str: + if fid in cache: + return cache[fid] + if fid not in folders or fid in _ROOT_IDS: + cache[fid] = "" + return "" + parent_id, title = folders[fid] + parent_path = resolve(parent_id) + if parent_path: + full = f"{parent_path}/{title}" if title else parent_path + else: + full = title + cache[fid] = full + return full + + return {fid: resolve(fid) for fid in folders} + + +def _query_bookmarks(con: sqlite3.Connection) -> list[Bookmark]: + """Query bookmarks from a places.sqlite connection.""" + folder_map = _build_folder_map(con) + + cur = con.execute(""" + SELECT b.id, b.title, b.parent, b.dateAdded, + p.url + FROM moz_bookmarks b + JOIN moz_places p ON b.fk = p.id + WHERE b.type = ? + AND p.url IS NOT NULL + AND p.url NOT LIKE 'place:%' + """, (_TYPE_BOOKMARK,)) + + seen: set[str] = set() + bookmarks: list[Bookmark] = [] + + for row in cur: + url = row["url"] + if not url or url in seen: + continue + seen.add(url) + + title = row["title"] or url + parent_id = row["parent"] + folder_path = folder_map.get(parent_id, "") + + try: + # Firefox stores dates as microseconds since epoch + add_date = int(row["dateAdded"] or 0) + except (ValueError, TypeError): + add_date = 0 + + bookmarks.append(Bookmark( + title=title, + url=url, + folder_path=folder_path, + add_date=add_date, + icon=None, + )) + + return bookmarks diff --git a/src/mindmark/browsers/paths.py b/src/mindmark/browsers/paths.py new file mode 100644 index 0000000..4cf00ab --- /dev/null +++ b/src/mindmark/browsers/paths.py @@ -0,0 +1,165 @@ +"""OS-specific browser bookmark path resolution and detection.""" +from __future__ import annotations + +import os +import sys +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class BrowserProfile: + """A detected browser profile with its bookmark file path.""" + browser_name: str # e.g. "Chrome", "Firefox" + browser_type: str # "chromium" or "firefox" + profile_name: str # e.g. "Default", "Profile 1", "default-release" + bookmark_path: Path # full path to the bookmark file + source_id: str = "" # unique id like "chrome:Default" + + def __post_init__(self): + if not self.source_id: + sid = f"{self.browser_name.lower()}:{self.profile_name}" + object.__setattr__(self, "source_id", sid) + + +# --------------------------------------------------------------------------- +# Browser path definitions per OS +# --------------------------------------------------------------------------- + +def _home() -> Path: + return Path.home() + + +def _local_app_data() -> Path: + """Windows %LOCALAPPDATA%.""" + val = os.environ.get("LOCALAPPDATA") + return Path(val) if val else Path.home() / "AppData" / "Local" + + +def _app_data() -> Path: + """Windows %APPDATA%.""" + val = os.environ.get("APPDATA") + return Path(val) if val else Path.home() / "AppData" / "Roaming" + + +# Each entry: (browser_name, browser_type, path_parts_tuple) +# Path parts are joined with Path.joinpath() — no OS-specific separators. +_CHROMIUM_BOOKMARK_FILE = "Bookmarks" + +_BROWSER_DEFS: dict[str, list[tuple[str, str, tuple[str, ...]]]] = { + "darwin": [ + ("Chrome", "chromium", + ("Library", "Application Support", "Google", "Chrome")), + ("Edge", "chromium", + ("Library", "Application Support", "Microsoft Edge")), + ("Brave", "chromium", + ("Library", "Application Support", "BraveSoftware", "Brave-Browser")), + ("Firefox", "firefox", + ("Library", "Application Support", "Firefox", "Profiles")), + ], + "linux": [ + ("Chrome", "chromium", (".config", "google-chrome")), + ("Edge", "chromium", (".config", "microsoft-edge")), + ("Brave", "chromium", (".config", "BraveSoftware", "Brave-Browser")), + ("Firefox", "firefox", (".mozilla", "firefox")), + ], + "win32": [ + ("Chrome", "chromium", ("Google", "Chrome", "User Data")), + ("Edge", "chromium", ("Microsoft", "Edge", "User Data")), + ("Brave", "chromium", ("BraveSoftware", "Brave-Browser", "User Data")), + ("Firefox", "firefox", ()), # handled specially + ], +} + + +SUPPORTED_BROWSERS = ["chrome", "edge", "brave", "firefox"] + + +def _chromium_base(path_parts: tuple[str, ...]) -> Path | None: + """Resolve the Chromium base directory for the current OS.""" + if sys.platform == "win32": + base = _local_app_data().joinpath(*path_parts) + else: + base = _home().joinpath(*path_parts) + return base if base.is_dir() else None + + +def _firefox_base(path_parts: tuple[str, ...]) -> Path | None: + """Resolve the Firefox profiles directory for the current OS.""" + if sys.platform == "win32": + base = _app_data() / "Mozilla" / "Firefox" / "Profiles" + else: + base = _home().joinpath(*path_parts) if path_parts else None + return base if base and base.is_dir() else None + + +def _discover_chromium_profiles(base: Path) -> list[BrowserProfile]: + """Find all Chromium profiles in a browser's data directory.""" + profiles: list[BrowserProfile] = [] + # Chromium profile dirs: "Default", "Profile 1", "Profile 2", etc. + candidates = sorted(base.iterdir()) if base.is_dir() else [] + for d in candidates: + if not d.is_dir(): + continue + bookmark_file = d / _CHROMIUM_BOOKMARK_FILE + if bookmark_file.is_file(): + profiles.append(BrowserProfile( + browser_name="", # filled by caller + browser_type="chromium", + profile_name=d.name, + bookmark_path=bookmark_file, + )) + return profiles + + +def _discover_firefox_profiles(base: Path) -> list[BrowserProfile]: + """Find all Firefox profiles in the profiles directory.""" + profiles: list[BrowserProfile] = [] + if not base.is_dir(): + return profiles + for d in sorted(base.iterdir()): + if not d.is_dir(): + continue + places = d / "places.sqlite" + if places.is_file(): + profiles.append(BrowserProfile( + browser_name="Firefox", + browser_type="firefox", + profile_name=d.name, + bookmark_path=places, + )) + return profiles + + +def detect_browsers() -> list[BrowserProfile]: + """Auto-detect installed browsers and their profiles. + + Returns a list of ``BrowserProfile`` instances for every discovered + profile that contains a bookmark file. + """ + platform = sys.platform + if platform.startswith("linux"): + platform = "linux" + + defs = _BROWSER_DEFS.get(platform, []) + found: list[BrowserProfile] = [] + + for browser_name, browser_type, path_parts in defs: + if browser_type == "chromium": + base = _chromium_base(path_parts) + if base is None: + continue + for p in _discover_chromium_profiles(base): + found.append(BrowserProfile( + browser_name=browser_name, + browser_type=p.browser_type, + profile_name=p.profile_name, + bookmark_path=p.bookmark_path, + )) + elif browser_type == "firefox": + base = _firefox_base(path_parts) + if base is None: + continue + found.extend(_discover_firefox_profiles(base)) + + return found diff --git a/src/mindmark/cli.py b/src/mindmark/cli.py index 9df3251..79aeee5 100644 --- a/src/mindmark/cli.py +++ b/src/mindmark/cli.py @@ -9,7 +9,7 @@ from . import __version__ from .parser import parse_file -from .index import Index, default_db_path, DEFAULT_MODEL +from .index import Index, SyncResult, default_db_path, DEFAULT_MODEL def _cmd_index(args): @@ -28,14 +28,25 @@ def _cmd_index(args): return 0 +def _auto_sync_hint(idx: Index) -> None: + """Print a hint when the index is empty.""" + if not idx.is_empty(): + return + print("index is empty — run 'mindmark sync' to import bookmarks from your browsers,") + print("or run 'mindmark index ' to import from an exported file.") + print() + + def _cmd_find(args): idx = Index(db_path=args.db) + if not getattr(args, 'json', False): + _auto_sync_hint(idx) results = idx.search( query=args.query, k=args.top, domain=args.domain, folder=args.folder, ) if not results: - print("no results (is the index empty? run: mindmark index )") + print("no results (is the index empty? run: mindmark sync)") return 1 if args.open is not None: @@ -79,6 +90,7 @@ def _cmd_stats(args): def _cmd_open(args): idx = Index(db_path=args.db) + _auto_sync_hint(idx) results = idx.search(args.query, k=1) if not results: print("no results") @@ -88,6 +100,49 @@ def _cmd_open(args): return 0 +def _cmd_sync(args): + from .browsers import collect_all_bookmarks, detect_browsers + + if args.list_browsers: + profiles = detect_browsers() + if not profiles: + print("no supported browsers detected") + return 1 + print(f"{'Browser':<12} {'Profile':<24} Path") + print(f"{'-------':<12} {'-------':<24} ----") + for p in profiles: + print(f"{p.browser_name:<12} {p.profile_name:<24} {p.bookmark_path}") + return 0 + + print("detecting browsers...") + pairs = collect_all_bookmarks(browser_filter=args.browser) + + if not pairs: + if args.browser: + print(f"no bookmarks found for browser: {args.browser}", file=sys.stderr) + else: + print("no supported browsers detected", file=sys.stderr) + return 1 + + idx = Index(db_path=args.db, model_name=args.model) + total_result = SyncResult() + + for profile, bookmarks in pairs: + source_id = profile.source_id + print(f"syncing {profile.browser_name} ({profile.profile_name}): " + f"{len(bookmarks)} bookmarks...") + result = idx.sync(bookmarks, source=source_id, batch_size=args.batch_size) + total_result.added += result.added + total_result.updated += result.updated + total_result.removed += result.removed + total_result.unchanged += result.unchanged + if result.total_changed > 0: + print(f" {result}") + + print(f"\ndone. {total_result}") + return 0 + + def build_parser(): p = argparse.ArgumentParser( prog="mindmark", @@ -123,6 +178,22 @@ def build_parser(): po.add_argument("query") po.set_defaults(func=_cmd_open) + psync = sub.add_parser( + "sync", + help="sync bookmarks directly from installed browsers (no export needed)", + ) + psync.add_argument( + "--browser", type=str, default=None, + help="sync only this browser (chrome, edge, brave, firefox)", + ) + psync.add_argument( + "--list-browsers", action="store_true", + help="list detected browsers and profiles, then exit", + ) + psync.add_argument("--model", default=DEFAULT_MODEL) + psync.add_argument("--batch-size", type=int, default=64) + psync.set_defaults(func=_cmd_sync) + return p diff --git a/src/mindmark/index.py b/src/mindmark/index.py index fd48de6..50fd430 100644 --- a/src/mindmark/index.py +++ b/src/mindmark/index.py @@ -1,6 +1,7 @@ """Embedding + SQLite-backed vector index for bookmarks.""" from __future__ import annotations +import hashlib import os import sqlite3 from dataclasses import dataclass @@ -12,6 +13,8 @@ DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" +_SCHEMA_VERSION = 2 + def default_db_path() -> Path: env = os.environ.get("MINDMARK_HOME") @@ -41,7 +44,14 @@ def default_db_path() -> Path: add_date INTEGER NOT NULL, icon TEXT, embedding BLOB NOT NULL, - dim INTEGER NOT NULL + dim INTEGER NOT NULL, + content_hash TEXT NOT NULL DEFAULT '' +); +CREATE TABLE IF NOT EXISTS bookmark_sources ( + url TEXT NOT NULL, + source TEXT NOT NULL, + content_hash TEXT NOT NULL DEFAULT '', + PRIMARY KEY (url, source) ); CREATE INDEX IF NOT EXISTS idx_bookmarks_domain ON bookmarks(domain); CREATE INDEX IF NOT EXISTS idx_bookmarks_folder ON bookmarks(folder_path); @@ -51,9 +61,40 @@ def default_db_path() -> Path: def _connect(db_path: Path) -> sqlite3.Connection: con = sqlite3.connect(db_path) con.executescript(_SCHEMA) + _migrate(con) return con +def _migrate(con: sqlite3.Connection) -> None: + """Run schema migrations for existing databases.""" + cur = con.cursor() + cur.execute("SELECT value FROM meta WHERE key = 'schema_version'") + row = cur.fetchone() + version = int(row[0]) if row else 1 + + if version < 2: + # Add content_hash column if missing (pre-v2 databases) + cols = {r[1] for r in cur.execute("PRAGMA table_info(bookmarks)")} + if "content_hash" not in cols: + cur.execute( + "ALTER TABLE bookmarks ADD COLUMN content_hash TEXT NOT NULL DEFAULT ''" + ) + # Create bookmark_sources table + cur.execute(""" + CREATE TABLE IF NOT EXISTS bookmark_sources ( + url TEXT NOT NULL, + source TEXT NOT NULL, + content_hash TEXT NOT NULL DEFAULT '', + PRIMARY KEY (url, source) + ) + """) + cur.execute( + "INSERT OR REPLACE INTO meta(key, value) VALUES ('schema_version', ?)", + (str(_SCHEMA_VERSION),), + ) + con.commit() + + def _vec_to_blob(v: np.ndarray) -> bytes: return v.astype(np.float32).tobytes() @@ -88,6 +129,32 @@ def embed_one(self, text: str) -> np.ndarray: return self.embed([text])[0] +def _content_hash(b: Bookmark) -> str: + """Hash the fields that affect embedding text.""" + payload = f"{b.url}\0{b.title}\0{b.folder_path}\0{b.domain}" + return hashlib.sha256(payload.encode()).hexdigest()[:16] + + +@dataclass +class SyncResult: + """Result of an incremental sync operation.""" + added: int = 0 + updated: int = 0 + removed: int = 0 + unchanged: int = 0 + source: str = "" + + @property + def total_changed(self) -> int: + return self.added + self.updated + self.removed + + def __str__(self) -> str: + return ( + f"{self.added} new, {self.updated} updated, " + f"{self.removed} removed, {self.unchanged} unchanged" + ) + + class Index: def __init__(self, db_path: Path | None = None, model_name: str = DEFAULT_MODEL): self.db_path = Path(db_path) if db_path else default_db_path() @@ -95,9 +162,189 @@ def __init__(self, db_path: Path | None = None, model_name: str = DEFAULT_MODEL) self.con = _connect(self.db_path) self.embedder = Embedder(model_name=model_name) + def is_empty(self) -> bool: + cur = self.con.cursor() + cur.execute("SELECT COUNT(*) FROM bookmarks") + return cur.fetchone()[0] == 0 + + def _model_changed(self) -> bool: + """Check if the stored model differs from the current one.""" + cur = self.con.cursor() + cur.execute("SELECT value FROM meta WHERE key = 'model'") + row = cur.fetchone() + if row is None: + return False # no model stored yet + return row[0] != self.model_name + + def sync( + self, + bookmarks: list[Bookmark], + source: str = "html", + batch_size: int = 64, + ) -> SyncResult: + """Incrementally sync bookmarks from a source. + + Only embeds new/changed bookmarks. Bookmarks removed from *this* + source are deleted from the index only if no other source + references them. + """ + result = SyncResult(source=source) + + if not bookmarks: + # Delete all bookmarks from this source + removed_urls = self._remove_source(source) + result.removed = len(removed_urls) + return result + + # If the embedding model changed, force full re-embed + force_reembed = self._model_changed() + + # 1. Hash incoming bookmarks + new_map: dict[str, tuple[Bookmark, str]] = {} + for b in bookmarks: + h = _content_hash(b) + if b.url not in new_map: # dedup by URL + new_map[b.url] = (b, h) + + # 2. Load existing hashes for this source + cur = self.con.cursor() + cur.execute( + "SELECT url, content_hash FROM bookmark_sources WHERE source = ?", + (source,), + ) + existing: dict[str, str] = {r[0]: r[1] for r in cur.fetchall()} + + # 3. Compute diff + new_urls = set(new_map.keys()) + old_urls = set(existing.keys()) + + to_add_urls = new_urls - old_urls + to_delete_urls = old_urls - new_urls + common_urls = new_urls & old_urls + + to_update_urls: set[str] = set() + for url in common_urls: + _, new_hash = new_map[url] + if force_reembed or new_hash != existing[url]: + to_update_urls.add(url) + + result.unchanged = len(common_urls) - len(to_update_urls) + result.added = len(to_add_urls) + result.updated = len(to_update_urls) + + # 4. Embed only what changed + to_embed_urls = to_add_urls | to_update_urls + embed_list = [new_map[u] for u in to_embed_urls] + + embedded: dict[str, tuple[Bookmark, str, bytes, int]] = {} + for start in range(0, len(embed_list), batch_size): + chunk = embed_list[start:start + batch_size] + texts = [b.embedding_text() for b, _h in chunk] + vecs = self.embedder.embed(texts) + for (b, h), v in zip(chunk, vecs): + embedded[b.url] = (b, h, _vec_to_blob(v), int(v.shape[0])) + + # 5. Apply all DB changes in a single transaction + cur = self.con.cursor() + try: + # Update model metadata + cur.execute( + "INSERT OR REPLACE INTO meta(key, value) VALUES ('model', ?)", + (self.model_name,), + ) + + # Insert/update bookmarks and sources + for url in to_add_urls: + b, h, vec_blob, dim = embedded[url] + # Check if URL exists from another source + cur.execute("SELECT 1 FROM bookmarks WHERE url = ?", (url,)) + if cur.fetchone(): + # URL already indexed by another source — update metadata + cur.execute( + "UPDATE bookmarks SET title=?, folder_path=?, domain=?, " + "add_date=?, icon=?, embedding=?, dim=?, content_hash=? " + "WHERE url=?", + (b.title, b.folder_path, b.domain, b.add_date, + b.icon, vec_blob, dim, h, url), + ) + else: + cur.execute( + "INSERT INTO bookmarks " + "(url, title, folder_path, domain, add_date, icon, " + "embedding, dim, content_hash) VALUES (?,?,?,?,?,?,?,?,?)", + (url, b.title, b.folder_path, b.domain, b.add_date, + b.icon, vec_blob, dim, h), + ) + cur.execute( + "INSERT OR REPLACE INTO bookmark_sources (url, source, content_hash) " + "VALUES (?, ?, ?)", + (url, source, h), + ) + + for url in to_update_urls: + b, h, vec_blob, dim = embedded[url] + cur.execute( + "UPDATE bookmarks SET title=?, folder_path=?, domain=?, " + "add_date=?, icon=?, embedding=?, dim=?, content_hash=? " + "WHERE url=?", + (b.title, b.folder_path, b.domain, b.add_date, + b.icon, vec_blob, dim, h, url), + ) + cur.execute( + "UPDATE bookmark_sources SET content_hash=? " + "WHERE url=? AND source=?", + (h, url, source), + ) + + # Delete bookmarks removed from this source + for url in to_delete_urls: + cur.execute( + "DELETE FROM bookmark_sources WHERE url=? AND source=?", + (url, source), + ) + # Only delete from bookmarks if no other source references it + cur.execute( + "SELECT COUNT(*) FROM bookmark_sources WHERE url=?", (url,) + ) + if cur.fetchone()[0] == 0: + cur.execute("DELETE FROM bookmarks WHERE url=?", (url,)) + + result.removed = len(to_delete_urls) + self.con.commit() + except Exception: + self.con.rollback() + raise + + return result + + def _remove_source(self, source: str) -> list[str]: + """Remove all bookmarks from a source, cleaning up orphans.""" + cur = self.con.cursor() + cur.execute( + "SELECT url FROM bookmark_sources WHERE source = ?", (source,) + ) + urls = [r[0] for r in cur.fetchall()] + try: + for url in urls: + cur.execute( + "DELETE FROM bookmark_sources WHERE url=? AND source=?", + (url, source), + ) + cur.execute( + "SELECT COUNT(*) FROM bookmark_sources WHERE url=?", (url,) + ) + if cur.fetchone()[0] == 0: + cur.execute("DELETE FROM bookmarks WHERE url=?", (url,)) + self.con.commit() + except Exception: + self.con.rollback() + raise + return urls + def rebuild(self, bookmarks: list[Bookmark], batch_size: int = 64) -> dict: cur = self.con.cursor() cur.execute("DELETE FROM bookmarks") + cur.execute("DELETE FROM bookmark_sources") cur.execute("INSERT OR REPLACE INTO meta(key, value) VALUES ('model', ?)", (self.model_name,)) self.con.commit() @@ -106,24 +353,32 @@ def rebuild(self, bookmarks: list[Bookmark], batch_size: int = 64) -> dict: return {"indexed": 0, "model": self.model_name, "dim": 0} rows = [] + source_rows = [] for start in range(0, total, batch_size): chunk = bookmarks[start:start + batch_size] texts = [b.embedding_text() for b in chunk] vecs = self.embedder.embed(texts) for b, v in zip(chunk, vecs): + h = _content_hash(b) rows.append(( b.url, b.title, b.folder_path, b.domain, - b.add_date, b.icon, _vec_to_blob(v), int(v.shape[0]), + b.add_date, b.icon, _vec_to_blob(v), int(v.shape[0]), h, )) + source_rows.append((b.url, "html", h)) cur.executemany( "INSERT OR REPLACE INTO bookmarks " - "(url, title, folder_path, domain, add_date, icon, embedding, dim) " - "VALUES (?,?,?,?,?,?,?,?)", + "(url, title, folder_path, domain, add_date, icon, embedding, dim, content_hash) " + "VALUES (?,?,?,?,?,?,?,?,?)", rows, ) + cur.executemany( + "INSERT OR REPLACE INTO bookmark_sources (url, source, content_hash) " + "VALUES (?,?,?)", + source_rows, + ) self.con.commit() - return {"indexed": total, "model": self.model_name, "dim": rows[0][-1]} + return {"indexed": total, "model": self.model_name, "dim": rows[0][-2]} def stats(self) -> dict: cur = self.con.cursor() diff --git a/tests/test_browser_detection.py b/tests/test_browser_detection.py new file mode 100644 index 0000000..f61a189 --- /dev/null +++ b/tests/test_browser_detection.py @@ -0,0 +1,88 @@ +"""Tests for browser detection and path resolution.""" +import sys +from pathlib import Path +from unittest.mock import patch + +from mindmark.browsers.paths import detect_browsers, BrowserProfile + + +def test_browser_profile_source_id(): + p = BrowserProfile( + browser_name="Chrome", + browser_type="chromium", + profile_name="Default", + bookmark_path=Path("/fake/path"), + ) + assert p.source_id == "chrome:Default" + + +def test_browser_profile_custom_source_id(): + p = BrowserProfile( + browser_name="Chrome", + browser_type="chromium", + profile_name="Default", + bookmark_path=Path("/fake/path"), + source_id="custom:id", + ) + assert p.source_id == "custom:id" + + +def test_detect_browsers_returns_list(tmp_path): + """detect_browsers should return a list (possibly empty) on any platform.""" + # With a fake home, no browsers should be detected + with patch("mindmark.browsers.paths._home", return_value=tmp_path): + with patch("mindmark.browsers.paths._local_app_data", return_value=tmp_path / "Local"): + with patch("mindmark.browsers.paths._app_data", return_value=tmp_path / "Roaming"): + profiles = detect_browsers() + assert isinstance(profiles, list) + + +def test_detect_chromium_with_fake_profile(tmp_path): + """Simulate a Chrome installation with a Default profile.""" + if sys.platform == "darwin": + chrome_dir = tmp_path / "Library" / "Application Support" / "Google" / "Chrome" + elif sys.platform.startswith("linux"): + chrome_dir = tmp_path / ".config" / "google-chrome" + else: + chrome_dir = tmp_path / "Google" / "Chrome" / "User Data" + + default_profile = chrome_dir / "Default" + default_profile.mkdir(parents=True) + (default_profile / "Bookmarks").write_text('{"roots":{}}') + + with patch("mindmark.browsers.paths._home", return_value=tmp_path): + with patch("mindmark.browsers.paths._local_app_data", return_value=tmp_path): + profiles = detect_browsers() + + chrome_profiles = [p for p in profiles if p.browser_name == "Chrome"] + assert len(chrome_profiles) >= 1 + assert chrome_profiles[0].profile_name == "Default" + assert chrome_profiles[0].browser_type == "chromium" + + +def test_detect_firefox_with_fake_profile(tmp_path): + """Simulate a Firefox installation with a profile.""" + if sys.platform == "darwin": + ff_dir = tmp_path / "Library" / "Application Support" / "Firefox" / "Profiles" + elif sys.platform.startswith("linux"): + ff_dir = tmp_path / ".mozilla" / "firefox" + else: + ff_dir = tmp_path / "Roaming" / "Mozilla" / "Firefox" / "Profiles" + + profile_dir = ff_dir / "abc12345.default-release" + profile_dir.mkdir(parents=True) + # Create a minimal places.sqlite + import sqlite3 + db = profile_dir / "places.sqlite" + con = sqlite3.connect(db) + con.execute("CREATE TABLE moz_places (id INTEGER PRIMARY KEY, url TEXT)") + con.close() + + with patch("mindmark.browsers.paths._home", return_value=tmp_path): + with patch("mindmark.browsers.paths._app_data", return_value=tmp_path / "Roaming"): + profiles = detect_browsers() + + ff_profiles = [p for p in profiles if p.browser_name == "Firefox"] + assert len(ff_profiles) >= 1 + assert ff_profiles[0].browser_type == "firefox" + assert "default-release" in ff_profiles[0].profile_name diff --git a/tests/test_browsers_init.py b/tests/test_browsers_init.py new file mode 100644 index 0000000..a04341b --- /dev/null +++ b/tests/test_browsers_init.py @@ -0,0 +1,143 @@ +"""Tests for the browsers orchestration layer (__init__.py).""" +import json +import tempfile +from pathlib import Path +from unittest.mock import patch, MagicMock + +from mindmark.browsers import ( + parse_browser_bookmarks, + collect_all_bookmarks, +) +from mindmark.browsers.paths import BrowserProfile + + +def _make_chromium_profile(tmp_path: Path) -> BrowserProfile: + """Create a fake Chromium profile with a Bookmarks JSON file.""" + bookmark_file = tmp_path / "Bookmarks" + data = { + "roots": { + "bookmark_bar": { + "children": [ + {"name": "Example", "type": "url", "url": "https://example.com"}, + {"name": "Test", "type": "url", "url": "https://test.com"}, + ], + "name": "Bookmarks Bar", + "type": "folder", + }, + "other": {"children": [], "name": "Other", "type": "folder"}, + "synced": {"children": [], "name": "Synced", "type": "folder"}, + } + } + bookmark_file.write_text(json.dumps(data)) + return BrowserProfile( + browser_name="Chrome", + browser_type="chromium", + profile_name="Default", + bookmark_path=bookmark_file, + ) + + +def _make_firefox_profile(tmp_path: Path) -> BrowserProfile: + """Create a fake Firefox profile with a places.sqlite file.""" + import sqlite3 + + db_path = tmp_path / "places.sqlite" + con = sqlite3.connect(db_path) + con.executescript(""" + CREATE TABLE moz_places (id INTEGER PRIMARY KEY, url TEXT); + CREATE TABLE moz_bookmarks ( + id INTEGER PRIMARY KEY, type INTEGER, fk INTEGER, + parent INTEGER, title TEXT, dateAdded INTEGER + ); + INSERT INTO moz_bookmarks (id, type, fk, parent, title) VALUES + (1, 2, NULL, 0, 'root'), (2, 2, NULL, 1, 'menu'); + INSERT INTO moz_places (id, url) VALUES (1, 'https://firefox.example.com'); + INSERT INTO moz_bookmarks (id, type, fk, parent, title, dateAdded) VALUES + (100, 1, 1, 2, 'Firefox Example', 0); + """) + con.close() + return BrowserProfile( + browser_name="Firefox", + browser_type="firefox", + profile_name="default-release", + bookmark_path=db_path, + ) + + +def test_parse_browser_bookmarks_chromium(tmp_path): + profile = _make_chromium_profile(tmp_path) + bookmarks = parse_browser_bookmarks(profile) + assert len(bookmarks) == 2 + urls = {b.url for b in bookmarks} + assert "https://example.com" in urls + assert "https://test.com" in urls + + +def test_parse_browser_bookmarks_firefox(tmp_path): + profile = _make_firefox_profile(tmp_path) + bookmarks = parse_browser_bookmarks(profile) + assert len(bookmarks) == 1 + assert bookmarks[0].url == "https://firefox.example.com" + + +def test_parse_browser_bookmarks_unsupported(): + profile = BrowserProfile( + browser_name="Safari", + browser_type="safari", + profile_name="Default", + bookmark_path=Path("/fake"), + ) + try: + parse_browser_bookmarks(profile) + assert False, "Should have raised ValueError" + except ValueError as e: + assert "Unsupported" in str(e) + + +def test_collect_all_bookmarks_with_filter(tmp_path): + chrome_dir = tmp_path / "chrome" + chrome_dir.mkdir() + chrome_profile = _make_chromium_profile(chrome_dir) + + ff_dir = tmp_path / "firefox" + ff_dir.mkdir() + ff_profile = _make_firefox_profile(ff_dir) + + fake_profiles = [chrome_profile, ff_profile] + + with patch("mindmark.browsers.detect_browsers", return_value=fake_profiles): + # Filter to Chrome only + results = collect_all_bookmarks(browser_filter="Chrome") + assert len(results) == 1 + assert results[0][0].browser_name == "Chrome" + + # Filter to Firefox only + results = collect_all_bookmarks(browser_filter="firefox") + assert len(results) == 1 + assert results[0][0].browser_name == "Firefox" + + # No filter — gets all + results = collect_all_bookmarks(browser_filter=None) + assert len(results) == 2 + + +def test_collect_all_bookmarks_no_browsers(): + with patch("mindmark.browsers.detect_browsers", return_value=[]): + results = collect_all_bookmarks() + assert results == [] + + +def test_collect_all_bookmarks_handles_parse_error(tmp_path, capsys): + """A broken profile should print a warning and not crash.""" + bad_profile = BrowserProfile( + browser_name="Chrome", + browser_type="chromium", + profile_name="Corrupt", + bookmark_path=tmp_path / "nonexistent", + ) + with patch("mindmark.browsers.detect_browsers", return_value=[bad_profile]): + results = collect_all_bookmarks() + assert results == [] + captured = capsys.readouterr() + assert "warning" in captured.err + assert "Chrome" in captured.err diff --git a/tests/test_chromium_parser.py b/tests/test_chromium_parser.py new file mode 100644 index 0000000..fe6a709 --- /dev/null +++ b/tests/test_chromium_parser.py @@ -0,0 +1,135 @@ +"""Tests for the Chromium JSON bookmark parser.""" +import json +import tempfile +from pathlib import Path + +from mindmark.browsers.chromium import parse_chromium_json + + +SAMPLE_CHROMIUM = { + "checksum": "abc123", + "roots": { + "bookmark_bar": { + "children": [ + { + "date_added": "13300000000000000", + "name": "Python Docs", + "type": "url", + "url": "https://docs.python.org/3/", + }, + { + "children": [ + { + "date_added": "13300000000000001", + "name": "GitHub", + "type": "url", + "url": "https://github.com", + }, + { + "children": [ + { + "date_added": "13300000000000002", + "name": "Kusto Guide", + "type": "url", + "url": "https://eng.ms/docs/kusto", + } + ], + "name": "Internal", + "type": "folder", + }, + ], + "name": "Work", + "type": "folder", + }, + ], + "name": "Bookmarks Bar", + "type": "folder", + }, + "other": { + "children": [ + { + "name": "Stack Overflow", + "type": "url", + "url": "https://stackoverflow.com", + } + ], + "name": "Other bookmarks", + "type": "folder", + }, + "synced": { + "children": [], + "name": "Mobile bookmarks", + "type": "folder", + }, + }, + "version": 1, +} + + +def _write_json(data: dict) -> Path: + f = tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False, encoding="utf-8" + ) + json.dump(data, f) + f.close() + return Path(f.name) + + +def test_parses_urls_and_titles(): + path = _write_json(SAMPLE_CHROMIUM) + bms = parse_chromium_json(path) + by_url = {b.url: b for b in bms} + assert "https://docs.python.org/3/" in by_url + assert by_url["https://docs.python.org/3/"].title == "Python Docs" + assert "https://github.com" in by_url + assert "https://stackoverflow.com" in by_url + path.unlink() + + +def test_folder_paths(): + path = _write_json(SAMPLE_CHROMIUM) + bms = parse_chromium_json(path) + by_url = {b.url: b for b in bms} + # Top-level bar bookmark + assert by_url["https://docs.python.org/3/"].folder_path == "Bookmarks Bar" + # Nested in Work + assert by_url["https://github.com"].folder_path == "Bookmarks Bar/Work" + # Nested in Work/Internal + assert by_url["https://eng.ms/docs/kusto"].folder_path == "Bookmarks Bar/Work/Internal" + # "Other" root + assert by_url["https://stackoverflow.com"].folder_path == "Other bookmarks" + path.unlink() + + +def test_deduplicates_by_url(): + data = json.loads(json.dumps(SAMPLE_CHROMIUM)) + # Add a duplicate URL in "other" + data["roots"]["other"]["children"].append({ + "name": "Python Docs Dup", + "type": "url", + "url": "https://docs.python.org/3/", + }) + path = _write_json(data) + bms = parse_chromium_json(path) + python_urls = [b for b in bms if b.url == "https://docs.python.org/3/"] + assert len(python_urls) == 1 + path.unlink() + + +def test_empty_roots(): + data = {"roots": {"bookmark_bar": {"children": [], "name": "Bar", "type": "folder"}}} + path = _write_json(data) + bms = parse_chromium_json(path) + assert bms == [] + path.unlink() + + +def test_embedding_text_contains_key_fields(): + path = _write_json(SAMPLE_CHROMIUM) + bms = parse_chromium_json(path) + k = next(b for b in bms if "kusto" in b.url) + t = k.embedding_text() + assert "Kusto Guide" in t + assert "eng.ms" in t + assert "Work/Internal" in t + path.unlink() diff --git a/tests/test_firefox_parser.py b/tests/test_firefox_parser.py new file mode 100644 index 0000000..2e2feb9 --- /dev/null +++ b/tests/test_firefox_parser.py @@ -0,0 +1,137 @@ +"""Tests for the Firefox places.sqlite bookmark parser.""" +import sqlite3 +import tempfile +from pathlib import Path + +from mindmark.browsers.firefox import parse_firefox_places + + +def _create_places_db() -> Path: + """Create a minimal Firefox places.sqlite with test bookmarks.""" + tmp = tempfile.NamedTemporaryFile( + suffix=".sqlite", delete=False, prefix="mindmark_test_ff_" + ) + tmp.close() + db_path = Path(tmp.name) + + con = sqlite3.connect(db_path) + con.executescript(""" + CREATE TABLE moz_places ( + id INTEGER PRIMARY KEY, + url TEXT + ); + CREATE TABLE moz_bookmarks ( + id INTEGER PRIMARY KEY, + type INTEGER, + fk INTEGER, + parent INTEGER, + title TEXT, + dateAdded INTEGER + ); + + -- Root folders (IDs 1-6 are built-in roots) + INSERT INTO moz_bookmarks (id, type, fk, parent, title) VALUES + (1, 2, NULL, 0, 'root'), + (2, 2, NULL, 1, 'menu'), + (3, 2, NULL, 1, 'toolbar'), + (4, 2, NULL, 1, 'tags'), + (5, 2, NULL, 1, 'unfiled'), + (6, 2, NULL, 1, 'mobile'); + + -- User folders + INSERT INTO moz_bookmarks (id, type, fk, parent, title) VALUES + (100, 2, NULL, 3, 'Work'), + (101, 2, NULL, 100, 'Internal'); + + -- Places (URLs) + INSERT INTO moz_places (id, url) VALUES + (1, 'https://docs.python.org/3/'), + (2, 'https://github.com'), + (3, 'https://eng.ms/docs/kusto'), + (4, 'https://stackoverflow.com'); + + -- Bookmarks referencing places + INSERT INTO moz_bookmarks (id, type, fk, parent, title, dateAdded) VALUES + (200, 1, 1, 3, 'Python Docs', 1700000000000000), + (201, 1, 2, 100, 'GitHub', 1700000000000001), + (202, 1, 3, 101, 'Kusto Guide', 1700000000000002), + (203, 1, 4, 5, 'Stack Overflow', 1700000000000003); + """) + con.close() + return db_path + + +def test_parses_urls_and_titles(): + path = _create_places_db() + bms = parse_firefox_places(path) + by_url = {b.url: b for b in bms} + assert "https://docs.python.org/3/" in by_url + assert by_url["https://docs.python.org/3/"].title == "Python Docs" + assert "https://github.com" in by_url + assert "https://stackoverflow.com" in by_url + path.unlink() + + +def test_folder_paths(): + path = _create_places_db() + bms = parse_firefox_places(path) + by_url = {b.url: b for b in bms} + # toolbar > Work + assert by_url["https://github.com"].folder_path == "Work" + # toolbar > Work > Internal + assert by_url["https://eng.ms/docs/kusto"].folder_path == "Work/Internal" + path.unlink() + + +def test_skips_place_urls(): + """Firefox internal place: URLs should be excluded.""" + tmp = tempfile.NamedTemporaryFile(suffix=".sqlite", delete=False) + tmp.close() + db_path = Path(tmp.name) + con = sqlite3.connect(db_path) + con.executescript(""" + CREATE TABLE moz_places (id INTEGER PRIMARY KEY, url TEXT); + CREATE TABLE moz_bookmarks ( + id INTEGER PRIMARY KEY, type INTEGER, fk INTEGER, + parent INTEGER, title TEXT, dateAdded INTEGER + ); + INSERT INTO moz_bookmarks (id, type, fk, parent, title) VALUES + (1, 2, NULL, 0, 'root'), + (2, 2, NULL, 1, 'menu'); + INSERT INTO moz_places (id, url) VALUES + (1, 'place:sort=8&maxResults=10'), + (2, 'https://example.com'); + INSERT INTO moz_bookmarks (id, type, fk, parent, title, dateAdded) VALUES + (100, 1, 1, 2, 'Recent Tags', 0), + (101, 1, 2, 2, 'Example', 0); + """) + con.close() + bms = parse_firefox_places(db_path) + urls = [b.url for b in bms] + assert "https://example.com" in urls + assert not any(u.startswith("place:") for u in urls) + db_path.unlink() + + +def test_deduplicates_by_url(): + tmp = tempfile.NamedTemporaryFile(suffix=".sqlite", delete=False) + tmp.close() + db_path = Path(tmp.name) + con = sqlite3.connect(db_path) + con.executescript(""" + CREATE TABLE moz_places (id INTEGER PRIMARY KEY, url TEXT); + CREATE TABLE moz_bookmarks ( + id INTEGER PRIMARY KEY, type INTEGER, fk INTEGER, + parent INTEGER, title TEXT, dateAdded INTEGER + ); + INSERT INTO moz_bookmarks (id, type, fk, parent, title) VALUES + (1, 2, NULL, 0, 'root'), (2, 2, NULL, 1, 'menu'); + INSERT INTO moz_places (id, url) VALUES (1, 'https://example.com'); + INSERT INTO moz_bookmarks (id, type, fk, parent, title, dateAdded) VALUES + (100, 1, 1, 2, 'Example A', 0), + (101, 1, 1, 2, 'Example B', 0); + """) + con.close() + bms = parse_firefox_places(db_path) + assert len([b for b in bms if b.url == "https://example.com"]) == 1 + db_path.unlink() diff --git a/tests/test_incremental_sync.py b/tests/test_incremental_sync.py new file mode 100644 index 0000000..66e9943 --- /dev/null +++ b/tests/test_incremental_sync.py @@ -0,0 +1,370 @@ +"""Tests for incremental sync logic in Index.""" +import sqlite3 +import tempfile +from pathlib import Path +from unittest.mock import patch, MagicMock + +import numpy as np + +from mindmark.parser import Bookmark +from mindmark.index import Index, SyncResult, _content_hash + + +def _make_bookmark(url: str, title: str = "T", folder: str = "") -> Bookmark: + return Bookmark(title=title, url=url, folder_path=folder, add_date=0, icon=None) + + +def _make_index(tmp: Path) -> Index: + """Create an Index with a mock embedder to avoid loading the real model.""" + idx = Index(db_path=tmp / "test.db") + # Replace embedder with a mock that returns deterministic vectors + mock_embedder = MagicMock() + dim = 4 + def fake_embed(texts): + vecs = np.random.RandomState(42).randn(len(texts), dim).astype(np.float32) + norms = np.linalg.norm(vecs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + return vecs / norms + mock_embedder.embed.side_effect = fake_embed + mock_embedder.embed_one.side_effect = lambda t: fake_embed([t])[0] + idx.embedder = mock_embedder + return idx + + +def test_sync_adds_new_bookmarks(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [ + _make_bookmark("https://a.com", "A"), + _make_bookmark("https://b.com", "B"), + ] + result = idx.sync(bms, source="chrome:Default") + assert result.added == 2 + assert result.updated == 0 + assert result.removed == 0 + assert result.unchanged == 0 + assert not idx.is_empty() + + +def test_sync_unchanged_skips_embedding(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [_make_bookmark("https://a.com", "A")] + idx.sync(bms, source="test") + + # Reset call count + idx.embedder.embed.reset_mock() + + # Sync again with same data + result = idx.sync(bms, source="test") + assert result.added == 0 + assert result.unchanged == 1 + # embed should NOT be called for unchanged bookmarks + idx.embedder.embed.assert_not_called() + + +def test_sync_updates_changed_bookmarks(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [_make_bookmark("https://a.com", "A", "Folder1")] + idx.sync(bms, source="test") + + # Change the title + bms2 = [_make_bookmark("https://a.com", "A Updated", "Folder1")] + result = idx.sync(bms2, source="test") + assert result.updated == 1 + assert result.added == 0 + assert result.unchanged == 0 + + # Verify the title was updated in the DB + cur = idx.con.cursor() + cur.execute("SELECT title FROM bookmarks WHERE url = ?", ("https://a.com",)) + assert cur.fetchone()[0] == "A Updated" + + +def test_sync_removes_deleted_bookmarks(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [ + _make_bookmark("https://a.com", "A"), + _make_bookmark("https://b.com", "B"), + ] + idx.sync(bms, source="test") + + # Remove one bookmark + bms2 = [_make_bookmark("https://a.com", "A")] + result = idx.sync(bms2, source="test") + assert result.removed == 1 + assert result.unchanged == 1 + + # Verify b.com is gone + cur = idx.con.cursor() + cur.execute("SELECT COUNT(*) FROM bookmarks WHERE url = ?", ("https://b.com",)) + assert cur.fetchone()[0] == 0 + + +def test_multi_source_no_cross_deletion(): + """Syncing source A should not delete bookmarks from source B.""" + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + + # Source A adds url X + bms_a = [_make_bookmark("https://shared.com", "Shared")] + idx.sync(bms_a, source="chrome:Default") + + # Source B also adds url X + bms_b = [_make_bookmark("https://shared.com", "Shared")] + idx.sync(bms_b, source="firefox:default") + + # Source A removes url X + result = idx.sync([], source="chrome:Default") + assert result.removed == 1 # removed from source A + + # But the bookmark should still exist (source B still references it) + cur = idx.con.cursor() + cur.execute("SELECT COUNT(*) FROM bookmarks WHERE url = ?", ("https://shared.com",)) + assert cur.fetchone()[0] == 1 + + # Now remove from source B too + result = idx.sync([], source="firefox:default") + cur.execute("SELECT COUNT(*) FROM bookmarks WHERE url = ?", ("https://shared.com",)) + assert cur.fetchone()[0] == 0 # now truly gone + + +def test_sync_result_str(): + r = SyncResult(added=3, updated=1, removed=2, unchanged=10) + s = str(r) + assert "3 new" in s + assert "1 updated" in s + assert "2 removed" in s + + +def test_content_hash_deterministic(): + b = _make_bookmark("https://a.com", "A", "Work") + h1 = _content_hash(b) + h2 = _content_hash(b) + assert h1 == h2 + assert len(h1) == 16 # truncated sha256 + + +def test_content_hash_changes_on_title_change(): + b1 = _make_bookmark("https://a.com", "A", "Work") + b2 = _make_bookmark("https://a.com", "B", "Work") + assert _content_hash(b1) != _content_hash(b2) + + +def test_schema_migration_on_old_db(): + """Ensure opening a v1 database migrates cleanly.""" + with tempfile.TemporaryDirectory() as tmpdir: + db_path = Path(tmpdir) / "old.db" + # Create a v1 database (no content_hash, no bookmark_sources) + con = sqlite3.connect(db_path) + con.executescript(""" + CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL); + CREATE TABLE bookmarks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + folder_path TEXT NOT NULL, + domain TEXT NOT NULL, + add_date INTEGER NOT NULL, + icon TEXT, + embedding BLOB NOT NULL, + dim INTEGER NOT NULL + ); + """) + con.close() + + # Opening with Index should trigger migration + idx = Index(db_path=db_path) + # Verify new columns/tables exist + cur = idx.con.cursor() + cols = {r[1] for r in cur.execute("PRAGMA table_info(bookmarks)")} + assert "content_hash" in cols + + tables = {r[0] for r in cur.execute( + "SELECT name FROM sqlite_master WHERE type='table'" + )} + assert "bookmark_sources" in tables + + +# ---- rebuild() tests ---- + +def test_rebuild_populates_content_hash(): + """rebuild() must set content_hash so sync() can do incremental diffs.""" + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [_make_bookmark("https://a.com", "A", "Work")] + idx.rebuild(bms) + + cur = idx.con.cursor() + cur.execute("SELECT content_hash FROM bookmarks WHERE url = ?", ("https://a.com",)) + h = cur.fetchone()[0] + assert h and len(h) == 16 # non-empty, truncated sha256 + + +def test_rebuild_populates_bookmark_sources(): + """rebuild() must populate bookmark_sources with source='html'.""" + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [ + _make_bookmark("https://a.com", "A"), + _make_bookmark("https://b.com", "B"), + ] + idx.rebuild(bms) + + cur = idx.con.cursor() + cur.execute("SELECT url, source FROM bookmark_sources ORDER BY url") + rows = cur.fetchall() + assert len(rows) == 2 + assert rows[0] == ("https://a.com", "html") + assert rows[1] == ("https://b.com", "html") + + +def test_rebuild_clears_previous_data(): + """rebuild() should clear old bookmarks and sources before inserting.""" + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + idx.rebuild([_make_bookmark("https://old.com", "Old")]) + idx.rebuild([_make_bookmark("https://new.com", "New")]) + + cur = idx.con.cursor() + cur.execute("SELECT COUNT(*) FROM bookmarks") + assert cur.fetchone()[0] == 1 + cur.execute("SELECT url FROM bookmarks") + assert cur.fetchone()[0] == "https://new.com" + cur.execute("SELECT COUNT(*) FROM bookmark_sources") + assert cur.fetchone()[0] == 1 + + +def test_rebuild_empty_list(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + result = idx.rebuild([]) + assert result["indexed"] == 0 + assert idx.is_empty() + + +def test_rebuild_then_sync_detects_unchanged(): + """rebuild() followed by sync() with the same data should show all unchanged.""" + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [_make_bookmark("https://a.com", "A", "Work")] + idx.rebuild(bms) + + idx.embedder.embed.reset_mock() + result = idx.sync(bms, source="html") + assert result.unchanged == 1 + assert result.added == 0 + idx.embedder.embed.assert_not_called() + + +# ---- stats() tests ---- + +def test_stats_on_populated_index(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [ + _make_bookmark("https://github.com/a", "Repo A", "Work"), + _make_bookmark("https://github.com/b", "Repo B", "Work"), + _make_bookmark("https://docs.python.org", "Python Docs", "Ref"), + ] + idx.rebuild(bms) + + s = idx.stats() + assert s["total"] == 3 + assert s["model"] is not None + assert str(idx.db_path) in s["db_path"] + # github.com should be top domain with count 2 + domains = dict(s["top_domains"]) + assert domains.get("github.com") == 2 + + +def test_stats_on_empty_index(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + s = idx.stats() + assert s["total"] == 0 + + +# ---- search() tests ---- + +def test_search_returns_results(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [ + _make_bookmark("https://a.com", "Alpha"), + _make_bookmark("https://b.com", "Beta"), + ] + idx.rebuild(bms) + + results = idx.search("anything", k=10) + assert len(results) == 2 + assert all("score" in r for r in results) + assert all("url" in r for r in results) + + +def test_search_empty_index(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + results = idx.search("test") + assert results == [] + + +def test_search_domain_filter(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [ + _make_bookmark("https://github.com/x", "GitHub"), + _make_bookmark("https://docs.python.org", "Docs"), + ] + idx.rebuild(bms) + + results = idx.search("test", domain="github.com") + assert all("github.com" in r["domain"] for r in results) + + +def test_search_folder_filter(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [ + _make_bookmark("https://a.com", "A", "Work/Internal"), + _make_bookmark("https://b.com", "B", "Personal"), + ] + idx.rebuild(bms) + + results = idx.search("test", folder="work") + assert all("work" in r["folder_path"].lower() for r in results) + + +def test_search_k_limit(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [_make_bookmark(f"https://{i}.com", f"Site {i}") for i in range(20)] + idx.rebuild(bms) + + results = idx.search("test", k=5) + assert len(results) == 5 + + +# ---- _remove_source() tests ---- + +def test_remove_source_cleans_orphans(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + bms = [_make_bookmark("https://a.com", "A")] + idx.sync(bms, source="chrome:Default") + + removed = idx._remove_source("chrome:Default") + assert len(removed) == 1 + assert idx.is_empty() + + +def test_remove_source_preserves_other_sources(): + with tempfile.TemporaryDirectory() as tmpdir: + idx = _make_index(Path(tmpdir)) + idx.sync([_make_bookmark("https://a.com", "A")], source="chrome:Default") + idx.sync([_make_bookmark("https://a.com", "A")], source="firefox:default") + + idx._remove_source("chrome:Default") + assert not idx.is_empty() # firefox still references it From 330f40dca59dc7ff1438b70d330dd9a1ddcef785 Mon Sep 17 00:00:00 2001 From: Sukanth Gunda Date: Sat, 18 Apr 2026 13:57:42 -0400 Subject: [PATCH 2/2] Add Index.close and use pytest fixture in tests Add a close() method to Index to explicitly close the SQLite connection. Refactor tests to use pytest (import pytest) and a shared idx fixture that creates an Index via _make_index(tmp_path/"test.db") and ensures the DB is closed on teardown. Update _make_index signature to accept a db_path and convert many tests to use the idx fixture (removing tempfile usage and simplifying setup/teardown). Also ensure the migration test closes the index after assertions. These changes improve test hygiene and ensure DB connections are properly cleaned up. --- src/mindmark/index.py | 4 + tests/test_incremental_sync.py | 469 +++++++++++++++------------------ 2 files changed, 223 insertions(+), 250 deletions(-) diff --git a/src/mindmark/index.py b/src/mindmark/index.py index 50fd430..5d0da1d 100644 --- a/src/mindmark/index.py +++ b/src/mindmark/index.py @@ -162,6 +162,10 @@ def __init__(self, db_path: Path | None = None, model_name: str = DEFAULT_MODEL) self.con = _connect(self.db_path) self.embedder = Embedder(model_name=model_name) + def close(self) -> None: + """Close the underlying database connection.""" + self.con.close() + def is_empty(self) -> bool: cur = self.con.cursor() cur.execute("SELECT COUNT(*) FROM bookmarks") diff --git a/tests/test_incremental_sync.py b/tests/test_incremental_sync.py index 66e9943..41e8f36 100644 --- a/tests/test_incremental_sync.py +++ b/tests/test_incremental_sync.py @@ -1,9 +1,9 @@ """Tests for incremental sync logic in Index.""" import sqlite3 -import tempfile from pathlib import Path from unittest.mock import patch, MagicMock +import pytest import numpy as np from mindmark.parser import Bookmark @@ -14,10 +14,9 @@ def _make_bookmark(url: str, title: str = "T", folder: str = "") -> Bookmark: return Bookmark(title=title, url=url, folder_path=folder, add_date=0, icon=None) -def _make_index(tmp: Path) -> Index: +def _make_index(db_path: Path) -> Index: """Create an Index with a mock embedder to avoid loading the real model.""" - idx = Index(db_path=tmp / "test.db") - # Replace embedder with a mock that returns deterministic vectors + idx = Index(db_path=db_path) mock_embedder = MagicMock() dim = 4 def fake_embed(texts): @@ -31,104 +30,101 @@ def fake_embed(texts): return idx -def test_sync_adds_new_bookmarks(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [ - _make_bookmark("https://a.com", "A"), - _make_bookmark("https://b.com", "B"), - ] - result = idx.sync(bms, source="chrome:Default") - assert result.added == 2 - assert result.updated == 0 - assert result.removed == 0 - assert result.unchanged == 0 - assert not idx.is_empty() - - -def test_sync_unchanged_skips_embedding(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [_make_bookmark("https://a.com", "A")] - idx.sync(bms, source="test") - - # Reset call count - idx.embedder.embed.reset_mock() - - # Sync again with same data - result = idx.sync(bms, source="test") - assert result.added == 0 - assert result.unchanged == 1 - # embed should NOT be called for unchanged bookmarks - idx.embedder.embed.assert_not_called() - - -def test_sync_updates_changed_bookmarks(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [_make_bookmark("https://a.com", "A", "Folder1")] - idx.sync(bms, source="test") - - # Change the title - bms2 = [_make_bookmark("https://a.com", "A Updated", "Folder1")] - result = idx.sync(bms2, source="test") - assert result.updated == 1 - assert result.added == 0 - assert result.unchanged == 0 - - # Verify the title was updated in the DB - cur = idx.con.cursor() - cur.execute("SELECT title FROM bookmarks WHERE url = ?", ("https://a.com",)) - assert cur.fetchone()[0] == "A Updated" - - -def test_sync_removes_deleted_bookmarks(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [ - _make_bookmark("https://a.com", "A"), - _make_bookmark("https://b.com", "B"), - ] - idx.sync(bms, source="test") - - # Remove one bookmark - bms2 = [_make_bookmark("https://a.com", "A")] - result = idx.sync(bms2, source="test") - assert result.removed == 1 - assert result.unchanged == 1 - - # Verify b.com is gone - cur = idx.con.cursor() - cur.execute("SELECT COUNT(*) FROM bookmarks WHERE url = ?", ("https://b.com",)) - assert cur.fetchone()[0] == 0 +@pytest.fixture +def idx(tmp_path): + """Yield an Index with a mock embedder; close DB on teardown.""" + index = _make_index(tmp_path / "test.db") + yield index + index.close() -def test_multi_source_no_cross_deletion(): - """Syncing source A should not delete bookmarks from source B.""" - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) +def test_sync_adds_new_bookmarks(idx): + bms = [ + _make_bookmark("https://a.com", "A"), + _make_bookmark("https://b.com", "B"), + ] + result = idx.sync(bms, source="chrome:Default") + assert result.added == 2 + assert result.updated == 0 + assert result.removed == 0 + assert result.unchanged == 0 + assert not idx.is_empty() - # Source A adds url X - bms_a = [_make_bookmark("https://shared.com", "Shared")] - idx.sync(bms_a, source="chrome:Default") - # Source B also adds url X - bms_b = [_make_bookmark("https://shared.com", "Shared")] - idx.sync(bms_b, source="firefox:default") +def test_sync_unchanged_skips_embedding(idx): + bms = [_make_bookmark("https://a.com", "A")] + idx.sync(bms, source="test") - # Source A removes url X - result = idx.sync([], source="chrome:Default") - assert result.removed == 1 # removed from source A + # Reset call count + idx.embedder.embed.reset_mock() - # But the bookmark should still exist (source B still references it) - cur = idx.con.cursor() - cur.execute("SELECT COUNT(*) FROM bookmarks WHERE url = ?", ("https://shared.com",)) - assert cur.fetchone()[0] == 1 + # Sync again with same data + result = idx.sync(bms, source="test") + assert result.added == 0 + assert result.unchanged == 1 + # embed should NOT be called for unchanged bookmarks + idx.embedder.embed.assert_not_called() + + +def test_sync_updates_changed_bookmarks(idx): + bms = [_make_bookmark("https://a.com", "A", "Folder1")] + idx.sync(bms, source="test") + + # Change the title + bms2 = [_make_bookmark("https://a.com", "A Updated", "Folder1")] + result = idx.sync(bms2, source="test") + assert result.updated == 1 + assert result.added == 0 + assert result.unchanged == 0 + + # Verify the title was updated in the DB + cur = idx.con.cursor() + cur.execute("SELECT title FROM bookmarks WHERE url = ?", ("https://a.com",)) + assert cur.fetchone()[0] == "A Updated" + + +def test_sync_removes_deleted_bookmarks(idx): + bms = [ + _make_bookmark("https://a.com", "A"), + _make_bookmark("https://b.com", "B"), + ] + idx.sync(bms, source="test") + + # Remove one bookmark + bms2 = [_make_bookmark("https://a.com", "A")] + result = idx.sync(bms2, source="test") + assert result.removed == 1 + assert result.unchanged == 1 - # Now remove from source B too - result = idx.sync([], source="firefox:default") - cur.execute("SELECT COUNT(*) FROM bookmarks WHERE url = ?", ("https://shared.com",)) - assert cur.fetchone()[0] == 0 # now truly gone + # Verify b.com is gone + cur = idx.con.cursor() + cur.execute("SELECT COUNT(*) FROM bookmarks WHERE url = ?", ("https://b.com",)) + assert cur.fetchone()[0] == 0 + + +def test_multi_source_no_cross_deletion(idx): + """Syncing source A should not delete bookmarks from source B.""" + # Source A adds url X + bms_a = [_make_bookmark("https://shared.com", "Shared")] + idx.sync(bms_a, source="chrome:Default") + + # Source B also adds url X + bms_b = [_make_bookmark("https://shared.com", "Shared")] + idx.sync(bms_b, source="firefox:default") + + # Source A removes url X + result = idx.sync([], source="chrome:Default") + assert result.removed == 1 # removed from source A + + # But the bookmark should still exist (source B still references it) + cur = idx.con.cursor() + cur.execute("SELECT COUNT(*) FROM bookmarks WHERE url = ?", ("https://shared.com",)) + assert cur.fetchone()[0] == 1 + + # Now remove from source B too + result = idx.sync([], source="firefox:default") + cur.execute("SELECT COUNT(*) FROM bookmarks WHERE url = ?", ("https://shared.com",)) + assert cur.fetchone()[0] == 0 # now truly gone def test_sync_result_str(): @@ -153,31 +149,30 @@ def test_content_hash_changes_on_title_change(): assert _content_hash(b1) != _content_hash(b2) -def test_schema_migration_on_old_db(): +def test_schema_migration_on_old_db(tmp_path): """Ensure opening a v1 database migrates cleanly.""" - with tempfile.TemporaryDirectory() as tmpdir: - db_path = Path(tmpdir) / "old.db" - # Create a v1 database (no content_hash, no bookmark_sources) - con = sqlite3.connect(db_path) - con.executescript(""" - CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL); - CREATE TABLE bookmarks ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - url TEXT UNIQUE NOT NULL, - title TEXT NOT NULL, - folder_path TEXT NOT NULL, - domain TEXT NOT NULL, - add_date INTEGER NOT NULL, - icon TEXT, - embedding BLOB NOT NULL, - dim INTEGER NOT NULL - ); - """) - con.close() - - # Opening with Index should trigger migration - idx = Index(db_path=db_path) - # Verify new columns/tables exist + db_path = tmp_path / "old.db" + # Create a v1 database (no content_hash, no bookmark_sources) + con = sqlite3.connect(db_path) + con.executescript(""" + CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL); + CREATE TABLE bookmarks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + folder_path TEXT NOT NULL, + domain TEXT NOT NULL, + add_date INTEGER NOT NULL, + icon TEXT, + embedding BLOB NOT NULL, + dim INTEGER NOT NULL + ); + """) + con.close() + + # Opening with Index should trigger migration + idx = Index(db_path=db_path) + try: cur = idx.con.cursor() cols = {r[1] for r in cur.execute("PRAGMA table_info(bookmarks)")} assert "content_hash" in cols @@ -186,185 +181,159 @@ def test_schema_migration_on_old_db(): "SELECT name FROM sqlite_master WHERE type='table'" )} assert "bookmark_sources" in tables + finally: + idx.close() # ---- rebuild() tests ---- -def test_rebuild_populates_content_hash(): +def test_rebuild_populates_content_hash(idx): """rebuild() must set content_hash so sync() can do incremental diffs.""" - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [_make_bookmark("https://a.com", "A", "Work")] - idx.rebuild(bms) + bms = [_make_bookmark("https://a.com", "A", "Work")] + idx.rebuild(bms) - cur = idx.con.cursor() - cur.execute("SELECT content_hash FROM bookmarks WHERE url = ?", ("https://a.com",)) - h = cur.fetchone()[0] - assert h and len(h) == 16 # non-empty, truncated sha256 + cur = idx.con.cursor() + cur.execute("SELECT content_hash FROM bookmarks WHERE url = ?", ("https://a.com",)) + h = cur.fetchone()[0] + assert h and len(h) == 16 # non-empty, truncated sha256 -def test_rebuild_populates_bookmark_sources(): +def test_rebuild_populates_bookmark_sources(idx): """rebuild() must populate bookmark_sources with source='html'.""" - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [ - _make_bookmark("https://a.com", "A"), - _make_bookmark("https://b.com", "B"), - ] - idx.rebuild(bms) + bms = [ + _make_bookmark("https://a.com", "A"), + _make_bookmark("https://b.com", "B"), + ] + idx.rebuild(bms) - cur = idx.con.cursor() - cur.execute("SELECT url, source FROM bookmark_sources ORDER BY url") - rows = cur.fetchall() - assert len(rows) == 2 - assert rows[0] == ("https://a.com", "html") - assert rows[1] == ("https://b.com", "html") + cur = idx.con.cursor() + cur.execute("SELECT url, source FROM bookmark_sources ORDER BY url") + rows = cur.fetchall() + assert len(rows) == 2 + assert rows[0] == ("https://a.com", "html") + assert rows[1] == ("https://b.com", "html") -def test_rebuild_clears_previous_data(): +def test_rebuild_clears_previous_data(idx): """rebuild() should clear old bookmarks and sources before inserting.""" - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - idx.rebuild([_make_bookmark("https://old.com", "Old")]) - idx.rebuild([_make_bookmark("https://new.com", "New")]) + idx.rebuild([_make_bookmark("https://old.com", "Old")]) + idx.rebuild([_make_bookmark("https://new.com", "New")]) - cur = idx.con.cursor() - cur.execute("SELECT COUNT(*) FROM bookmarks") - assert cur.fetchone()[0] == 1 - cur.execute("SELECT url FROM bookmarks") - assert cur.fetchone()[0] == "https://new.com" - cur.execute("SELECT COUNT(*) FROM bookmark_sources") - assert cur.fetchone()[0] == 1 + cur = idx.con.cursor() + cur.execute("SELECT COUNT(*) FROM bookmarks") + assert cur.fetchone()[0] == 1 + cur.execute("SELECT url FROM bookmarks") + assert cur.fetchone()[0] == "https://new.com" + cur.execute("SELECT COUNT(*) FROM bookmark_sources") + assert cur.fetchone()[0] == 1 -def test_rebuild_empty_list(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - result = idx.rebuild([]) - assert result["indexed"] == 0 - assert idx.is_empty() +def test_rebuild_empty_list(idx): + result = idx.rebuild([]) + assert result["indexed"] == 0 + assert idx.is_empty() -def test_rebuild_then_sync_detects_unchanged(): +def test_rebuild_then_sync_detects_unchanged(idx): """rebuild() followed by sync() with the same data should show all unchanged.""" - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [_make_bookmark("https://a.com", "A", "Work")] - idx.rebuild(bms) + bms = [_make_bookmark("https://a.com", "A", "Work")] + idx.rebuild(bms) - idx.embedder.embed.reset_mock() - result = idx.sync(bms, source="html") - assert result.unchanged == 1 - assert result.added == 0 - idx.embedder.embed.assert_not_called() + idx.embedder.embed.reset_mock() + result = idx.sync(bms, source="html") + assert result.unchanged == 1 + assert result.added == 0 + idx.embedder.embed.assert_not_called() # ---- stats() tests ---- -def test_stats_on_populated_index(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [ - _make_bookmark("https://github.com/a", "Repo A", "Work"), - _make_bookmark("https://github.com/b", "Repo B", "Work"), - _make_bookmark("https://docs.python.org", "Python Docs", "Ref"), - ] - idx.rebuild(bms) +def test_stats_on_populated_index(idx): + bms = [ + _make_bookmark("https://github.com/a", "Repo A", "Work"), + _make_bookmark("https://github.com/b", "Repo B", "Work"), + _make_bookmark("https://docs.python.org", "Python Docs", "Ref"), + ] + idx.rebuild(bms) - s = idx.stats() - assert s["total"] == 3 - assert s["model"] is not None - assert str(idx.db_path) in s["db_path"] - # github.com should be top domain with count 2 - domains = dict(s["top_domains"]) - assert domains.get("github.com") == 2 + s = idx.stats() + assert s["total"] == 3 + assert s["model"] is not None + assert str(idx.db_path) in s["db_path"] + # github.com should be top domain with count 2 + domains = dict(s["top_domains"]) + assert domains.get("github.com") == 2 -def test_stats_on_empty_index(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - s = idx.stats() - assert s["total"] == 0 +def test_stats_on_empty_index(idx): + s = idx.stats() + assert s["total"] == 0 # ---- search() tests ---- -def test_search_returns_results(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [ - _make_bookmark("https://a.com", "Alpha"), - _make_bookmark("https://b.com", "Beta"), - ] - idx.rebuild(bms) +def test_search_returns_results(idx): + bms = [ + _make_bookmark("https://a.com", "Alpha"), + _make_bookmark("https://b.com", "Beta"), + ] + idx.rebuild(bms) - results = idx.search("anything", k=10) - assert len(results) == 2 - assert all("score" in r for r in results) - assert all("url" in r for r in results) + results = idx.search("anything", k=10) + assert len(results) == 2 + assert all("score" in r for r in results) + assert all("url" in r for r in results) -def test_search_empty_index(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - results = idx.search("test") - assert results == [] +def test_search_empty_index(idx): + results = idx.search("test") + assert results == [] -def test_search_domain_filter(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [ - _make_bookmark("https://github.com/x", "GitHub"), - _make_bookmark("https://docs.python.org", "Docs"), - ] - idx.rebuild(bms) +def test_search_domain_filter(idx): + bms = [ + _make_bookmark("https://github.com/x", "GitHub"), + _make_bookmark("https://docs.python.org", "Docs"), + ] + idx.rebuild(bms) - results = idx.search("test", domain="github.com") - assert all("github.com" in r["domain"] for r in results) + results = idx.search("test", domain="github.com") + assert all("github.com" in r["domain"] for r in results) -def test_search_folder_filter(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [ - _make_bookmark("https://a.com", "A", "Work/Internal"), - _make_bookmark("https://b.com", "B", "Personal"), - ] - idx.rebuild(bms) +def test_search_folder_filter(idx): + bms = [ + _make_bookmark("https://a.com", "A", "Work/Internal"), + _make_bookmark("https://b.com", "B", "Personal"), + ] + idx.rebuild(bms) - results = idx.search("test", folder="work") - assert all("work" in r["folder_path"].lower() for r in results) + results = idx.search("test", folder="work") + assert all("work" in r["folder_path"].lower() for r in results) -def test_search_k_limit(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [_make_bookmark(f"https://{i}.com", f"Site {i}") for i in range(20)] - idx.rebuild(bms) +def test_search_k_limit(idx): + bms = [_make_bookmark(f"https://{i}.com", f"Site {i}") for i in range(20)] + idx.rebuild(bms) - results = idx.search("test", k=5) - assert len(results) == 5 + results = idx.search("test", k=5) + assert len(results) == 5 # ---- _remove_source() tests ---- -def test_remove_source_cleans_orphans(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - bms = [_make_bookmark("https://a.com", "A")] - idx.sync(bms, source="chrome:Default") +def test_remove_source_cleans_orphans(idx): + bms = [_make_bookmark("https://a.com", "A")] + idx.sync(bms, source="chrome:Default") - removed = idx._remove_source("chrome:Default") - assert len(removed) == 1 - assert idx.is_empty() + removed = idx._remove_source("chrome:Default") + assert len(removed) == 1 + assert idx.is_empty() -def test_remove_source_preserves_other_sources(): - with tempfile.TemporaryDirectory() as tmpdir: - idx = _make_index(Path(tmpdir)) - idx.sync([_make_bookmark("https://a.com", "A")], source="chrome:Default") - idx.sync([_make_bookmark("https://a.com", "A")], source="firefox:default") +def test_remove_source_preserves_other_sources(idx): + idx.sync([_make_bookmark("https://a.com", "A")], source="chrome:Default") + idx.sync([_make_bookmark("https://a.com", "A")], source="firefox:default") - idx._remove_source("chrome:Default") - assert not idx.is_empty() # firefox still references it + idx._remove_source("chrome:Default") + assert not idx.is_empty() # firefox still references it