diff --git a/.gitignore b/.gitignore index 5f551af..ca9ec84 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ dist/ .venv/ .pytest_cache/ .bookmarks-search/ +uv.lock +.vscode/ diff --git a/README.md b/README.md index 5620910..dc71a1f 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ Ask in natural language โ€” mindmark remembers what you saved. | `mindmark sync` | **Auto-detect** installed browsers and sync bookmarks directly โ€” no export needed | | `mindmark find "query"` | Semantic search over titles, folders, domains, and URL slugs โ€” returns top-K with similarity scores | | `mindmark open "query"` | Search and open the best match in your default browser | +| `mindmark enrich` | Fetch page content, extract text, embed summaries, and improve search relevance with page context | | `mindmark stats` | Show index size, model info, top domains, and top folders | | `mindmark index ` | Import bookmarks from an exported HTML file (legacy workflow) | | `mindmark validate` | Check indexed bookmark URLs for stale links (HTTP 4xx/5xx or unreachable) and report them | @@ -266,7 +267,7 @@ When you add new bookmarks in your browser, just run `mindmark sync` again โ€” i > ๐Ÿ’ก **Note:** If you change the embedding model with `--model`, all bookmarks will be re-embedded on the next sync. Browser names are case-insensitive (e.g., `--browser Chrome` and `--browser chrome` both work). -### Filters +### Filters and options Narrow down results without changing your query: @@ -274,8 +275,11 @@ Narrow down results without changing your query: mindmark find "useful tools" --domain github.com # only github.com results mindmark find "useful tools" --folder work/kusto # only bookmarks in matching folders mindmark find "useful tools" -k 20 # return top 20 instead of 10 +mindmark find "useful tools" --excerpt # include excerpts from enriched pages ``` +> ๐Ÿ’ก **Note:** The `--excerpt` flag requires you to run `mindmark enrich` first to fetch and embed page content. See [Augmented Index](#-augmented-index-page-summaries) for details. + ### Re-indexing For the `sync` workflow, just rerun `mindmark sync`. It's incremental โ€” only changed bookmarks are re-embedded. @@ -344,7 +348,90 @@ Browser data files "python async tutorial" --- -## ๐Ÿ—‚๏ธ Storage Layout +## ๐ŸŽฏ Augmented Index with Page Summaries + +By default, mindmark indexes only bookmark metadata: titles, folders, domains, and URL slugs. If you want **deeper page context** in search results, use the enrichment pipeline to fetch page content and embed summaries. + +> ๐Ÿ’ก **Note:** In order to be 100% local and lightweight enrichment uses **extractive summarization** (first 500 chars of page text) โ€” no LLM, no text generation. This means: +> - Only the opening content is embedded (relevant if key info is early; may miss content further down) +> - Page content must already be well-written for excerpts to be useful (relies on natural sentence structure) +> - Privacy and speed are preserved (no cloud calls, runs entirely locally) + +### Why enrich? + +Without enrichment, searching for **"authentication strategies"** on a bookmark titled **"AWS Services"** may miss it, even though the page discusses authentication. With enrichment, the page content is fetched and summarized, improving relevance. + +### Quick start + +1. **Enrich bookmarks** (fetch page content and embed summaries): + +```bash +mindmark enrich --limit 100 --workers 4 +``` + +Options: +- `--limit N` โ€” Process top N pending URLs (default: all) +- `--workers N` โ€” Parallel fetch workers (default: 8) +- `--timeout S` โ€” Per-request timeout in seconds (default: 10.0) +- `--refresh-failed` โ€” Retry previously failed enrichments + +2. **Search with page context**: + +```bash +mindmark find "authentication strategies" --excerpt +``` + +With `--excerpt`, results display the most relevant excerpt from the enriched page: + +``` + 1. AWS Services + aws.amazon.com + โคต To control user access to AWS resources, you must have an authentication strategy. AWS IAM provides fine-grained access control... + + 2. Auth0 Documentation + auth0.com + โคต Authentication is the process of verifying the identity of a user or service. Authorization is the process of granting permissions... +``` + +The `โคต` symbol indicates content from the enriched page. Without enrichment, the symbol won't appear. + +### How it works + +1. **Fetch** โ€” GET each bookmark URL with a user-agent, respecting HTTP 4xx/5xx and content-type guards. +2. **Extract** โ€” Strip boilerplate (nav, footer, scripts, styles) and extract plain text. +3. **Summarize** โ€” Use the first 500 characters of extracted text as the summary (extractive, no LLM). +4. **Embed** โ€” Embed the summary using the same ONNX model as bookmark metadata. +5. **Blend** โ€” At search time, combine base (bookmark metadata) and summary similarity scores: + - **Blended score = 0.65 ร— base_score + 0.35 ร— summary_score** + - Falls back to base-only if no summary exists. +6. **Excerpt** โ€” For readability, find and display the sentence from the summary most similar to the query. + +### Status and monitoring + +Check enrichment status: + +```bash +python -c " +from mindmark.index import Index +idx = Index() +print(idx.enrichment_stats()) +idx.close() +" +``` + +Example output: +```python +{'pending': 1234, 'complete': 450, 'failed': 23} +``` + +### Notes + +- **100% local** โ€” Page fetching happens on your machine; no cloud service is used. +- **Smart caching** โ€” Pages are re-fetched only if the page content changes (detected via content hash). +- **Failure resilience** โ€” HTTP errors, timeouts, and JavaScript-only pages are logged as failed; sync and search continue without interruption. +- **Privacy** โ€” No content leaves your machine; all processing is offline and local. + +--- | What | macOS / Linux | Windows | Override | |---|---|---|---| diff --git a/pyproject.toml b/pyproject.toml index 0aeecce..4754f16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mindmark" -version = "0.1.5" +version = "0.1.6" description = "Local semantic search over your browser bookmarks โ€” on-device embeddings, no cloud." readme = "README.md" requires-python = ">=3.9" diff --git a/src/mindmark/cli.py b/src/mindmark/cli.py index 9400558..de42bb0 100644 --- a/src/mindmark/cli.py +++ b/src/mindmark/cli.py @@ -153,6 +153,7 @@ def _clear_index_contents(db_path: Path) -> bool: try: con = sqlite3.connect(str(db_path), timeout=1.0) cur = con.cursor() + cur.execute("DELETE FROM bookmark_enrichment") cur.execute("DELETE FROM bookmark_sources") cur.execute("DELETE FROM bookmarks") cur.execute("DELETE FROM meta") @@ -192,9 +193,11 @@ def _cmd_find(args): idx = Index(db_path=args.db) if not getattr(args, 'json', False): _auto_sync_hint(idx) + include_excerpt = getattr(args, 'excerpt', False) results = idx.search( query=args.query, k=args.top, domain=args.domain, folder=args.folder, + include_excerpt=include_excerpt, ) if not results: print("no results (is the index empty? run: mindmark sync)") @@ -219,6 +222,9 @@ def _cmd_find(args): path = f"{folder}/" if folder else "" print(f"{i:2d}. {r['title']}") print(f" {path}{domain}") + if include_excerpt and r.get("relevant_excerpt"): + excerpt = r["relevant_excerpt"] + print(f" โคต {excerpt}") return 0 @@ -243,6 +249,50 @@ def _cmd_stats(args): idx.close() +def _cmd_enrich(args): + from .enricher import enrich_pending + + idx = Index(db_path=args.db) + try: + pending = idx.pending_enrichment_urls( + limit=None if args.refresh_failed else args.limit + ) + if args.refresh_failed: + reset = idx.reset_failed_enrichment() + if reset: + print(f"reset {reset} failed enrichment rows to pending") + # re-query after reset, respecting --limit + pending = idx.pending_enrichment_urls(limit=args.limit) + + estats = idx.enrichment_stats() + total_pending = estats.get("pending", 0) + + if not pending: + print("nothing to enrich โ€” run 'mindmark sync' first, or use --refresh-failed") + return 0 + + to_process = len(pending) + print( + f"enriching {to_process} bookmarks " + f"(pending={total_pending} workers={args.workers} timeout={args.timeout}s)" + ) + + result = enrich_pending( + idx, + limit=args.limit, + workers=args.workers, + timeout=args.timeout, + refresh_failed=False, # already handled above + ) + print(f"done. {result}") + return 0 + except KeyboardInterrupt: + print("\n\nCancelled by user.") + return 1 + finally: + idx.close() + + def _cmd_sync(args): from .browsers import parse_browser_bookmarks, detect_browsers @@ -292,6 +342,10 @@ def build_parser(): pf.add_argument("--folder") pf.add_argument("--json", action="store_true") pf.add_argument("--open", type=int, metavar="N") + pf.add_argument( + "--excerpt", action="store_true", + help="include excerpt from enriched page content (requires mindmark enrich)", + ) pf.set_defaults(func=_cmd_find) ps = sub.add_parser("stats", help="show index stats") @@ -324,6 +378,28 @@ def build_parser(): ) pd.set_defaults(func=_cmd_drop_index) + pe = sub.add_parser( + "enrich", + help="fetch page content for bookmarks and build summary embeddings (local, no cloud)", + ) + pe.add_argument( + "--limit", type=int, default=None, + help="max bookmarks to process per run (default: all pending)", + ) + pe.add_argument( + "--workers", type=int, default=8, + help="parallel fetch workers (default: 8)", + ) + pe.add_argument( + "--timeout", type=float, default=10.0, + help="per-request fetch timeout in seconds (default: 10.0)", + ) + pe.add_argument( + "--refresh-failed", action="store_true", + help="retry previously failed enrichments", + ) + pe.set_defaults(func=_cmd_enrich) + return p @@ -336,6 +412,12 @@ def main(argv=None): if args.workers <= 0: parser.error("--workers must be > 0") return args.func(args) + if args.cmd == "enrich": + if args.workers <= 0: + parser.error("--workers must be > 0") + if args.timeout <= 0: + parser.error("--timeout must be > 0") + return args.func(args) if args.cmd is None: parser.print_help() return 2 diff --git a/src/mindmark/enricher.py b/src/mindmark/enricher.py new file mode 100644 index 0000000..fe010aa --- /dev/null +++ b/src/mindmark/enricher.py @@ -0,0 +1,184 @@ +"""Enrichment pipeline: fetch page content, extract text, embed, persist. + +Summarization approach +---------------------- +mindmark is 100% local with no cloud dependencies. Rather than running a +generative LLM, we use *extractive* summarization: the first +``SUMMARY_CHARS`` characters of the extracted page text become the +"summary". This text is then embedded with the same BGE/MiniLM ONNX +model already used for bookmark metadata, producing a summary vector that +captures the page's semantic content. + +At search time (Phase 4) the summary embedding will be blended with the +bookmark metadata embedding to improve result relevance. +""" +from __future__ import annotations + +import concurrent.futures +import time +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from .fetcher import extract_text, fetch_page, text_content_hash + +if TYPE_CHECKING: + from .index import Index + +# Characters of extracted text used as the embedding corpus for each page. +# 500 chars fits a dense paragraph and keeps embedding latency negligible. +SUMMARY_CHARS = 500 + + +@dataclass +class EnrichResult: + url: str + status: str # 'complete' | 'failed' | 'skipped' + error: str | None = None + http_status: int | None = None + + +def _enrich_one(url: str, idx: "Index", timeout: float) -> EnrichResult: + """Fetch, extract, embed, and persist enrichment for a single URL. + + Returns an :class:`EnrichResult` describing the outcome. All + exceptions are caught internally so a single failure never aborts a + batch. + """ + try: + result = fetch_page(url, timeout=timeout) + except Exception as exc: # pragma: no cover โ€” defensive + idx.fail_enrichment(url, error=str(exc), fetched_at=int(time.time())) + return EnrichResult(url=url, status="failed", error=str(exc)) + + if not result.ok: + idx.fail_enrichment( + url, + error=result.error or "unknown fetch error", + http_status=result.http_status, + fetched_at=result.fetched_at, + ) + return EnrichResult( + url=url, status="failed", + error=result.error, http_status=result.http_status, + ) + + raw_text = extract_text(result.html or "", max_chars=SUMMARY_CHARS) + + if not raw_text.strip(): + idx.fail_enrichment( + url, + error="no extractable text", + http_status=result.http_status, + fetched_at=result.fetched_at, + ) + return EnrichResult( + url=url, status="failed", + error="no extractable text", http_status=result.http_status, + ) + + content_hash = text_content_hash(raw_text) + + # Skip re-embedding if content hasn't changed since last enrichment + try: + cur = idx.con.cursor() + cur.execute( + "SELECT page_content_hash, status FROM bookmark_enrichment WHERE url=?", + (url,), + ) + row = cur.fetchone() + if row and row[1] == "complete" and row[0] == content_hash: + return EnrichResult(url=url, status="skipped") + except Exception: + pass # if DB read fails, proceed to re-embed + + # Embed the extractive summary + try: + vec = idx.embedder.embed_one(raw_text) + except Exception as exc: + idx.fail_enrichment( + url, + error=f"embedding error: {exc}", + http_status=result.http_status, + fetched_at=result.fetched_at, + ) + return EnrichResult(url=url, status="failed", error=f"embedding error: {exc}") + + summarized_at = int(time.time()) + idx.save_enrichment( + url=url, + summary_text=raw_text, + summary_embedding=vec, + model_name=idx.model_name, + content_hash=content_hash, + http_status=result.http_status, + fetched_at=result.fetched_at, + summarized_at=summarized_at, + ) + + return EnrichResult(url=url, status="complete", http_status=result.http_status) + + +@dataclass +class BatchEnrichResult: + complete: int = 0 + failed: int = 0 + skipped: int = 0 + + @property + def total(self) -> int: + return self.complete + self.failed + self.skipped + + def __str__(self) -> str: + return ( + f"complete={self.complete} failed={self.failed} skipped={self.skipped}" + ) + + +def enrich_pending( + idx: "Index", + limit: int | None = None, + workers: int = 8, + timeout: float = 10.0, + refresh_failed: bool = False, +) -> BatchEnrichResult: + """Process pending enrichment rows from the index. + + Parameters + ---------- + idx: + Open :class:`~mindmark.index.Index` instance. + limit: + Maximum number of pending URLs to process. ``None`` means all. + workers: + Number of parallel fetch threads. + timeout: + Per-request fetch timeout in seconds. + refresh_failed: + If ``True``, reset all ``failed`` rows to ``pending`` before + processing so they are retried. + """ + if refresh_failed: + idx.reset_failed_enrichment() + + urls = idx.pending_enrichment_urls(limit=limit) + batch = BatchEnrichResult() + + if not urls: + return batch + + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as pool: + futures = {pool.submit(_enrich_one, url, idx, timeout): url for url in urls} + for fut in concurrent.futures.as_completed(futures): + try: + r = fut.result() + except Exception as exc: # pragma: no cover โ€” defensive + batch.failed += 1 + continue + if r.status == "complete": + batch.complete += 1 + elif r.status == "skipped": + batch.skipped += 1 + else: + batch.failed += 1 + + return batch diff --git a/src/mindmark/fetcher.py b/src/mindmark/fetcher.py new file mode 100644 index 0000000..69f6c42 --- /dev/null +++ b/src/mindmark/fetcher.py @@ -0,0 +1,239 @@ +"""Fetch and extract text content from bookmark URLs. + +Provides two public entry points: + +* ``fetch_page(url, ...)`` โ€” Downloads a URL and returns a :class:`FetchResult`. + Uses stdlib only (no extra dependencies). + +* ``extract_text(html)`` โ€” Strips boilerplate (scripts, nav, footer, etc.) from + raw HTML and returns normalised plain text suitable for summarisation. + +* ``text_content_hash(text)`` โ€” Short deterministic hash for change detection. +""" +from __future__ import annotations + +import hashlib +import html as _html_lib +import re +import time +from dataclasses import dataclass, field +from html.parser import HTMLParser +from urllib.error import HTTPError, URLError +from urllib.parse import urlparse +from urllib.request import Request, urlopen + +# Tags whose subtree should be completely ignored (no text collected) +_SKIP_TAGS: frozenset[str] = frozenset({ + "script", "style", "noscript", "template", + "nav", "header", "footer", "aside", + "form", "button", "select", "textarea", + "menu", "dialog", "figure", "figcaption", + "iframe", "object", "embed", "svg", "canvas", + "head", +}) + +# Tags that act as block separators (emit a space boundary around their text) +_BLOCK_TAGS: frozenset[str] = frozenset({ + "p", "div", "section", "article", "main", + "h1", "h2", "h3", "h4", "h5", "h6", + "li", "dt", "dd", "blockquote", "pre", "code", + "td", "th", "tr", "caption", + "br", "hr", +}) + +_MAX_FETCH_BYTES = 512 * 1024 # 512 KB hard cap +_MAX_TEXT_CHARS = 4_000 # characters returned by extract_text +_DEFAULT_TIMEOUT = 10.0 # seconds per request +_USER_AGENT = "mindmark/0.x (+bookmark-enrichment)" + + +# --------------------------------------------------------------------------- +# Data types +# --------------------------------------------------------------------------- + +@dataclass +class FetchResult: + url: str + html: str | None + http_status: int | None + content_type: str | None + error: str | None + fetched_at: int = field(default_factory=lambda: int(time.time())) + + @property + def ok(self) -> bool: + return self.html is not None and self.error is None + + +# --------------------------------------------------------------------------- +# Fetcher +# --------------------------------------------------------------------------- + +def _is_http_url(url: str) -> bool: + p = urlparse(url) + return p.scheme.lower() in {"http", "https"} and bool(p.netloc) + + +def _read_capped(resp, max_bytes: int) -> bytes: + """Read up to *max_bytes* from an HTTP response body.""" + chunks: list[bytes] = [] + remaining = max_bytes + while remaining > 0: + chunk = resp.read(min(8192, remaining)) + if not chunk: + break + chunks.append(chunk) + remaining -= len(chunk) + return b"".join(chunks) + + +def _decode(raw: bytes, content_type: str | None) -> str: + """Best-effort decode to str, honouring charset from Content-Type.""" + charset = "utf-8" + if content_type: + for part in content_type.split(";"): + part = part.strip() + if part.lower().startswith("charset="): + charset = part[len("charset="):].strip().strip('"') + break + return raw.decode(charset, errors="replace") + + +def fetch_page( + url: str, + timeout: float = _DEFAULT_TIMEOUT, + max_bytes: int = _MAX_FETCH_BYTES, +) -> FetchResult: + """Fetch *url* and return a :class:`FetchResult`. + + * Only ``http`` / ``https`` URLs are fetched; others are skipped. + * Response body is capped at *max_bytes* to avoid downloading huge pages. + * Non-HTML content types (e.g. PDF, image) are skipped immediately. + * On any network or HTTP error, returns a result with ``error`` set. + """ + if not _is_http_url(url): + return FetchResult( + url=url, html=None, http_status=None, + content_type=None, error="skipped: non-http URL", + ) + + headers = {"User-Agent": _USER_AGENT, "Accept": "text/html,*/*;q=0.8"} + + def _do_get() -> FetchResult: + try: + req = Request(url, headers=headers, method="GET") + with urlopen(req, timeout=timeout) as resp: + status = int(getattr(resp, "status", 0) or 0) + ct = resp.headers.get("Content-Type", "") + # Skip non-HTML content types early (before reading body) + if ct and not _looks_like_html(ct): + return FetchResult( + url=url, html=None, http_status=status, + content_type=ct, error=f"skipped: non-HTML content-type ({ct})", + ) + raw = _read_capped(resp, max_bytes) + html = _decode(raw, ct or None) + return FetchResult( + url=url, html=html, http_status=status, + content_type=ct or None, error=None, + ) + except HTTPError as e: + return FetchResult( + url=url, html=None, http_status=int(e.code), + content_type=None, error=f"HTTP {e.code}: {e.reason or 'error'}", + ) + except URLError as e: + reason = str(e.reason) if e.reason else str(e) + return FetchResult( + url=url, html=None, http_status=None, + content_type=None, error=f"connection error: {reason}", + ) + except TimeoutError: + return FetchResult( + url=url, html=None, http_status=None, + content_type=None, error="timeout", + ) + except Exception as e: # pragma: no cover โ€” defensive + return FetchResult( + url=url, html=None, http_status=None, + content_type=None, error=str(e), + ) + + return _do_get() + + +def _looks_like_html(content_type: str) -> bool: + ct_lower = content_type.lower() + return "text/html" in ct_lower or "application/xhtml" in ct_lower + + +# --------------------------------------------------------------------------- +# Text extractor +# --------------------------------------------------------------------------- + +class _TextExtractor(HTMLParser): + """SAX-style HTML parser that collects visible text, skipping boilerplate.""" + + def __init__(self) -> None: + super().__init__(convert_charrefs=True) + self._parts: list[str] = [] + self._skip_depth: int = 0 # > 0 while inside a skip tag subtree + self._last_was_block = False + + def handle_starttag(self, tag: str, attrs: list) -> None: + tag = tag.lower() + if tag in _SKIP_TAGS: + self._skip_depth += 1 + elif tag in _BLOCK_TAGS and not self._skip_depth: + # Insert a separator between block elements + if self._parts and not self._last_was_block: + self._parts.append(" ") + self._last_was_block = True + + def handle_endtag(self, tag: str) -> None: + tag = tag.lower() + if tag in _SKIP_TAGS and self._skip_depth: + self._skip_depth -= 1 + elif tag in _BLOCK_TAGS and not self._skip_depth: + if self._parts and not self._last_was_block: + self._parts.append(" ") + self._last_was_block = True + + def handle_data(self, data: str) -> None: + if self._skip_depth: + return + text = data + if text.strip(): + self._parts.append(text) + self._last_was_block = False + + def get_text(self) -> str: + raw = "".join(self._parts) + # Collapse all whitespace sequences to a single space + return re.sub(r"\s+", " ", raw).strip() + + +def extract_text(html: str, max_chars: int = _MAX_TEXT_CHARS) -> str: + """Extract human-readable text from *html*, stripping boilerplate. + + Returns at most *max_chars* characters. Always returns a str (empty + string on empty/unparse-able input). + """ + if not html or not html.strip(): + return "" + try: + parser = _TextExtractor() + parser.feed(html) + text = parser.get_text() + return text[:max_chars] + except Exception: # pragma: no cover โ€” defensive + return "" + + +# --------------------------------------------------------------------------- +# Content hashing +# --------------------------------------------------------------------------- + +def text_content_hash(text: str) -> str: + """Return a short SHA-256 hex digest of *text* for change detection.""" + return hashlib.sha256(text.encode()).hexdigest()[:16] diff --git a/src/mindmark/index.py b/src/mindmark/index.py index 22542bf..b1621e9 100644 --- a/src/mindmark/index.py +++ b/src/mindmark/index.py @@ -13,7 +13,7 @@ DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" -_SCHEMA_VERSION = 2 +_SCHEMA_VERSION = 3 def default_db_path() -> Path: @@ -53,13 +53,31 @@ def default_db_path() -> Path: content_hash TEXT NOT NULL DEFAULT '', PRIMARY KEY (url, source) ); +CREATE TABLE IF NOT EXISTS bookmark_enrichment ( + url TEXT PRIMARY KEY, + status TEXT NOT NULL DEFAULT 'pending', + page_content_hash TEXT NOT NULL DEFAULT '', + summary_text TEXT, + summary_embedding BLOB, + summary_dim INTEGER, + summary_model TEXT, + llm_model TEXT, + fetched_at INTEGER, + summarized_at INTEGER, + error TEXT, + http_status INTEGER +); CREATE INDEX IF NOT EXISTS idx_bookmarks_domain ON bookmarks(domain); CREATE INDEX IF NOT EXISTS idx_bookmarks_folder ON bookmarks(folder_path); +CREATE INDEX IF NOT EXISTS idx_enrichment_status +ON bookmark_enrichment(status); +CREATE INDEX IF NOT EXISTS idx_enrichment_summarized_at +ON bookmark_enrichment(summarized_at); """ def _connect(db_path: Path) -> sqlite3.Connection: - con = sqlite3.connect(db_path) + con = sqlite3.connect(db_path, check_same_thread=False) con.executescript(_SCHEMA) _migrate(con) return con @@ -90,7 +108,38 @@ def _migrate(con: sqlite3.Connection) -> None: """) cur.execute( "INSERT OR REPLACE INTO meta(key, value) VALUES ('schema_version', ?)", - (str(_SCHEMA_VERSION),), + ("2",), + ) + con.commit() + + if version < 3: + cur.execute(""" + CREATE TABLE IF NOT EXISTS bookmark_enrichment ( + url TEXT PRIMARY KEY, + status TEXT NOT NULL DEFAULT 'pending', + page_content_hash TEXT NOT NULL DEFAULT '', + summary_text TEXT, + summary_embedding BLOB, + summary_dim INTEGER, + summary_model TEXT, + llm_model TEXT, + fetched_at INTEGER, + summarized_at INTEGER, + error TEXT, + http_status INTEGER + ) + """) + cur.execute( + "CREATE INDEX IF NOT EXISTS idx_enrichment_status " + "ON bookmark_enrichment(status)" + ) + cur.execute( + "CREATE INDEX IF NOT EXISTS idx_enrichment_summarized_at " + "ON bookmark_enrichment(summarized_at)" + ) + cur.execute( + "INSERT OR REPLACE INTO meta(key, value) VALUES ('schema_version', ?)", + ("3",), ) con.commit() @@ -109,6 +158,76 @@ def _l2_normalize(mat: np.ndarray) -> np.ndarray: return mat / norms +def _find_relevant_excerpt( + query_embedding: np.ndarray, + summary_text: str, + embedder: "object", +) -> str: + """Find the most relevant sentence(s) in summary text for the query. + + Splits summary into sentences, embeds each, finds the most similar to + the query embedding, and returns that sentence with surrounding context + (max 200 chars total). + + Parameters + ---------- + query_embedding : np.ndarray + Normalized embedding of the search query. + summary_text : str + Full extracted summary text (up to 500 chars). + embedder : object + BGE/MiniLM embedder with embed_one() method. + + Returns + ------- + str + Most relevant sentence with surrounding context, or first 150 chars + if sentence detection fails. + """ + if not summary_text or not summary_text.strip(): + return "" + + # Split on sentence boundaries (., !, ?) + sentences = [] + current = [] + for char in summary_text: + current.append(char) + if char in ".!?": + s = "".join(current).strip() + if s and len(s) > 3: # Skip very short fragments + sentences.append(s) + current = [] + if current: + s = "".join(current).strip() + if s and len(s) > 3: + sentences.append(s) + + if not sentences: + # Fallback: return first 150 chars if no sentences found + return summary_text[:150] + + # If only one sentence, return it (up to 200 chars) + if len(sentences) == 1: + return sentences[0][:200] + + # Embed each sentence and find most similar to query + try: + sentence_embeddings = embedder.embed(sentences) + similarities = sentence_embeddings @ query_embedding + best_idx = int(np.argmax(similarities)) + except Exception: + # Fallback if embedding fails + return sentences[0][:200] + + # Return best sentence + surrounding context (up to 200 chars) + best_sentence = sentences[best_idx] + if len(best_sentence) <= 200: + return best_sentence + # Truncate with ellipsis + return best_sentence[:197] + "..." + + + @dataclass class Embedder: model_name: str = DEFAULT_MODEL @@ -166,6 +285,21 @@ def close(self) -> None: """Close the underlying database connection.""" self.con.close() + def _queue_enrichment_pending(self, cur: sqlite3.Cursor, urls: set[str]) -> None: + """Mark URLs for content-enrichment work.""" + if not urls: + return + cur.executemany( + """ + INSERT INTO bookmark_enrichment (url, status) + VALUES (?, 'pending') + ON CONFLICT(url) DO UPDATE SET + status='pending', + error=NULL + """, + [(u,) for u in sorted(urls)], + ) + def is_empty(self) -> bool: cur = self.con.cursor() cur.execute("SELECT COUNT(*) FROM bookmarks") @@ -300,6 +434,8 @@ def sync( (h, url, source), ) + self._queue_enrichment_pending(cur, to_embed_urls) + # Delete bookmarks removed from this source for url in to_delete_urls: cur.execute( @@ -312,6 +448,7 @@ def sync( ) if cur.fetchone()[0] == 0: cur.execute("DELETE FROM bookmarks WHERE url=?", (url,)) + cur.execute("DELETE FROM bookmark_enrichment WHERE url=?", (url,)) result.removed = len(to_delete_urls) self.con.commit() @@ -339,6 +476,7 @@ def _remove_source(self, source: str) -> list[str]: ) if cur.fetchone()[0] == 0: cur.execute("DELETE FROM bookmarks WHERE url=?", (url,)) + cur.execute("DELETE FROM bookmark_enrichment WHERE url=?", (url,)) self.con.commit() except Exception: self.con.rollback() @@ -349,6 +487,7 @@ def rebuild(self, bookmarks: list[Bookmark], batch_size: int = 64) -> dict: cur = self.con.cursor() cur.execute("DELETE FROM bookmarks") cur.execute("DELETE FROM bookmark_sources") + cur.execute("DELETE FROM bookmark_enrichment") cur.execute("INSERT OR REPLACE INTO meta(key, value) VALUES ('model', ?)", (self.model_name,)) self.con.commit() @@ -381,6 +520,7 @@ def rebuild(self, bookmarks: list[Bookmark], batch_size: int = 64) -> dict: "VALUES (?,?,?)", source_rows, ) + self._queue_enrichment_pending(cur, {r[0] for r in rows}) self.con.commit() return {"indexed": total, "model": self.model_name, "dim": rows[0][-2]} @@ -425,6 +565,113 @@ def all_bookmarks(self) -> list[dict]: for r in rows ] + # ------------------------------------------------------------------ + # Enrichment helpers + # ------------------------------------------------------------------ + + def pending_enrichment_urls(self, limit: int | None = None) -> list[str]: + """Return URLs whose enrichment status is 'pending'.""" + cur = self.con.cursor() + if limit is not None and limit > 0: + cur.execute( + "SELECT url FROM bookmark_enrichment WHERE status='pending' LIMIT ?", + (limit,), + ) + else: + cur.execute("SELECT url FROM bookmark_enrichment WHERE status='pending'") + return [r[0] for r in cur.fetchall()] + + def reset_failed_enrichment(self) -> int: + """Mark all 'failed' enrichment rows back to 'pending' for retry.""" + cur = self.con.cursor() + try: + cur.execute( + "UPDATE bookmark_enrichment SET status='pending', error=NULL " + "WHERE status='failed'" + ) + count = cur.rowcount + self.con.commit() + return count + except Exception: + self.con.rollback() + raise + + def save_enrichment( + self, + url: str, + summary_text: str, + summary_embedding: "np.ndarray", + model_name: str, + content_hash: str, + http_status: int | None, + summarized_at: int, + fetched_at: int, + ) -> None: + """Persist a successful enrichment result.""" + vec_blob = _vec_to_blob(summary_embedding) + dim = int(summary_embedding.shape[0]) + cur = self.con.cursor() + try: + cur.execute( + """ + UPDATE bookmark_enrichment SET + status='complete', + page_content_hash=?, + summary_text=?, + summary_embedding=?, + summary_dim=?, + summary_model=?, + http_status=?, + fetched_at=?, + summarized_at=?, + error=NULL + WHERE url=? + """, + ( + content_hash, summary_text, vec_blob, dim, + model_name, http_status, fetched_at, summarized_at, + url, + ), + ) + self.con.commit() + except Exception: + self.con.rollback() + raise + + def fail_enrichment( + self, + url: str, + error: str, + http_status: int | None = None, + fetched_at: int | None = None, + ) -> None: + """Record a fetch/extraction failure for a URL.""" + cur = self.con.cursor() + try: + cur.execute( + """ + UPDATE bookmark_enrichment SET + status='failed', + error=?, + http_status=?, + fetched_at=? + WHERE url=? + """, + (error, http_status, fetched_at, url), + ) + self.con.commit() + except Exception: + self.con.rollback() + raise + + def enrichment_stats(self) -> dict: + """Return a summary of enrichment table status counts.""" + cur = self.con.cursor() + cur.execute( + "SELECT status, COUNT(*) FROM bookmark_enrichment GROUP BY status" + ) + return dict(cur.fetchall()) + def remove_urls(self, urls: list[str]) -> int: """Delete bookmarks and source mappings for the given URLs.""" if not urls: @@ -443,6 +690,10 @@ def remove_urls(self, urls: list[str]) -> int: unique_urls, ) removed = cur.rowcount + cur.execute( + f"DELETE FROM bookmark_enrichment WHERE url IN ({placeholders})", + unique_urls, + ) self.con.commit() return removed except Exception: @@ -465,14 +716,68 @@ def _load_matrix(self): mat[i] = _blob_to_vec(r["embedding"], dim) return mat, rows + def _load_enrichments(self) -> dict: + """Load all complete enrichments as url -> (embedding, text, dim).""" + enrichments = {} + cur = self.con.cursor() + cur.execute( + "SELECT url, summary_embedding, summary_text, summary_dim " + "FROM bookmark_enrichment WHERE status='complete' AND summary_embedding IS NOT NULL" + ) + for url, embedding_blob, summary_text, dim in cur.fetchall(): + if embedding_blob and dim: + vec = _blob_to_vec(embedding_blob, dim) + enrichments[url] = (vec, summary_text, dim) + return enrichments + def search(self, query: str, k: int = 10, domain: str | None = None, - folder: str | None = None) -> list[dict]: + folder: str | None = None, include_excerpt: bool = False) -> list[dict]: + """Search bookmarks by query, with optional summary blending. + + Blends base bookmark embedding similarity with summary embedding + similarity (when available) using weights: 0.65 * base + 0.35 * summary. + + Parameters + ---------- + query : str + Search query text. + k : int + Maximum number of results. + domain : str, optional + Filter by domain substring. + folder : str, optional + Filter by folder substring. + include_excerpt : bool + If True, include relevant excerpt from summary_text in results. + + Returns + ------- + list[dict] + List of results with keys: url, title, folder_path, domain, + score (blended if summary available). If include_excerpt=True, + also includes 'relevant_excerpt' key. + """ mat, rows = self._load_matrix() if len(rows) == 0: return [] + q = self.embedder.embed_one(query) - sims = mat @ q - order = np.argsort(-sims) + base_sims = mat @ q + + # Load complete enrichments for summary blending + enrichments = self._load_enrichments() + + # Compute blended scores for each bookmark + blended_sims = np.array(base_sims, copy=True) + for idx, r in enumerate(rows): + url = r["url"] + if url in enrichments: + summary_embedding, summary_text, summary_dim = enrichments[url] + summary_sim = float(np.dot(summary_embedding, q)) + # Blend: 0.65 * base + 0.35 * summary + blended_sims[idx] = 0.65 * base_sims[idx] + 0.35 * summary_sim + + order = np.argsort(-blended_sims) results: list[dict] = [] for idx in order: r = rows[int(idx)] @@ -480,13 +785,19 @@ def search(self, query: str, k: int = 10, domain: str | None = None, continue if folder and folder.lower() not in r["folder_path"].lower(): continue - results.append({ - "score": float(sims[int(idx)]), + result = { + "score": float(blended_sims[int(idx)]), "title": r["title"], "url": r["url"], "folder_path": r["folder_path"], "domain": r["domain"], - }) + } + if include_excerpt and r["url"] in enrichments: + summary_text = enrichments[r["url"]][1] + # Find and include the most relevant excerpt for the query + if summary_text: + result["relevant_excerpt"] = _find_relevant_excerpt(q, summary_text, self.embedder) + results.append(result) if len(results) >= k: break return results diff --git a/tests/test_drop_index_cli.py b/tests/test_drop_index_cli.py new file mode 100644 index 0000000..a23a3cc --- /dev/null +++ b/tests/test_drop_index_cli.py @@ -0,0 +1,50 @@ +"""Tests for dropping the local index via CLI flag.""" +from __future__ import annotations + +from pathlib import Path + +import pytest + +from mindmark import cli + + +def test_main_drop_index_deletes_file(tmp_path): + db = tmp_path / "drop.db" + db.write_text("placeholder", encoding="utf-8") + + rc = cli.main(["--db", str(db), "drop-index", "--yes"]) + assert rc == 0 + assert not db.exists() + + +def test_main_drop_index_cancelled_by_prompt(tmp_path, monkeypatch): + db = tmp_path / "drop_cancel.db" + db.write_text("placeholder", encoding="utf-8") + monkeypatch.setattr("builtins.input", lambda _prompt: "n") + + rc = cli.main(["--db", str(db), "drop-index"]) + assert rc == 0 + assert db.exists() + + +def test_main_drop_index_rejects_subcommand(tmp_path): + db = tmp_path / "drop_reject.db" + with pytest.raises(SystemExit): + cli.main(["drop-index", "stats", "--db", str(db)]) + + +def test_main_drop_index_rejects_validate_combo(tmp_path): + db = tmp_path / "drop_validate_combo.db" + with pytest.raises(SystemExit): + cli.main(["drop-index", "--validate", "--db", str(db)]) + + +def test_main_drop_index_permission_fallback(tmp_path, monkeypatch): + db = tmp_path / "locked.db" + db.write_text("placeholder", encoding="utf-8") + + monkeypatch.setattr(Path, "unlink", lambda _self: (_ for _ in ()).throw(PermissionError("in use"))) + monkeypatch.setattr(cli, "_clear_index_contents", lambda _path: True) + + rc = cli.main(["--db", str(db), "drop-index", "--yes"]) + assert rc == 0 diff --git a/tests/test_enricher.py b/tests/test_enricher.py new file mode 100644 index 0000000..29df4c5 --- /dev/null +++ b/tests/test_enricher.py @@ -0,0 +1,242 @@ +"""Tests for the enrichment pipeline (enricher.py).""" +from __future__ import annotations + +import time +from dataclasses import dataclass +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest + +from mindmark.enricher import ( + SUMMARY_CHARS, + BatchEnrichResult, + EnrichResult, + _enrich_one, + enrich_pending, +) +from mindmark.fetcher import FetchResult + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _fake_fetch_result( + url: str = "https://example.com", + html: str = "

Hello world content here.

", + http_status: int = 200, + error: str | None = None, + content_type: str = "text/html", +) -> FetchResult: + return FetchResult( + url=url, + html=html, + http_status=http_status, + content_type=content_type, + error=error, + fetched_at=int(time.time()), + ) + + +def _make_mock_idx(pending: list[str] | None = None) -> MagicMock: + """Return a minimal mock Index.""" + idx = MagicMock() + idx.model_name = "test-model" + # Default pending queue + idx.pending_enrichment_urls.return_value = pending or [] + # embedder.embed_one returns a small float32 vector + idx.embedder.embed_one.side_effect = lambda text: np.ones(4, dtype=np.float32) + # con.cursor() for the skip-check path + cur = MagicMock() + cur.fetchone.return_value = None # no existing row by default + idx.con.cursor.return_value = cur + return idx + + +# --------------------------------------------------------------------------- +# _enrich_one โ€” success path +# --------------------------------------------------------------------------- + +class TestEnrichOne: + def test_complete_status_on_success(self): + idx = _make_mock_idx() + + with patch("mindmark.enricher.fetch_page", return_value=_fake_fetch_result()): + result = _enrich_one("https://example.com", idx, timeout=5.0) + + assert result.status == "complete" + assert result.url == "https://example.com" + assert result.error is None + idx.save_enrichment.assert_called_once() + + def test_save_enrichment_receives_correct_url(self): + idx = _make_mock_idx() + + with patch("mindmark.enricher.fetch_page", return_value=_fake_fetch_result(url="https://a.com")): + _enrich_one("https://a.com", idx, timeout=5.0) + + call_kwargs = idx.save_enrichment.call_args.kwargs + assert call_kwargs["url"] == "https://a.com" + assert call_kwargs["model_name"] == "test-model" + + def test_http_status_propagated_on_success(self): + idx = _make_mock_idx() + fetch = _fake_fetch_result(http_status=200) + + with patch("mindmark.enricher.fetch_page", return_value=fetch): + result = _enrich_one("https://example.com", idx, timeout=5.0) + + assert result.http_status == 200 + + # --------------------------------------------------------------------------- + # _enrich_one โ€” failure paths + # --------------------------------------------------------------------------- + + def test_failed_status_on_http_error(self): + idx = _make_mock_idx() + bad = _fake_fetch_result(html=None, http_status=404, error="Not Found") + # Simulate a non-ok result: FetchResult.ok is False when error is set + bad = FetchResult( + url="https://example.com", + html=None, + http_status=404, + content_type="text/html", + error="Not Found", + fetched_at=int(time.time()), + ) + + with patch("mindmark.enricher.fetch_page", return_value=bad): + result = _enrich_one("https://example.com", idx, timeout=5.0) + + assert result.status == "failed" + assert result.http_status == 404 + idx.fail_enrichment.assert_called_once() + + def test_failed_status_on_empty_text(self): + idx = _make_mock_idx() + # Page with no visible text (only tags) + empty_html = "x" + + with patch("mindmark.enricher.fetch_page", return_value=_fake_fetch_result(html=empty_html)): + with patch("mindmark.enricher.extract_text", return_value=" "): + result = _enrich_one("https://example.com", idx, timeout=5.0) + + assert result.status == "failed" + assert "no extractable text" in (result.error or "") + idx.fail_enrichment.assert_called_once() + + def test_failed_status_on_embedding_error(self): + idx = _make_mock_idx() + idx.embedder.embed_one.side_effect = RuntimeError("ONNX runtime error") + + with patch("mindmark.enricher.fetch_page", return_value=_fake_fetch_result()): + result = _enrich_one("https://example.com", idx, timeout=5.0) + + assert result.status == "failed" + assert "embedding error" in (result.error or "") + + # --------------------------------------------------------------------------- + # _enrich_one โ€” skip path + # --------------------------------------------------------------------------- + + def test_skipped_when_content_hash_unchanged(self): + idx = _make_mock_idx() + html = "

Stable content

" + + # First call to extract_text returns a known string; we pre-compute its hash + from mindmark.fetcher import extract_text, text_content_hash + text = extract_text(html, max_chars=SUMMARY_CHARS) + content_hash = text_content_hash(text) + + # Simulate existing DB row with matching hash and status=complete + cur = MagicMock() + cur.fetchone.return_value = (content_hash, "complete") + idx.con.cursor.return_value = cur + + with patch("mindmark.enricher.fetch_page", return_value=_fake_fetch_result(html=html)): + result = _enrich_one("https://example.com", idx, timeout=5.0) + + assert result.status == "skipped" + idx.save_enrichment.assert_not_called() + + def test_not_skipped_when_hash_differs(self): + idx = _make_mock_idx() + + # Row exists but hash is different โ†’ should re-embed + cur = MagicMock() + cur.fetchone.return_value = ("oldhash", "complete") + idx.con.cursor.return_value = cur + + with patch("mindmark.enricher.fetch_page", return_value=_fake_fetch_result()): + result = _enrich_one("https://example.com", idx, timeout=5.0) + + assert result.status == "complete" + idx.save_enrichment.assert_called_once() + + +# --------------------------------------------------------------------------- +# enrich_pending โ€” batch orchestration +# --------------------------------------------------------------------------- + +class TestEnrichPending: + def test_empty_pending_returns_zero_batch(self): + idx = _make_mock_idx(pending=[]) + result = enrich_pending(idx, limit=None, workers=2, timeout=5.0) + + assert result.total == 0 + assert result.complete == 0 + assert result.failed == 0 + assert result.skipped == 0 + + def test_batch_counts_mixed_results(self): + urls = [ + "https://ok.com", + "https://fail.com", + "https://skip.com", + ] + idx = _make_mock_idx(pending=urls) + + def side_enrich(url, index, timeout): + if "ok" in url: + return EnrichResult(url=url, status="complete") + if "fail" in url: + return EnrichResult(url=url, status="failed", error="err") + return EnrichResult(url=url, status="skipped") + + with patch("mindmark.enricher._enrich_one", side_effect=side_enrich): + result = enrich_pending(idx, workers=2, timeout=5.0) + + assert result.complete == 1 + assert result.failed == 1 + assert result.skipped == 1 + assert result.total == 3 + + def test_limit_passed_to_pending_urls(self): + idx = _make_mock_idx(pending=[]) + enrich_pending(idx, limit=10, workers=2, timeout=5.0) + idx.pending_enrichment_urls.assert_called_once_with(limit=10) + + def test_refresh_failed_calls_reset(self): + idx = _make_mock_idx(pending=[]) + idx.reset_failed_enrichment.return_value = 3 + + enrich_pending(idx, refresh_failed=True, workers=2, timeout=5.0) + + idx.reset_failed_enrichment.assert_called_once() + + def test_refresh_failed_false_does_not_reset(self): + idx = _make_mock_idx(pending=[]) + enrich_pending(idx, refresh_failed=False, workers=2, timeout=5.0) + idx.reset_failed_enrichment.assert_not_called() + + def test_str_representation(self): + r = BatchEnrichResult(complete=3, failed=1, skipped=2) + s = str(r) + assert "complete=3" in s + assert "failed=1" in s + assert "skipped=2" in s + + def test_total_property(self): + r = BatchEnrichResult(complete=2, failed=1, skipped=4) + assert r.total == 7 diff --git a/tests/test_fetcher.py b/tests/test_fetcher.py new file mode 100644 index 0000000..25c12ca --- /dev/null +++ b/tests/test_fetcher.py @@ -0,0 +1,386 @@ +"""Deterministic tests for fetcher.py โ€” no network access.""" +from __future__ import annotations + +import io +import time +from http.client import HTTPMessage +from types import SimpleNamespace +from unittest.mock import MagicMock, patch +from urllib.error import HTTPError, URLError + +import pytest + +from mindmark.fetcher import ( + FetchResult, + _looks_like_html, + extract_text, + fetch_page, + text_content_hash, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_response(body: bytes, status: int = 200, content_type: str = "text/html; charset=utf-8"): + """Build a minimal mock urlopen response.""" + headers = HTTPMessage() + headers["Content-Type"] = content_type + resp = MagicMock() + resp.status = status + resp.headers = headers + # Simulate streaming read + _stream = io.BytesIO(body) + resp.read.side_effect = lambda n: _stream.read(n) + resp.__enter__ = lambda s: s + resp.__exit__ = MagicMock(return_value=False) + return resp + + +# --------------------------------------------------------------------------- +# fetch_page โ€” non-HTTP skipping +# --------------------------------------------------------------------------- + +def test_fetch_non_http_url_skipped(): + result = fetch_page("file:///etc/hosts") + assert not result.ok + assert "non-http" in result.error + + +def test_fetch_ftp_url_skipped(): + result = fetch_page("ftp://example.com/file.txt") + assert not result.ok + + +def test_fetch_empty_string_skipped(): + result = fetch_page("") + assert not result.ok + + +# --------------------------------------------------------------------------- +# fetch_page โ€” successful fetch +# --------------------------------------------------------------------------- + +def test_fetch_returns_html_on_200(monkeypatch): + body = b"

Hello world

" + resp = _make_response(body) + + with patch("mindmark.fetcher.urlopen", return_value=resp): + result = fetch_page("https://example.com/", timeout=5) + + assert result.ok + assert result.http_status == 200 + assert "Hello world" in result.html + assert result.error is None + + +def test_fetch_sets_fetched_at(monkeypatch): + before = int(time.time()) + body = b"ok" + resp = _make_response(body) + + with patch("mindmark.fetcher.urlopen", return_value=resp): + result = fetch_page("https://example.com/") + + assert result.fetched_at >= before + + +def test_fetch_decodes_charset_from_content_type(): + body = "hรฉllo".encode("latin-1") + resp = _make_response(body, content_type="text/html; charset=iso-8859-1") + + with patch("mindmark.fetcher.urlopen", return_value=resp): + result = fetch_page("https://example.com/") + + assert result.ok + assert "hรฉllo" in result.html + + +# --------------------------------------------------------------------------- +# fetch_page โ€” HTTP errors +# --------------------------------------------------------------------------- + +def test_fetch_404_returns_error(): + with patch("mindmark.fetcher.urlopen", side_effect=HTTPError( + url="https://example.com/", code=404, msg="Not Found", hdrs=None, fp=None + )): + result = fetch_page("https://example.com/missing") + + assert not result.ok + assert result.http_status == 404 + assert "404" in result.error + + +def test_fetch_500_returns_error(): + with patch("mindmark.fetcher.urlopen", side_effect=HTTPError( + url="https://example.com/", code=500, msg="Internal Server Error", hdrs=None, fp=None + )): + result = fetch_page("https://example.com/boom") + + assert not result.ok + assert result.http_status == 500 + + +# --------------------------------------------------------------------------- +# fetch_page โ€” connection/timeout errors +# --------------------------------------------------------------------------- + +def test_fetch_url_error_returns_error(): + with patch("mindmark.fetcher.urlopen", side_effect=URLError("Name or service not known")): + result = fetch_page("https://nonexistent.invalid/") + + assert not result.ok + assert result.http_status is None + assert "connection error" in result.error + + +def test_fetch_timeout_returns_error(): + with patch("mindmark.fetcher.urlopen", side_effect=TimeoutError()): + result = fetch_page("https://slow.example.com/") + + assert not result.ok + assert "timeout" in result.error + + +# --------------------------------------------------------------------------- +# fetch_page โ€” non-HTML content types +# --------------------------------------------------------------------------- + +def test_fetch_pdf_skipped(): + resp = _make_response(b"%PDF-1.4", content_type="application/pdf") + + with patch("mindmark.fetcher.urlopen", return_value=resp): + result = fetch_page("https://example.com/doc.pdf") + + assert not result.ok + assert "non-HTML" in result.error + + +def test_fetch_image_skipped(): + resp = _make_response(b"\x89PNG\r\n", content_type="image/png") + + with patch("mindmark.fetcher.urlopen", return_value=resp): + result = fetch_page("https://example.com/img.png") + + assert not result.ok + + +def test_fetch_json_skipped(): + resp = _make_response(b'{"key": "val"}', content_type="application/json") + + with patch("mindmark.fetcher.urlopen", return_value=resp): + result = fetch_page("https://example.com/api") + + assert not result.ok + + +# --------------------------------------------------------------------------- +# fetch_page โ€” size cap +# --------------------------------------------------------------------------- + +def test_fetch_caps_body_size(): + # 600 KB body โ€” only first 512 KB should be kept + body = b"A" * (600 * 1024) + resp = _make_response(body) + + with patch("mindmark.fetcher.urlopen", return_value=resp): + result = fetch_page("https://example.com/big", max_bytes=512 * 1024) + + assert result.ok + assert len(result.html.encode()) <= 512 * 1024 + 100 # small tolerance for encoding + + +# --------------------------------------------------------------------------- +# _looks_like_html helper +# --------------------------------------------------------------------------- + +@pytest.mark.parametrize("ct,expected", [ + ("text/html; charset=utf-8", True), + ("text/html", True), + ("application/xhtml+xml", True), + ("application/pdf", False), + ("image/png", False), + ("application/json", False), + ("text/plain", False), +]) +def test_looks_like_html(ct, expected): + assert _looks_like_html(ct) is expected + + +# --------------------------------------------------------------------------- +# extract_text โ€” basic extraction +# --------------------------------------------------------------------------- + +def test_extract_basic_paragraph(): + html = "

Hello world

" + assert "Hello world" in extract_text(html) + + +def test_extract_heading_and_paragraph(): + html = "

Title

Body text here.

" + text = extract_text(html) + assert "Title" in text + assert "Body text here." in text + + +def test_extract_multiple_paragraphs_joined(): + html = "

First.

Second.

Third.

" + text = extract_text(html) + assert "First." in text + assert "Second." in text + assert "Third." in text + + +# --------------------------------------------------------------------------- +# extract_text โ€” boilerplate stripping +# --------------------------------------------------------------------------- + +def test_extract_strips_script(): + html = "

Good

Also good

" + text = extract_text(html) + assert "Good" in text + assert "alert" not in text + + +def test_extract_strips_style(): + html = "

Content

" + text = extract_text(html) + assert "Content" in text + assert "color" not in text + + +def test_extract_strips_nav(): + html = "

Article text

" + text = extract_text(html) + assert "Article text" in text + assert "Home" not in text + assert "About" not in text + + +def test_extract_strips_header(): + html = "

Site Logo

Main content

" + text = extract_text(html) + assert "Main content" in text + assert "Site Logo" not in text + + +def test_extract_strips_footer(): + html = "

Body

" + text = extract_text(html) + assert "Body" in text + assert "Copyright" not in text + + +def test_extract_strips_nested_skip_tags(): + html = "

Real content

" + text = extract_text(html) + assert "Real content" in text + assert "Nested nav link" not in text + + +def test_extract_strips_head_section(): + html = "Page Title

Body content

" + text = extract_text(html) + assert "Body content" in text + assert "Page Title" not in text + + +# --------------------------------------------------------------------------- +# extract_text โ€” whitespace normalisation +# --------------------------------------------------------------------------- + +def test_extract_normalises_whitespace(): + html = "

Too many spaces

" + text = extract_text(html) + assert " " not in text + assert "Too many spaces" in text + + +def test_extract_collapses_newlines(): + html = "

Line\n\n\none

" + text = extract_text(html) + assert "\n\n" not in text + + +# --------------------------------------------------------------------------- +# extract_text โ€” edge cases +# --------------------------------------------------------------------------- + +def test_extract_empty_string(): + assert extract_text("") == "" + + +def test_extract_whitespace_only(): + assert extract_text(" \n\t ") == "" + + +def test_extract_no_visible_text(): + html = "" + # All text is inside nav, so result should be empty or very short + text = extract_text(html) + assert "x" not in text + + +def test_extract_html_entities_decoded(): + html = "

Café & Boulangerie <Paris>

" + text = extract_text(html) + assert "Cafรฉ" in text + assert "é" not in text + assert "&" not in text + + +def test_extract_truncates_to_max_chars(): + html = "

" + "word " * 5000 + "

" + text = extract_text(html, max_chars=100) + assert len(text) <= 100 + + +def test_extract_real_world_structure(): + html = """ + + My Blog + +
+
+
+

Understanding Python Async

+

Async programming in Python uses the asyncio module.

+

It allows concurrent IO-bound tasks without threads.

+
+
+ + + + + """ + text = extract_text(html) + assert "Understanding Python Async" in text + assert "asyncio" in text + assert "concurrent IO-bound" in text + # boilerplate + assert "Home" not in text + assert "Related" not in text + assert "ยฉ 2026" not in text + + +# --------------------------------------------------------------------------- +# text_content_hash +# --------------------------------------------------------------------------- + +def test_hash_is_deterministic(): + assert text_content_hash("hello") == text_content_hash("hello") + + +def test_hash_changes_on_different_input(): + assert text_content_hash("hello") != text_content_hash("world") + + +def test_hash_is_16_chars(): + h = text_content_hash("some text") + assert len(h) == 16 + + +def test_hash_empty_string(): + h = text_content_hash("") + assert len(h) == 16 diff --git a/tests/test_incremental_sync.py b/tests/test_incremental_sync.py index 41e8f36..5e826f6 100644 --- a/tests/test_incremental_sync.py +++ b/tests/test_incremental_sync.py @@ -181,6 +181,51 @@ def test_schema_migration_on_old_db(tmp_path): "SELECT name FROM sqlite_master WHERE type='table'" )} assert "bookmark_sources" in tables + assert "bookmark_enrichment" in tables + + cur.execute("SELECT value FROM meta WHERE key='schema_version'") + assert cur.fetchone()[0] == "3" + finally: + idx.close() + + +def test_schema_v2_db_migrates_to_v3(tmp_path): + """Ensure opening a v2 database adds enrichment table and bumps version.""" + db_path = tmp_path / "v2.db" + con = sqlite3.connect(db_path) + con.executescript(""" + CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL); + CREATE TABLE bookmarks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + folder_path TEXT NOT NULL, + domain TEXT NOT NULL, + add_date INTEGER NOT NULL, + icon TEXT, + embedding BLOB NOT NULL, + dim INTEGER NOT NULL, + content_hash TEXT NOT NULL DEFAULT '' + ); + CREATE TABLE bookmark_sources ( + url TEXT NOT NULL, + source TEXT NOT NULL, + content_hash TEXT NOT NULL DEFAULT '', + PRIMARY KEY (url, source) + ); + INSERT INTO meta(key, value) VALUES ('schema_version', '2'); + """) + con.close() + + idx = Index(db_path=db_path) + try: + cur = idx.con.cursor() + tables = {r[0] for r in cur.execute( + "SELECT name FROM sqlite_master WHERE type='table'" + )} + assert "bookmark_enrichment" in tables + cur.execute("SELECT value FROM meta WHERE key='schema_version'") + assert cur.fetchone()[0] == "3" finally: idx.close() @@ -214,6 +259,22 @@ def test_rebuild_populates_bookmark_sources(idx): assert rows[1] == ("https://b.com", "html") +def test_rebuild_marks_enrichment_pending(idx): + bms = [ + _make_bookmark("https://a.com", "A"), + _make_bookmark("https://b.com", "B"), + ] + idx.rebuild(bms) + + cur = idx.con.cursor() + cur.execute("SELECT url, status FROM bookmark_enrichment ORDER BY url") + rows = cur.fetchall() + assert rows == [ + ("https://a.com", "pending"), + ("https://b.com", "pending"), + ] + + def test_rebuild_clears_previous_data(idx): """rebuild() should clear old bookmarks and sources before inserting.""" idx.rebuild([_make_bookmark("https://old.com", "Old")]) @@ -226,6 +287,8 @@ def test_rebuild_clears_previous_data(idx): assert cur.fetchone()[0] == "https://new.com" cur.execute("SELECT COUNT(*) FROM bookmark_sources") assert cur.fetchone()[0] == 1 + cur.execute("SELECT COUNT(*) FROM bookmark_enrichment") + assert cur.fetchone()[0] == 1 def test_rebuild_empty_list(idx): @@ -320,6 +383,44 @@ def test_search_k_limit(idx): assert len(results) == 5 +def test_sync_adds_enrichment_pending_for_changed_urls(idx): + bms = [ + _make_bookmark("https://a.com", "A"), + _make_bookmark("https://b.com", "B"), + ] + idx.sync(bms, source="test") + + cur = idx.con.cursor() + cur.execute("SELECT url, status FROM bookmark_enrichment ORDER BY url") + rows = cur.fetchall() + assert rows == [ + ("https://a.com", "pending"), + ("https://b.com", "pending"), + ] + + +def test_sync_removes_enrichment_when_bookmark_orphaned(idx): + bms = [_make_bookmark("https://a.com", "A")] + idx.sync(bms, source="chrome:Default") + idx.sync([], source="chrome:Default") + + cur = idx.con.cursor() + cur.execute("SELECT COUNT(*) FROM bookmark_enrichment WHERE url = ?", ("https://a.com",)) + assert cur.fetchone()[0] == 0 + + +def test_remove_urls_also_clears_enrichment(idx): + bms = [_make_bookmark("https://a.com", "A")] + idx.rebuild(bms) + + removed = idx.remove_urls(["https://a.com"]) + assert removed == 1 + + cur = idx.con.cursor() + cur.execute("SELECT COUNT(*) FROM bookmark_enrichment WHERE url = ?", ("https://a.com",)) + assert cur.fetchone()[0] == 0 + + # ---- _remove_source() tests ---- def test_remove_source_cleans_orphans(idx): @@ -337,3 +438,331 @@ def test_remove_source_preserves_other_sources(idx): idx._remove_source("chrome:Default") assert not idx.is_empty() # firefox still references it + + +# ---- Phase 4: Search Fusion with Summary Blending ---- + +class TestSearchFusion: + def test_search_with_excerpt(self, idx): + """include_excerpt flag adds relevant_excerpt to results.""" + bms = [_make_bookmark("https://a.com", "A")] + idx.sync(bms, source="test") + + # Manually add a complete enrichment + summary_text = "This is a test summary of the page content." + test_vec = np.ones(4, dtype=np.float32) / 2.0 + idx.save_enrichment( + url="https://a.com", + summary_text=summary_text, + summary_embedding=test_vec, + model_name="test-model", + content_hash="testhash", + http_status=200, + fetched_at=123, + summarized_at=124, + ) + + results = idx.search("a", include_excerpt=True) + assert len(results) >= 1 + assert "relevant_excerpt" in results[0] + + def test_search_without_excerpt_flag_omits_it(self, idx): + """include_excerpt=False does not add relevant_excerpt.""" + bms = [_make_bookmark("https://a.com", "A")] + idx.sync(bms, source="test") + + summary_text = "This is a test summary." + test_vec = np.ones(4, dtype=np.float32) / 2.0 + idx.save_enrichment( + url="https://a.com", + summary_text=summary_text, + summary_embedding=test_vec, + model_name="test-model", + content_hash="testhash", + http_status=200, + fetched_at=123, + summarized_at=124, + ) + + results = idx.search("a", include_excerpt=False) + assert len(results) >= 1 + assert "relevant_excerpt" not in results[0] + + def test_search_with_excerpt_generates_sentence(self, idx): + """Excerpt extraction finds most relevant sentence.""" + bms = [_make_bookmark("https://a.com", "A")] + idx.sync(bms, source="test") + + # Long summary + summary_text = "x" * 200 + test_vec = np.ones(4, dtype=np.float32) / 2.0 + idx.save_enrichment( + url="https://a.com", + summary_text=summary_text, + summary_embedding=test_vec, + model_name="test-model", + content_hash="testhash", + http_status=200, + fetched_at=123, + summarized_at=124, + ) + + results = idx.search("a", include_excerpt=True) + assert len(results) >= 1 + # Should have relevant_excerpt (from sentence extraction) + assert "relevant_excerpt" in results[0] + + def test_blended_score_computation(self, idx): + """Verify blended score = 0.65*base + 0.35*summary.""" + bms = [_make_bookmark("https://example.com", "Example")] + idx.sync(bms, source="test") + + # Add a summary embedding + summary_vec = np.array([0.5, 0.5, 0.5, 0.5], dtype=np.float32) + summary_vec /= np.linalg.norm(summary_vec) + idx.save_enrichment( + url="https://example.com", + summary_text="test summary", + summary_embedding=summary_vec, + model_name="test-model", + content_hash="testhash", + http_status=200, + fetched_at=123, + summarized_at=124, + ) + + # Search and get score + results = idx.search("example", k=1) + assert len(results) == 1 + blended = results[0]["score"] + assert -1 <= blended <= 1 + + def test_search_respects_domain_filter_with_summaries(self, idx): + """Domain filter works alongside summary blending.""" + bms = [ + _make_bookmark("https://a.com", "A"), + _make_bookmark("https://b.com", "B"), + ] + idx.sync(bms, source="test") + + # Add summaries to both + test_vec = np.ones(4, dtype=np.float32) / 2.0 + for url in ["https://a.com", "https://b.com"]: + idx.save_enrichment( + url=url, + summary_text="summary", + summary_embedding=test_vec, + model_name="test-model", + content_hash="hash", + http_status=200, + fetched_at=123, + summarized_at=124, + ) + + # Filter by domain + results = idx.search("", domain="b.com") + assert all("b.com" in r["domain"] for r in results) + + def test_search_respects_folder_filter_with_summaries(self, idx): + """Folder filter works alongside summary blending.""" + bms = [ + _make_bookmark("https://a.com", "A", folder="Folder1"), + _make_bookmark("https://b.com", "B", folder="Folder2"), + ] + idx.sync(bms, source="test") + + # Add summaries + test_vec = np.ones(4, dtype=np.float32) / 2.0 + for url in ["https://a.com", "https://b.com"]: + idx.save_enrichment( + url=url, + summary_text="summary", + summary_embedding=test_vec, + model_name="test-model", + content_hash="hash", + http_status=200, + fetched_at=123, + summarized_at=124, + ) + + # Filter by folder + results = idx.search("", folder="Folder1") + assert len(results) >= 1 + assert "Folder1" in results[0]["folder_path"] + + def test_failed_enrichment_not_used_in_blend(self, idx): + """Failed enrichment rows are skipped (only 'complete' used).""" + bms = [_make_bookmark("https://a.com", "A")] + idx.sync(bms, source="test") + + # Mark as failed + idx.fail_enrichment("https://a.com", error="test error") + + # Search should still work, using only base embedding + results = idx.search("a") + assert len(results) >= 1 + # No excerpt should be included + assert "relevant_excerpt" not in results[0] + + def test_search_with_relevant_excerpt(self, idx): + """include_excerpt with relevant_excerpt shows most relevant sentence.""" + bms = [_make_bookmark("https://a.com", "A")] + idx.sync(bms, source="test") + + summary_text = "First sentence here. Second sentence with details. Third sentence." + test_vec = np.ones(4, dtype=np.float32) / 2.0 + idx.save_enrichment( + url="https://a.com", + summary_text=summary_text, + summary_embedding=test_vec, + model_name="test-model", + content_hash="testhash", + http_status=200, + fetched_at=123, + summarized_at=124, + ) + + results = idx.search("a", include_excerpt=True) + assert len(results) >= 1 + # Should have relevant_excerpt with one of the sentences + assert "relevant_excerpt" in results[0] + excerpt = results[0]["relevant_excerpt"] + assert excerpt in [ + "First sentence here.", + "Second sentence with details.", + "Third sentence.", + ] + + +# ---- Phase 5: UX Improvements (Excerpt Extraction) ---- + +from mindmark.index import _find_relevant_excerpt + + +class TestRelevantExcerpt: + def test_empty_summary_returns_empty(self): + embedder_mock = MagicMock() + query_vec = np.ones(4, dtype=np.float32) + result = _find_relevant_excerpt(query_vec, "", embedder_mock) + assert result == "" + + def test_whitespace_only_returns_empty(self): + embedder_mock = MagicMock() + query_vec = np.ones(4, dtype=np.float32) + result = _find_relevant_excerpt(query_vec, " \n\t ", embedder_mock) + assert result == "" + + def test_single_sentence_returned_as_is(self): + embedder_mock = MagicMock() + query_vec = np.ones(4, dtype=np.float32) + text = "This is a single sentence." + result = _find_relevant_excerpt(query_vec, text, embedder_mock) + assert result == text + + def test_long_single_sentence_truncated(self): + embedder_mock = MagicMock() + query_vec = np.ones(4, dtype=np.float32) + text = "x" * 300 + "." + result = _find_relevant_excerpt(query_vec, text, embedder_mock) + assert len(result) <= 200 + + def test_multiple_sentences_picks_best_match(self): + """With multiple sentences, should return the best-matching one.""" + embedder_mock = MagicMock() + query_vec = np.array([1.0, 0.0, 0.0, 0.0], dtype=np.float32) + + def fake_embed(texts): + dim = 4 + vecs = np.zeros((len(texts), dim), dtype=np.float32) + for i, text in enumerate(texts): + seed = sum(ord(c) for c in text) % 100 + rng = np.random.RandomState(seed) + v = rng.randn(dim).astype(np.float32) + v = v / (np.linalg.norm(v) + 1e-8) + vecs[i] = v + return vecs + + embedder_mock.embed.side_effect = fake_embed + + text = "First sentence here. Second sentence there. Third sentence too." + result = _find_relevant_excerpt(query_vec, text, embedder_mock) + + # Should be one of the sentences + assert result in [ + "First sentence here.", + "Second sentence there.", + "Third sentence too.", + ] + + def test_no_sentence_markers_returns_first_150(self): + """If no sentence markers and text is too short, return as-is (up to 200).""" + embedder_mock = MagicMock() + query_vec = np.ones(4, dtype=np.float32) + text = "x" * 80 # No punctuation, shorter than 200 + result = _find_relevant_excerpt(query_vec, text, embedder_mock) + # Unseparated text is treated as single sentence, returned up to 200 chars + assert result == text + assert len(result) == 80 + + def test_long_unseparated_text_truncated_to_200(self): + """Long text with no sentence markers gets treated as one sentence.""" + embedder_mock = MagicMock() + query_vec = np.ones(4, dtype=np.float32) + text = "x" * 300 # No punctuation, longer than 200 + result = _find_relevant_excerpt(query_vec, text, embedder_mock) + # Unseparated text is single "sentence", truncated to 200 chars + assert result == "x" * 200 + assert len(result) == 200 + + def test_short_fragments_ignored(self): + """Sentence fragments < 3 chars should be skipped.""" + embedder_mock = MagicMock() + query_vec = np.ones(4, dtype=np.float32) + + def fake_embed(texts): + dim = 4 + vecs = np.zeros((len(texts), dim), dtype=np.float32) + for i in range(len(texts)): + vecs[i] = np.ones(dim) + return vecs + + embedder_mock.embed.side_effect = fake_embed + + text = "a. b. This is a real sentence." + result = _find_relevant_excerpt(query_vec, text, embedder_mock) + # Should return the real sentence, not the fragments + assert "This is a real sentence" in result + + def test_embedding_error_fallback(self): + """If embedding fails, fallback to first sentence.""" + + class FailingEmbedder: + def embed(self, texts): + raise RuntimeError("Embedding failed") + + query_vec = np.ones(4, dtype=np.float32) + text = "First sentence. Second sentence." + result = _find_relevant_excerpt(query_vec, text, FailingEmbedder()) + # Should return first sentence + assert result == "First sentence." + + def test_long_excerpt_truncated_with_ellipsis(self): + """Excerpts longer than 200 chars should be truncated with ...""" + embedder_mock = MagicMock() + query_vec = np.ones(4, dtype=np.float32) + + def fake_embed(texts): + dim = 4 + vecs = np.zeros((len(texts), dim), dtype=np.float32) + # Make first sentence most similar + vecs[0] = query_vec + return vecs + + embedder_mock.embed.side_effect = fake_embed + + long_sentence = "A" * 250 + "." + text = long_sentence + " Short." + result = _find_relevant_excerpt(query_vec, text, embedder_mock) + # Should end with ... + assert result.endswith("...") + assert len(result) <= 200