diff --git a/README.md b/README.md index f3474fd..5620910 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,8 @@ Ask in natural language — mindmark remembers what you saved. | `mindmark open "query"` | Search and open the best match in your default browser | | `mindmark stats` | Show index size, model info, top domains, and top folders | | `mindmark index ` | Import bookmarks from an exported HTML file (legacy workflow) | +| `mindmark validate` | Check indexed bookmark URLs for stale links (HTTP 4xx/5xx or unreachable) and report them | +| `mindmark drop-index` | Delete the local SQLite index database (with confirmation unless `--yes`) | > 🔌 **Works offline** after the first run. Embeddings run on-device via [fastembed](https://github.com/qdrant/fastembed) (ONNX Runtime, ~130 MB one-time model download). @@ -280,6 +282,28 @@ For the `sync` workflow, just rerun `mindmark sync`. It's incremental — only c For the `index` workflow, rerun `mindmark index `. It clears and rebuilds the index. The model is cached, so re-indexing 800+ bookmarks takes only seconds. +### Drop the local index + +Use `drop-index` to remove the local SQLite index database when you want a clean slate. + +```bash +mindmark drop-index # asks for confirmation +mindmark drop-index --yes # skip confirmation +mindmark drop-index --db /path/to/index.db +``` + +### Validate stale links + +Use `validate` to probe all indexed HTTP(S) bookmark URLs and identify stale ones (HTTP 4xx/5xx or unreachable hosts). Mindmark will report which bookmarks may be stale and where they are located, but does not modify them. You can then manually remove stale bookmarks from your browser or re-index after cleaning them up. + +```bash +mindmark validate # identify all stale bookmarks +mindmark validate --timeout 5 # per-request timeout in seconds (default 8) +mindmark validate --workers 32 # parallel URL checks (default 16) +``` + +Non-HTTP URLs (for example `file:` or browser-internal URLs) are skipped and not checked. + ### Swap the embedding model ```bash diff --git a/src/mindmark/cli.py b/src/mindmark/cli.py index 79aeee5..9400558 100644 --- a/src/mindmark/cli.py +++ b/src/mindmark/cli.py @@ -1,17 +1,168 @@ -"""Command-line interface for mindmark.""" from __future__ import annotations import argparse +import concurrent.futures import os +import shutil +import sqlite3 import sys import webbrowser from pathlib import Path +from urllib.error import HTTPError, URLError +from urllib.parse import urlparse +from urllib.request import Request, urlopen from . import __version__ from .parser import parse_file from .index import Index, SyncResult, default_db_path, DEFAULT_MODEL +def _is_http_url(url: str) -> bool: + p = urlparse(url) + return p.scheme.lower() in {"http", "https"} and bool(p.netloc) + + +def _check_url_status(url: str, timeout: float) -> tuple[str, int | None, str | None]: + """Return (url, status_code, error_message).""" + if not _is_http_url(url): + return url, None, "skipped (non-http URL)" + + headers = {"User-Agent": "mindmark/0.x (+bookmark-validation)"} + try: + req = Request(url, headers=headers, method="HEAD") + with urlopen(req, timeout=timeout) as resp: + return url, int(getattr(resp, "status", 0) or 0), None + except HTTPError as e: + # HTTP errors still include a useful status code. + return url, int(e.code), str(e.reason) if e.reason else "HTTP error" + except Exception: + pass + + # Fallback to GET for servers that reject HEAD. + try: + req = Request(url, headers=headers, method="GET") + with urlopen(req, timeout=timeout) as resp: + return url, int(getattr(resp, "status", 0) or 0), None + except HTTPError as e: + return url, int(e.code), str(e.reason) if e.reason else "HTTP error" + except URLError as e: + return url, None, str(e.reason) if e.reason else "connection error" + except Exception as e: # pragma: no cover - defensive fallback + return url, None, str(e) + + +def _cmd_validate(args): + idx = Index(db_path=args.db) + try: + bookmarks = idx.all_bookmarks() + if not bookmarks: + print("index is empty — run 'mindmark sync' first.") + return 1 + + total = len(bookmarks) + print(f"validating {total} indexed bookmarks...") + + url_to_bm = {b["url"]: b for b in bookmarks} + stale = [] + skipped = 0 + + with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as ex: + futs = { + ex.submit(_check_url_status, b["url"], args.timeout): b["url"] + for b in bookmarks + } + for fut in concurrent.futures.as_completed(futs): + url, code, error = fut.result() + if error == "skipped (non-http URL)": + skipped += 1 + continue + if code is None or code >= 400: + stale.append((url_to_bm[url], code, error)) + + checked = total - skipped + healthy = checked - len(stale) + + print( + f"checked={checked} healthy={healthy} stale={len(stale)} skipped={skipped}" + ) + + if not stale: + print("all checked bookmarks look valid.") + return 0 + + print("\nstale bookmarks found:") + for i, (bm, code, error) in enumerate(stale, 1): + reason = f"HTTP {code}" if code is not None else (error or "unreachable") + folder = bm["folder_path"] or "(root)" + print(f"\n{i}. {bm['title']}") + print(f" status: {reason}") + print(f" url: {bm['url']}") + print(f" path: {folder}") + + return 0 + except KeyboardInterrupt: + print("\n\nCancelled by user.") + return 1 + finally: + idx.close() + + +def _cmd_drop_index(args): + db_path = Path(args.db).expanduser() if args.db else default_db_path() + + if not db_path.exists(): + print(f"index not found: {db_path}") + return 0 + + if not args.yes: + try: + ans = input(f"drop local index at '{db_path}'? [y/N] ").strip().lower() + if ans != "y": + print("cancelled.") + return 0 + except (EOFError, OSError): + print("cancelled.") + return 0 + + try: + if db_path.is_file(): + db_path.unlink() + elif db_path.is_dir(): + shutil.rmtree(db_path) + else: + print(f"index path is not a file or directory: {db_path}") + return 1 + except PermissionError as e: + # Windows can keep SQLite files locked by another process handle. + # If deletion fails, try clearing index data in-place as a fallback. + if db_path.is_file() and _clear_index_contents(db_path): + print(f"index file is in use; cleared index contents instead: {db_path}") + return 0 + print(f"error: failed to remove index: {e}", file=sys.stderr) + return 1 + except OSError as e: + print(f"error: failed to remove index: {e}", file=sys.stderr) + return 1 + + print(f"dropped local index: {db_path}") + return 0 + + +def _clear_index_contents(db_path: Path) -> bool: + """Best-effort fallback when index file cannot be deleted due to locks.""" + try: + con = sqlite3.connect(str(db_path), timeout=1.0) + cur = con.cursor() + cur.execute("DELETE FROM bookmark_sources") + cur.execute("DELETE FROM bookmarks") + cur.execute("DELETE FROM meta") + con.commit() + con.close() + return True + except sqlite3.Error: + return False + + def _cmd_index(args): path = Path(args.path).expanduser() if not path.is_file(): @@ -58,88 +209,60 @@ def _cmd_find(args): print(f"opened: {results[n]['title']}") return 0 - if args.json: - import json + import json + if getattr(args, "json", False): print(json.dumps(results, indent=2)) - return 0 + else: + for i, r in enumerate(results, 1): + domain = urlparse(r["url"]).netloc + folder = r["folder_path"] + path = f"{folder}/" if folder else "" + print(f"{i:2d}. {r['title']}") + print(f" {path}{domain}") - for i, r in enumerate(results, 1): - folder = r["folder_path"] or "(no folder)" - print(f"{i:>2}. [{r['score']:.3f}] {r['title']}") - print(f" {r['url']}") - print(f" \u21b3 {folder}") return 0 def _cmd_stats(args): idx = Index(db_path=args.db) - s = idx.stats() - print(f"db: {s['db_path']}") - print(f"model: {s['model']}") - print(f"total: {s['total']} bookmarks") - if s["top_domains"]: - print("\ntop domains:") - for d, c in s["top_domains"]: - print(f" {c:5d} {d}") - if s["top_folders"]: - print("\ntop folders:") - for f, c in s["top_folders"]: - print(f" {c:5d} {f}") - return 0 - - -def _cmd_open(args): - idx = Index(db_path=args.db) - _auto_sync_hint(idx) - results = idx.search(args.query, k=1) - if not results: - print("no results") - return 1 - webbrowser.open(results[0]["url"]) - print(f"opened: {results[0]['title']}") - return 0 - - -def _cmd_sync(args): - from .browsers import collect_all_bookmarks, detect_browsers - - if args.list_browsers: - profiles = detect_browsers() - if not profiles: - print("no supported browsers detected") - return 1 - print(f"{'Browser':<12} {'Profile':<24} Path") - print(f"{'-------':<12} {'-------':<24} ----") - for p in profiles: - print(f"{p.browser_name:<12} {p.profile_name:<24} {p.bookmark_path}") + try: + stats = idx.stats() + print(f"bookmarks: {stats['total']}") + if stats['total'] > 0: + print(f"model: {stats['model']}") + if stats['top_domains']: + print(f"\ntop domains:") + for domain, count in stats['top_domains']: + print(f" {domain}: {count}") + if stats['top_folders']: + print(f"\ntop folders:") + for folder, count in stats['top_folders']: + print(f" {folder}: {count}") return 0 + finally: + idx.close() - print("detecting browsers...") - pairs = collect_all_bookmarks(browser_filter=args.browser) - if not pairs: - if args.browser: - print(f"no bookmarks found for browser: {args.browser}", file=sys.stderr) - else: - print("no supported browsers detected", file=sys.stderr) +def _cmd_sync(args): + from .browsers import parse_browser_bookmarks, detect_browsers + + browsers = detect_browsers() + if not browsers: + print("error: no browsers detected", file=sys.stderr) return 1 - + + print(f"[1/2] collecting bookmarks from {', '.join(b.browser_name for b in browsers)}") + bookmarks = []; [bookmarks.extend(parse_browser_bookmarks(b)) for b in browsers] + if not bookmarks: + print("no bookmarks found.") + return 0 + print(f" found {len(bookmarks)} unique bookmarks") + + print(f"[2/2] syncing to {args.db or default_db_path()}") idx = Index(db_path=args.db, model_name=args.model) - total_result = SyncResult() - - for profile, bookmarks in pairs: - source_id = profile.source_id - print(f"syncing {profile.browser_name} ({profile.profile_name}): " - f"{len(bookmarks)} bookmarks...") - result = idx.sync(bookmarks, source=source_id, batch_size=args.batch_size) - total_result.added += result.added - total_result.updated += result.updated - total_result.removed += result.removed - total_result.unchanged += result.unchanged - if result.total_changed > 0: - print(f" {result}") - - print(f"\ndone. {total_result}") + res = idx.sync(bookmarks) + + print(f"done. added={res.added} updated={res.updated} removed={res.removed}") return 0 @@ -154,7 +277,7 @@ def build_parser(): help=f"SQLite index path (default: {default_db_path()})", ) - sub = p.add_subparsers(dest="cmd", required=True) + sub = p.add_subparsers(dest="cmd") pi = sub.add_parser("index", help="build/refresh the index from an exported bookmarks HTML file") pi.add_argument("path", help="path to the exported Netscape bookmarks HTML file") @@ -174,25 +297,32 @@ def build_parser(): ps = sub.add_parser("stats", help="show index stats") ps.set_defaults(func=_cmd_stats) - po = sub.add_parser("open", help="search and open the top result in the browser") - po.add_argument("query") - po.set_defaults(func=_cmd_open) + py = sub.add_parser("sync", help="automatically sync bookmarks from local browsers") + py.add_argument("--model", default=DEFAULT_MODEL) + py.set_defaults(func=_cmd_sync) - psync = sub.add_parser( - "sync", - help="sync bookmarks directly from installed browsers (no export needed)", + pv = sub.add_parser("validate", help="validate indexed bookmark URLs and report stale entries (read-only)") + pv.add_argument( + "--timeout", + type=float, + default=8.0, + help="per-request timeout in seconds (default: 8.0)", ) - psync.add_argument( - "--browser", type=str, default=None, - help="sync only this browser (chrome, edge, brave, firefox)", + pv.add_argument( + "--workers", + type=int, + default=16, + help="parallel request workers (default: 16)", ) - psync.add_argument( - "--list-browsers", action="store_true", - help="list detected browsers and profiles, then exit", + pv.set_defaults(func=_cmd_validate) + + pd = sub.add_parser("drop-index", help="drop (delete) the local index database") + pd.add_argument( + "--yes", + action="store_true", + help="auto-confirm index deletion", ) - psync.add_argument("--model", default=DEFAULT_MODEL) - psync.add_argument("--batch-size", type=int, default=64) - psync.set_defaults(func=_cmd_sync) + pd.set_defaults(func=_cmd_drop_index) return p @@ -200,8 +330,13 @@ def build_parser(): def main(argv=None): parser = build_parser() args = parser.parse_args(argv) + if args.cmd == "validate": + if args.timeout <= 0: + parser.error("--timeout must be > 0") + if args.workers <= 0: + parser.error("--workers must be > 0") + return args.func(args) + if args.cmd is None: + parser.print_help() + return 2 return args.func(args) - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/src/mindmark/index.py b/src/mindmark/index.py index 5d0da1d..22542bf 100644 --- a/src/mindmark/index.py +++ b/src/mindmark/index.py @@ -407,6 +407,48 @@ def stats(self) -> dict: "top_folders": top_folders, } + def all_bookmarks(self) -> list[dict]: + """Return all indexed bookmarks needed for validation/reporting.""" + cur = self.con.cursor() + cur.row_factory = sqlite3.Row + cur.execute( + "SELECT url, title, folder_path, domain FROM bookmarks ORDER BY url" + ) + rows = cur.fetchall() + return [ + { + "url": r["url"], + "title": r["title"], + "folder_path": r["folder_path"], + "domain": r["domain"], + } + for r in rows + ] + + def remove_urls(self, urls: list[str]) -> int: + """Delete bookmarks and source mappings for the given URLs.""" + if not urls: + return 0 + + unique_urls = sorted(set(urls)) + cur = self.con.cursor() + placeholders = ",".join("?" for _ in unique_urls) + try: + cur.execute( + f"DELETE FROM bookmark_sources WHERE url IN ({placeholders})", + unique_urls, + ) + cur.execute( + f"DELETE FROM bookmarks WHERE url IN ({placeholders})", + unique_urls, + ) + removed = cur.rowcount + self.con.commit() + return removed + except Exception: + self.con.rollback() + raise + def _load_matrix(self): cur = self.con.cursor() cur.row_factory = sqlite3.Row diff --git a/tests/test_validate_cli.py b/tests/test_validate_cli.py new file mode 100644 index 0000000..fd45aa6 --- /dev/null +++ b/tests/test_validate_cli.py @@ -0,0 +1,124 @@ +"""Tests for URL validation CLI flow.""" +from __future__ import annotations + +from argparse import Namespace +from pathlib import Path +from unittest.mock import MagicMock + +import numpy as np +import pytest + +from mindmark import cli +from mindmark.index import Index +from mindmark.parser import Bookmark + + +def _make_bookmark(url: str, title: str = "T", folder: str = "") -> Bookmark: + return Bookmark(title=title, url=url, folder_path=folder, add_date=0, icon=None) + + +def _build_index(db_path: Path, bookmarks: list[Bookmark]) -> None: + idx = Index(db_path=db_path) + mock_embedder = MagicMock() + + def fake_embed(texts: list[str]) -> np.ndarray: + vecs = np.ones((len(texts), 4), dtype=np.float32) + norms = np.linalg.norm(vecs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + return vecs / norms + + mock_embedder.embed.side_effect = fake_embed + mock_embedder.embed_one.side_effect = lambda t: fake_embed([t])[0] + idx.embedder = mock_embedder + idx.rebuild(bookmarks) + idx.close() + + +def test_validate_all_healthy_no_prompt(tmp_path, monkeypatch): + db = tmp_path / "validate_ok.db" + _build_index( + db, + [ + _make_bookmark("https://a.example.com", "A"), + _make_bookmark("https://b.example.com", "B"), + ], + ) + + monkeypatch.setattr(cli, "_check_url_status", lambda url, timeout: (url, 200, None)) + + def fail_input(_prompt: str) -> str: + raise AssertionError("input() should not be called when all URLs are healthy") + + monkeypatch.setattr("builtins.input", fail_input) + + args = Namespace(db=db, timeout=0.5, workers=2, yes=False) + rc = cli._cmd_validate(args) + assert rc == 0 + + +def test_validate_stale_reports_only(tmp_path, monkeypatch): + """Validate identifies stale bookmarks but does not remove them.""" + db = tmp_path / "validate_report.db" + stale_url = "https://stale.example.com" + keep_url = "https://keep.example.com" + _build_index( + db, + [ + _make_bookmark(stale_url, "Stale"), + _make_bookmark(keep_url, "Keep"), + ], + ) + + def fake_check(url: str, timeout: float): + if url == stale_url: + return (url, 404, "Not Found") + return (url, 200, None) + + monkeypatch.setattr(cli, "_check_url_status", fake_check) + + def fail_input(_prompt: str) -> str: + raise AssertionError("input() should not be called in read-only validate mode") + + monkeypatch.setattr("builtins.input", fail_input) + + args = Namespace(db=db, timeout=0.5, workers=2, yes=False) + rc = cli._cmd_validate(args) + assert rc == 0 + + # Verify both bookmarks still exist (validate does NOT trim) + idx = Index(db_path=db) + try: + urls = [b["url"] for b in idx.all_bookmarks()] + assert keep_url in urls + assert stale_url in urls # Still there after validate + finally: + idx.close() + + +def test_main_validate_dispatch(monkeypatch, tmp_path): + db = tmp_path / "dispatch.db" + called = {"ok": False} + + def fake_validate(args): + called["ok"] = True + assert args.db == str(db) + assert args.timeout == 8.0 + assert args.workers == 16 + return 0 + + monkeypatch.setattr(cli, "_cmd_validate", fake_validate) + rc = cli.main(["--db", str(db), "validate"]) + assert rc == 0 + assert called["ok"] is True + + +def test_main_validate_rejects_subcommand(tmp_path): + db = tmp_path / "reject.db" + with pytest.raises(SystemExit): + cli.main(["validate", "stats", "--db", str(db)]) + + +def test_main_validate_rejects_yes(tmp_path): + db = tmp_path / "reject_yes.db" + with pytest.raises(SystemExit): + cli.main(["validate", "--yes", "--db", str(db)])