Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions docs/user-scripts.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Data Source Indexing Commands

LEANN provides `index-*` CLI commands for indexing common personal data sources. Each command reads from a specific data source and builds a searchable LEANN index.

## Available Commands

| Command | Description | Platform |
|---------|-------------|----------|
| `leann index-browser [chrome\|brave]` | Browser history | macOS |
| `leann index-email` | Apple Mail | macOS |
| `leann index-calendar` | Apple Calendar | macOS |
| `leann index-imessage` | iMessage conversations | macOS |
| `leann index-wechat --export-dir <path>` | WeChat chat history | Any |
| `leann index-chatgpt --export-path <path>` | ChatGPT export | Any |
| `leann index-claude --export-path <path>` | Claude export | Any |

## Common Options

All `index-*` commands accept these options:

```bash
--index-name NAME # Custom index name (each command has a sensible default)
--embedding-model MODEL # Embedding model (default: facebook/contriever)
--embedding-mode MODE # Backend: sentence-transformers, openai, mlx, ollama
--max-count N # Max items to index (default: 1000)
--no-recompute # Store full embeddings instead of using recomputation
```

## Examples

### Index Chrome browser history

```bash
leann index-browser chrome
leann index-browser brave --index-name brave_history
```

### Index Apple Mail

```bash
leann index-email
```

### Index iMessage

```bash
leann index-imessage
```

### Index Apple Calendar

```bash
leann index-calendar
```

### Index ChatGPT or Claude exports

```bash
# ChatGPT: export from https://chat.openai.com → Settings → Export data
leann index-chatgpt --export-path ~/Downloads/chatgpt-export.zip

# Claude: export from https://claude.ai → Settings → Export data
leann index-claude --export-path ~/Downloads/claude-export.json
```

### Index WeChat

```bash
# Requires exported JSON files from wechat-exporter
leann index-wechat --export-dir ~/wechat-export/
```

## Daily Automation

You can schedule indexing with cron for automatic daily updates:

```bash
# Edit crontab
crontab -e

# Add entries (runs at 2 AM daily):
0 2 * * * cd /path/to/LEANN && leann index-browser chrome
5 2 * * * cd /path/to/LEANN && leann index-email
10 2 * * * cd /path/to/LEANN && leann index-imessage
```

## Searching Indexed Data

After indexing, search with the standard `leann search` command:

```bash
leann search browser_history "github pull request review"
leann search email "meeting notes from last week"
leann search imessage "dinner plans"
```
274 changes: 274 additions & 0 deletions packages/leann-core/src/leann/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,10 @@ def create_parser(self) -> argparse.ArgumentParser:
leann search my-docs "query" # Search in my-docs index
leann ask my-docs "question" # Ask my-docs index
leann react my-docs "complex question" # Use ReAct agent for multiturn retrieval
leann index-browser chrome # Index Chrome browser history
leann index-email # Index Apple Mail
leann index-imessage # Index iMessage conversations
leann index-chatgpt --export-path ~/chatgpt-export.zip # Index ChatGPT export
leann list # List all stored indexes
leann remove my-docs # Remove an index (local first, then global)
""",
Expand Down Expand Up @@ -590,6 +594,85 @@ def create_parser(self) -> argparse.ArgumentParser:
help="API key for cloud LLM providers (OpenAI, Anthropic)",
)

# ── index-* commands: data source indexing ──────────────────────

def _add_index_args(p, default_name):
"""Add common embedding and index args to an index-* subparser."""
p.add_argument(
"--index-name",
type=str,
default=default_name,
help=f"Index name (default: {default_name})",
)
p.add_argument(
"--embedding-model", type=str, default="facebook/contriever", help="Embedding model"
)
p.add_argument(
"--embedding-mode",
type=str,
default="sentence-transformers",
choices=["sentence-transformers", "openai", "mlx", "ollama"],
help="Embedding backend",
)
p.add_argument("--embedding-host", type=str, default=None, help="Ollama embedding host")
p.add_argument(
"--embedding-api-base",
type=str,
default=None,
help="OpenAI-compatible embedding base URL",
)
p.add_argument("--embedding-api-key", type=str, default=None, help="Embedding API key")
p.add_argument(
"--max-count", type=int, default=1000, help="Max items to index (default: 1000)"
)
p.add_argument(
"--no-recompute",
action="store_true",
help="Disable embedding recomputation (stores full embeddings)",
)

idx_browser = subparsers.add_parser(
"index-browser", help="Index browser history (Chrome/Brave)"
)
_add_index_args(idx_browser, "browser_history")
idx_browser.add_argument(
"browser",
nargs="?",
default="chrome",
choices=["chrome", "brave"],
help="Browser to index (default: chrome)",
)

idx_email = subparsers.add_parser("index-email", help="Index Apple Mail")
_add_index_args(idx_email, "email")

idx_calendar = subparsers.add_parser("index-calendar", help="Index Apple Calendar events")
_add_index_args(idx_calendar, "calendar")

idx_imessage = subparsers.add_parser("index-imessage", help="Index iMessage conversations")
_add_index_args(idx_imessage, "imessage")

idx_wechat = subparsers.add_parser("index-wechat", help="Index WeChat chat history")
_add_index_args(idx_wechat, "wechat")
idx_wechat.add_argument(
"--export-dir", type=str, required=True, help="Path to WeChat JSON export directory"
)

idx_chatgpt = subparsers.add_parser("index-chatgpt", help="Index ChatGPT export")
_add_index_args(idx_chatgpt, "chatgpt")
idx_chatgpt.add_argument(
"--export-path",
type=str,
required=True,
help="Path to ChatGPT export (chat.html or .zip)",
)

idx_claude = subparsers.add_parser("index-claude", help="Index Claude export")
_add_index_args(idx_claude, "claude")
idx_claude.add_argument(
"--export-path", type=str, required=True, help="Path to Claude export (.json or .zip)"
)

# List command
subparsers.add_parser("list", help="List all indexes")

Expand Down Expand Up @@ -2753,6 +2836,177 @@ def _ask_once(prompt: str) -> None:

_ask_once(query)

# ── index-* shared handler ──────────────────────────────────────

async def _build_index_from_documents(self, args, documents: list):
"""Shared builder for all index-* commands.

Accepts a list of llama-index Document objects (or dicts with 'text' and
'metadata' keys), builds a LEANN index with proper embedding and
recomputation settings.
"""
if not documents:
print("No documents loaded — nothing to index.")
return

index_name = args.index_name
index_path = self.get_index_path(index_name)
is_recompute = not getattr(args, "no_recompute", False)

print(f"Building index '{index_name}' with {len(documents)} documents...")
print(f" Embedding: {args.embedding_model} ({args.embedding_mode})")
print(f" Recompute: {is_recompute}")

for attr in (
"embedding_prompt_template",
"query_prompt_template",
"embedding_host",
"embedding_api_base",
"embedding_api_key",
):
if not hasattr(args, attr):
setattr(args, attr, None)

embedding_options = self._build_embedding_options(args) or None

builder = LeannBuilder(
backend_name="hnsw",
embedding_model=args.embedding_model,
embedding_mode=args.embedding_mode,
embedding_options=embedding_options,
is_recompute=is_recompute,
)

for doc in documents:
if hasattr(doc, "text"):
builder.add_text(
doc.text, metadata=doc.metadata if hasattr(doc, "metadata") else {}
)
elif isinstance(doc, dict):
builder.add_text(doc["text"], metadata=doc.get("metadata", {}))

builder.build_index(index_path)
self.register_project_dir()
print(f"Index '{index_name}' built at {index_path}")

async def index_browser(self, args):
"""Index browser history (Chrome or Brave)."""
from apps.history_data.history import ChromeHistoryReader

browser = getattr(args, "browser", "chrome")
profile_paths = {
"chrome": "~/Library/Application Support/Google/Chrome/Default",
"brave": "~/Library/Application Support/BraveSoftware/Brave-Browser/Default",
}
profile = os.path.expanduser(profile_paths.get(browser, profile_paths["chrome"]))
reader = ChromeHistoryReader()
docs = reader.load_data(chrome_profile_path=profile, max_count=args.max_count)
print(f"Loaded {len(docs)} {browser} history entries")
await self._build_index_from_documents(args, docs)

async def index_email(self, args):
"""Index Apple Mail."""
from apps.email_data.LEANN_email_reader import EmlxReader, find_all_messages_directories

msg_dirs = find_all_messages_directories()
if not msg_dirs:
print("No Apple Mail Messages directories found.")
return
reader = EmlxReader()
docs = []
for msg_dir in msg_dirs:
docs.extend(reader.load_data(str(msg_dir), max_count=args.max_count))
await self._build_index_from_documents(args, docs)

async def index_calendar(self, args):
"""Index Apple Calendar events."""
import shutil
import sqlite3

from llama_index.core import Document

docs = []
calendar_cache = Path.home() / "Library/Calendars/Calendar Cache"
if not calendar_cache.exists():
print("Apple Calendar Cache not found.")
return

temp_db = "/tmp/leann_calendar_index_copy"
try:
shutil.copy2(calendar_cache, temp_db)
conn = sqlite3.connect(temp_db)
cursor = conn.cursor()
cursor.execute(
"""
SELECT summary, description, location,
datetime(start_date + 978307200, 'unixepoch', 'localtime') as start,
datetime(end_date + 978307200, 'unixepoch', 'localtime') as end_time
FROM CI_EVENT ORDER BY start_date DESC LIMIT ?
""",
(args.max_count,),
)
for summary, description, location, start, end_time in cursor.fetchall():
if not summary:
continue
text = (
f"Event: {summary}\nStart: {start}\nEnd: {end_time}\n"
f"Location: {location or ''}\nDescription: {description or ''}"
)
docs.append(Document(text=text, metadata={"event": summary, "start": start}))
conn.close()
except Exception as e:
print(f"Error reading Apple Calendar: {e}")
finally:
if os.path.exists(temp_db):
os.remove(temp_db)
await self._build_index_from_documents(args, docs)

async def index_imessage(self, args):
"""Index iMessage conversations."""
from apps.imessage_data.imessage_reader import IMessageReader

reader = IMessageReader(concatenate_conversations=True)
docs = reader.load_data()
print(f"Loaded {len(docs)} iMessage conversations")
await self._build_index_from_documents(args, docs)

async def index_wechat(self, args):
"""Index WeChat chat history from exported JSON."""
from apps.history_data.wechat_history import WeChatHistoryReader

reader = WeChatHistoryReader()
docs = reader.load_data(
input_dir=args.export_dir,
max_count=args.max_count,
concatenate_messages=True,
)
print(f"Loaded {len(docs)} WeChat conversations")
await self._build_index_from_documents(args, docs)

async def index_chatgpt(self, args):
"""Index ChatGPT export data."""
from apps.chatgpt_data.chatgpt_reader import ChatGPTReader

reader = ChatGPTReader(concatenate_conversations=True)
docs = reader.load_data(
input_dir=args.export_path,
max_count=args.max_count,
)
print(f"Loaded {len(docs)} ChatGPT conversations")
await self._build_index_from_documents(args, docs)

async def index_claude(self, args):
"""Index Claude export data."""
from apps.claude_data.claude_reader import ClaudeReader

reader = ClaudeReader(concatenate_conversations=True)
docs = reader.load_data(
input_dir=args.export_path,
max_count=args.max_count,
)
print(f"Loaded {len(docs)} Claude conversations")
await self._build_index_from_documents(args, docs)

async def react_agent(self, args):
"""Run ReAct agent for multiturn retrieval."""
index_name = args.index_name
Expand Down Expand Up @@ -2892,6 +3146,26 @@ async def run(self, args=None):
await self.react_agent(args)
elif args.command == "serve":
await self.serve_api(args)
elif args.command in (
"index-browser",
"index-email",
"index-calendar",
"index-imessage",
"index-wechat",
"index-chatgpt",
"index-claude",
):
handler = {
"index-browser": self.index_browser,
"index-email": self.index_email,
"index-calendar": self.index_calendar,
"index-imessage": self.index_imessage,
"index-wechat": self.index_wechat,
"index-chatgpt": self.index_chatgpt,
"index-claude": self.index_claude,
}[args.command]
with suppress_cpp_output(suppress):
await handler(args)
else:
parser.print_help()

Expand Down
Loading