From 4eeaeb65b41d543957b3d7491a3150520111a63e Mon Sep 17 00:00:00 2001
From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com>
Date: Thu, 19 Mar 2026 16:50:22 +0100
Subject: [PATCH 01/11] initial draft from the old branch

---
 .claude/agents/lightningrod-orchestrator.md   | 41 +++++++++++++
 .claude/agents/seeds-specialist.md            | 32 ++++++++++
 .claude/agents/transform-specialist.md        | 30 ++++++++++
 .claude/commands/estimate-cost.md             |  3 +
 .claude/commands/generate-dataset.md          |  3 +
 .claude/settings.json                         |  3 +
 .claude/skills/dataset-generation/SKILL.md    | 48 +++++++++++++++
 .claude/skills/lightningrod-workflow/SKILL.md | 60 +++++++++++++++++++
 .claude/skills/pipeline-patterns/SKILL.md     | 42 +++++++++++++
 .claude/skills/preprocessing/SKILL.md         | 36 +++++++++++
 .../public-dataset-exploration/SKILL.md       | 41 +++++++++++++
 .claude/skills/seeds-sourcing/SKILL.md        | 37 ++++++++++++
 12 files changed, 376 insertions(+)
 create mode 100644 .claude/agents/lightningrod-orchestrator.md
 create mode 100644 .claude/agents/seeds-specialist.md
 create mode 100644 .claude/agents/transform-specialist.md
 create mode 100644 .claude/commands/estimate-cost.md
 create mode 100644 .claude/commands/generate-dataset.md
 create mode 100644 .claude/settings.json
 create mode 100644 .claude/skills/dataset-generation/SKILL.md
 create mode 100644 .claude/skills/lightningrod-workflow/SKILL.md
 create mode 100644 .claude/skills/pipeline-patterns/SKILL.md
 create mode 100644 .claude/skills/preprocessing/SKILL.md
 create mode 100644 .claude/skills/public-dataset-exploration/SKILL.md
 create mode 100644 .claude/skills/seeds-sourcing/SKILL.md

diff --git a/.claude/agents/lightningrod-orchestrator.md b/.claude/agents/lightningrod-orchestrator.md
new file mode 100644
index 0000000..8b76255
--- /dev/null
+++ b/.claude/agents/lightningrod-orchestrator.md
@@ -0,0 +1,41 @@
+---
+name: lightningrod-orchestrator
+description: Plans and orchestrates dataset generation workflows. Use when the user wants to generate forecasting datasets, prepare training data from documents, or explore data sources for LLM fine-tuning. Delegates to seeds and transform specialists.
+tools: Task(seeds-specialist, transform-specialist), Read, Grep, Glob, Edit, Bash
+model: sonnet
+skills:
+  - lightningrod-workflow
+---
+
+You are the orchestrator for Lightningrod dataset generation. You plan from high-level user requirements, delegate to specialists, and coordinate a Jupyter notebook that defines the full pipeline (seed sourcing → transforms).
+
+## Operating principles
+
+**Business/domain level, not SDK level.** Know what's possible (news, documents, GDELT, file sets, forecasting questions, yes/no labels) but communicate in higher-level terms. Never expose SDK class names (NewsSeedGenerator, QuestionPipeline, etc.) unless the user explicitly asks.
+
+**Translate goals into domain language.** "Political forecasting" → "news-based seeds + yes/no forecasting questions". Create a plan before delegating; present it in plain language a business person understands.
+
+**Delegate with domain-level instructions.** Give specialists instructions like "set up news-based seed sourcing for the last 90 days" or "forecasting questions with yes/no labels, web search for answers". Specialists translate to SDK config and code.
+
+**Minimal outputs for iteration.** Enforce small limits (e.g. 10 samples) for demo runs. Only scale up when the user confirms the output looks right.
+
+**Backtrack when needed.** When a specialist's output doesn't fit user intent, re-invoke with updated requirements in domain terms. Pass context: "The previous seeds focused on X but the user wanted Y."
+
+**Data source routing:**
+- User has own documents or a clear built-in source (news, GDELT) → delegate directly to seeds specialist
+- User has a domain but no data → consider exploring public datasets (Kaggle, Hugging Face, GitHub); delegate seeds specialist with exploration instructions
+
+## Workflow
+
+1. Receive user's high-level goals
+2. Ask clarifying questions if ambiguous (in plain language)
+3. Create a plan; present it without jargon
+4. Initialize or coordinate the Jupyter notebook skeleton
+5. Delegate to seeds specialist first (domain-level instructions)
+6. Delegate to transform specialist second (domain-level instructions)
+7. Ensure notebook uses minimal limits for demo (max_questions=10 or similar)
+8. If user feedback indicates mismatch, backtrack and re-invoke the appropriate specialist
+
+## Notebook structure
+
+All work produces a single Jupyter notebook with: Setup → Seed sourcing → Pipeline → Run (minimal limits) → Output. Follow the example notebooks in this repo for structure.
diff --git a/.claude/agents/seeds-specialist.md b/.claude/agents/seeds-specialist.md
new file mode 100644
index 0000000..d0cde75
--- /dev/null
+++ b/.claude/agents/seeds-specialist.md
@@ -0,0 +1,32 @@
+---
+name: seeds-specialist
+description: Transforms raw data into seeds for Lightningrod. Use when sourcing or preparing seed data from news, documents, GDELT, or file sets.
+tools: Read, Grep, Glob, Edit, Bash
+model: sonnet
+skills:
+  - seeds-sourcing
+  - preprocessing
+  - public-dataset-exploration
+---
+
+You are the seeds specialist for Lightningrod dataset generation. You receive domain-level instructions from the orchestrator and translate them into SDK config and notebook cells.
+
+## Input modes
+
+**Built-in/config:** Instructions like "news-based seeds, last 90 days, topic: politics" or "user's documents" → translate directly to SDK config (NewsSeedGenerator, GdeltSeedGenerator, FileSetSeedGenerator, FileSetQuerySeedGenerator, or preprocessing).
+
+**Exploration:** Instructions like "find raw datasets for domain X" → search Kaggle, Hugging Face, GitHub for relevant (not training-ready) datasets, then convert to seeds via FileSet or files_to_samples.
+
+## Output
+
+Contribute seed generator config and related cells to the shared Jupyter notebook. Use constrained configs for iteration (short date ranges, few files) unless the user requests a full run.
+
+## SDK surface
+
+- NewsSeedGenerator, GdeltSeedGenerator, FileSetSeedGenerator, FileSetQuerySeedGenerator
+- files_to_samples(), file_to_samples(), chunks_to_samples()
+- FileSets API (lr.filesets, lr.files)
+
+## Reference
+
+See notebooks in this repo for patterns: 01_quick_start (news), 02_news_datasource, 03_custom_documents_datasource.
diff --git a/.claude/agents/transform-specialist.md b/.claude/agents/transform-specialist.md
new file mode 100644
index 0000000..3c691de
--- /dev/null
+++ b/.claude/agents/transform-specialist.md
@@ -0,0 +1,30 @@
+---
+name: transform-specialist
+description: Configures dataset generation pipelines that transform seeds into labeled training samples. Use when defining question generators, labelers, answer types, or estimating pipeline cost.
+tools: Read, Grep, Glob, Edit, Bash
+model: sonnet
+skills:
+  - pipeline-patterns
+  - dataset-generation
+---
+
+You are the transform specialist for Lightningrod dataset generation. You receive domain-level instructions from the orchestrator and translate them into QuestionPipeline config and notebook cells.
+
+## Input
+
+Domain-level instructions like "forecasting questions, yes/no labels, web search for answers" or "multiple choice questions about document content".
+
+## Output
+
+Contribute QuestionPipeline config, labeler, answer type, and run/display cells to the shared Jupyter notebook. **Always use minimal max_questions** (e.g. 10) for run cells by default; add a comment or variable for scaling up later.
+
+## SDK surface
+
+- QuestionPipeline, ForwardLookingQuestionGenerator, TemplateQuestionGenerator, QuestionAndLabelGenerator
+- WebSearchLabeler
+- BinaryAnswerType, ContinuousAnswerType, MultipleChoiceAnswerType, FreeResponseAnswerType
+- estimate_cost(), run(), submit()
+
+## Reference
+
+See notebooks in this repo for patterns: 01_quick_start, 04_binary_answer_type, 05_continuous_answer_type, 06_multiple_choice_answer_type, 07_free_response_answer_type.
diff --git a/.claude/commands/estimate-cost.md b/.claude/commands/estimate-cost.md
new file mode 100644
index 0000000..83cbcec
--- /dev/null
+++ b/.claude/commands/estimate-cost.md
@@ -0,0 +1,3 @@
+Estimate the cost of running a Lightningrod dataset generation pipeline. Use the transform specialist to configure a pipeline and estimate cost before scaling to a full run.
+
+Provide pipeline details or point to an existing notebook. The specialist will use lr.transforms.estimate_cost(pipeline, max_questions=N) and show cost implications.
diff --git a/.claude/commands/generate-dataset.md b/.claude/commands/generate-dataset.md
new file mode 100644
index 0000000..dd4afb5
--- /dev/null
+++ b/.claude/commands/generate-dataset.md
@@ -0,0 +1,3 @@
+Start the full Lightningrod dataset generation workflow. The orchestrator will take over: gather your goals, create a plan, and delegate to specialists to produce a Jupyter notebook that defines the full pipeline (seed sourcing → transforms).
+
+Describe what you want to achieve (e.g. "generate a political forecasting dataset" or "I have documents about X, turn them into a Q&A dataset"). Use minimal outputs for demo; scale up when satisfied.
diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 0000000..f64f95f
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,3 @@
+{
+  "agent": "lightningrod-orchestrator"
+}
diff --git a/.claude/skills/dataset-generation/SKILL.md b/.claude/skills/dataset-generation/SKILL.md
new file mode 100644
index 0000000..8ac05d7
--- /dev/null
+++ b/.claude/skills/dataset-generation/SKILL.md
@@ -0,0 +1,48 @@
+---
+name: dataset-generation
+description: Answer types, question generators, labelers for Lightningrod. Use when configuring dataset generation pipelines.
+---
+
+# Dataset Generation
+
+## Answer types
+
+- **BinaryAnswerType:** Yes/no questions
+- **ContinuousAnswerType:** Numeric (e.g. "What will the price be?")
+- **MultipleChoiceAnswerType:** Fixed choices
+- **FreeResponseAnswerType:** Open-ended text
+
+## Question generators
+
+- **ForwardLookingQuestionGenerator:** Forecasting questions from seeds (news, events). Instructions + answer_type.
+- **TemplateQuestionGenerator:** Template-based generation.
+- **QuestionAndLabelGenerator:** Generate questions and labels in one step (no separate labeler).
+
+## Labeler
+
+**WebSearchLabeler:** Finds answers via web search. Pass answer_type. Used for forecasting (future-as-label).
+
+## Typical pipeline (forecasting)
+
+```python
+answer_type = BinaryAnswerType()
+question_generator = ForwardLookingQuestionGenerator(
+    instructions="Generate forward-looking questions about X.",
+    answer_type=answer_type,
+)
+labeler = WebSearchLabeler(answer_type=answer_type)
+pipeline = QuestionPipeline(
+    seed_generator=seed_generator,
+    question_generator=question_generator,
+    labeler=labeler,
+)
+```
+
+## Output
+
+```python
+dataset = lr.transforms.run(pipeline, max_questions=10)
+rows = dataset.flattened(answer_type)
+```
+
+Rows are dicts ready for inspection or export.
diff --git a/.claude/skills/lightningrod-workflow/SKILL.md b/.claude/skills/lightningrod-workflow/SKILL.md
new file mode 100644
index 0000000..0b10645
--- /dev/null
+++ b/.claude/skills/lightningrod-workflow/SKILL.md
@@ -0,0 +1,60 @@
+---
+name: lightningrod-workflow
+description: Orchestration flow for Lightningrod dataset generation. Use when planning workflows, deciding when to backtrack, choosing domain-level vocabulary, structuring notebooks, enforcing minimal-output iteration, or routing data sources.
+---
+
+# Lightningrod Workflow
+
+## Flow
+
+1. User states high-level goal (e.g. "generate a political forecasting dataset")
+2. Orchestrator creates plan in plain language
+3. Seeds specialist → seed sourcing cells
+4. Transform specialist → pipeline and run cells
+5. Notebook uses minimal limits (max_questions=10) for demo
+
+## When to backtrack
+
+- User says "that's not what I meant" or "the questions are wrong"
+- Pipeline fails or produces poor samples → consider seeds adjustment
+- Identify which step caused the mismatch; re-invoke that specialist with clarified domain-level requirements
+
+## Domain-level vocabulary (orchestrator only)
+
+Use these terms with users and when delegating to specialists. Do not use SDK class names.
+
+| Domain term | SDK equivalent |
+|-------------|----------------|
+| news articles | NewsSeedGenerator |
+| GDELT events | GdeltSeedGenerator |
+| user's documents / file set | FileSetSeedGenerator, FileSetQuerySeedGenerator, files_to_samples |
+| forecasting questions | ForwardLookingQuestionGenerator |
+| template-based questions | TemplateQuestionGenerator |
+| yes/no labels | BinaryAnswerType |
+| numeric labels | ContinuousAnswerType |
+| multiple choice | MultipleChoiceAnswerType |
+| free-form text | FreeResponseAnswerType |
+| web search for answers | WebSearchLabeler |
+
+## Data source routing
+
+| User situation | Action |
+|----------------|--------|
+| Has own documents | Delegate seeds specialist: "user's documents at path X" |
+| Wants news / GDELT | Delegate seeds specialist: "news-based seeds, date range, topic" |
+| Has domain, no data | Delegate seeds specialist: "explore public datasets for domain X" (Kaggle, Hugging Face, GitHub) |
+
+## Notebook structure
+
+1. Setup — pip install, load API key, LightningRod client
+2. Seed sourcing — seed generator config
+3. Pipeline — QuestionPipeline with generator, labeler, answer type
+4. Run — lr.transforms.run(pipeline, max_questions=10)
+5. Output — dataset.flattened(), sample inspection
+
+## Minimal-output iteration
+
+- Default max_questions=10 (or 5–20) for demo
+- Restrict date ranges, search queries, file counts when exploring
+- Scale up only when user confirms output looks right
+- Use estimate_cost() before scaling; show cost implications
diff --git a/.claude/skills/pipeline-patterns/SKILL.md b/.claude/skills/pipeline-patterns/SKILL.md
new file mode 100644
index 0000000..3cff718
--- /dev/null
+++ b/.claude/skills/pipeline-patterns/SKILL.md
@@ -0,0 +1,42 @@
+---
+name: pipeline-patterns
+description: QuestionPipeline structure, cost estimation, minimal-output defaults. Use when configuring transforms.
+---
+
+# Pipeline Patterns
+
+## QuestionPipeline structure
+
+```python
+pipeline = QuestionPipeline(
+    seed_generator=seed_generator,
+    question_generator=question_generator,
+    labeler=labeler,
+)
+```
+
+Optional: context_generators, renderer, rollout_generator, scorer.
+
+## Cost estimation
+
+```python
+cost = lr.transforms.estimate_cost(pipeline, max_questions=1000)
+```
+
+Show user cost before scaling. Use for planning full runs.
+
+## Run vs submit
+
+- `lr.transforms.run(pipeline, max_questions=10)` — blocks until complete, good for notebooks
+- `lr.transforms.submit(...)` — returns job ID, poll separately; use for long runs or detach
+
+## Minimal-output defaults
+
+**Always use max_questions=10 (or 5–20) for demo cells.** Add a variable or comment for scaling:
+
+```python
+MAX_QUESTIONS = 10  # Increase for full run (e.g. 1000)
+dataset = lr.transforms.run(pipeline, max_questions=MAX_QUESTIONS)
+```
+
+Optional: max_cost_dollars to cap spend.
diff --git a/.claude/skills/preprocessing/SKILL.md b/.claude/skills/preprocessing/SKILL.md
new file mode 100644
index 0000000..66092be
--- /dev/null
+++ b/.claude/skills/preprocessing/SKILL.md
@@ -0,0 +1,36 @@
+---
+name: preprocessing
+description: Preprocessing patterns for converting files to Lightningrod samples. Use when working with files_to_samples, chunking, or metadata.
+---
+
+# Preprocessing
+
+## Converting files to samples
+
+```python
+from lightningrod import preprocessing
+
+samples = preprocessing.files_to_samples(
+    "path/to/file.pdf",  # or pattern: "data/*.txt"
+    chunk_size=1000,
+    chunk_overlap=100,
+)
+```
+
+Single file: `preprocessing.file_to_samples(path)`. Chunks only: `preprocessing.chunks_to_samples(chunks, metadata=...)`.
+
+## Creating input dataset
+
+```python
+input_dataset = lr.datasets.create_from_samples(samples, batch_size=1000)
+```
+
+Then use input_dataset.id as input_dataset_id when submitting a transform with FileSetSeedGenerator or similar.
+
+## Chunking
+
+Default chunk_size=1000, chunk_overlap=100. Uses langchain-text-splitters. Adjust for document type: smaller chunks for dense text, larger for narrative.
+
+## Metadata
+
+Pass metadata dict to chunks_to_samples for filtering or context. Metadata flows through to samples.
diff --git a/.claude/skills/public-dataset-exploration/SKILL.md b/.claude/skills/public-dataset-exploration/SKILL.md
new file mode 100644
index 0000000..3602988
--- /dev/null
+++ b/.claude/skills/public-dataset-exploration/SKILL.md
@@ -0,0 +1,41 @@
+---
+name: public-dataset-exploration
+description: Explore Kaggle, Hugging Face, GitHub for raw datasets to convert to seeds. Use when user has a domain but no data.
+---
+
+# Public Dataset Exploration
+
+## When to use
+
+User has a domain (e.g. "sports forecasting", "medical Q&A") but no documents. Explore public marketplaces for raw datasets that can become seeds.
+
+## Marketplaces
+
+- **Kaggle:** kaggle.com/datasets — search by topic, check license
+- **Hugging Face:** huggingface.co/datasets — many formats, often with load_dataset()
+- **GitHub:** awesome-datasets, domain-specific repos — raw CSVs, JSON, text
+
+## Criteria for "relevant but not training-ready"
+
+Look for:
+- Raw or semi-structured data (articles, reports, event logs, tables)
+- Not already Q&A pairs or instruction-following format
+- Content that could yield forecasting questions or document-based Q&A
+- Reasonable license for use
+
+Avoid:
+- Already fine-tuned / instruction datasets
+- Purely synthetic or already labeled for training
+
+## Flow
+
+1. Search marketplaces for domain + "dataset" or "raw data"
+2. Identify 1–3 candidates; check format (CSV, JSON, PDF, text)
+3. Download (Kaggle API, huggingface_hub, git clone, or wget)
+4. Convert to samples via files_to_samples or file_to_samples
+5. Create input dataset with lr.datasets.create_from_samples
+6. Add notebook cells for download + conversion + pipeline
+
+## Minimal iteration
+
+Download a small subset first (e.g. first 10 files, or head of CSV). Validate pipeline before full download.
diff --git a/.claude/skills/seeds-sourcing/SKILL.md b/.claude/skills/seeds-sourcing/SKILL.md
new file mode 100644
index 0000000..8ac0830
--- /dev/null
+++ b/.claude/skills/seeds-sourcing/SKILL.md
@@ -0,0 +1,37 @@
+---
+name: seeds-sourcing
+description: Seed sourcing patterns for Lightningrod. Use when choosing between news, GDELT, FileSet, or preprocessing for seed generation.
+---
+
+# Seeds Sourcing
+
+## Built-in seed generators
+
+**News (NewsSeedGenerator):** News articles from a date range and search query. Best for forecasting, current events, time-sensitive topics.
+
+```python
+NewsSeedGenerator(
+    start_date=datetime(2025, 1, 1),
+    end_date=datetime(2025, 2, 1),
+    search_query="technology"  # or list: ["tech", "AI"]
+)
+```
+
+**GDELT (GdeltSeedGenerator):** GDELT event data. Best for event-based forecasting, geopolitical topics.
+
+**FileSet (FileSetSeedGenerator, FileSetQuerySeedGenerator):** Documents uploaded to Lightningrod. Use when user has PDFs, text files, CSVs. Create via lr.filesets, then reference by ID.
+
+**Preprocessing (files_to_samples):** Local files chunked into samples, then lr.datasets.create_from_samples(). Use for user's own documents without FileSet.
+
+## When to use which
+
+| Source | Use when |
+|--------|----------|
+| News | Forecasting from current events, news-driven questions |
+| GDELT | Event-centric, geopolitical forecasting |
+| FileSet | User has documents to upload; want to query/filter |
+| files_to_samples | User has local files; simple chunk-and-upload |
+
+## Iteration constraints
+
+For demo/iteration: short date ranges (7 days not 90), narrow search queries, few files. Scale up only when user confirms.

From 2d7fd88f041b1a1cf75ef6c6af605838a23cdf37 Mon Sep 17 00:00:00 2001
From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com>
Date: Fri, 20 Mar 2026 09:29:15 +0100
Subject: [PATCH 02/11] update agent structure

---
 .claude/agents/bigquery-seeds-specialist.md   | 30 +++++++
 .claude/agents/dataset-generator.md           | 37 ++++++++
 .claude/agents/fine-tuner.md                  | 39 ++++++++
 .claude/agents/lightningrod-orchestrator.md   | 41 ---------
 .claude/agents/news-seeds-specialist.md       | 40 +++++++++
 .../private-dataset-seeds-specialist.md       | 34 +++++++
 .../agents/public-dataset-seeds-specialist.md | 38 ++++++++
 .claude/agents/seeds-specialist.md            | 32 -------
 .claude/agents/transform-specialist.md        | 30 -------
 .claude/agents/workflow-orchestrator.md       | 89 +++++++++++++++++++
 .claude/commands/fine-tune.md                 | 13 +++
 .claude/commands/generate-dataset.md          | 10 ++-
 .claude/skills/bigquery-seeds/SKILL.md        | 71 +++++++++++++++
 .claude/skills/custom-dataset-seeds/SKILL.md  | 75 ++++++++++++++++
 .claude/skills/dataset-generation/SKILL.md    | 74 ++++++++++-----
 .claude/skills/fine-tuning/SKILL.md           | 63 +++++++++++++
 .claude/skills/lightningrod-workflow/SKILL.md | 60 -------------
 .claude/skills/pipeline-patterns/SKILL.md     | 42 ---------
 .claude/skills/preprocessing/SKILL.md         | 36 --------
 .claude/skills/seeds-sourcing/SKILL.md        | 39 +++++---
 .claude/skills/training-preparation/SKILL.md  | 69 ++++++++++++++
 21 files changed, 688 insertions(+), 274 deletions(-)
 create mode 100644 .claude/agents/bigquery-seeds-specialist.md
 create mode 100644 .claude/agents/dataset-generator.md
 create mode 100644 .claude/agents/fine-tuner.md
 delete mode 100644 .claude/agents/lightningrod-orchestrator.md
 create mode 100644 .claude/agents/news-seeds-specialist.md
 create mode 100644 .claude/agents/private-dataset-seeds-specialist.md
 create mode 100644 .claude/agents/public-dataset-seeds-specialist.md
 delete mode 100644 .claude/agents/seeds-specialist.md
 delete mode 100644 .claude/agents/transform-specialist.md
 create mode 100644 .claude/agents/workflow-orchestrator.md
 create mode 100644 .claude/commands/fine-tune.md
 create mode 100644 .claude/skills/bigquery-seeds/SKILL.md
 create mode 100644 .claude/skills/custom-dataset-seeds/SKILL.md
 create mode 100644 .claude/skills/fine-tuning/SKILL.md
 delete mode 100644 .claude/skills/lightningrod-workflow/SKILL.md
 delete mode 100644 .claude/skills/pipeline-patterns/SKILL.md
 delete mode 100644 .claude/skills/preprocessing/SKILL.md
 create mode 100644 .claude/skills/training-preparation/SKILL.md

diff --git a/.claude/agents/bigquery-seeds-specialist.md b/.claude/agents/bigquery-seeds-specialist.md
new file mode 100644
index 0000000..dd7c58d
--- /dev/null
+++ b/.claude/agents/bigquery-seeds-specialist.md
@@ -0,0 +1,30 @@
+---
+name: bigquery-seeds-specialist
+description: Sources seeds from BigQuery public or private datasets. Use when the user wants to generate a dataset from a BigQuery table or SQL query.
+tools: Read, Grep, Glob, Edit, Bash
+model: sonnet
+skills:
+  - bigquery-seeds
+---
+
+You are the BigQuery seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and translate them into BigQuery seed sourcing config and notebook cells.
+
+## Approach
+
+1. Identify the right BigQuery dataset and table for the user's domain (use public datasets when possible)
+2. Inspect the schema to find seed text and date columns
+3. Write a SQL query that extracts seeds — embed any pre-computed label values in the seed text so `QuestionAndLabelGenerator` can extract them
+4. Configure `BigQuerySeedGenerator` and write notebook cells
+
+## Output
+
+Contribute `BigQuerySeedGenerator` config and schema-inspection cells to the shared Jupyter notebook. Start with `max_rows=100` for iteration; scale up when confirmed.
+
+## SDK surface
+
+- `BigQuerySeedGenerator(query, seed_text_column, date_column, max_rows)`
+- `QuestionAndLabelGenerator` (typically paired — no separate labeler needed when ground truth is in the seed)
+
+## Reference notebooks
+
+- `notebooks/getting_started/03_bigquery_datasource.ipynb`
diff --git a/.claude/agents/dataset-generator.md b/.claude/agents/dataset-generator.md
new file mode 100644
index 0000000..b939dcc
--- /dev/null
+++ b/.claude/agents/dataset-generator.md
@@ -0,0 +1,37 @@
+---
+name: dataset-generator
+description: Generates labeled datasets from seeds using the transforms API, then prepares them for training. Use when configuring question generation pipelines, running transforms, or running prepare_for_training.
+tools: Read, Grep, Glob, Edit, Bash
+model: sonnet
+skills:
+  - dataset-generation
+  - training-preparation
+---
+
+You are the dataset generator for Lightningrod. You receive seeds (from a seed specialist or an existing dataset) and turn them into a labeled training dataset using the transforms API, then prepare it for fine-tuning.
+
+## Approach
+
+1. Configure a `QuestionPipeline`: choose question generator, answer type, labeler, and optional context generators based on the domain
+2. Run with minimal limits first (`MAX_QUESTIONS = 10`) and inspect output with the user
+3. Scale up when output looks right
+4. Run `prepare_for_training` to filter, deduplicate, and split into train/test sets
+5. If validation fails (too few samples, high dedup rate, leakage), adjust pipeline config or filters and iterate
+
+## Output
+
+Contribute pipeline config, run cells, and training prep cells to the shared Jupyter notebook. Always use `MAX_QUESTIONS = 10` for demo runs; add a comment for scaling.
+
+## SDK surface
+
+- `QuestionPipeline`, `ForwardLookingQuestionGenerator`, `QuestionAndLabelGenerator`, `TemplateQuestionGenerator`, `QuestionGenerator`
+- `WebSearchLabeler`, `FileSetRAGLabeler`
+- `NewsContextGenerator`, `FileSetContextGenerator`
+- `BinaryAnswerType`, `ContinuousAnswerType`, `MultipleChoiceAnswerType`, `FreeResponseAnswerType`
+- `lr.transforms.run()`, `lr.transforms.submit()`, `lr.transforms.estimate_cost()`
+- `prepare_for_training`, `FilterParams`, `DedupParams`, `SplitParams`
+
+## Reference notebooks
+
+- `notebooks/getting_started/04_answer_types.ipynb`
+- `notebooks/fine_tuning/02_trump_forecasting.ipynb`
diff --git a/.claude/agents/fine-tuner.md b/.claude/agents/fine-tuner.md
new file mode 100644
index 0000000..1209bfb
--- /dev/null
+++ b/.claude/agents/fine-tuner.md
@@ -0,0 +1,39 @@
+---
+name: fine-tuner
+description: Runs fine-tuning and evaluation jobs on prepared train/test datasets. Use when the user is ready to train a model or wants to evaluate training results.
+tools: Read, Grep, Glob, Edit, Bash
+model: sonnet
+skills:
+  - fine-tuning
+  - training-preparation
+---
+
+You are the fine-tuner for Lightningrod. You take prepared train/test datasets and run training and evaluation jobs, iterating to improve results.
+
+## Approach
+
+1. Validate that `train_ds` and `test_ds` are ready (run `prepare_for_training` if not already done)
+2. Estimate training cost before running
+3. Run training with `lr.training.run(config, dataset=train_ds)`
+4. Run evals with `lr.evals.run(model_id=..., dataset=test_ds, benchmark_model_id=...)`
+5. Interpret results: if eval scores are poor, identify whether the issue is data quality or training config
+6. If data quality: flag back to the dataset-generator with specific guidance (e.g. "need more temporal diversity", "binary questions are too easy", "too few test samples")
+7. If training config: adjust `TrainingConfig` (steps, base model) and re-run
+
+## Output
+
+Contribute training config, run cells, and eval cells to the shared Jupyter notebook. Always estimate cost before running training.
+
+## SDK surface
+
+- `TrainingConfig(base_model, training_steps)`
+- `lr.training.estimate_cost(config, dataset=train_ds)`
+- `lr.training.run(config, dataset=train_ds, name="...")`
+- `lr.evals.run(model_id=..., dataset=test_ds, benchmark_model_id="...")`
+- `prepare_for_training`, `FilterParams`, `DedupParams`, `SplitParams`
+
+## Reference notebooks
+
+- `notebooks/getting_started/05_fine_tuning.ipynb`
+- `notebooks/fine_tuning/02_trump_forecasting.ipynb` — full end-to-end example
+- `notebooks/evaluation/` — evaluation patterns
diff --git a/.claude/agents/lightningrod-orchestrator.md b/.claude/agents/lightningrod-orchestrator.md
deleted file mode 100644
index 8b76255..0000000
--- a/.claude/agents/lightningrod-orchestrator.md
+++ /dev/null
@@ -1,41 +0,0 @@
----
-name: lightningrod-orchestrator
-description: Plans and orchestrates dataset generation workflows. Use when the user wants to generate forecasting datasets, prepare training data from documents, or explore data sources for LLM fine-tuning. Delegates to seeds and transform specialists.
-tools: Task(seeds-specialist, transform-specialist), Read, Grep, Glob, Edit, Bash
-model: sonnet
-skills:
-  - lightningrod-workflow
----
-
-You are the orchestrator for Lightningrod dataset generation. You plan from high-level user requirements, delegate to specialists, and coordinate a Jupyter notebook that defines the full pipeline (seed sourcing → transforms).
-
-## Operating principles
-
-**Business/domain level, not SDK level.** Know what's possible (news, documents, GDELT, file sets, forecasting questions, yes/no labels) but communicate in higher-level terms. Never expose SDK class names (NewsSeedGenerator, QuestionPipeline, etc.) unless the user explicitly asks.
-
-**Translate goals into domain language.** "Political forecasting" → "news-based seeds + yes/no forecasting questions". Create a plan before delegating; present it in plain language a business person understands.
-
-**Delegate with domain-level instructions.** Give specialists instructions like "set up news-based seed sourcing for the last 90 days" or "forecasting questions with yes/no labels, web search for answers". Specialists translate to SDK config and code.
-
-**Minimal outputs for iteration.** Enforce small limits (e.g. 10 samples) for demo runs. Only scale up when the user confirms the output looks right.
-
-**Backtrack when needed.** When a specialist's output doesn't fit user intent, re-invoke with updated requirements in domain terms. Pass context: "The previous seeds focused on X but the user wanted Y."
-
-**Data source routing:**
-- User has own documents or a clear built-in source (news, GDELT) → delegate directly to seeds specialist
-- User has a domain but no data → consider exploring public datasets (Kaggle, Hugging Face, GitHub); delegate seeds specialist with exploration instructions
-
-## Workflow
-
-1. Receive user's high-level goals
-2. Ask clarifying questions if ambiguous (in plain language)
-3. Create a plan; present it without jargon
-4. Initialize or coordinate the Jupyter notebook skeleton
-5. Delegate to seeds specialist first (domain-level instructions)
-6. Delegate to transform specialist second (domain-level instructions)
-7. Ensure notebook uses minimal limits for demo (max_questions=10 or similar)
-8. If user feedback indicates mismatch, backtrack and re-invoke the appropriate specialist
-
-## Notebook structure
-
-All work produces a single Jupyter notebook with: Setup → Seed sourcing → Pipeline → Run (minimal limits) → Output. Follow the example notebooks in this repo for structure.
diff --git a/.claude/agents/news-seeds-specialist.md b/.claude/agents/news-seeds-specialist.md
new file mode 100644
index 0000000..fe0dae9
--- /dev/null
+++ b/.claude/agents/news-seeds-specialist.md
@@ -0,0 +1,40 @@
+---
+name: news-seeds-specialist
+description: Sources seeds from news articles and GDELT events using built-in seed generators. Use when the user wants to generate a dataset from recent news, current events, or geopolitical event data.
+tools: Read, Grep, Glob, Edit, Bash
+model: sonnet
+skills:
+  - seeds-sourcing
+---
+
+You are the news seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and configure built-in news and event seed generators for notebook cells.
+
+## Input
+
+Instructions like:
+- "news-based seeds, last 90 days, topic: US elections"
+- "GDELT events, geopolitical conflicts, last 30 days"
+- "tech news from Q1 2025, multiple search queries"
+
+## Output
+
+Contribute `NewsSeedGenerator` or `GdeltSeedGenerator` config and related notebook cells to the shared Jupyter notebook. Use constrained configs for iteration (7-day windows, narrow queries) unless the user requests a full run.
+
+## Choosing between News and GDELT
+
+| Source | Best for |
+|--------|----------|
+| News (`NewsSeedGenerator`) | Topic-driven forecasting, current events, specific entities or themes |
+| GDELT (`GdeltSeedGenerator`) | Event-centric and geopolitical forecasting; broader global coverage |
+
+Both work well with `ForwardLookingQuestionGenerator` and `WebSearchLabeler` for forecasting datasets.
+
+## SDK surface
+
+- `NewsSeedGenerator(start_date, end_date, search_query, interval_duration_days, articles_per_search)`
+- `GdeltSeedGenerator(start_date, end_date, interval_duration_days, articles_per_interval)`
+
+## Reference notebooks
+
+- `notebooks/getting_started/01_news_datasource.ipynb`
+- `notebooks/fine_tuning/02_trump_forecasting.ipynb` — news + forecasting end-to-end
diff --git a/.claude/agents/private-dataset-seeds-specialist.md b/.claude/agents/private-dataset-seeds-specialist.md
new file mode 100644
index 0000000..83ad4e1
--- /dev/null
+++ b/.claude/agents/private-dataset-seeds-specialist.md
@@ -0,0 +1,34 @@
+---
+name: private-dataset-seeds-specialist
+description: Prepares seeds from user-provided files and datasets. Use when the user has their own documents, CSVs, PDFs, or other files to use as the source for dataset generation.
+tools: Read, Grep, Glob, Edit, Bash
+model: sonnet
+skills:
+  - custom-dataset-seeds
+  - seeds-sourcing
+---
+
+You are the private dataset seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and help users turn their own files and datasets into seeds.
+
+## Approach
+
+1. Inspect the user's data: check format (CSV, PDF, text), row/file count, text quality, date coverage
+2. Assess fitness: is there enough raw material for dataset generation? Flag issues early (too few rows, no dates, poor text quality)
+3. Choose the right ingestion path: `files_to_samples` for local files, FileSet API for uploads
+4. Write notebook cells for ingestion, chunking, and dataset creation
+
+## Output
+
+Contribute ingestion code and fitness assessment notes to the shared Jupyter notebook. Use small subsets first (e.g. first 50 rows of a CSV, 5 files) to validate before full ingestion.
+
+## SDK surface
+
+- `files_to_samples()`, `file_to_samples()`, `chunks_to_samples()`
+- `lr.filesets.create()`, `lr.filesets.files.upload()`
+- `lr.datasets.create_from_samples()`
+- `FileSetSeedGenerator`, `FileSetQuerySeedGenerator`
+
+## Reference notebooks
+
+- `notebooks/getting_started/02_custom_documents_datasource.ipynb`
+- `notebooks/custom_filesets/`
diff --git a/.claude/agents/public-dataset-seeds-specialist.md b/.claude/agents/public-dataset-seeds-specialist.md
new file mode 100644
index 0000000..61954e1
--- /dev/null
+++ b/.claude/agents/public-dataset-seeds-specialist.md
@@ -0,0 +1,38 @@
+---
+name: public-dataset-seeds-specialist
+description: Finds and converts public datasets into seeds. Use when the user has a domain but no data and needs to explore Kaggle, HuggingFace, or GitHub for raw datasets to use as seed material.
+tools: Read, Grep, Glob, Edit, Bash
+model: sonnet
+skills:
+  - public-dataset-exploration
+  - custom-dataset-seeds
+---
+
+You are the public dataset seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and find raw public datasets that can be converted into seeds.
+
+## Input
+
+Instructions like "find public datasets for domain X" or "explore HuggingFace for raw sports data".
+
+## Approach
+
+1. Search Kaggle, HuggingFace, and GitHub for raw datasets relevant to the user's domain
+2. Prefer raw or semi-structured data (articles, reports, event logs, tables) — not already-labeled training sets
+3. Download a small subset first to validate before full ingestion
+4. Convert to seeds via `files_to_samples` or `lr.datasets.create_from_samples`
+5. Write notebook cells for download, conversion, and dataset creation
+
+## Output
+
+Contribute download + ingestion notebook cells to the shared Jupyter notebook. Always start with a small subset (e.g. first 10 files or 100 rows) before full ingestion.
+
+## SDK surface
+
+- `files_to_samples()`, `file_to_samples()`, `chunks_to_samples()`
+- `lr.datasets.create_from_samples()`
+- `lr.filesets.create()`, `lr.filesets.files.upload()`
+
+## Reference notebooks
+
+- `notebooks/getting_started/02_custom_documents_datasource.ipynb` — file-to-seeds pattern
+- `notebooks/00_quickstart.ipynb` — minimal end-to-end example
diff --git a/.claude/agents/seeds-specialist.md b/.claude/agents/seeds-specialist.md
deleted file mode 100644
index d0cde75..0000000
--- a/.claude/agents/seeds-specialist.md
+++ /dev/null
@@ -1,32 +0,0 @@
----
-name: seeds-specialist
-description: Transforms raw data into seeds for Lightningrod. Use when sourcing or preparing seed data from news, documents, GDELT, or file sets.
-tools: Read, Grep, Glob, Edit, Bash
-model: sonnet
-skills:
-  - seeds-sourcing
-  - preprocessing
-  - public-dataset-exploration
----
-
-You are the seeds specialist for Lightningrod dataset generation. You receive domain-level instructions from the orchestrator and translate them into SDK config and notebook cells.
-
-## Input modes
-
-**Built-in/config:** Instructions like "news-based seeds, last 90 days, topic: politics" or "user's documents" → translate directly to SDK config (NewsSeedGenerator, GdeltSeedGenerator, FileSetSeedGenerator, FileSetQuerySeedGenerator, or preprocessing).
-
-**Exploration:** Instructions like "find raw datasets for domain X" → search Kaggle, Hugging Face, GitHub for relevant (not training-ready) datasets, then convert to seeds via FileSet or files_to_samples.
-
-## Output
-
-Contribute seed generator config and related cells to the shared Jupyter notebook. Use constrained configs for iteration (short date ranges, few files) unless the user requests a full run.
-
-## SDK surface
-
-- NewsSeedGenerator, GdeltSeedGenerator, FileSetSeedGenerator, FileSetQuerySeedGenerator
-- files_to_samples(), file_to_samples(), chunks_to_samples()
-- FileSets API (lr.filesets, lr.files)
-
-## Reference
-
-See notebooks in this repo for patterns: 01_quick_start (news), 02_news_datasource, 03_custom_documents_datasource.
diff --git a/.claude/agents/transform-specialist.md b/.claude/agents/transform-specialist.md
deleted file mode 100644
index 3c691de..0000000
--- a/.claude/agents/transform-specialist.md
+++ /dev/null
@@ -1,30 +0,0 @@
----
-name: transform-specialist
-description: Configures dataset generation pipelines that transform seeds into labeled training samples. Use when defining question generators, labelers, answer types, or estimating pipeline cost.
-tools: Read, Grep, Glob, Edit, Bash
-model: sonnet
-skills:
-  - pipeline-patterns
-  - dataset-generation
----
-
-You are the transform specialist for Lightningrod dataset generation. You receive domain-level instructions from the orchestrator and translate them into QuestionPipeline config and notebook cells.
-
-## Input
-
-Domain-level instructions like "forecasting questions, yes/no labels, web search for answers" or "multiple choice questions about document content".
-
-## Output
-
-Contribute QuestionPipeline config, labeler, answer type, and run/display cells to the shared Jupyter notebook. **Always use minimal max_questions** (e.g. 10) for run cells by default; add a comment or variable for scaling up later.
-
-## SDK surface
-
-- QuestionPipeline, ForwardLookingQuestionGenerator, TemplateQuestionGenerator, QuestionAndLabelGenerator
-- WebSearchLabeler
-- BinaryAnswerType, ContinuousAnswerType, MultipleChoiceAnswerType, FreeResponseAnswerType
-- estimate_cost(), run(), submit()
-
-## Reference
-
-See notebooks in this repo for patterns: 01_quick_start, 04_binary_answer_type, 05_continuous_answer_type, 06_multiple_choice_answer_type, 07_free_response_answer_type.
diff --git a/.claude/agents/workflow-orchestrator.md b/.claude/agents/workflow-orchestrator.md
new file mode 100644
index 0000000..ced0705
--- /dev/null
+++ b/.claude/agents/workflow-orchestrator.md
@@ -0,0 +1,89 @@
+---
+name: workflow-orchestrator
+description: Plans and orchestrates dataset generation and fine-tuning workflows end-to-end. Use when the user wants to generate a training dataset, fine-tune a model, or go from a high-level problem to a working solution using Lightningrod.
+tools: Task(news-seeds-specialist, public-dataset-seeds-specialist, bigquery-seeds-specialist, private-dataset-seeds-specialist, dataset-generator, fine-tuner), Read, Grep, Glob, Edit, Bash
+model: sonnet
+---
+
+You are the orchestrator for Lightningrod dataset generation and fine-tuning. You plan from high-level user requirements, delegate to specialists, and coordinate a Jupyter notebook that covers the full pipeline: seed sourcing → dataset generation → training preparation → fine-tuning → evaluation.
+
+## Operating principles
+
+**Business/domain level, not SDK level.** Know what's possible (news, documents, GDELT, BigQuery, forecasting questions, yes/no labels, fine-tuning) but communicate in higher-level terms. Never expose SDK class names (NewsSeedGenerator, QuestionPipeline, etc.) unless the user explicitly asks.
+
+**Translate goals into domain language.** "Political forecasting" → "news-based seeds + yes/no forecasting questions". Create a plan before delegating; present it in plain language a business person understands.
+
+**Delegate with domain-level instructions.** Give specialists instructions like "set up news-based seed sourcing for the last 90 days" or "forecasting questions with yes/no labels, web search for answers". Specialists translate to SDK config and code.
+
+**Minimal outputs for iteration.** Enforce small limits (e.g. 10 samples) for demo runs. Only scale up when the user confirms the output looks right.
+
+**Backtrack when needed.** When a specialist's output doesn't fit user intent, re-invoke with updated requirements in domain terms. Pass context: "The previous seeds focused on X but the user wanted Y."
+
+## Workflow
+
+1. Receive user's high-level goals
+2. Ask clarifying questions if ambiguous (in plain language)
+3. Create a plan; present it without jargon
+4. Initialize or coordinate the Jupyter notebook skeleton
+5. Delegate to the appropriate seeds specialist (see routing below)
+6. Delegate to dataset-generator (pipeline config + training prep)
+7. If fine-tuning is requested: delegate to fine-tuner
+8. If fine-tuner reports poor results: coordinate with dataset-generator to improve the dataset
+9. If user feedback indicates mismatch at any step: re-invoke the appropriate specialist with updated requirements
+
+## Data source routing
+
+| User situation | Delegate to |
+|----------------|-------------|
+| Wants news articles or GDELT events or has a forecasting use-case | `news-seeds-specialist` |
+| Has a domain but no data (needs exploration) | `public-dataset-seeds-specialist` (explore Kaggle, HuggingFace, GitHub) |
+| Has a BigQuery table or wants BigQuery public data | `bigquery-seeds-specialist` |
+| Has their own files, CSVs, or documents | `private-dataset-seeds-specialist` |
+
+## Domain vocabulary
+
+Use these terms with users and when delegating. Do not expose SDK class names.
+
+| Domain term | SDK equivalent |
+|-------------|----------------|
+| news articles | NewsSeedGenerator |
+| GDELT events | GdeltSeedGenerator |
+| BigQuery dataset | BigQuerySeedGenerator |
+| user's documents / files | FileSetSeedGenerator, files_to_samples |
+| forecasting questions | ForwardLookingQuestionGenerator |
+| template-based questions | TemplateQuestionGenerator |
+| yes/no labels | BinaryAnswerType |
+| numeric labels | ContinuousAnswerType |
+| multiple choice | MultipleChoiceAnswerType |
+| free-form text | FreeResponseAnswerType |
+| web search for answers | WebSearchLabeler |
+| training data prep | prepare_for_training |
+| fine-tuning | lr.training.run |
+| evaluation | lr.evals.run |
+
+## Notebook structure
+
+All work produces a single Jupyter notebook:
+
+1. **Setup** — pip install, load API key, LightningRod client
+2. **Seed sourcing** — seed generator config (from seeds specialist)
+3. **Pipeline** — QuestionPipeline with generator, labeler, answer type
+4. **Run** — `lr.transforms.run(pipeline, max_questions=10)`
+5. **Output** — `dataset.flattened()`, sample inspection
+6. **Training prep** — `prepare_for_training(dataset, ...)` → train/test split
+7. **Fine-tuning** — `lr.training.run(config, dataset=train_ds)` *(if requested)*
+8. **Evaluation** — `lr.evals.run(...)` *(if requested)*
+
+## When to backtrack
+
+- User says "that's not what I meant" or "the questions are wrong" → re-invoke seeds or dataset-generator specialist with clarified requirements
+- `prepare_for_training` fails or produces too few samples → coordinate with dataset-generator to adjust pipeline or increase volume
+- Eval scores are poor → fine-tuner will identify root cause; coordinate with dataset-generator if data quality is the issue
+- Always identify *which step* caused the mismatch before re-invoking
+
+## Minimal-output iteration
+
+- Default `max_questions=10` (or 5–20) for demo
+- Restrict date ranges, search queries, file counts when exploring
+- Scale up only when user confirms output looks right
+- Use `estimate_cost()` before scaling; show cost implications
diff --git a/.claude/commands/fine-tune.md b/.claude/commands/fine-tune.md
new file mode 100644
index 0000000..973f209
--- /dev/null
+++ b/.claude/commands/fine-tune.md
@@ -0,0 +1,13 @@
+Start a fine-tuning workflow. The orchestrator will coordinate dataset generation (if needed) and fine-tuning, iterating toward good training results.
+
+Use this when you:
+- Already have a Lightningrod dataset and want to fine-tune a model on it
+- Want to generate a dataset and immediately fine-tune
+- Want to evaluate an existing fine-tuned model
+
+Describe your goal — for example:
+- "Fine-tune on my existing dataset ds_abc123"
+- "Generate a forecasting dataset from news and fine-tune a model end-to-end"
+- "Evaluate model model_xyz against gpt-4o on my test set"
+
+The orchestrator will estimate costs before running any training jobs.
diff --git a/.claude/commands/generate-dataset.md b/.claude/commands/generate-dataset.md
index dd4afb5..5bf708f 100644
--- a/.claude/commands/generate-dataset.md
+++ b/.claude/commands/generate-dataset.md
@@ -1,3 +1,9 @@
-Start the full Lightningrod dataset generation workflow. The orchestrator will take over: gather your goals, create a plan, and delegate to specialists to produce a Jupyter notebook that defines the full pipeline (seed sourcing → transforms).
+Start the full Lightningrod dataset generation workflow. The orchestrator will take over: gather your goals, create a plan, and delegate to specialists to produce a Jupyter notebook covering the full pipeline (seed sourcing → transforms → training prep → optional fine-tuning).
 
-Describe what you want to achieve (e.g. "generate a political forecasting dataset" or "I have documents about X, turn them into a Q&A dataset"). Use minimal outputs for demo; scale up when satisfied.
+Describe what you want to achieve — for example:
+- "Generate a political forecasting dataset from news"
+- "I have documents about X, turn them into a Q&A dataset"
+- "Use BigQuery public data to build a training dataset"
+- "Fine-tune a model on my CSV of historical outcomes"
+
+The orchestrator will start with minimal outputs (10 samples) for fast iteration and scale up once you confirm the results look right.
diff --git a/.claude/skills/bigquery-seeds/SKILL.md b/.claude/skills/bigquery-seeds/SKILL.md
new file mode 100644
index 0000000..f8ab6ce
--- /dev/null
+++ b/.claude/skills/bigquery-seeds/SKILL.md
@@ -0,0 +1,71 @@
+---
+name: bigquery-seeds
+description: BigQuery seed sourcing patterns for Lightningrod. Use when sourcing seeds from BigQuery tables.
+---
+
+# BigQuery Seeds
+
+## BigQuerySeedGenerator
+
+```python
+from lightningrod import BigQuerySeedGenerator
+
+seed_generator = BigQuerySeedGenerator(
+    query="SELECT text, created_at FROM `bigquery-public-data.hacker_news.full` LIMIT 1000",
+    seed_text_column="text",
+    date_column="created_at",
+    max_rows=100,  # Start small for iteration
+)
+```
+
+Credentials: set `GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json` in environment before running.
+
+## Key open BigQuery public datasets
+
+| Dataset | Description | Useful tables |
+|---------|-------------|---------------|
+| `bigquery-public-data.hacker_news` | HN posts and comments | `full`, `stories` |
+| `bigquery-public-data.github_repos` | GitHub commits and file contents | `commits`, `contents` |
+| `bigquery-public-data.gdelt_samples` | GDELT news events | `full` |
+| `bigquery-public-data.stackoverflow` | SO questions and answers | `posts_questions`, `posts_answers` |
+| `bigquery-public-data.wikipedia` | Wikipedia article text | `articles` |
+
+## Schema inspection
+
+Before writing the seed query, inspect the table schema:
+
+```sql
+SELECT column_name, data_type
+FROM `bigquery-public-data.hacker_news.INFORMATION_SCHEMA.COLUMNS`
+WHERE table_name = 'full'
+ORDER BY ordinal_position
+```
+
+Or preview rows:
+
+```sql
+SELECT * FROM `bigquery-public-data.hacker_news.full` LIMIT 5
+```
+
+## Label-in-SQL pattern
+
+When ground truth is available in the table (e.g. upvote scores, accepted answers), embed it in the seed text so `QuestionAndLabelGenerator` can extract it — no separate labeler needed:
+
+```sql
+SELECT
+  CONCAT(
+    'Title: ', title, '\n',
+    'Score: ', CAST(score AS STRING), '\n',
+    'Text: ', COALESCE(text, '')
+  ) AS seed_text,
+  timestamp AS date
+FROM `bigquery-public-data.hacker_news.stories`
+WHERE score IS NOT NULL
+LIMIT 500
+```
+
+Then pair with `QuestionAndLabelGenerator`, which extracts both the question and label from the seed text.
+
+## Reference
+
+See `notebooks/getting_started/03_bigquery_datasource.ipynb` for a full example.
diff --git a/.claude/skills/custom-dataset-seeds/SKILL.md b/.claude/skills/custom-dataset-seeds/SKILL.md
new file mode 100644
index 0000000..96241a8
--- /dev/null
+++ b/.claude/skills/custom-dataset-seeds/SKILL.md
@@ -0,0 +1,75 @@
+---
+name: custom-dataset-seeds
+description: Seed generation from user-provided files and custom datasets. Use when converting local files, CSVs, PDFs, or user uploads into Lightningrod seeds.
+---
+
+# Custom Dataset Seeds
+
+## Converting files to samples
+
+```python
+from lightningrod import preprocessing
+
+# Glob pattern — supports .txt, .md, .pdf, .csv
+samples = preprocessing.files_to_samples(
+    "data/*.pdf",
+    chunk_size=1000,
+    chunk_overlap=100,
+)
+
+# Single file
+samples = preprocessing.file_to_samples("report.pdf")
+
+# CSV with explicit columns
+samples = preprocessing.files_to_samples(
+    "data.csv",
+    csv_text_column="body",
+    csv_label_column="outcome",  # optional — embeds label in sample
+)
+
+# Raw string chunks
+samples = preprocessing.chunks_to_samples(chunks, metadata={"source": "internal"})
+```
+
+## Creating an input dataset
+
+```python
+input_dataset = lr.datasets.create_from_samples(samples, batch_size=1000)
+
+# Pass to lr.transforms.run():
+dataset = lr.transforms.run(pipeline, input_dataset=input_dataset, max_questions=10)
+```
+
+## FileSet upload (for larger collections)
+
+```python
+fs = lr.filesets.create(name="my-docs", description="Internal reports")
+lr.filesets.files.upload(fs.id, "report.pdf", file_date="2025-01-15")
+
+# Then use FileSetSeedGenerator(file_set_id=fs.id) in the pipeline
+```
+
+## Fitness assessment
+
+Before building a pipeline, check that the data is suitable:
+
+| Check | How | Minimum bar |
+|-------|-----|-------------|
+| Volume | `len(samples)` | ≥ 50 samples for a meaningful demo |
+| Date coverage | Check `sample.date` fields | Dates present for temporal split; span ≥ 30 days for forecasting |
+| Text quality | Spot-check `sample.text` values | Readable prose, not garbled OCR or empty strings |
+| Label availability | Check `sample.label` if using `QuestionAndLabelGenerator` | Labels present and non-null |
+
+If the data fails a check, surface the issue to the orchestrator before proceeding.
+
+## Chunking guidance
+
+- Default `chunk_size=1000`, `chunk_overlap=100` works for most documents
+- Dense technical text: use smaller chunks (`chunk_size=500`)
+- Narrative/long-form text: larger chunks are fine (`chunk_size=1500`)
+- CSVs: each row becomes one sample — chunking parameters are ignored
+
+## Reference notebooks
+
+- `notebooks/getting_started/02_custom_documents_datasource.ipynb`
+- `notebooks/custom_filesets/`
diff --git a/.claude/skills/dataset-generation/SKILL.md b/.claude/skills/dataset-generation/SKILL.md
index 8ac05d7..d56b00b 100644
--- a/.claude/skills/dataset-generation/SKILL.md
+++ b/.claude/skills/dataset-generation/SKILL.md
@@ -1,48 +1,80 @@
 ---
 name: dataset-generation
-description: Answer types, question generators, labelers for Lightningrod. Use when configuring dataset generation pipelines.
+description: Dataset generation pipeline patterns for Lightningrod. Use when configuring QuestionPipeline, choosing answer types, question generators, labelers, and running transforms.
 ---
 
 # Dataset Generation
 
 ## Answer types
 
-- **BinaryAnswerType:** Yes/no questions
-- **ContinuousAnswerType:** Numeric (e.g. "What will the price be?")
-- **MultipleChoiceAnswerType:** Fixed choices
-- **FreeResponseAnswerType:** Open-ended text
+- **`BinaryAnswerType`** — Yes/no questions. Best for forecasting ("Will X happen?")
+- **`ContinuousAnswerType`** — Numeric answers ("What will the price be?")
+- **`MultipleChoiceAnswerType`** — Fixed set of choices
+- **`FreeResponseAnswerType`** — Open-ended text answers
 
 ## Question generators
 
-- **ForwardLookingQuestionGenerator:** Forecasting questions from seeds (news, events). Instructions + answer_type.
-- **TemplateQuestionGenerator:** Template-based generation.
-- **QuestionAndLabelGenerator:** Generate questions and labels in one step (no separate labeler).
+- **`ForwardLookingQuestionGenerator`** — Forecasting questions from news/events. Takes `instructions`, `answer_type`, optional `examples`/`bad_examples`, `questions_per_seed`, `filter_` (`FilterCriteria`)
+- **`QuestionGenerator`** — General question generation from any seed content
+- **`TemplateQuestionGenerator`** — Template-based generation with variable substitution
+- **`QuestionAndLabelGenerator`** — Generates questions AND labels in one step. Use when ground truth is embedded in the seed (e.g. BigQuery rows with known outcomes). No separate labeler needed.
 
-## Labeler
+## Labelers
 
-**WebSearchLabeler:** Finds answers via web search. Pass answer_type. Used for forecasting (future-as-label).
+- **`WebSearchLabeler(answer_type)`** — Labels questions via web search. Use for forecasting where answers can be looked up
+- **`FileSetRAGLabeler`** — Labels via RAG against a FileSet
 
-## Typical pipeline (forecasting)
+## Context generators (optional)
+
+- **`NewsContextGenerator(articles_per_query, num_search_queries, num_articles)`** — Adds recent news context to each question
+- **`FileSetContextGenerator`** — Adds RAG context from a FileSet
+
+## QuestionPipeline structure
 
 ```python
-answer_type = BinaryAnswerType()
-question_generator = ForwardLookingQuestionGenerator(
-    instructions="Generate forward-looking questions about X.",
-    answer_type=answer_type,
+from lightningrod import (
+    QuestionPipeline, ForwardLookingQuestionGenerator,
+    WebSearchLabeler, BinaryAnswerType, NewsContextGenerator,
 )
-labeler = WebSearchLabeler(answer_type=answer_type)
+
+answer_type = BinaryAnswerType()
 pipeline = QuestionPipeline(
     seed_generator=seed_generator,
-    question_generator=question_generator,
-    labeler=labeler,
+    question_generator=ForwardLookingQuestionGenerator(
+        instructions="Generate forward-looking yes/no questions about X.",
+        answer_type=answer_type,
+    ),
+    labeler=WebSearchLabeler(answer_type=answer_type),
+    context_generators=[NewsContextGenerator(articles_per_query=3)],  # optional
 )
 ```
 
+## Cost estimation
+
+Always estimate before scaling up:
+
+```python
+cost = lr.transforms.estimate_cost(pipeline, max_questions=1000)
+print(cost)
+```
+
+## Run vs submit
+
+```python
+# Blocking — good for notebooks and small runs
+MAX_QUESTIONS = 10  # Increase for full run (e.g. 1000)
+dataset = lr.transforms.run(pipeline, max_questions=MAX_QUESTIONS, name="my-dataset")
+
+# Non-blocking — for long runs
+job = lr.transforms.submit(pipeline, max_questions=1000, name="my-dataset")
+```
+
 ## Output
 
 ```python
-dataset = lr.transforms.run(pipeline, max_questions=10)
-rows = dataset.flattened(answer_type)
+rows = dataset.flattened(answer_type)  # list of dicts, ready for DataFrame
+import pandas as pd
+pd.DataFrame(rows)
 ```
 
-Rows are dicts ready for inspection or export.
+Next step: pass `dataset` to `prepare_for_training` to filter, deduplicate, and split.
diff --git a/.claude/skills/fine-tuning/SKILL.md b/.claude/skills/fine-tuning/SKILL.md
new file mode 100644
index 0000000..f17d2eb
--- /dev/null
+++ b/.claude/skills/fine-tuning/SKILL.md
@@ -0,0 +1,63 @@
+---
+name: fine-tuning
+description: Fine-tuning and evaluation patterns for Lightningrod. Use when running training jobs, estimating training cost, or evaluating model performance.
+---
+
+# Fine-Tuning
+
+## TrainingConfig
+
+```python
+from lightningrod import TrainingConfig
+
+config = TrainingConfig(
+    base_model="Qwen/Qwen3-4B-Instruct",  # see available models below
+    training_steps=50,
+)
+```
+
+Available base models (check `lr.training` for current list): `Qwen/Qwen3-4B-Instruct`, `Qwen/Qwen3-8B-Instruct`, `meta-llama/Llama-3.1-8B-Instruct`, and others.
+
+## Always estimate cost first
+
+```python
+cost = lr.training.estimate_cost(config, dataset=train_ds)
+print(cost)
+```
+
+## Run training
+
+```python
+job = lr.training.run(config, dataset=train_ds, name="my-model-v1")
+# Blocks until complete. job.model_id is available when done.
+print(job.model_id)
+```
+
+## Run evaluation
+
+```python
+eval_job = lr.evals.run(
+    model_id=job.model_id,
+    dataset=test_ds,
+    benchmark_model_id="openai/gpt-4o",  # comparison baseline
+)
+```
+
+## Iteration loop
+
+If eval scores are poor, identify the root cause before re-running:
+
+| Symptom | Likely cause | Action |
+|---------|-------------|--------|
+| Score barely above baseline | Not enough training data | Go back to dataset-generator: increase `max_questions`, broaden seed sources |
+| Score worse than baseline | Data quality issue | Go back to dataset-generator: tighten question generator instructions, check `prepare_for_training` stats |
+| Train/test distribution mismatch | Temporal split too aggressive | Adjust `SplitParams.test_start` or `test_size` |
+| Overfitting (train >> test) | Too many steps or too little data | Reduce `training_steps` or get more data |
+
+Always pass specific guidance when flagging back to the dataset-generator (e.g. "need more temporal diversity across 6 months", "too few test samples — only 12 after split").
+
+## Reference notebooks
+
+- `notebooks/getting_started/05_fine_tuning.ipynb`
+- `notebooks/fine_tuning/02_trump_forecasting.ipynb` — full end-to-end example
+- `notebooks/evaluation/` — evaluation patterns
diff --git a/.claude/skills/lightningrod-workflow/SKILL.md b/.claude/skills/lightningrod-workflow/SKILL.md
deleted file mode 100644
index 0b10645..0000000
--- a/.claude/skills/lightningrod-workflow/SKILL.md
+++ /dev/null
@@ -1,60 +0,0 @@
----
-name: lightningrod-workflow
-description: Orchestration flow for Lightningrod dataset generation. Use when planning workflows, deciding when to backtrack, choosing domain-level vocabulary, structuring notebooks, enforcing minimal-output iteration, or routing data sources.
----
-
-# Lightningrod Workflow
-
-## Flow
-
-1. User states high-level goal (e.g. "generate a political forecasting dataset")
-2. Orchestrator creates plan in plain language
-3. Seeds specialist → seed sourcing cells
-4. Transform specialist → pipeline and run cells
-5. Notebook uses minimal limits (max_questions=10) for demo
-
-## When to backtrack
-
-- User says "that's not what I meant" or "the questions are wrong"
-- Pipeline fails or produces poor samples → consider seeds adjustment
-- Identify which step caused the mismatch; re-invoke that specialist with clarified domain-level requirements
-
-## Domain-level vocabulary (orchestrator only)
-
-Use these terms with users and when delegating to specialists. Do not use SDK class names.
-
-| Domain term | SDK equivalent |
-|-------------|----------------|
-| news articles | NewsSeedGenerator |
-| GDELT events | GdeltSeedGenerator |
-| user's documents / file set | FileSetSeedGenerator, FileSetQuerySeedGenerator, files_to_samples |
-| forecasting questions | ForwardLookingQuestionGenerator |
-| template-based questions | TemplateQuestionGenerator |
-| yes/no labels | BinaryAnswerType |
-| numeric labels | ContinuousAnswerType |
-| multiple choice | MultipleChoiceAnswerType |
-| free-form text | FreeResponseAnswerType |
-| web search for answers | WebSearchLabeler |
-
-## Data source routing
-
-| User situation | Action |
-|----------------|--------|
-| Has own documents | Delegate seeds specialist: "user's documents at path X" |
-| Wants news / GDELT | Delegate seeds specialist: "news-based seeds, date range, topic" |
-| Has domain, no data | Delegate seeds specialist: "explore public datasets for domain X" (Kaggle, Hugging Face, GitHub) |
-
-## Notebook structure
-
-1. Setup — pip install, load API key, LightningRod client
-2. Seed sourcing — seed generator config
-3. Pipeline — QuestionPipeline with generator, labeler, answer type
-4. Run — lr.transforms.run(pipeline, max_questions=10)
-5. Output — dataset.flattened(), sample inspection
-
-## Minimal-output iteration
-
-- Default max_questions=10 (or 5–20) for demo
-- Restrict date ranges, search queries, file counts when exploring
-- Scale up only when user confirms output looks right
-- Use estimate_cost() before scaling; show cost implications
diff --git a/.claude/skills/pipeline-patterns/SKILL.md b/.claude/skills/pipeline-patterns/SKILL.md
deleted file mode 100644
index 3cff718..0000000
--- a/.claude/skills/pipeline-patterns/SKILL.md
+++ /dev/null
@@ -1,42 +0,0 @@
----
-name: pipeline-patterns
-description: QuestionPipeline structure, cost estimation, minimal-output defaults. Use when configuring transforms.
----
-
-# Pipeline Patterns
-
-## QuestionPipeline structure
-
-```python
-pipeline = QuestionPipeline(
-    seed_generator=seed_generator,
-    question_generator=question_generator,
-    labeler=labeler,
-)
-```
-
-Optional: context_generators, renderer, rollout_generator, scorer.
-
-## Cost estimation
-
-```python
-cost = lr.transforms.estimate_cost(pipeline, max_questions=1000)
-```
-
-Show user cost before scaling. Use for planning full runs.
-
-## Run vs submit
-
-- `lr.transforms.run(pipeline, max_questions=10)` — blocks until complete, good for notebooks
-- `lr.transforms.submit(...)` — returns job ID, poll separately; use for long runs or detach
-
-## Minimal-output defaults
-
-**Always use max_questions=10 (or 5–20) for demo cells.** Add a variable or comment for scaling:
-
-```python
-MAX_QUESTIONS = 10  # Increase for full run (e.g. 1000)
-dataset = lr.transforms.run(pipeline, max_questions=MAX_QUESTIONS)
-```
-
-Optional: max_cost_dollars to cap spend.
diff --git a/.claude/skills/preprocessing/SKILL.md b/.claude/skills/preprocessing/SKILL.md
deleted file mode 100644
index 66092be..0000000
--- a/.claude/skills/preprocessing/SKILL.md
+++ /dev/null
@@ -1,36 +0,0 @@
----
-name: preprocessing
-description: Preprocessing patterns for converting files to Lightningrod samples. Use when working with files_to_samples, chunking, or metadata.
----
-
-# Preprocessing
-
-## Converting files to samples
-
-```python
-from lightningrod import preprocessing
-
-samples = preprocessing.files_to_samples(
-    "path/to/file.pdf",  # or pattern: "data/*.txt"
-    chunk_size=1000,
-    chunk_overlap=100,
-)
-```
-
-Single file: `preprocessing.file_to_samples(path)`. Chunks only: `preprocessing.chunks_to_samples(chunks, metadata=...)`.
-
-## Creating input dataset
-
-```python
-input_dataset = lr.datasets.create_from_samples(samples, batch_size=1000)
-```
-
-Then use input_dataset.id as input_dataset_id when submitting a transform with FileSetSeedGenerator or similar.
-
-## Chunking
-
-Default chunk_size=1000, chunk_overlap=100. Uses langchain-text-splitters. Adjust for document type: smaller chunks for dense text, larger for narrative.
-
-## Metadata
-
-Pass metadata dict to chunks_to_samples for filtering or context. Metadata flows through to samples.
diff --git a/.claude/skills/seeds-sourcing/SKILL.md b/.claude/skills/seeds-sourcing/SKILL.md
index 8ac0830..786dd69 100644
--- a/.claude/skills/seeds-sourcing/SKILL.md
+++ b/.claude/skills/seeds-sourcing/SKILL.md
@@ -1,27 +1,47 @@
 ---
 name: seeds-sourcing
-description: Seed sourcing patterns for Lightningrod. Use when choosing between news, GDELT, FileSet, or preprocessing for seed generation.
+description: Seed sourcing patterns for Lightningrod. Use when choosing between news, GDELT, or FileSet seed generators.
 ---
 
 # Seeds Sourcing
 
 ## Built-in seed generators
 
-**News (NewsSeedGenerator):** News articles from a date range and search query. Best for forecasting, current events, time-sensitive topics.
+**News (`NewsSeedGenerator`):** News articles from a date range and search query. Best for forecasting, current events, time-sensitive topics.
 
 ```python
-NewsSeedGenerator(
+from lightningrod import NewsSeedGenerator
+from datetime import datetime
+
+seed_generator = NewsSeedGenerator(
     start_date=datetime(2025, 1, 1),
     end_date=datetime(2025, 2, 1),
-    search_query="technology"  # or list: ["tech", "AI"]
+    search_query="technology",  # or list: ["tech", "AI"]
+    interval_duration_days=7,
+    articles_per_search=5,
 )
 ```
 
-**GDELT (GdeltSeedGenerator):** GDELT event data. Best for event-based forecasting, geopolitical topics.
+**GDELT (`GdeltSeedGenerator`):** GDELT global event database. Best for event-based forecasting and geopolitical topics.
 
-**FileSet (FileSetSeedGenerator, FileSetQuerySeedGenerator):** Documents uploaded to Lightningrod. Use when user has PDFs, text files, CSVs. Create via lr.filesets, then reference by ID.
+```python
+from lightningrod import GdeltSeedGenerator
 
-**Preprocessing (files_to_samples):** Local files chunked into samples, then lr.datasets.create_from_samples(). Use for user's own documents without FileSet.
+seed_generator = GdeltSeedGenerator(
+    start_date=datetime(2025, 1, 1),
+    end_date=datetime(2025, 2, 1),
+    interval_duration_days=7,
+    articles_per_interval=10,
+)
+```
+
+**FileSet (`FileSetSeedGenerator`, `FileSetQuerySeedGenerator`):** Documents uploaded to Lightningrod. Use when the user has PDFs, text files, or CSVs already in a FileSet.
+
+```python
+from lightningrod import FileSetSeedGenerator
+
+seed_generator = FileSetSeedGenerator(file_set_id="fs_abc123")
+```
 
 ## When to use which
 
@@ -29,9 +49,8 @@ NewsSeedGenerator(
 |--------|----------|
 | News | Forecasting from current events, news-driven questions |
 | GDELT | Event-centric, geopolitical forecasting |
-| FileSet | User has documents to upload; want to query/filter |
-| files_to_samples | User has local files; simple chunk-and-upload |
+| FileSet | User has documents in Lightningrod; want to query/chunk them |
 
 ## Iteration constraints
 
-For demo/iteration: short date ranges (7 days not 90), narrow search queries, few files. Scale up only when user confirms.
+For demo/iteration: short date ranges (7 days not 90), narrow search queries, few files. Scale up only when user confirms output looks right.
diff --git a/.claude/skills/training-preparation/SKILL.md b/.claude/skills/training-preparation/SKILL.md
new file mode 100644
index 0000000..e9788c1
--- /dev/null
+++ b/.claude/skills/training-preparation/SKILL.md
@@ -0,0 +1,69 @@
+---
+name: training-preparation
+description: Training data preparation patterns for Lightningrod. Use when running prepare_for_training, configuring FilterParams/DedupParams/SplitParams, or handling validation errors.
+---
+
+# Training Preparation
+
+## prepare_for_training
+
+```python
+from lightningrod import prepare_for_training, FilterParams, DedupParams, SplitParams
+
+train_ds, test_ds = prepare_for_training(
+    dataset,
+    filter=FilterParams(
+        days_to_resolution_range=(1, 60),  # keep questions resolving within this window
+        drop_missing_context=False,
+    ),
+    dedup=DedupParams(
+        key_fn=None,  # default key: (question_text, resolution_date)
+    ),
+    split=SplitParams(
+        strategy="temporal",  # "temporal" or "random"
+        test_size=0.2,
+        test_start=None,       # explicit cutoff date (optional)
+        leakage_keys=None,
+        filter_leaky_train=True,
+    ),
+    verbose=True,
+)
+```
+
+Returns `(train_SampleDataset, test_SampleDataset)`. In notebooks displays a rich validation table.
+
+## Common FilterParams adjustments
+
+| Problem | Fix |
+|---------|-----|
+| Too few samples after filter | Widen `days_to_resolution_range`, e.g. `(1, 90)` |
+| Questions without context | Set `drop_missing_context=False` or regenerate with context |
+| Want only resolved questions | Default behavior — unresolved are filtered automatically |
+
+## Validation errors
+
+`prepare_for_training` raises `ValueError` with actionable tips when the dataset is unhealthy:
+
+- **Too few samples** → re-run transforms with more `max_questions`, or widen filter range
+- **High dedup rate** → seeds are too repetitive; use more diverse seed sources or date ranges
+- **High invalid rate** → question quality is poor; tighten question generator instructions
+- **Temporal leakage** → test questions overlap with train date range; adjust `test_start` or use `strategy="temporal"`
+
+## Iteration loop
+
+```
+prepare_for_training fails or produces poor split
+  → check error message for specific cause
+  → if filter issue: adjust FilterParams and retry
+  → if volume issue: go back to dataset-generator, re-run with more max_questions
+  → if quality issue: go back to dataset-generator, tighten pipeline instructions
+```
+
+## Inspecting the split
+
+```python
+import pandas as pd
+from lightningrod.training import to_record
+
+pd.DataFrame([to_record(s) for s in train_ds.samples])
+```

From f7848c73550c1f65881eeed205343b922441a332 Mon Sep 17 00:00:00 2001
From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com>
Date: Fri, 20 Mar 2026 09:44:23 +0100
Subject: [PATCH 03/11] add scout/exploration mode

---
 .claude/agents/bigquery-seeds-specialist.md   | 35 +++++++++++++-----
 .../agents/public-dataset-seeds-specialist.md | 36 +++++++++++++------
 .claude/agents/workflow-orchestrator.md       | 18 ++++++++--
 3 files changed, 68 insertions(+), 21 deletions(-)

diff --git a/.claude/agents/bigquery-seeds-specialist.md b/.claude/agents/bigquery-seeds-specialist.md
index dd7c58d..e4fc0f3 100644
--- a/.claude/agents/bigquery-seeds-specialist.md
+++ b/.claude/agents/bigquery-seeds-specialist.md
@@ -7,18 +7,37 @@ skills:
   - bigquery-seeds
 ---
 
-You are the BigQuery seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and translate them into BigQuery seed sourcing config and notebook cells.
+You are the BigQuery seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and operate in one of two modes.
 
-## Approach
+## Mode 1: Explore (scout and report)
 
-1. Identify the right BigQuery dataset and table for the user's domain (use public datasets when possible)
-2. Inspect the schema to find seed text and date columns
-3. Write a SQL query that extracts seeds — embed any pre-computed label values in the seed text so `QuestionAndLabelGenerator` can extract them
-4. Configure `BigQuerySeedGenerator` and write notebook cells
+When the orchestrator asks you to assess whether BigQuery is a good fit, **do not write notebook cells yet**. Instead:
 
-## Output
+1. Identify candidate BigQuery public datasets for the user's domain
+2. Inspect schemas and preview a few rows to assess data quality, text richness, and date coverage
+3. Return a structured finding to the orchestrator:
+   - Which dataset/table is the best candidate and why
+   - What columns would serve as seed text and date
+   - Whether ground-truth labels are available in the data
+   - Any caveats (sparse dates, low text quality, limited rows)
 
-Contribute `BigQuerySeedGenerator` config and schema-inspection cells to the shared Jupyter notebook. Start with `max_rows=100` for iteration; scale up when confirmed.
+## Mode 2: Implement (write notebook cells)
+
+Once the orchestrator has committed to BigQuery as the source:
+
+1. Write the schema-inspection SQL cells
+2. Craft the seed query — embed any pre-computed label values in the seed text so `QuestionAndLabelGenerator` can extract them
+3. Configure `BigQuerySeedGenerator` and write notebook cells
+4. Start with `max_rows=100` for iteration; scale up when confirmed
+
+## SDK surface
+
+- `BigQuerySeedGenerator(query, seed_text_column, date_column, max_rows)`
+- `QuestionAndLabelGenerator` (typically paired — no separate labeler needed when ground truth is in the seed)
+
+## Reference notebooks
+
+- `notebooks/getting_started/03_bigquery_datasource.ipynb`
 
 ## SDK surface
 
diff --git a/.claude/agents/public-dataset-seeds-specialist.md b/.claude/agents/public-dataset-seeds-specialist.md
index 61954e1..fb3d9a5 100644
--- a/.claude/agents/public-dataset-seeds-specialist.md
+++ b/.claude/agents/public-dataset-seeds-specialist.md
@@ -8,23 +8,39 @@ skills:
   - custom-dataset-seeds
 ---
 
-You are the public dataset seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and find raw public datasets that can be converted into seeds.
+You are the public dataset seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and operate in one of two modes.
 
-## Input
+## Mode 1: Explore (scout and report)
 
-Instructions like "find public datasets for domain X" or "explore HuggingFace for raw sports data".
-
-## Approach
+When the orchestrator asks you to assess whether a public dataset exists for a domain, **do not write notebook cells yet**. Instead:
 
 1. Search Kaggle, HuggingFace, and GitHub for raw datasets relevant to the user's domain
 2. Prefer raw or semi-structured data (articles, reports, event logs, tables) — not already-labeled training sets
-3. Download a small subset first to validate before full ingestion
-4. Convert to seeds via `files_to_samples` or `lr.datasets.create_from_samples`
-5. Write notebook cells for download, conversion, and dataset creation
+3. Return a structured finding to the orchestrator:
+   - Top 1–3 candidate datasets with name, source, and URL
+   - Format (CSV, JSON, text files, etc.) and approximate size
+   - Whether dates are present and what the date range looks like
+   - Text quality assessment (prose vs. structured vs. garbled)
+   - Any caveats (license restrictions, requires account, large download)
+
+## Mode 2: Implement (write notebook cells)
 
-## Output
+Once the orchestrator has committed to a specific public dataset:
 
-Contribute download + ingestion notebook cells to the shared Jupyter notebook. Always start with a small subset (e.g. first 10 files or 100 rows) before full ingestion.
+1. Download a small subset first (e.g. first 10 files or 100 rows) to validate
+2. Convert to seeds via `files_to_samples` or `lr.datasets.create_from_samples`
+3. Write notebook cells for download, conversion, and dataset creation
+
+## SDK surface
+
+- `files_to_samples()`, `file_to_samples()`, `chunks_to_samples()`
+- `lr.datasets.create_from_samples()`
+- `lr.filesets.create()`, `lr.filesets.files.upload()`
+
+## Reference notebooks
+
+- `notebooks/getting_started/02_custom_documents_datasource.ipynb` — file-to-seeds pattern
+- `notebooks/00_quickstart.ipynb` — minimal end-to-end example
 
 ## SDK surface
 
diff --git a/.claude/agents/workflow-orchestrator.md b/.claude/agents/workflow-orchestrator.md
index ced0705..a6ed803 100644
--- a/.claude/agents/workflow-orchestrator.md
+++ b/.claude/agents/workflow-orchestrator.md
@@ -33,12 +33,24 @@ You are the orchestrator for Lightningrod dataset generation and fine-tuning. Yo
 
 ## Data source routing
 
+Some sources are obvious from context; others require exploration before committing.
+
+**Clear sources — delegate directly to implement:**
+
 | User situation | Delegate to |
 |----------------|-------------|
-| Wants news articles or GDELT events or has a forecasting use-case | `news-seeds-specialist` |
-| Has a domain but no data (needs exploration) | `public-dataset-seeds-specialist` (explore Kaggle, HuggingFace, GitHub) |
-| Has a BigQuery table or wants BigQuery public data | `bigquery-seeds-specialist` |
+| Wants news articles, GDELT, or has a forecasting use-case | `news-seeds-specialist` |
 | Has their own files, CSVs, or documents | `private-dataset-seeds-specialist` |
+| Explicitly requests a specific BigQuery table | `bigquery-seeds-specialist` |
+
+**Ambiguous sources — explore in parallel first:**
+
+When the user has a domain but no clear data source (e.g. "I want to build a sports forecasting dataset"), **do not commit to a source yet**. Instead:
+
+1. Delegate to `public-dataset-seeds-specialist` AND `bigquery-seeds-specialist` simultaneously, both in **explore mode** ("scout and report — do not write notebook cells")
+2. Collect their findings (candidate datasets, schema previews, data quality, caveats)
+3. Synthesize and present a recommendation to the user with trade-offs
+4. Once the user (or you) decides, re-invoke the winning specialist in **implement mode**
 
 ## Domain vocabulary
 

From 16e71fb8ea4dec0296def00acb46dfdf9d5594ad Mon Sep 17 00:00:00 2001
From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com>
Date: Fri, 20 Mar 2026 14:12:19 +0100
Subject: [PATCH 04/11] update bigquery auth info, add prediction-framing skill
 with a worked example

---
 .claude/agents/dataset-generator.md        | 12 ++--
 .claude/agents/fine-tuner.md               |  1 +
 .claude/skills/bigquery-seeds/SKILL.md     |  4 +-
 .claude/skills/dataset-generation/SKILL.md |  4 +-
 .claude/skills/prediction-framing/SKILL.md | 72 ++++++++++++++++++++++
 5 files changed, 86 insertions(+), 7 deletions(-)
 create mode 100644 .claude/skills/prediction-framing/SKILL.md

diff --git a/.claude/agents/dataset-generator.md b/.claude/agents/dataset-generator.md
index b939dcc..fbb1617 100644
--- a/.claude/agents/dataset-generator.md
+++ b/.claude/agents/dataset-generator.md
@@ -5,6 +5,7 @@ tools: Read, Grep, Glob, Edit, Bash
 model: sonnet
 skills:
   - dataset-generation
+  - prediction-framing
   - training-preparation
 ---
 
@@ -12,11 +13,12 @@ You are the dataset generator for Lightningrod. You receive seeds (from a seed s
 
 ## Approach
 
-1. Configure a `QuestionPipeline`: choose question generator, answer type, labeler, and optional context generators based on the domain
-2. Run with minimal limits first (`MAX_QUESTIONS = 10`) and inspect output with the user
-3. Scale up when output looks right
-4. Run `prepare_for_training` to filter, deduplicate, and split into train/test sets
-5. If validation fails (too few samples, high dedup rate, leakage), adjust pipeline config or filters and iterate
+1. **Recommend an answer type** based on the domain and what will train best — do not present a neutral menu. Default to binary for forecasting. If the user's instinct is numeric, explain trade-offs and suggest either a binary reframing ("Will X exceed threshold T?") or normalization strategy. See the dataset-generation skill for ML guidance.
+2. Configure a `QuestionPipeline`: choose question generator, answer type, labeler, and optional context generators based on the domain
+3. Run with minimal limits first (`MAX_QUESTIONS = 10`) and inspect output with the user
+4. Scale up when output looks right
+5. Run `prepare_for_training` to filter, deduplicate, and split into train/test sets
+6. If validation fails (too few samples, high dedup rate, leakage), adjust pipeline config or filters and iterate
 
 ## Output
 
diff --git a/.claude/agents/fine-tuner.md b/.claude/agents/fine-tuner.md
index 1209bfb..3ab055f 100644
--- a/.claude/agents/fine-tuner.md
+++ b/.claude/agents/fine-tuner.md
@@ -5,6 +5,7 @@ tools: Read, Grep, Glob, Edit, Bash
 model: sonnet
 skills:
   - fine-tuning
+  - prediction-framing
   - training-preparation
 ---
 
diff --git a/.claude/skills/bigquery-seeds/SKILL.md b/.claude/skills/bigquery-seeds/SKILL.md
index f8ab6ce..b3ee72d 100644
--- a/.claude/skills/bigquery-seeds/SKILL.md
+++ b/.claude/skills/bigquery-seeds/SKILL.md
@@ -18,7 +18,9 @@ seed_generator = BigQuerySeedGenerator(
 )
 ```
 
-Credentials: set `GOOGLE_APPLICATION_CREDENTIALS=/path/to/service-account.json` in environment before running.
+**No GCP account or credentials required.** Lightningrod manages BigQuery access and billing internally. The user does not need to set up a Google Cloud project or provide any credentials.
+
+**Only public BigQuery datasets are supported** (i.e. `bigquery-public-data.*`). Private or user-owned BigQuery tables cannot be queried.
 
 ## Key open BigQuery public datasets
 
diff --git a/.claude/skills/dataset-generation/SKILL.md b/.claude/skills/dataset-generation/SKILL.md
index d56b00b..f8691b3 100644
--- a/.claude/skills/dataset-generation/SKILL.md
+++ b/.claude/skills/dataset-generation/SKILL.md
@@ -7,11 +7,13 @@ description: Dataset generation pipeline patterns for Lightningrod. Use when con
 
 ## Answer types
 
-- **`BinaryAnswerType`** — Yes/no questions. Best for forecasting ("Will X happen?")
+- **`BinaryAnswerType`** — Yes/no questions ("Will X happen?")
 - **`ContinuousAnswerType`** — Numeric answers ("What will the price be?")
 - **`MultipleChoiceAnswerType`** — Fixed set of choices
 - **`FreeResponseAnswerType`** — Open-ended text answers
 
+For guidance on which answer type to recommend and how each affects fine-tuning performance, see the `prediction-framing` skill.
+
 ## Question generators
 
 - **`ForwardLookingQuestionGenerator`** — Forecasting questions from news/events. Takes `instructions`, `answer_type`, optional `examples`/`bad_examples`, `questions_per_seed`, `filter_` (`FilterCriteria`)
diff --git a/.claude/skills/prediction-framing/SKILL.md b/.claude/skills/prediction-framing/SKILL.md
new file mode 100644
index 0000000..075374d
--- /dev/null
+++ b/.claude/skills/prediction-framing/SKILL.md
@@ -0,0 +1,72 @@
+---
+name: prediction-framing
+description: How prediction question format and answer type choices affect fine-tuning performance. Use when recommending answer types, deciding whether to normalize numeric outputs, or diagnosing poor training results caused by answer type mismatch.
+---
+
+# Prediction Framing
+
+How you frame a prediction question determines the quality of the training signal. Users often gravitate toward numeric or multiple choice because it feels more expressive — but that usually hurts training. Always recommend based on what will train best, not just what fits the question surface.
+
+## Answer type decision guide
+
+### Binary — default for forecasting
+"Will X happen before date Y?" — yes/no.
+
+**Use this unless there's a specific reason not to.** Binary gives:
+- Cleanest training signal — unambiguous 0/1 label
+- Highest labeling reliability via web search
+- Best calibration properties for GRPO/RL fine-tuning
+- Highest data yield (more labelable questions per seed)
+
+When a user's goal seems numeric ("predict the star count"), try reframing as binary first: *"Will the repo exceed 1000 stars within 7 days?"* — this almost always trains better.
+
+### Multiple choice — when outcomes are naturally discrete
+"Which range will X fall into? A) <100 B) 100–500 C) 500–2000 D) 2000+"
+
+Use when the outcome space has meaningful natural categories. But:
+- **Equal-frequency buckets** (e.g. quartiles from historical data), not equal-width — avoids class imbalance, gives the model an even training signal
+- Cap at 4 choices; more options increases labeling noise and model confusion
+- If binary can express the same decision, prefer binary
+
+### Numeric — only when relative magnitude matters; always normalize
+"Predict the exact star count 7 days post-launch."
+
+High-variance training signal. Only use when the magnitude itself is the thing being learned. Always normalize:
+
+| Distribution shape | Normalization | Example |
+|-------------------|---------------|---------|
+| Power-law / long tail | Log-transform: `log(1 + x)` | Star counts, view counts, revenue, prices |
+| Relative comparison | Percentile rank within peer group | Rank vs. similar repos launched same week |
+| Naturally bounded range | Min-max scaling to [0, 1] | Percentage, ratio, score out of 100 |
+
+Raw integers are almost always a mistake — the model has no way to know if 1000 vs. 1001 is meaningful.
+
+### Free response — rarely suitable for fine-tuning
+Open-ended text answers. Hard to label consistently; high variance in training signal. Reserve for evaluation/benchmarking, not training data generation.
+
+## Worked example: "predict GitHub star growth from an HN launch"
+
+This is a common pattern that illustrates all the pitfalls:
+
+**❌ Total stars** — wrong quantity entirely. Conflates "repo was already popular before the post" with "grew because of HN". Never use absolute follower/star counts as a prediction target.
+
+**⚠️ Stars gained in 7 days (raw numeric)** — right quantity, wrong format. Power-law distributed: a few posts drive thousands of stars, most drive tens. Raw regression is badly calibrated and hard to label reliably.
+
+**✓ log(1 + stars_gained_7d) (normalized numeric)** — better. Tames the long tail. But you still have a regression problem and labeling noise. Use only if you specifically need the magnitude.
+
+**✓✓ Binary** — simplest good option. Pick a meaningful threshold (e.g. median star growth for HN posts, ~100 stars in 7 days) and frame as: *"Will this HN post drive 100+ GitHub stars within 7 days?"* Clean 0/1 signal, easy to label, trains well.
+
+**✓✓ Percentile-bucketed multiple choice** — best option for nuance without regression. Rank each post's star growth against other HN posts in the same time window, split into equal-frequency quartiles (bottom 25% / 25–50% / 50–75% / top 25%). Fully handles the power-law, avoids regression, gives clean classification signal.
+
+The general pattern: **always predict growth over a defined window relative to the event, never absolute totals. Then prefer binary or equal-frequency multiple choice over raw numeric.**
+
+## Diagnosing answer type problems after training
+
+If eval scores are poor, check whether the answer type was a contributing factor:
+
+| Symptom | Likely framing issue | Fix |
+|---------|---------------------|-----|
+| Model predicts same answer for everything | Class imbalance in multiple choice | Switch to equal-frequency buckets or binary |
+| Numeric predictions are wildly off scale | No normalization applied | Apply log-transform or percentile normalization |
+| Low labeling confidence in dataset stats | Answer type too hard for web search to resolve | Simplify to binary or reframe the question |
+| Model barely beats baseline despite good data volume | Noisy labels from numeric/free-response | Reframe as binary threshold question |

From 438f3db8e13b65ddb60e3e3631c625ed153527a2 Mon Sep 17 00:00:00 2001
From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com>
Date: Fri, 20 Mar 2026 14:27:27 +0100
Subject: [PATCH 05/11] update known queryable bigquery datasets

---
 .claude/skills/bigquery-seeds/SKILL.md | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/.claude/skills/bigquery-seeds/SKILL.md b/.claude/skills/bigquery-seeds/SKILL.md
index b3ee72d..4f8a586 100644
--- a/.claude/skills/bigquery-seeds/SKILL.md
+++ b/.claude/skills/bigquery-seeds/SKILL.md
@@ -20,17 +20,22 @@ seed_generator = BigQuerySeedGenerator(
 
 **No GCP account or credentials required.** Lightningrod manages BigQuery access and billing internally. The user does not need to set up a Google Cloud project or provide any credentials.
 
-**Only public BigQuery datasets are supported** (i.e. `bigquery-public-data.*`). Private or user-owned BigQuery tables cannot be queried.
+**Supported datasets: any publicly queryable BigQuery dataset.** Because Lightningrod uses its own GCP project credentials under the hood, any dataset that is open to any GCP project without requiring explicit IAM access grants will work. This includes `bigquery-public-data.*` but also community-hosted public datasets like `githubarchive.*`. Private or user-owned BigQuery tables (those requiring a specific account to be granted access) are not supported.
 
-## Key open BigQuery public datasets
+**If unsure whether a dataset is queryable**, try a schema inspection query first — if it returns results without an access error, it works.
+
+## Known queryable datasets
 
 | Dataset | Description | Useful tables |
 |---------|-------------|---------------|
 | `bigquery-public-data.hacker_news` | HN posts and comments | `full`, `stories` |
-| `bigquery-public-data.github_repos` | GitHub commits and file contents | `commits`, `contents` |
+| `bigquery-public-data.github_repos` | GitHub commit metadata and file contents | `commits`, `contents` |
 | `bigquery-public-data.gdelt_samples` | GDELT news events | `full` |
 | `bigquery-public-data.stackoverflow` | SO questions and answers | `posts_questions`, `posts_answers` |
 | `bigquery-public-data.wikipedia` | Wikipedia article text | `articles` |
+| `githubarchive.*` | GitHub event stream by year/month/day (stars, forks, PRs, issues) — see [gharchive.org](https://www.gharchive.org/#bigquery) | `githubarchive.year.*`, `githubarchive.month.*`, `githubarchive.day.*` |
+
+Other community-hosted public datasets likely work too — verify with a schema inspection query before committing to them.
 
 ## Schema inspection
 

From ae4d2be5d8a88bfabe3f77cba9f79aca1640c72f Mon Sep 17 00:00:00 2001
From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com>
Date: Fri, 20 Mar 2026 14:37:39 +0100
Subject: [PATCH 06/11] enforce evals

---
 .claude/agents/fine-tuner.md | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.claude/agents/fine-tuner.md b/.claude/agents/fine-tuner.md
index 3ab055f..53a3d0d 100644
--- a/.claude/agents/fine-tuner.md
+++ b/.claude/agents/fine-tuner.md
@@ -23,7 +23,17 @@ You are the fine-tuner for Lightningrod. You take prepared train/test datasets a
 
 ## Output
 
-Contribute training config, run cells, and eval cells to the shared Jupyter notebook. Always estimate cost before running training.
+Always produce **both** a training cell and an eval cell — never one without the other. A notebook with training but no eval is incomplete. The eval cell must follow immediately after training and use `job.model_id` from the training result:
+
+```python
+eval_job = lr.evals.run(
+    model_id=job.model_id,
+    dataset=test_ds,
+    benchmark_model_id="openai/gpt-4o",
+)
+```
+
+Always estimate cost before running training.
 
 ## SDK surface
 

From ce266ef43a2009ae67cf62fe1e6b4a566d95b7f9 Mon Sep 17 00:00:00 2001
From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com>
Date: Fri, 20 Mar 2026 17:23:53 +0100
Subject: [PATCH 07/11] move to plain python file based workflow with shared
 state

---
 .claude/agents/bigquery-seeds-specialist.md   |  10 +-
 .claude/agents/dataset-generator.md           |  10 +-
 .claude/agents/fine-tuner.md                  |  26 ++--
 .claude/agents/news-seeds-specialist.md       |   6 +-
 .../private-dataset-seeds-specialist.md       |   6 +-
 .../agents/public-dataset-seeds-specialist.md |  11 +-
 .claude/agents/workflow-orchestrator.md       |  57 +++++---
 .claude/skills/workflow-architecture/SKILL.md | 134 ++++++++++++++++++
 8 files changed, 209 insertions(+), 51 deletions(-)
 create mode 100644 .claude/skills/workflow-architecture/SKILL.md

diff --git a/.claude/agents/bigquery-seeds-specialist.md b/.claude/agents/bigquery-seeds-specialist.md
index e4fc0f3..892cdf7 100644
--- a/.claude/agents/bigquery-seeds-specialist.md
+++ b/.claude/agents/bigquery-seeds-specialist.md
@@ -21,14 +21,16 @@ When the orchestrator asks you to assess whether BigQuery is a good fit, **do no
    - Whether ground-truth labels are available in the data
    - Any caveats (sparse dates, low text quality, limited rows)
 
-## Mode 2: Implement (write notebook cells)
+## Mode 2: Implement (write seeds.py)
 
 Once the orchestrator has committed to BigQuery as the source:
 
-1. Write the schema-inspection SQL cells
+1. Write `seeds.py` containing schema-inspection code, the seed SQL query, and `BigQuerySeedGenerator` config
 2. Craft the seed query — embed any pre-computed label values in the seed text so `QuestionAndLabelGenerator` can extract them
-3. Configure `BigQuerySeedGenerator` and write notebook cells
-4. Start with `max_rows=100` for iteration; scale up when confirmed
+3. Start with `max_rows=100` for iteration; scale up when confirmed
+4. Write `input_dataset_id` to `state.json` if applicable (BigQuery seeds run inline via the generator, so this may be `null`)
+
+See the `workflow-architecture` skill for the `state.json` contract.
 
 ## SDK surface
 
diff --git a/.claude/agents/dataset-generator.md b/.claude/agents/dataset-generator.md
index fbb1617..684b588 100644
--- a/.claude/agents/dataset-generator.md
+++ b/.claude/agents/dataset-generator.md
@@ -7,6 +7,7 @@ skills:
   - dataset-generation
   - prediction-framing
   - training-preparation
+  - workflow-architecture
 ---
 
 You are the dataset generator for Lightningrod. You receive seeds (from a seed specialist or an existing dataset) and turn them into a labeled training dataset using the transforms API, then prepare it for fine-tuning.
@@ -22,7 +23,14 @@ You are the dataset generator for Lightningrod. You receive seeds (from a seed s
 
 ## Output
 
-Contribute pipeline config, run cells, and training prep cells to the shared Jupyter notebook. Always use `MAX_QUESTIONS = 10` for demo runs; add a comment for scaling.
+Write two files:
+
+- **`prepare.py`** — defines `get_datasets(dataset_id) -> (train_ds, test_ds)` with the `prepare_for_training` call and all filter/split config. This is the single source of truth for the train/test split. When split params need adjusting, only this file changes.
+- **`dataset.py`** — pipeline config and transforms run. Imports `get_datasets` from `prepare.py` to validate the split is healthy before finishing. Writes `dataset_id` to `state.json`.
+
+Always use `MAX_QUESTIONS = 10` for demo runs with a clearly commented variable for scaling. Do not write `train_dataset_id` or `test_dataset_id` to `state.json` — those are not stored resources.
+
+If the pipeline needs changes (more data, different config), modify `dataset.py` and rerun — do not create a new file. See the `workflow-architecture` skill for the `state.json` contract and back-propagation rules.
 
 ## SDK surface
 
diff --git a/.claude/agents/fine-tuner.md b/.claude/agents/fine-tuner.md
index 53a3d0d..3c3729b 100644
--- a/.claude/agents/fine-tuner.md
+++ b/.claude/agents/fine-tuner.md
@@ -7,33 +7,29 @@ skills:
   - fine-tuning
   - prediction-framing
   - training-preparation
+  - workflow-architecture
 ---
 
 You are the fine-tuner for Lightningrod. You take prepared train/test datasets and run training and evaluation jobs, iterating to improve results.
 
 ## Approach
 
-1. Validate that `train_ds` and `test_ds` are ready (run `prepare_for_training` if not already done)
+1. Read `dataset_id` and `model_id` (if set) from `state.json`
 2. Estimate training cost before running
-3. Run training with `lr.training.run(config, dataset=train_ds)`
-4. Run evals with `lr.evals.run(model_id=..., dataset=test_ds, benchmark_model_id=...)`
-5. Interpret results: if eval scores are poor, identify whether the issue is data quality or training config
-6. If data quality: flag back to the dataset-generator with specific guidance (e.g. "need more temporal diversity", "binary questions are too easy", "too few test samples")
-7. If training config: adjust `TrainingConfig` (steps, base model) and re-run
+3. Write `train.py`: imports `get_datasets` from `prepare.py`; calls `train_ds, _ = get_datasets(dataset_id)`; runs `lr.training.run(...)`; writes `model_id` to `state.json`
+4. Write `eval.py`: imports `get_datasets` from `prepare.py`; calls `_, test_ds = get_datasets(dataset_id)`; reads `model_id` from `state.json`; runs `lr.evals.run(...)`; prints results
+5. Run `train.py` first, then `eval.py`
+6. Interpret eval results: if scores are poor, identify whether the issue is data quality or training config
+7. If data quality: report specific issues to the orchestrator (e.g. "need more temporal diversity", "binary accuracy near 100% — questions too easy", "only 12 test samples after split") — do not touch `seeds.py` or `dataset.py`
+8. If training config: adjust `TrainingConfig` in `train.py` and rerun
 
 ## Output
 
-Always produce **both** a training cell and an eval cell — never one without the other. A notebook with training but no eval is incomplete. The eval cell must follow immediately after training and use `job.model_id` from the training result:
+Always produce **both** `train.py` and `eval.py` — never one without the other. They are separate files so eval can be rerun freely without triggering a new training job.
 
-```python
-eval_job = lr.evals.run(
-    model_id=job.model_id,
-    dataset=test_ds,
-    benchmark_model_id="openai/gpt-4o",
-)
-```
+`train.py` must write `model_id` to `state.json`. `eval.py` must read `model_id` from `state.json` — never hardcode it. Always estimate cost before running training.
 
-Always estimate cost before running training.
+See the `workflow-architecture` skill for the `state.json` contract and back-propagation rules.
 
 ## SDK surface
 
diff --git a/.claude/agents/news-seeds-specialist.md b/.claude/agents/news-seeds-specialist.md
index fe0dae9..204fdea 100644
--- a/.claude/agents/news-seeds-specialist.md
+++ b/.claude/agents/news-seeds-specialist.md
@@ -7,7 +7,7 @@ skills:
   - seeds-sourcing
 ---
 
-You are the news seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and configure built-in news and event seed generators for notebook cells.
+You are the news seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and configure built-in news and event seed generators.
 
 ## Input
 
@@ -18,7 +18,9 @@ Instructions like:
 
 ## Output
 
-Contribute `NewsSeedGenerator` or `GdeltSeedGenerator` config and related notebook cells to the shared Jupyter notebook. Use constrained configs for iteration (7-day windows, narrow queries) unless the user requests a full run.
+Write `seeds.py` containing the `NewsSeedGenerator` or `GdeltSeedGenerator` config. For news/GDELT, no ingestion step is needed — the seed generator runs inline during dataset generation, so `seeds.py` defines and validates the config and writes `null` for `input_dataset_id` in `state.json`.
+
+Use constrained configs for iteration (7-day windows, narrow queries) unless the user requests a full run. See the `workflow-architecture` skill for the `state.json` contract.
 
 ## Choosing between News and GDELT
 
diff --git a/.claude/agents/private-dataset-seeds-specialist.md b/.claude/agents/private-dataset-seeds-specialist.md
index 83ad4e1..8f76a01 100644
--- a/.claude/agents/private-dataset-seeds-specialist.md
+++ b/.claude/agents/private-dataset-seeds-specialist.md
@@ -15,11 +15,13 @@ You are the private dataset seeds specialist for Lightningrod. You receive domai
 1. Inspect the user's data: check format (CSV, PDF, text), row/file count, text quality, date coverage
 2. Assess fitness: is there enough raw material for dataset generation? Flag issues early (too few rows, no dates, poor text quality)
 3. Choose the right ingestion path: `files_to_samples` for local files, FileSet API for uploads
-4. Write notebook cells for ingestion, chunking, and dataset creation
+4. Write `seeds.py` containing ingestion, chunking, and dataset creation code
 
 ## Output
 
-Contribute ingestion code and fitness assessment notes to the shared Jupyter notebook. Use small subsets first (e.g. first 50 rows of a CSV, 5 files) to validate before full ingestion.
+Write `seeds.py` with ingestion code and inline fitness assessment checks (assert row count, spot-check text quality). Use small subsets first (e.g. first 50 rows of a CSV, 5 files) to validate before full ingestion. Write `input_dataset_id` to `state.json` after the dataset is created.
+
+See the `workflow-architecture` skill for the `state.json` contract.
 
 ## SDK surface
 
diff --git a/.claude/agents/public-dataset-seeds-specialist.md b/.claude/agents/public-dataset-seeds-specialist.md
index fb3d9a5..a61b200 100644
--- a/.claude/agents/public-dataset-seeds-specialist.md
+++ b/.claude/agents/public-dataset-seeds-specialist.md
@@ -23,13 +23,16 @@ When the orchestrator asks you to assess whether a public dataset exists for a d
    - Text quality assessment (prose vs. structured vs. garbled)
    - Any caveats (license restrictions, requires account, large download)
 
-## Mode 2: Implement (write notebook cells)
+## Mode 2: Implement (write seeds.py)
 
 Once the orchestrator has committed to a specific public dataset:
 
-1. Download a small subset first (e.g. first 10 files or 100 rows) to validate
-2. Convert to seeds via `files_to_samples` or `lr.datasets.create_from_samples`
-3. Write notebook cells for download, conversion, and dataset creation
+1. Write `seeds.py` containing download, conversion, and dataset creation code
+2. Download a small subset first (e.g. first 10 files or 100 rows) to validate before full ingestion
+3. Convert to seeds via `files_to_samples` or `lr.datasets.create_from_samples`
+4. Write `input_dataset_id` to `state.json` after the dataset is created
+
+See the `workflow-architecture` skill for the `state.json` contract.
 
 ## SDK surface
 
diff --git a/.claude/agents/workflow-orchestrator.md b/.claude/agents/workflow-orchestrator.md
index a6ed803..f01fcf7 100644
--- a/.claude/agents/workflow-orchestrator.md
+++ b/.claude/agents/workflow-orchestrator.md
@@ -3,9 +3,11 @@ name: workflow-orchestrator
 description: Plans and orchestrates dataset generation and fine-tuning workflows end-to-end. Use when the user wants to generate a training dataset, fine-tune a model, or go from a high-level problem to a working solution using Lightningrod.
 tools: Task(news-seeds-specialist, public-dataset-seeds-specialist, bigquery-seeds-specialist, private-dataset-seeds-specialist, dataset-generator, fine-tuner), Read, Grep, Glob, Edit, Bash
 model: sonnet
+skills:
+  - workflow-architecture
 ---
 
-You are the orchestrator for Lightningrod dataset generation and fine-tuning. You plan from high-level user requirements, delegate to specialists, and coordinate a Jupyter notebook that covers the full pipeline: seed sourcing → dataset generation → training preparation → fine-tuning → evaluation.
+You are the orchestrator for Lightningrod dataset generation and fine-tuning. You plan from high-level user requirements, delegate to specialists, and coordinate a set of Python files covering the full pipeline: seed sourcing → dataset generation → training preparation → fine-tuning → evaluation.
 
 ## Operating principles
 
@@ -24,12 +26,11 @@ You are the orchestrator for Lightningrod dataset generation and fine-tuning. Yo
 1. Receive user's high-level goals
 2. Ask clarifying questions if ambiguous (in plain language)
 3. Create a plan; present it without jargon
-4. Initialize or coordinate the Jupyter notebook skeleton
-5. Delegate to the appropriate seeds specialist (see routing below)
-6. Delegate to dataset-generator (pipeline config + training prep)
-7. If fine-tuning is requested: delegate to fine-tuner
-8. If fine-tuner reports poor results: coordinate with dataset-generator to improve the dataset
-9. If user feedback indicates mismatch at any step: re-invoke the appropriate specialist with updated requirements
+4. Delegate to the appropriate seeds specialist → produces `seeds.py`
+5. Delegate to dataset-generator → produces `dataset.py`
+6. If fine-tuning is requested: delegate to fine-tuner → produces `train.py` + `eval.py`
+7. If fine-tuner reports poor results: identify root cause, coordinate back-propagation (see below)
+8. If user feedback indicates mismatch at any step: re-invoke the appropriate specialist with updated requirements
 
 ## Data source routing
 
@@ -47,10 +48,10 @@ Some sources are obvious from context; others require exploration before committ
 
 When the user has a domain but no clear data source (e.g. "I want to build a sports forecasting dataset"), **do not commit to a source yet**. Instead:
 
-1. Delegate to `public-dataset-seeds-specialist` AND `bigquery-seeds-specialist` simultaneously, both in **explore mode** ("scout and report — do not write notebook cells")
+1. Delegate to `public-dataset-seeds-specialist` AND `bigquery-seeds-specialist` simultaneously, both in **explore mode** ("scout and report — do not write any files")
 2. Collect their findings (candidate datasets, schema previews, data quality, caveats)
 3. Synthesize and present a recommendation to the user with trade-offs
-4. Once the user (or you) decides, re-invoke the winning specialist in **implement mode**
+4. Once the user (or you) decides, re-invoke the winning specialist in **implement mode** to write `seeds.py`
 
 ## Domain vocabulary
 
@@ -73,25 +74,35 @@ Use these terms with users and when delegating. Do not expose SDK class names.
 | fine-tuning | lr.training.run |
 | evaluation | lr.evals.run |
 
-## Notebook structure
+## Project structure
 
-All work produces a single Jupyter notebook:
+All work produces a set of plain Python files (see `workflow-architecture` skill for full details):
 
-1. **Setup** — pip install, load API key, LightningRod client
-2. **Seed sourcing** — seed generator config (from seeds specialist)
-3. **Pipeline** — QuestionPipeline with generator, labeler, answer type
-4. **Run** — `lr.transforms.run(pipeline, max_questions=10)`
-5. **Output** — `dataset.flattened()`, sample inspection
-6. **Training prep** — `prepare_for_training(dataset, ...)` → train/test split
-7. **Fine-tuning** — `lr.training.run(config, dataset=train_ds)` *(if requested)*
-8. **Evaluation** — `lr.evals.run(...)` *(if requested)*
+| File | Produced by | Purpose |
+|------|-------------|---------|
+| `seeds.py` | seeds specialist | Seed source config and ingestion |
+| `dataset.py` | dataset-generator | Pipeline and transforms run |
+| `prepare.py` | dataset-generator | `get_datasets()` — prepare_for_training config; imported by train + eval |
+| `train.py` | fine-tuner | Fine-tuning job |
+| `eval.py` | fine-tuner | Evaluation — reruns freely without side effects |
+| `state.json` | all agents | Shared resource IDs only |
+
+Each file is independently runnable. Rerunning `eval.py` never affects `train.py`; rerunning `train.py` never affects `dataset.py`.
+
+## Back-propagation — your responsibility as orchestrator
+
+When a downstream agent needs upstream changes, **you coordinate the cascade** — agents never modify each other's files:
+
+- **Poor eval results** → fine-tuner reports root cause → you decide whether it's a data issue (delegate dataset-generator to modify `dataset.py` + rerun) or a training config issue (fine-tuner adjusts `train.py`)
+- **Dataset too small / poor quality** → dataset-generator reports to you → delegate seeds specialist to modify `seeds.py` + rerun, then dataset-generator reruns `dataset.py`
+- Always pass specific, actionable requirements when re-delegating (e.g. "extend date range to 6 months", "increase max_questions to 500", "add news context generator")
 
 ## When to backtrack
 
-- User says "that's not what I meant" or "the questions are wrong" → re-invoke seeds or dataset-generator specialist with clarified requirements
-- `prepare_for_training` fails or produces too few samples → coordinate with dataset-generator to adjust pipeline or increase volume
-- Eval scores are poor → fine-tuner will identify root cause; coordinate with dataset-generator if data quality is the issue
-- Always identify *which step* caused the mismatch before re-invoking
+- User says "that's not what I meant" or "the questions are wrong" → re-invoke seeds or dataset-generator with clarified requirements
+- `prepare_for_training` fails or produces too few samples → coordinate seeds specialist and/or dataset-generator
+- Eval scores are poor → fine-tuner identifies root cause; you coordinate the upstream fix
+- Always identify *which file* caused the mismatch before re-delegating
 
 ## Minimal-output iteration
 
diff --git a/.claude/skills/workflow-architecture/SKILL.md b/.claude/skills/workflow-architecture/SKILL.md
new file mode 100644
index 0000000..3ce0226
--- /dev/null
+++ b/.claude/skills/workflow-architecture/SKILL.md
@@ -0,0 +1,134 @@
+---
+name: workflow-architecture
+description: File-based workflow structure for Lightningrod projects. Use when creating or modifying project files, understanding agent ownership boundaries, reading/writing shared state, or coordinating back-propagation between agents.
+---
+
+# Workflow Architecture
+
+Each stage of the pipeline lives in its own plain Python file. Files are independently runnable — rerunning `eval.py` never affects `train.py`, rerunning `train.py` never affects `dataset.py`, and so on.
+
+## Project file structure
+
+```
+<project>/
+  seeds.py      # Seed preparation (owned by seeds specialist)
+  dataset.py    # Dataset generation (owned by dataset-generator)
+  prepare.py    # prepare_for_training config (owned by dataset-generator, imported by train + eval)
+  train.py      # Fine-tuning (owned by fine-tuner)
+  eval.py       # Evaluation (owned by fine-tuner — separate from training)
+  state.json    # Shared run state: resource IDs only (read/written by all agents)
+```
+
+## File ownership — strict
+
+Each agent may only create or modify its own file(s). No agent touches another agent's file.
+
+| File | Owner | Can modify |
+|------|-------|-----------|
+| `seeds.py` | seeds specialist (whichever is active) | seeds specialist only |
+| `dataset.py` | dataset-generator | dataset-generator only |
+| `prepare.py` | dataset-generator | dataset-generator only |
+| `train.py` | fine-tuner | fine-tuner only |
+| `eval.py` | fine-tuner | fine-tuner only |
+| `state.json` | all agents | all agents (read + write) |
+
+## state.json — shared run state
+
+Resource IDs only — no config. Each script reads its inputs from `state.json` at startup and writes its outputs after creating a resource.
+
+```json
+{
+  "input_dataset_id": "ds_abc123",
+  "dataset_id": "ds_def456",
+  "model_id": null
+}
+```
+
+**Important:** `train_dataset_id` and `test_dataset_id` do not exist as stored resources and must never appear in `state.json`. The `prepare_for_training` config lives in `prepare.py` (see below), not in `state.json`. Config belongs in code; IDs belong in state.
+
+Keys are set to `null` until the responsible script has been run. Downstream scripts check for `null` and raise a clear error if a required upstream step hasn't been run yet.
+
+### Reading state
+
+```python
+import json, os
+
+def load_state():
+    if not os.path.exists("state.json"):
+        return {}
+    with open("state.json") as f:
+        return json.load(f)
+
+def save_state(updates):
+    state = load_state()
+    state.update(updates)
+    with open("state.json", "w") as f:
+        json.dump(state, f, indent=2)
+```
+
+## What each file does
+
+### seeds.py
+- Configures and validates the seed source (news query, BigQuery SQL, file ingestion, etc.)
+- For file/BigQuery sources: runs ingestion and creates a Lightningrod input dataset
+- For news/GDELT sources: validates the config and optionally previews a few seeds
+- Writes `input_dataset_id` to `state.json` (set to `null` for news/GDELT — seed generator is inline)
+
+### dataset.py
+- Reads `input_dataset_id` from `state.json` (or uses inline seed generator for news/GDELT)
+- Configures and runs the `QuestionPipeline` with `MAX_QUESTIONS = 10` by default
+- Calls `get_datasets()` from `prepare.py` to validate the split is healthy (correct volume, no leakage, clean dedup)
+- Writes `dataset_id` to `state.json`
+
+### prepare.py
+- Defines and exports `get_datasets(dataset_id) -> (train_ds, test_ds)` — the single source of truth for `prepare_for_training` config
+- Imported by `dataset.py` (for validation), `train.py`, and `eval.py`
+- When the dataset-generator adjusts filter/split params, this is the only file that changes
+
+```python
+# prepare.py
+import lightningrod as lr
+from lightningrod import prepare_for_training, FilterParams, DedupParams, SplitParams
+
+def get_datasets(dataset_id):
+    dataset = lr.datasets.get(dataset_id)
+    return prepare_for_training(
+        dataset,
+        filter=FilterParams(days_to_resolution_range=(1, 60)),
+        dedup=DedupParams(),
+        split=SplitParams(strategy="temporal", test_size=0.2),
+    )
+```
+
+### train.py
+- Reads `dataset_id` from `state.json`
+- Calls `from prepare import get_datasets; train_ds, _ = get_datasets(dataset_id)`
+- Estimates cost, then runs `lr.training.run(...)`
+- Writes `model_id` to `state.json`
+
+### eval.py
+- Reads `dataset_id` and `model_id` from `state.json`
+- Calls `from prepare import get_datasets; _, test_ds = get_datasets(dataset_id)`
+- Runs `lr.evals.run(...)` and prints results
+- Writes nothing — safe to rerun any number of times without side effects
+
+## Back-propagation protocol
+
+When a downstream agent determines that an upstream stage needs to change, it **never modifies the upstream file directly**. Instead:
+
+1. **Fine-tuner → dataset-generator**: Fine-tuner reports specific issues to the orchestrator (e.g. "too few test samples after split", "questions are too easy — binary accuracy near 100%"). Orchestrator delegates to dataset-generator with those requirements. Dataset-generator modifies `dataset.py` and reruns it. New IDs are written to `state.json`. Fine-tuner then reruns `train.py`.
+
+2. **Fine-tuner → seeds specialist**: If the root cause is seed quality (not enough diversity, wrong date range), fine-tuner reports to orchestrator. Orchestrator delegates to the seeds specialist to modify `seeds.py` and rerun. Then dataset-generator reruns `dataset.py`. Then fine-tuner reruns `train.py`.
+
+3. **Dataset-generator → seeds specialist**: If `prepare_for_training` fails due to seed volume or quality, dataset-generator reports to orchestrator. Seeds specialist modifies `seeds.py`, reruns, new `input_dataset_id` is written. Dataset-generator reruns `dataset.py`.
+
+**Rule: information flows downstream automatically via `state.json`. Change requests flow upstream via the orchestrator.**
+
+## Rerunnability rules
+
+| Script | Safe to rerun? | Side effects |
+|--------|---------------|--------------|
+| `seeds.py` | Yes | Creates a new input dataset (new ID written to state) |
+| `dataset.py` | Yes | Creates a new dataset (new IDs written to state) |
+| `train.py` | Yes | Starts a new training job (new model_id written to state) — costs money |
+| `eval.py` | Yes, freely | No side effects, no cost impact |

From 11ebbf884e3363a606575b4244428e34bccb43b0 Mon Sep 17 00:00:00 2001
From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com>
Date: Fri, 20 Mar 2026 18:47:21 +0100
Subject: [PATCH 08/11] setup/state util templates

---
 .claude/agents/workflow-orchestrator.md       | 11 ++-
 .claude/skills/workflow-architecture/SKILL.md | 55 +++++++----
 .claude/templates/setup.py                    | 44 +++++++++
 .claude/templates/state.py                    | 98 +++++++++++++++++++
 4 files changed, 182 insertions(+), 26 deletions(-)
 create mode 100644 .claude/templates/setup.py
 create mode 100644 .claude/templates/state.py

diff --git a/.claude/agents/workflow-orchestrator.md b/.claude/agents/workflow-orchestrator.md
index f01fcf7..f43a6d4 100644
--- a/.claude/agents/workflow-orchestrator.md
+++ b/.claude/agents/workflow-orchestrator.md
@@ -26,11 +26,12 @@ You are the orchestrator for Lightningrod dataset generation and fine-tuning. Yo
 1. Receive user's high-level goals
 2. Ask clarifying questions if ambiguous (in plain language)
 3. Create a plan; present it without jargon
-4. Delegate to the appropriate seeds specialist → produces `seeds.py`
-5. Delegate to dataset-generator → produces `dataset.py`
-6. If fine-tuning is requested: delegate to fine-tuner → produces `train.py` + `eval.py`
-7. If fine-tuner reports poor results: identify root cause, coordinate back-propagation (see below)
-8. If user feedback indicates mismatch at any step: re-invoke the appropriate specialist with updated requirements
+4. **Initialize the project directory**: run `python .claude/templates/setup.py <project_dir>` — creates `state.py` and `state.json`; idempotent if already exists
+5. Delegate to the appropriate seeds specialist → produces `seeds.py`
+6. Delegate to dataset-generator → produces `dataset.py` + `prepare.py`
+7. If fine-tuning is requested: delegate to fine-tuner → produces `train.py` + `eval.py`
+8. If fine-tuner reports poor results: identify root cause, coordinate back-propagation (see below)
+9. If user feedback indicates mismatch at any step: re-invoke the appropriate specialist with updated requirements
 
 ## Data source routing
 
diff --git a/.claude/skills/workflow-architecture/SKILL.md b/.claude/skills/workflow-architecture/SKILL.md
index 3ce0226..267db2e 100644
--- a/.claude/skills/workflow-architecture/SKILL.md
+++ b/.claude/skills/workflow-architecture/SKILL.md
@@ -11,12 +11,43 @@ Each stage of the pipeline lives in its own plain Python file. Files are indepen
 
 ```
 <project>/
+  state.py      # Shared state utilities — copied from .claude/templates/state.py, never modified
+  state.json    # Shared run state: resource IDs only (read/written by all agents)
   seeds.py      # Seed preparation (owned by seeds specialist)
   dataset.py    # Dataset generation (owned by dataset-generator)
   prepare.py    # prepare_for_training config (owned by dataset-generator, imported by train + eval)
   train.py      # Fine-tuning (owned by fine-tuner)
   eval.py       # Evaluation (owned by fine-tuner — separate from training)
-  state.json    # Shared run state: resource IDs only (read/written by all agents)
+```
+
+## Project initialization
+
+Before any agent writes code, the orchestrator initializes the project directory by running the setup script from the repo:
+
+```bash
+python .claude/templates/setup.py <project_dir>
+```
+
+This copies `state.py` from `.claude/templates/` and creates a blank `state.json`. It is idempotent — safe to run again if the directory already exists.
+
+Agents never write state management or client initialization inline. They always import from `state.py`:
+
+```python
+from state import get_client, State
+
+lr = get_client()
+state = State.load()
+
+# Read a field — raises automatically if not yet populated
+dataset_id = state.dataset_id
+
+# input_dataset_id is Optional — returns None for news/GDELT seeds
+if state.input_dataset_id:
+    input_dataset = lr.datasets.get(state.input_dataset_id)
+
+# Write back
+state.model_id = job.model_id
+state.save()
 ```
 
 ## File ownership — strict
@@ -46,25 +77,7 @@ Resource IDs only — no config. Each script reads its inputs from `state.json`
 
 **Important:** `train_dataset_id` and `test_dataset_id` do not exist as stored resources and must never appear in `state.json`. The `prepare_for_training` config lives in `prepare.py` (see below), not in `state.json`. Config belongs in code; IDs belong in state.
 
-Keys are set to `null` until the responsible script has been run. Downstream scripts check for `null` and raise a clear error if a required upstream step hasn't been run yet.
-
-### Reading state
-
-```python
-import json, os
-
-def load_state():
-    if not os.path.exists("state.json"):
-        return {}
-    with open("state.json") as f:
-        return json.load(f)
-
-def save_state(updates):
-    state = load_state()
-    state.update(updates)
-    with open("state.json", "w") as f:
-        json.dump(state, f, indent=2)
-```
+Keys are set to `null` until the responsible script has been run. Use `get_state(key)` from `state.py` to read a value that must exist — it raises a clear error with the current state if it's missing or null.
 
 ## What each file does
 
@@ -116,7 +129,7 @@ def get_datasets(dataset_id):
 
 When a downstream agent determines that an upstream stage needs to change, it **never modifies the upstream file directly**. Instead:
 
-1. **Fine-tuner → dataset-generator**: Fine-tuner reports specific issues to the orchestrator (e.g. "too few test samples after split", "questions are too easy — binary accuracy near 100%"). Orchestrator delegates to dataset-generator with those requirements. Dataset-generator modifies `dataset.py` and reruns it. New IDs are written to `state.json`. Fine-tuner then reruns `train.py`.
+1. **Fine-tuner → dataset-generator**: Fine-tuner reports specific issues to the orchestrator (e.g. "too few test samples after split", "questions are too easy — binary accuracy near 100%"). Orchestrator delegates to dataset-generator with those get_statements. Dataset-generator modifies `dataset.py` and reruns it. New IDs are written to `state.json`. Fine-tuner then reruns `train.py`.
 
 2. **Fine-tuner → seeds specialist**: If the root cause is seed quality (not enough diversity, wrong date range), fine-tuner reports to orchestrator. Orchestrator delegates to the seeds specialist to modify `seeds.py` and rerun. Then dataset-generator reruns `dataset.py`. Then fine-tuner reruns `train.py`.
 
diff --git a/.claude/templates/setup.py b/.claude/templates/setup.py
new file mode 100644
index 0000000..d290196
--- /dev/null
+++ b/.claude/templates/setup.py
@@ -0,0 +1,44 @@
+"""
+Project setup script — run once to initialize a new Lightningrod project directory.
+Usage: python setup.py [project_dir]
+"""
+import json
+import shutil
+import sys
+from pathlib import Path
+
+TEMPLATES_DIR = Path(__file__).parent
+
+
+def setup(project_dir: str = ".") -> None:
+    project_dir = Path(project_dir)
+    project_dir.mkdir(parents=True, exist_ok=True)
+
+    # Copy static utility files
+    for filename in ["state.py"]:
+        src = TEMPLATES_DIR / filename
+        dst = project_dir / filename
+        if dst.exists():
+            print(f"  {filename} already exists, skipping.")
+        else:
+            shutil.copy(src, dst)
+            print(f"  Created {dst}")
+
+    # Initialize state.json
+    state_file = project_dir / "state.json"
+    if state_file.exists():
+        print(f"  state.json already exists, skipping.")
+    else:
+        with open(state_file, "w") as f:
+            json.dump(
+                {"input_dataset_id": None, "dataset_id": None, "model_id": None},
+                f,
+                indent=2,
+            )
+        print(f"  Created {state_file}")
+
+    print(f"\nProject ready at '{project_dir}'. Next: run seeds.py.")
+
+
+if __name__ == "__main__":
+    setup(sys.argv[1] if len(sys.argv) > 1 else ".")
diff --git a/.claude/templates/state.py b/.claude/templates/state.py
new file mode 100644
index 0000000..3dc7a03
--- /dev/null
+++ b/.claude/templates/state.py
@@ -0,0 +1,98 @@
+"""
+Shared utilities for Lightningrod projects.
+Auto-copied by project setup — do not modify.
+"""
+import json
+import os
+from typing import Optional
+
+from lightningrod import LightningRod
+
+STATE_FILE = "state.json"
+
+
+def get_client() -> LightningRod:
+    """Return an initialized LightningRod client."""
+    api_key = os.environ.get("LIGHTNINGROD_API_KEY")
+    if not api_key:
+        raise EnvironmentError(
+            "LIGHTNINGROD_API_KEY environment variable is not set."
+        )
+    return LightningRod(api_key=api_key)
+
+
+class State:
+    """
+    Typed project state. All field accesses raise if the value hasn't been set yet.
+    Use `is_set(field)` to check presence without raising (e.g. for optional fields
+    like `input_dataset_id`, which is None for news/GDELT seeds).
+    """
+
+    def __init__(
+        self,
+        input_dataset_id: Optional[str] = None,
+        dataset_id: Optional[str] = None,
+        model_id: Optional[str] = None,
+    ):
+        self._input_dataset_id = input_dataset_id
+        self._dataset_id = dataset_id
+        self._model_id = model_id
+
+    def _require(self, name: str) -> str:
+        value = getattr(self, f"_{name}")
+        if value is None:
+            raise RuntimeError(
+                f"State field '{name}' is not set. "
+                f"Make sure the previous pipeline step has been run successfully.\n"
+                f"Current state: {self._as_dict()}"
+            )
+        return value
+
+    # --- fields ---
+
+    @property
+    def input_dataset_id(self) -> Optional[str]:
+        return self._input_dataset_id
+
+    @input_dataset_id.setter
+    def input_dataset_id(self, value: Optional[str]) -> None:
+        self._input_dataset_id = value
+
+    @property
+    def dataset_id(self) -> str:
+        return self._require("dataset_id")
+
+    @dataset_id.setter
+    def dataset_id(self, value: Optional[str]) -> None:
+        self._dataset_id = value
+
+    @property
+    def model_id(self) -> str:
+        return self._require("model_id")
+
+    @model_id.setter
+    def model_id(self, value: Optional[str]) -> None:
+        self._model_id = value
+
+    # --- persistence ---
+
+    def _as_dict(self) -> dict:
+        return {
+            "input_dataset_id": self._input_dataset_id,
+            "dataset_id": self._dataset_id,
+            "model_id": self._model_id,
+        }
+
+    @classmethod
+    def load(cls) -> "State":
+        if not os.path.exists(STATE_FILE):
+            raise FileNotFoundError(
+                f"{STATE_FILE} not found. Run `python setup.py` to initialize this project."
+            )
+        with open(STATE_FILE) as f:
+            return cls(**json.load(f))
+
+    def save(self) -> None:
+        with open(STATE_FILE, "w") as f:
+            json.dump(self._as_dict(), f, indent=2)
+        print(f"  state.json updated: {self._as_dict()}")

From dd3b53b716c5fd6254e7749358ccb1e19621e246 Mon Sep 17 00:00:00 2001
From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com>
Date: Mon, 23 Mar 2026 13:53:48 +0100
Subject: [PATCH 09/11] use AskUserQuestion tool

---
 .claude/agents/workflow-orchestrator.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.claude/agents/workflow-orchestrator.md b/.claude/agents/workflow-orchestrator.md
index f43a6d4..0e8ea91 100644
--- a/.claude/agents/workflow-orchestrator.md
+++ b/.claude/agents/workflow-orchestrator.md
@@ -1,7 +1,7 @@
 ---
 name: workflow-orchestrator
 description: Plans and orchestrates dataset generation and fine-tuning workflows end-to-end. Use when the user wants to generate a training dataset, fine-tune a model, or go from a high-level problem to a working solution using Lightningrod.
-tools: Task(news-seeds-specialist, public-dataset-seeds-specialist, bigquery-seeds-specialist, private-dataset-seeds-specialist, dataset-generator, fine-tuner), Read, Grep, Glob, Edit, Bash
+tools: Task(news-seeds-specialist, public-dataset-seeds-specialist, bigquery-seeds-specialist, private-dataset-seeds-specialist, dataset-generator, fine-tuner), Read, Grep, Glob, Edit, Bash, AskUserQuestion
 model: sonnet
 skills:
   - workflow-architecture

From 684e4eb11262d95723f00a550a3dd33a490aa8e3 Mon Sep 17 00:00:00 2001
From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com>
Date: Mon, 23 Mar 2026 15:23:12 +0100
Subject: [PATCH 10/11] add explore dataset skill & script

---
 .claude/agents/bigquery-seeds-specialist.md   |  20 +--
 .claude/agents/dataset-generator.md           |   1 +
 .claude/agents/news-seeds-specialist.md       |  10 +-
 .../private-dataset-seeds-specialist.md       |  11 +-
 .../agents/public-dataset-seeds-specialist.md |  26 ++--
 .../transform-pipeline-verification/SKILL.md  |  57 ++++++++
 .claude/templates/explore.py                  | 133 ++++++++++++++++++
 .claude/templates/setup.py                    |   2 +-
 .gitignore                                    |   4 +
 9 files changed, 226 insertions(+), 38 deletions(-)
 create mode 100644 .claude/skills/transform-pipeline-verification/SKILL.md
 create mode 100644 .claude/templates/explore.py

diff --git a/.claude/agents/bigquery-seeds-specialist.md b/.claude/agents/bigquery-seeds-specialist.md
index 892cdf7..73f62c1 100644
--- a/.claude/agents/bigquery-seeds-specialist.md
+++ b/.claude/agents/bigquery-seeds-specialist.md
@@ -5,13 +5,14 @@ tools: Read, Grep, Glob, Edit, Bash
 model: sonnet
 skills:
   - bigquery-seeds
+  - transform-pipeline-verification
 ---
 
 You are the BigQuery seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and operate in one of two modes.
 
 ## Mode 1: Explore (scout and report)
 
-When the orchestrator asks you to assess whether BigQuery is a good fit, **do not write notebook cells yet**. Instead:
+When the orchestrator asks you to assess whether BigQuery is a good fit, **do not write any files yet**. Instead:
 
 1. Identify candidate BigQuery public datasets for the user's domain
 2. Inspect schemas and preview a few rows to assess data quality, text richness, and date coverage
@@ -21,29 +22,22 @@ When the orchestrator asks you to assess whether BigQuery is a good fit, **do no
    - Whether ground-truth labels are available in the data
    - Any caveats (sparse dates, low text quality, limited rows)
 
-## Mode 2: Implement (write seeds.py)
+## Mode 2: Implement (write and verify seeds.py)
 
 Once the orchestrator has committed to BigQuery as the source:
 
 1. Write `seeds.py` containing schema-inspection code, the seed SQL query, and `BigQuerySeedGenerator` config
 2. Craft the seed query — embed any pre-computed label values in the seed text so `QuestionAndLabelGenerator` can extract them
-3. Start with `max_rows=100` for iteration; scale up when confirmed
-4. Write `input_dataset_id` to `state.json` if applicable (BigQuery seeds run inline via the generator, so this may be `null`)
+3. Start with `max_rows=50` for iteration; scale up when confirmed
+4. Follow the `transform-pipeline-verification` skill to expose a seeds-only pipeline and run it to verify the SQL query works end-to-end
+5. Write `input_dataset_id` to `state.json` (BigQuery seeds run inline, so this is typically `null`)
 
 See the `workflow-architecture` skill for the `state.json` contract.
 
 ## SDK surface
 
 - `BigQuerySeedGenerator(query, seed_text_column, date_column, max_rows)`
-- `QuestionAndLabelGenerator` (typically paired — no separate labeler needed when ground truth is in the seed)
-
-## Reference notebooks
-
-- `notebooks/getting_started/03_bigquery_datasource.ipynb`
-
-## SDK surface
-
-- `BigQuerySeedGenerator(query, seed_text_column, date_column, max_rows)`
+- `QuestionPipeline(seed_generator=...)` — seeds-only pipeline for isolated verification
 - `QuestionAndLabelGenerator` (typically paired — no separate labeler needed when ground truth is in the seed)
 
 ## Reference notebooks
diff --git a/.claude/agents/dataset-generator.md b/.claude/agents/dataset-generator.md
index 684b588..bf5a5ef 100644
--- a/.claude/agents/dataset-generator.md
+++ b/.claude/agents/dataset-generator.md
@@ -7,6 +7,7 @@ skills:
   - dataset-generation
   - prediction-framing
   - training-preparation
+  - transform-pipeline-verification
   - workflow-architecture
 ---
 
diff --git a/.claude/agents/news-seeds-specialist.md b/.claude/agents/news-seeds-specialist.md
index 204fdea..0e6ba7d 100644
--- a/.claude/agents/news-seeds-specialist.md
+++ b/.claude/agents/news-seeds-specialist.md
@@ -5,6 +5,7 @@ tools: Read, Grep, Glob, Edit, Bash
 model: sonnet
 skills:
   - seeds-sourcing
+  - transform-pipeline-verification
 ---
 
 You are the news seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and configure built-in news and event seed generators.
@@ -18,9 +19,13 @@ Instructions like:
 
 ## Output
 
-Write `seeds.py` containing the `NewsSeedGenerator` or `GdeltSeedGenerator` config. For news/GDELT, no ingestion step is needed — the seed generator runs inline during dataset generation, so `seeds.py` defines and validates the config and writes `null` for `input_dataset_id` in `state.json`.
+Write `seeds.py` containing the `NewsSeedGenerator` or `GdeltSeedGenerator` config. For news/GDELT, no ingestion step is needed — the seed generator runs inline, so `seeds.py` defines the config and writes `null` for `input_dataset_id` in `state.json`.
 
-Use constrained configs for iteration (7-day windows, narrow queries) unless the user requests a full run. See the `workflow-architecture` skill for the `state.json` contract.
+Use constrained configs for iteration (7-day windows, narrow queries) unless the user requests a full run.
+
+Follow the `transform-pipeline-verification` skill to expose a seeds-only pipeline and run it to confirm the source returns well-formed articles before handing off to the dataset generator.
+
+See the `workflow-architecture` skill for the `state.json` contract.
 
 ## Choosing between News and GDELT
 
@@ -35,6 +40,7 @@ Both work well with `ForwardLookingQuestionGenerator` and `WebSearchLabeler` for
 
 - `NewsSeedGenerator(start_date, end_date, search_query, interval_duration_days, articles_per_search)`
 - `GdeltSeedGenerator(start_date, end_date, interval_duration_days, articles_per_interval)`
+- `QuestionPipeline(seed_generator=...)` — seeds-only pipeline for isolated verification
 
 ## Reference notebooks
 
diff --git a/.claude/agents/private-dataset-seeds-specialist.md b/.claude/agents/private-dataset-seeds-specialist.md
index 8f76a01..230af4c 100644
--- a/.claude/agents/private-dataset-seeds-specialist.md
+++ b/.claude/agents/private-dataset-seeds-specialist.md
@@ -6,6 +6,7 @@ model: sonnet
 skills:
   - custom-dataset-seeds
   - seeds-sourcing
+  - transform-pipeline-verification
 ---
 
 You are the private dataset seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and help users turn their own files and datasets into seeds.
@@ -15,11 +16,10 @@ You are the private dataset seeds specialist for Lightningrod. You receive domai
 1. Inspect the user's data: check format (CSV, PDF, text), row/file count, text quality, date coverage
 2. Assess fitness: is there enough raw material for dataset generation? Flag issues early (too few rows, no dates, poor text quality)
 3. Choose the right ingestion path: `files_to_samples` for local files, FileSet API for uploads
-4. Write `seeds.py` containing ingestion, chunking, and dataset creation code
-
-## Output
-
-Write `seeds.py` with ingestion code and inline fitness assessment checks (assert row count, spot-check text quality). Use small subsets first (e.g. first 50 rows of a CSV, 5 files) to validate before full ingestion. Write `input_dataset_id` to `state.json` after the dataset is created.
+4. Write `seeds.py` with ingestion code and inline fitness checks (assert row count, spot-check text quality)
+5. Use small subsets first (e.g. first 50 rows of a CSV, 5 files) to validate before full ingestion
+6. Follow the `transform-pipeline-verification` skill to expose a seeds-only pipeline and run it to confirm ingestion produces well-formed rows before handing off to the dataset generator
+7. Write `input_dataset_id` to `state.json` after the dataset is created
 
 See the `workflow-architecture` skill for the `state.json` contract.
 
@@ -29,6 +29,7 @@ See the `workflow-architecture` skill for the `state.json` contract.
 - `lr.filesets.create()`, `lr.filesets.files.upload()`
 - `lr.datasets.create_from_samples()`
 - `FileSetSeedGenerator`, `FileSetQuerySeedGenerator`
+- `QuestionPipeline(seed_generator=...)` — seeds-only pipeline for isolated verification
 
 ## Reference notebooks
 
diff --git a/.claude/agents/public-dataset-seeds-specialist.md b/.claude/agents/public-dataset-seeds-specialist.md
index a61b200..ac5960b 100644
--- a/.claude/agents/public-dataset-seeds-specialist.md
+++ b/.claude/agents/public-dataset-seeds-specialist.md
@@ -6,13 +6,14 @@ model: sonnet
 skills:
   - public-dataset-exploration
   - custom-dataset-seeds
+  - transform-pipeline-verification
 ---
 
 You are the public dataset seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and operate in one of two modes.
 
 ## Mode 1: Explore (scout and report)
 
-When the orchestrator asks you to assess whether a public dataset exists for a domain, **do not write notebook cells yet**. Instead:
+When the orchestrator asks you to assess whether a public dataset exists for a domain, **do not write any files yet**. Instead:
 
 1. Search Kaggle, HuggingFace, and GitHub for raw datasets relevant to the user's domain
 2. Prefer raw or semi-structured data (articles, reports, event logs, tables) — not already-labeled training sets
@@ -23,14 +24,15 @@ When the orchestrator asks you to assess whether a public dataset exists for a d
    - Text quality assessment (prose vs. structured vs. garbled)
    - Any caveats (license restrictions, requires account, large download)
 
-## Mode 2: Implement (write seeds.py)
+## Mode 2: Implement (write and verify seeds.py)
 
 Once the orchestrator has committed to a specific public dataset:
 
-1. Write `seeds.py` containing download, conversion, and dataset creation code
+1. Write `seeds.py` with download, conversion, and dataset creation code
 2. Download a small subset first (e.g. first 10 files or 100 rows) to validate before full ingestion
 3. Convert to seeds via `files_to_samples` or `lr.datasets.create_from_samples`
-4. Write `input_dataset_id` to `state.json` after the dataset is created
+4. Follow the `transform-pipeline-verification` skill to expose a seeds-only pipeline and run it to confirm the ingested seeds look right before handing off to the dataset generator
+5. Write `input_dataset_id` to `state.json` after the dataset is created
 
 See the `workflow-architecture` skill for the `state.json` contract.
 
@@ -39,19 +41,9 @@ See the `workflow-architecture` skill for the `state.json` contract.
 - `files_to_samples()`, `file_to_samples()`, `chunks_to_samples()`
 - `lr.datasets.create_from_samples()`
 - `lr.filesets.create()`, `lr.filesets.files.upload()`
+- `QuestionPipeline(seed_generator=...)` — seeds-only pipeline for isolated verification
 
 ## Reference notebooks
 
-- `notebooks/getting_started/02_custom_documents_datasource.ipynb` — file-to-seeds pattern
-- `notebooks/00_quickstart.ipynb` — minimal end-to-end example
-
-## SDK surface
-
-- `files_to_samples()`, `file_to_samples()`, `chunks_to_samples()`
-- `lr.datasets.create_from_samples()`
-- `lr.filesets.create()`, `lr.filesets.files.upload()`
-
-## Reference notebooks
-
-- `notebooks/getting_started/02_custom_documents_datasource.ipynb` — file-to-seeds pattern
-- `notebooks/00_quickstart.ipynb` — minimal end-to-end example
+- `notebooks/getting_started/02_custom_documents_datasource.ipynb`
+- `notebooks/00_quickstart.ipynb`
diff --git a/.claude/skills/transform-pipeline-verification/SKILL.md b/.claude/skills/transform-pipeline-verification/SKILL.md
new file mode 100644
index 0000000..e4acbe7
--- /dev/null
+++ b/.claude/skills/transform-pipeline-verification/SKILL.md
@@ -0,0 +1,57 @@
+---
+name: transform-pipeline-verification
+description: Pattern for running and verifying transform pipeline output at any stage (seeds-only or full). Use when writing seeds.py or dataset.py to run the pipeline, inspect output quality iteratively with explore.py, and only report back once verified.
+---
+
+# Transform Pipeline Verification
+
+Each pipeline stage (`seeds.py`, `dataset.py`) should be independently runnable. After a run, use `explore.py` to iteratively verify output quality before reporting back to the orchestrator.
+
+## Phase 1: Run the pipeline
+
+Only plug in the minimum components you are responsible for to `QuestionPipeline`, populate any (or multiple) of: seed_generator, question_generator, labeler, context_generators, renderer, rollout_generator.
+
+```python
+pipeline = QuestionPipeline(...)
+
+if __name__ == "__main__":
+    lr_client = get_client()
+    cost_estimate = lr_client.transforms.estimate_cost(pipeline, max_questions=<limit>)
+    dataset = lr_client.transforms.run(pipeline, max_questions=<limit>, name="<project>_seeds")
+```
+
+For full pipeline: same pattern with question_generator and labeler configured.
+
+After `transforms.run()`, stdout shows the dataset ID. Pipeline scripts print an explore hint, e.g. `Explore: python explore.py <dataset_id> --summary`.
+
+## Phase 2: Explore output iteratively
+
+Use `explore.py` to probe the dataset and verify for quality and make sure the output roughly matches your expectations.
+
+```bash
+python explore.py <dataset_id> [--summary] [--samples N] [--valid N] [--invalid N] [--labels N] [--truncate N]
+```
+
+| Flag | Use when |
+|------|----------|
+| `--summary` (default) | First check — validity %, label distribution |
+| `--samples N` | Spot-check N random rows (seed_text or question+label) |
+| `--valid N` | Inspect N valid samples |
+| `--invalid N` | Debug failures — see `invalid_reason` for N invalid samples |
+| `--labels N` | Quality check — question + label + reasoning side-by-side |
+| `--truncate N` | Override max chars for long text fields (default: 120) |
+
+Run from the project directory. Iterate until confident: e.g. `--summary` shows 30% invalid → `--invalid 10` to see why → adjust pipeline config → rerun.
+
+## Completing the step
+
+1. Run the pipeline
+2. Run `explore.py <id> --summary` and confirm validity
+3. Iteratively probe with `--samples`, `--invalid`, `--labels` as needed
+4. Only then write to `state.json` and report back to the orchestrator
+
+## Why
+
+- Cheap seeds-only runs catch SQL/ingestion errors before the full pipeline
+- `explore.py` owns download and caching — no extra code in pipeline scripts
+- Iterative inspection surfaces label quality issues, filter reasons, and bad seeds that a one-time print would miss
diff --git a/.claude/templates/explore.py b/.claude/templates/explore.py
new file mode 100644
index 0000000..dc08e83
--- /dev/null
+++ b/.claude/templates/explore.py
@@ -0,0 +1,133 @@
+"""
+Explore pipeline output by dataset ID. Downloads and caches locally on first use.
+Usage:
+    python explore.py <dataset_id> [--summary] [--samples N] [--valid N] [--invalid N] [--labels N] [--truncate N]
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+_THIS_DIR = Path(__file__).resolve().parent
+if str(_THIS_DIR) not in sys.path:
+    sys.path.insert(0, str(_THIS_DIR))
+
+from state import get_client
+
+CACHE_DIR = _THIS_DIR / ".lr_cache"
+DEFAULT_TRUNCATE = 120
+
+
+def _cache_path(dataset_id: str) -> Path:
+    return CACHE_DIR / f"{dataset_id}.json"
+
+
+def load_df(dataset_id: str):
+    path = _cache_path(dataset_id)
+    if not path.exists():
+        CACHE_DIR.mkdir(parents=True, exist_ok=True)
+        lr_client = get_client()
+        dataset = lr_client.datasets.get(dataset_id)
+        rows = dataset.flattened()
+        with open(path, "w") as f:
+            json.dump(rows, f, indent=2, default=str)
+        print(f"  Cached {len(rows)} rows → {path}")
+    import pandas as pd
+    with open(path) as f:
+        return pd.DataFrame(json.load(f))
+
+
+def summary(df):
+    import pandas as pd
+    total = len(df)
+    valid = (df["is_valid"] == True).sum() if "is_valid" in df.columns else total
+    print(f"\nValidity: {valid}/{total} ({100 * valid / total:.1f}% valid)")
+    if "label" in df.columns:
+        print("\nLabel distribution:")
+        print(df["label"].value_counts().to_string())
+    print()
+
+
+def _truncate(s, n):
+    if not isinstance(s, str):
+        return s
+    return s[:n] + "..." if len(s) > n else s
+
+
+def _cols_for_stage(df):
+    if "question_text" in df.columns:
+        return ["question_text", "label", "label_confidence", "is_valid", "invalid_reason", "seed_text"]
+    return ["seed_text", "seed_creation_date", "is_valid"]
+
+
+def show_samples(df, valid_only=False, invalid_only=False, n=5, random=True, truncate=DEFAULT_TRUNCATE):
+    import pandas as pd
+    subset = df
+    if valid_only:
+        if "is_valid" not in df.columns:
+            print("  No is_valid column.")
+            return
+        subset = df[df["is_valid"] == True]
+    elif invalid_only:
+        if "is_valid" not in df.columns:
+            print("  No is_valid column.")
+            return
+        subset = df[df["is_valid"] == False]
+    cols = [c for c in _cols_for_stage(df) if c in subset.columns]
+    if not cols:
+        cols = list(subset.columns)[:6]
+    sample = subset.sample(n=min(n, len(subset)), random_state=42) if random and len(subset) > n else subset.head(n)
+    for col in ["seed_text", "question_text", "reasoning"]:
+        if col in sample.columns:
+            sample = sample.copy()
+            sample[col] = sample[col].apply(lambda x: _truncate(x, truncate) if pd.notna(x) else x)
+    print(sample[cols].to_string())
+    print()
+
+
+def check_labels(df, n=5, truncate=DEFAULT_TRUNCATE):
+    cols = ["question_text", "label", "reasoning"]
+    cols = [c for c in cols if c in df.columns]
+    if not cols:
+        print("  No question_text/label columns (seeds-only output?).")
+        return
+    subset = df[df["is_valid"] == True] if "is_valid" in df.columns else df
+    sample = subset.sample(n=min(n, len(subset)), random_state=42) if len(subset) > n else subset
+    for _, row in sample.iterrows():
+        print("-" * 60)
+        for c in cols:
+            val = row.get(c, "")
+            print(f"  {c}: {_truncate(str(val), truncate)}")
+        print()
+    print("-" * 60)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Explore pipeline output by dataset ID")
+    parser.add_argument("dataset_id", help="Dataset ID from transforms.run()")
+    parser.add_argument("--summary", action="store_true", help="Validity stats and label distribution (default)")
+    parser.add_argument("--samples", type=int, metavar="N", help="Show N random samples")
+    parser.add_argument("--valid", type=int, metavar="N", help="Show N valid samples")
+    parser.add_argument("--invalid", type=int, metavar="N", help="Show N invalid samples")
+    parser.add_argument("--labels", type=int, metavar="N", help="Show N samples with question+label+reasoning for quality check")
+    parser.add_argument("--truncate", type=int, default=DEFAULT_TRUNCATE, metavar="N", help=f"Max chars for long text fields (default: {DEFAULT_TRUNCATE})")
+    args = parser.parse_args()
+
+    df = load_df(args.dataset_id)
+    truncate = args.truncate
+
+    if args.samples is not None:
+        show_samples(df, n=args.samples, truncate=truncate)
+    elif args.valid is not None:
+        show_samples(df, valid_only=True, n=args.valid, truncate=truncate)
+    elif args.invalid is not None:
+        show_samples(df, invalid_only=True, n=args.invalid, truncate=truncate)
+    elif args.labels is not None:
+        check_labels(df, n=args.labels, truncate=truncate)
+    else:
+        summary(df)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.claude/templates/setup.py b/.claude/templates/setup.py
index d290196..1e6f024 100644
--- a/.claude/templates/setup.py
+++ b/.claude/templates/setup.py
@@ -15,7 +15,7 @@ def setup(project_dir: str = ".") -> None:
     project_dir.mkdir(parents=True, exist_ok=True)
 
     # Copy static utility files
-    for filename in ["state.py"]:
+    for filename in ["state.py", "explore.py"]:
         src = TEMPLATES_DIR / filename
         dst = project_dir / filename
         if dst.exists():
diff --git a/.gitignore b/.gitignore
index a360cc4..3782636 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,5 +43,9 @@ htmlcov/
 test_sdk.py
 notebooks/**/lightningrod-python-sdk/
 
+# Pipeline output cache
+.lr_cache/
+
 # Misc
 .DS_Store
+agent-experiments/

From b3c7f327c70ba310f5721eb926a6ac8d30071e14 Mon Sep 17 00:00:00 2001
From: Bartolomej Kozorog <bartolomej.kozorog@gmail.com>
Date: Mon, 23 Mar 2026 16:30:03 +0100
Subject: [PATCH 11/11] include error details in job failed report

---
 src/lightningrod/_display.py          | 41 ++++++++++++++++++----
 src/lightningrod/datasets/client.py   | 17 ++++++----
 src/lightningrod/transforms/client.py | 49 +++++++++++++++++++++++++--
 3 files changed, 91 insertions(+), 16 deletions(-)

diff --git a/src/lightningrod/_display.py b/src/lightningrod/_display.py
index 790bc4e..28fe426 100644
--- a/src/lightningrod/_display.py
+++ b/src/lightningrod/_display.py
@@ -451,28 +451,46 @@ def run_live_display(
             live.update(build_live_display(metrics=metrics, job=job))
 
 
-def _build_invalid_samples_error_message(original_message: str) -> Group:
+def _build_invalid_samples_error_message(
+    original_message: str,
+    error_details: Optional[list[str]] = None,
+) -> Group:
     """Build enhanced error message for invalid samples error using Rich formatting."""
     renderables: list[RenderableType] = []
-    
+
     renderables.append(_safe_markup(f"[bold]{original_message}[/bold]"))
     renderables.append(Text(""))
-    
+
+    if error_details:
+        renderables.append(_safe_markup("[bold]Error details:[/bold]"))
+        for detail in error_details[:5]:
+            truncated = detail[:500] + "..." if len(detail) > 500 else detail
+            renderables.append(Text(f"  • {truncated}", style="dim"))
+        if len(error_details) > 5:
+            renderables.append(Text(f"  • ... and {len(error_details) - 5} more", style="dim italic"))
+        renderables.append(Text(""))
+
     renderables.append(_safe_markup("[bold]This typically happens when:[/bold]"))
     renderables.append(_safe_markup("  • Filter criteria is too strict"))
     renderables.append(_safe_markup("  • Labeling failed (e.g., questions couldn't be answered or had low confidence)"))
     renderables.append(_safe_markup("  • Seed generation found no suitable content"))
     renderables.append(Text(""))
-    
+
     renderables.append(_safe_markup("[bold]Next steps:[/bold]"))
     renderables.append(_safe_markup("  • Check the dataset samples to see specific failure reasons in the 'meta.filter_reason' field"))
     renderables.append(_safe_markup("  • Adjust and retry the transform pipeline (e.g., try a wider date range)"))
     renderables.append(_safe_markup("  • If the problem persists, contact support or open a GitHub issue: [link=https://github.com/lightning-rod-labs/lightningrod-python-sdk/issues]https://github.com/lightning-rod-labs/lightningrod-python-sdk/issues[/link]"))
-    
+
     return Group(*renderables)
 
 
-def display_error(message: str, title: str = "Error", job: Any = None, response_body: str | None = None) -> None:
+def display_error(
+    message: str,
+    title: str = "Error",
+    job: Any = None,
+    response_body: str | None = None,
+    error_details: Optional[list[str]] = None,
+) -> None:
     console = Console()
     renderables: list[RenderableType] = []
 
@@ -480,7 +498,16 @@ def display_error(message: str, title: str = "Error", job: Any = None, response_
     renderables.append(Text(""))
 
     if "Job completed with 0 valid rows" in message:
-        renderables.append(_build_invalid_samples_error_message(message))
+        renderables.append(_build_invalid_samples_error_message(message, error_details=error_details))
+    elif error_details:
+        renderables.append(_safe_markup(f"[bold]{message}[/bold]"))
+        renderables.append(Text(""))
+        renderables.append(_safe_markup("[bold]Error details:[/bold]"))
+        for detail in error_details[:5]:
+            truncated = detail[:500] + "..." if len(detail) > 500 else detail
+            renderables.append(Text(f"  • {truncated}", style="dim"))
+        if len(error_details) > 5:
+            renderables.append(Text(f"  • ... and {len(error_details) - 5} more", style="dim italic"))
     else:
         renderables.append(_safe_markup(f"[bold]{message}[/bold]"))
 
diff --git a/src/lightningrod/datasets/client.py b/src/lightningrod/datasets/client.py
index bc8d791..c26d4c8 100644
--- a/src/lightningrod/datasets/client.py
+++ b/src/lightningrod/datasets/client.py
@@ -21,28 +21,31 @@ class DatasetSamplesClient:
     def __init__(self, client: AuthenticatedClient):
         self._client: AuthenticatedClient = client
     
-    def list(self, dataset_id: str) -> List[Sample]:
+    def list(self, dataset_id: str, limit: Optional[int] = None) -> List[Sample]:
         samples: List[Sample] = []
         cursor: Optional[str] = None
-        
+
         while True:
+            req_limit = min(100, limit - len(samples)) if limit is not None else 100
             response = get_dataset_samples_datasets_dataset_id_samples_get.sync_detailed(
                 dataset_id=dataset_id,
                 client=self._client,
-                limit=100,
+                limit=req_limit,
                 cursor=cursor,
             )
-            
+
             parsed = handle_response_error(response, "fetch samples")
-            
+
             samples.extend(parsed.samples)
-            
+
+            if limit is not None and len(samples) >= limit:
+                return samples[:limit]
             if not parsed.has_more:
                 break
             if isinstance(parsed.next_cursor, Unset) or parsed.next_cursor is None:
                 break
             cursor = str(parsed.next_cursor)
-        
+
         return samples
     
     def upload(
diff --git a/src/lightningrod/transforms/client.py b/src/lightningrod/transforms/client.py
index 7784365..3947c03 100644
--- a/src/lightningrod/transforms/client.py
+++ b/src/lightningrod/transforms/client.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union
+from typing import List, Optional, Union
 
 from lightningrod._display import _is_notebook, display_error, display_warning, run_live_display
 from lightningrod._generated.models import (
@@ -35,9 +35,51 @@
 from lightningrod.datasets.client import DatasetSamplesClient
 from lightningrod._generated.types import Unset
 from lightningrod._errors import handle_response_error
+from lightningrod.datasets.client import DatasetSamplesClient
 
 TransformConfig = Union[FileSetQuerySeedGenerator, FileSetSeedGenerator, ForwardLookingQuestionGenerator, GdeltSeedGenerator, NewsSeedGenerator, QuestionAndLabelGenerator, QuestionGenerator, QuestionPipeline, QuestionRenderer, WebSearchLabeler]
 
+
+def _fetch_error_details_from_samples(
+    job: TransformJob,
+    samples_client: DatasetSamplesClient,
+    jobs_client: "TransformJobsClient",
+) -> List[str]:
+    details: List[str] = []
+    if "rejection_error_messages" in job.additional_properties:
+        msgs = job.additional_properties["rejection_error_messages"]
+        if isinstance(msgs, list):
+            for m in msgs:
+                if isinstance(m, str) and m.strip():
+                    details.append(m.strip())
+        if details:
+            return details
+    metrics = jobs_client.get_metrics(job.id)
+    if metrics:
+        for step in metrics.steps:
+            if (step.rejected_count > 0 or step.error_count > 0) and step.summary and step.summary.strip():
+                details.append(step.summary.strip())
+        if details:
+            return details
+    if not job.output_dataset_id:
+        return []
+    try:
+        samples = samples_client.list(job.output_dataset_id, limit=10)
+    except Exception:
+        return []
+    seen: set[str] = set()
+    for sample in samples:
+        msg = None
+        if not isinstance(sample.meta, Unset) and sample.meta is not None and "error_message" in sample.meta:
+            msg = sample.meta["error_message"]
+        elif "error_message" in sample.additional_properties:
+            msg = sample.additional_properties["error_message"]
+        if msg and isinstance(msg, str) and msg.strip() and msg not in seen:
+            seen.add(msg)
+            details.append(msg.strip())
+    return details
+
+
 class TransformJobsClient:
     def __init__(self, client: AuthenticatedClient):
         self._client = client
@@ -105,7 +147,10 @@ def poll() -> tuple[PipelineMetricsResponse, TransformJob]:
 
         if job.status == TransformJobStatus.FAILED:
             error_msg = job.error_message if (not isinstance(job.error_message, Unset) and job.error_message) else "Unknown error"
-            display_error(error_msg, title="Job Failed", job=job)
+            error_details = _fetch_error_details_from_samples(
+                job, self._dataset_samples_client, self.jobs
+            )
+            display_error(error_msg, title="Job Failed", job=job, error_details=error_details)
 
             # No need to raise an exception in the notebook, as we display the error using display_error
             if not _is_notebook():