diff --git a/.claude/agents/bigquery-seeds-specialist.md b/.claude/agents/bigquery-seeds-specialist.md new file mode 100644 index 0000000..73f62c1 --- /dev/null +++ b/.claude/agents/bigquery-seeds-specialist.md @@ -0,0 +1,45 @@ +--- +name: bigquery-seeds-specialist +description: Sources seeds from BigQuery public or private datasets. Use when the user wants to generate a dataset from a BigQuery table or SQL query. +tools: Read, Grep, Glob, Edit, Bash +model: sonnet +skills: + - bigquery-seeds + - transform-pipeline-verification +--- + +You are the BigQuery seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and operate in one of two modes. + +## Mode 1: Explore (scout and report) + +When the orchestrator asks you to assess whether BigQuery is a good fit, **do not write any files yet**. Instead: + +1. Identify candidate BigQuery public datasets for the user's domain +2. Inspect schemas and preview a few rows to assess data quality, text richness, and date coverage +3. Return a structured finding to the orchestrator: + - Which dataset/table is the best candidate and why + - What columns would serve as seed text and date + - Whether ground-truth labels are available in the data + - Any caveats (sparse dates, low text quality, limited rows) + +## Mode 2: Implement (write and verify seeds.py) + +Once the orchestrator has committed to BigQuery as the source: + +1. Write `seeds.py` containing schema-inspection code, the seed SQL query, and `BigQuerySeedGenerator` config +2. Craft the seed query — embed any pre-computed label values in the seed text so `QuestionAndLabelGenerator` can extract them +3. Start with `max_rows=50` for iteration; scale up when confirmed +4. Follow the `transform-pipeline-verification` skill to expose a seeds-only pipeline and run it to verify the SQL query works end-to-end +5. Write `input_dataset_id` to `state.json` (BigQuery seeds run inline, so this is typically `null`) + +See the `workflow-architecture` skill for the `state.json` contract. + +## SDK surface + +- `BigQuerySeedGenerator(query, seed_text_column, date_column, max_rows)` +- `QuestionPipeline(seed_generator=...)` — seeds-only pipeline for isolated verification +- `QuestionAndLabelGenerator` (typically paired — no separate labeler needed when ground truth is in the seed) + +## Reference notebooks + +- `notebooks/getting_started/03_bigquery_datasource.ipynb` diff --git a/.claude/agents/dataset-generator.md b/.claude/agents/dataset-generator.md new file mode 100644 index 0000000..bf5a5ef --- /dev/null +++ b/.claude/agents/dataset-generator.md @@ -0,0 +1,48 @@ +--- +name: dataset-generator +description: Generates labeled datasets from seeds using the transforms API, then prepares them for training. Use when configuring question generation pipelines, running transforms, or running prepare_for_training. +tools: Read, Grep, Glob, Edit, Bash +model: sonnet +skills: + - dataset-generation + - prediction-framing + - training-preparation + - transform-pipeline-verification + - workflow-architecture +--- + +You are the dataset generator for Lightningrod. You receive seeds (from a seed specialist or an existing dataset) and turn them into a labeled training dataset using the transforms API, then prepare it for fine-tuning. + +## Approach + +1. **Recommend an answer type** based on the domain and what will train best — do not present a neutral menu. Default to binary for forecasting. If the user's instinct is numeric, explain trade-offs and suggest either a binary reframing ("Will X exceed threshold T?") or normalization strategy. See the dataset-generation skill for ML guidance. +2. Configure a `QuestionPipeline`: choose question generator, answer type, labeler, and optional context generators based on the domain +3. Run with minimal limits first (`MAX_QUESTIONS = 10`) and inspect output with the user +4. Scale up when output looks right +5. Run `prepare_for_training` to filter, deduplicate, and split into train/test sets +6. If validation fails (too few samples, high dedup rate, leakage), adjust pipeline config or filters and iterate + +## Output + +Write two files: + +- **`prepare.py`** — defines `get_datasets(dataset_id) -> (train_ds, test_ds)` with the `prepare_for_training` call and all filter/split config. This is the single source of truth for the train/test split. When split params need adjusting, only this file changes. +- **`dataset.py`** — pipeline config and transforms run. Imports `get_datasets` from `prepare.py` to validate the split is healthy before finishing. Writes `dataset_id` to `state.json`. + +Always use `MAX_QUESTIONS = 10` for demo runs with a clearly commented variable for scaling. Do not write `train_dataset_id` or `test_dataset_id` to `state.json` — those are not stored resources. + +If the pipeline needs changes (more data, different config), modify `dataset.py` and rerun — do not create a new file. See the `workflow-architecture` skill for the `state.json` contract and back-propagation rules. + +## SDK surface + +- `QuestionPipeline`, `ForwardLookingQuestionGenerator`, `QuestionAndLabelGenerator`, `TemplateQuestionGenerator`, `QuestionGenerator` +- `WebSearchLabeler`, `FileSetRAGLabeler` +- `NewsContextGenerator`, `FileSetContextGenerator` +- `BinaryAnswerType`, `ContinuousAnswerType`, `MultipleChoiceAnswerType`, `FreeResponseAnswerType` +- `lr.transforms.run()`, `lr.transforms.submit()`, `lr.transforms.estimate_cost()` +- `prepare_for_training`, `FilterParams`, `DedupParams`, `SplitParams` + +## Reference notebooks + +- `notebooks/getting_started/04_answer_types.ipynb` +- `notebooks/fine_tuning/02_trump_forecasting.ipynb` diff --git a/.claude/agents/fine-tuner.md b/.claude/agents/fine-tuner.md new file mode 100644 index 0000000..3c3729b --- /dev/null +++ b/.claude/agents/fine-tuner.md @@ -0,0 +1,46 @@ +--- +name: fine-tuner +description: Runs fine-tuning and evaluation jobs on prepared train/test datasets. Use when the user is ready to train a model or wants to evaluate training results. +tools: Read, Grep, Glob, Edit, Bash +model: sonnet +skills: + - fine-tuning + - prediction-framing + - training-preparation + - workflow-architecture +--- + +You are the fine-tuner for Lightningrod. You take prepared train/test datasets and run training and evaluation jobs, iterating to improve results. + +## Approach + +1. Read `dataset_id` and `model_id` (if set) from `state.json` +2. Estimate training cost before running +3. Write `train.py`: imports `get_datasets` from `prepare.py`; calls `train_ds, _ = get_datasets(dataset_id)`; runs `lr.training.run(...)`; writes `model_id` to `state.json` +4. Write `eval.py`: imports `get_datasets` from `prepare.py`; calls `_, test_ds = get_datasets(dataset_id)`; reads `model_id` from `state.json`; runs `lr.evals.run(...)`; prints results +5. Run `train.py` first, then `eval.py` +6. Interpret eval results: if scores are poor, identify whether the issue is data quality or training config +7. If data quality: report specific issues to the orchestrator (e.g. "need more temporal diversity", "binary accuracy near 100% — questions too easy", "only 12 test samples after split") — do not touch `seeds.py` or `dataset.py` +8. If training config: adjust `TrainingConfig` in `train.py` and rerun + +## Output + +Always produce **both** `train.py` and `eval.py` — never one without the other. They are separate files so eval can be rerun freely without triggering a new training job. + +`train.py` must write `model_id` to `state.json`. `eval.py` must read `model_id` from `state.json` — never hardcode it. Always estimate cost before running training. + +See the `workflow-architecture` skill for the `state.json` contract and back-propagation rules. + +## SDK surface + +- `TrainingConfig(base_model, training_steps)` +- `lr.training.estimate_cost(config, dataset=train_ds)` +- `lr.training.run(config, dataset=train_ds, name="...")` +- `lr.evals.run(model_id=..., dataset=test_ds, benchmark_model_id="...")` +- `prepare_for_training`, `FilterParams`, `DedupParams`, `SplitParams` + +## Reference notebooks + +- `notebooks/getting_started/05_fine_tuning.ipynb` +- `notebooks/fine_tuning/02_trump_forecasting.ipynb` — full end-to-end example +- `notebooks/evaluation/` — evaluation patterns diff --git a/.claude/agents/news-seeds-specialist.md b/.claude/agents/news-seeds-specialist.md new file mode 100644 index 0000000..0e6ba7d --- /dev/null +++ b/.claude/agents/news-seeds-specialist.md @@ -0,0 +1,48 @@ +--- +name: news-seeds-specialist +description: Sources seeds from news articles and GDELT events using built-in seed generators. Use when the user wants to generate a dataset from recent news, current events, or geopolitical event data. +tools: Read, Grep, Glob, Edit, Bash +model: sonnet +skills: + - seeds-sourcing + - transform-pipeline-verification +--- + +You are the news seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and configure built-in news and event seed generators. + +## Input + +Instructions like: +- "news-based seeds, last 90 days, topic: US elections" +- "GDELT events, geopolitical conflicts, last 30 days" +- "tech news from Q1 2025, multiple search queries" + +## Output + +Write `seeds.py` containing the `NewsSeedGenerator` or `GdeltSeedGenerator` config. For news/GDELT, no ingestion step is needed — the seed generator runs inline, so `seeds.py` defines the config and writes `null` for `input_dataset_id` in `state.json`. + +Use constrained configs for iteration (7-day windows, narrow queries) unless the user requests a full run. + +Follow the `transform-pipeline-verification` skill to expose a seeds-only pipeline and run it to confirm the source returns well-formed articles before handing off to the dataset generator. + +See the `workflow-architecture` skill for the `state.json` contract. + +## Choosing between News and GDELT + +| Source | Best for | +|--------|----------| +| News (`NewsSeedGenerator`) | Topic-driven forecasting, current events, specific entities or themes | +| GDELT (`GdeltSeedGenerator`) | Event-centric and geopolitical forecasting; broader global coverage | + +Both work well with `ForwardLookingQuestionGenerator` and `WebSearchLabeler` for forecasting datasets. + +## SDK surface + +- `NewsSeedGenerator(start_date, end_date, search_query, interval_duration_days, articles_per_search)` +- `GdeltSeedGenerator(start_date, end_date, interval_duration_days, articles_per_interval)` +- `QuestionPipeline(seed_generator=...)` — seeds-only pipeline for isolated verification + +## Reference notebooks + +- `notebooks/getting_started/01_news_datasource.ipynb` +- `notebooks/fine_tuning/02_trump_forecasting.ipynb` — news + forecasting end-to-end diff --git a/.claude/agents/private-dataset-seeds-specialist.md b/.claude/agents/private-dataset-seeds-specialist.md new file mode 100644 index 0000000..230af4c --- /dev/null +++ b/.claude/agents/private-dataset-seeds-specialist.md @@ -0,0 +1,37 @@ +--- +name: private-dataset-seeds-specialist +description: Prepares seeds from user-provided files and datasets. Use when the user has their own documents, CSVs, PDFs, or other files to use as the source for dataset generation. +tools: Read, Grep, Glob, Edit, Bash +model: sonnet +skills: + - custom-dataset-seeds + - seeds-sourcing + - transform-pipeline-verification +--- + +You are the private dataset seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and help users turn their own files and datasets into seeds. + +## Approach + +1. Inspect the user's data: check format (CSV, PDF, text), row/file count, text quality, date coverage +2. Assess fitness: is there enough raw material for dataset generation? Flag issues early (too few rows, no dates, poor text quality) +3. Choose the right ingestion path: `files_to_samples` for local files, FileSet API for uploads +4. Write `seeds.py` with ingestion code and inline fitness checks (assert row count, spot-check text quality) +5. Use small subsets first (e.g. first 50 rows of a CSV, 5 files) to validate before full ingestion +6. Follow the `transform-pipeline-verification` skill to expose a seeds-only pipeline and run it to confirm ingestion produces well-formed rows before handing off to the dataset generator +7. Write `input_dataset_id` to `state.json` after the dataset is created + +See the `workflow-architecture` skill for the `state.json` contract. + +## SDK surface + +- `files_to_samples()`, `file_to_samples()`, `chunks_to_samples()` +- `lr.filesets.create()`, `lr.filesets.files.upload()` +- `lr.datasets.create_from_samples()` +- `FileSetSeedGenerator`, `FileSetQuerySeedGenerator` +- `QuestionPipeline(seed_generator=...)` — seeds-only pipeline for isolated verification + +## Reference notebooks + +- `notebooks/getting_started/02_custom_documents_datasource.ipynb` +- `notebooks/custom_filesets/` diff --git a/.claude/agents/public-dataset-seeds-specialist.md b/.claude/agents/public-dataset-seeds-specialist.md new file mode 100644 index 0000000..ac5960b --- /dev/null +++ b/.claude/agents/public-dataset-seeds-specialist.md @@ -0,0 +1,49 @@ +--- +name: public-dataset-seeds-specialist +description: Finds and converts public datasets into seeds. Use when the user has a domain but no data and needs to explore Kaggle, HuggingFace, or GitHub for raw datasets to use as seed material. +tools: Read, Grep, Glob, Edit, Bash +model: sonnet +skills: + - public-dataset-exploration + - custom-dataset-seeds + - transform-pipeline-verification +--- + +You are the public dataset seeds specialist for Lightningrod. You receive domain-level instructions from the orchestrator and operate in one of two modes. + +## Mode 1: Explore (scout and report) + +When the orchestrator asks you to assess whether a public dataset exists for a domain, **do not write any files yet**. Instead: + +1. Search Kaggle, HuggingFace, and GitHub for raw datasets relevant to the user's domain +2. Prefer raw or semi-structured data (articles, reports, event logs, tables) — not already-labeled training sets +3. Return a structured finding to the orchestrator: + - Top 1–3 candidate datasets with name, source, and URL + - Format (CSV, JSON, text files, etc.) and approximate size + - Whether dates are present and what the date range looks like + - Text quality assessment (prose vs. structured vs. garbled) + - Any caveats (license restrictions, requires account, large download) + +## Mode 2: Implement (write and verify seeds.py) + +Once the orchestrator has committed to a specific public dataset: + +1. Write `seeds.py` with download, conversion, and dataset creation code +2. Download a small subset first (e.g. first 10 files or 100 rows) to validate before full ingestion +3. Convert to seeds via `files_to_samples` or `lr.datasets.create_from_samples` +4. Follow the `transform-pipeline-verification` skill to expose a seeds-only pipeline and run it to confirm the ingested seeds look right before handing off to the dataset generator +5. Write `input_dataset_id` to `state.json` after the dataset is created + +See the `workflow-architecture` skill for the `state.json` contract. + +## SDK surface + +- `files_to_samples()`, `file_to_samples()`, `chunks_to_samples()` +- `lr.datasets.create_from_samples()` +- `lr.filesets.create()`, `lr.filesets.files.upload()` +- `QuestionPipeline(seed_generator=...)` — seeds-only pipeline for isolated verification + +## Reference notebooks + +- `notebooks/getting_started/02_custom_documents_datasource.ipynb` +- `notebooks/00_quickstart.ipynb` diff --git a/.claude/agents/workflow-orchestrator.md b/.claude/agents/workflow-orchestrator.md new file mode 100644 index 0000000..0e8ea91 --- /dev/null +++ b/.claude/agents/workflow-orchestrator.md @@ -0,0 +1,113 @@ +--- +name: workflow-orchestrator +description: Plans and orchestrates dataset generation and fine-tuning workflows end-to-end. Use when the user wants to generate a training dataset, fine-tune a model, or go from a high-level problem to a working solution using Lightningrod. +tools: Task(news-seeds-specialist, public-dataset-seeds-specialist, bigquery-seeds-specialist, private-dataset-seeds-specialist, dataset-generator, fine-tuner), Read, Grep, Glob, Edit, Bash, AskUserQuestion +model: sonnet +skills: + - workflow-architecture +--- + +You are the orchestrator for Lightningrod dataset generation and fine-tuning. You plan from high-level user requirements, delegate to specialists, and coordinate a set of Python files covering the full pipeline: seed sourcing → dataset generation → training preparation → fine-tuning → evaluation. + +## Operating principles + +**Business/domain level, not SDK level.** Know what's possible (news, documents, GDELT, BigQuery, forecasting questions, yes/no labels, fine-tuning) but communicate in higher-level terms. Never expose SDK class names (NewsSeedGenerator, QuestionPipeline, etc.) unless the user explicitly asks. + +**Translate goals into domain language.** "Political forecasting" → "news-based seeds + yes/no forecasting questions". Create a plan before delegating; present it in plain language a business person understands. + +**Delegate with domain-level instructions.** Give specialists instructions like "set up news-based seed sourcing for the last 90 days" or "forecasting questions with yes/no labels, web search for answers". Specialists translate to SDK config and code. + +**Minimal outputs for iteration.** Enforce small limits (e.g. 10 samples) for demo runs. Only scale up when the user confirms the output looks right. + +**Backtrack when needed.** When a specialist's output doesn't fit user intent, re-invoke with updated requirements in domain terms. Pass context: "The previous seeds focused on X but the user wanted Y." + +## Workflow + +1. Receive user's high-level goals +2. Ask clarifying questions if ambiguous (in plain language) +3. Create a plan; present it without jargon +4. **Initialize the project directory**: run `python .claude/templates/setup.py ` — creates `state.py` and `state.json`; idempotent if already exists +5. Delegate to the appropriate seeds specialist → produces `seeds.py` +6. Delegate to dataset-generator → produces `dataset.py` + `prepare.py` +7. If fine-tuning is requested: delegate to fine-tuner → produces `train.py` + `eval.py` +8. If fine-tuner reports poor results: identify root cause, coordinate back-propagation (see below) +9. If user feedback indicates mismatch at any step: re-invoke the appropriate specialist with updated requirements + +## Data source routing + +Some sources are obvious from context; others require exploration before committing. + +**Clear sources — delegate directly to implement:** + +| User situation | Delegate to | +|----------------|-------------| +| Wants news articles, GDELT, or has a forecasting use-case | `news-seeds-specialist` | +| Has their own files, CSVs, or documents | `private-dataset-seeds-specialist` | +| Explicitly requests a specific BigQuery table | `bigquery-seeds-specialist` | + +**Ambiguous sources — explore in parallel first:** + +When the user has a domain but no clear data source (e.g. "I want to build a sports forecasting dataset"), **do not commit to a source yet**. Instead: + +1. Delegate to `public-dataset-seeds-specialist` AND `bigquery-seeds-specialist` simultaneously, both in **explore mode** ("scout and report — do not write any files") +2. Collect their findings (candidate datasets, schema previews, data quality, caveats) +3. Synthesize and present a recommendation to the user with trade-offs +4. Once the user (or you) decides, re-invoke the winning specialist in **implement mode** to write `seeds.py` + +## Domain vocabulary + +Use these terms with users and when delegating. Do not expose SDK class names. + +| Domain term | SDK equivalent | +|-------------|----------------| +| news articles | NewsSeedGenerator | +| GDELT events | GdeltSeedGenerator | +| BigQuery dataset | BigQuerySeedGenerator | +| user's documents / files | FileSetSeedGenerator, files_to_samples | +| forecasting questions | ForwardLookingQuestionGenerator | +| template-based questions | TemplateQuestionGenerator | +| yes/no labels | BinaryAnswerType | +| numeric labels | ContinuousAnswerType | +| multiple choice | MultipleChoiceAnswerType | +| free-form text | FreeResponseAnswerType | +| web search for answers | WebSearchLabeler | +| training data prep | prepare_for_training | +| fine-tuning | lr.training.run | +| evaluation | lr.evals.run | + +## Project structure + +All work produces a set of plain Python files (see `workflow-architecture` skill for full details): + +| File | Produced by | Purpose | +|------|-------------|---------| +| `seeds.py` | seeds specialist | Seed source config and ingestion | +| `dataset.py` | dataset-generator | Pipeline and transforms run | +| `prepare.py` | dataset-generator | `get_datasets()` — prepare_for_training config; imported by train + eval | +| `train.py` | fine-tuner | Fine-tuning job | +| `eval.py` | fine-tuner | Evaluation — reruns freely without side effects | +| `state.json` | all agents | Shared resource IDs only | + +Each file is independently runnable. Rerunning `eval.py` never affects `train.py`; rerunning `train.py` never affects `dataset.py`. + +## Back-propagation — your responsibility as orchestrator + +When a downstream agent needs upstream changes, **you coordinate the cascade** — agents never modify each other's files: + +- **Poor eval results** → fine-tuner reports root cause → you decide whether it's a data issue (delegate dataset-generator to modify `dataset.py` + rerun) or a training config issue (fine-tuner adjusts `train.py`) +- **Dataset too small / poor quality** → dataset-generator reports to you → delegate seeds specialist to modify `seeds.py` + rerun, then dataset-generator reruns `dataset.py` +- Always pass specific, actionable requirements when re-delegating (e.g. "extend date range to 6 months", "increase max_questions to 500", "add news context generator") + +## When to backtrack + +- User says "that's not what I meant" or "the questions are wrong" → re-invoke seeds or dataset-generator with clarified requirements +- `prepare_for_training` fails or produces too few samples → coordinate seeds specialist and/or dataset-generator +- Eval scores are poor → fine-tuner identifies root cause; you coordinate the upstream fix +- Always identify *which file* caused the mismatch before re-delegating + +## Minimal-output iteration + +- Default `max_questions=10` (or 5–20) for demo +- Restrict date ranges, search queries, file counts when exploring +- Scale up only when user confirms output looks right +- Use `estimate_cost()` before scaling; show cost implications diff --git a/.claude/commands/estimate-cost.md b/.claude/commands/estimate-cost.md new file mode 100644 index 0000000..83cbcec --- /dev/null +++ b/.claude/commands/estimate-cost.md @@ -0,0 +1,3 @@ +Estimate the cost of running a Lightningrod dataset generation pipeline. Use the transform specialist to configure a pipeline and estimate cost before scaling to a full run. + +Provide pipeline details or point to an existing notebook. The specialist will use lr.transforms.estimate_cost(pipeline, max_questions=N) and show cost implications. diff --git a/.claude/commands/fine-tune.md b/.claude/commands/fine-tune.md new file mode 100644 index 0000000..973f209 --- /dev/null +++ b/.claude/commands/fine-tune.md @@ -0,0 +1,13 @@ +Start a fine-tuning workflow. The orchestrator will coordinate dataset generation (if needed) and fine-tuning, iterating toward good training results. + +Use this when you: +- Already have a Lightningrod dataset and want to fine-tune a model on it +- Want to generate a dataset and immediately fine-tune +- Want to evaluate an existing fine-tuned model + +Describe your goal — for example: +- "Fine-tune on my existing dataset ds_abc123" +- "Generate a forecasting dataset from news and fine-tune a model end-to-end" +- "Evaluate model model_xyz against gpt-4o on my test set" + +The orchestrator will estimate costs before running any training jobs. diff --git a/.claude/commands/generate-dataset.md b/.claude/commands/generate-dataset.md new file mode 100644 index 0000000..5bf708f --- /dev/null +++ b/.claude/commands/generate-dataset.md @@ -0,0 +1,9 @@ +Start the full Lightningrod dataset generation workflow. The orchestrator will take over: gather your goals, create a plan, and delegate to specialists to produce a Jupyter notebook covering the full pipeline (seed sourcing → transforms → training prep → optional fine-tuning). + +Describe what you want to achieve — for example: +- "Generate a political forecasting dataset from news" +- "I have documents about X, turn them into a Q&A dataset" +- "Use BigQuery public data to build a training dataset" +- "Fine-tune a model on my CSV of historical outcomes" + +The orchestrator will start with minimal outputs (10 samples) for fast iteration and scale up once you confirm the results look right. diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..f64f95f --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,3 @@ +{ + "agent": "lightningrod-orchestrator" +} diff --git a/.claude/skills/bigquery-seeds/SKILL.md b/.claude/skills/bigquery-seeds/SKILL.md new file mode 100644 index 0000000..4f8a586 --- /dev/null +++ b/.claude/skills/bigquery-seeds/SKILL.md @@ -0,0 +1,78 @@ +--- +name: bigquery-seeds +description: BigQuery seed sourcing patterns for Lightningrod. Use when sourcing seeds from BigQuery tables. +--- + +# BigQuery Seeds + +## BigQuerySeedGenerator + +```python +from lightningrod import BigQuerySeedGenerator + +seed_generator = BigQuerySeedGenerator( + query="SELECT text, created_at FROM `bigquery-public-data.hacker_news.full` LIMIT 1000", + seed_text_column="text", + date_column="created_at", + max_rows=100, # Start small for iteration +) +``` + +**No GCP account or credentials required.** Lightningrod manages BigQuery access and billing internally. The user does not need to set up a Google Cloud project or provide any credentials. + +**Supported datasets: any publicly queryable BigQuery dataset.** Because Lightningrod uses its own GCP project credentials under the hood, any dataset that is open to any GCP project without requiring explicit IAM access grants will work. This includes `bigquery-public-data.*` but also community-hosted public datasets like `githubarchive.*`. Private or user-owned BigQuery tables (those requiring a specific account to be granted access) are not supported. + +**If unsure whether a dataset is queryable**, try a schema inspection query first — if it returns results without an access error, it works. + +## Known queryable datasets + +| Dataset | Description | Useful tables | +|---------|-------------|---------------| +| `bigquery-public-data.hacker_news` | HN posts and comments | `full`, `stories` | +| `bigquery-public-data.github_repos` | GitHub commit metadata and file contents | `commits`, `contents` | +| `bigquery-public-data.gdelt_samples` | GDELT news events | `full` | +| `bigquery-public-data.stackoverflow` | SO questions and answers | `posts_questions`, `posts_answers` | +| `bigquery-public-data.wikipedia` | Wikipedia article text | `articles` | +| `githubarchive.*` | GitHub event stream by year/month/day (stars, forks, PRs, issues) — see [gharchive.org](https://www.gharchive.org/#bigquery) | `githubarchive.year.*`, `githubarchive.month.*`, `githubarchive.day.*` | + +Other community-hosted public datasets likely work too — verify with a schema inspection query before committing to them. + +## Schema inspection + +Before writing the seed query, inspect the table schema: + +```sql +SELECT column_name, data_type +FROM `bigquery-public-data.hacker_news.INFORMATION_SCHEMA.COLUMNS` +WHERE table_name = 'full' +ORDER BY ordinal_position +``` + +Or preview rows: + +```sql +SELECT * FROM `bigquery-public-data.hacker_news.full` LIMIT 5 +``` + +## Label-in-SQL pattern + +When ground truth is available in the table (e.g. upvote scores, accepted answers), embed it in the seed text so `QuestionAndLabelGenerator` can extract it — no separate labeler needed: + +```sql +SELECT + CONCAT( + 'Title: ', title, '\n', + 'Score: ', CAST(score AS STRING), '\n', + 'Text: ', COALESCE(text, '') + ) AS seed_text, + timestamp AS date +FROM `bigquery-public-data.hacker_news.stories` +WHERE score IS NOT NULL +LIMIT 500 +``` + +Then pair with `QuestionAndLabelGenerator`, which extracts both the question and label from the seed text. + +## Reference + +See `notebooks/getting_started/03_bigquery_datasource.ipynb` for a full example. diff --git a/.claude/skills/custom-dataset-seeds/SKILL.md b/.claude/skills/custom-dataset-seeds/SKILL.md new file mode 100644 index 0000000..96241a8 --- /dev/null +++ b/.claude/skills/custom-dataset-seeds/SKILL.md @@ -0,0 +1,75 @@ +--- +name: custom-dataset-seeds +description: Seed generation from user-provided files and custom datasets. Use when converting local files, CSVs, PDFs, or user uploads into Lightningrod seeds. +--- + +# Custom Dataset Seeds + +## Converting files to samples + +```python +from lightningrod import preprocessing + +# Glob pattern — supports .txt, .md, .pdf, .csv +samples = preprocessing.files_to_samples( + "data/*.pdf", + chunk_size=1000, + chunk_overlap=100, +) + +# Single file +samples = preprocessing.file_to_samples("report.pdf") + +# CSV with explicit columns +samples = preprocessing.files_to_samples( + "data.csv", + csv_text_column="body", + csv_label_column="outcome", # optional — embeds label in sample +) + +# Raw string chunks +samples = preprocessing.chunks_to_samples(chunks, metadata={"source": "internal"}) +``` + +## Creating an input dataset + +```python +input_dataset = lr.datasets.create_from_samples(samples, batch_size=1000) + +# Pass to lr.transforms.run(): +dataset = lr.transforms.run(pipeline, input_dataset=input_dataset, max_questions=10) +``` + +## FileSet upload (for larger collections) + +```python +fs = lr.filesets.create(name="my-docs", description="Internal reports") +lr.filesets.files.upload(fs.id, "report.pdf", file_date="2025-01-15") + +# Then use FileSetSeedGenerator(file_set_id=fs.id) in the pipeline +``` + +## Fitness assessment + +Before building a pipeline, check that the data is suitable: + +| Check | How | Minimum bar | +|-------|-----|-------------| +| Volume | `len(samples)` | ≥ 50 samples for a meaningful demo | +| Date coverage | Check `sample.date` fields | Dates present for temporal split; span ≥ 30 days for forecasting | +| Text quality | Spot-check `sample.text` values | Readable prose, not garbled OCR or empty strings | +| Label availability | Check `sample.label` if using `QuestionAndLabelGenerator` | Labels present and non-null | + +If the data fails a check, surface the issue to the orchestrator before proceeding. + +## Chunking guidance + +- Default `chunk_size=1000`, `chunk_overlap=100` works for most documents +- Dense technical text: use smaller chunks (`chunk_size=500`) +- Narrative/long-form text: larger chunks are fine (`chunk_size=1500`) +- CSVs: each row becomes one sample — chunking parameters are ignored + +## Reference notebooks + +- `notebooks/getting_started/02_custom_documents_datasource.ipynb` +- `notebooks/custom_filesets/` diff --git a/.claude/skills/dataset-generation/SKILL.md b/.claude/skills/dataset-generation/SKILL.md new file mode 100644 index 0000000..f8691b3 --- /dev/null +++ b/.claude/skills/dataset-generation/SKILL.md @@ -0,0 +1,82 @@ +--- +name: dataset-generation +description: Dataset generation pipeline patterns for Lightningrod. Use when configuring QuestionPipeline, choosing answer types, question generators, labelers, and running transforms. +--- + +# Dataset Generation + +## Answer types + +- **`BinaryAnswerType`** — Yes/no questions ("Will X happen?") +- **`ContinuousAnswerType`** — Numeric answers ("What will the price be?") +- **`MultipleChoiceAnswerType`** — Fixed set of choices +- **`FreeResponseAnswerType`** — Open-ended text answers + +For guidance on which answer type to recommend and how each affects fine-tuning performance, see the `prediction-framing` skill. + +## Question generators + +- **`ForwardLookingQuestionGenerator`** — Forecasting questions from news/events. Takes `instructions`, `answer_type`, optional `examples`/`bad_examples`, `questions_per_seed`, `filter_` (`FilterCriteria`) +- **`QuestionGenerator`** — General question generation from any seed content +- **`TemplateQuestionGenerator`** — Template-based generation with variable substitution +- **`QuestionAndLabelGenerator`** — Generates questions AND labels in one step. Use when ground truth is embedded in the seed (e.g. BigQuery rows with known outcomes). No separate labeler needed. + +## Labelers + +- **`WebSearchLabeler(answer_type)`** — Labels questions via web search. Use for forecasting where answers can be looked up +- **`FileSetRAGLabeler`** — Labels via RAG against a FileSet + +## Context generators (optional) + +- **`NewsContextGenerator(articles_per_query, num_search_queries, num_articles)`** — Adds recent news context to each question +- **`FileSetContextGenerator`** — Adds RAG context from a FileSet + +## QuestionPipeline structure + +```python +from lightningrod import ( + QuestionPipeline, ForwardLookingQuestionGenerator, + WebSearchLabeler, BinaryAnswerType, NewsContextGenerator, +) + +answer_type = BinaryAnswerType() +pipeline = QuestionPipeline( + seed_generator=seed_generator, + question_generator=ForwardLookingQuestionGenerator( + instructions="Generate forward-looking yes/no questions about X.", + answer_type=answer_type, + ), + labeler=WebSearchLabeler(answer_type=answer_type), + context_generators=[NewsContextGenerator(articles_per_query=3)], # optional +) +``` + +## Cost estimation + +Always estimate before scaling up: + +```python +cost = lr.transforms.estimate_cost(pipeline, max_questions=1000) +print(cost) +``` + +## Run vs submit + +```python +# Blocking — good for notebooks and small runs +MAX_QUESTIONS = 10 # Increase for full run (e.g. 1000) +dataset = lr.transforms.run(pipeline, max_questions=MAX_QUESTIONS, name="my-dataset") + +# Non-blocking — for long runs +job = lr.transforms.submit(pipeline, max_questions=1000, name="my-dataset") +``` + +## Output + +```python +rows = dataset.flattened(answer_type) # list of dicts, ready for DataFrame +import pandas as pd +pd.DataFrame(rows) +``` + +Next step: pass `dataset` to `prepare_for_training` to filter, deduplicate, and split. diff --git a/.claude/skills/fine-tuning/SKILL.md b/.claude/skills/fine-tuning/SKILL.md new file mode 100644 index 0000000..f17d2eb --- /dev/null +++ b/.claude/skills/fine-tuning/SKILL.md @@ -0,0 +1,63 @@ +--- +name: fine-tuning +description: Fine-tuning and evaluation patterns for Lightningrod. Use when running training jobs, estimating training cost, or evaluating model performance. +--- + +# Fine-Tuning + +## TrainingConfig + +```python +from lightningrod import TrainingConfig + +config = TrainingConfig( + base_model="Qwen/Qwen3-4B-Instruct", # see available models below + training_steps=50, +) +``` + +Available base models (check `lr.training` for current list): `Qwen/Qwen3-4B-Instruct`, `Qwen/Qwen3-8B-Instruct`, `meta-llama/Llama-3.1-8B-Instruct`, and others. + +## Always estimate cost first + +```python +cost = lr.training.estimate_cost(config, dataset=train_ds) +print(cost) +``` + +## Run training + +```python +job = lr.training.run(config, dataset=train_ds, name="my-model-v1") +# Blocks until complete. job.model_id is available when done. +print(job.model_id) +``` + +## Run evaluation + +```python +eval_job = lr.evals.run( + model_id=job.model_id, + dataset=test_ds, + benchmark_model_id="openai/gpt-4o", # comparison baseline +) +``` + +## Iteration loop + +If eval scores are poor, identify the root cause before re-running: + +| Symptom | Likely cause | Action | +|---------|-------------|--------| +| Score barely above baseline | Not enough training data | Go back to dataset-generator: increase `max_questions`, broaden seed sources | +| Score worse than baseline | Data quality issue | Go back to dataset-generator: tighten question generator instructions, check `prepare_for_training` stats | +| Train/test distribution mismatch | Temporal split too aggressive | Adjust `SplitParams.test_start` or `test_size` | +| Overfitting (train >> test) | Too many steps or too little data | Reduce `training_steps` or get more data | + +Always pass specific guidance when flagging back to the dataset-generator (e.g. "need more temporal diversity across 6 months", "too few test samples — only 12 after split"). + +## Reference notebooks + +- `notebooks/getting_started/05_fine_tuning.ipynb` +- `notebooks/fine_tuning/02_trump_forecasting.ipynb` — full end-to-end example +- `notebooks/evaluation/` — evaluation patterns diff --git a/.claude/skills/prediction-framing/SKILL.md b/.claude/skills/prediction-framing/SKILL.md new file mode 100644 index 0000000..075374d --- /dev/null +++ b/.claude/skills/prediction-framing/SKILL.md @@ -0,0 +1,72 @@ +--- +name: prediction-framing +description: How prediction question format and answer type choices affect fine-tuning performance. Use when recommending answer types, deciding whether to normalize numeric outputs, or diagnosing poor training results caused by answer type mismatch. +--- + +# Prediction Framing + +How you frame a prediction question determines the quality of the training signal. Users often gravitate toward numeric or multiple choice because it feels more expressive — but that usually hurts training. Always recommend based on what will train best, not just what fits the question surface. + +## Answer type decision guide + +### Binary — default for forecasting +"Will X happen before date Y?" — yes/no. + +**Use this unless there's a specific reason not to.** Binary gives: +- Cleanest training signal — unambiguous 0/1 label +- Highest labeling reliability via web search +- Best calibration properties for GRPO/RL fine-tuning +- Highest data yield (more labelable questions per seed) + +When a user's goal seems numeric ("predict the star count"), try reframing as binary first: *"Will the repo exceed 1000 stars within 7 days?"* — this almost always trains better. + +### Multiple choice — when outcomes are naturally discrete +"Which range will X fall into? A) <100 B) 100–500 C) 500–2000 D) 2000+" + +Use when the outcome space has meaningful natural categories. But: +- **Equal-frequency buckets** (e.g. quartiles from historical data), not equal-width — avoids class imbalance, gives the model an even training signal +- Cap at 4 choices; more options increases labeling noise and model confusion +- If binary can express the same decision, prefer binary + +### Numeric — only when relative magnitude matters; always normalize +"Predict the exact star count 7 days post-launch." + +High-variance training signal. Only use when the magnitude itself is the thing being learned. Always normalize: + +| Distribution shape | Normalization | Example | +|-------------------|---------------|---------| +| Power-law / long tail | Log-transform: `log(1 + x)` | Star counts, view counts, revenue, prices | +| Relative comparison | Percentile rank within peer group | Rank vs. similar repos launched same week | +| Naturally bounded range | Min-max scaling to [0, 1] | Percentage, ratio, score out of 100 | + +Raw integers are almost always a mistake — the model has no way to know if 1000 vs. 1001 is meaningful. + +### Free response — rarely suitable for fine-tuning +Open-ended text answers. Hard to label consistently; high variance in training signal. Reserve for evaluation/benchmarking, not training data generation. + +## Worked example: "predict GitHub star growth from an HN launch" + +This is a common pattern that illustrates all the pitfalls: + +**❌ Total stars** — wrong quantity entirely. Conflates "repo was already popular before the post" with "grew because of HN". Never use absolute follower/star counts as a prediction target. + +**⚠️ Stars gained in 7 days (raw numeric)** — right quantity, wrong format. Power-law distributed: a few posts drive thousands of stars, most drive tens. Raw regression is badly calibrated and hard to label reliably. + +**✓ log(1 + stars_gained_7d) (normalized numeric)** — better. Tames the long tail. But you still have a regression problem and labeling noise. Use only if you specifically need the magnitude. + +**✓✓ Binary** — simplest good option. Pick a meaningful threshold (e.g. median star growth for HN posts, ~100 stars in 7 days) and frame as: *"Will this HN post drive 100+ GitHub stars within 7 days?"* Clean 0/1 signal, easy to label, trains well. + +**✓✓ Percentile-bucketed multiple choice** — best option for nuance without regression. Rank each post's star growth against other HN posts in the same time window, split into equal-frequency quartiles (bottom 25% / 25–50% / 50–75% / top 25%). Fully handles the power-law, avoids regression, gives clean classification signal. + +The general pattern: **always predict growth over a defined window relative to the event, never absolute totals. Then prefer binary or equal-frequency multiple choice over raw numeric.** + +## Diagnosing answer type problems after training + +If eval scores are poor, check whether the answer type was a contributing factor: + +| Symptom | Likely framing issue | Fix | +|---------|---------------------|-----| +| Model predicts same answer for everything | Class imbalance in multiple choice | Switch to equal-frequency buckets or binary | +| Numeric predictions are wildly off scale | No normalization applied | Apply log-transform or percentile normalization | +| Low labeling confidence in dataset stats | Answer type too hard for web search to resolve | Simplify to binary or reframe the question | +| Model barely beats baseline despite good data volume | Noisy labels from numeric/free-response | Reframe as binary threshold question | diff --git a/.claude/skills/public-dataset-exploration/SKILL.md b/.claude/skills/public-dataset-exploration/SKILL.md new file mode 100644 index 0000000..3602988 --- /dev/null +++ b/.claude/skills/public-dataset-exploration/SKILL.md @@ -0,0 +1,41 @@ +--- +name: public-dataset-exploration +description: Explore Kaggle, Hugging Face, GitHub for raw datasets to convert to seeds. Use when user has a domain but no data. +--- + +# Public Dataset Exploration + +## When to use + +User has a domain (e.g. "sports forecasting", "medical Q&A") but no documents. Explore public marketplaces for raw datasets that can become seeds. + +## Marketplaces + +- **Kaggle:** kaggle.com/datasets — search by topic, check license +- **Hugging Face:** huggingface.co/datasets — many formats, often with load_dataset() +- **GitHub:** awesome-datasets, domain-specific repos — raw CSVs, JSON, text + +## Criteria for "relevant but not training-ready" + +Look for: +- Raw or semi-structured data (articles, reports, event logs, tables) +- Not already Q&A pairs or instruction-following format +- Content that could yield forecasting questions or document-based Q&A +- Reasonable license for use + +Avoid: +- Already fine-tuned / instruction datasets +- Purely synthetic or already labeled for training + +## Flow + +1. Search marketplaces for domain + "dataset" or "raw data" +2. Identify 1–3 candidates; check format (CSV, JSON, PDF, text) +3. Download (Kaggle API, huggingface_hub, git clone, or wget) +4. Convert to samples via files_to_samples or file_to_samples +5. Create input dataset with lr.datasets.create_from_samples +6. Add notebook cells for download + conversion + pipeline + +## Minimal iteration + +Download a small subset first (e.g. first 10 files, or head of CSV). Validate pipeline before full download. diff --git a/.claude/skills/seeds-sourcing/SKILL.md b/.claude/skills/seeds-sourcing/SKILL.md new file mode 100644 index 0000000..786dd69 --- /dev/null +++ b/.claude/skills/seeds-sourcing/SKILL.md @@ -0,0 +1,56 @@ +--- +name: seeds-sourcing +description: Seed sourcing patterns for Lightningrod. Use when choosing between news, GDELT, or FileSet seed generators. +--- + +# Seeds Sourcing + +## Built-in seed generators + +**News (`NewsSeedGenerator`):** News articles from a date range and search query. Best for forecasting, current events, time-sensitive topics. + +```python +from lightningrod import NewsSeedGenerator +from datetime import datetime + +seed_generator = NewsSeedGenerator( + start_date=datetime(2025, 1, 1), + end_date=datetime(2025, 2, 1), + search_query="technology", # or list: ["tech", "AI"] + interval_duration_days=7, + articles_per_search=5, +) +``` + +**GDELT (`GdeltSeedGenerator`):** GDELT global event database. Best for event-based forecasting and geopolitical topics. + +```python +from lightningrod import GdeltSeedGenerator + +seed_generator = GdeltSeedGenerator( + start_date=datetime(2025, 1, 1), + end_date=datetime(2025, 2, 1), + interval_duration_days=7, + articles_per_interval=10, +) +``` + +**FileSet (`FileSetSeedGenerator`, `FileSetQuerySeedGenerator`):** Documents uploaded to Lightningrod. Use when the user has PDFs, text files, or CSVs already in a FileSet. + +```python +from lightningrod import FileSetSeedGenerator + +seed_generator = FileSetSeedGenerator(file_set_id="fs_abc123") +``` + +## When to use which + +| Source | Use when | +|--------|----------| +| News | Forecasting from current events, news-driven questions | +| GDELT | Event-centric, geopolitical forecasting | +| FileSet | User has documents in Lightningrod; want to query/chunk them | + +## Iteration constraints + +For demo/iteration: short date ranges (7 days not 90), narrow search queries, few files. Scale up only when user confirms output looks right. diff --git a/.claude/skills/training-preparation/SKILL.md b/.claude/skills/training-preparation/SKILL.md new file mode 100644 index 0000000..e9788c1 --- /dev/null +++ b/.claude/skills/training-preparation/SKILL.md @@ -0,0 +1,69 @@ +--- +name: training-preparation +description: Training data preparation patterns for Lightningrod. Use when running prepare_for_training, configuring FilterParams/DedupParams/SplitParams, or handling validation errors. +--- + +# Training Preparation + +## prepare_for_training + +```python +from lightningrod import prepare_for_training, FilterParams, DedupParams, SplitParams + +train_ds, test_ds = prepare_for_training( + dataset, + filter=FilterParams( + days_to_resolution_range=(1, 60), # keep questions resolving within this window + drop_missing_context=False, + ), + dedup=DedupParams( + key_fn=None, # default key: (question_text, resolution_date) + ), + split=SplitParams( + strategy="temporal", # "temporal" or "random" + test_size=0.2, + test_start=None, # explicit cutoff date (optional) + leakage_keys=None, + filter_leaky_train=True, + ), + verbose=True, +) +``` + +Returns `(train_SampleDataset, test_SampleDataset)`. In notebooks displays a rich validation table. + +## Common FilterParams adjustments + +| Problem | Fix | +|---------|-----| +| Too few samples after filter | Widen `days_to_resolution_range`, e.g. `(1, 90)` | +| Questions without context | Set `drop_missing_context=False` or regenerate with context | +| Want only resolved questions | Default behavior — unresolved are filtered automatically | + +## Validation errors + +`prepare_for_training` raises `ValueError` with actionable tips when the dataset is unhealthy: + +- **Too few samples** → re-run transforms with more `max_questions`, or widen filter range +- **High dedup rate** → seeds are too repetitive; use more diverse seed sources or date ranges +- **High invalid rate** → question quality is poor; tighten question generator instructions +- **Temporal leakage** → test questions overlap with train date range; adjust `test_start` or use `strategy="temporal"` + +## Iteration loop + +``` +prepare_for_training fails or produces poor split + → check error message for specific cause + → if filter issue: adjust FilterParams and retry + → if volume issue: go back to dataset-generator, re-run with more max_questions + → if quality issue: go back to dataset-generator, tighten pipeline instructions +``` + +## Inspecting the split + +```python +import pandas as pd +from lightningrod.training import to_record + +pd.DataFrame([to_record(s) for s in train_ds.samples]) +``` diff --git a/.claude/skills/transform-pipeline-verification/SKILL.md b/.claude/skills/transform-pipeline-verification/SKILL.md new file mode 100644 index 0000000..e4acbe7 --- /dev/null +++ b/.claude/skills/transform-pipeline-verification/SKILL.md @@ -0,0 +1,57 @@ +--- +name: transform-pipeline-verification +description: Pattern for running and verifying transform pipeline output at any stage (seeds-only or full). Use when writing seeds.py or dataset.py to run the pipeline, inspect output quality iteratively with explore.py, and only report back once verified. +--- + +# Transform Pipeline Verification + +Each pipeline stage (`seeds.py`, `dataset.py`) should be independently runnable. After a run, use `explore.py` to iteratively verify output quality before reporting back to the orchestrator. + +## Phase 1: Run the pipeline + +Only plug in the minimum components you are responsible for to `QuestionPipeline`, populate any (or multiple) of: seed_generator, question_generator, labeler, context_generators, renderer, rollout_generator. + +```python +pipeline = QuestionPipeline(...) + +if __name__ == "__main__": + lr_client = get_client() + cost_estimate = lr_client.transforms.estimate_cost(pipeline, max_questions=) + dataset = lr_client.transforms.run(pipeline, max_questions=, name="_seeds") +``` + +For full pipeline: same pattern with question_generator and labeler configured. + +After `transforms.run()`, stdout shows the dataset ID. Pipeline scripts print an explore hint, e.g. `Explore: python explore.py --summary`. + +## Phase 2: Explore output iteratively + +Use `explore.py` to probe the dataset and verify for quality and make sure the output roughly matches your expectations. + +```bash +python explore.py [--summary] [--samples N] [--valid N] [--invalid N] [--labels N] [--truncate N] +``` + +| Flag | Use when | +|------|----------| +| `--summary` (default) | First check — validity %, label distribution | +| `--samples N` | Spot-check N random rows (seed_text or question+label) | +| `--valid N` | Inspect N valid samples | +| `--invalid N` | Debug failures — see `invalid_reason` for N invalid samples | +| `--labels N` | Quality check — question + label + reasoning side-by-side | +| `--truncate N` | Override max chars for long text fields (default: 120) | + +Run from the project directory. Iterate until confident: e.g. `--summary` shows 30% invalid → `--invalid 10` to see why → adjust pipeline config → rerun. + +## Completing the step + +1. Run the pipeline +2. Run `explore.py --summary` and confirm validity +3. Iteratively probe with `--samples`, `--invalid`, `--labels` as needed +4. Only then write to `state.json` and report back to the orchestrator + +## Why + +- Cheap seeds-only runs catch SQL/ingestion errors before the full pipeline +- `explore.py` owns download and caching — no extra code in pipeline scripts +- Iterative inspection surfaces label quality issues, filter reasons, and bad seeds that a one-time print would miss diff --git a/.claude/skills/workflow-architecture/SKILL.md b/.claude/skills/workflow-architecture/SKILL.md new file mode 100644 index 0000000..267db2e --- /dev/null +++ b/.claude/skills/workflow-architecture/SKILL.md @@ -0,0 +1,147 @@ +--- +name: workflow-architecture +description: File-based workflow structure for Lightningrod projects. Use when creating or modifying project files, understanding agent ownership boundaries, reading/writing shared state, or coordinating back-propagation between agents. +--- + +# Workflow Architecture + +Each stage of the pipeline lives in its own plain Python file. Files are independently runnable — rerunning `eval.py` never affects `train.py`, rerunning `train.py` never affects `dataset.py`, and so on. + +## Project file structure + +``` +/ + state.py # Shared state utilities — copied from .claude/templates/state.py, never modified + state.json # Shared run state: resource IDs only (read/written by all agents) + seeds.py # Seed preparation (owned by seeds specialist) + dataset.py # Dataset generation (owned by dataset-generator) + prepare.py # prepare_for_training config (owned by dataset-generator, imported by train + eval) + train.py # Fine-tuning (owned by fine-tuner) + eval.py # Evaluation (owned by fine-tuner — separate from training) +``` + +## Project initialization + +Before any agent writes code, the orchestrator initializes the project directory by running the setup script from the repo: + +```bash +python .claude/templates/setup.py +``` + +This copies `state.py` from `.claude/templates/` and creates a blank `state.json`. It is idempotent — safe to run again if the directory already exists. + +Agents never write state management or client initialization inline. They always import from `state.py`: + +```python +from state import get_client, State + +lr = get_client() +state = State.load() + +# Read a field — raises automatically if not yet populated +dataset_id = state.dataset_id + +# input_dataset_id is Optional — returns None for news/GDELT seeds +if state.input_dataset_id: + input_dataset = lr.datasets.get(state.input_dataset_id) + +# Write back +state.model_id = job.model_id +state.save() +``` + +## File ownership — strict + +Each agent may only create or modify its own file(s). No agent touches another agent's file. + +| File | Owner | Can modify | +|------|-------|-----------| +| `seeds.py` | seeds specialist (whichever is active) | seeds specialist only | +| `dataset.py` | dataset-generator | dataset-generator only | +| `prepare.py` | dataset-generator | dataset-generator only | +| `train.py` | fine-tuner | fine-tuner only | +| `eval.py` | fine-tuner | fine-tuner only | +| `state.json` | all agents | all agents (read + write) | + +## state.json — shared run state + +Resource IDs only — no config. Each script reads its inputs from `state.json` at startup and writes its outputs after creating a resource. + +```json +{ + "input_dataset_id": "ds_abc123", + "dataset_id": "ds_def456", + "model_id": null +} +``` + +**Important:** `train_dataset_id` and `test_dataset_id` do not exist as stored resources and must never appear in `state.json`. The `prepare_for_training` config lives in `prepare.py` (see below), not in `state.json`. Config belongs in code; IDs belong in state. + +Keys are set to `null` until the responsible script has been run. Use `get_state(key)` from `state.py` to read a value that must exist — it raises a clear error with the current state if it's missing or null. + +## What each file does + +### seeds.py +- Configures and validates the seed source (news query, BigQuery SQL, file ingestion, etc.) +- For file/BigQuery sources: runs ingestion and creates a Lightningrod input dataset +- For news/GDELT sources: validates the config and optionally previews a few seeds +- Writes `input_dataset_id` to `state.json` (set to `null` for news/GDELT — seed generator is inline) + +### dataset.py +- Reads `input_dataset_id` from `state.json` (or uses inline seed generator for news/GDELT) +- Configures and runs the `QuestionPipeline` with `MAX_QUESTIONS = 10` by default +- Calls `get_datasets()` from `prepare.py` to validate the split is healthy (correct volume, no leakage, clean dedup) +- Writes `dataset_id` to `state.json` + +### prepare.py +- Defines and exports `get_datasets(dataset_id) -> (train_ds, test_ds)` — the single source of truth for `prepare_for_training` config +- Imported by `dataset.py` (for validation), `train.py`, and `eval.py` +- When the dataset-generator adjusts filter/split params, this is the only file that changes + +```python +# prepare.py +import lightningrod as lr +from lightningrod import prepare_for_training, FilterParams, DedupParams, SplitParams + +def get_datasets(dataset_id): + dataset = lr.datasets.get(dataset_id) + return prepare_for_training( + dataset, + filter=FilterParams(days_to_resolution_range=(1, 60)), + dedup=DedupParams(), + split=SplitParams(strategy="temporal", test_size=0.2), + ) +``` + +### train.py +- Reads `dataset_id` from `state.json` +- Calls `from prepare import get_datasets; train_ds, _ = get_datasets(dataset_id)` +- Estimates cost, then runs `lr.training.run(...)` +- Writes `model_id` to `state.json` + +### eval.py +- Reads `dataset_id` and `model_id` from `state.json` +- Calls `from prepare import get_datasets; _, test_ds = get_datasets(dataset_id)` +- Runs `lr.evals.run(...)` and prints results +- Writes nothing — safe to rerun any number of times without side effects + +## Back-propagation protocol + +When a downstream agent determines that an upstream stage needs to change, it **never modifies the upstream file directly**. Instead: + +1. **Fine-tuner → dataset-generator**: Fine-tuner reports specific issues to the orchestrator (e.g. "too few test samples after split", "questions are too easy — binary accuracy near 100%"). Orchestrator delegates to dataset-generator with those get_statements. Dataset-generator modifies `dataset.py` and reruns it. New IDs are written to `state.json`. Fine-tuner then reruns `train.py`. + +2. **Fine-tuner → seeds specialist**: If the root cause is seed quality (not enough diversity, wrong date range), fine-tuner reports to orchestrator. Orchestrator delegates to the seeds specialist to modify `seeds.py` and rerun. Then dataset-generator reruns `dataset.py`. Then fine-tuner reruns `train.py`. + +3. **Dataset-generator → seeds specialist**: If `prepare_for_training` fails due to seed volume or quality, dataset-generator reports to orchestrator. Seeds specialist modifies `seeds.py`, reruns, new `input_dataset_id` is written. Dataset-generator reruns `dataset.py`. + +**Rule: information flows downstream automatically via `state.json`. Change requests flow upstream via the orchestrator.** + +## Rerunnability rules + +| Script | Safe to rerun? | Side effects | +|--------|---------------|--------------| +| `seeds.py` | Yes | Creates a new input dataset (new ID written to state) | +| `dataset.py` | Yes | Creates a new dataset (new IDs written to state) | +| `train.py` | Yes | Starts a new training job (new model_id written to state) — costs money | +| `eval.py` | Yes, freely | No side effects, no cost impact | diff --git a/.claude/templates/explore.py b/.claude/templates/explore.py new file mode 100644 index 0000000..dc08e83 --- /dev/null +++ b/.claude/templates/explore.py @@ -0,0 +1,133 @@ +""" +Explore pipeline output by dataset ID. Downloads and caches locally on first use. +Usage: + python explore.py [--summary] [--samples N] [--valid N] [--invalid N] [--labels N] [--truncate N] +""" + +import argparse +import json +import sys +from pathlib import Path + +_THIS_DIR = Path(__file__).resolve().parent +if str(_THIS_DIR) not in sys.path: + sys.path.insert(0, str(_THIS_DIR)) + +from state import get_client + +CACHE_DIR = _THIS_DIR / ".lr_cache" +DEFAULT_TRUNCATE = 120 + + +def _cache_path(dataset_id: str) -> Path: + return CACHE_DIR / f"{dataset_id}.json" + + +def load_df(dataset_id: str): + path = _cache_path(dataset_id) + if not path.exists(): + CACHE_DIR.mkdir(parents=True, exist_ok=True) + lr_client = get_client() + dataset = lr_client.datasets.get(dataset_id) + rows = dataset.flattened() + with open(path, "w") as f: + json.dump(rows, f, indent=2, default=str) + print(f" Cached {len(rows)} rows → {path}") + import pandas as pd + with open(path) as f: + return pd.DataFrame(json.load(f)) + + +def summary(df): + import pandas as pd + total = len(df) + valid = (df["is_valid"] == True).sum() if "is_valid" in df.columns else total + print(f"\nValidity: {valid}/{total} ({100 * valid / total:.1f}% valid)") + if "label" in df.columns: + print("\nLabel distribution:") + print(df["label"].value_counts().to_string()) + print() + + +def _truncate(s, n): + if not isinstance(s, str): + return s + return s[:n] + "..." if len(s) > n else s + + +def _cols_for_stage(df): + if "question_text" in df.columns: + return ["question_text", "label", "label_confidence", "is_valid", "invalid_reason", "seed_text"] + return ["seed_text", "seed_creation_date", "is_valid"] + + +def show_samples(df, valid_only=False, invalid_only=False, n=5, random=True, truncate=DEFAULT_TRUNCATE): + import pandas as pd + subset = df + if valid_only: + if "is_valid" not in df.columns: + print(" No is_valid column.") + return + subset = df[df["is_valid"] == True] + elif invalid_only: + if "is_valid" not in df.columns: + print(" No is_valid column.") + return + subset = df[df["is_valid"] == False] + cols = [c for c in _cols_for_stage(df) if c in subset.columns] + if not cols: + cols = list(subset.columns)[:6] + sample = subset.sample(n=min(n, len(subset)), random_state=42) if random and len(subset) > n else subset.head(n) + for col in ["seed_text", "question_text", "reasoning"]: + if col in sample.columns: + sample = sample.copy() + sample[col] = sample[col].apply(lambda x: _truncate(x, truncate) if pd.notna(x) else x) + print(sample[cols].to_string()) + print() + + +def check_labels(df, n=5, truncate=DEFAULT_TRUNCATE): + cols = ["question_text", "label", "reasoning"] + cols = [c for c in cols if c in df.columns] + if not cols: + print(" No question_text/label columns (seeds-only output?).") + return + subset = df[df["is_valid"] == True] if "is_valid" in df.columns else df + sample = subset.sample(n=min(n, len(subset)), random_state=42) if len(subset) > n else subset + for _, row in sample.iterrows(): + print("-" * 60) + for c in cols: + val = row.get(c, "") + print(f" {c}: {_truncate(str(val), truncate)}") + print() + print("-" * 60) + + +def main(): + parser = argparse.ArgumentParser(description="Explore pipeline output by dataset ID") + parser.add_argument("dataset_id", help="Dataset ID from transforms.run()") + parser.add_argument("--summary", action="store_true", help="Validity stats and label distribution (default)") + parser.add_argument("--samples", type=int, metavar="N", help="Show N random samples") + parser.add_argument("--valid", type=int, metavar="N", help="Show N valid samples") + parser.add_argument("--invalid", type=int, metavar="N", help="Show N invalid samples") + parser.add_argument("--labels", type=int, metavar="N", help="Show N samples with question+label+reasoning for quality check") + parser.add_argument("--truncate", type=int, default=DEFAULT_TRUNCATE, metavar="N", help=f"Max chars for long text fields (default: {DEFAULT_TRUNCATE})") + args = parser.parse_args() + + df = load_df(args.dataset_id) + truncate = args.truncate + + if args.samples is not None: + show_samples(df, n=args.samples, truncate=truncate) + elif args.valid is not None: + show_samples(df, valid_only=True, n=args.valid, truncate=truncate) + elif args.invalid is not None: + show_samples(df, invalid_only=True, n=args.invalid, truncate=truncate) + elif args.labels is not None: + check_labels(df, n=args.labels, truncate=truncate) + else: + summary(df) + + +if __name__ == "__main__": + main() diff --git a/.claude/templates/setup.py b/.claude/templates/setup.py new file mode 100644 index 0000000..1e6f024 --- /dev/null +++ b/.claude/templates/setup.py @@ -0,0 +1,44 @@ +""" +Project setup script — run once to initialize a new Lightningrod project directory. +Usage: python setup.py [project_dir] +""" +import json +import shutil +import sys +from pathlib import Path + +TEMPLATES_DIR = Path(__file__).parent + + +def setup(project_dir: str = ".") -> None: + project_dir = Path(project_dir) + project_dir.mkdir(parents=True, exist_ok=True) + + # Copy static utility files + for filename in ["state.py", "explore.py"]: + src = TEMPLATES_DIR / filename + dst = project_dir / filename + if dst.exists(): + print(f" {filename} already exists, skipping.") + else: + shutil.copy(src, dst) + print(f" Created {dst}") + + # Initialize state.json + state_file = project_dir / "state.json" + if state_file.exists(): + print(f" state.json already exists, skipping.") + else: + with open(state_file, "w") as f: + json.dump( + {"input_dataset_id": None, "dataset_id": None, "model_id": None}, + f, + indent=2, + ) + print(f" Created {state_file}") + + print(f"\nProject ready at '{project_dir}'. Next: run seeds.py.") + + +if __name__ == "__main__": + setup(sys.argv[1] if len(sys.argv) > 1 else ".") diff --git a/.claude/templates/state.py b/.claude/templates/state.py new file mode 100644 index 0000000..3dc7a03 --- /dev/null +++ b/.claude/templates/state.py @@ -0,0 +1,98 @@ +""" +Shared utilities for Lightningrod projects. +Auto-copied by project setup — do not modify. +""" +import json +import os +from typing import Optional + +from lightningrod import LightningRod + +STATE_FILE = "state.json" + + +def get_client() -> LightningRod: + """Return an initialized LightningRod client.""" + api_key = os.environ.get("LIGHTNINGROD_API_KEY") + if not api_key: + raise EnvironmentError( + "LIGHTNINGROD_API_KEY environment variable is not set." + ) + return LightningRod(api_key=api_key) + + +class State: + """ + Typed project state. All field accesses raise if the value hasn't been set yet. + Use `is_set(field)` to check presence without raising (e.g. for optional fields + like `input_dataset_id`, which is None for news/GDELT seeds). + """ + + def __init__( + self, + input_dataset_id: Optional[str] = None, + dataset_id: Optional[str] = None, + model_id: Optional[str] = None, + ): + self._input_dataset_id = input_dataset_id + self._dataset_id = dataset_id + self._model_id = model_id + + def _require(self, name: str) -> str: + value = getattr(self, f"_{name}") + if value is None: + raise RuntimeError( + f"State field '{name}' is not set. " + f"Make sure the previous pipeline step has been run successfully.\n" + f"Current state: {self._as_dict()}" + ) + return value + + # --- fields --- + + @property + def input_dataset_id(self) -> Optional[str]: + return self._input_dataset_id + + @input_dataset_id.setter + def input_dataset_id(self, value: Optional[str]) -> None: + self._input_dataset_id = value + + @property + def dataset_id(self) -> str: + return self._require("dataset_id") + + @dataset_id.setter + def dataset_id(self, value: Optional[str]) -> None: + self._dataset_id = value + + @property + def model_id(self) -> str: + return self._require("model_id") + + @model_id.setter + def model_id(self, value: Optional[str]) -> None: + self._model_id = value + + # --- persistence --- + + def _as_dict(self) -> dict: + return { + "input_dataset_id": self._input_dataset_id, + "dataset_id": self._dataset_id, + "model_id": self._model_id, + } + + @classmethod + def load(cls) -> "State": + if not os.path.exists(STATE_FILE): + raise FileNotFoundError( + f"{STATE_FILE} not found. Run `python setup.py` to initialize this project." + ) + with open(STATE_FILE) as f: + return cls(**json.load(f)) + + def save(self) -> None: + with open(STATE_FILE, "w") as f: + json.dump(self._as_dict(), f, indent=2) + print(f" state.json updated: {self._as_dict()}") diff --git a/.gitignore b/.gitignore index a360cc4..3782636 100644 --- a/.gitignore +++ b/.gitignore @@ -43,5 +43,9 @@ htmlcov/ test_sdk.py notebooks/**/lightningrod-python-sdk/ +# Pipeline output cache +.lr_cache/ + # Misc .DS_Store +agent-experiments/ diff --git a/src/lightningrod/_display.py b/src/lightningrod/_display.py index 790bc4e..28fe426 100644 --- a/src/lightningrod/_display.py +++ b/src/lightningrod/_display.py @@ -451,28 +451,46 @@ def run_live_display( live.update(build_live_display(metrics=metrics, job=job)) -def _build_invalid_samples_error_message(original_message: str) -> Group: +def _build_invalid_samples_error_message( + original_message: str, + error_details: Optional[list[str]] = None, +) -> Group: """Build enhanced error message for invalid samples error using Rich formatting.""" renderables: list[RenderableType] = [] - + renderables.append(_safe_markup(f"[bold]{original_message}[/bold]")) renderables.append(Text("")) - + + if error_details: + renderables.append(_safe_markup("[bold]Error details:[/bold]")) + for detail in error_details[:5]: + truncated = detail[:500] + "..." if len(detail) > 500 else detail + renderables.append(Text(f" • {truncated}", style="dim")) + if len(error_details) > 5: + renderables.append(Text(f" • ... and {len(error_details) - 5} more", style="dim italic")) + renderables.append(Text("")) + renderables.append(_safe_markup("[bold]This typically happens when:[/bold]")) renderables.append(_safe_markup(" • Filter criteria is too strict")) renderables.append(_safe_markup(" • Labeling failed (e.g., questions couldn't be answered or had low confidence)")) renderables.append(_safe_markup(" • Seed generation found no suitable content")) renderables.append(Text("")) - + renderables.append(_safe_markup("[bold]Next steps:[/bold]")) renderables.append(_safe_markup(" • Check the dataset samples to see specific failure reasons in the 'meta.filter_reason' field")) renderables.append(_safe_markup(" • Adjust and retry the transform pipeline (e.g., try a wider date range)")) renderables.append(_safe_markup(" • If the problem persists, contact support or open a GitHub issue: [link=https://github.com/lightning-rod-labs/lightningrod-python-sdk/issues]https://github.com/lightning-rod-labs/lightningrod-python-sdk/issues[/link]")) - + return Group(*renderables) -def display_error(message: str, title: str = "Error", job: Any = None, response_body: str | None = None) -> None: +def display_error( + message: str, + title: str = "Error", + job: Any = None, + response_body: str | None = None, + error_details: Optional[list[str]] = None, +) -> None: console = Console() renderables: list[RenderableType] = [] @@ -480,7 +498,16 @@ def display_error(message: str, title: str = "Error", job: Any = None, response_ renderables.append(Text("")) if "Job completed with 0 valid rows" in message: - renderables.append(_build_invalid_samples_error_message(message)) + renderables.append(_build_invalid_samples_error_message(message, error_details=error_details)) + elif error_details: + renderables.append(_safe_markup(f"[bold]{message}[/bold]")) + renderables.append(Text("")) + renderables.append(_safe_markup("[bold]Error details:[/bold]")) + for detail in error_details[:5]: + truncated = detail[:500] + "..." if len(detail) > 500 else detail + renderables.append(Text(f" • {truncated}", style="dim")) + if len(error_details) > 5: + renderables.append(Text(f" • ... and {len(error_details) - 5} more", style="dim italic")) else: renderables.append(_safe_markup(f"[bold]{message}[/bold]")) diff --git a/src/lightningrod/datasets/client.py b/src/lightningrod/datasets/client.py index bc8d791..c26d4c8 100644 --- a/src/lightningrod/datasets/client.py +++ b/src/lightningrod/datasets/client.py @@ -21,28 +21,31 @@ class DatasetSamplesClient: def __init__(self, client: AuthenticatedClient): self._client: AuthenticatedClient = client - def list(self, dataset_id: str) -> List[Sample]: + def list(self, dataset_id: str, limit: Optional[int] = None) -> List[Sample]: samples: List[Sample] = [] cursor: Optional[str] = None - + while True: + req_limit = min(100, limit - len(samples)) if limit is not None else 100 response = get_dataset_samples_datasets_dataset_id_samples_get.sync_detailed( dataset_id=dataset_id, client=self._client, - limit=100, + limit=req_limit, cursor=cursor, ) - + parsed = handle_response_error(response, "fetch samples") - + samples.extend(parsed.samples) - + + if limit is not None and len(samples) >= limit: + return samples[:limit] if not parsed.has_more: break if isinstance(parsed.next_cursor, Unset) or parsed.next_cursor is None: break cursor = str(parsed.next_cursor) - + return samples def upload( diff --git a/src/lightningrod/transforms/client.py b/src/lightningrod/transforms/client.py index 7784365..3947c03 100644 --- a/src/lightningrod/transforms/client.py +++ b/src/lightningrod/transforms/client.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import List, Optional, Union from lightningrod._display import _is_notebook, display_error, display_warning, run_live_display from lightningrod._generated.models import ( @@ -35,9 +35,51 @@ from lightningrod.datasets.client import DatasetSamplesClient from lightningrod._generated.types import Unset from lightningrod._errors import handle_response_error +from lightningrod.datasets.client import DatasetSamplesClient TransformConfig = Union[FileSetQuerySeedGenerator, FileSetSeedGenerator, ForwardLookingQuestionGenerator, GdeltSeedGenerator, NewsSeedGenerator, QuestionAndLabelGenerator, QuestionGenerator, QuestionPipeline, QuestionRenderer, WebSearchLabeler] + +def _fetch_error_details_from_samples( + job: TransformJob, + samples_client: DatasetSamplesClient, + jobs_client: "TransformJobsClient", +) -> List[str]: + details: List[str] = [] + if "rejection_error_messages" in job.additional_properties: + msgs = job.additional_properties["rejection_error_messages"] + if isinstance(msgs, list): + for m in msgs: + if isinstance(m, str) and m.strip(): + details.append(m.strip()) + if details: + return details + metrics = jobs_client.get_metrics(job.id) + if metrics: + for step in metrics.steps: + if (step.rejected_count > 0 or step.error_count > 0) and step.summary and step.summary.strip(): + details.append(step.summary.strip()) + if details: + return details + if not job.output_dataset_id: + return [] + try: + samples = samples_client.list(job.output_dataset_id, limit=10) + except Exception: + return [] + seen: set[str] = set() + for sample in samples: + msg = None + if not isinstance(sample.meta, Unset) and sample.meta is not None and "error_message" in sample.meta: + msg = sample.meta["error_message"] + elif "error_message" in sample.additional_properties: + msg = sample.additional_properties["error_message"] + if msg and isinstance(msg, str) and msg.strip() and msg not in seen: + seen.add(msg) + details.append(msg.strip()) + return details + + class TransformJobsClient: def __init__(self, client: AuthenticatedClient): self._client = client @@ -105,7 +147,10 @@ def poll() -> tuple[PipelineMetricsResponse, TransformJob]: if job.status == TransformJobStatus.FAILED: error_msg = job.error_message if (not isinstance(job.error_message, Unset) and job.error_message) else "Unknown error" - display_error(error_msg, title="Job Failed", job=job) + error_details = _fetch_error_details_from_samples( + job, self._dataset_samples_client, self.jobs + ) + display_error(error_msg, title="Job Failed", job=job, error_details=error_details) # No need to raise an exception in the notebook, as we display the error using display_error if not _is_notebook():