diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ca02864 --- /dev/null +++ b/.gitignore @@ -0,0 +1,42 @@ +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +*.egg +dist/ +build/ +*.whl + +# Virtual environments +.venv/ +venv/ +env/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +.DS_Store + +# mypy +.mypy_cache/ + +# Test / pytest +.pytest_cache/ +.coverage +htmlcov/ +test_data.jsonl +test_tool_call.jsonl +results.jsonl + +# Config secrets (never commit credentials) +llm_eval_kit.yaml +.env + +# Blog drafts +blog/ + +# Lambda deployment artifacts +deploy_package/ +*.zip diff --git a/README.md b/README.md index f030b53..be6f60e 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,22 @@ # LLM Eval Kit A Python SDK for creating custom evaluation metrics for LLM model evaluation on Sagemaker Training Job with built-in Pydantic validation. + For the official integration with AWS Sagemaker training job, please view in the [Official AWS Sagemaker Documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/nova-model-evaluation.html). + ## Installation -``` +```bash git clone https://github.com/aws/llm-eval-kit.git cd llm-eval-kit -pip install . +uv venv .venv && source .venv/bin/activate +uv pip install . ``` ## Architecture The SDK provides: + - **Pydantic Validation**: Automatic input/output validation using Pydantic models - **PreProcessor**: For input data transformation with validation - **PostProcessor**: For output data formatting with validation @@ -27,6 +31,7 @@ The SDK provides: See `example/run_example.py` for a complete working example to run locally. ### Run in AWS Lambda + You need to create a lambda (follow this [guide](https://docs.aws.amazon.com/lambda/latest/dg/getting-started.html)) and upload `llm-eval-kit` as a lambda layer in order to use it. In the [github release](https://github.com/aws/llm-eval-kit/releases), you should be able to find a pre-built llm-eval-kit-layer.zip file. @@ -35,10 +40,11 @@ Use below command to upload custom lambda layer. ``` aws lambda publish-layer-version \ - --layer-name llm-eval-kit-layer \ - --zip-file fileb://llm-eval-kit-layer.zip \ - --compatible-runtimes python3.12 python3.11 python3.10 python3.9 +--layer-name llm-eval-kit-layer \ +--zip-file fileb://llm-eval-kit-layer.zip \ +--compatible-runtimes python3.12 python3.11 python3.10 python3.9 ``` + You need to add this layer as custom layer along with the required AWS layer: `AWSLambdaPowertoolsPythonV3-python312-arm64` (because of pydantic depencency) to your lambda. Then update your lambda code with: @@ -72,7 +78,6 @@ def postprocessor(event: dict, context) -> dict: "metric": "inverted_accuracy_custom", "value": inverted_accuracy }) - # Add more metrics here return { @@ -92,31 +97,58 @@ lambda_handler = build_lambda_handler( The SDK automatically validates: ### Preprocessing Input + ```json { - "process_type": "preprocess", - "data": { - "prompt": "what can you do?", - "gold": "Hello! How can I help you today?", - "system": "You are a helpful assistant" - } + "process_type": "preprocess", + "data": { + "prompt": "what can you do?", + "gold": "Hello! How can I help you today?", + "system": "You are a helpful assistant" + } } ``` ### Postprocessing Input + ```json { - "process_type": "postprocess", - "data": [ - { - "prompt": "what can you do", - "inference_output": "Hello! How can I help you today?", - "gold": "Hello! How can I help you today?" - } - ] + "process_type": "postprocess", + "data": [ + { + "prompt": "what can you do", + "inference_output": "Hello! How can I help you today?", + "gold": "Hello! How can I help you today?" + } + ] } ``` +## RLVR Grader Framework + +llm-eval-kit also includes a grader framework for building and deploying reward functions for Reinforcement Learning with Verifiable Rewards (RLVR) on Amazon Bedrock. This extends the SDK beyond SageMaker evaluation into RFT (Reinforcement Fine-Tuning) workflows. + +Features include: + +- Built-in graders for exact match, string similarity, and BFCL tool-calling evaluation +- A `@grader` decorator for writing custom reward functions +- Dataset loaders for JSONL, BFCL, and HuggingFace Hub +- One-command Lambda deployment of graders as reward functions +- A CLI for local evaluation, validation, and deployment + +```bash +uv pip install -e ".[dev,datasets,deploy]" +``` + +For full documentation on the grader framework, see the [src/llm_eval_kit README](src/llm_eval_kit/README.md). + +| Topic | Description | +|-------|-------------| +| [Graders](docs/graders.md) | Built-in graders, writing custom graders, the `@grader` decorator | +| [Datasets](docs/datasets.md) | Loading from JSONL, BFCL, and HuggingFace Hub | +| [Lambda Deployment](docs/deploy.md) | Deploy graders as AWS Lambda reward functions for RLVR | +| [CLI Reference](docs/cli.md) | All CLI commands and options | + ## Testing ```bash @@ -130,8 +162,9 @@ python example/run_example.py ## Development ```bash -# Install in development mode -pip install -e . +# Create venv and install in development mode +uv venv .venv && source .venv/bin/activate +uv pip install -e ".[dev,datasets,deploy]" # Run tests with coverage python -m pytest tests/ --cov=llm_eval_kit diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 0000000..c82f56e --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,67 @@ +# CLI Reference + +``` +llm-eval-kit [options] +``` + +## `evaluate` + +Run a grader over a dataset. + +```bash +llm-eval-kit evaluate --grader --data [options] +``` + +| Option | Description | +|--------|-------------| +| `--grader` | Built-in grader name (`exact_match`, `string_similarity`, `tool_call`) | +| `--grader-path` | Custom grader as `module.path:function_name` | +| `--data` | Path to JSONL dataset file (required) | +| `--format` | `jsonl` (default) or `bfcl` for BFCL-formatted files | +| `--output` | Write per-sample results to a JSONL file | +| `--max-samples` | Limit number of samples to evaluate | + +Examples: + +```bash +# Built-in grader +llm-eval-kit evaluate --grader exact_match --data samples.jsonl + +# Custom grader with output +llm-eval-kit evaluate --grader-path my_module:my_grader --data samples.jsonl --output results.jsonl + +# BFCL format with sample limit +llm-eval-kit evaluate --grader tool_call --data BFCL_v3_simple.json --format bfcl --max-samples 50 +``` + +## `list-graders` + +Show all registered graders. + +```bash +llm-eval-kit list-graders +``` + +## `validate` + +Check a dataset file for schema errors. + +```bash +llm-eval-kit validate --data +``` + +## `deploy` + +Deploy a grader as an AWS Lambda function. Requires `uv pip install -e ".[deploy]"`. + +```bash +llm-eval-kit deploy --grader [options] +``` + +| Option | Description | +|--------|-------------| +| `--grader` | Built-in grader name | +| `--grader-path` | Custom grader as `module.path:function_name` | +| `--config` | Path to `llm_eval_kit.yaml` config file | + +See [deploy.md](deploy.md) for the full deployment walkthrough. diff --git a/docs/datasets.md b/docs/datasets.md new file mode 100644 index 0000000..6fac452 --- /dev/null +++ b/docs/datasets.md @@ -0,0 +1,98 @@ +# Datasets + +llm-eval-kit supports loading evaluation data from JSONL files, BFCL-formatted files, and HuggingFace Hub. + +## JSONL Format + +Each line is a JSON object with `id`, `messages`, and `ground_truth`: + +```jsonl +{"id": "1", "messages": [{"role": "user", "content": "2+2?"}, {"role": "assistant", "content": "4"}], "ground_truth": "4"} +{"id": "2", "messages": [{"role": "user", "content": "Capital of France?"}, {"role": "assistant", "content": "Paris"}], "ground_truth": "Paris"} +``` + +Load from CLI: + +```bash +llm-eval-kit evaluate --grader exact_match --data samples.jsonl +``` + +Load from Python: + +```python +from llm_eval_kit.datasets.loader import load_jsonl + +dataset = load_jsonl("samples.jsonl", max_samples=100) +``` + +Validate a file before running: + +```bash +llm-eval-kit validate --data samples.jsonl +``` + +## BFCL Format + +The [Berkeley Function Calling Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard.html) uses a specific JSONL format with `id`, `question` (list of message dicts), and `function` (tool definitions). + +```bash +llm-eval-kit evaluate \ + --grader tool_call \ + --data BFCL_v3_multiple.json \ + --format bfcl +``` + +```python +from llm_eval_kit.datasets.loader import load_bfcl + +dataset = load_bfcl("BFCL_v3_multiple.json", max_samples=100) +``` + +## HuggingFace Hub + +Pull datasets directly from HuggingFace. Requires `uv pip install -e ".[datasets]"`. + +```python +from llm_eval_kit.datasets.loader import load_huggingface + +dataset = load_huggingface( + "gorilla-llm/Berkeley-Function-Calling-Leaderboard", + split="train", + max_samples=50, + data_files="BFCL_v3_exec_simple.json", # pick a specific file + prompt_key="question", + ground_truth_key="ground_truth", + id_key="id", + response_key=None, +) +``` + +### Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `dataset_name` | (required) | HF dataset name (e.g. `"gorilla-llm/Berkeley-Function-Calling-Leaderboard"`) | +| `split` | `"train"` | Dataset split | +| `max_samples` | `None` | Limit number of samples | +| `token` | `None` | HF API token (falls back to `HF_TOKEN` env var) | +| `data_files` | `None` | Specific file(s) to load from the repo | +| `config_name` | `None` | Dataset config/subset name | +| `prompt_key` | `"prompt"` | Column name for the prompt | +| `response_key` | `"response"` | Column name for model response (`None` to skip) | +| `ground_truth_key` | `"ground_truth"` | Column name for ground truth (`None` to skip) | +| `id_key` | `"id"` | Column name for sample ID (`None` to auto-generate) | + +### BFCL on HuggingFace + +The BFCL repo has ~49 files with different schemas. You must use `data_files` to select one — loading the entire repo will fail. + +Available files include: `BFCL_v3_simple.json`, `BFCL_v3_multiple.json`, `BFCL_v3_parallel.json`, `BFCL_v3_exec_simple.json`, `BFCL_v3_live_simple.json`, and more. + +### Private/Gated Datasets + +```python +dataset = load_huggingface( + "my-org/my-private-dataset", + token="hf_...", # or set HF_TOKEN env var +) +``` diff --git a/docs/deploy.md b/docs/deploy.md new file mode 100644 index 0000000..9feffa0 --- /dev/null +++ b/docs/deploy.md @@ -0,0 +1,116 @@ +# Lambda Deployment + +Deploy any grader as an AWS Lambda function for use as a reward function in Bedrock RFT jobs. The deploy command packages your grader with all dependencies (including pydantic), creates or updates the Lambda, and wires up the handler automatically. + +Requires `uv pip install -e ".[deploy]"`. + +## 1. Create a Lambda Execution Role + +If you don't already have one: + +```bash +aws iam create-role \ + --role-name llm-eval-kit-lambda-role \ + --assume-role-policy-document '{ + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Principal": {"Service": "lambda.amazonaws.com"}, + "Action": "sts:AssumeRole" + }] + }' +``` + +## 2. Create a Config File + +Create `llm_eval_kit.yaml` in your project root: + +```yaml +aws: + region: us-east-1 + account_id: "123456789012" + lambda: + function_name: my-reward-function + runtime: python3.12 + timeout: 60 + memory_size: 256 + role_arn: arn:aws:iam::123456789012:role/llm-eval-kit-lambda-role +``` + +Config values can also be set via environment variables: `AWS_REGION`, `AWS_DEFAULT_REGION`, `AWS_ACCOUNT_ID`. + +## 3. Deploy + +```bash +# Deploy a built-in grader +llm-eval-kit deploy --grader exact_match + +# Deploy a custom grader from a module path +llm-eval-kit deploy --grader-path my_module:my_grader + +# Deploy with a specific config file +llm-eval-kit deploy --grader tool_call --config my_config.yaml +``` + +## 4. Test the Deployed Function + +```bash +aws lambda invoke \ + --function-name my-reward-function \ + --payload '{ + "messages": [ + {"role": "user", "content": "What is 2+2?"}, + {"role": "assistant", "content": "4"} + ], + "ground_truth": "4" + }' \ + /dev/stdout +``` + +Expected response: + +```json +{ + "statusCode": 200, + "body": { + "score": 1.0, + "reason": "Exact match", + "is_valid": true, + "metrics": {"exact_match": {"score": 1.0, "reason": "case_sensitive=False", "is_valid": true}}, + "metadata": {} + } +} +``` + +## What Happens Under the Hood + +1. Your grader + the `llm_eval_kit` package + dependencies (pydantic, etc.) are installed for the Lambda runtime and zipped into a deployment package +2. An auto-generated `handler.py` wraps your grader with Lambda-compatible request/response handling +3. The Lambda function is created or updated in your AWS account +4. The function accepts `{"messages": [...], "ground_truth": ..., "kwargs": {}}` payloads + +## Config Reference + +### `llm_eval_kit.yaml` + +```yaml +aws: + region: us-east-1 # AWS region + account_id: "123456789012" # AWS account ID (optional, for reference) + lambda: + function_name: my-reward-function # Lambda function name + runtime: python3.12 # Lambda runtime + handler: handler.lambda_handler # Handler path (default) + timeout: 60 # Timeout in seconds + memory_size: 256 # Memory in MB + role_arn: arn:aws:iam::... # Lambda execution role ARN + environment: {} # Extra env vars for the Lambda +``` + +### Environment Variable Overrides + +| Variable | Overrides | +|----------|-----------| +| `AWS_REGION` | `aws.region` | +| `AWS_DEFAULT_REGION` | `aws.region` | +| `AWS_ACCOUNT_ID` | `aws.account_id` | diff --git a/docs/graders.md b/docs/graders.md new file mode 100644 index 0000000..582a0e7 --- /dev/null +++ b/docs/graders.md @@ -0,0 +1,78 @@ +# Graders + +Graders are the core evaluation unit in llm-eval-kit. A grader takes a conversation (messages) and ground truth, then returns a scored `EvaluateResult`. + +## Built-in Graders + +| Name | Description | +|------|-------------| +| `exact_match` | Exact string comparison (case-insensitive by default) | +| `string_similarity` | Levenshtein distance or token F1 fuzzy matching | +| `tool_call` | BFCL-style AST comparison of function calls with type coercion | + +List them from the CLI: + +```bash +llm-eval-kit list-graders +``` + +## Writing a Custom Grader + +Use the `@grader` decorator to register a function as a grader: + +```python +from llm_eval_kit.graders.decorator import grader +from llm_eval_kit.models.results import EvaluateResult + +@grader(name="my_grader", description="My custom grader") +def my_grader(messages, ground_truth, **kwargs): + response = messages[-1].content + match = response.strip().lower() == str(ground_truth).strip().lower() + return EvaluateResult( + score=1.0 if match else 0.0, + reason="Match" if match else "No match", + ) +``` + +Your function receives: +- `messages` — list of `Message` objects (role + content) +- `ground_truth` — the expected answer (str, list, or dict) +- `**kwargs` — any extra metadata from the sample + +It must return an `EvaluateResult` with at minimum a `score` (0.0–1.0). + +## Using a Custom Grader from CLI + +Point to your grader with `--grader-path`: + +```bash +llm-eval-kit evaluate \ + --grader-path my_module:my_grader \ + --data samples.jsonl +``` + +The format is `module.path:function_name`. The module must be importable from your current directory or installed in your environment. + +## Grader Architecture + +- `Grader` (ABC) — base class with a `grade(messages, ground_truth, **kwargs)` method +- `@grader` decorator — wraps a plain function into a `_FunctionGrader` instance +- `GraderRegistry` — singleton that maps names to grader instances +- Built-in graders auto-register on import via `graders/__init__.py` + +## EvaluateResult + +```python +from llm_eval_kit.models.results import EvaluateResult, MetricResult + +result = EvaluateResult( + score=0.85, + reason="Partial match", + is_valid=True, + metrics={ + "name_accuracy": MetricResult(score=1.0, reason="Correct name"), + "value_accuracy": MetricResult(score=0.7, reason="2/3 values matched"), + }, + metadata={"debug": "extra info"}, +) +``` diff --git a/llm_eval_kit.example.yaml b/llm_eval_kit.example.yaml new file mode 100644 index 0000000..2e0cd4f --- /dev/null +++ b/llm_eval_kit.example.yaml @@ -0,0 +1,19 @@ +# llm-eval-kit deployment configuration +# Copy to llm_eval_kit.yaml and fill in your values. +# The CLI auto-discovers this file in the current or parent directories. + +aws: + region: us-east-1 + # account_id: "123456789012" # optional, auto-detected from creds + # profile: my-profile # AWS CLI profile name (optional) + + lambda: + function_name: llm-eval-reward-function + runtime: python3.12 + handler: handler.lambda_handler + timeout: 60 + memory_size: 256 + role_arn: arn:aws:iam::123456789012:role/your-lambda-execution-role + # layers: [] # optional Lambda layers + # environment: # extra env vars passed to the Lambda + # MY_VAR: my_value diff --git a/pyproject.toml b/pyproject.toml index d34cbc7..7d2b298 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,11 +9,11 @@ build-backend = "setuptools.build_meta" [project] name = "llm-eval-kit" license = "Apache-2.0" -version = "1.0" +version = "1.1.0" readme = "README.md" +requires-python = ">=3.10" dependencies = [ - "pydantic" - # Put your dependencies here! + "pydantic>=2.0.0", ] [project.optional-dependencies] @@ -22,4 +22,17 @@ dev = [ "setuptools", "twine", "wheel", + "pytest>=7.0.0", + "mypy", ] +datasets = [ + "datasets>=2.0.0", + "huggingface-hub>=0.20.0", +] +deploy = [ + "boto3>=1.28.0", + "pyyaml>=6.0", +] + +[project.scripts] +llm-eval-kit = "llm_eval_kit.cli.main:main" diff --git a/src/llm_eval_kit/README.md b/src/llm_eval_kit/README.md new file mode 100644 index 0000000..05d8d61 --- /dev/null +++ b/src/llm_eval_kit/README.md @@ -0,0 +1,93 @@ +# llm-eval-kit — RLVR Grader Framework + +A grader framework for building evaluation functions, running them over datasets, and deploying them as AWS Lambda reward functions for RLVR workflows on Amazon Bedrock. + +## Install + +```bash +uv venv .venv +source .venv/bin/activate +uv pip install -e ".[dev]" +``` + +Optional extras: + +```bash +uv pip install -e ".[datasets]" # HuggingFace dataset support +uv pip install -e ".[deploy]" # AWS Lambda deployment +``` + +## Quick Start + +Write a grader: + +```python +from llm_eval_kit.graders.decorator import grader +from llm_eval_kit.models.results import EvaluateResult + +@grader +def my_grader(messages, ground_truth, **kwargs): + response = messages[-1].content + match = response.strip().lower() == str(ground_truth).strip().lower() + return EvaluateResult( + score=1.0 if match else 0.0, + reason="Match" if match else "No match", + ) +``` + +Run it: + +```bash +llm-eval-kit evaluate --grader exact_match --data samples.jsonl +``` + +Or from Python: + +```python +from llm_eval_kit.datasets.loader import load_jsonl +from llm_eval_kit.execution.pipeline import EvalPipeline +from llm_eval_kit.graders import exact_match_grader + +dataset = load_jsonl("samples.jsonl") +report = EvalPipeline(exact_match_grader, dataset).run_with_report() +print(report.summary()) +# Samples: 2 | Avg: 1.0000 | Min: 1.0000 | Max: 1.0000 +``` + +## Documentation + +| Topic | Description | +|-------|-------------| +| [Graders](../../docs/graders.md) | Built-in graders, writing custom graders, the `@grader` decorator | +| [Datasets](../../docs/datasets.md) | Loading from JSONL, BFCL, and HuggingFace Hub | +| [Lambda Deployment](../../docs/deploy.md) | Deploy graders as AWS Lambda reward functions for RLVR | +| [CLI Reference](../../docs/cli.md) | All CLI commands and options | + +## Built-in Graders + +| Name | Description | +|------|-------------| +| `exact_match` | Exact string comparison (case-insensitive by default) | +| `string_similarity` | Levenshtein distance or token F1 fuzzy matching | +| `tool_call` | BFCL-style AST comparison of function calls with type coercion | + +## Project Structure + +``` +llm_eval_kit/ +├── models/ # Pydantic data models (Message, EvaluateResult, EvalSample) +├── graders/ # Grader framework (ABC, decorator, registry) +│ └── builtins/ # Built-in grader implementations +├── datasets/ # Dataset loaders (JSONL, BFCL, HuggingFace) +├── execution/ # Evaluation pipeline and reporting +├── deploy/ # AWS Lambda deployment +├── cli/ # Command-line interface +├── utils/ # Dynamic module loading +├── processors/ # SageMaker pre/post processing (existing) +├── model/ # SageMaker payload models (existing) +└── lambda_handler.py # SageMaker Lambda handler (existing) +``` + +## License + +Apache-2.0 diff --git a/src/llm_eval_kit/__init__.py b/src/llm_eval_kit/__init__.py index e69de29..314bc04 100644 --- a/src/llm_eval_kit/__init__.py +++ b/src/llm_eval_kit/__init__.py @@ -0,0 +1,22 @@ +""" +llm-eval-kit — A Python SDK for LLM evaluation and RFT grader development. + +This package provides: +- Grader framework: Define evaluation functions with @grader decorator +- Built-in graders: exact_match, string_similarity, llm_judge +- Dataset loading: JSONL and HuggingFace dataset support +- Evaluation pipeline: Run graders over datasets and collect results +- SageMaker integration: Pre/post processing for SageMaker eval jobs (Lambda) +- CLI: Command-line tools for running evaluations + +Quick start: + from llm_eval_kit.graders.decorator import grader + from llm_eval_kit.models.results import EvaluateResult + + @grader + def my_grader(messages, ground_truth, **kwargs): + # Your evaluation logic here + return EvaluateResult(score=1.0, reason="Perfect!") +""" + +__version__ = "1.1.0" diff --git a/src/llm_eval_kit/cli/__init__.py b/src/llm_eval_kit/cli/__init__.py new file mode 100644 index 0000000..a3ba1f6 --- /dev/null +++ b/src/llm_eval_kit/cli/__init__.py @@ -0,0 +1 @@ +"""CLI package.""" diff --git a/src/llm_eval_kit/cli/main.py b/src/llm_eval_kit/cli/main.py new file mode 100644 index 0000000..d459fd8 --- /dev/null +++ b/src/llm_eval_kit/cli/main.py @@ -0,0 +1,183 @@ +"""CLI entry point for llm-eval-kit.""" +import argparse +import json +import sys +from pathlib import Path + +from llm_eval_kit.datasets.loader import load_bfcl, load_jsonl +from llm_eval_kit.execution.pipeline import EvalPipeline +from llm_eval_kit.graders.registry import default_registry +from llm_eval_kit.models.datasets import EvalSample +from llm_eval_kit.utils.module_loader import load_function + +# Top-level imports so graders are registered once +import llm_eval_kit.graders # noqa: F401 + + +def _cmd_evaluate(args): + if args.grader: + try: + g = default_registry.get(args.grader) + except KeyError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + elif args.grader_path: + try: + g = load_function(args.grader_path) + except (ImportError, AttributeError) as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + else: + print("Error: --grader or --grader-path required", + file=sys.stderr) + sys.exit(1) + + if not Path(args.data).exists(): + print(f"Error: not found: {args.data}", file=sys.stderr) + sys.exit(1) + + if args.format == "bfcl": + ds = load_bfcl(args.data, max_samples=args.max_samples) + else: + ds = load_jsonl(args.data, max_samples=args.max_samples) + + report = EvalPipeline(g, ds).run_with_report() + print(report.summary()) + if args.output: + report.to_jsonl(args.output) + print(f"Results written to {args.output}") + + +def _cmd_list_graders(): + for name in default_registry.list_graders(): + g = default_registry.get(name) + print(f" {name}: {g.description}") + + +def _cmd_validate(args): + if not Path(args.data).exists(): + print(f"Error: not found: {args.data}", file=sys.stderr) + sys.exit(1) + errors = [] + count = 0 + with open(args.data) as f: + for ln, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + try: + EvalSample(**json.loads(line)) + count += 1 + except Exception as e: + errors.append((ln, str(e))) + if errors: + for ln, err in errors: + print(f" Line {ln}: {err}", file=sys.stderr) + sys.exit(1) + print(f"Valid: {count} samples") + + +def _cmd_deploy(args): + """Deploy a grader as an AWS Lambda reward function.""" + try: + from llm_eval_kit.deploy.lambda_deploy import deploy_grader + from llm_eval_kit.deploy.config import load_deploy_config + except ImportError as e: + print(f"Error: {e}", file=sys.stderr) + print("Install deploy extras: pip install llm-eval-kit[deploy]", + file=sys.stderr) + sys.exit(1) + + # Resolve grader reference + if args.grader: + grader_ref = ( + f"llm_eval_kit.graders.builtins.{args.grader}" + f":{args.grader}_grader" + ) + elif args.grader_path: + grader_ref = args.grader_path + else: + print("Error: --grader or --grader-path required", file=sys.stderr) + sys.exit(1) + + try: + config = load_deploy_config(args.config) + # CLI flags override config file / env vars + if args.profile: + config.aws.profile = args.profile + if args.region: + config.aws.region = args.region + if args.role_arn: + config.aws.lambda_config.role_arn = args.role_arn + if args.function_name: + config.aws.lambda_config.function_name = ( + args.function_name + ) + result = deploy_grader(grader_ref, config=config) + print(f"Deployed: {result['function_name']}") + print(f" ARN: {result['function_arn']}") + print(f" Region: {result['region']}") + print(f" Grader: {result['grader_ref']}") + except ImportError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Deploy failed: {e}", file=sys.stderr) + sys.exit(1) + + +def main(): + """Entry point for llm-eval-kit CLI.""" + parser = argparse.ArgumentParser( + prog="llm-eval-kit", + description="LLM Evaluation Toolkit", + ) + sub = parser.add_subparsers(dest="command") + + ep = sub.add_parser("evaluate", help="Run evaluation") + ep.add_argument("--grader", type=str) + ep.add_argument("--grader-path", type=str) + ep.add_argument("--data", required=True) + ep.add_argument("--output", type=str) + ep.add_argument("--max-samples", type=int) + ep.add_argument("--format", choices=["bfcl", "jsonl"], + default="jsonl") + + sub.add_parser("list-graders", help="List graders") + + vp = sub.add_parser("validate", help="Validate dataset") + vp.add_argument("--data", required=True) + + dp = sub.add_parser("deploy", help="Deploy grader as Lambda") + dp.add_argument("--grader", type=str, + help="Built-in grader name (e.g. exact_match)") + dp.add_argument("--grader-path", type=str, + help="Module path (e.g. my_module:my_grader)") + dp.add_argument("--config", type=str, + help="Path to llm_eval_kit.yaml config file") + dp.add_argument("--profile", type=str, + help="AWS profile name (from ~/.aws/credentials)") + dp.add_argument("--region", type=str, + help="AWS region (overrides config/env)") + dp.add_argument("--role-arn", type=str, + help="IAM role ARN for the Lambda function") + dp.add_argument("--function-name", type=str, + help="Lambda function name (default: " + "llm-eval-reward-function)") + + args = parser.parse_args() + if args.command == "evaluate": + _cmd_evaluate(args) + elif args.command == "list-graders": + _cmd_list_graders() + elif args.command == "validate": + _cmd_validate(args) + elif args.command == "deploy": + _cmd_deploy(args) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/llm_eval_kit/datasets/__init__.py b/src/llm_eval_kit/datasets/__init__.py new file mode 100644 index 0000000..d06db58 --- /dev/null +++ b/src/llm_eval_kit/datasets/__init__.py @@ -0,0 +1,12 @@ +"""Dataset loading and formatting utilities.""" +from .loader import load_jsonl, load_bfcl, load_huggingface +from .formatter import export_rft_jsonl, upload_to_s3, SplitResult + +__all__ = [ + "load_jsonl", + "load_bfcl", + "load_huggingface", + "export_rft_jsonl", + "upload_to_s3", + "SplitResult", +] diff --git a/src/llm_eval_kit/datasets/formatter.py b/src/llm_eval_kit/datasets/formatter.py new file mode 100644 index 0000000..a320596 --- /dev/null +++ b/src/llm_eval_kit/datasets/formatter.py @@ -0,0 +1,266 @@ +""" +RFT dataset formatters — convert EvalDataset to Bedrock RFT training formats. + +Two output formats supported: + - Bedrock API: for create_model_customization_job (uploads to S3) + - OpenAI-compatible: for client.files.create (uploads via API) + +Both share the same core schema (messages + ground_truth) but differ +in metadata fields and how ground_truth is structured. +""" +import json +import logging +import random +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from llm_eval_kit.models.datasets import EvalDataset, EvalSample + +logger = logging.getLogger(__name__) + + +@dataclass +class SplitResult: + """Result of a train/val/test split with file paths.""" + + train_path: str + train_size: int + val_path: Optional[str] = None + val_size: int = 0 + test_path: Optional[str] = None + test_size: int = 0 + paths: Dict[str, str] = field(default_factory=dict) + + def summary(self) -> str: + parts = [f"train={self.train_size}"] + if self.val_size: + parts.append(f"val={self.val_size}") + if self.test_size: + parts.append(f"test={self.test_size}") + return " | ".join(parts) + + +def format_for_bedrock( + sample: EvalSample, + system_prompt: Optional[str] = None, + domain: Optional[str] = None, + data_source: Optional[str] = None, + split_name: str = "train", + index: int = 0, +) -> dict: + """ + Format a single EvalSample for the Bedrock API RFT schema. + + Output schema: + { + "messages": [{"role": ..., "content": ...}, ...], + "metadata": {"ground_truth": ...}, + "task_id": "...", + "domain": "...", + "data_source": "..." + } + """ + messages = [] + + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + for msg in sample.messages: + messages.append({"role": msg.role, "content": msg.content}) + + row: Dict[str, Any] = { + "messages": messages, + "metadata": { + "ground_truth": sample.ground_truth, + }, + } + + # Optional metadata fields + task_id = sample.id or f"{split_name}_{index}" + row["task_id"] = task_id + + if domain: + row["domain"] = domain + if data_source: + row["data_source"] = data_source + + # Pass through tool_definitions if present + if "tool_definitions" in sample.metadata: + row["metadata"]["tool_definitions"] = ( + sample.metadata["tool_definitions"] + ) + + return row + + +def format_for_openai( + sample: EvalSample, + system_prompt: Optional[str] = None, +) -> dict: + """ + Format a single EvalSample for the OpenAI-compatible RFT schema. + + Output schema: + { + "messages": [{"role": ..., "content": ...}, ...], + "ground_truth": "..." + } + + The OpenAI-compatible path uses client.files.create() to upload, + so no S3 or task_id/domain fields are needed. + """ + messages = [] + + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + + for msg in sample.messages: + messages.append({"role": msg.role, "content": msg.content}) + + # Ground truth — flatten to string if it's a list with one item + gt = sample.ground_truth + if isinstance(gt, list) and len(gt) == 1: + gt = str(gt[0]) + elif isinstance(gt, list): + gt = json.dumps(gt) + + return { + "messages": messages, + "ground_truth": gt, + } + + +def export_rft_jsonl( + dataset: EvalDataset, + output_dir: str, + fmt: str = "bedrock", + system_prompt: Optional[str] = None, + domain: Optional[str] = None, + data_source: Optional[str] = None, + train_ratio: float = 0.8, + val_ratio: float = 0.1, + seed: int = 42, + shuffle: bool = True, +) -> SplitResult: + """ + Export an EvalDataset to RFT-formatted JSONL files with + train/val/test split. + + Args: + dataset: The EvalDataset to export. + output_dir: Directory to write JSONL files into. + fmt: "bedrock" for Bedrock API or "openai" for + OpenAI-compatible API. + system_prompt: Optional system message prepended to each + sample's messages. + domain: Domain tag (Bedrock format only). + data_source: Data source tag (Bedrock format only). + train_ratio: Fraction of data for training (default 0.8). + val_ratio: Fraction for validation (default 0.1). + Remainder goes to test. + seed: Random seed for shuffling. + shuffle: Whether to shuffle before splitting. + + Returns: + SplitResult with file paths and counts. + """ + out = Path(output_dir) + out.mkdir(parents=True, exist_ok=True) + + samples = list(dataset) + if shuffle: + rng = random.Random(seed) + rng.shuffle(samples) + + total = len(samples) + train_size = int(total * train_ratio) + val_size = int(total * val_ratio) + test_size = total - train_size - val_size + + splits: List[Tuple[str, List[EvalSample]]] = [ + ("train", samples[:train_size]), + ] + if val_size > 0: + splits.append( + ("val", samples[train_size:train_size + val_size]) + ) + if test_size > 0: + splits.append(("test", samples[train_size + val_size:])) + + paths: Dict[str, str] = {} + for split_name, split_samples in splits: + path = out / f"{split_name}.jsonl" + with open(path, "w", encoding="utf-8") as f: + for i, sample in enumerate(split_samples): + if fmt == "openai": + row = format_for_openai( + sample, + system_prompt=system_prompt, + ) + else: + row = format_for_bedrock( + sample, + system_prompt=system_prompt, + domain=domain, + data_source=data_source, + split_name=split_name, + index=i, + ) + f.write(json.dumps(row) + "\n") + paths[split_name] = str(path) + logger.info( + "Wrote %d samples to %s", len(split_samples), path, + ) + + return SplitResult( + train_path=paths["train"], + train_size=train_size, + val_path=paths.get("val"), + val_size=val_size, + test_path=paths.get("test"), + test_size=test_size, + paths=paths, + ) + + +def upload_to_s3( + split_result: SplitResult, + bucket: str, + prefix: str, + session=None, +) -> Dict[str, str]: + """ + Upload split JSONL files to S3. + + Args: + split_result: Output from export_rft_jsonl. + bucket: S3 bucket name. + prefix: S3 key prefix (e.g. "rft-data/bfcl"). + session: Optional boto3.Session. Uses default if None. + + Returns: + Dict mapping split name to S3 URI. + """ + try: + import boto3 + except ImportError: + raise ImportError( + "boto3 not installed. " + "Run: uv pip install -e \".[deploy]\"" + ) + + if session is None: + session = boto3.Session() + + s3 = session.client("s3") + uris: Dict[str, str] = {} + + for split_name, local_path in split_result.paths.items(): + key = f"{prefix.rstrip('/')}/{split_name}.jsonl" + s3.upload_file(local_path, bucket, key) + uri = f"s3://{bucket}/{key}" + uris[split_name] = uri + logger.info("Uploaded %s → %s", local_path, uri) + + return uris diff --git a/src/llm_eval_kit/datasets/loader.py b/src/llm_eval_kit/datasets/loader.py new file mode 100644 index 0000000..cee3ba2 --- /dev/null +++ b/src/llm_eval_kit/datasets/loader.py @@ -0,0 +1,207 @@ +""" +Dataset loaders — load evaluation samples from various sources. + +load_jsonl: generic JSONL where each line maps to EvalSample fields. +load_bfcl: BFCL-specific JSONL with field mapping for the Berkeley + Function Calling Leaderboard dataset. +""" +import json +import logging +from typing import Optional + +from llm_eval_kit.models.messages import Message +from llm_eval_kit.models.datasets import EvalDataset, EvalSample + +logger = logging.getLogger(__name__) + + +def load_jsonl( + path: str, max_samples: Optional[int] = None +) -> EvalDataset: + """Load a generic JSONL file into an EvalDataset.""" + return EvalDataset.from_jsonl(path, max_samples=max_samples) + + +def load_bfcl( + path: str, max_samples: Optional[int] = None +) -> EvalDataset: + """ + Load a BFCL JSONL file with field mapping. + + BFCL format (each line): + - "id": unique identifier + - "question": list of message dicts (the user prompt) + - "function": list of tool definition dicts (JSON schemas) + - ground truth: varies by file, often a separate answer file + + Note: BFCL files are NOT compatible with HuggingFace load_dataset. + """ + samples = [] + with open(path) as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + except json.JSONDecodeError: + logger.warning("Skipping malformed JSON at line %d", line_num) + continue + + if "id" not in data or "question" not in data: + logger.warning( + "Skipping line %d: missing 'id' or 'question'", line_num + ) + continue + + # Map BFCL fields to EvalSample + question = data["question"] + if isinstance(question, list): + messages = [Message(**msg) if isinstance(msg, dict) else msg for msg in question] + else: + # Some BFCL entries have question as a string + messages = [Message(role="user", content=str(question))] + + sample = EvalSample( + id=str(data["id"]), + messages=messages, + ground_truth=data.get("ground_truth"), + metadata={ + "tool_definitions": data.get("function", []), + }, + ) + samples.append(sample) + + if max_samples and len(samples) >= max_samples: + break + + return EvalDataset(samples) + + + +def load_huggingface( + dataset_name: str, + split: str = "train", + max_samples: Optional[int] = None, + token: Optional[str] = None, + prompt_key: str = "prompt", + response_key: str = "response", + ground_truth_key: Optional[str] = "ground_truth", + id_key: Optional[str] = "id", + config_name: Optional[str] = None, + data_files: Optional[str] = None, +) -> EvalDataset: + """ + Load a dataset from HuggingFace Hub. + + Requires: pip install llm-eval-kit[datasets] + + Args: + dataset_name: HF dataset name + (e.g. "gorilla-llm/Berkeley-Function-Calling-Leaderboard") + split: Dataset split (default: "train") + max_samples: Max samples to load (None = all) + token: HuggingFace API token. Falls back to HF_TOKEN env var. + prompt_key: Column name containing the prompt/question + response_key: Column name for model response (None to skip) + ground_truth_key: Column name for ground truth (None to skip) + id_key: Column name for sample ID (None to auto-generate) + config_name: Dataset config/subset name (for multi-config + datasets) + data_files: Specific file(s) to load from the repo + (e.g. "BFCL_v3_simple.json"). Useful when a HF repo + contains multiple files with different schemas. + """ + try: + from datasets import load_dataset + except ImportError: + raise ImportError( + "HuggingFace datasets not installed. " + "Run: uv pip install -e \".[datasets]\"" + ) + + import os + hf_token = token or os.environ.get("HF_TOKEN") + + # Build kwargs for load_dataset + load_kwargs = { + "split": split, + "token": hf_token, + } + if config_name: + load_kwargs["name"] = config_name + if data_files: + load_kwargs["data_files"] = data_files + + logger.info( + "Loading %s (split=%s%s) from HuggingFace...", + dataset_name, + split, + f", file={data_files}" if data_files else "", + ) + ds = load_dataset(dataset_name, **load_kwargs) + + samples = [] + for i, row in enumerate(ds): + if max_samples and i >= max_samples: + break + + # Build sample ID + sample_id = str(row.get(id_key, i)) if id_key else str(i) + + # Build messages from available columns + messages = [] + if prompt_key and prompt_key in row: + prompt = row[prompt_key] + if isinstance(prompt, list): + # Handle nested lists (e.g. BFCL "question" + # is list[list[message_dict]]) + flat = prompt + if ( + flat + and isinstance(flat[0], list) + ): + flat = flat[0] + for msg in flat: + if isinstance(msg, dict): + messages.append(Message(**msg)) + else: + messages.append( + Message(role="user", content=str(msg)) + ) + else: + messages.append( + Message(role="user", content=str(prompt)) + ) + + if response_key and response_key in row: + messages.append( + Message( + role="assistant", + content=str(row[response_key]), + ) + ) + + # Ground truth + gt = row.get(ground_truth_key) if ground_truth_key else None + + # Collect remaining columns as metadata + skip_keys = { + prompt_key, response_key, ground_truth_key, id_key, + } + metadata = { + k: v for k, v in row.items() if k not in skip_keys + } + + samples.append(EvalSample( + id=sample_id, + messages=messages, + ground_truth=gt, + metadata=metadata, + )) + + logger.info( + "Loaded %d samples from %s", len(samples), dataset_name, + ) + return EvalDataset(samples) + diff --git a/src/llm_eval_kit/deploy/__init__.py b/src/llm_eval_kit/deploy/__init__.py new file mode 100644 index 0000000..5f104b9 --- /dev/null +++ b/src/llm_eval_kit/deploy/__init__.py @@ -0,0 +1,5 @@ +"""AWS Lambda deployment for grader/reward functions.""" +from .config import DeployConfig, load_deploy_config +from .lambda_deploy import deploy_grader + +__all__ = ["DeployConfig", "load_deploy_config", "deploy_grader"] diff --git a/src/llm_eval_kit/deploy/config.py b/src/llm_eval_kit/deploy/config.py new file mode 100644 index 0000000..df041c7 --- /dev/null +++ b/src/llm_eval_kit/deploy/config.py @@ -0,0 +1,103 @@ +""" +Deployment configuration — YAML-based config for AWS Lambda deployment. + +Reads from llm_eval_kit.yaml or a user-specified path. +Falls back to environment variables for AWS credentials. +""" +import os +import logging +from typing import Dict, Optional + +from pydantic import BaseModel, Field + +logger = logging.getLogger(__name__) + +CONFIG_FILE_NAME = "llm_eval_kit.yaml" + + +class LambdaConfig(BaseModel): + """Lambda function configuration.""" + function_name: str = "llm-eval-reward-function" + runtime: str = "python3.12" + handler: str = "handler.lambda_handler" + timeout: int = 60 + memory_size: int = 256 + role_arn: Optional[str] = None + layers: list = Field(default_factory=list) + environment: Dict[str, str] = Field(default_factory=dict) + + +class AWSConfig(BaseModel): + """AWS account configuration.""" + region: str = "us-east-1" + account_id: Optional[str] = None + profile: Optional[str] = None + lambda_config: LambdaConfig = Field( + default_factory=LambdaConfig, + alias="lambda", + ) + + model_config = {"populate_by_name": True} + + +class DeployConfig(BaseModel): + """Top-level deployment configuration.""" + aws: AWSConfig = Field(default_factory=AWSConfig) + + +def load_deploy_config( + config_path: Optional[str] = None, +) -> DeployConfig: + """ + Load deployment config from YAML file. + + Search order: + 1. Explicit config_path argument + 2. llm_eval_kit.yaml in current directory + 3. Walk up parent directories + 4. Fall back to defaults + env vars + """ + try: + import yaml + except ImportError: + raise ImportError( + "PyYAML not installed. Run: uv pip install -e \".[deploy]\"" + ) + + # Find config file + if config_path is None: + config_path = _find_config_file() + + if config_path and os.path.isfile(config_path): + logger.info("Loading config from %s", config_path) + with open(config_path) as f: + raw = yaml.safe_load(f) or {} + config = DeployConfig(**raw) + else: + logger.info("No config file found, using defaults") + config = DeployConfig() + + # Override from environment variables + if not config.aws.account_id: + config.aws.account_id = os.environ.get("AWS_ACCOUNT_ID") + if os.environ.get("AWS_DEFAULT_REGION"): + config.aws.region = os.environ["AWS_DEFAULT_REGION"] + if os.environ.get("AWS_REGION"): + config.aws.region = os.environ["AWS_REGION"] + if not config.aws.profile: + config.aws.profile = os.environ.get("AWS_PROFILE") + + return config + + +def _find_config_file() -> Optional[str]: + """Walk up from CWD looking for llm_eval_kit.yaml.""" + current = os.path.abspath(os.getcwd()) + while True: + candidate = os.path.join(current, CONFIG_FILE_NAME) + if os.path.isfile(candidate): + return candidate + parent = os.path.dirname(current) + if parent == current: + return None + current = parent diff --git a/src/llm_eval_kit/deploy/lambda_deploy.py b/src/llm_eval_kit/deploy/lambda_deploy.py new file mode 100644 index 0000000..5824ed1 --- /dev/null +++ b/src/llm_eval_kit/deploy/lambda_deploy.py @@ -0,0 +1,412 @@ +""" +AWS Lambda deployment for grader functions. + +Packages a grader as a Lambda function and deploys it using boto3. +The deployed Lambda accepts a JSON payload with messages and ground_truth, +runs the grader, and returns the EvaluateResult. +""" +import io +import logging +import zipfile +from pathlib import Path +from typing import Optional + +from .config import DeployConfig, load_deploy_config + +logger = logging.getLogger(__name__) + +# Template for the Lambda handler that wraps a grader +HANDLER_TEMPLATE = '''"""Auto-generated Lambda handler for llm-eval-kit grader.""" +import json +import sys +import os + +# Add the package to the path +sys.path.insert(0, os.path.dirname(__file__)) + +from llm_eval_kit.models.messages import Message +from llm_eval_kit.models.results import EvaluateResult +from llm_eval_kit.utils.module_loader import load_function + + +# Load the grader at cold start +_GRADER_REF = os.environ.get("GRADER_REF", "{grader_ref}") +_grader = load_function(_GRADER_REF) + + +def lambda_handler(event, context): + """ + Lambda handler for reward function evaluation. + + Expected payload: + {{ + "messages": [{{"role": "user", "content": "..."}}, ...], + "ground_truth": "expected answer" | ["call1()", "call2()"], + "kwargs": {{}} // optional extra args + }} + """ + try: + body = event if isinstance(event, dict) else json.loads(event) + + raw_messages = body.get("messages", []) + messages = [Message(**m) for m in raw_messages] + ground_truth = body.get("ground_truth") + kwargs = body.get("kwargs", {{}}) + + result = _grader.grade(messages, ground_truth, **kwargs) + + return {{ + "statusCode": 200, + "body": result.to_dict(), + }} + except Exception as e: + return {{ + "statusCode": 500, + "body": {{"error": str(e)}}, + }} +''' + + +def _build_deployment_package(grader_ref: str) -> bytes: + """ + Build a Lambda deployment zip containing: + - handler.py (generated from template) + - The llm_eval_kit package + - Third-party dependencies (pydantic, etc.) installed + for the Lambda runtime platform + """ + import shutil + import subprocess + import sys + import tempfile + + buf = io.BytesIO() + + # Find the llm_eval_kit package directory + import llm_eval_kit + pkg_dir = Path(llm_eval_kit.__file__).parent + + # Install dependencies into a temp dir for bundling. + # Prefer uv for speed; fall back to pip if uv isn't available. + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + logger.info("Installing dependencies into package...") + + pip_args = [ + "--target", str(tmp_path), + "--platform", "manylinux2014_x86_64", + "--implementation", "cp", + "--python-version", "3.12", + "--only-binary=:all:", + "--quiet", + "pydantic>=2.0.0", + "pydantic-core", + "annotated-types", + "typing_extensions", + ] + + uv_bin = shutil.which("uv") + if uv_bin: + subprocess.check_call( + [uv_bin, "pip", "install"] + pip_args, + stderr=subprocess.STDOUT, + ) + else: + subprocess.check_call( + [sys.executable, "-m", "pip", "install"] + pip_args, + stderr=subprocess.STDOUT, + ) + + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + # 1. Write the handler + handler_code = HANDLER_TEMPLATE.format( + grader_ref=grader_ref, + ) + zf.writestr("handler.py", handler_code) + + # 2. Bundle the llm_eval_kit package + for file_path in pkg_dir.rglob("*.py"): + arcname = str( + file_path.relative_to(pkg_dir.parent) + ) + zf.writestr(arcname, file_path.read_text()) + + # 3. Bundle pip-installed dependencies + for file_path in tmp_path.rglob("*"): + if file_path.is_file(): + arcname = str( + file_path.relative_to(tmp_path) + ) + zf.writestr( + arcname, file_path.read_bytes(), + ) + + buf.seek(0) + return buf.read() + + +def deploy_grader( + grader_ref: str, + config: Optional[DeployConfig] = None, + config_path: Optional[str] = None, +) -> dict: + """ + Deploy a grader as an AWS Lambda function. + + Args: + grader_ref: Module path to the grader + (e.g. "llm_eval_kit.graders.builtins.exact_match:exact_match_grader") + config: DeployConfig instance (loaded from YAML if not provided) + config_path: Path to config YAML file + + Returns: + dict with deployment info (function_name, function_arn, etc.) + """ + try: + import boto3 + import botocore.exceptions + except ImportError: + raise ImportError( + "boto3 not installed. Run: uv pip install -e \".[deploy]\"" + ) + + if config is None: + config = load_deploy_config(config_path) + + lc = config.aws.lambda_config + region = config.aws.region + profile = config.aws.profile + + # Build a session — supports named profiles, env vars, SSO, + # instance roles, and the full default credential chain. + session = boto3.Session( + profile_name=profile, + region_name=region, + ) + + # Validate credentials before doing any real work + try: + sts = session.client("sts") + identity = sts.get_caller_identity() + logger.info( + "Authenticated as %s (account %s)", + identity["Arn"], identity["Account"], + ) + except botocore.exceptions.NoCredentialsError: + raise RuntimeError( + "No AWS credentials found. Set them up using one of:\n" + " 1. aws configure " + "(writes ~/.aws/credentials)\n" + " 2. aws configure sso " + "(SSO login)\n" + " 3. Environment variables " + "(AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY)\n" + " 4. --profile flag or " + "aws.profile in llm_eval_kit.yaml" + ) + except botocore.exceptions.ClientError as e: + raise RuntimeError( + f"AWS credential check failed: {e}\n" + "Run 'aws sts get-caller-identity' to debug." + ) + + logger.info("Building deployment package for %s...", grader_ref) + zip_bytes = _build_deployment_package(grader_ref) + logger.info("Package size: %.1f KB", len(zip_bytes) / 1024) + + # Merge grader ref into environment + env_vars = {**lc.environment, "GRADER_REF": grader_ref} + + client = session.client("lambda") + + # Check if function exists + try: + client.get_function(FunctionName=lc.function_name) + exists = True + except client.exceptions.ResourceNotFoundException: + exists = False + + if exists: + logger.info("Updating existing function: %s", lc.function_name) + client.update_function_code( + FunctionName=lc.function_name, + ZipFile=zip_bytes, + ) + # Wait for update to complete before updating config + waiter = client.get_waiter("function_updated_v2") + waiter.wait(FunctionName=lc.function_name) + + client.update_function_configuration( + FunctionName=lc.function_name, + Runtime=lc.runtime, + Handler=lc.handler, + Timeout=lc.timeout, + MemorySize=lc.memory_size, + Environment={"Variables": env_vars}, + ) + response = client.get_function(FunctionName=lc.function_name) + arn = response["Configuration"]["FunctionArn"] + else: + if not lc.role_arn: + raise ValueError( + "role_arn is required to create a new Lambda function. " + "Set it in llm_eval_kit.yaml under aws.lambda.role_arn " + "or provide an existing function name to update." + ) + logger.info("Creating new function: %s", lc.function_name) + response = client.create_function( + FunctionName=lc.function_name, + Runtime=lc.runtime, + Role=lc.role_arn, + Handler=lc.handler, + Code={"ZipFile": zip_bytes}, + Timeout=lc.timeout, + MemorySize=lc.memory_size, + Environment={"Variables": env_vars}, + ) + arn = response["FunctionArn"] + + result = { + "function_name": lc.function_name, + "function_arn": arn, + "region": region, + "grader_ref": grader_ref, + } + logger.info("Deployed: %s (%s)", lc.function_name, arn) + return result + + +def deploy_reward_function( + source_file: str, + function_name: str, + role_arn: str, + handler: Optional[str] = None, + runtime: str = "python3.12", + timeout: int = 300, + memory_size: int = 512, + region: Optional[str] = None, + profile: Optional[str] = None, +) -> dict: + """ + Deploy a standalone reward function .py file as a Lambda. + + This is for zero-dependency reward functions that follow the + Bedrock RFT batch contract (receive list, return list with + id + aggregate_reward_score + reward_components). + + Unlike deploy_grader(), this does NOT bundle llm_eval_kit or + pydantic — it just zips the single .py file and deploys it. + + Args: + source_file: Path to the .py reward function file. + function_name: Lambda function name. + role_arn: IAM role ARN for the Lambda. + handler: Lambda handler string. Defaults to + ".lambda_handler". + runtime: Lambda runtime (default python3.12). + timeout: Timeout in seconds (default 300). + memory_size: Memory in MB (default 512). + region: AWS region (default from env/config). + profile: AWS profile name. + + Returns: + dict with function_name, function_arn, region. + """ + try: + import boto3 + import botocore.exceptions + except ImportError: + raise ImportError( + "boto3 not installed. " + "Run: uv pip install -e \".[deploy]\"" + ) + + from pathlib import Path as _Path + + src = _Path(source_file) + if not src.is_file(): + raise FileNotFoundError(f"Reward function not found: {src}") + + module_name = src.stem + if handler is None: + handler = f"{module_name}.lambda_handler" + + # Build zip with just the single file + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr(f"{module_name}.py", src.read_text()) + buf.seek(0) + zip_bytes = buf.read() + + logger.info( + "Package: %s (%.1f KB)", + module_name, len(zip_bytes) / 1024, + ) + + session = boto3.Session( + profile_name=profile, + region_name=region, + ) + + # Validate credentials + try: + sts = session.client("sts") + identity = sts.get_caller_identity() + logger.info( + "Authenticated as %s", identity["Arn"], + ) + except botocore.exceptions.NoCredentialsError: + raise RuntimeError( + "No AWS credentials found. " + "Run 'aws configure' or set env vars." + ) + + client = session.client("lambda") + + try: + client.get_function(FunctionName=function_name) + exists = True + except client.exceptions.ResourceNotFoundException: + exists = False + + if exists: + logger.info("Updating: %s", function_name) + client.update_function_code( + FunctionName=function_name, + ZipFile=zip_bytes, + ) + waiter = client.get_waiter("function_updated_v2") + waiter.wait(FunctionName=function_name) + client.update_function_configuration( + FunctionName=function_name, + Runtime=runtime, + Handler=handler, + Timeout=timeout, + MemorySize=memory_size, + ) + resp = client.get_function(FunctionName=function_name) + arn = resp["Configuration"]["FunctionArn"] + else: + logger.info("Creating: %s", function_name) + resp = client.create_function( + FunctionName=function_name, + Runtime=runtime, + Role=role_arn, + Handler=handler, + Code={"ZipFile": zip_bytes}, + Timeout=timeout, + MemorySize=memory_size, + ) + arn = resp["FunctionArn"] + + # Wait for function to be active + waiter = client.get_waiter("function_active_v2") + waiter.wait(FunctionName=function_name) + + result = { + "function_name": function_name, + "function_arn": arn, + "region": session.region_name, + } + logger.info("Deployed: %s (%s)", function_name, arn) + return result diff --git a/src/llm_eval_kit/execution/__init__.py b/src/llm_eval_kit/execution/__init__.py new file mode 100644 index 0000000..3b23571 --- /dev/null +++ b/src/llm_eval_kit/execution/__init__.py @@ -0,0 +1,4 @@ +"""Evaluation execution pipeline.""" +from .pipeline import EvalPipeline, EvalReport + +__all__ = ["EvalPipeline", "EvalReport"] diff --git a/src/llm_eval_kit/execution/pipeline.py b/src/llm_eval_kit/execution/pipeline.py new file mode 100644 index 0000000..0f9d939 --- /dev/null +++ b/src/llm_eval_kit/execution/pipeline.py @@ -0,0 +1,81 @@ +""" +Evaluation pipeline — runs graders over datasets and collects results. + +EvalPipeline is a plain class (composition: has a grader + dataset). +EvalReport is a @dataclass (internal summary container). +""" +import json +import logging +from dataclasses import dataclass, field +from typing import List + +from llm_eval_kit.graders.base import Grader +from llm_eval_kit.models.datasets import EvalDataset +from llm_eval_kit.models.results import EvaluateResult + +logger = logging.getLogger(__name__) + + +@dataclass +class EvalReport: + """Summary of an evaluation run.""" + + total_samples: int + avg_score: float + min_score: float + max_score: float + results: List[EvaluateResult] = field(repr=False) + + def to_jsonl(self, path: str) -> None: + """Write each EvaluateResult as a JSON line.""" + with open(path, "w") as f: + for result in self.results: + f.write(json.dumps(result.to_dict()) + "\n") + + def summary(self) -> str: + return ( + f"Samples: {self.total_samples} | " + f"Avg: {self.avg_score:.4f} | " + f"Min: {self.min_score:.4f} | " + f"Max: {self.max_score:.4f}" + ) + + +class EvalPipeline: + """Orchestrates running a grader over a dataset.""" + + def __init__(self, grader: Grader, dataset: EvalDataset) -> None: + self.grader = grader + self.dataset = dataset + + def run(self) -> List[EvaluateResult]: + results = [] + for sample in self.dataset: + try: + result = self.grader.grade( + messages=sample.messages, + ground_truth=sample.ground_truth, + **sample.metadata, + ) + results.append(result) + except Exception as e: + logger.warning("Grader failed on sample %s: %s", sample.id, e) + results.append( + EvaluateResult( + score=0.0, + is_valid=False, + reason=f"Error: {e}", + ) + ) + return results + + def run_with_report(self) -> EvalReport: + results = self.run() + scores = [r.score for r in results] + return EvalReport( + total_samples=len(results), + avg_score=sum(scores) / len(scores) if scores else 0.0, + min_score=min(scores) if scores else 0.0, + max_score=max(scores) if scores else 0.0, + results=results, + ) diff --git a/src/llm_eval_kit/graders/__init__.py b/src/llm_eval_kit/graders/__init__.py new file mode 100644 index 0000000..c0239e2 --- /dev/null +++ b/src/llm_eval_kit/graders/__init__.py @@ -0,0 +1,26 @@ +""" +Grader framework — the core evaluation engine of llm-eval-kit. + +Importing this package auto-populates the default_registry with built-in graders. +""" +from .base import Grader +from .decorator import grader +from .registry import GraderRegistry, default_registry +from .builtins.exact_match import exact_match_grader +from .builtins.string_similarity import string_similarity_grader +from .builtins.tool_call import tool_call_grader + +# Auto-register built-in graders +default_registry.register("exact_match", exact_match_grader) +default_registry.register("string_similarity", string_similarity_grader) +default_registry.register("tool_call", tool_call_grader) + +__all__ = [ + "Grader", + "grader", + "GraderRegistry", + "default_registry", + "exact_match_grader", + "string_similarity_grader", + "tool_call_grader", +] diff --git a/src/llm_eval_kit/graders/base.py b/src/llm_eval_kit/graders/base.py new file mode 100644 index 0000000..add7e0a --- /dev/null +++ b/src/llm_eval_kit/graders/base.py @@ -0,0 +1,39 @@ +"""Abstract base class for all graders.""" +from abc import ABC, abstractmethod +from typing import Any, List, Optional, Union + +from llm_eval_kit.models.messages import Message +from llm_eval_kit.models.results import EvaluateResult + + +class Grader(ABC): + """ + Interface that all graders must implement. + + Subclasses provide name, description, and grade(). + Every Grader is callable — __call__ delegates to grade(). + """ + + @property + @abstractmethod + def name(self) -> str: ... + + @property + @abstractmethod + def description(self) -> str: ... + + @abstractmethod + def grade( + self, + messages: List[Message], + ground_truth: Optional[Union[str, dict, list]] = None, + **kwargs: Any, + ) -> EvaluateResult: ... + + def __call__( + self, + messages: List[Message], + ground_truth: Optional[Union[str, dict, list]] = None, + **kwargs: Any, + ) -> EvaluateResult: + return self.grade(messages, ground_truth, **kwargs) diff --git a/src/llm_eval_kit/graders/builtins/__init__.py b/src/llm_eval_kit/graders/builtins/__init__.py new file mode 100644 index 0000000..1ef7769 --- /dev/null +++ b/src/llm_eval_kit/graders/builtins/__init__.py @@ -0,0 +1,10 @@ +"""Built-in grader implementations.""" +from .exact_match import exact_match_grader +from .string_similarity import string_similarity_grader +from .tool_call import tool_call_grader + +__all__ = [ + "exact_match_grader", + "string_similarity_grader", + "tool_call_grader", +] diff --git a/src/llm_eval_kit/graders/builtins/_helpers.py b/src/llm_eval_kit/graders/builtins/_helpers.py new file mode 100644 index 0000000..28440da --- /dev/null +++ b/src/llm_eval_kit/graders/builtins/_helpers.py @@ -0,0 +1,12 @@ +"""Shared helpers for built-in graders.""" +from typing import List, Optional + +from llm_eval_kit.models.messages import Message + + +def get_last_assistant_content(messages: List[Message]) -> Optional[str]: + """Walk messages in reverse, return first assistant content found.""" + for msg in reversed(messages): + if msg.role == "assistant" and msg.content is not None: + return msg.content + return None diff --git a/src/llm_eval_kit/graders/builtins/exact_match.py b/src/llm_eval_kit/graders/builtins/exact_match.py new file mode 100644 index 0000000..9e77630 --- /dev/null +++ b/src/llm_eval_kit/graders/builtins/exact_match.py @@ -0,0 +1,38 @@ +"""Exact match grader — checks if assistant response exactly matches ground truth.""" +from llm_eval_kit.graders.decorator import grader +from llm_eval_kit.models.results import EvaluateResult, MetricResult +from ._helpers import get_last_assistant_content + + +@grader( + name="exact_match", + description="Exact string match between response and ground truth", +) +def exact_match_grader(messages, ground_truth, *, case_sensitive=False, **kwargs): + response = get_last_assistant_content(messages) + if response is None: + return EvaluateResult( + score=0.0, + is_valid=False, + reason="No assistant message found", + ) + + response = response.strip() + expected = str(ground_truth).strip() if ground_truth is not None else "" + + if not case_sensitive: + match = response.lower() == expected.lower() + else: + match = response == expected + + score = 1.0 if match else 0.0 + return EvaluateResult( + score=score, + reason="Exact match" if match else "No match", + metrics={ + "exact_match": MetricResult( + score=score, + reason=f"case_sensitive={case_sensitive}", + ), + }, + ) diff --git a/src/llm_eval_kit/graders/builtins/llm_judge.py b/src/llm_eval_kit/graders/builtins/llm_judge.py new file mode 100644 index 0000000..1ebc2b0 --- /dev/null +++ b/src/llm_eval_kit/graders/builtins/llm_judge.py @@ -0,0 +1,6 @@ +""" +LLM-as-judge grader — uses an LLM to evaluate responses. + +This is a placeholder for future implementation. +It would call an LLM (e.g., via Bedrock Converse API) to score responses. +""" diff --git a/src/llm_eval_kit/graders/builtins/string_similarity.py b/src/llm_eval_kit/graders/builtins/string_similarity.py new file mode 100644 index 0000000..cabc925 --- /dev/null +++ b/src/llm_eval_kit/graders/builtins/string_similarity.py @@ -0,0 +1,114 @@ +""" +String similarity grader — fuzzy matching using edit distance or token overlap. + +The grader framework is complete. The two algorithm functions below are stubs +for YOU to implement as Leetcode practice. +""" +from llm_eval_kit.graders.decorator import grader +from llm_eval_kit.models.results import EvaluateResult, MetricResult +from ._helpers import get_last_assistant_content + + +# --------------------------------------------------------------------------- +# YOUR TASK: Implement these two functions +# --------------------------------------------------------------------------- + +def levenshtein_similarity(s1: str, s2: str) -> float: + """ + Compute normalized Levenshtein similarity between two strings. + Return: 1.0 - (edit_distance / max(len(s1), len(s2))) + + LEETCODE CONNECTION: This is Leetcode #72 (Edit Distance). + + Algorithm: + 1. Build a 2D DP table of size (len(s1)+1) x (len(s2)+1) + 2. dp[i][j] = minimum edits to convert s1[:i] into s2[:j] + 3. Base cases: dp[i][0] = i, dp[0][j] = j + 4. Transition: + - If s1[i-1] == s2[j-1]: dp[i][j] = dp[i-1][j-1] + - Else: dp[i][j] = 1 + min(dp[i-1][j], # delete + dp[i][j-1], # insert + dp[i-1][j-1]) # replace + 5. edit_distance = dp[len(s1)][len(s2)] + 6. Normalize: 1.0 - (edit_distance / max(len(s1), len(s2))) + + Edge cases: + - Both empty -> return 1.0 + - One empty -> return 0.0 + + Space optimization (optional stretch goal): + - You only need the previous row, so you can use O(min(m,n)) space + instead of O(m*n). This is a common follow-up in interviews. + """ + raise NotImplementedError("Implement levenshtein_similarity") + + +def token_f1_score(prediction: str, reference: str) -> float: + """ + Compute token-level F1 score between prediction and reference. + + Algorithm: + 1. Tokenize: prediction.lower().split(), reference.lower().split() + 2. Use collections.Counter to count token frequencies (multiset) + 3. Overlap = sum of min counts for each token (Counter intersection) + - In Python: sum((counter_pred & counter_ref).values()) + 4. precision = overlap / len(predicted_tokens) + 5. recall = overlap / len(reference_tokens) + 6. F1 = 2 * precision * recall / (precision + recall) + + LEETCODE CONNECTION: + - Counter intersection is related to array intersection problems + - Using Counter (multiset) is key — plain set loses duplicate info + - Think about: what if one string has "the the the" and the other + has "the"? Plain set says full overlap, Counter says 1/3. + + Edge cases: + - Both empty -> return 1.0 + - One empty -> return 0.0 + - No overlap -> return 0.0 (avoid division by zero in F1) + """ + raise NotImplementedError("Implement token_f1_score") + + +# --------------------------------------------------------------------------- +# Grader (framework code — complete) +# --------------------------------------------------------------------------- + +@grader( + name="string_similarity", + description="Fuzzy string matching via Levenshtein distance or token F1", +) +def string_similarity_grader( + messages, ground_truth, *, strategy="levenshtein", **kwargs +): + response = get_last_assistant_content(messages) + if response is None: + return EvaluateResult( + score=0.0, + is_valid=False, + reason="No assistant message found", + ) + + expected = str(ground_truth) if ground_truth is not None else "" + + # Both empty is a perfect match + if not response and not expected: + return EvaluateResult(score=1.0, reason="Both empty") + + if strategy == "levenshtein": + score = levenshtein_similarity(response, expected) + elif strategy == "token_f1": + score = token_f1_score(response, expected) + else: + raise ValueError(f"Unknown strategy: {strategy}") + + return EvaluateResult( + score=score, + reason=f"Similarity ({strategy}): {score:.4f}", + metrics={ + strategy: MetricResult( + score=score, + reason=f"Computed via {strategy} strategy", + ), + }, + ) diff --git a/src/llm_eval_kit/graders/builtins/tool_call.py b/src/llm_eval_kit/graders/builtins/tool_call.py new file mode 100644 index 0000000..f75b27a --- /dev/null +++ b/src/llm_eval_kit/graders/builtins/tool_call.py @@ -0,0 +1,288 @@ +""" +Tool call grader — BFCL-style AST comparison of function calls. + +Parses function call strings like `func_name(param1=value1, param2="str")` +using Python's ast module, then compares structurally with type coercion. +""" +import ast +import logging +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Union + +from llm_eval_kit.graders.decorator import grader +from llm_eval_kit.models.results import EvaluateResult, MetricResult +from ._helpers import get_last_assistant_content + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Function call parser +# --------------------------------------------------------------------------- + +@dataclass +class ParsedCall: + """Structured representation of a parsed function call.""" + func_name: str + params: Dict[str, Any] + + +def parse_function_call(call_str: str) -> ParsedCall: + """ + Parse 'func_name(param1=value1, param2="str")' into a ParsedCall. + + Uses ast.parse in eval mode to safely parse the expression, + then extracts function name and keyword arguments. + """ + call_str = call_str.strip() + try: + tree = ast.parse(call_str, mode="eval") + except SyntaxError as e: + raise ValueError(f"Cannot parse function call: {call_str!r}") from e + + if not isinstance(tree.body, ast.Call): + raise ValueError(f"Expression is not a function call: {call_str!r}") + + call_node = tree.body + + # Extract function name (handles simple names like func_name) + if isinstance(call_node.func, ast.Name): + func_name = call_node.func.id + elif isinstance(call_node.func, ast.Attribute): + # Handle dotted names like module.func_name + parts = [] + node = call_node.func + while isinstance(node, ast.Attribute): + parts.append(node.attr) + node = node.value + if isinstance(node, ast.Name): + parts.append(node.id) + func_name = ".".join(reversed(parts)) + else: + raise ValueError(f"Unsupported function call format: {call_str!r}") + + # Extract keyword arguments + params: Dict[str, Any] = {} + + # Handle positional args (convert to indexed params) + for i, arg in enumerate(call_node.args): + try: + params[f"_arg{i}"] = ast.literal_eval(arg) + except (ValueError, TypeError): + params[f"_arg{i}"] = ast.dump(arg) + + # Handle keyword args + for kw in call_node.keywords: + if kw.arg is None: + continue # **kwargs expansion, skip + try: + params[kw.arg] = ast.literal_eval(kw.value) + except (ValueError, TypeError): + # Fall back to string representation for complex expressions + params[kw.arg] = ast.dump(kw.value) + + return ParsedCall(func_name=func_name, params=params) + + +def format_function_call(parsed: ParsedCall) -> str: + """Pretty-print a ParsedCall back to a function call string.""" + param_strs = [f"{k}={repr(v)}" for k, v in sorted(parsed.params.items())] + return f"{parsed.func_name}({', '.join(param_strs)})" + + +# --------------------------------------------------------------------------- +# Type-coerced value comparison +# --------------------------------------------------------------------------- + +def _try_coerce_match(a: Any, b: Any) -> bool: + """Try to coerce `a` to the type of `b` and compare.""" + if isinstance(a, str) and isinstance(b, bool): + if a.lower() in ("true", "false"): + return (a.lower() == "true") == b + return False + if isinstance(a, str) and isinstance(b, int) and not isinstance(b, bool): + try: + return int(a) == b + except (ValueError, TypeError): + return False + if isinstance(a, str) and isinstance(b, float): + try: + return float(a) == b + except (ValueError, TypeError): + return False + return False + + +def values_match(predicted: Any, expected: Any) -> bool: + """Compare two values with type coercion.""" + if predicted == expected: + return True + # Try coercion in both directions + if _try_coerce_match(predicted, expected): + return True + if _try_coerce_match(expected, predicted): + return True + # Recursive comparison for lists + if isinstance(predicted, list) and isinstance(expected, list): + if len(predicted) != len(expected): + return False + return all(values_match(p, e) for p, e in zip(predicted, expected)) + # Recursive comparison for dicts + if isinstance(predicted, dict) and isinstance(expected, dict): + if set(predicted.keys()) != set(expected.keys()): + return False + return all( + values_match(predicted[k], expected[k]) for k in expected + ) + return False + + +# --------------------------------------------------------------------------- +# Scoring +# --------------------------------------------------------------------------- + +def compare_single_call(predicted: ParsedCall, expected: ParsedCall) -> dict: + """Compare one predicted call against one expected call.""" + func_match = predicted.func_name == expected.func_name + + expected_keys = set(expected.params.keys()) + predicted_keys = set(predicted.params.keys()) + common_keys = expected_keys & predicted_keys + + if expected_keys: + param_name_acc = len(common_keys) / len(expected_keys) + else: + param_name_acc = 1.0 if not predicted_keys else 0.0 + + value_matches = sum( + 1 for k in common_keys + if values_match(predicted.params[k], expected.params[k]) + ) + param_value_acc = value_matches / len(expected_keys) if expected_keys else 1.0 + + overall = ( + (1.0 if func_match else 0.0) * 0.33 + + param_name_acc * 0.33 + + param_value_acc * 0.34 + ) + return { + "func_name_match": func_match, + "param_name_accuracy": param_name_acc, + "param_value_accuracy": param_value_acc, + "overall": overall, + } + + +def _split_calls(text: str) -> List[str]: + """ + Split a string that may contain multiple function calls. + Handles newline-separated or list-formatted calls. + """ + text = text.strip() + # If it looks like a Python list, try to parse individual calls + if text.startswith("[") and text.endswith("]"): + text = text[1:-1].strip() + + # Split on newlines or comma-separated top-level calls + calls = [] + depth = 0 + current: List[str] = [] + for char in text: + if char == "(": + depth += 1 + elif char == ")": + depth -= 1 + if char == "\n" and depth == 0: + chunk = "".join(current).strip() + if chunk: + calls.append(chunk) + current = [] + else: + current.append(char) + chunk = "".join(current).strip() + if chunk: + calls.append(chunk) + + # Clean trailing commas + return [c.rstrip(",").strip() for c in calls if c.strip()] + + +# --------------------------------------------------------------------------- +# The grader +# --------------------------------------------------------------------------- + +@grader( + name="tool_call", + description="BFCL-style AST comparison of function calls", +) +def tool_call_grader(messages, ground_truth, **kwargs): + """ + Compare predicted function calls against ground truth. + + ground_truth: str or List[str] of function call strings + messages: last assistant message content contains predicted call(s) + """ + predicted_str = get_last_assistant_content(messages) + if predicted_str is None: + return EvaluateResult( + score=0.0, is_valid=False, reason="No assistant message" + ) + + # Normalize ground_truth to list of strings + if isinstance(ground_truth, str): + gt_strs = [ground_truth] + elif isinstance(ground_truth, list): + gt_strs = [str(g) for g in ground_truth] + else: + return EvaluateResult( + score=0.0, + is_valid=False, + reason=f"Unexpected ground_truth type: {type(ground_truth)}", + ) + + # Parse predicted calls + try: + predicted_calls = [parse_function_call(s) for s in _split_calls(predicted_str)] + except (ValueError, SyntaxError) as e: + return EvaluateResult( + score=0.0, is_valid=False, reason=f"Parse error (predicted): {e}" + ) + + # Parse expected calls + try: + expected_calls = [parse_function_call(s) for s in gt_strs] + except (ValueError, SyntaxError) as e: + return EvaluateResult( + score=0.0, is_valid=False, reason=f"Parse error (ground truth): {e}" + ) + + # Compare by position + comparisons = [] + for pred, exp in zip(predicted_calls, expected_calls): + comparisons.append(compare_single_call(pred, exp)) + + n = max(len(expected_calls), len(predicted_calls), 1) + avg_overall = sum(c["overall"] for c in comparisons) / n if comparisons else 0.0 + + fn_acc = sum(1.0 if c["func_name_match"] else 0.0 for c in comparisons) / n + pn_acc = sum(c["param_name_accuracy"] for c in comparisons) / n + pv_acc = sum(c["param_value_accuracy"] for c in comparisons) / n + + return EvaluateResult( + score=avg_overall, + reason=f"Matched {len(comparisons)}/{n} function calls", + metrics={ + "function_name_accuracy": MetricResult( + score=fn_acc, + reason="Fraction of calls with correct function name", + ), + "parameter_name_accuracy": MetricResult( + score=pn_acc, + reason="Average parameter name accuracy across calls", + ), + "parameter_value_accuracy": MetricResult( + score=pv_acc, + reason="Average parameter value accuracy across calls", + ), + }, + ) diff --git a/src/llm_eval_kit/graders/decorator.py b/src/llm_eval_kit/graders/decorator.py new file mode 100644 index 0000000..59587de --- /dev/null +++ b/src/llm_eval_kit/graders/decorator.py @@ -0,0 +1,87 @@ +""" +The @grader decorator — wraps plain functions into Grader instances. + +Supports both @grader and @grader(name=..., description=...) syntax. +""" +import functools +import inspect +from typing import Any, Callable, List, Optional, Union + +from llm_eval_kit.models.messages import Message +from llm_eval_kit.models.results import EvaluateResult +from .base import Grader + + +class _FunctionGrader(Grader): + """Internal: wraps a plain function as a Grader. Created by @grader.""" + + def __init__( + self, fn: Callable, grader_name: str, grader_desc: str + ) -> None: + functools.update_wrapper(self, fn) + self._fn = fn + self._name = grader_name + self._description = grader_desc + + @property + def name(self) -> str: + return self._name + + @property + def description(self) -> str: + return self._description + + def grade( + self, + messages: List[Message], + ground_truth: Optional[Union[str, dict, list]] = None, + **kwargs: Any, + ) -> EvaluateResult: + return self._fn(messages, ground_truth, **kwargs) + + +def _validate_signature(fn: Callable) -> None: + """Check that fn accepts (messages, ground_truth, **kwargs).""" + sig = inspect.signature(fn) + params = list(sig.parameters.keys()) + if len(params) < 2: + raise TypeError( + f"Grader function '{fn.__name__}' must accept at least " + f"(messages, ground_truth, **kwargs), got: ({', '.join(params)})" + ) + if params[0] != "messages" or params[1] != "ground_truth": + raise TypeError( + f"Grader function '{fn.__name__}' first two parameters must be " + f"'messages' and 'ground_truth', got: ({', '.join(params[:2])})" + ) + + +def grader( + func: Optional[Callable] = None, + *, + name: Optional[str] = None, + description: Optional[str] = None, +) -> Union[Grader, Callable[..., Grader]]: + """ + Decorator to turn a function into a Grader. + + Usage: + @grader + def my_grader(messages, ground_truth, **kwargs): ... + + @grader(name="custom", description="My grader") + def my_grader(messages, ground_truth, **kwargs): ... + """ + def _wrap(fn: Callable) -> _FunctionGrader: + _validate_signature(fn) + return _FunctionGrader( + fn, + grader_name=name or fn.__name__, + grader_desc=description or fn.__doc__ or "", + ) + + if func is not None: + # Bare @grader (no parentheses) + return _wrap(func) + # @grader(...) with arguments — return the wrapper + return _wrap diff --git a/src/llm_eval_kit/graders/registry.py b/src/llm_eval_kit/graders/registry.py new file mode 100644 index 0000000..805d1d6 --- /dev/null +++ b/src/llm_eval_kit/graders/registry.py @@ -0,0 +1,31 @@ +"""Grader registry — central place to register and discover graders by name.""" +from typing import Dict, List + +from .base import Grader + + +class GraderRegistry: + """Maps string names to Grader instances.""" + + def __init__(self) -> None: + self._graders: Dict[str, Grader] = {} + + def register(self, name: str, grader: Grader) -> None: + self._graders[name] = grader + + def get(self, name: str) -> Grader: + if name not in self._graders: + available = ", ".join(sorted(self._graders.keys())) + raise KeyError( + f"Grader '{name}' not found. Available: {available}" + ) + return self._graders[name] + + def list_graders(self) -> List[str]: + return sorted(self._graders.keys()) + + def __contains__(self, name: str) -> bool: + return name in self._graders + + +default_registry = GraderRegistry() diff --git a/src/llm_eval_kit/model/__init__.py b/src/llm_eval_kit/model/__init__.py new file mode 100644 index 0000000..8313542 --- /dev/null +++ b/src/llm_eval_kit/model/__init__.py @@ -0,0 +1,5 @@ +""" +SageMaker payload models (existing functionality). + +Pydantic models for validating Lambda pre/post processing payloads. +""" diff --git a/src/llm_eval_kit/models/__init__.py b/src/llm_eval_kit/models/__init__.py new file mode 100644 index 0000000..0d14429 --- /dev/null +++ b/src/llm_eval_kit/models/__init__.py @@ -0,0 +1,18 @@ +""" +Core data models for llm-eval-kit. + +Pydantic v2 BaseModel: Message, MetricResult, EvaluateResult, EvalSample +Plain class: Conversation, EvalDataset +""" +from .messages import Conversation, Message +from .results import EvaluateResult, MetricResult +from .datasets import EvalDataset, EvalSample + +__all__ = [ + "Message", + "Conversation", + "MetricResult", + "EvaluateResult", + "EvalSample", + "EvalDataset", +] diff --git a/src/llm_eval_kit/models/datasets.py b/src/llm_eval_kit/models/datasets.py new file mode 100644 index 0000000..d5c954a --- /dev/null +++ b/src/llm_eval_kit/models/datasets.py @@ -0,0 +1,67 @@ +""" +Dataset sample models. + +EvalSample is a Pydantic v2 BaseModel (boundary — validated from external data). +EvalDataset is a plain Python class (internal container — lightweight). +""" +import json +import logging +from typing import Any, Dict, Iterator, List, Optional, Union + +from pydantic import BaseModel, Field + +from .messages import Message + +logger = logging.getLogger(__name__) + + +class EvalSample(BaseModel): + """One evaluation sample with messages, ground truth, and metadata.""" + + id: str + messages: List[Message] + ground_truth: Optional[Union[str, dict, list]] = None + metadata: Dict[str, Any] = Field(default_factory=dict) + + +class EvalDataset: + """Container wrapping a list of EvalSample with Pythonic iteration.""" + + def __init__(self, samples: List[EvalSample]) -> None: + self.samples = list(samples) + + def __len__(self) -> int: + return len(self.samples) + + def __getitem__(self, index: int) -> EvalSample: + return self.samples[index] + + def __iter__(self) -> Iterator[EvalSample]: + return iter(self.samples) + + @classmethod + def from_jsonl( + cls, + path: str, + max_samples: Optional[int] = None, + ) -> "EvalDataset": + """Read a JSONL file into EvalSample objects. Skips malformed lines.""" + samples: List[EvalSample] = [] + with open(path) as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + except json.JSONDecodeError: + logger.warning("Skipping malformed JSON at line %d", line_num) + continue + try: + samples.append(EvalSample(**data)) + except Exception as e: + logger.warning("Skipping invalid sample at line %d: %s", line_num, e) + continue + if max_samples and len(samples) >= max_samples: + break + return cls(samples) diff --git a/src/llm_eval_kit/models/messages.py b/src/llm_eval_kit/models/messages.py new file mode 100644 index 0000000..055245d --- /dev/null +++ b/src/llm_eval_kit/models/messages.py @@ -0,0 +1,73 @@ +""" +Message and Conversation models for representing LLM interactions. + +Message is a Pydantic v2 BaseModel (boundary model — validated, serializable). +Conversation is a plain Python class (lightweight wrapper — no validation overhead). +""" +from typing import Any, Dict, Iterator, List, Optional + +from pydantic import BaseModel, model_validator + + +class Message(BaseModel): + """Provider-agnostic chat message model.""" + + role: str + content: Optional[str] = None + name: Optional[str] = None + tool_call_id: Optional[str] = None + tool_calls: Optional[List[dict]] = None + + @model_validator(mode="after") + def validate_tool_message(self) -> "Message": + if self.role == "tool" and not self.tool_call_id: + raise ValueError( + "Messages with role 'tool' must include tool_call_id" + ) + return self + + def to_openai_format(self) -> dict: + """Return dict with only non-None fields, compatible with OpenAI API.""" + d: Dict[str, Any] = {"role": self.role} + if self.content is not None: + d["content"] = self.content + if self.name is not None: + d["name"] = self.name + if self.tool_call_id is not None: + d["tool_call_id"] = self.tool_call_id + if self.tool_calls is not None: + d["tool_calls"] = self.tool_calls + return d + + @classmethod + def from_openai(cls, data: dict) -> "Message": + """Construct a Message from an OpenAI-format dict.""" + return cls(**data) + + +class Conversation: + """Lightweight wrapper around a list of Messages with helper accessors.""" + + def __init__(self, messages: List[Message]) -> None: + self.messages = list(messages) + + def get_last_assistant_message(self) -> Optional[Message]: + for msg in reversed(self.messages): + if msg.role == "assistant": + return msg + return None + + def get_system_prompt(self) -> Optional[str]: + for msg in self.messages: + if msg.role == "system": + return msg.content + return None + + def to_openai_format(self) -> List[dict]: + return [msg.to_openai_format() for msg in self.messages] + + def __len__(self) -> int: + return len(self.messages) + + def __iter__(self) -> Iterator[Message]: + return iter(self.messages) diff --git a/src/llm_eval_kit/models/results.py b/src/llm_eval_kit/models/results.py new file mode 100644 index 0000000..4f0f831 --- /dev/null +++ b/src/llm_eval_kit/models/results.py @@ -0,0 +1,61 @@ +""" +Evaluation result models — the output contract for all graders. + +MetricResult and EvaluateResult are Pydantic v2 BaseModels (boundary models). +""" +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field + + +class MetricResult(BaseModel): + """A single named metric score with explanation.""" + + score: float = Field(ge=0.0, le=1.0) + reason: str + is_valid: bool = True + + def to_dict(self) -> dict: + return { + "score": self.score, + "reason": self.reason, + "is_valid": self.is_valid, + } + + +class EvaluateResult(BaseModel): + """Complete output of a grader — overall score, sub-metrics, and metadata.""" + + score: float = Field(ge=0.0, le=1.0) + reason: Optional[str] = None + is_valid: bool = True + metrics: Dict[str, MetricResult] = Field(default_factory=dict) + metadata: Dict[str, Any] = Field(default_factory=dict) + + def summary(self) -> str: + lines = [f"Score: {self.score:.4f}"] + if self.reason: + lines.append(f"Reason: {self.reason}") + for name, metric in self.metrics.items(): + lines.append(f" {name}: {metric.score:.4f} ({metric.reason})") + return "\n".join(lines) + + def to_dict(self) -> dict: + return { + "score": self.score, + "reason": self.reason, + "is_valid": self.is_valid, + "metrics": {k: v.to_dict() for k, v in self.metrics.items()}, + "metadata": self.metadata, + } + + @classmethod + def aggregate(cls, results: List["EvaluateResult"]) -> "EvaluateResult": + """Compute mean score across a list of results.""" + if not results: + return cls(score=0.0, reason="No results to aggregate") + avg = sum(r.score for r in results) / len(results) + return cls( + score=avg, + reason=f"Aggregated over {len(results)} samples", + ) diff --git a/src/llm_eval_kit/processors/__init__.py b/src/llm_eval_kit/processors/__init__.py new file mode 100644 index 0000000..b40a135 --- /dev/null +++ b/src/llm_eval_kit/processors/__init__.py @@ -0,0 +1,6 @@ +""" +SageMaker pre/post processing framework (existing functionality). + +This package provides the original Lambda-based preprocessing and +postprocessing pipeline for SageMaker evaluation jobs. +""" diff --git a/src/llm_eval_kit/utils/__init__.py b/src/llm_eval_kit/utils/__init__.py new file mode 100644 index 0000000..75b9281 --- /dev/null +++ b/src/llm_eval_kit/utils/__init__.py @@ -0,0 +1,4 @@ +"""Utility functions.""" +from .module_loader import load_function + +__all__ = ["load_function"] diff --git a/src/llm_eval_kit/utils/module_loader.py b/src/llm_eval_kit/utils/module_loader.py new file mode 100644 index 0000000..831a930 --- /dev/null +++ b/src/llm_eval_kit/utils/module_loader.py @@ -0,0 +1,36 @@ +"""Dynamic module/function loader — load Python objects from string paths.""" +import importlib +from typing import Any + + +def load_function(path: str) -> Any: + """ + Load a Python object from a string path. + + Supports: + - "module.submodule:func_name" (colon format, preferred) + - "module.submodule.func_name" (dot format, last component is attribute) + """ + if ":" in path: + module_path, attr_name = path.split(":", 1) + else: + parts = path.rsplit(".", 1) + if len(parts) < 2: + raise ImportError( + f"Invalid path format: {path!r}. " + f"Expected 'module.path:func' or 'module.path.func'" + ) + module_path, attr_name = parts + + try: + module = importlib.import_module(module_path) + except ImportError as e: + raise ImportError( + f"Cannot import module '{module_path}': {e}" + ) from e + + if not hasattr(module, attr_name): + raise AttributeError( + f"Module '{module_path}' has no attribute '{attr_name}'" + ) + return getattr(module, attr_name)