diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ca02864
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,42 @@
+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+*.egg
+dist/
+build/
+*.whl
+
+# Virtual environments
+.venv/
+venv/
+env/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+.DS_Store
+
+# mypy
+.mypy_cache/
+
+# Test / pytest
+.pytest_cache/
+.coverage
+htmlcov/
+test_data.jsonl
+test_tool_call.jsonl
+results.jsonl
+
+# Config secrets (never commit credentials)
+llm_eval_kit.yaml
+.env
+
+# Blog drafts
+blog/
+
+# Lambda deployment artifacts
+deploy_package/
+*.zip
diff --git a/README.md b/README.md
index f030b53..be6f60e 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,22 @@
 # LLM Eval Kit
 
 A Python SDK for creating custom evaluation metrics for LLM model evaluation on Sagemaker Training Job with built-in Pydantic validation.
+
 For the official integration with AWS Sagemaker training job, please view in the [Official AWS Sagemaker Documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/nova-model-evaluation.html).
+
 ## Installation
 
-```
+```bash
 git clone https://github.com/aws/llm-eval-kit.git
 cd llm-eval-kit
-pip install .
+uv venv .venv && source .venv/bin/activate
+uv pip install .
 ```
 
 ## Architecture
 
 The SDK provides:
+
 - **Pydantic Validation**: Automatic input/output validation using Pydantic models
 - **PreProcessor**: For input data transformation with validation
 - **PostProcessor**: For output data formatting with validation
@@ -27,6 +31,7 @@ The SDK provides:
 See `example/run_example.py` for a complete working example to run locally.
 
 ### Run in AWS Lambda
+
 You need to create a lambda (follow this [guide](https://docs.aws.amazon.com/lambda/latest/dg/getting-started.html)) and upload `llm-eval-kit` as a lambda layer in order to use it.
 
 In the [github release](https://github.com/aws/llm-eval-kit/releases), you should be able to find a pre-built llm-eval-kit-layer.zip file.
@@ -35,10 +40,11 @@ Use below command to upload custom lambda layer.
 
 ```
 aws lambda publish-layer-version \
-    --layer-name llm-eval-kit-layer \
-    --zip-file fileb://llm-eval-kit-layer.zip \
-    --compatible-runtimes python3.12 python3.11 python3.10 python3.9
+--layer-name llm-eval-kit-layer \
+--zip-file fileb://llm-eval-kit-layer.zip \
+--compatible-runtimes python3.12 python3.11 python3.10 python3.9
 ```
+
 You need to add this layer as custom layer along with the required AWS layer: `AWSLambdaPowertoolsPythonV3-python312-arm64` (because of pydantic depencency) to your lambda.
 
 Then update your lambda code with:
@@ -72,7 +78,6 @@ def postprocessor(event: dict, context) -> dict:
         "metric": "inverted_accuracy_custom",
         "value": inverted_accuracy
     })
-
     # Add more metrics here
 
     return {
@@ -92,31 +97,58 @@ lambda_handler = build_lambda_handler(
 The SDK automatically validates:
 
 ### Preprocessing Input
+
 ```json
 {
-  "process_type": "preprocess",
-  "data": {
-    "prompt": "what can you do?",
-    "gold": "Hello! How can I help you today?",
-    "system": "You are a helpful assistant"
-  }
+    "process_type": "preprocess",
+    "data": {
+        "prompt": "what can you do?",
+        "gold": "Hello! How can I help you today?",
+        "system": "You are a helpful assistant"
+    }
 }
 ```
 
 ### Postprocessing Input
+
 ```json
 {
-  "process_type": "postprocess",
-  "data": [
-    {
-      "prompt": "what can you do",
-      "inference_output": "Hello! How can I help you today?",
-      "gold": "Hello! How can I help you today?"
-    }
-  ]
+    "process_type": "postprocess",
+    "data": [
+        {
+            "prompt": "what can you do",
+            "inference_output": "Hello! How can I help you today?",
+            "gold": "Hello! How can I help you today?"
+        }
+    ]
 }
 ```
 
+## RLVR Grader Framework
+
+llm-eval-kit also includes a grader framework for building and deploying reward functions for Reinforcement Learning with Verifiable Rewards (RLVR) on Amazon Bedrock. This extends the SDK beyond SageMaker evaluation into RFT (Reinforcement Fine-Tuning) workflows.
+
+Features include:
+
+- Built-in graders for exact match, string similarity, and BFCL tool-calling evaluation
+- A `@grader` decorator for writing custom reward functions
+- Dataset loaders for JSONL, BFCL, and HuggingFace Hub
+- One-command Lambda deployment of graders as reward functions
+- A CLI for local evaluation, validation, and deployment
+
+```bash
+uv pip install -e ".[dev,datasets,deploy]"
+```
+
+For full documentation on the grader framework, see the [src/llm_eval_kit README](src/llm_eval_kit/README.md).
+
+| Topic | Description |
+|-------|-------------|
+| [Graders](docs/graders.md) | Built-in graders, writing custom graders, the `@grader` decorator |
+| [Datasets](docs/datasets.md) | Loading from JSONL, BFCL, and HuggingFace Hub |
+| [Lambda Deployment](docs/deploy.md) | Deploy graders as AWS Lambda reward functions for RLVR |
+| [CLI Reference](docs/cli.md) | All CLI commands and options |
+
 ## Testing
 
 ```bash
@@ -130,8 +162,9 @@ python example/run_example.py
 ## Development
 
 ```bash
-# Install in development mode
-pip install -e .
+# Create venv and install in development mode
+uv venv .venv && source .venv/bin/activate
+uv pip install -e ".[dev,datasets,deploy]"
 
 # Run tests with coverage
 python -m pytest tests/ --cov=llm_eval_kit
diff --git a/docs/cli.md b/docs/cli.md
new file mode 100644
index 0000000..c82f56e
--- /dev/null
+++ b/docs/cli.md
@@ -0,0 +1,67 @@
+# CLI Reference
+
+```
+llm-eval-kit <command> [options]
+```
+
+## `evaluate`
+
+Run a grader over a dataset.
+
+```bash
+llm-eval-kit evaluate --grader <name> --data <path> [options]
+```
+
+| Option | Description |
+|--------|-------------|
+| `--grader` | Built-in grader name (`exact_match`, `string_similarity`, `tool_call`) |
+| `--grader-path` | Custom grader as `module.path:function_name` |
+| `--data` | Path to JSONL dataset file (required) |
+| `--format` | `jsonl` (default) or `bfcl` for BFCL-formatted files |
+| `--output` | Write per-sample results to a JSONL file |
+| `--max-samples` | Limit number of samples to evaluate |
+
+Examples:
+
+```bash
+# Built-in grader
+llm-eval-kit evaluate --grader exact_match --data samples.jsonl
+
+# Custom grader with output
+llm-eval-kit evaluate --grader-path my_module:my_grader --data samples.jsonl --output results.jsonl
+
+# BFCL format with sample limit
+llm-eval-kit evaluate --grader tool_call --data BFCL_v3_simple.json --format bfcl --max-samples 50
+```
+
+## `list-graders`
+
+Show all registered graders.
+
+```bash
+llm-eval-kit list-graders
+```
+
+## `validate`
+
+Check a dataset file for schema errors.
+
+```bash
+llm-eval-kit validate --data <path>
+```
+
+## `deploy`
+
+Deploy a grader as an AWS Lambda function. Requires `uv pip install -e ".[deploy]"`.
+
+```bash
+llm-eval-kit deploy --grader <name> [options]
+```
+
+| Option | Description |
+|--------|-------------|
+| `--grader` | Built-in grader name |
+| `--grader-path` | Custom grader as `module.path:function_name` |
+| `--config` | Path to `llm_eval_kit.yaml` config file |
+
+See [deploy.md](deploy.md) for the full deployment walkthrough.
diff --git a/docs/datasets.md b/docs/datasets.md
new file mode 100644
index 0000000..6fac452
--- /dev/null
+++ b/docs/datasets.md
@@ -0,0 +1,98 @@
+# Datasets
+
+llm-eval-kit supports loading evaluation data from JSONL files, BFCL-formatted files, and HuggingFace Hub.
+
+## JSONL Format
+
+Each line is a JSON object with `id`, `messages`, and `ground_truth`:
+
+```jsonl
+{"id": "1", "messages": [{"role": "user", "content": "2+2?"}, {"role": "assistant", "content": "4"}], "ground_truth": "4"}
+{"id": "2", "messages": [{"role": "user", "content": "Capital of France?"}, {"role": "assistant", "content": "Paris"}], "ground_truth": "Paris"}
+```
+
+Load from CLI:
+
+```bash
+llm-eval-kit evaluate --grader exact_match --data samples.jsonl
+```
+
+Load from Python:
+
+```python
+from llm_eval_kit.datasets.loader import load_jsonl
+
+dataset = load_jsonl("samples.jsonl", max_samples=100)
+```
+
+Validate a file before running:
+
+```bash
+llm-eval-kit validate --data samples.jsonl
+```
+
+## BFCL Format
+
+The [Berkeley Function Calling Leaderboard](https://gorilla.cs.berkeley.edu/leaderboard.html) uses a specific JSONL format with `id`, `question` (list of message dicts), and `function` (tool definitions).
+
+```bash
+llm-eval-kit evaluate \
+    --grader tool_call \
+    --data BFCL_v3_multiple.json \
+    --format bfcl
+```
+
+```python
+from llm_eval_kit.datasets.loader import load_bfcl
+
+dataset = load_bfcl("BFCL_v3_multiple.json", max_samples=100)
+```
+
+## HuggingFace Hub
+
+Pull datasets directly from HuggingFace. Requires `uv pip install -e ".[datasets]"`.
+
+```python
+from llm_eval_kit.datasets.loader import load_huggingface
+
+dataset = load_huggingface(
+    "gorilla-llm/Berkeley-Function-Calling-Leaderboard",
+    split="train",
+    max_samples=50,
+    data_files="BFCL_v3_exec_simple.json",  # pick a specific file
+    prompt_key="question",
+    ground_truth_key="ground_truth",
+    id_key="id",
+    response_key=None,
+)
+```
+
+### Parameters
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `dataset_name` | (required) | HF dataset name (e.g. `"gorilla-llm/Berkeley-Function-Calling-Leaderboard"`) |
+| `split` | `"train"` | Dataset split |
+| `max_samples` | `None` | Limit number of samples |
+| `token` | `None` | HF API token (falls back to `HF_TOKEN` env var) |
+| `data_files` | `None` | Specific file(s) to load from the repo |
+| `config_name` | `None` | Dataset config/subset name |
+| `prompt_key` | `"prompt"` | Column name for the prompt |
+| `response_key` | `"response"` | Column name for model response (`None` to skip) |
+| `ground_truth_key` | `"ground_truth"` | Column name for ground truth (`None` to skip) |
+| `id_key` | `"id"` | Column name for sample ID (`None` to auto-generate) |
+
+### BFCL on HuggingFace
+
+The BFCL repo has ~49 files with different schemas. You must use `data_files` to select one — loading the entire repo will fail.
+
+Available files include: `BFCL_v3_simple.json`, `BFCL_v3_multiple.json`, `BFCL_v3_parallel.json`, `BFCL_v3_exec_simple.json`, `BFCL_v3_live_simple.json`, and more.
+
+### Private/Gated Datasets
+
+```python
+dataset = load_huggingface(
+    "my-org/my-private-dataset",
+    token="hf_...",  # or set HF_TOKEN env var
+)
+```
diff --git a/docs/deploy.md b/docs/deploy.md
new file mode 100644
index 0000000..9feffa0
--- /dev/null
+++ b/docs/deploy.md
@@ -0,0 +1,116 @@
+# Lambda Deployment
+
+Deploy any grader as an AWS Lambda function for use as a reward function in Bedrock RFT jobs. The deploy command packages your grader with all dependencies (including pydantic), creates or updates the Lambda, and wires up the handler automatically.
+
+Requires `uv pip install -e ".[deploy]"`.
+
+## 1. Create a Lambda Execution Role
+
+If you don't already have one:
+
+```bash
+aws iam create-role \
+  --role-name llm-eval-kit-lambda-role \
+  --assume-role-policy-document '{
+    "Version": "2012-10-17",
+    "Statement": [{
+      "Effect": "Allow",
+      "Principal": {"Service": "lambda.amazonaws.com"},
+      "Action": "sts:AssumeRole"
+    }]
+  }'
+```
+
+## 2. Create a Config File
+
+Create `llm_eval_kit.yaml` in your project root:
+
+```yaml
+aws:
+  region: us-east-1
+  account_id: "123456789012"
+  lambda:
+    function_name: my-reward-function
+    runtime: python3.12
+    timeout: 60
+    memory_size: 256
+    role_arn: arn:aws:iam::123456789012:role/llm-eval-kit-lambda-role
+```
+
+Config values can also be set via environment variables: `AWS_REGION`, `AWS_DEFAULT_REGION`, `AWS_ACCOUNT_ID`.
+
+## 3. Deploy
+
+```bash
+# Deploy a built-in grader
+llm-eval-kit deploy --grader exact_match
+
+# Deploy a custom grader from a module path
+llm-eval-kit deploy --grader-path my_module:my_grader
+
+# Deploy with a specific config file
+llm-eval-kit deploy --grader tool_call --config my_config.yaml
+```
+
+## 4. Test the Deployed Function
+
+```bash
+aws lambda invoke \
+  --function-name my-reward-function \
+  --payload '{
+    "messages": [
+      {"role": "user", "content": "What is 2+2?"},
+      {"role": "assistant", "content": "4"}
+    ],
+    "ground_truth": "4"
+  }' \
+  /dev/stdout
+```
+
+Expected response:
+
+```json
+{
+  "statusCode": 200,
+  "body": {
+    "score": 1.0,
+    "reason": "Exact match",
+    "is_valid": true,
+    "metrics": {"exact_match": {"score": 1.0, "reason": "case_sensitive=False", "is_valid": true}},
+    "metadata": {}
+  }
+}
+```
+
+## What Happens Under the Hood
+
+1. Your grader + the `llm_eval_kit` package + dependencies (pydantic, etc.) are installed for the Lambda runtime and zipped into a deployment package
+2. An auto-generated `handler.py` wraps your grader with Lambda-compatible request/response handling
+3. The Lambda function is created or updated in your AWS account
+4. The function accepts `{"messages": [...], "ground_truth": ..., "kwargs": {}}` payloads
+
+## Config Reference
+
+### `llm_eval_kit.yaml`
+
+```yaml
+aws:
+  region: us-east-1          # AWS region
+  account_id: "123456789012" # AWS account ID (optional, for reference)
+  lambda:
+    function_name: my-reward-function  # Lambda function name
+    runtime: python3.12                # Lambda runtime
+    handler: handler.lambda_handler    # Handler path (default)
+    timeout: 60                        # Timeout in seconds
+    memory_size: 256                   # Memory in MB
+    role_arn: arn:aws:iam::...         # Lambda execution role ARN
+    environment: {}                    # Extra env vars for the Lambda
+```
+
+### Environment Variable Overrides
+
+| Variable | Overrides |
+|----------|-----------|
+| `AWS_REGION` | `aws.region` |
+| `AWS_DEFAULT_REGION` | `aws.region` |
+| `AWS_ACCOUNT_ID` | `aws.account_id` |
diff --git a/docs/graders.md b/docs/graders.md
new file mode 100644
index 0000000..582a0e7
--- /dev/null
+++ b/docs/graders.md
@@ -0,0 +1,78 @@
+# Graders
+
+Graders are the core evaluation unit in llm-eval-kit. A grader takes a conversation (messages) and ground truth, then returns a scored `EvaluateResult`.
+
+## Built-in Graders
+
+| Name | Description |
+|------|-------------|
+| `exact_match` | Exact string comparison (case-insensitive by default) |
+| `string_similarity` | Levenshtein distance or token F1 fuzzy matching |
+| `tool_call` | BFCL-style AST comparison of function calls with type coercion |
+
+List them from the CLI:
+
+```bash
+llm-eval-kit list-graders
+```
+
+## Writing a Custom Grader
+
+Use the `@grader` decorator to register a function as a grader:
+
+```python
+from llm_eval_kit.graders.decorator import grader
+from llm_eval_kit.models.results import EvaluateResult
+
+@grader(name="my_grader", description="My custom grader")
+def my_grader(messages, ground_truth, **kwargs):
+    response = messages[-1].content
+    match = response.strip().lower() == str(ground_truth).strip().lower()
+    return EvaluateResult(
+        score=1.0 if match else 0.0,
+        reason="Match" if match else "No match",
+    )
+```
+
+Your function receives:
+- `messages` — list of `Message` objects (role + content)
+- `ground_truth` — the expected answer (str, list, or dict)
+- `**kwargs` — any extra metadata from the sample
+
+It must return an `EvaluateResult` with at minimum a `score` (0.0–1.0).
+
+## Using a Custom Grader from CLI
+
+Point to your grader with `--grader-path`:
+
+```bash
+llm-eval-kit evaluate \
+    --grader-path my_module:my_grader \
+    --data samples.jsonl
+```
+
+The format is `module.path:function_name`. The module must be importable from your current directory or installed in your environment.
+
+## Grader Architecture
+
+- `Grader` (ABC) — base class with a `grade(messages, ground_truth, **kwargs)` method
+- `@grader` decorator — wraps a plain function into a `_FunctionGrader` instance
+- `GraderRegistry` — singleton that maps names to grader instances
+- Built-in graders auto-register on import via `graders/__init__.py`
+
+## EvaluateResult
+
+```python
+from llm_eval_kit.models.results import EvaluateResult, MetricResult
+
+result = EvaluateResult(
+    score=0.85,
+    reason="Partial match",
+    is_valid=True,
+    metrics={
+        "name_accuracy": MetricResult(score=1.0, reason="Correct name"),
+        "value_accuracy": MetricResult(score=0.7, reason="2/3 values matched"),
+    },
+    metadata={"debug": "extra info"},
+)
+```
diff --git a/llm_eval_kit.example.yaml b/llm_eval_kit.example.yaml
new file mode 100644
index 0000000..2e0cd4f
--- /dev/null
+++ b/llm_eval_kit.example.yaml
@@ -0,0 +1,19 @@
+# llm-eval-kit deployment configuration
+# Copy to llm_eval_kit.yaml and fill in your values.
+# The CLI auto-discovers this file in the current or parent directories.
+
+aws:
+  region: us-east-1
+  # account_id: "123456789012"  # optional, auto-detected from creds
+  # profile: my-profile         # AWS CLI profile name (optional)
+
+  lambda:
+    function_name: llm-eval-reward-function
+    runtime: python3.12
+    handler: handler.lambda_handler
+    timeout: 60
+    memory_size: 256
+    role_arn: arn:aws:iam::123456789012:role/your-lambda-execution-role
+    # layers: []                 # optional Lambda layers
+    # environment:               # extra env vars passed to the Lambda
+    #   MY_VAR: my_value
diff --git a/pyproject.toml b/pyproject.toml
index d34cbc7..7d2b298 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,11 +9,11 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "llm-eval-kit"
 license = "Apache-2.0"
-version = "1.0"
+version = "1.1.0"
 readme = "README.md"
+requires-python = ">=3.10"
 dependencies = [
-  "pydantic"
-  # Put your dependencies here!
+  "pydantic>=2.0.0",
 ]
 
 [project.optional-dependencies]
@@ -22,4 +22,17 @@ dev = [
   "setuptools",
   "twine",
   "wheel",
+  "pytest>=7.0.0",
+  "mypy",
 ]
+datasets = [
+  "datasets>=2.0.0",
+  "huggingface-hub>=0.20.0",
+]
+deploy = [
+  "boto3>=1.28.0",
+  "pyyaml>=6.0",
+]
+
+[project.scripts]
+llm-eval-kit = "llm_eval_kit.cli.main:main"
diff --git a/src/llm_eval_kit/README.md b/src/llm_eval_kit/README.md
new file mode 100644
index 0000000..05d8d61
--- /dev/null
+++ b/src/llm_eval_kit/README.md
@@ -0,0 +1,93 @@
+# llm-eval-kit — RLVR Grader Framework
+
+A grader framework for building evaluation functions, running them over datasets, and deploying them as AWS Lambda reward functions for RLVR workflows on Amazon Bedrock.
+
+## Install
+
+```bash
+uv venv .venv
+source .venv/bin/activate
+uv pip install -e ".[dev]"
+```
+
+Optional extras:
+
+```bash
+uv pip install -e ".[datasets]"  # HuggingFace dataset support
+uv pip install -e ".[deploy]"    # AWS Lambda deployment
+```
+
+## Quick Start
+
+Write a grader:
+
+```python
+from llm_eval_kit.graders.decorator import grader
+from llm_eval_kit.models.results import EvaluateResult
+
+@grader
+def my_grader(messages, ground_truth, **kwargs):
+    response = messages[-1].content
+    match = response.strip().lower() == str(ground_truth).strip().lower()
+    return EvaluateResult(
+        score=1.0 if match else 0.0,
+        reason="Match" if match else "No match",
+    )
+```
+
+Run it:
+
+```bash
+llm-eval-kit evaluate --grader exact_match --data samples.jsonl
+```
+
+Or from Python:
+
+```python
+from llm_eval_kit.datasets.loader import load_jsonl
+from llm_eval_kit.execution.pipeline import EvalPipeline
+from llm_eval_kit.graders import exact_match_grader
+
+dataset = load_jsonl("samples.jsonl")
+report = EvalPipeline(exact_match_grader, dataset).run_with_report()
+print(report.summary())
+# Samples: 2 | Avg: 1.0000 | Min: 1.0000 | Max: 1.0000
+```
+
+## Documentation
+
+| Topic | Description |
+|-------|-------------|
+| [Graders](../../docs/graders.md) | Built-in graders, writing custom graders, the `@grader` decorator |
+| [Datasets](../../docs/datasets.md) | Loading from JSONL, BFCL, and HuggingFace Hub |
+| [Lambda Deployment](../../docs/deploy.md) | Deploy graders as AWS Lambda reward functions for RLVR |
+| [CLI Reference](../../docs/cli.md) | All CLI commands and options |
+
+## Built-in Graders
+
+| Name | Description |
+|------|-------------|
+| `exact_match` | Exact string comparison (case-insensitive by default) |
+| `string_similarity` | Levenshtein distance or token F1 fuzzy matching |
+| `tool_call` | BFCL-style AST comparison of function calls with type coercion |
+
+## Project Structure
+
+```
+llm_eval_kit/
+├── models/           # Pydantic data models (Message, EvaluateResult, EvalSample)
+├── graders/          # Grader framework (ABC, decorator, registry)
+│   └── builtins/     # Built-in grader implementations
+├── datasets/         # Dataset loaders (JSONL, BFCL, HuggingFace)
+├── execution/        # Evaluation pipeline and reporting
+├── deploy/           # AWS Lambda deployment
+├── cli/              # Command-line interface
+├── utils/            # Dynamic module loading
+├── processors/       # SageMaker pre/post processing (existing)
+├── model/            # SageMaker payload models (existing)
+└── lambda_handler.py # SageMaker Lambda handler (existing)
+```
+
+## License
+
+Apache-2.0
diff --git a/src/llm_eval_kit/__init__.py b/src/llm_eval_kit/__init__.py
index e69de29..314bc04 100644
--- a/src/llm_eval_kit/__init__.py
+++ b/src/llm_eval_kit/__init__.py
@@ -0,0 +1,22 @@
+"""
+llm-eval-kit — A Python SDK for LLM evaluation and RFT grader development.
+
+This package provides:
+- Grader framework: Define evaluation functions with @grader decorator
+- Built-in graders: exact_match, string_similarity, llm_judge
+- Dataset loading: JSONL and HuggingFace dataset support
+- Evaluation pipeline: Run graders over datasets and collect results
+- SageMaker integration: Pre/post processing for SageMaker eval jobs (Lambda)
+- CLI: Command-line tools for running evaluations
+
+Quick start:
+    from llm_eval_kit.graders.decorator import grader
+    from llm_eval_kit.models.results import EvaluateResult
+
+    @grader
+    def my_grader(messages, ground_truth, **kwargs):
+        # Your evaluation logic here
+        return EvaluateResult(score=1.0, reason="Perfect!")
+"""
+
+__version__ = "1.1.0"
diff --git a/src/llm_eval_kit/cli/__init__.py b/src/llm_eval_kit/cli/__init__.py
new file mode 100644
index 0000000..a3ba1f6
--- /dev/null
+++ b/src/llm_eval_kit/cli/__init__.py
@@ -0,0 +1 @@
+"""CLI package."""
diff --git a/src/llm_eval_kit/cli/main.py b/src/llm_eval_kit/cli/main.py
new file mode 100644
index 0000000..d459fd8
--- /dev/null
+++ b/src/llm_eval_kit/cli/main.py
@@ -0,0 +1,183 @@
+"""CLI entry point for llm-eval-kit."""
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from llm_eval_kit.datasets.loader import load_bfcl, load_jsonl
+from llm_eval_kit.execution.pipeline import EvalPipeline
+from llm_eval_kit.graders.registry import default_registry
+from llm_eval_kit.models.datasets import EvalSample
+from llm_eval_kit.utils.module_loader import load_function
+
+# Top-level imports so graders are registered once
+import llm_eval_kit.graders  # noqa: F401
+
+
+def _cmd_evaluate(args):
+    if args.grader:
+        try:
+            g = default_registry.get(args.grader)
+        except KeyError as e:
+            print(f"Error: {e}", file=sys.stderr)
+            sys.exit(1)
+    elif args.grader_path:
+        try:
+            g = load_function(args.grader_path)
+        except (ImportError, AttributeError) as e:
+            print(f"Error: {e}", file=sys.stderr)
+            sys.exit(1)
+    else:
+        print("Error: --grader or --grader-path required",
+              file=sys.stderr)
+        sys.exit(1)
+
+    if not Path(args.data).exists():
+        print(f"Error: not found: {args.data}", file=sys.stderr)
+        sys.exit(1)
+
+    if args.format == "bfcl":
+        ds = load_bfcl(args.data, max_samples=args.max_samples)
+    else:
+        ds = load_jsonl(args.data, max_samples=args.max_samples)
+
+    report = EvalPipeline(g, ds).run_with_report()
+    print(report.summary())
+    if args.output:
+        report.to_jsonl(args.output)
+        print(f"Results written to {args.output}")
+
+
+def _cmd_list_graders():
+    for name in default_registry.list_graders():
+        g = default_registry.get(name)
+        print(f"  {name}: {g.description}")
+
+
+def _cmd_validate(args):
+    if not Path(args.data).exists():
+        print(f"Error: not found: {args.data}", file=sys.stderr)
+        sys.exit(1)
+    errors = []
+    count = 0
+    with open(args.data) as f:
+        for ln, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                EvalSample(**json.loads(line))
+                count += 1
+            except Exception as e:
+                errors.append((ln, str(e)))
+    if errors:
+        for ln, err in errors:
+            print(f"  Line {ln}: {err}", file=sys.stderr)
+        sys.exit(1)
+    print(f"Valid: {count} samples")
+
+
+def _cmd_deploy(args):
+    """Deploy a grader as an AWS Lambda reward function."""
+    try:
+        from llm_eval_kit.deploy.lambda_deploy import deploy_grader
+        from llm_eval_kit.deploy.config import load_deploy_config
+    except ImportError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        print("Install deploy extras: pip install llm-eval-kit[deploy]",
+              file=sys.stderr)
+        sys.exit(1)
+
+    # Resolve grader reference
+    if args.grader:
+        grader_ref = (
+            f"llm_eval_kit.graders.builtins.{args.grader}"
+            f":{args.grader}_grader"
+        )
+    elif args.grader_path:
+        grader_ref = args.grader_path
+    else:
+        print("Error: --grader or --grader-path required", file=sys.stderr)
+        sys.exit(1)
+
+    try:
+        config = load_deploy_config(args.config)
+        # CLI flags override config file / env vars
+        if args.profile:
+            config.aws.profile = args.profile
+        if args.region:
+            config.aws.region = args.region
+        if args.role_arn:
+            config.aws.lambda_config.role_arn = args.role_arn
+        if args.function_name:
+            config.aws.lambda_config.function_name = (
+                args.function_name
+            )
+        result = deploy_grader(grader_ref, config=config)
+        print(f"Deployed: {result['function_name']}")
+        print(f"  ARN:    {result['function_arn']}")
+        print(f"  Region: {result['region']}")
+        print(f"  Grader: {result['grader_ref']}")
+    except ImportError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Deploy failed: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+def main():
+    """Entry point for llm-eval-kit CLI."""
+    parser = argparse.ArgumentParser(
+        prog="llm-eval-kit",
+        description="LLM Evaluation Toolkit",
+    )
+    sub = parser.add_subparsers(dest="command")
+
+    ep = sub.add_parser("evaluate", help="Run evaluation")
+    ep.add_argument("--grader", type=str)
+    ep.add_argument("--grader-path", type=str)
+    ep.add_argument("--data", required=True)
+    ep.add_argument("--output", type=str)
+    ep.add_argument("--max-samples", type=int)
+    ep.add_argument("--format", choices=["bfcl", "jsonl"],
+                    default="jsonl")
+
+    sub.add_parser("list-graders", help="List graders")
+
+    vp = sub.add_parser("validate", help="Validate dataset")
+    vp.add_argument("--data", required=True)
+
+    dp = sub.add_parser("deploy", help="Deploy grader as Lambda")
+    dp.add_argument("--grader", type=str,
+                    help="Built-in grader name (e.g. exact_match)")
+    dp.add_argument("--grader-path", type=str,
+                    help="Module path (e.g. my_module:my_grader)")
+    dp.add_argument("--config", type=str,
+                    help="Path to llm_eval_kit.yaml config file")
+    dp.add_argument("--profile", type=str,
+                    help="AWS profile name (from ~/.aws/credentials)")
+    dp.add_argument("--region", type=str,
+                    help="AWS region (overrides config/env)")
+    dp.add_argument("--role-arn", type=str,
+                    help="IAM role ARN for the Lambda function")
+    dp.add_argument("--function-name", type=str,
+                    help="Lambda function name (default: "
+                         "llm-eval-reward-function)")
+
+    args = parser.parse_args()
+    if args.command == "evaluate":
+        _cmd_evaluate(args)
+    elif args.command == "list-graders":
+        _cmd_list_graders()
+    elif args.command == "validate":
+        _cmd_validate(args)
+    elif args.command == "deploy":
+        _cmd_deploy(args)
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/llm_eval_kit/datasets/__init__.py b/src/llm_eval_kit/datasets/__init__.py
new file mode 100644
index 0000000..d06db58
--- /dev/null
+++ b/src/llm_eval_kit/datasets/__init__.py
@@ -0,0 +1,12 @@
+"""Dataset loading and formatting utilities."""
+from .loader import load_jsonl, load_bfcl, load_huggingface
+from .formatter import export_rft_jsonl, upload_to_s3, SplitResult
+
+__all__ = [
+    "load_jsonl",
+    "load_bfcl",
+    "load_huggingface",
+    "export_rft_jsonl",
+    "upload_to_s3",
+    "SplitResult",
+]
diff --git a/src/llm_eval_kit/datasets/formatter.py b/src/llm_eval_kit/datasets/formatter.py
new file mode 100644
index 0000000..a320596
--- /dev/null
+++ b/src/llm_eval_kit/datasets/formatter.py
@@ -0,0 +1,266 @@
+"""
+RFT dataset formatters — convert EvalDataset to Bedrock RFT training formats.
+
+Two output formats supported:
+  - Bedrock API: for create_model_customization_job (uploads to S3)
+  - OpenAI-compatible: for client.files.create (uploads via API)
+
+Both share the same core schema (messages + ground_truth) but differ
+in metadata fields and how ground_truth is structured.
+"""
+import json
+import logging
+import random
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+from llm_eval_kit.models.datasets import EvalDataset, EvalSample
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SplitResult:
+    """Result of a train/val/test split with file paths."""
+
+    train_path: str
+    train_size: int
+    val_path: Optional[str] = None
+    val_size: int = 0
+    test_path: Optional[str] = None
+    test_size: int = 0
+    paths: Dict[str, str] = field(default_factory=dict)
+
+    def summary(self) -> str:
+        parts = [f"train={self.train_size}"]
+        if self.val_size:
+            parts.append(f"val={self.val_size}")
+        if self.test_size:
+            parts.append(f"test={self.test_size}")
+        return " | ".join(parts)
+
+
+def format_for_bedrock(
+    sample: EvalSample,
+    system_prompt: Optional[str] = None,
+    domain: Optional[str] = None,
+    data_source: Optional[str] = None,
+    split_name: str = "train",
+    index: int = 0,
+) -> dict:
+    """
+    Format a single EvalSample for the Bedrock API RFT schema.
+
+    Output schema:
+        {
+            "messages": [{"role": ..., "content": ...}, ...],
+            "metadata": {"ground_truth": ...},
+            "task_id": "...",
+            "domain": "...",
+            "data_source": "..."
+        }
+    """
+    messages = []
+
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+
+    for msg in sample.messages:
+        messages.append({"role": msg.role, "content": msg.content})
+
+    row: Dict[str, Any] = {
+        "messages": messages,
+        "metadata": {
+            "ground_truth": sample.ground_truth,
+        },
+    }
+
+    # Optional metadata fields
+    task_id = sample.id or f"{split_name}_{index}"
+    row["task_id"] = task_id
+
+    if domain:
+        row["domain"] = domain
+    if data_source:
+        row["data_source"] = data_source
+
+    # Pass through tool_definitions if present
+    if "tool_definitions" in sample.metadata:
+        row["metadata"]["tool_definitions"] = (
+            sample.metadata["tool_definitions"]
+        )
+
+    return row
+
+
+def format_for_openai(
+    sample: EvalSample,
+    system_prompt: Optional[str] = None,
+) -> dict:
+    """
+    Format a single EvalSample for the OpenAI-compatible RFT schema.
+
+    Output schema:
+        {
+            "messages": [{"role": ..., "content": ...}, ...],
+            "ground_truth": "..."
+        }
+
+    The OpenAI-compatible path uses client.files.create() to upload,
+    so no S3 or task_id/domain fields are needed.
+    """
+    messages = []
+
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+
+    for msg in sample.messages:
+        messages.append({"role": msg.role, "content": msg.content})
+
+    # Ground truth — flatten to string if it's a list with one item
+    gt = sample.ground_truth
+    if isinstance(gt, list) and len(gt) == 1:
+        gt = str(gt[0])
+    elif isinstance(gt, list):
+        gt = json.dumps(gt)
+
+    return {
+        "messages": messages,
+        "ground_truth": gt,
+    }
+
+
+def export_rft_jsonl(
+    dataset: EvalDataset,
+    output_dir: str,
+    fmt: str = "bedrock",
+    system_prompt: Optional[str] = None,
+    domain: Optional[str] = None,
+    data_source: Optional[str] = None,
+    train_ratio: float = 0.8,
+    val_ratio: float = 0.1,
+    seed: int = 42,
+    shuffle: bool = True,
+) -> SplitResult:
+    """
+    Export an EvalDataset to RFT-formatted JSONL files with
+    train/val/test split.
+
+    Args:
+        dataset: The EvalDataset to export.
+        output_dir: Directory to write JSONL files into.
+        fmt: "bedrock" for Bedrock API or "openai" for
+            OpenAI-compatible API.
+        system_prompt: Optional system message prepended to each
+            sample's messages.
+        domain: Domain tag (Bedrock format only).
+        data_source: Data source tag (Bedrock format only).
+        train_ratio: Fraction of data for training (default 0.8).
+        val_ratio: Fraction for validation (default 0.1).
+            Remainder goes to test.
+        seed: Random seed for shuffling.
+        shuffle: Whether to shuffle before splitting.
+
+    Returns:
+        SplitResult with file paths and counts.
+    """
+    out = Path(output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+
+    samples = list(dataset)
+    if shuffle:
+        rng = random.Random(seed)
+        rng.shuffle(samples)
+
+    total = len(samples)
+    train_size = int(total * train_ratio)
+    val_size = int(total * val_ratio)
+    test_size = total - train_size - val_size
+
+    splits: List[Tuple[str, List[EvalSample]]] = [
+        ("train", samples[:train_size]),
+    ]
+    if val_size > 0:
+        splits.append(
+            ("val", samples[train_size:train_size + val_size])
+        )
+    if test_size > 0:
+        splits.append(("test", samples[train_size + val_size:]))
+
+    paths: Dict[str, str] = {}
+    for split_name, split_samples in splits:
+        path = out / f"{split_name}.jsonl"
+        with open(path, "w", encoding="utf-8") as f:
+            for i, sample in enumerate(split_samples):
+                if fmt == "openai":
+                    row = format_for_openai(
+                        sample,
+                        system_prompt=system_prompt,
+                    )
+                else:
+                    row = format_for_bedrock(
+                        sample,
+                        system_prompt=system_prompt,
+                        domain=domain,
+                        data_source=data_source,
+                        split_name=split_name,
+                        index=i,
+                    )
+                f.write(json.dumps(row) + "\n")
+        paths[split_name] = str(path)
+        logger.info(
+            "Wrote %d samples to %s", len(split_samples), path,
+        )
+
+    return SplitResult(
+        train_path=paths["train"],
+        train_size=train_size,
+        val_path=paths.get("val"),
+        val_size=val_size,
+        test_path=paths.get("test"),
+        test_size=test_size,
+        paths=paths,
+    )
+
+
+def upload_to_s3(
+    split_result: SplitResult,
+    bucket: str,
+    prefix: str,
+    session=None,
+) -> Dict[str, str]:
+    """
+    Upload split JSONL files to S3.
+
+    Args:
+        split_result: Output from export_rft_jsonl.
+        bucket: S3 bucket name.
+        prefix: S3 key prefix (e.g. "rft-data/bfcl").
+        session: Optional boto3.Session. Uses default if None.
+
+    Returns:
+        Dict mapping split name to S3 URI.
+    """
+    try:
+        import boto3
+    except ImportError:
+        raise ImportError(
+            "boto3 not installed. "
+            "Run: uv pip install -e \".[deploy]\""
+        )
+
+    if session is None:
+        session = boto3.Session()
+
+    s3 = session.client("s3")
+    uris: Dict[str, str] = {}
+
+    for split_name, local_path in split_result.paths.items():
+        key = f"{prefix.rstrip('/')}/{split_name}.jsonl"
+        s3.upload_file(local_path, bucket, key)
+        uri = f"s3://{bucket}/{key}"
+        uris[split_name] = uri
+        logger.info("Uploaded %s → %s", local_path, uri)
+
+    return uris
diff --git a/src/llm_eval_kit/datasets/loader.py b/src/llm_eval_kit/datasets/loader.py
new file mode 100644
index 0000000..cee3ba2
--- /dev/null
+++ b/src/llm_eval_kit/datasets/loader.py
@@ -0,0 +1,207 @@
+"""
+Dataset loaders — load evaluation samples from various sources.
+
+load_jsonl: generic JSONL where each line maps to EvalSample fields.
+load_bfcl: BFCL-specific JSONL with field mapping for the Berkeley
+           Function Calling Leaderboard dataset.
+"""
+import json
+import logging
+from typing import Optional
+
+from llm_eval_kit.models.messages import Message
+from llm_eval_kit.models.datasets import EvalDataset, EvalSample
+
+logger = logging.getLogger(__name__)
+
+
+def load_jsonl(
+    path: str, max_samples: Optional[int] = None
+) -> EvalDataset:
+    """Load a generic JSONL file into an EvalDataset."""
+    return EvalDataset.from_jsonl(path, max_samples=max_samples)
+
+
+def load_bfcl(
+    path: str, max_samples: Optional[int] = None
+) -> EvalDataset:
+    """
+    Load a BFCL JSONL file with field mapping.
+
+    BFCL format (each line):
+      - "id": unique identifier
+      - "question": list of message dicts (the user prompt)
+      - "function": list of tool definition dicts (JSON schemas)
+      - ground truth: varies by file, often a separate answer file
+
+    Note: BFCL files are NOT compatible with HuggingFace load_dataset.
+    """
+    samples = []
+    with open(path) as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+            except json.JSONDecodeError:
+                logger.warning("Skipping malformed JSON at line %d", line_num)
+                continue
+
+            if "id" not in data or "question" not in data:
+                logger.warning(
+                    "Skipping line %d: missing 'id' or 'question'", line_num
+                )
+                continue
+
+            # Map BFCL fields to EvalSample
+            question = data["question"]
+            if isinstance(question, list):
+                messages = [Message(**msg) if isinstance(msg, dict) else msg for msg in question]
+            else:
+                # Some BFCL entries have question as a string
+                messages = [Message(role="user", content=str(question))]
+
+            sample = EvalSample(
+                id=str(data["id"]),
+                messages=messages,
+                ground_truth=data.get("ground_truth"),
+                metadata={
+                    "tool_definitions": data.get("function", []),
+                },
+            )
+            samples.append(sample)
+
+            if max_samples and len(samples) >= max_samples:
+                break
+
+    return EvalDataset(samples)
+
+
+
+def load_huggingface(
+    dataset_name: str,
+    split: str = "train",
+    max_samples: Optional[int] = None,
+    token: Optional[str] = None,
+    prompt_key: str = "prompt",
+    response_key: str = "response",
+    ground_truth_key: Optional[str] = "ground_truth",
+    id_key: Optional[str] = "id",
+    config_name: Optional[str] = None,
+    data_files: Optional[str] = None,
+) -> EvalDataset:
+    """
+    Load a dataset from HuggingFace Hub.
+
+    Requires: pip install llm-eval-kit[datasets]
+
+    Args:
+        dataset_name: HF dataset name
+            (e.g. "gorilla-llm/Berkeley-Function-Calling-Leaderboard")
+        split: Dataset split (default: "train")
+        max_samples: Max samples to load (None = all)
+        token: HuggingFace API token. Falls back to HF_TOKEN env var.
+        prompt_key: Column name containing the prompt/question
+        response_key: Column name for model response (None to skip)
+        ground_truth_key: Column name for ground truth (None to skip)
+        id_key: Column name for sample ID (None to auto-generate)
+        config_name: Dataset config/subset name (for multi-config
+            datasets)
+        data_files: Specific file(s) to load from the repo
+            (e.g. "BFCL_v3_simple.json"). Useful when a HF repo
+            contains multiple files with different schemas.
+    """
+    try:
+        from datasets import load_dataset
+    except ImportError:
+        raise ImportError(
+            "HuggingFace datasets not installed. "
+            "Run: uv pip install -e \".[datasets]\""
+        )
+
+    import os
+    hf_token = token or os.environ.get("HF_TOKEN")
+
+    # Build kwargs for load_dataset
+    load_kwargs = {
+        "split": split,
+        "token": hf_token,
+    }
+    if config_name:
+        load_kwargs["name"] = config_name
+    if data_files:
+        load_kwargs["data_files"] = data_files
+
+    logger.info(
+        "Loading %s (split=%s%s) from HuggingFace...",
+        dataset_name,
+        split,
+        f", file={data_files}" if data_files else "",
+    )
+    ds = load_dataset(dataset_name, **load_kwargs)
+
+    samples = []
+    for i, row in enumerate(ds):
+        if max_samples and i >= max_samples:
+            break
+
+        # Build sample ID
+        sample_id = str(row.get(id_key, i)) if id_key else str(i)
+
+        # Build messages from available columns
+        messages = []
+        if prompt_key and prompt_key in row:
+            prompt = row[prompt_key]
+            if isinstance(prompt, list):
+                # Handle nested lists (e.g. BFCL "question"
+                # is list[list[message_dict]])
+                flat = prompt
+                if (
+                    flat
+                    and isinstance(flat[0], list)
+                ):
+                    flat = flat[0]
+                for msg in flat:
+                    if isinstance(msg, dict):
+                        messages.append(Message(**msg))
+                    else:
+                        messages.append(
+                            Message(role="user", content=str(msg))
+                        )
+            else:
+                messages.append(
+                    Message(role="user", content=str(prompt))
+                )
+
+        if response_key and response_key in row:
+            messages.append(
+                Message(
+                    role="assistant",
+                    content=str(row[response_key]),
+                )
+            )
+
+        # Ground truth
+        gt = row.get(ground_truth_key) if ground_truth_key else None
+
+        # Collect remaining columns as metadata
+        skip_keys = {
+            prompt_key, response_key, ground_truth_key, id_key,
+        }
+        metadata = {
+            k: v for k, v in row.items() if k not in skip_keys
+        }
+
+        samples.append(EvalSample(
+            id=sample_id,
+            messages=messages,
+            ground_truth=gt,
+            metadata=metadata,
+        ))
+
+    logger.info(
+        "Loaded %d samples from %s", len(samples), dataset_name,
+    )
+    return EvalDataset(samples)
+
diff --git a/src/llm_eval_kit/deploy/__init__.py b/src/llm_eval_kit/deploy/__init__.py
new file mode 100644
index 0000000..5f104b9
--- /dev/null
+++ b/src/llm_eval_kit/deploy/__init__.py
@@ -0,0 +1,5 @@
+"""AWS Lambda deployment for grader/reward functions."""
+from .config import DeployConfig, load_deploy_config
+from .lambda_deploy import deploy_grader
+
+__all__ = ["DeployConfig", "load_deploy_config", "deploy_grader"]
diff --git a/src/llm_eval_kit/deploy/config.py b/src/llm_eval_kit/deploy/config.py
new file mode 100644
index 0000000..df041c7
--- /dev/null
+++ b/src/llm_eval_kit/deploy/config.py
@@ -0,0 +1,103 @@
+"""
+Deployment configuration — YAML-based config for AWS Lambda deployment.
+
+Reads from llm_eval_kit.yaml or a user-specified path.
+Falls back to environment variables for AWS credentials.
+"""
+import os
+import logging
+from typing import Dict, Optional
+
+from pydantic import BaseModel, Field
+
+logger = logging.getLogger(__name__)
+
+CONFIG_FILE_NAME = "llm_eval_kit.yaml"
+
+
+class LambdaConfig(BaseModel):
+    """Lambda function configuration."""
+    function_name: str = "llm-eval-reward-function"
+    runtime: str = "python3.12"
+    handler: str = "handler.lambda_handler"
+    timeout: int = 60
+    memory_size: int = 256
+    role_arn: Optional[str] = None
+    layers: list = Field(default_factory=list)
+    environment: Dict[str, str] = Field(default_factory=dict)
+
+
+class AWSConfig(BaseModel):
+    """AWS account configuration."""
+    region: str = "us-east-1"
+    account_id: Optional[str] = None
+    profile: Optional[str] = None
+    lambda_config: LambdaConfig = Field(
+        default_factory=LambdaConfig,
+        alias="lambda",
+    )
+
+    model_config = {"populate_by_name": True}
+
+
+class DeployConfig(BaseModel):
+    """Top-level deployment configuration."""
+    aws: AWSConfig = Field(default_factory=AWSConfig)
+
+
+def load_deploy_config(
+    config_path: Optional[str] = None,
+) -> DeployConfig:
+    """
+    Load deployment config from YAML file.
+
+    Search order:
+    1. Explicit config_path argument
+    2. llm_eval_kit.yaml in current directory
+    3. Walk up parent directories
+    4. Fall back to defaults + env vars
+    """
+    try:
+        import yaml
+    except ImportError:
+        raise ImportError(
+            "PyYAML not installed. Run: uv pip install -e \".[deploy]\""
+        )
+
+    # Find config file
+    if config_path is None:
+        config_path = _find_config_file()
+
+    if config_path and os.path.isfile(config_path):
+        logger.info("Loading config from %s", config_path)
+        with open(config_path) as f:
+            raw = yaml.safe_load(f) or {}
+        config = DeployConfig(**raw)
+    else:
+        logger.info("No config file found, using defaults")
+        config = DeployConfig()
+
+    # Override from environment variables
+    if not config.aws.account_id:
+        config.aws.account_id = os.environ.get("AWS_ACCOUNT_ID")
+    if os.environ.get("AWS_DEFAULT_REGION"):
+        config.aws.region = os.environ["AWS_DEFAULT_REGION"]
+    if os.environ.get("AWS_REGION"):
+        config.aws.region = os.environ["AWS_REGION"]
+    if not config.aws.profile:
+        config.aws.profile = os.environ.get("AWS_PROFILE")
+
+    return config
+
+
+def _find_config_file() -> Optional[str]:
+    """Walk up from CWD looking for llm_eval_kit.yaml."""
+    current = os.path.abspath(os.getcwd())
+    while True:
+        candidate = os.path.join(current, CONFIG_FILE_NAME)
+        if os.path.isfile(candidate):
+            return candidate
+        parent = os.path.dirname(current)
+        if parent == current:
+            return None
+        current = parent
diff --git a/src/llm_eval_kit/deploy/lambda_deploy.py b/src/llm_eval_kit/deploy/lambda_deploy.py
new file mode 100644
index 0000000..5824ed1
--- /dev/null
+++ b/src/llm_eval_kit/deploy/lambda_deploy.py
@@ -0,0 +1,412 @@
+"""
+AWS Lambda deployment for grader functions.
+
+Packages a grader as a Lambda function and deploys it using boto3.
+The deployed Lambda accepts a JSON payload with messages and ground_truth,
+runs the grader, and returns the EvaluateResult.
+"""
+import io
+import logging
+import zipfile
+from pathlib import Path
+from typing import Optional
+
+from .config import DeployConfig, load_deploy_config
+
+logger = logging.getLogger(__name__)
+
+# Template for the Lambda handler that wraps a grader
+HANDLER_TEMPLATE = '''"""Auto-generated Lambda handler for llm-eval-kit grader."""
+import json
+import sys
+import os
+
+# Add the package to the path
+sys.path.insert(0, os.path.dirname(__file__))
+
+from llm_eval_kit.models.messages import Message
+from llm_eval_kit.models.results import EvaluateResult
+from llm_eval_kit.utils.module_loader import load_function
+
+
+# Load the grader at cold start
+_GRADER_REF = os.environ.get("GRADER_REF", "{grader_ref}")
+_grader = load_function(_GRADER_REF)
+
+
+def lambda_handler(event, context):
+    """
+    Lambda handler for reward function evaluation.
+
+    Expected payload:
+    {{
+        "messages": [{{"role": "user", "content": "..."}}, ...],
+        "ground_truth": "expected answer" | ["call1()", "call2()"],
+        "kwargs": {{}}  // optional extra args
+    }}
+    """
+    try:
+        body = event if isinstance(event, dict) else json.loads(event)
+
+        raw_messages = body.get("messages", [])
+        messages = [Message(**m) for m in raw_messages]
+        ground_truth = body.get("ground_truth")
+        kwargs = body.get("kwargs", {{}})
+
+        result = _grader.grade(messages, ground_truth, **kwargs)
+
+        return {{
+            "statusCode": 200,
+            "body": result.to_dict(),
+        }}
+    except Exception as e:
+        return {{
+            "statusCode": 500,
+            "body": {{"error": str(e)}},
+        }}
+'''
+
+
+def _build_deployment_package(grader_ref: str) -> bytes:
+    """
+    Build a Lambda deployment zip containing:
+    - handler.py (generated from template)
+    - The llm_eval_kit package
+    - Third-party dependencies (pydantic, etc.) installed
+      for the Lambda runtime platform
+    """
+    import shutil
+    import subprocess
+    import sys
+    import tempfile
+
+    buf = io.BytesIO()
+
+    # Find the llm_eval_kit package directory
+    import llm_eval_kit
+    pkg_dir = Path(llm_eval_kit.__file__).parent
+
+    # Install dependencies into a temp dir for bundling.
+    # Prefer uv for speed; fall back to pip if uv isn't available.
+    with tempfile.TemporaryDirectory() as tmp:
+        tmp_path = Path(tmp)
+        logger.info("Installing dependencies into package...")
+
+        pip_args = [
+            "--target", str(tmp_path),
+            "--platform", "manylinux2014_x86_64",
+            "--implementation", "cp",
+            "--python-version", "3.12",
+            "--only-binary=:all:",
+            "--quiet",
+            "pydantic>=2.0.0",
+            "pydantic-core",
+            "annotated-types",
+            "typing_extensions",
+        ]
+
+        uv_bin = shutil.which("uv")
+        if uv_bin:
+            subprocess.check_call(
+                [uv_bin, "pip", "install"] + pip_args,
+                stderr=subprocess.STDOUT,
+            )
+        else:
+            subprocess.check_call(
+                [sys.executable, "-m", "pip", "install"] + pip_args,
+                stderr=subprocess.STDOUT,
+            )
+
+        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
+            # 1. Write the handler
+            handler_code = HANDLER_TEMPLATE.format(
+                grader_ref=grader_ref,
+            )
+            zf.writestr("handler.py", handler_code)
+
+            # 2. Bundle the llm_eval_kit package
+            for file_path in pkg_dir.rglob("*.py"):
+                arcname = str(
+                    file_path.relative_to(pkg_dir.parent)
+                )
+                zf.writestr(arcname, file_path.read_text())
+
+            # 3. Bundle pip-installed dependencies
+            for file_path in tmp_path.rglob("*"):
+                if file_path.is_file():
+                    arcname = str(
+                        file_path.relative_to(tmp_path)
+                    )
+                    zf.writestr(
+                        arcname, file_path.read_bytes(),
+                    )
+
+    buf.seek(0)
+    return buf.read()
+
+
+def deploy_grader(
+    grader_ref: str,
+    config: Optional[DeployConfig] = None,
+    config_path: Optional[str] = None,
+) -> dict:
+    """
+    Deploy a grader as an AWS Lambda function.
+
+    Args:
+        grader_ref: Module path to the grader
+            (e.g. "llm_eval_kit.graders.builtins.exact_match:exact_match_grader")
+        config: DeployConfig instance (loaded from YAML if not provided)
+        config_path: Path to config YAML file
+
+    Returns:
+        dict with deployment info (function_name, function_arn, etc.)
+    """
+    try:
+        import boto3
+        import botocore.exceptions
+    except ImportError:
+        raise ImportError(
+            "boto3 not installed. Run: uv pip install -e \".[deploy]\""
+        )
+
+    if config is None:
+        config = load_deploy_config(config_path)
+
+    lc = config.aws.lambda_config
+    region = config.aws.region
+    profile = config.aws.profile
+
+    # Build a session — supports named profiles, env vars, SSO,
+    # instance roles, and the full default credential chain.
+    session = boto3.Session(
+        profile_name=profile,
+        region_name=region,
+    )
+
+    # Validate credentials before doing any real work
+    try:
+        sts = session.client("sts")
+        identity = sts.get_caller_identity()
+        logger.info(
+            "Authenticated as %s (account %s)",
+            identity["Arn"], identity["Account"],
+        )
+    except botocore.exceptions.NoCredentialsError:
+        raise RuntimeError(
+            "No AWS credentials found. Set them up using one of:\n"
+            "  1. aws configure            "
+            "(writes ~/.aws/credentials)\n"
+            "  2. aws configure sso        "
+            "(SSO login)\n"
+            "  3. Environment variables     "
+            "(AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY)\n"
+            "  4. --profile flag or "
+            "aws.profile in llm_eval_kit.yaml"
+        )
+    except botocore.exceptions.ClientError as e:
+        raise RuntimeError(
+            f"AWS credential check failed: {e}\n"
+            "Run 'aws sts get-caller-identity' to debug."
+        )
+
+    logger.info("Building deployment package for %s...", grader_ref)
+    zip_bytes = _build_deployment_package(grader_ref)
+    logger.info("Package size: %.1f KB", len(zip_bytes) / 1024)
+
+    # Merge grader ref into environment
+    env_vars = {**lc.environment, "GRADER_REF": grader_ref}
+
+    client = session.client("lambda")
+
+    # Check if function exists
+    try:
+        client.get_function(FunctionName=lc.function_name)
+        exists = True
+    except client.exceptions.ResourceNotFoundException:
+        exists = False
+
+    if exists:
+        logger.info("Updating existing function: %s", lc.function_name)
+        client.update_function_code(
+            FunctionName=lc.function_name,
+            ZipFile=zip_bytes,
+        )
+        # Wait for update to complete before updating config
+        waiter = client.get_waiter("function_updated_v2")
+        waiter.wait(FunctionName=lc.function_name)
+
+        client.update_function_configuration(
+            FunctionName=lc.function_name,
+            Runtime=lc.runtime,
+            Handler=lc.handler,
+            Timeout=lc.timeout,
+            MemorySize=lc.memory_size,
+            Environment={"Variables": env_vars},
+        )
+        response = client.get_function(FunctionName=lc.function_name)
+        arn = response["Configuration"]["FunctionArn"]
+    else:
+        if not lc.role_arn:
+            raise ValueError(
+                "role_arn is required to create a new Lambda function. "
+                "Set it in llm_eval_kit.yaml under aws.lambda.role_arn "
+                "or provide an existing function name to update."
+            )
+        logger.info("Creating new function: %s", lc.function_name)
+        response = client.create_function(
+            FunctionName=lc.function_name,
+            Runtime=lc.runtime,
+            Role=lc.role_arn,
+            Handler=lc.handler,
+            Code={"ZipFile": zip_bytes},
+            Timeout=lc.timeout,
+            MemorySize=lc.memory_size,
+            Environment={"Variables": env_vars},
+        )
+        arn = response["FunctionArn"]
+
+    result = {
+        "function_name": lc.function_name,
+        "function_arn": arn,
+        "region": region,
+        "grader_ref": grader_ref,
+    }
+    logger.info("Deployed: %s (%s)", lc.function_name, arn)
+    return result
+
+
+def deploy_reward_function(
+    source_file: str,
+    function_name: str,
+    role_arn: str,
+    handler: Optional[str] = None,
+    runtime: str = "python3.12",
+    timeout: int = 300,
+    memory_size: int = 512,
+    region: Optional[str] = None,
+    profile: Optional[str] = None,
+) -> dict:
+    """
+    Deploy a standalone reward function .py file as a Lambda.
+
+    This is for zero-dependency reward functions that follow the
+    Bedrock RFT batch contract (receive list, return list with
+    id + aggregate_reward_score + reward_components).
+
+    Unlike deploy_grader(), this does NOT bundle llm_eval_kit or
+    pydantic — it just zips the single .py file and deploys it.
+
+    Args:
+        source_file: Path to the .py reward function file.
+        function_name: Lambda function name.
+        role_arn: IAM role ARN for the Lambda.
+        handler: Lambda handler string. Defaults to
+            "<module_name>.lambda_handler".
+        runtime: Lambda runtime (default python3.12).
+        timeout: Timeout in seconds (default 300).
+        memory_size: Memory in MB (default 512).
+        region: AWS region (default from env/config).
+        profile: AWS profile name.
+
+    Returns:
+        dict with function_name, function_arn, region.
+    """
+    try:
+        import boto3
+        import botocore.exceptions
+    except ImportError:
+        raise ImportError(
+            "boto3 not installed. "
+            "Run: uv pip install -e \".[deploy]\""
+        )
+
+    from pathlib import Path as _Path
+
+    src = _Path(source_file)
+    if not src.is_file():
+        raise FileNotFoundError(f"Reward function not found: {src}")
+
+    module_name = src.stem
+    if handler is None:
+        handler = f"{module_name}.lambda_handler"
+
+    # Build zip with just the single file
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
+        zf.writestr(f"{module_name}.py", src.read_text())
+    buf.seek(0)
+    zip_bytes = buf.read()
+
+    logger.info(
+        "Package: %s (%.1f KB)",
+        module_name, len(zip_bytes) / 1024,
+    )
+
+    session = boto3.Session(
+        profile_name=profile,
+        region_name=region,
+    )
+
+    # Validate credentials
+    try:
+        sts = session.client("sts")
+        identity = sts.get_caller_identity()
+        logger.info(
+            "Authenticated as %s", identity["Arn"],
+        )
+    except botocore.exceptions.NoCredentialsError:
+        raise RuntimeError(
+            "No AWS credentials found. "
+            "Run 'aws configure' or set env vars."
+        )
+
+    client = session.client("lambda")
+
+    try:
+        client.get_function(FunctionName=function_name)
+        exists = True
+    except client.exceptions.ResourceNotFoundException:
+        exists = False
+
+    if exists:
+        logger.info("Updating: %s", function_name)
+        client.update_function_code(
+            FunctionName=function_name,
+            ZipFile=zip_bytes,
+        )
+        waiter = client.get_waiter("function_updated_v2")
+        waiter.wait(FunctionName=function_name)
+        client.update_function_configuration(
+            FunctionName=function_name,
+            Runtime=runtime,
+            Handler=handler,
+            Timeout=timeout,
+            MemorySize=memory_size,
+        )
+        resp = client.get_function(FunctionName=function_name)
+        arn = resp["Configuration"]["FunctionArn"]
+    else:
+        logger.info("Creating: %s", function_name)
+        resp = client.create_function(
+            FunctionName=function_name,
+            Runtime=runtime,
+            Role=role_arn,
+            Handler=handler,
+            Code={"ZipFile": zip_bytes},
+            Timeout=timeout,
+            MemorySize=memory_size,
+        )
+        arn = resp["FunctionArn"]
+
+    # Wait for function to be active
+    waiter = client.get_waiter("function_active_v2")
+    waiter.wait(FunctionName=function_name)
+
+    result = {
+        "function_name": function_name,
+        "function_arn": arn,
+        "region": session.region_name,
+    }
+    logger.info("Deployed: %s (%s)", function_name, arn)
+    return result
diff --git a/src/llm_eval_kit/execution/__init__.py b/src/llm_eval_kit/execution/__init__.py
new file mode 100644
index 0000000..3b23571
--- /dev/null
+++ b/src/llm_eval_kit/execution/__init__.py
@@ -0,0 +1,4 @@
+"""Evaluation execution pipeline."""
+from .pipeline import EvalPipeline, EvalReport
+
+__all__ = ["EvalPipeline", "EvalReport"]
diff --git a/src/llm_eval_kit/execution/pipeline.py b/src/llm_eval_kit/execution/pipeline.py
new file mode 100644
index 0000000..0f9d939
--- /dev/null
+++ b/src/llm_eval_kit/execution/pipeline.py
@@ -0,0 +1,81 @@
+"""
+Evaluation pipeline — runs graders over datasets and collects results.
+
+EvalPipeline is a plain class (composition: has a grader + dataset).
+EvalReport is a @dataclass (internal summary container).
+"""
+import json
+import logging
+from dataclasses import dataclass, field
+from typing import List
+
+from llm_eval_kit.graders.base import Grader
+from llm_eval_kit.models.datasets import EvalDataset
+from llm_eval_kit.models.results import EvaluateResult
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class EvalReport:
+    """Summary of an evaluation run."""
+
+    total_samples: int
+    avg_score: float
+    min_score: float
+    max_score: float
+    results: List[EvaluateResult] = field(repr=False)
+
+    def to_jsonl(self, path: str) -> None:
+        """Write each EvaluateResult as a JSON line."""
+        with open(path, "w") as f:
+            for result in self.results:
+                f.write(json.dumps(result.to_dict()) + "\n")
+
+    def summary(self) -> str:
+        return (
+            f"Samples: {self.total_samples} | "
+            f"Avg: {self.avg_score:.4f} | "
+            f"Min: {self.min_score:.4f} | "
+            f"Max: {self.max_score:.4f}"
+        )
+
+
+class EvalPipeline:
+    """Orchestrates running a grader over a dataset."""
+
+    def __init__(self, grader: Grader, dataset: EvalDataset) -> None:
+        self.grader = grader
+        self.dataset = dataset
+
+    def run(self) -> List[EvaluateResult]:
+        results = []
+        for sample in self.dataset:
+            try:
+                result = self.grader.grade(
+                    messages=sample.messages,
+                    ground_truth=sample.ground_truth,
+                    **sample.metadata,
+                )
+                results.append(result)
+            except Exception as e:
+                logger.warning("Grader failed on sample %s: %s", sample.id, e)
+                results.append(
+                    EvaluateResult(
+                        score=0.0,
+                        is_valid=False,
+                        reason=f"Error: {e}",
+                    )
+                )
+        return results
+
+    def run_with_report(self) -> EvalReport:
+        results = self.run()
+        scores = [r.score for r in results]
+        return EvalReport(
+            total_samples=len(results),
+            avg_score=sum(scores) / len(scores) if scores else 0.0,
+            min_score=min(scores) if scores else 0.0,
+            max_score=max(scores) if scores else 0.0,
+            results=results,
+        )
diff --git a/src/llm_eval_kit/graders/__init__.py b/src/llm_eval_kit/graders/__init__.py
new file mode 100644
index 0000000..c0239e2
--- /dev/null
+++ b/src/llm_eval_kit/graders/__init__.py
@@ -0,0 +1,26 @@
+"""
+Grader framework — the core evaluation engine of llm-eval-kit.
+
+Importing this package auto-populates the default_registry with built-in graders.
+"""
+from .base import Grader
+from .decorator import grader
+from .registry import GraderRegistry, default_registry
+from .builtins.exact_match import exact_match_grader
+from .builtins.string_similarity import string_similarity_grader
+from .builtins.tool_call import tool_call_grader
+
+# Auto-register built-in graders
+default_registry.register("exact_match", exact_match_grader)
+default_registry.register("string_similarity", string_similarity_grader)
+default_registry.register("tool_call", tool_call_grader)
+
+__all__ = [
+    "Grader",
+    "grader",
+    "GraderRegistry",
+    "default_registry",
+    "exact_match_grader",
+    "string_similarity_grader",
+    "tool_call_grader",
+]
diff --git a/src/llm_eval_kit/graders/base.py b/src/llm_eval_kit/graders/base.py
new file mode 100644
index 0000000..add7e0a
--- /dev/null
+++ b/src/llm_eval_kit/graders/base.py
@@ -0,0 +1,39 @@
+"""Abstract base class for all graders."""
+from abc import ABC, abstractmethod
+from typing import Any, List, Optional, Union
+
+from llm_eval_kit.models.messages import Message
+from llm_eval_kit.models.results import EvaluateResult
+
+
+class Grader(ABC):
+    """
+    Interface that all graders must implement.
+
+    Subclasses provide name, description, and grade().
+    Every Grader is callable — __call__ delegates to grade().
+    """
+
+    @property
+    @abstractmethod
+    def name(self) -> str: ...
+
+    @property
+    @abstractmethod
+    def description(self) -> str: ...
+
+    @abstractmethod
+    def grade(
+        self,
+        messages: List[Message],
+        ground_truth: Optional[Union[str, dict, list]] = None,
+        **kwargs: Any,
+    ) -> EvaluateResult: ...
+
+    def __call__(
+        self,
+        messages: List[Message],
+        ground_truth: Optional[Union[str, dict, list]] = None,
+        **kwargs: Any,
+    ) -> EvaluateResult:
+        return self.grade(messages, ground_truth, **kwargs)
diff --git a/src/llm_eval_kit/graders/builtins/__init__.py b/src/llm_eval_kit/graders/builtins/__init__.py
new file mode 100644
index 0000000..1ef7769
--- /dev/null
+++ b/src/llm_eval_kit/graders/builtins/__init__.py
@@ -0,0 +1,10 @@
+"""Built-in grader implementations."""
+from .exact_match import exact_match_grader
+from .string_similarity import string_similarity_grader
+from .tool_call import tool_call_grader
+
+__all__ = [
+    "exact_match_grader",
+    "string_similarity_grader",
+    "tool_call_grader",
+]
diff --git a/src/llm_eval_kit/graders/builtins/_helpers.py b/src/llm_eval_kit/graders/builtins/_helpers.py
new file mode 100644
index 0000000..28440da
--- /dev/null
+++ b/src/llm_eval_kit/graders/builtins/_helpers.py
@@ -0,0 +1,12 @@
+"""Shared helpers for built-in graders."""
+from typing import List, Optional
+
+from llm_eval_kit.models.messages import Message
+
+
+def get_last_assistant_content(messages: List[Message]) -> Optional[str]:
+    """Walk messages in reverse, return first assistant content found."""
+    for msg in reversed(messages):
+        if msg.role == "assistant" and msg.content is not None:
+            return msg.content
+    return None
diff --git a/src/llm_eval_kit/graders/builtins/exact_match.py b/src/llm_eval_kit/graders/builtins/exact_match.py
new file mode 100644
index 0000000..9e77630
--- /dev/null
+++ b/src/llm_eval_kit/graders/builtins/exact_match.py
@@ -0,0 +1,38 @@
+"""Exact match grader — checks if assistant response exactly matches ground truth."""
+from llm_eval_kit.graders.decorator import grader
+from llm_eval_kit.models.results import EvaluateResult, MetricResult
+from ._helpers import get_last_assistant_content
+
+
+@grader(
+    name="exact_match",
+    description="Exact string match between response and ground truth",
+)
+def exact_match_grader(messages, ground_truth, *, case_sensitive=False, **kwargs):
+    response = get_last_assistant_content(messages)
+    if response is None:
+        return EvaluateResult(
+            score=0.0,
+            is_valid=False,
+            reason="No assistant message found",
+        )
+
+    response = response.strip()
+    expected = str(ground_truth).strip() if ground_truth is not None else ""
+
+    if not case_sensitive:
+        match = response.lower() == expected.lower()
+    else:
+        match = response == expected
+
+    score = 1.0 if match else 0.0
+    return EvaluateResult(
+        score=score,
+        reason="Exact match" if match else "No match",
+        metrics={
+            "exact_match": MetricResult(
+                score=score,
+                reason=f"case_sensitive={case_sensitive}",
+            ),
+        },
+    )
diff --git a/src/llm_eval_kit/graders/builtins/llm_judge.py b/src/llm_eval_kit/graders/builtins/llm_judge.py
new file mode 100644
index 0000000..1ebc2b0
--- /dev/null
+++ b/src/llm_eval_kit/graders/builtins/llm_judge.py
@@ -0,0 +1,6 @@
+"""
+LLM-as-judge grader — uses an LLM to evaluate responses.
+
+This is a placeholder for future implementation.
+It would call an LLM (e.g., via Bedrock Converse API) to score responses.
+"""
diff --git a/src/llm_eval_kit/graders/builtins/string_similarity.py b/src/llm_eval_kit/graders/builtins/string_similarity.py
new file mode 100644
index 0000000..cabc925
--- /dev/null
+++ b/src/llm_eval_kit/graders/builtins/string_similarity.py
@@ -0,0 +1,114 @@
+"""
+String similarity grader — fuzzy matching using edit distance or token overlap.
+
+The grader framework is complete. The two algorithm functions below are stubs
+for YOU to implement as Leetcode practice.
+"""
+from llm_eval_kit.graders.decorator import grader
+from llm_eval_kit.models.results import EvaluateResult, MetricResult
+from ._helpers import get_last_assistant_content
+
+
+# ---------------------------------------------------------------------------
+# YOUR TASK: Implement these two functions
+# ---------------------------------------------------------------------------
+
+def levenshtein_similarity(s1: str, s2: str) -> float:
+    """
+    Compute normalized Levenshtein similarity between two strings.
+    Return: 1.0 - (edit_distance / max(len(s1), len(s2)))
+
+    LEETCODE CONNECTION: This is Leetcode #72 (Edit Distance).
+
+    Algorithm:
+    1. Build a 2D DP table of size (len(s1)+1) x (len(s2)+1)
+    2. dp[i][j] = minimum edits to convert s1[:i] into s2[:j]
+    3. Base cases: dp[i][0] = i, dp[0][j] = j
+    4. Transition:
+       - If s1[i-1] == s2[j-1]: dp[i][j] = dp[i-1][j-1]
+       - Else: dp[i][j] = 1 + min(dp[i-1][j],      # delete
+                                    dp[i][j-1],      # insert
+                                    dp[i-1][j-1])    # replace
+    5. edit_distance = dp[len(s1)][len(s2)]
+    6. Normalize: 1.0 - (edit_distance / max(len(s1), len(s2)))
+
+    Edge cases:
+    - Both empty -> return 1.0
+    - One empty -> return 0.0
+
+    Space optimization (optional stretch goal):
+    - You only need the previous row, so you can use O(min(m,n)) space
+      instead of O(m*n). This is a common follow-up in interviews.
+    """
+    raise NotImplementedError("Implement levenshtein_similarity")
+
+
+def token_f1_score(prediction: str, reference: str) -> float:
+    """
+    Compute token-level F1 score between prediction and reference.
+
+    Algorithm:
+    1. Tokenize: prediction.lower().split(), reference.lower().split()
+    2. Use collections.Counter to count token frequencies (multiset)
+    3. Overlap = sum of min counts for each token (Counter intersection)
+       - In Python: sum((counter_pred & counter_ref).values())
+    4. precision = overlap / len(predicted_tokens)
+    5. recall = overlap / len(reference_tokens)
+    6. F1 = 2 * precision * recall / (precision + recall)
+
+    LEETCODE CONNECTION:
+    - Counter intersection is related to array intersection problems
+    - Using Counter (multiset) is key — plain set loses duplicate info
+    - Think about: what if one string has "the the the" and the other
+      has "the"? Plain set says full overlap, Counter says 1/3.
+
+    Edge cases:
+    - Both empty -> return 1.0
+    - One empty -> return 0.0
+    - No overlap -> return 0.0 (avoid division by zero in F1)
+    """
+    raise NotImplementedError("Implement token_f1_score")
+
+
+# ---------------------------------------------------------------------------
+# Grader (framework code — complete)
+# ---------------------------------------------------------------------------
+
+@grader(
+    name="string_similarity",
+    description="Fuzzy string matching via Levenshtein distance or token F1",
+)
+def string_similarity_grader(
+    messages, ground_truth, *, strategy="levenshtein", **kwargs
+):
+    response = get_last_assistant_content(messages)
+    if response is None:
+        return EvaluateResult(
+            score=0.0,
+            is_valid=False,
+            reason="No assistant message found",
+        )
+
+    expected = str(ground_truth) if ground_truth is not None else ""
+
+    # Both empty is a perfect match
+    if not response and not expected:
+        return EvaluateResult(score=1.0, reason="Both empty")
+
+    if strategy == "levenshtein":
+        score = levenshtein_similarity(response, expected)
+    elif strategy == "token_f1":
+        score = token_f1_score(response, expected)
+    else:
+        raise ValueError(f"Unknown strategy: {strategy}")
+
+    return EvaluateResult(
+        score=score,
+        reason=f"Similarity ({strategy}): {score:.4f}",
+        metrics={
+            strategy: MetricResult(
+                score=score,
+                reason=f"Computed via {strategy} strategy",
+            ),
+        },
+    )
diff --git a/src/llm_eval_kit/graders/builtins/tool_call.py b/src/llm_eval_kit/graders/builtins/tool_call.py
new file mode 100644
index 0000000..f75b27a
--- /dev/null
+++ b/src/llm_eval_kit/graders/builtins/tool_call.py
@@ -0,0 +1,288 @@
+"""
+Tool call grader — BFCL-style AST comparison of function calls.
+
+Parses function call strings like `func_name(param1=value1, param2="str")`
+using Python's ast module, then compares structurally with type coercion.
+"""
+import ast
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+from llm_eval_kit.graders.decorator import grader
+from llm_eval_kit.models.results import EvaluateResult, MetricResult
+from ._helpers import get_last_assistant_content
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Function call parser
+# ---------------------------------------------------------------------------
+
+@dataclass
+class ParsedCall:
+    """Structured representation of a parsed function call."""
+    func_name: str
+    params: Dict[str, Any]
+
+
+def parse_function_call(call_str: str) -> ParsedCall:
+    """
+    Parse 'func_name(param1=value1, param2="str")' into a ParsedCall.
+
+    Uses ast.parse in eval mode to safely parse the expression,
+    then extracts function name and keyword arguments.
+    """
+    call_str = call_str.strip()
+    try:
+        tree = ast.parse(call_str, mode="eval")
+    except SyntaxError as e:
+        raise ValueError(f"Cannot parse function call: {call_str!r}") from e
+
+    if not isinstance(tree.body, ast.Call):
+        raise ValueError(f"Expression is not a function call: {call_str!r}")
+
+    call_node = tree.body
+
+    # Extract function name (handles simple names like func_name)
+    if isinstance(call_node.func, ast.Name):
+        func_name = call_node.func.id
+    elif isinstance(call_node.func, ast.Attribute):
+        # Handle dotted names like module.func_name
+        parts = []
+        node = call_node.func
+        while isinstance(node, ast.Attribute):
+            parts.append(node.attr)
+            node = node.value
+        if isinstance(node, ast.Name):
+            parts.append(node.id)
+        func_name = ".".join(reversed(parts))
+    else:
+        raise ValueError(f"Unsupported function call format: {call_str!r}")
+
+    # Extract keyword arguments
+    params: Dict[str, Any] = {}
+
+    # Handle positional args (convert to indexed params)
+    for i, arg in enumerate(call_node.args):
+        try:
+            params[f"_arg{i}"] = ast.literal_eval(arg)
+        except (ValueError, TypeError):
+            params[f"_arg{i}"] = ast.dump(arg)
+
+    # Handle keyword args
+    for kw in call_node.keywords:
+        if kw.arg is None:
+            continue  # **kwargs expansion, skip
+        try:
+            params[kw.arg] = ast.literal_eval(kw.value)
+        except (ValueError, TypeError):
+            # Fall back to string representation for complex expressions
+            params[kw.arg] = ast.dump(kw.value)
+
+    return ParsedCall(func_name=func_name, params=params)
+
+
+def format_function_call(parsed: ParsedCall) -> str:
+    """Pretty-print a ParsedCall back to a function call string."""
+    param_strs = [f"{k}={repr(v)}" for k, v in sorted(parsed.params.items())]
+    return f"{parsed.func_name}({', '.join(param_strs)})"
+
+
+# ---------------------------------------------------------------------------
+# Type-coerced value comparison
+# ---------------------------------------------------------------------------
+
+def _try_coerce_match(a: Any, b: Any) -> bool:
+    """Try to coerce `a` to the type of `b` and compare."""
+    if isinstance(a, str) and isinstance(b, bool):
+        if a.lower() in ("true", "false"):
+            return (a.lower() == "true") == b
+        return False
+    if isinstance(a, str) and isinstance(b, int) and not isinstance(b, bool):
+        try:
+            return int(a) == b
+        except (ValueError, TypeError):
+            return False
+    if isinstance(a, str) and isinstance(b, float):
+        try:
+            return float(a) == b
+        except (ValueError, TypeError):
+            return False
+    return False
+
+
+def values_match(predicted: Any, expected: Any) -> bool:
+    """Compare two values with type coercion."""
+    if predicted == expected:
+        return True
+    # Try coercion in both directions
+    if _try_coerce_match(predicted, expected):
+        return True
+    if _try_coerce_match(expected, predicted):
+        return True
+    # Recursive comparison for lists
+    if isinstance(predicted, list) and isinstance(expected, list):
+        if len(predicted) != len(expected):
+            return False
+        return all(values_match(p, e) for p, e in zip(predicted, expected))
+    # Recursive comparison for dicts
+    if isinstance(predicted, dict) and isinstance(expected, dict):
+        if set(predicted.keys()) != set(expected.keys()):
+            return False
+        return all(
+            values_match(predicted[k], expected[k]) for k in expected
+        )
+    return False
+
+
+# ---------------------------------------------------------------------------
+# Scoring
+# ---------------------------------------------------------------------------
+
+def compare_single_call(predicted: ParsedCall, expected: ParsedCall) -> dict:
+    """Compare one predicted call against one expected call."""
+    func_match = predicted.func_name == expected.func_name
+
+    expected_keys = set(expected.params.keys())
+    predicted_keys = set(predicted.params.keys())
+    common_keys = expected_keys & predicted_keys
+
+    if expected_keys:
+        param_name_acc = len(common_keys) / len(expected_keys)
+    else:
+        param_name_acc = 1.0 if not predicted_keys else 0.0
+
+    value_matches = sum(
+        1 for k in common_keys
+        if values_match(predicted.params[k], expected.params[k])
+    )
+    param_value_acc = value_matches / len(expected_keys) if expected_keys else 1.0
+
+    overall = (
+        (1.0 if func_match else 0.0) * 0.33
+        + param_name_acc * 0.33
+        + param_value_acc * 0.34
+    )
+    return {
+        "func_name_match": func_match,
+        "param_name_accuracy": param_name_acc,
+        "param_value_accuracy": param_value_acc,
+        "overall": overall,
+    }
+
+
+def _split_calls(text: str) -> List[str]:
+    """
+    Split a string that may contain multiple function calls.
+    Handles newline-separated or list-formatted calls.
+    """
+    text = text.strip()
+    # If it looks like a Python list, try to parse individual calls
+    if text.startswith("[") and text.endswith("]"):
+        text = text[1:-1].strip()
+
+    # Split on newlines or comma-separated top-level calls
+    calls = []
+    depth = 0
+    current: List[str] = []
+    for char in text:
+        if char == "(":
+            depth += 1
+        elif char == ")":
+            depth -= 1
+        if char == "\n" and depth == 0:
+            chunk = "".join(current).strip()
+            if chunk:
+                calls.append(chunk)
+            current = []
+        else:
+            current.append(char)
+    chunk = "".join(current).strip()
+    if chunk:
+        calls.append(chunk)
+
+    # Clean trailing commas
+    return [c.rstrip(",").strip() for c in calls if c.strip()]
+
+
+# ---------------------------------------------------------------------------
+# The grader
+# ---------------------------------------------------------------------------
+
+@grader(
+    name="tool_call",
+    description="BFCL-style AST comparison of function calls",
+)
+def tool_call_grader(messages, ground_truth, **kwargs):
+    """
+    Compare predicted function calls against ground truth.
+
+    ground_truth: str or List[str] of function call strings
+    messages: last assistant message content contains predicted call(s)
+    """
+    predicted_str = get_last_assistant_content(messages)
+    if predicted_str is None:
+        return EvaluateResult(
+            score=0.0, is_valid=False, reason="No assistant message"
+        )
+
+    # Normalize ground_truth to list of strings
+    if isinstance(ground_truth, str):
+        gt_strs = [ground_truth]
+    elif isinstance(ground_truth, list):
+        gt_strs = [str(g) for g in ground_truth]
+    else:
+        return EvaluateResult(
+            score=0.0,
+            is_valid=False,
+            reason=f"Unexpected ground_truth type: {type(ground_truth)}",
+        )
+
+    # Parse predicted calls
+    try:
+        predicted_calls = [parse_function_call(s) for s in _split_calls(predicted_str)]
+    except (ValueError, SyntaxError) as e:
+        return EvaluateResult(
+            score=0.0, is_valid=False, reason=f"Parse error (predicted): {e}"
+        )
+
+    # Parse expected calls
+    try:
+        expected_calls = [parse_function_call(s) for s in gt_strs]
+    except (ValueError, SyntaxError) as e:
+        return EvaluateResult(
+            score=0.0, is_valid=False, reason=f"Parse error (ground truth): {e}"
+        )
+
+    # Compare by position
+    comparisons = []
+    for pred, exp in zip(predicted_calls, expected_calls):
+        comparisons.append(compare_single_call(pred, exp))
+
+    n = max(len(expected_calls), len(predicted_calls), 1)
+    avg_overall = sum(c["overall"] for c in comparisons) / n if comparisons else 0.0
+
+    fn_acc = sum(1.0 if c["func_name_match"] else 0.0 for c in comparisons) / n
+    pn_acc = sum(c["param_name_accuracy"] for c in comparisons) / n
+    pv_acc = sum(c["param_value_accuracy"] for c in comparisons) / n
+
+    return EvaluateResult(
+        score=avg_overall,
+        reason=f"Matched {len(comparisons)}/{n} function calls",
+        metrics={
+            "function_name_accuracy": MetricResult(
+                score=fn_acc,
+                reason="Fraction of calls with correct function name",
+            ),
+            "parameter_name_accuracy": MetricResult(
+                score=pn_acc,
+                reason="Average parameter name accuracy across calls",
+            ),
+            "parameter_value_accuracy": MetricResult(
+                score=pv_acc,
+                reason="Average parameter value accuracy across calls",
+            ),
+        },
+    )
diff --git a/src/llm_eval_kit/graders/decorator.py b/src/llm_eval_kit/graders/decorator.py
new file mode 100644
index 0000000..59587de
--- /dev/null
+++ b/src/llm_eval_kit/graders/decorator.py
@@ -0,0 +1,87 @@
+"""
+The @grader decorator — wraps plain functions into Grader instances.
+
+Supports both @grader and @grader(name=..., description=...) syntax.
+"""
+import functools
+import inspect
+from typing import Any, Callable, List, Optional, Union
+
+from llm_eval_kit.models.messages import Message
+from llm_eval_kit.models.results import EvaluateResult
+from .base import Grader
+
+
+class _FunctionGrader(Grader):
+    """Internal: wraps a plain function as a Grader. Created by @grader."""
+
+    def __init__(
+        self, fn: Callable, grader_name: str, grader_desc: str
+    ) -> None:
+        functools.update_wrapper(self, fn)
+        self._fn = fn
+        self._name = grader_name
+        self._description = grader_desc
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def description(self) -> str:
+        return self._description
+
+    def grade(
+        self,
+        messages: List[Message],
+        ground_truth: Optional[Union[str, dict, list]] = None,
+        **kwargs: Any,
+    ) -> EvaluateResult:
+        return self._fn(messages, ground_truth, **kwargs)
+
+
+def _validate_signature(fn: Callable) -> None:
+    """Check that fn accepts (messages, ground_truth, **kwargs)."""
+    sig = inspect.signature(fn)
+    params = list(sig.parameters.keys())
+    if len(params) < 2:
+        raise TypeError(
+            f"Grader function '{fn.__name__}' must accept at least "
+            f"(messages, ground_truth, **kwargs), got: ({', '.join(params)})"
+        )
+    if params[0] != "messages" or params[1] != "ground_truth":
+        raise TypeError(
+            f"Grader function '{fn.__name__}' first two parameters must be "
+            f"'messages' and 'ground_truth', got: ({', '.join(params[:2])})"
+        )
+
+
+def grader(
+    func: Optional[Callable] = None,
+    *,
+    name: Optional[str] = None,
+    description: Optional[str] = None,
+) -> Union[Grader, Callable[..., Grader]]:
+    """
+    Decorator to turn a function into a Grader.
+
+    Usage:
+        @grader
+        def my_grader(messages, ground_truth, **kwargs): ...
+
+        @grader(name="custom", description="My grader")
+        def my_grader(messages, ground_truth, **kwargs): ...
+    """
+    def _wrap(fn: Callable) -> _FunctionGrader:
+        _validate_signature(fn)
+        return _FunctionGrader(
+            fn,
+            grader_name=name or fn.__name__,
+            grader_desc=description or fn.__doc__ or "",
+        )
+
+    if func is not None:
+        # Bare @grader (no parentheses)
+        return _wrap(func)
+    # @grader(...) with arguments — return the wrapper
+    return _wrap
diff --git a/src/llm_eval_kit/graders/registry.py b/src/llm_eval_kit/graders/registry.py
new file mode 100644
index 0000000..805d1d6
--- /dev/null
+++ b/src/llm_eval_kit/graders/registry.py
@@ -0,0 +1,31 @@
+"""Grader registry — central place to register and discover graders by name."""
+from typing import Dict, List
+
+from .base import Grader
+
+
+class GraderRegistry:
+    """Maps string names to Grader instances."""
+
+    def __init__(self) -> None:
+        self._graders: Dict[str, Grader] = {}
+
+    def register(self, name: str, grader: Grader) -> None:
+        self._graders[name] = grader
+
+    def get(self, name: str) -> Grader:
+        if name not in self._graders:
+            available = ", ".join(sorted(self._graders.keys()))
+            raise KeyError(
+                f"Grader '{name}' not found. Available: {available}"
+            )
+        return self._graders[name]
+
+    def list_graders(self) -> List[str]:
+        return sorted(self._graders.keys())
+
+    def __contains__(self, name: str) -> bool:
+        return name in self._graders
+
+
+default_registry = GraderRegistry()
diff --git a/src/llm_eval_kit/model/__init__.py b/src/llm_eval_kit/model/__init__.py
new file mode 100644
index 0000000..8313542
--- /dev/null
+++ b/src/llm_eval_kit/model/__init__.py
@@ -0,0 +1,5 @@
+"""
+SageMaker payload models (existing functionality).
+
+Pydantic models for validating Lambda pre/post processing payloads.
+"""
diff --git a/src/llm_eval_kit/models/__init__.py b/src/llm_eval_kit/models/__init__.py
new file mode 100644
index 0000000..0d14429
--- /dev/null
+++ b/src/llm_eval_kit/models/__init__.py
@@ -0,0 +1,18 @@
+"""
+Core data models for llm-eval-kit.
+
+Pydantic v2 BaseModel: Message, MetricResult, EvaluateResult, EvalSample
+Plain class: Conversation, EvalDataset
+"""
+from .messages import Conversation, Message
+from .results import EvaluateResult, MetricResult
+from .datasets import EvalDataset, EvalSample
+
+__all__ = [
+    "Message",
+    "Conversation",
+    "MetricResult",
+    "EvaluateResult",
+    "EvalSample",
+    "EvalDataset",
+]
diff --git a/src/llm_eval_kit/models/datasets.py b/src/llm_eval_kit/models/datasets.py
new file mode 100644
index 0000000..d5c954a
--- /dev/null
+++ b/src/llm_eval_kit/models/datasets.py
@@ -0,0 +1,67 @@
+"""
+Dataset sample models.
+
+EvalSample is a Pydantic v2 BaseModel (boundary — validated from external data).
+EvalDataset is a plain Python class (internal container — lightweight).
+"""
+import json
+import logging
+from typing import Any, Dict, Iterator, List, Optional, Union
+
+from pydantic import BaseModel, Field
+
+from .messages import Message
+
+logger = logging.getLogger(__name__)
+
+
+class EvalSample(BaseModel):
+    """One evaluation sample with messages, ground truth, and metadata."""
+
+    id: str
+    messages: List[Message]
+    ground_truth: Optional[Union[str, dict, list]] = None
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+
+class EvalDataset:
+    """Container wrapping a list of EvalSample with Pythonic iteration."""
+
+    def __init__(self, samples: List[EvalSample]) -> None:
+        self.samples = list(samples)
+
+    def __len__(self) -> int:
+        return len(self.samples)
+
+    def __getitem__(self, index: int) -> EvalSample:
+        return self.samples[index]
+
+    def __iter__(self) -> Iterator[EvalSample]:
+        return iter(self.samples)
+
+    @classmethod
+    def from_jsonl(
+        cls,
+        path: str,
+        max_samples: Optional[int] = None,
+    ) -> "EvalDataset":
+        """Read a JSONL file into EvalSample objects. Skips malformed lines."""
+        samples: List[EvalSample] = []
+        with open(path) as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    data = json.loads(line)
+                except json.JSONDecodeError:
+                    logger.warning("Skipping malformed JSON at line %d", line_num)
+                    continue
+                try:
+                    samples.append(EvalSample(**data))
+                except Exception as e:
+                    logger.warning("Skipping invalid sample at line %d: %s", line_num, e)
+                    continue
+                if max_samples and len(samples) >= max_samples:
+                    break
+        return cls(samples)
diff --git a/src/llm_eval_kit/models/messages.py b/src/llm_eval_kit/models/messages.py
new file mode 100644
index 0000000..055245d
--- /dev/null
+++ b/src/llm_eval_kit/models/messages.py
@@ -0,0 +1,73 @@
+"""
+Message and Conversation models for representing LLM interactions.
+
+Message is a Pydantic v2 BaseModel (boundary model — validated, serializable).
+Conversation is a plain Python class (lightweight wrapper — no validation overhead).
+"""
+from typing import Any, Dict, Iterator, List, Optional
+
+from pydantic import BaseModel, model_validator
+
+
+class Message(BaseModel):
+    """Provider-agnostic chat message model."""
+
+    role: str
+    content: Optional[str] = None
+    name: Optional[str] = None
+    tool_call_id: Optional[str] = None
+    tool_calls: Optional[List[dict]] = None
+
+    @model_validator(mode="after")
+    def validate_tool_message(self) -> "Message":
+        if self.role == "tool" and not self.tool_call_id:
+            raise ValueError(
+                "Messages with role 'tool' must include tool_call_id"
+            )
+        return self
+
+    def to_openai_format(self) -> dict:
+        """Return dict with only non-None fields, compatible with OpenAI API."""
+        d: Dict[str, Any] = {"role": self.role}
+        if self.content is not None:
+            d["content"] = self.content
+        if self.name is not None:
+            d["name"] = self.name
+        if self.tool_call_id is not None:
+            d["tool_call_id"] = self.tool_call_id
+        if self.tool_calls is not None:
+            d["tool_calls"] = self.tool_calls
+        return d
+
+    @classmethod
+    def from_openai(cls, data: dict) -> "Message":
+        """Construct a Message from an OpenAI-format dict."""
+        return cls(**data)
+
+
+class Conversation:
+    """Lightweight wrapper around a list of Messages with helper accessors."""
+
+    def __init__(self, messages: List[Message]) -> None:
+        self.messages = list(messages)
+
+    def get_last_assistant_message(self) -> Optional[Message]:
+        for msg in reversed(self.messages):
+            if msg.role == "assistant":
+                return msg
+        return None
+
+    def get_system_prompt(self) -> Optional[str]:
+        for msg in self.messages:
+            if msg.role == "system":
+                return msg.content
+        return None
+
+    def to_openai_format(self) -> List[dict]:
+        return [msg.to_openai_format() for msg in self.messages]
+
+    def __len__(self) -> int:
+        return len(self.messages)
+
+    def __iter__(self) -> Iterator[Message]:
+        return iter(self.messages)
diff --git a/src/llm_eval_kit/models/results.py b/src/llm_eval_kit/models/results.py
new file mode 100644
index 0000000..4f0f831
--- /dev/null
+++ b/src/llm_eval_kit/models/results.py
@@ -0,0 +1,61 @@
+"""
+Evaluation result models — the output contract for all graders.
+
+MetricResult and EvaluateResult are Pydantic v2 BaseModels (boundary models).
+"""
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+
+class MetricResult(BaseModel):
+    """A single named metric score with explanation."""
+
+    score: float = Field(ge=0.0, le=1.0)
+    reason: str
+    is_valid: bool = True
+
+    def to_dict(self) -> dict:
+        return {
+            "score": self.score,
+            "reason": self.reason,
+            "is_valid": self.is_valid,
+        }
+
+
+class EvaluateResult(BaseModel):
+    """Complete output of a grader — overall score, sub-metrics, and metadata."""
+
+    score: float = Field(ge=0.0, le=1.0)
+    reason: Optional[str] = None
+    is_valid: bool = True
+    metrics: Dict[str, MetricResult] = Field(default_factory=dict)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+    def summary(self) -> str:
+        lines = [f"Score: {self.score:.4f}"]
+        if self.reason:
+            lines.append(f"Reason: {self.reason}")
+        for name, metric in self.metrics.items():
+            lines.append(f"  {name}: {metric.score:.4f} ({metric.reason})")
+        return "\n".join(lines)
+
+    def to_dict(self) -> dict:
+        return {
+            "score": self.score,
+            "reason": self.reason,
+            "is_valid": self.is_valid,
+            "metrics": {k: v.to_dict() for k, v in self.metrics.items()},
+            "metadata": self.metadata,
+        }
+
+    @classmethod
+    def aggregate(cls, results: List["EvaluateResult"]) -> "EvaluateResult":
+        """Compute mean score across a list of results."""
+        if not results:
+            return cls(score=0.0, reason="No results to aggregate")
+        avg = sum(r.score for r in results) / len(results)
+        return cls(
+            score=avg,
+            reason=f"Aggregated over {len(results)} samples",
+        )
diff --git a/src/llm_eval_kit/processors/__init__.py b/src/llm_eval_kit/processors/__init__.py
new file mode 100644
index 0000000..b40a135
--- /dev/null
+++ b/src/llm_eval_kit/processors/__init__.py
@@ -0,0 +1,6 @@
+"""
+SageMaker pre/post processing framework (existing functionality).
+
+This package provides the original Lambda-based preprocessing and
+postprocessing pipeline for SageMaker evaluation jobs.
+"""
diff --git a/src/llm_eval_kit/utils/__init__.py b/src/llm_eval_kit/utils/__init__.py
new file mode 100644
index 0000000..75b9281
--- /dev/null
+++ b/src/llm_eval_kit/utils/__init__.py
@@ -0,0 +1,4 @@
+"""Utility functions."""
+from .module_loader import load_function
+
+__all__ = ["load_function"]
diff --git a/src/llm_eval_kit/utils/module_loader.py b/src/llm_eval_kit/utils/module_loader.py
new file mode 100644
index 0000000..831a930
--- /dev/null
+++ b/src/llm_eval_kit/utils/module_loader.py
@@ -0,0 +1,36 @@
+"""Dynamic module/function loader — load Python objects from string paths."""
+import importlib
+from typing import Any
+
+
+def load_function(path: str) -> Any:
+    """
+    Load a Python object from a string path.
+
+    Supports:
+      - "module.submodule:func_name" (colon format, preferred)
+      - "module.submodule.func_name" (dot format, last component is attribute)
+    """
+    if ":" in path:
+        module_path, attr_name = path.split(":", 1)
+    else:
+        parts = path.rsplit(".", 1)
+        if len(parts) < 2:
+            raise ImportError(
+                f"Invalid path format: {path!r}. "
+                f"Expected 'module.path:func' or 'module.path.func'"
+            )
+        module_path, attr_name = parts
+
+    try:
+        module = importlib.import_module(module_path)
+    except ImportError as e:
+        raise ImportError(
+            f"Cannot import module '{module_path}': {e}"
+        ) from e
+
+    if not hasattr(module, attr_name):
+        raise AttributeError(
+            f"Module '{module_path}' has no attribute '{attr_name}'"
+        )
+    return getattr(module, attr_name)