Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,35 @@ We also release our IF-RLVR code, as part of [open-instruct](https://github.com/

The new training constraints and verification functions are here: https://github.com/allenai/open-instruct/tree/main/open_instruct/IFEvalG

### Reward records for RLVR loops

For lightweight RLVR integrations, `rlvr_env.py` exposes a per-completion reward
API around the existing IFBench verifiers:

```python
from evaluation_lib import read_prompt_list
from rlvr_env import score_response

example = read_prompt_list("data/IFBench_test.jsonl")[0]
result = score_response(example, "model completion", reward_mode="fraction")
print(result.reward, result.follow_instruction_list)
```

You can also convert a prompt file and response file into reward-labeled JSONL:

```bash
python -m rlvr_env \
--input_data data/IFBench_test.jsonl \
--input_response_data data/sample_output.jsonl \
--output_path eval/sample_rewards.jsonl \
--evaluation_mode loose \
--reward_mode fraction
```

Each output row contains the prompt, response, scalar reward, per-instruction
verifier decisions, and instruction IDs. `--reward_mode all` gives a binary
prompt-level reward; `--reward_mode fraction` gives dense partial credit.

## 📊 Model Performance Leaderboard

| Rank | Model | IFBench Score | IFEval Score |
Expand Down
6 changes: 3 additions & 3 deletions evaluation_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class OutputExample:
def read_prompt_list(input_jsonl_filename):
"""Read inputs from jsonl."""
inputs = []
with open(input_jsonl_filename, "r") as f:
with open(input_jsonl_filename, "r", encoding="utf-8") as f:
for l in f:
example = json.loads(l)
inputs.append(
Expand All @@ -57,7 +57,7 @@ def read_prompt_list(input_jsonl_filename):
def write_outputs(output_jsonl_filename, outputs):
"""Writes outputs to jsonl."""
assert outputs
with open(output_jsonl_filename, "w") as f:
with open(output_jsonl_filename, "w", encoding="utf-8") as f:
for o in outputs:
f.write(
json.dumps(
Expand Down Expand Up @@ -169,7 +169,7 @@ def test_instruction_following_loose(
def read_prompt_to_response_dict(input_jsonl_filename):
"""Creates dictionary matching prompt and response."""
return_dict = {}
with open(input_jsonl_filename, "r") as f:
with open(input_jsonl_filename, "r", encoding="utf-8") as f:
for l in f:
example = json.loads(l)
return_dict[example["prompt"]] = example["response"]
Expand Down
196 changes: 196 additions & 0 deletions rlvr_env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
"""Train-ready reward helpers for IFBench / IF-RLVR style loops.

The main evaluator scores a whole response file at once. RLVR pipelines usually
need a smaller interface: given one prompt and one sampled completion, return a
verifiable scalar reward plus per-instruction diagnostics. This module keeps
that wrapper close to the existing IFBench verifier implementation.
"""

from __future__ import annotations

import argparse
import dataclasses
import json
from collections.abc import Iterable
from pathlib import Path
from typing import Any, Literal

import evaluation_lib
import instructions_registry


EvaluationMode = Literal["strict", "loose"]
RewardMode = Literal["all", "fraction"]


@dataclasses.dataclass(frozen=True)
class RewardResult:
"""Verifier output for one prompt/completion pair."""

prompt: str
response: str
reward: float
follow_all_instructions: bool
follow_instruction_list: list[bool]
instruction_id_list: list[str]


def _clean_kwargs(kwargs: dict[str, Any]) -> dict[str, Any]:
return {key: value for key, value in kwargs.items() if value is not None}


def _response_variants(response: str, mode: EvaluationMode) -> list[str]:
if mode == "strict":
return [response]

lines = response.split("\n")
response_remove_first = "\n".join(lines[1:]).strip()
response_remove_last = "\n".join(lines[:-1]).strip()
response_remove_both = "\n".join(lines[1:-1]).strip()
revised_response = response.replace("*", "")

return [
response,
revised_response,
response_remove_first,
response_remove_last,
response_remove_both,
response_remove_first.replace("*", ""),
response_remove_last.replace("*", ""),
response_remove_both.replace("*", ""),
]


def score_response(
inp: evaluation_lib.InputExample,
response: str | None,
*,
evaluation_mode: EvaluationMode = "strict",
reward_mode: RewardMode = "all",
) -> RewardResult:
"""Score one sampled completion with IFBench verification functions.

Args:
inp: IFBench input example.
response: Model completion for ``inp.prompt``.
evaluation_mode: ``strict`` uses the exact completion; ``loose`` mirrors the
benchmark's loose scoring variants.
reward_mode: ``all`` returns 1.0 only when all instructions pass; ``fraction``
returns the mean per-instruction pass rate.
"""

if evaluation_mode not in ("strict", "loose"):
raise ValueError(f"unsupported evaluation_mode: {evaluation_mode}")
if reward_mode not in ("all", "fraction"):
raise ValueError(f"unsupported reward_mode: {reward_mode}")

response = response or ""
response_variants = _response_variants(response, evaluation_mode)
follow_instruction_list: list[bool] = []

for index, instruction_id in enumerate(inp.instruction_id_list):
instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
instruction = instruction_cls(instruction_id)
instruction.build_description(**_clean_kwargs(inp.kwargs[index]))
args = instruction.get_instruction_args()
if args and "prompt" in args:
instruction.build_description(prompt=inp.prompt)

follows = any(
variant.strip() and instruction.check_following(variant)
for variant in response_variants
)
follow_instruction_list.append(follows)

follow_all = all(follow_instruction_list)
if reward_mode == "all":
reward = 1.0 if follow_all else 0.0
else:
reward = sum(follow_instruction_list) / len(follow_instruction_list)

return RewardResult(
prompt=inp.prompt,
response=response,
reward=reward,
follow_all_instructions=follow_all,
follow_instruction_list=follow_instruction_list,
instruction_id_list=list(inp.instruction_id_list),
)


def iter_reward_records(
inputs: Iterable[evaluation_lib.InputExample],
prompt_to_response: dict[str, str],
*,
evaluation_mode: EvaluationMode = "strict",
reward_mode: RewardMode = "all",
) -> Iterable[dict[str, Any]]:
"""Yield JSON-serializable training records with verifier rewards."""

for inp in inputs:
response = prompt_to_response.get(inp.prompt)
if response is None:
response = prompt_to_response.get(inp.prompt.strip())
result = score_response(
inp,
response,
evaluation_mode=evaluation_mode,
reward_mode=reward_mode,
)
yield dataclasses.asdict(result)


def write_reward_records(
*,
input_data: str | Path,
input_response_data: str | Path,
output_path: str | Path,
evaluation_mode: EvaluationMode = "strict",
reward_mode: RewardMode = "all",
) -> None:
"""Convert IFBench prompt/response JSONL files to reward-labeled JSONL."""

inputs = evaluation_lib.read_prompt_list(str(input_data))
prompt_to_response = evaluation_lib.read_prompt_to_response_dict(
str(input_response_data)
)
for prompt, response in list(prompt_to_response.items()):
prompt_to_response.setdefault(prompt.strip(), response)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)

with output_path.open("w", encoding="utf-8") as output_file:
for record in iter_reward_records(
inputs,
prompt_to_response,
evaluation_mode=evaluation_mode,
reward_mode=reward_mode,
):
output_file.write(json.dumps(record, ensure_ascii=False))
output_file.write("\n")


def main() -> None:
parser = argparse.ArgumentParser(
description="Create reward-labeled IFBench JSONL records for RLVR loops."
)
parser.add_argument("--input_data", required=True)
parser.add_argument("--input_response_data", required=True)
parser.add_argument("--output_path", required=True)
parser.add_argument(
"--evaluation_mode", choices=("strict", "loose"), default="strict"
)
parser.add_argument("--reward_mode", choices=("all", "fraction"), default="all")
args = parser.parse_args()

write_reward_records(
input_data=args.input_data,
input_response_data=args.input_response_data,
output_path=args.output_path,
evaluation_mode=args.evaluation_mode,
reward_mode=args.reward_mode,
)


if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions tests/test_rlvr_env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import json
from pathlib import Path

import evaluation_lib
from rlvr_env import score_response
from rlvr_env import write_reward_records


ROOT = Path(__file__).resolve().parents[1]


def test_score_response_returns_scalar_reward_and_instruction_diagnostics():
inputs = evaluation_lib.read_prompt_list(ROOT / "data" / "IFBench_test.jsonl")
prompt_to_response = evaluation_lib.read_prompt_to_response_dict(
ROOT / "data" / "sample_output.jsonl"
)

result = score_response(inputs[0], prompt_to_response.get(inputs[0].prompt))

assert result.prompt == inputs[0].prompt
assert 0.0 <= result.reward <= 1.0
assert len(result.follow_instruction_list) == len(inputs[0].instruction_id_list)
assert result.instruction_id_list == inputs[0].instruction_id_list


def test_fraction_reward_gives_partial_credit_for_multi_instruction_prompts():
inp = evaluation_lib.InputExample(
key=0,
prompt="Say hello. Include keyword apple once and keyword banana twice.",
instruction_id_list=["sentence:keyword", "sentence:keyword"],
kwargs=[
{"word": "apple", "N": 1},
{"word": "banana", "N": 1},
],
)

result = score_response(
inp,
"hello apple here. banana appears later.",
reward_mode="fraction",
)

assert result.follow_instruction_list == [True, False]
assert result.reward == 0.5


def test_write_reward_records_outputs_jsonl(tmp_path):
output_path = tmp_path / "rewards.jsonl"

write_reward_records(
input_data=ROOT / "data" / "IFBench_test.jsonl",
input_response_data=ROOT / "data" / "sample_output.jsonl",
output_path=output_path,
evaluation_mode="loose",
reward_mode="fraction",
)

first_record = json.loads(output_path.read_text(encoding="utf-8").splitlines()[0])
assert "prompt" in first_record
assert "response" in first_record
assert "reward" in first_record
assert "follow_instruction_list" in first_record
assert first_record["response"]