From c3ab2d1fcf95da3d6b7651f38224cccf4e614291 Mon Sep 17 00:00:00 2001 From: Catnap7 <36249828+Catnap7@users.noreply.github.com> Date: Sat, 23 May 2026 03:34:25 +0900 Subject: [PATCH] Add IFBench RLVR reward wrapper --- README.md | 29 ++++++ evaluation_lib.py | 6 +- rlvr_env.py | 196 +++++++++++++++++++++++++++++++++++++++++ tests/test_rlvr_env.py | 63 +++++++++++++ 4 files changed, 291 insertions(+), 3 deletions(-) create mode 100644 rlvr_env.py create mode 100644 tests/test_rlvr_env.py diff --git a/README.md b/README.md index 2810d4f..c05addf 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,35 @@ We also release our IF-RLVR code, as part of [open-instruct](https://github.com/ The new training constraints and verification functions are here: https://github.com/allenai/open-instruct/tree/main/open_instruct/IFEvalG +### Reward records for RLVR loops + +For lightweight RLVR integrations, `rlvr_env.py` exposes a per-completion reward +API around the existing IFBench verifiers: + +```python +from evaluation_lib import read_prompt_list +from rlvr_env import score_response + +example = read_prompt_list("data/IFBench_test.jsonl")[0] +result = score_response(example, "model completion", reward_mode="fraction") +print(result.reward, result.follow_instruction_list) +``` + +You can also convert a prompt file and response file into reward-labeled JSONL: + +```bash +python -m rlvr_env \ + --input_data data/IFBench_test.jsonl \ + --input_response_data data/sample_output.jsonl \ + --output_path eval/sample_rewards.jsonl \ + --evaluation_mode loose \ + --reward_mode fraction +``` + +Each output row contains the prompt, response, scalar reward, per-instruction +verifier decisions, and instruction IDs. `--reward_mode all` gives a binary +prompt-level reward; `--reward_mode fraction` gives dense partial credit. + ## 📊 Model Performance Leaderboard | Rank | Model | IFBench Score | IFEval Score | diff --git a/evaluation_lib.py b/evaluation_lib.py index ea99ca9..bf33afc 100644 --- a/evaluation_lib.py +++ b/evaluation_lib.py @@ -43,7 +43,7 @@ class OutputExample: def read_prompt_list(input_jsonl_filename): """Read inputs from jsonl.""" inputs = [] - with open(input_jsonl_filename, "r") as f: + with open(input_jsonl_filename, "r", encoding="utf-8") as f: for l in f: example = json.loads(l) inputs.append( @@ -57,7 +57,7 @@ def read_prompt_list(input_jsonl_filename): def write_outputs(output_jsonl_filename, outputs): """Writes outputs to jsonl.""" assert outputs - with open(output_jsonl_filename, "w") as f: + with open(output_jsonl_filename, "w", encoding="utf-8") as f: for o in outputs: f.write( json.dumps( @@ -169,7 +169,7 @@ def test_instruction_following_loose( def read_prompt_to_response_dict(input_jsonl_filename): """Creates dictionary matching prompt and response.""" return_dict = {} - with open(input_jsonl_filename, "r") as f: + with open(input_jsonl_filename, "r", encoding="utf-8") as f: for l in f: example = json.loads(l) return_dict[example["prompt"]] = example["response"] diff --git a/rlvr_env.py b/rlvr_env.py new file mode 100644 index 0000000..0e9ea13 --- /dev/null +++ b/rlvr_env.py @@ -0,0 +1,196 @@ +"""Train-ready reward helpers for IFBench / IF-RLVR style loops. + +The main evaluator scores a whole response file at once. RLVR pipelines usually +need a smaller interface: given one prompt and one sampled completion, return a +verifiable scalar reward plus per-instruction diagnostics. This module keeps +that wrapper close to the existing IFBench verifier implementation. +""" + +from __future__ import annotations + +import argparse +import dataclasses +import json +from collections.abc import Iterable +from pathlib import Path +from typing import Any, Literal + +import evaluation_lib +import instructions_registry + + +EvaluationMode = Literal["strict", "loose"] +RewardMode = Literal["all", "fraction"] + + +@dataclasses.dataclass(frozen=True) +class RewardResult: + """Verifier output for one prompt/completion pair.""" + + prompt: str + response: str + reward: float + follow_all_instructions: bool + follow_instruction_list: list[bool] + instruction_id_list: list[str] + + +def _clean_kwargs(kwargs: dict[str, Any]) -> dict[str, Any]: + return {key: value for key, value in kwargs.items() if value is not None} + + +def _response_variants(response: str, mode: EvaluationMode) -> list[str]: + if mode == "strict": + return [response] + + lines = response.split("\n") + response_remove_first = "\n".join(lines[1:]).strip() + response_remove_last = "\n".join(lines[:-1]).strip() + response_remove_both = "\n".join(lines[1:-1]).strip() + revised_response = response.replace("*", "") + + return [ + response, + revised_response, + response_remove_first, + response_remove_last, + response_remove_both, + response_remove_first.replace("*", ""), + response_remove_last.replace("*", ""), + response_remove_both.replace("*", ""), + ] + + +def score_response( + inp: evaluation_lib.InputExample, + response: str | None, + *, + evaluation_mode: EvaluationMode = "strict", + reward_mode: RewardMode = "all", +) -> RewardResult: + """Score one sampled completion with IFBench verification functions. + + Args: + inp: IFBench input example. + response: Model completion for ``inp.prompt``. + evaluation_mode: ``strict`` uses the exact completion; ``loose`` mirrors the + benchmark's loose scoring variants. + reward_mode: ``all`` returns 1.0 only when all instructions pass; ``fraction`` + returns the mean per-instruction pass rate. + """ + + if evaluation_mode not in ("strict", "loose"): + raise ValueError(f"unsupported evaluation_mode: {evaluation_mode}") + if reward_mode not in ("all", "fraction"): + raise ValueError(f"unsupported reward_mode: {reward_mode}") + + response = response or "" + response_variants = _response_variants(response, evaluation_mode) + follow_instruction_list: list[bool] = [] + + for index, instruction_id in enumerate(inp.instruction_id_list): + instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id] + instruction = instruction_cls(instruction_id) + instruction.build_description(**_clean_kwargs(inp.kwargs[index])) + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=inp.prompt) + + follows = any( + variant.strip() and instruction.check_following(variant) + for variant in response_variants + ) + follow_instruction_list.append(follows) + + follow_all = all(follow_instruction_list) + if reward_mode == "all": + reward = 1.0 if follow_all else 0.0 + else: + reward = sum(follow_instruction_list) / len(follow_instruction_list) + + return RewardResult( + prompt=inp.prompt, + response=response, + reward=reward, + follow_all_instructions=follow_all, + follow_instruction_list=follow_instruction_list, + instruction_id_list=list(inp.instruction_id_list), + ) + + +def iter_reward_records( + inputs: Iterable[evaluation_lib.InputExample], + prompt_to_response: dict[str, str], + *, + evaluation_mode: EvaluationMode = "strict", + reward_mode: RewardMode = "all", +) -> Iterable[dict[str, Any]]: + """Yield JSON-serializable training records with verifier rewards.""" + + for inp in inputs: + response = prompt_to_response.get(inp.prompt) + if response is None: + response = prompt_to_response.get(inp.prompt.strip()) + result = score_response( + inp, + response, + evaluation_mode=evaluation_mode, + reward_mode=reward_mode, + ) + yield dataclasses.asdict(result) + + +def write_reward_records( + *, + input_data: str | Path, + input_response_data: str | Path, + output_path: str | Path, + evaluation_mode: EvaluationMode = "strict", + reward_mode: RewardMode = "all", +) -> None: + """Convert IFBench prompt/response JSONL files to reward-labeled JSONL.""" + + inputs = evaluation_lib.read_prompt_list(str(input_data)) + prompt_to_response = evaluation_lib.read_prompt_to_response_dict( + str(input_response_data) + ) + for prompt, response in list(prompt_to_response.items()): + prompt_to_response.setdefault(prompt.strip(), response) + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with output_path.open("w", encoding="utf-8") as output_file: + for record in iter_reward_records( + inputs, + prompt_to_response, + evaluation_mode=evaluation_mode, + reward_mode=reward_mode, + ): + output_file.write(json.dumps(record, ensure_ascii=False)) + output_file.write("\n") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Create reward-labeled IFBench JSONL records for RLVR loops." + ) + parser.add_argument("--input_data", required=True) + parser.add_argument("--input_response_data", required=True) + parser.add_argument("--output_path", required=True) + parser.add_argument( + "--evaluation_mode", choices=("strict", "loose"), default="strict" + ) + parser.add_argument("--reward_mode", choices=("all", "fraction"), default="all") + args = parser.parse_args() + + write_reward_records( + input_data=args.input_data, + input_response_data=args.input_response_data, + output_path=args.output_path, + evaluation_mode=args.evaluation_mode, + reward_mode=args.reward_mode, + ) + + +if __name__ == "__main__": + main() diff --git a/tests/test_rlvr_env.py b/tests/test_rlvr_env.py new file mode 100644 index 0000000..fb218f0 --- /dev/null +++ b/tests/test_rlvr_env.py @@ -0,0 +1,63 @@ +import json +from pathlib import Path + +import evaluation_lib +from rlvr_env import score_response +from rlvr_env import write_reward_records + + +ROOT = Path(__file__).resolve().parents[1] + + +def test_score_response_returns_scalar_reward_and_instruction_diagnostics(): + inputs = evaluation_lib.read_prompt_list(ROOT / "data" / "IFBench_test.jsonl") + prompt_to_response = evaluation_lib.read_prompt_to_response_dict( + ROOT / "data" / "sample_output.jsonl" + ) + + result = score_response(inputs[0], prompt_to_response.get(inputs[0].prompt)) + + assert result.prompt == inputs[0].prompt + assert 0.0 <= result.reward <= 1.0 + assert len(result.follow_instruction_list) == len(inputs[0].instruction_id_list) + assert result.instruction_id_list == inputs[0].instruction_id_list + + +def test_fraction_reward_gives_partial_credit_for_multi_instruction_prompts(): + inp = evaluation_lib.InputExample( + key=0, + prompt="Say hello. Include keyword apple once and keyword banana twice.", + instruction_id_list=["sentence:keyword", "sentence:keyword"], + kwargs=[ + {"word": "apple", "N": 1}, + {"word": "banana", "N": 1}, + ], + ) + + result = score_response( + inp, + "hello apple here. banana appears later.", + reward_mode="fraction", + ) + + assert result.follow_instruction_list == [True, False] + assert result.reward == 0.5 + + +def test_write_reward_records_outputs_jsonl(tmp_path): + output_path = tmp_path / "rewards.jsonl" + + write_reward_records( + input_data=ROOT / "data" / "IFBench_test.jsonl", + input_response_data=ROOT / "data" / "sample_output.jsonl", + output_path=output_path, + evaluation_mode="loose", + reward_mode="fraction", + ) + + first_record = json.loads(output_path.read_text(encoding="utf-8").splitlines()[0]) + assert "prompt" in first_record + assert "response" in first_record + assert "reward" in first_record + assert "follow_instruction_list" in first_record + assert first_record["response"]