From c3ab2d1fcf95da3d6b7651f38224cccf4e614291 Mon Sep 17 00:00:00 2001
From: Catnap7 <36249828+Catnap7@users.noreply.github.com>
Date: Sat, 23 May 2026 03:34:25 +0900
Subject: [PATCH] Add IFBench RLVR reward wrapper

---
 README.md              |  29 ++++++
 evaluation_lib.py      |   6 +-
 rlvr_env.py            | 196 +++++++++++++++++++++++++++++++++++++++++
 tests/test_rlvr_env.py |  63 +++++++++++++
 4 files changed, 291 insertions(+), 3 deletions(-)
 create mode 100644 rlvr_env.py
 create mode 100644 tests/test_rlvr_env.py

diff --git a/README.md b/README.md
index 2810d4f..c05addf 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,35 @@ We also release our IF-RLVR code, as part of [open-instruct](https://github.com/
 
 The new training constraints and verification functions are here: https://github.com/allenai/open-instruct/tree/main/open_instruct/IFEvalG
 
+### Reward records for RLVR loops
+
+For lightweight RLVR integrations, `rlvr_env.py` exposes a per-completion reward
+API around the existing IFBench verifiers:
+
+```python
+from evaluation_lib import read_prompt_list
+from rlvr_env import score_response
+
+example = read_prompt_list("data/IFBench_test.jsonl")[0]
+result = score_response(example, "model completion", reward_mode="fraction")
+print(result.reward, result.follow_instruction_list)
+```
+
+You can also convert a prompt file and response file into reward-labeled JSONL:
+
+```bash
+python -m rlvr_env \
+  --input_data data/IFBench_test.jsonl \
+  --input_response_data data/sample_output.jsonl \
+  --output_path eval/sample_rewards.jsonl \
+  --evaluation_mode loose \
+  --reward_mode fraction
+```
+
+Each output row contains the prompt, response, scalar reward, per-instruction
+verifier decisions, and instruction IDs. `--reward_mode all` gives a binary
+prompt-level reward; `--reward_mode fraction` gives dense partial credit.
+
 ## 📊 Model Performance Leaderboard
 
 | Rank | Model | IFBench Score | IFEval Score |
diff --git a/evaluation_lib.py b/evaluation_lib.py
index ea99ca9..bf33afc 100644
--- a/evaluation_lib.py
+++ b/evaluation_lib.py
@@ -43,7 +43,7 @@ class OutputExample:
 def read_prompt_list(input_jsonl_filename):
   """Read inputs from jsonl."""
   inputs = []
-  with open(input_jsonl_filename, "r") as f:
+  with open(input_jsonl_filename, "r", encoding="utf-8") as f:
     for l in f:
       example = json.loads(l)
       inputs.append(
@@ -57,7 +57,7 @@ def read_prompt_list(input_jsonl_filename):
 def write_outputs(output_jsonl_filename, outputs):
   """Writes outputs to jsonl."""
   assert outputs
-  with open(output_jsonl_filename, "w") as f:
+  with open(output_jsonl_filename, "w", encoding="utf-8") as f:
     for o in outputs:
       f.write(
           json.dumps(
@@ -169,7 +169,7 @@ def test_instruction_following_loose(
 def read_prompt_to_response_dict(input_jsonl_filename):
   """Creates dictionary matching prompt and response."""
   return_dict = {}
-  with open(input_jsonl_filename, "r") as f:
+  with open(input_jsonl_filename, "r", encoding="utf-8") as f:
     for l in f:
       example = json.loads(l)
       return_dict[example["prompt"]] = example["response"]
diff --git a/rlvr_env.py b/rlvr_env.py
new file mode 100644
index 0000000..0e9ea13
--- /dev/null
+++ b/rlvr_env.py
@@ -0,0 +1,196 @@
+"""Train-ready reward helpers for IFBench / IF-RLVR style loops.
+
+The main evaluator scores a whole response file at once. RLVR pipelines usually
+need a smaller interface: given one prompt and one sampled completion, return a
+verifiable scalar reward plus per-instruction diagnostics. This module keeps
+that wrapper close to the existing IFBench verifier implementation.
+"""
+
+from __future__ import annotations
+
+import argparse
+import dataclasses
+import json
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any, Literal
+
+import evaluation_lib
+import instructions_registry
+
+
+EvaluationMode = Literal["strict", "loose"]
+RewardMode = Literal["all", "fraction"]
+
+
+@dataclasses.dataclass(frozen=True)
+class RewardResult:
+  """Verifier output for one prompt/completion pair."""
+
+  prompt: str
+  response: str
+  reward: float
+  follow_all_instructions: bool
+  follow_instruction_list: list[bool]
+  instruction_id_list: list[str]
+
+
+def _clean_kwargs(kwargs: dict[str, Any]) -> dict[str, Any]:
+  return {key: value for key, value in kwargs.items() if value is not None}
+
+
+def _response_variants(response: str, mode: EvaluationMode) -> list[str]:
+  if mode == "strict":
+    return [response]
+
+  lines = response.split("\n")
+  response_remove_first = "\n".join(lines[1:]).strip()
+  response_remove_last = "\n".join(lines[:-1]).strip()
+  response_remove_both = "\n".join(lines[1:-1]).strip()
+  revised_response = response.replace("*", "")
+
+  return [
+      response,
+      revised_response,
+      response_remove_first,
+      response_remove_last,
+      response_remove_both,
+      response_remove_first.replace("*", ""),
+      response_remove_last.replace("*", ""),
+      response_remove_both.replace("*", ""),
+  ]
+
+
+def score_response(
+    inp: evaluation_lib.InputExample,
+    response: str | None,
+    *,
+    evaluation_mode: EvaluationMode = "strict",
+    reward_mode: RewardMode = "all",
+) -> RewardResult:
+  """Score one sampled completion with IFBench verification functions.
+
+  Args:
+    inp: IFBench input example.
+    response: Model completion for ``inp.prompt``.
+    evaluation_mode: ``strict`` uses the exact completion; ``loose`` mirrors the
+      benchmark's loose scoring variants.
+    reward_mode: ``all`` returns 1.0 only when all instructions pass; ``fraction``
+      returns the mean per-instruction pass rate.
+  """
+
+  if evaluation_mode not in ("strict", "loose"):
+    raise ValueError(f"unsupported evaluation_mode: {evaluation_mode}")
+  if reward_mode not in ("all", "fraction"):
+    raise ValueError(f"unsupported reward_mode: {reward_mode}")
+
+  response = response or ""
+  response_variants = _response_variants(response, evaluation_mode)
+  follow_instruction_list: list[bool] = []
+
+  for index, instruction_id in enumerate(inp.instruction_id_list):
+    instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+    instruction = instruction_cls(instruction_id)
+    instruction.build_description(**_clean_kwargs(inp.kwargs[index]))
+    args = instruction.get_instruction_args()
+    if args and "prompt" in args:
+      instruction.build_description(prompt=inp.prompt)
+
+    follows = any(
+        variant.strip() and instruction.check_following(variant)
+        for variant in response_variants
+    )
+    follow_instruction_list.append(follows)
+
+  follow_all = all(follow_instruction_list)
+  if reward_mode == "all":
+    reward = 1.0 if follow_all else 0.0
+  else:
+    reward = sum(follow_instruction_list) / len(follow_instruction_list)
+
+  return RewardResult(
+      prompt=inp.prompt,
+      response=response,
+      reward=reward,
+      follow_all_instructions=follow_all,
+      follow_instruction_list=follow_instruction_list,
+      instruction_id_list=list(inp.instruction_id_list),
+  )
+
+
+def iter_reward_records(
+    inputs: Iterable[evaluation_lib.InputExample],
+    prompt_to_response: dict[str, str],
+    *,
+    evaluation_mode: EvaluationMode = "strict",
+    reward_mode: RewardMode = "all",
+) -> Iterable[dict[str, Any]]:
+  """Yield JSON-serializable training records with verifier rewards."""
+
+  for inp in inputs:
+    response = prompt_to_response.get(inp.prompt)
+    if response is None:
+      response = prompt_to_response.get(inp.prompt.strip())
+    result = score_response(
+        inp,
+        response,
+        evaluation_mode=evaluation_mode,
+        reward_mode=reward_mode,
+    )
+    yield dataclasses.asdict(result)
+
+
+def write_reward_records(
+    *,
+    input_data: str | Path,
+    input_response_data: str | Path,
+    output_path: str | Path,
+    evaluation_mode: EvaluationMode = "strict",
+    reward_mode: RewardMode = "all",
+) -> None:
+  """Convert IFBench prompt/response JSONL files to reward-labeled JSONL."""
+
+  inputs = evaluation_lib.read_prompt_list(str(input_data))
+  prompt_to_response = evaluation_lib.read_prompt_to_response_dict(
+      str(input_response_data)
+  )
+  for prompt, response in list(prompt_to_response.items()):
+    prompt_to_response.setdefault(prompt.strip(), response)
+  output_path = Path(output_path)
+  output_path.parent.mkdir(parents=True, exist_ok=True)
+
+  with output_path.open("w", encoding="utf-8") as output_file:
+    for record in iter_reward_records(
+        inputs,
+        prompt_to_response,
+        evaluation_mode=evaluation_mode,
+        reward_mode=reward_mode,
+    ):
+      output_file.write(json.dumps(record, ensure_ascii=False))
+      output_file.write("\n")
+
+
+def main() -> None:
+  parser = argparse.ArgumentParser(
+      description="Create reward-labeled IFBench JSONL records for RLVR loops."
+  )
+  parser.add_argument("--input_data", required=True)
+  parser.add_argument("--input_response_data", required=True)
+  parser.add_argument("--output_path", required=True)
+  parser.add_argument(
+      "--evaluation_mode", choices=("strict", "loose"), default="strict"
+  )
+  parser.add_argument("--reward_mode", choices=("all", "fraction"), default="all")
+  args = parser.parse_args()
+
+  write_reward_records(
+      input_data=args.input_data,
+      input_response_data=args.input_response_data,
+      output_path=args.output_path,
+      evaluation_mode=args.evaluation_mode,
+      reward_mode=args.reward_mode,
+  )
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tests/test_rlvr_env.py b/tests/test_rlvr_env.py
new file mode 100644
index 0000000..fb218f0
--- /dev/null
+++ b/tests/test_rlvr_env.py
@@ -0,0 +1,63 @@
+import json
+from pathlib import Path
+
+import evaluation_lib
+from rlvr_env import score_response
+from rlvr_env import write_reward_records
+
+
+ROOT = Path(__file__).resolve().parents[1]
+
+
+def test_score_response_returns_scalar_reward_and_instruction_diagnostics():
+  inputs = evaluation_lib.read_prompt_list(ROOT / "data" / "IFBench_test.jsonl")
+  prompt_to_response = evaluation_lib.read_prompt_to_response_dict(
+      ROOT / "data" / "sample_output.jsonl"
+  )
+
+  result = score_response(inputs[0], prompt_to_response.get(inputs[0].prompt))
+
+  assert result.prompt == inputs[0].prompt
+  assert 0.0 <= result.reward <= 1.0
+  assert len(result.follow_instruction_list) == len(inputs[0].instruction_id_list)
+  assert result.instruction_id_list == inputs[0].instruction_id_list
+
+
+def test_fraction_reward_gives_partial_credit_for_multi_instruction_prompts():
+  inp = evaluation_lib.InputExample(
+      key=0,
+      prompt="Say hello. Include keyword apple once and keyword banana twice.",
+      instruction_id_list=["sentence:keyword", "sentence:keyword"],
+      kwargs=[
+          {"word": "apple", "N": 1},
+          {"word": "banana", "N": 1},
+      ],
+  )
+
+  result = score_response(
+      inp,
+      "hello apple here. banana appears later.",
+      reward_mode="fraction",
+  )
+
+  assert result.follow_instruction_list == [True, False]
+  assert result.reward == 0.5
+
+
+def test_write_reward_records_outputs_jsonl(tmp_path):
+  output_path = tmp_path / "rewards.jsonl"
+
+  write_reward_records(
+      input_data=ROOT / "data" / "IFBench_test.jsonl",
+      input_response_data=ROOT / "data" / "sample_output.jsonl",
+      output_path=output_path,
+      evaluation_mode="loose",
+      reward_mode="fraction",
+  )
+
+  first_record = json.loads(output_path.read_text(encoding="utf-8").splitlines()[0])
+  assert "prompt" in first_record
+  assert "response" in first_record
+  assert "reward" in first_record
+  assert "follow_instruction_list" in first_record
+  assert first_record["response"]