From 37dd6e41e51b06eff63341dfcbcde247a315661c Mon Sep 17 00:00:00 2001 From: AnkitaNaik Date: Fri, 15 May 2026 23:53:52 -0400 Subject: [PATCH 1/9] Add Human Evaluation Metric for Capability 3 --- scripts/human_evaluation/README.md | 131 ++++++ scripts/human_evaluation/task3_eval_ui.py | 462 ++++++++++++++++++++++ 2 files changed, 593 insertions(+) create mode 100644 scripts/human_evaluation/README.md create mode 100644 scripts/human_evaluation/task3_eval_ui.py diff --git a/scripts/human_evaluation/README.md b/scripts/human_evaluation/README.md new file mode 100644 index 0000000..5ef03ef --- /dev/null +++ b/scripts/human_evaluation/README.md @@ -0,0 +1,131 @@ +# ๐Ÿงช Query Evaluation UI + +Lightweight Flask UI for **human evaluation of LLM-generated multi-hop queries** with: + +* Side-by-side input + evaluation view +* Keyboard shortcuts โšก +* Auto-advance across metrics +* Hover tooltips for scoring + +--- + +# ๐Ÿš€ Setup & Run + +```bash +pip install flask +python helpers/human_evaluation/task3_eval_ui.py --data task3_data.json --out annotations.json +``` + +Open: + +``` +http://127.0.0.1:5000 +``` + +--- + +# ๐Ÿ“‚ Data Format & Usage + +### Input JSON (`task3_data.json`) +Get the input file from https://ibm.box.com/s/2b9g7nq1r2h1ll866sebw5cuaf1az2xj + +```json +[ + { + "uuid": "ex-001", + "domain": "movies", + "component_queries": ["..."], + "query": "...", + "gt_tools": [...], + "gt_responses": [...], + "gt_answer": "..." + } +] +``` + +### Annotation Flow + +* Press **1โ€“4** โ†’ score metric +* It auto-advances to next metric +* Press **N** โ†’ next example +* Press **P** โ†’ previous +* Press **T** โ†’ jump to notes + +### Output + +Saved to: + +``` +annotations.json +``` +annotations.json saved at the end of every sample. + +Contains scores + notes + timestamps. + +### Metrics + +``` +METRICS = { + "faithfulness": { + "title": "Faithfulness Score", + "desc": "Does the merged query contain only information grounded in the component queries, without introducing unsupported facts?", + "scale": { + 1: "Completely hallucinated: major unsupported entities or relations introduced", + 2: "Harmful hallucination: incorrect info that breaks reasoning", + 3: "Minor hallucination: small issues but mostly correct", + 4: "No hallucination: fully grounded in component queries" + } + }, + "naturalness": { + "title": "Naturalness", + "desc": "Is the merged query fluent, natural, and phrased in a way a human would realistically understand?", + "scale": { + 1: "Completely unnatural or broken", + 2: "Awkward and clearly stitched", + 3: "Mostly natural but slightly awkward", + 4: "Fully natural and fluent" + } + }, + "logical_consistency": { + "title": "Logical Consistency", + "desc": "Is the merged query logically consistent, with no contradictions between its components, and aligned with the reasoning chain implied by intermediate queries?", + "scale": { + 1: "Completely inconsistent and contradictory leading it to be unanswerable", + 2: "Major inconsistency to the level of being misleading", + 3: "Minor inconsistency not affecting answerability", + 4: "Fully consistent" + } + }, + "answer_leakage": { + "title": "Answer Leakage", + "desc": "Does the merged query explicitly or implicitly reveal the answer entity, making the reasoning task trivial? (i.e., Is the answer entity accidentally mentioned in the query, or does the merged query leak any answer to the hop question?)", + "scale": { + 1: "Complete leakage (answer directly present)", + 2: "Strong leakage (very obvious answer)", + 3: "Minor hints present", + 4: "No leakage" + } + }, + "context_sufficiency": { + "title": "Context Sufficiency", + "desc": "Does the merged query contain sufficient context and constraints to be answerable via the intended tool or retrieval pipeline? (i.e., Does the merged question have enough context present to be answered as well as does the query have enough context to populate the arguments of the tools?)", + "scale": { + 1: "Not answerable", + 2: "Major missing context", + 3: "Mostly sufficient", + 4: "Fully sufficient" + } + }, + "reasoning_hops": { + "title": "Reasoning Steps / Hops", + "desc": "If you had to solve this query, how many reasoning steps would be required according to you?", + "scale": { + 1: "Single hop", + 2: "Two hops", + 3: "Moderate multi-hop", + 4: "Complex multi-hop", + 5: "Very complex reasoning" + } + } +} +``` diff --git a/scripts/human_evaluation/task3_eval_ui.py b/scripts/human_evaluation/task3_eval_ui.py new file mode 100644 index 0000000..86da970 --- /dev/null +++ b/scripts/human_evaluation/task3_eval_ui.py @@ -0,0 +1,462 @@ +import json +import argparse +from pathlib import Path +from datetime import datetime + +from flask import Flask, request, redirect, render_template_string, url_for + +METRICS = { + "faithfulness": { + "title": "Faithfulness Score", + "desc": "Does the merged query contain only information grounded in the component queries, without introducing unsupported facts?", + "scale": { + 1: "Completely hallucinated: major unsupported entities or relations introduced", + 2: "Harmful hallucination: incorrect info that breaks reasoning", + 3: "Minor hallucination: small issues but mostly correct", + 4: "No hallucination: fully grounded in component queries" + } + }, + "naturalness": { + "title": "Naturalness", + "desc": "Is the merged query fluent, natural, and phrased in a way a human would realistically understand?", + "scale": { + 1: "Completely unnatural or broken", + 2: "Awkward and clearly stitched", + 3: "Mostly natural but slightly awkward", + 4: "Fully natural and fluent" + } + }, + "logical_consistency": { + "title": "Logical Consistency", + "desc": "Is the merged query logically consistent, with no contradictions between its components, and aligned with the reasoning chain implied by intermediate queries?", + "scale": { + 1: "Completely inconsistent and contradictory leading it to be unanswerable", + 2: "Major inconsistency to the level of being misleading", + 3: "Minor inconsistency not affecting answerability", + 4: "Fully consistent" + } + }, + "answer_leakage": { + "title": "Answer Leakage", + "desc": "Does the merged query explicitly or implicitly reveal the answer entity, making the reasoning task trivial? (i.e., Is the answer entity accidentally mentioned in the query, or does the merged query leak any answer to the hop question?)", + "scale": { + 1: "Complete leakage (answer directly present)", + 2: "Strong leakage (very obvious answer)", + 3: "Minor hints present", + 4: "No leakage" + } + }, + "context_sufficiency": { + "title": "Context Sufficiency", + "desc": "Does the merged query contain sufficient context and constraints to be answerable via the intended tool or retrieval pipeline? (i.e., Does the merged question have enough context present to be answered as well as does the query have enough context to populate the arguments of the tools?)", + "scale": { + 1: "Not answerable", + 2: "Major missing context", + 3: "Mostly sufficient", + 4: "Fully sufficient" + } + }, + "reasoning_hops": { + "title": "Reasoning Steps / Hops", + "desc": "If you had to solve this query, how many reasoning steps would be required according to you?", + "scale": { + 1: "Single hop", + 2: "Two hops", + 3: "Moderate multi-hop", + 4: "Complex multi-hop", + 5: "Very complex reasoning" + } + } +} + +HTML = """ + + + + Query Evaluation + + + + + +
+ +
+

Input Data

+ +
+ UUID: {{ item.uuid }}
+ Domain: {{ item.domain }} +
+ +
+

Component Queries

+
    + {% for q in item.component_queries %} +
  • {{ q }}
  • + {% endfor %} +
+
+ +
+

Merged Query

+
{{ item.query }}
+
+ +
+

Ground Truth Tools

+
{{ item.gt_tools | tojson(indent=2) }}
+
+ +
+

Tool Responses

+
{{ item.gt_responses | tojson(indent=2) }}
+
+ +
+

Final Answer

+
{{ item.gt_answer }}
+
+
+ +
+

Evaluation

+ +
+ Item {{ idx + 1 }} / {{ total }} | Completed: {{ completed }} +
+ +
+ Hover over numbers for score explanations. Keyboard: 1โ€“5 = score active metric, N = Save & Next, P = Save & Previous. +
+ +
+ + {% for key, metric in metrics.items() %} +
+

{{ loop.index }}. {{ metric.title }}

+
{{ metric.desc }}
+ + {% for score, explanation in metric.scale.items() %} + + {% endfor %} +
+ {% endfor %} + +
+

Notes

+ +
+ + + + + +
+
+ +
+ + + + + +""" + +def load_json(path): + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def save_json(path, data): + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(".tmp") + with open(tmp, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + tmp.replace(path) + + +def make_app(data_path, output_path): + app = Flask(__name__) + + data = load_json(data_path) + + if output_path.exists(): + results = load_json(output_path) + else: + results = {} + + def completed_count(): + return sum(1 for item in data if item["uuid"] in results) + + @app.route("/") + def home(): + return redirect(url_for("annotate", idx=0)) + + @app.route("/item/") + def annotate(idx): + idx = max(0, min(idx, len(data) - 1)) + item = data[idx] + existing = results.get(item["uuid"], {}) + + return render_template_string( + HTML, + item=item, + idx=idx, + total=len(data), + metrics=METRICS, + existing=existing, + completed=completed_count(), + ) + + @app.route("/save/", methods=["POST"]) + def save(idx): + item = data[idx] + uuid = item["uuid"] + + annotation = { + "uuid": uuid, + "domain": item.get("domain"), + "query": item.get("query"), + + "reasoning_hops": int(request.form["reasoning_hops"]), + "faithfulness": int(request.form["faithfulness"]), + "naturalness": int(request.form["naturalness"]), + "logical_consistency": int(request.form["logical_consistency"]), + "answer_leakage": int(request.form["answer_leakage"]), + "context_sufficiency": int(request.form["context_sufficiency"]), + + "notes": request.form.get("notes", ""), + "updated_at": datetime.utcnow().isoformat() + "Z", + } + + quality_keys = [ + "faithfulness", + "naturalness", + "logical_consistency", + "answer_leakage", + "context_sufficiency", + ] + + annotation["mean_quality_score_1_to_4"] = sum( + annotation[k] for k in quality_keys + ) / len(quality_keys) + + results[uuid] = annotation + save_json(output_path, results) + + action = request.form.get("action") + if action == "prev": + return redirect(url_for("annotate", idx=max(0, idx - 1))) + if action == "next": + return redirect(url_for("annotate", idx=min(len(data) - 1, idx + 1))) + return redirect(url_for("annotate", idx=idx)) + + return app + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data", required=True, help="Input JSON file") + parser.add_argument("--out", default="annotations.json", help="Output JSON file") + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", default=5000, type=int) + args = parser.parse_args() + + app = make_app(Path(args.data), Path(args.out)) + app.run(host=args.host, port=args.port, debug=True) \ No newline at end of file From bb8266a6620ccf62519ad64837ebc833dfcbbe99 Mon Sep 17 00:00:00 2001 From: AnkitaNaik Date: Mon, 18 May 2026 12:24:08 -0400 Subject: [PATCH 2/9] Add requirements to setup evaluation UI. --- scripts/human_evaluation/requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 scripts/human_evaluation/requirements.txt diff --git a/scripts/human_evaluation/requirements.txt b/scripts/human_evaluation/requirements.txt new file mode 100644 index 0000000..90ce11d --- /dev/null +++ b/scripts/human_evaluation/requirements.txt @@ -0,0 +1 @@ +Flask>=3.0,<4 From d136b6f0776aa7bb27ea74d331365b05f92aa3ff Mon Sep 17 00:00:00 2001 From: AnkitaNaik Date: Mon, 18 May 2026 12:24:56 -0400 Subject: [PATCH 3/9] Add UI for Task3 & Task 4 evaluation. --- scripts/human_evaluation/README.md | 126 ++++-- scripts/human_evaluation/task4_eval_ui.py | 524 ++++++++++++++++++++++ 2 files changed, 619 insertions(+), 31 deletions(-) create mode 100644 scripts/human_evaluation/task4_eval_ui.py diff --git a/scripts/human_evaluation/README.md b/scripts/human_evaluation/README.md index 5ef03ef..93a5e70 100644 --- a/scripts/human_evaluation/README.md +++ b/scripts/human_evaluation/README.md @@ -1,19 +1,21 @@ # ๐Ÿงช Query Evaluation UI -Lightweight Flask UI for **human evaluation of LLM-generated multi-hop queries** with: - -* Side-by-side input + evaluation view -* Keyboard shortcuts โšก -* Auto-advance across metrics -* Hover tooltips for scoring +Lightweight Flask UI for **human evaluation of LLM-generated multi-hop queries**. --- # ๐Ÿš€ Setup & Run +### Input JSON (`task_data.json`) +Get the input file from https://ibm.box.com/s/2b9g7nq1r2h1ll866sebw5cuaf1az2xj + +From this folder: + ```bash -pip install flask -python helpers/human_evaluation/task3_eval_ui.py --data task3_data.json --out annotations.json +cd vakra/scripts/human_evaluation +python3 -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt +python task_eval_ui.py --data task_data.json --out task_annotations.json ``` Open: @@ -22,27 +24,6 @@ Open: http://127.0.0.1:5000 ``` ---- - -# ๐Ÿ“‚ Data Format & Usage - -### Input JSON (`task3_data.json`) -Get the input file from https://ibm.box.com/s/2b9g7nq1r2h1ll866sebw5cuaf1az2xj - -```json -[ - { - "uuid": "ex-001", - "domain": "movies", - "component_queries": ["..."], - "query": "...", - "gt_tools": [...], - "gt_responses": [...], - "gt_answer": "..." - } -] -``` - ### Annotation Flow * Press **1โ€“4** โ†’ score metric @@ -62,9 +43,9 @@ annotations.json saved at the end of every sample. Contains scores + notes + timestamps. -### Metrics +### Task 3 Metrics -``` +```python METRICS = { "faithfulness": { "title": "Faithfulness Score", @@ -129,3 +110,86 @@ METRICS = { } } ``` + +### Task 4 Metrics + +```python +METRICS = { + "faithfulness": { + "title": "Faithfulness Score", + "desc": "Does the merged query contain only information grounded in the component queries, without introducing unsupported facts?", + "scale": { + 1: "Completely hallucinated: major unsupported entities or relations", + 2: "Harmful hallucination: incorrect info that breaks reasoning", + 3: "Minor hallucination: small issues but mostly correct", + 4: "No hallucination: fully grounded in component queries" + } + }, + "naturalness": { + "title": "Naturalness", + "desc": "Is the merged query fluent, natural, and phrased in a way a human would understand the ask? (i.e., check grammaticality, fluency, and overall natural phrasing of the merged query)", + "scale": { + 1: "Cannot understand what is being asked at all", + 2: "Highly awkward phrasing; clearly stitched but can understand the ask from the query", + 3: "Slightly awkward phrasing but can understand the ask from the query", + 4: "Fully natural and fluent" + } + }, + "logical_consistency": { + "title": "Logical Consistency", + "desc": ( + "Does the merged query avoid logically incompatible conditions or contradictions? " + "The different parts of the query should be simultaneously satisfiable and should " + "form a valid reasoning chain. For example, a question like " + "'Name a city on Earth which lies above the equator and is in Australia?' " + "contains logically incompatible constraints." + ), + "scale": { + 1: "Completely inconsistent: logically incompatible component queries or conditions that cannot be satisfied together were merged", + 2: "Major inconsistency: the query contains impossible or directly contradictory conditions that make the question invalid or unanswerable", + 3: "Minor inconsistency: the query is mostly logically valid but contains small ambiguities, weak conflicts, or mildly confusing constraints", + 4: "Fully consistent" + } + }, + "answer_leakage": { + "title": "Answer Leakage", + "desc": "Does the merged query explicitly or implicitly reveal the answer entity, making the reasoning task trivial? (i.e., Is the answer entity accidentally mentioned in the query, or does the merged query leak any answer to the hop question?)", + "scale": { + 1: "Complete leakage (answer directly present)", + 2: "Strong leakage (very obvious answer)", + 3: "Minor hints present", + 4: "No leakage" + } + }, + "context_sufficiency": { + "title": "Context Sufficiency", + "desc": "Does the merged query contain sufficient context and constraints to be answerable via the intended tool or retrieval pipeline? (i.e., Does the merged question have enough context present to be answered as well as does the query have enough context to populate the arguments of the tools?)", + "scale": { + 1: "Not answerable: entities missing in the merged query or insufficient context to answer the question", + 2: "Major missing context: all entities present but insufficient context significantly hindering answerability", + 3: "Mostly sufficient: minor missing context that does not significantly hinder answerability", + 4: "Fully sufficient" + } + }, + "retrieval_sufficiency": { + "title": "Retrieval Sufficiency Score", + "desc": "Do the ground truth documents have sufficient information to answer the RAG query?", + "scale": { + 1: "GT document have no relevant information", + 2: "GT document have some missing information", + 3: "GT document have some missing information which is common sense knowledge", + 4: "No information missing" + } + }, + "cross_hop_entity_consistency": { + "title": "Cross-Hop Entity Consistency Score", + "desc": "Are entities required by the arguments of the succeeding or preceding API tool calls correctly inferred and grounded in the retrieved documents or retriever questions?", + "scale": { + 1: "Not answerable", + 2: "Majorly missing context / entities cannot be answered without these entities", + 3: "Mostly sufficient have some missing information which is common sense knowledge", + 4: "Fully sufficient" + } + } +} +``` diff --git a/scripts/human_evaluation/task4_eval_ui.py b/scripts/human_evaluation/task4_eval_ui.py new file mode 100644 index 0000000..a3f62f9 --- /dev/null +++ b/scripts/human_evaluation/task4_eval_ui.py @@ -0,0 +1,524 @@ +import argparse +import json +from datetime import datetime +from pathlib import Path + +from flask import Flask, redirect, render_template_string, request, url_for + + +METRICS = { + "faithfulness": { + "title": "Faithfulness Score", + "desc": "Does the merged query contain only information grounded in the component queries, without introducing unsupported facts?", + "scale": { + 1: "Completely hallucinated: major unsupported entities or relations", + 2: "Harmful hallucination: incorrect info that breaks reasoning", + 3: "Minor hallucination: small issues but mostly correct", + 4: "No hallucination: fully grounded in component queries" + } + }, + "naturalness": { + "title": "Naturalness", + "desc": "Is the merged query fluent, natural, and phrased in a way a human would understand the ask? (i.e., check grammaticality, fluency, and overall natural phrasing of the merged query)", + "scale": { + 1: "Cannot understand what is being asked at all", + 2: "Highly awkward phrasing; clearly stitched but can understand the ask from the query", + 3: "Slightly awkward phrasing but can understand the ask from the query", + 4: "Fully natural and fluent" + } + }, + "logical_consistency": { + "title": "Logical Consistency", + "desc": ( + "Does the merged query avoid logically incompatible conditions or contradictions? " + "The different parts of the query should be simultaneously satisfiable and should " + "form a valid reasoning chain. For example, a question like " + "'Name a city on Earth which lies above the equator and is in Australia?' " + "contains logically incompatible constraints." + ), + "scale": { + 1: "Completely inconsistent: logically incompatible component queries or conditions that cannot be satisfied together were merged", + 2: "Major inconsistency: the query contains impossible or directly contradictory conditions that make the question invalid or unanswerable", + 3: "Minor inconsistency: the query is mostly logically valid but contains small ambiguities, weak conflicts, or mildly confusing constraints", + 4: "Fully consistent" + } + }, + "answer_leakage": { + "title": "Answer Leakage", + "desc": "Does the merged query explicitly or implicitly reveal the answer entity, making the reasoning task trivial? (i.e., Is the answer entity accidentally mentioned in the query, or does the merged query leak any answer to the hop question?)", + "scale": { + 1: "Complete leakage (answer directly present)", + 2: "Strong leakage (very obvious answer)", + 3: "Minor hints present", + 4: "No leakage" + } + }, + "context_sufficiency": { + "title": "Context Sufficiency", + "desc": "Does the merged query contain sufficient context and constraints to be answerable via the intended tool or retrieval pipeline? (i.e., Does the merged question have enough context present to be answered as well as does the query have enough context to populate the arguments of the tools?)", + "scale": { + 1: "Not answerable: entities missing in the merged query or insufficient context to answer the question", + 2: "Major missing context: all entities present but insufficient context significantly hindering answerability", + 3: "Mostly sufficient: minor missing context that does not significantly hinder answerability", + 4: "Fully sufficient" + } + }, + # "reasoning_hops": { + # "title": "Reasoning Steps / Hops", + # "desc": "If you had to solve this query, how many reasoning steps would be required according to you?", + # "scale": { + # 1: "Single step", + # 2: "Two steps", + # 3: "Moderate multi-hop", + # 4: "Complex multi-hop", + # 5: "Very complex reasoning", + # }, + # }, + "retrieval_sufficiency": { + "title": "Retrieval Sufficiency Score", + "desc": "Do the ground truth documents have sufficient information to answer the RAG query?", + "scale": { + 1: "GT document have no relevant information", + 2: "GT document have some missing information", + 3: "GT document have some missing information which is common sense knowledge", + 4: "No information missing", + }, + }, + "cross_hop_entity_consistency": { + "title": "Cross-Hop Entity Consistency Score", + "desc": "Are entities required by the arguments of the succeeding or preceding API tool calls correctly inferred and grounded in the retrieved documents or retriever questions?", + "scale": { + 1: "Not answerable", + 2: "Majorly missing context / entities cannot be answered without these entities", + 3: "Mostly sufficient have some missing information which is common sense knowledge", + 4: "Fully sufficient", + }, + }, +} + +QUALITY_KEYS = [ + "faithfulness", + "naturalness", + "logical_consistency", + "answer_leakage", + "context_sufficiency", + "retrieval_sufficiency", + "cross_hop_entity_consistency", +] + +HTML = """ + + + + Task 4 Query Evaluation + + + + + +
+ +
+

Input Data

+ +
+ UUID: {{ item.uuid }}
+ Domain: {{ item.domain }}
+ Type: {{ item.type }}
+ Question Type: {{ item.question_type }} +
+ +
+

Dialogue Context

+ {% if item.dialogue %} + {% for turn in item.dialogue %} +
+ Turn {{ turn.turn_id }} +
{{ turn.query }}
+ {% if turn.answer is defined %} + Answer +
{{ turn.answer | tojson(indent=2) }}
+ {% endif %} +
+ {% endfor %} + {% else %} +
No prior dialogue turns.
+ {% endif %} +
+ +
+

Component Queries

+ {% if item.component_queries %} +
    + {% for q in item.component_queries %} +
  • {{ q }}
  • + {% endfor %} +
+ {% else %} +
No component queries for this sample.
+ {% endif %} +
+ +
+

Final Query

+
{{ item.query }}
+
+ +
+

Ground Truth Tools

+
{{ item.gt_tools | tojson(indent=2) }}
+
+ +
+

Tool Responses / Documents

+
{{ item.gt_responses | tojson(indent=2) }}
+
+ +
+

Final Answer

+
{{ item.gt_answer }}
+
+
+ +
+

Evaluation

+ +
+ Item {{ idx + 1 }} / {{ total }} | Completed: {{ completed }} +
+ +
+ Hover over numbers for score explanations. Keyboard: 1-5 = score active metric, N = Save & Next, P = Save & Previous. +
+ +
+ + {% for key, metric in metrics.items() %} +
+

{{ loop.index }}. {{ metric.title }}

+
{{ metric.desc }}
+ + {% for score, explanation in metric.scale.items() %} + + {% endfor %} +
+ {% endfor %} + +
+

Notes

+ +
+ + + + + +
+
+ +
+ + + + + +""" + + +def load_json(path): + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def save_json(path, data): + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(".tmp") + with open(tmp, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + tmp.replace(path) + + +def make_app(data_path, output_path): + app = Flask(__name__) + + data = load_json(data_path) + + if output_path.exists(): + results = load_json(output_path) + else: + results = {} + + def completed_count(): + return sum(1 for item in data if item["uuid"] in results) + + @app.route("/") + def home(): + return redirect(url_for("annotate", idx=0)) + + @app.route("/item/") + def annotate(idx): + idx = max(0, min(idx, len(data) - 1)) + item = data[idx] + existing = results.get(item["uuid"], {}) + + return render_template_string( + HTML, + item=item, + idx=idx, + total=len(data), + metrics=METRICS, + existing=existing, + completed=completed_count(), + ) + + @app.route("/save/", methods=["POST"]) + def save(idx): + item = data[idx] + uuid = item["uuid"] + + annotation = { + "uuid": uuid, + "domain": item.get("domain"), + "type": item.get("type"), + "question_type": item.get("question_type"), + "query": item.get("query"), + "notes": request.form.get("notes", ""), + "updated_at": datetime.utcnow().isoformat() + "Z", + } + + for key in METRICS: + annotation[key] = int(request.form[key]) + + annotation["mean_quality_score_1_to_4"] = sum( + annotation[k] for k in QUALITY_KEYS + ) / len(QUALITY_KEYS) + + results[uuid] = annotation + save_json(output_path, results) + + action = request.form.get("action") + if action == "prev": + return redirect(url_for("annotate", idx=max(0, idx - 1))) + if action == "next": + return redirect(url_for("annotate", idx=min(len(data) - 1, idx + 1))) + return redirect(url_for("annotate", idx=idx)) + + return app + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data", default="task4_data.json", help="Input JSON file") + parser.add_argument("--out", default="task4_annotations.json", help="Output JSON file") + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", default=5001, type=int) + args = parser.parse_args() + + app = make_app(Path(args.data), Path(args.out)) + app.run(host=args.host, port=args.port, debug=True) From a1f55eb09676c9e188c8a6d585f306fdede8d019 Mon Sep 17 00:00:00 2001 From: AnkitaNaik Date: Mon, 18 May 2026 12:25:21 -0400 Subject: [PATCH 4/9] Update overlay of Task3 buttons in UI. --- scripts/human_evaluation/task3_eval_ui.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/human_evaluation/task3_eval_ui.py b/scripts/human_evaluation/task3_eval_ui.py index 86da970..5b0914b 100644 --- a/scripts/human_evaluation/task3_eval_ui.py +++ b/scripts/human_evaluation/task3_eval_ui.py @@ -98,10 +98,12 @@ .left { border-right: 2px solid #ddd; background: #ffffff; + z-index: 1; } .right { background: #f9f9f9; + z-index: 2; } .card { @@ -164,8 +166,7 @@ position: absolute; z-index: 9999; top: 140%; - left: 50%; - transform: translateX(-50%); + left: 0; opacity: 0; transition: opacity 0.2s; font-size: 12px; @@ -459,4 +460,4 @@ def save(idx): args = parser.parse_args() app = make_app(Path(args.data), Path(args.out)) - app.run(host=args.host, port=args.port, debug=True) \ No newline at end of file + app.run(host=args.host, port=args.port, debug=True) From e60b6282dea5453c6756394929829cca64f5f6a5 Mon Sep 17 00:00:00 2001 From: AnkitaNaik Date: Mon, 18 May 2026 12:41:22 -0400 Subject: [PATCH 5/9] Add 0 for Task 4 where RAG queries are misisng. --- scripts/human_evaluation/task4_eval_ui.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/human_evaluation/task4_eval_ui.py b/scripts/human_evaluation/task4_eval_ui.py index a3f62f9..0dc017a 100644 --- a/scripts/human_evaluation/task4_eval_ui.py +++ b/scripts/human_evaluation/task4_eval_ui.py @@ -76,18 +76,20 @@ # }, "retrieval_sufficiency": { "title": "Retrieval Sufficiency Score", - "desc": "Do the ground truth documents have sufficient information to answer the RAG query?", + "desc": "Do the ground truth documents have sufficient information to answer the RAG query? (Mark '0' is no RAG component query.)", "scale": { + 0: "Not applicable (e.g., no retrieval needed for this query)", 1: "GT document have no relevant information", 2: "GT document have some missing information", 3: "GT document have some missing information which is common sense knowledge", - 4: "No information missing", + 4: "No information missing" }, }, "cross_hop_entity_consistency": { "title": "Cross-Hop Entity Consistency Score", - "desc": "Are entities required by the arguments of the succeeding or preceding API tool calls correctly inferred and grounded in the retrieved documents or retriever questions?", + "desc": "Are entities required by the arguments of the succeeding or preceding API tool calls correctly inferred and grounded in the retrieved documents or retriever questions? (Mark '0' is no RAG component query.)", "scale": { + 0: "Not applicable (e.g., no retrieval needed for this query)", 1: "Not answerable", 2: "Majorly missing context / entities cannot be answered without these entities", 3: "Mostly sufficient have some missing information which is common sense knowledge", From a966a3a672840e67ec922de12fe60cc1ce8bbe85 Mon Sep 17 00:00:00 2001 From: AnkitaNaik Date: Mon, 18 May 2026 12:46:15 -0400 Subject: [PATCH 6/9] Add radio button shortcuts to Task 4 UI. --- scripts/human_evaluation/README.md | 7 +++++-- scripts/human_evaluation/task4_eval_ui.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/human_evaluation/README.md b/scripts/human_evaluation/README.md index 93a5e70..a7ad9b8 100644 --- a/scripts/human_evaluation/README.md +++ b/scripts/human_evaluation/README.md @@ -27,6 +27,7 @@ http://127.0.0.1:5000 ### Annotation Flow * Press **1โ€“4** โ†’ score metric +* Press **0** โ†’ mark `retrieval_sufficiency` or `cross_hop_entity_consistency` as not applicable in Task 4 * It auto-advances to next metric * Press **N** โ†’ next example * Press **P** โ†’ previous @@ -173,8 +174,9 @@ METRICS = { }, "retrieval_sufficiency": { "title": "Retrieval Sufficiency Score", - "desc": "Do the ground truth documents have sufficient information to answer the RAG query?", + "desc": "Do the ground truth documents have sufficient information to answer the RAG query? (Mark '0' is no RAG component query.)", "scale": { + 0: "Not applicable (e.g., no retrieval needed for this query)", 1: "GT document have no relevant information", 2: "GT document have some missing information", 3: "GT document have some missing information which is common sense knowledge", @@ -183,8 +185,9 @@ METRICS = { }, "cross_hop_entity_consistency": { "title": "Cross-Hop Entity Consistency Score", - "desc": "Are entities required by the arguments of the succeeding or preceding API tool calls correctly inferred and grounded in the retrieved documents or retriever questions?", + "desc": "Are entities required by the arguments of the succeeding or preceding API tool calls correctly inferred and grounded in the retrieved documents or retriever questions? (Mark '0' is no RAG component query.)", "scale": { + 0: "Not applicable (e.g., no retrieval needed for this query)", 1: "Not answerable", 2: "Majorly missing context / entities cannot be answered without these entities", 3: "Mostly sufficient have some missing information which is common sense knowledge", diff --git a/scripts/human_evaluation/task4_eval_ui.py b/scripts/human_evaluation/task4_eval_ui.py index 0dc017a..ce46ebd 100644 --- a/scripts/human_evaluation/task4_eval_ui.py +++ b/scripts/human_evaluation/task4_eval_ui.py @@ -312,7 +312,7 @@
- Hover over numbers for score explanations. Keyboard: 1-5 = score active metric, N = Save & Next, P = Save & Previous. + Hover over numbers for score explanations. Keyboard: 0-5 = score active metric when available, N = Save & Next, P = Save & Previous.
@@ -401,7 +401,7 @@ return; } - if (["1", "2", "3", "4", "5"].includes(event.key)) { + if (["0", "1", "2", "3", "4", "5"].includes(event.key)) { event.preventDefault(); selectScore(event.key); } From 73037eb3658b6098ff8cee964d94086d7d5f3c79 Mon Sep 17 00:00:00 2001 From: AnkitaNaik Date: Mon, 18 May 2026 12:55:08 -0400 Subject: [PATCH 7/9] Minor wording updates. --- scripts/human_evaluation/task4_eval_ui.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/human_evaluation/task4_eval_ui.py b/scripts/human_evaluation/task4_eval_ui.py index ce46ebd..690f1c8 100644 --- a/scripts/human_evaluation/task4_eval_ui.py +++ b/scripts/human_evaluation/task4_eval_ui.py @@ -76,7 +76,7 @@ # }, "retrieval_sufficiency": { "title": "Retrieval Sufficiency Score", - "desc": "Do the ground truth documents have sufficient information to answer the RAG query? (Mark '0' is no RAG component query.)", + "desc": "Do the ground truth documents have sufficient information to answer the RAG query? (Mark '0' if no RAG component in query.)", "scale": { 0: "Not applicable (e.g., no retrieval needed for this query)", 1: "GT document have no relevant information", @@ -87,7 +87,7 @@ }, "cross_hop_entity_consistency": { "title": "Cross-Hop Entity Consistency Score", - "desc": "Are entities required by the arguments of the succeeding or preceding API tool calls correctly inferred and grounded in the retrieved documents or retriever questions? (Mark '0' is no RAG component query.)", + "desc": "Are entities required by the arguments of the succeeding or preceding API tool calls correctly inferred and grounded in the retrieved documents or retriever questions? (Mark '0' if no RAG component in query.)", "scale": { 0: "Not applicable (e.g., no retrieval needed for this query)", 1: "Not answerable", From 222e6f557c3579d6da4c89fdc31622d487cc71e4 Mon Sep 17 00:00:00 2001 From: AnkitaNaik Date: Mon, 18 May 2026 12:57:26 -0400 Subject: [PATCH 8/9] Update METRICS in README. --- scripts/human_evaluation/README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/scripts/human_evaluation/README.md b/scripts/human_evaluation/README.md index a7ad9b8..fa135e3 100644 --- a/scripts/human_evaluation/README.md +++ b/scripts/human_evaluation/README.md @@ -139,12 +139,12 @@ METRICS = { "logical_consistency": { "title": "Logical Consistency", "desc": ( - "Does the merged query avoid logically incompatible conditions or contradictions? " - "The different parts of the query should be simultaneously satisfiable and should " - "form a valid reasoning chain. For example, a question like " - "'Name a city on Earth which lies above the equator and is in Australia?' " - "contains logically incompatible constraints." - ), + "Does the merged query avoid logically incompatible conditions or contradictions? " + "The different parts of the query should be simultaneously satisfiable and should " + "form a valid reasoning chain. For example, a question like " + "'Name a city on Earth which lies above the equator and is in Australia?' " + "contains logically incompatible constraints." + ), "scale": { 1: "Completely inconsistent: logically incompatible component queries or conditions that cannot be satisfied together were merged", 2: "Major inconsistency: the query contains impossible or directly contradictory conditions that make the question invalid or unanswerable", @@ -174,25 +174,25 @@ METRICS = { }, "retrieval_sufficiency": { "title": "Retrieval Sufficiency Score", - "desc": "Do the ground truth documents have sufficient information to answer the RAG query? (Mark '0' is no RAG component query.)", + "desc": "Do the ground truth documents have sufficient information to answer the RAG query? (Mark '0' if no RAG component in query.)", "scale": { - 0: "Not applicable (e.g., no retrieval needed for this query)", + 0: "Not applicable (e.g., no retrieval needed for this query)", 1: "GT document have no relevant information", 2: "GT document have some missing information", 3: "GT document have some missing information which is common sense knowledge", 4: "No information missing" - } + }, }, "cross_hop_entity_consistency": { "title": "Cross-Hop Entity Consistency Score", - "desc": "Are entities required by the arguments of the succeeding or preceding API tool calls correctly inferred and grounded in the retrieved documents or retriever questions? (Mark '0' is no RAG component query.)", + "desc": "Are entities required by the arguments of the succeeding or preceding API tool calls correctly inferred and grounded in the retrieved documents or retriever questions? (Mark '0' if no RAG component in query.)", "scale": { 0: "Not applicable (e.g., no retrieval needed for this query)", 1: "Not answerable", 2: "Majorly missing context / entities cannot be answered without these entities", 3: "Mostly sufficient have some missing information which is common sense knowledge", - 4: "Fully sufficient" - } + 4: "Fully sufficient", + }, } } ``` From 34be339d98ad4ea5f798a8f7e70577b15e9b6b52 Mon Sep 17 00:00:00 2001 From: AnkitaNaik Date: Mon, 18 May 2026 13:20:06 -0400 Subject: [PATCH 9/9] Remove question around reasoning hops. --- scripts/human_evaluation/task3_eval_ui.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/scripts/human_evaluation/task3_eval_ui.py b/scripts/human_evaluation/task3_eval_ui.py index 5b0914b..afe66b5 100644 --- a/scripts/human_evaluation/task3_eval_ui.py +++ b/scripts/human_evaluation/task3_eval_ui.py @@ -55,17 +55,6 @@ 3: "Mostly sufficient", 4: "Fully sufficient" } - }, - "reasoning_hops": { - "title": "Reasoning Steps / Hops", - "desc": "If you had to solve this query, how many reasoning steps would be required according to you?", - "scale": { - 1: "Single hop", - 2: "Two hops", - 3: "Moderate multi-hop", - 4: "Complex multi-hop", - 5: "Very complex reasoning" - } } } @@ -238,7 +227,7 @@
- Hover over numbers for score explanations. Keyboard: 1โ€“5 = score active metric, N = Save & Next, P = Save & Previous. + Hover over numbers for score explanations. Keyboard: 1-4 = score active metric, N = Save & Next, P = Save & Previous.
@@ -328,7 +317,7 @@ return; } - if (["1", "2", "3", "4", "5"].includes(event.key)) { + if (["1", "2", "3", "4"].includes(event.key)) { event.preventDefault(); selectScore(event.key); } @@ -415,7 +404,6 @@ def save(idx): "domain": item.get("domain"), "query": item.get("query"), - "reasoning_hops": int(request.form["reasoning_hops"]), "faithfulness": int(request.form["faithfulness"]), "naturalness": int(request.form["naturalness"]), "logical_consistency": int(request.form["logical_consistency"]),